]> git.proxmox.com Git - mirror_lxc.git/blame - src/lxc/utils.c
utils: rework fix_stdio_permissions()
[mirror_lxc.git] / src / lxc / utils.c
CommitLineData
cc73685d 1/* SPDX-License-Identifier: LGPL-2.1+ */
e3642c43 2
d38dd64a
CB
3#ifndef _GNU_SOURCE
4#define _GNU_SOURCE 1
5#endif
7935833c 6#define __STDC_FORMAT_MACROS /* Required for PRIu64 to work. */
643c1984 7#include <ctype.h>
a1e5280d 8#include <dirent.h>
e3642c43 9#include <errno.h>
a1e5280d 10#include <fcntl.h>
dbaf55a3 11#include <grp.h>
7935833c 12#include <inttypes.h>
a1e5280d 13#include <libgen.h>
b467714b 14#include <pthread.h>
39293f22 15#include <signal.h>
d983b93c 16#include <stddef.h>
a1e5280d
CB
17#include <stdio.h>
18#include <stdlib.h>
61a1d519 19#include <string.h>
e3642c43 20#include <sys/mman.h>
6e4bb2e0 21#include <sys/mount.h>
066210f0
CB
22/* Needs to be after sys/mount.h header */
23#include <linux/fs.h>
a1e5280d
CB
24#include <sys/param.h>
25#include <sys/prctl.h>
26#include <sys/stat.h>
9be53773
SH
27#include <sys/types.h>
28#include <sys/wait.h>
d38dd64a 29#include <unistd.h>
e3642c43 30
d38dd64a 31#include "config.h"
e3642c43 32#include "log.h"
4fef78bc 33#include "lsm/lsm.h"
025ed0f3 34#include "lxclock.h"
c4382ee2 35#include "memory_utils.h"
51d0854c 36#include "namespace.h"
e3db0162 37#include "parse.h"
38e5c2db 38#include "raw_syscalls.h"
b25291da 39#include "syscall_wrappers.h"
981f6029 40#include "utils.h"
e3642c43 41
43f984ea
DJ
42#ifndef HAVE_STRLCPY
43#include "include/strlcpy.h"
44#endif
45
bd583214
DJ
46#ifndef HAVE_STRLCAT
47#include "include/strlcat.h"
48#endif
49
4928c718
SG
50#ifndef O_PATH
51#define O_PATH 010000000
52#endif
53
54#ifndef O_NOFOLLOW
55#define O_NOFOLLOW 00400000
56#endif
57
ac2cecc4 58lxc_log_define(utils, lxc);
e3642c43 59
4295c5de
SH
60/*
61 * if path is btrfs, tries to remove it and any subvolumes beneath it
62 */
63extern bool btrfs_try_remove_subvol(const char *path);
64
41dc7155 65static int _recursive_rmdir(const char *dirname, dev_t pdev,
0cc417b2 66 const char *exclude, int level, bool onedev)
60bf62d4 67{
f1258455
CB
68 __do_closedir DIR *dir = NULL;
69 int failed = 0;
70 bool hadexclude = false;
71 int ret;
74f96976 72 struct dirent *direntp;
d726953a 73 char pathname[PATH_MAX];
60bf62d4
SH
74
75 dir = opendir(dirname);
f1258455
CB
76 if (!dir)
77 return log_error(-1, "Failed to open \"%s\"", dirname);
60bf62d4 78
74f96976 79 while ((direntp = readdir(dir))) {
60bf62d4 80 int rc;
f1258455 81 struct stat mystat;
60bf62d4 82
60bf62d4
SH
83 if (!strcmp(direntp->d_name, ".") ||
84 !strcmp(direntp->d_name, ".."))
85 continue;
86
d726953a
CB
87 rc = snprintf(pathname, PATH_MAX, "%s/%s", dirname, direntp->d_name);
88 if (rc < 0 || rc >= PATH_MAX) {
7be6bcd5 89 ERROR("The name of path is too long");
f1258455 90 failed = 1;
60bf62d4
SH
91 continue;
92 }
18aa217b
SH
93
94 if (!level && exclude && !strcmp(direntp->d_name, exclude)) {
95 ret = rmdir(pathname);
96 if (ret < 0) {
f1258455 97 switch (errno) {
18aa217b 98 case ENOTEMPTY:
7be6bcd5 99 INFO("Not deleting snapshot \"%s\"", pathname);
18aa217b
SH
100 hadexclude = true;
101 break;
102 case ENOTDIR:
103 ret = unlink(pathname);
104 if (ret)
7be6bcd5 105 INFO("Failed to remove \"%s\"", pathname);
18aa217b
SH
106 break;
107 default:
7be6bcd5 108 SYSERROR("Failed to rmdir \"%s\"", pathname);
18aa217b
SH
109 failed = 1;
110 break;
111 }
112 }
7be6bcd5 113
18aa217b
SH
114 continue;
115 }
116
60bf62d4
SH
117 ret = lstat(pathname, &mystat);
118 if (ret) {
7be6bcd5 119 SYSERROR("Failed to stat \"%s\"", pathname);
4295c5de 120 failed = 1;
60bf62d4
SH
121 continue;
122 }
b14fc100 123
4295c5de 124 if (onedev && mystat.st_dev != pdev) {
4295c5de 125 if (btrfs_try_remove_subvol(pathname))
7be6bcd5 126 INFO("Removed btrfs subvolume at \"%s\"", pathname);
60bf62d4 127 continue;
4295c5de 128 }
b14fc100 129
60bf62d4 130 if (S_ISDIR(mystat.st_mode)) {
f1258455
CB
131 if (_recursive_rmdir(pathname, pdev, exclude, level + 1, onedev) < 0)
132 failed = 1;
60bf62d4 133 } else {
066210f0
CB
134 ret = unlink(pathname);
135 if (ret < 0) {
136 __do_close int fd = -EBADF;
137
138 fd = open(pathname, O_RDONLY | O_CLOEXEC | O_NONBLOCK);
139 if (fd >= 0) {
140 /* The file might be marked immutable. */
141 int attr = 0;
142 ret = ioctl(fd, FS_IOC_GETFLAGS, &attr);
143 if (ret < 0)
144 SYSERROR("Failed to retrieve file flags");
145 attr &= ~FS_IMMUTABLE_FL;
146 ret = ioctl(fd, FS_IOC_SETFLAGS, &attr);
147 if (ret < 0)
148 SYSERROR("Failed to set file flags");
149 }
150
151 ret = unlink(pathname);
152 if (ret < 0) {
153 SYSERROR("Failed to delete \"%s\"", pathname);
154 failed = 1;
155 }
60bf62d4
SH
156 }
157 }
158 }
159
4295c5de 160 if (rmdir(dirname) < 0 && !btrfs_try_remove_subvol(dirname) && !hadexclude) {
7be6bcd5 161 SYSERROR("Failed to delete \"%s\"", dirname);
f1258455 162 failed = 1;
60bf62d4
SH
163 }
164
4355ab5f 165 return failed ? -1 : 0;
60bf62d4
SH
166}
167
f1258455
CB
168/*
169 * In overlayfs, st_dev is unreliable. So on overlayfs we don't do the
170 * lxc_rmdir_onedev().
0cc417b2 171 */
f1258455 172static inline bool is_native_overlayfs(const char *path)
0cc417b2 173{
f1258455
CB
174 return has_fs_type(path, OVERLAY_SUPER_MAGIC) ||
175 has_fs_type(path, OVERLAYFS_SUPER_MAGIC);
0cc417b2
SH
176}
177
4355ab5f 178/* returns 0 on success, -1 if there were any failures */
41dc7155 179extern int lxc_rmdir_onedev(const char *path, const char *exclude)
60bf62d4
SH
180{
181 struct stat mystat;
0cc417b2
SH
182 bool onedev = true;
183
41dc7155 184 if (is_native_overlayfs(path))
0cc417b2 185 onedev = false;
60bf62d4
SH
186
187 if (lstat(path, &mystat) < 0) {
067650d0
SH
188 if (errno == ENOENT)
189 return 0;
41dc7155 190
f1258455 191 return log_error_errno(-1, errno, "Failed to stat \"%s\"", path);
60bf62d4
SH
192 }
193
0cc417b2 194 return _recursive_rmdir(path, mystat.st_dev, exclude, 0, onedev);
60bf62d4
SH
195}
196
9ddaf3bf 197/* borrowed from iproute2 */
7c11d57a 198extern int get_u16(unsigned short *val, const char *arg, int base)
9ddaf3bf
JHS
199{
200 unsigned long res;
201 char *ptr;
202
203 if (!arg || !*arg)
204 return -1;
205
09bbd745 206 errno = 0;
9ddaf3bf 207 res = strtoul(arg, &ptr, base);
09bbd745 208 if (!ptr || ptr == arg || *ptr || res > 0xFFFF || errno != 0)
9ddaf3bf
JHS
209 return -1;
210
211 *val = res;
212
213 return 0;
214}
215
6099dd5a 216int mkdir_p(const char *dir, mode_t mode)
1b09f2c0 217{
3ce74686
SH
218 const char *tmp = dir;
219 const char *orig = dir;
7be6bcd5 220
c5e7a7ac 221 do {
f1258455 222 __do_free char *makeme = NULL;
6099dd5a 223 int ret;
6099dd5a 224
860fc865
RW
225 dir = tmp + strspn(tmp, "/");
226 tmp = dir + strcspn(dir, "/");
b14fc100 227
d74325c4 228 makeme = strndup(orig, dir - orig);
6099dd5a 229 if (!makeme)
f1258455 230 return ret_set_errno(-1, ENOMEM);
6099dd5a
CB
231
232 ret = mkdir(makeme, mode);
f1258455
CB
233 if (ret < 0 && errno != EEXIST)
234 return log_error_errno(-1, errno, "Failed to create directory \"%s\"", makeme);
6099dd5a
CB
235
236 } while (tmp != dir);
1b09f2c0 237
98663823 238 return 0;
1b09f2c0 239}
2a59a681 240
44b9ae4b 241char *get_rundir()
9e60f51d 242{
97a696c6 243 char *rundir;
f1258455 244 size_t len;
97a696c6 245 const char *homedir;
9650c735 246 struct stat sb;
9e60f51d 247
b14fc100 248 if (stat(RUNTIME_PATH, &sb) < 0)
9650c735 249 return NULL;
9650c735 250
f1258455
CB
251 if (geteuid() == sb.st_uid || getegid() == sb.st_gid)
252 return strdup(RUNTIME_PATH);
97a696c6
SG
253
254 rundir = getenv("XDG_RUNTIME_DIR");
f1258455
CB
255 if (rundir)
256 return strdup(rundir);
97a696c6 257
7be6bcd5 258 INFO("XDG_RUNTIME_DIR isn't set in the environment");
44b9ae4b 259 homedir = getenv("HOME");
f1258455
CB
260 if (!homedir)
261 return log_error(NULL, "HOME isn't set in the environment");
97a696c6 262
f1258455
CB
263 len = strlen(homedir) + 17;
264 rundir = malloc(sizeof(char) * len);
b14fc100 265 if (!rundir)
266 return NULL;
267
f1258455 268 snprintf(rundir, len, "%s/.cache/lxc/run/", homedir);
9e60f51d
DE
269 return rundir;
270}
271
9be53773
SH
272int wait_for_pid(pid_t pid)
273{
274 int status, ret;
275
276again:
277 ret = waitpid(pid, &status, 0);
278 if (ret == -1) {
71b9b8ed 279 if (errno == EINTR)
9be53773 280 goto again;
b14fc100 281
9be53773
SH
282 return -1;
283 }
b14fc100 284
9be53773
SH
285 if (ret != pid)
286 goto again;
b14fc100 287
9be53773
SH
288 if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
289 return -1;
b14fc100 290
9be53773
SH
291 return 0;
292}
c797a220 293
39293f22
CB
294int wait_for_pidfd(int pidfd)
295{
296 int ret;
297 siginfo_t info = {
298 .si_signo = 0,
299 };
300
301 do {
302 ret = waitid(P_PIDFD, pidfd, &info, __WALL | WEXITED);
303 } while (ret < 0 && errno == EINTR);
304
305 return !ret && WIFEXITED(info.si_status) && WEXITSTATUS(info.si_status) == 0;
306}
307
c797a220
CS
308int lxc_wait_for_pid_status(pid_t pid)
309{
310 int status, ret;
311
312again:
313 ret = waitpid(pid, &status, 0);
314 if (ret == -1) {
315 if (errno == EINTR)
316 goto again;
b14fc100 317
c797a220
CS
318 return -1;
319 }
b14fc100 320
c797a220
CS
321 if (ret != pid)
322 goto again;
b14fc100 323
c797a220
CS
324 return status;
325}
92f023dc 326
fa2bb6ba
SH
327#ifdef HAVE_OPENSSL
328#include <openssl/evp.h>
41246cee 329
f1258455
CB
330static int do_sha1_hash(const char *buf, int buflen, unsigned char *md_value,
331 unsigned int *md_len)
41246cee 332{
fa2bb6ba
SH
333 EVP_MD_CTX *mdctx;
334 const EVP_MD *md;
335
336 md = EVP_get_digestbyname("sha1");
f1258455
CB
337 if (!md)
338 return log_error(-1, "Unknown message digest: sha1\n");
fa2bb6ba 339
b138bfcf 340 mdctx = EVP_MD_CTX_create();
fa2bb6ba
SH
341 EVP_DigestInit_ex(mdctx, md, NULL);
342 EVP_DigestUpdate(mdctx, buf, buflen);
343 EVP_DigestFinal_ex(mdctx, md_value, md_len);
b138bfcf 344 EVP_MD_CTX_destroy(mdctx);
fa2bb6ba
SH
345
346 return 0;
41246cee
DE
347}
348
7c3d3976 349int sha1sum_file(char *fnam, unsigned char *digest, unsigned int *md_len)
3ce74686 350{
f1258455
CB
351 __do_free char *buf = NULL;
352 __do_fclose FILE *f = NULL;
3ce74686 353 int ret;
3ce74686
SH
354 long flen;
355
356 if (!fnam)
357 return -1;
b14fc100 358
025ed0f3 359 f = fopen_cloexec(fnam, "r");
f1258455
CB
360 if (!f)
361 return log_error_errno(-1, errno, "Failed to open template \"%s\"", fnam);
b14fc100 362
f1258455
CB
363 if (fseek(f, 0, SEEK_END) < 0)
364 return log_error_errno(-1, errno, "Failed to seek to end of template");
b14fc100 365
f1258455
CB
366 flen = ftell(f);
367 if (flen < 0)
368 return log_error_errno(-1, errno, "Failed to tell size of template");
b14fc100 369
f1258455
CB
370 if (fseek(f, 0, SEEK_SET) < 0)
371 return log_error_errno(-1, errno, "Failed to seek to start of template");
b14fc100 372
f1258455
CB
373 buf = malloc(flen + 1);
374 if (!buf)
375 return log_error_errno(-1, ENOMEM, "Out of memory");
b14fc100 376
f1258455
CB
377 if (fread(buf, 1, flen, f) != flen)
378 return log_error_errno(-1, errno, "Failed to read template");
b14fc100 379
3ce74686 380 buf[flen] = '\0';
fa2bb6ba 381 ret = do_sha1_hash(buf, flen, (void *)digest, md_len);
3ce74686
SH
382 return ret;
383}
384#endif
61a1d519 385
8bd8018e 386struct lxc_popen_FILE *lxc_popen(const char *command)
ebec9176 387{
3f323207 388 int ret;
ebec9176
AM
389 int pipe_fds[2];
390 pid_t child_pid;
8bd8018e 391 struct lxc_popen_FILE *fp = NULL;
ebec9176 392
8bd8018e
CB
393 ret = pipe2(pipe_fds, O_CLOEXEC);
394 if (ret < 0)
ebec9176 395 return NULL;
ebec9176
AM
396
397 child_pid = fork();
8bd8018e
CB
398 if (child_pid < 0)
399 goto on_error;
400
401 if (!child_pid) {
402 sigset_t mask;
403
404 close(pipe_fds[0]);
405
406 /* duplicate stdout */
407 if (pipe_fds[1] != STDOUT_FILENO)
408 ret = dup2(pipe_fds[1], STDOUT_FILENO);
409 else
410 ret = fcntl(pipe_fds[1], F_SETFD, 0);
411 if (ret < 0) {
412 close(pipe_fds[1]);
03f618af 413 _exit(EXIT_FAILURE);
3f323207
CB
414 }
415
8bd8018e
CB
416 /* duplicate stderr */
417 if (pipe_fds[1] != STDERR_FILENO)
418 ret = dup2(pipe_fds[1], STDERR_FILENO);
419 else
420 ret = fcntl(pipe_fds[1], F_SETFD, 0);
421 close(pipe_fds[1]);
422 if (ret < 0)
03f618af 423 _exit(EXIT_FAILURE);
8bd8018e
CB
424
425 /* unblock all signals */
426 ret = sigfillset(&mask);
427 if (ret < 0)
03f618af 428 _exit(EXIT_FAILURE);
8bd8018e 429
b467714b 430 ret = pthread_sigmask(SIG_UNBLOCK, &mask, NULL);
8bd8018e 431 if (ret < 0)
03f618af 432 _exit(EXIT_FAILURE);
8bd8018e 433
ecfa5693 434 /* check if /bin/sh exist, otherwise try Android location /system/bin/sh */
435 if (file_exists("/bin/sh"))
436 execl("/bin/sh", "sh", "-c", command, (char *)NULL);
437 else
438 execl("/system/bin/sh", "sh", "-c", command, (char *)NULL);
439
03f618af 440 _exit(127);
ebec9176
AM
441 }
442
8bd8018e
CB
443 close(pipe_fds[1]);
444 pipe_fds[1] = -1;
ebec9176 445
8bd8018e
CB
446 fp = malloc(sizeof(*fp));
447 if (!fp)
448 goto on_error;
b14fc100 449
7e50ec0b 450 memset(fp, 0, sizeof(*fp));
ebec9176
AM
451
452 fp->child_pid = child_pid;
8bd8018e 453 fp->pipe = pipe_fds[0];
ebec9176 454
7e50ec0b
CB
455 /* From now on, closing fp->f will also close fp->pipe. So only ever
456 * call fclose(fp->f).
457 */
8bd8018e
CB
458 fp->f = fdopen(pipe_fds[0], "r");
459 if (!fp->f)
460 goto on_error;
ebec9176 461
8bd8018e 462 return fp;
ebec9176 463
8bd8018e 464on_error:
7e50ec0b
CB
465 /* We can only close pipe_fds[0] if fdopen() didn't succeed or wasn't
466 * called yet. Otherwise the fd belongs to the file opened by fdopen()
467 * since it isn't dup()ed.
468 */
469 if (fp && !fp->f && pipe_fds[0] >= 0)
8bd8018e
CB
470 close(pipe_fds[0]);
471
472 if (pipe_fds[1] >= 0)
473 close(pipe_fds[1]);
ebec9176 474
7e50ec0b
CB
475 if (fp && fp->f)
476 fclose(fp->f);
477
478 if (fp)
479 free(fp);
480
ebec9176
AM
481 return NULL;
482}
483
8bd8018e 484int lxc_pclose(struct lxc_popen_FILE *fp)
ebec9176 485{
ebec9176 486 pid_t wait_pid;
8bd8018e 487 int wstatus = 0;
ebec9176 488
8bd8018e 489 if (!fp)
ebec9176 490 return -1;
ebec9176
AM
491
492 do {
8bd8018e
CB
493 wait_pid = waitpid(fp->child_pid, &wstatus, 0);
494 } while (wait_pid < 0 && errno == EINTR);
ebec9176 495
8bd8018e
CB
496 fclose(fp->f);
497 free(fp);
498
499 if (wait_pid < 0)
ebec9176 500 return -1;
ebec9176
AM
501
502 return wstatus;
503}
504
508c263e
SH
505int randseed(bool srand_it)
506{
4110345b 507 __do_fclose FILE *f = NULL;
508c263e 508 /*
7be6bcd5 509 * srand pre-seed function based on /dev/urandom
510 */
091045f8 511 unsigned int seed = time(NULL) + getpid();
508c263e 512
4110345b 513 f = fopen("/dev/urandom", "re");
508c263e
SH
514 if (f) {
515 int ret = fread(&seed, sizeof(seed), 1, f);
516 if (ret != 1)
7be6bcd5 517 SYSDEBUG("Unable to fread /dev/urandom, fallback to time+pid rand seed");
508c263e
SH
518 }
519
520 if (srand_it)
521 srand(seed);
522
523 return seed;
524}
5d897655
SH
525
526uid_t get_ns_uid(uid_t orig)
527{
4110345b
CB
528 __do_free char *line = NULL;
529 __do_fclose FILE *f = NULL;
5d897655
SH
530 size_t sz = 0;
531 uid_t nsid, hostid, range;
7be6bcd5 532
4110345b 533 f = fopen("/proc/self/uid_map", "re");
f1258455
CB
534 if (!f)
535 return log_error_errno(0, errno, "Failed to open uid_map");
5d897655
SH
536
537 while (getline(&line, &sz, f) != -1) {
538 if (sscanf(line, "%u %u %u", &nsid, &hostid, &range) != 3)
539 continue;
b14fc100 540
4110345b
CB
541 if (hostid <= orig && hostid + range > orig)
542 return nsid += orig - hostid;
5d897655
SH
543 }
544
4110345b 545 return LXC_INVALID_UID;
b962868f
CB
546}
547
548gid_t get_ns_gid(gid_t orig)
549{
4110345b
CB
550 __do_free char *line = NULL;
551 __do_fclose FILE *f = NULL;
b962868f
CB
552 size_t sz = 0;
553 gid_t nsid, hostid, range;
7be6bcd5 554
4110345b 555 f = fopen("/proc/self/gid_map", "re");
f1258455
CB
556 if (!f)
557 return log_error_errno(0, errno, "Failed to open gid_map");
b962868f
CB
558
559 while (getline(&line, &sz, f) != -1) {
560 if (sscanf(line, "%u %u %u", &nsid, &hostid, &range) != 3)
561 continue;
562
4110345b
CB
563 if (hostid <= orig && hostid + range > orig)
564 return nsid += orig - hostid;
b962868f
CB
565 }
566
4110345b 567 return LXC_INVALID_GID;
5d897655 568}
c476bdce
SH
569
570bool dir_exists(const char *path)
571{
572 struct stat sb;
573 int ret;
574
575 ret = stat(path, &sb);
576 if (ret < 0)
1a0e70ac 577 /* Could be something other than eexist, just say "no". */
c476bdce 578 return false;
b14fc100 579
c476bdce
SH
580 return S_ISDIR(sb.st_mode);
581}
93c379f0
ÇO
582
583/* Note we don't use SHA-1 here as we don't want to depend on HAVE_GNUTLS.
584 * FNV has good anti collision properties and we're not worried
585 * about pre-image resistance or one-way-ness, we're just trying to make
586 * the name unique in the 108 bytes of space we have.
587 */
588uint64_t fnv_64a_buf(void *buf, size_t len, uint64_t hval)
589{
590 unsigned char *bp;
591
7be6bcd5 592 for(bp = buf; bp < (unsigned char *)buf + len; bp++) {
93c379f0
ÇO
593 /* xor the bottom with the current octet */
594 hval ^= (uint64_t)*bp;
595
596 /* gcc optimised:
597 * multiply by the 64 bit FNV magic prime mod 2^64
598 */
599 hval += (hval << 1) + (hval << 4) + (hval << 5) +
600 (hval << 7) + (hval << 8) + (hval << 40);
601 }
602
603 return hval;
604}
2c6f3fc9 605
f6310f18 606bool is_shared_mountpoint(const char *path)
2c6f3fc9 607{
c4382ee2
CB
608 __do_fclose FILE *f = NULL;
609 __do_free char *line = NULL;
2c6f3fc9 610 int i;
c4382ee2 611 size_t len = 0;
2c6f3fc9 612
4110345b 613 f = fopen("/proc/self/mountinfo", "re");
2c6f3fc9
SH
614 if (!f)
615 return 0;
b14fc100 616
c4382ee2
CB
617 while (getline(&line, &len, f) > 0) {
618 char *slider1, *slider2;
619
620 for (slider1 = line, i = 0; slider1 && i < 4; i++)
621 slider1 = strchr(slider1 + 1, ' ');
622
623 if (!slider1)
2c6f3fc9 624 continue;
b14fc100 625
c4382ee2
CB
626 slider2 = strchr(slider1 + 1, ' ');
627 if (!slider2)
2c6f3fc9 628 continue;
b14fc100 629
c4382ee2
CB
630 *slider2 = '\0';
631 if (strcmp(slider1 + 1, path) == 0) {
f6310f18 632 /* This is the path. Is it shared? */
c4382ee2
CB
633 slider1 = strchr(slider2 + 1, ' ');
634 if (slider1 && strstr(slider1, "shared:"))
f6310f18 635 return true;
2c6f3fc9
SH
636 }
637 }
b14fc100 638
f6310f18
LT
639 return false;
640}
641
642/*
643 * Detect whether / is mounted MS_SHARED. The only way I know of to
644 * check that is through /proc/self/mountinfo.
645 * I'm only checking for /. If the container rootfs or mount location
646 * is MS_SHARED, but not '/', then you're out of luck - figuring that
647 * out would be too much work to be worth it.
648 */
649int detect_shared_rootfs(void)
650{
651 if (is_shared_mountpoint("/"))
652 return 1;
7be6bcd5 653
2c6f3fc9
SH
654 return 0;
655}
0e6e3a41 656
37ef15bb
CB
657bool switch_to_ns(pid_t pid, const char *ns)
658{
f62cf1d4 659 __do_close int fd = -EBADF;
b280bc38
CB
660 int ret;
661 char nspath[STRLITERALLEN("/proc//ns/")
662 + INTTYPE_TO_STRLEN(pid_t)
663 + LXC_NAMESPACE_NAME_MAX];
51d0854c
DY
664
665 /* Switch to new ns */
b280bc38
CB
666 ret = snprintf(nspath, sizeof(nspath), "/proc/%d/ns/%s", pid, ns);
667 if (ret < 0 || ret >= sizeof(nspath))
51d0854c
DY
668 return false;
669
b280bc38 670 fd = open(nspath, O_RDONLY | O_CLOEXEC);
f1258455
CB
671 if (fd < 0)
672 return log_error_errno(false, errno, "Failed to open \"%s\"", nspath);
51d0854c
DY
673
674 ret = setns(fd, 0);
f1258455
CB
675 if (ret)
676 return log_error_errno(false, errno, "Failed to set process %d to \"%s\" of %d", pid, ns, fd);
b14fc100 677
51d0854c
DY
678 return true;
679}
680
b7f954bb
SH
681/*
682 * looking at fs/proc_namespace.c, it appears we can
683 * actually expect the rootfs entry to very specifically contain
684 * " - rootfs rootfs "
685 * IIUC, so long as we've chrooted so that rootfs is not our root,
686 * the rootfs entry should always be skipped in mountinfo contents.
687 */
fa454c8e 688bool detect_ramfs_rootfs(void)
b7f954bb 689{
4110345b
CB
690 __do_free char *line = NULL;
691 __do_free void *fopen_cache = NULL;
692 __do_fclose FILE *f = NULL;
fa454c8e 693 size_t len = 0;
b7f954bb 694
4110345b
CB
695 f = fopen_cached("/proc/self/mountinfo", "re", &fopen_cache);
696 if (!f)
fa454c8e
CB
697 return false;
698
699 while (getline(&line, &len, f) != -1) {
4110345b
CB
700 int i;
701 char *p, *p2;
702
fa454c8e
CB
703 for (p = line, i = 0; p && i < 4; i++)
704 p = strchr(p + 1, ' ');
b7f954bb
SH
705 if (!p)
706 continue;
b14fc100 707
fa454c8e 708 p2 = strchr(p + 1, ' ');
b7f954bb
SH
709 if (!p2)
710 continue;
711 *p2 = '\0';
fa454c8e 712 if (strcmp(p + 1, "/") == 0) {
1a0e70ac 713 /* This is '/'. Is it the ramfs? */
fa454c8e 714 p = strchr(p2 + 1, '-');
4110345b 715 if (p && strncmp(p, "- rootfs rootfs ", 16) == 0)
fa454c8e 716 return true;
b7f954bb
SH
717 }
718 }
b14fc100 719
fa454c8e 720 return false;
b7f954bb
SH
721}
722
37ef15bb
CB
723char *on_path(const char *cmd, const char *rootfs)
724{
f1258455
CB
725 __do_free char *path = NULL;
726 char *entry = NULL;
d726953a 727 char cmdpath[PATH_MAX];
0e6e3a41
SG
728 int ret;
729
730 path = getenv("PATH");
731 if (!path)
8afb3e61 732 return NULL;
0e6e3a41
SG
733
734 path = strdup(path);
735 if (!path)
8afb3e61 736 return NULL;
0e6e3a41 737
f1258455 738 lxc_iterate_parts(entry, path, ":") {
9d9c111c 739 if (rootfs)
d726953a 740 ret = snprintf(cmdpath, PATH_MAX, "%s/%s/%s", rootfs,
37ef15bb 741 entry, cmd);
9d9c111c 742 else
d726953a
CB
743 ret = snprintf(cmdpath, PATH_MAX, "%s/%s", entry, cmd);
744 if (ret < 0 || ret >= PATH_MAX)
84c5549b 745 continue;
0e6e3a41 746
f1258455 747 if (access(cmdpath, X_OK) == 0)
8afb3e61 748 return strdup(cmdpath);
0e6e3a41
SG
749 }
750
8afb3e61 751 return NULL;
0e6e3a41 752}
76a26f55 753
12983ba4
SH
754bool cgns_supported(void)
755{
756 return file_exists("/proc/self/ns/cgroup");
757}
758
9d9c111c
SH
759/* historically lxc-init has been under /usr/lib/lxc and under
760 * /usr/lib/$ARCH/lxc. It now lives as $prefix/sbin/init.lxc.
761 */
762char *choose_init(const char *rootfs)
763{
764 char *retv = NULL;
370ec268
SF
765 const char *empty = "",
766 *tmp;
9d9c111c 767 int ret, env_set = 0;
9d9c111c
SH
768
769 if (!getenv("PATH")) {
770 if (setenv("PATH", "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", 0))
771 SYSERROR("Failed to setenv");
b14fc100 772
9d9c111c
SH
773 env_set = 1;
774 }
775
776 retv = on_path("init.lxc", rootfs);
777
7be6bcd5 778 if (env_set)
9d9c111c
SH
779 if (unsetenv("PATH"))
780 SYSERROR("Failed to unsetenv");
9d9c111c
SH
781
782 if (retv)
783 return retv;
784
785 retv = malloc(PATH_MAX);
786 if (!retv)
787 return NULL;
788
789 if (rootfs)
370ec268 790 tmp = rootfs;
9d9c111c 791 else
370ec268
SF
792 tmp = empty;
793
794 ret = snprintf(retv, PATH_MAX, "%s/%s/%s", tmp, SBINDIR, "/init.lxc");
9d9c111c 795 if (ret < 0 || ret >= PATH_MAX) {
7be6bcd5 796 ERROR("The name of path is too long");
9d9c111c
SH
797 goto out1;
798 }
b14fc100 799
e57cd7e9 800 if (access(retv, X_OK) == 0)
9d9c111c
SH
801 return retv;
802
370ec268 803 ret = snprintf(retv, PATH_MAX, "%s/%s/%s", tmp, LXCINITDIR, "/lxc/lxc-init");
9d9c111c 804 if (ret < 0 || ret >= PATH_MAX) {
7be6bcd5 805 ERROR("The name of path is too long");
9d9c111c
SH
806 goto out1;
807 }
b14fc100 808
e57cd7e9 809 if (access(retv, X_OK) == 0)
9d9c111c
SH
810 return retv;
811
370ec268 812 ret = snprintf(retv, PATH_MAX, "%s/usr/lib/lxc/lxc-init", tmp);
9d9c111c 813 if (ret < 0 || ret >= PATH_MAX) {
7be6bcd5 814 ERROR("The name of path is too long");
9d9c111c
SH
815 goto out1;
816 }
b14fc100 817
e57cd7e9 818 if (access(retv, X_OK) == 0)
9d9c111c
SH
819 return retv;
820
370ec268 821 ret = snprintf(retv, PATH_MAX, "%s/sbin/lxc-init", tmp);
9d9c111c 822 if (ret < 0 || ret >= PATH_MAX) {
7be6bcd5 823 ERROR("The name of path is too long");
9d9c111c
SH
824 goto out1;
825 }
b14fc100 826
e57cd7e9 827 if (access(retv, X_OK) == 0)
9d9c111c
SH
828 return retv;
829
830 /*
831 * Last resort, look for the statically compiled init.lxc which we
832 * hopefully bind-mounted in.
833 * If we are called during container setup, and we get to this point,
834 * then the init.lxc.static from the host will need to be bind-mounted
835 * in. So we return NULL here to indicate that.
836 */
837 if (rootfs)
838 goto out1;
839
840 ret = snprintf(retv, PATH_MAX, "/init.lxc.static");
841 if (ret < 0 || ret >= PATH_MAX) {
842 WARN("Nonsense - name /lxc.init.static too long");
843 goto out1;
844 }
b14fc100 845
e57cd7e9 846 if (access(retv, X_OK) == 0)
9d9c111c
SH
847 return retv;
848
849out1:
850 free(retv);
851 return NULL;
852}
735f2c6e 853
6010a416
SG
854/*
855 * Given the '-t' template option to lxc-create, figure out what to
856 * do. If the template is a full executable path, use that. If it
857 * is something like 'sshd', then return $templatepath/lxc-sshd.
858 * On success return the template, on error return NULL.
859 */
860char *get_template_path(const char *t)
861{
862 int ret, len;
863 char *tpath;
864
b275efe3
RK
865 if (t[0] == '/') {
866 if (access(t, X_OK) == 0) {
867 return strdup(t);
868 } else {
869 SYSERROR("Bad template pathname: %s", t);
870 return NULL;
871 }
6010a416
SG
872 }
873
874 len = strlen(LXCTEMPLATEDIR) + strlen(t) + strlen("/lxc-") + 1;
b14fc100 875
6010a416
SG
876 tpath = malloc(len);
877 if (!tpath)
878 return NULL;
b14fc100 879
6010a416
SG
880 ret = snprintf(tpath, len, "%s/lxc-%s", LXCTEMPLATEDIR, t);
881 if (ret < 0 || ret >= len) {
882 free(tpath);
883 return NULL;
884 }
b14fc100 885
6010a416
SG
886 if (access(tpath, X_OK) < 0) {
887 SYSERROR("bad template: %s", t);
888 free(tpath);
889 return NULL;
890 }
891
892 return tpath;
893}
0a4be28d 894
592fd47a
SH
895/*
896 * @path: a pathname where / replaced with '\0'.
897 * @offsetp: pointer to int showing which path segment was last seen.
898 * Updated on return to reflect the next segment.
899 * @fulllen: full original path length.
900 * Returns a pointer to the next path segment, or NULL if done.
901 */
902static char *get_nextpath(char *path, int *offsetp, int fulllen)
903{
904 int offset = *offsetp;
905
906 if (offset >= fulllen)
907 return NULL;
908
91d9cab6 909 while (offset < fulllen && path[offset] != '\0')
592fd47a 910 offset++;
b14fc100 911
91d9cab6 912 while (offset < fulllen && path[offset] == '\0')
592fd47a
SH
913 offset++;
914
915 *offsetp = offset;
7be6bcd5 916
592fd47a
SH
917 return (offset < fulllen) ? &path[offset] : NULL;
918}
919
920/*
921 * Check that @subdir is a subdir of @dir. @len is the length of
922 * @dir (to avoid having to recalculate it).
923 */
924static bool is_subdir(const char *subdir, const char *dir, size_t len)
925{
926 size_t subdirlen = strlen(subdir);
927
928 if (subdirlen < len)
929 return false;
b14fc100 930
592fd47a
SH
931 if (strncmp(subdir, dir, len) != 0)
932 return false;
b14fc100 933
592fd47a
SH
934 if (dir[len-1] == '/')
935 return true;
b14fc100 936
592fd47a
SH
937 if (subdir[len] == '/' || subdirlen == len)
938 return true;
b14fc100 939
592fd47a
SH
940 return false;
941}
942
943/*
944 * Check if the open fd is a symlink. Return -ELOOP if it is. Return
945 * -ENOENT if we couldn't fstat. Return 0 if the fd is ok.
946 */
947static int check_symlink(int fd)
948{
949 struct stat sb;
b14fc100 950 int ret;
951
952 ret = fstat(fd, &sb);
592fd47a
SH
953 if (ret < 0)
954 return -ENOENT;
b14fc100 955
592fd47a
SH
956 if (S_ISLNK(sb.st_mode))
957 return -ELOOP;
b14fc100 958
592fd47a
SH
959 return 0;
960}
961
962/*
963 * Open a file or directory, provided that it contains no symlinks.
964 *
965 * CAVEAT: This function must not be used for other purposes than container
966 * setup before executing the container's init
967 */
968static int open_if_safe(int dirfd, const char *nextpath)
969{
970 int newfd = openat(dirfd, nextpath, O_RDONLY | O_NOFOLLOW);
1a0e70ac 971 if (newfd >= 0) /* Was not a symlink, all good. */
592fd47a
SH
972 return newfd;
973
974 if (errno == ELOOP)
975 return newfd;
976
977 if (errno == EPERM || errno == EACCES) {
1a0e70ac
CB
978 /* We're not root (cause we got EPERM) so try opening with
979 * O_PATH.
980 */
592fd47a
SH
981 newfd = openat(dirfd, nextpath, O_PATH | O_NOFOLLOW);
982 if (newfd >= 0) {
1a0e70ac
CB
983 /* O_PATH will return an fd for symlinks. We know
984 * nextpath wasn't a symlink at last openat, so if fd is
985 * now a link, then something * fishy is going on.
592fd47a
SH
986 */
987 int ret = check_symlink(newfd);
988 if (ret < 0) {
989 close(newfd);
990 newfd = ret;
991 }
992 }
993 }
994
995 return newfd;
996}
997
998/*
999 * Open a path intending for mounting, ensuring that the final path
1000 * is inside the container's rootfs.
1001 *
1002 * CAVEAT: This function must not be used for other purposes than container
1003 * setup before executing the container's init
1004 *
1005 * @target: path to be opened
1006 * @prefix_skip: a part of @target in which to ignore symbolic links. This
1007 * would be the container's rootfs.
1008 *
1009 * Return an open fd for the path, or <0 on error.
1010 */
1011static int open_without_symlink(const char *target, const char *prefix_skip)
1012{
1013 int curlen = 0, dirfd, fulllen, i;
7be6bcd5 1014 char *dup;
592fd47a
SH
1015
1016 fulllen = strlen(target);
1017
1018 /* make sure prefix-skip makes sense */
01074e5b 1019 if (prefix_skip && strlen(prefix_skip) > 0) {
592fd47a
SH
1020 curlen = strlen(prefix_skip);
1021 if (!is_subdir(target, prefix_skip, curlen)) {
7be6bcd5 1022 ERROR("WHOA there - target \"%s\" didn't start with prefix \"%s\"",
1023 target, prefix_skip);
592fd47a
SH
1024 return -EINVAL;
1025 }
b14fc100 1026
592fd47a
SH
1027 /*
1028 * get_nextpath() expects the curlen argument to be
1029 * on a (turned into \0) / or before it, so decrement
1030 * curlen to make sure that happens
1031 */
1032 if (curlen)
1033 curlen--;
1034 } else {
1035 prefix_skip = "/";
1036 curlen = 0;
1037 }
1038
1039 /* Make a copy of target which we can hack up, and tokenize it */
1040 if ((dup = strdup(target)) == NULL) {
7be6bcd5 1041 ERROR("Out of memory checking for symbolic link");
592fd47a
SH
1042 return -ENOMEM;
1043 }
b14fc100 1044
592fd47a
SH
1045 for (i = 0; i < fulllen; i++) {
1046 if (dup[i] == '/')
1047 dup[i] = '\0';
1048 }
1049
1050 dirfd = open(prefix_skip, O_RDONLY);
7be6bcd5 1051 if (dirfd < 0) {
1052 SYSERROR("Failed to open path \"%s\"", prefix_skip);
592fd47a 1053 goto out;
7be6bcd5 1054 }
b14fc100 1055
51a8a74c 1056 for (;;) {
592fd47a
SH
1057 int newfd, saved_errno;
1058 char *nextpath;
1059
1060 if ((nextpath = get_nextpath(dup, &curlen, fulllen)) == NULL)
1061 goto out;
b14fc100 1062
592fd47a
SH
1063 newfd = open_if_safe(dirfd, nextpath);
1064 saved_errno = errno;
1065 close(dirfd);
b14fc100 1066
592fd47a
SH
1067 dirfd = newfd;
1068 if (newfd < 0) {
1069 errno = saved_errno;
1070 if (errno == ELOOP)
1071 SYSERROR("%s in %s was a symbolic link!", nextpath, target);
b14fc100 1072
592fd47a
SH
1073 goto out;
1074 }
1075 }
1076
1077out:
1078 free(dup);
1079 return dirfd;
1080}
1081
1082/*
1083 * Safely mount a path into a container, ensuring that the mount target
1084 * is under the container's @rootfs. (If @rootfs is NULL, then the container
1085 * uses the host's /)
1086 *
1087 * CAVEAT: This function must not be used for other purposes than container
1088 * setup before executing the container's init
1089 */
1090int safe_mount(const char *src, const char *dest, const char *fstype,
1091 unsigned long flags, const void *data, const char *rootfs)
1092{
1a0e70ac
CB
1093 int destfd, ret, saved_errno;
1094 /* Only needs enough for /proc/self/fd/<fd>. */
1095 char srcbuf[50], destbuf[50];
1096 int srcfd = -1;
592fd47a
SH
1097 const char *mntsrc = src;
1098
1099 if (!rootfs)
1100 rootfs = "";
1101
1102 /* todo - allow symlinks for relative paths if 'allowsymlinks' option is passed */
1103 if (flags & MS_BIND && src && src[0] != '/') {
7be6bcd5 1104 INFO("This is a relative bind mount");
b14fc100 1105
592fd47a
SH
1106 srcfd = open_without_symlink(src, NULL);
1107 if (srcfd < 0)
1108 return srcfd;
b14fc100 1109
6da73634
RK
1110 ret = snprintf(srcbuf, sizeof(srcbuf), "/proc/self/fd/%d", srcfd);
1111 if (ret < 0 || ret >= (int)sizeof(srcbuf)) {
592fd47a
SH
1112 close(srcfd);
1113 ERROR("Out of memory");
1114 return -EINVAL;
1115 }
1116 mntsrc = srcbuf;
1117 }
1118
1119 destfd = open_without_symlink(dest, rootfs);
1120 if (destfd < 0) {
88e078ba
CB
1121 if (srcfd != -1) {
1122 saved_errno = errno;
592fd47a 1123 close(srcfd);
88e078ba
CB
1124 errno = saved_errno;
1125 }
b14fc100 1126
592fd47a
SH
1127 return destfd;
1128 }
1129
6da73634
RK
1130 ret = snprintf(destbuf, sizeof(destbuf), "/proc/self/fd/%d", destfd);
1131 if (ret < 0 || ret >= (int)sizeof(destbuf)) {
592fd47a
SH
1132 if (srcfd != -1)
1133 close(srcfd);
b14fc100 1134
592fd47a
SH
1135 close(destfd);
1136 ERROR("Out of memory");
1137 return -EINVAL;
1138 }
1139
1140 ret = mount(mntsrc, destbuf, fstype, flags, data);
1141 saved_errno = errno;
1142 if (srcfd != -1)
1143 close(srcfd);
b14fc100 1144
592fd47a
SH
1145 close(destfd);
1146 if (ret < 0) {
1147 errno = saved_errno;
7be6bcd5 1148 SYSERROR("Failed to mount \"%s\" onto \"%s\"", src ? src : "(null)", dest);
592fd47a
SH
1149 return ret;
1150 }
1151
1152 return 0;
1153}
1154
ced03a01
SH
1155/*
1156 * Mount a proc under @rootfs if proc self points to a pid other than
1157 * my own. This is needed to have a known-good proc mount for setting
1158 * up LSMs both at container startup and attach.
1159 *
1160 * @rootfs : the rootfs where proc should be mounted
1161 *
1162 * Returns < 0 on failure, 0 if the correct proc was already mounted
1163 * and 1 if a new proc was mounted.
f267d666
BP
1164 *
1165 * NOTE: not to be called from inside the container namespace!
ced03a01 1166 */
943144d9 1167int lxc_mount_proc_if_needed(const char *rootfs)
ced03a01 1168{
7be6bcd5 1169 char path[PATH_MAX] = {0};
6b1ba5d6 1170 int link_to_pid, linklen, mypid, ret;
40464e8a 1171 char link[INTTYPE_TO_STRLEN(pid_t)] = {0};
ced03a01 1172
d726953a
CB
1173 ret = snprintf(path, PATH_MAX, "%s/proc/self", rootfs);
1174 if (ret < 0 || ret >= PATH_MAX) {
7be6bcd5 1175 SYSERROR("The name of proc path is too long");
ced03a01
SH
1176 return -1;
1177 }
fc2ad9dc 1178
979a0d93 1179 linklen = readlink(path, link, sizeof(link));
fc2ad9dc 1180
d726953a
CB
1181 ret = snprintf(path, PATH_MAX, "%s/proc", rootfs);
1182 if (ret < 0 || ret >= PATH_MAX) {
7be6bcd5 1183 SYSERROR("The name of proc path is too long");
d539a2b2
CB
1184 return -1;
1185 }
fc2ad9dc
CB
1186
1187 /* /proc not mounted */
1188 if (linklen < 0) {
1189 if (mkdir(path, 0755) && errno != EEXIST)
1190 return -1;
b14fc100 1191
ced03a01 1192 goto domount;
979a0d93 1193 } else if (linklen >= sizeof(link)) {
6b1ba5d6 1194 link[linklen - 1] = '\0';
7be6bcd5 1195 ERROR("Readlink returned truncated content: \"%s\"", link);
6b1ba5d6 1196 return -1;
fc2ad9dc
CB
1197 }
1198
0059379f 1199 mypid = lxc_raw_getpid();
6b1ba5d6
CB
1200 INFO("I am %d, /proc/self points to \"%s\"", mypid, link);
1201
2d036cca
CB
1202 if (lxc_safe_int(link, &link_to_pid) < 0)
1203 return -1;
fc2ad9dc 1204
6b1ba5d6
CB
1205 /* correct procfs is already mounted */
1206 if (link_to_pid == mypid)
1207 return 0;
fc2ad9dc 1208
6b1ba5d6
CB
1209 ret = umount2(path, MNT_DETACH);
1210 if (ret < 0)
7be6bcd5 1211 SYSWARN("Failed to umount \"%s\" with MNT_DETACH", path);
ced03a01
SH
1212
1213domount:
fc2ad9dc 1214 /* rootfs is NULL */
6b1ba5d6 1215 if (!strcmp(rootfs, ""))
f267d666
BP
1216 ret = mount("proc", path, "proc", 0, NULL);
1217 else
1218 ret = safe_mount("proc", path, "proc", 0, NULL, rootfs);
f267d666 1219 if (ret < 0)
ced03a01 1220 return -1;
f267d666 1221
7be6bcd5 1222 INFO("Mounted /proc in container for security transition");
ced03a01
SH
1223 return 1;
1224}
69aeabac 1225
f8dd0275 1226int open_devnull(void)
69aeabac 1227{
f8dd0275 1228 int fd = open("/dev/null", O_RDWR);
f8dd0275
AM
1229 if (fd < 0)
1230 SYSERROR("Can't open /dev/null");
1231
1232 return fd;
1233}
69aeabac 1234
f8dd0275
AM
1235int set_stdfds(int fd)
1236{
bbbf65ee
CB
1237 int ret;
1238
69aeabac
TA
1239 if (fd < 0)
1240 return -1;
1241
bbbf65ee
CB
1242 ret = dup2(fd, STDIN_FILENO);
1243 if (ret < 0)
f8dd0275 1244 return -1;
bbbf65ee
CB
1245
1246 ret = dup2(fd, STDOUT_FILENO);
1247 if (ret < 0)
f8dd0275 1248 return -1;
bbbf65ee
CB
1249
1250 ret = dup2(fd, STDERR_FILENO);
1251 if (ret < 0)
f8dd0275
AM
1252 return -1;
1253
1254 return 0;
1255}
1256
1257int null_stdfds(void)
1258{
1259 int ret = -1;
b14fc100 1260 int fd;
f8dd0275 1261
b14fc100 1262 fd = open_devnull();
f8dd0275
AM
1263 if (fd >= 0) {
1264 ret = set_stdfds(fd);
1265 close(fd);
1266 }
69aeabac 1267
69aeabac
TA
1268 return ret;
1269}
ccb4cabe 1270
330ae3d3 1271/* Check whether a signal is blocked by a process. */
de3c491b 1272/* /proc/pid-to-str/status\0 = (5 + 21 + 7 + 1) */
40464e8a 1273#define __PROC_STATUS_LEN (6 + INTTYPE_TO_STRLEN(pid_t) + 7 + 1)
573ad77f 1274bool task_blocks_signal(pid_t pid, int signal)
330ae3d3 1275{
4110345b
CB
1276 __do_free char *line = NULL;
1277 __do_fclose FILE *f = NULL;
330ae3d3 1278 int ret;
7be6bcd5 1279 char status[__PROC_STATUS_LEN] = {0};
573ad77f 1280 uint64_t sigblk = 0, one = 1;
eabf1ea9
CB
1281 size_t n = 0;
1282 bool bret = false;
330ae3d3 1283
de3c491b
CB
1284 ret = snprintf(status, __PROC_STATUS_LEN, "/proc/%d/status", pid);
1285 if (ret < 0 || ret >= __PROC_STATUS_LEN)
330ae3d3
CB
1286 return bret;
1287
4110345b 1288 f = fopen(status, "re");
330ae3d3 1289 if (!f)
4110345b 1290 return false;
330ae3d3
CB
1291
1292 while (getline(&line, &n, f) != -1) {
573ad77f
CB
1293 char *numstr;
1294
eabf1ea9 1295 if (strncmp(line, "SigBlk:", 7))
6fbcbe3b
CB
1296 continue;
1297
573ad77f
CB
1298 numstr = lxc_trim_whitespace_in_place(line + 7);
1299 ret = lxc_safe_uint64(numstr, &sigblk, 16);
1300 if (ret < 0)
4110345b 1301 return false;
573ad77f
CB
1302
1303 break;
330ae3d3
CB
1304 }
1305
573ad77f 1306 if (sigblk & (one << (signal - 1)))
330ae3d3
CB
1307 bret = true;
1308
330ae3d3
CB
1309 return bret;
1310}
000dfda7 1311
a687256f
CB
1312int lxc_preserve_ns(const int pid, const char *ns)
1313{
1314 int ret;
a052913d
CB
1315/* 5 /proc + 21 /int_as_str + 3 /ns + 20 /NS_NAME + 1 \0 */
1316#define __NS_PATH_LEN 50
1317 char path[__NS_PATH_LEN];
a687256f 1318
4d8ac866
CB
1319 /* This way we can use this function to also check whether namespaces
1320 * are supported by the kernel by passing in the NULL or the empty
1321 * string.
1322 */
a052913d 1323 ret = snprintf(path, __NS_PATH_LEN, "/proc/%d/ns%s%s", pid,
4d8ac866
CB
1324 !ns || strcmp(ns, "") == 0 ? "" : "/",
1325 !ns || strcmp(ns, "") == 0 ? "" : ns);
ea918412 1326 if (ret < 0 || (size_t)ret >= __NS_PATH_LEN) {
1327 errno = EFBIG;
1328 return -1;
1329 }
a687256f
CB
1330
1331 return open(path, O_RDONLY | O_CLOEXEC);
1332}
6bc2eafe 1333
464c4611 1334bool lxc_switch_uid_gid(uid_t uid, gid_t gid)
dbaf55a3 1335{
db2d1af1
CB
1336 int ret = 0;
1337
1338 if (gid != LXC_INVALID_GID) {
1339 ret = setgid(gid);
1340 if (ret < 0) {
1341 SYSERROR("Failed to switch to gid %d", gid);
464c4611 1342 return false;
db2d1af1
CB
1343 }
1344 NOTICE("Switched to gid %d", gid);
dbaf55a3 1345 }
dbaf55a3 1346
db2d1af1
CB
1347 if (uid != LXC_INVALID_UID) {
1348 ret = setuid(uid);
1349 if (ret < 0) {
1350 SYSERROR("Failed to switch to uid %d", uid);
464c4611 1351 return false;
db2d1af1
CB
1352 }
1353 NOTICE("Switched to uid %d", uid);
dbaf55a3 1354 }
dbaf55a3 1355
464c4611 1356 return true;
dbaf55a3
CB
1357}
1358
46b3a2f6 1359/* Simple convenience function which enables uniform logging. */
8af07f82 1360bool lxc_setgroups(int size, gid_t list[])
dbaf55a3
CB
1361{
1362 if (setgroups(size, list) < 0) {
8af07f82
CB
1363 SYSERROR("Failed to setgroups()");
1364 return false;
dbaf55a3 1365 }
8af07f82 1366 NOTICE("Dropped additional groups");
dbaf55a3 1367
8af07f82 1368 return true;
dbaf55a3 1369}
c6868a1f
CB
1370
1371static int lxc_get_unused_loop_dev_legacy(char *loop_name)
1372{
1373 struct dirent *dp;
1374 struct loop_info64 lo64;
1375 DIR *dir;
1376 int dfd = -1, fd = -1, ret = -1;
1377
1378 dir = opendir("/dev");
2f32e37e 1379 if (!dir) {
1380 SYSERROR("Failed to open \"/dev\"");
c6868a1f 1381 return -1;
2f32e37e 1382 }
c6868a1f
CB
1383
1384 while ((dp = readdir(dir))) {
c6868a1f
CB
1385 if (strncmp(dp->d_name, "loop", 4) != 0)
1386 continue;
1387
1388 dfd = dirfd(dir);
1389 if (dfd < 0)
1390 continue;
1391
1392 fd = openat(dfd, dp->d_name, O_RDWR);
1393 if (fd < 0)
1394 continue;
1395
1396 ret = ioctl(fd, LOOP_GET_STATUS64, &lo64);
1397 if (ret < 0) {
1398 if (ioctl(fd, LOOP_GET_STATUS64, &lo64) == 0 ||
1399 errno != ENXIO) {
1400 close(fd);
1401 fd = -1;
1402 continue;
1403 }
1404 }
1405
1406 ret = snprintf(loop_name, LO_NAME_SIZE, "/dev/%s", dp->d_name);
1407 if (ret < 0 || ret >= LO_NAME_SIZE) {
1408 close(fd);
1409 fd = -1;
1410 continue;
1411 }
1412
1413 break;
1414 }
1415
1416 closedir(dir);
1417
1418 if (fd < 0)
1419 return -1;
1420
1421 return fd;
1422}
1423
1424static int lxc_get_unused_loop_dev(char *name_loop)
1425{
1426 int loop_nr, ret;
1427 int fd_ctl = -1, fd_tmp = -1;
1428
1429 fd_ctl = open("/dev/loop-control", O_RDWR | O_CLOEXEC);
2f32e37e 1430 if (fd_ctl < 0) {
1431 SYSERROR("Failed to open loop control");
c6868a1f 1432 return -ENODEV;
2f32e37e 1433 }
c6868a1f
CB
1434
1435 loop_nr = ioctl(fd_ctl, LOOP_CTL_GET_FREE);
2f32e37e 1436 if (loop_nr < 0) {
1437 SYSERROR("Failed to get loop control");
c6868a1f 1438 goto on_error;
2f32e37e 1439 }
c6868a1f
CB
1440
1441 ret = snprintf(name_loop, LO_NAME_SIZE, "/dev/loop%d", loop_nr);
1442 if (ret < 0 || ret >= LO_NAME_SIZE)
1443 goto on_error;
1444
1445 fd_tmp = open(name_loop, O_RDWR | O_CLOEXEC);
b11738d7 1446 if (fd_tmp < 0) {
1447 /* on Android loop devices are moved under /dev/block, give it a shot */
1448 ret = snprintf(name_loop, LO_NAME_SIZE, "/dev/block/loop%d", loop_nr);
1449 if (ret < 0 || ret >= LO_NAME_SIZE)
1450 goto on_error;
1451
1452 fd_tmp = open(name_loop, O_RDWR | O_CLOEXEC);
1453 if (fd_tmp < 0)
1454 SYSERROR("Failed to open loop \"%s\"", name_loop);
1455 }
c6868a1f
CB
1456
1457on_error:
1458 close(fd_ctl);
1459 return fd_tmp;
1460}
1461
1462int lxc_prepare_loop_dev(const char *source, char *loop_dev, int flags)
1463{
1464 int ret;
1465 struct loop_info64 lo64;
1466 int fd_img = -1, fret = -1, fd_loop = -1;
1467
1468 fd_loop = lxc_get_unused_loop_dev(loop_dev);
1469 if (fd_loop < 0) {
2f32e37e 1470 if (fd_loop != -ENODEV)
1471 goto on_error;
1472
1473 fd_loop = lxc_get_unused_loop_dev_legacy(loop_dev);
1474 if (fd_loop < 0)
c6868a1f
CB
1475 goto on_error;
1476 }
1477
1478 fd_img = open(source, O_RDWR | O_CLOEXEC);
2f32e37e 1479 if (fd_img < 0) {
1480 SYSERROR("Failed to open source \"%s\"", source);
c6868a1f 1481 goto on_error;
2f32e37e 1482 }
c6868a1f
CB
1483
1484 ret = ioctl(fd_loop, LOOP_SET_FD, fd_img);
2f32e37e 1485 if (ret < 0) {
1486 SYSERROR("Failed to set loop fd");
c6868a1f 1487 goto on_error;
2f32e37e 1488 }
c6868a1f
CB
1489
1490 memset(&lo64, 0, sizeof(lo64));
1491 lo64.lo_flags = flags;
1492
a70c9e85
JF
1493 strlcpy((char *)lo64.lo_file_name, source, LO_NAME_SIZE);
1494
c6868a1f 1495 ret = ioctl(fd_loop, LOOP_SET_STATUS64, &lo64);
2f32e37e 1496 if (ret < 0) {
1497 SYSERROR("Failed to set loop status64");
c6868a1f 1498 goto on_error;
2f32e37e 1499 }
c6868a1f
CB
1500
1501 fret = 0;
1502
1503on_error:
1504 if (fd_img >= 0)
1505 close(fd_img);
1506
1507 if (fret < 0 && fd_loop >= 0) {
1508 close(fd_loop);
1509 fd_loop = -1;
1510 }
1511
1512 return fd_loop;
1513}
74251e49
CB
1514
1515int lxc_unstack_mountpoint(const char *path, bool lazy)
1516{
1517 int ret;
1518 int umounts = 0;
1519
1520pop_stack:
1521 ret = umount2(path, lazy ? MNT_DETACH : 0);
1522 if (ret < 0) {
1523 /* We consider anything else than EINVAL deadly to prevent going
1524 * into an infinite loop. (The other alternative is constantly
1525 * parsing /proc/self/mountinfo which is yucky and probably
1526 * racy.)
1527 */
1528 if (errno != EINVAL)
1529 return -errno;
1530 } else {
b4a40f7b
CB
1531 /* Just stop counting when this happens. That'd just be so
1532 * stupid that we won't even bother trying to report back the
1533 * correct value anymore.
1534 */
1535 if (umounts != INT_MAX)
1536 umounts++;
b14fc100 1537
74251e49
CB
1538 /* We succeeded in umounting. Make sure that there's no other
1539 * mountpoint stacked underneath.
1540 */
74251e49
CB
1541 goto pop_stack;
1542 }
1543
1544 return umounts;
1545}
ea3a694f 1546
99a8edfc 1547int run_command_internal(char *buf, size_t buf_size, int (*child_fn)(void *), void *args, bool wait_status)
ea3a694f
CB
1548{
1549 pid_t child;
1550 int ret, fret, pipefd[2];
1551 ssize_t bytes;
1552
46210729 1553 /* Make sure our callers do not receive uninitialized memory. */
ea3a694f
CB
1554 if (buf_size > 0 && buf)
1555 buf[0] = '\0';
1556
1557 if (pipe(pipefd) < 0) {
7be6bcd5 1558 SYSERROR("Failed to create pipe");
ea3a694f
CB
1559 return -1;
1560 }
1561
a59440be 1562 child = lxc_raw_clone(0, NULL);
ea3a694f
CB
1563 if (child < 0) {
1564 close(pipefd[0]);
1565 close(pipefd[1]);
7be6bcd5 1566 SYSERROR("Failed to create new process");
ea3a694f
CB
1567 return -1;
1568 }
1569
1570 if (child == 0) {
1571 /* Close the read-end of the pipe. */
1572 close(pipefd[0]);
1573
1574 /* Redirect std{err,out} to write-end of the
1575 * pipe.
1576 */
1577 ret = dup2(pipefd[1], STDOUT_FILENO);
1578 if (ret >= 0)
1579 ret = dup2(pipefd[1], STDERR_FILENO);
1580
1581 /* Close the write-end of the pipe. */
1582 close(pipefd[1]);
1583
1584 if (ret < 0) {
7be6bcd5 1585 SYSERROR("Failed to duplicate std{err,out} file descriptor");
d8b3f9c3 1586 _exit(EXIT_FAILURE);
ea3a694f
CB
1587 }
1588
1589 /* Does not return. */
1590 child_fn(args);
7be6bcd5 1591 ERROR("Failed to exec command");
d8b3f9c3 1592 _exit(EXIT_FAILURE);
ea3a694f
CB
1593 }
1594
1595 /* close the write-end of the pipe */
1596 close(pipefd[1]);
1597
7a643c7c 1598 if (buf && buf_size > 0) {
a5bc6cb0 1599 bytes = lxc_read_nointr(pipefd[0], buf, buf_size - 1);
7a643c7c
CB
1600 if (bytes > 0)
1601 buf[bytes - 1] = '\0';
1602 }
ea3a694f 1603
99a8edfc 1604 if (wait_status)
1605 fret = lxc_wait_for_pid_status(child);
1606 else
1607 fret = wait_for_pid(child);
1608
ea3a694f
CB
1609 /* close the read-end of the pipe */
1610 close(pipefd[0]);
1611
1612 return fret;
1613}
04ad7ffe 1614
99a8edfc 1615int run_command(char *buf, size_t buf_size, int (*child_fn)(void *), void *args)
1616{
1617 return run_command_internal(buf, buf_size, child_fn, args, false);
1618}
1619
1620int run_command_status(char *buf, size_t buf_size, int (*child_fn)(void *), void *args)
1621{
1622 return run_command_internal(buf, buf_size, child_fn, args, true);
1623}
1624
d75c14e2
CB
1625bool lxc_nic_exists(char *nic)
1626{
1627#define __LXC_SYS_CLASS_NET_LEN 15 + IFNAMSIZ + 1
1628 char path[__LXC_SYS_CLASS_NET_LEN];
1629 int ret;
1630 struct stat sb;
1631
1632 if (!strcmp(nic, "none"))
1633 return true;
1634
1635 ret = snprintf(path, __LXC_SYS_CLASS_NET_LEN, "/sys/class/net/%s", nic);
1636 if (ret < 0 || (size_t)ret >= __LXC_SYS_CLASS_NET_LEN)
1637 return false;
1638
1639 ret = stat(path, &sb);
1640 if (ret < 0)
1641 return false;
1642
1643 return true;
1644}
127c6e70 1645
6222c3f4
CB
1646uint64_t lxc_find_next_power2(uint64_t n)
1647{
1648 /* 0 is not valid input. We return 0 to the caller since 0 is not a
1649 * valid power of two.
1650 */
1651 if (n == 0)
1652 return 0;
1653
1654 if (!(n & (n - 1)))
1655 return n;
1656
1657 while (n & (n - 1))
1658 n = n & (n - 1);
1659
1660 n = n << 1;
1661 return n;
1662}
1fd0f41e 1663
4d8bdfa0
CB
1664static int process_dead(/* takes */ int status_fd)
1665{
f62cf1d4 1666 __do_close int dupfd = -EBADF;
4d8bdfa0
CB
1667 __do_free char *line = NULL;
1668 __do_fclose FILE *f = NULL;
1669 int ret = 0;
1670 size_t n = 0;
1671
1672 dupfd = dup(status_fd);
1673 if (dupfd < 0)
1674 return -1;
1675
1676 if (fd_cloexec(dupfd, true) < 0)
1677 return -1;
1678
92bdc593 1679 f = fdopen(dupfd, "re");
4d8bdfa0
CB
1680 if (!f)
1681 return -1;
4110345b
CB
1682
1683 /* Transfer ownership of fd. */
92bdc593 1684 move_fd(dupfd);
4d8bdfa0
CB
1685
1686 ret = 0;
1687 while (getline(&line, &n, f) != -1) {
1688 char *state;
1689
1690 if (strncmp(line, "State:", 6))
1691 continue;
1692
1693 state = lxc_trim_whitespace_in_place(line + 6);
1694 /* only check whether process is dead or zombie for now */
1695 if (*state == 'X' || *state == 'Z')
1696 ret = 1;
1697 }
1698
1699 return ret;
1700}
1701
1702int lxc_set_death_signal(int signal, pid_t parent, int parent_status_fd)
1fd0f41e
CB
1703{
1704 int ret;
1705 pid_t ppid;
1706
b81689a1
CB
1707 ret = prctl(PR_SET_PDEATHSIG, prctl_arg(signal), prctl_arg(0),
1708 prctl_arg(0), prctl_arg(0));
1fd0f41e 1709
4d8bdfa0 1710 /* verify that we haven't been orphaned in the meantime */
1fd0f41e 1711 ppid = (pid_t)syscall(SYS_getppid);
4d8bdfa0
CB
1712 if (ppid == 0) { /* parent outside our pidns */
1713 if (parent_status_fd < 0)
1714 return 0;
1715
1716 if (process_dead(parent_status_fd) == 1)
1717 return raise(SIGKILL);
1718 } else if (ppid != parent) {
1719 return raise(SIGKILL);
1fd0f41e
CB
1720 }
1721
2f32e37e 1722 if (ret < 0)
1fd0f41e 1723 return -1;
1fd0f41e
CB
1724
1725 return 0;
1726}
7ad37670 1727
a9d4ebc1
CB
1728int fd_cloexec(int fd, bool cloexec)
1729{
1730 int oflags, nflags;
1731
1732 oflags = fcntl(fd, F_GETFD, 0);
1733 if (oflags < 0)
1734 return -errno;
1735
1736 if (cloexec)
1737 nflags = oflags | FD_CLOEXEC;
1738 else
1739 nflags = oflags & ~FD_CLOEXEC;
1740
1741 if (nflags == oflags)
1742 return 0;
1743
1744 if (fcntl(fd, F_SETFD, nflags) < 0)
1745 return -errno;
1746
1747 return 0;
1748}
d7ab0375 1749
8408a9cc 1750int lxc_rm_rf(const char *dirname)
d7ab0375 1751{
8e64b673
CB
1752 __do_closedir DIR *dir = NULL;
1753 int fret = 0;
d7ab0375 1754 int ret;
1755 struct dirent *direntp;
d7ab0375 1756
1757 dir = opendir(dirname);
8e64b673
CB
1758 if (!dir)
1759 return log_error_errno(-1, errno, "Failed to open dir \"%s\"", dirname);
d7ab0375 1760
1761 while ((direntp = readdir(dir))) {
8e64b673 1762 __do_free char *pathname = NULL;
d7ab0375 1763 struct stat mystat;
1764
1765 if (!strcmp(direntp->d_name, ".") ||
1766 !strcmp(direntp->d_name, ".."))
1767 continue;
1768
1769 pathname = must_make_path(dirname, direntp->d_name, NULL);
d7ab0375 1770 ret = lstat(pathname, &mystat);
1771 if (ret < 0) {
8e64b673 1772 if (!fret)
7be6bcd5 1773 SYSWARN("Failed to stat \"%s\"", pathname);
d7ab0375 1774
8e64b673
CB
1775 fret = -1;
1776 continue;
d7ab0375 1777 }
1778
1779 if (!S_ISDIR(mystat.st_mode))
8e64b673 1780 continue;
d7ab0375 1781
8408a9cc 1782 ret = lxc_rm_rf(pathname);
d7ab0375 1783 if (ret < 0)
8e64b673 1784 fret = -1;
d7ab0375 1785 }
1786
1787 ret = rmdir(dirname);
8e64b673
CB
1788 if (ret < 0)
1789 return log_warn_errno(-1, errno, "Failed to delete \"%s\"", dirname);
d7ab0375 1790
8e64b673 1791 return fret;
d7ab0375 1792}
b25291da 1793
4fef78bc 1794int lxc_setup_keyring(char *keyring_label)
b25291da
CB
1795{
1796 key_serial_t keyring;
1797 int ret = 0;
1798
4fef78bc
MB
1799 if (keyring_label) {
1800 if (lsm_keyring_label_set(keyring_label) < 0) {
1801 ERROR("Couldn't set keyring label");
1802 }
1803 }
1804
b25291da
CB
1805 /* Try to allocate a new session keyring for the container to prevent
1806 * information leaks.
1807 */
1808 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, prctl_arg(0),
1809 prctl_arg(0), prctl_arg(0), prctl_arg(0));
1810 if (keyring < 0) {
1811 switch (errno) {
1812 case ENOSYS:
1813 DEBUG("The keyctl() syscall is not supported or blocked");
1814 break;
1815 case EACCES:
1816 __fallthrough;
1817 case EPERM:
1818 DEBUG("Failed to access kernel keyring. Continuing...");
1819 break;
1820 default:
1821 SYSERROR("Failed to create kernel keyring");
b25291da
CB
1822 break;
1823 }
1824 }
1825
1826 return ret;
1827}
39293f22
CB
1828
1829bool lxc_can_use_pidfd(int pidfd)
1830{
1831 int ret;
1832
1833 if (pidfd < 0)
1834 return log_error(false, "Kernel does not support pidfds");
1835
39293f22
CB
1836 /*
1837 * We don't care whether or not children were in a waitable state. We
1838 * just care whether waitid() recognizes P_PIDFD.
1839 *
1840 * Btw, while I have your attention, the above waitid() code is an
1841 * excellent example of how _not_ to do flag-based kernel APIs. So if
1842 * you ever go into kernel development or are already and you add this
1843 * kind of flag potpourri even though you have read this comment shame
1844 * on you. May the gods of operating system development have mercy on
1845 * your soul because I won't.
1846 */
1847 ret = waitid(P_PIDFD, pidfd, NULL,
1848 /* Type of children to wait for. */
1849 __WALL |
1850 /* How to wait for them. */
1851 WNOHANG | WNOWAIT |
1852 /* What state to wait for. */
1853 WEXITED | WSTOPPED | WCONTINUED);
1854 if (ret < 0)
1855 return log_error_errno(false, errno, "Kernel does not support waiting on processes through pidfds");
1856
8ad4fa68
CB
1857 ret = lxc_raw_pidfd_send_signal(pidfd, 0, NULL, 0);
1858 if (ret)
1859 return log_error_errno(false, errno, "Kernel does not support sending singals through pidfds");
1860
39293f22
CB
1861 return log_trace(true, "Kernel supports pidfds");
1862}
6aff5157 1863
c353b0b9 1864int fix_stdio_permissions(uid_t uid)
6aff5157 1865{
c353b0b9
CB
1866 __do_close int devnull_fd = -EBADF;
1867 int fret = 0;
1868 int std_fds[] = {STDIN_FILENO, STDOUT_FILENO, STDERR_FILENO};
6aff5157 1869 int ret;
c353b0b9 1870 struct stat st, st_null;
6aff5157 1871
1872 devnull_fd = open_devnull();
c353b0b9
CB
1873 if (devnull_fd < 0)
1874 return log_warn_errno(-1, errno, "Failed to open \"/dev/null\"");
1875
1876 ret = fstat(devnull_fd, &st_null);
1877 if (ret)
1878 return log_warn_errno(-errno, errno, "Failed to stat \"/dev/null\"");
6aff5157 1879
c353b0b9 1880 for (int i = 0; i < ARRAY_SIZE(std_fds); i++) {
6aff5157 1881 ret = fstat(std_fds[i], &st);
c353b0b9
CB
1882 if (ret) {
1883 SYSWARN("Failed to stat standard I/O file descriptor %d", std_fds[i]);
1884 fret = -1;
6aff5157 1885 continue;
1886 }
1887
c353b0b9 1888 if (st.st_rdev == st_null.st_rdev)
6aff5157 1889 continue;
6aff5157 1890
1891 ret = fchown(std_fds[i], uid, st.st_gid);
c353b0b9
CB
1892 if (ret) {
1893 SYSWARN("Failed to chown standard I/O file descriptor %d to uid %d and gid %d",
1894 std_fds[i], uid, st.st_gid);
1895 fret = -1;
6aff5157 1896 }
1897
1898 ret = fchmod(std_fds[i], 0700);
c353b0b9
CB
1899 if (ret) {
1900 SYSWARN("Failed to chmod standard I/O file descriptor %d", std_fds[i]);
1901 fret = -1;
6aff5157 1902 }
1903 }
1904
c353b0b9 1905 return fret;
6aff5157 1906}