]> git.proxmox.com Git - mirror_lxc.git/blob - src/lxc/utils.c
Merge pull request #3956 from brauner/2021-08-27.list
[mirror_lxc.git] / src / lxc / utils.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2
3 #ifndef _GNU_SOURCE
4 #define _GNU_SOURCE 1
5 #endif
6 #define __STDC_FORMAT_MACROS /* Required for PRIu64 to work. */
7 #include <ctype.h>
8 #include <dirent.h>
9 #include <errno.h>
10 #include <fcntl.h>
11 #include <grp.h>
12 #include <inttypes.h>
13 #include <libgen.h>
14 #include <pthread.h>
15 #include <signal.h>
16 #include <stddef.h>
17 #include <stdio.h>
18 #include <stdlib.h>
19 #include <string.h>
20 #include <sys/mman.h>
21 #include <sys/mount.h>
22 /* Needs to be after sys/mount.h header */
23 #include <linux/fs.h>
24 #include <sys/param.h>
25 #include <sys/prctl.h>
26 #include <sys/stat.h>
27 #include <sys/types.h>
28 #include <sys/wait.h>
29 #include <unistd.h>
30
31 #include "config.h"
32 #include "log.h"
33 #include "lsm/lsm.h"
34 #include "lxclock.h"
35 #include "memory_utils.h"
36 #include "namespace.h"
37 #include "parse.h"
38 #include "process_utils.h"
39 #include "syscall_wrappers.h"
40 #include "utils.h"
41
42 #ifndef HAVE_STRLCPY
43 #include "include/strlcpy.h"
44 #endif
45
46 #ifndef HAVE_STRLCAT
47 #include "include/strlcat.h"
48 #endif
49
50 #ifndef O_PATH
51 #define O_PATH 010000000
52 #endif
53
54 #ifndef O_NOFOLLOW
55 #define O_NOFOLLOW 00400000
56 #endif
57
58 lxc_log_define(utils, lxc);
59
60 /*
61 * if path is btrfs, tries to remove it and any subvolumes beneath it
62 */
63 extern bool btrfs_try_remove_subvol(const char *path);
64
65 static int _recursive_rmdir(const char *dirname, dev_t pdev,
66 const char *exclude, int level, bool onedev)
67 {
68 __do_closedir DIR *dir = NULL;
69 int failed = 0;
70 bool hadexclude = false;
71 int ret;
72 struct dirent *direntp;
73 char pathname[PATH_MAX];
74
75 dir = opendir(dirname);
76 if (!dir)
77 return log_error(-1, "Failed to open \"%s\"", dirname);
78
79 while ((direntp = readdir(dir))) {
80 int rc;
81 struct stat mystat;
82
83 if (strequal(direntp->d_name, ".") ||
84 strequal(direntp->d_name, ".."))
85 continue;
86
87 rc = strnprintf(pathname, sizeof(pathname), "%s/%s", dirname, direntp->d_name);
88 if (rc < 0) {
89 ERROR("The name of path is too long");
90 failed = 1;
91 continue;
92 }
93
94 if (!level && exclude && strequal(direntp->d_name, exclude)) {
95 ret = rmdir(pathname);
96 if (ret < 0) {
97 switch (errno) {
98 case ENOTEMPTY:
99 INFO("Not deleting snapshot \"%s\"", pathname);
100 hadexclude = true;
101 break;
102 case ENOTDIR:
103 ret = unlink(pathname);
104 if (ret)
105 INFO("Failed to remove \"%s\"", pathname);
106 break;
107 default:
108 SYSERROR("Failed to rmdir \"%s\"", pathname);
109 failed = 1;
110 break;
111 }
112 }
113
114 continue;
115 }
116
117 ret = lstat(pathname, &mystat);
118 if (ret) {
119 SYSERROR("Failed to stat \"%s\"", pathname);
120 failed = 1;
121 continue;
122 }
123
124 if (onedev && mystat.st_dev != pdev) {
125 if (btrfs_try_remove_subvol(pathname))
126 INFO("Removed btrfs subvolume at \"%s\"", pathname);
127 continue;
128 }
129
130 if (S_ISDIR(mystat.st_mode)) {
131 if (_recursive_rmdir(pathname, pdev, exclude, level + 1, onedev) < 0)
132 failed = 1;
133 } else {
134 ret = unlink(pathname);
135 if (ret < 0) {
136 __do_close int fd = -EBADF;
137
138 fd = open(pathname, O_RDONLY | O_CLOEXEC | O_NONBLOCK);
139 if (fd >= 0) {
140 /* The file might be marked immutable. */
141 int attr = 0;
142 ret = ioctl(fd, FS_IOC_GETFLAGS, &attr);
143 if (ret < 0)
144 SYSERROR("Failed to retrieve file flags");
145 attr &= ~FS_IMMUTABLE_FL;
146 ret = ioctl(fd, FS_IOC_SETFLAGS, &attr);
147 if (ret < 0)
148 SYSERROR("Failed to set file flags");
149 }
150
151 ret = unlink(pathname);
152 if (ret < 0) {
153 SYSERROR("Failed to delete \"%s\"", pathname);
154 failed = 1;
155 }
156 }
157 }
158 }
159
160 if (rmdir(dirname) < 0 && !btrfs_try_remove_subvol(dirname) && !hadexclude) {
161 SYSERROR("Failed to delete \"%s\"", dirname);
162 failed = 1;
163 }
164
165 return failed ? -1 : 0;
166 }
167
168 /*
169 * In overlayfs, st_dev is unreliable. So on overlayfs we don't do the
170 * lxc_rmdir_onedev().
171 */
172 static inline bool is_native_overlayfs(const char *path)
173 {
174 return has_fs_type(path, OVERLAY_SUPER_MAGIC) ||
175 has_fs_type(path, OVERLAYFS_SUPER_MAGIC);
176 }
177
178 /* returns 0 on success, -1 if there were any failures */
179 extern int lxc_rmdir_onedev(const char *path, const char *exclude)
180 {
181 struct stat mystat;
182 bool onedev = true;
183
184 if (is_native_overlayfs(path))
185 onedev = false;
186
187 if (lstat(path, &mystat) < 0) {
188 if (errno == ENOENT)
189 return 0;
190
191 return log_error_errno(-1, errno, "Failed to stat \"%s\"", path);
192 }
193
194 return _recursive_rmdir(path, mystat.st_dev, exclude, 0, onedev);
195 }
196
197 /* borrowed from iproute2 */
198 extern int get_u16(unsigned short *val, const char *arg, int base)
199 {
200 unsigned long res;
201 char *ptr;
202
203 if (!arg || !*arg)
204 return ret_errno(EINVAL);
205
206 errno = 0;
207 res = strtoul(arg, &ptr, base);
208 if (!ptr || ptr == arg || *ptr || res > 0xFFFF || errno != 0)
209 return ret_errno(ERANGE);
210
211 *val = res;
212
213 return 0;
214 }
215
216 int mkdir_p(const char *dir, mode_t mode)
217 {
218 const char *tmp = dir;
219 const char *orig = dir;
220
221 do {
222 __do_free char *makeme = NULL;
223 int ret;
224
225 dir = tmp + strspn(tmp, "/");
226 tmp = dir + strcspn(dir, "/");
227
228 makeme = strndup(orig, dir - orig);
229 if (!makeme)
230 return ret_set_errno(-1, ENOMEM);
231
232 ret = mkdir(makeme, mode);
233 if (ret < 0 && errno != EEXIST)
234 return log_error_errno(-1, errno, "Failed to create directory \"%s\"", makeme);
235
236 } while (tmp != dir);
237
238 return 0;
239 }
240
241 char *get_rundir(void)
242 {
243 __do_free char *rundir = NULL;
244 char *static_rundir;
245 int ret;
246 size_t len;
247 const char *homedir;
248 struct stat sb;
249
250 if (stat(RUNTIME_PATH, &sb) < 0)
251 return NULL;
252
253 if (geteuid() == sb.st_uid || getegid() == sb.st_gid)
254 return strdup(RUNTIME_PATH);
255
256 static_rundir = getenv("XDG_RUNTIME_DIR");
257 if (static_rundir)
258 return strdup(static_rundir);
259
260 INFO("XDG_RUNTIME_DIR isn't set in the environment");
261 homedir = getenv("HOME");
262 if (!homedir)
263 return log_error(NULL, "HOME isn't set in the environment");
264
265 len = strlen(homedir) + 17;
266 rundir = malloc(sizeof(char) * len);
267 if (!rundir)
268 return NULL;
269
270 ret = strnprintf(rundir, len, "%s/.cache/lxc/run/", homedir);
271 if (ret < 0)
272 return ret_set_errno(NULL, EIO);
273
274 return move_ptr(rundir);
275 }
276
277 int wait_for_pid(pid_t pid)
278 {
279 int status, ret;
280
281 again:
282 ret = waitpid(pid, &status, 0);
283 if (ret == -1) {
284 if (errno == EINTR)
285 goto again;
286
287 return -1;
288 }
289
290 if (ret != pid)
291 goto again;
292
293 if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
294 return -1;
295
296 return 0;
297 }
298
299 int wait_for_pidfd(int pidfd)
300 {
301 int ret;
302 siginfo_t info = {
303 .si_signo = 0,
304 };
305
306 do {
307 ret = waitid(P_PIDFD, pidfd, &info, __WALL | WEXITED);
308 } while (ret < 0 && errno == EINTR);
309
310 return !ret && WIFEXITED(info.si_status) && WEXITSTATUS(info.si_status) == 0;
311 }
312
313 int lxc_wait_for_pid_status(pid_t pid)
314 {
315 int status, ret;
316
317 again:
318 ret = waitpid(pid, &status, 0);
319 if (ret == -1) {
320 if (errno == EINTR)
321 goto again;
322
323 return -1;
324 }
325
326 if (ret != pid)
327 goto again;
328
329 return status;
330 }
331
332 #ifdef HAVE_OPENSSL
333 #include <openssl/evp.h>
334
335 static int do_sha1_hash(const char *buf, int buflen, unsigned char *md_value,
336 unsigned int *md_len)
337 {
338 EVP_MD_CTX *mdctx;
339 const EVP_MD *md;
340
341 md = EVP_get_digestbyname("sha1");
342 if (!md)
343 return log_error(-1, "Unknown message digest: sha1\n");
344
345 mdctx = EVP_MD_CTX_create();
346 EVP_DigestInit_ex(mdctx, md, NULL);
347 EVP_DigestUpdate(mdctx, buf, buflen);
348 EVP_DigestFinal_ex(mdctx, md_value, md_len);
349 EVP_MD_CTX_destroy(mdctx);
350
351 return 0;
352 }
353
354 int sha1sum_file(char *fnam, unsigned char *digest, unsigned int *md_len)
355 {
356 __do_free char *buf = NULL;
357 __do_fclose FILE *f = NULL;
358 int ret;
359 long flen;
360
361 if (!fnam)
362 return -1;
363
364 f = fopen_cloexec(fnam, "r");
365 if (!f)
366 return log_error_errno(-1, errno, "Failed to open template \"%s\"", fnam);
367
368 if (fseek(f, 0, SEEK_END) < 0)
369 return log_error_errno(-1, errno, "Failed to seek to end of template");
370
371 flen = ftell(f);
372 if (flen < 0)
373 return log_error_errno(-1, errno, "Failed to tell size of template");
374
375 if (fseek(f, 0, SEEK_SET) < 0)
376 return log_error_errno(-1, errno, "Failed to seek to start of template");
377
378 buf = malloc(flen + 1);
379 if (!buf)
380 return log_error_errno(-1, ENOMEM, "Out of memory");
381
382 if (fread(buf, 1, flen, f) != flen)
383 return log_error_errno(-1, errno, "Failed to read template");
384
385 buf[flen] = '\0';
386 ret = do_sha1_hash(buf, flen, (void *)digest, md_len);
387 return ret;
388 }
389 #endif
390
391 struct lxc_popen_FILE *lxc_popen(const char *command)
392 {
393 int ret;
394 int pipe_fds[2];
395 pid_t child_pid;
396 struct lxc_popen_FILE *fp = NULL;
397
398 ret = pipe2(pipe_fds, O_CLOEXEC);
399 if (ret < 0)
400 return NULL;
401
402 child_pid = fork();
403 if (child_pid < 0)
404 goto on_error;
405
406 if (!child_pid) {
407 sigset_t mask;
408
409 close(pipe_fds[0]);
410
411 /* duplicate stdout */
412 if (pipe_fds[1] != STDOUT_FILENO)
413 ret = dup2(pipe_fds[1], STDOUT_FILENO);
414 else
415 ret = fcntl(pipe_fds[1], F_SETFD, 0);
416 if (ret < 0) {
417 close(pipe_fds[1]);
418 _exit(EXIT_FAILURE);
419 }
420
421 /* duplicate stderr */
422 if (pipe_fds[1] != STDERR_FILENO)
423 ret = dup2(pipe_fds[1], STDERR_FILENO);
424 else
425 ret = fcntl(pipe_fds[1], F_SETFD, 0);
426 close(pipe_fds[1]);
427 if (ret < 0)
428 _exit(EXIT_FAILURE);
429
430 /* unblock all signals */
431 ret = sigfillset(&mask);
432 if (ret < 0)
433 _exit(EXIT_FAILURE);
434
435 ret = pthread_sigmask(SIG_UNBLOCK, &mask, NULL);
436 if (ret < 0)
437 _exit(EXIT_FAILURE);
438
439 /* check if /bin/sh exist, otherwise try Android location /system/bin/sh */
440 if (file_exists("/bin/sh"))
441 execl("/bin/sh", "sh", "-c", command, (char *)NULL);
442 else
443 execl("/system/bin/sh", "sh", "-c", command, (char *)NULL);
444
445 _exit(127);
446 }
447
448 close(pipe_fds[1]);
449 pipe_fds[1] = -1;
450
451 fp = malloc(sizeof(*fp));
452 if (!fp)
453 goto on_error;
454
455 memset(fp, 0, sizeof(*fp));
456
457 fp->child_pid = child_pid;
458 fp->pipe = pipe_fds[0];
459
460 /* From now on, closing fp->f will also close fp->pipe. So only ever
461 * call fclose(fp->f).
462 */
463 fp->f = fdopen(pipe_fds[0], "r");
464 if (!fp->f)
465 goto on_error;
466
467 return fp;
468
469 on_error:
470 /* We can only close pipe_fds[0] if fdopen() didn't succeed or wasn't
471 * called yet. Otherwise the fd belongs to the file opened by fdopen()
472 * since it isn't dup()ed.
473 */
474 if (fp && !fp->f && pipe_fds[0] >= 0)
475 close(pipe_fds[0]);
476
477 if (pipe_fds[1] >= 0)
478 close(pipe_fds[1]);
479
480 if (fp && fp->f)
481 fclose(fp->f);
482
483 if (fp)
484 free(fp);
485
486 return NULL;
487 }
488
489 int lxc_pclose(struct lxc_popen_FILE *fp)
490 {
491 pid_t wait_pid;
492 int wstatus = 0;
493
494 if (!fp)
495 return -1;
496
497 do {
498 wait_pid = waitpid(fp->child_pid, &wstatus, 0);
499 } while (wait_pid < 0 && errno == EINTR);
500
501 fclose(fp->f);
502 free(fp);
503
504 if (wait_pid < 0)
505 return -1;
506
507 return wstatus;
508 }
509
510 int randseed(bool srand_it)
511 {
512 __do_fclose FILE *f = NULL;
513 /*
514 * srand pre-seed function based on /dev/urandom
515 */
516 unsigned int seed = time(NULL) + getpid();
517
518 f = fopen("/dev/urandom", "re");
519 if (f) {
520 int ret = fread(&seed, sizeof(seed), 1, f);
521 if (ret != 1)
522 SYSDEBUG("Unable to fread /dev/urandom, fallback to time+pid rand seed");
523 }
524
525 if (srand_it)
526 srand(seed);
527
528 return seed;
529 }
530
531 uid_t get_ns_uid(uid_t orig)
532 {
533 __do_free char *line = NULL;
534 __do_fclose FILE *f = NULL;
535 size_t sz = 0;
536 uid_t nsid, hostid, range;
537
538 f = fopen("/proc/self/uid_map", "re");
539 if (!f)
540 return log_error_errno(0, errno, "Failed to open uid_map");
541
542 while (getline(&line, &sz, f) != -1) {
543 if (sscanf(line, "%u %u %u", &nsid, &hostid, &range) != 3)
544 continue;
545
546 if (hostid <= orig && hostid + range > orig)
547 return nsid += orig - hostid;
548 }
549
550 return LXC_INVALID_UID;
551 }
552
553 gid_t get_ns_gid(gid_t orig)
554 {
555 __do_free char *line = NULL;
556 __do_fclose FILE *f = NULL;
557 size_t sz = 0;
558 gid_t nsid, hostid, range;
559
560 f = fopen("/proc/self/gid_map", "re");
561 if (!f)
562 return log_error_errno(0, errno, "Failed to open gid_map");
563
564 while (getline(&line, &sz, f) != -1) {
565 if (sscanf(line, "%u %u %u", &nsid, &hostid, &range) != 3)
566 continue;
567
568 if (hostid <= orig && hostid + range > orig)
569 return nsid += orig - hostid;
570 }
571
572 return LXC_INVALID_GID;
573 }
574
575 bool dir_exists(const char *path)
576 {
577 return exists_dir_at(-1, path);
578 }
579
580 /* Note we don't use SHA-1 here as we don't want to depend on HAVE_GNUTLS.
581 * FNV has good anti collision properties and we're not worried
582 * about pre-image resistance or one-way-ness, we're just trying to make
583 * the name unique in the 108 bytes of space we have.
584 */
585 uint64_t fnv_64a_buf(void *buf, size_t len, uint64_t hval)
586 {
587 unsigned char *bp;
588
589 for(bp = buf; bp < (unsigned char *)buf + len; bp++) {
590 /* xor the bottom with the current octet */
591 hval ^= (uint64_t)*bp;
592
593 /* gcc optimised:
594 * multiply by the 64 bit FNV magic prime mod 2^64
595 */
596 hval += (hval << 1) + (hval << 4) + (hval << 5) +
597 (hval << 7) + (hval << 8) + (hval << 40);
598 }
599
600 return hval;
601 }
602
603 bool is_shared_mountpoint(const char *path)
604 {
605 __do_fclose FILE *f = NULL;
606 __do_free char *line = NULL;
607 int i;
608 size_t len = 0;
609
610 f = fopen("/proc/self/mountinfo", "re");
611 if (!f)
612 return 0;
613
614 while (getline(&line, &len, f) > 0) {
615 char *slider1, *slider2;
616
617 for (slider1 = line, i = 0; slider1 && i < 4; i++)
618 slider1 = strchr(slider1 + 1, ' ');
619
620 if (!slider1)
621 continue;
622
623 slider2 = strchr(slider1 + 1, ' ');
624 if (!slider2)
625 continue;
626
627 *slider2 = '\0';
628 if (strequal(slider1 + 1, path)) {
629 /* This is the path. Is it shared? */
630 slider1 = strchr(slider2 + 1, ' ');
631 if (slider1 && strstr(slider1, "shared:"))
632 return true;
633 }
634 }
635
636 return false;
637 }
638
639 /*
640 * Detect whether / is mounted MS_SHARED. The only way I know of to
641 * check that is through /proc/self/mountinfo.
642 * I'm only checking for /. If the container rootfs or mount location
643 * is MS_SHARED, but not '/', then you're out of luck - figuring that
644 * out would be too much work to be worth it.
645 */
646 int detect_shared_rootfs(void)
647 {
648 if (is_shared_mountpoint("/"))
649 return 1;
650
651 return 0;
652 }
653
654 bool switch_to_ns(pid_t pid, const char *ns)
655 {
656 __do_close int fd = -EBADF;
657 int ret;
658 char nspath[STRLITERALLEN("/proc//ns/")
659 + INTTYPE_TO_STRLEN(pid_t)
660 + LXC_NAMESPACE_NAME_MAX];
661
662 /* Switch to new ns */
663 ret = strnprintf(nspath, sizeof(nspath), "/proc/%d/ns/%s", pid, ns);
664 if (ret < 0)
665 return false;
666
667 fd = open(nspath, O_RDONLY | O_CLOEXEC);
668 if (fd < 0)
669 return log_error_errno(false, errno, "Failed to open \"%s\"", nspath);
670
671 ret = setns(fd, 0);
672 if (ret)
673 return log_error_errno(false, errno, "Failed to set process %d to \"%s\" of %d", pid, ns, fd);
674
675 return true;
676 }
677
678 /*
679 * looking at fs/proc_namespace.c, it appears we can
680 * actually expect the rootfs entry to very specifically contain
681 * " - rootfs rootfs "
682 * IIUC, so long as we've chrooted so that rootfs is not our root,
683 * the rootfs entry should always be skipped in mountinfo contents.
684 */
685 bool detect_ramfs_rootfs(void)
686 {
687 __do_free char *line = NULL;
688 __do_free void *fopen_cache = NULL;
689 __do_fclose FILE *f = NULL;
690 size_t len = 0;
691
692 f = fopen_cached("/proc/self/mountinfo", "re", &fopen_cache);
693 if (!f)
694 return false;
695
696 while (getline(&line, &len, f) != -1) {
697 int i;
698 char *p, *p2;
699
700 for (p = line, i = 0; p && i < 4; i++)
701 p = strchr(p + 1, ' ');
702 if (!p)
703 continue;
704
705 p2 = strchr(p + 1, ' ');
706 if (!p2)
707 continue;
708 *p2 = '\0';
709 if (strequal(p + 1, "/")) {
710 /* This is '/'. Is it the ramfs? */
711 p = strchr(p2 + 1, '-');
712 if (p && strnequal(p, "- rootfs ", 9))
713 return true;
714 }
715 }
716
717 return false;
718 }
719
720 char *on_path(const char *cmd, const char *rootfs)
721 {
722 __do_free char *path = NULL;
723 char *entry = NULL;
724 char cmdpath[PATH_MAX];
725 int ret;
726
727 path = getenv("PATH");
728 if (!path)
729 return NULL;
730
731 path = strdup(path);
732 if (!path)
733 return NULL;
734
735 lxc_iterate_parts(entry, path, ":") {
736 if (rootfs)
737 ret = strnprintf(cmdpath, sizeof(cmdpath), "%s/%s/%s", rootfs, entry, cmd);
738 else
739 ret = strnprintf(cmdpath, sizeof(cmdpath), "%s/%s", entry, cmd);
740 if (ret < 0)
741 continue;
742
743 if (access(cmdpath, X_OK) == 0)
744 return strdup(cmdpath);
745 }
746
747 return NULL;
748 }
749
750 /* historically lxc-init has been under /usr/lib/lxc and under
751 * /usr/lib/$ARCH/lxc. It now lives as $prefix/sbin/init.lxc.
752 */
753 char *choose_init(const char *rootfs)
754 {
755 char *retv = NULL;
756 const char *empty = "",
757 *tmp;
758 int ret, env_set = 0;
759
760 if (!getenv("PATH")) {
761 if (setenv("PATH", "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", 0))
762 SYSERROR("Failed to setenv");
763
764 env_set = 1;
765 }
766
767 retv = on_path("init.lxc", rootfs);
768
769 if (env_set)
770 if (unsetenv("PATH"))
771 SYSERROR("Failed to unsetenv");
772
773 if (retv)
774 return retv;
775
776 retv = malloc(PATH_MAX);
777 if (!retv)
778 return NULL;
779
780 if (rootfs)
781 tmp = rootfs;
782 else
783 tmp = empty;
784
785 ret = strnprintf(retv, PATH_MAX, "%s/%s/%s", tmp, SBINDIR, "/init.lxc");
786 if (ret < 0) {
787 ERROR("The name of path is too long");
788 goto out1;
789 }
790
791 if (access(retv, X_OK) == 0)
792 return retv;
793
794 ret = strnprintf(retv, PATH_MAX, "%s/%s/%s", tmp, LXCINITDIR, "/lxc/lxc-init");
795 if (ret < 0) {
796 ERROR("The name of path is too long");
797 goto out1;
798 }
799
800 if (access(retv, X_OK) == 0)
801 return retv;
802
803 ret = strnprintf(retv, PATH_MAX, "%s/usr/lib/lxc/lxc-init", tmp);
804 if (ret < 0) {
805 ERROR("The name of path is too long");
806 goto out1;
807 }
808
809 if (access(retv, X_OK) == 0)
810 return retv;
811
812 ret = strnprintf(retv, PATH_MAX, "%s/sbin/lxc-init", tmp);
813 if (ret < 0) {
814 ERROR("The name of path is too long");
815 goto out1;
816 }
817
818 if (access(retv, X_OK) == 0)
819 return retv;
820
821 /*
822 * Last resort, look for the statically compiled init.lxc which we
823 * hopefully bind-mounted in.
824 * If we are called during container setup, and we get to this point,
825 * then the init.lxc.static from the host will need to be bind-mounted
826 * in. So we return NULL here to indicate that.
827 */
828 if (rootfs)
829 goto out1;
830
831 ret = strnprintf(retv, PATH_MAX, "/init.lxc.static");
832 if (ret < 0) {
833 WARN("Nonsense - name /lxc.init.static too long");
834 goto out1;
835 }
836
837 if (access(retv, X_OK) == 0)
838 return retv;
839
840 out1:
841 free(retv);
842 return NULL;
843 }
844
845 /*
846 * Given the '-t' template option to lxc-create, figure out what to
847 * do. If the template is a full executable path, use that. If it
848 * is something like 'sshd', then return $templatepath/lxc-sshd.
849 * On success return the template, on error return NULL.
850 */
851 char *get_template_path(const char *t)
852 {
853 int ret, len;
854 char *tpath;
855
856 if (t[0] == '/') {
857 if (access(t, X_OK) == 0) {
858 return strdup(t);
859 } else {
860 SYSERROR("Bad template pathname: %s", t);
861 return NULL;
862 }
863 }
864
865 len = strlen(LXCTEMPLATEDIR) + strlen(t) + strlen("/lxc-") + 1;
866
867 tpath = malloc(len);
868 if (!tpath)
869 return NULL;
870
871 ret = strnprintf(tpath, len, "%s/lxc-%s", LXCTEMPLATEDIR, t);
872 if (ret < 0) {
873 free(tpath);
874 return NULL;
875 }
876
877 if (access(tpath, X_OK) < 0) {
878 SYSERROR("bad template: %s", t);
879 free(tpath);
880 return NULL;
881 }
882
883 return tpath;
884 }
885
886 /*
887 * @path: a pathname where / replaced with '\0'.
888 * @offsetp: pointer to int showing which path segment was last seen.
889 * Updated on return to reflect the next segment.
890 * @fulllen: full original path length.
891 * Returns a pointer to the next path segment, or NULL if done.
892 */
893 static char *get_nextpath(char *path, int *offsetp, int fulllen)
894 {
895 int offset = *offsetp;
896
897 if (offset >= fulllen)
898 return NULL;
899
900 while (offset < fulllen && path[offset] != '\0')
901 offset++;
902
903 while (offset < fulllen && path[offset] == '\0')
904 offset++;
905
906 *offsetp = offset;
907
908 return (offset < fulllen) ? &path[offset] : NULL;
909 }
910
911 /*
912 * Check that @subdir is a subdir of @dir. @len is the length of
913 * @dir (to avoid having to recalculate it).
914 */
915 static bool is_subdir(const char *subdir, const char *dir, size_t len)
916 {
917 size_t subdirlen = strlen(subdir);
918
919 if (subdirlen < len)
920 return false;
921
922 if (!strnequal(subdir, dir, len))
923 return false;
924
925 if (dir[len-1] == '/')
926 return true;
927
928 if (subdir[len] == '/' || subdirlen == len)
929 return true;
930
931 return false;
932 }
933
934 /*
935 * Check if the open fd is a symlink. Return -ELOOP if it is. Return
936 * -ENOENT if we couldn't fstat. Return 0 if the fd is ok.
937 */
938 static int check_symlink(int fd)
939 {
940 struct stat sb;
941 int ret;
942
943 ret = fstat(fd, &sb);
944 if (ret < 0)
945 return -ENOENT;
946
947 if (S_ISLNK(sb.st_mode))
948 return -ELOOP;
949
950 return 0;
951 }
952
953 /*
954 * Open a file or directory, provided that it contains no symlinks.
955 *
956 * CAVEAT: This function must not be used for other purposes than container
957 * setup before executing the container's init
958 */
959 static int open_if_safe(int dirfd, const char *nextpath)
960 {
961 int newfd = openat(dirfd, nextpath, O_RDONLY | O_NOFOLLOW);
962 if (newfd >= 0) /* Was not a symlink, all good. */
963 return newfd;
964
965 if (errno == ELOOP)
966 return newfd;
967
968 if (errno == EPERM || errno == EACCES) {
969 /* We're not root (cause we got EPERM) so try opening with
970 * O_PATH.
971 */
972 newfd = openat(dirfd, nextpath, O_PATH | O_NOFOLLOW);
973 if (newfd >= 0) {
974 /* O_PATH will return an fd for symlinks. We know
975 * nextpath wasn't a symlink at last openat, so if fd is
976 * now a link, then something * fishy is going on.
977 */
978 int ret = check_symlink(newfd);
979 if (ret < 0) {
980 close(newfd);
981 newfd = ret;
982 }
983 }
984 }
985
986 return newfd;
987 }
988
989 /*
990 * Open a path intending for mounting, ensuring that the final path
991 * is inside the container's rootfs.
992 *
993 * CAVEAT: This function must not be used for other purposes than container
994 * setup before executing the container's init
995 *
996 * @target: path to be opened
997 * @prefix_skip: a part of @target in which to ignore symbolic links. This
998 * would be the container's rootfs.
999 *
1000 * Return an open fd for the path, or <0 on error.
1001 */
1002 static int open_without_symlink(const char *target, const char *prefix_skip)
1003 {
1004 int curlen = 0, dirfd, fulllen, i;
1005 char *dup;
1006
1007 fulllen = strlen(target);
1008
1009 /* make sure prefix-skip makes sense */
1010 if (prefix_skip && strlen(prefix_skip) > 0) {
1011 curlen = strlen(prefix_skip);
1012 if (!is_subdir(target, prefix_skip, curlen)) {
1013 ERROR("WHOA there - target \"%s\" didn't start with prefix \"%s\"",
1014 target, prefix_skip);
1015 return -EINVAL;
1016 }
1017
1018 /*
1019 * get_nextpath() expects the curlen argument to be
1020 * on a (turned into \0) / or before it, so decrement
1021 * curlen to make sure that happens
1022 */
1023 if (curlen)
1024 curlen--;
1025 } else {
1026 prefix_skip = "/";
1027 curlen = 0;
1028 }
1029
1030 /* Make a copy of target which we can hack up, and tokenize it */
1031 if ((dup = strdup(target)) == NULL) {
1032 ERROR("Out of memory checking for symbolic link");
1033 return -ENOMEM;
1034 }
1035
1036 for (i = 0; i < fulllen; i++) {
1037 if (dup[i] == '/')
1038 dup[i] = '\0';
1039 }
1040
1041 dirfd = open(prefix_skip, O_RDONLY);
1042 if (dirfd < 0) {
1043 SYSERROR("Failed to open path \"%s\"", prefix_skip);
1044 goto out;
1045 }
1046
1047 for (;;) {
1048 int newfd, saved_errno;
1049 char *nextpath;
1050
1051 if ((nextpath = get_nextpath(dup, &curlen, fulllen)) == NULL)
1052 goto out;
1053
1054 newfd = open_if_safe(dirfd, nextpath);
1055 saved_errno = errno;
1056 close(dirfd);
1057
1058 dirfd = newfd;
1059 if (newfd < 0) {
1060 errno = saved_errno;
1061 if (errno == ELOOP)
1062 SYSERROR("%s in %s was a symbolic link!", nextpath, target);
1063
1064 goto out;
1065 }
1066 }
1067
1068 out:
1069 free(dup);
1070 return dirfd;
1071 }
1072
1073 int __safe_mount_beneath_at(int beneath_fd, const char *src, const char *dst, const char *fstype,
1074 unsigned int flags, const void *data)
1075 {
1076 __do_close int source_fd = -EBADF, target_fd = -EBADF;
1077 struct lxc_open_how how = {
1078 .flags = PROTECT_OPATH_DIRECTORY,
1079 .resolve = PROTECT_LOOKUP_BENEATH_WITH_MAGICLINKS,
1080 };
1081 int ret;
1082 char src_buf[LXC_PROC_PID_FD_LEN], tgt_buf[LXC_PROC_PID_FD_LEN];
1083
1084 if (beneath_fd < 0)
1085 return -EINVAL;
1086
1087 if ((flags & MS_BIND) && src && src[0] != '/') {
1088 source_fd = openat2(beneath_fd, src, &how, sizeof(how));
1089 if (source_fd < 0)
1090 return -errno;
1091 ret = strnprintf(src_buf, sizeof(src_buf), "/proc/self/fd/%d", source_fd);
1092 if (ret < 0)
1093 return -EIO;
1094 } else {
1095 src_buf[0] = '\0';
1096 }
1097
1098 target_fd = openat2(beneath_fd, dst, &how, sizeof(how));
1099 if (target_fd < 0)
1100 return log_error_errno(-errno, errno, "Failed to open %d(%s)", beneath_fd, dst);
1101 ret = strnprintf(tgt_buf, sizeof(tgt_buf), "/proc/self/fd/%d", target_fd);
1102 if (ret < 0)
1103 return -EIO;
1104
1105 if (!is_empty_string(src_buf))
1106 ret = mount(src_buf, tgt_buf, fstype, flags, data);
1107 else
1108 ret = mount(src, tgt_buf, fstype, flags, data);
1109
1110 return ret;
1111 }
1112
1113 int safe_mount_beneath(const char *beneath, const char *src, const char *dst, const char *fstype,
1114 unsigned int flags, const void *data)
1115 {
1116 __do_close int beneath_fd = -EBADF;
1117 const char *path = beneath ? beneath : "/";
1118
1119 beneath_fd = openat(-1, path, PROTECT_OPATH_DIRECTORY);
1120 if (beneath_fd < 0)
1121 return log_error_errno(-errno, errno, "Failed to open %s", path);
1122
1123 return __safe_mount_beneath_at(beneath_fd, src, dst, fstype, flags, data);
1124 }
1125
1126 int safe_mount_beneath_at(int beneath_fd, const char *src, const char *dst, const char *fstype,
1127 unsigned int flags, const void *data)
1128 {
1129 return __safe_mount_beneath_at(beneath_fd, src, dst, fstype, flags, data);
1130 }
1131
1132 /*
1133 * Safely mount a path into a container, ensuring that the mount target
1134 * is under the container's @rootfs. (If @rootfs is NULL, then the container
1135 * uses the host's /)
1136 *
1137 * CAVEAT: This function must not be used for other purposes than container
1138 * setup before executing the container's init
1139 */
1140 int safe_mount(const char *src, const char *dest, const char *fstype,
1141 unsigned long flags, const void *data, const char *rootfs)
1142 {
1143 int destfd, ret, saved_errno;
1144 /* Only needs enough for /proc/self/fd/<fd>. */
1145 char srcbuf[50], destbuf[50];
1146 int srcfd = -1;
1147 const char *mntsrc = src;
1148
1149 if (!rootfs)
1150 rootfs = "";
1151
1152 /* todo - allow symlinks for relative paths if 'allowsymlinks' option is passed */
1153 if (flags & MS_BIND && src && src[0] != '/') {
1154 INFO("This is a relative bind mount");
1155
1156 srcfd = open_without_symlink(src, NULL);
1157 if (srcfd < 0)
1158 return srcfd;
1159
1160 ret = strnprintf(srcbuf, sizeof(srcbuf), "/proc/self/fd/%d", srcfd);
1161 if (ret < 0) {
1162 close(srcfd);
1163 ERROR("Out of memory");
1164 return -EINVAL;
1165 }
1166 mntsrc = srcbuf;
1167 }
1168
1169 destfd = open_without_symlink(dest, rootfs);
1170 if (destfd < 0) {
1171 if (srcfd != -1) {
1172 saved_errno = errno;
1173 close(srcfd);
1174 errno = saved_errno;
1175 }
1176
1177 return destfd;
1178 }
1179
1180 ret = strnprintf(destbuf, sizeof(destbuf), "/proc/self/fd/%d", destfd);
1181 if (ret < 0) {
1182 if (srcfd != -1)
1183 close(srcfd);
1184
1185 close(destfd);
1186 ERROR("Out of memory");
1187 return -EINVAL;
1188 }
1189
1190 ret = mount(mntsrc, destbuf, fstype, flags, data);
1191 saved_errno = errno;
1192 if (srcfd != -1)
1193 close(srcfd);
1194
1195 close(destfd);
1196 if (ret < 0) {
1197 errno = saved_errno;
1198 SYSERROR("Failed to mount \"%s\" onto \"%s\"", src ? src : "(null)", dest);
1199 return ret;
1200 }
1201
1202 return 0;
1203 }
1204
1205 int open_devnull(void)
1206 {
1207 int fd = open("/dev/null", O_RDWR);
1208 if (fd < 0)
1209 SYSERROR("Can't open /dev/null");
1210
1211 return fd;
1212 }
1213
1214 int set_stdfds(int fd)
1215 {
1216 int ret;
1217
1218 if (fd < 0)
1219 return -1;
1220
1221 ret = dup2(fd, STDIN_FILENO);
1222 if (ret < 0)
1223 return -1;
1224
1225 ret = dup2(fd, STDOUT_FILENO);
1226 if (ret < 0)
1227 return -1;
1228
1229 ret = dup2(fd, STDERR_FILENO);
1230 if (ret < 0)
1231 return -1;
1232
1233 return 0;
1234 }
1235
1236 int null_stdfds(void)
1237 {
1238 int ret = -1;
1239 int fd;
1240
1241 fd = open_devnull();
1242 if (fd >= 0) {
1243 ret = set_stdfds(fd);
1244 close(fd);
1245 }
1246
1247 return ret;
1248 }
1249
1250 /* Check whether a signal is blocked by a process. */
1251 /* /proc/pid-to-str/status\0 = (5 + 21 + 7 + 1) */
1252 #define __PROC_STATUS_LEN (6 + INTTYPE_TO_STRLEN(pid_t) + 7 + 1)
1253 bool task_blocks_signal(pid_t pid, int signal)
1254 {
1255 __do_free char *line = NULL;
1256 __do_fclose FILE *f = NULL;
1257 int ret;
1258 char status[__PROC_STATUS_LEN] = {0};
1259 uint64_t sigblk = 0, one = 1;
1260 size_t n = 0;
1261 bool bret = false;
1262
1263 ret = strnprintf(status, sizeof(status), "/proc/%d/status", pid);
1264 if (ret < 0)
1265 return bret;
1266
1267 f = fopen(status, "re");
1268 if (!f)
1269 return false;
1270
1271 while (getline(&line, &n, f) != -1) {
1272 char *numstr;
1273
1274 if (!strnequal(line, "SigBlk:", 7))
1275 continue;
1276
1277 numstr = lxc_trim_whitespace_in_place(line + 7);
1278 ret = lxc_safe_uint64(numstr, &sigblk, 16);
1279 if (ret < 0)
1280 return false;
1281
1282 break;
1283 }
1284
1285 if (sigblk & (one << (signal - 1)))
1286 bret = true;
1287
1288 return bret;
1289 }
1290
1291 int lxc_preserve_ns(const int pid, const char *ns)
1292 {
1293 int ret;
1294 /* 5 /proc + 21 /int_as_str + 3 /ns + 20 /NS_NAME + 1 \0 */
1295 #define __NS_PATH_LEN 50
1296 char path[__NS_PATH_LEN];
1297
1298 /* This way we can use this function to also check whether namespaces
1299 * are supported by the kernel by passing in the NULL or the empty
1300 * string.
1301 */
1302 ret = strnprintf(path, sizeof(path), "/proc/%d/ns%s%s", pid,
1303 !ns || strequal(ns, "") ? "" : "/",
1304 !ns || strequal(ns, "") ? "" : ns);
1305 if (ret < 0)
1306 return ret_errno(EIO);
1307
1308 return open(path, O_RDONLY | O_CLOEXEC);
1309 }
1310
1311 bool lxc_switch_uid_gid(uid_t uid, gid_t gid)
1312 {
1313 int ret = 0;
1314
1315 if (gid != LXC_INVALID_GID) {
1316 ret = setresgid(gid, gid, gid);
1317 if (ret < 0) {
1318 SYSERROR("Failed to switch to gid %d", gid);
1319 return false;
1320 }
1321 NOTICE("Switched to gid %d", gid);
1322 }
1323
1324 if (uid != LXC_INVALID_UID) {
1325 ret = setresuid(uid, uid, uid);
1326 if (ret < 0) {
1327 SYSERROR("Failed to switch to uid %d", uid);
1328 return false;
1329 }
1330 NOTICE("Switched to uid %d", uid);
1331 }
1332
1333 return true;
1334 }
1335
1336 /* Simple convenience function which enables uniform logging. */
1337 bool lxc_drop_groups(void)
1338 {
1339 int ret;
1340
1341 ret = setgroups(0, NULL);
1342 if (ret)
1343 return log_error_errno(false, errno, "Failed to drop supplimentary groups");
1344
1345 NOTICE("Dropped supplimentary groups");
1346 return ret == 0;
1347 }
1348
1349 bool lxc_setgroups(gid_t list[], size_t size)
1350 {
1351 int ret;
1352
1353 ret = setgroups(size, list);
1354 if (ret)
1355 return log_error_errno(false, errno, "Failed to set supplimentary groups");
1356
1357 if (size > 0 && lxc_log_trace()) {
1358 for (size_t i = 0; i < size; i++)
1359 TRACE("Setting supplimentary group %d", list[i]);
1360 }
1361
1362 NOTICE("Set supplimentary groups");
1363 return true;
1364 }
1365
1366 static int lxc_get_unused_loop_dev_legacy(char *loop_name)
1367 {
1368 struct dirent *dp;
1369 struct loop_info64 lo64;
1370 DIR *dir;
1371 int dfd = -1, fd = -1, ret = -1;
1372
1373 dir = opendir("/dev");
1374 if (!dir) {
1375 SYSERROR("Failed to open \"/dev\"");
1376 return -1;
1377 }
1378
1379 while ((dp = readdir(dir))) {
1380 if (!strnequal(dp->d_name, "loop", 4))
1381 continue;
1382
1383 dfd = dirfd(dir);
1384 if (dfd < 0)
1385 continue;
1386
1387 fd = openat(dfd, dp->d_name, O_RDWR);
1388 if (fd < 0)
1389 continue;
1390
1391 ret = ioctl(fd, LOOP_GET_STATUS64, &lo64);
1392 if (ret < 0) {
1393 if (ioctl(fd, LOOP_GET_STATUS64, &lo64) == 0 ||
1394 errno != ENXIO) {
1395 close(fd);
1396 fd = -1;
1397 continue;
1398 }
1399 }
1400
1401 ret = strnprintf(loop_name, LO_NAME_SIZE, "/dev/%s", dp->d_name);
1402 if (ret < 0) {
1403 close(fd);
1404 fd = -1;
1405 continue;
1406 }
1407
1408 break;
1409 }
1410
1411 closedir(dir);
1412
1413 if (fd < 0)
1414 return -1;
1415
1416 return fd;
1417 }
1418
1419 static int lxc_get_unused_loop_dev(char *name_loop)
1420 {
1421 int loop_nr, ret;
1422 int fd_ctl = -1, fd_tmp = -1;
1423
1424 fd_ctl = open("/dev/loop-control", O_RDWR | O_CLOEXEC);
1425 if (fd_ctl < 0) {
1426 SYSERROR("Failed to open loop control");
1427 return -ENODEV;
1428 }
1429
1430 loop_nr = ioctl(fd_ctl, LOOP_CTL_GET_FREE);
1431 if (loop_nr < 0) {
1432 SYSERROR("Failed to get loop control");
1433 goto on_error;
1434 }
1435
1436 ret = strnprintf(name_loop, LO_NAME_SIZE, "/dev/loop%d", loop_nr);
1437 if (ret < 0)
1438 goto on_error;
1439
1440 fd_tmp = open(name_loop, O_RDWR | O_CLOEXEC);
1441 if (fd_tmp < 0) {
1442 /* on Android loop devices are moved under /dev/block, give it a shot */
1443 ret = strnprintf(name_loop, LO_NAME_SIZE, "/dev/block/loop%d", loop_nr);
1444 if (ret < 0)
1445 goto on_error;
1446
1447 fd_tmp = open(name_loop, O_RDWR | O_CLOEXEC);
1448 if (fd_tmp < 0)
1449 SYSERROR("Failed to open loop \"%s\"", name_loop);
1450 }
1451
1452 on_error:
1453 close(fd_ctl);
1454 return fd_tmp;
1455 }
1456
1457 int lxc_prepare_loop_dev(const char *source, char *loop_dev, int flags)
1458 {
1459 int ret;
1460 struct loop_info64 lo64;
1461 int fd_img = -1, fret = -1, fd_loop = -1;
1462
1463 fd_loop = lxc_get_unused_loop_dev(loop_dev);
1464 if (fd_loop < 0) {
1465 if (fd_loop != -ENODEV)
1466 goto on_error;
1467
1468 fd_loop = lxc_get_unused_loop_dev_legacy(loop_dev);
1469 if (fd_loop < 0)
1470 goto on_error;
1471 }
1472
1473 fd_img = open(source, O_RDWR | O_CLOEXEC);
1474 if (fd_img < 0) {
1475 SYSERROR("Failed to open source \"%s\"", source);
1476 goto on_error;
1477 }
1478
1479 ret = ioctl(fd_loop, LOOP_SET_FD, fd_img);
1480 if (ret < 0) {
1481 SYSERROR("Failed to set loop fd");
1482 goto on_error;
1483 }
1484
1485 memset(&lo64, 0, sizeof(lo64));
1486 lo64.lo_flags = flags;
1487
1488 strlcpy((char *)lo64.lo_file_name, source, LO_NAME_SIZE);
1489
1490 ret = ioctl(fd_loop, LOOP_SET_STATUS64, &lo64);
1491 if (ret < 0) {
1492 SYSERROR("Failed to set loop status64");
1493 goto on_error;
1494 }
1495
1496 fret = 0;
1497
1498 on_error:
1499 if (fd_img >= 0)
1500 close(fd_img);
1501
1502 if (fret < 0 && fd_loop >= 0) {
1503 close(fd_loop);
1504 fd_loop = -1;
1505 }
1506
1507 return fd_loop;
1508 }
1509
1510 int lxc_unstack_mountpoint(const char *path, bool lazy)
1511 {
1512 int ret;
1513 int umounts = 0;
1514
1515 pop_stack:
1516 ret = umount2(path, lazy ? MNT_DETACH : 0);
1517 if (ret < 0) {
1518 /* We consider anything else than EINVAL deadly to prevent going
1519 * into an infinite loop. (The other alternative is constantly
1520 * parsing /proc/self/mountinfo which is yucky and probably
1521 * racy.)
1522 */
1523 if (errno != EINVAL)
1524 return -errno;
1525 } else {
1526 /* Just stop counting when this happens. That'd just be so
1527 * stupid that we won't even bother trying to report back the
1528 * correct value anymore.
1529 */
1530 if (umounts != INT_MAX)
1531 umounts++;
1532
1533 /* We succeeded in umounting. Make sure that there's no other
1534 * mountpoint stacked underneath.
1535 */
1536 goto pop_stack;
1537 }
1538
1539 return umounts;
1540 }
1541
1542 static int run_command_internal(char *buf, size_t buf_size, int (*child_fn)(void *), void *args, bool wait_status)
1543 {
1544 pid_t child;
1545 int ret, fret, pipefd[2];
1546 ssize_t bytes;
1547
1548 /* Make sure our callers do not receive uninitialized memory. */
1549 if (buf_size > 0 && buf)
1550 buf[0] = '\0';
1551
1552 if (pipe(pipefd) < 0) {
1553 SYSERROR("Failed to create pipe");
1554 return -1;
1555 }
1556
1557 child = lxc_raw_clone(0, NULL);
1558 if (child < 0) {
1559 close(pipefd[0]);
1560 close(pipefd[1]);
1561 SYSERROR("Failed to create new process");
1562 return -1;
1563 }
1564
1565 if (child == 0) {
1566 /* Close the read-end of the pipe. */
1567 close(pipefd[0]);
1568
1569 /* Redirect std{err,out} to write-end of the
1570 * pipe.
1571 */
1572 ret = dup2(pipefd[1], STDOUT_FILENO);
1573 if (ret >= 0)
1574 ret = dup2(pipefd[1], STDERR_FILENO);
1575
1576 /* Close the write-end of the pipe. */
1577 close(pipefd[1]);
1578
1579 if (ret < 0) {
1580 SYSERROR("Failed to duplicate std{err,out} file descriptor");
1581 _exit(EXIT_FAILURE);
1582 }
1583
1584 /* Does not return. */
1585 child_fn(args);
1586 ERROR("Failed to exec command");
1587 _exit(EXIT_FAILURE);
1588 }
1589
1590 /* close the write-end of the pipe */
1591 close(pipefd[1]);
1592
1593 if (buf && buf_size > 0) {
1594 bytes = lxc_read_nointr(pipefd[0], buf, buf_size - 1);
1595 if (bytes > 0)
1596 buf[bytes - 1] = '\0';
1597 }
1598
1599 if (wait_status)
1600 fret = lxc_wait_for_pid_status(child);
1601 else
1602 fret = wait_for_pid(child);
1603
1604 /* close the read-end of the pipe */
1605 close(pipefd[0]);
1606
1607 return fret;
1608 }
1609
1610 int run_command(char *buf, size_t buf_size, int (*child_fn)(void *), void *args)
1611 {
1612 return run_command_internal(buf, buf_size, child_fn, args, false);
1613 }
1614
1615 int run_command_status(char *buf, size_t buf_size, int (*child_fn)(void *), void *args)
1616 {
1617 return run_command_internal(buf, buf_size, child_fn, args, true);
1618 }
1619
1620 bool lxc_nic_exists(char *nic)
1621 {
1622 #define __LXC_SYS_CLASS_NET_LEN 15 + IFNAMSIZ + 1
1623 char path[__LXC_SYS_CLASS_NET_LEN];
1624 int ret;
1625 struct stat sb;
1626
1627 if (strequal(nic, "none"))
1628 return true;
1629
1630 ret = strnprintf(path, sizeof(path), "/sys/class/net/%s", nic);
1631 if (ret < 0)
1632 return false;
1633
1634 ret = stat(path, &sb);
1635 if (ret < 0)
1636 return false;
1637
1638 return true;
1639 }
1640
1641 uint64_t lxc_find_next_power2(uint64_t n)
1642 {
1643 /* 0 is not valid input. We return 0 to the caller since 0 is not a
1644 * valid power of two.
1645 */
1646 if (n == 0)
1647 return 0;
1648
1649 if (!(n & (n - 1)))
1650 return n;
1651
1652 while (n & (n - 1))
1653 n = n & (n - 1);
1654
1655 n = n << 1;
1656 return n;
1657 }
1658
1659 static int process_dead(/* takes */ int status_fd)
1660 {
1661 __do_close int dupfd = -EBADF;
1662 __do_free char *line = NULL;
1663 __do_fclose FILE *f = NULL;
1664 int ret = 0;
1665 size_t n = 0;
1666
1667 dupfd = dup(status_fd);
1668 if (dupfd < 0)
1669 return -1;
1670
1671 if (fd_cloexec(dupfd, true) < 0)
1672 return -1;
1673
1674 f = fdopen(dupfd, "re");
1675 if (!f)
1676 return -1;
1677
1678 /* Transfer ownership of fd. */
1679 move_fd(dupfd);
1680
1681 ret = 0;
1682 while (getline(&line, &n, f) != -1) {
1683 char *state;
1684
1685 if (!strnequal(line, "State:", 6))
1686 continue;
1687
1688 state = lxc_trim_whitespace_in_place(line + 6);
1689 /* only check whether process is dead or zombie for now */
1690 if (*state == 'X' || *state == 'Z')
1691 ret = 1;
1692 }
1693
1694 return ret;
1695 }
1696
1697 int lxc_set_death_signal(int signal, pid_t parent, int parent_status_fd)
1698 {
1699 int ret;
1700 pid_t ppid;
1701
1702 ret = prctl(PR_SET_PDEATHSIG, prctl_arg(signal), prctl_arg(0),
1703 prctl_arg(0), prctl_arg(0));
1704
1705 /* verify that we haven't been orphaned in the meantime */
1706 ppid = (pid_t)syscall(SYS_getppid);
1707 if (ppid == 0) { /* parent outside our pidns */
1708 if (parent_status_fd < 0)
1709 return 0;
1710
1711 if (process_dead(parent_status_fd) == 1)
1712 return raise(SIGKILL);
1713 } else if (ppid != parent) {
1714 return raise(SIGKILL);
1715 }
1716
1717 if (ret < 0)
1718 return -1;
1719
1720 return 0;
1721 }
1722
1723 int lxc_rm_rf(const char *dirname)
1724 {
1725 __do_closedir DIR *dir = NULL;
1726 int fret = 0;
1727 int ret;
1728 struct dirent *direntp;
1729
1730 dir = opendir(dirname);
1731 if (!dir)
1732 return log_error_errno(-1, errno, "Failed to open dir \"%s\"", dirname);
1733
1734 while ((direntp = readdir(dir))) {
1735 __do_free char *pathname = NULL;
1736 struct stat mystat;
1737
1738 if (strequal(direntp->d_name, ".") ||
1739 strequal(direntp->d_name, ".."))
1740 continue;
1741
1742 pathname = must_make_path(dirname, direntp->d_name, NULL);
1743 ret = lstat(pathname, &mystat);
1744 if (ret < 0) {
1745 if (!fret)
1746 SYSWARN("Failed to stat \"%s\"", pathname);
1747
1748 fret = -1;
1749 continue;
1750 }
1751
1752 if (!S_ISDIR(mystat.st_mode))
1753 continue;
1754
1755 ret = lxc_rm_rf(pathname);
1756 if (ret < 0)
1757 fret = -1;
1758 }
1759
1760 ret = rmdir(dirname);
1761 if (ret < 0)
1762 return log_warn_errno(-1, errno, "Failed to delete \"%s\"", dirname);
1763
1764 return fret;
1765 }
1766
1767 bool lxc_can_use_pidfd(int pidfd)
1768 {
1769 int ret;
1770
1771 if (pidfd < 0)
1772 return log_error(false, "Kernel does not support pidfds");
1773
1774 /*
1775 * We don't care whether or not children were in a waitable state. We
1776 * just care whether waitid() recognizes P_PIDFD.
1777 *
1778 * Btw, while I have your attention, the above waitid() code is an
1779 * excellent example of how _not_ to do flag-based kernel APIs. So if
1780 * you ever go into kernel development or are already and you add this
1781 * kind of flag potpourri even though you have read this comment shame
1782 * on you. May the gods of operating system development have mercy on
1783 * your soul because I won't.
1784 */
1785 ret = waitid(P_PIDFD, pidfd, NULL,
1786 /* Type of children to wait for. */
1787 __WALL |
1788 /* How to wait for them. */
1789 WNOHANG | WNOWAIT |
1790 /* What state to wait for. */
1791 WEXITED | WSTOPPED | WCONTINUED);
1792 if (ret < 0)
1793 return log_error_errno(false, errno, "Kernel does not support waiting on processes through pidfds");
1794
1795 ret = lxc_raw_pidfd_send_signal(pidfd, 0, NULL, 0);
1796 if (ret)
1797 return log_error_errno(false, errno, "Kernel does not support sending singals through pidfds");
1798
1799 return log_trace(true, "Kernel supports pidfds");
1800 }
1801
1802 int fix_stdio_permissions(uid_t uid)
1803 {
1804 __do_close int devnull_fd = -EBADF;
1805 int fret = 0;
1806 int std_fds[] = {STDIN_FILENO, STDOUT_FILENO, STDERR_FILENO};
1807 int ret;
1808 struct stat st, st_null;
1809
1810 devnull_fd = open_devnull();
1811 if (devnull_fd < 0)
1812 return log_trace_errno(-1, errno, "Failed to open \"/dev/null\"");
1813
1814 ret = fstat(devnull_fd, &st_null);
1815 if (ret)
1816 return log_trace_errno(-errno, errno, "Failed to stat \"/dev/null\"");
1817
1818 for (int i = 0; i < ARRAY_SIZE(std_fds); i++) {
1819 ret = fstat(std_fds[i], &st);
1820 if (ret) {
1821 SYSWARN("Failed to stat standard I/O file descriptor %d", std_fds[i]);
1822 fret = -1;
1823 continue;
1824 }
1825
1826 if (st.st_rdev == st_null.st_rdev)
1827 continue;
1828
1829 ret = fchown(std_fds[i], uid, st.st_gid);
1830 if (ret) {
1831 SYSTRACE("Failed to chown standard I/O file descriptor %d to uid %d and gid %d",
1832 std_fds[i], uid, st.st_gid);
1833 fret = -1;
1834 continue;
1835 }
1836
1837 ret = fchmod(std_fds[i], 0700);
1838 if (ret) {
1839 SYSTRACE("Failed to chmod standard I/O file descriptor %d", std_fds[i]);
1840 fret = -1;
1841 }
1842 }
1843
1844 return fret;
1845 }
1846
1847 bool multiply_overflow(int64_t base, uint64_t mult, int64_t *res)
1848 {
1849 if (base > 0 && base > (INT64_MAX / mult))
1850 return false;
1851
1852 if (base < 0 && base < (INT64_MIN / mult))
1853 return false;
1854
1855 *res = base * mult;
1856 return true;
1857 }
1858
1859 int print_r(int fd, const char *path)
1860 {
1861 __do_close int dfd = -EBADF, dfd_dup = -EBADF;
1862 __do_closedir DIR *dir = NULL;
1863 int ret = 0;
1864 struct dirent *direntp;
1865 struct stat st;
1866
1867 if (is_empty_string(path)) {
1868 char buf[LXC_PROC_SELF_FD_LEN];
1869
1870 ret = strnprintf(buf, sizeof(buf), "/proc/self/fd/%d", fd);
1871 if (ret < 0)
1872 return ret_errno(EIO);
1873
1874 /*
1875 * O_PATH file descriptors can't be used so we need to re-open
1876 * just in case.
1877 */
1878 dfd = openat(-EBADF, buf, O_CLOEXEC | O_DIRECTORY, 0);
1879 } else {
1880 dfd = openat(fd, path, O_CLOEXEC | O_DIRECTORY, 0);
1881 }
1882 if (dfd < 0)
1883 return -1;
1884
1885 dfd_dup = dup_cloexec(dfd);
1886 if (dfd_dup < 0)
1887 return -1;
1888
1889 dir = fdopendir(dfd);
1890 if (!dir)
1891 return -1;
1892 /* Transfer ownership to fdopendir(). */
1893 move_fd(dfd);
1894
1895 while ((direntp = readdir(dir))) {
1896 if (!strcmp(direntp->d_name, ".") ||
1897 !strcmp(direntp->d_name, ".."))
1898 continue;
1899
1900 ret = fstatat(dfd_dup, direntp->d_name, &st, AT_SYMLINK_NOFOLLOW);
1901 if (ret < 0 && errno != ENOENT)
1902 break;
1903
1904 ret = 0;
1905 if (S_ISDIR(st.st_mode))
1906 ret = print_r(dfd_dup, direntp->d_name);
1907 else
1908 INFO("mode(%o):uid(%d):gid(%d) -> %d/%s\n",
1909 (st.st_mode & ~S_IFMT), st.st_uid, st.st_gid, dfd_dup,
1910 direntp->d_name);
1911 if (ret < 0 && errno != ENOENT)
1912 break;
1913 }
1914
1915 if (is_empty_string(path))
1916 ret = fstatat(fd, "", &st, AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH);
1917 else
1918 ret = fstatat(fd, path, &st, AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW);
1919 if (ret)
1920 return -1;
1921 else
1922 INFO("mode(%o):uid(%d):gid(%d) -> %s",
1923 (st.st_mode & ~S_IFMT), st.st_uid, st.st_gid, maybe_empty(path));
1924 return ret;
1925 }