]> git.proxmox.com Git - mirror_lxc.git/blame - src/lxc/utils.c
raw_syscalls: add lxc_raw_clone{_cb}()
[mirror_lxc.git] / src / lxc / utils.c
CommitLineData
e3642c43
DL
1/*
2 * lxc: linux Container library
3 *
4 * (C) Copyright IBM Corp. 2007, 2008
5 *
6 * Authors:
9afe19d6 7 * Daniel Lezcano <daniel.lezcano at free.fr>
e3642c43
DL
8 *
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
250b1eec 21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
e3642c43
DL
22 */
23
d38dd64a
CB
24#ifndef _GNU_SOURCE
25#define _GNU_SOURCE 1
26#endif
7935833c 27#define __STDC_FORMAT_MACROS /* Required for PRIu64 to work. */
643c1984 28#include <ctype.h>
a1e5280d 29#include <dirent.h>
e3642c43 30#include <errno.h>
a1e5280d 31#include <fcntl.h>
dbaf55a3 32#include <grp.h>
7935833c 33#include <inttypes.h>
a1e5280d 34#include <libgen.h>
b467714b 35#include <pthread.h>
d983b93c 36#include <stddef.h>
a1e5280d
CB
37#include <stdio.h>
38#include <stdlib.h>
61a1d519 39#include <string.h>
e3642c43 40#include <sys/mman.h>
6e4bb2e0 41#include <sys/mount.h>
a1e5280d
CB
42#include <sys/param.h>
43#include <sys/prctl.h>
44#include <sys/stat.h>
9be53773
SH
45#include <sys/types.h>
46#include <sys/wait.h>
d38dd64a 47#include <unistd.h>
e3642c43 48
d38dd64a 49#include "config.h"
e3642c43 50#include "log.h"
025ed0f3 51#include "lxclock.h"
51d0854c 52#include "namespace.h"
e3db0162 53#include "parse.h"
38e5c2db 54#include "raw_syscalls.h"
b25291da 55#include "syscall_wrappers.h"
981f6029 56#include "utils.h"
e3642c43 57
43f984ea
DJ
58#ifndef HAVE_STRLCPY
59#include "include/strlcpy.h"
60#endif
61
bd583214
DJ
62#ifndef HAVE_STRLCAT
63#include "include/strlcat.h"
64#endif
65
4928c718
SG
66#ifndef O_PATH
67#define O_PATH 010000000
68#endif
69
70#ifndef O_NOFOLLOW
71#define O_NOFOLLOW 00400000
72#endif
73
ac2cecc4 74lxc_log_define(utils, lxc);
e3642c43 75
4295c5de
SH
76/*
77 * if path is btrfs, tries to remove it and any subvolumes beneath it
78 */
79extern bool btrfs_try_remove_subvol(const char *path);
80
41dc7155 81static int _recursive_rmdir(const char *dirname, dev_t pdev,
0cc417b2 82 const char *exclude, int level, bool onedev)
60bf62d4 83{
74f96976 84 struct dirent *direntp;
60bf62d4
SH
85 DIR *dir;
86 int ret, failed=0;
87 char pathname[MAXPATHLEN];
18aa217b 88 bool hadexclude = false;
60bf62d4
SH
89
90 dir = opendir(dirname);
91 if (!dir) {
b103ceac 92 ERROR("failed to open %s", dirname);
4355ab5f 93 return -1;
60bf62d4
SH
94 }
95
74f96976 96 while ((direntp = readdir(dir))) {
60bf62d4
SH
97 struct stat mystat;
98 int rc;
99
60bf62d4
SH
100 if (!strcmp(direntp->d_name, ".") ||
101 !strcmp(direntp->d_name, ".."))
102 continue;
103
104 rc = snprintf(pathname, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
105 if (rc < 0 || rc >= MAXPATHLEN) {
106 ERROR("pathname too long");
107 failed=1;
108 continue;
109 }
18aa217b
SH
110
111 if (!level && exclude && !strcmp(direntp->d_name, exclude)) {
112 ret = rmdir(pathname);
113 if (ret < 0) {
114 switch(errno) {
115 case ENOTEMPTY:
0cc417b2 116 INFO("Not deleting snapshot %s", pathname);
18aa217b
SH
117 hadexclude = true;
118 break;
119 case ENOTDIR:
120 ret = unlink(pathname);
121 if (ret)
b103ceac 122 INFO("Failed to remove %s", pathname);
18aa217b
SH
123 break;
124 default:
b103ceac 125 SYSERROR("Failed to rmdir %s", pathname);
18aa217b
SH
126 failed = 1;
127 break;
128 }
129 }
130 continue;
131 }
132
60bf62d4
SH
133 ret = lstat(pathname, &mystat);
134 if (ret) {
b103ceac 135 ERROR("Failed to stat %s", pathname);
4295c5de 136 failed = 1;
60bf62d4
SH
137 continue;
138 }
b14fc100 139
4295c5de
SH
140 if (onedev && mystat.st_dev != pdev) {
141 /* TODO should we be checking /proc/self/mountinfo for
142 * pathname and not doing this if found? */
143 if (btrfs_try_remove_subvol(pathname))
144 INFO("Removed btrfs subvolume at %s\n", pathname);
60bf62d4 145 continue;
4295c5de 146 }
b14fc100 147
60bf62d4 148 if (S_ISDIR(mystat.st_mode)) {
0cc417b2 149 if (_recursive_rmdir(pathname, pdev, exclude, level+1, onedev) < 0)
60bf62d4
SH
150 failed=1;
151 } else {
152 if (unlink(pathname) < 0) {
b103ceac 153 SYSERROR("Failed to delete %s", pathname);
60bf62d4
SH
154 failed=1;
155 }
156 }
157 }
158
4295c5de 159 if (rmdir(dirname) < 0 && !btrfs_try_remove_subvol(dirname) && !hadexclude) {
b103ceac 160 ERROR("Failed to delete %s", dirname);
4295c5de 161 failed=1;
60bf62d4
SH
162 }
163
025ed0f3 164 ret = closedir(dir);
025ed0f3 165 if (ret) {
b103ceac 166 ERROR("Failed to close directory %s", dirname);
60bf62d4
SH
167 failed=1;
168 }
169
4355ab5f 170 return failed ? -1 : 0;
60bf62d4
SH
171}
172
29a11a7f
CB
173/* In overlayfs, st_dev is unreliable. So on overlayfs we don't do the
174 * lxc_rmdir_onedev()
0cc417b2
SH
175 */
176static bool is_native_overlayfs(const char *path)
177{
29a11a7f
CB
178 if (has_fs_type(path, OVERLAY_SUPER_MAGIC) ||
179 has_fs_type(path, OVERLAYFS_SUPER_MAGIC))
0cc417b2 180 return true;
29a11a7f 181
0cc417b2
SH
182 return false;
183}
184
4355ab5f 185/* returns 0 on success, -1 if there were any failures */
41dc7155 186extern int lxc_rmdir_onedev(const char *path, const char *exclude)
60bf62d4
SH
187{
188 struct stat mystat;
0cc417b2
SH
189 bool onedev = true;
190
41dc7155 191 if (is_native_overlayfs(path))
0cc417b2 192 onedev = false;
60bf62d4
SH
193
194 if (lstat(path, &mystat) < 0) {
067650d0
SH
195 if (errno == ENOENT)
196 return 0;
41dc7155 197
b103ceac 198 ERROR("Failed to stat %s", path);
4355ab5f 199 return -1;
60bf62d4
SH
200 }
201
0cc417b2 202 return _recursive_rmdir(path, mystat.st_dev, exclude, 0, onedev);
60bf62d4
SH
203}
204
9ddaf3bf 205/* borrowed from iproute2 */
7c11d57a 206extern int get_u16(unsigned short *val, const char *arg, int base)
9ddaf3bf
JHS
207{
208 unsigned long res;
209 char *ptr;
210
211 if (!arg || !*arg)
212 return -1;
213
09bbd745 214 errno = 0;
9ddaf3bf 215 res = strtoul(arg, &ptr, base);
09bbd745 216 if (!ptr || ptr == arg || *ptr || res > 0xFFFF || errno != 0)
9ddaf3bf
JHS
217 return -1;
218
219 *val = res;
220
221 return 0;
222}
223
3ce74686 224extern int mkdir_p(const char *dir, mode_t mode)
1b09f2c0 225{
3ce74686
SH
226 const char *tmp = dir;
227 const char *orig = dir;
860fc865
RW
228 char *makeme;
229
230 do {
231 dir = tmp + strspn(tmp, "/");
232 tmp = dir + strcspn(dir, "/");
b14fc100 233
d74325c4 234 makeme = strndup(orig, dir - orig);
860fc865
RW
235 if (*makeme) {
236 if (mkdir(makeme, mode) && errno != EEXIST) {
959aee9c 237 SYSERROR("failed to create directory '%s'", makeme);
d74325c4 238 free(makeme);
860fc865
RW
239 return -1;
240 }
241 }
d74325c4 242 free(makeme);
860fc865 243 } while(tmp != dir);
1b09f2c0 244
98663823 245 return 0;
1b09f2c0 246}
2a59a681 247
44b9ae4b 248char *get_rundir()
9e60f51d 249{
97a696c6
SG
250 char *rundir;
251 const char *homedir;
9650c735 252 struct stat sb;
9e60f51d 253
b14fc100 254 if (stat(RUNTIME_PATH, &sb) < 0)
9650c735 255 return NULL;
9650c735
TA
256
257 if (geteuid() == sb.st_uid || getegid() == sb.st_gid) {
c580b8d2 258 rundir = strdup(RUNTIME_PATH);
d6470e71
SG
259 return rundir;
260 }
97a696c6
SG
261
262 rundir = getenv("XDG_RUNTIME_DIR");
44b9ae4b
SG
263 if (rundir) {
264 rundir = strdup(rundir);
265 return rundir;
266 }
97a696c6 267
44b9ae4b
SG
268 INFO("XDG_RUNTIME_DIR isn't set in the environment.");
269 homedir = getenv("HOME");
270 if (!homedir) {
271 ERROR("HOME isn't set in the environment.");
272 return NULL;
97a696c6
SG
273 }
274
44b9ae4b 275 rundir = malloc(sizeof(char) * (17 + strlen(homedir)));
b14fc100 276 if (!rundir)
277 return NULL;
278
44b9ae4b
SG
279 sprintf(rundir, "%s/.cache/lxc/run/", homedir);
280
9e60f51d
DE
281 return rundir;
282}
283
9be53773
SH
284int wait_for_pid(pid_t pid)
285{
286 int status, ret;
287
288again:
289 ret = waitpid(pid, &status, 0);
290 if (ret == -1) {
71b9b8ed 291 if (errno == EINTR)
9be53773 292 goto again;
b14fc100 293
9be53773
SH
294 return -1;
295 }
b14fc100 296
9be53773
SH
297 if (ret != pid)
298 goto again;
b14fc100 299
9be53773
SH
300 if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
301 return -1;
b14fc100 302
9be53773
SH
303 return 0;
304}
c797a220
CS
305
306int lxc_wait_for_pid_status(pid_t pid)
307{
308 int status, ret;
309
310again:
311 ret = waitpid(pid, &status, 0);
312 if (ret == -1) {
313 if (errno == EINTR)
314 goto again;
b14fc100 315
c797a220
CS
316 return -1;
317 }
b14fc100 318
c797a220
CS
319 if (ret != pid)
320 goto again;
b14fc100 321
c797a220
CS
322 return status;
323}
92f023dc 324
3ce74686
SH
325#if HAVE_LIBGNUTLS
326#include <gnutls/gnutls.h>
327#include <gnutls/crypto.h>
41246cee
DE
328
329__attribute__((constructor))
330static void gnutls_lxc_init(void)
331{
332 gnutls_global_init();
333}
334
3ce74686
SH
335int sha1sum_file(char *fnam, unsigned char *digest)
336{
337 char *buf;
338 int ret;
339 FILE *f;
340 long flen;
341
342 if (!fnam)
343 return -1;
b14fc100 344
025ed0f3 345 f = fopen_cloexec(fnam, "r");
7be677a8 346 if (!f) {
3ce74686
SH
347 SYSERROR("Error opening template");
348 return -1;
349 }
b14fc100 350
3ce74686
SH
351 if (fseek(f, 0, SEEK_END) < 0) {
352 SYSERROR("Error seeking to end of template");
dd1d77f9 353 fclose(f);
3ce74686
SH
354 return -1;
355 }
b14fc100 356
3ce74686
SH
357 if ((flen = ftell(f)) < 0) {
358 SYSERROR("Error telling size of template");
dd1d77f9 359 fclose(f);
3ce74686
SH
360 return -1;
361 }
b14fc100 362
3ce74686
SH
363 if (fseek(f, 0, SEEK_SET) < 0) {
364 SYSERROR("Error seeking to start of template");
dd1d77f9 365 fclose(f);
3ce74686
SH
366 return -1;
367 }
b14fc100 368
3ce74686
SH
369 if ((buf = malloc(flen+1)) == NULL) {
370 SYSERROR("Out of memory");
dd1d77f9 371 fclose(f);
3ce74686
SH
372 return -1;
373 }
b14fc100 374
3ce74686
SH
375 if (fread(buf, 1, flen, f) != flen) {
376 SYSERROR("Failure reading template");
377 free(buf);
dd1d77f9 378 fclose(f);
3ce74686
SH
379 return -1;
380 }
b14fc100 381
dd1d77f9 382 if (fclose(f) < 0) {
3ce74686
SH
383 SYSERROR("Failre closing template");
384 free(buf);
385 return -1;
386 }
b14fc100 387
3ce74686
SH
388 buf[flen] = '\0';
389 ret = gnutls_hash_fast(GNUTLS_DIG_SHA1, buf, flen, (void *)digest);
390 free(buf);
391 return ret;
392}
393#endif
61a1d519 394
8bd8018e 395struct lxc_popen_FILE *lxc_popen(const char *command)
ebec9176 396{
3f323207 397 int ret;
ebec9176
AM
398 int pipe_fds[2];
399 pid_t child_pid;
8bd8018e 400 struct lxc_popen_FILE *fp = NULL;
ebec9176 401
8bd8018e
CB
402 ret = pipe2(pipe_fds, O_CLOEXEC);
403 if (ret < 0)
ebec9176 404 return NULL;
ebec9176
AM
405
406 child_pid = fork();
8bd8018e
CB
407 if (child_pid < 0)
408 goto on_error;
409
410 if (!child_pid) {
411 sigset_t mask;
412
413 close(pipe_fds[0]);
414
415 /* duplicate stdout */
416 if (pipe_fds[1] != STDOUT_FILENO)
417 ret = dup2(pipe_fds[1], STDOUT_FILENO);
418 else
419 ret = fcntl(pipe_fds[1], F_SETFD, 0);
420 if (ret < 0) {
421 close(pipe_fds[1]);
03f618af 422 _exit(EXIT_FAILURE);
3f323207
CB
423 }
424
8bd8018e
CB
425 /* duplicate stderr */
426 if (pipe_fds[1] != STDERR_FILENO)
427 ret = dup2(pipe_fds[1], STDERR_FILENO);
428 else
429 ret = fcntl(pipe_fds[1], F_SETFD, 0);
430 close(pipe_fds[1]);
431 if (ret < 0)
03f618af 432 _exit(EXIT_FAILURE);
8bd8018e
CB
433
434 /* unblock all signals */
435 ret = sigfillset(&mask);
436 if (ret < 0)
03f618af 437 _exit(EXIT_FAILURE);
8bd8018e 438
b467714b 439 ret = pthread_sigmask(SIG_UNBLOCK, &mask, NULL);
8bd8018e 440 if (ret < 0)
03f618af 441 _exit(EXIT_FAILURE);
8bd8018e
CB
442
443 execl("/bin/sh", "sh", "-c", command, (char *)NULL);
03f618af 444 _exit(127);
ebec9176
AM
445 }
446
8bd8018e
CB
447 close(pipe_fds[1]);
448 pipe_fds[1] = -1;
ebec9176 449
8bd8018e
CB
450 fp = malloc(sizeof(*fp));
451 if (!fp)
452 goto on_error;
b14fc100 453
7e50ec0b 454 memset(fp, 0, sizeof(*fp));
ebec9176
AM
455
456 fp->child_pid = child_pid;
8bd8018e 457 fp->pipe = pipe_fds[0];
ebec9176 458
7e50ec0b
CB
459 /* From now on, closing fp->f will also close fp->pipe. So only ever
460 * call fclose(fp->f).
461 */
8bd8018e
CB
462 fp->f = fdopen(pipe_fds[0], "r");
463 if (!fp->f)
464 goto on_error;
ebec9176 465
8bd8018e 466 return fp;
ebec9176 467
8bd8018e 468on_error:
7e50ec0b
CB
469 /* We can only close pipe_fds[0] if fdopen() didn't succeed or wasn't
470 * called yet. Otherwise the fd belongs to the file opened by fdopen()
471 * since it isn't dup()ed.
472 */
473 if (fp && !fp->f && pipe_fds[0] >= 0)
8bd8018e
CB
474 close(pipe_fds[0]);
475
476 if (pipe_fds[1] >= 0)
477 close(pipe_fds[1]);
ebec9176 478
7e50ec0b
CB
479 if (fp && fp->f)
480 fclose(fp->f);
481
482 if (fp)
483 free(fp);
484
ebec9176
AM
485 return NULL;
486}
487
8bd8018e 488int lxc_pclose(struct lxc_popen_FILE *fp)
ebec9176 489{
ebec9176 490 pid_t wait_pid;
8bd8018e 491 int wstatus = 0;
ebec9176 492
8bd8018e 493 if (!fp)
ebec9176 494 return -1;
ebec9176
AM
495
496 do {
8bd8018e
CB
497 wait_pid = waitpid(fp->child_pid, &wstatus, 0);
498 } while (wait_pid < 0 && errno == EINTR);
ebec9176 499
8bd8018e
CB
500 fclose(fp->f);
501 free(fp);
502
503 if (wait_pid < 0)
ebec9176 504 return -1;
ebec9176
AM
505
506 return wstatus;
507}
508
508c263e
SH
509int randseed(bool srand_it)
510{
511 /*
512 srand pre-seed function based on /dev/urandom
513 */
091045f8 514 unsigned int seed = time(NULL) + getpid();
508c263e
SH
515
516 FILE *f;
517 f = fopen("/dev/urandom", "r");
518 if (f) {
519 int ret = fread(&seed, sizeof(seed), 1, f);
520 if (ret != 1)
7874d81a 521 SYSDEBUG("unable to fread /dev/urandom, fallback to time+pid rand seed");
522
508c263e
SH
523 fclose(f);
524 }
525
526 if (srand_it)
527 srand(seed);
528
529 return seed;
530}
5d897655
SH
531
532uid_t get_ns_uid(uid_t orig)
533{
534 char *line = NULL;
535 size_t sz = 0;
536 uid_t nsid, hostid, range;
537 FILE *f = fopen("/proc/self/uid_map", "r");
538 if (!f)
539 return 0;
540
541 while (getline(&line, &sz, f) != -1) {
542 if (sscanf(line, "%u %u %u", &nsid, &hostid, &range) != 3)
543 continue;
b14fc100 544
5d897655
SH
545 if (hostid <= orig && hostid + range > orig) {
546 nsid += orig - hostid;
547 goto found;
548 }
549 }
550
b962868f
CB
551 nsid = LXC_INVALID_UID;
552
553found:
554 fclose(f);
555 free(line);
556 return nsid;
557}
558
559gid_t get_ns_gid(gid_t orig)
560{
561 char *line = NULL;
562 size_t sz = 0;
563 gid_t nsid, hostid, range;
564 FILE *f = fopen("/proc/self/gid_map", "r");
565 if (!f)
566 return 0;
567
568 while (getline(&line, &sz, f) != -1) {
569 if (sscanf(line, "%u %u %u", &nsid, &hostid, &range) != 3)
570 continue;
571
572 if (hostid <= orig && hostid + range > orig) {
573 nsid += orig - hostid;
574 goto found;
575 }
576 }
577
578 nsid = LXC_INVALID_GID;
b14fc100 579
5d897655
SH
580found:
581 fclose(f);
582 free(line);
583 return nsid;
584}
c476bdce
SH
585
586bool dir_exists(const char *path)
587{
588 struct stat sb;
589 int ret;
590
591 ret = stat(path, &sb);
592 if (ret < 0)
1a0e70ac 593 /* Could be something other than eexist, just say "no". */
c476bdce 594 return false;
b14fc100 595
c476bdce
SH
596 return S_ISDIR(sb.st_mode);
597}
93c379f0
ÇO
598
599/* Note we don't use SHA-1 here as we don't want to depend on HAVE_GNUTLS.
600 * FNV has good anti collision properties and we're not worried
601 * about pre-image resistance or one-way-ness, we're just trying to make
602 * the name unique in the 108 bytes of space we have.
603 */
604uint64_t fnv_64a_buf(void *buf, size_t len, uint64_t hval)
605{
606 unsigned char *bp;
607
608 for(bp = buf; bp < (unsigned char *)buf + len; bp++)
609 {
610 /* xor the bottom with the current octet */
611 hval ^= (uint64_t)*bp;
612
613 /* gcc optimised:
614 * multiply by the 64 bit FNV magic prime mod 2^64
615 */
616 hval += (hval << 1) + (hval << 4) + (hval << 5) +
617 (hval << 7) + (hval << 8) + (hval << 40);
618 }
619
620 return hval;
621}
2c6f3fc9 622
f6310f18 623bool is_shared_mountpoint(const char *path)
2c6f3fc9 624{
f6310f18 625 char buf[LXC_LINELEN];
2c6f3fc9
SH
626 FILE *f;
627 int i;
f6310f18 628 char *p, *p2;
2c6f3fc9
SH
629
630 f = fopen("/proc/self/mountinfo", "r");
631 if (!f)
632 return 0;
b14fc100 633
eab15c1e
CB
634 while (fgets(buf, LXC_LINELEN, f)) {
635 for (p = buf, i = 0; p && i < 4; i++)
636 p = strchr(p + 1, ' ');
2c6f3fc9
SH
637 if (!p)
638 continue;
b14fc100 639
eab15c1e 640 p2 = strchr(p + 1, ' ');
2c6f3fc9
SH
641 if (!p2)
642 continue;
b14fc100 643
2c6f3fc9 644 *p2 = '\0';
f6310f18
LT
645 if (strcmp(p + 1, path) == 0) {
646 /* This is the path. Is it shared? */
eab15c1e 647 p = strchr(p2 + 1, ' ');
2c6f3fc9
SH
648 if (p && strstr(p, "shared:")) {
649 fclose(f);
f6310f18 650 return true;
2c6f3fc9
SH
651 }
652 }
653 }
b14fc100 654
2c6f3fc9 655 fclose(f);
f6310f18
LT
656 return false;
657}
658
659/*
660 * Detect whether / is mounted MS_SHARED. The only way I know of to
661 * check that is through /proc/self/mountinfo.
662 * I'm only checking for /. If the container rootfs or mount location
663 * is MS_SHARED, but not '/', then you're out of luck - figuring that
664 * out would be too much work to be worth it.
665 */
666int detect_shared_rootfs(void)
667{
668 if (is_shared_mountpoint("/"))
669 return 1;
2c6f3fc9
SH
670 return 0;
671}
0e6e3a41 672
37ef15bb
CB
673bool switch_to_ns(pid_t pid, const char *ns)
674{
51d0854c
DY
675 int fd, ret;
676 char nspath[MAXPATHLEN];
677
678 /* Switch to new ns */
679 ret = snprintf(nspath, MAXPATHLEN, "/proc/%d/ns/%s", pid, ns);
680 if (ret < 0 || ret >= MAXPATHLEN)
681 return false;
682
683 fd = open(nspath, O_RDONLY);
684 if (fd < 0) {
a9cb0fb8 685 SYSERROR("Failed to open %s", nspath);
51d0854c
DY
686 return false;
687 }
688
689 ret = setns(fd, 0);
690 if (ret) {
a9cb0fb8 691 SYSERROR("Failed to set process %d to %s of %d.", pid, ns, fd);
51d0854c
DY
692 close(fd);
693 return false;
694 }
b14fc100 695
51d0854c
DY
696 close(fd);
697 return true;
698}
699
b7f954bb
SH
700/*
701 * looking at fs/proc_namespace.c, it appears we can
702 * actually expect the rootfs entry to very specifically contain
703 * " - rootfs rootfs "
704 * IIUC, so long as we've chrooted so that rootfs is not our root,
705 * the rootfs entry should always be skipped in mountinfo contents.
706 */
fa454c8e 707bool detect_ramfs_rootfs(void)
b7f954bb 708{
b7f954bb 709 FILE *f;
fa454c8e
CB
710 char *p, *p2;
711 char *line = NULL;
712 size_t len = 0;
b7f954bb 713 int i;
b7f954bb
SH
714
715 f = fopen("/proc/self/mountinfo", "r");
716 if (!f)
fa454c8e
CB
717 return false;
718
719 while (getline(&line, &len, f) != -1) {
720 for (p = line, i = 0; p && i < 4; i++)
721 p = strchr(p + 1, ' ');
b7f954bb
SH
722 if (!p)
723 continue;
b14fc100 724
fa454c8e 725 p2 = strchr(p + 1, ' ');
b7f954bb
SH
726 if (!p2)
727 continue;
b14fc100 728
b7f954bb 729 *p2 = '\0';
fa454c8e 730 if (strcmp(p + 1, "/") == 0) {
1a0e70ac 731 /* This is '/'. Is it the ramfs? */
fa454c8e 732 p = strchr(p2 + 1, '-');
b7f954bb 733 if (p && strncmp(p, "- rootfs rootfs ", 16) == 0) {
fa454c8e 734 free(line);
b7f954bb 735 fclose(f);
8ce1abc2 736 INFO("Rootfs is located on ramfs");
fa454c8e 737 return true;
b7f954bb
SH
738 }
739 }
740 }
b14fc100 741
fa454c8e 742 free(line);
b7f954bb 743 fclose(f);
fa454c8e 744 return false;
b7f954bb
SH
745}
746
37ef15bb
CB
747char *on_path(const char *cmd, const char *rootfs)
748{
84c5549b 749 char *entry = NULL, *path = NULL;
0e6e3a41
SG
750 char cmdpath[MAXPATHLEN];
751 int ret;
752
753 path = getenv("PATH");
754 if (!path)
8afb3e61 755 return NULL;
0e6e3a41
SG
756
757 path = strdup(path);
758 if (!path)
8afb3e61 759 return NULL;
0e6e3a41 760
37ef15bb 761 lxc_iterate_parts (entry, path, ":") {
9d9c111c 762 if (rootfs)
37ef15bb
CB
763 ret = snprintf(cmdpath, MAXPATHLEN, "%s/%s/%s", rootfs,
764 entry, cmd);
9d9c111c
SH
765 else
766 ret = snprintf(cmdpath, MAXPATHLEN, "%s/%s", entry, cmd);
0e6e3a41 767 if (ret < 0 || ret >= MAXPATHLEN)
84c5549b 768 continue;
0e6e3a41
SG
769
770 if (access(cmdpath, X_OK) == 0) {
771 free(path);
8afb3e61 772 return strdup(cmdpath);
0e6e3a41 773 }
0e6e3a41
SG
774 }
775
776 free(path);
8afb3e61 777 return NULL;
0e6e3a41 778}
76a26f55 779
12983ba4
SH
780bool cgns_supported(void)
781{
782 return file_exists("/proc/self/ns/cgroup");
783}
784
9d9c111c
SH
785/* historically lxc-init has been under /usr/lib/lxc and under
786 * /usr/lib/$ARCH/lxc. It now lives as $prefix/sbin/init.lxc.
787 */
788char *choose_init(const char *rootfs)
789{
790 char *retv = NULL;
370ec268
SF
791 const char *empty = "",
792 *tmp;
9d9c111c 793 int ret, env_set = 0;
9d9c111c
SH
794
795 if (!getenv("PATH")) {
796 if (setenv("PATH", "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", 0))
797 SYSERROR("Failed to setenv");
b14fc100 798
9d9c111c
SH
799 env_set = 1;
800 }
801
802 retv = on_path("init.lxc", rootfs);
803
804 if (env_set) {
805 if (unsetenv("PATH"))
806 SYSERROR("Failed to unsetenv");
807 }
808
809 if (retv)
810 return retv;
811
812 retv = malloc(PATH_MAX);
813 if (!retv)
814 return NULL;
815
816 if (rootfs)
370ec268 817 tmp = rootfs;
9d9c111c 818 else
370ec268
SF
819 tmp = empty;
820
821 ret = snprintf(retv, PATH_MAX, "%s/%s/%s", tmp, SBINDIR, "/init.lxc");
9d9c111c
SH
822 if (ret < 0 || ret >= PATH_MAX) {
823 ERROR("pathname too long");
824 goto out1;
825 }
b14fc100 826
e57cd7e9 827 if (access(retv, X_OK) == 0)
9d9c111c
SH
828 return retv;
829
370ec268 830 ret = snprintf(retv, PATH_MAX, "%s/%s/%s", tmp, LXCINITDIR, "/lxc/lxc-init");
9d9c111c
SH
831 if (ret < 0 || ret >= PATH_MAX) {
832 ERROR("pathname too long");
833 goto out1;
834 }
b14fc100 835
e57cd7e9 836 if (access(retv, X_OK) == 0)
9d9c111c
SH
837 return retv;
838
370ec268 839 ret = snprintf(retv, PATH_MAX, "%s/usr/lib/lxc/lxc-init", tmp);
9d9c111c
SH
840 if (ret < 0 || ret >= PATH_MAX) {
841 ERROR("pathname too long");
842 goto out1;
843 }
b14fc100 844
e57cd7e9 845 if (access(retv, X_OK) == 0)
9d9c111c
SH
846 return retv;
847
370ec268 848 ret = snprintf(retv, PATH_MAX, "%s/sbin/lxc-init", tmp);
9d9c111c
SH
849 if (ret < 0 || ret >= PATH_MAX) {
850 ERROR("pathname too long");
851 goto out1;
852 }
b14fc100 853
e57cd7e9 854 if (access(retv, X_OK) == 0)
9d9c111c
SH
855 return retv;
856
857 /*
858 * Last resort, look for the statically compiled init.lxc which we
859 * hopefully bind-mounted in.
860 * If we are called during container setup, and we get to this point,
861 * then the init.lxc.static from the host will need to be bind-mounted
862 * in. So we return NULL here to indicate that.
863 */
864 if (rootfs)
865 goto out1;
866
867 ret = snprintf(retv, PATH_MAX, "/init.lxc.static");
868 if (ret < 0 || ret >= PATH_MAX) {
869 WARN("Nonsense - name /lxc.init.static too long");
870 goto out1;
871 }
b14fc100 872
e57cd7e9 873 if (access(retv, X_OK) == 0)
9d9c111c
SH
874 return retv;
875
876out1:
877 free(retv);
878 return NULL;
879}
735f2c6e 880
6010a416
SG
881/*
882 * Given the '-t' template option to lxc-create, figure out what to
883 * do. If the template is a full executable path, use that. If it
884 * is something like 'sshd', then return $templatepath/lxc-sshd.
885 * On success return the template, on error return NULL.
886 */
887char *get_template_path(const char *t)
888{
889 int ret, len;
890 char *tpath;
891
892 if (t[0] == '/' && access(t, X_OK) == 0) {
893 tpath = strdup(t);
894 return tpath;
895 }
896
897 len = strlen(LXCTEMPLATEDIR) + strlen(t) + strlen("/lxc-") + 1;
b14fc100 898
6010a416
SG
899 tpath = malloc(len);
900 if (!tpath)
901 return NULL;
b14fc100 902
6010a416
SG
903 ret = snprintf(tpath, len, "%s/lxc-%s", LXCTEMPLATEDIR, t);
904 if (ret < 0 || ret >= len) {
905 free(tpath);
906 return NULL;
907 }
b14fc100 908
6010a416
SG
909 if (access(tpath, X_OK) < 0) {
910 SYSERROR("bad template: %s", t);
911 free(tpath);
912 return NULL;
913 }
914
915 return tpath;
916}
0a4be28d 917
592fd47a
SH
918/*
919 * @path: a pathname where / replaced with '\0'.
920 * @offsetp: pointer to int showing which path segment was last seen.
921 * Updated on return to reflect the next segment.
922 * @fulllen: full original path length.
923 * Returns a pointer to the next path segment, or NULL if done.
924 */
925static char *get_nextpath(char *path, int *offsetp, int fulllen)
926{
927 int offset = *offsetp;
928
929 if (offset >= fulllen)
930 return NULL;
931
91d9cab6 932 while (offset < fulllen && path[offset] != '\0')
592fd47a 933 offset++;
b14fc100 934
91d9cab6 935 while (offset < fulllen && path[offset] == '\0')
592fd47a
SH
936 offset++;
937
938 *offsetp = offset;
939 return (offset < fulllen) ? &path[offset] : NULL;
940}
941
942/*
943 * Check that @subdir is a subdir of @dir. @len is the length of
944 * @dir (to avoid having to recalculate it).
945 */
946static bool is_subdir(const char *subdir, const char *dir, size_t len)
947{
948 size_t subdirlen = strlen(subdir);
949
950 if (subdirlen < len)
951 return false;
b14fc100 952
592fd47a
SH
953 if (strncmp(subdir, dir, len) != 0)
954 return false;
b14fc100 955
592fd47a
SH
956 if (dir[len-1] == '/')
957 return true;
b14fc100 958
592fd47a
SH
959 if (subdir[len] == '/' || subdirlen == len)
960 return true;
b14fc100 961
592fd47a
SH
962 return false;
963}
964
965/*
966 * Check if the open fd is a symlink. Return -ELOOP if it is. Return
967 * -ENOENT if we couldn't fstat. Return 0 if the fd is ok.
968 */
969static int check_symlink(int fd)
970{
971 struct stat sb;
b14fc100 972 int ret;
973
974 ret = fstat(fd, &sb);
592fd47a
SH
975 if (ret < 0)
976 return -ENOENT;
b14fc100 977
592fd47a
SH
978 if (S_ISLNK(sb.st_mode))
979 return -ELOOP;
b14fc100 980
592fd47a
SH
981 return 0;
982}
983
984/*
985 * Open a file or directory, provided that it contains no symlinks.
986 *
987 * CAVEAT: This function must not be used for other purposes than container
988 * setup before executing the container's init
989 */
990static int open_if_safe(int dirfd, const char *nextpath)
991{
992 int newfd = openat(dirfd, nextpath, O_RDONLY | O_NOFOLLOW);
1a0e70ac 993 if (newfd >= 0) /* Was not a symlink, all good. */
592fd47a
SH
994 return newfd;
995
996 if (errno == ELOOP)
997 return newfd;
998
999 if (errno == EPERM || errno == EACCES) {
1a0e70ac
CB
1000 /* We're not root (cause we got EPERM) so try opening with
1001 * O_PATH.
1002 */
592fd47a
SH
1003 newfd = openat(dirfd, nextpath, O_PATH | O_NOFOLLOW);
1004 if (newfd >= 0) {
1a0e70ac
CB
1005 /* O_PATH will return an fd for symlinks. We know
1006 * nextpath wasn't a symlink at last openat, so if fd is
1007 * now a link, then something * fishy is going on.
592fd47a
SH
1008 */
1009 int ret = check_symlink(newfd);
1010 if (ret < 0) {
1011 close(newfd);
1012 newfd = ret;
1013 }
1014 }
1015 }
1016
1017 return newfd;
1018}
1019
1020/*
1021 * Open a path intending for mounting, ensuring that the final path
1022 * is inside the container's rootfs.
1023 *
1024 * CAVEAT: This function must not be used for other purposes than container
1025 * setup before executing the container's init
1026 *
1027 * @target: path to be opened
1028 * @prefix_skip: a part of @target in which to ignore symbolic links. This
1029 * would be the container's rootfs.
1030 *
1031 * Return an open fd for the path, or <0 on error.
1032 */
1033static int open_without_symlink(const char *target, const char *prefix_skip)
1034{
1035 int curlen = 0, dirfd, fulllen, i;
1036 char *dup = NULL;
1037
1038 fulllen = strlen(target);
1039
1040 /* make sure prefix-skip makes sense */
01074e5b 1041 if (prefix_skip && strlen(prefix_skip) > 0) {
592fd47a
SH
1042 curlen = strlen(prefix_skip);
1043 if (!is_subdir(target, prefix_skip, curlen)) {
1044 ERROR("WHOA there - target '%s' didn't start with prefix '%s'",
1045 target, prefix_skip);
1046 return -EINVAL;
1047 }
b14fc100 1048
592fd47a
SH
1049 /*
1050 * get_nextpath() expects the curlen argument to be
1051 * on a (turned into \0) / or before it, so decrement
1052 * curlen to make sure that happens
1053 */
1054 if (curlen)
1055 curlen--;
1056 } else {
1057 prefix_skip = "/";
1058 curlen = 0;
1059 }
1060
1061 /* Make a copy of target which we can hack up, and tokenize it */
1062 if ((dup = strdup(target)) == NULL) {
1063 SYSERROR("Out of memory checking for symbolic link");
1064 return -ENOMEM;
1065 }
b14fc100 1066
592fd47a
SH
1067 for (i = 0; i < fulllen; i++) {
1068 if (dup[i] == '/')
1069 dup[i] = '\0';
1070 }
1071
1072 dirfd = open(prefix_skip, O_RDONLY);
1073 if (dirfd < 0)
1074 goto out;
b14fc100 1075
592fd47a
SH
1076 while (1) {
1077 int newfd, saved_errno;
1078 char *nextpath;
1079
1080 if ((nextpath = get_nextpath(dup, &curlen, fulllen)) == NULL)
1081 goto out;
b14fc100 1082
592fd47a
SH
1083 newfd = open_if_safe(dirfd, nextpath);
1084 saved_errno = errno;
1085 close(dirfd);
b14fc100 1086
592fd47a
SH
1087 dirfd = newfd;
1088 if (newfd < 0) {
1089 errno = saved_errno;
1090 if (errno == ELOOP)
1091 SYSERROR("%s in %s was a symbolic link!", nextpath, target);
b14fc100 1092
592fd47a
SH
1093 goto out;
1094 }
1095 }
1096
1097out:
1098 free(dup);
1099 return dirfd;
1100}
1101
1102/*
1103 * Safely mount a path into a container, ensuring that the mount target
1104 * is under the container's @rootfs. (If @rootfs is NULL, then the container
1105 * uses the host's /)
1106 *
1107 * CAVEAT: This function must not be used for other purposes than container
1108 * setup before executing the container's init
1109 */
1110int safe_mount(const char *src, const char *dest, const char *fstype,
1111 unsigned long flags, const void *data, const char *rootfs)
1112{
1a0e70ac
CB
1113 int destfd, ret, saved_errno;
1114 /* Only needs enough for /proc/self/fd/<fd>. */
1115 char srcbuf[50], destbuf[50];
1116 int srcfd = -1;
592fd47a
SH
1117 const char *mntsrc = src;
1118
1119 if (!rootfs)
1120 rootfs = "";
1121
1122 /* todo - allow symlinks for relative paths if 'allowsymlinks' option is passed */
1123 if (flags & MS_BIND && src && src[0] != '/') {
1124 INFO("this is a relative bind mount");
b14fc100 1125
592fd47a
SH
1126 srcfd = open_without_symlink(src, NULL);
1127 if (srcfd < 0)
1128 return srcfd;
b14fc100 1129
592fd47a
SH
1130 ret = snprintf(srcbuf, 50, "/proc/self/fd/%d", srcfd);
1131 if (ret < 0 || ret > 50) {
1132 close(srcfd);
1133 ERROR("Out of memory");
1134 return -EINVAL;
1135 }
1136 mntsrc = srcbuf;
1137 }
1138
1139 destfd = open_without_symlink(dest, rootfs);
1140 if (destfd < 0) {
88e078ba
CB
1141 if (srcfd != -1) {
1142 saved_errno = errno;
592fd47a 1143 close(srcfd);
88e078ba
CB
1144 errno = saved_errno;
1145 }
b14fc100 1146
592fd47a
SH
1147 return destfd;
1148 }
1149
1150 ret = snprintf(destbuf, 50, "/proc/self/fd/%d", destfd);
1151 if (ret < 0 || ret > 50) {
1152 if (srcfd != -1)
1153 close(srcfd);
b14fc100 1154
592fd47a
SH
1155 close(destfd);
1156 ERROR("Out of memory");
1157 return -EINVAL;
1158 }
1159
1160 ret = mount(mntsrc, destbuf, fstype, flags, data);
1161 saved_errno = errno;
1162 if (srcfd != -1)
1163 close(srcfd);
b14fc100 1164
592fd47a
SH
1165 close(destfd);
1166 if (ret < 0) {
1167 errno = saved_errno;
0103eb53 1168 SYSERROR("Failed to mount %s onto %s", src ? src : "(null)", dest);
592fd47a
SH
1169 return ret;
1170 }
1171
1172 return 0;
1173}
1174
ced03a01
SH
1175/*
1176 * Mount a proc under @rootfs if proc self points to a pid other than
1177 * my own. This is needed to have a known-good proc mount for setting
1178 * up LSMs both at container startup and attach.
1179 *
1180 * @rootfs : the rootfs where proc should be mounted
1181 *
1182 * Returns < 0 on failure, 0 if the correct proc was already mounted
1183 * and 1 if a new proc was mounted.
f267d666
BP
1184 *
1185 * NOTE: not to be called from inside the container namespace!
ced03a01 1186 */
943144d9 1187int lxc_mount_proc_if_needed(const char *rootfs)
ced03a01
SH
1188{
1189 char path[MAXPATHLEN];
6b1ba5d6 1190 int link_to_pid, linklen, mypid, ret;
40464e8a 1191 char link[INTTYPE_TO_STRLEN(pid_t)] = {0};
ced03a01
SH
1192
1193 ret = snprintf(path, MAXPATHLEN, "%s/proc/self", rootfs);
1194 if (ret < 0 || ret >= MAXPATHLEN) {
1195 SYSERROR("proc path name too long");
1196 return -1;
1197 }
fc2ad9dc 1198
979a0d93 1199 linklen = readlink(path, link, sizeof(link));
fc2ad9dc 1200
ced03a01 1201 ret = snprintf(path, MAXPATHLEN, "%s/proc", rootfs);
d539a2b2
CB
1202 if (ret < 0 || ret >= MAXPATHLEN) {
1203 SYSERROR("proc path name too long");
1204 return -1;
1205 }
fc2ad9dc
CB
1206
1207 /* /proc not mounted */
1208 if (linklen < 0) {
1209 if (mkdir(path, 0755) && errno != EEXIST)
1210 return -1;
b14fc100 1211
ced03a01 1212 goto domount;
979a0d93 1213 } else if (linklen >= sizeof(link)) {
6b1ba5d6
CB
1214 link[linklen - 1] = '\0';
1215 ERROR("readlink returned truncated content: \"%s\"", link);
1216 return -1;
fc2ad9dc
CB
1217 }
1218
0059379f 1219 mypid = lxc_raw_getpid();
6b1ba5d6
CB
1220 INFO("I am %d, /proc/self points to \"%s\"", mypid, link);
1221
2d036cca
CB
1222 if (lxc_safe_int(link, &link_to_pid) < 0)
1223 return -1;
fc2ad9dc 1224
6b1ba5d6
CB
1225 /* correct procfs is already mounted */
1226 if (link_to_pid == mypid)
1227 return 0;
fc2ad9dc 1228
6b1ba5d6
CB
1229 ret = umount2(path, MNT_DETACH);
1230 if (ret < 0)
1231 WARN("failed to umount \"%s\" with MNT_DETACH", path);
ced03a01
SH
1232
1233domount:
fc2ad9dc 1234 /* rootfs is NULL */
6b1ba5d6 1235 if (!strcmp(rootfs, ""))
f267d666
BP
1236 ret = mount("proc", path, "proc", 0, NULL);
1237 else
1238 ret = safe_mount("proc", path, "proc", 0, NULL, rootfs);
f267d666 1239 if (ret < 0)
ced03a01 1240 return -1;
f267d666 1241
fc2ad9dc 1242 INFO("mounted /proc in container for security transition");
ced03a01
SH
1243 return 1;
1244}
69aeabac 1245
f8dd0275 1246int open_devnull(void)
69aeabac 1247{
f8dd0275
AM
1248 int fd = open("/dev/null", O_RDWR);
1249
1250 if (fd < 0)
1251 SYSERROR("Can't open /dev/null");
1252
1253 return fd;
1254}
69aeabac 1255
f8dd0275
AM
1256int set_stdfds(int fd)
1257{
bbbf65ee
CB
1258 int ret;
1259
69aeabac
TA
1260 if (fd < 0)
1261 return -1;
1262
bbbf65ee
CB
1263 ret = dup2(fd, STDIN_FILENO);
1264 if (ret < 0)
f8dd0275 1265 return -1;
bbbf65ee
CB
1266
1267 ret = dup2(fd, STDOUT_FILENO);
1268 if (ret < 0)
f8dd0275 1269 return -1;
bbbf65ee
CB
1270
1271 ret = dup2(fd, STDERR_FILENO);
1272 if (ret < 0)
f8dd0275
AM
1273 return -1;
1274
1275 return 0;
1276}
1277
1278int null_stdfds(void)
1279{
1280 int ret = -1;
b14fc100 1281 int fd;
f8dd0275 1282
b14fc100 1283 fd = open_devnull();
f8dd0275
AM
1284 if (fd >= 0) {
1285 ret = set_stdfds(fd);
1286 close(fd);
1287 }
69aeabac 1288
69aeabac
TA
1289 return ret;
1290}
ccb4cabe 1291
330ae3d3 1292/* Check whether a signal is blocked by a process. */
de3c491b 1293/* /proc/pid-to-str/status\0 = (5 + 21 + 7 + 1) */
40464e8a 1294#define __PROC_STATUS_LEN (6 + INTTYPE_TO_STRLEN(pid_t) + 7 + 1)
573ad77f 1295bool task_blocks_signal(pid_t pid, int signal)
330ae3d3 1296{
330ae3d3 1297 int ret;
de3c491b 1298 char status[__PROC_STATUS_LEN];
eabf1ea9 1299 FILE *f;
573ad77f 1300 uint64_t sigblk = 0, one = 1;
eabf1ea9
CB
1301 size_t n = 0;
1302 bool bret = false;
1303 char *line = NULL;
330ae3d3 1304
de3c491b
CB
1305 ret = snprintf(status, __PROC_STATUS_LEN, "/proc/%d/status", pid);
1306 if (ret < 0 || ret >= __PROC_STATUS_LEN)
330ae3d3
CB
1307 return bret;
1308
1309 f = fopen(status, "r");
1310 if (!f)
1311 return bret;
1312
1313 while (getline(&line, &n, f) != -1) {
573ad77f
CB
1314 char *numstr;
1315
eabf1ea9 1316 if (strncmp(line, "SigBlk:", 7))
6fbcbe3b
CB
1317 continue;
1318
573ad77f
CB
1319 numstr = lxc_trim_whitespace_in_place(line + 7);
1320 ret = lxc_safe_uint64(numstr, &sigblk, 16);
1321 if (ret < 0)
6fbcbe3b 1322 goto out;
573ad77f
CB
1323
1324 break;
330ae3d3
CB
1325 }
1326
573ad77f 1327 if (sigblk & (one << (signal - 1)))
330ae3d3
CB
1328 bret = true;
1329
1330out:
1331 free(line);
1332 fclose(f);
1333 return bret;
1334}
000dfda7 1335
a687256f
CB
1336int lxc_preserve_ns(const int pid, const char *ns)
1337{
1338 int ret;
a052913d
CB
1339/* 5 /proc + 21 /int_as_str + 3 /ns + 20 /NS_NAME + 1 \0 */
1340#define __NS_PATH_LEN 50
1341 char path[__NS_PATH_LEN];
a687256f 1342
4d8ac866
CB
1343 /* This way we can use this function to also check whether namespaces
1344 * are supported by the kernel by passing in the NULL or the empty
1345 * string.
1346 */
a052913d 1347 ret = snprintf(path, __NS_PATH_LEN, "/proc/%d/ns%s%s", pid,
4d8ac866
CB
1348 !ns || strcmp(ns, "") == 0 ? "" : "/",
1349 !ns || strcmp(ns, "") == 0 ? "" : ns);
ea918412 1350 if (ret < 0 || (size_t)ret >= __NS_PATH_LEN) {
1351 errno = EFBIG;
1352 return -1;
1353 }
a687256f
CB
1354
1355 return open(path, O_RDONLY | O_CLOEXEC);
1356}
6bc2eafe 1357
464c4611 1358bool lxc_switch_uid_gid(uid_t uid, gid_t gid)
dbaf55a3 1359{
db2d1af1
CB
1360 int ret = 0;
1361
1362 if (gid != LXC_INVALID_GID) {
1363 ret = setgid(gid);
1364 if (ret < 0) {
1365 SYSERROR("Failed to switch to gid %d", gid);
464c4611 1366 return false;
db2d1af1
CB
1367 }
1368 NOTICE("Switched to gid %d", gid);
dbaf55a3 1369 }
dbaf55a3 1370
db2d1af1
CB
1371 if (uid != LXC_INVALID_UID) {
1372 ret = setuid(uid);
1373 if (ret < 0) {
1374 SYSERROR("Failed to switch to uid %d", uid);
464c4611 1375 return false;
db2d1af1
CB
1376 }
1377 NOTICE("Switched to uid %d", uid);
dbaf55a3 1378 }
dbaf55a3 1379
464c4611 1380 return true;
dbaf55a3
CB
1381}
1382
1383/* Simple covenience function which enables uniform logging. */
8af07f82 1384bool lxc_setgroups(int size, gid_t list[])
dbaf55a3
CB
1385{
1386 if (setgroups(size, list) < 0) {
8af07f82
CB
1387 SYSERROR("Failed to setgroups()");
1388 return false;
dbaf55a3 1389 }
8af07f82 1390 NOTICE("Dropped additional groups");
dbaf55a3 1391
8af07f82 1392 return true;
dbaf55a3 1393}
c6868a1f
CB
1394
1395static int lxc_get_unused_loop_dev_legacy(char *loop_name)
1396{
1397 struct dirent *dp;
1398 struct loop_info64 lo64;
1399 DIR *dir;
1400 int dfd = -1, fd = -1, ret = -1;
1401
1402 dir = opendir("/dev");
1403 if (!dir)
1404 return -1;
1405
1406 while ((dp = readdir(dir))) {
c6868a1f
CB
1407 if (strncmp(dp->d_name, "loop", 4) != 0)
1408 continue;
1409
1410 dfd = dirfd(dir);
1411 if (dfd < 0)
1412 continue;
1413
1414 fd = openat(dfd, dp->d_name, O_RDWR);
1415 if (fd < 0)
1416 continue;
1417
1418 ret = ioctl(fd, LOOP_GET_STATUS64, &lo64);
1419 if (ret < 0) {
1420 if (ioctl(fd, LOOP_GET_STATUS64, &lo64) == 0 ||
1421 errno != ENXIO) {
1422 close(fd);
1423 fd = -1;
1424 continue;
1425 }
1426 }
1427
1428 ret = snprintf(loop_name, LO_NAME_SIZE, "/dev/%s", dp->d_name);
1429 if (ret < 0 || ret >= LO_NAME_SIZE) {
1430 close(fd);
1431 fd = -1;
1432 continue;
1433 }
1434
1435 break;
1436 }
1437
1438 closedir(dir);
1439
1440 if (fd < 0)
1441 return -1;
1442
1443 return fd;
1444}
1445
1446static int lxc_get_unused_loop_dev(char *name_loop)
1447{
1448 int loop_nr, ret;
1449 int fd_ctl = -1, fd_tmp = -1;
1450
1451 fd_ctl = open("/dev/loop-control", O_RDWR | O_CLOEXEC);
1452 if (fd_ctl < 0)
1453 return -ENODEV;
1454
1455 loop_nr = ioctl(fd_ctl, LOOP_CTL_GET_FREE);
1456 if (loop_nr < 0)
1457 goto on_error;
1458
1459 ret = snprintf(name_loop, LO_NAME_SIZE, "/dev/loop%d", loop_nr);
1460 if (ret < 0 || ret >= LO_NAME_SIZE)
1461 goto on_error;
1462
1463 fd_tmp = open(name_loop, O_RDWR | O_CLOEXEC);
1464 if (fd_tmp < 0)
1465 goto on_error;
1466
1467on_error:
1468 close(fd_ctl);
1469 return fd_tmp;
1470}
1471
1472int lxc_prepare_loop_dev(const char *source, char *loop_dev, int flags)
1473{
1474 int ret;
1475 struct loop_info64 lo64;
1476 int fd_img = -1, fret = -1, fd_loop = -1;
1477
1478 fd_loop = lxc_get_unused_loop_dev(loop_dev);
1479 if (fd_loop < 0) {
1480 if (fd_loop == -ENODEV)
1481 fd_loop = lxc_get_unused_loop_dev_legacy(loop_dev);
1482 else
1483 goto on_error;
1484 }
1485
1486 fd_img = open(source, O_RDWR | O_CLOEXEC);
1487 if (fd_img < 0)
1488 goto on_error;
1489
1490 ret = ioctl(fd_loop, LOOP_SET_FD, fd_img);
1491 if (ret < 0)
1492 goto on_error;
1493
1494 memset(&lo64, 0, sizeof(lo64));
1495 lo64.lo_flags = flags;
1496
1497 ret = ioctl(fd_loop, LOOP_SET_STATUS64, &lo64);
1498 if (ret < 0)
1499 goto on_error;
1500
1501 fret = 0;
1502
1503on_error:
1504 if (fd_img >= 0)
1505 close(fd_img);
1506
1507 if (fret < 0 && fd_loop >= 0) {
1508 close(fd_loop);
1509 fd_loop = -1;
1510 }
1511
1512 return fd_loop;
1513}
74251e49
CB
1514
1515int lxc_unstack_mountpoint(const char *path, bool lazy)
1516{
1517 int ret;
1518 int umounts = 0;
1519
1520pop_stack:
1521 ret = umount2(path, lazy ? MNT_DETACH : 0);
1522 if (ret < 0) {
1523 /* We consider anything else than EINVAL deadly to prevent going
1524 * into an infinite loop. (The other alternative is constantly
1525 * parsing /proc/self/mountinfo which is yucky and probably
1526 * racy.)
1527 */
1528 if (errno != EINVAL)
1529 return -errno;
1530 } else {
b4a40f7b
CB
1531 /* Just stop counting when this happens. That'd just be so
1532 * stupid that we won't even bother trying to report back the
1533 * correct value anymore.
1534 */
1535 if (umounts != INT_MAX)
1536 umounts++;
b14fc100 1537
74251e49
CB
1538 /* We succeeded in umounting. Make sure that there's no other
1539 * mountpoint stacked underneath.
1540 */
74251e49
CB
1541 goto pop_stack;
1542 }
1543
1544 return umounts;
1545}
ea3a694f
CB
1546
1547int run_command(char *buf, size_t buf_size, int (*child_fn)(void *), void *args)
1548{
1549 pid_t child;
1550 int ret, fret, pipefd[2];
1551 ssize_t bytes;
1552
46210729 1553 /* Make sure our callers do not receive uninitialized memory. */
ea3a694f
CB
1554 if (buf_size > 0 && buf)
1555 buf[0] = '\0';
1556
1557 if (pipe(pipefd) < 0) {
1558 SYSERROR("failed to create pipe");
1559 return -1;
1560 }
1561
2d728b2f 1562 child = lxc_raw_clone(0);
ea3a694f
CB
1563 if (child < 0) {
1564 close(pipefd[0]);
1565 close(pipefd[1]);
1566 SYSERROR("failed to create new process");
1567 return -1;
1568 }
1569
1570 if (child == 0) {
1571 /* Close the read-end of the pipe. */
1572 close(pipefd[0]);
1573
1574 /* Redirect std{err,out} to write-end of the
1575 * pipe.
1576 */
1577 ret = dup2(pipefd[1], STDOUT_FILENO);
1578 if (ret >= 0)
1579 ret = dup2(pipefd[1], STDERR_FILENO);
1580
1581 /* Close the write-end of the pipe. */
1582 close(pipefd[1]);
1583
1584 if (ret < 0) {
1585 SYSERROR("failed to duplicate std{err,out} file descriptor");
d8b3f9c3 1586 _exit(EXIT_FAILURE);
ea3a694f
CB
1587 }
1588
1589 /* Does not return. */
1590 child_fn(args);
1591 ERROR("failed to exec command");
d8b3f9c3 1592 _exit(EXIT_FAILURE);
ea3a694f
CB
1593 }
1594
1595 /* close the write-end of the pipe */
1596 close(pipefd[1]);
1597
7a643c7c 1598 if (buf && buf_size > 0) {
a5bc6cb0 1599 bytes = lxc_read_nointr(pipefd[0], buf, buf_size - 1);
7a643c7c
CB
1600 if (bytes > 0)
1601 buf[bytes - 1] = '\0';
1602 }
ea3a694f
CB
1603
1604 fret = wait_for_pid(child);
1605 /* close the read-end of the pipe */
1606 close(pipefd[0]);
1607
1608 return fret;
1609}
04ad7ffe 1610
d75c14e2
CB
1611bool lxc_nic_exists(char *nic)
1612{
1613#define __LXC_SYS_CLASS_NET_LEN 15 + IFNAMSIZ + 1
1614 char path[__LXC_SYS_CLASS_NET_LEN];
1615 int ret;
1616 struct stat sb;
1617
1618 if (!strcmp(nic, "none"))
1619 return true;
1620
1621 ret = snprintf(path, __LXC_SYS_CLASS_NET_LEN, "/sys/class/net/%s", nic);
1622 if (ret < 0 || (size_t)ret >= __LXC_SYS_CLASS_NET_LEN)
1623 return false;
1624
1625 ret = stat(path, &sb);
1626 if (ret < 0)
1627 return false;
1628
1629 return true;
1630}
127c6e70 1631
6222c3f4
CB
1632uint64_t lxc_find_next_power2(uint64_t n)
1633{
1634 /* 0 is not valid input. We return 0 to the caller since 0 is not a
1635 * valid power of two.
1636 */
1637 if (n == 0)
1638 return 0;
1639
1640 if (!(n & (n - 1)))
1641 return n;
1642
1643 while (n & (n - 1))
1644 n = n & (n - 1);
1645
1646 n = n << 1;
1647 return n;
1648}
1fd0f41e
CB
1649
1650int lxc_set_death_signal(int signal)
1651{
1652 int ret;
1653 pid_t ppid;
1654
b81689a1
CB
1655 ret = prctl(PR_SET_PDEATHSIG, prctl_arg(signal), prctl_arg(0),
1656 prctl_arg(0), prctl_arg(0));
1fd0f41e
CB
1657
1658 /* Check whether we have been orphaned. */
1659 ppid = (pid_t)syscall(SYS_getppid);
1660 if (ppid == 1) {
1661 pid_t self;
1662
1663 self = lxc_raw_getpid();
1664 ret = kill(self, SIGKILL);
1665 if (ret < 0)
1666 return -1;
1667 }
1668
1669 if (ret < 0) {
1670 SYSERROR("Failed to set PR_SET_PDEATHSIG to %d", signal);
1671 return -1;
1672 }
1673
1674 return 0;
1675}
7ad37670 1676
a9d4ebc1
CB
1677int fd_cloexec(int fd, bool cloexec)
1678{
1679 int oflags, nflags;
1680
1681 oflags = fcntl(fd, F_GETFD, 0);
1682 if (oflags < 0)
1683 return -errno;
1684
1685 if (cloexec)
1686 nflags = oflags | FD_CLOEXEC;
1687 else
1688 nflags = oflags & ~FD_CLOEXEC;
1689
1690 if (nflags == oflags)
1691 return 0;
1692
1693 if (fcntl(fd, F_SETFD, nflags) < 0)
1694 return -errno;
1695
1696 return 0;
1697}
d7ab0375 1698
1699int recursive_destroy(char *dirname)
1700{
1701 int ret;
1702 struct dirent *direntp;
1703 DIR *dir;
1704 int r = 0;
1705
1706 dir = opendir(dirname);
1707 if (!dir)
1708 return -1;
1709
1710 while ((direntp = readdir(dir))) {
1711 char *pathname;
1712 struct stat mystat;
1713
1714 if (!strcmp(direntp->d_name, ".") ||
1715 !strcmp(direntp->d_name, ".."))
1716 continue;
1717
1718 pathname = must_make_path(dirname, direntp->d_name, NULL);
1719
1720 ret = lstat(pathname, &mystat);
1721 if (ret < 0) {
1722 if (!r)
1723 WARN("Failed to stat \"%s\"", pathname);
1724
1725 r = -1;
1726 goto next;
1727 }
1728
1729 if (!S_ISDIR(mystat.st_mode))
1730 goto next;
1731
1732 ret = recursive_destroy(pathname);
1733 if (ret < 0)
1734 r = -1;
1735
1736 next:
1737 free(pathname);
1738 }
1739
1740 ret = rmdir(dirname);
1741 if (ret < 0) {
1742 if (!r)
1743 SYSWARN("Failed to delete \"%s\"", dirname);
1744
1745 r = -1;
1746 }
1747
1748 ret = closedir(dir);
1749 if (ret < 0) {
1750 if (!r)
1751 SYSWARN("Failed to delete \"%s\"", dirname);
1752
1753 r = -1;
1754 }
1755
1756 return r;
1757}
b25291da
CB
1758
1759int lxc_setup_keyring(void)
1760{
1761 key_serial_t keyring;
1762 int ret = 0;
1763
1764 /* Try to allocate a new session keyring for the container to prevent
1765 * information leaks.
1766 */
1767 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, prctl_arg(0),
1768 prctl_arg(0), prctl_arg(0), prctl_arg(0));
1769 if (keyring < 0) {
1770 switch (errno) {
1771 case ENOSYS:
1772 DEBUG("The keyctl() syscall is not supported or blocked");
1773 break;
1774 case EACCES:
1775 __fallthrough;
1776 case EPERM:
1777 DEBUG("Failed to access kernel keyring. Continuing...");
1778 break;
1779 default:
1780 SYSERROR("Failed to create kernel keyring");
1781 ret = -1;
1782 break;
1783 }
1784 }
1785
1786 return ret;
1787}