]> git.proxmox.com Git - mirror_lxc.git/blame - src/lxc/utils.c
spelling: userns
[mirror_lxc.git] / src / lxc / utils.c
CommitLineData
e3642c43
DL
1/*
2 * lxc: linux Container library
3 *
4 * (C) Copyright IBM Corp. 2007, 2008
5 *
6 * Authors:
9afe19d6 7 * Daniel Lezcano <daniel.lezcano at free.fr>
e3642c43
DL
8 *
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
250b1eec 21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
e3642c43
DL
22 */
23
d38dd64a
CB
24#ifndef _GNU_SOURCE
25#define _GNU_SOURCE 1
26#endif
7935833c 27#define __STDC_FORMAT_MACROS /* Required for PRIu64 to work. */
643c1984 28#include <ctype.h>
a1e5280d 29#include <dirent.h>
e3642c43 30#include <errno.h>
a1e5280d 31#include <fcntl.h>
dbaf55a3 32#include <grp.h>
7935833c 33#include <inttypes.h>
a1e5280d 34#include <libgen.h>
b467714b 35#include <pthread.h>
d983b93c 36#include <stddef.h>
a1e5280d
CB
37#include <stdio.h>
38#include <stdlib.h>
61a1d519 39#include <string.h>
e3642c43 40#include <sys/mman.h>
6e4bb2e0 41#include <sys/mount.h>
a1e5280d
CB
42#include <sys/param.h>
43#include <sys/prctl.h>
44#include <sys/stat.h>
9be53773
SH
45#include <sys/types.h>
46#include <sys/wait.h>
d38dd64a 47#include <unistd.h>
e3642c43 48
d38dd64a 49#include "config.h"
e3642c43 50#include "log.h"
025ed0f3 51#include "lxclock.h"
51d0854c 52#include "namespace.h"
e3db0162 53#include "parse.h"
38e5c2db 54#include "raw_syscalls.h"
b25291da 55#include "syscall_wrappers.h"
981f6029 56#include "utils.h"
e3642c43 57
43f984ea
DJ
58#ifndef HAVE_STRLCPY
59#include "include/strlcpy.h"
60#endif
61
bd583214
DJ
62#ifndef HAVE_STRLCAT
63#include "include/strlcat.h"
64#endif
65
4928c718
SG
66#ifndef O_PATH
67#define O_PATH 010000000
68#endif
69
70#ifndef O_NOFOLLOW
71#define O_NOFOLLOW 00400000
72#endif
73
ac2cecc4 74lxc_log_define(utils, lxc);
e3642c43 75
4295c5de
SH
76/*
77 * if path is btrfs, tries to remove it and any subvolumes beneath it
78 */
79extern bool btrfs_try_remove_subvol(const char *path);
80
41dc7155 81static int _recursive_rmdir(const char *dirname, dev_t pdev,
0cc417b2 82 const char *exclude, int level, bool onedev)
60bf62d4 83{
74f96976 84 struct dirent *direntp;
60bf62d4
SH
85 DIR *dir;
86 int ret, failed=0;
d726953a 87 char pathname[PATH_MAX];
18aa217b 88 bool hadexclude = false;
60bf62d4
SH
89
90 dir = opendir(dirname);
91 if (!dir) {
b103ceac 92 ERROR("failed to open %s", dirname);
4355ab5f 93 return -1;
60bf62d4
SH
94 }
95
74f96976 96 while ((direntp = readdir(dir))) {
60bf62d4
SH
97 struct stat mystat;
98 int rc;
99
60bf62d4
SH
100 if (!strcmp(direntp->d_name, ".") ||
101 !strcmp(direntp->d_name, ".."))
102 continue;
103
d726953a
CB
104 rc = snprintf(pathname, PATH_MAX, "%s/%s", dirname, direntp->d_name);
105 if (rc < 0 || rc >= PATH_MAX) {
60bf62d4
SH
106 ERROR("pathname too long");
107 failed=1;
108 continue;
109 }
18aa217b
SH
110
111 if (!level && exclude && !strcmp(direntp->d_name, exclude)) {
112 ret = rmdir(pathname);
113 if (ret < 0) {
114 switch(errno) {
115 case ENOTEMPTY:
0cc417b2 116 INFO("Not deleting snapshot %s", pathname);
18aa217b
SH
117 hadexclude = true;
118 break;
119 case ENOTDIR:
120 ret = unlink(pathname);
121 if (ret)
b103ceac 122 INFO("Failed to remove %s", pathname);
18aa217b
SH
123 break;
124 default:
b103ceac 125 SYSERROR("Failed to rmdir %s", pathname);
18aa217b
SH
126 failed = 1;
127 break;
128 }
129 }
130 continue;
131 }
132
60bf62d4
SH
133 ret = lstat(pathname, &mystat);
134 if (ret) {
b103ceac 135 ERROR("Failed to stat %s", pathname);
4295c5de 136 failed = 1;
60bf62d4
SH
137 continue;
138 }
b14fc100 139
4295c5de
SH
140 if (onedev && mystat.st_dev != pdev) {
141 /* TODO should we be checking /proc/self/mountinfo for
142 * pathname and not doing this if found? */
143 if (btrfs_try_remove_subvol(pathname))
144 INFO("Removed btrfs subvolume at %s\n", pathname);
60bf62d4 145 continue;
4295c5de 146 }
b14fc100 147
60bf62d4 148 if (S_ISDIR(mystat.st_mode)) {
0cc417b2 149 if (_recursive_rmdir(pathname, pdev, exclude, level+1, onedev) < 0)
60bf62d4
SH
150 failed=1;
151 } else {
152 if (unlink(pathname) < 0) {
b103ceac 153 SYSERROR("Failed to delete %s", pathname);
60bf62d4
SH
154 failed=1;
155 }
156 }
157 }
158
4295c5de 159 if (rmdir(dirname) < 0 && !btrfs_try_remove_subvol(dirname) && !hadexclude) {
b103ceac 160 ERROR("Failed to delete %s", dirname);
4295c5de 161 failed=1;
60bf62d4
SH
162 }
163
025ed0f3 164 ret = closedir(dir);
025ed0f3 165 if (ret) {
b103ceac 166 ERROR("Failed to close directory %s", dirname);
60bf62d4
SH
167 failed=1;
168 }
169
4355ab5f 170 return failed ? -1 : 0;
60bf62d4
SH
171}
172
29a11a7f
CB
173/* In overlayfs, st_dev is unreliable. So on overlayfs we don't do the
174 * lxc_rmdir_onedev()
0cc417b2
SH
175 */
176static bool is_native_overlayfs(const char *path)
177{
29a11a7f
CB
178 if (has_fs_type(path, OVERLAY_SUPER_MAGIC) ||
179 has_fs_type(path, OVERLAYFS_SUPER_MAGIC))
0cc417b2 180 return true;
29a11a7f 181
0cc417b2
SH
182 return false;
183}
184
4355ab5f 185/* returns 0 on success, -1 if there were any failures */
41dc7155 186extern int lxc_rmdir_onedev(const char *path, const char *exclude)
60bf62d4
SH
187{
188 struct stat mystat;
0cc417b2
SH
189 bool onedev = true;
190
41dc7155 191 if (is_native_overlayfs(path))
0cc417b2 192 onedev = false;
60bf62d4
SH
193
194 if (lstat(path, &mystat) < 0) {
067650d0
SH
195 if (errno == ENOENT)
196 return 0;
41dc7155 197
b103ceac 198 ERROR("Failed to stat %s", path);
4355ab5f 199 return -1;
60bf62d4
SH
200 }
201
0cc417b2 202 return _recursive_rmdir(path, mystat.st_dev, exclude, 0, onedev);
60bf62d4
SH
203}
204
9ddaf3bf 205/* borrowed from iproute2 */
7c11d57a 206extern int get_u16(unsigned short *val, const char *arg, int base)
9ddaf3bf
JHS
207{
208 unsigned long res;
209 char *ptr;
210
211 if (!arg || !*arg)
212 return -1;
213
09bbd745 214 errno = 0;
9ddaf3bf 215 res = strtoul(arg, &ptr, base);
09bbd745 216 if (!ptr || ptr == arg || *ptr || res > 0xFFFF || errno != 0)
9ddaf3bf
JHS
217 return -1;
218
219 *val = res;
220
221 return 0;
222}
223
6099dd5a 224int mkdir_p(const char *dir, mode_t mode)
1b09f2c0 225{
3ce74686
SH
226 const char *tmp = dir;
227 const char *orig = dir;
c5e7a7ac 228 do {
6099dd5a
CB
229 int ret;
230 char *makeme;
231
860fc865
RW
232 dir = tmp + strspn(tmp, "/");
233 tmp = dir + strcspn(dir, "/");
b14fc100 234
6099dd5a 235 errno = ENOMEM;
d74325c4 236 makeme = strndup(orig, dir - orig);
6099dd5a
CB
237 if (!makeme)
238 return -1;
239
240 ret = mkdir(makeme, mode);
241 if (ret < 0 && errno != EEXIST) {
242 SYSERROR("Failed to create directory \"%s\"", makeme);
243 free(makeme);
244 return -1;
860fc865 245 }
d74325c4 246 free(makeme);
6099dd5a
CB
247
248 } while (tmp != dir);
1b09f2c0 249
98663823 250 return 0;
1b09f2c0 251}
2a59a681 252
44b9ae4b 253char *get_rundir()
9e60f51d 254{
97a696c6
SG
255 char *rundir;
256 const char *homedir;
9650c735 257 struct stat sb;
9e60f51d 258
b14fc100 259 if (stat(RUNTIME_PATH, &sb) < 0)
9650c735 260 return NULL;
9650c735
TA
261
262 if (geteuid() == sb.st_uid || getegid() == sb.st_gid) {
c580b8d2 263 rundir = strdup(RUNTIME_PATH);
d6470e71
SG
264 return rundir;
265 }
97a696c6
SG
266
267 rundir = getenv("XDG_RUNTIME_DIR");
44b9ae4b
SG
268 if (rundir) {
269 rundir = strdup(rundir);
270 return rundir;
271 }
97a696c6 272
44b9ae4b
SG
273 INFO("XDG_RUNTIME_DIR isn't set in the environment.");
274 homedir = getenv("HOME");
275 if (!homedir) {
276 ERROR("HOME isn't set in the environment.");
277 return NULL;
97a696c6
SG
278 }
279
44b9ae4b 280 rundir = malloc(sizeof(char) * (17 + strlen(homedir)));
b14fc100 281 if (!rundir)
282 return NULL;
283
44b9ae4b
SG
284 sprintf(rundir, "%s/.cache/lxc/run/", homedir);
285
9e60f51d
DE
286 return rundir;
287}
288
9be53773
SH
289int wait_for_pid(pid_t pid)
290{
291 int status, ret;
292
293again:
294 ret = waitpid(pid, &status, 0);
295 if (ret == -1) {
71b9b8ed 296 if (errno == EINTR)
9be53773 297 goto again;
b14fc100 298
9be53773
SH
299 return -1;
300 }
b14fc100 301
9be53773
SH
302 if (ret != pid)
303 goto again;
b14fc100 304
9be53773
SH
305 if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
306 return -1;
b14fc100 307
9be53773
SH
308 return 0;
309}
c797a220
CS
310
311int lxc_wait_for_pid_status(pid_t pid)
312{
313 int status, ret;
314
315again:
316 ret = waitpid(pid, &status, 0);
317 if (ret == -1) {
318 if (errno == EINTR)
319 goto again;
b14fc100 320
c797a220
CS
321 return -1;
322 }
b14fc100 323
c797a220
CS
324 if (ret != pid)
325 goto again;
b14fc100 326
c797a220
CS
327 return status;
328}
92f023dc 329
3ce74686
SH
330#if HAVE_LIBGNUTLS
331#include <gnutls/gnutls.h>
332#include <gnutls/crypto.h>
41246cee
DE
333
334__attribute__((constructor))
335static void gnutls_lxc_init(void)
336{
337 gnutls_global_init();
338}
339
3ce74686
SH
340int sha1sum_file(char *fnam, unsigned char *digest)
341{
342 char *buf;
343 int ret;
344 FILE *f;
345 long flen;
346
347 if (!fnam)
348 return -1;
b14fc100 349
025ed0f3 350 f = fopen_cloexec(fnam, "r");
7be677a8 351 if (!f) {
3ce74686
SH
352 SYSERROR("Error opening template");
353 return -1;
354 }
b14fc100 355
3ce74686
SH
356 if (fseek(f, 0, SEEK_END) < 0) {
357 SYSERROR("Error seeking to end of template");
dd1d77f9 358 fclose(f);
3ce74686
SH
359 return -1;
360 }
b14fc100 361
3ce74686
SH
362 if ((flen = ftell(f)) < 0) {
363 SYSERROR("Error telling size of template");
dd1d77f9 364 fclose(f);
3ce74686
SH
365 return -1;
366 }
b14fc100 367
3ce74686
SH
368 if (fseek(f, 0, SEEK_SET) < 0) {
369 SYSERROR("Error seeking to start of template");
dd1d77f9 370 fclose(f);
3ce74686
SH
371 return -1;
372 }
b14fc100 373
3ce74686
SH
374 if ((buf = malloc(flen+1)) == NULL) {
375 SYSERROR("Out of memory");
dd1d77f9 376 fclose(f);
3ce74686
SH
377 return -1;
378 }
b14fc100 379
3ce74686
SH
380 if (fread(buf, 1, flen, f) != flen) {
381 SYSERROR("Failure reading template");
382 free(buf);
dd1d77f9 383 fclose(f);
3ce74686
SH
384 return -1;
385 }
b14fc100 386
dd1d77f9 387 if (fclose(f) < 0) {
3ce74686
SH
388 SYSERROR("Failre closing template");
389 free(buf);
390 return -1;
391 }
b14fc100 392
3ce74686
SH
393 buf[flen] = '\0';
394 ret = gnutls_hash_fast(GNUTLS_DIG_SHA1, buf, flen, (void *)digest);
395 free(buf);
396 return ret;
397}
398#endif
61a1d519 399
8bd8018e 400struct lxc_popen_FILE *lxc_popen(const char *command)
ebec9176 401{
3f323207 402 int ret;
ebec9176
AM
403 int pipe_fds[2];
404 pid_t child_pid;
8bd8018e 405 struct lxc_popen_FILE *fp = NULL;
ebec9176 406
8bd8018e
CB
407 ret = pipe2(pipe_fds, O_CLOEXEC);
408 if (ret < 0)
ebec9176 409 return NULL;
ebec9176
AM
410
411 child_pid = fork();
8bd8018e
CB
412 if (child_pid < 0)
413 goto on_error;
414
415 if (!child_pid) {
416 sigset_t mask;
417
418 close(pipe_fds[0]);
419
420 /* duplicate stdout */
421 if (pipe_fds[1] != STDOUT_FILENO)
422 ret = dup2(pipe_fds[1], STDOUT_FILENO);
423 else
424 ret = fcntl(pipe_fds[1], F_SETFD, 0);
425 if (ret < 0) {
426 close(pipe_fds[1]);
03f618af 427 _exit(EXIT_FAILURE);
3f323207
CB
428 }
429
8bd8018e
CB
430 /* duplicate stderr */
431 if (pipe_fds[1] != STDERR_FILENO)
432 ret = dup2(pipe_fds[1], STDERR_FILENO);
433 else
434 ret = fcntl(pipe_fds[1], F_SETFD, 0);
435 close(pipe_fds[1]);
436 if (ret < 0)
03f618af 437 _exit(EXIT_FAILURE);
8bd8018e
CB
438
439 /* unblock all signals */
440 ret = sigfillset(&mask);
441 if (ret < 0)
03f618af 442 _exit(EXIT_FAILURE);
8bd8018e 443
b467714b 444 ret = pthread_sigmask(SIG_UNBLOCK, &mask, NULL);
8bd8018e 445 if (ret < 0)
03f618af 446 _exit(EXIT_FAILURE);
8bd8018e
CB
447
448 execl("/bin/sh", "sh", "-c", command, (char *)NULL);
03f618af 449 _exit(127);
ebec9176
AM
450 }
451
8bd8018e
CB
452 close(pipe_fds[1]);
453 pipe_fds[1] = -1;
ebec9176 454
8bd8018e
CB
455 fp = malloc(sizeof(*fp));
456 if (!fp)
457 goto on_error;
b14fc100 458
7e50ec0b 459 memset(fp, 0, sizeof(*fp));
ebec9176
AM
460
461 fp->child_pid = child_pid;
8bd8018e 462 fp->pipe = pipe_fds[0];
ebec9176 463
7e50ec0b
CB
464 /* From now on, closing fp->f will also close fp->pipe. So only ever
465 * call fclose(fp->f).
466 */
8bd8018e
CB
467 fp->f = fdopen(pipe_fds[0], "r");
468 if (!fp->f)
469 goto on_error;
ebec9176 470
8bd8018e 471 return fp;
ebec9176 472
8bd8018e 473on_error:
7e50ec0b
CB
474 /* We can only close pipe_fds[0] if fdopen() didn't succeed or wasn't
475 * called yet. Otherwise the fd belongs to the file opened by fdopen()
476 * since it isn't dup()ed.
477 */
478 if (fp && !fp->f && pipe_fds[0] >= 0)
8bd8018e
CB
479 close(pipe_fds[0]);
480
481 if (pipe_fds[1] >= 0)
482 close(pipe_fds[1]);
ebec9176 483
7e50ec0b
CB
484 if (fp && fp->f)
485 fclose(fp->f);
486
487 if (fp)
488 free(fp);
489
ebec9176
AM
490 return NULL;
491}
492
8bd8018e 493int lxc_pclose(struct lxc_popen_FILE *fp)
ebec9176 494{
ebec9176 495 pid_t wait_pid;
8bd8018e 496 int wstatus = 0;
ebec9176 497
8bd8018e 498 if (!fp)
ebec9176 499 return -1;
ebec9176
AM
500
501 do {
8bd8018e
CB
502 wait_pid = waitpid(fp->child_pid, &wstatus, 0);
503 } while (wait_pid < 0 && errno == EINTR);
ebec9176 504
8bd8018e
CB
505 fclose(fp->f);
506 free(fp);
507
508 if (wait_pid < 0)
ebec9176 509 return -1;
ebec9176
AM
510
511 return wstatus;
512}
513
508c263e
SH
514int randseed(bool srand_it)
515{
516 /*
517 srand pre-seed function based on /dev/urandom
518 */
091045f8 519 unsigned int seed = time(NULL) + getpid();
508c263e
SH
520
521 FILE *f;
522 f = fopen("/dev/urandom", "r");
523 if (f) {
524 int ret = fread(&seed, sizeof(seed), 1, f);
525 if (ret != 1)
7874d81a 526 SYSDEBUG("unable to fread /dev/urandom, fallback to time+pid rand seed");
527
508c263e
SH
528 fclose(f);
529 }
530
531 if (srand_it)
532 srand(seed);
533
534 return seed;
535}
5d897655
SH
536
537uid_t get_ns_uid(uid_t orig)
538{
539 char *line = NULL;
540 size_t sz = 0;
541 uid_t nsid, hostid, range;
542 FILE *f = fopen("/proc/self/uid_map", "r");
543 if (!f)
544 return 0;
545
546 while (getline(&line, &sz, f) != -1) {
547 if (sscanf(line, "%u %u %u", &nsid, &hostid, &range) != 3)
548 continue;
b14fc100 549
5d897655
SH
550 if (hostid <= orig && hostid + range > orig) {
551 nsid += orig - hostid;
552 goto found;
553 }
554 }
555
b962868f
CB
556 nsid = LXC_INVALID_UID;
557
558found:
559 fclose(f);
560 free(line);
561 return nsid;
562}
563
564gid_t get_ns_gid(gid_t orig)
565{
566 char *line = NULL;
567 size_t sz = 0;
568 gid_t nsid, hostid, range;
569 FILE *f = fopen("/proc/self/gid_map", "r");
570 if (!f)
571 return 0;
572
573 while (getline(&line, &sz, f) != -1) {
574 if (sscanf(line, "%u %u %u", &nsid, &hostid, &range) != 3)
575 continue;
576
577 if (hostid <= orig && hostid + range > orig) {
578 nsid += orig - hostid;
579 goto found;
580 }
581 }
582
583 nsid = LXC_INVALID_GID;
b14fc100 584
5d897655
SH
585found:
586 fclose(f);
587 free(line);
588 return nsid;
589}
c476bdce
SH
590
591bool dir_exists(const char *path)
592{
593 struct stat sb;
594 int ret;
595
596 ret = stat(path, &sb);
597 if (ret < 0)
1a0e70ac 598 /* Could be something other than eexist, just say "no". */
c476bdce 599 return false;
b14fc100 600
c476bdce
SH
601 return S_ISDIR(sb.st_mode);
602}
93c379f0
ÇO
603
604/* Note we don't use SHA-1 here as we don't want to depend on HAVE_GNUTLS.
605 * FNV has good anti collision properties and we're not worried
606 * about pre-image resistance or one-way-ness, we're just trying to make
607 * the name unique in the 108 bytes of space we have.
608 */
609uint64_t fnv_64a_buf(void *buf, size_t len, uint64_t hval)
610{
611 unsigned char *bp;
612
613 for(bp = buf; bp < (unsigned char *)buf + len; bp++)
614 {
615 /* xor the bottom with the current octet */
616 hval ^= (uint64_t)*bp;
617
618 /* gcc optimised:
619 * multiply by the 64 bit FNV magic prime mod 2^64
620 */
621 hval += (hval << 1) + (hval << 4) + (hval << 5) +
622 (hval << 7) + (hval << 8) + (hval << 40);
623 }
624
625 return hval;
626}
2c6f3fc9 627
f6310f18 628bool is_shared_mountpoint(const char *path)
2c6f3fc9 629{
f6310f18 630 char buf[LXC_LINELEN];
2c6f3fc9
SH
631 FILE *f;
632 int i;
f6310f18 633 char *p, *p2;
2c6f3fc9
SH
634
635 f = fopen("/proc/self/mountinfo", "r");
636 if (!f)
637 return 0;
b14fc100 638
eab15c1e
CB
639 while (fgets(buf, LXC_LINELEN, f)) {
640 for (p = buf, i = 0; p && i < 4; i++)
641 p = strchr(p + 1, ' ');
2c6f3fc9
SH
642 if (!p)
643 continue;
b14fc100 644
eab15c1e 645 p2 = strchr(p + 1, ' ');
2c6f3fc9
SH
646 if (!p2)
647 continue;
b14fc100 648
2c6f3fc9 649 *p2 = '\0';
f6310f18
LT
650 if (strcmp(p + 1, path) == 0) {
651 /* This is the path. Is it shared? */
eab15c1e 652 p = strchr(p2 + 1, ' ');
2c6f3fc9
SH
653 if (p && strstr(p, "shared:")) {
654 fclose(f);
f6310f18 655 return true;
2c6f3fc9
SH
656 }
657 }
658 }
b14fc100 659
2c6f3fc9 660 fclose(f);
f6310f18
LT
661 return false;
662}
663
664/*
665 * Detect whether / is mounted MS_SHARED. The only way I know of to
666 * check that is through /proc/self/mountinfo.
667 * I'm only checking for /. If the container rootfs or mount location
668 * is MS_SHARED, but not '/', then you're out of luck - figuring that
669 * out would be too much work to be worth it.
670 */
671int detect_shared_rootfs(void)
672{
673 if (is_shared_mountpoint("/"))
674 return 1;
2c6f3fc9
SH
675 return 0;
676}
0e6e3a41 677
37ef15bb
CB
678bool switch_to_ns(pid_t pid, const char *ns)
679{
51d0854c 680 int fd, ret;
d726953a 681 char nspath[PATH_MAX];
51d0854c
DY
682
683 /* Switch to new ns */
d726953a
CB
684 ret = snprintf(nspath, PATH_MAX, "/proc/%d/ns/%s", pid, ns);
685 if (ret < 0 || ret >= PATH_MAX)
51d0854c
DY
686 return false;
687
688 fd = open(nspath, O_RDONLY);
689 if (fd < 0) {
a9cb0fb8 690 SYSERROR("Failed to open %s", nspath);
51d0854c
DY
691 return false;
692 }
693
694 ret = setns(fd, 0);
695 if (ret) {
a9cb0fb8 696 SYSERROR("Failed to set process %d to %s of %d.", pid, ns, fd);
51d0854c
DY
697 close(fd);
698 return false;
699 }
b14fc100 700
51d0854c
DY
701 close(fd);
702 return true;
703}
704
b7f954bb
SH
705/*
706 * looking at fs/proc_namespace.c, it appears we can
707 * actually expect the rootfs entry to very specifically contain
708 * " - rootfs rootfs "
709 * IIUC, so long as we've chrooted so that rootfs is not our root,
710 * the rootfs entry should always be skipped in mountinfo contents.
711 */
fa454c8e 712bool detect_ramfs_rootfs(void)
b7f954bb 713{
b7f954bb 714 FILE *f;
fa454c8e
CB
715 char *p, *p2;
716 char *line = NULL;
717 size_t len = 0;
b7f954bb 718 int i;
b7f954bb
SH
719
720 f = fopen("/proc/self/mountinfo", "r");
721 if (!f)
fa454c8e
CB
722 return false;
723
724 while (getline(&line, &len, f) != -1) {
725 for (p = line, i = 0; p && i < 4; i++)
726 p = strchr(p + 1, ' ');
b7f954bb
SH
727 if (!p)
728 continue;
b14fc100 729
fa454c8e 730 p2 = strchr(p + 1, ' ');
b7f954bb
SH
731 if (!p2)
732 continue;
b14fc100 733
b7f954bb 734 *p2 = '\0';
fa454c8e 735 if (strcmp(p + 1, "/") == 0) {
1a0e70ac 736 /* This is '/'. Is it the ramfs? */
fa454c8e 737 p = strchr(p2 + 1, '-');
b7f954bb 738 if (p && strncmp(p, "- rootfs rootfs ", 16) == 0) {
fa454c8e 739 free(line);
b7f954bb 740 fclose(f);
8ce1abc2 741 INFO("Rootfs is located on ramfs");
fa454c8e 742 return true;
b7f954bb
SH
743 }
744 }
745 }
b14fc100 746
fa454c8e 747 free(line);
b7f954bb 748 fclose(f);
fa454c8e 749 return false;
b7f954bb
SH
750}
751
37ef15bb
CB
752char *on_path(const char *cmd, const char *rootfs)
753{
84c5549b 754 char *entry = NULL, *path = NULL;
d726953a 755 char cmdpath[PATH_MAX];
0e6e3a41
SG
756 int ret;
757
758 path = getenv("PATH");
759 if (!path)
8afb3e61 760 return NULL;
0e6e3a41
SG
761
762 path = strdup(path);
763 if (!path)
8afb3e61 764 return NULL;
0e6e3a41 765
37ef15bb 766 lxc_iterate_parts (entry, path, ":") {
9d9c111c 767 if (rootfs)
d726953a 768 ret = snprintf(cmdpath, PATH_MAX, "%s/%s/%s", rootfs,
37ef15bb 769 entry, cmd);
9d9c111c 770 else
d726953a
CB
771 ret = snprintf(cmdpath, PATH_MAX, "%s/%s", entry, cmd);
772 if (ret < 0 || ret >= PATH_MAX)
84c5549b 773 continue;
0e6e3a41
SG
774
775 if (access(cmdpath, X_OK) == 0) {
776 free(path);
8afb3e61 777 return strdup(cmdpath);
0e6e3a41 778 }
0e6e3a41
SG
779 }
780
781 free(path);
8afb3e61 782 return NULL;
0e6e3a41 783}
76a26f55 784
12983ba4
SH
785bool cgns_supported(void)
786{
787 return file_exists("/proc/self/ns/cgroup");
788}
789
9d9c111c
SH
790/* historically lxc-init has been under /usr/lib/lxc and under
791 * /usr/lib/$ARCH/lxc. It now lives as $prefix/sbin/init.lxc.
792 */
793char *choose_init(const char *rootfs)
794{
795 char *retv = NULL;
370ec268
SF
796 const char *empty = "",
797 *tmp;
9d9c111c 798 int ret, env_set = 0;
9d9c111c
SH
799
800 if (!getenv("PATH")) {
801 if (setenv("PATH", "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", 0))
802 SYSERROR("Failed to setenv");
b14fc100 803
9d9c111c
SH
804 env_set = 1;
805 }
806
807 retv = on_path("init.lxc", rootfs);
808
809 if (env_set) {
810 if (unsetenv("PATH"))
811 SYSERROR("Failed to unsetenv");
812 }
813
814 if (retv)
815 return retv;
816
817 retv = malloc(PATH_MAX);
818 if (!retv)
819 return NULL;
820
821 if (rootfs)
370ec268 822 tmp = rootfs;
9d9c111c 823 else
370ec268
SF
824 tmp = empty;
825
826 ret = snprintf(retv, PATH_MAX, "%s/%s/%s", tmp, SBINDIR, "/init.lxc");
9d9c111c
SH
827 if (ret < 0 || ret >= PATH_MAX) {
828 ERROR("pathname too long");
829 goto out1;
830 }
b14fc100 831
e57cd7e9 832 if (access(retv, X_OK) == 0)
9d9c111c
SH
833 return retv;
834
370ec268 835 ret = snprintf(retv, PATH_MAX, "%s/%s/%s", tmp, LXCINITDIR, "/lxc/lxc-init");
9d9c111c
SH
836 if (ret < 0 || ret >= PATH_MAX) {
837 ERROR("pathname too long");
838 goto out1;
839 }
b14fc100 840
e57cd7e9 841 if (access(retv, X_OK) == 0)
9d9c111c
SH
842 return retv;
843
370ec268 844 ret = snprintf(retv, PATH_MAX, "%s/usr/lib/lxc/lxc-init", tmp);
9d9c111c
SH
845 if (ret < 0 || ret >= PATH_MAX) {
846 ERROR("pathname too long");
847 goto out1;
848 }
b14fc100 849
e57cd7e9 850 if (access(retv, X_OK) == 0)
9d9c111c
SH
851 return retv;
852
370ec268 853 ret = snprintf(retv, PATH_MAX, "%s/sbin/lxc-init", tmp);
9d9c111c
SH
854 if (ret < 0 || ret >= PATH_MAX) {
855 ERROR("pathname too long");
856 goto out1;
857 }
b14fc100 858
e57cd7e9 859 if (access(retv, X_OK) == 0)
9d9c111c
SH
860 return retv;
861
862 /*
863 * Last resort, look for the statically compiled init.lxc which we
864 * hopefully bind-mounted in.
865 * If we are called during container setup, and we get to this point,
866 * then the init.lxc.static from the host will need to be bind-mounted
867 * in. So we return NULL here to indicate that.
868 */
869 if (rootfs)
870 goto out1;
871
872 ret = snprintf(retv, PATH_MAX, "/init.lxc.static");
873 if (ret < 0 || ret >= PATH_MAX) {
874 WARN("Nonsense - name /lxc.init.static too long");
875 goto out1;
876 }
b14fc100 877
e57cd7e9 878 if (access(retv, X_OK) == 0)
9d9c111c
SH
879 return retv;
880
881out1:
882 free(retv);
883 return NULL;
884}
735f2c6e 885
6010a416
SG
886/*
887 * Given the '-t' template option to lxc-create, figure out what to
888 * do. If the template is a full executable path, use that. If it
889 * is something like 'sshd', then return $templatepath/lxc-sshd.
890 * On success return the template, on error return NULL.
891 */
892char *get_template_path(const char *t)
893{
894 int ret, len;
895 char *tpath;
896
897 if (t[0] == '/' && access(t, X_OK) == 0) {
898 tpath = strdup(t);
899 return tpath;
900 }
901
902 len = strlen(LXCTEMPLATEDIR) + strlen(t) + strlen("/lxc-") + 1;
b14fc100 903
6010a416
SG
904 tpath = malloc(len);
905 if (!tpath)
906 return NULL;
b14fc100 907
6010a416
SG
908 ret = snprintf(tpath, len, "%s/lxc-%s", LXCTEMPLATEDIR, t);
909 if (ret < 0 || ret >= len) {
910 free(tpath);
911 return NULL;
912 }
b14fc100 913
6010a416
SG
914 if (access(tpath, X_OK) < 0) {
915 SYSERROR("bad template: %s", t);
916 free(tpath);
917 return NULL;
918 }
919
920 return tpath;
921}
0a4be28d 922
592fd47a
SH
923/*
924 * @path: a pathname where / replaced with '\0'.
925 * @offsetp: pointer to int showing which path segment was last seen.
926 * Updated on return to reflect the next segment.
927 * @fulllen: full original path length.
928 * Returns a pointer to the next path segment, or NULL if done.
929 */
930static char *get_nextpath(char *path, int *offsetp, int fulllen)
931{
932 int offset = *offsetp;
933
934 if (offset >= fulllen)
935 return NULL;
936
91d9cab6 937 while (offset < fulllen && path[offset] != '\0')
592fd47a 938 offset++;
b14fc100 939
91d9cab6 940 while (offset < fulllen && path[offset] == '\0')
592fd47a
SH
941 offset++;
942
943 *offsetp = offset;
944 return (offset < fulllen) ? &path[offset] : NULL;
945}
946
947/*
948 * Check that @subdir is a subdir of @dir. @len is the length of
949 * @dir (to avoid having to recalculate it).
950 */
951static bool is_subdir(const char *subdir, const char *dir, size_t len)
952{
953 size_t subdirlen = strlen(subdir);
954
955 if (subdirlen < len)
956 return false;
b14fc100 957
592fd47a
SH
958 if (strncmp(subdir, dir, len) != 0)
959 return false;
b14fc100 960
592fd47a
SH
961 if (dir[len-1] == '/')
962 return true;
b14fc100 963
592fd47a
SH
964 if (subdir[len] == '/' || subdirlen == len)
965 return true;
b14fc100 966
592fd47a
SH
967 return false;
968}
969
970/*
971 * Check if the open fd is a symlink. Return -ELOOP if it is. Return
972 * -ENOENT if we couldn't fstat. Return 0 if the fd is ok.
973 */
974static int check_symlink(int fd)
975{
976 struct stat sb;
b14fc100 977 int ret;
978
979 ret = fstat(fd, &sb);
592fd47a
SH
980 if (ret < 0)
981 return -ENOENT;
b14fc100 982
592fd47a
SH
983 if (S_ISLNK(sb.st_mode))
984 return -ELOOP;
b14fc100 985
592fd47a
SH
986 return 0;
987}
988
989/*
990 * Open a file or directory, provided that it contains no symlinks.
991 *
992 * CAVEAT: This function must not be used for other purposes than container
993 * setup before executing the container's init
994 */
995static int open_if_safe(int dirfd, const char *nextpath)
996{
997 int newfd = openat(dirfd, nextpath, O_RDONLY | O_NOFOLLOW);
1a0e70ac 998 if (newfd >= 0) /* Was not a symlink, all good. */
592fd47a
SH
999 return newfd;
1000
1001 if (errno == ELOOP)
1002 return newfd;
1003
1004 if (errno == EPERM || errno == EACCES) {
1a0e70ac
CB
1005 /* We're not root (cause we got EPERM) so try opening with
1006 * O_PATH.
1007 */
592fd47a
SH
1008 newfd = openat(dirfd, nextpath, O_PATH | O_NOFOLLOW);
1009 if (newfd >= 0) {
1a0e70ac
CB
1010 /* O_PATH will return an fd for symlinks. We know
1011 * nextpath wasn't a symlink at last openat, so if fd is
1012 * now a link, then something * fishy is going on.
592fd47a
SH
1013 */
1014 int ret = check_symlink(newfd);
1015 if (ret < 0) {
1016 close(newfd);
1017 newfd = ret;
1018 }
1019 }
1020 }
1021
1022 return newfd;
1023}
1024
1025/*
1026 * Open a path intending for mounting, ensuring that the final path
1027 * is inside the container's rootfs.
1028 *
1029 * CAVEAT: This function must not be used for other purposes than container
1030 * setup before executing the container's init
1031 *
1032 * @target: path to be opened
1033 * @prefix_skip: a part of @target in which to ignore symbolic links. This
1034 * would be the container's rootfs.
1035 *
1036 * Return an open fd for the path, or <0 on error.
1037 */
1038static int open_without_symlink(const char *target, const char *prefix_skip)
1039{
1040 int curlen = 0, dirfd, fulllen, i;
1041 char *dup = NULL;
1042
1043 fulllen = strlen(target);
1044
1045 /* make sure prefix-skip makes sense */
01074e5b 1046 if (prefix_skip && strlen(prefix_skip) > 0) {
592fd47a
SH
1047 curlen = strlen(prefix_skip);
1048 if (!is_subdir(target, prefix_skip, curlen)) {
1049 ERROR("WHOA there - target '%s' didn't start with prefix '%s'",
1050 target, prefix_skip);
1051 return -EINVAL;
1052 }
b14fc100 1053
592fd47a
SH
1054 /*
1055 * get_nextpath() expects the curlen argument to be
1056 * on a (turned into \0) / or before it, so decrement
1057 * curlen to make sure that happens
1058 */
1059 if (curlen)
1060 curlen--;
1061 } else {
1062 prefix_skip = "/";
1063 curlen = 0;
1064 }
1065
1066 /* Make a copy of target which we can hack up, and tokenize it */
1067 if ((dup = strdup(target)) == NULL) {
1068 SYSERROR("Out of memory checking for symbolic link");
1069 return -ENOMEM;
1070 }
b14fc100 1071
592fd47a
SH
1072 for (i = 0; i < fulllen; i++) {
1073 if (dup[i] == '/')
1074 dup[i] = '\0';
1075 }
1076
1077 dirfd = open(prefix_skip, O_RDONLY);
1078 if (dirfd < 0)
1079 goto out;
b14fc100 1080
592fd47a
SH
1081 while (1) {
1082 int newfd, saved_errno;
1083 char *nextpath;
1084
1085 if ((nextpath = get_nextpath(dup, &curlen, fulllen)) == NULL)
1086 goto out;
b14fc100 1087
592fd47a
SH
1088 newfd = open_if_safe(dirfd, nextpath);
1089 saved_errno = errno;
1090 close(dirfd);
b14fc100 1091
592fd47a
SH
1092 dirfd = newfd;
1093 if (newfd < 0) {
1094 errno = saved_errno;
1095 if (errno == ELOOP)
1096 SYSERROR("%s in %s was a symbolic link!", nextpath, target);
b14fc100 1097
592fd47a
SH
1098 goto out;
1099 }
1100 }
1101
1102out:
1103 free(dup);
1104 return dirfd;
1105}
1106
1107/*
1108 * Safely mount a path into a container, ensuring that the mount target
1109 * is under the container's @rootfs. (If @rootfs is NULL, then the container
1110 * uses the host's /)
1111 *
1112 * CAVEAT: This function must not be used for other purposes than container
1113 * setup before executing the container's init
1114 */
1115int safe_mount(const char *src, const char *dest, const char *fstype,
1116 unsigned long flags, const void *data, const char *rootfs)
1117{
1a0e70ac
CB
1118 int destfd, ret, saved_errno;
1119 /* Only needs enough for /proc/self/fd/<fd>. */
1120 char srcbuf[50], destbuf[50];
1121 int srcfd = -1;
592fd47a
SH
1122 const char *mntsrc = src;
1123
1124 if (!rootfs)
1125 rootfs = "";
1126
1127 /* todo - allow symlinks for relative paths if 'allowsymlinks' option is passed */
1128 if (flags & MS_BIND && src && src[0] != '/') {
1129 INFO("this is a relative bind mount");
b14fc100 1130
592fd47a
SH
1131 srcfd = open_without_symlink(src, NULL);
1132 if (srcfd < 0)
1133 return srcfd;
b14fc100 1134
592fd47a
SH
1135 ret = snprintf(srcbuf, 50, "/proc/self/fd/%d", srcfd);
1136 if (ret < 0 || ret > 50) {
1137 close(srcfd);
1138 ERROR("Out of memory");
1139 return -EINVAL;
1140 }
1141 mntsrc = srcbuf;
1142 }
1143
1144 destfd = open_without_symlink(dest, rootfs);
1145 if (destfd < 0) {
88e078ba
CB
1146 if (srcfd != -1) {
1147 saved_errno = errno;
592fd47a 1148 close(srcfd);
88e078ba
CB
1149 errno = saved_errno;
1150 }
b14fc100 1151
592fd47a
SH
1152 return destfd;
1153 }
1154
1155 ret = snprintf(destbuf, 50, "/proc/self/fd/%d", destfd);
1156 if (ret < 0 || ret > 50) {
1157 if (srcfd != -1)
1158 close(srcfd);
b14fc100 1159
592fd47a
SH
1160 close(destfd);
1161 ERROR("Out of memory");
1162 return -EINVAL;
1163 }
1164
1165 ret = mount(mntsrc, destbuf, fstype, flags, data);
1166 saved_errno = errno;
1167 if (srcfd != -1)
1168 close(srcfd);
b14fc100 1169
592fd47a
SH
1170 close(destfd);
1171 if (ret < 0) {
1172 errno = saved_errno;
0103eb53 1173 SYSERROR("Failed to mount %s onto %s", src ? src : "(null)", dest);
592fd47a
SH
1174 return ret;
1175 }
1176
1177 return 0;
1178}
1179
ced03a01
SH
1180/*
1181 * Mount a proc under @rootfs if proc self points to a pid other than
1182 * my own. This is needed to have a known-good proc mount for setting
1183 * up LSMs both at container startup and attach.
1184 *
1185 * @rootfs : the rootfs where proc should be mounted
1186 *
1187 * Returns < 0 on failure, 0 if the correct proc was already mounted
1188 * and 1 if a new proc was mounted.
f267d666
BP
1189 *
1190 * NOTE: not to be called from inside the container namespace!
ced03a01 1191 */
943144d9 1192int lxc_mount_proc_if_needed(const char *rootfs)
ced03a01 1193{
d726953a 1194 char path[PATH_MAX];
6b1ba5d6 1195 int link_to_pid, linklen, mypid, ret;
40464e8a 1196 char link[INTTYPE_TO_STRLEN(pid_t)] = {0};
ced03a01 1197
d726953a
CB
1198 ret = snprintf(path, PATH_MAX, "%s/proc/self", rootfs);
1199 if (ret < 0 || ret >= PATH_MAX) {
ced03a01
SH
1200 SYSERROR("proc path name too long");
1201 return -1;
1202 }
fc2ad9dc 1203
979a0d93 1204 linklen = readlink(path, link, sizeof(link));
fc2ad9dc 1205
d726953a
CB
1206 ret = snprintf(path, PATH_MAX, "%s/proc", rootfs);
1207 if (ret < 0 || ret >= PATH_MAX) {
d539a2b2
CB
1208 SYSERROR("proc path name too long");
1209 return -1;
1210 }
fc2ad9dc
CB
1211
1212 /* /proc not mounted */
1213 if (linklen < 0) {
1214 if (mkdir(path, 0755) && errno != EEXIST)
1215 return -1;
b14fc100 1216
ced03a01 1217 goto domount;
979a0d93 1218 } else if (linklen >= sizeof(link)) {
6b1ba5d6
CB
1219 link[linklen - 1] = '\0';
1220 ERROR("readlink returned truncated content: \"%s\"", link);
1221 return -1;
fc2ad9dc
CB
1222 }
1223
0059379f 1224 mypid = lxc_raw_getpid();
6b1ba5d6
CB
1225 INFO("I am %d, /proc/self points to \"%s\"", mypid, link);
1226
2d036cca
CB
1227 if (lxc_safe_int(link, &link_to_pid) < 0)
1228 return -1;
fc2ad9dc 1229
6b1ba5d6
CB
1230 /* correct procfs is already mounted */
1231 if (link_to_pid == mypid)
1232 return 0;
fc2ad9dc 1233
6b1ba5d6
CB
1234 ret = umount2(path, MNT_DETACH);
1235 if (ret < 0)
1236 WARN("failed to umount \"%s\" with MNT_DETACH", path);
ced03a01
SH
1237
1238domount:
fc2ad9dc 1239 /* rootfs is NULL */
6b1ba5d6 1240 if (!strcmp(rootfs, ""))
f267d666
BP
1241 ret = mount("proc", path, "proc", 0, NULL);
1242 else
1243 ret = safe_mount("proc", path, "proc", 0, NULL, rootfs);
f267d666 1244 if (ret < 0)
ced03a01 1245 return -1;
f267d666 1246
fc2ad9dc 1247 INFO("mounted /proc in container for security transition");
ced03a01
SH
1248 return 1;
1249}
69aeabac 1250
f8dd0275 1251int open_devnull(void)
69aeabac 1252{
f8dd0275
AM
1253 int fd = open("/dev/null", O_RDWR);
1254
1255 if (fd < 0)
1256 SYSERROR("Can't open /dev/null");
1257
1258 return fd;
1259}
69aeabac 1260
f8dd0275
AM
1261int set_stdfds(int fd)
1262{
bbbf65ee
CB
1263 int ret;
1264
69aeabac
TA
1265 if (fd < 0)
1266 return -1;
1267
bbbf65ee
CB
1268 ret = dup2(fd, STDIN_FILENO);
1269 if (ret < 0)
f8dd0275 1270 return -1;
bbbf65ee
CB
1271
1272 ret = dup2(fd, STDOUT_FILENO);
1273 if (ret < 0)
f8dd0275 1274 return -1;
bbbf65ee
CB
1275
1276 ret = dup2(fd, STDERR_FILENO);
1277 if (ret < 0)
f8dd0275
AM
1278 return -1;
1279
1280 return 0;
1281}
1282
1283int null_stdfds(void)
1284{
1285 int ret = -1;
b14fc100 1286 int fd;
f8dd0275 1287
b14fc100 1288 fd = open_devnull();
f8dd0275
AM
1289 if (fd >= 0) {
1290 ret = set_stdfds(fd);
1291 close(fd);
1292 }
69aeabac 1293
69aeabac
TA
1294 return ret;
1295}
ccb4cabe 1296
330ae3d3 1297/* Check whether a signal is blocked by a process. */
de3c491b 1298/* /proc/pid-to-str/status\0 = (5 + 21 + 7 + 1) */
40464e8a 1299#define __PROC_STATUS_LEN (6 + INTTYPE_TO_STRLEN(pid_t) + 7 + 1)
573ad77f 1300bool task_blocks_signal(pid_t pid, int signal)
330ae3d3 1301{
330ae3d3 1302 int ret;
de3c491b 1303 char status[__PROC_STATUS_LEN];
eabf1ea9 1304 FILE *f;
573ad77f 1305 uint64_t sigblk = 0, one = 1;
eabf1ea9
CB
1306 size_t n = 0;
1307 bool bret = false;
1308 char *line = NULL;
330ae3d3 1309
de3c491b
CB
1310 ret = snprintf(status, __PROC_STATUS_LEN, "/proc/%d/status", pid);
1311 if (ret < 0 || ret >= __PROC_STATUS_LEN)
330ae3d3
CB
1312 return bret;
1313
1314 f = fopen(status, "r");
1315 if (!f)
1316 return bret;
1317
1318 while (getline(&line, &n, f) != -1) {
573ad77f
CB
1319 char *numstr;
1320
eabf1ea9 1321 if (strncmp(line, "SigBlk:", 7))
6fbcbe3b
CB
1322 continue;
1323
573ad77f
CB
1324 numstr = lxc_trim_whitespace_in_place(line + 7);
1325 ret = lxc_safe_uint64(numstr, &sigblk, 16);
1326 if (ret < 0)
6fbcbe3b 1327 goto out;
573ad77f
CB
1328
1329 break;
330ae3d3
CB
1330 }
1331
573ad77f 1332 if (sigblk & (one << (signal - 1)))
330ae3d3
CB
1333 bret = true;
1334
1335out:
1336 free(line);
1337 fclose(f);
1338 return bret;
1339}
000dfda7 1340
a687256f
CB
1341int lxc_preserve_ns(const int pid, const char *ns)
1342{
1343 int ret;
a052913d
CB
1344/* 5 /proc + 21 /int_as_str + 3 /ns + 20 /NS_NAME + 1 \0 */
1345#define __NS_PATH_LEN 50
1346 char path[__NS_PATH_LEN];
a687256f 1347
4d8ac866
CB
1348 /* This way we can use this function to also check whether namespaces
1349 * are supported by the kernel by passing in the NULL or the empty
1350 * string.
1351 */
a052913d 1352 ret = snprintf(path, __NS_PATH_LEN, "/proc/%d/ns%s%s", pid,
4d8ac866
CB
1353 !ns || strcmp(ns, "") == 0 ? "" : "/",
1354 !ns || strcmp(ns, "") == 0 ? "" : ns);
ea918412 1355 if (ret < 0 || (size_t)ret >= __NS_PATH_LEN) {
1356 errno = EFBIG;
1357 return -1;
1358 }
a687256f
CB
1359
1360 return open(path, O_RDONLY | O_CLOEXEC);
1361}
6bc2eafe 1362
464c4611 1363bool lxc_switch_uid_gid(uid_t uid, gid_t gid)
dbaf55a3 1364{
db2d1af1
CB
1365 int ret = 0;
1366
1367 if (gid != LXC_INVALID_GID) {
1368 ret = setgid(gid);
1369 if (ret < 0) {
1370 SYSERROR("Failed to switch to gid %d", gid);
464c4611 1371 return false;
db2d1af1
CB
1372 }
1373 NOTICE("Switched to gid %d", gid);
dbaf55a3 1374 }
dbaf55a3 1375
db2d1af1
CB
1376 if (uid != LXC_INVALID_UID) {
1377 ret = setuid(uid);
1378 if (ret < 0) {
1379 SYSERROR("Failed to switch to uid %d", uid);
464c4611 1380 return false;
db2d1af1
CB
1381 }
1382 NOTICE("Switched to uid %d", uid);
dbaf55a3 1383 }
dbaf55a3 1384
464c4611 1385 return true;
dbaf55a3
CB
1386}
1387
46b3a2f6 1388/* Simple convenience function which enables uniform logging. */
8af07f82 1389bool lxc_setgroups(int size, gid_t list[])
dbaf55a3
CB
1390{
1391 if (setgroups(size, list) < 0) {
8af07f82
CB
1392 SYSERROR("Failed to setgroups()");
1393 return false;
dbaf55a3 1394 }
8af07f82 1395 NOTICE("Dropped additional groups");
dbaf55a3 1396
8af07f82 1397 return true;
dbaf55a3 1398}
c6868a1f
CB
1399
1400static int lxc_get_unused_loop_dev_legacy(char *loop_name)
1401{
1402 struct dirent *dp;
1403 struct loop_info64 lo64;
1404 DIR *dir;
1405 int dfd = -1, fd = -1, ret = -1;
1406
1407 dir = opendir("/dev");
1408 if (!dir)
1409 return -1;
1410
1411 while ((dp = readdir(dir))) {
c6868a1f
CB
1412 if (strncmp(dp->d_name, "loop", 4) != 0)
1413 continue;
1414
1415 dfd = dirfd(dir);
1416 if (dfd < 0)
1417 continue;
1418
1419 fd = openat(dfd, dp->d_name, O_RDWR);
1420 if (fd < 0)
1421 continue;
1422
1423 ret = ioctl(fd, LOOP_GET_STATUS64, &lo64);
1424 if (ret < 0) {
1425 if (ioctl(fd, LOOP_GET_STATUS64, &lo64) == 0 ||
1426 errno != ENXIO) {
1427 close(fd);
1428 fd = -1;
1429 continue;
1430 }
1431 }
1432
1433 ret = snprintf(loop_name, LO_NAME_SIZE, "/dev/%s", dp->d_name);
1434 if (ret < 0 || ret >= LO_NAME_SIZE) {
1435 close(fd);
1436 fd = -1;
1437 continue;
1438 }
1439
1440 break;
1441 }
1442
1443 closedir(dir);
1444
1445 if (fd < 0)
1446 return -1;
1447
1448 return fd;
1449}
1450
1451static int lxc_get_unused_loop_dev(char *name_loop)
1452{
1453 int loop_nr, ret;
1454 int fd_ctl = -1, fd_tmp = -1;
1455
1456 fd_ctl = open("/dev/loop-control", O_RDWR | O_CLOEXEC);
1457 if (fd_ctl < 0)
1458 return -ENODEV;
1459
1460 loop_nr = ioctl(fd_ctl, LOOP_CTL_GET_FREE);
1461 if (loop_nr < 0)
1462 goto on_error;
1463
1464 ret = snprintf(name_loop, LO_NAME_SIZE, "/dev/loop%d", loop_nr);
1465 if (ret < 0 || ret >= LO_NAME_SIZE)
1466 goto on_error;
1467
1468 fd_tmp = open(name_loop, O_RDWR | O_CLOEXEC);
1469 if (fd_tmp < 0)
1470 goto on_error;
1471
1472on_error:
1473 close(fd_ctl);
1474 return fd_tmp;
1475}
1476
1477int lxc_prepare_loop_dev(const char *source, char *loop_dev, int flags)
1478{
1479 int ret;
1480 struct loop_info64 lo64;
1481 int fd_img = -1, fret = -1, fd_loop = -1;
1482
1483 fd_loop = lxc_get_unused_loop_dev(loop_dev);
1484 if (fd_loop < 0) {
1485 if (fd_loop == -ENODEV)
1486 fd_loop = lxc_get_unused_loop_dev_legacy(loop_dev);
1487 else
1488 goto on_error;
1489 }
1490
1491 fd_img = open(source, O_RDWR | O_CLOEXEC);
1492 if (fd_img < 0)
1493 goto on_error;
1494
1495 ret = ioctl(fd_loop, LOOP_SET_FD, fd_img);
1496 if (ret < 0)
1497 goto on_error;
1498
1499 memset(&lo64, 0, sizeof(lo64));
1500 lo64.lo_flags = flags;
1501
1502 ret = ioctl(fd_loop, LOOP_SET_STATUS64, &lo64);
1503 if (ret < 0)
1504 goto on_error;
1505
1506 fret = 0;
1507
1508on_error:
1509 if (fd_img >= 0)
1510 close(fd_img);
1511
1512 if (fret < 0 && fd_loop >= 0) {
1513 close(fd_loop);
1514 fd_loop = -1;
1515 }
1516
1517 return fd_loop;
1518}
74251e49
CB
1519
1520int lxc_unstack_mountpoint(const char *path, bool lazy)
1521{
1522 int ret;
1523 int umounts = 0;
1524
1525pop_stack:
1526 ret = umount2(path, lazy ? MNT_DETACH : 0);
1527 if (ret < 0) {
1528 /* We consider anything else than EINVAL deadly to prevent going
1529 * into an infinite loop. (The other alternative is constantly
1530 * parsing /proc/self/mountinfo which is yucky and probably
1531 * racy.)
1532 */
1533 if (errno != EINVAL)
1534 return -errno;
1535 } else {
b4a40f7b
CB
1536 /* Just stop counting when this happens. That'd just be so
1537 * stupid that we won't even bother trying to report back the
1538 * correct value anymore.
1539 */
1540 if (umounts != INT_MAX)
1541 umounts++;
b14fc100 1542
74251e49
CB
1543 /* We succeeded in umounting. Make sure that there's no other
1544 * mountpoint stacked underneath.
1545 */
74251e49
CB
1546 goto pop_stack;
1547 }
1548
1549 return umounts;
1550}
ea3a694f
CB
1551
1552int run_command(char *buf, size_t buf_size, int (*child_fn)(void *), void *args)
1553{
1554 pid_t child;
1555 int ret, fret, pipefd[2];
1556 ssize_t bytes;
1557
46210729 1558 /* Make sure our callers do not receive uninitialized memory. */
ea3a694f
CB
1559 if (buf_size > 0 && buf)
1560 buf[0] = '\0';
1561
1562 if (pipe(pipefd) < 0) {
1563 SYSERROR("failed to create pipe");
1564 return -1;
1565 }
1566
2d728b2f 1567 child = lxc_raw_clone(0);
ea3a694f
CB
1568 if (child < 0) {
1569 close(pipefd[0]);
1570 close(pipefd[1]);
1571 SYSERROR("failed to create new process");
1572 return -1;
1573 }
1574
1575 if (child == 0) {
1576 /* Close the read-end of the pipe. */
1577 close(pipefd[0]);
1578
1579 /* Redirect std{err,out} to write-end of the
1580 * pipe.
1581 */
1582 ret = dup2(pipefd[1], STDOUT_FILENO);
1583 if (ret >= 0)
1584 ret = dup2(pipefd[1], STDERR_FILENO);
1585
1586 /* Close the write-end of the pipe. */
1587 close(pipefd[1]);
1588
1589 if (ret < 0) {
1590 SYSERROR("failed to duplicate std{err,out} file descriptor");
d8b3f9c3 1591 _exit(EXIT_FAILURE);
ea3a694f
CB
1592 }
1593
1594 /* Does not return. */
1595 child_fn(args);
1596 ERROR("failed to exec command");
d8b3f9c3 1597 _exit(EXIT_FAILURE);
ea3a694f
CB
1598 }
1599
1600 /* close the write-end of the pipe */
1601 close(pipefd[1]);
1602
7a643c7c 1603 if (buf && buf_size > 0) {
a5bc6cb0 1604 bytes = lxc_read_nointr(pipefd[0], buf, buf_size - 1);
7a643c7c
CB
1605 if (bytes > 0)
1606 buf[bytes - 1] = '\0';
1607 }
ea3a694f
CB
1608
1609 fret = wait_for_pid(child);
1610 /* close the read-end of the pipe */
1611 close(pipefd[0]);
1612
1613 return fret;
1614}
04ad7ffe 1615
d75c14e2
CB
1616bool lxc_nic_exists(char *nic)
1617{
1618#define __LXC_SYS_CLASS_NET_LEN 15 + IFNAMSIZ + 1
1619 char path[__LXC_SYS_CLASS_NET_LEN];
1620 int ret;
1621 struct stat sb;
1622
1623 if (!strcmp(nic, "none"))
1624 return true;
1625
1626 ret = snprintf(path, __LXC_SYS_CLASS_NET_LEN, "/sys/class/net/%s", nic);
1627 if (ret < 0 || (size_t)ret >= __LXC_SYS_CLASS_NET_LEN)
1628 return false;
1629
1630 ret = stat(path, &sb);
1631 if (ret < 0)
1632 return false;
1633
1634 return true;
1635}
127c6e70 1636
6222c3f4
CB
1637uint64_t lxc_find_next_power2(uint64_t n)
1638{
1639 /* 0 is not valid input. We return 0 to the caller since 0 is not a
1640 * valid power of two.
1641 */
1642 if (n == 0)
1643 return 0;
1644
1645 if (!(n & (n - 1)))
1646 return n;
1647
1648 while (n & (n - 1))
1649 n = n & (n - 1);
1650
1651 n = n << 1;
1652 return n;
1653}
1fd0f41e 1654
c7f493ae 1655int lxc_set_death_signal(int signal, pid_t parent)
1fd0f41e
CB
1656{
1657 int ret;
1658 pid_t ppid;
1659
b81689a1
CB
1660 ret = prctl(PR_SET_PDEATHSIG, prctl_arg(signal), prctl_arg(0),
1661 prctl_arg(0), prctl_arg(0));
1fd0f41e
CB
1662
1663 /* Check whether we have been orphaned. */
1664 ppid = (pid_t)syscall(SYS_getppid);
c7f493ae
CB
1665 if (ppid != parent) {
1666 ret = raise(SIGKILL);
1fd0f41e
CB
1667 if (ret < 0)
1668 return -1;
1669 }
1670
1671 if (ret < 0) {
1672 SYSERROR("Failed to set PR_SET_PDEATHSIG to %d", signal);
1673 return -1;
1674 }
1675
1676 return 0;
1677}
7ad37670 1678
a9d4ebc1
CB
1679int fd_cloexec(int fd, bool cloexec)
1680{
1681 int oflags, nflags;
1682
1683 oflags = fcntl(fd, F_GETFD, 0);
1684 if (oflags < 0)
1685 return -errno;
1686
1687 if (cloexec)
1688 nflags = oflags | FD_CLOEXEC;
1689 else
1690 nflags = oflags & ~FD_CLOEXEC;
1691
1692 if (nflags == oflags)
1693 return 0;
1694
1695 if (fcntl(fd, F_SETFD, nflags) < 0)
1696 return -errno;
1697
1698 return 0;
1699}
d7ab0375 1700
1701int recursive_destroy(char *dirname)
1702{
1703 int ret;
1704 struct dirent *direntp;
1705 DIR *dir;
1706 int r = 0;
1707
1708 dir = opendir(dirname);
1709 if (!dir)
1710 return -1;
1711
1712 while ((direntp = readdir(dir))) {
1713 char *pathname;
1714 struct stat mystat;
1715
1716 if (!strcmp(direntp->d_name, ".") ||
1717 !strcmp(direntp->d_name, ".."))
1718 continue;
1719
1720 pathname = must_make_path(dirname, direntp->d_name, NULL);
1721
1722 ret = lstat(pathname, &mystat);
1723 if (ret < 0) {
1724 if (!r)
1725 WARN("Failed to stat \"%s\"", pathname);
1726
1727 r = -1;
1728 goto next;
1729 }
1730
1731 if (!S_ISDIR(mystat.st_mode))
1732 goto next;
1733
1734 ret = recursive_destroy(pathname);
1735 if (ret < 0)
1736 r = -1;
1737
1738 next:
1739 free(pathname);
1740 }
1741
1742 ret = rmdir(dirname);
1743 if (ret < 0) {
1744 if (!r)
1745 SYSWARN("Failed to delete \"%s\"", dirname);
1746
1747 r = -1;
1748 }
1749
1750 ret = closedir(dir);
1751 if (ret < 0) {
1752 if (!r)
1753 SYSWARN("Failed to delete \"%s\"", dirname);
1754
1755 r = -1;
1756 }
1757
1758 return r;
1759}
b25291da
CB
1760
1761int lxc_setup_keyring(void)
1762{
1763 key_serial_t keyring;
1764 int ret = 0;
1765
1766 /* Try to allocate a new session keyring for the container to prevent
1767 * information leaks.
1768 */
1769 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, prctl_arg(0),
1770 prctl_arg(0), prctl_arg(0), prctl_arg(0));
1771 if (keyring < 0) {
1772 switch (errno) {
1773 case ENOSYS:
1774 DEBUG("The keyctl() syscall is not supported or blocked");
1775 break;
1776 case EACCES:
1777 __fallthrough;
1778 case EPERM:
1779 DEBUG("Failed to access kernel keyring. Continuing...");
1780 break;
1781 default:
1782 SYSERROR("Failed to create kernel keyring");
1783 ret = -1;
1784 break;
1785 }
1786 }
1787
1788 return ret;
1789}