]> git.proxmox.com Git - mirror_lxc.git/blob - src/lxc/initutils.c
build: add src/include to build and simplify header inclusions
[mirror_lxc.git] / src / lxc / initutils.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2
3 #ifndef _GNU_SOURCE
4 #define _GNU_SOURCE 1
5 #endif
6 #include <pthread.h>
7 #include <signal.h>
8 #include <sys/prctl.h>
9 #include <sys/syscall.h>
10 #include <sys/wait.h>
11 #include <unistd.h>
12
13 #include "compiler.h"
14 #include "config.h"
15 #include "error.h"
16 #include "file_utils.h"
17 #include "initutils.h"
18 #include "macro.h"
19 #include "memory_utils.h"
20 #include "process_utils.h"
21
22 #ifndef HAVE_STRLCPY
23 #include "strlcpy.h"
24 #endif
25
26 static char *copy_global_config_value(char *p)
27 {
28 int len = strlen(p);
29 char *retbuf;
30
31 if (len < 1)
32 return NULL;
33
34 if (p[len-1] == '\n') {
35 p[len-1] = '\0';
36 len--;
37 }
38
39 retbuf = malloc(len + 1);
40 if (!retbuf)
41 return NULL;
42
43 (void)strlcpy(retbuf, p, len + 1);
44 return retbuf;
45 }
46
47 const char *lxc_global_config_value(const char *option_name)
48 {
49 static const char * const options[][2] = {
50 { "lxc.bdev.lvm.vg", DEFAULT_VG },
51 { "lxc.bdev.lvm.thin_pool", DEFAULT_THIN_POOL },
52 { "lxc.bdev.zfs.root", DEFAULT_ZFSROOT },
53 { "lxc.bdev.rbd.rbdpool", DEFAULT_RBDPOOL },
54 { "lxc.lxcpath", NULL },
55 { "lxc.default_config", NULL },
56 { "lxc.cgroup.pattern", NULL },
57 { "lxc.cgroup.use", NULL },
58 { NULL, NULL },
59 };
60
61 /* placed in the thread local storage pool for non-bionic targets */
62 static thread_local const char *values[sizeof(options) / sizeof(options[0])] = {0};
63
64 /* user_config_path is freed as soon as it is used */
65 char *user_config_path = NULL;
66
67 /*
68 * The following variables are freed at bottom unconditionally.
69 * So NULL the value if it is to be returned to the caller
70 */
71 char *user_default_config_path = NULL;
72 char *user_lxc_path = NULL;
73 char *user_cgroup_pattern = NULL;
74
75 if (geteuid() > 0) {
76 const char *user_home = getenv("HOME");
77 if (!user_home)
78 user_home = "/";
79
80 user_config_path = malloc(sizeof(char) * (22 + strlen(user_home)));
81 user_default_config_path = malloc(sizeof(char) * (26 + strlen(user_home)));
82 user_lxc_path = malloc(sizeof(char) * (19 + strlen(user_home)));
83
84 sprintf(user_config_path, "%s/.config/lxc/lxc.conf", user_home);
85 sprintf(user_default_config_path, "%s/.config/lxc/default.conf", user_home);
86 sprintf(user_lxc_path, "%s/.local/share/lxc/", user_home);
87 }
88 else {
89 user_config_path = strdup(LXC_GLOBAL_CONF);
90 user_default_config_path = strdup(LXC_DEFAULT_CONFIG);
91 user_lxc_path = strdup(LXCPATH);
92 if (!strequal(DEFAULT_CGROUP_PATTERN, ""))
93 user_cgroup_pattern = strdup(DEFAULT_CGROUP_PATTERN);
94 }
95
96 const char * const (*ptr)[2];
97 size_t i;
98 FILE *fin = NULL;
99
100 for (i = 0, ptr = options; (*ptr)[0]; ptr++, i++) {
101 if (strequal(option_name, (*ptr)[0]))
102 break;
103 }
104 if (!(*ptr)[0]) {
105 free(user_config_path);
106 free(user_default_config_path);
107 free(user_lxc_path);
108 free(user_cgroup_pattern);
109 errno = EINVAL;
110 return NULL;
111 }
112
113 if (values[i]) {
114 free(user_config_path);
115 free(user_default_config_path);
116 free(user_lxc_path);
117 free(user_cgroup_pattern);
118 return values[i];
119 }
120
121 fin = fopen_cloexec(user_config_path, "r");
122 free(user_config_path);
123 if (fin) {
124 __do_free char *line = NULL;
125 size_t len = 0;
126 char *slider1, *slider2;
127
128 while (getline(&line, &len, fin) > 0) {
129 if (*line == '#')
130 continue;
131
132 slider1 = strstr(line, option_name);
133 if (!slider1)
134 continue;
135
136 /* see if there was just white space in front
137 * of the option name
138 */
139 for (slider2 = line; slider2 < slider1; slider2++)
140 if (*slider2 != ' ' && *slider2 != '\t')
141 break;
142
143 if (slider2 < slider1)
144 continue;
145
146 slider1 = strchr(slider1, '=');
147 if (!slider1)
148 continue;
149
150 /* see if there was just white space after
151 * the option name
152 */
153 for (slider2 += strlen(option_name); slider2 < slider1;
154 slider2++)
155 if (*slider2 != ' ' && *slider2 != '\t')
156 break;
157
158 if (slider2 < slider1)
159 continue;
160
161 slider1++;
162 while (*slider1 && (*slider1 == ' ' || *slider1 == '\t'))
163 slider1++;
164
165 if (!*slider1)
166 continue;
167
168 if (strequal(option_name, "lxc.lxcpath")) {
169 free(user_lxc_path);
170 user_lxc_path = copy_global_config_value(slider1);
171 remove_trailing_slashes(user_lxc_path);
172 values[i] = move_ptr(user_lxc_path);
173 goto out;
174 }
175
176 values[i] = copy_global_config_value(slider1);
177 goto out;
178 }
179 }
180
181 /* could not find value, use default */
182 if (strequal(option_name, "lxc.lxcpath")) {
183 remove_trailing_slashes(user_lxc_path);
184 values[i] = move_ptr(user_lxc_path);
185 } else if (strequal(option_name, "lxc.default_config")) {
186 values[i] = move_ptr(user_default_config_path);
187 } else if (strequal(option_name, "lxc.cgroup.pattern")) {
188 values[i] = move_ptr(user_cgroup_pattern);
189 } else {
190 values[i] = (*ptr)[1];
191 }
192
193 /* special case: if default value is NULL,
194 * and there is no config, don't view that
195 * as an error... */
196 if (!values[i])
197 errno = 0;
198
199 out:
200 if (fin)
201 fclose(fin);
202
203 free(user_cgroup_pattern);
204 free(user_default_config_path);
205 free(user_lxc_path);
206
207 return values[i];
208 }
209
210 /*
211 * Sets the process title to the specified title. Note that this may fail if
212 * the kernel doesn't support PR_SET_MM_MAP (kernels <3.18).
213 */
214 int setproctitle(char *title)
215 {
216 __do_fclose FILE *f = NULL;
217 int i, fd, len;
218 char *buf_ptr, *tmp_proctitle;
219 char buf[LXC_LINELEN];
220 int ret = 0;
221 ssize_t bytes_read = 0;
222 static char *proctitle = NULL;
223
224 /*
225 * We don't really need to know all of this stuff, but unfortunately
226 * PR_SET_MM_MAP requires us to set it all at once, so we have to
227 * figure it out anyway.
228 */
229 unsigned long start_data, end_data, start_brk, start_code, end_code,
230 start_stack, arg_start, arg_end, env_start, env_end, brk_val;
231 struct prctl_mm_map prctl_map;
232
233 f = fopen_cloexec("/proc/self/stat", "r");
234 if (!f)
235 return -1;
236
237 fd = fileno(f);
238 if (fd < 0)
239 return -1;
240
241 bytes_read = lxc_read_nointr(fd, buf, sizeof(buf) - 1);
242 if (bytes_read <= 0)
243 return -1;
244
245 buf[bytes_read] = '\0';
246
247 /* Skip the first 25 fields, column 26-28 are start_code, end_code,
248 * and start_stack */
249 buf_ptr = strchr(buf, ' ');
250 for (i = 0; i < 24; i++) {
251 if (!buf_ptr)
252 return -1;
253 buf_ptr = strchr(buf_ptr + 1, ' ');
254 }
255 if (!buf_ptr)
256 return -1;
257
258 i = sscanf(buf_ptr, "%lu %lu %lu", &start_code, &end_code, &start_stack);
259 if (i != 3)
260 return -1;
261
262 /* Skip the next 19 fields, column 45-51 are start_data to arg_end */
263 for (i = 0; i < 19; i++) {
264 if (!buf_ptr)
265 return -1;
266 buf_ptr = strchr(buf_ptr + 1, ' ');
267 }
268
269 if (!buf_ptr)
270 return -1;
271
272 i = sscanf(buf_ptr, "%lu %lu %lu %*u %*u %lu %lu", &start_data,
273 &end_data, &start_brk, &env_start, &env_end);
274 if (i != 5)
275 return -1;
276
277 /* Include the null byte here, because in the calculations below we
278 * want to have room for it. */
279 len = strlen(title) + 1;
280
281 tmp_proctitle = realloc(proctitle, len);
282 if (!tmp_proctitle)
283 return -1;
284
285 proctitle = tmp_proctitle;
286
287 arg_start = (unsigned long)proctitle;
288 arg_end = arg_start + len;
289
290 brk_val = syscall(__NR_brk, 0);
291
292 prctl_map = (struct prctl_mm_map){
293 .start_code = start_code,
294 .end_code = end_code,
295 .start_stack = start_stack,
296 .start_data = start_data,
297 .end_data = end_data,
298 .start_brk = start_brk,
299 .brk = brk_val,
300 .arg_start = arg_start,
301 .arg_end = arg_end,
302 .env_start = env_start,
303 .env_end = env_end,
304 .auxv = NULL,
305 .auxv_size = 0,
306 .exe_fd = -1,
307 };
308
309 ret = prctl(PR_SET_MM, prctl_arg(PR_SET_MM_MAP), prctl_arg(&prctl_map),
310 prctl_arg(sizeof(prctl_map)), prctl_arg(0));
311 if (ret == 0)
312 (void)strlcpy((char *)arg_start, title, len);
313
314 return ret;
315 }
316
317 static void prevent_forking(void)
318 {
319 __do_free char *line = NULL;
320 __do_fclose FILE *f = NULL;
321 char path[PATH_MAX];
322 size_t len = 0;
323
324 f = fopen("/proc/self/cgroup", "re");
325 if (!f)
326 return;
327
328 while (getline(&line, &len, f) != -1) {
329 __do_close int fd = -EBADF;
330 int ret;
331 char *p, *p2;
332
333 p = strchr(line, ':');
334 if (!p)
335 continue;
336 p++;
337 p2 = strchr(p, ':');
338 if (!p2)
339 continue;
340 *p2 = '\0';
341
342 /* This is a cgroup v2 entry. Skip it. */
343 if ((p2 - p) == 0)
344 continue;
345
346 if (strcmp(p, "pids") != 0)
347 continue;
348 p2++;
349
350 p2 += lxc_char_left_gc(p2, strlen(p2));
351 p2[lxc_char_right_gc(p2, strlen(p2))] = '\0';
352
353 ret = snprintf(path, sizeof(path),
354 "/sys/fs/cgroup/pids/%s/pids.max", p2);
355 if (ret < 0 || (size_t)ret >= sizeof(path)) {
356 fprintf(stderr, "Failed to create string\n");
357 return;
358 }
359
360 fd = open(path, O_WRONLY | O_CLOEXEC);
361 if (fd < 0) {
362 fprintf(stderr, "Failed to open \"%s\"\n", path);
363 return;
364 }
365
366 ret = write(fd, "1", 1);
367 if (ret != 1)
368 fprintf(stderr, "Failed to write to \"%s\"\n", path);
369
370 return;
371 }
372 }
373
374 static void kill_children(pid_t pid)
375 {
376 __do_fclose FILE *f = NULL;
377 char path[PATH_MAX];
378 int ret;
379
380 ret = snprintf(path, sizeof(path), "/proc/%d/task/%d/children", pid, pid);
381 if (ret < 0 || (size_t)ret >= sizeof(path)) {
382 fprintf(stderr, "Failed to create string\n");
383 return;
384 }
385
386 f = fopen(path, "re");
387 if (!f) {
388 fprintf(stderr, "Failed to open %s\n", path);
389 return;
390 }
391
392 while (!feof(f)) {
393 pid_t find_pid;
394
395 if (fscanf(f, "%d ", &find_pid) != 1) {
396 fprintf(stderr, "Failed to retrieve pid\n");
397 return;
398 }
399
400 (void)kill_children(find_pid);
401 (void)kill(find_pid, SIGKILL);
402 }
403 }
404
405 static void remove_self(void)
406 {
407 int ret;
408 ssize_t n;
409 char path[PATH_MAX] = {0};
410
411 n = readlink("/proc/self/exe", path, sizeof(path));
412 if (n < 0 || n >= PATH_MAX)
413 return;
414 path[n] = '\0';
415
416 ret = umount2(path, MNT_DETACH);
417 if (ret < 0)
418 return;
419
420 ret = unlink(path);
421 if (ret < 0)
422 return;
423 }
424
425 static sig_atomic_t was_interrupted;
426
427 static void interrupt_handler(int sig)
428 {
429 if (!was_interrupted)
430 was_interrupted = sig;
431 }
432
433 static int close_inherited(void)
434 {
435 int fddir;
436 DIR *dir;
437 struct dirent *direntp;
438
439 restart:
440 dir = opendir("/proc/self/fd");
441 if (!dir)
442 return -errno;
443
444 fddir = dirfd(dir);
445
446 while ((direntp = readdir(dir))) {
447 int fd, ret;
448
449 if (strcmp(direntp->d_name, ".") == 0)
450 continue;
451
452 if (strcmp(direntp->d_name, "..") == 0)
453 continue;
454
455 ret = lxc_safe_int(direntp->d_name, &fd);
456 if (ret < 0)
457 continue;
458
459 if (fd == STDERR_FILENO || fd == fddir)
460 break;
461
462 if (close(fd)) {
463 closedir(dir);
464 return -errno;
465 }
466
467 closedir(dir);
468 goto restart;
469 }
470
471 closedir(dir);
472 return 0;
473 }
474
475 __noreturn int lxc_container_init(int argc, char *const *argv, bool quiet)
476 {
477 int i, logfd, ret;
478 pid_t pid;
479 struct sigaction act;
480 sigset_t mask, omask;
481 int have_status = 0, exit_with = 1, shutdown = 0;
482
483 /* Mask all the signals so we are safe to install a signal handler and
484 * to fork.
485 */
486 ret = sigfillset(&mask);
487 if (ret < 0)
488 exit(EXIT_FAILURE);
489
490 ret = sigdelset(&mask, SIGILL);
491 if (ret < 0)
492 exit(EXIT_FAILURE);
493
494 ret = sigdelset(&mask, SIGSEGV);
495 if (ret < 0)
496 exit(EXIT_FAILURE);
497
498 ret = sigdelset(&mask, SIGBUS);
499 if (ret < 0)
500 exit(EXIT_FAILURE);
501
502 ret = pthread_sigmask(SIG_SETMASK, &mask, &omask);
503 if (ret < 0)
504 exit(EXIT_FAILURE);
505
506 ret = sigfillset(&act.sa_mask);
507 if (ret < 0)
508 exit(EXIT_FAILURE);
509
510 ret = sigdelset(&act.sa_mask, SIGILL);
511 if (ret < 0)
512 exit(EXIT_FAILURE);
513
514 ret = sigdelset(&act.sa_mask, SIGSEGV);
515 if (ret < 0)
516 exit(EXIT_FAILURE);
517
518 ret = sigdelset(&act.sa_mask, SIGBUS);
519 if (ret < 0)
520 exit(EXIT_FAILURE);
521
522 ret = sigdelset(&act.sa_mask, SIGSTOP);
523 if (ret < 0)
524 exit(EXIT_FAILURE);
525
526 ret = sigdelset(&act.sa_mask, SIGKILL);
527 if (ret < 0)
528 exit(EXIT_FAILURE);
529
530 act.sa_flags = 0;
531 act.sa_handler = interrupt_handler;
532
533 for (i = 1; i < NSIG; i++) {
534 /* Exclude some signals: ILL, SEGV and BUS are likely to reveal
535 * a bug and we want a core. STOP and KILL cannot be handled
536 * anyway: they're here for documentation. 32 and 33 are not
537 * defined.
538 */
539 if (i == SIGILL || i == SIGSEGV || i == SIGBUS ||
540 i == SIGSTOP || i == SIGKILL || i == 32 || i == 33)
541 continue;
542
543 ret = sigaction(i, &act, NULL);
544 if (ret < 0) {
545 if (errno == EINVAL)
546 continue;
547
548 if (!quiet)
549 fprintf(stderr, "Failed to change signal action\n");
550 exit(EXIT_FAILURE);
551 }
552 }
553
554 remove_self();
555
556 pid = vfork();
557 if (pid < 0)
558 exit(EXIT_FAILURE);
559
560 if (!pid) {
561 /* restore default signal handlers */
562 for (i = 1; i < NSIG; i++) {
563 sighandler_t sigerr;
564
565 if (i == SIGILL || i == SIGSEGV || i == SIGBUS ||
566 i == SIGSTOP || i == SIGKILL || i == 32 || i == 33)
567 continue;
568
569 sigerr = signal(i, SIG_DFL);
570 if (sigerr == SIG_ERR && !quiet)
571 fprintf(stderr, "Failed to reset to default action for signal \"%d\": %d\n", i, pid);
572 }
573
574 ret = pthread_sigmask(SIG_SETMASK, &omask, NULL);
575 if (ret < 0) {
576 if (quiet)
577 fprintf(stderr, "Failed to set signal mask\n");
578 exit(EXIT_FAILURE);
579 }
580
581 (void)setsid();
582
583 (void)ioctl(STDIN_FILENO, TIOCSCTTY, 0);
584
585 ret = execvp(argv[0], argv);
586 if (!quiet)
587 fprintf(stderr, "Failed to exec \"%s\"\n", argv[0]);
588 exit(ret);
589 }
590 logfd = open("/dev/console", O_WRONLY | O_NOCTTY | O_CLOEXEC);
591 if (logfd >= 0) {
592 ret = dup3(logfd, STDERR_FILENO, O_CLOEXEC);
593 if (ret < 0)
594 exit(EXIT_FAILURE);
595 }
596
597 (void)setproctitle("init");
598
599 /* Let's process the signals now. */
600 ret = sigdelset(&omask, SIGALRM);
601 if (ret < 0)
602 exit(EXIT_FAILURE);
603
604 ret = pthread_sigmask(SIG_SETMASK, &omask, NULL);
605 if (ret < 0) {
606 if (!quiet)
607 fprintf(stderr, "Failed to set signal mask\n");
608 exit(EXIT_FAILURE);
609 }
610
611 ret = close_range(STDERR_FILENO + 1, UINT_MAX, CLOSE_RANGE_UNSHARE);
612 if (ret) {
613 /*
614 * Fallback to close_inherited() when the syscall is not
615 * available or when CLOSE_RANGE_UNSHARE isn't supported.
616 * On a regular kernel CLOSE_RANGE_UNSHARE should always be
617 * available but openSUSE Leap 15.3 seems to have a partial
618 * backport without CLOSE_RANGE_UNSHARE support.
619 */
620 if (errno == ENOSYS || errno == EINVAL)
621 ret = close_inherited();
622 }
623 if (ret) {
624 fprintf(stderr, "Aborting attach to prevent leaking file descriptors into container\n");
625 exit(EXIT_FAILURE);
626 }
627
628 for (;;) {
629 int status;
630 pid_t waited_pid;
631
632 switch (was_interrupted) {
633 case 0:
634 /* Some applications send SIGHUP in order to get init to reload
635 * its configuration. We don't want to forward this onto the
636 * application itself, because it probably isn't expecting this
637 * signal since it was expecting init to do something with it.
638 *
639 * Instead, let's explicitly ignore it here. The actual
640 * terminal case is handled in the monitor's handler, which
641 * sends this task a SIGTERM in the case of a SIGHUP, which is
642 * what we want.
643 */
644 case SIGHUP:
645 break;
646 case SIGPWR:
647 case SIGTERM:
648 if (!shutdown) {
649 pid_t mypid = lxc_raw_getpid();
650
651 shutdown = 1;
652 prevent_forking();
653 if (mypid != 1) {
654 kill_children(mypid);
655 } else {
656 ret = kill(-1, SIGTERM);
657 if (ret < 0 && !quiet)
658 fprintf(stderr, "Failed to send SIGTERM to all children\n");
659 }
660 alarm(1);
661 }
662 break;
663 case SIGALRM: {
664 pid_t mypid = lxc_raw_getpid();
665
666 prevent_forking();
667 if (mypid != 1) {
668 kill_children(mypid);
669 } else {
670 ret = kill(-1, SIGKILL);
671 if (ret < 0 && !quiet)
672 fprintf(stderr, "Failed to send SIGTERM to all children\n");
673 }
674 break;
675 }
676 default:
677 kill(pid, was_interrupted);
678 break;
679 }
680 ret = EXIT_SUCCESS;
681
682 was_interrupted = 0;
683 waited_pid = wait(&status);
684 if (waited_pid < 0) {
685 if (errno == ECHILD)
686 goto out;
687
688 if (errno == EINTR)
689 continue;
690
691 if (!quiet)
692 fprintf(stderr, "Failed to wait on child %d\n", pid);
693 ret = -1;
694 goto out;
695 }
696
697 /* Reset timer each time a process exited. */
698 if (shutdown)
699 alarm(1);
700
701 /* Keep the exit code of the started application (not wrapped
702 * pid) and continue to wait for the end of the orphan group.
703 */
704 if (waited_pid == pid && !have_status) {
705 exit_with = lxc_error_set_and_log(waited_pid, status);
706 have_status = 1;
707 }
708 }
709 out:
710 if (ret < 0)
711 exit(EXIT_FAILURE);
712 exit(exit_with);
713 }