]> git.proxmox.com Git - mirror_lxc.git/blob - src/lxc/criu.c
tree-wide: harden mount option parsing
[mirror_lxc.git] / src / lxc / criu.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2
3 #ifndef _GNU_SOURCE
4 #define _GNU_SOURCE 1
5 #endif
6 #include <inttypes.h>
7 #include <linux/limits.h>
8 #include <sched.h>
9 #include <stdio.h>
10 #include <stdlib.h>
11 #include <string.h>
12 #include <sys/mount.h>
13 #include <sys/types.h>
14 #include <sys/wait.h>
15 #include <unistd.h>
16
17 #include "cgroup.h"
18 #include "commands.h"
19 #include "conf.h"
20 #include "config.h"
21 #include "criu.h"
22 #include "log.h"
23 #include "lxc.h"
24 #include "lxclock.h"
25 #include "network.h"
26 #include "storage.h"
27 #include "syscall_wrappers.h"
28 #include "utils.h"
29
30 #if IS_BIONIC
31 #include <../include/lxcmntent.h>
32 #else
33 #include <mntent.h>
34 #endif
35
36 #ifndef HAVE_STRLCPY
37 #include "include/strlcpy.h"
38 #endif
39
40 #define CRIU_VERSION "2.0"
41
42 #define CRIU_GITID_VERSION "2.0"
43 #define CRIU_GITID_PATCHLEVEL 0
44
45 #define CRIU_IN_FLIGHT_SUPPORT "2.4"
46 #define CRIU_EXTERNAL_NOT_VETH "2.8"
47
48 lxc_log_define(criu, lxc);
49
50 struct criu_opts {
51 /* the thing to hook to stdout and stderr for logging */
52 int pipefd;
53
54 /* The type of criu invocation, one of "dump" or "restore" */
55 char *action;
56
57 /* the user-provided migrate options relevant to this action */
58 struct migrate_opts *user;
59
60 /* The container to dump */
61 struct lxc_container *c;
62
63 /* dump: stop the container or not after dumping? */
64 char tty_id[32]; /* the criu tty id for /dev/console, i.e. "tty[${rdev}:${dev}]" */
65
66 /* restore: the file to write the init process' pid into */
67 struct lxc_handler *handler;
68 int console_fd;
69 /* The path that is bind mounted from /dev/console, if any. We don't
70 * want to use `--ext-mount-map auto`'s result here because the pts
71 * device may have a different path (e.g. if the pty number is
72 * different) on the target host. NULL if lxc.console.path = "none".
73 */
74 char *console_name;
75
76 /* The detected version of criu */
77 char *criu_version;
78 };
79
80 static int load_tty_major_minor(char *directory, char *output, int len)
81 {
82 FILE *f;
83 char path[PATH_MAX];
84 int ret;
85
86 ret = snprintf(path, sizeof(path), "%s/tty.info", directory);
87 if (ret < 0 || ret >= sizeof(path)) {
88 ERROR("snprintf'd too many characters: %d", ret);
89 return -1;
90 }
91
92 f = fopen(path, "re");
93 if (!f) {
94 /* This means we're coming from a liblxc which didn't export
95 * the tty info. In this case they had to have lxc.console.path
96 * = * none, so there's no problem restoring.
97 */
98 if (errno == ENOENT)
99 return 0;
100
101 SYSERROR("couldn't open %s", path);
102 return -1;
103 }
104
105 if (!fgets(output, len, f)) {
106 fclose(f);
107 SYSERROR("couldn't read %s", path);
108 return -1;
109 }
110
111 fclose(f);
112 return 0;
113 }
114
115 static int cmp_version(const char *v1, const char *v2)
116 {
117 int ret;
118 int oct_v1[3], oct_v2[3];
119
120 memset(oct_v1, -1, sizeof(oct_v1));
121 memset(oct_v2, -1, sizeof(oct_v2));
122
123 ret = sscanf(v1, "%d.%d.%d", &oct_v1[0], &oct_v1[1], &oct_v1[2]);
124 if (ret < 1)
125 return -1;
126
127 ret = sscanf(v2, "%d.%d.%d", &oct_v2[0], &oct_v2[1], &oct_v2[2]);
128 if (ret < 1)
129 return -1;
130
131 /* Major version is greater. */
132 if (oct_v1[0] > oct_v2[0])
133 return 1;
134
135 if (oct_v1[0] < oct_v2[0])
136 return -1;
137
138 /* Minor number is greater.*/
139 if (oct_v1[1] > oct_v2[1])
140 return 1;
141
142 if (oct_v1[1] < oct_v2[1])
143 return -1;
144
145 /* Patch number is greater. */
146 if (oct_v1[2] > oct_v2[2])
147 return 1;
148
149 /* Patch numbers are equal. */
150 if (oct_v1[2] == oct_v2[2])
151 return 0;
152
153 return -1;
154 }
155
156 static void exec_criu(struct cgroup_ops *cgroup_ops, struct lxc_conf *conf,
157 struct criu_opts *opts)
158 {
159 char **argv, log[PATH_MAX];
160 int static_args = 23, argc = 0, i, ret;
161 int netnr = 0;
162 struct lxc_list *it;
163 FILE *mnts;
164 struct mntent mntent;
165
166 char buf[4096], ttys[32];
167 size_t pos;
168
169 /* If we are currently in a cgroup /foo/bar, and the container is in a
170 * cgroup /lxc/foo, lxcfs will give us an ENOENT if some task in the
171 * container has an open fd that points to one of the cgroup files
172 * (systemd always opens its "root" cgroup). So, let's escape to the
173 * /actual/ root cgroup so that lxcfs thinks criu has enough rights to
174 * see all cgroups.
175 */
176 if (!cgroup_ops->escape(cgroup_ops, conf)) {
177 ERROR("failed to escape cgroups");
178 return;
179 }
180
181 /* The command line always looks like:
182 * criu $(action) --tcp-established --file-locks --link-remap \
183 * --manage-cgroups=full --action-script foo.sh -D $(directory) \
184 * -o $(directory)/$(action).log --ext-mount-map auto
185 * --enable-external-sharing --enable-external-masters
186 * --enable-fs hugetlbfs --enable-fs tracefs --ext-mount-map console:/dev/pts/n
187 * +1 for final NULL */
188
189 if (strcmp(opts->action, "dump") == 0 || strcmp(opts->action, "pre-dump") == 0) {
190 /* -t pid --freeze-cgroup /lxc/ct */
191 static_args += 4;
192
193 /* --prev-images-dir <path-to-directory-A-relative-to-B> */
194 if (opts->user->predump_dir)
195 static_args += 2;
196
197 /* --page-server --address <address> --port <port> */
198 if (opts->user->pageserver_address && opts->user->pageserver_port)
199 static_args += 5;
200
201 /* --leave-running (only for final dump) */
202 if (strcmp(opts->action, "dump") == 0 && !opts->user->stop)
203 static_args++;
204
205 /* --external tty[88,4] */
206 if (opts->tty_id[0])
207 static_args += 2;
208
209 /* --force-irmap */
210 if (!opts->user->preserves_inodes)
211 static_args++;
212
213 /* --ghost-limit 1024 */
214 if (opts->user->ghost_limit)
215 static_args += 2;
216 } else if (strcmp(opts->action, "restore") == 0) {
217 /* --root $(lxc_mount_point) --restore-detached
218 * --restore-sibling
219 * --lsm-profile apparmor:whatever
220 */
221 static_args += 6;
222
223 ttys[0] = 0;
224 if (load_tty_major_minor(opts->user->directory, ttys, sizeof(ttys)))
225 return;
226
227 /* --inherit-fd fd[%d]:tty[%s] */
228 if (ttys[0])
229 static_args += 2;
230 } else {
231 return;
232 }
233
234 if (cgroup_ops->num_hierarchies(cgroup_ops) > 0)
235 static_args += 2 * cgroup_ops->num_hierarchies(cgroup_ops);
236
237 if (opts->user->verbose)
238 static_args++;
239
240 if (opts->user->action_script)
241 static_args += 2;
242
243 static_args += 2 * lxc_list_len(&opts->c->lxc_conf->mount_list);
244
245 ret = snprintf(log, PATH_MAX, "%s/%s.log", opts->user->directory, opts->action);
246 if (ret < 0 || ret >= PATH_MAX) {
247 ERROR("logfile name too long");
248 return;
249 }
250
251 argv = malloc(static_args * sizeof(*argv));
252 if (!argv)
253 return;
254
255 memset(argv, 0, static_args * sizeof(*argv));
256
257 #define DECLARE_ARG(arg) \
258 do { \
259 if (arg == NULL) { \
260 ERROR("Got NULL argument for criu"); \
261 goto err; \
262 } \
263 argv[argc++] = strdup(arg); \
264 if (!argv[argc-1]) \
265 goto err; \
266 } while (0)
267
268 argv[argc++] = on_path("criu", NULL);
269 if (!argv[argc-1]) {
270 ERROR("Couldn't find criu binary");
271 goto err;
272 }
273
274 DECLARE_ARG(opts->action);
275 DECLARE_ARG("--tcp-established");
276 DECLARE_ARG("--file-locks");
277 DECLARE_ARG("--link-remap");
278 DECLARE_ARG("--manage-cgroups=full");
279 DECLARE_ARG("--ext-mount-map");
280 DECLARE_ARG("auto");
281 DECLARE_ARG("--enable-external-sharing");
282 DECLARE_ARG("--enable-external-masters");
283 DECLARE_ARG("--enable-fs");
284 DECLARE_ARG("hugetlbfs");
285 DECLARE_ARG("--enable-fs");
286 DECLARE_ARG("tracefs");
287 DECLARE_ARG("-D");
288 DECLARE_ARG(opts->user->directory);
289 DECLARE_ARG("-o");
290 DECLARE_ARG(log);
291
292 for (i = 0; i < cgroup_ops->num_hierarchies(cgroup_ops); i++) {
293 char **controllers = NULL, *fullname;
294 char *path, *tmp;
295
296 if (!cgroup_ops->get_hierarchies(cgroup_ops, i, &controllers)) {
297 ERROR("failed to get hierarchy %d", i);
298 goto err;
299 }
300
301 /* if we are in a dump, we have to ask the monitor process what
302 * the right cgroup is. if this is a restore, we can just use
303 * the handler the restore task created.
304 */
305 if (!strcmp(opts->action, "dump") || !strcmp(opts->action, "pre-dump")) {
306 path = lxc_cmd_get_cgroup_path(opts->c->name, opts->c->config_path, controllers[0]);
307 if (!path) {
308 ERROR("failed to get cgroup path for %s", controllers[0]);
309 goto err;
310 }
311 } else {
312 const char *p;
313
314 p = cgroup_ops->get_cgroup(cgroup_ops, controllers[0]);
315 if (!p) {
316 ERROR("failed to get cgroup path for %s", controllers[0]);
317 goto err;
318 }
319
320 path = strdup(p);
321 if (!path) {
322 ERROR("strdup failed");
323 goto err;
324 }
325 }
326
327 tmp = lxc_deslashify(path);
328 if (!tmp) {
329 ERROR("Failed to remove extraneous slashes from \"%s\"",
330 path);
331 free(path);
332 goto err;
333 }
334 free(path);
335 path = tmp;
336
337 fullname = lxc_string_join(",", (const char **) controllers, false);
338 if (!fullname) {
339 ERROR("failed to join controllers");
340 free(path);
341 goto err;
342 }
343
344 ret = sprintf(buf, "%s:%s", fullname, path);
345 free(path);
346 free(fullname);
347 if (ret < 0 || ret >= sizeof(buf)) {
348 ERROR("sprintf of cgroup root arg failed");
349 goto err;
350 }
351
352 DECLARE_ARG("--cgroup-root");
353 DECLARE_ARG(buf);
354 }
355
356 if (opts->user->verbose)
357 DECLARE_ARG("-v4");
358
359 if (opts->user->action_script) {
360 DECLARE_ARG("--action-script");
361 DECLARE_ARG(opts->user->action_script);
362 }
363
364 mnts = make_anonymous_mount_file(&opts->c->lxc_conf->mount_list,
365 opts->c->lxc_conf->lsm_aa_allow_nesting);
366 if (!mnts)
367 goto err;
368
369 while (getmntent_r(mnts, &mntent, buf, sizeof(buf))) {
370 unsigned long flags = 0;
371 char *mntdata = NULL;
372 char arg[2 * PATH_MAX + 2];
373
374 if (parse_mntopts(mntent.mnt_opts, &flags, &mntdata) < 0)
375 goto err;
376
377 free(mntdata);
378
379 /* only add --ext-mount-map for actual bind mounts */
380 if (!(flags & MS_BIND))
381 continue;
382
383 if (strcmp(opts->action, "dump") == 0)
384 ret = snprintf(arg, sizeof(arg), "/%s:%s",
385 mntent.mnt_dir, mntent.mnt_dir);
386 else
387 ret = snprintf(arg, sizeof(arg), "%s:%s",
388 mntent.mnt_dir, mntent.mnt_fsname);
389 if (ret < 0 || ret >= sizeof(arg)) {
390 fclose(mnts);
391 ERROR("snprintf failed");
392 goto err;
393 }
394
395 DECLARE_ARG("--ext-mount-map");
396 DECLARE_ARG(arg);
397 }
398 fclose(mnts);
399
400 if (strcmp(opts->action, "dump") == 0 || strcmp(opts->action, "pre-dump") == 0) {
401 char pid[32], *freezer_relative;
402
403 if (sprintf(pid, "%d", opts->c->init_pid(opts->c)) < 0)
404 goto err;
405
406 DECLARE_ARG("-t");
407 DECLARE_ARG(pid);
408
409 freezer_relative = lxc_cmd_get_cgroup_path(opts->c->name,
410 opts->c->config_path,
411 "freezer");
412 if (!freezer_relative) {
413 ERROR("failed getting freezer path");
414 goto err;
415 }
416
417 ret = snprintf(log, sizeof(log), "/sys/fs/cgroup/freezer/%s", freezer_relative);
418 if (ret < 0 || ret >= sizeof(log))
419 goto err;
420
421 if (!opts->user->disable_skip_in_flight &&
422 strcmp(opts->criu_version, CRIU_IN_FLIGHT_SUPPORT) >= 0)
423 DECLARE_ARG("--skip-in-flight");
424
425 DECLARE_ARG("--freeze-cgroup");
426 DECLARE_ARG(log);
427
428 if (opts->tty_id[0]) {
429 DECLARE_ARG("--ext-mount-map");
430 DECLARE_ARG("/dev/console:console");
431
432 DECLARE_ARG("--external");
433 DECLARE_ARG(opts->tty_id);
434 }
435
436 if (opts->user->predump_dir) {
437 DECLARE_ARG("--prev-images-dir");
438 DECLARE_ARG(opts->user->predump_dir);
439 DECLARE_ARG("--track-mem");
440 }
441
442 if (opts->user->pageserver_address && opts->user->pageserver_port) {
443 DECLARE_ARG("--page-server");
444 DECLARE_ARG("--address");
445 DECLARE_ARG(opts->user->pageserver_address);
446 DECLARE_ARG("--port");
447 DECLARE_ARG(opts->user->pageserver_port);
448 }
449
450 if (!opts->user->preserves_inodes)
451 DECLARE_ARG("--force-irmap");
452
453 if (opts->user->ghost_limit) {
454 char ghost_limit[32];
455
456 ret = sprintf(ghost_limit, "%"PRIu64, opts->user->ghost_limit);
457 if (ret < 0 || ret >= sizeof(ghost_limit)) {
458 ERROR("failed to print ghost limit %"PRIu64, opts->user->ghost_limit);
459 goto err;
460 }
461
462 DECLARE_ARG("--ghost-limit");
463 DECLARE_ARG(ghost_limit);
464 }
465
466 /* only for final dump */
467 if (strcmp(opts->action, "dump") == 0 && !opts->user->stop)
468 DECLARE_ARG("--leave-running");
469 } else if (strcmp(opts->action, "restore") == 0) {
470 void *m;
471 int additional;
472 struct lxc_conf *lxc_conf = opts->c->lxc_conf;
473
474 DECLARE_ARG("--root");
475 DECLARE_ARG(opts->c->lxc_conf->rootfs.mount);
476 DECLARE_ARG("--restore-detached");
477 DECLARE_ARG("--restore-sibling");
478
479 if (ttys[0]) {
480 if (opts->console_fd < 0) {
481 ERROR("lxc.console.path configured on source host but not target");
482 goto err;
483 }
484
485 ret = snprintf(buf, sizeof(buf), "fd[%d]:%s", opts->console_fd, ttys);
486 if (ret < 0 || ret >= sizeof(buf))
487 goto err;
488
489 DECLARE_ARG("--inherit-fd");
490 DECLARE_ARG(buf);
491 }
492 if (opts->console_name) {
493 if (snprintf(buf, sizeof(buf), "console:%s", opts->console_name) < 0) {
494 SYSERROR("sprintf'd too many bytes");
495 }
496 DECLARE_ARG("--ext-mount-map");
497 DECLARE_ARG(buf);
498 }
499
500 if (lxc_conf->lsm_aa_profile || lxc_conf->lsm_se_context) {
501
502 if (lxc_conf->lsm_aa_profile)
503 ret = snprintf(buf, sizeof(buf), "apparmor:%s", lxc_conf->lsm_aa_profile);
504 else
505 ret = snprintf(buf, sizeof(buf), "selinux:%s", lxc_conf->lsm_se_context);
506
507 if (ret < 0 || ret >= sizeof(buf))
508 goto err;
509
510 DECLARE_ARG("--lsm-profile");
511 DECLARE_ARG(buf);
512 }
513
514 additional = lxc_list_len(&opts->c->lxc_conf->network) * 2;
515
516 m = realloc(argv, (argc + additional + 1) * sizeof(*argv));
517 if (!m)
518 goto err;
519 argv = m;
520
521 lxc_list_for_each(it, &opts->c->lxc_conf->network) {
522 size_t retlen;
523 char eth[128], *veth;
524 struct lxc_netdev *n = it->elem;
525 bool external_not_veth;
526
527 if (cmp_version(opts->criu_version, CRIU_EXTERNAL_NOT_VETH) >= 0) {
528 /* Since criu version 2.8 the usage of --veth-pair
529 * has been deprecated:
530 * git tag --contains f2037e6d3445fc400
531 * v2.8 */
532 external_not_veth = true;
533 } else {
534 external_not_veth = false;
535 }
536
537 if (n->name[0] != '\0') {
538 retlen = strlcpy(eth, n->name, sizeof(eth));
539 if (retlen >= sizeof(eth))
540 goto err;
541 } else {
542 ret = snprintf(eth, sizeof(eth), "eth%d", netnr);
543 if (ret < 0 || ret >= sizeof(eth))
544 goto err;
545 }
546
547 switch (n->type) {
548 case LXC_NET_VETH:
549 veth = n->priv.veth_attr.pair;
550 if (veth[0] == '\0')
551 veth = n->priv.veth_attr.veth1;
552
553 if (n->link[0] != '\0') {
554 if (external_not_veth)
555 ret = snprintf(buf, sizeof(buf),
556 "veth[%s]:%s@%s",
557 eth, veth,
558 n->link);
559 else
560 ret = snprintf(buf, sizeof(buf),
561 "%s=%s@%s", eth,
562 veth, n->link);
563 } else {
564 if (external_not_veth)
565 ret = snprintf(buf, sizeof(buf),
566 "veth[%s]:%s",
567 eth, veth);
568 else
569 ret = snprintf(buf, sizeof(buf),
570 "%s=%s", eth,
571 veth);
572 }
573 if (ret < 0 || ret >= sizeof(buf))
574 goto err;
575 break;
576 case LXC_NET_MACVLAN:
577 if (n->link[0] == '\0') {
578 ERROR("no host interface for macvlan %s", n->name);
579 goto err;
580 }
581
582 ret = snprintf(buf, sizeof(buf), "macvlan[%s]:%s", eth, n->link);
583 if (ret < 0 || ret >= sizeof(buf))
584 goto err;
585 break;
586 case LXC_NET_NONE:
587 case LXC_NET_EMPTY:
588 break;
589 default:
590 /* we have screened for this earlier... */
591 ERROR("unexpected network type %d", n->type);
592 goto err;
593 }
594
595 if (external_not_veth)
596 DECLARE_ARG("--external");
597 else
598 DECLARE_ARG("--veth-pair");
599 DECLARE_ARG(buf);
600 netnr++;
601 }
602
603 }
604
605 argv[argc] = NULL;
606
607 buf[0] = 0;
608 pos = 0;
609
610 for (i = 0; argv[i]; i++) {
611 ret = snprintf(buf + pos, sizeof(buf) - pos, "%s ", argv[i]);
612 if (ret < 0 || ret >= sizeof(buf) - pos)
613 goto err;
614 else
615 pos += ret;
616 }
617
618 INFO("execing: %s", buf);
619
620 /* before criu inits its log, it sometimes prints things to stdout/err;
621 * let's be sure we capture that.
622 */
623 if (dup2(opts->pipefd, STDOUT_FILENO) < 0) {
624 SYSERROR("dup2 stdout failed");
625 goto err;
626 }
627
628 if (dup2(opts->pipefd, STDERR_FILENO) < 0) {
629 SYSERROR("dup2 stderr failed");
630 goto err;
631 }
632
633 close(opts->pipefd);
634
635 #undef DECLARE_ARG
636 execv(argv[0], argv);
637 err:
638 for (i = 0; argv[i]; i++)
639 free(argv[i]);
640 free(argv);
641 }
642
643 /*
644 * Function to check if the checks activated in 'features_to_check' are
645 * available with the current architecture/kernel/criu combination.
646 *
647 * Parameter features_to_check is a bit mask of all features that should be
648 * checked (see feature check defines in lxc/lxccontainer.h).
649 *
650 * If the return value is true, all requested features are supported. If
651 * the return value is false the features_to_check parameter is updated
652 * to reflect which features are available. '0' means no feature but
653 * also that something went totally wrong.
654 *
655 * Some of the code flow of criu_version_ok() is duplicated and maybe it
656 * is a good candidate for refactoring.
657 */
658 bool __criu_check_feature(uint64_t *features_to_check)
659 {
660 pid_t pid;
661 uint64_t current_bit = 0;
662 int ret;
663 uint64_t features = *features_to_check;
664 /* Feature checking is currently always like
665 * criu check --feature <feature-name>
666 */
667 char *args[] = { "criu", "check", "--feature", NULL, NULL };
668
669 if ((features & ~FEATURE_MEM_TRACK & ~FEATURE_LAZY_PAGES) != 0) {
670 /* There are feature bits activated we do not understand.
671 * Refusing to answer at all */
672 *features_to_check = 0;
673 return false;
674 }
675
676 while (current_bit < (sizeof(uint64_t) * 8 - 1)) {
677 /* only test requested features */
678 if (!(features & (1ULL << current_bit))) {
679 /* skip this */
680 current_bit++;
681 continue;
682 }
683
684 pid = fork();
685 if (pid < 0) {
686 SYSERROR("fork() failed");
687 *features_to_check = 0;
688 return false;
689 }
690
691 if (pid == 0) {
692 if ((1ULL << current_bit) == FEATURE_MEM_TRACK)
693 /* This is needed for pre-dump support, which
694 * enables pre-copy migration. */
695 args[3] = "mem_dirty_track";
696 else if ((1ULL << current_bit) == FEATURE_LAZY_PAGES)
697 /* CRIU has two checks for userfaultfd support.
698 *
699 * The simpler check is only for 'uffd'. If the
700 * kernel supports userfaultfd without noncoop
701 * then only process can be lazily restored
702 * which do not fork. With 'uffd-noncoop'
703 * it is also possible to lazily restore processes
704 * which do fork. For a container runtime like
705 * LXC checking only for 'uffd' makes not much sense. */
706 args[3] = "uffd-noncoop";
707 else
708 _exit(EXIT_FAILURE);
709
710 null_stdfds();
711
712 execvp("criu", args);
713 SYSERROR("Failed to exec \"criu\"");
714 _exit(EXIT_FAILURE);
715 }
716
717 ret = wait_for_pid(pid);
718
719 if (ret == -1) {
720 /* It is not known why CRIU failed. Either
721 * CRIU is not available, the feature check
722 * does not exist or the feature is not
723 * supported. */
724 INFO("feature not supported");
725 /* Clear not supported feature bit */
726 features &= ~(1ULL << current_bit);
727 }
728
729 current_bit++;
730 /* no more checks requested; exit check loop */
731 if (!(features & ~((1ULL << current_bit)-1)))
732 break;
733 }
734 if (features != *features_to_check) {
735 *features_to_check = features;
736 return false;
737 }
738 return true;
739 }
740
741 /*
742 * Check to see if the criu version is recent enough for all the features we
743 * use. This version allows either CRIU_VERSION or (CRIU_GITID_VERSION and
744 * CRIU_GITID_PATCHLEVEL) to work, enabling users building from git to c/r
745 * things potentially before a version is released with a particular feature.
746 *
747 * The intent is that when criu development slows down, we can drop this, but
748 * for now we shouldn't attempt to c/r with versions that we know won't work.
749 *
750 * Note: If version != NULL criu_version() stores the detected criu version in
751 * version. Allocates memory for version which must be freed by caller.
752 */
753 static bool criu_version_ok(char **version)
754 {
755 int pipes[2];
756 pid_t pid;
757
758 if (pipe(pipes) < 0) {
759 SYSERROR("pipe() failed");
760 return false;
761 }
762
763 pid = fork();
764 if (pid < 0) {
765 SYSERROR("fork() failed");
766 return false;
767 }
768
769 if (pid == 0) {
770 char *args[] = { "criu", "--version", NULL };
771 char *path;
772 close(pipes[0]);
773
774 close(STDERR_FILENO);
775 if (dup2(pipes[1], STDOUT_FILENO) < 0)
776 _exit(EXIT_FAILURE);
777
778 path = on_path("criu", NULL);
779 if (!path)
780 _exit(EXIT_FAILURE);
781
782 execv(path, args);
783 _exit(EXIT_FAILURE);
784 } else {
785 FILE *f;
786 char *tmp;
787 int patch;
788
789 close(pipes[1]);
790 if (wait_for_pid(pid) < 0) {
791 close(pipes[0]);
792 SYSERROR("execing criu failed, is it installed?");
793 return false;
794 }
795
796 f = fdopen(pipes[0], "re");
797 if (!f) {
798 close(pipes[0]);
799 return false;
800 }
801
802 tmp = malloc(1024);
803 if (!tmp) {
804 fclose(f);
805 return false;
806 }
807
808 if (fscanf(f, "Version: %1023[^\n]s", tmp) != 1)
809 goto version_error;
810
811 if (fgetc(f) != '\n')
812 goto version_error;
813
814 if (strcmp(tmp, CRIU_VERSION) >= 0)
815 goto version_match;
816
817 if (fscanf(f, "GitID: v%1023[^-]s", tmp) != 1)
818 goto version_error;
819
820 if (fgetc(f) != '-')
821 goto version_error;
822
823 if (fscanf(f, "%d", &patch) != 1)
824 goto version_error;
825
826 if (strcmp(tmp, CRIU_GITID_VERSION) < 0)
827 goto version_error;
828
829 if (patch < CRIU_GITID_PATCHLEVEL)
830 goto version_error;
831
832 version_match:
833 fclose(f);
834 if (!version)
835 free(tmp);
836 else
837 *version = tmp;
838 return true;
839
840 version_error:
841 fclose(f);
842 free(tmp);
843 ERROR("must have criu " CRIU_VERSION " or greater to checkpoint/restore");
844 return false;
845 }
846 }
847
848 /* Check and make sure the container has a configuration that we know CRIU can
849 * dump. */
850 static bool criu_ok(struct lxc_container *c, char **criu_version)
851 {
852 struct lxc_list *it;
853
854 if (geteuid()) {
855 ERROR("Must be root to checkpoint");
856 return false;
857 }
858
859 if (!criu_version_ok(criu_version))
860 return false;
861
862 /* We only know how to restore containers with veth networks. */
863 lxc_list_for_each(it, &c->lxc_conf->network) {
864 struct lxc_netdev *n = it->elem;
865 switch(n->type) {
866 case LXC_NET_VETH:
867 case LXC_NET_NONE:
868 case LXC_NET_EMPTY:
869 case LXC_NET_MACVLAN:
870 break;
871 default:
872 ERROR("Found un-dumpable network: %s (%s)", lxc_net_type_to_str(n->type), n->name);
873 if (criu_version) {
874 free(*criu_version);
875 *criu_version = NULL;
876 }
877 return false;
878 }
879 }
880
881 return true;
882 }
883
884 static bool restore_net_info(struct lxc_container *c)
885 {
886 int ret;
887 struct lxc_list *it;
888 bool has_error = true;
889
890 if (container_mem_lock(c))
891 return false;
892
893 lxc_list_for_each(it, &c->lxc_conf->network) {
894 struct lxc_netdev *netdev = it->elem;
895 char template[IFNAMSIZ];
896
897 if (netdev->type != LXC_NET_VETH)
898 continue;
899
900 ret = snprintf(template, sizeof(template), "vethXXXXXX");
901 if (ret < 0 || ret >= sizeof(template))
902 goto out_unlock;
903
904 if (netdev->priv.veth_attr.pair[0] == '\0' &&
905 netdev->priv.veth_attr.veth1[0] == '\0') {
906 if (!lxc_ifname_alnum_case_sensitive(template))
907 goto out_unlock;
908
909 (void)strlcpy(netdev->priv.veth_attr.veth1, template, IFNAMSIZ);
910 }
911 }
912
913 has_error = false;
914
915 out_unlock:
916 container_mem_unlock(c);
917 return !has_error;
918 }
919
920 /* do_restore never returns, the calling process is used as the monitor process.
921 * do_restore calls _exit() if it fails.
922 */
923 static void do_restore(struct lxc_container *c, int status_pipe, struct migrate_opts *opts, char *criu_version)
924 {
925 int fd, ret;
926 pid_t pid;
927 struct lxc_handler *handler;
928 int status = 0;
929 int pipes[2] = {-1, -1};
930 struct cgroup_ops *cgroup_ops;
931
932 /* Try to detach from the current controlling tty if it exists.
933 * Otherwise, lxc_init (via lxc_console) will attach the container's
934 * console output to the current tty, which is probably not what any
935 * library user wants, and if they do, they can just manually configure
936 * it :)
937 */
938 fd = open("/dev/tty", O_RDWR);
939 if (fd >= 0) {
940 if (ioctl(fd, TIOCNOTTY, NULL) < 0)
941 SYSERROR("couldn't detach from tty");
942 close(fd);
943 }
944
945 handler = lxc_init_handler(c->name, c->lxc_conf, c->config_path, false);
946 if (!handler)
947 goto out;
948
949 if (lxc_init(c->name, handler) < 0)
950 goto out;
951
952 cgroup_ops = cgroup_init(c->lxc_conf);
953 if (!cgroup_ops)
954 goto out_fini_handler;
955 handler->cgroup_ops = cgroup_ops;
956
957 if (!cgroup_ops->payload_create(cgroup_ops, handler)) {
958 ERROR("failed creating groups");
959 goto out_fini_handler;
960 }
961
962 if (!restore_net_info(c)) {
963 ERROR("failed restoring network info");
964 goto out_fini_handler;
965 }
966
967 ret = resolve_clone_flags(handler);
968 if (ret < 0) {
969 SYSERROR("Unsupported clone flag specified");
970 goto out_fini_handler;
971 }
972
973 if (pipe2(pipes, O_CLOEXEC) < 0) {
974 SYSERROR("pipe() failed");
975 goto out_fini_handler;
976 }
977
978 pid = fork();
979 if (pid < 0)
980 goto out_fini_handler;
981
982 if (pid == 0) {
983 struct criu_opts os;
984 struct lxc_rootfs *rootfs;
985 int flags;
986
987 close(status_pipe);
988 status_pipe = -1;
989
990 close(pipes[0]);
991 pipes[0] = -1;
992
993 if (unshare(CLONE_NEWNS))
994 goto out_fini_handler;
995
996 /* CRIU needs the lxc root bind mounted so that it is the root of some
997 * mount. */
998 rootfs = &c->lxc_conf->rootfs;
999
1000 if (rootfs_is_blockdev(c->lxc_conf)) {
1001 if (lxc_setup_rootfs_prepare_root(c->lxc_conf, c->name,
1002 c->config_path) < 0)
1003 goto out_fini_handler;
1004 } else {
1005 if (mkdir(rootfs->mount, 0755) < 0 && errno != EEXIST)
1006 goto out_fini_handler;
1007
1008 if (mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL) < 0) {
1009 SYSERROR("remount / to private failed");
1010 goto out_fini_handler;
1011 }
1012
1013 if (mount(rootfs->path, rootfs->mount, NULL, MS_BIND, NULL) < 0) {
1014 rmdir(rootfs->mount);
1015 goto out_fini_handler;
1016 }
1017 }
1018
1019 os.pipefd = pipes[1];
1020 os.action = "restore";
1021 os.user = opts;
1022 os.c = c;
1023 os.console_fd = c->lxc_conf->console.slave;
1024 os.criu_version = criu_version;
1025 os.handler = handler;
1026
1027 if (os.console_fd >= 0) {
1028 /* Twiddle the FD_CLOEXEC bit. We want to pass this FD to criu
1029 * via --inherit-fd, so we don't want it to close.
1030 */
1031 flags = fcntl(os.console_fd, F_GETFD);
1032 if (flags < 0) {
1033 SYSERROR("F_GETFD failed: %d", os.console_fd);
1034 goto out_fini_handler;
1035 }
1036
1037 flags &= ~FD_CLOEXEC;
1038
1039 if (fcntl(os.console_fd, F_SETFD, flags) < 0) {
1040 SYSERROR("F_SETFD failed");
1041 goto out_fini_handler;
1042 }
1043 }
1044 os.console_name = c->lxc_conf->console.name;
1045
1046 /* exec_criu() returning is an error */
1047 exec_criu(cgroup_ops, c->lxc_conf, &os);
1048 umount(rootfs->mount);
1049 rmdir(rootfs->mount);
1050 goto out_fini_handler;
1051 } else {
1052 char title[2048];
1053
1054 close(pipes[1]);
1055 pipes[1] = -1;
1056
1057 pid_t w = waitpid(pid, &status, 0);
1058 if (w == -1) {
1059 SYSERROR("waitpid");
1060 goto out_fini_handler;
1061 }
1062
1063 if (WIFEXITED(status)) {
1064 char buf[4096];
1065
1066 if (WEXITSTATUS(status)) {
1067 int n;
1068
1069 n = lxc_read_nointr(pipes[0], buf, sizeof(buf));
1070 if (n < 0) {
1071 SYSERROR("failed reading from criu stderr");
1072 goto out_fini_handler;
1073 }
1074
1075 if (n == sizeof(buf))
1076 n--;
1077 buf[n] = 0;
1078
1079 ERROR("criu process exited %d, output:\n%s", WEXITSTATUS(status), buf);
1080 goto out_fini_handler;
1081 } else {
1082 ret = snprintf(buf, sizeof(buf), "/proc/self/task/%lu/children", (unsigned long)syscall(__NR_gettid));
1083 if (ret < 0 || ret >= sizeof(buf)) {
1084 ERROR("snprintf'd too many characters: %d", ret);
1085 goto out_fini_handler;
1086 }
1087
1088 FILE *f = fopen(buf, "re");
1089 if (!f) {
1090 SYSERROR("couldn't read restore's children file %s", buf);
1091 goto out_fini_handler;
1092 }
1093
1094 ret = fscanf(f, "%d", (int*) &handler->pid);
1095 fclose(f);
1096 if (ret != 1) {
1097 ERROR("reading restore pid failed");
1098 goto out_fini_handler;
1099 }
1100
1101 if (lxc_set_state(c->name, handler, RUNNING)) {
1102 ERROR("error setting running state after restore");
1103 goto out_fini_handler;
1104 }
1105 }
1106 } else {
1107 ERROR("CRIU was killed with signal %d", WTERMSIG(status));
1108 goto out_fini_handler;
1109 }
1110
1111 close(pipes[0]);
1112
1113 ret = lxc_write_nointr(status_pipe, &status, sizeof(status));
1114 close(status_pipe);
1115 status_pipe = -1;
1116
1117 if (sizeof(status) != ret) {
1118 SYSERROR("failed to write all of status");
1119 goto out_fini_handler;
1120 }
1121
1122 /*
1123 * See comment in lxcapi_start; we don't care if these
1124 * fail because it's just a beauty thing. We just
1125 * assign the return here to silence potential.
1126 */
1127 ret = snprintf(title, sizeof(title), "[lxc monitor] %s %s", c->config_path, c->name);
1128 if (ret < 0 || (size_t)ret >= sizeof(title))
1129 INFO("Setting truncated process name");
1130
1131 ret = setproctitle(title);
1132 if (ret < 0)
1133 INFO("Failed to set process name");
1134
1135 ret = lxc_poll(c->name, handler);
1136 if (ret)
1137 lxc_abort(handler);
1138 lxc_end(handler);
1139 _exit(ret);
1140 }
1141
1142 out_fini_handler:
1143 if (pipes[0] >= 0)
1144 close(pipes[0]);
1145 if (pipes[1] >= 0)
1146 close(pipes[1]);
1147
1148 lxc_end(handler);
1149
1150 out:
1151 if (status_pipe >= 0) {
1152 /* ensure getting here was a failure, e.g. if we failed to
1153 * parse the child pid or something, even after a successful
1154 * restore
1155 */
1156 if (!status)
1157 status = 1;
1158
1159 if (lxc_write_nointr(status_pipe, &status, sizeof(status)) != sizeof(status))
1160 SYSERROR("writing status failed");
1161 close(status_pipe);
1162 }
1163
1164 _exit(EXIT_FAILURE);
1165 }
1166
1167 static int save_tty_major_minor(char *directory, struct lxc_container *c, char *tty_id, int len)
1168 {
1169 FILE *f;
1170 char path[PATH_MAX];
1171 int ret;
1172 struct stat sb;
1173
1174 if (c->lxc_conf->console.path && !strcmp(c->lxc_conf->console.path, "none")) {
1175 tty_id[0] = 0;
1176 return 0;
1177 }
1178
1179 ret = snprintf(path, sizeof(path), "/proc/%d/root/dev/console", c->init_pid(c));
1180 if (ret < 0 || ret >= sizeof(path)) {
1181 ERROR("snprintf'd too many characters: %d", ret);
1182 return -1;
1183 }
1184
1185 ret = stat(path, &sb);
1186 if (ret < 0) {
1187 SYSERROR("stat of %s failed", path);
1188 return -1;
1189 }
1190
1191 ret = snprintf(path, sizeof(path), "%s/tty.info", directory);
1192 if (ret < 0 || ret >= sizeof(path)) {
1193 ERROR("snprintf'd too many characters: %d", ret);
1194 return -1;
1195 }
1196
1197 ret = snprintf(tty_id, len, "tty[%llx:%llx]",
1198 (long long unsigned) sb.st_rdev,
1199 (long long unsigned) sb.st_dev);
1200 if (ret < 0 || ret >= sizeof(path)) {
1201 ERROR("snprintf'd too many characters: %d", ret);
1202 return -1;
1203 }
1204
1205 f = fopen(path, "we");
1206 if (!f) {
1207 SYSERROR("failed to open %s", path);
1208 return -1;
1209 }
1210
1211 ret = fprintf(f, "%s", tty_id);
1212 fclose(f);
1213 if (ret < 0)
1214 SYSERROR("failed to write to %s", path);
1215 return ret;
1216 }
1217
1218 /* do one of either predump or a regular dump */
1219 static bool do_dump(struct lxc_container *c, char *mode, struct migrate_opts *opts)
1220 {
1221 int ret;
1222 pid_t pid;
1223 int criuout[2];
1224 char *criu_version = NULL;
1225
1226 if (!criu_ok(c, &criu_version))
1227 return false;
1228
1229 ret = pipe(criuout);
1230 if (ret < 0) {
1231 SYSERROR("pipe() failed");
1232 free(criu_version);
1233 return false;
1234 }
1235
1236 if (mkdir_p(opts->directory, 0700) < 0)
1237 goto fail;
1238
1239 pid = fork();
1240 if (pid < 0) {
1241 SYSERROR("fork failed");
1242 goto fail;
1243 }
1244
1245 if (pid == 0) {
1246 struct criu_opts os;
1247 struct cgroup_ops *cgroup_ops;
1248
1249 close(criuout[0]);
1250
1251 cgroup_ops = cgroup_init(c->lxc_conf);
1252 if (!cgroup_ops) {
1253 ERROR("failed to cgroup_init()");
1254 _exit(EXIT_FAILURE);
1255 }
1256
1257 os.pipefd = criuout[1];
1258 os.action = mode;
1259 os.user = opts;
1260 os.c = c;
1261 os.console_name = c->lxc_conf->console.path;
1262 os.criu_version = criu_version;
1263 os.handler = NULL;
1264
1265 ret = save_tty_major_minor(opts->directory, c, os.tty_id, sizeof(os.tty_id));
1266 if (ret < 0) {
1267 free(criu_version);
1268 _exit(EXIT_FAILURE);
1269 }
1270
1271 /* exec_criu() returning is an error */
1272 exec_criu(cgroup_ops, c->lxc_conf, &os);
1273 free(criu_version);
1274 _exit(EXIT_FAILURE);
1275 } else {
1276 int status;
1277 ssize_t n;
1278 char buf[4096];
1279
1280 close(criuout[1]);
1281
1282 pid_t w = waitpid(pid, &status, 0);
1283 if (w == -1) {
1284 SYSERROR("waitpid");
1285 close(criuout[0]);
1286 free(criu_version);
1287 return false;
1288 }
1289
1290 n = lxc_read_nointr(criuout[0], buf, sizeof(buf));
1291 close(criuout[0]);
1292 if (n < 0) {
1293 SYSERROR("read");
1294 n = 0;
1295 }
1296
1297 if (n == sizeof(buf))
1298 buf[n-1] = 0;
1299 else
1300 buf[n] = 0;
1301
1302 if (WIFEXITED(status)) {
1303 if (WEXITSTATUS(status)) {
1304 ERROR("dump failed with %d", WEXITSTATUS(status));
1305 ret = false;
1306 } else {
1307 ret = true;
1308 }
1309 } else if (WIFSIGNALED(status)) {
1310 ERROR("dump signaled with %d", WTERMSIG(status));
1311 ret = false;
1312 } else {
1313 ERROR("unknown dump exit %d", status);
1314 ret = false;
1315 }
1316
1317 if (!ret)
1318 ERROR("criu output: %s", buf);
1319
1320 free(criu_version);
1321 return ret;
1322 }
1323 fail:
1324 close(criuout[0]);
1325 close(criuout[1]);
1326 rmdir(opts->directory);
1327 free(criu_version);
1328 return false;
1329 }
1330
1331 bool __criu_pre_dump(struct lxc_container *c, struct migrate_opts *opts)
1332 {
1333 return do_dump(c, "pre-dump", opts);
1334 }
1335
1336 bool __criu_dump(struct lxc_container *c, struct migrate_opts *opts)
1337 {
1338 char path[PATH_MAX];
1339 int ret;
1340
1341 ret = snprintf(path, sizeof(path), "%s/inventory.img", opts->directory);
1342 if (ret < 0 || ret >= sizeof(path))
1343 return false;
1344
1345 if (access(path, F_OK) == 0) {
1346 ERROR("please use a fresh directory for the dump directory");
1347 return false;
1348 }
1349
1350 return do_dump(c, "dump", opts);
1351 }
1352
1353 bool __criu_restore(struct lxc_container *c, struct migrate_opts *opts)
1354 {
1355 pid_t pid;
1356 int status, nread;
1357 int pipefd[2];
1358 char *criu_version = NULL;
1359
1360 if (geteuid()) {
1361 ERROR("Must be root to restore");
1362 return false;
1363 }
1364
1365 if (pipe(pipefd)) {
1366 ERROR("failed to create pipe");
1367 return false;
1368 }
1369
1370 if (!criu_ok(c, &criu_version)) {
1371 close(pipefd[0]);
1372 close(pipefd[1]);
1373 return false;
1374 }
1375
1376 pid = fork();
1377 if (pid < 0) {
1378 close(pipefd[0]);
1379 close(pipefd[1]);
1380 free(criu_version);
1381 return false;
1382 }
1383
1384 if (pid == 0) {
1385 close(pipefd[0]);
1386 /* this never returns */
1387 do_restore(c, pipefd[1], opts, criu_version);
1388 }
1389
1390 close(pipefd[1]);
1391 free(criu_version);
1392
1393 nread = lxc_read_nointr(pipefd[0], &status, sizeof(status));
1394 close(pipefd[0]);
1395 if (sizeof(status) != nread) {
1396 ERROR("reading status from pipe failed");
1397 goto err_wait;
1398 }
1399
1400 /* If the criu process was killed or exited nonzero, wait() for the
1401 * handler, since the restore process died. Otherwise, we don't need to
1402 * wait, since the child becomes the monitor process.
1403 */
1404 if (!WIFEXITED(status) || WEXITSTATUS(status))
1405 goto err_wait;
1406 return true;
1407
1408 err_wait:
1409 if (wait_for_pid(pid))
1410 ERROR("restore process died");
1411 return false;
1412 }