]> git.proxmox.com Git - mirror_lxc.git/blob - src/lxc/criu.c
Merge pull request #1376 from brauner/2017-01-04/sic_semper_assertis
[mirror_lxc.git] / src / lxc / criu.c
1 /*
2 * lxc: linux Container library
3 *
4 * Copyright © 2014-2015 Canonical Ltd.
5 *
6 * Authors:
7 * Tycho Andersen <tycho.andersen@canonical.com>
8 *
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 */
23 #define _GNU_SOURCE
24 #include <inttypes.h>
25 #include <linux/limits.h>
26 #include <sched.h>
27 #include <stdio.h>
28 #include <stdlib.h>
29 #include <string.h>
30 #include <sys/mount.h>
31 #include <sys/types.h>
32 #include <sys/wait.h>
33 #include <unistd.h>
34
35 #include "config.h"
36
37 #include "bdev.h"
38 #include "cgroup.h"
39 #include "conf.h"
40 #include "commands.h"
41 #include "criu.h"
42 #include "log.h"
43 #include "lxc.h"
44 #include "lxclock.h"
45 #include "network.h"
46 #include "utils.h"
47
48 #if IS_BIONIC
49 #include <../include/lxcmntent.h>
50 #else
51 #include <mntent.h>
52 #endif
53
54 #define CRIU_VERSION "2.0"
55
56 #define CRIU_GITID_VERSION "2.0"
57 #define CRIU_GITID_PATCHLEVEL 0
58
59 #define CRIU_IN_FLIGHT_SUPPORT "2.4"
60 #define CRIU_EXTERNAL_NOT_VETH "2.8"
61
62 lxc_log_define(lxc_criu, lxc);
63
64 struct criu_opts {
65 /* the thing to hook to stdout and stderr for logging */
66 int pipefd;
67
68 /* The type of criu invocation, one of "dump" or "restore" */
69 char *action;
70
71 /* the user-provided migrate options relevant to this action */
72 struct migrate_opts *user;
73
74 /* The container to dump */
75 struct lxc_container *c;
76
77 /* dump: stop the container or not after dumping? */
78 char tty_id[32]; /* the criu tty id for /dev/console, i.e. "tty[${rdev}:${dev}]" */
79
80 /* restore: the file to write the init process' pid into */
81 struct lxc_handler *handler;
82 int console_fd;
83 /* The path that is bind mounted from /dev/console, if any. We don't
84 * want to use `--ext-mount-map auto`'s result here because the pts
85 * device may have a different path (e.g. if the pty number is
86 * different) on the target host. NULL if lxc.console = "none".
87 */
88 char *console_name;
89
90 /* The detected version of criu */
91 char *criu_version;
92 };
93
94 static int load_tty_major_minor(char *directory, char *output, int len)
95 {
96 FILE *f;
97 char path[PATH_MAX];
98 int ret;
99
100 ret = snprintf(path, sizeof(path), "%s/tty.info", directory);
101 if (ret < 0 || ret >= sizeof(path)) {
102 ERROR("snprintf'd too many chacters: %d", ret);
103 return -1;
104 }
105
106 f = fopen(path, "r");
107 if (!f) {
108 /* This means we're coming from a liblxc which didn't export
109 * the tty info. In this case they had to have lxc.console =
110 * none, so there's no problem restoring.
111 */
112 if (errno == ENOENT)
113 return 0;
114
115 SYSERROR("couldn't open %s", path);
116 return -1;
117 }
118
119 if (!fgets(output, len, f)) {
120 fclose(f);
121 SYSERROR("couldn't read %s", path);
122 return -1;
123 }
124
125 fclose(f);
126 return 0;
127 }
128
129 static void exec_criu(struct criu_opts *opts)
130 {
131 char **argv, log[PATH_MAX];
132 int static_args = 23, argc = 0, i, ret;
133 int netnr = 0;
134 struct lxc_list *it;
135 FILE *mnts;
136 struct mntent mntent;
137
138 char buf[4096], tty_info[32];
139 size_t pos;
140
141 /* If we are currently in a cgroup /foo/bar, and the container is in a
142 * cgroup /lxc/foo, lxcfs will give us an ENOENT if some task in the
143 * container has an open fd that points to one of the cgroup files
144 * (systemd always opens its "root" cgroup). So, let's escape to the
145 * /actual/ root cgroup so that lxcfs thinks criu has enough rights to
146 * see all cgroups.
147 */
148 if (!cgroup_escape()) {
149 ERROR("failed to escape cgroups");
150 return;
151 }
152
153 /* The command line always looks like:
154 * criu $(action) --tcp-established --file-locks --link-remap \
155 * --manage-cgroups=full --action-script foo.sh -D $(directory) \
156 * -o $(directory)/$(action).log --ext-mount-map auto
157 * --enable-external-sharing --enable-external-masters
158 * --enable-fs hugetlbfs --enable-fs tracefs --ext-mount-map console:/dev/pts/n
159 * +1 for final NULL */
160
161 if (strcmp(opts->action, "dump") == 0 || strcmp(opts->action, "pre-dump") == 0) {
162 /* -t pid --freeze-cgroup /lxc/ct */
163 static_args += 4;
164
165 /* --prev-images-dir <path-to-directory-A-relative-to-B> */
166 if (opts->user->predump_dir)
167 static_args += 2;
168
169 /* --page-server --address <address> --port <port> */
170 if (opts->user->pageserver_address && opts->user->pageserver_port)
171 static_args += 5;
172
173 /* --leave-running (only for final dump) */
174 if (strcmp(opts->action, "dump") == 0 && !opts->user->stop)
175 static_args++;
176
177 /* --external tty[88,4] */
178 if (opts->tty_id[0])
179 static_args += 2;
180
181 /* --force-irmap */
182 if (!opts->user->preserves_inodes)
183 static_args++;
184
185 /* --ghost-limit 1024 */
186 if (opts->user->ghost_limit)
187 static_args += 2;
188 } else if (strcmp(opts->action, "restore") == 0) {
189 /* --root $(lxc_mount_point) --restore-detached
190 * --restore-sibling
191 * --lsm-profile apparmor:whatever
192 */
193 static_args += 6;
194
195 tty_info[0] = 0;
196 if (load_tty_major_minor(opts->user->directory, tty_info, sizeof(tty_info)))
197 return;
198
199 /* --inherit-fd fd[%d]:tty[%s] */
200 if (tty_info[0])
201 static_args += 2;
202 } else {
203 return;
204 }
205
206 if (cgroup_num_hierarchies() > 0)
207 static_args += 2 * cgroup_num_hierarchies();
208
209 if (opts->user->verbose)
210 static_args++;
211
212 if (opts->user->action_script)
213 static_args += 2;
214
215 static_args += 2 * lxc_list_len(&opts->c->lxc_conf->mount_list);
216
217 ret = snprintf(log, PATH_MAX, "%s/%s.log", opts->user->directory, opts->action);
218 if (ret < 0 || ret >= PATH_MAX) {
219 ERROR("logfile name too long");
220 return;
221 }
222
223 argv = malloc(static_args * sizeof(*argv));
224 if (!argv)
225 return;
226
227 memset(argv, 0, static_args * sizeof(*argv));
228
229 #define DECLARE_ARG(arg) \
230 do { \
231 if (arg == NULL) { \
232 ERROR("Got NULL argument for criu"); \
233 goto err; \
234 } \
235 argv[argc++] = strdup(arg); \
236 if (!argv[argc-1]) \
237 goto err; \
238 } while (0)
239
240 argv[argc++] = on_path("criu", NULL);
241 if (!argv[argc-1]) {
242 ERROR("Couldn't find criu binary");
243 goto err;
244 }
245
246 DECLARE_ARG(opts->action);
247 DECLARE_ARG("--tcp-established");
248 DECLARE_ARG("--file-locks");
249 DECLARE_ARG("--link-remap");
250 DECLARE_ARG("--manage-cgroups=full");
251 DECLARE_ARG("--ext-mount-map");
252 DECLARE_ARG("auto");
253 DECLARE_ARG("--enable-external-sharing");
254 DECLARE_ARG("--enable-external-masters");
255 DECLARE_ARG("--enable-fs");
256 DECLARE_ARG("hugetlbfs");
257 DECLARE_ARG("--enable-fs");
258 DECLARE_ARG("tracefs");
259 DECLARE_ARG("-D");
260 DECLARE_ARG(opts->user->directory);
261 DECLARE_ARG("-o");
262 DECLARE_ARG(log);
263
264 for (i = 0; i < cgroup_num_hierarchies(); i++) {
265 char **controllers = NULL, *fullname;
266 char *path;
267
268 if (!cgroup_get_hierarchies(i, &controllers)) {
269 ERROR("failed to get hierarchy %d", i);
270 goto err;
271 }
272
273 /* if we are in a dump, we have to ask the monitor process what
274 * the right cgroup is. if this is a restore, we can just use
275 * the handler the restore task created.
276 */
277 if (!strcmp(opts->action, "dump") || !strcmp(opts->action, "pre-dump")) {
278 path = lxc_cmd_get_cgroup_path(opts->c->name, opts->c->config_path, controllers[0]);
279 if (!path) {
280 ERROR("failed to get cgroup path for %s", controllers[0]);
281 goto err;
282 }
283 } else {
284 const char *p;
285
286 p = cgroup_get_cgroup(opts->handler, controllers[0]);
287 if (!p) {
288 ERROR("failed to get cgroup path for %s", controllers[0]);
289 goto err;
290 }
291
292 path = strdup(p);
293 if (!path) {
294 ERROR("strdup failed");
295 goto err;
296 }
297 }
298
299 if (!lxc_deslashify(&path)) {
300 ERROR("failed to deslashify %s", path);
301 free(path);
302 goto err;
303 }
304
305 fullname = lxc_string_join(",", (const char **) controllers, false);
306 if (!fullname) {
307 ERROR("failed to join controllers");
308 free(path);
309 goto err;
310 }
311
312 ret = sprintf(buf, "%s:%s", fullname, path);
313 free(path);
314 free(fullname);
315 if (ret < 0 || ret >= sizeof(buf)) {
316 ERROR("sprintf of cgroup root arg failed");
317 goto err;
318 }
319
320 DECLARE_ARG("--cgroup-root");
321 DECLARE_ARG(buf);
322 }
323
324 if (opts->user->verbose)
325 DECLARE_ARG("-vvvvvv");
326
327 if (opts->user->action_script) {
328 DECLARE_ARG("--action-script");
329 DECLARE_ARG(opts->user->action_script);
330 }
331
332 mnts = make_anonymous_mount_file(&opts->c->lxc_conf->mount_list);
333 if (!mnts)
334 goto err;
335
336 while (getmntent_r(mnts, &mntent, buf, sizeof(buf))) {
337 char *fmt, *key, *val;
338 char arg[2 * PATH_MAX + 2];
339
340 if (strcmp(opts->action, "dump") == 0) {
341 fmt = "/%s:%s";
342 key = mntent.mnt_dir;
343 val = mntent.mnt_dir;
344 } else {
345 fmt = "%s:%s";
346 key = mntent.mnt_dir;
347 val = mntent.mnt_fsname;
348 }
349
350 ret = snprintf(arg, sizeof(arg), fmt, key, val);
351 if (ret < 0 || ret >= sizeof(arg)) {
352 fclose(mnts);
353 ERROR("snprintf failed");
354 goto err;
355 }
356
357 DECLARE_ARG("--ext-mount-map");
358 DECLARE_ARG(arg);
359 }
360 fclose(mnts);
361
362 if (strcmp(opts->action, "dump") == 0 || strcmp(opts->action, "pre-dump") == 0) {
363 char pid[32], *freezer_relative;
364
365 if (sprintf(pid, "%d", opts->c->init_pid(opts->c)) < 0)
366 goto err;
367
368 DECLARE_ARG("-t");
369 DECLARE_ARG(pid);
370
371 freezer_relative = lxc_cmd_get_cgroup_path(opts->c->name,
372 opts->c->config_path,
373 "freezer");
374 if (!freezer_relative) {
375 ERROR("failed getting freezer path");
376 goto err;
377 }
378
379 ret = snprintf(log, sizeof(log), "/sys/fs/cgroup/freezer/%s", freezer_relative);
380 if (ret < 0 || ret >= sizeof(log))
381 goto err;
382
383 if (!opts->user->disable_skip_in_flight &&
384 strcmp(opts->criu_version, CRIU_IN_FLIGHT_SUPPORT) >= 0)
385 DECLARE_ARG("--skip-in-flight");
386
387 DECLARE_ARG("--freeze-cgroup");
388 DECLARE_ARG(log);
389
390 if (opts->tty_id[0]) {
391 DECLARE_ARG("--ext-mount-map");
392 DECLARE_ARG("/dev/console:console");
393
394 DECLARE_ARG("--external");
395 DECLARE_ARG(opts->tty_id);
396 }
397
398 if (opts->user->predump_dir) {
399 DECLARE_ARG("--prev-images-dir");
400 DECLARE_ARG(opts->user->predump_dir);
401 DECLARE_ARG("--track-mem");
402 }
403
404 if (opts->user->pageserver_address && opts->user->pageserver_port) {
405 DECLARE_ARG("--page-server");
406 DECLARE_ARG("--address");
407 DECLARE_ARG(opts->user->pageserver_address);
408 DECLARE_ARG("--port");
409 DECLARE_ARG(opts->user->pageserver_port);
410 }
411
412 if (!opts->user->preserves_inodes)
413 DECLARE_ARG("--force-irmap");
414
415 if (opts->user->ghost_limit) {
416 char ghost_limit[32];
417
418 ret = sprintf(ghost_limit, "%"PRIu64, opts->user->ghost_limit);
419 if (ret < 0 || ret >= sizeof(ghost_limit)) {
420 ERROR("failed to print ghost limit %"PRIu64, opts->user->ghost_limit);
421 goto err;
422 }
423
424 DECLARE_ARG("--ghost-limit");
425 DECLARE_ARG(ghost_limit);
426 }
427
428 /* only for final dump */
429 if (strcmp(opts->action, "dump") == 0 && !opts->user->stop)
430 DECLARE_ARG("--leave-running");
431 } else if (strcmp(opts->action, "restore") == 0) {
432 void *m;
433 int additional;
434 struct lxc_conf *lxc_conf = opts->c->lxc_conf;
435
436 DECLARE_ARG("--root");
437 DECLARE_ARG(opts->c->lxc_conf->rootfs.mount);
438 DECLARE_ARG("--restore-detached");
439 DECLARE_ARG("--restore-sibling");
440
441 if (tty_info[0]) {
442 if (opts->console_fd < 0) {
443 ERROR("lxc.console configured on source host but not target");
444 goto err;
445 }
446
447 ret = snprintf(buf, sizeof(buf), "fd[%d]:%s", opts->console_fd, tty_info);
448 if (ret < 0 || ret >= sizeof(buf))
449 goto err;
450
451 DECLARE_ARG("--inherit-fd");
452 DECLARE_ARG(buf);
453 }
454 if (opts->console_name) {
455 if (snprintf(buf, sizeof(buf), "console:%s", opts->console_name) < 0) {
456 SYSERROR("sprintf'd too many bytes");
457 }
458 DECLARE_ARG("--ext-mount-map");
459 DECLARE_ARG(buf);
460 }
461
462 if (lxc_conf->lsm_aa_profile || lxc_conf->lsm_se_context) {
463
464 if (lxc_conf->lsm_aa_profile)
465 ret = snprintf(buf, sizeof(buf), "apparmor:%s", lxc_conf->lsm_aa_profile);
466 else
467 ret = snprintf(buf, sizeof(buf), "selinux:%s", lxc_conf->lsm_se_context);
468
469 if (ret < 0 || ret >= sizeof(buf))
470 goto err;
471
472 DECLARE_ARG("--lsm-profile");
473 DECLARE_ARG(buf);
474 }
475
476 additional = lxc_list_len(&opts->c->lxc_conf->network) * 2;
477
478 m = realloc(argv, (argc + additional + 1) * sizeof(*argv));
479 if (!m)
480 goto err;
481 argv = m;
482
483 lxc_list_for_each(it, &opts->c->lxc_conf->network) {
484 char eth[128], *veth;
485 char *fmt;
486 struct lxc_netdev *n = it->elem;
487 bool external_not_veth;
488
489 if (strcmp(opts->criu_version, CRIU_EXTERNAL_NOT_VETH) >= 0) {
490 /* Since criu version 2.8 the usage of --veth-pair
491 * has been deprecated:
492 * git tag --contains f2037e6d3445fc400
493 * v2.8 */
494 external_not_veth = true;
495 } else {
496 external_not_veth = false;
497 }
498
499 if (n->name) {
500 if (strlen(n->name) >= sizeof(eth))
501 goto err;
502 strncpy(eth, n->name, sizeof(eth));
503 } else {
504 ret = snprintf(eth, sizeof(eth), "eth%d", netnr);
505 if (ret < 0 || ret >= sizeof(eth))
506 goto err;
507 }
508
509 switch (n->type) {
510 case LXC_NET_VETH:
511 veth = n->priv.veth_attr.pair;
512
513 if (n->link) {
514 if (external_not_veth)
515 fmt = "veth[%s]:%s@%s";
516 else
517 fmt = "%s=%s@%s";
518
519 ret = snprintf(buf, sizeof(buf), fmt, eth, veth, n->link);
520 } else {
521 if (external_not_veth)
522 fmt = "veth[%s]:%s";
523 else
524 fmt = "%s=%s";
525
526 ret = snprintf(buf, sizeof(buf), fmt, eth, veth);
527 }
528 if (ret < 0 || ret >= sizeof(buf))
529 goto err;
530 break;
531 case LXC_NET_MACVLAN:
532 if (!n->link) {
533 ERROR("no host interface for macvlan %s", n->name);
534 goto err;
535 }
536
537 ret = snprintf(buf, sizeof(buf), "macvlan[%s]:%s", eth, n->link);
538 if (ret < 0 || ret >= sizeof(buf))
539 goto err;
540 break;
541 case LXC_NET_NONE:
542 case LXC_NET_EMPTY:
543 break;
544 default:
545 /* we have screened for this earlier... */
546 ERROR("unexpected network type %d", n->type);
547 goto err;
548 }
549
550 if (external_not_veth)
551 DECLARE_ARG("--external");
552 else
553 DECLARE_ARG("--veth-pair");
554 DECLARE_ARG(buf);
555 netnr++;
556 }
557
558 }
559
560 argv[argc] = NULL;
561
562 buf[0] = 0;
563 pos = 0;
564
565 for (i = 0; argv[i]; i++) {
566 ret = snprintf(buf + pos, sizeof(buf) - pos, "%s ", argv[i]);
567 if (ret < 0 || ret >= sizeof(buf) - pos)
568 goto err;
569 else
570 pos += ret;
571 }
572
573 INFO("execing: %s", buf);
574
575 /* before criu inits its log, it sometimes prints things to stdout/err;
576 * let's be sure we capture that.
577 */
578 if (dup2(opts->pipefd, STDOUT_FILENO) < 0) {
579 SYSERROR("dup2 stdout failed");
580 goto err;
581 }
582
583 if (dup2(opts->pipefd, STDERR_FILENO) < 0) {
584 SYSERROR("dup2 stderr failed");
585 goto err;
586 }
587
588 close(opts->pipefd);
589
590 #undef DECLARE_ARG
591 execv(argv[0], argv);
592 err:
593 for (i = 0; argv[i]; i++)
594 free(argv[i]);
595 free(argv);
596 }
597
598 /*
599 * Check to see if the criu version is recent enough for all the features we
600 * use. This version allows either CRIU_VERSION or (CRIU_GITID_VERSION and
601 * CRIU_GITID_PATCHLEVEL) to work, enabling users building from git to c/r
602 * things potentially before a version is released with a particular feature.
603 *
604 * The intent is that when criu development slows down, we can drop this, but
605 * for now we shouldn't attempt to c/r with versions that we know won't work.
606 *
607 * Note: If version != NULL criu_version() stores the detected criu version in
608 * version. Allocates memory for version which must be freed by caller.
609 */
610 static bool criu_version_ok(char **version)
611 {
612 int pipes[2];
613 pid_t pid;
614
615 if (pipe(pipes) < 0) {
616 SYSERROR("pipe() failed");
617 return false;
618 }
619
620 pid = fork();
621 if (pid < 0) {
622 SYSERROR("fork() failed");
623 return false;
624 }
625
626 if (pid == 0) {
627 char *args[] = { "criu", "--version", NULL };
628 char *path;
629 close(pipes[0]);
630
631 close(STDERR_FILENO);
632 if (dup2(pipes[1], STDOUT_FILENO) < 0)
633 exit(1);
634
635 path = on_path("criu", NULL);
636 if (!path)
637 exit(1);
638
639 execv(path, args);
640 exit(1);
641 } else {
642 FILE *f;
643 char *tmp;
644 int patch;
645
646 close(pipes[1]);
647 if (wait_for_pid(pid) < 0) {
648 close(pipes[0]);
649 SYSERROR("execing criu failed, is it installed?");
650 return false;
651 }
652
653 f = fdopen(pipes[0], "r");
654 if (!f) {
655 close(pipes[0]);
656 return false;
657 }
658
659 tmp = malloc(1024);
660 if (!tmp) {
661 fclose(f);
662 return false;
663 }
664
665 if (fscanf(f, "Version: %1023[^\n]s", tmp) != 1)
666 goto version_error;
667
668 if (fgetc(f) != '\n')
669 goto version_error;
670
671 if (strcmp(tmp, CRIU_VERSION) >= 0)
672 goto version_match;
673
674 if (fscanf(f, "GitID: v%1023[^-]s", tmp) != 1)
675 goto version_error;
676
677 if (fgetc(f) != '-')
678 goto version_error;
679
680 if (fscanf(f, "%d", &patch) != 1)
681 goto version_error;
682
683 if (strcmp(tmp, CRIU_GITID_VERSION) < 0)
684 goto version_error;
685
686 if (patch < CRIU_GITID_PATCHLEVEL)
687 goto version_error;
688
689 version_match:
690 fclose(f);
691 if (!version)
692 free(tmp);
693 else
694 *version = tmp;
695 return true;
696
697 version_error:
698 fclose(f);
699 free(tmp);
700 ERROR("must have criu " CRIU_VERSION " or greater to checkpoint/restore");
701 return false;
702 }
703 }
704
705 /* Check and make sure the container has a configuration that we know CRIU can
706 * dump. */
707 static bool criu_ok(struct lxc_container *c, char **criu_version)
708 {
709 struct lxc_list *it;
710
711 if (!criu_version_ok(criu_version))
712 return false;
713
714 if (geteuid()) {
715 ERROR("Must be root to checkpoint");
716 return false;
717 }
718
719 /* We only know how to restore containers with veth networks. */
720 lxc_list_for_each(it, &c->lxc_conf->network) {
721 struct lxc_netdev *n = it->elem;
722 switch(n->type) {
723 case LXC_NET_VETH:
724 case LXC_NET_NONE:
725 case LXC_NET_EMPTY:
726 case LXC_NET_MACVLAN:
727 break;
728 default:
729 ERROR("Found un-dumpable network: %s (%s)", lxc_net_type_to_str(n->type), n->name);
730 return false;
731 }
732 }
733
734 return true;
735 }
736
737 static bool restore_net_info(struct lxc_container *c)
738 {
739 struct lxc_list *it;
740 bool has_error = true;
741
742 if (container_mem_lock(c))
743 return false;
744
745 lxc_list_for_each(it, &c->lxc_conf->network) {
746 struct lxc_netdev *netdev = it->elem;
747 char template[IFNAMSIZ];
748
749 if (netdev->type != LXC_NET_VETH)
750 continue;
751
752 snprintf(template, sizeof(template), "vethXXXXXX");
753
754 if (!netdev->priv.veth_attr.pair)
755 netdev->priv.veth_attr.pair = lxc_mkifname(template);
756
757 if (!netdev->priv.veth_attr.pair)
758 goto out_unlock;
759 }
760
761 has_error = false;
762
763 out_unlock:
764 container_mem_unlock(c);
765 return !has_error;
766 }
767
768 // do_restore never returns, the calling process is used as the
769 // monitor process. do_restore calls exit() if it fails.
770 static void do_restore(struct lxc_container *c, int status_pipe, struct migrate_opts *opts, char *criu_version)
771 {
772 pid_t pid;
773 struct lxc_handler *handler;
774 int status, fd;
775 int pipes[2] = {-1, -1};
776
777 /* Try to detach from the current controlling tty if it exists.
778 * Othwerise, lxc_init (via lxc_console) will attach the container's
779 * console output to the current tty, which is probably not what any
780 * library user wants, and if they do, they can just manually configure
781 * it :)
782 */
783 fd = open("/dev/tty", O_RDWR);
784 if (fd >= 0) {
785 if (ioctl(fd, TIOCNOTTY, NULL) < 0)
786 SYSERROR("couldn't detach from tty");
787 close(fd);
788 }
789
790 handler = lxc_init(c->name, c->lxc_conf, c->config_path);
791 if (!handler)
792 goto out;
793
794 if (!cgroup_init(handler)) {
795 ERROR("failed initing cgroups");
796 goto out_fini_handler;
797 }
798
799 if (!cgroup_create(handler)) {
800 ERROR("failed creating groups");
801 goto out_fini_handler;
802 }
803
804 if (!restore_net_info(c)) {
805 ERROR("failed restoring network info");
806 goto out_fini_handler;
807 }
808
809 resolve_clone_flags(handler);
810
811 if (pipe(pipes) < 0) {
812 SYSERROR("pipe() failed");
813 goto out_fini_handler;
814 }
815
816 pid = fork();
817 if (pid < 0)
818 goto out_fini_handler;
819
820 if (pid == 0) {
821 struct criu_opts os;
822 struct lxc_rootfs *rootfs;
823 int flags;
824
825 close(status_pipe);
826 status_pipe = -1;
827
828 close(pipes[0]);
829 pipes[0] = -1;
830
831 if (unshare(CLONE_NEWNS))
832 goto out_fini_handler;
833
834 /* CRIU needs the lxc root bind mounted so that it is the root of some
835 * mount. */
836 rootfs = &c->lxc_conf->rootfs;
837
838 if (rootfs_is_blockdev(c->lxc_conf)) {
839 if (do_rootfs_setup(c->lxc_conf, c->name, c->config_path) < 0)
840 goto out_fini_handler;
841 } else {
842 if (mkdir(rootfs->mount, 0755) < 0 && errno != EEXIST)
843 goto out_fini_handler;
844
845 if (mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL) < 0) {
846 SYSERROR("remount / to private failed");
847 goto out_fini_handler;
848 }
849
850 if (mount(rootfs->path, rootfs->mount, NULL, MS_BIND, NULL) < 0) {
851 rmdir(rootfs->mount);
852 goto out_fini_handler;
853 }
854 }
855
856 os.pipefd = pipes[1];
857 os.action = "restore";
858 os.user = opts;
859 os.c = c;
860 os.console_fd = c->lxc_conf->console.slave;
861 os.criu_version = criu_version;
862 os.handler = handler;
863
864 if (os.console_fd >= 0) {
865 /* Twiddle the FD_CLOEXEC bit. We want to pass this FD to criu
866 * via --inherit-fd, so we don't want it to close.
867 */
868 flags = fcntl(os.console_fd, F_GETFD);
869 if (flags < 0) {
870 SYSERROR("F_GETFD failed: %d", os.console_fd);
871 goto out_fini_handler;
872 }
873
874 flags &= ~FD_CLOEXEC;
875
876 if (fcntl(os.console_fd, F_SETFD, flags) < 0) {
877 SYSERROR("F_SETFD failed");
878 goto out_fini_handler;
879 }
880 }
881 os.console_name = c->lxc_conf->console.name;
882
883 /* exec_criu() returning is an error */
884 exec_criu(&os);
885 umount(rootfs->mount);
886 rmdir(rootfs->mount);
887 goto out_fini_handler;
888 } else {
889 int ret;
890 char title[2048];
891
892 close(pipes[1]);
893 pipes[1] = -1;
894
895 pid_t w = waitpid(pid, &status, 0);
896 if (w == -1) {
897 SYSERROR("waitpid");
898 goto out_fini_handler;
899 }
900
901 if (WIFEXITED(status)) {
902 char buf[4096];
903
904 if (WEXITSTATUS(status)) {
905 int n;
906
907 n = read(pipes[0], buf, sizeof(buf));
908 if (n < 0) {
909 SYSERROR("failed reading from criu stderr");
910 goto out_fini_handler;
911 }
912
913 if (n == sizeof(buf))
914 n--;
915 buf[n] = 0;
916
917 ERROR("criu process exited %d, output:\n%s", WEXITSTATUS(status), buf);
918 goto out_fini_handler;
919 } else {
920 ret = snprintf(buf, sizeof(buf), "/proc/self/task/%lu/children", (unsigned long)syscall(__NR_gettid));
921 if (ret < 0 || ret >= sizeof(buf)) {
922 ERROR("snprintf'd too many characters: %d", ret);
923 goto out_fini_handler;
924 }
925
926 FILE *f = fopen(buf, "r");
927 if (!f) {
928 SYSERROR("couldn't read restore's children file %s", buf);
929 goto out_fini_handler;
930 }
931
932 ret = fscanf(f, "%d", (int*) &handler->pid);
933 fclose(f);
934 if (ret != 1) {
935 ERROR("reading restore pid failed");
936 goto out_fini_handler;
937 }
938
939 if (lxc_set_state(c->name, handler, RUNNING)) {
940 ERROR("error setting running state after restore");
941 goto out_fini_handler;
942 }
943 }
944 } else {
945 ERROR("CRIU was killed with signal %d", WTERMSIG(status));
946 goto out_fini_handler;
947 }
948
949 close(pipes[0]);
950
951 ret = write(status_pipe, &status, sizeof(status));
952 close(status_pipe);
953 status_pipe = -1;
954
955 if (sizeof(status) != ret) {
956 SYSERROR("failed to write all of status");
957 goto out_fini_handler;
958 }
959
960 /*
961 * See comment in lxcapi_start; we don't care if these
962 * fail because it's just a beauty thing. We just
963 * assign the return here to silence potential.
964 */
965 ret = snprintf(title, sizeof(title), "[lxc monitor] %s %s", c->config_path, c->name);
966 ret = setproctitle(title);
967
968 ret = lxc_poll(c->name, handler);
969 if (ret)
970 lxc_abort(c->name, handler);
971 lxc_fini(c->name, handler);
972 exit(ret);
973 }
974
975 out_fini_handler:
976 if (pipes[0] >= 0)
977 close(pipes[0]);
978 if (pipes[1] >= 0)
979 close(pipes[1]);
980
981 lxc_fini(c->name, handler);
982
983 out:
984 if (status_pipe >= 0) {
985 /* ensure getting here was a failure, e.g. if we failed to
986 * parse the child pid or something, even after a successful
987 * restore
988 */
989 if (!status)
990 status = 1;
991 if (write(status_pipe, &status, sizeof(status)) != sizeof(status)) {
992 SYSERROR("writing status failed");
993 }
994 close(status_pipe);
995 }
996
997 exit(1);
998 }
999
1000 static int save_tty_major_minor(char *directory, struct lxc_container *c, char *tty_id, int len)
1001 {
1002 FILE *f;
1003 char path[PATH_MAX];
1004 int ret;
1005 struct stat sb;
1006
1007 if (c->lxc_conf->console.path && !strcmp(c->lxc_conf->console.path, "none")) {
1008 tty_id[0] = 0;
1009 return 0;
1010 }
1011
1012 ret = snprintf(path, sizeof(path), "/proc/%d/root/dev/console", c->init_pid(c));
1013 if (ret < 0 || ret >= sizeof(path)) {
1014 ERROR("snprintf'd too many chacters: %d", ret);
1015 return -1;
1016 }
1017
1018 ret = stat(path, &sb);
1019 if (ret < 0) {
1020 SYSERROR("stat of %s failed", path);
1021 return -1;
1022 }
1023
1024 ret = snprintf(path, sizeof(path), "%s/tty.info", directory);
1025 if (ret < 0 || ret >= sizeof(path)) {
1026 ERROR("snprintf'd too many characters: %d", ret);
1027 return -1;
1028 }
1029
1030 ret = snprintf(tty_id, len, "tty[%llx:%llx]",
1031 (long long unsigned) sb.st_rdev,
1032 (long long unsigned) sb.st_dev);
1033 if (ret < 0 || ret >= sizeof(path)) {
1034 ERROR("snprintf'd too many characters: %d", ret);
1035 return -1;
1036 }
1037
1038 f = fopen(path, "w");
1039 if (!f) {
1040 SYSERROR("failed to open %s", path);
1041 return -1;
1042 }
1043
1044 ret = fprintf(f, "%s", tty_id);
1045 fclose(f);
1046 if (ret < 0)
1047 SYSERROR("failed to write to %s", path);
1048 return ret;
1049 }
1050
1051 /* do one of either predump or a regular dump */
1052 static bool do_dump(struct lxc_container *c, char *mode, struct migrate_opts *opts)
1053 {
1054 pid_t pid;
1055 char *criu_version = NULL;
1056 int criuout[2];
1057
1058 if (!criu_ok(c, &criu_version))
1059 return false;
1060
1061 if (pipe(criuout) < 0) {
1062 SYSERROR("pipe() failed");
1063 return false;
1064 }
1065
1066 if (mkdir_p(opts->directory, 0700) < 0)
1067 goto fail;
1068
1069 pid = fork();
1070 if (pid < 0) {
1071 SYSERROR("fork failed");
1072 goto fail;
1073 }
1074
1075 if (pid == 0) {
1076 struct criu_opts os;
1077 struct lxc_handler h;
1078
1079 close(criuout[0]);
1080
1081 h.name = c->name;
1082 if (!cgroup_init(&h)) {
1083 ERROR("failed to cgroup_init()");
1084 exit(1);
1085 }
1086
1087 os.pipefd = criuout[1];
1088 os.action = mode;
1089 os.user = opts;
1090 os.c = c;
1091 os.console_name = c->lxc_conf->console.path;
1092 os.criu_version = criu_version;
1093
1094 if (save_tty_major_minor(opts->directory, c, os.tty_id, sizeof(os.tty_id)) < 0)
1095 exit(1);
1096
1097 /* exec_criu() returning is an error */
1098 exec_criu(&os);
1099 exit(1);
1100 } else {
1101 int status;
1102 ssize_t n;
1103 char buf[4096];
1104 bool ret;
1105
1106 close(criuout[1]);
1107
1108 pid_t w = waitpid(pid, &status, 0);
1109 if (w == -1) {
1110 SYSERROR("waitpid");
1111 close(criuout[0]);
1112 return false;
1113 }
1114
1115 n = read(criuout[0], buf, sizeof(buf));
1116 close(criuout[0]);
1117 if (n < 0) {
1118 SYSERROR("read");
1119 n = 0;
1120 }
1121 buf[n] = 0;
1122
1123 if (WIFEXITED(status)) {
1124 if (WEXITSTATUS(status)) {
1125 ERROR("dump failed with %d", WEXITSTATUS(status));
1126 ret = false;
1127 } else {
1128 ret = true;
1129 }
1130 } else if (WIFSIGNALED(status)) {
1131 ERROR("dump signaled with %d", WTERMSIG(status));
1132 ret = false;
1133 } else {
1134 ERROR("unknown dump exit %d", status);
1135 ret = false;
1136 }
1137
1138 if (!ret)
1139 ERROR("criu output: %s", buf);
1140 return ret;
1141 }
1142 fail:
1143 close(criuout[0]);
1144 close(criuout[1]);
1145 rmdir(opts->directory);
1146 return false;
1147 }
1148
1149 bool __criu_pre_dump(struct lxc_container *c, struct migrate_opts *opts)
1150 {
1151 return do_dump(c, "pre-dump", opts);
1152 }
1153
1154 bool __criu_dump(struct lxc_container *c, struct migrate_opts *opts)
1155 {
1156 char path[PATH_MAX];
1157 int ret;
1158
1159 ret = snprintf(path, sizeof(path), "%s/inventory.img", opts->directory);
1160 if (ret < 0 || ret >= sizeof(path))
1161 return false;
1162
1163 if (access(path, F_OK) == 0) {
1164 ERROR("please use a fresh directory for the dump directory");
1165 return false;
1166 }
1167
1168 return do_dump(c, "dump", opts);
1169 }
1170
1171 bool __criu_restore(struct lxc_container *c, struct migrate_opts *opts)
1172 {
1173 pid_t pid;
1174 int status, nread;
1175 int pipefd[2];
1176 char *criu_version = NULL;
1177
1178 if (!criu_ok(c, &criu_version))
1179 return false;
1180
1181 if (geteuid()) {
1182 ERROR("Must be root to restore");
1183 return false;
1184 }
1185
1186 if (pipe(pipefd)) {
1187 ERROR("failed to create pipe");
1188 return false;
1189 }
1190
1191 pid = fork();
1192 if (pid < 0) {
1193 close(pipefd[0]);
1194 close(pipefd[1]);
1195 return false;
1196 }
1197
1198 if (pid == 0) {
1199 close(pipefd[0]);
1200 // this never returns
1201 do_restore(c, pipefd[1], opts, criu_version);
1202 }
1203
1204 close(pipefd[1]);
1205
1206 nread = read(pipefd[0], &status, sizeof(status));
1207 close(pipefd[0]);
1208 if (sizeof(status) != nread) {
1209 ERROR("reading status from pipe failed");
1210 goto err_wait;
1211 }
1212
1213 // If the criu process was killed or exited nonzero, wait() for the
1214 // handler, since the restore process died. Otherwise, we don't need to
1215 // wait, since the child becomes the monitor process.
1216 if (!WIFEXITED(status) || WEXITSTATUS(status))
1217 goto err_wait;
1218 return true;
1219
1220 err_wait:
1221 if (wait_for_pid(pid))
1222 ERROR("restore process died");
1223 return false;
1224 }