]> git.proxmox.com Git - mirror_lxc.git/blob - src/lxc/criu.c
Merge pull request #3059 from brauner/2019-06-21/seccomp_notify
[mirror_lxc.git] / src / lxc / criu.c
1 /*
2 * lxc: linux Container library
3 *
4 * Copyright © 2014-2015 Canonical Ltd.
5 *
6 * Authors:
7 * Tycho Andersen <tycho.andersen@canonical.com>
8 *
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 */
23
24 #ifndef _GNU_SOURCE
25 #define _GNU_SOURCE 1
26 #endif
27 #include <inttypes.h>
28 #include <linux/limits.h>
29 #include <sched.h>
30 #include <stdio.h>
31 #include <stdlib.h>
32 #include <string.h>
33 #include <sys/mount.h>
34 #include <sys/types.h>
35 #include <sys/wait.h>
36 #include <unistd.h>
37
38 #include "cgroup.h"
39 #include "commands.h"
40 #include "conf.h"
41 #include "config.h"
42 #include "criu.h"
43 #include "log.h"
44 #include "lxc.h"
45 #include "lxclock.h"
46 #include "network.h"
47 #include "storage.h"
48 #include "syscall_wrappers.h"
49 #include "utils.h"
50
51 #if IS_BIONIC
52 #include <../include/lxcmntent.h>
53 #else
54 #include <mntent.h>
55 #endif
56
57 #ifndef HAVE_STRLCPY
58 #include "include/strlcpy.h"
59 #endif
60
61 #define CRIU_VERSION "2.0"
62
63 #define CRIU_GITID_VERSION "2.0"
64 #define CRIU_GITID_PATCHLEVEL 0
65
66 #define CRIU_IN_FLIGHT_SUPPORT "2.4"
67 #define CRIU_EXTERNAL_NOT_VETH "2.8"
68
69 lxc_log_define(criu, lxc);
70
71 struct criu_opts {
72 /* the thing to hook to stdout and stderr for logging */
73 int pipefd;
74
75 /* The type of criu invocation, one of "dump" or "restore" */
76 char *action;
77
78 /* the user-provided migrate options relevant to this action */
79 struct migrate_opts *user;
80
81 /* The container to dump */
82 struct lxc_container *c;
83
84 /* dump: stop the container or not after dumping? */
85 char tty_id[32]; /* the criu tty id for /dev/console, i.e. "tty[${rdev}:${dev}]" */
86
87 /* restore: the file to write the init process' pid into */
88 struct lxc_handler *handler;
89 int console_fd;
90 /* The path that is bind mounted from /dev/console, if any. We don't
91 * want to use `--ext-mount-map auto`'s result here because the pts
92 * device may have a different path (e.g. if the pty number is
93 * different) on the target host. NULL if lxc.console.path = "none".
94 */
95 char *console_name;
96
97 /* The detected version of criu */
98 char *criu_version;
99 };
100
101 static int load_tty_major_minor(char *directory, char *output, int len)
102 {
103 FILE *f;
104 char path[PATH_MAX];
105 int ret;
106
107 ret = snprintf(path, sizeof(path), "%s/tty.info", directory);
108 if (ret < 0 || ret >= sizeof(path)) {
109 ERROR("snprintf'd too many characters: %d", ret);
110 return -1;
111 }
112
113 f = fopen(path, "r");
114 if (!f) {
115 /* This means we're coming from a liblxc which didn't export
116 * the tty info. In this case they had to have lxc.console.path
117 * = * none, so there's no problem restoring.
118 */
119 if (errno == ENOENT)
120 return 0;
121
122 SYSERROR("couldn't open %s", path);
123 return -1;
124 }
125
126 if (!fgets(output, len, f)) {
127 fclose(f);
128 SYSERROR("couldn't read %s", path);
129 return -1;
130 }
131
132 fclose(f);
133 return 0;
134 }
135
136 static int cmp_version(const char *v1, const char *v2)
137 {
138 int ret;
139 int oct_v1[3], oct_v2[3];
140
141 memset(oct_v1, -1, sizeof(oct_v1));
142 memset(oct_v2, -1, sizeof(oct_v2));
143
144 ret = sscanf(v1, "%d.%d.%d", &oct_v1[0], &oct_v1[1], &oct_v1[2]);
145 if (ret < 1)
146 return -1;
147
148 ret = sscanf(v2, "%d.%d.%d", &oct_v2[0], &oct_v2[1], &oct_v2[2]);
149 if (ret < 1)
150 return -1;
151
152 /* Major version is greater. */
153 if (oct_v1[0] > oct_v2[0])
154 return 1;
155
156 if (oct_v1[0] < oct_v2[0])
157 return -1;
158
159 /* Minor number is greater.*/
160 if (oct_v1[1] > oct_v2[1])
161 return 1;
162
163 if (oct_v1[1] < oct_v2[1])
164 return -1;
165
166 /* Patch number is greater. */
167 if (oct_v1[2] > oct_v2[2])
168 return 1;
169
170 /* Patch numbers are equal. */
171 if (oct_v1[2] == oct_v2[2])
172 return 0;
173
174 return -1;
175 }
176
177 static void exec_criu(struct cgroup_ops *cgroup_ops, struct lxc_conf *conf,
178 struct criu_opts *opts)
179 {
180 char **argv, log[PATH_MAX];
181 int static_args = 23, argc = 0, i, ret;
182 int netnr = 0;
183 struct lxc_list *it;
184 FILE *mnts;
185 struct mntent mntent;
186
187 char buf[4096], ttys[32];
188 size_t pos;
189
190 /* If we are currently in a cgroup /foo/bar, and the container is in a
191 * cgroup /lxc/foo, lxcfs will give us an ENOENT if some task in the
192 * container has an open fd that points to one of the cgroup files
193 * (systemd always opens its "root" cgroup). So, let's escape to the
194 * /actual/ root cgroup so that lxcfs thinks criu has enough rights to
195 * see all cgroups.
196 */
197 if (!cgroup_ops->escape(cgroup_ops, conf)) {
198 ERROR("failed to escape cgroups");
199 return;
200 }
201
202 /* The command line always looks like:
203 * criu $(action) --tcp-established --file-locks --link-remap \
204 * --manage-cgroups=full --action-script foo.sh -D $(directory) \
205 * -o $(directory)/$(action).log --ext-mount-map auto
206 * --enable-external-sharing --enable-external-masters
207 * --enable-fs hugetlbfs --enable-fs tracefs --ext-mount-map console:/dev/pts/n
208 * +1 for final NULL */
209
210 if (strcmp(opts->action, "dump") == 0 || strcmp(opts->action, "pre-dump") == 0) {
211 /* -t pid --freeze-cgroup /lxc/ct */
212 static_args += 4;
213
214 /* --prev-images-dir <path-to-directory-A-relative-to-B> */
215 if (opts->user->predump_dir)
216 static_args += 2;
217
218 /* --page-server --address <address> --port <port> */
219 if (opts->user->pageserver_address && opts->user->pageserver_port)
220 static_args += 5;
221
222 /* --leave-running (only for final dump) */
223 if (strcmp(opts->action, "dump") == 0 && !opts->user->stop)
224 static_args++;
225
226 /* --external tty[88,4] */
227 if (opts->tty_id[0])
228 static_args += 2;
229
230 /* --force-irmap */
231 if (!opts->user->preserves_inodes)
232 static_args++;
233
234 /* --ghost-limit 1024 */
235 if (opts->user->ghost_limit)
236 static_args += 2;
237 } else if (strcmp(opts->action, "restore") == 0) {
238 /* --root $(lxc_mount_point) --restore-detached
239 * --restore-sibling
240 * --lsm-profile apparmor:whatever
241 */
242 static_args += 6;
243
244 ttys[0] = 0;
245 if (load_tty_major_minor(opts->user->directory, ttys, sizeof(ttys)))
246 return;
247
248 /* --inherit-fd fd[%d]:tty[%s] */
249 if (ttys[0])
250 static_args += 2;
251 } else {
252 return;
253 }
254
255 if (cgroup_ops->num_hierarchies(cgroup_ops) > 0)
256 static_args += 2 * cgroup_ops->num_hierarchies(cgroup_ops);
257
258 if (opts->user->verbose)
259 static_args++;
260
261 if (opts->user->action_script)
262 static_args += 2;
263
264 static_args += 2 * lxc_list_len(&opts->c->lxc_conf->mount_list);
265
266 ret = snprintf(log, PATH_MAX, "%s/%s.log", opts->user->directory, opts->action);
267 if (ret < 0 || ret >= PATH_MAX) {
268 ERROR("logfile name too long");
269 return;
270 }
271
272 argv = malloc(static_args * sizeof(*argv));
273 if (!argv)
274 return;
275
276 memset(argv, 0, static_args * sizeof(*argv));
277
278 #define DECLARE_ARG(arg) \
279 do { \
280 if (arg == NULL) { \
281 ERROR("Got NULL argument for criu"); \
282 goto err; \
283 } \
284 argv[argc++] = strdup(arg); \
285 if (!argv[argc-1]) \
286 goto err; \
287 } while (0)
288
289 argv[argc++] = on_path("criu", NULL);
290 if (!argv[argc-1]) {
291 ERROR("Couldn't find criu binary");
292 goto err;
293 }
294
295 DECLARE_ARG(opts->action);
296 DECLARE_ARG("--tcp-established");
297 DECLARE_ARG("--file-locks");
298 DECLARE_ARG("--link-remap");
299 DECLARE_ARG("--manage-cgroups=full");
300 DECLARE_ARG("--ext-mount-map");
301 DECLARE_ARG("auto");
302 DECLARE_ARG("--enable-external-sharing");
303 DECLARE_ARG("--enable-external-masters");
304 DECLARE_ARG("--enable-fs");
305 DECLARE_ARG("hugetlbfs");
306 DECLARE_ARG("--enable-fs");
307 DECLARE_ARG("tracefs");
308 DECLARE_ARG("-D");
309 DECLARE_ARG(opts->user->directory);
310 DECLARE_ARG("-o");
311 DECLARE_ARG(log);
312
313 for (i = 0; i < cgroup_ops->num_hierarchies(cgroup_ops); i++) {
314 char **controllers = NULL, *fullname;
315 char *path, *tmp;
316
317 if (!cgroup_ops->get_hierarchies(cgroup_ops, i, &controllers)) {
318 ERROR("failed to get hierarchy %d", i);
319 goto err;
320 }
321
322 /* if we are in a dump, we have to ask the monitor process what
323 * the right cgroup is. if this is a restore, we can just use
324 * the handler the restore task created.
325 */
326 if (!strcmp(opts->action, "dump") || !strcmp(opts->action, "pre-dump")) {
327 path = lxc_cmd_get_cgroup_path(opts->c->name, opts->c->config_path, controllers[0]);
328 if (!path) {
329 ERROR("failed to get cgroup path for %s", controllers[0]);
330 goto err;
331 }
332 } else {
333 const char *p;
334
335 p = cgroup_ops->get_cgroup(cgroup_ops, controllers[0]);
336 if (!p) {
337 ERROR("failed to get cgroup path for %s", controllers[0]);
338 goto err;
339 }
340
341 path = strdup(p);
342 if (!path) {
343 ERROR("strdup failed");
344 goto err;
345 }
346 }
347
348 tmp = lxc_deslashify(path);
349 if (!tmp) {
350 ERROR("Failed to remove extraneous slashes from \"%s\"",
351 path);
352 free(path);
353 goto err;
354 }
355 free(path);
356 path = tmp;
357
358 fullname = lxc_string_join(",", (const char **) controllers, false);
359 if (!fullname) {
360 ERROR("failed to join controllers");
361 free(path);
362 goto err;
363 }
364
365 ret = sprintf(buf, "%s:%s", fullname, path);
366 free(path);
367 free(fullname);
368 if (ret < 0 || ret >= sizeof(buf)) {
369 ERROR("sprintf of cgroup root arg failed");
370 goto err;
371 }
372
373 DECLARE_ARG("--cgroup-root");
374 DECLARE_ARG(buf);
375 }
376
377 if (opts->user->verbose)
378 DECLARE_ARG("-v4");
379
380 if (opts->user->action_script) {
381 DECLARE_ARG("--action-script");
382 DECLARE_ARG(opts->user->action_script);
383 }
384
385 mnts = make_anonymous_mount_file(&opts->c->lxc_conf->mount_list,
386 opts->c->lxc_conf->lsm_aa_allow_nesting);
387 if (!mnts)
388 goto err;
389
390 while (getmntent_r(mnts, &mntent, buf, sizeof(buf))) {
391 char *mntdata;
392 char arg[2 * PATH_MAX + 2];
393 unsigned long flags;
394
395 if (parse_mntopts(mntent.mnt_opts, &flags, &mntdata) < 0)
396 goto err;
397
398 free(mntdata);
399
400 /* only add --ext-mount-map for actual bind mounts */
401 if (!(flags & MS_BIND))
402 continue;
403
404 if (strcmp(opts->action, "dump") == 0)
405 ret = snprintf(arg, sizeof(arg), "/%s:%s",
406 mntent.mnt_dir, mntent.mnt_dir);
407 else
408 ret = snprintf(arg, sizeof(arg), "%s:%s",
409 mntent.mnt_dir, mntent.mnt_fsname);
410 if (ret < 0 || ret >= sizeof(arg)) {
411 fclose(mnts);
412 ERROR("snprintf failed");
413 goto err;
414 }
415
416 DECLARE_ARG("--ext-mount-map");
417 DECLARE_ARG(arg);
418 }
419 fclose(mnts);
420
421 if (strcmp(opts->action, "dump") == 0 || strcmp(opts->action, "pre-dump") == 0) {
422 char pid[32], *freezer_relative;
423
424 if (sprintf(pid, "%d", opts->c->init_pid(opts->c)) < 0)
425 goto err;
426
427 DECLARE_ARG("-t");
428 DECLARE_ARG(pid);
429
430 freezer_relative = lxc_cmd_get_cgroup_path(opts->c->name,
431 opts->c->config_path,
432 "freezer");
433 if (!freezer_relative) {
434 ERROR("failed getting freezer path");
435 goto err;
436 }
437
438 ret = snprintf(log, sizeof(log), "/sys/fs/cgroup/freezer/%s", freezer_relative);
439 if (ret < 0 || ret >= sizeof(log))
440 goto err;
441
442 if (!opts->user->disable_skip_in_flight &&
443 strcmp(opts->criu_version, CRIU_IN_FLIGHT_SUPPORT) >= 0)
444 DECLARE_ARG("--skip-in-flight");
445
446 DECLARE_ARG("--freeze-cgroup");
447 DECLARE_ARG(log);
448
449 if (opts->tty_id[0]) {
450 DECLARE_ARG("--ext-mount-map");
451 DECLARE_ARG("/dev/console:console");
452
453 DECLARE_ARG("--external");
454 DECLARE_ARG(opts->tty_id);
455 }
456
457 if (opts->user->predump_dir) {
458 DECLARE_ARG("--prev-images-dir");
459 DECLARE_ARG(opts->user->predump_dir);
460 DECLARE_ARG("--track-mem");
461 }
462
463 if (opts->user->pageserver_address && opts->user->pageserver_port) {
464 DECLARE_ARG("--page-server");
465 DECLARE_ARG("--address");
466 DECLARE_ARG(opts->user->pageserver_address);
467 DECLARE_ARG("--port");
468 DECLARE_ARG(opts->user->pageserver_port);
469 }
470
471 if (!opts->user->preserves_inodes)
472 DECLARE_ARG("--force-irmap");
473
474 if (opts->user->ghost_limit) {
475 char ghost_limit[32];
476
477 ret = sprintf(ghost_limit, "%"PRIu64, opts->user->ghost_limit);
478 if (ret < 0 || ret >= sizeof(ghost_limit)) {
479 ERROR("failed to print ghost limit %"PRIu64, opts->user->ghost_limit);
480 goto err;
481 }
482
483 DECLARE_ARG("--ghost-limit");
484 DECLARE_ARG(ghost_limit);
485 }
486
487 /* only for final dump */
488 if (strcmp(opts->action, "dump") == 0 && !opts->user->stop)
489 DECLARE_ARG("--leave-running");
490 } else if (strcmp(opts->action, "restore") == 0) {
491 void *m;
492 int additional;
493 struct lxc_conf *lxc_conf = opts->c->lxc_conf;
494
495 DECLARE_ARG("--root");
496 DECLARE_ARG(opts->c->lxc_conf->rootfs.mount);
497 DECLARE_ARG("--restore-detached");
498 DECLARE_ARG("--restore-sibling");
499
500 if (ttys[0]) {
501 if (opts->console_fd < 0) {
502 ERROR("lxc.console.path configured on source host but not target");
503 goto err;
504 }
505
506 ret = snprintf(buf, sizeof(buf), "fd[%d]:%s", opts->console_fd, ttys);
507 if (ret < 0 || ret >= sizeof(buf))
508 goto err;
509
510 DECLARE_ARG("--inherit-fd");
511 DECLARE_ARG(buf);
512 }
513 if (opts->console_name) {
514 if (snprintf(buf, sizeof(buf), "console:%s", opts->console_name) < 0) {
515 SYSERROR("sprintf'd too many bytes");
516 }
517 DECLARE_ARG("--ext-mount-map");
518 DECLARE_ARG(buf);
519 }
520
521 if (lxc_conf->lsm_aa_profile || lxc_conf->lsm_se_context) {
522
523 if (lxc_conf->lsm_aa_profile)
524 ret = snprintf(buf, sizeof(buf), "apparmor:%s", lxc_conf->lsm_aa_profile);
525 else
526 ret = snprintf(buf, sizeof(buf), "selinux:%s", lxc_conf->lsm_se_context);
527
528 if (ret < 0 || ret >= sizeof(buf))
529 goto err;
530
531 DECLARE_ARG("--lsm-profile");
532 DECLARE_ARG(buf);
533 }
534
535 additional = lxc_list_len(&opts->c->lxc_conf->network) * 2;
536
537 m = realloc(argv, (argc + additional + 1) * sizeof(*argv));
538 if (!m)
539 goto err;
540 argv = m;
541
542 lxc_list_for_each(it, &opts->c->lxc_conf->network) {
543 size_t retlen;
544 char eth[128], *veth;
545 struct lxc_netdev *n = it->elem;
546 bool external_not_veth;
547
548 if (cmp_version(opts->criu_version, CRIU_EXTERNAL_NOT_VETH) >= 0) {
549 /* Since criu version 2.8 the usage of --veth-pair
550 * has been deprecated:
551 * git tag --contains f2037e6d3445fc400
552 * v2.8 */
553 external_not_veth = true;
554 } else {
555 external_not_veth = false;
556 }
557
558 if (n->name[0] != '\0') {
559 retlen = strlcpy(eth, n->name, sizeof(eth));
560 if (retlen >= sizeof(eth))
561 goto err;
562 } else {
563 ret = snprintf(eth, sizeof(eth), "eth%d", netnr);
564 if (ret < 0 || ret >= sizeof(eth))
565 goto err;
566 }
567
568 switch (n->type) {
569 case LXC_NET_VETH:
570 veth = n->priv.veth_attr.pair;
571 if (veth[0] == '\0')
572 veth = n->priv.veth_attr.veth1;
573
574 if (n->link[0] != '\0') {
575 if (external_not_veth)
576 ret = snprintf(buf, sizeof(buf),
577 "veth[%s]:%s@%s",
578 eth, veth,
579 n->link);
580 else
581 ret = snprintf(buf, sizeof(buf),
582 "%s=%s@%s", eth,
583 veth, n->link);
584 } else {
585 if (external_not_veth)
586 ret = snprintf(buf, sizeof(buf),
587 "veth[%s]:%s",
588 eth, veth);
589 else
590 ret = snprintf(buf, sizeof(buf),
591 "%s=%s", eth,
592 veth);
593 }
594 if (ret < 0 || ret >= sizeof(buf))
595 goto err;
596 break;
597 case LXC_NET_MACVLAN:
598 if (n->link[0] == '\0') {
599 ERROR("no host interface for macvlan %s", n->name);
600 goto err;
601 }
602
603 ret = snprintf(buf, sizeof(buf), "macvlan[%s]:%s", eth, n->link);
604 if (ret < 0 || ret >= sizeof(buf))
605 goto err;
606 break;
607 case LXC_NET_NONE:
608 case LXC_NET_EMPTY:
609 break;
610 default:
611 /* we have screened for this earlier... */
612 ERROR("unexpected network type %d", n->type);
613 goto err;
614 }
615
616 if (external_not_veth)
617 DECLARE_ARG("--external");
618 else
619 DECLARE_ARG("--veth-pair");
620 DECLARE_ARG(buf);
621 netnr++;
622 }
623
624 }
625
626 argv[argc] = NULL;
627
628 buf[0] = 0;
629 pos = 0;
630
631 for (i = 0; argv[i]; i++) {
632 ret = snprintf(buf + pos, sizeof(buf) - pos, "%s ", argv[i]);
633 if (ret < 0 || ret >= sizeof(buf) - pos)
634 goto err;
635 else
636 pos += ret;
637 }
638
639 INFO("execing: %s", buf);
640
641 /* before criu inits its log, it sometimes prints things to stdout/err;
642 * let's be sure we capture that.
643 */
644 if (dup2(opts->pipefd, STDOUT_FILENO) < 0) {
645 SYSERROR("dup2 stdout failed");
646 goto err;
647 }
648
649 if (dup2(opts->pipefd, STDERR_FILENO) < 0) {
650 SYSERROR("dup2 stderr failed");
651 goto err;
652 }
653
654 close(opts->pipefd);
655
656 #undef DECLARE_ARG
657 execv(argv[0], argv);
658 err:
659 for (i = 0; argv[i]; i++)
660 free(argv[i]);
661 free(argv);
662 }
663
664 /*
665 * Function to check if the checks activated in 'features_to_check' are
666 * available with the current architecture/kernel/criu combination.
667 *
668 * Parameter features_to_check is a bit mask of all features that should be
669 * checked (see feature check defines in lxc/lxccontainer.h).
670 *
671 * If the return value is true, all requested features are supported. If
672 * the return value is false the features_to_check parameter is updated
673 * to reflect which features are available. '0' means no feature but
674 * also that something went totally wrong.
675 *
676 * Some of the code flow of criu_version_ok() is duplicated and maybe it
677 * is a good candidate for refactoring.
678 */
679 bool __criu_check_feature(uint64_t *features_to_check)
680 {
681 pid_t pid;
682 uint64_t current_bit = 0;
683 int ret;
684 uint64_t features = *features_to_check;
685 /* Feature checking is currently always like
686 * criu check --feature <feature-name>
687 */
688 char *args[] = { "criu", "check", "--feature", NULL, NULL };
689
690 if ((features & ~FEATURE_MEM_TRACK & ~FEATURE_LAZY_PAGES) != 0) {
691 /* There are feature bits activated we do not understand.
692 * Refusing to answer at all */
693 *features_to_check = 0;
694 return false;
695 }
696
697 while (current_bit < (sizeof(uint64_t) * 8 - 1)) {
698 /* only test requested features */
699 if (!(features & (1ULL << current_bit))) {
700 /* skip this */
701 current_bit++;
702 continue;
703 }
704
705 pid = fork();
706 if (pid < 0) {
707 SYSERROR("fork() failed");
708 *features_to_check = 0;
709 return false;
710 }
711
712 if (pid == 0) {
713 if ((1ULL << current_bit) == FEATURE_MEM_TRACK)
714 /* This is needed for pre-dump support, which
715 * enables pre-copy migration. */
716 args[3] = "mem_dirty_track";
717 else if ((1ULL << current_bit) == FEATURE_LAZY_PAGES)
718 /* CRIU has two checks for userfaultfd support.
719 *
720 * The simpler check is only for 'uffd'. If the
721 * kernel supports userfaultfd without noncoop
722 * then only process can be lazily restored
723 * which do not fork. With 'uffd-noncoop'
724 * it is also possible to lazily restore processes
725 * which do fork. For a container runtime like
726 * LXC checking only for 'uffd' makes not much sense. */
727 args[3] = "uffd-noncoop";
728 else
729 _exit(EXIT_FAILURE);
730
731 null_stdfds();
732
733 execvp("criu", args);
734 SYSERROR("Failed to exec \"criu\"");
735 _exit(EXIT_FAILURE);
736 }
737
738 ret = wait_for_pid(pid);
739
740 if (ret == -1) {
741 /* It is not known why CRIU failed. Either
742 * CRIU is not available, the feature check
743 * does not exist or the feature is not
744 * supported. */
745 INFO("feature not supported");
746 /* Clear not supported feature bit */
747 features &= ~(1ULL << current_bit);
748 }
749
750 current_bit++;
751 /* no more checks requested; exit check loop */
752 if (!(features & ~((1ULL << current_bit)-1)))
753 break;
754 }
755 if (features != *features_to_check) {
756 *features_to_check = features;
757 return false;
758 }
759 return true;
760 }
761
762 /*
763 * Check to see if the criu version is recent enough for all the features we
764 * use. This version allows either CRIU_VERSION or (CRIU_GITID_VERSION and
765 * CRIU_GITID_PATCHLEVEL) to work, enabling users building from git to c/r
766 * things potentially before a version is released with a particular feature.
767 *
768 * The intent is that when criu development slows down, we can drop this, but
769 * for now we shouldn't attempt to c/r with versions that we know won't work.
770 *
771 * Note: If version != NULL criu_version() stores the detected criu version in
772 * version. Allocates memory for version which must be freed by caller.
773 */
774 static bool criu_version_ok(char **version)
775 {
776 int pipes[2];
777 pid_t pid;
778
779 if (pipe(pipes) < 0) {
780 SYSERROR("pipe() failed");
781 return false;
782 }
783
784 pid = fork();
785 if (pid < 0) {
786 SYSERROR("fork() failed");
787 return false;
788 }
789
790 if (pid == 0) {
791 char *args[] = { "criu", "--version", NULL };
792 char *path;
793 close(pipes[0]);
794
795 close(STDERR_FILENO);
796 if (dup2(pipes[1], STDOUT_FILENO) < 0)
797 _exit(EXIT_FAILURE);
798
799 path = on_path("criu", NULL);
800 if (!path)
801 _exit(EXIT_FAILURE);
802
803 execv(path, args);
804 _exit(EXIT_FAILURE);
805 } else {
806 FILE *f;
807 char *tmp;
808 int patch;
809
810 close(pipes[1]);
811 if (wait_for_pid(pid) < 0) {
812 close(pipes[0]);
813 SYSERROR("execing criu failed, is it installed?");
814 return false;
815 }
816
817 f = fdopen(pipes[0], "r");
818 if (!f) {
819 close(pipes[0]);
820 return false;
821 }
822
823 tmp = malloc(1024);
824 if (!tmp) {
825 fclose(f);
826 return false;
827 }
828
829 if (fscanf(f, "Version: %1023[^\n]s", tmp) != 1)
830 goto version_error;
831
832 if (fgetc(f) != '\n')
833 goto version_error;
834
835 if (strcmp(tmp, CRIU_VERSION) >= 0)
836 goto version_match;
837
838 if (fscanf(f, "GitID: v%1023[^-]s", tmp) != 1)
839 goto version_error;
840
841 if (fgetc(f) != '-')
842 goto version_error;
843
844 if (fscanf(f, "%d", &patch) != 1)
845 goto version_error;
846
847 if (strcmp(tmp, CRIU_GITID_VERSION) < 0)
848 goto version_error;
849
850 if (patch < CRIU_GITID_PATCHLEVEL)
851 goto version_error;
852
853 version_match:
854 fclose(f);
855 if (!version)
856 free(tmp);
857 else
858 *version = tmp;
859 return true;
860
861 version_error:
862 fclose(f);
863 free(tmp);
864 ERROR("must have criu " CRIU_VERSION " or greater to checkpoint/restore");
865 return false;
866 }
867 }
868
869 /* Check and make sure the container has a configuration that we know CRIU can
870 * dump. */
871 static bool criu_ok(struct lxc_container *c, char **criu_version)
872 {
873 struct lxc_list *it;
874
875 if (geteuid()) {
876 ERROR("Must be root to checkpoint");
877 return false;
878 }
879
880 if (!criu_version_ok(criu_version))
881 return false;
882
883 /* We only know how to restore containers with veth networks. */
884 lxc_list_for_each(it, &c->lxc_conf->network) {
885 struct lxc_netdev *n = it->elem;
886 switch(n->type) {
887 case LXC_NET_VETH:
888 case LXC_NET_NONE:
889 case LXC_NET_EMPTY:
890 case LXC_NET_MACVLAN:
891 break;
892 default:
893 ERROR("Found un-dumpable network: %s (%s)", lxc_net_type_to_str(n->type), n->name);
894 if (criu_version) {
895 free(*criu_version);
896 *criu_version = NULL;
897 }
898 return false;
899 }
900 }
901
902 return true;
903 }
904
905 static bool restore_net_info(struct lxc_container *c)
906 {
907 int ret;
908 struct lxc_list *it;
909 bool has_error = true;
910
911 if (container_mem_lock(c))
912 return false;
913
914 lxc_list_for_each(it, &c->lxc_conf->network) {
915 struct lxc_netdev *netdev = it->elem;
916 char template[IFNAMSIZ];
917
918 if (netdev->type != LXC_NET_VETH)
919 continue;
920
921 ret = snprintf(template, sizeof(template), "vethXXXXXX");
922 if (ret < 0 || ret >= sizeof(template))
923 goto out_unlock;
924
925 if (netdev->priv.veth_attr.pair[0] == '\0' &&
926 netdev->priv.veth_attr.veth1[0] == '\0') {
927 if (!lxc_mkifname(template))
928 goto out_unlock;
929
930 (void)strlcpy(netdev->priv.veth_attr.veth1, template, IFNAMSIZ);
931 }
932 }
933
934 has_error = false;
935
936 out_unlock:
937 container_mem_unlock(c);
938 return !has_error;
939 }
940
941 /* do_restore never returns, the calling process is used as the monitor process.
942 * do_restore calls _exit() if it fails.
943 */
944 static void do_restore(struct lxc_container *c, int status_pipe, struct migrate_opts *opts, char *criu_version)
945 {
946 int fd, ret;
947 pid_t pid;
948 struct lxc_handler *handler;
949 int status = 0;
950 int pipes[2] = {-1, -1};
951 struct cgroup_ops *cgroup_ops;
952
953 /* Try to detach from the current controlling tty if it exists.
954 * Otherwise, lxc_init (via lxc_console) will attach the container's
955 * console output to the current tty, which is probably not what any
956 * library user wants, and if they do, they can just manually configure
957 * it :)
958 */
959 fd = open("/dev/tty", O_RDWR);
960 if (fd >= 0) {
961 if (ioctl(fd, TIOCNOTTY, NULL) < 0)
962 SYSERROR("couldn't detach from tty");
963 close(fd);
964 }
965
966 handler = lxc_init_handler(c->name, c->lxc_conf, c->config_path, false);
967 if (!handler)
968 goto out;
969
970 if (lxc_init(c->name, handler) < 0)
971 goto out;
972
973 cgroup_ops = cgroup_init(c->lxc_conf);
974 if (!cgroup_ops)
975 goto out_fini_handler;
976 handler->cgroup_ops = cgroup_ops;
977
978 if (!cgroup_ops->payload_create(cgroup_ops, handler)) {
979 ERROR("failed creating groups");
980 goto out_fini_handler;
981 }
982
983 if (!restore_net_info(c)) {
984 ERROR("failed restoring network info");
985 goto out_fini_handler;
986 }
987
988 ret = resolve_clone_flags(handler);
989 if (ret < 0) {
990 SYSERROR("Unsupported clone flag specified");
991 goto out_fini_handler;
992 }
993
994 if (pipe2(pipes, O_CLOEXEC) < 0) {
995 SYSERROR("pipe() failed");
996 goto out_fini_handler;
997 }
998
999 pid = fork();
1000 if (pid < 0)
1001 goto out_fini_handler;
1002
1003 if (pid == 0) {
1004 struct criu_opts os;
1005 struct lxc_rootfs *rootfs;
1006 int flags;
1007
1008 close(status_pipe);
1009 status_pipe = -1;
1010
1011 close(pipes[0]);
1012 pipes[0] = -1;
1013
1014 if (unshare(CLONE_NEWNS))
1015 goto out_fini_handler;
1016
1017 /* CRIU needs the lxc root bind mounted so that it is the root of some
1018 * mount. */
1019 rootfs = &c->lxc_conf->rootfs;
1020
1021 if (rootfs_is_blockdev(c->lxc_conf)) {
1022 if (lxc_setup_rootfs_prepare_root(c->lxc_conf, c->name,
1023 c->config_path) < 0)
1024 goto out_fini_handler;
1025 } else {
1026 if (mkdir(rootfs->mount, 0755) < 0 && errno != EEXIST)
1027 goto out_fini_handler;
1028
1029 if (mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL) < 0) {
1030 SYSERROR("remount / to private failed");
1031 goto out_fini_handler;
1032 }
1033
1034 if (mount(rootfs->path, rootfs->mount, NULL, MS_BIND, NULL) < 0) {
1035 rmdir(rootfs->mount);
1036 goto out_fini_handler;
1037 }
1038 }
1039
1040 os.pipefd = pipes[1];
1041 os.action = "restore";
1042 os.user = opts;
1043 os.c = c;
1044 os.console_fd = c->lxc_conf->console.slave;
1045 os.criu_version = criu_version;
1046 os.handler = handler;
1047
1048 if (os.console_fd >= 0) {
1049 /* Twiddle the FD_CLOEXEC bit. We want to pass this FD to criu
1050 * via --inherit-fd, so we don't want it to close.
1051 */
1052 flags = fcntl(os.console_fd, F_GETFD);
1053 if (flags < 0) {
1054 SYSERROR("F_GETFD failed: %d", os.console_fd);
1055 goto out_fini_handler;
1056 }
1057
1058 flags &= ~FD_CLOEXEC;
1059
1060 if (fcntl(os.console_fd, F_SETFD, flags) < 0) {
1061 SYSERROR("F_SETFD failed");
1062 goto out_fini_handler;
1063 }
1064 }
1065 os.console_name = c->lxc_conf->console.name;
1066
1067 /* exec_criu() returning is an error */
1068 exec_criu(cgroup_ops, c->lxc_conf, &os);
1069 umount(rootfs->mount);
1070 rmdir(rootfs->mount);
1071 goto out_fini_handler;
1072 } else {
1073 char title[2048];
1074
1075 close(pipes[1]);
1076 pipes[1] = -1;
1077
1078 pid_t w = waitpid(pid, &status, 0);
1079 if (w == -1) {
1080 SYSERROR("waitpid");
1081 goto out_fini_handler;
1082 }
1083
1084 if (WIFEXITED(status)) {
1085 char buf[4096];
1086
1087 if (WEXITSTATUS(status)) {
1088 int n;
1089
1090 n = lxc_read_nointr(pipes[0], buf, sizeof(buf));
1091 if (n < 0) {
1092 SYSERROR("failed reading from criu stderr");
1093 goto out_fini_handler;
1094 }
1095
1096 if (n == sizeof(buf))
1097 n--;
1098 buf[n] = 0;
1099
1100 ERROR("criu process exited %d, output:\n%s", WEXITSTATUS(status), buf);
1101 goto out_fini_handler;
1102 } else {
1103 ret = snprintf(buf, sizeof(buf), "/proc/self/task/%lu/children", (unsigned long)syscall(__NR_gettid));
1104 if (ret < 0 || ret >= sizeof(buf)) {
1105 ERROR("snprintf'd too many characters: %d", ret);
1106 goto out_fini_handler;
1107 }
1108
1109 FILE *f = fopen(buf, "r");
1110 if (!f) {
1111 SYSERROR("couldn't read restore's children file %s", buf);
1112 goto out_fini_handler;
1113 }
1114
1115 ret = fscanf(f, "%d", (int*) &handler->pid);
1116 fclose(f);
1117 if (ret != 1) {
1118 ERROR("reading restore pid failed");
1119 goto out_fini_handler;
1120 }
1121
1122 if (lxc_set_state(c->name, handler, RUNNING)) {
1123 ERROR("error setting running state after restore");
1124 goto out_fini_handler;
1125 }
1126 }
1127 } else {
1128 ERROR("CRIU was killed with signal %d", WTERMSIG(status));
1129 goto out_fini_handler;
1130 }
1131
1132 close(pipes[0]);
1133
1134 ret = lxc_write_nointr(status_pipe, &status, sizeof(status));
1135 close(status_pipe);
1136 status_pipe = -1;
1137
1138 if (sizeof(status) != ret) {
1139 SYSERROR("failed to write all of status");
1140 goto out_fini_handler;
1141 }
1142
1143 /*
1144 * See comment in lxcapi_start; we don't care if these
1145 * fail because it's just a beauty thing. We just
1146 * assign the return here to silence potential.
1147 */
1148 ret = snprintf(title, sizeof(title), "[lxc monitor] %s %s", c->config_path, c->name);
1149 if (ret < 0 || (size_t)ret >= sizeof(title))
1150 INFO("Setting truncated process name");
1151
1152 ret = setproctitle(title);
1153 if (ret < 0)
1154 INFO("Failed to set process name");
1155
1156 ret = lxc_poll(c->name, handler);
1157 if (ret)
1158 lxc_abort(c->name, handler);
1159 lxc_fini(c->name, handler);
1160 _exit(ret);
1161 }
1162
1163 out_fini_handler:
1164 if (pipes[0] >= 0)
1165 close(pipes[0]);
1166 if (pipes[1] >= 0)
1167 close(pipes[1]);
1168
1169 lxc_fini(c->name, handler);
1170
1171 out:
1172 if (status_pipe >= 0) {
1173 /* ensure getting here was a failure, e.g. if we failed to
1174 * parse the child pid or something, even after a successful
1175 * restore
1176 */
1177 if (!status)
1178 status = 1;
1179
1180 if (lxc_write_nointr(status_pipe, &status, sizeof(status)) != sizeof(status))
1181 SYSERROR("writing status failed");
1182 close(status_pipe);
1183 }
1184
1185 _exit(EXIT_FAILURE);
1186 }
1187
1188 static int save_tty_major_minor(char *directory, struct lxc_container *c, char *tty_id, int len)
1189 {
1190 FILE *f;
1191 char path[PATH_MAX];
1192 int ret;
1193 struct stat sb;
1194
1195 if (c->lxc_conf->console.path && !strcmp(c->lxc_conf->console.path, "none")) {
1196 tty_id[0] = 0;
1197 return 0;
1198 }
1199
1200 ret = snprintf(path, sizeof(path), "/proc/%d/root/dev/console", c->init_pid(c));
1201 if (ret < 0 || ret >= sizeof(path)) {
1202 ERROR("snprintf'd too many characters: %d", ret);
1203 return -1;
1204 }
1205
1206 ret = stat(path, &sb);
1207 if (ret < 0) {
1208 SYSERROR("stat of %s failed", path);
1209 return -1;
1210 }
1211
1212 ret = snprintf(path, sizeof(path), "%s/tty.info", directory);
1213 if (ret < 0 || ret >= sizeof(path)) {
1214 ERROR("snprintf'd too many characters: %d", ret);
1215 return -1;
1216 }
1217
1218 ret = snprintf(tty_id, len, "tty[%llx:%llx]",
1219 (long long unsigned) sb.st_rdev,
1220 (long long unsigned) sb.st_dev);
1221 if (ret < 0 || ret >= sizeof(path)) {
1222 ERROR("snprintf'd too many characters: %d", ret);
1223 return -1;
1224 }
1225
1226 f = fopen(path, "w");
1227 if (!f) {
1228 SYSERROR("failed to open %s", path);
1229 return -1;
1230 }
1231
1232 ret = fprintf(f, "%s", tty_id);
1233 fclose(f);
1234 if (ret < 0)
1235 SYSERROR("failed to write to %s", path);
1236 return ret;
1237 }
1238
1239 /* do one of either predump or a regular dump */
1240 static bool do_dump(struct lxc_container *c, char *mode, struct migrate_opts *opts)
1241 {
1242 int ret;
1243 pid_t pid;
1244 int criuout[2];
1245 char *criu_version = NULL;
1246
1247 if (!criu_ok(c, &criu_version))
1248 return false;
1249
1250 ret = pipe(criuout);
1251 if (ret < 0) {
1252 SYSERROR("pipe() failed");
1253 free(criu_version);
1254 return false;
1255 }
1256
1257 if (mkdir_p(opts->directory, 0700) < 0)
1258 goto fail;
1259
1260 pid = fork();
1261 if (pid < 0) {
1262 SYSERROR("fork failed");
1263 goto fail;
1264 }
1265
1266 if (pid == 0) {
1267 struct criu_opts os;
1268 struct cgroup_ops *cgroup_ops;
1269
1270 close(criuout[0]);
1271
1272 cgroup_ops = cgroup_init(c->lxc_conf);
1273 if (!cgroup_ops) {
1274 ERROR("failed to cgroup_init()");
1275 _exit(EXIT_FAILURE);
1276 }
1277
1278 os.pipefd = criuout[1];
1279 os.action = mode;
1280 os.user = opts;
1281 os.c = c;
1282 os.console_name = c->lxc_conf->console.path;
1283 os.criu_version = criu_version;
1284 os.handler = NULL;
1285
1286 ret = save_tty_major_minor(opts->directory, c, os.tty_id, sizeof(os.tty_id));
1287 if (ret < 0) {
1288 free(criu_version);
1289 _exit(EXIT_FAILURE);
1290 }
1291
1292 /* exec_criu() returning is an error */
1293 exec_criu(cgroup_ops, c->lxc_conf, &os);
1294 free(criu_version);
1295 _exit(EXIT_FAILURE);
1296 } else {
1297 int status;
1298 ssize_t n;
1299 char buf[4096];
1300
1301 close(criuout[1]);
1302
1303 pid_t w = waitpid(pid, &status, 0);
1304 if (w == -1) {
1305 SYSERROR("waitpid");
1306 close(criuout[0]);
1307 free(criu_version);
1308 return false;
1309 }
1310
1311 n = lxc_read_nointr(criuout[0], buf, sizeof(buf));
1312 close(criuout[0]);
1313 if (n < 0) {
1314 SYSERROR("read");
1315 n = 0;
1316 }
1317
1318 if (n == sizeof(buf))
1319 buf[n-1] = 0;
1320 else
1321 buf[n] = 0;
1322
1323 if (WIFEXITED(status)) {
1324 if (WEXITSTATUS(status)) {
1325 ERROR("dump failed with %d", WEXITSTATUS(status));
1326 ret = false;
1327 } else {
1328 ret = true;
1329 }
1330 } else if (WIFSIGNALED(status)) {
1331 ERROR("dump signaled with %d", WTERMSIG(status));
1332 ret = false;
1333 } else {
1334 ERROR("unknown dump exit %d", status);
1335 ret = false;
1336 }
1337
1338 if (!ret)
1339 ERROR("criu output: %s", buf);
1340
1341 free(criu_version);
1342 return ret;
1343 }
1344 fail:
1345 close(criuout[0]);
1346 close(criuout[1]);
1347 rmdir(opts->directory);
1348 free(criu_version);
1349 return false;
1350 }
1351
1352 bool __criu_pre_dump(struct lxc_container *c, struct migrate_opts *opts)
1353 {
1354 return do_dump(c, "pre-dump", opts);
1355 }
1356
1357 bool __criu_dump(struct lxc_container *c, struct migrate_opts *opts)
1358 {
1359 char path[PATH_MAX];
1360 int ret;
1361
1362 ret = snprintf(path, sizeof(path), "%s/inventory.img", opts->directory);
1363 if (ret < 0 || ret >= sizeof(path))
1364 return false;
1365
1366 if (access(path, F_OK) == 0) {
1367 ERROR("please use a fresh directory for the dump directory");
1368 return false;
1369 }
1370
1371 return do_dump(c, "dump", opts);
1372 }
1373
1374 bool __criu_restore(struct lxc_container *c, struct migrate_opts *opts)
1375 {
1376 pid_t pid;
1377 int status, nread;
1378 int pipefd[2];
1379 char *criu_version = NULL;
1380
1381 if (geteuid()) {
1382 ERROR("Must be root to restore");
1383 return false;
1384 }
1385
1386 if (pipe(pipefd)) {
1387 ERROR("failed to create pipe");
1388 return false;
1389 }
1390
1391 if (!criu_ok(c, &criu_version)) {
1392 close(pipefd[0]);
1393 close(pipefd[1]);
1394 return false;
1395 }
1396
1397 pid = fork();
1398 if (pid < 0) {
1399 close(pipefd[0]);
1400 close(pipefd[1]);
1401 free(criu_version);
1402 return false;
1403 }
1404
1405 if (pid == 0) {
1406 close(pipefd[0]);
1407 /* this never returns */
1408 do_restore(c, pipefd[1], opts, criu_version);
1409 }
1410
1411 close(pipefd[1]);
1412 free(criu_version);
1413
1414 nread = lxc_read_nointr(pipefd[0], &status, sizeof(status));
1415 close(pipefd[0]);
1416 if (sizeof(status) != nread) {
1417 ERROR("reading status from pipe failed");
1418 goto err_wait;
1419 }
1420
1421 /* If the criu process was killed or exited nonzero, wait() for the
1422 * handler, since the restore process died. Otherwise, we don't need to
1423 * wait, since the child becomes the monitor process.
1424 */
1425 if (!WIFEXITED(status) || WEXITSTATUS(status))
1426 goto err_wait;
1427 return true;
1428
1429 err_wait:
1430 if (wait_for_pid(pid))
1431 ERROR("restore process died");
1432 return false;
1433 }