]> git.proxmox.com Git - mirror_lxc.git/blob - src/lxc/criu.c
coverity: #1426130
[mirror_lxc.git] / src / lxc / criu.c
1 /*
2 * lxc: linux Container library
3 *
4 * Copyright © 2014-2015 Canonical Ltd.
5 *
6 * Authors:
7 * Tycho Andersen <tycho.andersen@canonical.com>
8 *
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 */
23
24 #ifndef _GNU_SOURCE
25 #define _GNU_SOURCE 1
26 #endif
27 #include <inttypes.h>
28 #include <linux/limits.h>
29 #include <sched.h>
30 #include <stdio.h>
31 #include <stdlib.h>
32 #include <string.h>
33 #include <sys/mount.h>
34 #include <sys/types.h>
35 #include <sys/wait.h>
36 #include <unistd.h>
37
38 #include "cgroup.h"
39 #include "commands.h"
40 #include "conf.h"
41 #include "config.h"
42 #include "criu.h"
43 #include "log.h"
44 #include "lxc.h"
45 #include "lxclock.h"
46 #include "network.h"
47 #include "storage.h"
48 #include "syscall_wrappers.h"
49 #include "utils.h"
50
51 #if IS_BIONIC
52 #include <../include/lxcmntent.h>
53 #else
54 #include <mntent.h>
55 #endif
56
57 #ifndef HAVE_STRLCPY
58 #include "include/strlcpy.h"
59 #endif
60
61 #define CRIU_VERSION "2.0"
62
63 #define CRIU_GITID_VERSION "2.0"
64 #define CRIU_GITID_PATCHLEVEL 0
65
66 #define CRIU_IN_FLIGHT_SUPPORT "2.4"
67 #define CRIU_EXTERNAL_NOT_VETH "2.8"
68
69 lxc_log_define(criu, lxc);
70
71 struct criu_opts {
72 /* the thing to hook to stdout and stderr for logging */
73 int pipefd;
74
75 /* The type of criu invocation, one of "dump" or "restore" */
76 char *action;
77
78 /* the user-provided migrate options relevant to this action */
79 struct migrate_opts *user;
80
81 /* The container to dump */
82 struct lxc_container *c;
83
84 /* dump: stop the container or not after dumping? */
85 char tty_id[32]; /* the criu tty id for /dev/console, i.e. "tty[${rdev}:${dev}]" */
86
87 /* restore: the file to write the init process' pid into */
88 struct lxc_handler *handler;
89 int console_fd;
90 /* The path that is bind mounted from /dev/console, if any. We don't
91 * want to use `--ext-mount-map auto`'s result here because the pts
92 * device may have a different path (e.g. if the pty number is
93 * different) on the target host. NULL if lxc.console.path = "none".
94 */
95 char *console_name;
96
97 /* The detected version of criu */
98 char *criu_version;
99 };
100
101 static int load_tty_major_minor(char *directory, char *output, int len)
102 {
103 FILE *f;
104 char path[PATH_MAX];
105 int ret;
106
107 ret = snprintf(path, sizeof(path), "%s/tty.info", directory);
108 if (ret < 0 || ret >= sizeof(path)) {
109 ERROR("snprintf'd too many chacters: %d", ret);
110 return -1;
111 }
112
113 f = fopen(path, "r");
114 if (!f) {
115 /* This means we're coming from a liblxc which didn't export
116 * the tty info. In this case they had to have lxc.console.path
117 * = * none, so there's no problem restoring.
118 */
119 if (errno == ENOENT)
120 return 0;
121
122 SYSERROR("couldn't open %s", path);
123 return -1;
124 }
125
126 if (!fgets(output, len, f)) {
127 fclose(f);
128 SYSERROR("couldn't read %s", path);
129 return -1;
130 }
131
132 fclose(f);
133 return 0;
134 }
135
136 static int cmp_version(const char *v1, const char *v2)
137 {
138 int ret;
139 int oct_v1[3], oct_v2[3];
140
141 memset(oct_v1, -1, sizeof(oct_v1));
142 memset(oct_v2, -1, sizeof(oct_v2));
143
144 ret = sscanf(v1, "%d.%d.%d", &oct_v1[0], &oct_v1[1], &oct_v1[2]);
145 if (ret < 1)
146 return -1;
147
148 ret = sscanf(v2, "%d.%d.%d", &oct_v2[0], &oct_v2[1], &oct_v2[2]);
149 if (ret < 1)
150 return -1;
151
152 /* Major version is greater. */
153 if (oct_v1[0] > oct_v2[0])
154 return 1;
155
156 if (oct_v1[0] < oct_v2[0])
157 return -1;
158
159 /* Minor number is greater.*/
160 if (oct_v1[1] > oct_v2[1])
161 return 1;
162
163 if (oct_v1[1] < oct_v2[1])
164 return -1;
165
166 /* Patch number is greater. */
167 if (oct_v1[2] > oct_v2[2])
168 return 1;
169
170 /* Patch numbers are equal. */
171 if (oct_v1[2] == oct_v2[2])
172 return 0;
173
174 return -1;
175 }
176
177 static void exec_criu(struct cgroup_ops *cgroup_ops, struct lxc_conf *conf,
178 struct criu_opts *opts)
179 {
180 char **argv, log[PATH_MAX];
181 int static_args = 23, argc = 0, i, ret;
182 int netnr = 0;
183 struct lxc_list *it;
184 FILE *mnts;
185 struct mntent mntent;
186
187 char buf[4096], ttys[32];
188 size_t pos;
189
190 /* If we are currently in a cgroup /foo/bar, and the container is in a
191 * cgroup /lxc/foo, lxcfs will give us an ENOENT if some task in the
192 * container has an open fd that points to one of the cgroup files
193 * (systemd always opens its "root" cgroup). So, let's escape to the
194 * /actual/ root cgroup so that lxcfs thinks criu has enough rights to
195 * see all cgroups.
196 */
197 if (!cgroup_ops->escape(cgroup_ops, conf)) {
198 ERROR("failed to escape cgroups");
199 return;
200 }
201
202 /* The command line always looks like:
203 * criu $(action) --tcp-established --file-locks --link-remap \
204 * --manage-cgroups=full --action-script foo.sh -D $(directory) \
205 * -o $(directory)/$(action).log --ext-mount-map auto
206 * --enable-external-sharing --enable-external-masters
207 * --enable-fs hugetlbfs --enable-fs tracefs --ext-mount-map console:/dev/pts/n
208 * +1 for final NULL */
209
210 if (strcmp(opts->action, "dump") == 0 || strcmp(opts->action, "pre-dump") == 0) {
211 /* -t pid --freeze-cgroup /lxc/ct */
212 static_args += 4;
213
214 /* --prev-images-dir <path-to-directory-A-relative-to-B> */
215 if (opts->user->predump_dir)
216 static_args += 2;
217
218 /* --page-server --address <address> --port <port> */
219 if (opts->user->pageserver_address && opts->user->pageserver_port)
220 static_args += 5;
221
222 /* --leave-running (only for final dump) */
223 if (strcmp(opts->action, "dump") == 0 && !opts->user->stop)
224 static_args++;
225
226 /* --external tty[88,4] */
227 if (opts->tty_id[0])
228 static_args += 2;
229
230 /* --force-irmap */
231 if (!opts->user->preserves_inodes)
232 static_args++;
233
234 /* --ghost-limit 1024 */
235 if (opts->user->ghost_limit)
236 static_args += 2;
237 } else if (strcmp(opts->action, "restore") == 0) {
238 /* --root $(lxc_mount_point) --restore-detached
239 * --restore-sibling
240 * --lsm-profile apparmor:whatever
241 */
242 static_args += 6;
243
244 ttys[0] = 0;
245 if (load_tty_major_minor(opts->user->directory, ttys, sizeof(ttys)))
246 return;
247
248 /* --inherit-fd fd[%d]:tty[%s] */
249 if (ttys[0])
250 static_args += 2;
251 } else {
252 return;
253 }
254
255 if (cgroup_ops->num_hierarchies(cgroup_ops) > 0)
256 static_args += 2 * cgroup_ops->num_hierarchies(cgroup_ops);
257
258 if (opts->user->verbose)
259 static_args++;
260
261 if (opts->user->action_script)
262 static_args += 2;
263
264 static_args += 2 * lxc_list_len(&opts->c->lxc_conf->mount_list);
265
266 ret = snprintf(log, PATH_MAX, "%s/%s.log", opts->user->directory, opts->action);
267 if (ret < 0 || ret >= PATH_MAX) {
268 ERROR("logfile name too long");
269 return;
270 }
271
272 argv = malloc(static_args * sizeof(*argv));
273 if (!argv)
274 return;
275
276 memset(argv, 0, static_args * sizeof(*argv));
277
278 #define DECLARE_ARG(arg) \
279 do { \
280 if (arg == NULL) { \
281 ERROR("Got NULL argument for criu"); \
282 goto err; \
283 } \
284 argv[argc++] = strdup(arg); \
285 if (!argv[argc-1]) \
286 goto err; \
287 } while (0)
288
289 argv[argc++] = on_path("criu", NULL);
290 if (!argv[argc-1]) {
291 ERROR("Couldn't find criu binary");
292 goto err;
293 }
294
295 DECLARE_ARG(opts->action);
296 DECLARE_ARG("--tcp-established");
297 DECLARE_ARG("--file-locks");
298 DECLARE_ARG("--link-remap");
299 DECLARE_ARG("--manage-cgroups=full");
300 DECLARE_ARG("--ext-mount-map");
301 DECLARE_ARG("auto");
302 DECLARE_ARG("--enable-external-sharing");
303 DECLARE_ARG("--enable-external-masters");
304 DECLARE_ARG("--enable-fs");
305 DECLARE_ARG("hugetlbfs");
306 DECLARE_ARG("--enable-fs");
307 DECLARE_ARG("tracefs");
308 DECLARE_ARG("-D");
309 DECLARE_ARG(opts->user->directory);
310 DECLARE_ARG("-o");
311 DECLARE_ARG(log);
312
313 for (i = 0; i < cgroup_ops->num_hierarchies(cgroup_ops); i++) {
314 char **controllers = NULL, *fullname;
315 char *path, *tmp;
316
317 if (!cgroup_ops->get_hierarchies(cgroup_ops, i, &controllers)) {
318 ERROR("failed to get hierarchy %d", i);
319 goto err;
320 }
321
322 /* if we are in a dump, we have to ask the monitor process what
323 * the right cgroup is. if this is a restore, we can just use
324 * the handler the restore task created.
325 */
326 if (!strcmp(opts->action, "dump") || !strcmp(opts->action, "pre-dump")) {
327 path = lxc_cmd_get_cgroup_path(opts->c->name, opts->c->config_path, controllers[0]);
328 if (!path) {
329 ERROR("failed to get cgroup path for %s", controllers[0]);
330 goto err;
331 }
332 } else {
333 const char *p;
334
335 p = cgroup_ops->get_cgroup(cgroup_ops, controllers[0]);
336 if (!p) {
337 ERROR("failed to get cgroup path for %s", controllers[0]);
338 goto err;
339 }
340
341 path = strdup(p);
342 if (!path) {
343 ERROR("strdup failed");
344 goto err;
345 }
346 }
347
348 tmp = lxc_deslashify(path);
349 if (!tmp) {
350 ERROR("Failed to remove extraneous slashes from \"%s\"",
351 path);
352 free(path);
353 goto err;
354 }
355 free(path);
356 path = tmp;
357
358 fullname = lxc_string_join(",", (const char **) controllers, false);
359 if (!fullname) {
360 ERROR("failed to join controllers");
361 free(path);
362 goto err;
363 }
364
365 ret = sprintf(buf, "%s:%s", fullname, path);
366 free(path);
367 free(fullname);
368 if (ret < 0 || ret >= sizeof(buf)) {
369 ERROR("sprintf of cgroup root arg failed");
370 goto err;
371 }
372
373 DECLARE_ARG("--cgroup-root");
374 DECLARE_ARG(buf);
375 }
376
377 if (opts->user->verbose)
378 DECLARE_ARG("-vvvvvv");
379
380 if (opts->user->action_script) {
381 DECLARE_ARG("--action-script");
382 DECLARE_ARG(opts->user->action_script);
383 }
384
385 mnts = make_anonymous_mount_file(&opts->c->lxc_conf->mount_list,
386 opts->c->lxc_conf->lsm_aa_allow_nesting);
387 if (!mnts)
388 goto err;
389
390 while (getmntent_r(mnts, &mntent, buf, sizeof(buf))) {
391 char *fmt, *key, *val, *mntdata;
392 char arg[2 * PATH_MAX + 2];
393 unsigned long flags;
394
395 if (parse_mntopts(mntent.mnt_opts, &flags, &mntdata) < 0)
396 goto err;
397
398 free(mntdata);
399
400 /* only add --ext-mount-map for actual bind mounts */
401 if (!(flags & MS_BIND))
402 continue;
403
404 if (strcmp(opts->action, "dump") == 0) {
405 fmt = "/%s:%s";
406 key = mntent.mnt_dir;
407 val = mntent.mnt_dir;
408 } else {
409 fmt = "%s:%s";
410 key = mntent.mnt_dir;
411 val = mntent.mnt_fsname;
412 }
413
414 ret = snprintf(arg, sizeof(arg), fmt, key, val);
415 if (ret < 0 || ret >= sizeof(arg)) {
416 fclose(mnts);
417 ERROR("snprintf failed");
418 goto err;
419 }
420
421 DECLARE_ARG("--ext-mount-map");
422 DECLARE_ARG(arg);
423 }
424 fclose(mnts);
425
426 if (strcmp(opts->action, "dump") == 0 || strcmp(opts->action, "pre-dump") == 0) {
427 char pid[32], *freezer_relative;
428
429 if (sprintf(pid, "%d", opts->c->init_pid(opts->c)) < 0)
430 goto err;
431
432 DECLARE_ARG("-t");
433 DECLARE_ARG(pid);
434
435 freezer_relative = lxc_cmd_get_cgroup_path(opts->c->name,
436 opts->c->config_path,
437 "freezer");
438 if (!freezer_relative) {
439 ERROR("failed getting freezer path");
440 goto err;
441 }
442
443 ret = snprintf(log, sizeof(log), "/sys/fs/cgroup/freezer/%s", freezer_relative);
444 if (ret < 0 || ret >= sizeof(log))
445 goto err;
446
447 if (!opts->user->disable_skip_in_flight &&
448 strcmp(opts->criu_version, CRIU_IN_FLIGHT_SUPPORT) >= 0)
449 DECLARE_ARG("--skip-in-flight");
450
451 DECLARE_ARG("--freeze-cgroup");
452 DECLARE_ARG(log);
453
454 if (opts->tty_id[0]) {
455 DECLARE_ARG("--ext-mount-map");
456 DECLARE_ARG("/dev/console:console");
457
458 DECLARE_ARG("--external");
459 DECLARE_ARG(opts->tty_id);
460 }
461
462 if (opts->user->predump_dir) {
463 DECLARE_ARG("--prev-images-dir");
464 DECLARE_ARG(opts->user->predump_dir);
465 DECLARE_ARG("--track-mem");
466 }
467
468 if (opts->user->pageserver_address && opts->user->pageserver_port) {
469 DECLARE_ARG("--page-server");
470 DECLARE_ARG("--address");
471 DECLARE_ARG(opts->user->pageserver_address);
472 DECLARE_ARG("--port");
473 DECLARE_ARG(opts->user->pageserver_port);
474 }
475
476 if (!opts->user->preserves_inodes)
477 DECLARE_ARG("--force-irmap");
478
479 if (opts->user->ghost_limit) {
480 char ghost_limit[32];
481
482 ret = sprintf(ghost_limit, "%"PRIu64, opts->user->ghost_limit);
483 if (ret < 0 || ret >= sizeof(ghost_limit)) {
484 ERROR("failed to print ghost limit %"PRIu64, opts->user->ghost_limit);
485 goto err;
486 }
487
488 DECLARE_ARG("--ghost-limit");
489 DECLARE_ARG(ghost_limit);
490 }
491
492 /* only for final dump */
493 if (strcmp(opts->action, "dump") == 0 && !opts->user->stop)
494 DECLARE_ARG("--leave-running");
495 } else if (strcmp(opts->action, "restore") == 0) {
496 void *m;
497 int additional;
498 struct lxc_conf *lxc_conf = opts->c->lxc_conf;
499
500 DECLARE_ARG("--root");
501 DECLARE_ARG(opts->c->lxc_conf->rootfs.mount);
502 DECLARE_ARG("--restore-detached");
503 DECLARE_ARG("--restore-sibling");
504
505 if (ttys[0]) {
506 if (opts->console_fd < 0) {
507 ERROR("lxc.console.path configured on source host but not target");
508 goto err;
509 }
510
511 ret = snprintf(buf, sizeof(buf), "fd[%d]:%s", opts->console_fd, ttys);
512 if (ret < 0 || ret >= sizeof(buf))
513 goto err;
514
515 DECLARE_ARG("--inherit-fd");
516 DECLARE_ARG(buf);
517 }
518 if (opts->console_name) {
519 if (snprintf(buf, sizeof(buf), "console:%s", opts->console_name) < 0) {
520 SYSERROR("sprintf'd too many bytes");
521 }
522 DECLARE_ARG("--ext-mount-map");
523 DECLARE_ARG(buf);
524 }
525
526 if (lxc_conf->lsm_aa_profile || lxc_conf->lsm_se_context) {
527
528 if (lxc_conf->lsm_aa_profile)
529 ret = snprintf(buf, sizeof(buf), "apparmor:%s", lxc_conf->lsm_aa_profile);
530 else
531 ret = snprintf(buf, sizeof(buf), "selinux:%s", lxc_conf->lsm_se_context);
532
533 if (ret < 0 || ret >= sizeof(buf))
534 goto err;
535
536 DECLARE_ARG("--lsm-profile");
537 DECLARE_ARG(buf);
538 }
539
540 additional = lxc_list_len(&opts->c->lxc_conf->network) * 2;
541
542 m = realloc(argv, (argc + additional + 1) * sizeof(*argv));
543 if (!m)
544 goto err;
545 argv = m;
546
547 lxc_list_for_each(it, &opts->c->lxc_conf->network) {
548 size_t retlen;
549 char eth[128], *veth;
550 char *fmt;
551 struct lxc_netdev *n = it->elem;
552 bool external_not_veth;
553
554 if (cmp_version(opts->criu_version, CRIU_EXTERNAL_NOT_VETH) >= 0) {
555 /* Since criu version 2.8 the usage of --veth-pair
556 * has been deprecated:
557 * git tag --contains f2037e6d3445fc400
558 * v2.8 */
559 external_not_veth = true;
560 } else {
561 external_not_veth = false;
562 }
563
564 if (n->name[0] != '\0') {
565 retlen = strlcpy(eth, n->name, sizeof(eth));
566 if (retlen >= sizeof(eth))
567 goto err;
568 } else {
569 ret = snprintf(eth, sizeof(eth), "eth%d", netnr);
570 if (ret < 0 || ret >= sizeof(eth))
571 goto err;
572 }
573
574 switch (n->type) {
575 case LXC_NET_VETH:
576 veth = n->priv.veth_attr.pair;
577 if (veth[0] == '\0')
578 veth = n->priv.veth_attr.veth1;
579
580 if (n->link[0] != '\0') {
581 if (external_not_veth)
582 fmt = "veth[%s]:%s@%s";
583 else
584 fmt = "%s=%s@%s";
585
586 ret = snprintf(buf, sizeof(buf), fmt, eth, veth, n->link);
587 } else {
588 if (external_not_veth)
589 fmt = "veth[%s]:%s";
590 else
591 fmt = "%s=%s";
592
593 ret = snprintf(buf, sizeof(buf), fmt, eth, veth);
594 }
595 if (ret < 0 || ret >= sizeof(buf))
596 goto err;
597 break;
598 case LXC_NET_MACVLAN:
599 if (n->link[0] == '\0') {
600 ERROR("no host interface for macvlan %s", n->name);
601 goto err;
602 }
603
604 ret = snprintf(buf, sizeof(buf), "macvlan[%s]:%s", eth, n->link);
605 if (ret < 0 || ret >= sizeof(buf))
606 goto err;
607 break;
608 case LXC_NET_NONE:
609 case LXC_NET_EMPTY:
610 break;
611 default:
612 /* we have screened for this earlier... */
613 ERROR("unexpected network type %d", n->type);
614 goto err;
615 }
616
617 if (external_not_veth)
618 DECLARE_ARG("--external");
619 else
620 DECLARE_ARG("--veth-pair");
621 DECLARE_ARG(buf);
622 netnr++;
623 }
624
625 }
626
627 argv[argc] = NULL;
628
629 buf[0] = 0;
630 pos = 0;
631
632 for (i = 0; argv[i]; i++) {
633 ret = snprintf(buf + pos, sizeof(buf) - pos, "%s ", argv[i]);
634 if (ret < 0 || ret >= sizeof(buf) - pos)
635 goto err;
636 else
637 pos += ret;
638 }
639
640 INFO("execing: %s", buf);
641
642 /* before criu inits its log, it sometimes prints things to stdout/err;
643 * let's be sure we capture that.
644 */
645 if (dup2(opts->pipefd, STDOUT_FILENO) < 0) {
646 SYSERROR("dup2 stdout failed");
647 goto err;
648 }
649
650 if (dup2(opts->pipefd, STDERR_FILENO) < 0) {
651 SYSERROR("dup2 stderr failed");
652 goto err;
653 }
654
655 close(opts->pipefd);
656
657 #undef DECLARE_ARG
658 execv(argv[0], argv);
659 err:
660 for (i = 0; argv[i]; i++)
661 free(argv[i]);
662 free(argv);
663 }
664
665 /*
666 * Function to check if the checks activated in 'features_to_check' are
667 * available with the current architecture/kernel/criu combination.
668 *
669 * Parameter features_to_check is a bit mask of all features that should be
670 * checked (see feature check defines in lxc/lxccontainer.h).
671 *
672 * If the return value is true, all requested features are supported. If
673 * the return value is false the features_to_check parameter is updated
674 * to reflect which features are available. '0' means no feature but
675 * also that something went totally wrong.
676 *
677 * Some of the code flow of criu_version_ok() is duplicated and maybe it
678 * is a good candidate for refactoring.
679 */
680 bool __criu_check_feature(uint64_t *features_to_check)
681 {
682 pid_t pid;
683 uint64_t current_bit = 0;
684 int ret;
685 uint64_t features = *features_to_check;
686 /* Feature checking is currently always like
687 * criu check --feature <feature-name>
688 */
689 char *args[] = { "criu", "check", "--feature", NULL, NULL };
690
691 if ((features & ~FEATURE_MEM_TRACK & ~FEATURE_LAZY_PAGES) != 0) {
692 /* There are feature bits activated we do not understand.
693 * Refusing to answer at all */
694 *features_to_check = 0;
695 return false;
696 }
697
698 while (current_bit < (sizeof(uint64_t) * 8 - 1)) {
699 /* only test requested features */
700 if (!(features & (1ULL << current_bit))) {
701 /* skip this */
702 current_bit++;
703 continue;
704 }
705
706 pid = fork();
707 if (pid < 0) {
708 SYSERROR("fork() failed");
709 *features_to_check = 0;
710 return false;
711 }
712
713 if (pid == 0) {
714 if ((1ULL << current_bit) == FEATURE_MEM_TRACK)
715 /* This is needed for pre-dump support, which
716 * enables pre-copy migration. */
717 args[3] = "mem_dirty_track";
718 else if ((1ULL << current_bit) == FEATURE_LAZY_PAGES)
719 /* CRIU has two checks for userfaultfd support.
720 *
721 * The simpler check is only for 'uffd'. If the
722 * kernel supports userfaultfd without noncoop
723 * then only process can be lazily restored
724 * which do not fork. With 'uffd-noncoop'
725 * it is also possible to lazily restore processes
726 * which do fork. For a container runtime like
727 * LXC checking only for 'uffd' makes not much sense. */
728 args[3] = "uffd-noncoop";
729 else
730 _exit(EXIT_FAILURE);
731
732 null_stdfds();
733
734 execvp("criu", args);
735 SYSERROR("Failed to exec \"criu\"");
736 _exit(EXIT_FAILURE);
737 }
738
739 ret = wait_for_pid(pid);
740
741 if (ret == -1) {
742 /* It is not known why CRIU failed. Either
743 * CRIU is not available, the feature check
744 * does not exist or the feature is not
745 * supported. */
746 INFO("feature not supported");
747 /* Clear not supported feature bit */
748 features &= ~(1ULL << current_bit);
749 }
750
751 current_bit++;
752 /* no more checks requested; exit check loop */
753 if (!(features & ~((1ULL << current_bit)-1)))
754 break;
755 }
756 if (features != *features_to_check) {
757 *features_to_check = features;
758 return false;
759 }
760 return true;
761 }
762
763 /*
764 * Check to see if the criu version is recent enough for all the features we
765 * use. This version allows either CRIU_VERSION or (CRIU_GITID_VERSION and
766 * CRIU_GITID_PATCHLEVEL) to work, enabling users building from git to c/r
767 * things potentially before a version is released with a particular feature.
768 *
769 * The intent is that when criu development slows down, we can drop this, but
770 * for now we shouldn't attempt to c/r with versions that we know won't work.
771 *
772 * Note: If version != NULL criu_version() stores the detected criu version in
773 * version. Allocates memory for version which must be freed by caller.
774 */
775 static bool criu_version_ok(char **version)
776 {
777 int pipes[2];
778 pid_t pid;
779
780 if (pipe(pipes) < 0) {
781 SYSERROR("pipe() failed");
782 return false;
783 }
784
785 pid = fork();
786 if (pid < 0) {
787 SYSERROR("fork() failed");
788 return false;
789 }
790
791 if (pid == 0) {
792 char *args[] = { "criu", "--version", NULL };
793 char *path;
794 close(pipes[0]);
795
796 close(STDERR_FILENO);
797 if (dup2(pipes[1], STDOUT_FILENO) < 0)
798 _exit(EXIT_FAILURE);
799
800 path = on_path("criu", NULL);
801 if (!path)
802 _exit(EXIT_FAILURE);
803
804 execv(path, args);
805 _exit(EXIT_FAILURE);
806 } else {
807 FILE *f;
808 char *tmp;
809 int patch;
810
811 close(pipes[1]);
812 if (wait_for_pid(pid) < 0) {
813 close(pipes[0]);
814 SYSERROR("execing criu failed, is it installed?");
815 return false;
816 }
817
818 f = fdopen(pipes[0], "r");
819 if (!f) {
820 close(pipes[0]);
821 return false;
822 }
823
824 tmp = malloc(1024);
825 if (!tmp) {
826 fclose(f);
827 return false;
828 }
829
830 if (fscanf(f, "Version: %1023[^\n]s", tmp) != 1)
831 goto version_error;
832
833 if (fgetc(f) != '\n')
834 goto version_error;
835
836 if (strcmp(tmp, CRIU_VERSION) >= 0)
837 goto version_match;
838
839 if (fscanf(f, "GitID: v%1023[^-]s", tmp) != 1)
840 goto version_error;
841
842 if (fgetc(f) != '-')
843 goto version_error;
844
845 if (fscanf(f, "%d", &patch) != 1)
846 goto version_error;
847
848 if (strcmp(tmp, CRIU_GITID_VERSION) < 0)
849 goto version_error;
850
851 if (patch < CRIU_GITID_PATCHLEVEL)
852 goto version_error;
853
854 version_match:
855 fclose(f);
856 if (!version)
857 free(tmp);
858 else
859 *version = tmp;
860 return true;
861
862 version_error:
863 fclose(f);
864 free(tmp);
865 ERROR("must have criu " CRIU_VERSION " or greater to checkpoint/restore");
866 return false;
867 }
868 }
869
870 /* Check and make sure the container has a configuration that we know CRIU can
871 * dump. */
872 static bool criu_ok(struct lxc_container *c, char **criu_version)
873 {
874 struct lxc_list *it;
875
876 if (geteuid()) {
877 ERROR("Must be root to checkpoint");
878 return false;
879 }
880
881 if (!criu_version_ok(criu_version))
882 return false;
883
884 /* We only know how to restore containers with veth networks. */
885 lxc_list_for_each(it, &c->lxc_conf->network) {
886 struct lxc_netdev *n = it->elem;
887 switch(n->type) {
888 case LXC_NET_VETH:
889 case LXC_NET_NONE:
890 case LXC_NET_EMPTY:
891 case LXC_NET_MACVLAN:
892 break;
893 default:
894 ERROR("Found un-dumpable network: %s (%s)", lxc_net_type_to_str(n->type), n->name);
895 if (criu_version) {
896 free(*criu_version);
897 *criu_version = NULL;
898 }
899 return false;
900 }
901 }
902
903 return true;
904 }
905
906 static bool restore_net_info(struct lxc_container *c)
907 {
908 int ret;
909 struct lxc_list *it;
910 bool has_error = true;
911
912 if (container_mem_lock(c))
913 return false;
914
915 lxc_list_for_each(it, &c->lxc_conf->network) {
916 struct lxc_netdev *netdev = it->elem;
917 char template[IFNAMSIZ];
918
919 if (netdev->type != LXC_NET_VETH)
920 continue;
921
922 ret = snprintf(template, sizeof(template), "vethXXXXXX");
923 if (ret < 0 || ret >= sizeof(template))
924 goto out_unlock;
925
926 if (netdev->priv.veth_attr.pair[0] == '\0' &&
927 netdev->priv.veth_attr.veth1[0] == '\0') {
928 if (!lxc_mkifname(template))
929 goto out_unlock;
930
931 (void)strlcpy(netdev->priv.veth_attr.veth1, template, IFNAMSIZ);
932 }
933 }
934
935 has_error = false;
936
937 out_unlock:
938 container_mem_unlock(c);
939 return !has_error;
940 }
941
942 /* do_restore never returns, the calling process is used as the monitor process.
943 * do_restore calls _exit() if it fails.
944 */
945 static void do_restore(struct lxc_container *c, int status_pipe, struct migrate_opts *opts, char *criu_version)
946 {
947 int fd, ret;
948 pid_t pid;
949 struct lxc_handler *handler;
950 int status = 0;
951 int pipes[2] = {-1, -1};
952 struct cgroup_ops *cgroup_ops;
953
954 /* Try to detach from the current controlling tty if it exists.
955 * Othwerise, lxc_init (via lxc_console) will attach the container's
956 * console output to the current tty, which is probably not what any
957 * library user wants, and if they do, they can just manually configure
958 * it :)
959 */
960 fd = open("/dev/tty", O_RDWR);
961 if (fd >= 0) {
962 if (ioctl(fd, TIOCNOTTY, NULL) < 0)
963 SYSERROR("couldn't detach from tty");
964 close(fd);
965 }
966
967 handler = lxc_init_handler(c->name, c->lxc_conf, c->config_path, false);
968 if (!handler)
969 goto out;
970
971 if (lxc_init(c->name, handler) < 0)
972 goto out;
973
974 cgroup_ops = cgroup_init(c->lxc_conf);
975 if (!cgroup_ops)
976 goto out_fini_handler;
977 handler->cgroup_ops = cgroup_ops;
978
979 if (!cgroup_ops->payload_create(cgroup_ops, handler)) {
980 ERROR("failed creating groups");
981 goto out_fini_handler;
982 }
983
984 if (!restore_net_info(c)) {
985 ERROR("failed restoring network info");
986 goto out_fini_handler;
987 }
988
989 ret = resolve_clone_flags(handler);
990 if (ret < 0) {
991 SYSERROR("Unsupported clone flag specified");
992 goto out_fini_handler;
993 }
994
995 if (pipe2(pipes, O_CLOEXEC) < 0) {
996 SYSERROR("pipe() failed");
997 goto out_fini_handler;
998 }
999
1000 pid = fork();
1001 if (pid < 0)
1002 goto out_fini_handler;
1003
1004 if (pid == 0) {
1005 struct criu_opts os;
1006 struct lxc_rootfs *rootfs;
1007 int flags;
1008
1009 close(status_pipe);
1010 status_pipe = -1;
1011
1012 close(pipes[0]);
1013 pipes[0] = -1;
1014
1015 if (unshare(CLONE_NEWNS))
1016 goto out_fini_handler;
1017
1018 /* CRIU needs the lxc root bind mounted so that it is the root of some
1019 * mount. */
1020 rootfs = &c->lxc_conf->rootfs;
1021
1022 if (rootfs_is_blockdev(c->lxc_conf)) {
1023 if (lxc_setup_rootfs_prepare_root(c->lxc_conf, c->name,
1024 c->config_path) < 0)
1025 goto out_fini_handler;
1026 } else {
1027 if (mkdir(rootfs->mount, 0755) < 0 && errno != EEXIST)
1028 goto out_fini_handler;
1029
1030 if (mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL) < 0) {
1031 SYSERROR("remount / to private failed");
1032 goto out_fini_handler;
1033 }
1034
1035 if (mount(rootfs->path, rootfs->mount, NULL, MS_BIND, NULL) < 0) {
1036 rmdir(rootfs->mount);
1037 goto out_fini_handler;
1038 }
1039 }
1040
1041 os.pipefd = pipes[1];
1042 os.action = "restore";
1043 os.user = opts;
1044 os.c = c;
1045 os.console_fd = c->lxc_conf->console.slave;
1046 os.criu_version = criu_version;
1047 os.handler = handler;
1048
1049 if (os.console_fd >= 0) {
1050 /* Twiddle the FD_CLOEXEC bit. We want to pass this FD to criu
1051 * via --inherit-fd, so we don't want it to close.
1052 */
1053 flags = fcntl(os.console_fd, F_GETFD);
1054 if (flags < 0) {
1055 SYSERROR("F_GETFD failed: %d", os.console_fd);
1056 goto out_fini_handler;
1057 }
1058
1059 flags &= ~FD_CLOEXEC;
1060
1061 if (fcntl(os.console_fd, F_SETFD, flags) < 0) {
1062 SYSERROR("F_SETFD failed");
1063 goto out_fini_handler;
1064 }
1065 }
1066 os.console_name = c->lxc_conf->console.name;
1067
1068 /* exec_criu() returning is an error */
1069 exec_criu(cgroup_ops, c->lxc_conf, &os);
1070 umount(rootfs->mount);
1071 rmdir(rootfs->mount);
1072 goto out_fini_handler;
1073 } else {
1074 int ret;
1075 char title[2048];
1076
1077 close(pipes[1]);
1078 pipes[1] = -1;
1079
1080 pid_t w = waitpid(pid, &status, 0);
1081 if (w == -1) {
1082 SYSERROR("waitpid");
1083 goto out_fini_handler;
1084 }
1085
1086 if (WIFEXITED(status)) {
1087 char buf[4096];
1088
1089 if (WEXITSTATUS(status)) {
1090 int n;
1091
1092 n = lxc_read_nointr(pipes[0], buf, sizeof(buf));
1093 if (n < 0) {
1094 SYSERROR("failed reading from criu stderr");
1095 goto out_fini_handler;
1096 }
1097
1098 if (n == sizeof(buf))
1099 n--;
1100 buf[n] = 0;
1101
1102 ERROR("criu process exited %d, output:\n%s", WEXITSTATUS(status), buf);
1103 goto out_fini_handler;
1104 } else {
1105 ret = snprintf(buf, sizeof(buf), "/proc/self/task/%lu/children", (unsigned long)syscall(__NR_gettid));
1106 if (ret < 0 || ret >= sizeof(buf)) {
1107 ERROR("snprintf'd too many characters: %d", ret);
1108 goto out_fini_handler;
1109 }
1110
1111 FILE *f = fopen(buf, "r");
1112 if (!f) {
1113 SYSERROR("couldn't read restore's children file %s", buf);
1114 goto out_fini_handler;
1115 }
1116
1117 ret = fscanf(f, "%d", (int*) &handler->pid);
1118 fclose(f);
1119 if (ret != 1) {
1120 ERROR("reading restore pid failed");
1121 goto out_fini_handler;
1122 }
1123
1124 if (lxc_set_state(c->name, handler, RUNNING)) {
1125 ERROR("error setting running state after restore");
1126 goto out_fini_handler;
1127 }
1128 }
1129 } else {
1130 ERROR("CRIU was killed with signal %d", WTERMSIG(status));
1131 goto out_fini_handler;
1132 }
1133
1134 close(pipes[0]);
1135
1136 ret = lxc_write_nointr(status_pipe, &status, sizeof(status));
1137 close(status_pipe);
1138 status_pipe = -1;
1139
1140 if (sizeof(status) != ret) {
1141 SYSERROR("failed to write all of status");
1142 goto out_fini_handler;
1143 }
1144
1145 /*
1146 * See comment in lxcapi_start; we don't care if these
1147 * fail because it's just a beauty thing. We just
1148 * assign the return here to silence potential.
1149 */
1150 ret = snprintf(title, sizeof(title), "[lxc monitor] %s %s", c->config_path, c->name);
1151 if (ret < 0 || (size_t)ret >= sizeof(title))
1152 INFO("Setting truncated process name");
1153
1154 ret = setproctitle(title);
1155 if (ret < 0)
1156 INFO("Failed to set process name");
1157
1158 ret = lxc_poll(c->name, handler);
1159 if (ret)
1160 lxc_abort(c->name, handler);
1161 lxc_fini(c->name, handler);
1162 _exit(ret);
1163 }
1164
1165 out_fini_handler:
1166 if (pipes[0] >= 0)
1167 close(pipes[0]);
1168 if (pipes[1] >= 0)
1169 close(pipes[1]);
1170
1171 lxc_fini(c->name, handler);
1172
1173 out:
1174 if (status_pipe >= 0) {
1175 /* ensure getting here was a failure, e.g. if we failed to
1176 * parse the child pid or something, even after a successful
1177 * restore
1178 */
1179 if (!status)
1180 status = 1;
1181
1182 if (lxc_write_nointr(status_pipe, &status, sizeof(status)) != sizeof(status))
1183 SYSERROR("writing status failed");
1184 close(status_pipe);
1185 }
1186
1187 _exit(EXIT_FAILURE);
1188 }
1189
1190 static int save_tty_major_minor(char *directory, struct lxc_container *c, char *tty_id, int len)
1191 {
1192 FILE *f;
1193 char path[PATH_MAX];
1194 int ret;
1195 struct stat sb;
1196
1197 if (c->lxc_conf->console.path && !strcmp(c->lxc_conf->console.path, "none")) {
1198 tty_id[0] = 0;
1199 return 0;
1200 }
1201
1202 ret = snprintf(path, sizeof(path), "/proc/%d/root/dev/console", c->init_pid(c));
1203 if (ret < 0 || ret >= sizeof(path)) {
1204 ERROR("snprintf'd too many chacters: %d", ret);
1205 return -1;
1206 }
1207
1208 ret = stat(path, &sb);
1209 if (ret < 0) {
1210 SYSERROR("stat of %s failed", path);
1211 return -1;
1212 }
1213
1214 ret = snprintf(path, sizeof(path), "%s/tty.info", directory);
1215 if (ret < 0 || ret >= sizeof(path)) {
1216 ERROR("snprintf'd too many characters: %d", ret);
1217 return -1;
1218 }
1219
1220 ret = snprintf(tty_id, len, "tty[%llx:%llx]",
1221 (long long unsigned) sb.st_rdev,
1222 (long long unsigned) sb.st_dev);
1223 if (ret < 0 || ret >= sizeof(path)) {
1224 ERROR("snprintf'd too many characters: %d", ret);
1225 return -1;
1226 }
1227
1228 f = fopen(path, "w");
1229 if (!f) {
1230 SYSERROR("failed to open %s", path);
1231 return -1;
1232 }
1233
1234 ret = fprintf(f, "%s", tty_id);
1235 fclose(f);
1236 if (ret < 0)
1237 SYSERROR("failed to write to %s", path);
1238 return ret;
1239 }
1240
1241 /* do one of either predump or a regular dump */
1242 static bool do_dump(struct lxc_container *c, char *mode, struct migrate_opts *opts)
1243 {
1244 int ret;
1245 pid_t pid;
1246 int criuout[2];
1247 char *criu_version = NULL;
1248
1249 if (!criu_ok(c, &criu_version))
1250 return false;
1251
1252 ret = pipe(criuout);
1253 if (ret < 0) {
1254 SYSERROR("pipe() failed");
1255 free(criu_version);
1256 return false;
1257 }
1258
1259 if (mkdir_p(opts->directory, 0700) < 0)
1260 goto fail;
1261
1262 pid = fork();
1263 if (pid < 0) {
1264 SYSERROR("fork failed");
1265 goto fail;
1266 }
1267
1268 if (pid == 0) {
1269 struct criu_opts os;
1270 struct cgroup_ops *cgroup_ops;
1271
1272 close(criuout[0]);
1273
1274 cgroup_ops = cgroup_init(c->lxc_conf);
1275 if (!cgroup_ops) {
1276 ERROR("failed to cgroup_init()");
1277 _exit(EXIT_FAILURE);
1278 return -1;
1279 }
1280
1281 os.pipefd = criuout[1];
1282 os.action = mode;
1283 os.user = opts;
1284 os.c = c;
1285 os.console_name = c->lxc_conf->console.path;
1286 os.criu_version = criu_version;
1287 os.handler = NULL;
1288
1289 ret = save_tty_major_minor(opts->directory, c, os.tty_id, sizeof(os.tty_id));
1290 if (ret < 0) {
1291 free(criu_version);
1292 _exit(EXIT_FAILURE);
1293 }
1294
1295 /* exec_criu() returning is an error */
1296 exec_criu(cgroup_ops, c->lxc_conf, &os);
1297 free(criu_version);
1298 _exit(EXIT_FAILURE);
1299 } else {
1300 int status;
1301 ssize_t n;
1302 char buf[4096];
1303 bool ret;
1304
1305 close(criuout[1]);
1306
1307 pid_t w = waitpid(pid, &status, 0);
1308 if (w == -1) {
1309 SYSERROR("waitpid");
1310 close(criuout[0]);
1311 free(criu_version);
1312 return false;
1313 }
1314
1315 n = lxc_read_nointr(criuout[0], buf, sizeof(buf));
1316 close(criuout[0]);
1317 if (n < 0) {
1318 SYSERROR("read");
1319 n = 0;
1320 }
1321
1322 if (n == sizeof(buf))
1323 buf[n-1] = 0;
1324 else
1325 buf[n] = 0;
1326
1327 if (WIFEXITED(status)) {
1328 if (WEXITSTATUS(status)) {
1329 ERROR("dump failed with %d", WEXITSTATUS(status));
1330 ret = false;
1331 } else {
1332 ret = true;
1333 }
1334 } else if (WIFSIGNALED(status)) {
1335 ERROR("dump signaled with %d", WTERMSIG(status));
1336 ret = false;
1337 } else {
1338 ERROR("unknown dump exit %d", status);
1339 ret = false;
1340 }
1341
1342 if (!ret)
1343 ERROR("criu output: %s", buf);
1344
1345 free(criu_version);
1346 return ret;
1347 }
1348 fail:
1349 close(criuout[0]);
1350 close(criuout[1]);
1351 rmdir(opts->directory);
1352 free(criu_version);
1353 return false;
1354 }
1355
1356 bool __criu_pre_dump(struct lxc_container *c, struct migrate_opts *opts)
1357 {
1358 return do_dump(c, "pre-dump", opts);
1359 }
1360
1361 bool __criu_dump(struct lxc_container *c, struct migrate_opts *opts)
1362 {
1363 char path[PATH_MAX];
1364 int ret;
1365
1366 ret = snprintf(path, sizeof(path), "%s/inventory.img", opts->directory);
1367 if (ret < 0 || ret >= sizeof(path))
1368 return false;
1369
1370 if (access(path, F_OK) == 0) {
1371 ERROR("please use a fresh directory for the dump directory");
1372 return false;
1373 }
1374
1375 return do_dump(c, "dump", opts);
1376 }
1377
1378 bool __criu_restore(struct lxc_container *c, struct migrate_opts *opts)
1379 {
1380 pid_t pid;
1381 int status, nread;
1382 int pipefd[2];
1383 char *criu_version = NULL;
1384
1385 if (geteuid()) {
1386 ERROR("Must be root to restore");
1387 return false;
1388 }
1389
1390 if (pipe(pipefd)) {
1391 ERROR("failed to create pipe");
1392 return false;
1393 }
1394
1395 if (!criu_ok(c, &criu_version)) {
1396 close(pipefd[0]);
1397 close(pipefd[1]);
1398 return false;
1399 }
1400
1401 pid = fork();
1402 if (pid < 0) {
1403 close(pipefd[0]);
1404 close(pipefd[1]);
1405 free(criu_version);
1406 return false;
1407 }
1408
1409 if (pid == 0) {
1410 close(pipefd[0]);
1411 /* this never returns */
1412 do_restore(c, pipefd[1], opts, criu_version);
1413 }
1414
1415 close(pipefd[1]);
1416 free(criu_version);
1417
1418 nread = lxc_read_nointr(pipefd[0], &status, sizeof(status));
1419 close(pipefd[0]);
1420 if (sizeof(status) != nread) {
1421 ERROR("reading status from pipe failed");
1422 goto err_wait;
1423 }
1424
1425 /* If the criu process was killed or exited nonzero, wait() for the
1426 * handler, since the restore process died. Otherwise, we don't need to
1427 * wait, since the child becomes the monitor process.
1428 */
1429 if (!WIFEXITED(status) || WEXITSTATUS(status))
1430 goto err_wait;
1431 return true;
1432
1433 err_wait:
1434 if (wait_for_pid(pid))
1435 ERROR("restore process died");
1436 return false;
1437 }