]>
git.proxmox.com Git - mirror_lxc.git/blob - src/lxc/criu.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
7 #include <linux/limits.h>
12 #include <sys/mount.h>
13 #include <sys/types.h>
27 #include "syscall_wrappers.h"
31 #include <../include/lxcmntent.h>
37 #include "include/strlcpy.h"
40 #define CRIU_VERSION "2.0"
42 #define CRIU_GITID_VERSION "2.0"
43 #define CRIU_GITID_PATCHLEVEL 0
45 #define CRIU_IN_FLIGHT_SUPPORT "2.4"
46 #define CRIU_EXTERNAL_NOT_VETH "2.8"
48 lxc_log_define(criu
, lxc
);
51 /* the thing to hook to stdout and stderr for logging */
54 /* The type of criu invocation, one of "dump" or "restore" */
57 /* the user-provided migrate options relevant to this action */
58 struct migrate_opts
*user
;
60 /* The container to dump */
61 struct lxc_container
*c
;
63 /* dump: stop the container or not after dumping? */
64 char tty_id
[32]; /* the criu tty id for /dev/console, i.e. "tty[${rdev}:${dev}]" */
66 /* restore: the file to write the init process' pid into */
67 struct lxc_handler
*handler
;
69 /* The path that is bind mounted from /dev/console, if any. We don't
70 * want to use `--ext-mount-map auto`'s result here because the pts
71 * device may have a different path (e.g. if the pty number is
72 * different) on the target host. NULL if lxc.console.path = "none".
76 /* The detected version of criu */
80 static int load_tty_major_minor(char *directory
, char *output
, int len
)
86 ret
= snprintf(path
, sizeof(path
), "%s/tty.info", directory
);
87 if (ret
< 0 || ret
>= sizeof(path
)) {
88 ERROR("snprintf'd too many characters: %d", ret
);
92 f
= fopen(path
, "re");
94 /* This means we're coming from a liblxc which didn't export
95 * the tty info. In this case they had to have lxc.console.path
96 * = * none, so there's no problem restoring.
101 SYSERROR("couldn't open %s", path
);
105 if (!fgets(output
, len
, f
)) {
107 SYSERROR("couldn't read %s", path
);
115 static int cmp_version(const char *v1
, const char *v2
)
118 int oct_v1
[3], oct_v2
[3];
120 memset(oct_v1
, -1, sizeof(oct_v1
));
121 memset(oct_v2
, -1, sizeof(oct_v2
));
123 ret
= sscanf(v1
, "%d.%d.%d", &oct_v1
[0], &oct_v1
[1], &oct_v1
[2]);
127 ret
= sscanf(v2
, "%d.%d.%d", &oct_v2
[0], &oct_v2
[1], &oct_v2
[2]);
131 /* Major version is greater. */
132 if (oct_v1
[0] > oct_v2
[0])
135 if (oct_v1
[0] < oct_v2
[0])
138 /* Minor number is greater.*/
139 if (oct_v1
[1] > oct_v2
[1])
142 if (oct_v1
[1] < oct_v2
[1])
145 /* Patch number is greater. */
146 if (oct_v1
[2] > oct_v2
[2])
149 /* Patch numbers are equal. */
150 if (oct_v1
[2] == oct_v2
[2])
156 static void exec_criu(struct cgroup_ops
*cgroup_ops
, struct lxc_conf
*conf
,
157 struct criu_opts
*opts
)
159 char **argv
, log
[PATH_MAX
];
160 int static_args
= 23, argc
= 0, i
, ret
;
164 struct mntent mntent
;
166 char buf
[4096], ttys
[32];
169 /* If we are currently in a cgroup /foo/bar, and the container is in a
170 * cgroup /lxc/foo, lxcfs will give us an ENOENT if some task in the
171 * container has an open fd that points to one of the cgroup files
172 * (systemd always opens its "root" cgroup). So, let's escape to the
173 * /actual/ root cgroup so that lxcfs thinks criu has enough rights to
176 if (!cgroup_ops
->escape(cgroup_ops
, conf
)) {
177 ERROR("failed to escape cgroups");
181 /* The command line always looks like:
182 * criu $(action) --tcp-established --file-locks --link-remap \
183 * --manage-cgroups=full --action-script foo.sh -D $(directory) \
184 * -o $(directory)/$(action).log --ext-mount-map auto
185 * --enable-external-sharing --enable-external-masters
186 * --enable-fs hugetlbfs --enable-fs tracefs --ext-mount-map console:/dev/pts/n
187 * +1 for final NULL */
189 if (strcmp(opts
->action
, "dump") == 0 || strcmp(opts
->action
, "pre-dump") == 0) {
190 /* -t pid --freeze-cgroup /lxc/ct */
193 /* --prev-images-dir <path-to-directory-A-relative-to-B> */
194 if (opts
->user
->predump_dir
)
197 /* --page-server --address <address> --port <port> */
198 if (opts
->user
->pageserver_address
&& opts
->user
->pageserver_port
)
201 /* --leave-running (only for final dump) */
202 if (strcmp(opts
->action
, "dump") == 0 && !opts
->user
->stop
)
205 /* --external tty[88,4] */
210 if (!opts
->user
->preserves_inodes
)
213 /* --ghost-limit 1024 */
214 if (opts
->user
->ghost_limit
)
216 } else if (strcmp(opts
->action
, "restore") == 0) {
217 /* --root $(lxc_mount_point) --restore-detached
219 * --lsm-profile apparmor:whatever
224 if (load_tty_major_minor(opts
->user
->directory
, ttys
, sizeof(ttys
)))
227 /* --inherit-fd fd[%d]:tty[%s] */
234 if (cgroup_ops
->num_hierarchies(cgroup_ops
) > 0)
235 static_args
+= 2 * cgroup_ops
->num_hierarchies(cgroup_ops
);
237 if (opts
->user
->verbose
)
240 if (opts
->user
->action_script
)
243 static_args
+= 2 * lxc_list_len(&opts
->c
->lxc_conf
->mount_list
);
245 ret
= snprintf(log
, PATH_MAX
, "%s/%s.log", opts
->user
->directory
, opts
->action
);
246 if (ret
< 0 || ret
>= PATH_MAX
) {
247 ERROR("logfile name too long");
251 argv
= malloc(static_args
* sizeof(*argv
));
255 memset(argv
, 0, static_args
* sizeof(*argv
));
257 #define DECLARE_ARG(arg) \
260 ERROR("Got NULL argument for criu"); \
263 argv[argc++] = strdup(arg); \
268 argv
[argc
++] = on_path("criu", NULL
);
270 ERROR("Couldn't find criu binary");
274 DECLARE_ARG(opts
->action
);
275 DECLARE_ARG("--tcp-established");
276 DECLARE_ARG("--file-locks");
277 DECLARE_ARG("--link-remap");
278 DECLARE_ARG("--manage-cgroups=full");
279 DECLARE_ARG("--ext-mount-map");
281 DECLARE_ARG("--enable-external-sharing");
282 DECLARE_ARG("--enable-external-masters");
283 DECLARE_ARG("--enable-fs");
284 DECLARE_ARG("hugetlbfs");
285 DECLARE_ARG("--enable-fs");
286 DECLARE_ARG("tracefs");
288 DECLARE_ARG(opts
->user
->directory
);
292 for (i
= 0; i
< cgroup_ops
->num_hierarchies(cgroup_ops
); i
++) {
293 char **controllers
= NULL
, *fullname
;
296 if (!cgroup_ops
->get_hierarchies(cgroup_ops
, i
, &controllers
)) {
297 ERROR("failed to get hierarchy %d", i
);
301 /* if we are in a dump, we have to ask the monitor process what
302 * the right cgroup is. if this is a restore, we can just use
303 * the handler the restore task created.
305 if (!strcmp(opts
->action
, "dump") || !strcmp(opts
->action
, "pre-dump")) {
306 path
= lxc_cmd_get_cgroup_path(opts
->c
->name
, opts
->c
->config_path
, controllers
[0]);
308 ERROR("failed to get cgroup path for %s", controllers
[0]);
314 p
= cgroup_ops
->get_cgroup(cgroup_ops
, controllers
[0]);
316 ERROR("failed to get cgroup path for %s", controllers
[0]);
322 ERROR("strdup failed");
327 tmp
= lxc_deslashify(path
);
329 ERROR("Failed to remove extraneous slashes from \"%s\"",
337 fullname
= lxc_string_join(",", (const char **) controllers
, false);
339 ERROR("failed to join controllers");
344 ret
= sprintf(buf
, "%s:%s", fullname
, path
);
347 if (ret
< 0 || ret
>= sizeof(buf
)) {
348 ERROR("sprintf of cgroup root arg failed");
352 DECLARE_ARG("--cgroup-root");
356 if (opts
->user
->verbose
)
359 if (opts
->user
->action_script
) {
360 DECLARE_ARG("--action-script");
361 DECLARE_ARG(opts
->user
->action_script
);
364 mnts
= make_anonymous_mount_file(&opts
->c
->lxc_conf
->mount_list
,
365 opts
->c
->lxc_conf
->lsm_aa_allow_nesting
);
369 while (getmntent_r(mnts
, &mntent
, buf
, sizeof(buf
))) {
370 unsigned long flags
= 0;
371 char *mntdata
= NULL
;
372 char arg
[2 * PATH_MAX
+ 2];
374 if (parse_mntopts(mntent
.mnt_opts
, &flags
, &mntdata
) < 0)
379 /* only add --ext-mount-map for actual bind mounts */
380 if (!(flags
& MS_BIND
))
383 if (strcmp(opts
->action
, "dump") == 0)
384 ret
= snprintf(arg
, sizeof(arg
), "/%s:%s",
385 mntent
.mnt_dir
, mntent
.mnt_dir
);
387 ret
= snprintf(arg
, sizeof(arg
), "%s:%s",
388 mntent
.mnt_dir
, mntent
.mnt_fsname
);
389 if (ret
< 0 || ret
>= sizeof(arg
)) {
391 ERROR("snprintf failed");
395 DECLARE_ARG("--ext-mount-map");
400 if (strcmp(opts
->action
, "dump") == 0 || strcmp(opts
->action
, "pre-dump") == 0) {
401 char pid
[32], *freezer_relative
;
403 if (sprintf(pid
, "%d", opts
->c
->init_pid(opts
->c
)) < 0)
409 freezer_relative
= lxc_cmd_get_cgroup_path(opts
->c
->name
,
410 opts
->c
->config_path
,
412 if (!freezer_relative
) {
413 ERROR("failed getting freezer path");
417 ret
= snprintf(log
, sizeof(log
), "/sys/fs/cgroup/freezer/%s", freezer_relative
);
418 if (ret
< 0 || ret
>= sizeof(log
))
421 if (!opts
->user
->disable_skip_in_flight
&&
422 strcmp(opts
->criu_version
, CRIU_IN_FLIGHT_SUPPORT
) >= 0)
423 DECLARE_ARG("--skip-in-flight");
425 DECLARE_ARG("--freeze-cgroup");
428 if (opts
->tty_id
[0]) {
429 DECLARE_ARG("--ext-mount-map");
430 DECLARE_ARG("/dev/console:console");
432 DECLARE_ARG("--external");
433 DECLARE_ARG(opts
->tty_id
);
436 if (opts
->user
->predump_dir
) {
437 DECLARE_ARG("--prev-images-dir");
438 DECLARE_ARG(opts
->user
->predump_dir
);
439 DECLARE_ARG("--track-mem");
442 if (opts
->user
->pageserver_address
&& opts
->user
->pageserver_port
) {
443 DECLARE_ARG("--page-server");
444 DECLARE_ARG("--address");
445 DECLARE_ARG(opts
->user
->pageserver_address
);
446 DECLARE_ARG("--port");
447 DECLARE_ARG(opts
->user
->pageserver_port
);
450 if (!opts
->user
->preserves_inodes
)
451 DECLARE_ARG("--force-irmap");
453 if (opts
->user
->ghost_limit
) {
454 char ghost_limit
[32];
456 ret
= sprintf(ghost_limit
, "%"PRIu64
, opts
->user
->ghost_limit
);
457 if (ret
< 0 || ret
>= sizeof(ghost_limit
)) {
458 ERROR("failed to print ghost limit %"PRIu64
, opts
->user
->ghost_limit
);
462 DECLARE_ARG("--ghost-limit");
463 DECLARE_ARG(ghost_limit
);
466 /* only for final dump */
467 if (strcmp(opts
->action
, "dump") == 0 && !opts
->user
->stop
)
468 DECLARE_ARG("--leave-running");
469 } else if (strcmp(opts
->action
, "restore") == 0) {
472 struct lxc_conf
*lxc_conf
= opts
->c
->lxc_conf
;
474 DECLARE_ARG("--root");
475 DECLARE_ARG(opts
->c
->lxc_conf
->rootfs
.mount
);
476 DECLARE_ARG("--restore-detached");
477 DECLARE_ARG("--restore-sibling");
480 if (opts
->console_fd
< 0) {
481 ERROR("lxc.console.path configured on source host but not target");
485 ret
= snprintf(buf
, sizeof(buf
), "fd[%d]:%s", opts
->console_fd
, ttys
);
486 if (ret
< 0 || ret
>= sizeof(buf
))
489 DECLARE_ARG("--inherit-fd");
492 if (opts
->console_name
) {
493 if (snprintf(buf
, sizeof(buf
), "console:%s", opts
->console_name
) < 0) {
494 SYSERROR("sprintf'd too many bytes");
496 DECLARE_ARG("--ext-mount-map");
500 if (lxc_conf
->lsm_aa_profile
|| lxc_conf
->lsm_se_context
) {
502 if (lxc_conf
->lsm_aa_profile
)
503 ret
= snprintf(buf
, sizeof(buf
), "apparmor:%s", lxc_conf
->lsm_aa_profile
);
505 ret
= snprintf(buf
, sizeof(buf
), "selinux:%s", lxc_conf
->lsm_se_context
);
507 if (ret
< 0 || ret
>= sizeof(buf
))
510 DECLARE_ARG("--lsm-profile");
514 additional
= lxc_list_len(&opts
->c
->lxc_conf
->network
) * 2;
516 m
= realloc(argv
, (argc
+ additional
+ 1) * sizeof(*argv
));
521 lxc_list_for_each(it
, &opts
->c
->lxc_conf
->network
) {
523 char eth
[128], *veth
;
524 struct lxc_netdev
*n
= it
->elem
;
525 bool external_not_veth
;
527 if (cmp_version(opts
->criu_version
, CRIU_EXTERNAL_NOT_VETH
) >= 0) {
528 /* Since criu version 2.8 the usage of --veth-pair
529 * has been deprecated:
530 * git tag --contains f2037e6d3445fc400
532 external_not_veth
= true;
534 external_not_veth
= false;
537 if (n
->name
[0] != '\0') {
538 retlen
= strlcpy(eth
, n
->name
, sizeof(eth
));
539 if (retlen
>= sizeof(eth
))
542 ret
= snprintf(eth
, sizeof(eth
), "eth%d", netnr
);
543 if (ret
< 0 || ret
>= sizeof(eth
))
549 veth
= n
->priv
.veth_attr
.pair
;
551 veth
= n
->priv
.veth_attr
.veth1
;
553 if (n
->link
[0] != '\0') {
554 if (external_not_veth
)
555 ret
= snprintf(buf
, sizeof(buf
),
560 ret
= snprintf(buf
, sizeof(buf
),
564 if (external_not_veth
)
565 ret
= snprintf(buf
, sizeof(buf
),
569 ret
= snprintf(buf
, sizeof(buf
),
573 if (ret
< 0 || ret
>= sizeof(buf
))
576 case LXC_NET_MACVLAN
:
577 if (n
->link
[0] == '\0') {
578 ERROR("no host interface for macvlan %s", n
->name
);
582 ret
= snprintf(buf
, sizeof(buf
), "macvlan[%s]:%s", eth
, n
->link
);
583 if (ret
< 0 || ret
>= sizeof(buf
))
590 /* we have screened for this earlier... */
591 ERROR("unexpected network type %d", n
->type
);
595 if (external_not_veth
)
596 DECLARE_ARG("--external");
598 DECLARE_ARG("--veth-pair");
610 for (i
= 0; argv
[i
]; i
++) {
611 ret
= snprintf(buf
+ pos
, sizeof(buf
) - pos
, "%s ", argv
[i
]);
612 if (ret
< 0 || ret
>= sizeof(buf
) - pos
)
618 INFO("execing: %s", buf
);
620 /* before criu inits its log, it sometimes prints things to stdout/err;
621 * let's be sure we capture that.
623 if (dup2(opts
->pipefd
, STDOUT_FILENO
) < 0) {
624 SYSERROR("dup2 stdout failed");
628 if (dup2(opts
->pipefd
, STDERR_FILENO
) < 0) {
629 SYSERROR("dup2 stderr failed");
636 execv(argv
[0], argv
);
638 for (i
= 0; argv
[i
]; i
++)
644 * Function to check if the checks activated in 'features_to_check' are
645 * available with the current architecture/kernel/criu combination.
647 * Parameter features_to_check is a bit mask of all features that should be
648 * checked (see feature check defines in lxc/lxccontainer.h).
650 * If the return value is true, all requested features are supported. If
651 * the return value is false the features_to_check parameter is updated
652 * to reflect which features are available. '0' means no feature but
653 * also that something went totally wrong.
655 * Some of the code flow of criu_version_ok() is duplicated and maybe it
656 * is a good candidate for refactoring.
658 bool __criu_check_feature(uint64_t *features_to_check
)
661 uint64_t current_bit
= 0;
663 uint64_t features
= *features_to_check
;
664 /* Feature checking is currently always like
665 * criu check --feature <feature-name>
667 char *args
[] = { "criu", "check", "--feature", NULL
, NULL
};
669 if ((features
& ~FEATURE_MEM_TRACK
& ~FEATURE_LAZY_PAGES
) != 0) {
670 /* There are feature bits activated we do not understand.
671 * Refusing to answer at all */
672 *features_to_check
= 0;
676 while (current_bit
< (sizeof(uint64_t) * 8 - 1)) {
677 /* only test requested features */
678 if (!(features
& (1ULL << current_bit
))) {
686 SYSERROR("fork() failed");
687 *features_to_check
= 0;
692 if ((1ULL << current_bit
) == FEATURE_MEM_TRACK
)
693 /* This is needed for pre-dump support, which
694 * enables pre-copy migration. */
695 args
[3] = "mem_dirty_track";
696 else if ((1ULL << current_bit
) == FEATURE_LAZY_PAGES
)
697 /* CRIU has two checks for userfaultfd support.
699 * The simpler check is only for 'uffd'. If the
700 * kernel supports userfaultfd without noncoop
701 * then only process can be lazily restored
702 * which do not fork. With 'uffd-noncoop'
703 * it is also possible to lazily restore processes
704 * which do fork. For a container runtime like
705 * LXC checking only for 'uffd' makes not much sense. */
706 args
[3] = "uffd-noncoop";
712 execvp("criu", args
);
713 SYSERROR("Failed to exec \"criu\"");
717 ret
= wait_for_pid(pid
);
720 /* It is not known why CRIU failed. Either
721 * CRIU is not available, the feature check
722 * does not exist or the feature is not
724 INFO("feature not supported");
725 /* Clear not supported feature bit */
726 features
&= ~(1ULL << current_bit
);
730 /* no more checks requested; exit check loop */
731 if (!(features
& ~((1ULL << current_bit
)-1)))
734 if (features
!= *features_to_check
) {
735 *features_to_check
= features
;
742 * Check to see if the criu version is recent enough for all the features we
743 * use. This version allows either CRIU_VERSION or (CRIU_GITID_VERSION and
744 * CRIU_GITID_PATCHLEVEL) to work, enabling users building from git to c/r
745 * things potentially before a version is released with a particular feature.
747 * The intent is that when criu development slows down, we can drop this, but
748 * for now we shouldn't attempt to c/r with versions that we know won't work.
750 * Note: If version != NULL criu_version() stores the detected criu version in
751 * version. Allocates memory for version which must be freed by caller.
753 static bool criu_version_ok(char **version
)
758 if (pipe(pipes
) < 0) {
759 SYSERROR("pipe() failed");
765 SYSERROR("fork() failed");
770 char *args
[] = { "criu", "--version", NULL
};
774 close(STDERR_FILENO
);
775 if (dup2(pipes
[1], STDOUT_FILENO
) < 0)
778 path
= on_path("criu", NULL
);
790 if (wait_for_pid(pid
) < 0) {
792 SYSERROR("execing criu failed, is it installed?");
796 f
= fdopen(pipes
[0], "re");
808 if (fscanf(f
, "Version: %1023[^\n]s", tmp
) != 1)
811 if (fgetc(f
) != '\n')
814 if (strcmp(tmp
, CRIU_VERSION
) >= 0)
817 if (fscanf(f
, "GitID: v%1023[^-]s", tmp
) != 1)
823 if (fscanf(f
, "%d", &patch
) != 1)
826 if (strcmp(tmp
, CRIU_GITID_VERSION
) < 0)
829 if (patch
< CRIU_GITID_PATCHLEVEL
)
843 ERROR("must have criu " CRIU_VERSION
" or greater to checkpoint/restore");
848 /* Check and make sure the container has a configuration that we know CRIU can
850 static bool criu_ok(struct lxc_container
*c
, char **criu_version
)
855 ERROR("Must be root to checkpoint");
859 if (!criu_version_ok(criu_version
))
862 /* We only know how to restore containers with veth networks. */
863 lxc_list_for_each(it
, &c
->lxc_conf
->network
) {
864 struct lxc_netdev
*n
= it
->elem
;
869 case LXC_NET_MACVLAN
:
872 ERROR("Found un-dumpable network: %s (%s)", lxc_net_type_to_str(n
->type
), n
->name
);
875 *criu_version
= NULL
;
884 static bool restore_net_info(struct lxc_container
*c
)
888 bool has_error
= true;
890 if (container_mem_lock(c
))
893 lxc_list_for_each(it
, &c
->lxc_conf
->network
) {
894 struct lxc_netdev
*netdev
= it
->elem
;
895 char template[IFNAMSIZ
];
897 if (netdev
->type
!= LXC_NET_VETH
)
900 ret
= snprintf(template, sizeof(template), "vethXXXXXX");
901 if (ret
< 0 || ret
>= sizeof(template))
904 if (netdev
->priv
.veth_attr
.pair
[0] == '\0' &&
905 netdev
->priv
.veth_attr
.veth1
[0] == '\0') {
906 if (!lxc_ifname_alnum_case_sensitive(template))
909 (void)strlcpy(netdev
->priv
.veth_attr
.veth1
, template, IFNAMSIZ
);
916 container_mem_unlock(c
);
920 /* do_restore never returns, the calling process is used as the monitor process.
921 * do_restore calls _exit() if it fails.
923 static void do_restore(struct lxc_container
*c
, int status_pipe
, struct migrate_opts
*opts
, char *criu_version
)
927 struct lxc_handler
*handler
;
929 int pipes
[2] = {-1, -1};
930 struct cgroup_ops
*cgroup_ops
;
932 /* Try to detach from the current controlling tty if it exists.
933 * Otherwise, lxc_init (via lxc_console) will attach the container's
934 * console output to the current tty, which is probably not what any
935 * library user wants, and if they do, they can just manually configure
938 fd
= open("/dev/tty", O_RDWR
);
940 if (ioctl(fd
, TIOCNOTTY
, NULL
) < 0)
941 SYSERROR("couldn't detach from tty");
945 handler
= lxc_init_handler(c
->name
, c
->lxc_conf
, c
->config_path
, false);
949 if (lxc_init(c
->name
, handler
) < 0)
952 cgroup_ops
= cgroup_init(c
->lxc_conf
);
954 goto out_fini_handler
;
955 handler
->cgroup_ops
= cgroup_ops
;
957 if (!cgroup_ops
->payload_create(cgroup_ops
, handler
)) {
958 ERROR("failed creating groups");
959 goto out_fini_handler
;
962 if (!restore_net_info(c
)) {
963 ERROR("failed restoring network info");
964 goto out_fini_handler
;
967 ret
= resolve_clone_flags(handler
);
969 SYSERROR("Unsupported clone flag specified");
970 goto out_fini_handler
;
973 if (pipe2(pipes
, O_CLOEXEC
) < 0) {
974 SYSERROR("pipe() failed");
975 goto out_fini_handler
;
980 goto out_fini_handler
;
984 struct lxc_rootfs
*rootfs
;
993 if (unshare(CLONE_NEWNS
))
994 goto out_fini_handler
;
996 /* CRIU needs the lxc root bind mounted so that it is the root of some
998 rootfs
= &c
->lxc_conf
->rootfs
;
1000 if (rootfs_is_blockdev(c
->lxc_conf
)) {
1001 if (lxc_setup_rootfs_prepare_root(c
->lxc_conf
, c
->name
,
1002 c
->config_path
) < 0)
1003 goto out_fini_handler
;
1005 if (mkdir(rootfs
->mount
, 0755) < 0 && errno
!= EEXIST
)
1006 goto out_fini_handler
;
1008 if (mount(NULL
, "/", NULL
, MS_SLAVE
| MS_REC
, NULL
) < 0) {
1009 SYSERROR("remount / to private failed");
1010 goto out_fini_handler
;
1013 if (mount(rootfs
->path
, rootfs
->mount
, NULL
, MS_BIND
, NULL
) < 0) {
1014 rmdir(rootfs
->mount
);
1015 goto out_fini_handler
;
1019 os
.pipefd
= pipes
[1];
1020 os
.action
= "restore";
1023 os
.console_fd
= c
->lxc_conf
->console
.slave
;
1024 os
.criu_version
= criu_version
;
1025 os
.handler
= handler
;
1027 if (os
.console_fd
>= 0) {
1028 /* Twiddle the FD_CLOEXEC bit. We want to pass this FD to criu
1029 * via --inherit-fd, so we don't want it to close.
1031 flags
= fcntl(os
.console_fd
, F_GETFD
);
1033 SYSERROR("F_GETFD failed: %d", os
.console_fd
);
1034 goto out_fini_handler
;
1037 flags
&= ~FD_CLOEXEC
;
1039 if (fcntl(os
.console_fd
, F_SETFD
, flags
) < 0) {
1040 SYSERROR("F_SETFD failed");
1041 goto out_fini_handler
;
1044 os
.console_name
= c
->lxc_conf
->console
.name
;
1046 /* exec_criu() returning is an error */
1047 exec_criu(cgroup_ops
, c
->lxc_conf
, &os
);
1048 umount(rootfs
->mount
);
1049 rmdir(rootfs
->mount
);
1050 goto out_fini_handler
;
1057 pid_t w
= waitpid(pid
, &status
, 0);
1059 SYSERROR("waitpid");
1060 goto out_fini_handler
;
1063 if (WIFEXITED(status
)) {
1066 if (WEXITSTATUS(status
)) {
1069 n
= lxc_read_nointr(pipes
[0], buf
, sizeof(buf
));
1071 SYSERROR("failed reading from criu stderr");
1072 goto out_fini_handler
;
1075 if (n
== sizeof(buf
))
1079 ERROR("criu process exited %d, output:\n%s", WEXITSTATUS(status
), buf
);
1080 goto out_fini_handler
;
1082 ret
= snprintf(buf
, sizeof(buf
), "/proc/self/task/%lu/children", (unsigned long)syscall(__NR_gettid
));
1083 if (ret
< 0 || ret
>= sizeof(buf
)) {
1084 ERROR("snprintf'd too many characters: %d", ret
);
1085 goto out_fini_handler
;
1088 FILE *f
= fopen(buf
, "re");
1090 SYSERROR("couldn't read restore's children file %s", buf
);
1091 goto out_fini_handler
;
1094 ret
= fscanf(f
, "%d", (int*) &handler
->pid
);
1097 ERROR("reading restore pid failed");
1098 goto out_fini_handler
;
1101 if (lxc_set_state(c
->name
, handler
, RUNNING
)) {
1102 ERROR("error setting running state after restore");
1103 goto out_fini_handler
;
1107 ERROR("CRIU was killed with signal %d", WTERMSIG(status
));
1108 goto out_fini_handler
;
1113 ret
= lxc_write_nointr(status_pipe
, &status
, sizeof(status
));
1117 if (sizeof(status
) != ret
) {
1118 SYSERROR("failed to write all of status");
1119 goto out_fini_handler
;
1123 * See comment in lxcapi_start; we don't care if these
1124 * fail because it's just a beauty thing. We just
1125 * assign the return here to silence potential.
1127 ret
= snprintf(title
, sizeof(title
), "[lxc monitor] %s %s", c
->config_path
, c
->name
);
1128 if (ret
< 0 || (size_t)ret
>= sizeof(title
))
1129 INFO("Setting truncated process name");
1131 ret
= setproctitle(title
);
1133 INFO("Failed to set process name");
1135 ret
= lxc_poll(c
->name
, handler
);
1151 if (status_pipe
>= 0) {
1152 /* ensure getting here was a failure, e.g. if we failed to
1153 * parse the child pid or something, even after a successful
1159 if (lxc_write_nointr(status_pipe
, &status
, sizeof(status
)) != sizeof(status
))
1160 SYSERROR("writing status failed");
1164 _exit(EXIT_FAILURE
);
1167 static int save_tty_major_minor(char *directory
, struct lxc_container
*c
, char *tty_id
, int len
)
1170 char path
[PATH_MAX
];
1174 if (c
->lxc_conf
->console
.path
&& !strcmp(c
->lxc_conf
->console
.path
, "none")) {
1179 ret
= snprintf(path
, sizeof(path
), "/proc/%d/root/dev/console", c
->init_pid(c
));
1180 if (ret
< 0 || ret
>= sizeof(path
)) {
1181 ERROR("snprintf'd too many characters: %d", ret
);
1185 ret
= stat(path
, &sb
);
1187 SYSERROR("stat of %s failed", path
);
1191 ret
= snprintf(path
, sizeof(path
), "%s/tty.info", directory
);
1192 if (ret
< 0 || ret
>= sizeof(path
)) {
1193 ERROR("snprintf'd too many characters: %d", ret
);
1197 ret
= snprintf(tty_id
, len
, "tty[%llx:%llx]",
1198 (long long unsigned) sb
.st_rdev
,
1199 (long long unsigned) sb
.st_dev
);
1200 if (ret
< 0 || ret
>= sizeof(path
)) {
1201 ERROR("snprintf'd too many characters: %d", ret
);
1205 f
= fopen(path
, "we");
1207 SYSERROR("failed to open %s", path
);
1211 ret
= fprintf(f
, "%s", tty_id
);
1214 SYSERROR("failed to write to %s", path
);
1218 /* do one of either predump or a regular dump */
1219 static bool do_dump(struct lxc_container
*c
, char *mode
, struct migrate_opts
*opts
)
1224 char *criu_version
= NULL
;
1226 if (!criu_ok(c
, &criu_version
))
1229 ret
= pipe(criuout
);
1231 SYSERROR("pipe() failed");
1236 if (mkdir_p(opts
->directory
, 0700) < 0)
1241 SYSERROR("fork failed");
1246 struct criu_opts os
;
1247 struct cgroup_ops
*cgroup_ops
;
1251 cgroup_ops
= cgroup_init(c
->lxc_conf
);
1253 ERROR("failed to cgroup_init()");
1254 _exit(EXIT_FAILURE
);
1257 os
.pipefd
= criuout
[1];
1261 os
.console_name
= c
->lxc_conf
->console
.path
;
1262 os
.criu_version
= criu_version
;
1265 ret
= save_tty_major_minor(opts
->directory
, c
, os
.tty_id
, sizeof(os
.tty_id
));
1268 _exit(EXIT_FAILURE
);
1271 /* exec_criu() returning is an error */
1272 exec_criu(cgroup_ops
, c
->lxc_conf
, &os
);
1274 _exit(EXIT_FAILURE
);
1282 pid_t w
= waitpid(pid
, &status
, 0);
1284 SYSERROR("waitpid");
1290 n
= lxc_read_nointr(criuout
[0], buf
, sizeof(buf
));
1297 if (n
== sizeof(buf
))
1302 if (WIFEXITED(status
)) {
1303 if (WEXITSTATUS(status
)) {
1304 ERROR("dump failed with %d", WEXITSTATUS(status
));
1309 } else if (WIFSIGNALED(status
)) {
1310 ERROR("dump signaled with %d", WTERMSIG(status
));
1313 ERROR("unknown dump exit %d", status
);
1318 ERROR("criu output: %s", buf
);
1326 rmdir(opts
->directory
);
1331 bool __criu_pre_dump(struct lxc_container
*c
, struct migrate_opts
*opts
)
1333 return do_dump(c
, "pre-dump", opts
);
1336 bool __criu_dump(struct lxc_container
*c
, struct migrate_opts
*opts
)
1338 char path
[PATH_MAX
];
1341 ret
= snprintf(path
, sizeof(path
), "%s/inventory.img", opts
->directory
);
1342 if (ret
< 0 || ret
>= sizeof(path
))
1345 if (access(path
, F_OK
) == 0) {
1346 ERROR("please use a fresh directory for the dump directory");
1350 return do_dump(c
, "dump", opts
);
1353 bool __criu_restore(struct lxc_container
*c
, struct migrate_opts
*opts
)
1358 char *criu_version
= NULL
;
1361 ERROR("Must be root to restore");
1366 ERROR("failed to create pipe");
1370 if (!criu_ok(c
, &criu_version
)) {
1386 /* this never returns */
1387 do_restore(c
, pipefd
[1], opts
, criu_version
);
1393 nread
= lxc_read_nointr(pipefd
[0], &status
, sizeof(status
));
1395 if (sizeof(status
) != nread
) {
1396 ERROR("reading status from pipe failed");
1400 /* If the criu process was killed or exited nonzero, wait() for the
1401 * handler, since the restore process died. Otherwise, we don't need to
1402 * wait, since the child becomes the monitor process.
1404 if (!WIFEXITED(status
) || WEXITSTATUS(status
))
1409 if (wait_for_pid(pid
))
1410 ERROR("restore process died");