]>
git.proxmox.com Git - mirror_lxc.git/blob - src/lxc/criu.c
2 * lxc: linux Container library
4 * Copyright © 2014-2015 Canonical Ltd.
7 * Tycho Andersen <tycho.andersen@canonical.com>
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
26 #include <linux/limits.h>
31 #include <sys/mount.h>
32 #include <sys/types.h>
49 #define CRIU_VERSION "2.0"
51 #define CRIU_GITID_VERSION "2.0"
52 #define CRIU_GITID_PATCHLEVEL 0
54 #define CRIU_IN_FLIGHT_SUPPORT "2.4"
56 lxc_log_define(lxc_criu
, lxc
);
59 /* The type of criu invocation, one of "dump" or "restore" */
62 /* the user-provided migrate options relevant to this action */
63 struct migrate_opts
*user
;
65 /* The container to dump */
66 struct lxc_container
*c
;
68 /* dump: stop the container or not after dumping? */
69 char tty_id
[32]; /* the criu tty id for /dev/console, i.e. "tty[${rdev}:${dev}]" */
71 /* restore: the file to write the init process' pid into */
72 const char *cgroup_path
;
74 /* The path that is bind mounted from /dev/console, if any. We don't
75 * want to use `--ext-mount-map auto`'s result here because the pts
76 * device may have a different path (e.g. if the pty number is
77 * different) on the target host. NULL if lxc.console = "none".
81 /* The detected version of criu */
85 static int load_tty_major_minor(char *directory
, char *output
, int len
)
91 ret
= snprintf(path
, sizeof(path
), "%s/tty.info", directory
);
92 if (ret
< 0 || ret
>= sizeof(path
)) {
93 ERROR("snprintf'd too many chacters: %d", ret
);
99 /* This means we're coming from a liblxc which didn't export
100 * the tty info. In this case they had to have lxc.console =
101 * none, so there's no problem restoring.
106 SYSERROR("couldn't open %s", path
);
110 if (!fgets(output
, len
, f
)) {
112 SYSERROR("couldn't read %s", path
);
120 static void exec_criu(struct criu_opts
*opts
)
122 char **argv
, log
[PATH_MAX
];
123 int static_args
= 23, argc
= 0, i
, ret
;
127 char buf
[4096], tty_info
[32];
129 /* If we are currently in a cgroup /foo/bar, and the container is in a
130 * cgroup /lxc/foo, lxcfs will give us an ENOENT if some task in the
131 * container has an open fd that points to one of the cgroup files
132 * (systemd always opens its "root" cgroup). So, let's escape to the
133 * /actual/ root cgroup so that lxcfs thinks criu has enough rights to
136 if (!cgroup_escape()) {
137 ERROR("failed to escape cgroups");
141 /* The command line always looks like:
142 * criu $(action) --tcp-established --file-locks --link-remap \
143 * --manage-cgroups=full action-script foo.sh -D $(directory) \
144 * -o $(directory)/$(action).log --ext-mount-map auto
145 * --enable-external-sharing --enable-external-masters
146 * --enable-fs hugetlbfs --enable-fs tracefs --ext-mount-map console:/dev/pts/n
147 * +1 for final NULL */
149 if (strcmp(opts
->action
, "dump") == 0 || strcmp(opts
->action
, "pre-dump") == 0) {
150 /* -t pid --freeze-cgroup /lxc/ct */
153 /* --prev-images-dir <path-to-directory-A-relative-to-B> */
154 if (opts
->user
->predump_dir
)
157 /* --page-server --address <address> --port <port> */
158 if (opts
->user
->pageserver_address
&& opts
->user
->pageserver_port
)
161 /* --leave-running (only for final dump) */
162 if (strcmp(opts
->action
, "dump") == 0 && !opts
->user
->stop
)
165 /* --external tty[88,4] */
170 if (!opts
->user
->preserves_inodes
)
173 /* --ghost-limit 1024 */
174 if (opts
->user
->ghost_limit
)
176 } else if (strcmp(opts
->action
, "restore") == 0) {
177 /* --root $(lxc_mount_point) --restore-detached
178 * --restore-sibling --cgroup-root $foo
179 * --lsm-profile apparmor:whatever
184 if (load_tty_major_minor(opts
->user
->directory
, tty_info
, sizeof(tty_info
)))
187 /* --inherit-fd fd[%d]:tty[%s] */
194 if (opts
->user
->verbose
)
197 if (opts
->user
->action_script
)
200 ret
= snprintf(log
, PATH_MAX
, "%s/%s.log", opts
->user
->directory
, opts
->action
);
201 if (ret
< 0 || ret
>= PATH_MAX
) {
202 ERROR("logfile name too long\n");
206 argv
= malloc(static_args
* sizeof(*argv
));
210 memset(argv
, 0, static_args
* sizeof(*argv
));
212 #define DECLARE_ARG(arg) \
215 ERROR("Got NULL argument for criu"); \
218 argv[argc++] = strdup(arg); \
223 argv
[argc
++] = on_path("criu", NULL
);
225 ERROR("Couldn't find criu binary\n");
229 DECLARE_ARG(opts
->action
);
230 DECLARE_ARG("--tcp-established");
231 DECLARE_ARG("--file-locks");
232 DECLARE_ARG("--link-remap");
233 DECLARE_ARG("--manage-cgroups=full");
234 DECLARE_ARG("--ext-mount-map");
236 DECLARE_ARG("--enable-external-sharing");
237 DECLARE_ARG("--enable-external-masters");
238 DECLARE_ARG("--enable-fs");
239 DECLARE_ARG("hugetlbfs");
240 DECLARE_ARG("--enable-fs");
241 DECLARE_ARG("tracefs");
243 DECLARE_ARG(opts
->user
->directory
);
247 if (opts
->user
->verbose
)
248 DECLARE_ARG("-vvvvvv");
250 if (opts
->user
->action_script
) {
251 DECLARE_ARG("--action-script");
252 DECLARE_ARG(opts
->user
->action_script
);
255 if (strcmp(opts
->action
, "dump") == 0 || strcmp(opts
->action
, "pre-dump") == 0) {
256 char pid
[32], *freezer_relative
;
258 if (sprintf(pid
, "%d", opts
->c
->init_pid(opts
->c
)) < 0)
264 freezer_relative
= lxc_cmd_get_cgroup_path(opts
->c
->name
,
265 opts
->c
->config_path
,
267 if (!freezer_relative
) {
268 ERROR("failed getting freezer path");
272 ret
= snprintf(log
, sizeof(log
), "/sys/fs/cgroup/freezer/%s", freezer_relative
);
273 if (ret
< 0 || ret
>= sizeof(log
))
276 if (!opts
->user
->disable_skip_in_flight
&&
277 strcmp(opts
->criu_version
, CRIU_IN_FLIGHT_SUPPORT
) >= 0)
278 DECLARE_ARG("--skip-in-flight");
280 DECLARE_ARG("--freeze-cgroup");
283 if (opts
->tty_id
[0]) {
284 DECLARE_ARG("--ext-mount-map");
285 DECLARE_ARG("/dev/console:console");
287 DECLARE_ARG("--external");
288 DECLARE_ARG(opts
->tty_id
);
291 if (opts
->user
->predump_dir
) {
292 DECLARE_ARG("--prev-images-dir");
293 DECLARE_ARG(opts
->user
->predump_dir
);
296 if (opts
->user
->pageserver_address
&& opts
->user
->pageserver_port
) {
297 DECLARE_ARG("--page-server");
298 DECLARE_ARG("--address");
299 DECLARE_ARG(opts
->user
->pageserver_address
);
300 DECLARE_ARG("--port");
301 DECLARE_ARG(opts
->user
->pageserver_port
);
304 if (!opts
->user
->preserves_inodes
)
305 DECLARE_ARG("--force-irmap");
307 if (opts
->user
->ghost_limit
) {
308 char ghost_limit
[32];
310 ret
= sprintf(ghost_limit
, "%"PRIu64
, opts
->user
->ghost_limit
);
311 if (ret
< 0 || ret
>= sizeof(ghost_limit
)) {
312 ERROR("failed to print ghost limit %"PRIu64
, opts
->user
->ghost_limit
);
316 DECLARE_ARG("--ghost-limit");
317 DECLARE_ARG(ghost_limit
);
320 /* only for final dump */
321 if (strcmp(opts
->action
, "dump") == 0 && !opts
->user
->stop
)
322 DECLARE_ARG("--leave-running");
323 } else if (strcmp(opts
->action
, "restore") == 0) {
326 struct lxc_conf
*lxc_conf
= opts
->c
->lxc_conf
;
328 DECLARE_ARG("--root");
329 DECLARE_ARG(opts
->c
->lxc_conf
->rootfs
.mount
);
330 DECLARE_ARG("--restore-detached");
331 DECLARE_ARG("--restore-sibling");
332 DECLARE_ARG("--cgroup-root");
333 DECLARE_ARG(opts
->cgroup_path
);
336 if (opts
->console_fd
< 0) {
337 ERROR("lxc.console configured on source host but not target");
341 ret
= snprintf(buf
, sizeof(buf
), "fd[%d]:%s", opts
->console_fd
, tty_info
);
342 if (ret
< 0 || ret
>= sizeof(buf
))
345 DECLARE_ARG("--inherit-fd");
348 if (opts
->console_name
) {
349 if (snprintf(buf
, sizeof(buf
), "console:%s", opts
->console_name
) < 0) {
350 SYSERROR("sprintf'd too many bytes");
352 DECLARE_ARG("--ext-mount-map");
356 if (lxc_conf
->lsm_aa_profile
|| lxc_conf
->lsm_se_context
) {
358 if (lxc_conf
->lsm_aa_profile
)
359 ret
= snprintf(buf
, sizeof(buf
), "apparmor:%s", lxc_conf
->lsm_aa_profile
);
361 ret
= snprintf(buf
, sizeof(buf
), "selinux:%s", lxc_conf
->lsm_se_context
);
363 if (ret
< 0 || ret
>= sizeof(buf
))
366 DECLARE_ARG("--lsm-profile");
370 additional
= lxc_list_len(&opts
->c
->lxc_conf
->network
) * 2;
372 m
= realloc(argv
, (argc
+ additional
+ 1) * sizeof(*argv
));
377 lxc_list_for_each(it
, &opts
->c
->lxc_conf
->network
) {
378 char eth
[128], *veth
;
379 struct lxc_netdev
*n
= it
->elem
;
381 if (n
->type
!= LXC_NET_VETH
)
385 if (strlen(n
->name
) >= sizeof(eth
))
387 strncpy(eth
, n
->name
, sizeof(eth
));
389 sprintf(eth
, "eth%d", netnr
);
391 veth
= n
->priv
.veth_attr
.pair
;
394 ret
= snprintf(buf
, sizeof(buf
), "%s=%s@%s", eth
, veth
, n
->link
);
396 ret
= snprintf(buf
, sizeof(buf
), "%s=%s", eth
, veth
);
397 if (ret
< 0 || ret
>= sizeof(buf
))
400 DECLARE_ARG("--veth-pair");
411 for (i
= 0; argv
[i
]; i
++) {
412 ret
= snprintf(buf
+ pos
, sizeof(buf
) - pos
, "%s ", argv
[i
]);
413 if (ret
< 0 || ret
>= sizeof(buf
) - pos
)
419 INFO("execing: %s", buf
);
422 execv(argv
[0], argv
);
424 for (i
= 0; argv
[i
]; i
++)
430 * Check to see if the criu version is recent enough for all the features we
431 * use. This version allows either CRIU_VERSION or (CRIU_GITID_VERSION and
432 * CRIU_GITID_PATCHLEVEL) to work, enabling users building from git to c/r
433 * things potentially before a version is released with a particular feature.
435 * The intent is that when criu development slows down, we can drop this, but
436 * for now we shouldn't attempt to c/r with versions that we know won't work.
438 * Note: If version != NULL criu_version() stores the detected criu version in
439 * version. Allocates memory for version which must be freed by caller.
441 static bool criu_version_ok(char **version
)
446 if (pipe(pipes
) < 0) {
447 SYSERROR("pipe() failed");
453 SYSERROR("fork() failed");
458 char *args
[] = { "criu", "--version", NULL
};
462 close(STDERR_FILENO
);
463 if (dup2(pipes
[1], STDOUT_FILENO
) < 0)
466 path
= on_path("criu", NULL
);
478 if (wait_for_pid(pid
) < 0) {
480 SYSERROR("execing criu failed, is it installed?");
484 f
= fdopen(pipes
[0], "r");
496 if (fscanf(f
, "Version: %1023[^\n]s", tmp
) != 1)
499 if (fgetc(f
) != '\n')
502 if (strcmp(tmp
, CRIU_VERSION
) >= 0)
505 if (fscanf(f
, "GitID: v%1023[^-]s", tmp
) != 1)
511 if (fscanf(f
, "%d", &patch
) != 1)
514 if (strcmp(tmp
, CRIU_GITID_VERSION
) < 0)
517 if (patch
< CRIU_GITID_PATCHLEVEL
)
531 ERROR("must have criu " CRIU_VERSION
" or greater to checkpoint/restore\n");
536 /* Check and make sure the container has a configuration that we know CRIU can
538 static bool criu_ok(struct lxc_container
*c
, char **criu_version
)
542 if (!criu_version_ok(criu_version
))
546 ERROR("Must be root to checkpoint\n");
550 /* We only know how to restore containers with veth networks. */
551 lxc_list_for_each(it
, &c
->lxc_conf
->network
) {
552 struct lxc_netdev
*n
= it
->elem
;
559 ERROR("Found network that is not VETH or NONE\n");
567 static bool restore_net_info(struct lxc_container
*c
)
570 bool has_error
= true;
572 if (container_mem_lock(c
))
575 lxc_list_for_each(it
, &c
->lxc_conf
->network
) {
576 struct lxc_netdev
*netdev
= it
->elem
;
577 char template[IFNAMSIZ
];
579 if (netdev
->type
!= LXC_NET_VETH
)
582 snprintf(template, sizeof(template), "vethXXXXXX");
584 if (!netdev
->priv
.veth_attr
.pair
)
585 netdev
->priv
.veth_attr
.pair
= lxc_mkifname(template);
587 if (!netdev
->priv
.veth_attr
.pair
)
594 container_mem_unlock(c
);
598 // do_restore never returns, the calling process is used as the
599 // monitor process. do_restore calls exit() if it fails.
600 static void do_restore(struct lxc_container
*c
, int status_pipe
, struct migrate_opts
*opts
, char *criu_version
)
603 struct lxc_handler
*handler
;
605 int pipes
[2] = {-1, -1};
607 handler
= lxc_init(c
->name
, c
->lxc_conf
, c
->config_path
);
611 if (!cgroup_init(handler
)) {
612 ERROR("failed initing cgroups");
613 goto out_fini_handler
;
616 if (!cgroup_create(handler
)) {
617 ERROR("failed creating groups");
618 goto out_fini_handler
;
621 if (!restore_net_info(c
)) {
622 ERROR("failed restoring network info");
623 goto out_fini_handler
;
626 resolve_clone_flags(handler
);
628 if (pipe(pipes
) < 0) {
629 SYSERROR("pipe() failed");
630 goto out_fini_handler
;
635 goto out_fini_handler
;
639 struct lxc_rootfs
*rootfs
;
647 if (dup2(pipes
[1], STDERR_FILENO
) < 0) {
648 SYSERROR("dup2 failed");
649 goto out_fini_handler
;
652 if (dup2(pipes
[1], STDOUT_FILENO
) < 0) {
653 SYSERROR("dup2 failed");
654 goto out_fini_handler
;
657 if (unshare(CLONE_NEWNS
))
658 goto out_fini_handler
;
660 /* CRIU needs the lxc root bind mounted so that it is the root of some
662 rootfs
= &c
->lxc_conf
->rootfs
;
664 if (rootfs_is_blockdev(c
->lxc_conf
)) {
665 if (do_rootfs_setup(c
->lxc_conf
, c
->name
, c
->config_path
) < 0)
666 goto out_fini_handler
;
668 if (mkdir(rootfs
->mount
, 0755) < 0 && errno
!= EEXIST
)
669 goto out_fini_handler
;
671 if (mount(NULL
, "/", NULL
, MS_SLAVE
| MS_REC
, NULL
) < 0) {
672 SYSERROR("remount / to private failed");
673 goto out_fini_handler
;
676 if (mount(rootfs
->path
, rootfs
->mount
, NULL
, MS_BIND
, NULL
) < 0) {
677 rmdir(rootfs
->mount
);
678 goto out_fini_handler
;
682 os
.action
= "restore";
685 os
.cgroup_path
= cgroup_canonical_path(handler
);
686 os
.console_fd
= c
->lxc_conf
->console
.slave
;
687 os
.criu_version
= criu_version
;
689 if (os
.console_fd
>= 0) {
690 /* Twiddle the FD_CLOEXEC bit. We want to pass this FD to criu
691 * via --inherit-fd, so we don't want it to close.
693 flags
= fcntl(os
.console_fd
, F_GETFD
);
695 SYSERROR("F_GETFD failed: %d", os
.console_fd
);
696 goto out_fini_handler
;
699 flags
&= ~FD_CLOEXEC
;
701 if (fcntl(os
.console_fd
, F_SETFD
, flags
) < 0) {
702 SYSERROR("F_SETFD failed");
703 goto out_fini_handler
;
706 os
.console_name
= c
->lxc_conf
->console
.name
;
708 /* exec_criu() returning is an error */
710 umount(rootfs
->mount
);
711 rmdir(rootfs
->mount
);
712 goto out_fini_handler
;
720 pid_t w
= waitpid(pid
, &status
, 0);
723 goto out_fini_handler
;
726 ret
= write(status_pipe
, &status
, sizeof(status
));
730 if (sizeof(status
) != ret
) {
731 SYSERROR("failed to write all of status");
732 goto out_fini_handler
;
735 if (WIFEXITED(status
)) {
738 if (WEXITSTATUS(status
)) {
741 n
= read(pipes
[0], buf
, sizeof(buf
));
743 SYSERROR("failed reading from criu stderr");
744 goto out_fini_handler
;
749 ERROR("criu process exited %d, output:\n%s\n", WEXITSTATUS(status
), buf
);
750 goto out_fini_handler
;
752 ret
= snprintf(buf
, sizeof(buf
), "/proc/self/task/%lu/children", (unsigned long)syscall(__NR_gettid
));
753 if (ret
< 0 || ret
>= sizeof(buf
)) {
754 ERROR("snprintf'd too many characters: %d", ret
);
755 goto out_fini_handler
;
758 FILE *f
= fopen(buf
, "r");
760 SYSERROR("couldn't read restore's children file %s\n", buf
);
761 goto out_fini_handler
;
764 ret
= fscanf(f
, "%d", (int*) &handler
->pid
);
767 ERROR("reading restore pid failed");
768 goto out_fini_handler
;
771 if (lxc_set_state(c
->name
, handler
, RUNNING
)) {
772 ERROR("error setting running state after restore");
773 goto out_fini_handler
;
777 ERROR("CRIU was killed with signal %d\n", WTERMSIG(status
));
778 goto out_fini_handler
;
784 * See comment in lxcapi_start; we don't care if these
785 * fail because it's just a beauty thing. We just
786 * assign the return here to silence potential.
788 ret
= snprintf(title
, sizeof(title
), "[lxc monitor] %s %s", c
->config_path
, c
->name
);
789 ret
= setproctitle(title
);
791 ret
= lxc_poll(c
->name
, handler
);
793 lxc_abort(c
->name
, handler
);
794 lxc_fini(c
->name
, handler
);
804 lxc_fini(c
->name
, handler
);
807 if (status_pipe
>= 0) {
809 if (write(status_pipe
, &status
, sizeof(status
)) != sizeof(status
)) {
810 SYSERROR("writing status failed");
818 static int save_tty_major_minor(char *directory
, struct lxc_container
*c
, char *tty_id
, int len
)
825 if (c
->lxc_conf
->console
.path
&& !strcmp(c
->lxc_conf
->console
.path
, "none")) {
830 ret
= snprintf(path
, sizeof(path
), "/proc/%d/root/dev/console", c
->init_pid(c
));
831 if (ret
< 0 || ret
>= sizeof(path
)) {
832 ERROR("snprintf'd too many chacters: %d", ret
);
836 ret
= stat(path
, &sb
);
838 SYSERROR("stat of %s failed", path
);
842 ret
= snprintf(path
, sizeof(path
), "%s/tty.info", directory
);
843 if (ret
< 0 || ret
>= sizeof(path
)) {
844 ERROR("snprintf'd too many characters: %d", ret
);
848 ret
= snprintf(tty_id
, len
, "tty[%llx:%llx]",
849 (long long unsigned) sb
.st_rdev
,
850 (long long unsigned) sb
.st_dev
);
851 if (ret
< 0 || ret
>= sizeof(path
)) {
852 ERROR("snprintf'd too many characters: %d", ret
);
856 f
= fopen(path
, "w");
858 SYSERROR("failed to open %s", path
);
862 ret
= fprintf(f
, "%s", tty_id
);
865 SYSERROR("failed to write to %s", path
);
869 /* do one of either predump or a regular dump */
870 static bool do_dump(struct lxc_container
*c
, char *mode
, struct migrate_opts
*opts
)
873 char *criu_version
= NULL
;
875 if (!criu_ok(c
, &criu_version
))
878 if (mkdir_p(opts
->directory
, 0700) < 0)
883 SYSERROR("fork failed");
893 os
.console_name
= c
->lxc_conf
->console
.path
;
894 os
.criu_version
= criu_version
;
896 if (save_tty_major_minor(opts
->directory
, c
, os
.tty_id
, sizeof(os
.tty_id
)) < 0)
899 /* exec_criu() returning is an error */
904 pid_t w
= waitpid(pid
, &status
, 0);
910 if (WIFEXITED(status
)) {
911 if (WEXITSTATUS(status
)) {
912 ERROR("dump failed with %d\n", WEXITSTATUS(status
));
917 } else if (WIFSIGNALED(status
)) {
918 ERROR("dump signaled with %d\n", WTERMSIG(status
));
921 ERROR("unknown dump exit %d\n", status
);
927 bool __criu_pre_dump(struct lxc_container
*c
, struct migrate_opts
*opts
)
929 return do_dump(c
, "pre-dump", opts
);
932 bool __criu_dump(struct lxc_container
*c
, struct migrate_opts
*opts
)
937 ret
= snprintf(path
, sizeof(path
), "%s/inventory.img", opts
->directory
);
938 if (ret
< 0 || ret
>= sizeof(path
))
941 if (access(path
, F_OK
) == 0) {
942 ERROR("please use a fresh directory for the dump directory\n");
946 return do_dump(c
, "dump", opts
);
949 bool __criu_restore(struct lxc_container
*c
, struct migrate_opts
*opts
)
954 char *criu_version
= NULL
;
956 if (!criu_ok(c
, &criu_version
))
960 ERROR("Must be root to restore\n");
965 ERROR("failed to create pipe");
978 // this never returns
979 do_restore(c
, pipefd
[1], opts
, criu_version
);
984 nread
= read(pipefd
[0], &status
, sizeof(status
));
986 if (sizeof(status
) != nread
) {
987 ERROR("reading status from pipe failed");
991 // If the criu process was killed or exited nonzero, wait() for the
992 // handler, since the restore process died. Otherwise, we don't need to
993 // wait, since the child becomes the monitor process.
994 if (!WIFEXITED(status
) || WEXITSTATUS(status
))
999 if (wait_for_pid(pid
))
1000 ERROR("restore process died");