]>
git.proxmox.com Git - mirror_lxc.git/blob - src/lxc/criu.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
7 #include <linux/limits.h>
12 #include <sys/mount.h>
13 #include <sys/types.h>
25 #include "memory_utils.h"
28 #include "syscall_wrappers.h"
32 #include "lxcmntent.h"
41 #define CRIU_VERSION "2.0"
43 #define CRIU_GITID_VERSION "2.0"
44 #define CRIU_GITID_PATCHLEVEL 0
46 #define CRIU_IN_FLIGHT_SUPPORT "2.4"
47 #define CRIU_EXTERNAL_NOT_VETH "2.8"
49 lxc_log_define(criu
, lxc
);
52 /* the thing to hook to stdout and stderr for logging */
55 /* The type of criu invocation, one of "dump" or "restore" */
58 /* the user-provided migrate options relevant to this action */
59 struct migrate_opts
*user
;
61 /* The container to dump */
62 struct lxc_container
*c
;
64 /* dump: stop the container or not after dumping? */
65 char tty_id
[32]; /* the criu tty id for /dev/console, i.e. "tty[${rdev}:${dev}]" */
67 /* restore: the file to write the init process' pid into */
68 struct lxc_handler
*handler
;
70 /* The path that is bind mounted from /dev/console, if any. We don't
71 * want to use `--ext-mount-map auto`'s result here because the pty
72 * device may have a different path (e.g. if the pty number is
73 * different) on the target host. NULL if lxc.console.path = "none".
77 /* The detected version of criu */
81 static int load_tty_major_minor(char *directory
, char *output
, int len
)
86 ret
= strnprintf(path
, sizeof(path
), "%s/tty.info", directory
);
88 return ret_errno(EIO
);
90 ret
= lxc_read_from_file(path
, output
, len
);
93 * This means we're coming from a liblxc which didn't export
94 * the tty info. In this case they had to have lxc.console.path
95 * = * none, so there's no problem restoring.
100 return log_error_errno(-errno
, errno
, "Failed to open \"%s\"", path
);
106 static int cmp_version(const char *v1
, const char *v2
)
109 int oct_v1
[3], oct_v2
[3];
111 memset(oct_v1
, -1, sizeof(oct_v1
));
112 memset(oct_v2
, -1, sizeof(oct_v2
));
114 ret
= sscanf(v1
, "%d.%d.%d", &oct_v1
[0], &oct_v1
[1], &oct_v1
[2]);
118 ret
= sscanf(v2
, "%d.%d.%d", &oct_v2
[0], &oct_v2
[1], &oct_v2
[2]);
122 /* Major version is greater. */
123 if (oct_v1
[0] > oct_v2
[0])
126 if (oct_v1
[0] < oct_v2
[0])
129 /* Minor number is greater.*/
130 if (oct_v1
[1] > oct_v2
[1])
133 if (oct_v1
[1] < oct_v2
[1])
136 /* Patch number is greater. */
137 if (oct_v1
[2] > oct_v2
[2])
140 /* Patch numbers are equal. */
141 if (oct_v1
[2] == oct_v2
[2])
147 struct criu_exec_args
{
152 static void put_criu_exec_args(struct criu_exec_args
*args
)
155 for (int i
= 0; i
< args
->argc
; i
++)
156 free_disarm(args
->argv
[i
]);
161 define_cleanup_function(struct criu_exec_args
*, put_criu_exec_args
);
163 static int exec_criu(struct cgroup_ops
*cgroup_ops
, struct lxc_conf
*conf
,
164 struct criu_opts
*opts
)
166 call_cleaner(put_criu_exec_args
) struct criu_exec_args
*args
= NULL
;
167 __do_fclose
FILE *f_mnt
= NULL
;
169 int static_args
= 23, ret
;
171 struct mntent mntent
;
172 struct lxc_netdev
*netdev
;
173 struct string_entry
*strentry
;
175 char buf
[4096], ttys
[32];
177 /* If we are currently in a cgroup /foo/bar, and the container is in a
178 * cgroup /lxc/foo, lxcfs will give us an ENOENT if some task in the
179 * container has an open fd that points to one of the cgroup files
180 * (systemd always opens its "root" cgroup). So, let's escape to the
181 * /actual/ root cgroup so that lxcfs thinks criu has enough rights to
184 if (!cgroup_ops
->criu_escape(cgroup_ops
, conf
))
185 return log_error_errno(-ENOENT
, ENOENT
, "Failed to escape to root cgroup");
187 /* The command line always looks like:
188 * criu $(action) --tcp-established --file-locks --link-remap \
189 * --manage-cgroups=full --action-script foo.sh -D $(directory) \
190 * -o $(directory)/$(action).log --ext-mount-map auto
191 * --enable-external-sharing --enable-external-masters
192 * --enable-fs hugetlbfs --enable-fs tracefs --ext-mount-map console:/dev/pts/n
193 * +1 for final NULL */
195 if (strequal(opts
->action
, "dump") || strequal(opts
->action
, "pre-dump")) {
196 /* -t pid --freeze-cgroup /lxc/ct */
199 /* --prev-images-dir <path-to-directory-A-relative-to-B> */
200 if (opts
->user
->predump_dir
)
203 /* --page-server --address <address> --port <port> */
204 if (opts
->user
->pageserver_address
&& opts
->user
->pageserver_port
)
207 /* --leave-running (only for final dump) */
208 if (strequal(opts
->action
, "dump") && !opts
->user
->stop
)
211 /* --external tty[88,4] */
216 if (!opts
->user
->preserves_inodes
)
219 /* --ghost-limit 1024 */
220 if (opts
->user
->ghost_limit
)
222 } else if (strequal(opts
->action
, "restore")) {
223 /* --root $(lxc_mount_point) --restore-detached
225 * --lsm-profile apparmor:whatever
230 if (load_tty_major_minor(opts
->user
->directory
, ttys
, sizeof(ttys
)))
231 return log_error_errno(-EINVAL
, EINVAL
, "Failed to load tty information");
233 /* --inherit-fd fd[%d]:tty[%s] */
237 static_args
+= list_len(netdev
, &opts
->c
->lxc_conf
->netdevs
, head
) * 2;
239 return log_error_errno(-EINVAL
, EINVAL
, "Invalid criu operation specified");
242 if (cgroup_ops
->criu_num_hierarchies(cgroup_ops
) > 0)
243 static_args
+= 2 * cgroup_ops
->criu_num_hierarchies(cgroup_ops
);
245 if (opts
->user
->verbose
)
248 if (opts
->user
->action_script
)
251 static_args
+= 2 * list_len(strentry
, &opts
->c
->lxc_conf
->mount_entries
, head
);
253 ret
= strnprintf(log
, sizeof(log
), "%s/%s.log", opts
->user
->directory
, opts
->action
);
255 return ret_errno(EIO
);
257 args
= zalloc(sizeof(struct criu_exec_args
) + (static_args
* sizeof(char **)));
259 return log_error_errno(-ENOMEM
, ENOMEM
, "Failed to allocate static arguments");
261 #define DECLARE_ARG(arg) \
264 return log_error_errno(-EINVAL, EINVAL, \
265 "Got NULL argument for criu"); \
266 args->argv[(args->argc)++] = strdup(arg); \
267 if (!args->argv[args->argc - 1]) \
268 return log_error_errno(-ENOMEM, ENOMEM, \
269 "Failed to duplicate argumen %s", arg); \
272 args
->argv
[(args
->argc
)++] = on_path("criu", NULL
);
273 if (!args
->argv
[args
->argc
- 1])
274 return log_error_errno(-ENOENT
, ENOENT
, "Failed to find criu binary");
276 DECLARE_ARG(opts
->action
);
277 DECLARE_ARG("--tcp-established");
278 DECLARE_ARG("--file-locks");
279 DECLARE_ARG("--link-remap");
280 DECLARE_ARG("--manage-cgroups=full");
281 DECLARE_ARG("--ext-mount-map");
283 DECLARE_ARG("--enable-external-sharing");
284 DECLARE_ARG("--enable-external-masters");
285 DECLARE_ARG("--enable-fs");
286 DECLARE_ARG("hugetlbfs");
287 DECLARE_ARG("--enable-fs");
288 DECLARE_ARG("tracefs");
290 DECLARE_ARG(opts
->user
->directory
);
294 for (int i
= 0; i
< cgroup_ops
->criu_num_hierarchies(cgroup_ops
); i
++) {
295 __do_free
char *cgroup_base_path
= NULL
, *controllers
;
296 char **controllers_list
= NULL
;
299 if (!cgroup_ops
->criu_get_hierarchies(cgroup_ops
, i
, &controllers_list
))
300 return log_error_errno(-ENOENT
, ENOENT
, "Failed to retrieve cgroup hierarchies %d", i
);
303 * If we are in a dump, we have to ask the monitor process what
304 * the right cgroup is. if this is a restore, we can just use
305 * the handler the restore task created.
307 if (strequal(opts
->action
, "dump") || strequal(opts
->action
, "pre-dump")) {
308 cgroup_base_path
= lxc_cmd_get_limit_cgroup_path(opts
->c
->name
, opts
->c
->config_path
, controllers_list
[0]);
309 if (!cgroup_base_path
)
310 return log_error_errno(-ENOENT
, ENOENT
, "Failed to retrieve limit cgroup path for %s", controllers_list
[0] ?: "(null)");
314 p
= cgroup_ops
->get_limit_cgroup(cgroup_ops
, controllers_list
[0]);
316 return log_error_errno(-ENOENT
, ENOENT
, "Failed to retrieve limit cgroup path for %s", controllers_list
[0] ?: "(null)");
318 cgroup_base_path
= strdup(p
);
319 if (!cgroup_base_path
)
320 return log_error_errno(-ENOMEM
, ENOMEM
, "Failed to duplicate limit cgroup path");
323 tmp
= path_simplify(cgroup_base_path
);
325 return log_error_errno(-ENOMEM
, ENOMEM
, "Failed to remove extraneous slashes from \"%s\"", tmp
);
326 free_move_ptr(cgroup_base_path
, tmp
);
328 if (controllers_list
[0]) {
329 controllers
= lxc_string_join(",", (const char **)controllers_list
, false);
331 return log_error_errno(-ENOMEM
, ENOMEM
, "Failed to join controllers");
333 ret
= sprintf(buf
, "%s:%s", controllers
, cgroup_base_path
);
335 WARN("No cgroup controllers configured in container's cgroup %s", cgroup_base_path
);
336 ret
= sprintf(buf
, "%s", cgroup_base_path
);
338 if (ret
< 0 || ret
>= sizeof(buf
))
339 return log_error_errno(-EIO
, EIO
, "sprintf of cgroup root arg failed");
341 DECLARE_ARG("--cgroup-root");
345 if (opts
->user
->verbose
)
348 if (opts
->user
->action_script
) {
349 DECLARE_ARG("--action-script");
350 DECLARE_ARG(opts
->user
->action_script
);
353 f_mnt
= make_anonymous_mount_file(&opts
->c
->lxc_conf
->mount_entries
,
354 opts
->c
->lxc_conf
->lsm_aa_allow_nesting
);
356 return log_error_errno(-ENOENT
, ENOENT
, "Failed to create anonymous mount file");
358 while (getmntent_r(f_mnt
, &mntent
, buf
, sizeof(buf
))) {
359 __do_free
char *mnt_options
= NULL
;
360 unsigned long flags
= 0;
361 char arg
[2 * PATH_MAX
+ 2];
363 if (parse_mntopts_legacy(mntent
.mnt_opts
, &flags
, &mnt_options
) < 0)
364 return log_error_errno(-EINVAL
, EINVAL
, "Failed to parse mount options");
366 /* only add --ext-mount-map for actual bind mounts */
367 if (!(flags
& MS_BIND
))
370 if (strequal(opts
->action
, "dump"))
371 ret
= strnprintf(arg
, sizeof(arg
), "/%s:%s", mntent
.mnt_dir
, mntent
.mnt_dir
);
373 ret
= strnprintf(arg
, sizeof(arg
), "%s:%s", mntent
.mnt_dir
, mntent
.mnt_fsname
);
375 return log_error_errno(-EIO
, EIO
, "Failed to create mount entry");
377 DECLARE_ARG("--ext-mount-map");
381 if (strequal(opts
->action
, "dump") || strequal(opts
->action
, "pre-dump")) {
383 char init_pid_str
[INTTYPE_TO_STRLEN(int)];
384 char *freezer_relative
;
386 init_pid
= opts
->c
->init_pid(opts
->c
);
388 return log_error_errno(-ESRCH
, ESRCH
, "Failed to retrieve init pid of container");
390 ret
= strnprintf(init_pid_str
, sizeof(init_pid_str
), "%d", init_pid
);
392 return log_error_errno(-EIO
, EIO
, "Failed to create entry for init pid of container");
395 DECLARE_ARG(init_pid_str
);
397 freezer_relative
= lxc_cmd_get_limit_cgroup_path(opts
->c
->name
,
398 opts
->c
->config_path
,
400 if (!freezer_relative
)
401 return log_error_errno(-ENOENT
, ENOENT
, "Failed getting freezer path");
403 if (pure_unified_layout(cgroup_ops
))
404 ret
= strnprintf(log
, sizeof(log
), "/sys/fs/cgroup/%s", freezer_relative
);
406 ret
= strnprintf(log
, sizeof(log
), "/sys/fs/cgroup/freezer/%s", freezer_relative
);
408 return log_error_errno(-EIO
, EIO
, "Failed to freezer cgroup entry");
410 if (!opts
->user
->disable_skip_in_flight
&&
411 strcmp(opts
->criu_version
, CRIU_IN_FLIGHT_SUPPORT
) >= 0)
412 DECLARE_ARG("--skip-in-flight");
414 DECLARE_ARG("--freeze-cgroup");
417 if (opts
->tty_id
[0]) {
418 DECLARE_ARG("--ext-mount-map");
419 DECLARE_ARG("/dev/console:console");
421 DECLARE_ARG("--external");
422 DECLARE_ARG(opts
->tty_id
);
425 if (opts
->user
->predump_dir
) {
426 DECLARE_ARG("--prev-images-dir");
427 DECLARE_ARG(opts
->user
->predump_dir
);
428 DECLARE_ARG("--track-mem");
431 if (opts
->user
->pageserver_address
&& opts
->user
->pageserver_port
) {
432 DECLARE_ARG("--page-server");
433 DECLARE_ARG("--address");
434 DECLARE_ARG(opts
->user
->pageserver_address
);
435 DECLARE_ARG("--port");
436 DECLARE_ARG(opts
->user
->pageserver_port
);
439 if (!opts
->user
->preserves_inodes
)
440 DECLARE_ARG("--force-irmap");
442 if (opts
->user
->ghost_limit
) {
443 char ghost_limit
[32];
445 ret
= sprintf(ghost_limit
, "%"PRIu64
, opts
->user
->ghost_limit
);
446 if (ret
< 0 || ret
>= sizeof(ghost_limit
))
447 return log_error_errno(-EIO
, EIO
, "Failed to print ghost limit %"PRIu64
, opts
->user
->ghost_limit
);
449 DECLARE_ARG("--ghost-limit");
450 DECLARE_ARG(ghost_limit
);
453 /* only for final dump */
454 if (strequal(opts
->action
, "dump") && !opts
->user
->stop
)
455 DECLARE_ARG("--leave-running");
456 } else if (strequal(opts
->action
, "restore")) {
457 struct lxc_conf
*lxc_conf
= opts
->c
->lxc_conf
;
459 DECLARE_ARG("--root");
460 DECLARE_ARG(opts
->c
->lxc_conf
->rootfs
.mount
);
461 DECLARE_ARG("--restore-detached");
462 DECLARE_ARG("--restore-sibling");
465 if (opts
->console_fd
< 0)
466 return log_error_errno(-EINVAL
, EINVAL
, "lxc.console.path configured on source host but not target");
468 ret
= strnprintf(buf
, sizeof(buf
), "fd[%d]:%s", opts
->console_fd
, ttys
);
470 return log_error_errno(-EIO
, EIO
, "Failed to create console entry");
472 DECLARE_ARG("--inherit-fd");
475 if (opts
->console_name
) {
476 if (strnprintf(buf
, sizeof(buf
), "console:%s", opts
->console_name
) < 0)
477 return log_error_errno(-EIO
, EIO
, "Failed to create console entry");
479 DECLARE_ARG("--ext-mount-map");
483 if (lxc_conf
->lsm_aa_profile
|| lxc_conf
->lsm_se_context
) {
485 if (lxc_conf
->lsm_aa_profile
)
486 ret
= strnprintf(buf
, sizeof(buf
), "apparmor:%s", lxc_conf
->lsm_aa_profile
);
488 ret
= strnprintf(buf
, sizeof(buf
), "selinux:%s", lxc_conf
->lsm_se_context
);
490 return log_error_errno(-EIO
, EIO
, "Failed to create lsm entry");
492 DECLARE_ARG("--lsm-profile");
496 list_for_each_entry(netdev
, &opts
->c
->lxc_conf
->netdevs
, head
) {
498 char eth
[128], *veth
;
499 bool external_not_veth
;
501 if (cmp_version(opts
->criu_version
, CRIU_EXTERNAL_NOT_VETH
) >= 0) {
502 /* Since criu version 2.8 the usage of --veth-pair
503 * has been deprecated:
504 * git tag --contains f2037e6d3445fc400
506 external_not_veth
= true;
508 external_not_veth
= false;
511 if (netdev
->name
[0] != '\0') {
512 retlen
= strlcpy(eth
, netdev
->name
, sizeof(eth
));
513 if (retlen
>= sizeof(eth
))
514 return log_error_errno(-E2BIG
, E2BIG
, "Failed to append veth device name");
516 ret
= strnprintf(eth
, sizeof(eth
), "eth%d", netnr
);
518 return log_error_errno(-E2BIG
, E2BIG
, "Failed to append veth device name");
521 switch (netdev
->type
) {
523 veth
= netdev
->priv
.veth_attr
.pair
;
525 veth
= netdev
->priv
.veth_attr
.veth1
;
527 if (netdev
->link
[0] != '\0') {
528 if (external_not_veth
)
529 ret
= strnprintf(buf
, sizeof(buf
), "veth[%s]:%s@%s", eth
, veth
, netdev
->link
);
531 ret
= strnprintf(buf
, sizeof(buf
), "%s=%s@%s", eth
, veth
, netdev
->link
);
533 if (external_not_veth
)
534 ret
= strnprintf(buf
, sizeof(buf
), "veth[%s]:%s", eth
, veth
);
536 ret
= strnprintf(buf
, sizeof(buf
), "%s=%s", eth
, veth
);
539 return log_error_errno(-EIO
, EIO
, "Failed to append veth device name");
541 TRACE("Added veth device entry %s", buf
);
543 case LXC_NET_MACVLAN
:
544 if (netdev
->link
[0] == '\0')
545 return log_error_errno(-EINVAL
, EINVAL
, "Failed to find host interface for macvlan %s", netdev
->name
);
547 ret
= strnprintf(buf
, sizeof(buf
), "macvlan[%s]:%s", eth
, netdev
->link
);
549 return log_error_errno(-EIO
, EIO
, "Failed to add macvlan entry");
551 TRACE("Added macvlan device entry %s", buf
);
558 /* we have screened for this earlier... */
559 return log_error_errno(-EINVAL
, EINVAL
, "Unsupported network type %d", netdev
->type
);
562 if (external_not_veth
)
563 DECLARE_ARG("--external");
565 DECLARE_ARG("--veth-pair");
572 args
->argv
[args
->argc
] = NULL
;
574 if (lxc_log_trace()) {
576 for (int i
= 0, pos
= 0; i
< args
->argc
&& args
->argv
[i
]; i
++) {
577 ret
= strnprintf(buf
+ pos
, sizeof(buf
) - pos
, "%s ", args
->argv
[i
]);
579 return log_error_errno(-EIO
, EIO
, "Failed to reorder entries");
584 TRACE("Using command line %s", buf
);
587 /* before criu inits its log, it sometimes prints things to stdout/err;
588 * let's be sure we capture that.
590 if (dup2(opts
->pipefd
, STDOUT_FILENO
) < 0)
591 return log_error_errno(-errno
, errno
, "Failed to duplicate stdout");
593 if (dup2(opts
->pipefd
, STDERR_FILENO
) < 0)
594 return log_error_errno(-errno
, errno
, "Failed to duplicate stderr");
599 execv(args
->argv
[0], args
->argv
);
604 * Function to check if the checks activated in 'features_to_check' are
605 * available with the current architecture/kernel/criu combination.
607 * Parameter features_to_check is a bit mask of all features that should be
608 * checked (see feature check defines in lxc/lxccontainer.h).
610 * If the return value is true, all requested features are supported. If
611 * the return value is false the features_to_check parameter is updated
612 * to reflect which features are available. '0' means no feature but
613 * also that something went totally wrong.
615 * Some of the code flow of criu_version_ok() is duplicated and maybe it
616 * is a good candidate for refactoring.
618 bool __criu_check_feature(uint64_t *features_to_check
)
621 uint64_t current_bit
= 0;
623 uint64_t features
= *features_to_check
;
624 /* Feature checking is currently always like
625 * criu check --feature <feature-name>
627 char *args
[] = { "criu", "check", "--feature", NULL
, NULL
};
629 if ((features
& ~FEATURE_MEM_TRACK
& ~FEATURE_LAZY_PAGES
) != 0) {
630 /* There are feature bits activated we do not understand.
631 * Refusing to answer at all */
632 *features_to_check
= 0;
636 while (current_bit
< (sizeof(uint64_t) * 8 - 1)) {
637 /* only test requested features */
638 if (!(features
& (1ULL << current_bit
))) {
646 SYSERROR("fork() failed");
647 *features_to_check
= 0;
652 if ((1ULL << current_bit
) == FEATURE_MEM_TRACK
)
653 /* This is needed for pre-dump support, which
654 * enables pre-copy migration. */
655 args
[3] = "mem_dirty_track";
656 else if ((1ULL << current_bit
) == FEATURE_LAZY_PAGES
)
657 /* CRIU has two checks for userfaultfd support.
659 * The simpler check is only for 'uffd'. If the
660 * kernel supports userfaultfd without noncoop
661 * then only process can be lazily restored
662 * which do not fork. With 'uffd-noncoop'
663 * it is also possible to lazily restore processes
664 * which do fork. For a container runtime like
665 * LXC checking only for 'uffd' makes not much sense. */
666 args
[3] = "uffd-noncoop";
672 execvp("criu", args
);
673 SYSERROR("Failed to exec \"criu\"");
677 ret
= wait_for_pid(pid
);
680 /* It is not known why CRIU failed. Either
681 * CRIU is not available, the feature check
682 * does not exist or the feature is not
684 INFO("feature not supported");
685 /* Clear not supported feature bit */
686 features
&= ~(1ULL << current_bit
);
690 /* no more checks requested; exit check loop */
691 if (!(features
& ~((1ULL << current_bit
)-1)))
694 if (features
!= *features_to_check
) {
695 *features_to_check
= features
;
702 * Check to see if the criu version is recent enough for all the features we
703 * use. This version allows either CRIU_VERSION or (CRIU_GITID_VERSION and
704 * CRIU_GITID_PATCHLEVEL) to work, enabling users building from git to c/r
705 * things potentially before a version is released with a particular feature.
707 * The intent is that when criu development slows down, we can drop this, but
708 * for now we shouldn't attempt to c/r with versions that we know won't work.
710 * Note: If version != NULL criu_version() stores the detected criu version in
711 * version. Allocates memory for version which must be freed by caller.
713 static bool criu_version_ok(char **version
)
718 if (pipe(pipes
) < 0) {
719 SYSERROR("pipe() failed");
725 SYSERROR("fork() failed");
730 char *args
[] = { "criu", "--version", NULL
};
734 close(STDERR_FILENO
);
735 if (dup2(pipes
[1], STDOUT_FILENO
) < 0)
738 path
= on_path("criu", NULL
);
750 if (wait_for_pid(pid
) < 0) {
752 SYSERROR("execing criu failed, is it installed?");
756 f
= fdopen(pipes
[0], "re");
768 if (fscanf(f
, "Version: %1023[^\n]s", tmp
) != 1)
771 if (fgetc(f
) != '\n')
774 if (strcmp(tmp
, CRIU_VERSION
) >= 0)
777 if (fscanf(f
, "GitID: v%1023[^-]s", tmp
) != 1)
783 if (fscanf(f
, "%d", &patch
) != 1)
786 if (strcmp(tmp
, CRIU_GITID_VERSION
) < 0)
789 if (patch
< CRIU_GITID_PATCHLEVEL
)
803 ERROR("must have criu " CRIU_VERSION
" or greater to checkpoint/restore");
808 /* Check and make sure the container has a configuration that we know CRIU can
810 static bool criu_ok(struct lxc_container
*c
, char **criu_version
)
812 struct lxc_netdev
*netdev
;
815 ERROR("Must be root to checkpoint");
819 if (!criu_version_ok(criu_version
))
822 /* We only know how to restore containers with veth networks. */
823 list_for_each_entry(netdev
, &c
->lxc_conf
->netdevs
, head
) {
824 switch(netdev
->type
) {
828 case LXC_NET_MACVLAN
:
831 ERROR("Found un-dumpable network: %s (%s)", lxc_net_type_to_str(netdev
->type
), netdev
->name
);
834 *criu_version
= NULL
;
843 static bool restore_net_info(struct lxc_container
*c
)
846 bool has_error
= true;
847 struct lxc_netdev
*netdev
;
849 if (container_mem_lock(c
))
852 list_for_each_entry(netdev
, &c
->lxc_conf
->netdevs
, head
) {
853 char template[IFNAMSIZ
];
855 if (netdev
->type
!= LXC_NET_VETH
)
858 ret
= strnprintf(template, sizeof(template), "vethXXXXXX");
862 if (netdev
->priv
.veth_attr
.pair
[0] == '\0' &&
863 netdev
->priv
.veth_attr
.veth1
[0] == '\0') {
864 if (!lxc_ifname_alnum_case_sensitive(template))
867 (void)strlcpy(netdev
->priv
.veth_attr
.veth1
, template, IFNAMSIZ
);
874 container_mem_unlock(c
);
878 /* do_restore never returns, the calling process is used as the monitor process.
879 * do_restore calls _exit() if it fails.
881 static void do_restore(struct lxc_container
*c
, int status_pipe
, struct migrate_opts
*opts
, char *criu_version
)
885 struct lxc_handler
*handler
;
887 int pipes
[2] = {-1, -1};
888 struct cgroup_ops
*cgroup_ops
;
890 /* Try to detach from the current controlling tty if it exists.
891 * Otherwise, lxc_init (via lxc_console) will attach the container's
892 * console output to the current tty, which is probably not what any
893 * library user wants, and if they do, they can just manually configure
896 fd
= open("/dev/tty", O_RDWR
);
898 if (ioctl(fd
, TIOCNOTTY
, NULL
) < 0)
899 SYSERROR("couldn't detach from tty");
903 handler
= lxc_init_handler(NULL
, c
->name
, c
->lxc_conf
, c
->config_path
, false);
907 if (lxc_init(c
->name
, handler
) < 0)
909 cgroup_ops
= handler
->cgroup_ops
;
911 if (!cgroup_ops
->monitor_create(cgroup_ops
, handler
)) {
912 ERROR("Failed to create monitor cgroup");
913 goto out_fini_handler
;
916 if (!cgroup_ops
->monitor_enter(cgroup_ops
, handler
)) {
917 ERROR("Failed to enter monitor cgroup");
918 goto out_fini_handler
;
921 if (!cgroup_ops
->monitor_delegate_controllers(cgroup_ops
)) {
922 ERROR("Failed to delegate controllers to monitor cgroup");
923 goto out_fini_handler
;
926 if (!cgroup_ops
->payload_create(cgroup_ops
, handler
)) {
927 ERROR("Failed creating cgroups");
928 goto out_fini_handler
;
931 if (!restore_net_info(c
)) {
932 ERROR("failed restoring network info");
933 goto out_fini_handler
;
936 ret
= resolve_clone_flags(handler
);
938 SYSERROR("Unsupported clone flag specified");
939 goto out_fini_handler
;
942 if (pipe2(pipes
, O_CLOEXEC
) < 0) {
943 SYSERROR("pipe() failed");
944 goto out_fini_handler
;
949 goto out_fini_handler
;
953 struct lxc_rootfs
*rootfs
;
962 if (unshare(CLONE_NEWNS
))
963 goto out_fini_handler
;
965 ret
= lxc_storage_prepare(c
->lxc_conf
);
967 goto out_fini_handler
;
969 /* CRIU needs the lxc root bind mounted so that it is the root of some
971 rootfs
= &c
->lxc_conf
->rootfs
;
973 if (rootfs_is_blockdev(c
->lxc_conf
)) {
974 if (lxc_setup_rootfs_prepare_root(c
->lxc_conf
, c
->name
,
976 goto out_fini_handler
;
978 if (mkdir(rootfs
->mount
, 0755) < 0 && errno
!= EEXIST
)
979 goto out_fini_handler
;
981 if (mount(NULL
, "/", NULL
, MS_SLAVE
| MS_REC
, NULL
) < 0) {
982 SYSERROR("remount / to private failed");
983 goto out_fini_handler
;
986 if (mount(rootfs
->path
, rootfs
->mount
, NULL
, MS_BIND
, NULL
) < 0) {
987 (void)rmdir(rootfs
->mount
);
988 goto out_fini_handler
;
992 os
.pipefd
= pipes
[1];
993 os
.action
= "restore";
996 os
.console_fd
= c
->lxc_conf
->console
.pty
;
997 os
.criu_version
= criu_version
;
998 os
.handler
= handler
;
1000 if (os
.console_fd
>= 0) {
1001 /* Twiddle the FD_CLOEXEC bit. We want to pass this FD to criu
1002 * via --inherit-fd, so we don't want it to close.
1004 flags
= fcntl(os
.console_fd
, F_GETFD
);
1006 SYSERROR("F_GETFD failed: %d", os
.console_fd
);
1007 goto out_fini_handler
;
1010 flags
&= ~FD_CLOEXEC
;
1012 if (fcntl(os
.console_fd
, F_SETFD
, flags
) < 0) {
1013 SYSERROR("F_SETFD failed");
1014 goto out_fini_handler
;
1017 os
.console_name
= c
->lxc_conf
->console
.name
;
1019 /* exec_criu() returning is an error */
1020 ret
= exec_criu(handler
->cgroup_ops
, c
->lxc_conf
, &os
);
1022 SYSERROR("Failed to execute criu");
1023 umount(rootfs
->mount
);
1024 (void)rmdir(rootfs
->mount
);
1025 goto out_fini_handler
;
1032 pid_t w
= waitpid(pid
, &status
, 0);
1034 SYSERROR("waitpid");
1035 goto out_fini_handler
;
1038 if (WIFEXITED(status
)) {
1041 if (WEXITSTATUS(status
)) {
1044 n
= lxc_read_nointr(pipes
[0], buf
, sizeof(buf
));
1046 SYSERROR("failed reading from criu stderr");
1047 goto out_fini_handler
;
1050 if (n
== sizeof(buf
))
1054 ERROR("criu process exited %d, output:\n%s", WEXITSTATUS(status
), buf
);
1055 goto out_fini_handler
;
1057 ret
= strnprintf(buf
, sizeof(buf
), "/proc/self/task/%lu/children", (unsigned long)syscall(__NR_gettid
));
1059 ERROR("strnprintf'd too many characters: %d", ret
);
1060 goto out_fini_handler
;
1063 FILE *f
= fopen(buf
, "re");
1065 SYSERROR("couldn't read restore's children file %s", buf
);
1066 goto out_fini_handler
;
1069 ret
= fscanf(f
, "%d", (int*) &handler
->pid
);
1072 ERROR("reading restore pid failed");
1073 goto out_fini_handler
;
1076 if (lxc_set_state(c
->name
, handler
, RUNNING
)) {
1077 ERROR("error setting running state after restore");
1078 goto out_fini_handler
;
1082 ERROR("CRIU was killed with signal %d", WTERMSIG(status
));
1083 goto out_fini_handler
;
1088 ret
= lxc_write_nointr(status_pipe
, &status
, sizeof(status
));
1092 if (sizeof(status
) != ret
) {
1093 SYSERROR("failed to write all of status");
1094 goto out_fini_handler
;
1098 * See comment in lxcapi_start; we don't care if these
1099 * fail because it's just a beauty thing. We just
1100 * assign the return here to silence potential.
1102 ret
= strnprintf(title
, sizeof(title
), "[lxc monitor] %s %s", c
->config_path
, c
->name
);
1104 INFO("Setting truncated process name");
1106 ret
= setproctitle(title
);
1108 INFO("Failed to set process name");
1110 ret
= lxc_poll(c
->name
, handler
);
1126 if (status_pipe
>= 0) {
1127 /* ensure getting here was a failure, e.g. if we failed to
1128 * parse the child pid or something, even after a successful
1134 if (lxc_write_nointr(status_pipe
, &status
, sizeof(status
)) != sizeof(status
))
1135 SYSERROR("writing status failed");
1139 _exit(EXIT_FAILURE
);
1142 static int save_tty_major_minor(char *directory
, struct lxc_container
*c
, char *tty_id
, int len
)
1145 char path
[PATH_MAX
];
1149 if (c
->lxc_conf
->console
.path
&& strequal(c
->lxc_conf
->console
.path
, "none")) {
1154 ret
= strnprintf(path
, sizeof(path
), "/proc/%d/root/dev/console", c
->init_pid(c
));
1156 ERROR("strnprintf'd too many characters: %d", ret
);
1160 ret
= stat(path
, &sb
);
1162 SYSERROR("stat of %s failed", path
);
1166 ret
= strnprintf(path
, sizeof(path
), "%s/tty.info", directory
);
1168 ERROR("strnprintf'd too many characters: %d", ret
);
1172 ret
= strnprintf(tty_id
, len
, "tty[%llx:%llx]",
1173 (long long unsigned) sb
.st_rdev
,
1174 (long long unsigned) sb
.st_dev
);
1176 ERROR("strnprintf'd too many characters: %d", ret
);
1180 f
= fopen(path
, "we");
1182 SYSERROR("failed to open %s", path
);
1186 ret
= fprintf(f
, "%s", tty_id
);
1189 SYSERROR("failed to write to %s", path
);
1193 /* do one of either predump or a regular dump */
1194 static bool do_dump(struct lxc_container
*c
, char *mode
, struct migrate_opts
*opts
)
1199 char *criu_version
= NULL
;
1201 if (!criu_ok(c
, &criu_version
))
1204 ret
= pipe(criuout
);
1206 SYSERROR("pipe() failed");
1211 if (mkdir_p(opts
->directory
, 0700) < 0)
1216 SYSERROR("fork failed");
1221 struct criu_opts os
;
1222 struct cgroup_ops
*cgroup_ops
;
1226 cgroup_ops
= cgroup_init(c
->lxc_conf
);
1228 ERROR("failed to cgroup_init()");
1229 _exit(EXIT_FAILURE
);
1232 os
.pipefd
= criuout
[1];
1236 os
.console_name
= c
->lxc_conf
->console
.path
;
1237 os
.criu_version
= criu_version
;
1240 ret
= save_tty_major_minor(opts
->directory
, c
, os
.tty_id
, sizeof(os
.tty_id
));
1243 _exit(EXIT_FAILURE
);
1246 /* exec_criu() returning is an error */
1247 ret
= exec_criu(cgroup_ops
, c
->lxc_conf
, &os
);
1249 SYSERROR("Failed to execute criu");
1251 _exit(EXIT_FAILURE
);
1259 pid_t w
= waitpid(pid
, &status
, 0);
1261 SYSERROR("waitpid");
1267 n
= lxc_read_nointr(criuout
[0], buf
, sizeof(buf
));
1274 if (n
== sizeof(buf
))
1279 if (WIFEXITED(status
)) {
1280 if (WEXITSTATUS(status
)) {
1281 ERROR("dump failed with %d", WEXITSTATUS(status
));
1286 } else if (WIFSIGNALED(status
)) {
1287 ERROR("dump signaled with %d", WTERMSIG(status
));
1290 ERROR("unknown dump exit %d", status
);
1295 ERROR("criu output: %s", buf
);
1303 (void)rmdir(opts
->directory
);
1308 bool __criu_pre_dump(struct lxc_container
*c
, struct migrate_opts
*opts
)
1310 return do_dump(c
, "pre-dump", opts
);
1313 bool __criu_dump(struct lxc_container
*c
, struct migrate_opts
*opts
)
1315 char path
[PATH_MAX
];
1318 ret
= strnprintf(path
, sizeof(path
), "%s/inventory.img", opts
->directory
);
1322 if (access(path
, F_OK
) == 0) {
1323 ERROR("please use a fresh directory for the dump directory");
1327 return do_dump(c
, "dump", opts
);
1330 bool __criu_restore(struct lxc_container
*c
, struct migrate_opts
*opts
)
1335 char *criu_version
= NULL
;
1338 ERROR("Must be root to restore");
1343 ERROR("failed to create pipe");
1347 if (!criu_ok(c
, &criu_version
)) {
1363 /* this never returns */
1364 do_restore(c
, pipefd
[1], opts
, criu_version
);
1370 nread
= lxc_read_nointr(pipefd
[0], &status
, sizeof(status
));
1372 if (sizeof(status
) != nread
) {
1373 ERROR("reading status from pipe failed");
1377 /* If the criu process was killed or exited nonzero, wait() for the
1378 * handler, since the restore process died. Otherwise, we don't need to
1379 * wait, since the child becomes the monitor process.
1381 if (!WIFEXITED(status
) || WEXITSTATUS(status
))
1386 if (wait_for_pid(pid
))
1387 ERROR("restore process died");