]>
git.proxmox.com Git - mirror_lxc.git/blob - src/lxc/criu.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
6 #include <linux/limits.h>
11 #include <sys/mount.h>
12 #include <sys/types.h>
16 #include "attach_options.h"
25 #include "memory_utils.h"
28 #include "syscall_wrappers.h"
32 #include "lxcmntent.h"
41 #define CRIU_VERSION "2.0"
43 #define CRIU_GITID_VERSION "2.0"
44 #define CRIU_GITID_PATCHLEVEL 0
46 #define CRIU_IN_FLIGHT_SUPPORT "2.4"
47 #define CRIU_EXTERNAL_NOT_VETH "2.8"
48 #define CRIU_EXTERNAL_NETDEV "3.15"
50 lxc_log_define(criu
, lxc
);
53 /* the thing to hook to stdout and stderr for logging */
56 /* The type of criu invocation, one of "dump" or "restore" */
59 /* the user-provided migrate options relevant to this action */
60 struct migrate_opts
*user
;
62 /* The container to dump */
63 struct lxc_container
*c
;
65 /* dump: stop the container or not after dumping? */
66 char tty_id
[32]; /* the criu tty id for /dev/console, i.e. "tty[${rdev}:${dev}]" */
68 /* restore: the file to write the init process' pid into */
69 struct lxc_handler
*handler
;
71 /* The path that is bind mounted from /dev/console, if any. We don't
72 * want to use `--ext-mount-map auto`'s result here because the pty
73 * device may have a different path (e.g. if the pty number is
74 * different) on the target host. NULL if lxc.console.path = "none".
78 /* The detected version of criu */
82 static int load_tty_major_minor(char *directory
, char *output
, int len
)
87 ret
= strnprintf(path
, sizeof(path
), "%s/tty.info", directory
);
89 return ret_errno(EIO
);
91 ret
= lxc_read_from_file(path
, output
, len
);
94 * This means we're coming from a liblxc which didn't export
95 * the tty info. In this case they had to have lxc.console.path
96 * = * none, so there's no problem restoring.
101 return log_error_errno(-errno
, errno
, "Failed to open \"%s\"", path
);
107 static int cmp_version(const char *v1
, const char *v2
)
110 int oct_v1
[3], oct_v2
[3];
112 memset(oct_v1
, -1, sizeof(oct_v1
));
113 memset(oct_v2
, -1, sizeof(oct_v2
));
115 ret
= sscanf(v1
, "%d.%d.%d", &oct_v1
[0], &oct_v1
[1], &oct_v1
[2]);
119 ret
= sscanf(v2
, "%d.%d.%d", &oct_v2
[0], &oct_v2
[1], &oct_v2
[2]);
123 /* Major version is greater. */
124 if (oct_v1
[0] > oct_v2
[0])
127 if (oct_v1
[0] < oct_v2
[0])
130 /* Minor number is greater.*/
131 if (oct_v1
[1] > oct_v2
[1])
134 if (oct_v1
[1] < oct_v2
[1])
137 /* Patch number is greater. */
138 if (oct_v1
[2] > oct_v2
[2])
141 /* Patch numbers are equal. */
142 if (oct_v1
[2] == oct_v2
[2])
148 struct criu_exec_args
{
153 static void put_criu_exec_args(struct criu_exec_args
*args
)
156 for (int i
= 0; i
< args
->argc
; i
++)
157 free_disarm(args
->argv
[i
]);
162 define_cleanup_function(struct criu_exec_args
*, put_criu_exec_args
);
164 static int exec_criu(struct cgroup_ops
*cgroup_ops
, struct lxc_conf
*conf
,
165 struct criu_opts
*opts
)
167 call_cleaner(put_criu_exec_args
) struct criu_exec_args
*args
= NULL
;
168 __do_fclose
FILE *f_mnt
= NULL
;
170 int static_args
= 23, ret
;
172 struct mntent mntent
;
173 struct lxc_netdev
*netdev
;
174 struct string_entry
*strentry
;
176 char buf
[4096], ttys
[32];
178 /* If we are currently in a cgroup /foo/bar, and the container is in a
179 * cgroup /lxc/foo, lxcfs will give us an ENOENT if some task in the
180 * container has an open fd that points to one of the cgroup files
181 * (systemd always opens its "root" cgroup). So, let's escape to the
182 * /actual/ root cgroup so that lxcfs thinks criu has enough rights to
185 if (!cgroup_ops
->criu_escape(cgroup_ops
, conf
))
186 return log_error_errno(-ENOENT
, ENOENT
, "Failed to escape to root cgroup");
188 /* The command line always looks like:
189 * criu $(action) --tcp-established --file-locks --link-remap \
190 * --manage-cgroups=full --action-script foo.sh -D $(directory) \
191 * -o $(directory)/$(action).log --ext-mount-map auto
192 * --enable-external-sharing --enable-external-masters
193 * --enable-fs hugetlbfs --enable-fs tracefs --ext-mount-map console:/dev/pts/n
194 * +1 for final NULL */
196 if (strequal(opts
->action
, "dump") || strequal(opts
->action
, "pre-dump")) {
197 /* -t pid --freeze-cgroup /lxc/ct */
200 /* --prev-images-dir <path-to-directory-A-relative-to-B> */
201 if (opts
->user
->predump_dir
)
204 /* --page-server --address <address> --port <port> */
205 if (opts
->user
->pageserver_address
&& opts
->user
->pageserver_port
)
208 /* --leave-running (only for final dump) */
209 if (strequal(opts
->action
, "dump") && !opts
->user
->stop
)
212 /* --external tty[88,4] */
217 if (!opts
->user
->preserves_inodes
)
220 /* --ghost-limit 1024 */
221 if (opts
->user
->ghost_limit
)
223 } else if (strequal(opts
->action
, "restore")) {
224 /* --root $(lxc_mount_point) --restore-detached
226 * --lsm-profile apparmor:whatever
231 if (load_tty_major_minor(opts
->user
->directory
, ttys
, sizeof(ttys
)))
232 return log_error_errno(-EINVAL
, EINVAL
, "Failed to load tty information");
234 /* --inherit-fd fd[%d]:tty[%s] */
238 static_args
+= list_len(netdev
, &opts
->c
->lxc_conf
->netdevs
, head
) * 2;
240 return log_error_errno(-EINVAL
, EINVAL
, "Invalid criu operation specified");
243 if (cgroup_ops
->criu_num_hierarchies(cgroup_ops
) > 0)
244 static_args
+= 2 * cgroup_ops
->criu_num_hierarchies(cgroup_ops
);
246 if (opts
->user
->verbose
)
249 if (opts
->user
->action_script
)
252 static_args
+= 2 * list_len(strentry
, &opts
->c
->lxc_conf
->mount_entries
, head
);
254 ret
= strnprintf(log
, sizeof(log
), "%s/%s.log", opts
->user
->directory
, opts
->action
);
256 return ret_errno(EIO
);
258 args
= zalloc(sizeof(struct criu_exec_args
) + (static_args
* sizeof(char **)));
260 return log_error_errno(-ENOMEM
, ENOMEM
, "Failed to allocate static arguments");
262 #define DECLARE_ARG(arg) \
265 return log_error_errno(-EINVAL, EINVAL, \
266 "Got NULL argument for criu"); \
267 args->argv[(args->argc)++] = strdup(arg); \
268 if (!args->argv[args->argc - 1]) \
269 return log_error_errno(-ENOMEM, ENOMEM, \
270 "Failed to duplicate argumen %s", arg); \
273 args
->argv
[(args
->argc
)++] = on_path("criu", NULL
);
274 if (!args
->argv
[args
->argc
- 1])
275 return log_error_errno(-ENOENT
, ENOENT
, "Failed to find criu binary");
277 DECLARE_ARG(opts
->action
);
278 DECLARE_ARG("--tcp-established");
279 DECLARE_ARG("--file-locks");
280 DECLARE_ARG("--link-remap");
281 DECLARE_ARG("--manage-cgroups=full");
282 DECLARE_ARG("--ext-mount-map");
284 DECLARE_ARG("--enable-external-sharing");
285 DECLARE_ARG("--enable-external-masters");
286 DECLARE_ARG("--enable-fs");
287 DECLARE_ARG("hugetlbfs");
288 DECLARE_ARG("--enable-fs");
289 DECLARE_ARG("tracefs");
291 DECLARE_ARG(opts
->user
->directory
);
295 for (int i
= 0; i
< cgroup_ops
->criu_num_hierarchies(cgroup_ops
); i
++) {
296 __do_free
char *cgroup_base_path
= NULL
, *controllers
;
297 char **controllers_list
= NULL
;
300 if (!cgroup_ops
->criu_get_hierarchies(cgroup_ops
, i
, &controllers_list
))
301 return log_error_errno(-ENOENT
, ENOENT
, "Failed to retrieve cgroup hierarchies %d", i
);
304 * If we are in a dump, we have to ask the monitor process what
305 * the right cgroup is. if this is a restore, we can just use
306 * the handler the restore task created.
308 if (strequal(opts
->action
, "dump") || strequal(opts
->action
, "pre-dump")) {
309 cgroup_base_path
= lxc_cmd_get_limit_cgroup_path(opts
->c
->name
, opts
->c
->config_path
, controllers_list
[0]);
310 if (!cgroup_base_path
)
311 return log_error_errno(-ENOENT
, ENOENT
, "Failed to retrieve limit cgroup path for %s", controllers_list
[0] ?: "(null)");
315 p
= cgroup_ops
->get_limit_cgroup(cgroup_ops
, controllers_list
[0]);
317 return log_error_errno(-ENOENT
, ENOENT
, "Failed to retrieve limit cgroup path for %s", controllers_list
[0] ?: "(null)");
319 cgroup_base_path
= strdup(p
);
320 if (!cgroup_base_path
)
321 return log_error_errno(-ENOMEM
, ENOMEM
, "Failed to duplicate limit cgroup path");
324 tmp
= lxc_path_simplify(cgroup_base_path
);
326 return log_error_errno(-ENOMEM
, ENOMEM
, "Failed to remove extraneous slashes from \"%s\"", cgroup_base_path
);
327 free_move_ptr(cgroup_base_path
, tmp
);
329 if (controllers_list
[0]) {
330 controllers
= lxc_string_join(",", (const char **)controllers_list
, false);
332 return log_error_errno(-ENOMEM
, ENOMEM
, "Failed to join controllers");
334 ret
= sprintf(buf
, "%s:%s", controllers
, cgroup_base_path
);
336 WARN("No cgroup controllers configured in container's cgroup %s", cgroup_base_path
);
337 ret
= sprintf(buf
, "%s", cgroup_base_path
);
339 if (ret
< 0 || (size_t)ret
>= sizeof(buf
))
340 return log_error_errno(-EIO
, EIO
, "sprintf of cgroup root arg failed");
342 DECLARE_ARG("--cgroup-root");
346 if (opts
->user
->verbose
)
349 if (opts
->user
->action_script
) {
350 DECLARE_ARG("--action-script");
351 DECLARE_ARG(opts
->user
->action_script
);
354 f_mnt
= make_anonymous_mount_file(&opts
->c
->lxc_conf
->mount_entries
,
355 opts
->c
->lxc_conf
->lsm_aa_allow_nesting
);
357 return log_error_errno(-ENOENT
, ENOENT
, "Failed to create anonymous mount file");
359 while (getmntent_r(f_mnt
, &mntent
, buf
, sizeof(buf
))) {
360 __do_free
char *mnt_options
= NULL
;
361 unsigned long flags
= 0;
362 char arg
[2 * PATH_MAX
+ 2];
364 if (parse_mntopts_legacy(mntent
.mnt_opts
, &flags
, &mnt_options
) < 0)
365 return log_error_errno(-EINVAL
, EINVAL
, "Failed to parse mount options");
367 /* only add --ext-mount-map for actual bind mounts */
368 if (!(flags
& MS_BIND
))
371 if (strequal(opts
->action
, "dump"))
372 ret
= strnprintf(arg
, sizeof(arg
), "/%s:%s", mntent
.mnt_dir
, mntent
.mnt_dir
);
374 ret
= strnprintf(arg
, sizeof(arg
), "%s:%s", mntent
.mnt_dir
, mntent
.mnt_fsname
);
376 return log_error_errno(-EIO
, EIO
, "Failed to create mount entry");
378 DECLARE_ARG("--ext-mount-map");
382 if (strequal(opts
->action
, "dump") || strequal(opts
->action
, "pre-dump")) {
384 char init_pid_str
[INTTYPE_TO_STRLEN(int)];
385 char *freezer_relative
;
387 init_pid
= opts
->c
->init_pid(opts
->c
);
389 return log_error_errno(-ESRCH
, ESRCH
, "Failed to retrieve init pid of container");
391 ret
= strnprintf(init_pid_str
, sizeof(init_pid_str
), "%d", init_pid
);
393 return log_error_errno(-EIO
, EIO
, "Failed to create entry for init pid of container");
396 DECLARE_ARG(init_pid_str
);
398 freezer_relative
= lxc_cmd_get_limit_cgroup_path(opts
->c
->name
,
399 opts
->c
->config_path
,
401 if (!freezer_relative
)
402 return log_error_errno(-ENOENT
, ENOENT
, "Failed getting freezer path");
404 if (pure_unified_layout(cgroup_ops
))
405 ret
= strnprintf(log
, sizeof(log
), "/sys/fs/cgroup/%s", freezer_relative
);
407 ret
= strnprintf(log
, sizeof(log
), "/sys/fs/cgroup/freezer/%s", freezer_relative
);
409 return log_error_errno(-EIO
, EIO
, "Failed to freezer cgroup entry");
411 if (!opts
->user
->disable_skip_in_flight
&&
412 strcmp(opts
->criu_version
, CRIU_IN_FLIGHT_SUPPORT
) >= 0)
413 DECLARE_ARG("--skip-in-flight");
415 DECLARE_ARG("--freeze-cgroup");
418 if (opts
->tty_id
[0]) {
419 DECLARE_ARG("--ext-mount-map");
420 DECLARE_ARG("/dev/console:console");
422 DECLARE_ARG("--external");
423 DECLARE_ARG(opts
->tty_id
);
426 if (opts
->user
->predump_dir
) {
427 DECLARE_ARG("--prev-images-dir");
428 DECLARE_ARG(opts
->user
->predump_dir
);
429 DECLARE_ARG("--track-mem");
432 if (opts
->user
->pageserver_address
&& opts
->user
->pageserver_port
) {
433 DECLARE_ARG("--page-server");
434 DECLARE_ARG("--address");
435 DECLARE_ARG(opts
->user
->pageserver_address
);
436 DECLARE_ARG("--port");
437 DECLARE_ARG(opts
->user
->pageserver_port
);
440 if (!opts
->user
->preserves_inodes
)
441 DECLARE_ARG("--force-irmap");
443 if (opts
->user
->ghost_limit
) {
444 char ghost_limit
[32];
446 ret
= sprintf(ghost_limit
, "%"PRIu64
, opts
->user
->ghost_limit
);
447 if (ret
< 0 || (size_t)ret
>= sizeof(ghost_limit
))
448 return log_error_errno(-EIO
, EIO
, "Failed to print ghost limit %"PRIu64
, opts
->user
->ghost_limit
);
450 DECLARE_ARG("--ghost-limit");
451 DECLARE_ARG(ghost_limit
);
454 /* only for final dump */
455 if (strequal(opts
->action
, "dump") && !opts
->user
->stop
)
456 DECLARE_ARG("--leave-running");
457 } else if (strequal(opts
->action
, "restore")) {
458 struct lxc_conf
*lxc_conf
= opts
->c
->lxc_conf
;
460 DECLARE_ARG("--root");
461 DECLARE_ARG(opts
->c
->lxc_conf
->rootfs
.mount
);
462 DECLARE_ARG("--restore-detached");
463 DECLARE_ARG("--restore-sibling");
466 if (opts
->console_fd
< 0)
467 return log_error_errno(-EINVAL
, EINVAL
, "lxc.console.path configured on source host but not target");
469 ret
= strnprintf(buf
, sizeof(buf
), "fd[%d]:%s", opts
->console_fd
, ttys
);
471 return log_error_errno(-EIO
, EIO
, "Failed to create console entry");
473 DECLARE_ARG("--inherit-fd");
476 if (opts
->console_name
) {
477 if (strnprintf(buf
, sizeof(buf
), "console:%s", opts
->console_name
) < 0)
478 return log_error_errno(-EIO
, EIO
, "Failed to create console entry");
480 DECLARE_ARG("--ext-mount-map");
484 if (lxc_conf
->lsm_aa_profile
|| lxc_conf
->lsm_se_context
) {
486 if (lxc_conf
->lsm_aa_profile
)
487 ret
= strnprintf(buf
, sizeof(buf
), "apparmor:%s", lxc_conf
->lsm_aa_profile
);
489 ret
= strnprintf(buf
, sizeof(buf
), "selinux:%s", lxc_conf
->lsm_se_context
);
491 return log_error_errno(-EIO
, EIO
, "Failed to create lsm entry");
493 DECLARE_ARG("--lsm-profile");
497 list_for_each_entry(netdev
, &opts
->c
->lxc_conf
->netdevs
, head
) {
499 char eth
[128], *veth
;
500 bool external_not_veth
;
502 if (cmp_version(opts
->criu_version
, CRIU_EXTERNAL_NOT_VETH
) >= 0) {
503 /* Since criu version 2.8 the usage of --veth-pair
504 * has been deprecated:
505 * git tag --contains f2037e6d3445fc400
507 external_not_veth
= true;
509 external_not_veth
= false;
512 if (netdev
->name
[0] != '\0') {
513 retlen
= strlcpy(eth
, netdev
->name
, sizeof(eth
));
514 if (retlen
>= sizeof(eth
))
515 return log_error_errno(-E2BIG
, E2BIG
, "Failed to append veth device name");
517 ret
= strnprintf(eth
, sizeof(eth
), "eth%d", netnr
);
519 return log_error_errno(-E2BIG
, E2BIG
, "Failed to append veth device name");
522 switch (netdev
->type
) {
524 veth
= netdev
->priv
.veth_attr
.pair
;
526 veth
= netdev
->priv
.veth_attr
.veth1
;
528 if (netdev
->link
[0] != '\0') {
529 if (external_not_veth
)
530 ret
= strnprintf(buf
, sizeof(buf
), "veth[%s]:%s@%s", eth
, veth
, netdev
->link
);
532 ret
= strnprintf(buf
, sizeof(buf
), "%s=%s@%s", eth
, veth
, netdev
->link
);
534 if (external_not_veth
)
535 ret
= strnprintf(buf
, sizeof(buf
), "veth[%s]:%s", eth
, veth
);
537 ret
= strnprintf(buf
, sizeof(buf
), "%s=%s", eth
, veth
);
540 return log_error_errno(-EIO
, EIO
, "Failed to append veth device name");
542 TRACE("Added veth device entry %s", buf
);
544 case LXC_NET_MACVLAN
:
545 if (netdev
->link
[0] == '\0')
546 return log_error_errno(-EINVAL
, EINVAL
, "Failed to find host interface for macvlan %s", netdev
->name
);
548 ret
= strnprintf(buf
, sizeof(buf
), "macvlan[%s]:%s", eth
, netdev
->link
);
550 return log_error_errno(-EIO
, EIO
, "Failed to add macvlan entry");
552 TRACE("Added macvlan device entry %s", buf
);
556 if (cmp_version(opts
->criu_version
, CRIU_EXTERNAL_NETDEV
) < 0)
557 return syserror_set(-EOPNOTSUPP
, "Restoring physical network devices not supported");
559 if (is_empty_string(netdev
->link
))
560 return syserror_set(-EINVAL
, "Specifying link is required");
562 ret
= strnprintf(buf
, sizeof(buf
), "netdev[%s]:%s", eth
, netdev
->link
);
564 return syserror_set(-EIO
, "Failed to append phys device name");
566 TRACE("Added phys device entry %s", buf
);
573 /* we have screened for this earlier... */
574 return log_error_errno(-EINVAL
, EINVAL
, "Unsupported network type %d", netdev
->type
);
577 if (external_not_veth
)
578 DECLARE_ARG("--external");
580 DECLARE_ARG("--veth-pair");
587 args
->argv
[args
->argc
] = NULL
;
589 if (lxc_log_trace()) {
591 for (int i
= 0, pos
= 0; i
< args
->argc
&& args
->argv
[i
]; i
++) {
592 ret
= strnprintf(buf
+ pos
, sizeof(buf
) - pos
, "%s ", args
->argv
[i
]);
594 return log_error_errno(-EIO
, EIO
, "Failed to reorder entries");
599 TRACE("Using command line %s", buf
);
602 /* before criu inits its log, it sometimes prints things to stdout/err;
603 * let's be sure we capture that.
605 if (dup2(opts
->pipefd
, STDOUT_FILENO
) < 0)
606 return log_error_errno(-errno
, errno
, "Failed to duplicate stdout");
608 if (dup2(opts
->pipefd
, STDERR_FILENO
) < 0)
609 return log_error_errno(-errno
, errno
, "Failed to duplicate stderr");
614 execv(args
->argv
[0], args
->argv
);
619 * Function to check if the checks activated in 'features_to_check' are
620 * available with the current architecture/kernel/criu combination.
622 * Parameter features_to_check is a bit mask of all features that should be
623 * checked (see feature check defines in lxc/lxccontainer.h).
625 * If the return value is true, all requested features are supported. If
626 * the return value is false the features_to_check parameter is updated
627 * to reflect which features are available. '0' means no feature but
628 * also that something went totally wrong.
630 * Some of the code flow of criu_version_ok() is duplicated and maybe it
631 * is a good candidate for refactoring.
633 bool __criu_check_feature(uint64_t *features_to_check
)
636 uint64_t current_bit
= 0;
638 uint64_t features
= *features_to_check
;
639 /* Feature checking is currently always like
640 * criu check --feature <feature-name>
642 char *args
[] = { "criu", "check", "--feature", NULL
, NULL
};
644 if ((features
& ~FEATURE_MEM_TRACK
& ~FEATURE_LAZY_PAGES
) != 0) {
645 /* There are feature bits activated we do not understand.
646 * Refusing to answer at all */
647 *features_to_check
= 0;
651 while (current_bit
< (sizeof(uint64_t) * 8 - 1)) {
652 /* only test requested features */
653 if (!(features
& (1ULL << current_bit
))) {
661 SYSERROR("fork() failed");
662 *features_to_check
= 0;
667 if ((1ULL << current_bit
) == FEATURE_MEM_TRACK
)
668 /* This is needed for pre-dump support, which
669 * enables pre-copy migration. */
670 args
[3] = "mem_dirty_track";
671 else if ((1ULL << current_bit
) == FEATURE_LAZY_PAGES
)
672 /* CRIU has two checks for userfaultfd support.
674 * The simpler check is only for 'uffd'. If the
675 * kernel supports userfaultfd without noncoop
676 * then only process can be lazily restored
677 * which do not fork. With 'uffd-noncoop'
678 * it is also possible to lazily restore processes
679 * which do fork. For a container runtime like
680 * LXC checking only for 'uffd' makes not much sense. */
681 args
[3] = "uffd-noncoop";
687 execvp("criu", args
);
688 SYSERROR("Failed to exec \"criu\"");
692 ret
= wait_for_pid(pid
);
695 /* It is not known why CRIU failed. Either
696 * CRIU is not available, the feature check
697 * does not exist or the feature is not
699 INFO("feature not supported");
700 /* Clear not supported feature bit */
701 features
&= ~(1ULL << current_bit
);
705 /* no more checks requested; exit check loop */
706 if (!(features
& ~((1ULL << current_bit
)-1)))
709 if (features
!= *features_to_check
) {
710 *features_to_check
= features
;
717 * Check to see if the criu version is recent enough for all the features we
718 * use. This version allows either CRIU_VERSION or (CRIU_GITID_VERSION and
719 * CRIU_GITID_PATCHLEVEL) to work, enabling users building from git to c/r
720 * things potentially before a version is released with a particular feature.
722 * The intent is that when criu development slows down, we can drop this, but
723 * for now we shouldn't attempt to c/r with versions that we know won't work.
725 * Note: If version != NULL criu_version() stores the detected criu version in
726 * version. Allocates memory for version which must be freed by caller.
728 static bool criu_version_ok(char **version
)
733 if (pipe(pipes
) < 0) {
734 SYSERROR("pipe() failed");
740 SYSERROR("fork() failed");
745 char *args
[] = { "criu", "--version", NULL
};
749 close(STDERR_FILENO
);
750 if (dup2(pipes
[1], STDOUT_FILENO
) < 0)
753 path
= on_path("criu", NULL
);
765 if (wait_for_pid(pid
) < 0) {
767 SYSERROR("execing criu failed, is it installed?");
771 f
= fdopen(pipes
[0], "re");
783 if (fscanf(f
, "Version: %1023[^\n]s", tmp
) != 1)
786 if (fgetc(f
) != '\n')
789 if (strcmp(tmp
, CRIU_VERSION
) >= 0)
792 if (fscanf(f
, "GitID: v%1023[^-]s", tmp
) != 1)
798 if (fscanf(f
, "%d", &patch
) != 1)
801 if (strcmp(tmp
, CRIU_GITID_VERSION
) < 0)
804 if (patch
< CRIU_GITID_PATCHLEVEL
)
818 ERROR("must have criu " CRIU_VERSION
" or greater to checkpoint/restore");
823 /* Check and make sure the container has a configuration that we know CRIU can
825 static bool criu_ok(struct lxc_container
*c
, char **criu_version
)
827 struct lxc_netdev
*netdev
;
830 ERROR("Must be root to checkpoint");
834 if (!criu_version_ok(criu_version
))
837 /* We only know how to restore containers with veth networks. */
838 list_for_each_entry(netdev
, &c
->lxc_conf
->netdevs
, head
) {
839 switch(netdev
->type
) {
844 case LXC_NET_MACVLAN
:
847 ERROR("Found un-dumpable network: %s (%s)", lxc_net_type_to_str(netdev
->type
), netdev
->name
);
850 *criu_version
= NULL
;
859 static bool restore_net_info(struct lxc_container
*c
)
862 bool has_error
= true;
863 struct lxc_netdev
*netdev
;
865 if (container_mem_lock(c
))
868 list_for_each_entry(netdev
, &c
->lxc_conf
->netdevs
, head
) {
869 char template[IFNAMSIZ
];
871 if (netdev
->type
!= LXC_NET_VETH
)
874 ret
= strnprintf(template, sizeof(template), "vethXXXXXX");
878 if (netdev
->priv
.veth_attr
.pair
[0] == '\0' &&
879 netdev
->priv
.veth_attr
.veth1
[0] == '\0') {
880 if (!lxc_ifname_alnum_case_sensitive(template))
883 (void)strlcpy(netdev
->priv
.veth_attr
.veth1
, template, IFNAMSIZ
);
890 container_mem_unlock(c
);
894 /* do_restore never returns, the calling process is used as the monitor process.
895 * do_restore calls _exit() if it fails.
897 static void do_restore(struct lxc_container
*c
, int status_pipe
, struct migrate_opts
*opts
, char *criu_version
)
901 struct lxc_handler
*handler
;
903 int pipes
[2] = {-1, -1};
904 struct cgroup_ops
*cgroup_ops
;
906 /* Try to detach from the current controlling tty if it exists.
907 * Otherwise, lxc_init (via lxc_console) will attach the container's
908 * console output to the current tty, which is probably not what any
909 * library user wants, and if they do, they can just manually configure
912 fd
= open("/dev/tty", O_RDWR
);
914 if (ioctl(fd
, TIOCNOTTY
, NULL
) < 0)
915 SYSERROR("couldn't detach from tty");
919 handler
= lxc_init_handler(NULL
, c
->name
, c
->lxc_conf
, c
->config_path
, false);
923 if (lxc_init(c
->name
, handler
) < 0)
925 cgroup_ops
= handler
->cgroup_ops
;
927 if (!cgroup_ops
->monitor_create(cgroup_ops
, handler
)) {
928 ERROR("Failed to create monitor cgroup");
929 goto out_fini_handler
;
932 if (!cgroup_ops
->monitor_enter(cgroup_ops
, handler
)) {
933 ERROR("Failed to enter monitor cgroup");
934 goto out_fini_handler
;
937 if (!cgroup_ops
->monitor_delegate_controllers(cgroup_ops
)) {
938 ERROR("Failed to delegate controllers to monitor cgroup");
939 goto out_fini_handler
;
942 if (!cgroup_ops
->payload_create(cgroup_ops
, handler
)) {
943 ERROR("Failed creating cgroups");
944 goto out_fini_handler
;
947 if (!restore_net_info(c
)) {
948 ERROR("failed restoring network info");
949 goto out_fini_handler
;
952 ret
= resolve_clone_flags(handler
);
954 SYSERROR("Unsupported clone flag specified");
955 goto out_fini_handler
;
958 if (pipe2(pipes
, O_CLOEXEC
) < 0) {
959 SYSERROR("pipe() failed");
960 goto out_fini_handler
;
965 goto out_fini_handler
;
969 struct lxc_rootfs
*rootfs
;
978 if (unshare(CLONE_NEWNS
))
979 goto out_fini_handler
;
981 ret
= lxc_storage_prepare(c
->lxc_conf
);
983 goto out_fini_handler
;
985 /* CRIU needs the lxc root bind mounted so that it is the root of some
987 rootfs
= &c
->lxc_conf
->rootfs
;
989 if (rootfs_is_blockdev(c
->lxc_conf
)) {
990 if (lxc_setup_rootfs_prepare_root(c
->lxc_conf
, c
->name
,
992 goto out_fini_handler
;
994 if (mkdir(rootfs
->mount
, 0755) < 0 && errno
!= EEXIST
)
995 goto out_fini_handler
;
997 if (mount(NULL
, "/", NULL
, MS_SLAVE
| MS_REC
, NULL
) < 0) {
998 SYSERROR("remount / to private failed");
999 goto out_fini_handler
;
1002 if (mount(rootfs
->path
, rootfs
->mount
, NULL
, MS_BIND
, NULL
) < 0) {
1003 (void)rmdir(rootfs
->mount
);
1004 goto out_fini_handler
;
1008 os
.pipefd
= pipes
[1];
1009 os
.action
= "restore";
1012 os
.console_fd
= c
->lxc_conf
->console
.pty
;
1013 os
.criu_version
= criu_version
;
1014 os
.handler
= handler
;
1016 if (os
.console_fd
>= 0) {
1017 /* Twiddle the FD_CLOEXEC bit. We want to pass this FD to criu
1018 * via --inherit-fd, so we don't want it to close.
1020 flags
= fcntl(os
.console_fd
, F_GETFD
);
1022 SYSERROR("F_GETFD failed: %d", os
.console_fd
);
1023 goto out_fini_handler
;
1026 flags
&= ~FD_CLOEXEC
;
1028 if (fcntl(os
.console_fd
, F_SETFD
, flags
) < 0) {
1029 SYSERROR("F_SETFD failed");
1030 goto out_fini_handler
;
1033 os
.console_name
= c
->lxc_conf
->console
.name
;
1035 /* exec_criu() returning is an error */
1036 ret
= exec_criu(handler
->cgroup_ops
, c
->lxc_conf
, &os
);
1038 SYSERROR("Failed to execute criu");
1039 umount(rootfs
->mount
);
1040 (void)rmdir(rootfs
->mount
);
1041 goto out_fini_handler
;
1048 pid_t w
= waitpid(pid
, &status
, 0);
1050 SYSERROR("waitpid");
1051 goto out_fini_handler
;
1054 if (WIFEXITED(status
)) {
1057 if (WEXITSTATUS(status
)) {
1060 n
= lxc_read_nointr(pipes
[0], buf
, sizeof(buf
));
1062 SYSERROR("failed reading from criu stderr");
1063 goto out_fini_handler
;
1066 if (n
== sizeof(buf
))
1070 ERROR("criu process exited %d, output:\n%s", WEXITSTATUS(status
), buf
);
1071 goto out_fini_handler
;
1073 ret
= strnprintf(buf
, sizeof(buf
), "/proc/self/task/%lu/children", (unsigned long)syscall(__NR_gettid
));
1075 ERROR("strnprintf'd too many characters: %d", ret
);
1076 goto out_fini_handler
;
1079 FILE *f
= fopen(buf
, "re");
1081 SYSERROR("couldn't read restore's children file %s", buf
);
1082 goto out_fini_handler
;
1085 ret
= fscanf(f
, "%d", (int*) &handler
->pid
);
1088 ERROR("reading restore pid failed");
1089 goto out_fini_handler
;
1092 if (lxc_set_state(c
->name
, handler
, RUNNING
)) {
1093 ERROR("error setting running state after restore");
1094 goto out_fini_handler
;
1098 ERROR("CRIU was killed with signal %d", WTERMSIG(status
));
1099 goto out_fini_handler
;
1104 ret
= lxc_write_nointr(status_pipe
, &status
, sizeof(status
));
1108 if (sizeof(status
) != ret
) {
1109 SYSERROR("failed to write all of status");
1110 goto out_fini_handler
;
1114 * See comment in lxcapi_start; we don't care if these
1115 * fail because it's just a beauty thing. We just
1116 * assign the return here to silence potential.
1118 ret
= strnprintf(title
, sizeof(title
), "[lxc monitor] %s %s", c
->config_path
, c
->name
);
1120 INFO("Setting truncated process name");
1122 ret
= setproctitle(title
);
1124 INFO("Failed to set process name");
1126 ret
= lxc_poll(c
->name
, handler
);
1142 if (status_pipe
>= 0) {
1143 /* ensure getting here was a failure, e.g. if we failed to
1144 * parse the child pid or something, even after a successful
1150 if (lxc_write_nointr(status_pipe
, &status
, sizeof(status
)) != sizeof(status
))
1151 SYSERROR("writing status failed");
1155 _exit(EXIT_FAILURE
);
1158 static int save_tty_major_minor(char *directory
, struct lxc_container
*c
, char *tty_id
, int len
)
1161 char path
[PATH_MAX
];
1165 if (c
->lxc_conf
->console
.path
&& strequal(c
->lxc_conf
->console
.path
, "none")) {
1170 ret
= strnprintf(path
, sizeof(path
), "/proc/%d/root/dev/console", c
->init_pid(c
));
1172 ERROR("strnprintf'd too many characters: %d", ret
);
1176 ret
= stat(path
, &sb
);
1178 SYSERROR("stat of %s failed", path
);
1182 ret
= strnprintf(path
, sizeof(path
), "%s/tty.info", directory
);
1184 ERROR("strnprintf'd too many characters: %d", ret
);
1188 ret
= strnprintf(tty_id
, len
, "tty[%llx:%llx]",
1189 (long long unsigned) sb
.st_rdev
,
1190 (long long unsigned) sb
.st_dev
);
1192 ERROR("strnprintf'd too many characters: %d", ret
);
1196 f
= fopen(path
, "we");
1198 SYSERROR("failed to open %s", path
);
1202 ret
= fprintf(f
, "%s", tty_id
);
1205 SYSERROR("failed to write to %s", path
);
1209 /* do one of either predump or a regular dump */
1210 static bool do_dump(struct lxc_container
*c
, char *mode
, struct migrate_opts
*opts
)
1215 char *criu_version
= NULL
;
1217 if (!criu_ok(c
, &criu_version
))
1220 ret
= pipe(criuout
);
1222 SYSERROR("pipe() failed");
1227 if (lxc_mkdir_p(opts
->directory
, 0700) < 0)
1232 SYSERROR("fork failed");
1237 struct criu_opts os
;
1238 struct cgroup_ops
*cgroup_ops
;
1242 cgroup_ops
= cgroup_init(c
->lxc_conf
);
1244 ERROR("failed to cgroup_init()");
1245 _exit(EXIT_FAILURE
);
1248 os
.pipefd
= criuout
[1];
1252 os
.console_name
= c
->lxc_conf
->console
.path
;
1253 os
.criu_version
= criu_version
;
1256 ret
= save_tty_major_minor(opts
->directory
, c
, os
.tty_id
, sizeof(os
.tty_id
));
1259 _exit(EXIT_FAILURE
);
1262 /* exec_criu() returning is an error */
1263 ret
= exec_criu(cgroup_ops
, c
->lxc_conf
, &os
);
1265 SYSERROR("Failed to execute criu");
1267 _exit(EXIT_FAILURE
);
1275 pid_t w
= waitpid(pid
, &status
, 0);
1277 SYSERROR("waitpid");
1283 n
= lxc_read_nointr(criuout
[0], buf
, sizeof(buf
));
1290 if (n
== sizeof(buf
))
1295 if (WIFEXITED(status
)) {
1296 if (WEXITSTATUS(status
)) {
1297 ERROR("dump failed with %d", WEXITSTATUS(status
));
1302 } else if (WIFSIGNALED(status
)) {
1303 ERROR("dump signaled with %d", WTERMSIG(status
));
1306 ERROR("unknown dump exit %d", status
);
1311 ERROR("criu output: %s", buf
);
1319 (void)rmdir(opts
->directory
);
1324 bool __criu_pre_dump(struct lxc_container
*c
, struct migrate_opts
*opts
)
1326 return do_dump(c
, "pre-dump", opts
);
1329 bool __criu_dump(struct lxc_container
*c
, struct migrate_opts
*opts
)
1331 char path
[PATH_MAX
];
1334 ret
= strnprintf(path
, sizeof(path
), "%s/inventory.img", opts
->directory
);
1338 if (access(path
, F_OK
) == 0) {
1339 ERROR("please use a fresh directory for the dump directory");
1343 return do_dump(c
, "dump", opts
);
1346 bool __criu_restore(struct lxc_container
*c
, struct migrate_opts
*opts
)
1351 char *criu_version
= NULL
;
1354 ERROR("Must be root to restore");
1359 ERROR("failed to create pipe");
1363 if (!criu_ok(c
, &criu_version
)) {
1379 /* this never returns */
1380 do_restore(c
, pipefd
[1], opts
, criu_version
);
1386 nread
= lxc_read_nointr(pipefd
[0], &status
, sizeof(status
));
1388 if (sizeof(status
) != nread
) {
1389 ERROR("reading status from pipe failed");
1393 /* If the criu process was killed or exited nonzero, wait() for the
1394 * handler, since the restore process died. Otherwise, we don't need to
1395 * wait, since the child becomes the monitor process.
1397 if (!WIFEXITED(status
) || WEXITSTATUS(status
))
1402 if (wait_for_pid(pid
))
1403 ERROR("restore process died");