]> git.proxmox.com Git - mirror_lxc.git/blob - src/lxc/criu.c
github: Update for main branch
[mirror_lxc.git] / src / lxc / criu.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2
3 #include "config.h"
4
5 #include <inttypes.h>
6 #include <linux/limits.h>
7 #include <sched.h>
8 #include <stdio.h>
9 #include <stdlib.h>
10 #include <string.h>
11 #include <sys/mount.h>
12 #include <sys/types.h>
13 #include <sys/wait.h>
14 #include <unistd.h>
15
16 #include "attach_options.h"
17
18 #include "cgroup.h"
19 #include "commands.h"
20 #include "conf.h"
21 #include "criu.h"
22 #include "log.h"
23 #include "lxc.h"
24 #include "lxclock.h"
25 #include "memory_utils.h"
26 #include "network.h"
27 #include "storage.h"
28 #include "syscall_wrappers.h"
29 #include "utils.h"
30
31 #if IS_BIONIC
32 #include "lxcmntent.h"
33 #else
34 #include <mntent.h>
35 #endif
36
37 #if !HAVE_STRLCPY
38 #include "strlcpy.h"
39 #endif
40
41 #define CRIU_VERSION "2.0"
42
43 #define CRIU_GITID_VERSION "2.0"
44 #define CRIU_GITID_PATCHLEVEL 0
45
46 #define CRIU_IN_FLIGHT_SUPPORT "2.4"
47 #define CRIU_EXTERNAL_NOT_VETH "2.8"
48 #define CRIU_EXTERNAL_NETDEV "3.15"
49
50 lxc_log_define(criu, lxc);
51
52 struct criu_opts {
53 /* the thing to hook to stdout and stderr for logging */
54 int pipefd;
55
56 /* The type of criu invocation, one of "dump" or "restore" */
57 char *action;
58
59 /* the user-provided migrate options relevant to this action */
60 struct migrate_opts *user;
61
62 /* The container to dump */
63 struct lxc_container *c;
64
65 /* dump: stop the container or not after dumping? */
66 char tty_id[32]; /* the criu tty id for /dev/console, i.e. "tty[${rdev}:${dev}]" */
67
68 /* restore: the file to write the init process' pid into */
69 struct lxc_handler *handler;
70 int console_fd;
71 /* The path that is bind mounted from /dev/console, if any. We don't
72 * want to use `--ext-mount-map auto`'s result here because the pty
73 * device may have a different path (e.g. if the pty number is
74 * different) on the target host. NULL if lxc.console.path = "none".
75 */
76 char *console_name;
77
78 /* The detected version of criu */
79 char *criu_version;
80 };
81
82 static int load_tty_major_minor(char *directory, char *output, int len)
83 {
84 char path[PATH_MAX];
85 ssize_t ret;
86
87 ret = strnprintf(path, sizeof(path), "%s/tty.info", directory);
88 if (ret < 0)
89 return ret_errno(EIO);
90
91 ret = lxc_read_from_file(path, output, len);
92 if (ret < 0) {
93 /*
94 * This means we're coming from a liblxc which didn't export
95 * the tty info. In this case they had to have lxc.console.path
96 * = * none, so there's no problem restoring.
97 */
98 if (errno == ENOENT)
99 return 0;
100
101 return log_error_errno(-errno, errno, "Failed to open \"%s\"", path);
102 }
103
104 return 0;
105 }
106
107 static int cmp_version(const char *v1, const char *v2)
108 {
109 int ret;
110 int oct_v1[3], oct_v2[3];
111
112 memset(oct_v1, -1, sizeof(oct_v1));
113 memset(oct_v2, -1, sizeof(oct_v2));
114
115 ret = sscanf(v1, "%d.%d.%d", &oct_v1[0], &oct_v1[1], &oct_v1[2]);
116 if (ret < 1)
117 return -1;
118
119 ret = sscanf(v2, "%d.%d.%d", &oct_v2[0], &oct_v2[1], &oct_v2[2]);
120 if (ret < 1)
121 return -1;
122
123 /* Major version is greater. */
124 if (oct_v1[0] > oct_v2[0])
125 return 1;
126
127 if (oct_v1[0] < oct_v2[0])
128 return -1;
129
130 /* Minor number is greater.*/
131 if (oct_v1[1] > oct_v2[1])
132 return 1;
133
134 if (oct_v1[1] < oct_v2[1])
135 return -1;
136
137 /* Patch number is greater. */
138 if (oct_v1[2] > oct_v2[2])
139 return 1;
140
141 /* Patch numbers are equal. */
142 if (oct_v1[2] == oct_v2[2])
143 return 0;
144
145 return -1;
146 }
147
148 struct criu_exec_args {
149 int argc;
150 char *argv[];
151 };
152
153 static void put_criu_exec_args(struct criu_exec_args *args)
154 {
155 if (args) {
156 for (int i = 0; i < args->argc; i++)
157 free_disarm(args->argv[i]);
158 free_disarm(args);
159 }
160 }
161
162 define_cleanup_function(struct criu_exec_args *, put_criu_exec_args);
163
164 static int exec_criu(struct cgroup_ops *cgroup_ops, struct lxc_conf *conf,
165 struct criu_opts *opts)
166 {
167 call_cleaner(put_criu_exec_args) struct criu_exec_args *args = NULL;
168 __do_fclose FILE *f_mnt = NULL;
169 char log[PATH_MAX];
170 int static_args = 23, ret;
171 int netnr = 0;
172 struct mntent mntent;
173 struct lxc_netdev *netdev;
174 struct string_entry *strentry;
175
176 char buf[4096], ttys[32];
177
178 /* If we are currently in a cgroup /foo/bar, and the container is in a
179 * cgroup /lxc/foo, lxcfs will give us an ENOENT if some task in the
180 * container has an open fd that points to one of the cgroup files
181 * (systemd always opens its "root" cgroup). So, let's escape to the
182 * /actual/ root cgroup so that lxcfs thinks criu has enough rights to
183 * see all cgroups.
184 */
185 if (!cgroup_ops->criu_escape(cgroup_ops, conf))
186 return log_error_errno(-ENOENT, ENOENT, "Failed to escape to root cgroup");
187
188 /* The command line always looks like:
189 * criu $(action) --tcp-established --file-locks --link-remap \
190 * --manage-cgroups=full --action-script foo.sh -D $(directory) \
191 * -o $(directory)/$(action).log --ext-mount-map auto
192 * --enable-external-sharing --enable-external-masters
193 * --enable-fs hugetlbfs --enable-fs tracefs --ext-mount-map console:/dev/pts/n
194 * +1 for final NULL */
195
196 if (strequal(opts->action, "dump") || strequal(opts->action, "pre-dump")) {
197 /* -t pid --freeze-cgroup /lxc/ct */
198 static_args += 4;
199
200 /* --prev-images-dir <path-to-directory-A-relative-to-B> */
201 if (opts->user->predump_dir)
202 static_args += 2;
203
204 /* --page-server --address <address> --port <port> */
205 if (opts->user->pageserver_address && opts->user->pageserver_port)
206 static_args += 5;
207
208 /* --leave-running (only for final dump) */
209 if (strequal(opts->action, "dump") && !opts->user->stop)
210 static_args++;
211
212 /* --external tty[88,4] */
213 if (opts->tty_id[0])
214 static_args += 2;
215
216 /* --force-irmap */
217 if (!opts->user->preserves_inodes)
218 static_args++;
219
220 /* --ghost-limit 1024 */
221 if (opts->user->ghost_limit)
222 static_args += 2;
223 } else if (strequal(opts->action, "restore")) {
224 /* --root $(lxc_mount_point) --restore-detached
225 * --restore-sibling
226 * --lsm-profile apparmor:whatever
227 */
228 static_args += 6;
229
230 ttys[0] = 0;
231 if (load_tty_major_minor(opts->user->directory, ttys, sizeof(ttys)))
232 return log_error_errno(-EINVAL, EINVAL, "Failed to load tty information");
233
234 /* --inherit-fd fd[%d]:tty[%s] */
235 if (ttys[0])
236 static_args += 2;
237
238 static_args += list_len(netdev, &opts->c->lxc_conf->netdevs, head) * 2;
239 } else {
240 return log_error_errno(-EINVAL, EINVAL, "Invalid criu operation specified");
241 }
242
243 if (cgroup_ops->criu_num_hierarchies(cgroup_ops) > 0)
244 static_args += 2 * cgroup_ops->criu_num_hierarchies(cgroup_ops);
245
246 if (opts->user->verbose)
247 static_args++;
248
249 if (opts->user->action_script)
250 static_args += 2;
251
252 static_args += 2 * list_len(strentry, &opts->c->lxc_conf->mount_entries, head);
253
254 ret = strnprintf(log, sizeof(log), "%s/%s.log", opts->user->directory, opts->action);
255 if (ret < 0)
256 return ret_errno(EIO);
257
258 args = zalloc(sizeof(struct criu_exec_args) + (static_args * sizeof(char **)));
259 if (!args)
260 return log_error_errno(-ENOMEM, ENOMEM, "Failed to allocate static arguments");
261
262 #define DECLARE_ARG(arg) \
263 do { \
264 if (arg == NULL) \
265 return log_error_errno(-EINVAL, EINVAL, \
266 "Got NULL argument for criu"); \
267 args->argv[(args->argc)++] = strdup(arg); \
268 if (!args->argv[args->argc - 1]) \
269 return log_error_errno(-ENOMEM, ENOMEM, \
270 "Failed to duplicate argumen %s", arg); \
271 } while (0)
272
273 args->argv[(args->argc)++] = on_path("criu", NULL);
274 if (!args->argv[args->argc - 1])
275 return log_error_errno(-ENOENT, ENOENT, "Failed to find criu binary");
276
277 DECLARE_ARG(opts->action);
278 DECLARE_ARG("--tcp-established");
279 DECLARE_ARG("--file-locks");
280 DECLARE_ARG("--link-remap");
281 DECLARE_ARG("--manage-cgroups=full");
282 DECLARE_ARG("--ext-mount-map");
283 DECLARE_ARG("auto");
284 DECLARE_ARG("--enable-external-sharing");
285 DECLARE_ARG("--enable-external-masters");
286 DECLARE_ARG("--enable-fs");
287 DECLARE_ARG("hugetlbfs");
288 DECLARE_ARG("--enable-fs");
289 DECLARE_ARG("tracefs");
290 DECLARE_ARG("-D");
291 DECLARE_ARG(opts->user->directory);
292 DECLARE_ARG("-o");
293 DECLARE_ARG(log);
294
295 for (int i = 0; i < cgroup_ops->criu_num_hierarchies(cgroup_ops); i++) {
296 __do_free char *cgroup_base_path = NULL, *controllers;
297 char **controllers_list = NULL;
298 char *tmp;
299
300 if (!cgroup_ops->criu_get_hierarchies(cgroup_ops, i, &controllers_list))
301 return log_error_errno(-ENOENT, ENOENT, "Failed to retrieve cgroup hierarchies %d", i);
302
303 /*
304 * If we are in a dump, we have to ask the monitor process what
305 * the right cgroup is. if this is a restore, we can just use
306 * the handler the restore task created.
307 */
308 if (strequal(opts->action, "dump") || strequal(opts->action, "pre-dump")) {
309 cgroup_base_path = lxc_cmd_get_limit_cgroup_path(opts->c->name, opts->c->config_path, controllers_list[0]);
310 if (!cgroup_base_path)
311 return log_error_errno(-ENOENT, ENOENT, "Failed to retrieve limit cgroup path for %s", controllers_list[0] ?: "(null)");
312 } else {
313 const char *p;
314
315 p = cgroup_ops->get_limit_cgroup(cgroup_ops, controllers_list[0]);
316 if (!p)
317 return log_error_errno(-ENOENT, ENOENT, "Failed to retrieve limit cgroup path for %s", controllers_list[0] ?: "(null)");
318
319 cgroup_base_path = strdup(p);
320 if (!cgroup_base_path)
321 return log_error_errno(-ENOMEM, ENOMEM, "Failed to duplicate limit cgroup path");
322 }
323
324 tmp = lxc_path_simplify(cgroup_base_path);
325 if (!tmp)
326 return log_error_errno(-ENOMEM, ENOMEM, "Failed to remove extraneous slashes from \"%s\"", cgroup_base_path);
327 free_move_ptr(cgroup_base_path, tmp);
328
329 if (controllers_list[0]) {
330 controllers = lxc_string_join(",", (const char **)controllers_list, false);
331 if (!controllers)
332 return log_error_errno(-ENOMEM, ENOMEM, "Failed to join controllers");
333
334 ret = sprintf(buf, "%s:%s", controllers, cgroup_base_path);
335 } else {
336 WARN("No cgroup controllers configured in container's cgroup %s", cgroup_base_path);
337 ret = sprintf(buf, "%s", cgroup_base_path);
338 }
339 if (ret < 0 || (size_t)ret >= sizeof(buf))
340 return log_error_errno(-EIO, EIO, "sprintf of cgroup root arg failed");
341
342 DECLARE_ARG("--cgroup-root");
343 DECLARE_ARG(buf);
344 }
345
346 if (opts->user->verbose)
347 DECLARE_ARG("-v4");
348
349 if (opts->user->action_script) {
350 DECLARE_ARG("--action-script");
351 DECLARE_ARG(opts->user->action_script);
352 }
353
354 f_mnt = make_anonymous_mount_file(&opts->c->lxc_conf->mount_entries,
355 opts->c->lxc_conf->lsm_aa_allow_nesting);
356 if (!f_mnt)
357 return log_error_errno(-ENOENT, ENOENT, "Failed to create anonymous mount file");
358
359 while (getmntent_r(f_mnt, &mntent, buf, sizeof(buf))) {
360 __do_free char *mnt_options = NULL;
361 unsigned long flags = 0;
362 char arg[2 * PATH_MAX + 2];
363
364 if (parse_mntopts_legacy(mntent.mnt_opts, &flags, &mnt_options) < 0)
365 return log_error_errno(-EINVAL, EINVAL, "Failed to parse mount options");
366
367 /* only add --ext-mount-map for actual bind mounts */
368 if (!(flags & MS_BIND))
369 continue;
370
371 if (strequal(opts->action, "dump"))
372 ret = strnprintf(arg, sizeof(arg), "/%s:%s", mntent.mnt_dir, mntent.mnt_dir);
373 else
374 ret = strnprintf(arg, sizeof(arg), "%s:%s", mntent.mnt_dir, mntent.mnt_fsname);
375 if (ret < 0)
376 return log_error_errno(-EIO, EIO, "Failed to create mount entry");
377
378 DECLARE_ARG("--ext-mount-map");
379 DECLARE_ARG(arg);
380 }
381
382 if (strequal(opts->action, "dump") || strequal(opts->action, "pre-dump")) {
383 pid_t init_pid;
384 char init_pid_str[INTTYPE_TO_STRLEN(int)];
385 char *freezer_relative;
386
387 init_pid = opts->c->init_pid(opts->c);
388 if (init_pid < 0)
389 return log_error_errno(-ESRCH, ESRCH, "Failed to retrieve init pid of container");
390
391 ret = strnprintf(init_pid_str, sizeof(init_pid_str), "%d", init_pid);
392 if (ret < 0)
393 return log_error_errno(-EIO, EIO, "Failed to create entry for init pid of container");
394
395 DECLARE_ARG("-t");
396 DECLARE_ARG(init_pid_str);
397
398 freezer_relative = lxc_cmd_get_limit_cgroup_path(opts->c->name,
399 opts->c->config_path,
400 "freezer");
401 if (!freezer_relative)
402 return log_error_errno(-ENOENT, ENOENT, "Failed getting freezer path");
403
404 if (pure_unified_layout(cgroup_ops))
405 ret = strnprintf(log, sizeof(log), "/sys/fs/cgroup/%s", freezer_relative);
406 else
407 ret = strnprintf(log, sizeof(log), "/sys/fs/cgroup/freezer/%s", freezer_relative);
408 if (ret < 0)
409 return log_error_errno(-EIO, EIO, "Failed to freezer cgroup entry");
410
411 if (!opts->user->disable_skip_in_flight &&
412 strcmp(opts->criu_version, CRIU_IN_FLIGHT_SUPPORT) >= 0)
413 DECLARE_ARG("--skip-in-flight");
414
415 DECLARE_ARG("--freeze-cgroup");
416 DECLARE_ARG(log);
417
418 if (opts->tty_id[0]) {
419 DECLARE_ARG("--ext-mount-map");
420 DECLARE_ARG("/dev/console:console");
421
422 DECLARE_ARG("--external");
423 DECLARE_ARG(opts->tty_id);
424 }
425
426 if (opts->user->predump_dir) {
427 DECLARE_ARG("--prev-images-dir");
428 DECLARE_ARG(opts->user->predump_dir);
429 DECLARE_ARG("--track-mem");
430 }
431
432 if (opts->user->pageserver_address && opts->user->pageserver_port) {
433 DECLARE_ARG("--page-server");
434 DECLARE_ARG("--address");
435 DECLARE_ARG(opts->user->pageserver_address);
436 DECLARE_ARG("--port");
437 DECLARE_ARG(opts->user->pageserver_port);
438 }
439
440 if (!opts->user->preserves_inodes)
441 DECLARE_ARG("--force-irmap");
442
443 if (opts->user->ghost_limit) {
444 char ghost_limit[32];
445
446 ret = sprintf(ghost_limit, "%"PRIu64, opts->user->ghost_limit);
447 if (ret < 0 || (size_t)ret >= sizeof(ghost_limit))
448 return log_error_errno(-EIO, EIO, "Failed to print ghost limit %"PRIu64, opts->user->ghost_limit);
449
450 DECLARE_ARG("--ghost-limit");
451 DECLARE_ARG(ghost_limit);
452 }
453
454 /* only for final dump */
455 if (strequal(opts->action, "dump") && !opts->user->stop)
456 DECLARE_ARG("--leave-running");
457 } else if (strequal(opts->action, "restore")) {
458 struct lxc_conf *lxc_conf = opts->c->lxc_conf;
459
460 DECLARE_ARG("--root");
461 DECLARE_ARG(opts->c->lxc_conf->rootfs.mount);
462 DECLARE_ARG("--restore-detached");
463 DECLARE_ARG("--restore-sibling");
464
465 if (ttys[0]) {
466 if (opts->console_fd < 0)
467 return log_error_errno(-EINVAL, EINVAL, "lxc.console.path configured on source host but not target");
468
469 ret = strnprintf(buf, sizeof(buf), "fd[%d]:%s", opts->console_fd, ttys);
470 if (ret < 0)
471 return log_error_errno(-EIO, EIO, "Failed to create console entry");
472
473 DECLARE_ARG("--inherit-fd");
474 DECLARE_ARG(buf);
475 }
476 if (opts->console_name) {
477 if (strnprintf(buf, sizeof(buf), "console:%s", opts->console_name) < 0)
478 return log_error_errno(-EIO, EIO, "Failed to create console entry");
479
480 DECLARE_ARG("--ext-mount-map");
481 DECLARE_ARG(buf);
482 }
483
484 if (lxc_conf->lsm_aa_profile || lxc_conf->lsm_se_context) {
485
486 if (lxc_conf->lsm_aa_profile)
487 ret = strnprintf(buf, sizeof(buf), "apparmor:%s", lxc_conf->lsm_aa_profile);
488 else
489 ret = strnprintf(buf, sizeof(buf), "selinux:%s", lxc_conf->lsm_se_context);
490 if (ret < 0)
491 return log_error_errno(-EIO, EIO, "Failed to create lsm entry");
492
493 DECLARE_ARG("--lsm-profile");
494 DECLARE_ARG(buf);
495 }
496
497 list_for_each_entry(netdev, &opts->c->lxc_conf->netdevs, head) {
498 size_t retlen;
499 char eth[128], *veth;
500 bool external_not_veth;
501
502 if (cmp_version(opts->criu_version, CRIU_EXTERNAL_NOT_VETH) >= 0) {
503 /* Since criu version 2.8 the usage of --veth-pair
504 * has been deprecated:
505 * git tag --contains f2037e6d3445fc400
506 * v2.8 */
507 external_not_veth = true;
508 } else {
509 external_not_veth = false;
510 }
511
512 if (netdev->name[0] != '\0') {
513 retlen = strlcpy(eth, netdev->name, sizeof(eth));
514 if (retlen >= sizeof(eth))
515 return log_error_errno(-E2BIG, E2BIG, "Failed to append veth device name");
516 } else {
517 ret = strnprintf(eth, sizeof(eth), "eth%d", netnr);
518 if (ret < 0)
519 return log_error_errno(-E2BIG, E2BIG, "Failed to append veth device name");
520 }
521
522 switch (netdev->type) {
523 case LXC_NET_VETH:
524 veth = netdev->priv.veth_attr.pair;
525 if (veth[0] == '\0')
526 veth = netdev->priv.veth_attr.veth1;
527
528 if (netdev->link[0] != '\0') {
529 if (external_not_veth)
530 ret = strnprintf(buf, sizeof(buf), "veth[%s]:%s@%s", eth, veth, netdev->link);
531 else
532 ret = strnprintf(buf, sizeof(buf), "%s=%s@%s", eth, veth, netdev->link);
533 } else {
534 if (external_not_veth)
535 ret = strnprintf(buf, sizeof(buf), "veth[%s]:%s", eth, veth);
536 else
537 ret = strnprintf(buf, sizeof(buf), "%s=%s", eth, veth);
538 }
539 if (ret < 0)
540 return log_error_errno(-EIO, EIO, "Failed to append veth device name");
541
542 TRACE("Added veth device entry %s", buf);
543 break;
544 case LXC_NET_MACVLAN:
545 if (netdev->link[0] == '\0')
546 return log_error_errno(-EINVAL, EINVAL, "Failed to find host interface for macvlan %s", netdev->name);
547
548 ret = strnprintf(buf, sizeof(buf), "macvlan[%s]:%s", eth, netdev->link);
549 if (ret < 0)
550 return log_error_errno(-EIO, EIO, "Failed to add macvlan entry");
551
552 TRACE("Added macvlan device entry %s", buf);
553
554 break;
555 case LXC_NET_PHYS:
556 if (cmp_version(opts->criu_version, CRIU_EXTERNAL_NETDEV) < 0)
557 return syserror_set(-EOPNOTSUPP, "Restoring physical network devices not supported");
558
559 if (is_empty_string(netdev->link))
560 return syserror_set(-EINVAL, "Specifying link is required");
561
562 ret = strnprintf(buf, sizeof(buf), "netdev[%s]:%s", eth, netdev->link);
563 if (ret < 0)
564 return syserror_set(-EIO, "Failed to append phys device name");
565
566 TRACE("Added phys device entry %s", buf);
567 break;
568 case LXC_NET_NONE:
569 __fallthrough;
570 case LXC_NET_EMPTY:
571 break;
572 default:
573 /* we have screened for this earlier... */
574 return log_error_errno(-EINVAL, EINVAL, "Unsupported network type %d", netdev->type);
575 }
576
577 if (external_not_veth)
578 DECLARE_ARG("--external");
579 else
580 DECLARE_ARG("--veth-pair");
581 DECLARE_ARG(buf);
582 netnr++;
583 }
584
585 }
586
587 args->argv[args->argc] = NULL;
588
589 if (lxc_log_trace()) {
590 buf[0] = 0;
591 for (int i = 0, pos = 0; i < args->argc && args->argv[i]; i++) {
592 ret = strnprintf(buf + pos, sizeof(buf) - pos, "%s ", args->argv[i]);
593 if (ret < 0)
594 return log_error_errno(-EIO, EIO, "Failed to reorder entries");
595 else
596 pos += ret;
597 }
598
599 TRACE("Using command line %s", buf);
600 }
601
602 /* before criu inits its log, it sometimes prints things to stdout/err;
603 * let's be sure we capture that.
604 */
605 if (dup2(opts->pipefd, STDOUT_FILENO) < 0)
606 return log_error_errno(-errno, errno, "Failed to duplicate stdout");
607
608 if (dup2(opts->pipefd, STDERR_FILENO) < 0)
609 return log_error_errno(-errno, errno, "Failed to duplicate stderr");
610
611 close(opts->pipefd);
612
613 #undef DECLARE_ARG
614 execv(args->argv[0], args->argv);
615 return -ENOEXEC;
616 }
617
618 /*
619 * Function to check if the checks activated in 'features_to_check' are
620 * available with the current architecture/kernel/criu combination.
621 *
622 * Parameter features_to_check is a bit mask of all features that should be
623 * checked (see feature check defines in lxc/lxccontainer.h).
624 *
625 * If the return value is true, all requested features are supported. If
626 * the return value is false the features_to_check parameter is updated
627 * to reflect which features are available. '0' means no feature but
628 * also that something went totally wrong.
629 *
630 * Some of the code flow of criu_version_ok() is duplicated and maybe it
631 * is a good candidate for refactoring.
632 */
633 bool __criu_check_feature(uint64_t *features_to_check)
634 {
635 pid_t pid;
636 uint64_t current_bit = 0;
637 int ret;
638 uint64_t features = *features_to_check;
639 /* Feature checking is currently always like
640 * criu check --feature <feature-name>
641 */
642 char *args[] = { "criu", "check", "--feature", NULL, NULL };
643
644 if ((features & ~FEATURE_MEM_TRACK & ~FEATURE_LAZY_PAGES) != 0) {
645 /* There are feature bits activated we do not understand.
646 * Refusing to answer at all */
647 *features_to_check = 0;
648 return false;
649 }
650
651 while (current_bit < (sizeof(uint64_t) * 8 - 1)) {
652 /* only test requested features */
653 if (!(features & (1ULL << current_bit))) {
654 /* skip this */
655 current_bit++;
656 continue;
657 }
658
659 pid = fork();
660 if (pid < 0) {
661 SYSERROR("fork() failed");
662 *features_to_check = 0;
663 return false;
664 }
665
666 if (pid == 0) {
667 if ((1ULL << current_bit) == FEATURE_MEM_TRACK)
668 /* This is needed for pre-dump support, which
669 * enables pre-copy migration. */
670 args[3] = "mem_dirty_track";
671 else if ((1ULL << current_bit) == FEATURE_LAZY_PAGES)
672 /* CRIU has two checks for userfaultfd support.
673 *
674 * The simpler check is only for 'uffd'. If the
675 * kernel supports userfaultfd without noncoop
676 * then only process can be lazily restored
677 * which do not fork. With 'uffd-noncoop'
678 * it is also possible to lazily restore processes
679 * which do fork. For a container runtime like
680 * LXC checking only for 'uffd' makes not much sense. */
681 args[3] = "uffd-noncoop";
682 else
683 _exit(EXIT_FAILURE);
684
685 null_stdfds();
686
687 execvp("criu", args);
688 SYSERROR("Failed to exec \"criu\"");
689 _exit(EXIT_FAILURE);
690 }
691
692 ret = wait_for_pid(pid);
693
694 if (ret == -1) {
695 /* It is not known why CRIU failed. Either
696 * CRIU is not available, the feature check
697 * does not exist or the feature is not
698 * supported. */
699 INFO("feature not supported");
700 /* Clear not supported feature bit */
701 features &= ~(1ULL << current_bit);
702 }
703
704 current_bit++;
705 /* no more checks requested; exit check loop */
706 if (!(features & ~((1ULL << current_bit)-1)))
707 break;
708 }
709 if (features != *features_to_check) {
710 *features_to_check = features;
711 return false;
712 }
713 return true;
714 }
715
716 /*
717 * Check to see if the criu version is recent enough for all the features we
718 * use. This version allows either CRIU_VERSION or (CRIU_GITID_VERSION and
719 * CRIU_GITID_PATCHLEVEL) to work, enabling users building from git to c/r
720 * things potentially before a version is released with a particular feature.
721 *
722 * The intent is that when criu development slows down, we can drop this, but
723 * for now we shouldn't attempt to c/r with versions that we know won't work.
724 *
725 * Note: If version != NULL criu_version() stores the detected criu version in
726 * version. Allocates memory for version which must be freed by caller.
727 */
728 static bool criu_version_ok(char **version)
729 {
730 int pipes[2];
731 pid_t pid;
732
733 if (pipe(pipes) < 0) {
734 SYSERROR("pipe() failed");
735 return false;
736 }
737
738 pid = fork();
739 if (pid < 0) {
740 SYSERROR("fork() failed");
741 return false;
742 }
743
744 if (pid == 0) {
745 char *args[] = { "criu", "--version", NULL };
746 char *path;
747 close(pipes[0]);
748
749 close(STDERR_FILENO);
750 if (dup2(pipes[1], STDOUT_FILENO) < 0)
751 _exit(EXIT_FAILURE);
752
753 path = on_path("criu", NULL);
754 if (!path)
755 _exit(EXIT_FAILURE);
756
757 execv(path, args);
758 _exit(EXIT_FAILURE);
759 } else {
760 FILE *f;
761 char *tmp;
762 int patch;
763
764 close(pipes[1]);
765 if (wait_for_pid(pid) < 0) {
766 close(pipes[0]);
767 SYSERROR("execing criu failed, is it installed?");
768 return false;
769 }
770
771 f = fdopen(pipes[0], "re");
772 if (!f) {
773 close(pipes[0]);
774 return false;
775 }
776
777 tmp = malloc(1024);
778 if (!tmp) {
779 fclose(f);
780 return false;
781 }
782
783 if (fscanf(f, "Version: %1023[^\n]s", tmp) != 1)
784 goto version_error;
785
786 if (fgetc(f) != '\n')
787 goto version_error;
788
789 if (strcmp(tmp, CRIU_VERSION) >= 0)
790 goto version_match;
791
792 if (fscanf(f, "GitID: v%1023[^-]s", tmp) != 1)
793 goto version_error;
794
795 if (fgetc(f) != '-')
796 goto version_error;
797
798 if (fscanf(f, "%d", &patch) != 1)
799 goto version_error;
800
801 if (strcmp(tmp, CRIU_GITID_VERSION) < 0)
802 goto version_error;
803
804 if (patch < CRIU_GITID_PATCHLEVEL)
805 goto version_error;
806
807 version_match:
808 fclose(f);
809 if (!version)
810 free(tmp);
811 else
812 *version = tmp;
813 return true;
814
815 version_error:
816 fclose(f);
817 free(tmp);
818 ERROR("must have criu " CRIU_VERSION " or greater to checkpoint/restore");
819 return false;
820 }
821 }
822
823 /* Check and make sure the container has a configuration that we know CRIU can
824 * dump. */
825 static bool criu_ok(struct lxc_container *c, char **criu_version)
826 {
827 struct lxc_netdev *netdev;
828
829 if (geteuid()) {
830 ERROR("Must be root to checkpoint");
831 return false;
832 }
833
834 if (!criu_version_ok(criu_version))
835 return false;
836
837 /* We only know how to restore containers with veth networks. */
838 list_for_each_entry(netdev, &c->lxc_conf->netdevs, head) {
839 switch(netdev->type) {
840 case LXC_NET_VETH:
841 case LXC_NET_NONE:
842 case LXC_NET_EMPTY:
843 case LXC_NET_PHYS:
844 case LXC_NET_MACVLAN:
845 break;
846 default:
847 ERROR("Found un-dumpable network: %s (%s)", lxc_net_type_to_str(netdev->type), netdev->name);
848 if (criu_version) {
849 free(*criu_version);
850 *criu_version = NULL;
851 }
852 return false;
853 }
854 }
855
856 return true;
857 }
858
859 static bool restore_net_info(struct lxc_container *c)
860 {
861 int ret;
862 bool has_error = true;
863 struct lxc_netdev *netdev;
864
865 if (container_mem_lock(c))
866 return false;
867
868 list_for_each_entry(netdev, &c->lxc_conf->netdevs, head) {
869 char template[IFNAMSIZ];
870
871 if (netdev->type != LXC_NET_VETH)
872 continue;
873
874 ret = strnprintf(template, sizeof(template), "vethXXXXXX");
875 if (ret < 0)
876 goto out_unlock;
877
878 if (netdev->priv.veth_attr.pair[0] == '\0' &&
879 netdev->priv.veth_attr.veth1[0] == '\0') {
880 if (!lxc_ifname_alnum_case_sensitive(template))
881 goto out_unlock;
882
883 (void)strlcpy(netdev->priv.veth_attr.veth1, template, IFNAMSIZ);
884 }
885 }
886
887 has_error = false;
888
889 out_unlock:
890 container_mem_unlock(c);
891 return !has_error;
892 }
893
894 /* do_restore never returns, the calling process is used as the monitor process.
895 * do_restore calls _exit() if it fails.
896 */
897 static void do_restore(struct lxc_container *c, int status_pipe, struct migrate_opts *opts, char *criu_version)
898 {
899 int fd, ret;
900 pid_t pid;
901 struct lxc_handler *handler;
902 int status = 0;
903 int pipes[2] = {-1, -1};
904 struct cgroup_ops *cgroup_ops;
905
906 /* Try to detach from the current controlling tty if it exists.
907 * Otherwise, lxc_init (via lxc_console) will attach the container's
908 * console output to the current tty, which is probably not what any
909 * library user wants, and if they do, they can just manually configure
910 * it :)
911 */
912 fd = open("/dev/tty", O_RDWR);
913 if (fd >= 0) {
914 if (ioctl(fd, TIOCNOTTY, NULL) < 0)
915 SYSERROR("couldn't detach from tty");
916 close(fd);
917 }
918
919 handler = lxc_init_handler(NULL, c->name, c->lxc_conf, c->config_path, false);
920 if (!handler)
921 goto out;
922
923 if (lxc_init(c->name, handler) < 0)
924 goto out;
925 cgroup_ops = handler->cgroup_ops;
926
927 if (!cgroup_ops->monitor_create(cgroup_ops, handler)) {
928 ERROR("Failed to create monitor cgroup");
929 goto out_fini_handler;
930 }
931
932 if (!cgroup_ops->monitor_enter(cgroup_ops, handler)) {
933 ERROR("Failed to enter monitor cgroup");
934 goto out_fini_handler;
935 }
936
937 if (!cgroup_ops->monitor_delegate_controllers(cgroup_ops)) {
938 ERROR("Failed to delegate controllers to monitor cgroup");
939 goto out_fini_handler;
940 }
941
942 if (!cgroup_ops->payload_create(cgroup_ops, handler)) {
943 ERROR("Failed creating cgroups");
944 goto out_fini_handler;
945 }
946
947 if (!restore_net_info(c)) {
948 ERROR("failed restoring network info");
949 goto out_fini_handler;
950 }
951
952 ret = resolve_clone_flags(handler);
953 if (ret < 0) {
954 SYSERROR("Unsupported clone flag specified");
955 goto out_fini_handler;
956 }
957
958 if (pipe2(pipes, O_CLOEXEC) < 0) {
959 SYSERROR("pipe() failed");
960 goto out_fini_handler;
961 }
962
963 pid = fork();
964 if (pid < 0)
965 goto out_fini_handler;
966
967 if (pid == 0) {
968 struct criu_opts os;
969 struct lxc_rootfs *rootfs;
970 int flags;
971
972 close(status_pipe);
973 status_pipe = -1;
974
975 close(pipes[0]);
976 pipes[0] = -1;
977
978 if (unshare(CLONE_NEWNS))
979 goto out_fini_handler;
980
981 ret = lxc_storage_prepare(c->lxc_conf);
982 if (ret)
983 goto out_fini_handler;
984
985 /* CRIU needs the lxc root bind mounted so that it is the root of some
986 * mount. */
987 rootfs = &c->lxc_conf->rootfs;
988
989 if (rootfs_is_blockdev(c->lxc_conf)) {
990 if (lxc_setup_rootfs_prepare_root(c->lxc_conf, c->name,
991 c->config_path) < 0)
992 goto out_fini_handler;
993 } else {
994 if (mkdir(rootfs->mount, 0755) < 0 && errno != EEXIST)
995 goto out_fini_handler;
996
997 if (mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL) < 0) {
998 SYSERROR("remount / to private failed");
999 goto out_fini_handler;
1000 }
1001
1002 if (mount(rootfs->path, rootfs->mount, NULL, MS_BIND, NULL) < 0) {
1003 (void)rmdir(rootfs->mount);
1004 goto out_fini_handler;
1005 }
1006 }
1007
1008 os.pipefd = pipes[1];
1009 os.action = "restore";
1010 os.user = opts;
1011 os.c = c;
1012 os.console_fd = c->lxc_conf->console.pty;
1013 os.criu_version = criu_version;
1014 os.handler = handler;
1015
1016 if (os.console_fd >= 0) {
1017 /* Twiddle the FD_CLOEXEC bit. We want to pass this FD to criu
1018 * via --inherit-fd, so we don't want it to close.
1019 */
1020 flags = fcntl(os.console_fd, F_GETFD);
1021 if (flags < 0) {
1022 SYSERROR("F_GETFD failed: %d", os.console_fd);
1023 goto out_fini_handler;
1024 }
1025
1026 flags &= ~FD_CLOEXEC;
1027
1028 if (fcntl(os.console_fd, F_SETFD, flags) < 0) {
1029 SYSERROR("F_SETFD failed");
1030 goto out_fini_handler;
1031 }
1032 }
1033 os.console_name = c->lxc_conf->console.name;
1034
1035 /* exec_criu() returning is an error */
1036 ret = exec_criu(handler->cgroup_ops, c->lxc_conf, &os);
1037 if (ret)
1038 SYSERROR("Failed to execute criu");
1039 umount(rootfs->mount);
1040 (void)rmdir(rootfs->mount);
1041 goto out_fini_handler;
1042 } else {
1043 char title[2048];
1044
1045 close(pipes[1]);
1046 pipes[1] = -1;
1047
1048 pid_t w = waitpid(pid, &status, 0);
1049 if (w == -1) {
1050 SYSERROR("waitpid");
1051 goto out_fini_handler;
1052 }
1053
1054 if (WIFEXITED(status)) {
1055 char buf[4096];
1056
1057 if (WEXITSTATUS(status)) {
1058 int n;
1059
1060 n = lxc_read_nointr(pipes[0], buf, sizeof(buf));
1061 if (n < 0) {
1062 SYSERROR("failed reading from criu stderr");
1063 goto out_fini_handler;
1064 }
1065
1066 if (n == sizeof(buf))
1067 n--;
1068 buf[n] = 0;
1069
1070 ERROR("criu process exited %d, output:\n%s", WEXITSTATUS(status), buf);
1071 goto out_fini_handler;
1072 } else {
1073 ret = strnprintf(buf, sizeof(buf), "/proc/self/task/%lu/children", (unsigned long)syscall(__NR_gettid));
1074 if (ret < 0) {
1075 ERROR("strnprintf'd too many characters: %d", ret);
1076 goto out_fini_handler;
1077 }
1078
1079 FILE *f = fopen(buf, "re");
1080 if (!f) {
1081 SYSERROR("couldn't read restore's children file %s", buf);
1082 goto out_fini_handler;
1083 }
1084
1085 ret = fscanf(f, "%d", (int*) &handler->pid);
1086 fclose(f);
1087 if (ret != 1) {
1088 ERROR("reading restore pid failed");
1089 goto out_fini_handler;
1090 }
1091
1092 if (lxc_set_state(c->name, handler, RUNNING)) {
1093 ERROR("error setting running state after restore");
1094 goto out_fini_handler;
1095 }
1096 }
1097 } else {
1098 ERROR("CRIU was killed with signal %d", WTERMSIG(status));
1099 goto out_fini_handler;
1100 }
1101
1102 close(pipes[0]);
1103
1104 ret = lxc_write_nointr(status_pipe, &status, sizeof(status));
1105 close(status_pipe);
1106 status_pipe = -1;
1107
1108 if (sizeof(status) != ret) {
1109 SYSERROR("failed to write all of status");
1110 goto out_fini_handler;
1111 }
1112
1113 /*
1114 * See comment in lxcapi_start; we don't care if these
1115 * fail because it's just a beauty thing. We just
1116 * assign the return here to silence potential.
1117 */
1118 ret = strnprintf(title, sizeof(title), "[lxc monitor] %s %s", c->config_path, c->name);
1119 if (ret < 0)
1120 INFO("Setting truncated process name");
1121
1122 ret = setproctitle(title);
1123 if (ret < 0)
1124 INFO("Failed to set process name");
1125
1126 ret = lxc_poll(c->name, handler);
1127 if (ret)
1128 lxc_abort(handler);
1129 lxc_end(handler);
1130 _exit(ret);
1131 }
1132
1133 out_fini_handler:
1134 if (pipes[0] >= 0)
1135 close(pipes[0]);
1136 if (pipes[1] >= 0)
1137 close(pipes[1]);
1138
1139 lxc_end(handler);
1140
1141 out:
1142 if (status_pipe >= 0) {
1143 /* ensure getting here was a failure, e.g. if we failed to
1144 * parse the child pid or something, even after a successful
1145 * restore
1146 */
1147 if (!status)
1148 status = 1;
1149
1150 if (lxc_write_nointr(status_pipe, &status, sizeof(status)) != sizeof(status))
1151 SYSERROR("writing status failed");
1152 close(status_pipe);
1153 }
1154
1155 _exit(EXIT_FAILURE);
1156 }
1157
1158 static int save_tty_major_minor(char *directory, struct lxc_container *c, char *tty_id, int len)
1159 {
1160 FILE *f;
1161 char path[PATH_MAX];
1162 int ret;
1163 struct stat sb;
1164
1165 if (c->lxc_conf->console.path && strequal(c->lxc_conf->console.path, "none")) {
1166 tty_id[0] = 0;
1167 return 0;
1168 }
1169
1170 ret = strnprintf(path, sizeof(path), "/proc/%d/root/dev/console", c->init_pid(c));
1171 if (ret < 0) {
1172 ERROR("strnprintf'd too many characters: %d", ret);
1173 return -1;
1174 }
1175
1176 ret = stat(path, &sb);
1177 if (ret < 0) {
1178 SYSERROR("stat of %s failed", path);
1179 return -1;
1180 }
1181
1182 ret = strnprintf(path, sizeof(path), "%s/tty.info", directory);
1183 if (ret < 0) {
1184 ERROR("strnprintf'd too many characters: %d", ret);
1185 return -1;
1186 }
1187
1188 ret = strnprintf(tty_id, len, "tty[%llx:%llx]",
1189 (long long unsigned) sb.st_rdev,
1190 (long long unsigned) sb.st_dev);
1191 if (ret < 0) {
1192 ERROR("strnprintf'd too many characters: %d", ret);
1193 return -1;
1194 }
1195
1196 f = fopen(path, "we");
1197 if (!f) {
1198 SYSERROR("failed to open %s", path);
1199 return -1;
1200 }
1201
1202 ret = fprintf(f, "%s", tty_id);
1203 fclose(f);
1204 if (ret < 0)
1205 SYSERROR("failed to write to %s", path);
1206 return ret;
1207 }
1208
1209 /* do one of either predump or a regular dump */
1210 static bool do_dump(struct lxc_container *c, char *mode, struct migrate_opts *opts)
1211 {
1212 int ret;
1213 pid_t pid;
1214 int criuout[2];
1215 char *criu_version = NULL;
1216
1217 if (!criu_ok(c, &criu_version))
1218 return false;
1219
1220 ret = pipe(criuout);
1221 if (ret < 0) {
1222 SYSERROR("pipe() failed");
1223 free(criu_version);
1224 return false;
1225 }
1226
1227 if (lxc_mkdir_p(opts->directory, 0700) < 0)
1228 goto fail;
1229
1230 pid = fork();
1231 if (pid < 0) {
1232 SYSERROR("fork failed");
1233 goto fail;
1234 }
1235
1236 if (pid == 0) {
1237 struct criu_opts os;
1238 struct cgroup_ops *cgroup_ops;
1239
1240 close(criuout[0]);
1241
1242 cgroup_ops = cgroup_init(c->lxc_conf);
1243 if (!cgroup_ops) {
1244 ERROR("failed to cgroup_init()");
1245 _exit(EXIT_FAILURE);
1246 }
1247
1248 os.pipefd = criuout[1];
1249 os.action = mode;
1250 os.user = opts;
1251 os.c = c;
1252 os.console_name = c->lxc_conf->console.path;
1253 os.criu_version = criu_version;
1254 os.handler = NULL;
1255
1256 ret = save_tty_major_minor(opts->directory, c, os.tty_id, sizeof(os.tty_id));
1257 if (ret < 0) {
1258 free(criu_version);
1259 _exit(EXIT_FAILURE);
1260 }
1261
1262 /* exec_criu() returning is an error */
1263 ret = exec_criu(cgroup_ops, c->lxc_conf, &os);
1264 if (ret)
1265 SYSERROR("Failed to execute criu");
1266 free(criu_version);
1267 _exit(EXIT_FAILURE);
1268 } else {
1269 int status;
1270 ssize_t n;
1271 char buf[4096];
1272
1273 close(criuout[1]);
1274
1275 pid_t w = waitpid(pid, &status, 0);
1276 if (w == -1) {
1277 SYSERROR("waitpid");
1278 close(criuout[0]);
1279 free(criu_version);
1280 return false;
1281 }
1282
1283 n = lxc_read_nointr(criuout[0], buf, sizeof(buf));
1284 close(criuout[0]);
1285 if (n < 0) {
1286 SYSERROR("read");
1287 n = 0;
1288 }
1289
1290 if (n == sizeof(buf))
1291 buf[n-1] = 0;
1292 else
1293 buf[n] = 0;
1294
1295 if (WIFEXITED(status)) {
1296 if (WEXITSTATUS(status)) {
1297 ERROR("dump failed with %d", WEXITSTATUS(status));
1298 ret = false;
1299 } else {
1300 ret = true;
1301 }
1302 } else if (WIFSIGNALED(status)) {
1303 ERROR("dump signaled with %d", WTERMSIG(status));
1304 ret = false;
1305 } else {
1306 ERROR("unknown dump exit %d", status);
1307 ret = false;
1308 }
1309
1310 if (!ret)
1311 ERROR("criu output: %s", buf);
1312
1313 free(criu_version);
1314 return ret;
1315 }
1316 fail:
1317 close(criuout[0]);
1318 close(criuout[1]);
1319 (void)rmdir(opts->directory);
1320 free(criu_version);
1321 return false;
1322 }
1323
1324 bool __criu_pre_dump(struct lxc_container *c, struct migrate_opts *opts)
1325 {
1326 return do_dump(c, "pre-dump", opts);
1327 }
1328
1329 bool __criu_dump(struct lxc_container *c, struct migrate_opts *opts)
1330 {
1331 char path[PATH_MAX];
1332 int ret;
1333
1334 ret = strnprintf(path, sizeof(path), "%s/inventory.img", opts->directory);
1335 if (ret < 0)
1336 return false;
1337
1338 if (access(path, F_OK) == 0) {
1339 ERROR("please use a fresh directory for the dump directory");
1340 return false;
1341 }
1342
1343 return do_dump(c, "dump", opts);
1344 }
1345
1346 bool __criu_restore(struct lxc_container *c, struct migrate_opts *opts)
1347 {
1348 pid_t pid;
1349 int status, nread;
1350 int pipefd[2];
1351 char *criu_version = NULL;
1352
1353 if (geteuid()) {
1354 ERROR("Must be root to restore");
1355 return false;
1356 }
1357
1358 if (pipe(pipefd)) {
1359 ERROR("failed to create pipe");
1360 return false;
1361 }
1362
1363 if (!criu_ok(c, &criu_version)) {
1364 close(pipefd[0]);
1365 close(pipefd[1]);
1366 return false;
1367 }
1368
1369 pid = fork();
1370 if (pid < 0) {
1371 close(pipefd[0]);
1372 close(pipefd[1]);
1373 free(criu_version);
1374 return false;
1375 }
1376
1377 if (pid == 0) {
1378 close(pipefd[0]);
1379 /* this never returns */
1380 do_restore(c, pipefd[1], opts, criu_version);
1381 }
1382
1383 close(pipefd[1]);
1384 free(criu_version);
1385
1386 nread = lxc_read_nointr(pipefd[0], &status, sizeof(status));
1387 close(pipefd[0]);
1388 if (sizeof(status) != nread) {
1389 ERROR("reading status from pipe failed");
1390 goto err_wait;
1391 }
1392
1393 /* If the criu process was killed or exited nonzero, wait() for the
1394 * handler, since the restore process died. Otherwise, we don't need to
1395 * wait, since the child becomes the monitor process.
1396 */
1397 if (!WIFEXITED(status) || WEXITSTATUS(status))
1398 goto err_wait;
1399 return true;
1400
1401 err_wait:
1402 if (wait_for_pid(pid))
1403 ERROR("restore process died");
1404 return false;
1405 }