]> git.proxmox.com Git - mirror_lxc.git/blob - src/lxc/criu.c
build: add src/include to build and simplify header inclusions
[mirror_lxc.git] / src / lxc / criu.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2
3 #ifndef _GNU_SOURCE
4 #define _GNU_SOURCE 1
5 #endif
6 #include <inttypes.h>
7 #include <linux/limits.h>
8 #include <sched.h>
9 #include <stdio.h>
10 #include <stdlib.h>
11 #include <string.h>
12 #include <sys/mount.h>
13 #include <sys/types.h>
14 #include <sys/wait.h>
15 #include <unistd.h>
16
17 #include "cgroup.h"
18 #include "commands.h"
19 #include "conf.h"
20 #include "config.h"
21 #include "criu.h"
22 #include "log.h"
23 #include "lxc.h"
24 #include "lxclock.h"
25 #include "memory_utils.h"
26 #include "network.h"
27 #include "storage.h"
28 #include "syscall_wrappers.h"
29 #include "utils.h"
30
31 #if IS_BIONIC
32 #include "lxcmntent.h"
33 #else
34 #include <mntent.h>
35 #endif
36
37 #ifndef HAVE_STRLCPY
38 #include "strlcpy.h"
39 #endif
40
41 #define CRIU_VERSION "2.0"
42
43 #define CRIU_GITID_VERSION "2.0"
44 #define CRIU_GITID_PATCHLEVEL 0
45
46 #define CRIU_IN_FLIGHT_SUPPORT "2.4"
47 #define CRIU_EXTERNAL_NOT_VETH "2.8"
48
49 lxc_log_define(criu, lxc);
50
51 struct criu_opts {
52 /* the thing to hook to stdout and stderr for logging */
53 int pipefd;
54
55 /* The type of criu invocation, one of "dump" or "restore" */
56 char *action;
57
58 /* the user-provided migrate options relevant to this action */
59 struct migrate_opts *user;
60
61 /* The container to dump */
62 struct lxc_container *c;
63
64 /* dump: stop the container or not after dumping? */
65 char tty_id[32]; /* the criu tty id for /dev/console, i.e. "tty[${rdev}:${dev}]" */
66
67 /* restore: the file to write the init process' pid into */
68 struct lxc_handler *handler;
69 int console_fd;
70 /* The path that is bind mounted from /dev/console, if any. We don't
71 * want to use `--ext-mount-map auto`'s result here because the pty
72 * device may have a different path (e.g. if the pty number is
73 * different) on the target host. NULL if lxc.console.path = "none".
74 */
75 char *console_name;
76
77 /* The detected version of criu */
78 char *criu_version;
79 };
80
81 static int load_tty_major_minor(char *directory, char *output, int len)
82 {
83 char path[PATH_MAX];
84 ssize_t ret;
85
86 ret = strnprintf(path, sizeof(path), "%s/tty.info", directory);
87 if (ret < 0)
88 return ret_errno(EIO);
89
90 ret = lxc_read_from_file(path, output, len);
91 if (ret < 0) {
92 /*
93 * This means we're coming from a liblxc which didn't export
94 * the tty info. In this case they had to have lxc.console.path
95 * = * none, so there's no problem restoring.
96 */
97 if (errno == ENOENT)
98 return 0;
99
100 return log_error_errno(-errno, errno, "Failed to open \"%s\"", path);
101 }
102
103 return 0;
104 }
105
106 static int cmp_version(const char *v1, const char *v2)
107 {
108 int ret;
109 int oct_v1[3], oct_v2[3];
110
111 memset(oct_v1, -1, sizeof(oct_v1));
112 memset(oct_v2, -1, sizeof(oct_v2));
113
114 ret = sscanf(v1, "%d.%d.%d", &oct_v1[0], &oct_v1[1], &oct_v1[2]);
115 if (ret < 1)
116 return -1;
117
118 ret = sscanf(v2, "%d.%d.%d", &oct_v2[0], &oct_v2[1], &oct_v2[2]);
119 if (ret < 1)
120 return -1;
121
122 /* Major version is greater. */
123 if (oct_v1[0] > oct_v2[0])
124 return 1;
125
126 if (oct_v1[0] < oct_v2[0])
127 return -1;
128
129 /* Minor number is greater.*/
130 if (oct_v1[1] > oct_v2[1])
131 return 1;
132
133 if (oct_v1[1] < oct_v2[1])
134 return -1;
135
136 /* Patch number is greater. */
137 if (oct_v1[2] > oct_v2[2])
138 return 1;
139
140 /* Patch numbers are equal. */
141 if (oct_v1[2] == oct_v2[2])
142 return 0;
143
144 return -1;
145 }
146
147 struct criu_exec_args {
148 int argc;
149 char *argv[];
150 };
151
152 static void put_criu_exec_args(struct criu_exec_args *args)
153 {
154 if (args) {
155 for (int i = 0; i < args->argc; i++)
156 free_disarm(args->argv[i]);
157 free_disarm(args);
158 }
159 }
160
161 define_cleanup_function(struct criu_exec_args *, put_criu_exec_args);
162
163 static int exec_criu(struct cgroup_ops *cgroup_ops, struct lxc_conf *conf,
164 struct criu_opts *opts)
165 {
166 call_cleaner(put_criu_exec_args) struct criu_exec_args *args = NULL;
167 __do_fclose FILE *f_mnt = NULL;
168 char log[PATH_MAX];
169 int static_args = 23, ret;
170 int netnr = 0;
171 struct mntent mntent;
172 struct lxc_netdev *netdev;
173 struct string_entry *strentry;
174
175 char buf[4096], ttys[32];
176
177 /* If we are currently in a cgroup /foo/bar, and the container is in a
178 * cgroup /lxc/foo, lxcfs will give us an ENOENT if some task in the
179 * container has an open fd that points to one of the cgroup files
180 * (systemd always opens its "root" cgroup). So, let's escape to the
181 * /actual/ root cgroup so that lxcfs thinks criu has enough rights to
182 * see all cgroups.
183 */
184 if (!cgroup_ops->criu_escape(cgroup_ops, conf))
185 return log_error_errno(-ENOENT, ENOENT, "Failed to escape to root cgroup");
186
187 /* The command line always looks like:
188 * criu $(action) --tcp-established --file-locks --link-remap \
189 * --manage-cgroups=full --action-script foo.sh -D $(directory) \
190 * -o $(directory)/$(action).log --ext-mount-map auto
191 * --enable-external-sharing --enable-external-masters
192 * --enable-fs hugetlbfs --enable-fs tracefs --ext-mount-map console:/dev/pts/n
193 * +1 for final NULL */
194
195 if (strequal(opts->action, "dump") || strequal(opts->action, "pre-dump")) {
196 /* -t pid --freeze-cgroup /lxc/ct */
197 static_args += 4;
198
199 /* --prev-images-dir <path-to-directory-A-relative-to-B> */
200 if (opts->user->predump_dir)
201 static_args += 2;
202
203 /* --page-server --address <address> --port <port> */
204 if (opts->user->pageserver_address && opts->user->pageserver_port)
205 static_args += 5;
206
207 /* --leave-running (only for final dump) */
208 if (strequal(opts->action, "dump") && !opts->user->stop)
209 static_args++;
210
211 /* --external tty[88,4] */
212 if (opts->tty_id[0])
213 static_args += 2;
214
215 /* --force-irmap */
216 if (!opts->user->preserves_inodes)
217 static_args++;
218
219 /* --ghost-limit 1024 */
220 if (opts->user->ghost_limit)
221 static_args += 2;
222 } else if (strequal(opts->action, "restore")) {
223 /* --root $(lxc_mount_point) --restore-detached
224 * --restore-sibling
225 * --lsm-profile apparmor:whatever
226 */
227 static_args += 6;
228
229 ttys[0] = 0;
230 if (load_tty_major_minor(opts->user->directory, ttys, sizeof(ttys)))
231 return log_error_errno(-EINVAL, EINVAL, "Failed to load tty information");
232
233 /* --inherit-fd fd[%d]:tty[%s] */
234 if (ttys[0])
235 static_args += 2;
236
237 static_args += list_len(netdev, &opts->c->lxc_conf->netdevs, head) * 2;
238 } else {
239 return log_error_errno(-EINVAL, EINVAL, "Invalid criu operation specified");
240 }
241
242 if (cgroup_ops->criu_num_hierarchies(cgroup_ops) > 0)
243 static_args += 2 * cgroup_ops->criu_num_hierarchies(cgroup_ops);
244
245 if (opts->user->verbose)
246 static_args++;
247
248 if (opts->user->action_script)
249 static_args += 2;
250
251 static_args += 2 * list_len(strentry, &opts->c->lxc_conf->mount_entries, head);
252
253 ret = strnprintf(log, sizeof(log), "%s/%s.log", opts->user->directory, opts->action);
254 if (ret < 0)
255 return ret_errno(EIO);
256
257 args = zalloc(sizeof(struct criu_exec_args) + (static_args * sizeof(char **)));
258 if (!args)
259 return log_error_errno(-ENOMEM, ENOMEM, "Failed to allocate static arguments");
260
261 #define DECLARE_ARG(arg) \
262 do { \
263 if (arg == NULL) \
264 return log_error_errno(-EINVAL, EINVAL, \
265 "Got NULL argument for criu"); \
266 args->argv[(args->argc)++] = strdup(arg); \
267 if (!args->argv[args->argc - 1]) \
268 return log_error_errno(-ENOMEM, ENOMEM, \
269 "Failed to duplicate argumen %s", arg); \
270 } while (0)
271
272 args->argv[(args->argc)++] = on_path("criu", NULL);
273 if (!args->argv[args->argc - 1])
274 return log_error_errno(-ENOENT, ENOENT, "Failed to find criu binary");
275
276 DECLARE_ARG(opts->action);
277 DECLARE_ARG("--tcp-established");
278 DECLARE_ARG("--file-locks");
279 DECLARE_ARG("--link-remap");
280 DECLARE_ARG("--manage-cgroups=full");
281 DECLARE_ARG("--ext-mount-map");
282 DECLARE_ARG("auto");
283 DECLARE_ARG("--enable-external-sharing");
284 DECLARE_ARG("--enable-external-masters");
285 DECLARE_ARG("--enable-fs");
286 DECLARE_ARG("hugetlbfs");
287 DECLARE_ARG("--enable-fs");
288 DECLARE_ARG("tracefs");
289 DECLARE_ARG("-D");
290 DECLARE_ARG(opts->user->directory);
291 DECLARE_ARG("-o");
292 DECLARE_ARG(log);
293
294 for (int i = 0; i < cgroup_ops->criu_num_hierarchies(cgroup_ops); i++) {
295 __do_free char *cgroup_base_path = NULL, *controllers;
296 char **controllers_list = NULL;
297 char *tmp;
298
299 if (!cgroup_ops->criu_get_hierarchies(cgroup_ops, i, &controllers_list))
300 return log_error_errno(-ENOENT, ENOENT, "Failed to retrieve cgroup hierarchies %d", i);
301
302 /*
303 * If we are in a dump, we have to ask the monitor process what
304 * the right cgroup is. if this is a restore, we can just use
305 * the handler the restore task created.
306 */
307 if (strequal(opts->action, "dump") || strequal(opts->action, "pre-dump")) {
308 cgroup_base_path = lxc_cmd_get_limit_cgroup_path(opts->c->name, opts->c->config_path, controllers_list[0]);
309 if (!cgroup_base_path)
310 return log_error_errno(-ENOENT, ENOENT, "Failed to retrieve limit cgroup path for %s", controllers_list[0] ?: "(null)");
311 } else {
312 const char *p;
313
314 p = cgroup_ops->get_limit_cgroup(cgroup_ops, controllers_list[0]);
315 if (!p)
316 return log_error_errno(-ENOENT, ENOENT, "Failed to retrieve limit cgroup path for %s", controllers_list[0] ?: "(null)");
317
318 cgroup_base_path = strdup(p);
319 if (!cgroup_base_path)
320 return log_error_errno(-ENOMEM, ENOMEM, "Failed to duplicate limit cgroup path");
321 }
322
323 tmp = path_simplify(cgroup_base_path);
324 if (!tmp)
325 return log_error_errno(-ENOMEM, ENOMEM, "Failed to remove extraneous slashes from \"%s\"", tmp);
326 free_move_ptr(cgroup_base_path, tmp);
327
328 if (controllers_list[0]) {
329 controllers = lxc_string_join(",", (const char **)controllers_list, false);
330 if (!controllers)
331 return log_error_errno(-ENOMEM, ENOMEM, "Failed to join controllers");
332
333 ret = sprintf(buf, "%s:%s", controllers, cgroup_base_path);
334 } else {
335 WARN("No cgroup controllers configured in container's cgroup %s", cgroup_base_path);
336 ret = sprintf(buf, "%s", cgroup_base_path);
337 }
338 if (ret < 0 || ret >= sizeof(buf))
339 return log_error_errno(-EIO, EIO, "sprintf of cgroup root arg failed");
340
341 DECLARE_ARG("--cgroup-root");
342 DECLARE_ARG(buf);
343 }
344
345 if (opts->user->verbose)
346 DECLARE_ARG("-v4");
347
348 if (opts->user->action_script) {
349 DECLARE_ARG("--action-script");
350 DECLARE_ARG(opts->user->action_script);
351 }
352
353 f_mnt = make_anonymous_mount_file(&opts->c->lxc_conf->mount_entries,
354 opts->c->lxc_conf->lsm_aa_allow_nesting);
355 if (!f_mnt)
356 return log_error_errno(-ENOENT, ENOENT, "Failed to create anonymous mount file");
357
358 while (getmntent_r(f_mnt, &mntent, buf, sizeof(buf))) {
359 __do_free char *mnt_options = NULL;
360 unsigned long flags = 0;
361 char arg[2 * PATH_MAX + 2];
362
363 if (parse_mntopts_legacy(mntent.mnt_opts, &flags, &mnt_options) < 0)
364 return log_error_errno(-EINVAL, EINVAL, "Failed to parse mount options");
365
366 /* only add --ext-mount-map for actual bind mounts */
367 if (!(flags & MS_BIND))
368 continue;
369
370 if (strequal(opts->action, "dump"))
371 ret = strnprintf(arg, sizeof(arg), "/%s:%s", mntent.mnt_dir, mntent.mnt_dir);
372 else
373 ret = strnprintf(arg, sizeof(arg), "%s:%s", mntent.mnt_dir, mntent.mnt_fsname);
374 if (ret < 0)
375 return log_error_errno(-EIO, EIO, "Failed to create mount entry");
376
377 DECLARE_ARG("--ext-mount-map");
378 DECLARE_ARG(arg);
379 }
380
381 if (strequal(opts->action, "dump") || strequal(opts->action, "pre-dump")) {
382 pid_t init_pid;
383 char init_pid_str[INTTYPE_TO_STRLEN(int)];
384 char *freezer_relative;
385
386 init_pid = opts->c->init_pid(opts->c);
387 if (init_pid < 0)
388 return log_error_errno(-ESRCH, ESRCH, "Failed to retrieve init pid of container");
389
390 ret = strnprintf(init_pid_str, sizeof(init_pid_str), "%d", init_pid);
391 if (ret < 0)
392 return log_error_errno(-EIO, EIO, "Failed to create entry for init pid of container");
393
394 DECLARE_ARG("-t");
395 DECLARE_ARG(init_pid_str);
396
397 freezer_relative = lxc_cmd_get_limit_cgroup_path(opts->c->name,
398 opts->c->config_path,
399 "freezer");
400 if (!freezer_relative)
401 return log_error_errno(-ENOENT, ENOENT, "Failed getting freezer path");
402
403 if (pure_unified_layout(cgroup_ops))
404 ret = strnprintf(log, sizeof(log), "/sys/fs/cgroup/%s", freezer_relative);
405 else
406 ret = strnprintf(log, sizeof(log), "/sys/fs/cgroup/freezer/%s", freezer_relative);
407 if (ret < 0)
408 return log_error_errno(-EIO, EIO, "Failed to freezer cgroup entry");
409
410 if (!opts->user->disable_skip_in_flight &&
411 strcmp(opts->criu_version, CRIU_IN_FLIGHT_SUPPORT) >= 0)
412 DECLARE_ARG("--skip-in-flight");
413
414 DECLARE_ARG("--freeze-cgroup");
415 DECLARE_ARG(log);
416
417 if (opts->tty_id[0]) {
418 DECLARE_ARG("--ext-mount-map");
419 DECLARE_ARG("/dev/console:console");
420
421 DECLARE_ARG("--external");
422 DECLARE_ARG(opts->tty_id);
423 }
424
425 if (opts->user->predump_dir) {
426 DECLARE_ARG("--prev-images-dir");
427 DECLARE_ARG(opts->user->predump_dir);
428 DECLARE_ARG("--track-mem");
429 }
430
431 if (opts->user->pageserver_address && opts->user->pageserver_port) {
432 DECLARE_ARG("--page-server");
433 DECLARE_ARG("--address");
434 DECLARE_ARG(opts->user->pageserver_address);
435 DECLARE_ARG("--port");
436 DECLARE_ARG(opts->user->pageserver_port);
437 }
438
439 if (!opts->user->preserves_inodes)
440 DECLARE_ARG("--force-irmap");
441
442 if (opts->user->ghost_limit) {
443 char ghost_limit[32];
444
445 ret = sprintf(ghost_limit, "%"PRIu64, opts->user->ghost_limit);
446 if (ret < 0 || ret >= sizeof(ghost_limit))
447 return log_error_errno(-EIO, EIO, "Failed to print ghost limit %"PRIu64, opts->user->ghost_limit);
448
449 DECLARE_ARG("--ghost-limit");
450 DECLARE_ARG(ghost_limit);
451 }
452
453 /* only for final dump */
454 if (strequal(opts->action, "dump") && !opts->user->stop)
455 DECLARE_ARG("--leave-running");
456 } else if (strequal(opts->action, "restore")) {
457 struct lxc_conf *lxc_conf = opts->c->lxc_conf;
458
459 DECLARE_ARG("--root");
460 DECLARE_ARG(opts->c->lxc_conf->rootfs.mount);
461 DECLARE_ARG("--restore-detached");
462 DECLARE_ARG("--restore-sibling");
463
464 if (ttys[0]) {
465 if (opts->console_fd < 0)
466 return log_error_errno(-EINVAL, EINVAL, "lxc.console.path configured on source host but not target");
467
468 ret = strnprintf(buf, sizeof(buf), "fd[%d]:%s", opts->console_fd, ttys);
469 if (ret < 0)
470 return log_error_errno(-EIO, EIO, "Failed to create console entry");
471
472 DECLARE_ARG("--inherit-fd");
473 DECLARE_ARG(buf);
474 }
475 if (opts->console_name) {
476 if (strnprintf(buf, sizeof(buf), "console:%s", opts->console_name) < 0)
477 return log_error_errno(-EIO, EIO, "Failed to create console entry");
478
479 DECLARE_ARG("--ext-mount-map");
480 DECLARE_ARG(buf);
481 }
482
483 if (lxc_conf->lsm_aa_profile || lxc_conf->lsm_se_context) {
484
485 if (lxc_conf->lsm_aa_profile)
486 ret = strnprintf(buf, sizeof(buf), "apparmor:%s", lxc_conf->lsm_aa_profile);
487 else
488 ret = strnprintf(buf, sizeof(buf), "selinux:%s", lxc_conf->lsm_se_context);
489 if (ret < 0)
490 return log_error_errno(-EIO, EIO, "Failed to create lsm entry");
491
492 DECLARE_ARG("--lsm-profile");
493 DECLARE_ARG(buf);
494 }
495
496 list_for_each_entry(netdev, &opts->c->lxc_conf->netdevs, head) {
497 size_t retlen;
498 char eth[128], *veth;
499 bool external_not_veth;
500
501 if (cmp_version(opts->criu_version, CRIU_EXTERNAL_NOT_VETH) >= 0) {
502 /* Since criu version 2.8 the usage of --veth-pair
503 * has been deprecated:
504 * git tag --contains f2037e6d3445fc400
505 * v2.8 */
506 external_not_veth = true;
507 } else {
508 external_not_veth = false;
509 }
510
511 if (netdev->name[0] != '\0') {
512 retlen = strlcpy(eth, netdev->name, sizeof(eth));
513 if (retlen >= sizeof(eth))
514 return log_error_errno(-E2BIG, E2BIG, "Failed to append veth device name");
515 } else {
516 ret = strnprintf(eth, sizeof(eth), "eth%d", netnr);
517 if (ret < 0)
518 return log_error_errno(-E2BIG, E2BIG, "Failed to append veth device name");
519 }
520
521 switch (netdev->type) {
522 case LXC_NET_VETH:
523 veth = netdev->priv.veth_attr.pair;
524 if (veth[0] == '\0')
525 veth = netdev->priv.veth_attr.veth1;
526
527 if (netdev->link[0] != '\0') {
528 if (external_not_veth)
529 ret = strnprintf(buf, sizeof(buf), "veth[%s]:%s@%s", eth, veth, netdev->link);
530 else
531 ret = strnprintf(buf, sizeof(buf), "%s=%s@%s", eth, veth, netdev->link);
532 } else {
533 if (external_not_veth)
534 ret = strnprintf(buf, sizeof(buf), "veth[%s]:%s", eth, veth);
535 else
536 ret = strnprintf(buf, sizeof(buf), "%s=%s", eth, veth);
537 }
538 if (ret < 0)
539 return log_error_errno(-EIO, EIO, "Failed to append veth device name");
540
541 TRACE("Added veth device entry %s", buf);
542 break;
543 case LXC_NET_MACVLAN:
544 if (netdev->link[0] == '\0')
545 return log_error_errno(-EINVAL, EINVAL, "Failed to find host interface for macvlan %s", netdev->name);
546
547 ret = strnprintf(buf, sizeof(buf), "macvlan[%s]:%s", eth, netdev->link);
548 if (ret < 0)
549 return log_error_errno(-EIO, EIO, "Failed to add macvlan entry");
550
551 TRACE("Added macvlan device entry %s", buf);
552
553 break;
554 case LXC_NET_NONE:
555 case LXC_NET_EMPTY:
556 break;
557 default:
558 /* we have screened for this earlier... */
559 return log_error_errno(-EINVAL, EINVAL, "Unsupported network type %d", netdev->type);
560 }
561
562 if (external_not_veth)
563 DECLARE_ARG("--external");
564 else
565 DECLARE_ARG("--veth-pair");
566 DECLARE_ARG(buf);
567 netnr++;
568 }
569
570 }
571
572 args->argv[args->argc] = NULL;
573
574 if (lxc_log_trace()) {
575 buf[0] = 0;
576 for (int i = 0, pos = 0; i < args->argc && args->argv[i]; i++) {
577 ret = strnprintf(buf + pos, sizeof(buf) - pos, "%s ", args->argv[i]);
578 if (ret < 0)
579 return log_error_errno(-EIO, EIO, "Failed to reorder entries");
580 else
581 pos += ret;
582 }
583
584 TRACE("Using command line %s", buf);
585 }
586
587 /* before criu inits its log, it sometimes prints things to stdout/err;
588 * let's be sure we capture that.
589 */
590 if (dup2(opts->pipefd, STDOUT_FILENO) < 0)
591 return log_error_errno(-errno, errno, "Failed to duplicate stdout");
592
593 if (dup2(opts->pipefd, STDERR_FILENO) < 0)
594 return log_error_errno(-errno, errno, "Failed to duplicate stderr");
595
596 close(opts->pipefd);
597
598 #undef DECLARE_ARG
599 execv(args->argv[0], args->argv);
600 return -ENOEXEC;
601 }
602
603 /*
604 * Function to check if the checks activated in 'features_to_check' are
605 * available with the current architecture/kernel/criu combination.
606 *
607 * Parameter features_to_check is a bit mask of all features that should be
608 * checked (see feature check defines in lxc/lxccontainer.h).
609 *
610 * If the return value is true, all requested features are supported. If
611 * the return value is false the features_to_check parameter is updated
612 * to reflect which features are available. '0' means no feature but
613 * also that something went totally wrong.
614 *
615 * Some of the code flow of criu_version_ok() is duplicated and maybe it
616 * is a good candidate for refactoring.
617 */
618 bool __criu_check_feature(uint64_t *features_to_check)
619 {
620 pid_t pid;
621 uint64_t current_bit = 0;
622 int ret;
623 uint64_t features = *features_to_check;
624 /* Feature checking is currently always like
625 * criu check --feature <feature-name>
626 */
627 char *args[] = { "criu", "check", "--feature", NULL, NULL };
628
629 if ((features & ~FEATURE_MEM_TRACK & ~FEATURE_LAZY_PAGES) != 0) {
630 /* There are feature bits activated we do not understand.
631 * Refusing to answer at all */
632 *features_to_check = 0;
633 return false;
634 }
635
636 while (current_bit < (sizeof(uint64_t) * 8 - 1)) {
637 /* only test requested features */
638 if (!(features & (1ULL << current_bit))) {
639 /* skip this */
640 current_bit++;
641 continue;
642 }
643
644 pid = fork();
645 if (pid < 0) {
646 SYSERROR("fork() failed");
647 *features_to_check = 0;
648 return false;
649 }
650
651 if (pid == 0) {
652 if ((1ULL << current_bit) == FEATURE_MEM_TRACK)
653 /* This is needed for pre-dump support, which
654 * enables pre-copy migration. */
655 args[3] = "mem_dirty_track";
656 else if ((1ULL << current_bit) == FEATURE_LAZY_PAGES)
657 /* CRIU has two checks for userfaultfd support.
658 *
659 * The simpler check is only for 'uffd'. If the
660 * kernel supports userfaultfd without noncoop
661 * then only process can be lazily restored
662 * which do not fork. With 'uffd-noncoop'
663 * it is also possible to lazily restore processes
664 * which do fork. For a container runtime like
665 * LXC checking only for 'uffd' makes not much sense. */
666 args[3] = "uffd-noncoop";
667 else
668 _exit(EXIT_FAILURE);
669
670 null_stdfds();
671
672 execvp("criu", args);
673 SYSERROR("Failed to exec \"criu\"");
674 _exit(EXIT_FAILURE);
675 }
676
677 ret = wait_for_pid(pid);
678
679 if (ret == -1) {
680 /* It is not known why CRIU failed. Either
681 * CRIU is not available, the feature check
682 * does not exist or the feature is not
683 * supported. */
684 INFO("feature not supported");
685 /* Clear not supported feature bit */
686 features &= ~(1ULL << current_bit);
687 }
688
689 current_bit++;
690 /* no more checks requested; exit check loop */
691 if (!(features & ~((1ULL << current_bit)-1)))
692 break;
693 }
694 if (features != *features_to_check) {
695 *features_to_check = features;
696 return false;
697 }
698 return true;
699 }
700
701 /*
702 * Check to see if the criu version is recent enough for all the features we
703 * use. This version allows either CRIU_VERSION or (CRIU_GITID_VERSION and
704 * CRIU_GITID_PATCHLEVEL) to work, enabling users building from git to c/r
705 * things potentially before a version is released with a particular feature.
706 *
707 * The intent is that when criu development slows down, we can drop this, but
708 * for now we shouldn't attempt to c/r with versions that we know won't work.
709 *
710 * Note: If version != NULL criu_version() stores the detected criu version in
711 * version. Allocates memory for version which must be freed by caller.
712 */
713 static bool criu_version_ok(char **version)
714 {
715 int pipes[2];
716 pid_t pid;
717
718 if (pipe(pipes) < 0) {
719 SYSERROR("pipe() failed");
720 return false;
721 }
722
723 pid = fork();
724 if (pid < 0) {
725 SYSERROR("fork() failed");
726 return false;
727 }
728
729 if (pid == 0) {
730 char *args[] = { "criu", "--version", NULL };
731 char *path;
732 close(pipes[0]);
733
734 close(STDERR_FILENO);
735 if (dup2(pipes[1], STDOUT_FILENO) < 0)
736 _exit(EXIT_FAILURE);
737
738 path = on_path("criu", NULL);
739 if (!path)
740 _exit(EXIT_FAILURE);
741
742 execv(path, args);
743 _exit(EXIT_FAILURE);
744 } else {
745 FILE *f;
746 char *tmp;
747 int patch;
748
749 close(pipes[1]);
750 if (wait_for_pid(pid) < 0) {
751 close(pipes[0]);
752 SYSERROR("execing criu failed, is it installed?");
753 return false;
754 }
755
756 f = fdopen(pipes[0], "re");
757 if (!f) {
758 close(pipes[0]);
759 return false;
760 }
761
762 tmp = malloc(1024);
763 if (!tmp) {
764 fclose(f);
765 return false;
766 }
767
768 if (fscanf(f, "Version: %1023[^\n]s", tmp) != 1)
769 goto version_error;
770
771 if (fgetc(f) != '\n')
772 goto version_error;
773
774 if (strcmp(tmp, CRIU_VERSION) >= 0)
775 goto version_match;
776
777 if (fscanf(f, "GitID: v%1023[^-]s", tmp) != 1)
778 goto version_error;
779
780 if (fgetc(f) != '-')
781 goto version_error;
782
783 if (fscanf(f, "%d", &patch) != 1)
784 goto version_error;
785
786 if (strcmp(tmp, CRIU_GITID_VERSION) < 0)
787 goto version_error;
788
789 if (patch < CRIU_GITID_PATCHLEVEL)
790 goto version_error;
791
792 version_match:
793 fclose(f);
794 if (!version)
795 free(tmp);
796 else
797 *version = tmp;
798 return true;
799
800 version_error:
801 fclose(f);
802 free(tmp);
803 ERROR("must have criu " CRIU_VERSION " or greater to checkpoint/restore");
804 return false;
805 }
806 }
807
808 /* Check and make sure the container has a configuration that we know CRIU can
809 * dump. */
810 static bool criu_ok(struct lxc_container *c, char **criu_version)
811 {
812 struct lxc_netdev *netdev;
813
814 if (geteuid()) {
815 ERROR("Must be root to checkpoint");
816 return false;
817 }
818
819 if (!criu_version_ok(criu_version))
820 return false;
821
822 /* We only know how to restore containers with veth networks. */
823 list_for_each_entry(netdev, &c->lxc_conf->netdevs, head) {
824 switch(netdev->type) {
825 case LXC_NET_VETH:
826 case LXC_NET_NONE:
827 case LXC_NET_EMPTY:
828 case LXC_NET_MACVLAN:
829 break;
830 default:
831 ERROR("Found un-dumpable network: %s (%s)", lxc_net_type_to_str(netdev->type), netdev->name);
832 if (criu_version) {
833 free(*criu_version);
834 *criu_version = NULL;
835 }
836 return false;
837 }
838 }
839
840 return true;
841 }
842
843 static bool restore_net_info(struct lxc_container *c)
844 {
845 int ret;
846 bool has_error = true;
847 struct lxc_netdev *netdev;
848
849 if (container_mem_lock(c))
850 return false;
851
852 list_for_each_entry(netdev, &c->lxc_conf->netdevs, head) {
853 char template[IFNAMSIZ];
854
855 if (netdev->type != LXC_NET_VETH)
856 continue;
857
858 ret = strnprintf(template, sizeof(template), "vethXXXXXX");
859 if (ret < 0)
860 goto out_unlock;
861
862 if (netdev->priv.veth_attr.pair[0] == '\0' &&
863 netdev->priv.veth_attr.veth1[0] == '\0') {
864 if (!lxc_ifname_alnum_case_sensitive(template))
865 goto out_unlock;
866
867 (void)strlcpy(netdev->priv.veth_attr.veth1, template, IFNAMSIZ);
868 }
869 }
870
871 has_error = false;
872
873 out_unlock:
874 container_mem_unlock(c);
875 return !has_error;
876 }
877
878 /* do_restore never returns, the calling process is used as the monitor process.
879 * do_restore calls _exit() if it fails.
880 */
881 static void do_restore(struct lxc_container *c, int status_pipe, struct migrate_opts *opts, char *criu_version)
882 {
883 int fd, ret;
884 pid_t pid;
885 struct lxc_handler *handler;
886 int status = 0;
887 int pipes[2] = {-1, -1};
888 struct cgroup_ops *cgroup_ops;
889
890 /* Try to detach from the current controlling tty if it exists.
891 * Otherwise, lxc_init (via lxc_console) will attach the container's
892 * console output to the current tty, which is probably not what any
893 * library user wants, and if they do, they can just manually configure
894 * it :)
895 */
896 fd = open("/dev/tty", O_RDWR);
897 if (fd >= 0) {
898 if (ioctl(fd, TIOCNOTTY, NULL) < 0)
899 SYSERROR("couldn't detach from tty");
900 close(fd);
901 }
902
903 handler = lxc_init_handler(NULL, c->name, c->lxc_conf, c->config_path, false);
904 if (!handler)
905 goto out;
906
907 if (lxc_init(c->name, handler) < 0)
908 goto out;
909 cgroup_ops = handler->cgroup_ops;
910
911 if (!cgroup_ops->monitor_create(cgroup_ops, handler)) {
912 ERROR("Failed to create monitor cgroup");
913 goto out_fini_handler;
914 }
915
916 if (!cgroup_ops->monitor_enter(cgroup_ops, handler)) {
917 ERROR("Failed to enter monitor cgroup");
918 goto out_fini_handler;
919 }
920
921 if (!cgroup_ops->monitor_delegate_controllers(cgroup_ops)) {
922 ERROR("Failed to delegate controllers to monitor cgroup");
923 goto out_fini_handler;
924 }
925
926 if (!cgroup_ops->payload_create(cgroup_ops, handler)) {
927 ERROR("Failed creating cgroups");
928 goto out_fini_handler;
929 }
930
931 if (!restore_net_info(c)) {
932 ERROR("failed restoring network info");
933 goto out_fini_handler;
934 }
935
936 ret = resolve_clone_flags(handler);
937 if (ret < 0) {
938 SYSERROR("Unsupported clone flag specified");
939 goto out_fini_handler;
940 }
941
942 if (pipe2(pipes, O_CLOEXEC) < 0) {
943 SYSERROR("pipe() failed");
944 goto out_fini_handler;
945 }
946
947 pid = fork();
948 if (pid < 0)
949 goto out_fini_handler;
950
951 if (pid == 0) {
952 struct criu_opts os;
953 struct lxc_rootfs *rootfs;
954 int flags;
955
956 close(status_pipe);
957 status_pipe = -1;
958
959 close(pipes[0]);
960 pipes[0] = -1;
961
962 if (unshare(CLONE_NEWNS))
963 goto out_fini_handler;
964
965 ret = lxc_storage_prepare(c->lxc_conf);
966 if (ret)
967 goto out_fini_handler;
968
969 /* CRIU needs the lxc root bind mounted so that it is the root of some
970 * mount. */
971 rootfs = &c->lxc_conf->rootfs;
972
973 if (rootfs_is_blockdev(c->lxc_conf)) {
974 if (lxc_setup_rootfs_prepare_root(c->lxc_conf, c->name,
975 c->config_path) < 0)
976 goto out_fini_handler;
977 } else {
978 if (mkdir(rootfs->mount, 0755) < 0 && errno != EEXIST)
979 goto out_fini_handler;
980
981 if (mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL) < 0) {
982 SYSERROR("remount / to private failed");
983 goto out_fini_handler;
984 }
985
986 if (mount(rootfs->path, rootfs->mount, NULL, MS_BIND, NULL) < 0) {
987 (void)rmdir(rootfs->mount);
988 goto out_fini_handler;
989 }
990 }
991
992 os.pipefd = pipes[1];
993 os.action = "restore";
994 os.user = opts;
995 os.c = c;
996 os.console_fd = c->lxc_conf->console.pty;
997 os.criu_version = criu_version;
998 os.handler = handler;
999
1000 if (os.console_fd >= 0) {
1001 /* Twiddle the FD_CLOEXEC bit. We want to pass this FD to criu
1002 * via --inherit-fd, so we don't want it to close.
1003 */
1004 flags = fcntl(os.console_fd, F_GETFD);
1005 if (flags < 0) {
1006 SYSERROR("F_GETFD failed: %d", os.console_fd);
1007 goto out_fini_handler;
1008 }
1009
1010 flags &= ~FD_CLOEXEC;
1011
1012 if (fcntl(os.console_fd, F_SETFD, flags) < 0) {
1013 SYSERROR("F_SETFD failed");
1014 goto out_fini_handler;
1015 }
1016 }
1017 os.console_name = c->lxc_conf->console.name;
1018
1019 /* exec_criu() returning is an error */
1020 ret = exec_criu(handler->cgroup_ops, c->lxc_conf, &os);
1021 if (ret)
1022 SYSERROR("Failed to execute criu");
1023 umount(rootfs->mount);
1024 (void)rmdir(rootfs->mount);
1025 goto out_fini_handler;
1026 } else {
1027 char title[2048];
1028
1029 close(pipes[1]);
1030 pipes[1] = -1;
1031
1032 pid_t w = waitpid(pid, &status, 0);
1033 if (w == -1) {
1034 SYSERROR("waitpid");
1035 goto out_fini_handler;
1036 }
1037
1038 if (WIFEXITED(status)) {
1039 char buf[4096];
1040
1041 if (WEXITSTATUS(status)) {
1042 int n;
1043
1044 n = lxc_read_nointr(pipes[0], buf, sizeof(buf));
1045 if (n < 0) {
1046 SYSERROR("failed reading from criu stderr");
1047 goto out_fini_handler;
1048 }
1049
1050 if (n == sizeof(buf))
1051 n--;
1052 buf[n] = 0;
1053
1054 ERROR("criu process exited %d, output:\n%s", WEXITSTATUS(status), buf);
1055 goto out_fini_handler;
1056 } else {
1057 ret = strnprintf(buf, sizeof(buf), "/proc/self/task/%lu/children", (unsigned long)syscall(__NR_gettid));
1058 if (ret < 0) {
1059 ERROR("strnprintf'd too many characters: %d", ret);
1060 goto out_fini_handler;
1061 }
1062
1063 FILE *f = fopen(buf, "re");
1064 if (!f) {
1065 SYSERROR("couldn't read restore's children file %s", buf);
1066 goto out_fini_handler;
1067 }
1068
1069 ret = fscanf(f, "%d", (int*) &handler->pid);
1070 fclose(f);
1071 if (ret != 1) {
1072 ERROR("reading restore pid failed");
1073 goto out_fini_handler;
1074 }
1075
1076 if (lxc_set_state(c->name, handler, RUNNING)) {
1077 ERROR("error setting running state after restore");
1078 goto out_fini_handler;
1079 }
1080 }
1081 } else {
1082 ERROR("CRIU was killed with signal %d", WTERMSIG(status));
1083 goto out_fini_handler;
1084 }
1085
1086 close(pipes[0]);
1087
1088 ret = lxc_write_nointr(status_pipe, &status, sizeof(status));
1089 close(status_pipe);
1090 status_pipe = -1;
1091
1092 if (sizeof(status) != ret) {
1093 SYSERROR("failed to write all of status");
1094 goto out_fini_handler;
1095 }
1096
1097 /*
1098 * See comment in lxcapi_start; we don't care if these
1099 * fail because it's just a beauty thing. We just
1100 * assign the return here to silence potential.
1101 */
1102 ret = strnprintf(title, sizeof(title), "[lxc monitor] %s %s", c->config_path, c->name);
1103 if (ret < 0)
1104 INFO("Setting truncated process name");
1105
1106 ret = setproctitle(title);
1107 if (ret < 0)
1108 INFO("Failed to set process name");
1109
1110 ret = lxc_poll(c->name, handler);
1111 if (ret)
1112 lxc_abort(handler);
1113 lxc_end(handler);
1114 _exit(ret);
1115 }
1116
1117 out_fini_handler:
1118 if (pipes[0] >= 0)
1119 close(pipes[0]);
1120 if (pipes[1] >= 0)
1121 close(pipes[1]);
1122
1123 lxc_end(handler);
1124
1125 out:
1126 if (status_pipe >= 0) {
1127 /* ensure getting here was a failure, e.g. if we failed to
1128 * parse the child pid or something, even after a successful
1129 * restore
1130 */
1131 if (!status)
1132 status = 1;
1133
1134 if (lxc_write_nointr(status_pipe, &status, sizeof(status)) != sizeof(status))
1135 SYSERROR("writing status failed");
1136 close(status_pipe);
1137 }
1138
1139 _exit(EXIT_FAILURE);
1140 }
1141
1142 static int save_tty_major_minor(char *directory, struct lxc_container *c, char *tty_id, int len)
1143 {
1144 FILE *f;
1145 char path[PATH_MAX];
1146 int ret;
1147 struct stat sb;
1148
1149 if (c->lxc_conf->console.path && strequal(c->lxc_conf->console.path, "none")) {
1150 tty_id[0] = 0;
1151 return 0;
1152 }
1153
1154 ret = strnprintf(path, sizeof(path), "/proc/%d/root/dev/console", c->init_pid(c));
1155 if (ret < 0) {
1156 ERROR("strnprintf'd too many characters: %d", ret);
1157 return -1;
1158 }
1159
1160 ret = stat(path, &sb);
1161 if (ret < 0) {
1162 SYSERROR("stat of %s failed", path);
1163 return -1;
1164 }
1165
1166 ret = strnprintf(path, sizeof(path), "%s/tty.info", directory);
1167 if (ret < 0) {
1168 ERROR("strnprintf'd too many characters: %d", ret);
1169 return -1;
1170 }
1171
1172 ret = strnprintf(tty_id, len, "tty[%llx:%llx]",
1173 (long long unsigned) sb.st_rdev,
1174 (long long unsigned) sb.st_dev);
1175 if (ret < 0) {
1176 ERROR("strnprintf'd too many characters: %d", ret);
1177 return -1;
1178 }
1179
1180 f = fopen(path, "we");
1181 if (!f) {
1182 SYSERROR("failed to open %s", path);
1183 return -1;
1184 }
1185
1186 ret = fprintf(f, "%s", tty_id);
1187 fclose(f);
1188 if (ret < 0)
1189 SYSERROR("failed to write to %s", path);
1190 return ret;
1191 }
1192
1193 /* do one of either predump or a regular dump */
1194 static bool do_dump(struct lxc_container *c, char *mode, struct migrate_opts *opts)
1195 {
1196 int ret;
1197 pid_t pid;
1198 int criuout[2];
1199 char *criu_version = NULL;
1200
1201 if (!criu_ok(c, &criu_version))
1202 return false;
1203
1204 ret = pipe(criuout);
1205 if (ret < 0) {
1206 SYSERROR("pipe() failed");
1207 free(criu_version);
1208 return false;
1209 }
1210
1211 if (mkdir_p(opts->directory, 0700) < 0)
1212 goto fail;
1213
1214 pid = fork();
1215 if (pid < 0) {
1216 SYSERROR("fork failed");
1217 goto fail;
1218 }
1219
1220 if (pid == 0) {
1221 struct criu_opts os;
1222 struct cgroup_ops *cgroup_ops;
1223
1224 close(criuout[0]);
1225
1226 cgroup_ops = cgroup_init(c->lxc_conf);
1227 if (!cgroup_ops) {
1228 ERROR("failed to cgroup_init()");
1229 _exit(EXIT_FAILURE);
1230 }
1231
1232 os.pipefd = criuout[1];
1233 os.action = mode;
1234 os.user = opts;
1235 os.c = c;
1236 os.console_name = c->lxc_conf->console.path;
1237 os.criu_version = criu_version;
1238 os.handler = NULL;
1239
1240 ret = save_tty_major_minor(opts->directory, c, os.tty_id, sizeof(os.tty_id));
1241 if (ret < 0) {
1242 free(criu_version);
1243 _exit(EXIT_FAILURE);
1244 }
1245
1246 /* exec_criu() returning is an error */
1247 ret = exec_criu(cgroup_ops, c->lxc_conf, &os);
1248 if (ret)
1249 SYSERROR("Failed to execute criu");
1250 free(criu_version);
1251 _exit(EXIT_FAILURE);
1252 } else {
1253 int status;
1254 ssize_t n;
1255 char buf[4096];
1256
1257 close(criuout[1]);
1258
1259 pid_t w = waitpid(pid, &status, 0);
1260 if (w == -1) {
1261 SYSERROR("waitpid");
1262 close(criuout[0]);
1263 free(criu_version);
1264 return false;
1265 }
1266
1267 n = lxc_read_nointr(criuout[0], buf, sizeof(buf));
1268 close(criuout[0]);
1269 if (n < 0) {
1270 SYSERROR("read");
1271 n = 0;
1272 }
1273
1274 if (n == sizeof(buf))
1275 buf[n-1] = 0;
1276 else
1277 buf[n] = 0;
1278
1279 if (WIFEXITED(status)) {
1280 if (WEXITSTATUS(status)) {
1281 ERROR("dump failed with %d", WEXITSTATUS(status));
1282 ret = false;
1283 } else {
1284 ret = true;
1285 }
1286 } else if (WIFSIGNALED(status)) {
1287 ERROR("dump signaled with %d", WTERMSIG(status));
1288 ret = false;
1289 } else {
1290 ERROR("unknown dump exit %d", status);
1291 ret = false;
1292 }
1293
1294 if (!ret)
1295 ERROR("criu output: %s", buf);
1296
1297 free(criu_version);
1298 return ret;
1299 }
1300 fail:
1301 close(criuout[0]);
1302 close(criuout[1]);
1303 (void)rmdir(opts->directory);
1304 free(criu_version);
1305 return false;
1306 }
1307
1308 bool __criu_pre_dump(struct lxc_container *c, struct migrate_opts *opts)
1309 {
1310 return do_dump(c, "pre-dump", opts);
1311 }
1312
1313 bool __criu_dump(struct lxc_container *c, struct migrate_opts *opts)
1314 {
1315 char path[PATH_MAX];
1316 int ret;
1317
1318 ret = strnprintf(path, sizeof(path), "%s/inventory.img", opts->directory);
1319 if (ret < 0)
1320 return false;
1321
1322 if (access(path, F_OK) == 0) {
1323 ERROR("please use a fresh directory for the dump directory");
1324 return false;
1325 }
1326
1327 return do_dump(c, "dump", opts);
1328 }
1329
1330 bool __criu_restore(struct lxc_container *c, struct migrate_opts *opts)
1331 {
1332 pid_t pid;
1333 int status, nread;
1334 int pipefd[2];
1335 char *criu_version = NULL;
1336
1337 if (geteuid()) {
1338 ERROR("Must be root to restore");
1339 return false;
1340 }
1341
1342 if (pipe(pipefd)) {
1343 ERROR("failed to create pipe");
1344 return false;
1345 }
1346
1347 if (!criu_ok(c, &criu_version)) {
1348 close(pipefd[0]);
1349 close(pipefd[1]);
1350 return false;
1351 }
1352
1353 pid = fork();
1354 if (pid < 0) {
1355 close(pipefd[0]);
1356 close(pipefd[1]);
1357 free(criu_version);
1358 return false;
1359 }
1360
1361 if (pid == 0) {
1362 close(pipefd[0]);
1363 /* this never returns */
1364 do_restore(c, pipefd[1], opts, criu_version);
1365 }
1366
1367 close(pipefd[1]);
1368 free(criu_version);
1369
1370 nread = lxc_read_nointr(pipefd[0], &status, sizeof(status));
1371 close(pipefd[0]);
1372 if (sizeof(status) != nread) {
1373 ERROR("reading status from pipe failed");
1374 goto err_wait;
1375 }
1376
1377 /* If the criu process was killed or exited nonzero, wait() for the
1378 * handler, since the restore process died. Otherwise, we don't need to
1379 * wait, since the child becomes the monitor process.
1380 */
1381 if (!WIFEXITED(status) || WEXITSTATUS(status))
1382 goto err_wait;
1383 return true;
1384
1385 err_wait:
1386 if (wait_for_pid(pid))
1387 ERROR("restore process died");
1388 return false;
1389 }