]> git.proxmox.com Git - mirror_lxc.git/blame - src/lxc/criu.c
tree-wide: use lxc_drop_groups() instead of lxc_setgroups(0, NULL)
[mirror_lxc.git] / src / lxc / criu.c
CommitLineData
cc73685d 1/* SPDX-License-Identifier: LGPL-2.1+ */
d38dd64a
CB
2
3#ifndef _GNU_SOURCE
4#define _GNU_SOURCE 1
5#endif
9b945f13 6#include <inttypes.h>
e29fe1dd
TA
7#include <linux/limits.h>
8#include <sched.h>
9#include <stdio.h>
10#include <stdlib.h>
11#include <string.h>
12#include <sys/mount.h>
13#include <sys/types.h>
14#include <sys/wait.h>
15#include <unistd.h>
16
e29fe1dd 17#include "cgroup.h"
dc259399 18#include "commands.h"
d38dd64a
CB
19#include "conf.h"
20#include "config.h"
e29fe1dd
TA
21#include "criu.h"
22#include "log.h"
23#include "lxc.h"
24#include "lxclock.h"
25#include "network.h"
28d832c4 26#include "storage.h"
e8f764b6 27#include "syscall_wrappers.h"
e29fe1dd
TA
28#include "utils.h"
29
5f4e44a2
TA
30#if IS_BIONIC
31#include <../include/lxcmntent.h>
32#else
33#include <mntent.h>
34#endif
35
9de31d5a
CB
36#ifndef HAVE_STRLCPY
37#include "include/strlcpy.h"
38#endif
39
c33b0338 40#define CRIU_VERSION "2.0"
73d46752
TA
41
42#define CRIU_GITID_VERSION "2.0"
43#define CRIU_GITID_PATCHLEVEL 0
44
f1954503 45#define CRIU_IN_FLIGHT_SUPPORT "2.4"
46c8ffd5 46#define CRIU_EXTERNAL_NOT_VETH "2.8"
f1954503 47
ac2cecc4 48lxc_log_define(criu, lxc);
e29fe1dd 49
73d46752 50struct criu_opts {
5af85cb1
TA
51 /* the thing to hook to stdout and stderr for logging */
52 int pipefd;
53
73d46752
TA
54 /* The type of criu invocation, one of "dump" or "restore" */
55 char *action;
56
b2c3710f
TA
57 /* the user-provided migrate options relevant to this action */
58 struct migrate_opts *user;
73d46752
TA
59
60 /* The container to dump */
61 struct lxc_container *c;
62
73d46752 63 /* dump: stop the container or not after dumping? */
4b54788e 64 char tty_id[32]; /* the criu tty id for /dev/console, i.e. "tty[${rdev}:${dev}]" */
73d46752
TA
65
66 /* restore: the file to write the init process' pid into */
0ab5703f 67 struct lxc_handler *handler;
4b54788e
TA
68 int console_fd;
69 /* The path that is bind mounted from /dev/console, if any. We don't
41808e20 70 * want to use `--ext-mount-map auto`'s result here because the pty
4b54788e 71 * device may have a different path (e.g. if the pty number is
3aed4934 72 * different) on the target host. NULL if lxc.console.path = "none".
4b54788e
TA
73 */
74 char *console_name;
f1954503
AR
75
76 /* The detected version of criu */
77 char *criu_version;
73d46752
TA
78};
79
4b54788e
TA
80static int load_tty_major_minor(char *directory, char *output, int len)
81{
4b54788e 82 char path[PATH_MAX];
c3e48967 83 ssize_t ret;
4b54788e
TA
84
85 ret = snprintf(path, sizeof(path), "%s/tty.info", directory);
c3e48967
CB
86 if (ret < 0 || (size_t)ret >= sizeof(path))
87 return ret_errno(EIO);
4b54788e 88
c3e48967
CB
89 ret = lxc_read_from_file(path, output, len);
90 if (ret < 0) {
91 /*
92 * This means we're coming from a liblxc which didn't export
3aed4934
CB
93 * the tty info. In this case they had to have lxc.console.path
94 * = * none, so there's no problem restoring.
4b54788e
TA
95 */
96 if (errno == ENOENT)
97 return 0;
98
c3e48967 99 return log_error_errno(-errno, errno, "Failed to open \"%s\"", path);
4b54788e
TA
100 }
101
4b54788e
TA
102 return 0;
103}
104
74ad3607
FB
105static int cmp_version(const char *v1, const char *v2)
106{
107 int ret;
108 int oct_v1[3], oct_v2[3];
109
110 memset(oct_v1, -1, sizeof(oct_v1));
111 memset(oct_v2, -1, sizeof(oct_v2));
112
113 ret = sscanf(v1, "%d.%d.%d", &oct_v1[0], &oct_v1[1], &oct_v1[2]);
114 if (ret < 1)
115 return -1;
116
117 ret = sscanf(v2, "%d.%d.%d", &oct_v2[0], &oct_v2[1], &oct_v2[2]);
118 if (ret < 1)
119 return -1;
120
121 /* Major version is greater. */
122 if (oct_v1[0] > oct_v2[0])
123 return 1;
124
125 if (oct_v1[0] < oct_v2[0])
126 return -1;
127
128 /* Minor number is greater.*/
129 if (oct_v1[1] > oct_v2[1])
130 return 1;
131
132 if (oct_v1[1] < oct_v2[1])
133 return -1;
134
135 /* Patch number is greater. */
136 if (oct_v1[2] > oct_v2[2])
137 return 1;
138
139 /* Patch numbers are equal. */
140 if (oct_v1[2] == oct_v2[2])
141 return 0;
142
143 return -1;
144}
145
e20f46f8
AR
146static void exec_criu(struct cgroup_ops *cgroup_ops, struct lxc_conf *conf,
147 struct criu_opts *opts)
e29fe1dd
TA
148{
149 char **argv, log[PATH_MAX];
19d1509c 150 int static_args = 23, argc = 0, i, ret;
e29fe1dd
TA
151 int netnr = 0;
152 struct lxc_list *it;
5f4e44a2
TA
153 FILE *mnts;
154 struct mntent mntent;
e29fe1dd 155
0e4be3cf 156 char buf[4096], ttys[32];
a17fa3c0 157 size_t pos;
5af85cb1 158
e9195050
TA
159 /* If we are currently in a cgroup /foo/bar, and the container is in a
160 * cgroup /lxc/foo, lxcfs will give us an ENOENT if some task in the
161 * container has an open fd that points to one of the cgroup files
162 * (systemd always opens its "root" cgroup). So, let's escape to the
163 * /actual/ root cgroup so that lxcfs thinks criu has enough rights to
164 * see all cgroups.
165 */
e20f46f8 166 if (!cgroup_ops->escape(cgroup_ops, conf)) {
e9195050
TA
167 ERROR("failed to escape cgroups");
168 return;
169 }
170
e29fe1dd 171 /* The command line always looks like:
19d1509c 172 * criu $(action) --tcp-established --file-locks --link-remap \
5f178bc9 173 * --manage-cgroups=full --action-script foo.sh -D $(directory) \
e29fe1dd
TA
174 * -o $(directory)/$(action).log --ext-mount-map auto
175 * --enable-external-sharing --enable-external-masters
4b54788e 176 * --enable-fs hugetlbfs --enable-fs tracefs --ext-mount-map console:/dev/pts/n
e29fe1dd
TA
177 * +1 for final NULL */
178
aef3d51e 179 if (strcmp(opts->action, "dump") == 0 || strcmp(opts->action, "pre-dump") == 0) {
dc259399
TA
180 /* -t pid --freeze-cgroup /lxc/ct */
181 static_args += 4;
e29fe1dd 182
aef3d51e 183 /* --prev-images-dir <path-to-directory-A-relative-to-B> */
b2c3710f 184 if (opts->user->predump_dir)
aef3d51e
TA
185 static_args += 2;
186
74eb576c 187 /* --page-server --address <address> --port <port> */
b2c3710f 188 if (opts->user->pageserver_address && opts->user->pageserver_port)
74eb576c
NE
189 static_args += 5;
190
aef3d51e 191 /* --leave-running (only for final dump) */
b2c3710f 192 if (strcmp(opts->action, "dump") == 0 && !opts->user->stop)
e29fe1dd 193 static_args++;
4b54788e
TA
194
195 /* --external tty[88,4] */
196 if (opts->tty_id[0])
197 static_args += 2;
19d1509c
TA
198
199 /* --force-irmap */
200 if (!opts->user->preserves_inodes)
201 static_args++;
b2b7b0d2
TA
202
203 /* --ghost-limit 1024 */
204 if (opts->user->ghost_limit)
205 static_args += 2;
e29fe1dd
TA
206 } else if (strcmp(opts->action, "restore") == 0) {
207 /* --root $(lxc_mount_point) --restore-detached
0ab5703f 208 * --restore-sibling
13389b29
TA
209 * --lsm-profile apparmor:whatever
210 */
0ab5703f 211 static_args += 6;
4b54788e 212
0e4be3cf
CB
213 ttys[0] = 0;
214 if (load_tty_major_minor(opts->user->directory, ttys, sizeof(ttys)))
4b54788e
TA
215 return;
216
217 /* --inherit-fd fd[%d]:tty[%s] */
0e4be3cf 218 if (ttys[0])
4b54788e 219 static_args += 2;
e29fe1dd
TA
220 } else {
221 return;
222 }
223
2202afc9
CB
224 if (cgroup_ops->num_hierarchies(cgroup_ops) > 0)
225 static_args += 2 * cgroup_ops->num_hierarchies(cgroup_ops);
0ab5703f 226
b2c3710f 227 if (opts->user->verbose)
e29fe1dd
TA
228 static_args++;
229
b9ee6643
TA
230 if (opts->user->action_script)
231 static_args += 2;
232
5f4e44a2
TA
233 static_args += 2 * lxc_list_len(&opts->c->lxc_conf->mount_list);
234
b2c3710f 235 ret = snprintf(log, PATH_MAX, "%s/%s.log", opts->user->directory, opts->action);
e29fe1dd 236 if (ret < 0 || ret >= PATH_MAX) {
9f1f54b0 237 ERROR("logfile name too long");
e29fe1dd
TA
238 return;
239 }
240
241 argv = malloc(static_args * sizeof(*argv));
242 if (!argv)
243 return;
244
245 memset(argv, 0, static_args * sizeof(*argv));
246
247#define DECLARE_ARG(arg) \
248 do { \
249 if (arg == NULL) { \
250 ERROR("Got NULL argument for criu"); \
251 goto err; \
252 } \
253 argv[argc++] = strdup(arg); \
254 if (!argv[argc-1]) \
255 goto err; \
256 } while (0)
257
258 argv[argc++] = on_path("criu", NULL);
259 if (!argv[argc-1]) {
9f1f54b0 260 ERROR("Couldn't find criu binary");
e29fe1dd
TA
261 goto err;
262 }
263
264 DECLARE_ARG(opts->action);
265 DECLARE_ARG("--tcp-established");
266 DECLARE_ARG("--file-locks");
267 DECLARE_ARG("--link-remap");
0a5fc6df 268 DECLARE_ARG("--manage-cgroups=full");
e29fe1dd
TA
269 DECLARE_ARG("--ext-mount-map");
270 DECLARE_ARG("auto");
271 DECLARE_ARG("--enable-external-sharing");
272 DECLARE_ARG("--enable-external-masters");
dd62857a
TA
273 DECLARE_ARG("--enable-fs");
274 DECLARE_ARG("hugetlbfs");
5b454329
TA
275 DECLARE_ARG("--enable-fs");
276 DECLARE_ARG("tracefs");
e29fe1dd 277 DECLARE_ARG("-D");
b2c3710f 278 DECLARE_ARG(opts->user->directory);
e29fe1dd
TA
279 DECLARE_ARG("-o");
280 DECLARE_ARG(log);
281
2202afc9 282 for (i = 0; i < cgroup_ops->num_hierarchies(cgroup_ops); i++) {
0ab5703f 283 char **controllers = NULL, *fullname;
31b204e4 284 char *path, *tmp;
0ab5703f 285
2202afc9 286 if (!cgroup_ops->get_hierarchies(cgroup_ops, i, &controllers)) {
0ab5703f
TA
287 ERROR("failed to get hierarchy %d", i);
288 goto err;
289 }
290
291 /* if we are in a dump, we have to ask the monitor process what
292 * the right cgroup is. if this is a restore, we can just use
293 * the handler the restore task created.
294 */
295 if (!strcmp(opts->action, "dump") || !strcmp(opts->action, "pre-dump")) {
a900cbaf 296 path = lxc_cmd_get_limiting_cgroup_path(opts->c->name, opts->c->config_path, controllers[0]);
0ab5703f
TA
297 if (!path) {
298 ERROR("failed to get cgroup path for %s", controllers[0]);
299 goto err;
300 }
301 } else {
302 const char *p;
303
a900cbaf 304 p = cgroup_ops->get_limiting_cgroup(cgroup_ops, controllers[0]);
0ab5703f
TA
305 if (!p) {
306 ERROR("failed to get cgroup path for %s", controllers[0]);
307 goto err;
308 }
309
310 path = strdup(p);
311 if (!path) {
312 ERROR("strdup failed");
313 goto err;
314 }
315 }
316
31b204e4
CB
317 tmp = lxc_deslashify(path);
318 if (!tmp) {
319 ERROR("Failed to remove extraneous slashes from \"%s\"",
320 path);
0ab5703f
TA
321 free(path);
322 goto err;
323 }
31b204e4
CB
324 free(path);
325 path = tmp;
0ab5703f
TA
326
327 fullname = lxc_string_join(",", (const char **) controllers, false);
328 if (!fullname) {
329 ERROR("failed to join controllers");
330 free(path);
331 goto err;
332 }
333
334 ret = sprintf(buf, "%s:%s", fullname, path);
335 free(path);
336 free(fullname);
337 if (ret < 0 || ret >= sizeof(buf)) {
338 ERROR("sprintf of cgroup root arg failed");
339 goto err;
340 }
341
342 DECLARE_ARG("--cgroup-root");
343 DECLARE_ARG(buf);
344 }
345
b2c3710f 346 if (opts->user->verbose)
582cb478 347 DECLARE_ARG("-v4");
e29fe1dd 348
b9ee6643
TA
349 if (opts->user->action_script) {
350 DECLARE_ARG("--action-script");
351 DECLARE_ARG(opts->user->action_script);
352 }
353
1800f924
WB
354 mnts = make_anonymous_mount_file(&opts->c->lxc_conf->mount_list,
355 opts->c->lxc_conf->lsm_aa_allow_nesting);
5f4e44a2
TA
356 if (!mnts)
357 goto err;
358
359 while (getmntent_r(mnts, &mntent, buf, sizeof(buf))) {
a08bfbe3
CB
360 unsigned long flags = 0;
361 char *mntdata = NULL;
5f4e44a2 362 char arg[2 * PATH_MAX + 2];
19d2422b
TA
363
364 if (parse_mntopts(mntent.mnt_opts, &flags, &mntdata) < 0)
365 goto err;
366
367 free(mntdata);
368
369 /* only add --ext-mount-map for actual bind mounts */
370 if (!(flags & MS_BIND))
371 continue;
5f4e44a2 372
d07545c7
CB
373 if (strcmp(opts->action, "dump") == 0)
374 ret = snprintf(arg, sizeof(arg), "/%s:%s",
375 mntent.mnt_dir, mntent.mnt_dir);
376 else
377 ret = snprintf(arg, sizeof(arg), "%s:%s",
378 mntent.mnt_dir, mntent.mnt_fsname);
5f4e44a2
TA
379 if (ret < 0 || ret >= sizeof(arg)) {
380 fclose(mnts);
381 ERROR("snprintf failed");
382 goto err;
383 }
384
385 DECLARE_ARG("--ext-mount-map");
386 DECLARE_ARG(arg);
387 }
388 fclose(mnts);
389
aef3d51e 390 if (strcmp(opts->action, "dump") == 0 || strcmp(opts->action, "pre-dump") == 0) {
dc259399 391 char pid[32], *freezer_relative;
e29fe1dd
TA
392
393 if (sprintf(pid, "%d", opts->c->init_pid(opts->c)) < 0)
394 goto err;
395
396 DECLARE_ARG("-t");
397 DECLARE_ARG(pid);
dc259399 398
a900cbaf
WB
399 freezer_relative = lxc_cmd_get_limiting_cgroup_path(opts->c->name,
400 opts->c->config_path,
401 "freezer");
dc259399
TA
402 if (!freezer_relative) {
403 ERROR("failed getting freezer path");
404 goto err;
405 }
406
928b065d
CB
407 if (pure_unified_layout(cgroup_ops))
408 ret = snprintf(log, sizeof(log), "/sys/fs/cgroup/%s", freezer_relative);
409 else
410 ret = snprintf(log, sizeof(log), "/sys/fs/cgroup/freezer/%s", freezer_relative);
dc259399
TA
411 if (ret < 0 || ret >= sizeof(log))
412 goto err;
413
f1954503
AR
414 if (!opts->user->disable_skip_in_flight &&
415 strcmp(opts->criu_version, CRIU_IN_FLIGHT_SUPPORT) >= 0)
416 DECLARE_ARG("--skip-in-flight");
417
dc259399
TA
418 DECLARE_ARG("--freeze-cgroup");
419 DECLARE_ARG(log);
420
4b54788e 421 if (opts->tty_id[0]) {
36d2096c
TA
422 DECLARE_ARG("--ext-mount-map");
423 DECLARE_ARG("/dev/console:console");
424
4b54788e
TA
425 DECLARE_ARG("--external");
426 DECLARE_ARG(opts->tty_id);
427 }
428
b2c3710f 429 if (opts->user->predump_dir) {
aef3d51e 430 DECLARE_ARG("--prev-images-dir");
b2c3710f 431 DECLARE_ARG(opts->user->predump_dir);
9f99a33f 432 DECLARE_ARG("--track-mem");
74eb576c 433 }
4c0c0319 434
b2c3710f 435 if (opts->user->pageserver_address && opts->user->pageserver_port) {
74eb576c
NE
436 DECLARE_ARG("--page-server");
437 DECLARE_ARG("--address");
b2c3710f 438 DECLARE_ARG(opts->user->pageserver_address);
74eb576c 439 DECLARE_ARG("--port");
b2c3710f 440 DECLARE_ARG(opts->user->pageserver_port);
74eb576c 441 }
aef3d51e 442
19d1509c
TA
443 if (!opts->user->preserves_inodes)
444 DECLARE_ARG("--force-irmap");
445
b2b7b0d2
TA
446 if (opts->user->ghost_limit) {
447 char ghost_limit[32];
448
9b945f13 449 ret = sprintf(ghost_limit, "%"PRIu64, opts->user->ghost_limit);
b2b7b0d2 450 if (ret < 0 || ret >= sizeof(ghost_limit)) {
9b945f13 451 ERROR("failed to print ghost limit %"PRIu64, opts->user->ghost_limit);
b2b7b0d2
TA
452 goto err;
453 }
454
455 DECLARE_ARG("--ghost-limit");
456 DECLARE_ARG(ghost_limit);
457 }
458
aef3d51e 459 /* only for final dump */
b2c3710f 460 if (strcmp(opts->action, "dump") == 0 && !opts->user->stop)
e29fe1dd
TA
461 DECLARE_ARG("--leave-running");
462 } else if (strcmp(opts->action, "restore") == 0) {
463 void *m;
464 int additional;
13389b29 465 struct lxc_conf *lxc_conf = opts->c->lxc_conf;
e29fe1dd
TA
466
467 DECLARE_ARG("--root");
468 DECLARE_ARG(opts->c->lxc_conf->rootfs.mount);
469 DECLARE_ARG("--restore-detached");
470 DECLARE_ARG("--restore-sibling");
e29fe1dd 471
0e4be3cf 472 if (ttys[0]) {
97e4f1a9 473 if (opts->console_fd < 0) {
3aed4934 474 ERROR("lxc.console.path configured on source host but not target");
97e4f1a9
TA
475 goto err;
476 }
477
0e4be3cf 478 ret = snprintf(buf, sizeof(buf), "fd[%d]:%s", opts->console_fd, ttys);
4b54788e
TA
479 if (ret < 0 || ret >= sizeof(buf))
480 goto err;
481
482 DECLARE_ARG("--inherit-fd");
483 DECLARE_ARG(buf);
484 }
485 if (opts->console_name) {
486 if (snprintf(buf, sizeof(buf), "console:%s", opts->console_name) < 0) {
487 SYSERROR("sprintf'd too many bytes");
488 }
489 DECLARE_ARG("--ext-mount-map");
490 DECLARE_ARG(buf);
491 }
492
13389b29
TA
493 if (lxc_conf->lsm_aa_profile || lxc_conf->lsm_se_context) {
494
495 if (lxc_conf->lsm_aa_profile)
496 ret = snprintf(buf, sizeof(buf), "apparmor:%s", lxc_conf->lsm_aa_profile);
497 else
498 ret = snprintf(buf, sizeof(buf), "selinux:%s", lxc_conf->lsm_se_context);
499
500 if (ret < 0 || ret >= sizeof(buf))
501 goto err;
502
503 DECLARE_ARG("--lsm-profile");
504 DECLARE_ARG(buf);
505 }
506
e29fe1dd
TA
507 additional = lxc_list_len(&opts->c->lxc_conf->network) * 2;
508
fa071249
TA
509 m = realloc(argv, (argc + additional + 1) * sizeof(*argv));
510 if (!m)
511 goto err;
e29fe1dd
TA
512 argv = m;
513
514 lxc_list_for_each(it, &opts->c->lxc_conf->network) {
9de31d5a 515 size_t retlen;
e29fe1dd
TA
516 char eth[128], *veth;
517 struct lxc_netdev *n = it->elem;
46c8ffd5
AR
518 bool external_not_veth;
519
74ad3607 520 if (cmp_version(opts->criu_version, CRIU_EXTERNAL_NOT_VETH) >= 0) {
46c8ffd5
AR
521 /* Since criu version 2.8 the usage of --veth-pair
522 * has been deprecated:
523 * git tag --contains f2037e6d3445fc400
524 * v2.8 */
525 external_not_veth = true;
526 } else {
527 external_not_veth = false;
528 }
e29fe1dd 529
42277b1c 530 if (n->name[0] != '\0') {
9de31d5a
CB
531 retlen = strlcpy(eth, n->name, sizeof(eth));
532 if (retlen >= sizeof(eth))
e29fe1dd 533 goto err;
796a109d
TA
534 } else {
535 ret = snprintf(eth, sizeof(eth), "eth%d", netnr);
536 if (ret < 0 || ret >= sizeof(eth))
537 goto err;
538 }
e29fe1dd 539
e2697330
TA
540 switch (n->type) {
541 case LXC_NET_VETH:
542 veth = n->priv.veth_attr.pair;
ea7f6b29
CB
543 if (veth[0] == '\0')
544 veth = n->priv.veth_attr.veth1;
e29fe1dd 545
de4855a8 546 if (n->link[0] != '\0') {
46c8ffd5 547 if (external_not_veth)
d07545c7
CB
548 ret = snprintf(buf, sizeof(buf),
549 "veth[%s]:%s@%s",
550 eth, veth,
551 n->link);
46c8ffd5 552 else
d07545c7
CB
553 ret = snprintf(buf, sizeof(buf),
554 "%s=%s@%s", eth,
555 veth, n->link);
46c8ffd5
AR
556 } else {
557 if (external_not_veth)
d07545c7
CB
558 ret = snprintf(buf, sizeof(buf),
559 "veth[%s]:%s",
560 eth, veth);
46c8ffd5 561 else
d07545c7
CB
562 ret = snprintf(buf, sizeof(buf),
563 "%s=%s", eth,
564 veth);
46c8ffd5 565 }
e2697330
TA
566 if (ret < 0 || ret >= sizeof(buf))
567 goto err;
568 break;
569 case LXC_NET_MACVLAN:
de4855a8 570 if (n->link[0] == '\0') {
9f1f54b0 571 ERROR("no host interface for macvlan %s", n->name);
e2697330
TA
572 goto err;
573 }
574
575 ret = snprintf(buf, sizeof(buf), "macvlan[%s]:%s", eth, n->link);
576 if (ret < 0 || ret >= sizeof(buf))
577 goto err;
578 break;
579 case LXC_NET_NONE:
580 case LXC_NET_EMPTY:
581 break;
582 default:
583 /* we have screened for this earlier... */
9f1f54b0 584 ERROR("unexpected network type %d", n->type);
e29fe1dd 585 goto err;
e2697330 586 }
e29fe1dd 587
46c8ffd5
AR
588 if (external_not_veth)
589 DECLARE_ARG("--external");
590 else
591 DECLARE_ARG("--veth-pair");
e29fe1dd 592 DECLARE_ARG(buf);
2f3fbc6b 593 netnr++;
e29fe1dd
TA
594 }
595
596 }
597
598 argv[argc] = NULL;
599
cf4b07a5 600 buf[0] = 0;
a17fa3c0 601 pos = 0;
72a30576 602
cf4b07a5 603 for (i = 0; argv[i]; i++) {
72a30576
NE
604 ret = snprintf(buf + pos, sizeof(buf) - pos, "%s ", argv[i]);
605 if (ret < 0 || ret >= sizeof(buf) - pos)
606 goto err;
607 else
608 pos += ret;
cf4b07a5
TA
609 }
610
611 INFO("execing: %s", buf);
612
5af85cb1
TA
613 /* before criu inits its log, it sometimes prints things to stdout/err;
614 * let's be sure we capture that.
615 */
616 if (dup2(opts->pipefd, STDOUT_FILENO) < 0) {
617 SYSERROR("dup2 stdout failed");
618 goto err;
619 }
620
621 if (dup2(opts->pipefd, STDERR_FILENO) < 0) {
622 SYSERROR("dup2 stderr failed");
623 goto err;
624 }
625
626 close(opts->pipefd);
627
e29fe1dd
TA
628#undef DECLARE_ARG
629 execv(argv[0], argv);
630err:
e29fe1dd
TA
631 for (i = 0; argv[i]; i++)
632 free(argv[i]);
633 free(argv);
634}
635
b5b12b9e
AR
636/*
637 * Function to check if the checks activated in 'features_to_check' are
638 * available with the current architecture/kernel/criu combination.
639 *
640 * Parameter features_to_check is a bit mask of all features that should be
641 * checked (see feature check defines in lxc/lxccontainer.h).
642 *
643 * If the return value is true, all requested features are supported. If
644 * the return value is false the features_to_check parameter is updated
645 * to reflect which features are available. '0' means no feature but
646 * also that something went totally wrong.
647 *
648 * Some of the code flow of criu_version_ok() is duplicated and maybe it
649 * is a good candidate for refactoring.
650 */
651bool __criu_check_feature(uint64_t *features_to_check)
652{
653 pid_t pid;
654 uint64_t current_bit = 0;
655 int ret;
fca23691 656 uint64_t features = *features_to_check;
b5b12b9e
AR
657 /* Feature checking is currently always like
658 * criu check --feature <feature-name>
659 */
660 char *args[] = { "criu", "check", "--feature", NULL, NULL };
661
662 if ((features & ~FEATURE_MEM_TRACK & ~FEATURE_LAZY_PAGES) != 0) {
663 /* There are feature bits activated we do not understand.
664 * Refusing to answer at all */
665 *features_to_check = 0;
666 return false;
667 }
668
6d61f17d 669 while (current_bit < (sizeof(uint64_t) * 8 - 1)) {
b5b12b9e
AR
670 /* only test requested features */
671 if (!(features & (1ULL << current_bit))) {
672 /* skip this */
673 current_bit++;
674 continue;
675 }
676
677 pid = fork();
678 if (pid < 0) {
679 SYSERROR("fork() failed");
680 *features_to_check = 0;
681 return false;
682 }
683
684 if (pid == 0) {
685 if ((1ULL << current_bit) == FEATURE_MEM_TRACK)
686 /* This is needed for pre-dump support, which
687 * enables pre-copy migration. */
688 args[3] = "mem_dirty_track";
689 else if ((1ULL << current_bit) == FEATURE_LAZY_PAGES)
690 /* CRIU has two checks for userfaultfd support.
691 *
692 * The simpler check is only for 'uffd'. If the
693 * kernel supports userfaultfd without noncoop
694 * then only process can be lazily restored
695 * which do not fork. With 'uffd-noncoop'
696 * it is also possible to lazily restore processes
697 * which do fork. For a container runtime like
698 * LXC checking only for 'uffd' makes not much sense. */
699 args[3] = "uffd-noncoop";
700 else
4f43526d 701 _exit(EXIT_FAILURE);
b5b12b9e
AR
702
703 null_stdfds();
704
705 execvp("criu", args);
706 SYSERROR("Failed to exec \"criu\"");
4f43526d 707 _exit(EXIT_FAILURE);
b5b12b9e
AR
708 }
709
710 ret = wait_for_pid(pid);
711
712 if (ret == -1) {
713 /* It is not known why CRIU failed. Either
714 * CRIU is not available, the feature check
715 * does not exist or the feature is not
716 * supported. */
717 INFO("feature not supported");
718 /* Clear not supported feature bit */
719 features &= ~(1ULL << current_bit);
720 }
721
722 current_bit++;
723 /* no more checks requested; exit check loop */
724 if (!(features & ~((1ULL << current_bit)-1)))
725 break;
726 }
727 if (features != *features_to_check) {
728 *features_to_check = features;
729 return false;
730 }
731 return true;
732}
733
8ba5ced7
TA
734/*
735 * Check to see if the criu version is recent enough for all the features we
736 * use. This version allows either CRIU_VERSION or (CRIU_GITID_VERSION and
737 * CRIU_GITID_PATCHLEVEL) to work, enabling users building from git to c/r
738 * things potentially before a version is released with a particular feature.
739 *
740 * The intent is that when criu development slows down, we can drop this, but
741 * for now we shouldn't attempt to c/r with versions that we know won't work.
5407e2ab
CB
742 *
743 * Note: If version != NULL criu_version() stores the detected criu version in
744 * version. Allocates memory for version which must be freed by caller.
8ba5ced7 745 */
5407e2ab 746static bool criu_version_ok(char **version)
8ba5ced7
TA
747{
748 int pipes[2];
749 pid_t pid;
750
751 if (pipe(pipes) < 0) {
752 SYSERROR("pipe() failed");
753 return false;
754 }
755
756 pid = fork();
757 if (pid < 0) {
758 SYSERROR("fork() failed");
759 return false;
760 }
761
762 if (pid == 0) {
763 char *args[] = { "criu", "--version", NULL };
755fa453 764 char *path;
8ba5ced7
TA
765 close(pipes[0]);
766
767 close(STDERR_FILENO);
768 if (dup2(pipes[1], STDOUT_FILENO) < 0)
665bb114 769 _exit(EXIT_FAILURE);
8ba5ced7 770
755fa453 771 path = on_path("criu", NULL);
d9b32b09 772 if (!path)
665bb114 773 _exit(EXIT_FAILURE);
d9b32b09 774
755fa453 775 execv(path, args);
665bb114 776 _exit(EXIT_FAILURE);
8ba5ced7
TA
777 } else {
778 FILE *f;
5407e2ab 779 char *tmp;
8ba5ced7
TA
780 int patch;
781
782 close(pipes[1]);
783 if (wait_for_pid(pid) < 0) {
784 close(pipes[0]);
4eae4051 785 SYSERROR("execing criu failed, is it installed?");
8ba5ced7
TA
786 return false;
787 }
788
4110345b 789 f = fdopen(pipes[0], "re");
8ba5ced7
TA
790 if (!f) {
791 close(pipes[0]);
792 return false;
793 }
794
5407e2ab
CB
795 tmp = malloc(1024);
796 if (!tmp) {
797 fclose(f);
798 return false;
799 }
800
801 if (fscanf(f, "Version: %1023[^\n]s", tmp) != 1)
8ba5ced7
TA
802 goto version_error;
803
804 if (fgetc(f) != '\n')
805 goto version_error;
806
5407e2ab 807 if (strcmp(tmp, CRIU_VERSION) >= 0)
8ba5ced7
TA
808 goto version_match;
809
5407e2ab 810 if (fscanf(f, "GitID: v%1023[^-]s", tmp) != 1)
8ba5ced7
TA
811 goto version_error;
812
813 if (fgetc(f) != '-')
814 goto version_error;
815
816 if (fscanf(f, "%d", &patch) != 1)
817 goto version_error;
818
5407e2ab 819 if (strcmp(tmp, CRIU_GITID_VERSION) < 0)
8ba5ced7
TA
820 goto version_error;
821
822 if (patch < CRIU_GITID_PATCHLEVEL)
823 goto version_error;
824
825version_match:
3158ab5b 826 fclose(f);
5407e2ab
CB
827 if (!version)
828 free(tmp);
829 else
830 *version = tmp;
8ba5ced7
TA
831 return true;
832
833version_error:
3158ab5b 834 fclose(f);
5407e2ab 835 free(tmp);
9f1f54b0 836 ERROR("must have criu " CRIU_VERSION " or greater to checkpoint/restore");
8ba5ced7
TA
837 return false;
838 }
839}
840
e29fe1dd
TA
841/* Check and make sure the container has a configuration that we know CRIU can
842 * dump. */
f1954503 843static bool criu_ok(struct lxc_container *c, char **criu_version)
e29fe1dd
TA
844{
845 struct lxc_list *it;
e29fe1dd
TA
846
847 if (geteuid()) {
9f1f54b0 848 ERROR("Must be root to checkpoint");
e29fe1dd
TA
849 return false;
850 }
851
7177e6b1
DJ
852 if (!criu_version_ok(criu_version))
853 return false;
854
e29fe1dd
TA
855 /* We only know how to restore containers with veth networks. */
856 lxc_list_for_each(it, &c->lxc_conf->network) {
857 struct lxc_netdev *n = it->elem;
65b20221
TA
858 switch(n->type) {
859 case LXC_NET_VETH:
860 case LXC_NET_NONE:
861 case LXC_NET_EMPTY:
e2697330 862 case LXC_NET_MACVLAN:
65b20221
TA
863 break;
864 default:
9f1f54b0 865 ERROR("Found un-dumpable network: %s (%s)", lxc_net_type_to_str(n->type), n->name);
7177e6b1
DJ
866 if (criu_version) {
867 free(*criu_version);
868 *criu_version = NULL;
869 }
e29fe1dd
TA
870 return false;
871 }
872 }
873
e29fe1dd
TA
874 return true;
875}
876
e29fe1dd
TA
877static bool restore_net_info(struct lxc_container *c)
878{
7eab8fc6 879 int ret;
e29fe1dd
TA
880 struct lxc_list *it;
881 bool has_error = true;
882
883 if (container_mem_lock(c))
884 return false;
885
886 lxc_list_for_each(it, &c->lxc_conf->network) {
887 struct lxc_netdev *netdev = it->elem;
888 char template[IFNAMSIZ];
65b20221
TA
889
890 if (netdev->type != LXC_NET_VETH)
891 continue;
892
7eab8fc6
CB
893 ret = snprintf(template, sizeof(template), "vethXXXXXX");
894 if (ret < 0 || ret >= sizeof(template))
895 goto out_unlock;
e29fe1dd 896
de4855a8
CB
897 if (netdev->priv.veth_attr.pair[0] == '\0' &&
898 netdev->priv.veth_attr.veth1[0] == '\0') {
3646ffd9 899 if (!lxc_ifname_alnum_case_sensitive(template))
de4855a8
CB
900 goto out_unlock;
901
cbb9c7c7 902 (void)strlcpy(netdev->priv.veth_attr.veth1, template, IFNAMSIZ);
de4855a8 903 }
e29fe1dd
TA
904 }
905
906 has_error = false;
907
908out_unlock:
909 container_mem_unlock(c);
910 return !has_error;
911}
912
1a0e70ac 913/* do_restore never returns, the calling process is used as the monitor process.
5a24adb8 914 * do_restore calls _exit() if it fails.
1a0e70ac 915 */
c33b0338 916static void do_restore(struct lxc_container *c, int status_pipe, struct migrate_opts *opts, char *criu_version)
e29fe1dd 917{
5af9369b 918 int fd, ret;
e29fe1dd 919 pid_t pid;
e29fe1dd 920 struct lxc_handler *handler;
113ebd57 921 int status = 0;
9b1e2e6e 922 int pipes[2] = {-1, -1};
2202afc9 923 struct cgroup_ops *cgroup_ops;
e29fe1dd 924
a7fb6043 925 /* Try to detach from the current controlling tty if it exists.
69e3b3be 926 * Otherwise, lxc_init (via lxc_console) will attach the container's
a7fb6043
TA
927 * console output to the current tty, which is probably not what any
928 * library user wants, and if they do, they can just manually configure
929 * it :)
930 */
931 fd = open("/dev/tty", O_RDWR);
932 if (fd >= 0) {
933 if (ioctl(fd, TIOCNOTTY, NULL) < 0)
934 SYSERROR("couldn't detach from tty");
935 close(fd);
936 }
937
a42abcce 938 handler = lxc_init_handler(NULL, c->name, c->lxc_conf, c->config_path, false);
e29fe1dd
TA
939 if (!handler)
940 goto out;
941
aa460476
CB
942 if (lxc_init(c->name, handler) < 0)
943 goto out;
944
5a087e05 945 cgroup_ops = cgroup_init(c->lxc_conf);
2202afc9 946 if (!cgroup_ops)
e29fe1dd 947 goto out_fini_handler;
2202afc9 948 handler->cgroup_ops = cgroup_ops;
e29fe1dd 949
e8b181f5 950 if (!cgroup_ops->payload_create(cgroup_ops, handler)) {
e29fe1dd
TA
951 ERROR("failed creating groups");
952 goto out_fini_handler;
953 }
954
955 if (!restore_net_info(c)) {
956 ERROR("failed restoring network info");
957 goto out_fini_handler;
958 }
959
5af9369b
CB
960 ret = resolve_clone_flags(handler);
961 if (ret < 0) {
6d1400b5 962 SYSERROR("Unsupported clone flag specified");
5af9369b
CB
963 goto out_fini_handler;
964 }
e29fe1dd 965
de31cb57 966 if (pipe2(pipes, O_CLOEXEC) < 0) {
3d9a5c85
TA
967 SYSERROR("pipe() failed");
968 goto out_fini_handler;
969 }
970
e29fe1dd
TA
971 pid = fork();
972 if (pid < 0)
973 goto out_fini_handler;
974
975 if (pid == 0) {
976 struct criu_opts os;
977 struct lxc_rootfs *rootfs;
4b54788e 978 int flags;
e29fe1dd 979
3d9a5c85
TA
980 close(status_pipe);
981 status_pipe = -1;
982
983 close(pipes[0]);
984 pipes[0] = -1;
e29fe1dd
TA
985
986 if (unshare(CLONE_NEWNS))
987 goto out_fini_handler;
988
989 /* CRIU needs the lxc root bind mounted so that it is the root of some
990 * mount. */
991 rootfs = &c->lxc_conf->rootfs;
992
993 if (rootfs_is_blockdev(c->lxc_conf)) {
8ce1abc2
CB
994 if (lxc_setup_rootfs_prepare_root(c->lxc_conf, c->name,
995 c->config_path) < 0)
e29fe1dd
TA
996 goto out_fini_handler;
997 } else {
998 if (mkdir(rootfs->mount, 0755) < 0 && errno != EEXIST)
999 goto out_fini_handler;
1000
1001 if (mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL) < 0) {
1002 SYSERROR("remount / to private failed");
1003 goto out_fini_handler;
1004 }
1005
1006 if (mount(rootfs->path, rootfs->mount, NULL, MS_BIND, NULL) < 0) {
f075e955 1007 (void)rmdir(rootfs->mount);
e29fe1dd
TA
1008 goto out_fini_handler;
1009 }
1010 }
1011
5af85cb1 1012 os.pipefd = pipes[1];
e29fe1dd 1013 os.action = "restore";
b2c3710f 1014 os.user = opts;
e29fe1dd 1015 os.c = c;
41808e20 1016 os.console_fd = c->lxc_conf->console.pty;
f1954503 1017 os.criu_version = criu_version;
0ab5703f 1018 os.handler = handler;
4b54788e 1019
97e4f1a9
TA
1020 if (os.console_fd >= 0) {
1021 /* Twiddle the FD_CLOEXEC bit. We want to pass this FD to criu
1022 * via --inherit-fd, so we don't want it to close.
1023 */
1024 flags = fcntl(os.console_fd, F_GETFD);
1025 if (flags < 0) {
1026 SYSERROR("F_GETFD failed: %d", os.console_fd);
1027 goto out_fini_handler;
1028 }
4b54788e 1029
97e4f1a9 1030 flags &= ~FD_CLOEXEC;
4b54788e 1031
97e4f1a9
TA
1032 if (fcntl(os.console_fd, F_SETFD, flags) < 0) {
1033 SYSERROR("F_SETFD failed");
1034 goto out_fini_handler;
1035 }
4b54788e
TA
1036 }
1037 os.console_name = c->lxc_conf->console.name;
e29fe1dd
TA
1038
1039 /* exec_criu() returning is an error */
e20f46f8 1040 exec_criu(cgroup_ops, c->lxc_conf, &os);
e29fe1dd 1041 umount(rootfs->mount);
f075e955 1042 (void)rmdir(rootfs->mount);
e29fe1dd
TA
1043 goto out_fini_handler;
1044 } else {
e29fe1dd
TA
1045 char title[2048];
1046
3d9a5c85
TA
1047 close(pipes[1]);
1048 pipes[1] = -1;
1049
e29fe1dd
TA
1050 pid_t w = waitpid(pid, &status, 0);
1051 if (w == -1) {
1052 SYSERROR("waitpid");
1053 goto out_fini_handler;
1054 }
1055
e29fe1dd 1056 if (WIFEXITED(status)) {
75d219f0
TA
1057 char buf[4096];
1058
e29fe1dd 1059 if (WEXITSTATUS(status)) {
3d9a5c85
TA
1060 int n;
1061
668ba602 1062 n = lxc_read_nointr(pipes[0], buf, sizeof(buf));
3d9a5c85
TA
1063 if (n < 0) {
1064 SYSERROR("failed reading from criu stderr");
1065 goto out_fini_handler;
1066 }
1067
2735dfae
TA
1068 if (n == sizeof(buf))
1069 n--;
3d9a5c85
TA
1070 buf[n] = 0;
1071
9f1f54b0 1072 ERROR("criu process exited %d, output:\n%s", WEXITSTATUS(status), buf);
e29fe1dd
TA
1073 goto out_fini_handler;
1074 } else {
3eba9b49 1075 ret = snprintf(buf, sizeof(buf), "/proc/self/task/%lu/children", (unsigned long)syscall(__NR_gettid));
75d219f0
TA
1076 if (ret < 0 || ret >= sizeof(buf)) {
1077 ERROR("snprintf'd too many characters: %d", ret);
1078 goto out_fini_handler;
1079 }
1080
4110345b 1081 FILE *f = fopen(buf, "re");
e29fe1dd 1082 if (!f) {
9f1f54b0 1083 SYSERROR("couldn't read restore's children file %s", buf);
e29fe1dd
TA
1084 goto out_fini_handler;
1085 }
1086
1087 ret = fscanf(f, "%d", (int*) &handler->pid);
1088 fclose(f);
1089 if (ret != 1) {
1090 ERROR("reading restore pid failed");
1091 goto out_fini_handler;
1092 }
1093
f8a41688
TA
1094 if (lxc_set_state(c->name, handler, RUNNING)) {
1095 ERROR("error setting running state after restore");
e29fe1dd 1096 goto out_fini_handler;
f8a41688 1097 }
e29fe1dd
TA
1098 }
1099 } else {
9f1f54b0 1100 ERROR("CRIU was killed with signal %d", WTERMSIG(status));
e29fe1dd
TA
1101 goto out_fini_handler;
1102 }
1103
3d9a5c85
TA
1104 close(pipes[0]);
1105
614be9bc 1106 ret = lxc_write_nointr(status_pipe, &status, sizeof(status));
f3886023
TA
1107 close(status_pipe);
1108 status_pipe = -1;
1109
1110 if (sizeof(status) != ret) {
1111 SYSERROR("failed to write all of status");
1112 goto out_fini_handler;
1113 }
1114
e29fe1dd
TA
1115 /*
1116 * See comment in lxcapi_start; we don't care if these
1117 * fail because it's just a beauty thing. We just
1118 * assign the return here to silence potential.
1119 */
1120 ret = snprintf(title, sizeof(title), "[lxc monitor] %s %s", c->config_path, c->name);
223e30c1
CB
1121 if (ret < 0 || (size_t)ret >= sizeof(title))
1122 INFO("Setting truncated process name");
1123
e29fe1dd 1124 ret = setproctitle(title);
223e30c1
CB
1125 if (ret < 0)
1126 INFO("Failed to set process name");
e29fe1dd
TA
1127
1128 ret = lxc_poll(c->name, handler);
1129 if (ret)
0c5859ff 1130 lxc_abort(handler);
fd5be714 1131 lxc_end(handler);
5a24adb8 1132 _exit(ret);
e29fe1dd
TA
1133 }
1134
1135out_fini_handler:
3d9a5c85
TA
1136 if (pipes[0] >= 0)
1137 close(pipes[0]);
1138 if (pipes[1] >= 0)
1139 close(pipes[1]);
1140
fd5be714 1141 lxc_end(handler);
e29fe1dd
TA
1142
1143out:
3d9a5c85 1144 if (status_pipe >= 0) {
f3886023
TA
1145 /* ensure getting here was a failure, e.g. if we failed to
1146 * parse the child pid or something, even after a successful
1147 * restore
1148 */
1149 if (!status)
1150 status = 1;
113ebd57 1151
614be9bc 1152 if (lxc_write_nointr(status_pipe, &status, sizeof(status)) != sizeof(status))
e29fe1dd 1153 SYSERROR("writing status failed");
3d9a5c85 1154 close(status_pipe);
e29fe1dd
TA
1155 }
1156
5a24adb8 1157 _exit(EXIT_FAILURE);
e29fe1dd 1158}
aef3d51e 1159
4b54788e
TA
1160static int save_tty_major_minor(char *directory, struct lxc_container *c, char *tty_id, int len)
1161{
1162 FILE *f;
1163 char path[PATH_MAX];
1164 int ret;
1165 struct stat sb;
1166
1167 if (c->lxc_conf->console.path && !strcmp(c->lxc_conf->console.path, "none")) {
1168 tty_id[0] = 0;
1169 return 0;
1170 }
1171
1172 ret = snprintf(path, sizeof(path), "/proc/%d/root/dev/console", c->init_pid(c));
1173 if (ret < 0 || ret >= sizeof(path)) {
f510330c 1174 ERROR("snprintf'd too many characters: %d", ret);
4b54788e
TA
1175 return -1;
1176 }
1177
1178 ret = stat(path, &sb);
1179 if (ret < 0) {
1180 SYSERROR("stat of %s failed", path);
1181 return -1;
1182 }
1183
1184 ret = snprintf(path, sizeof(path), "%s/tty.info", directory);
1185 if (ret < 0 || ret >= sizeof(path)) {
1186 ERROR("snprintf'd too many characters: %d", ret);
1187 return -1;
1188 }
1189
f03280a7
TA
1190 ret = snprintf(tty_id, len, "tty[%llx:%llx]",
1191 (long long unsigned) sb.st_rdev,
1192 (long long unsigned) sb.st_dev);
4b54788e
TA
1193 if (ret < 0 || ret >= sizeof(path)) {
1194 ERROR("snprintf'd too many characters: %d", ret);
1195 return -1;
1196 }
1197
4110345b 1198 f = fopen(path, "we");
4b54788e
TA
1199 if (!f) {
1200 SYSERROR("failed to open %s", path);
1201 return -1;
1202 }
1203
1204 ret = fprintf(f, "%s", tty_id);
1205 fclose(f);
1206 if (ret < 0)
1207 SYSERROR("failed to write to %s", path);
1208 return ret;
1209}
1210
aef3d51e 1211/* do one of either predump or a regular dump */
b2c3710f 1212static bool do_dump(struct lxc_container *c, char *mode, struct migrate_opts *opts)
aef3d51e 1213{
0e4adc1a 1214 int ret;
aef3d51e 1215 pid_t pid;
5af85cb1 1216 int criuout[2];
0e4adc1a 1217 char *criu_version = NULL;
aef3d51e 1218
f1954503 1219 if (!criu_ok(c, &criu_version))
aef3d51e
TA
1220 return false;
1221
0e4adc1a
CB
1222 ret = pipe(criuout);
1223 if (ret < 0) {
5af85cb1 1224 SYSERROR("pipe() failed");
7177e6b1 1225 free(criu_version);
aef3d51e 1226 return false;
5af85cb1
TA
1227 }
1228
1229 if (mkdir_p(opts->directory, 0700) < 0)
1230 goto fail;
aef3d51e
TA
1231
1232 pid = fork();
1233 if (pid < 0) {
1234 SYSERROR("fork failed");
5af85cb1 1235 goto fail;
aef3d51e
TA
1236 }
1237
1238 if (pid == 0) {
1239 struct criu_opts os;
2202afc9 1240 struct cgroup_ops *cgroup_ops;
0ab5703f 1241
5af85cb1
TA
1242 close(criuout[0]);
1243
5a087e05 1244 cgroup_ops = cgroup_init(c->lxc_conf);
2202afc9 1245 if (!cgroup_ops) {
0ab5703f 1246 ERROR("failed to cgroup_init()");
7211378b 1247 _exit(EXIT_FAILURE);
0ab5703f 1248 }
aef3d51e 1249
5af85cb1 1250 os.pipefd = criuout[1];
aef3d51e 1251 os.action = mode;
b2c3710f 1252 os.user = opts;
aef3d51e 1253 os.c = c;
4b54788e 1254 os.console_name = c->lxc_conf->console.path;
f1954503 1255 os.criu_version = criu_version;
e20f46f8 1256 os.handler = NULL;
74eb576c 1257
0e4adc1a
CB
1258 ret = save_tty_major_minor(opts->directory, c, os.tty_id, sizeof(os.tty_id));
1259 if (ret < 0) {
1260 free(criu_version);
7211378b 1261 _exit(EXIT_FAILURE);
0e4adc1a 1262 }
aef3d51e
TA
1263
1264 /* exec_criu() returning is an error */
e20f46f8 1265 exec_criu(cgroup_ops, c->lxc_conf, &os);
0e4adc1a 1266 free(criu_version);
7211378b 1267 _exit(EXIT_FAILURE);
aef3d51e
TA
1268 } else {
1269 int status;
5af85cb1
TA
1270 ssize_t n;
1271 char buf[4096];
5af85cb1
TA
1272
1273 close(criuout[1]);
1274
aef3d51e
TA
1275 pid_t w = waitpid(pid, &status, 0);
1276 if (w == -1) {
1277 SYSERROR("waitpid");
5af85cb1 1278 close(criuout[0]);
7177e6b1 1279 free(criu_version);
aef3d51e
TA
1280 return false;
1281 }
1282
668ba602 1283 n = lxc_read_nointr(criuout[0], buf, sizeof(buf));
5af85cb1
TA
1284 close(criuout[0]);
1285 if (n < 0) {
1286 SYSERROR("read");
1287 n = 0;
1288 }
40229e95 1289
1290 if (n == sizeof(buf))
1291 buf[n-1] = 0;
1292 else
1293 buf[n] = 0;
5af85cb1 1294
aef3d51e
TA
1295 if (WIFEXITED(status)) {
1296 if (WEXITSTATUS(status)) {
9f1f54b0 1297 ERROR("dump failed with %d", WEXITSTATUS(status));
5af85cb1
TA
1298 ret = false;
1299 } else {
1300 ret = true;
aef3d51e 1301 }
aef3d51e 1302 } else if (WIFSIGNALED(status)) {
9f1f54b0 1303 ERROR("dump signaled with %d", WTERMSIG(status));
5af85cb1 1304 ret = false;
aef3d51e 1305 } else {
9f1f54b0 1306 ERROR("unknown dump exit %d", status);
5af85cb1 1307 ret = false;
aef3d51e 1308 }
5af85cb1
TA
1309
1310 if (!ret)
1311 ERROR("criu output: %s", buf);
7177e6b1
DJ
1312
1313 free(criu_version);
5af85cb1 1314 return ret;
aef3d51e 1315 }
5af85cb1
TA
1316fail:
1317 close(criuout[0]);
1318 close(criuout[1]);
f075e955 1319 (void)rmdir(opts->directory);
0e4adc1a 1320 free(criu_version);
5af85cb1 1321 return false;
aef3d51e
TA
1322}
1323
b2c3710f 1324bool __criu_pre_dump(struct lxc_container *c, struct migrate_opts *opts)
aef3d51e 1325{
b2c3710f 1326 return do_dump(c, "pre-dump", opts);
aef3d51e
TA
1327}
1328
b2c3710f 1329bool __criu_dump(struct lxc_container *c, struct migrate_opts *opts)
aef3d51e
TA
1330{
1331 char path[PATH_MAX];
1332 int ret;
1333
b2c3710f 1334 ret = snprintf(path, sizeof(path), "%s/inventory.img", opts->directory);
aef3d51e
TA
1335 if (ret < 0 || ret >= sizeof(path))
1336 return false;
1337
1338 if (access(path, F_OK) == 0) {
9f1f54b0 1339 ERROR("please use a fresh directory for the dump directory");
aef3d51e
TA
1340 return false;
1341 }
1342
b2c3710f 1343 return do_dump(c, "dump", opts);
aef3d51e
TA
1344}
1345
b2c3710f 1346bool __criu_restore(struct lxc_container *c, struct migrate_opts *opts)
aef3d51e
TA
1347{
1348 pid_t pid;
1349 int status, nread;
1350 int pipefd[2];
f1954503 1351 char *criu_version = NULL;
aef3d51e 1352
aef3d51e 1353 if (geteuid()) {
9f1f54b0 1354 ERROR("Must be root to restore");
aef3d51e
TA
1355 return false;
1356 }
1357
1358 if (pipe(pipefd)) {
1359 ERROR("failed to create pipe");
1360 return false;
1361 }
1362
7177e6b1
DJ
1363 if (!criu_ok(c, &criu_version)) {
1364 close(pipefd[0]);
1365 close(pipefd[1]);
1366 return false;
1367 }
1368
aef3d51e
TA
1369 pid = fork();
1370 if (pid < 0) {
1371 close(pipefd[0]);
1372 close(pipefd[1]);
7177e6b1 1373 free(criu_version);
aef3d51e
TA
1374 return false;
1375 }
1376
1377 if (pid == 0) {
1378 close(pipefd[0]);
1a0e70ac 1379 /* this never returns */
f1954503 1380 do_restore(c, pipefd[1], opts, criu_version);
aef3d51e
TA
1381 }
1382
1383 close(pipefd[1]);
7177e6b1 1384 free(criu_version);
aef3d51e 1385
668ba602 1386 nread = lxc_read_nointr(pipefd[0], &status, sizeof(status));
aef3d51e
TA
1387 close(pipefd[0]);
1388 if (sizeof(status) != nread) {
1389 ERROR("reading status from pipe failed");
1390 goto err_wait;
1391 }
1392
1a0e70ac
CB
1393 /* If the criu process was killed or exited nonzero, wait() for the
1394 * handler, since the restore process died. Otherwise, we don't need to
1395 * wait, since the child becomes the monitor process.
1396 */
aef3d51e
TA
1397 if (!WIFEXITED(status) || WEXITSTATUS(status))
1398 goto err_wait;
1399 return true;
1400
1401err_wait:
1402 if (wait_for_pid(pid))
1403 ERROR("restore process died");
1404 return false;
1405}