]> git.proxmox.com Git - mirror_lxc.git/blame - src/lxc/criu.c
Merge pull request #2516 from 2xsec/bugfix
[mirror_lxc.git] / src / lxc / criu.c
CommitLineData
e29fe1dd
TA
1/*
2 * lxc: linux Container library
3 *
4 * Copyright © 2014-2015 Canonical Ltd.
5 *
6 * Authors:
7 * Tycho Andersen <tycho.andersen@canonical.com>
8 *
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 */
23#define _GNU_SOURCE
9b945f13 24#include <inttypes.h>
e29fe1dd
TA
25#include <linux/limits.h>
26#include <sched.h>
27#include <stdio.h>
28#include <stdlib.h>
29#include <string.h>
30#include <sys/mount.h>
31#include <sys/types.h>
32#include <sys/wait.h>
33#include <unistd.h>
34
35#include "config.h"
36
e29fe1dd
TA
37#include "cgroup.h"
38#include "conf.h"
dc259399 39#include "commands.h"
e29fe1dd
TA
40#include "criu.h"
41#include "log.h"
42#include "lxc.h"
43#include "lxclock.h"
44#include "network.h"
28d832c4 45#include "storage.h"
e29fe1dd
TA
46#include "utils.h"
47
5f4e44a2
TA
48#if IS_BIONIC
49#include <../include/lxcmntent.h>
50#else
51#include <mntent.h>
52#endif
53
9de31d5a
CB
54#ifndef HAVE_STRLCPY
55#include "include/strlcpy.h"
56#endif
57
c33b0338 58#define CRIU_VERSION "2.0"
73d46752
TA
59
60#define CRIU_GITID_VERSION "2.0"
61#define CRIU_GITID_PATCHLEVEL 0
62
f1954503 63#define CRIU_IN_FLIGHT_SUPPORT "2.4"
46c8ffd5 64#define CRIU_EXTERNAL_NOT_VETH "2.8"
f1954503 65
ac2cecc4 66lxc_log_define(criu, lxc);
e29fe1dd 67
73d46752 68struct criu_opts {
5af85cb1
TA
69 /* the thing to hook to stdout and stderr for logging */
70 int pipefd;
71
73d46752
TA
72 /* The type of criu invocation, one of "dump" or "restore" */
73 char *action;
74
b2c3710f
TA
75 /* the user-provided migrate options relevant to this action */
76 struct migrate_opts *user;
73d46752
TA
77
78 /* The container to dump */
79 struct lxc_container *c;
80
73d46752 81 /* dump: stop the container or not after dumping? */
4b54788e 82 char tty_id[32]; /* the criu tty id for /dev/console, i.e. "tty[${rdev}:${dev}]" */
73d46752
TA
83
84 /* restore: the file to write the init process' pid into */
0ab5703f 85 struct lxc_handler *handler;
4b54788e
TA
86 int console_fd;
87 /* The path that is bind mounted from /dev/console, if any. We don't
88 * want to use `--ext-mount-map auto`'s result here because the pts
89 * device may have a different path (e.g. if the pty number is
3aed4934 90 * different) on the target host. NULL if lxc.console.path = "none".
4b54788e
TA
91 */
92 char *console_name;
f1954503
AR
93
94 /* The detected version of criu */
95 char *criu_version;
73d46752
TA
96};
97
4b54788e
TA
98static int load_tty_major_minor(char *directory, char *output, int len)
99{
100 FILE *f;
101 char path[PATH_MAX];
102 int ret;
103
104 ret = snprintf(path, sizeof(path), "%s/tty.info", directory);
105 if (ret < 0 || ret >= sizeof(path)) {
106 ERROR("snprintf'd too many chacters: %d", ret);
107 return -1;
108 }
109
110 f = fopen(path, "r");
111 if (!f) {
112 /* This means we're coming from a liblxc which didn't export
3aed4934
CB
113 * the tty info. In this case they had to have lxc.console.path
114 * = * none, so there's no problem restoring.
4b54788e
TA
115 */
116 if (errno == ENOENT)
117 return 0;
118
119 SYSERROR("couldn't open %s", path);
120 return -1;
121 }
122
123 if (!fgets(output, len, f)) {
124 fclose(f);
125 SYSERROR("couldn't read %s", path);
126 return -1;
127 }
128
129 fclose(f);
130 return 0;
131}
132
74ad3607
FB
133static int cmp_version(const char *v1, const char *v2)
134{
135 int ret;
136 int oct_v1[3], oct_v2[3];
137
138 memset(oct_v1, -1, sizeof(oct_v1));
139 memset(oct_v2, -1, sizeof(oct_v2));
140
141 ret = sscanf(v1, "%d.%d.%d", &oct_v1[0], &oct_v1[1], &oct_v1[2]);
142 if (ret < 1)
143 return -1;
144
145 ret = sscanf(v2, "%d.%d.%d", &oct_v2[0], &oct_v2[1], &oct_v2[2]);
146 if (ret < 1)
147 return -1;
148
149 /* Major version is greater. */
150 if (oct_v1[0] > oct_v2[0])
151 return 1;
152
153 if (oct_v1[0] < oct_v2[0])
154 return -1;
155
156 /* Minor number is greater.*/
157 if (oct_v1[1] > oct_v2[1])
158 return 1;
159
160 if (oct_v1[1] < oct_v2[1])
161 return -1;
162
163 /* Patch number is greater. */
164 if (oct_v1[2] > oct_v2[2])
165 return 1;
166
167 /* Patch numbers are equal. */
168 if (oct_v1[2] == oct_v2[2])
169 return 0;
170
171 return -1;
172}
173
2202afc9 174static void exec_criu(struct cgroup_ops *cgroup_ops, struct criu_opts *opts)
e29fe1dd
TA
175{
176 char **argv, log[PATH_MAX];
19d1509c 177 int static_args = 23, argc = 0, i, ret;
e29fe1dd
TA
178 int netnr = 0;
179 struct lxc_list *it;
5f4e44a2
TA
180 FILE *mnts;
181 struct mntent mntent;
e29fe1dd 182
0e4be3cf 183 char buf[4096], ttys[32];
a17fa3c0 184 size_t pos;
5af85cb1 185
e9195050
TA
186 /* If we are currently in a cgroup /foo/bar, and the container is in a
187 * cgroup /lxc/foo, lxcfs will give us an ENOENT if some task in the
188 * container has an open fd that points to one of the cgroup files
189 * (systemd always opens its "root" cgroup). So, let's escape to the
190 * /actual/ root cgroup so that lxcfs thinks criu has enough rights to
191 * see all cgroups.
192 */
2202afc9 193 if (!cgroup_ops->escape(cgroup_ops)) {
e9195050
TA
194 ERROR("failed to escape cgroups");
195 return;
196 }
197
e29fe1dd 198 /* The command line always looks like:
19d1509c 199 * criu $(action) --tcp-established --file-locks --link-remap \
5f178bc9 200 * --manage-cgroups=full --action-script foo.sh -D $(directory) \
e29fe1dd
TA
201 * -o $(directory)/$(action).log --ext-mount-map auto
202 * --enable-external-sharing --enable-external-masters
4b54788e 203 * --enable-fs hugetlbfs --enable-fs tracefs --ext-mount-map console:/dev/pts/n
e29fe1dd
TA
204 * +1 for final NULL */
205
aef3d51e 206 if (strcmp(opts->action, "dump") == 0 || strcmp(opts->action, "pre-dump") == 0) {
dc259399
TA
207 /* -t pid --freeze-cgroup /lxc/ct */
208 static_args += 4;
e29fe1dd 209
aef3d51e 210 /* --prev-images-dir <path-to-directory-A-relative-to-B> */
b2c3710f 211 if (opts->user->predump_dir)
aef3d51e
TA
212 static_args += 2;
213
74eb576c 214 /* --page-server --address <address> --port <port> */
b2c3710f 215 if (opts->user->pageserver_address && opts->user->pageserver_port)
74eb576c
NE
216 static_args += 5;
217
aef3d51e 218 /* --leave-running (only for final dump) */
b2c3710f 219 if (strcmp(opts->action, "dump") == 0 && !opts->user->stop)
e29fe1dd 220 static_args++;
4b54788e
TA
221
222 /* --external tty[88,4] */
223 if (opts->tty_id[0])
224 static_args += 2;
19d1509c
TA
225
226 /* --force-irmap */
227 if (!opts->user->preserves_inodes)
228 static_args++;
b2b7b0d2
TA
229
230 /* --ghost-limit 1024 */
231 if (opts->user->ghost_limit)
232 static_args += 2;
e29fe1dd
TA
233 } else if (strcmp(opts->action, "restore") == 0) {
234 /* --root $(lxc_mount_point) --restore-detached
0ab5703f 235 * --restore-sibling
13389b29
TA
236 * --lsm-profile apparmor:whatever
237 */
0ab5703f 238 static_args += 6;
4b54788e 239
0e4be3cf
CB
240 ttys[0] = 0;
241 if (load_tty_major_minor(opts->user->directory, ttys, sizeof(ttys)))
4b54788e
TA
242 return;
243
244 /* --inherit-fd fd[%d]:tty[%s] */
0e4be3cf 245 if (ttys[0])
4b54788e 246 static_args += 2;
e29fe1dd
TA
247 } else {
248 return;
249 }
250
2202afc9
CB
251 if (cgroup_ops->num_hierarchies(cgroup_ops) > 0)
252 static_args += 2 * cgroup_ops->num_hierarchies(cgroup_ops);
0ab5703f 253
b2c3710f 254 if (opts->user->verbose)
e29fe1dd
TA
255 static_args++;
256
b9ee6643
TA
257 if (opts->user->action_script)
258 static_args += 2;
259
5f4e44a2
TA
260 static_args += 2 * lxc_list_len(&opts->c->lxc_conf->mount_list);
261
b2c3710f 262 ret = snprintf(log, PATH_MAX, "%s/%s.log", opts->user->directory, opts->action);
e29fe1dd 263 if (ret < 0 || ret >= PATH_MAX) {
9f1f54b0 264 ERROR("logfile name too long");
e29fe1dd
TA
265 return;
266 }
267
268 argv = malloc(static_args * sizeof(*argv));
269 if (!argv)
270 return;
271
272 memset(argv, 0, static_args * sizeof(*argv));
273
274#define DECLARE_ARG(arg) \
275 do { \
276 if (arg == NULL) { \
277 ERROR("Got NULL argument for criu"); \
278 goto err; \
279 } \
280 argv[argc++] = strdup(arg); \
281 if (!argv[argc-1]) \
282 goto err; \
283 } while (0)
284
285 argv[argc++] = on_path("criu", NULL);
286 if (!argv[argc-1]) {
9f1f54b0 287 ERROR("Couldn't find criu binary");
e29fe1dd
TA
288 goto err;
289 }
290
291 DECLARE_ARG(opts->action);
292 DECLARE_ARG("--tcp-established");
293 DECLARE_ARG("--file-locks");
294 DECLARE_ARG("--link-remap");
0a5fc6df 295 DECLARE_ARG("--manage-cgroups=full");
e29fe1dd
TA
296 DECLARE_ARG("--ext-mount-map");
297 DECLARE_ARG("auto");
298 DECLARE_ARG("--enable-external-sharing");
299 DECLARE_ARG("--enable-external-masters");
dd62857a
TA
300 DECLARE_ARG("--enable-fs");
301 DECLARE_ARG("hugetlbfs");
5b454329
TA
302 DECLARE_ARG("--enable-fs");
303 DECLARE_ARG("tracefs");
e29fe1dd 304 DECLARE_ARG("-D");
b2c3710f 305 DECLARE_ARG(opts->user->directory);
e29fe1dd
TA
306 DECLARE_ARG("-o");
307 DECLARE_ARG(log);
308
2202afc9 309 for (i = 0; i < cgroup_ops->num_hierarchies(cgroup_ops); i++) {
0ab5703f 310 char **controllers = NULL, *fullname;
31b204e4 311 char *path, *tmp;
0ab5703f 312
2202afc9 313 if (!cgroup_ops->get_hierarchies(cgroup_ops, i, &controllers)) {
0ab5703f
TA
314 ERROR("failed to get hierarchy %d", i);
315 goto err;
316 }
317
318 /* if we are in a dump, we have to ask the monitor process what
319 * the right cgroup is. if this is a restore, we can just use
320 * the handler the restore task created.
321 */
322 if (!strcmp(opts->action, "dump") || !strcmp(opts->action, "pre-dump")) {
323 path = lxc_cmd_get_cgroup_path(opts->c->name, opts->c->config_path, controllers[0]);
324 if (!path) {
325 ERROR("failed to get cgroup path for %s", controllers[0]);
326 goto err;
327 }
328 } else {
329 const char *p;
330
2202afc9 331 p = cgroup_ops->get_cgroup(cgroup_ops, controllers[0]);
0ab5703f
TA
332 if (!p) {
333 ERROR("failed to get cgroup path for %s", controllers[0]);
334 goto err;
335 }
336
337 path = strdup(p);
338 if (!path) {
339 ERROR("strdup failed");
340 goto err;
341 }
342 }
343
31b204e4
CB
344 tmp = lxc_deslashify(path);
345 if (!tmp) {
346 ERROR("Failed to remove extraneous slashes from \"%s\"",
347 path);
0ab5703f
TA
348 free(path);
349 goto err;
350 }
31b204e4
CB
351 free(path);
352 path = tmp;
0ab5703f
TA
353
354 fullname = lxc_string_join(",", (const char **) controllers, false);
355 if (!fullname) {
356 ERROR("failed to join controllers");
357 free(path);
358 goto err;
359 }
360
361 ret = sprintf(buf, "%s:%s", fullname, path);
362 free(path);
363 free(fullname);
364 if (ret < 0 || ret >= sizeof(buf)) {
365 ERROR("sprintf of cgroup root arg failed");
366 goto err;
367 }
368
369 DECLARE_ARG("--cgroup-root");
370 DECLARE_ARG(buf);
371 }
372
b2c3710f 373 if (opts->user->verbose)
e29fe1dd
TA
374 DECLARE_ARG("-vvvvvv");
375
b9ee6643
TA
376 if (opts->user->action_script) {
377 DECLARE_ARG("--action-script");
378 DECLARE_ARG(opts->user->action_script);
379 }
380
1800f924
WB
381 mnts = make_anonymous_mount_file(&opts->c->lxc_conf->mount_list,
382 opts->c->lxc_conf->lsm_aa_allow_nesting);
5f4e44a2
TA
383 if (!mnts)
384 goto err;
385
386 while (getmntent_r(mnts, &mntent, buf, sizeof(buf))) {
19d2422b 387 char *fmt, *key, *val, *mntdata;
5f4e44a2 388 char arg[2 * PATH_MAX + 2];
19d2422b
TA
389 unsigned long flags;
390
391 if (parse_mntopts(mntent.mnt_opts, &flags, &mntdata) < 0)
392 goto err;
393
394 free(mntdata);
395
396 /* only add --ext-mount-map for actual bind mounts */
397 if (!(flags & MS_BIND))
398 continue;
5f4e44a2
TA
399
400 if (strcmp(opts->action, "dump") == 0) {
401 fmt = "/%s:%s";
402 key = mntent.mnt_dir;
403 val = mntent.mnt_dir;
404 } else {
405 fmt = "%s:%s";
406 key = mntent.mnt_dir;
407 val = mntent.mnt_fsname;
408 }
409
410 ret = snprintf(arg, sizeof(arg), fmt, key, val);
411 if (ret < 0 || ret >= sizeof(arg)) {
412 fclose(mnts);
413 ERROR("snprintf failed");
414 goto err;
415 }
416
417 DECLARE_ARG("--ext-mount-map");
418 DECLARE_ARG(arg);
419 }
420 fclose(mnts);
421
aef3d51e 422 if (strcmp(opts->action, "dump") == 0 || strcmp(opts->action, "pre-dump") == 0) {
dc259399 423 char pid[32], *freezer_relative;
e29fe1dd
TA
424
425 if (sprintf(pid, "%d", opts->c->init_pid(opts->c)) < 0)
426 goto err;
427
428 DECLARE_ARG("-t");
429 DECLARE_ARG(pid);
dc259399
TA
430
431 freezer_relative = lxc_cmd_get_cgroup_path(opts->c->name,
432 opts->c->config_path,
433 "freezer");
434 if (!freezer_relative) {
435 ERROR("failed getting freezer path");
436 goto err;
437 }
438
439 ret = snprintf(log, sizeof(log), "/sys/fs/cgroup/freezer/%s", freezer_relative);
440 if (ret < 0 || ret >= sizeof(log))
441 goto err;
442
f1954503
AR
443 if (!opts->user->disable_skip_in_flight &&
444 strcmp(opts->criu_version, CRIU_IN_FLIGHT_SUPPORT) >= 0)
445 DECLARE_ARG("--skip-in-flight");
446
dc259399
TA
447 DECLARE_ARG("--freeze-cgroup");
448 DECLARE_ARG(log);
449
4b54788e 450 if (opts->tty_id[0]) {
36d2096c
TA
451 DECLARE_ARG("--ext-mount-map");
452 DECLARE_ARG("/dev/console:console");
453
4b54788e
TA
454 DECLARE_ARG("--external");
455 DECLARE_ARG(opts->tty_id);
456 }
457
b2c3710f 458 if (opts->user->predump_dir) {
aef3d51e 459 DECLARE_ARG("--prev-images-dir");
b2c3710f 460 DECLARE_ARG(opts->user->predump_dir);
9f99a33f 461 DECLARE_ARG("--track-mem");
74eb576c 462 }
4c0c0319 463
b2c3710f 464 if (opts->user->pageserver_address && opts->user->pageserver_port) {
74eb576c
NE
465 DECLARE_ARG("--page-server");
466 DECLARE_ARG("--address");
b2c3710f 467 DECLARE_ARG(opts->user->pageserver_address);
74eb576c 468 DECLARE_ARG("--port");
b2c3710f 469 DECLARE_ARG(opts->user->pageserver_port);
74eb576c 470 }
aef3d51e 471
19d1509c
TA
472 if (!opts->user->preserves_inodes)
473 DECLARE_ARG("--force-irmap");
474
b2b7b0d2
TA
475 if (opts->user->ghost_limit) {
476 char ghost_limit[32];
477
9b945f13 478 ret = sprintf(ghost_limit, "%"PRIu64, opts->user->ghost_limit);
b2b7b0d2 479 if (ret < 0 || ret >= sizeof(ghost_limit)) {
9b945f13 480 ERROR("failed to print ghost limit %"PRIu64, opts->user->ghost_limit);
b2b7b0d2
TA
481 goto err;
482 }
483
484 DECLARE_ARG("--ghost-limit");
485 DECLARE_ARG(ghost_limit);
486 }
487
aef3d51e 488 /* only for final dump */
b2c3710f 489 if (strcmp(opts->action, "dump") == 0 && !opts->user->stop)
e29fe1dd
TA
490 DECLARE_ARG("--leave-running");
491 } else if (strcmp(opts->action, "restore") == 0) {
492 void *m;
493 int additional;
13389b29 494 struct lxc_conf *lxc_conf = opts->c->lxc_conf;
e29fe1dd
TA
495
496 DECLARE_ARG("--root");
497 DECLARE_ARG(opts->c->lxc_conf->rootfs.mount);
498 DECLARE_ARG("--restore-detached");
499 DECLARE_ARG("--restore-sibling");
e29fe1dd 500
0e4be3cf 501 if (ttys[0]) {
97e4f1a9 502 if (opts->console_fd < 0) {
3aed4934 503 ERROR("lxc.console.path configured on source host but not target");
97e4f1a9
TA
504 goto err;
505 }
506
0e4be3cf 507 ret = snprintf(buf, sizeof(buf), "fd[%d]:%s", opts->console_fd, ttys);
4b54788e
TA
508 if (ret < 0 || ret >= sizeof(buf))
509 goto err;
510
511 DECLARE_ARG("--inherit-fd");
512 DECLARE_ARG(buf);
513 }
514 if (opts->console_name) {
515 if (snprintf(buf, sizeof(buf), "console:%s", opts->console_name) < 0) {
516 SYSERROR("sprintf'd too many bytes");
517 }
518 DECLARE_ARG("--ext-mount-map");
519 DECLARE_ARG(buf);
520 }
521
13389b29
TA
522 if (lxc_conf->lsm_aa_profile || lxc_conf->lsm_se_context) {
523
524 if (lxc_conf->lsm_aa_profile)
525 ret = snprintf(buf, sizeof(buf), "apparmor:%s", lxc_conf->lsm_aa_profile);
526 else
527 ret = snprintf(buf, sizeof(buf), "selinux:%s", lxc_conf->lsm_se_context);
528
529 if (ret < 0 || ret >= sizeof(buf))
530 goto err;
531
532 DECLARE_ARG("--lsm-profile");
533 DECLARE_ARG(buf);
534 }
535
e29fe1dd
TA
536 additional = lxc_list_len(&opts->c->lxc_conf->network) * 2;
537
fa071249
TA
538 m = realloc(argv, (argc + additional + 1) * sizeof(*argv));
539 if (!m)
540 goto err;
e29fe1dd
TA
541 argv = m;
542
543 lxc_list_for_each(it, &opts->c->lxc_conf->network) {
9de31d5a 544 size_t retlen;
e29fe1dd 545 char eth[128], *veth;
46c8ffd5 546 char *fmt;
e29fe1dd 547 struct lxc_netdev *n = it->elem;
46c8ffd5
AR
548 bool external_not_veth;
549
74ad3607 550 if (cmp_version(opts->criu_version, CRIU_EXTERNAL_NOT_VETH) >= 0) {
46c8ffd5
AR
551 /* Since criu version 2.8 the usage of --veth-pair
552 * has been deprecated:
553 * git tag --contains f2037e6d3445fc400
554 * v2.8 */
555 external_not_veth = true;
556 } else {
557 external_not_veth = false;
558 }
e29fe1dd 559
42277b1c 560 if (n->name[0] != '\0') {
9de31d5a
CB
561 retlen = strlcpy(eth, n->name, sizeof(eth));
562 if (retlen >= sizeof(eth))
e29fe1dd 563 goto err;
796a109d
TA
564 } else {
565 ret = snprintf(eth, sizeof(eth), "eth%d", netnr);
566 if (ret < 0 || ret >= sizeof(eth))
567 goto err;
568 }
e29fe1dd 569
e2697330
TA
570 switch (n->type) {
571 case LXC_NET_VETH:
572 veth = n->priv.veth_attr.pair;
ea7f6b29
CB
573 if (veth[0] == '\0')
574 veth = n->priv.veth_attr.veth1;
e29fe1dd 575
de4855a8 576 if (n->link[0] != '\0') {
46c8ffd5
AR
577 if (external_not_veth)
578 fmt = "veth[%s]:%s@%s";
579 else
580 fmt = "%s=%s@%s";
581
582 ret = snprintf(buf, sizeof(buf), fmt, eth, veth, n->link);
583 } else {
584 if (external_not_veth)
585 fmt = "veth[%s]:%s";
586 else
587 fmt = "%s=%s";
588
589 ret = snprintf(buf, sizeof(buf), fmt, eth, veth);
590 }
e2697330
TA
591 if (ret < 0 || ret >= sizeof(buf))
592 goto err;
593 break;
594 case LXC_NET_MACVLAN:
de4855a8 595 if (n->link[0] == '\0') {
9f1f54b0 596 ERROR("no host interface for macvlan %s", n->name);
e2697330
TA
597 goto err;
598 }
599
600 ret = snprintf(buf, sizeof(buf), "macvlan[%s]:%s", eth, n->link);
601 if (ret < 0 || ret >= sizeof(buf))
602 goto err;
603 break;
604 case LXC_NET_NONE:
605 case LXC_NET_EMPTY:
606 break;
607 default:
608 /* we have screened for this earlier... */
9f1f54b0 609 ERROR("unexpected network type %d", n->type);
e29fe1dd 610 goto err;
e2697330 611 }
e29fe1dd 612
46c8ffd5
AR
613 if (external_not_veth)
614 DECLARE_ARG("--external");
615 else
616 DECLARE_ARG("--veth-pair");
e29fe1dd 617 DECLARE_ARG(buf);
2f3fbc6b 618 netnr++;
e29fe1dd
TA
619 }
620
621 }
622
623 argv[argc] = NULL;
624
cf4b07a5 625 buf[0] = 0;
a17fa3c0 626 pos = 0;
72a30576 627
cf4b07a5 628 for (i = 0; argv[i]; i++) {
72a30576
NE
629 ret = snprintf(buf + pos, sizeof(buf) - pos, "%s ", argv[i]);
630 if (ret < 0 || ret >= sizeof(buf) - pos)
631 goto err;
632 else
633 pos += ret;
cf4b07a5
TA
634 }
635
636 INFO("execing: %s", buf);
637
5af85cb1
TA
638 /* before criu inits its log, it sometimes prints things to stdout/err;
639 * let's be sure we capture that.
640 */
641 if (dup2(opts->pipefd, STDOUT_FILENO) < 0) {
642 SYSERROR("dup2 stdout failed");
643 goto err;
644 }
645
646 if (dup2(opts->pipefd, STDERR_FILENO) < 0) {
647 SYSERROR("dup2 stderr failed");
648 goto err;
649 }
650
651 close(opts->pipefd);
652
e29fe1dd
TA
653#undef DECLARE_ARG
654 execv(argv[0], argv);
655err:
e29fe1dd
TA
656 for (i = 0; argv[i]; i++)
657 free(argv[i]);
658 free(argv);
659}
660
b5b12b9e
AR
661/*
662 * Function to check if the checks activated in 'features_to_check' are
663 * available with the current architecture/kernel/criu combination.
664 *
665 * Parameter features_to_check is a bit mask of all features that should be
666 * checked (see feature check defines in lxc/lxccontainer.h).
667 *
668 * If the return value is true, all requested features are supported. If
669 * the return value is false the features_to_check parameter is updated
670 * to reflect which features are available. '0' means no feature but
671 * also that something went totally wrong.
672 *
673 * Some of the code flow of criu_version_ok() is duplicated and maybe it
674 * is a good candidate for refactoring.
675 */
676bool __criu_check_feature(uint64_t *features_to_check)
677{
678 pid_t pid;
679 uint64_t current_bit = 0;
680 int ret;
681 int features = *features_to_check;
682 /* Feature checking is currently always like
683 * criu check --feature <feature-name>
684 */
685 char *args[] = { "criu", "check", "--feature", NULL, NULL };
686
687 if ((features & ~FEATURE_MEM_TRACK & ~FEATURE_LAZY_PAGES) != 0) {
688 /* There are feature bits activated we do not understand.
689 * Refusing to answer at all */
690 *features_to_check = 0;
691 return false;
692 }
693
694 while (current_bit < sizeof(uint64_t) * 8) {
695 /* only test requested features */
696 if (!(features & (1ULL << current_bit))) {
697 /* skip this */
698 current_bit++;
699 continue;
700 }
701
702 pid = fork();
703 if (pid < 0) {
704 SYSERROR("fork() failed");
705 *features_to_check = 0;
706 return false;
707 }
708
709 if (pid == 0) {
710 if ((1ULL << current_bit) == FEATURE_MEM_TRACK)
711 /* This is needed for pre-dump support, which
712 * enables pre-copy migration. */
713 args[3] = "mem_dirty_track";
714 else if ((1ULL << current_bit) == FEATURE_LAZY_PAGES)
715 /* CRIU has two checks for userfaultfd support.
716 *
717 * The simpler check is only for 'uffd'. If the
718 * kernel supports userfaultfd without noncoop
719 * then only process can be lazily restored
720 * which do not fork. With 'uffd-noncoop'
721 * it is also possible to lazily restore processes
722 * which do fork. For a container runtime like
723 * LXC checking only for 'uffd' makes not much sense. */
724 args[3] = "uffd-noncoop";
725 else
4f43526d 726 _exit(EXIT_FAILURE);
b5b12b9e
AR
727
728 null_stdfds();
729
730 execvp("criu", args);
731 SYSERROR("Failed to exec \"criu\"");
4f43526d 732 _exit(EXIT_FAILURE);
b5b12b9e
AR
733 }
734
735 ret = wait_for_pid(pid);
736
737 if (ret == -1) {
738 /* It is not known why CRIU failed. Either
739 * CRIU is not available, the feature check
740 * does not exist or the feature is not
741 * supported. */
742 INFO("feature not supported");
743 /* Clear not supported feature bit */
744 features &= ~(1ULL << current_bit);
745 }
746
747 current_bit++;
748 /* no more checks requested; exit check loop */
749 if (!(features & ~((1ULL << current_bit)-1)))
750 break;
751 }
752 if (features != *features_to_check) {
753 *features_to_check = features;
754 return false;
755 }
756 return true;
757}
758
8ba5ced7
TA
759/*
760 * Check to see if the criu version is recent enough for all the features we
761 * use. This version allows either CRIU_VERSION or (CRIU_GITID_VERSION and
762 * CRIU_GITID_PATCHLEVEL) to work, enabling users building from git to c/r
763 * things potentially before a version is released with a particular feature.
764 *
765 * The intent is that when criu development slows down, we can drop this, but
766 * for now we shouldn't attempt to c/r with versions that we know won't work.
5407e2ab
CB
767 *
768 * Note: If version != NULL criu_version() stores the detected criu version in
769 * version. Allocates memory for version which must be freed by caller.
8ba5ced7 770 */
5407e2ab 771static bool criu_version_ok(char **version)
8ba5ced7
TA
772{
773 int pipes[2];
774 pid_t pid;
775
776 if (pipe(pipes) < 0) {
777 SYSERROR("pipe() failed");
778 return false;
779 }
780
781 pid = fork();
782 if (pid < 0) {
783 SYSERROR("fork() failed");
784 return false;
785 }
786
787 if (pid == 0) {
788 char *args[] = { "criu", "--version", NULL };
755fa453 789 char *path;
8ba5ced7
TA
790 close(pipes[0]);
791
792 close(STDERR_FILENO);
793 if (dup2(pipes[1], STDOUT_FILENO) < 0)
665bb114 794 _exit(EXIT_FAILURE);
8ba5ced7 795
755fa453 796 path = on_path("criu", NULL);
d9b32b09 797 if (!path)
665bb114 798 _exit(EXIT_FAILURE);
d9b32b09 799
755fa453 800 execv(path, args);
665bb114 801 _exit(EXIT_FAILURE);
8ba5ced7
TA
802 } else {
803 FILE *f;
5407e2ab 804 char *tmp;
8ba5ced7
TA
805 int patch;
806
807 close(pipes[1]);
808 if (wait_for_pid(pid) < 0) {
809 close(pipes[0]);
4eae4051 810 SYSERROR("execing criu failed, is it installed?");
8ba5ced7
TA
811 return false;
812 }
813
814 f = fdopen(pipes[0], "r");
815 if (!f) {
816 close(pipes[0]);
817 return false;
818 }
819
5407e2ab
CB
820 tmp = malloc(1024);
821 if (!tmp) {
822 fclose(f);
823 return false;
824 }
825
826 if (fscanf(f, "Version: %1023[^\n]s", tmp) != 1)
8ba5ced7
TA
827 goto version_error;
828
829 if (fgetc(f) != '\n')
830 goto version_error;
831
5407e2ab 832 if (strcmp(tmp, CRIU_VERSION) >= 0)
8ba5ced7
TA
833 goto version_match;
834
5407e2ab 835 if (fscanf(f, "GitID: v%1023[^-]s", tmp) != 1)
8ba5ced7
TA
836 goto version_error;
837
838 if (fgetc(f) != '-')
839 goto version_error;
840
841 if (fscanf(f, "%d", &patch) != 1)
842 goto version_error;
843
5407e2ab 844 if (strcmp(tmp, CRIU_GITID_VERSION) < 0)
8ba5ced7
TA
845 goto version_error;
846
847 if (patch < CRIU_GITID_PATCHLEVEL)
848 goto version_error;
849
850version_match:
3158ab5b 851 fclose(f);
5407e2ab
CB
852 if (!version)
853 free(tmp);
854 else
855 *version = tmp;
8ba5ced7
TA
856 return true;
857
858version_error:
3158ab5b 859 fclose(f);
5407e2ab 860 free(tmp);
9f1f54b0 861 ERROR("must have criu " CRIU_VERSION " or greater to checkpoint/restore");
8ba5ced7
TA
862 return false;
863 }
864}
865
e29fe1dd
TA
866/* Check and make sure the container has a configuration that we know CRIU can
867 * dump. */
f1954503 868static bool criu_ok(struct lxc_container *c, char **criu_version)
e29fe1dd
TA
869{
870 struct lxc_list *it;
e29fe1dd
TA
871
872 if (geteuid()) {
9f1f54b0 873 ERROR("Must be root to checkpoint");
e29fe1dd
TA
874 return false;
875 }
876
7177e6b1
DJ
877 if (!criu_version_ok(criu_version))
878 return false;
879
e29fe1dd
TA
880 /* We only know how to restore containers with veth networks. */
881 lxc_list_for_each(it, &c->lxc_conf->network) {
882 struct lxc_netdev *n = it->elem;
65b20221
TA
883 switch(n->type) {
884 case LXC_NET_VETH:
885 case LXC_NET_NONE:
886 case LXC_NET_EMPTY:
e2697330 887 case LXC_NET_MACVLAN:
65b20221
TA
888 break;
889 default:
9f1f54b0 890 ERROR("Found un-dumpable network: %s (%s)", lxc_net_type_to_str(n->type), n->name);
7177e6b1
DJ
891 if (criu_version) {
892 free(*criu_version);
893 *criu_version = NULL;
894 }
e29fe1dd
TA
895 return false;
896 }
897 }
898
e29fe1dd
TA
899 return true;
900}
901
e29fe1dd
TA
902static bool restore_net_info(struct lxc_container *c)
903{
7eab8fc6 904 int ret;
e29fe1dd
TA
905 struct lxc_list *it;
906 bool has_error = true;
907
908 if (container_mem_lock(c))
909 return false;
910
911 lxc_list_for_each(it, &c->lxc_conf->network) {
912 struct lxc_netdev *netdev = it->elem;
913 char template[IFNAMSIZ];
65b20221
TA
914
915 if (netdev->type != LXC_NET_VETH)
916 continue;
917
7eab8fc6
CB
918 ret = snprintf(template, sizeof(template), "vethXXXXXX");
919 if (ret < 0 || ret >= sizeof(template))
920 goto out_unlock;
e29fe1dd 921
de4855a8
CB
922 if (netdev->priv.veth_attr.pair[0] == '\0' &&
923 netdev->priv.veth_attr.veth1[0] == '\0') {
966e9f1f 924 if (!lxc_mkifname(template))
de4855a8
CB
925 goto out_unlock;
926
cbb9c7c7 927 (void)strlcpy(netdev->priv.veth_attr.veth1, template, IFNAMSIZ);
de4855a8 928 }
e29fe1dd
TA
929 }
930
931 has_error = false;
932
933out_unlock:
934 container_mem_unlock(c);
935 return !has_error;
936}
937
1a0e70ac 938/* do_restore never returns, the calling process is used as the monitor process.
5a24adb8 939 * do_restore calls _exit() if it fails.
1a0e70ac 940 */
c33b0338 941static void do_restore(struct lxc_container *c, int status_pipe, struct migrate_opts *opts, char *criu_version)
e29fe1dd 942{
5af9369b 943 int fd, ret;
e29fe1dd 944 pid_t pid;
e29fe1dd 945 struct lxc_handler *handler;
113ebd57 946 int status = 0;
9b1e2e6e 947 int pipes[2] = {-1, -1};
2202afc9 948 struct cgroup_ops *cgroup_ops;
e29fe1dd 949
a7fb6043
TA
950 /* Try to detach from the current controlling tty if it exists.
951 * Othwerise, lxc_init (via lxc_console) will attach the container's
952 * console output to the current tty, which is probably not what any
953 * library user wants, and if they do, they can just manually configure
954 * it :)
955 */
956 fd = open("/dev/tty", O_RDWR);
957 if (fd >= 0) {
958 if (ioctl(fd, TIOCNOTTY, NULL) < 0)
959 SYSERROR("couldn't detach from tty");
960 close(fd);
961 }
962
5e5576a4 963 handler = lxc_init_handler(c->name, c->lxc_conf, c->config_path, false);
e29fe1dd
TA
964 if (!handler)
965 goto out;
966
aa460476
CB
967 if (lxc_init(c->name, handler) < 0)
968 goto out;
969
2202afc9
CB
970 cgroup_ops = cgroup_init(NULL);
971 if (!cgroup_ops)
e29fe1dd 972 goto out_fini_handler;
2202afc9 973 handler->cgroup_ops = cgroup_ops;
e29fe1dd 974
2202afc9 975 if (!cgroup_ops->create(cgroup_ops, handler)) {
e29fe1dd
TA
976 ERROR("failed creating groups");
977 goto out_fini_handler;
978 }
979
980 if (!restore_net_info(c)) {
981 ERROR("failed restoring network info");
982 goto out_fini_handler;
983 }
984
5af9369b
CB
985 ret = resolve_clone_flags(handler);
986 if (ret < 0) {
6d1400b5 987 SYSERROR("Unsupported clone flag specified");
5af9369b
CB
988 goto out_fini_handler;
989 }
e29fe1dd 990
de31cb57 991 if (pipe2(pipes, O_CLOEXEC) < 0) {
3d9a5c85
TA
992 SYSERROR("pipe() failed");
993 goto out_fini_handler;
994 }
995
e29fe1dd
TA
996 pid = fork();
997 if (pid < 0)
998 goto out_fini_handler;
999
1000 if (pid == 0) {
1001 struct criu_opts os;
1002 struct lxc_rootfs *rootfs;
4b54788e 1003 int flags;
e29fe1dd 1004
3d9a5c85
TA
1005 close(status_pipe);
1006 status_pipe = -1;
1007
1008 close(pipes[0]);
1009 pipes[0] = -1;
e29fe1dd
TA
1010
1011 if (unshare(CLONE_NEWNS))
1012 goto out_fini_handler;
1013
1014 /* CRIU needs the lxc root bind mounted so that it is the root of some
1015 * mount. */
1016 rootfs = &c->lxc_conf->rootfs;
1017
1018 if (rootfs_is_blockdev(c->lxc_conf)) {
8ce1abc2
CB
1019 if (lxc_setup_rootfs_prepare_root(c->lxc_conf, c->name,
1020 c->config_path) < 0)
e29fe1dd
TA
1021 goto out_fini_handler;
1022 } else {
1023 if (mkdir(rootfs->mount, 0755) < 0 && errno != EEXIST)
1024 goto out_fini_handler;
1025
1026 if (mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL) < 0) {
1027 SYSERROR("remount / to private failed");
1028 goto out_fini_handler;
1029 }
1030
1031 if (mount(rootfs->path, rootfs->mount, NULL, MS_BIND, NULL) < 0) {
1032 rmdir(rootfs->mount);
1033 goto out_fini_handler;
1034 }
1035 }
1036
5af85cb1 1037 os.pipefd = pipes[1];
e29fe1dd 1038 os.action = "restore";
b2c3710f 1039 os.user = opts;
e29fe1dd 1040 os.c = c;
4b54788e 1041 os.console_fd = c->lxc_conf->console.slave;
f1954503 1042 os.criu_version = criu_version;
0ab5703f 1043 os.handler = handler;
4b54788e 1044
97e4f1a9
TA
1045 if (os.console_fd >= 0) {
1046 /* Twiddle the FD_CLOEXEC bit. We want to pass this FD to criu
1047 * via --inherit-fd, so we don't want it to close.
1048 */
1049 flags = fcntl(os.console_fd, F_GETFD);
1050 if (flags < 0) {
1051 SYSERROR("F_GETFD failed: %d", os.console_fd);
1052 goto out_fini_handler;
1053 }
4b54788e 1054
97e4f1a9 1055 flags &= ~FD_CLOEXEC;
4b54788e 1056
97e4f1a9
TA
1057 if (fcntl(os.console_fd, F_SETFD, flags) < 0) {
1058 SYSERROR("F_SETFD failed");
1059 goto out_fini_handler;
1060 }
4b54788e
TA
1061 }
1062 os.console_name = c->lxc_conf->console.name;
e29fe1dd
TA
1063
1064 /* exec_criu() returning is an error */
2202afc9 1065 exec_criu(cgroup_ops, &os);
e29fe1dd
TA
1066 umount(rootfs->mount);
1067 rmdir(rootfs->mount);
1068 goto out_fini_handler;
1069 } else {
1070 int ret;
1071 char title[2048];
1072
3d9a5c85
TA
1073 close(pipes[1]);
1074 pipes[1] = -1;
1075
e29fe1dd
TA
1076 pid_t w = waitpid(pid, &status, 0);
1077 if (w == -1) {
1078 SYSERROR("waitpid");
1079 goto out_fini_handler;
1080 }
1081
e29fe1dd 1082 if (WIFEXITED(status)) {
75d219f0
TA
1083 char buf[4096];
1084
e29fe1dd 1085 if (WEXITSTATUS(status)) {
3d9a5c85
TA
1086 int n;
1087
668ba602 1088 n = lxc_read_nointr(pipes[0], buf, sizeof(buf));
3d9a5c85
TA
1089 if (n < 0) {
1090 SYSERROR("failed reading from criu stderr");
1091 goto out_fini_handler;
1092 }
1093
2735dfae
TA
1094 if (n == sizeof(buf))
1095 n--;
3d9a5c85
TA
1096 buf[n] = 0;
1097
9f1f54b0 1098 ERROR("criu process exited %d, output:\n%s", WEXITSTATUS(status), buf);
e29fe1dd
TA
1099 goto out_fini_handler;
1100 } else {
3eba9b49 1101 ret = snprintf(buf, sizeof(buf), "/proc/self/task/%lu/children", (unsigned long)syscall(__NR_gettid));
75d219f0
TA
1102 if (ret < 0 || ret >= sizeof(buf)) {
1103 ERROR("snprintf'd too many characters: %d", ret);
1104 goto out_fini_handler;
1105 }
1106
1107 FILE *f = fopen(buf, "r");
e29fe1dd 1108 if (!f) {
9f1f54b0 1109 SYSERROR("couldn't read restore's children file %s", buf);
e29fe1dd
TA
1110 goto out_fini_handler;
1111 }
1112
1113 ret = fscanf(f, "%d", (int*) &handler->pid);
1114 fclose(f);
1115 if (ret != 1) {
1116 ERROR("reading restore pid failed");
1117 goto out_fini_handler;
1118 }
1119
f8a41688
TA
1120 if (lxc_set_state(c->name, handler, RUNNING)) {
1121 ERROR("error setting running state after restore");
e29fe1dd 1122 goto out_fini_handler;
f8a41688 1123 }
e29fe1dd
TA
1124 }
1125 } else {
9f1f54b0 1126 ERROR("CRIU was killed with signal %d", WTERMSIG(status));
e29fe1dd
TA
1127 goto out_fini_handler;
1128 }
1129
3d9a5c85
TA
1130 close(pipes[0]);
1131
614be9bc 1132 ret = lxc_write_nointr(status_pipe, &status, sizeof(status));
f3886023
TA
1133 close(status_pipe);
1134 status_pipe = -1;
1135
1136 if (sizeof(status) != ret) {
1137 SYSERROR("failed to write all of status");
1138 goto out_fini_handler;
1139 }
1140
e29fe1dd
TA
1141 /*
1142 * See comment in lxcapi_start; we don't care if these
1143 * fail because it's just a beauty thing. We just
1144 * assign the return here to silence potential.
1145 */
1146 ret = snprintf(title, sizeof(title), "[lxc monitor] %s %s", c->config_path, c->name);
223e30c1
CB
1147 if (ret < 0 || (size_t)ret >= sizeof(title))
1148 INFO("Setting truncated process name");
1149
e29fe1dd 1150 ret = setproctitle(title);
223e30c1
CB
1151 if (ret < 0)
1152 INFO("Failed to set process name");
e29fe1dd
TA
1153
1154 ret = lxc_poll(c->name, handler);
1155 if (ret)
1156 lxc_abort(c->name, handler);
1157 lxc_fini(c->name, handler);
5a24adb8 1158 _exit(ret);
e29fe1dd
TA
1159 }
1160
1161out_fini_handler:
3d9a5c85
TA
1162 if (pipes[0] >= 0)
1163 close(pipes[0]);
1164 if (pipes[1] >= 0)
1165 close(pipes[1]);
1166
e29fe1dd
TA
1167 lxc_fini(c->name, handler);
1168
1169out:
3d9a5c85 1170 if (status_pipe >= 0) {
f3886023
TA
1171 /* ensure getting here was a failure, e.g. if we failed to
1172 * parse the child pid or something, even after a successful
1173 * restore
1174 */
1175 if (!status)
1176 status = 1;
113ebd57 1177
614be9bc 1178 if (lxc_write_nointr(status_pipe, &status, sizeof(status)) != sizeof(status))
e29fe1dd 1179 SYSERROR("writing status failed");
3d9a5c85 1180 close(status_pipe);
e29fe1dd
TA
1181 }
1182
5a24adb8 1183 _exit(EXIT_FAILURE);
e29fe1dd 1184}
aef3d51e 1185
4b54788e
TA
1186static int save_tty_major_minor(char *directory, struct lxc_container *c, char *tty_id, int len)
1187{
1188 FILE *f;
1189 char path[PATH_MAX];
1190 int ret;
1191 struct stat sb;
1192
1193 if (c->lxc_conf->console.path && !strcmp(c->lxc_conf->console.path, "none")) {
1194 tty_id[0] = 0;
1195 return 0;
1196 }
1197
1198 ret = snprintf(path, sizeof(path), "/proc/%d/root/dev/console", c->init_pid(c));
1199 if (ret < 0 || ret >= sizeof(path)) {
1200 ERROR("snprintf'd too many chacters: %d", ret);
1201 return -1;
1202 }
1203
1204 ret = stat(path, &sb);
1205 if (ret < 0) {
1206 SYSERROR("stat of %s failed", path);
1207 return -1;
1208 }
1209
1210 ret = snprintf(path, sizeof(path), "%s/tty.info", directory);
1211 if (ret < 0 || ret >= sizeof(path)) {
1212 ERROR("snprintf'd too many characters: %d", ret);
1213 return -1;
1214 }
1215
f03280a7
TA
1216 ret = snprintf(tty_id, len, "tty[%llx:%llx]",
1217 (long long unsigned) sb.st_rdev,
1218 (long long unsigned) sb.st_dev);
4b54788e
TA
1219 if (ret < 0 || ret >= sizeof(path)) {
1220 ERROR("snprintf'd too many characters: %d", ret);
1221 return -1;
1222 }
1223
1224 f = fopen(path, "w");
1225 if (!f) {
1226 SYSERROR("failed to open %s", path);
1227 return -1;
1228 }
1229
1230 ret = fprintf(f, "%s", tty_id);
1231 fclose(f);
1232 if (ret < 0)
1233 SYSERROR("failed to write to %s", path);
1234 return ret;
1235}
1236
aef3d51e 1237/* do one of either predump or a regular dump */
b2c3710f 1238static bool do_dump(struct lxc_container *c, char *mode, struct migrate_opts *opts)
aef3d51e 1239{
0e4adc1a 1240 int ret;
aef3d51e 1241 pid_t pid;
5af85cb1 1242 int criuout[2];
0e4adc1a 1243 char *criu_version = NULL;
aef3d51e 1244
f1954503 1245 if (!criu_ok(c, &criu_version))
aef3d51e
TA
1246 return false;
1247
0e4adc1a
CB
1248 ret = pipe(criuout);
1249 if (ret < 0) {
5af85cb1 1250 SYSERROR("pipe() failed");
7177e6b1 1251 free(criu_version);
aef3d51e 1252 return false;
5af85cb1
TA
1253 }
1254
1255 if (mkdir_p(opts->directory, 0700) < 0)
1256 goto fail;
aef3d51e
TA
1257
1258 pid = fork();
1259 if (pid < 0) {
1260 SYSERROR("fork failed");
5af85cb1 1261 goto fail;
aef3d51e
TA
1262 }
1263
1264 if (pid == 0) {
1265 struct criu_opts os;
0ab5703f 1266 struct lxc_handler h;
2202afc9 1267 struct cgroup_ops *cgroup_ops;
0ab5703f 1268
5af85cb1
TA
1269 close(criuout[0]);
1270
41784e4e
CB
1271 lxc_zero_handler(&h);
1272
0ab5703f 1273 h.name = c->name;
2202afc9
CB
1274
1275 cgroup_ops = cgroup_init(NULL);
1276 if (!cgroup_ops) {
0ab5703f 1277 ERROR("failed to cgroup_init()");
7211378b 1278 _exit(EXIT_FAILURE);
2202afc9 1279 return -1;
0ab5703f 1280 }
2202afc9 1281 h.cgroup_ops = cgroup_ops;
aef3d51e 1282
5af85cb1 1283 os.pipefd = criuout[1];
aef3d51e 1284 os.action = mode;
b2c3710f 1285 os.user = opts;
aef3d51e 1286 os.c = c;
4b54788e 1287 os.console_name = c->lxc_conf->console.path;
f1954503 1288 os.criu_version = criu_version;
74eb576c 1289
0e4adc1a
CB
1290 ret = save_tty_major_minor(opts->directory, c, os.tty_id, sizeof(os.tty_id));
1291 if (ret < 0) {
1292 free(criu_version);
7211378b 1293 _exit(EXIT_FAILURE);
0e4adc1a 1294 }
aef3d51e
TA
1295
1296 /* exec_criu() returning is an error */
2202afc9 1297 exec_criu(cgroup_ops, &os);
0e4adc1a 1298 free(criu_version);
7211378b 1299 _exit(EXIT_FAILURE);
aef3d51e
TA
1300 } else {
1301 int status;
5af85cb1
TA
1302 ssize_t n;
1303 char buf[4096];
1304 bool ret;
1305
1306 close(criuout[1]);
1307
aef3d51e
TA
1308 pid_t w = waitpid(pid, &status, 0);
1309 if (w == -1) {
1310 SYSERROR("waitpid");
5af85cb1 1311 close(criuout[0]);
7177e6b1 1312 free(criu_version);
aef3d51e
TA
1313 return false;
1314 }
1315
668ba602 1316 n = lxc_read_nointr(criuout[0], buf, sizeof(buf));
5af85cb1
TA
1317 close(criuout[0]);
1318 if (n < 0) {
1319 SYSERROR("read");
1320 n = 0;
1321 }
40229e95 1322
1323 if (n == sizeof(buf))
1324 buf[n-1] = 0;
1325 else
1326 buf[n] = 0;
5af85cb1 1327
aef3d51e
TA
1328 if (WIFEXITED(status)) {
1329 if (WEXITSTATUS(status)) {
9f1f54b0 1330 ERROR("dump failed with %d", WEXITSTATUS(status));
5af85cb1
TA
1331 ret = false;
1332 } else {
1333 ret = true;
aef3d51e 1334 }
aef3d51e 1335 } else if (WIFSIGNALED(status)) {
9f1f54b0 1336 ERROR("dump signaled with %d", WTERMSIG(status));
5af85cb1 1337 ret = false;
aef3d51e 1338 } else {
9f1f54b0 1339 ERROR("unknown dump exit %d", status);
5af85cb1 1340 ret = false;
aef3d51e 1341 }
5af85cb1
TA
1342
1343 if (!ret)
1344 ERROR("criu output: %s", buf);
7177e6b1
DJ
1345
1346 free(criu_version);
5af85cb1 1347 return ret;
aef3d51e 1348 }
5af85cb1
TA
1349fail:
1350 close(criuout[0]);
1351 close(criuout[1]);
1352 rmdir(opts->directory);
0e4adc1a 1353 free(criu_version);
5af85cb1 1354 return false;
aef3d51e
TA
1355}
1356
b2c3710f 1357bool __criu_pre_dump(struct lxc_container *c, struct migrate_opts *opts)
aef3d51e 1358{
b2c3710f 1359 return do_dump(c, "pre-dump", opts);
aef3d51e
TA
1360}
1361
b2c3710f 1362bool __criu_dump(struct lxc_container *c, struct migrate_opts *opts)
aef3d51e
TA
1363{
1364 char path[PATH_MAX];
1365 int ret;
1366
b2c3710f 1367 ret = snprintf(path, sizeof(path), "%s/inventory.img", opts->directory);
aef3d51e
TA
1368 if (ret < 0 || ret >= sizeof(path))
1369 return false;
1370
1371 if (access(path, F_OK) == 0) {
9f1f54b0 1372 ERROR("please use a fresh directory for the dump directory");
aef3d51e
TA
1373 return false;
1374 }
1375
b2c3710f 1376 return do_dump(c, "dump", opts);
aef3d51e
TA
1377}
1378
b2c3710f 1379bool __criu_restore(struct lxc_container *c, struct migrate_opts *opts)
aef3d51e
TA
1380{
1381 pid_t pid;
1382 int status, nread;
1383 int pipefd[2];
f1954503 1384 char *criu_version = NULL;
aef3d51e 1385
aef3d51e 1386 if (geteuid()) {
9f1f54b0 1387 ERROR("Must be root to restore");
aef3d51e
TA
1388 return false;
1389 }
1390
1391 if (pipe(pipefd)) {
1392 ERROR("failed to create pipe");
1393 return false;
1394 }
1395
7177e6b1
DJ
1396 if (!criu_ok(c, &criu_version)) {
1397 close(pipefd[0]);
1398 close(pipefd[1]);
1399 return false;
1400 }
1401
aef3d51e
TA
1402 pid = fork();
1403 if (pid < 0) {
1404 close(pipefd[0]);
1405 close(pipefd[1]);
7177e6b1 1406 free(criu_version);
aef3d51e
TA
1407 return false;
1408 }
1409
1410 if (pid == 0) {
1411 close(pipefd[0]);
1a0e70ac 1412 /* this never returns */
f1954503 1413 do_restore(c, pipefd[1], opts, criu_version);
aef3d51e
TA
1414 }
1415
1416 close(pipefd[1]);
7177e6b1 1417 free(criu_version);
aef3d51e 1418
668ba602 1419 nread = lxc_read_nointr(pipefd[0], &status, sizeof(status));
aef3d51e
TA
1420 close(pipefd[0]);
1421 if (sizeof(status) != nread) {
1422 ERROR("reading status from pipe failed");
1423 goto err_wait;
1424 }
1425
1a0e70ac
CB
1426 /* If the criu process was killed or exited nonzero, wait() for the
1427 * handler, since the restore process died. Otherwise, we don't need to
1428 * wait, since the child becomes the monitor process.
1429 */
aef3d51e
TA
1430 if (!WIFEXITED(status) || WEXITSTATUS(status))
1431 goto err_wait;
1432 return true;
1433
1434err_wait:
1435 if (wait_for_pid(pid))
1436 ERROR("restore process died");
1437 return false;
1438}