]> git.proxmox.com Git - mirror_lxc.git/blame - src/lxc/criu.c
cgroups: refactor cgroup handling
[mirror_lxc.git] / src / lxc / criu.c
CommitLineData
e29fe1dd
TA
1/*
2 * lxc: linux Container library
3 *
4 * Copyright © 2014-2015 Canonical Ltd.
5 *
6 * Authors:
7 * Tycho Andersen <tycho.andersen@canonical.com>
8 *
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 */
23#define _GNU_SOURCE
9b945f13 24#include <inttypes.h>
e29fe1dd
TA
25#include <linux/limits.h>
26#include <sched.h>
27#include <stdio.h>
28#include <stdlib.h>
29#include <string.h>
30#include <sys/mount.h>
31#include <sys/types.h>
32#include <sys/wait.h>
33#include <unistd.h>
34
35#include "config.h"
36
e29fe1dd
TA
37#include "cgroup.h"
38#include "conf.h"
dc259399 39#include "commands.h"
e29fe1dd
TA
40#include "criu.h"
41#include "log.h"
42#include "lxc.h"
43#include "lxclock.h"
44#include "network.h"
28d832c4 45#include "storage.h"
e29fe1dd
TA
46#include "utils.h"
47
5f4e44a2
TA
48#if IS_BIONIC
49#include <../include/lxcmntent.h>
50#else
51#include <mntent.h>
52#endif
53
9de31d5a
CB
54#ifndef HAVE_STRLCPY
55#include "include/strlcpy.h"
56#endif
57
c33b0338 58#define CRIU_VERSION "2.0"
73d46752
TA
59
60#define CRIU_GITID_VERSION "2.0"
61#define CRIU_GITID_PATCHLEVEL 0
62
f1954503 63#define CRIU_IN_FLIGHT_SUPPORT "2.4"
46c8ffd5 64#define CRIU_EXTERNAL_NOT_VETH "2.8"
f1954503 65
e29fe1dd
TA
66lxc_log_define(lxc_criu, lxc);
67
73d46752 68struct criu_opts {
5af85cb1
TA
69 /* the thing to hook to stdout and stderr for logging */
70 int pipefd;
71
73d46752
TA
72 /* The type of criu invocation, one of "dump" or "restore" */
73 char *action;
74
b2c3710f
TA
75 /* the user-provided migrate options relevant to this action */
76 struct migrate_opts *user;
73d46752
TA
77
78 /* The container to dump */
79 struct lxc_container *c;
80
73d46752 81 /* dump: stop the container or not after dumping? */
4b54788e 82 char tty_id[32]; /* the criu tty id for /dev/console, i.e. "tty[${rdev}:${dev}]" */
73d46752
TA
83
84 /* restore: the file to write the init process' pid into */
0ab5703f 85 struct lxc_handler *handler;
4b54788e
TA
86 int console_fd;
87 /* The path that is bind mounted from /dev/console, if any. We don't
88 * want to use `--ext-mount-map auto`'s result here because the pts
89 * device may have a different path (e.g. if the pty number is
3aed4934 90 * different) on the target host. NULL if lxc.console.path = "none".
4b54788e
TA
91 */
92 char *console_name;
f1954503
AR
93
94 /* The detected version of criu */
95 char *criu_version;
73d46752
TA
96};
97
4b54788e
TA
98static int load_tty_major_minor(char *directory, char *output, int len)
99{
100 FILE *f;
101 char path[PATH_MAX];
102 int ret;
103
104 ret = snprintf(path, sizeof(path), "%s/tty.info", directory);
105 if (ret < 0 || ret >= sizeof(path)) {
106 ERROR("snprintf'd too many chacters: %d", ret);
107 return -1;
108 }
109
110 f = fopen(path, "r");
111 if (!f) {
112 /* This means we're coming from a liblxc which didn't export
3aed4934
CB
113 * the tty info. In this case they had to have lxc.console.path
114 * = * none, so there's no problem restoring.
4b54788e
TA
115 */
116 if (errno == ENOENT)
117 return 0;
118
119 SYSERROR("couldn't open %s", path);
120 return -1;
121 }
122
123 if (!fgets(output, len, f)) {
124 fclose(f);
125 SYSERROR("couldn't read %s", path);
126 return -1;
127 }
128
129 fclose(f);
130 return 0;
131}
132
74ad3607
FB
133static int cmp_version(const char *v1, const char *v2)
134{
135 int ret;
136 int oct_v1[3], oct_v2[3];
137
138 memset(oct_v1, -1, sizeof(oct_v1));
139 memset(oct_v2, -1, sizeof(oct_v2));
140
141 ret = sscanf(v1, "%d.%d.%d", &oct_v1[0], &oct_v1[1], &oct_v1[2]);
142 if (ret < 1)
143 return -1;
144
145 ret = sscanf(v2, "%d.%d.%d", &oct_v2[0], &oct_v2[1], &oct_v2[2]);
146 if (ret < 1)
147 return -1;
148
149 /* Major version is greater. */
150 if (oct_v1[0] > oct_v2[0])
151 return 1;
152
153 if (oct_v1[0] < oct_v2[0])
154 return -1;
155
156 /* Minor number is greater.*/
157 if (oct_v1[1] > oct_v2[1])
158 return 1;
159
160 if (oct_v1[1] < oct_v2[1])
161 return -1;
162
163 /* Patch number is greater. */
164 if (oct_v1[2] > oct_v2[2])
165 return 1;
166
167 /* Patch numbers are equal. */
168 if (oct_v1[2] == oct_v2[2])
169 return 0;
170
171 return -1;
172}
173
2202afc9 174static void exec_criu(struct cgroup_ops *cgroup_ops, struct criu_opts *opts)
e29fe1dd
TA
175{
176 char **argv, log[PATH_MAX];
19d1509c 177 int static_args = 23, argc = 0, i, ret;
e29fe1dd
TA
178 int netnr = 0;
179 struct lxc_list *it;
5f4e44a2
TA
180 FILE *mnts;
181 struct mntent mntent;
e29fe1dd 182
0e4be3cf 183 char buf[4096], ttys[32];
a17fa3c0 184 size_t pos;
5af85cb1 185
e9195050
TA
186 /* If we are currently in a cgroup /foo/bar, and the container is in a
187 * cgroup /lxc/foo, lxcfs will give us an ENOENT if some task in the
188 * container has an open fd that points to one of the cgroup files
189 * (systemd always opens its "root" cgroup). So, let's escape to the
190 * /actual/ root cgroup so that lxcfs thinks criu has enough rights to
191 * see all cgroups.
192 */
2202afc9 193 if (!cgroup_ops->escape(cgroup_ops)) {
e9195050
TA
194 ERROR("failed to escape cgroups");
195 return;
196 }
197
e29fe1dd 198 /* The command line always looks like:
19d1509c 199 * criu $(action) --tcp-established --file-locks --link-remap \
5f178bc9 200 * --manage-cgroups=full --action-script foo.sh -D $(directory) \
e29fe1dd
TA
201 * -o $(directory)/$(action).log --ext-mount-map auto
202 * --enable-external-sharing --enable-external-masters
4b54788e 203 * --enable-fs hugetlbfs --enable-fs tracefs --ext-mount-map console:/dev/pts/n
e29fe1dd
TA
204 * +1 for final NULL */
205
aef3d51e 206 if (strcmp(opts->action, "dump") == 0 || strcmp(opts->action, "pre-dump") == 0) {
dc259399
TA
207 /* -t pid --freeze-cgroup /lxc/ct */
208 static_args += 4;
e29fe1dd 209
aef3d51e 210 /* --prev-images-dir <path-to-directory-A-relative-to-B> */
b2c3710f 211 if (opts->user->predump_dir)
aef3d51e
TA
212 static_args += 2;
213
74eb576c 214 /* --page-server --address <address> --port <port> */
b2c3710f 215 if (opts->user->pageserver_address && opts->user->pageserver_port)
74eb576c
NE
216 static_args += 5;
217
aef3d51e 218 /* --leave-running (only for final dump) */
b2c3710f 219 if (strcmp(opts->action, "dump") == 0 && !opts->user->stop)
e29fe1dd 220 static_args++;
4b54788e
TA
221
222 /* --external tty[88,4] */
223 if (opts->tty_id[0])
224 static_args += 2;
19d1509c
TA
225
226 /* --force-irmap */
227 if (!opts->user->preserves_inodes)
228 static_args++;
b2b7b0d2
TA
229
230 /* --ghost-limit 1024 */
231 if (opts->user->ghost_limit)
232 static_args += 2;
e29fe1dd
TA
233 } else if (strcmp(opts->action, "restore") == 0) {
234 /* --root $(lxc_mount_point) --restore-detached
0ab5703f 235 * --restore-sibling
13389b29
TA
236 * --lsm-profile apparmor:whatever
237 */
0ab5703f 238 static_args += 6;
4b54788e 239
0e4be3cf
CB
240 ttys[0] = 0;
241 if (load_tty_major_minor(opts->user->directory, ttys, sizeof(ttys)))
4b54788e
TA
242 return;
243
244 /* --inherit-fd fd[%d]:tty[%s] */
0e4be3cf 245 if (ttys[0])
4b54788e 246 static_args += 2;
e29fe1dd
TA
247 } else {
248 return;
249 }
250
2202afc9
CB
251 if (cgroup_ops->num_hierarchies(cgroup_ops) > 0)
252 static_args += 2 * cgroup_ops->num_hierarchies(cgroup_ops);
0ab5703f 253
b2c3710f 254 if (opts->user->verbose)
e29fe1dd
TA
255 static_args++;
256
b9ee6643
TA
257 if (opts->user->action_script)
258 static_args += 2;
259
5f4e44a2
TA
260 static_args += 2 * lxc_list_len(&opts->c->lxc_conf->mount_list);
261
b2c3710f 262 ret = snprintf(log, PATH_MAX, "%s/%s.log", opts->user->directory, opts->action);
e29fe1dd 263 if (ret < 0 || ret >= PATH_MAX) {
9f1f54b0 264 ERROR("logfile name too long");
e29fe1dd
TA
265 return;
266 }
267
268 argv = malloc(static_args * sizeof(*argv));
269 if (!argv)
270 return;
271
272 memset(argv, 0, static_args * sizeof(*argv));
273
274#define DECLARE_ARG(arg) \
275 do { \
276 if (arg == NULL) { \
277 ERROR("Got NULL argument for criu"); \
278 goto err; \
279 } \
280 argv[argc++] = strdup(arg); \
281 if (!argv[argc-1]) \
282 goto err; \
283 } while (0)
284
285 argv[argc++] = on_path("criu", NULL);
286 if (!argv[argc-1]) {
9f1f54b0 287 ERROR("Couldn't find criu binary");
e29fe1dd
TA
288 goto err;
289 }
290
291 DECLARE_ARG(opts->action);
292 DECLARE_ARG("--tcp-established");
293 DECLARE_ARG("--file-locks");
294 DECLARE_ARG("--link-remap");
0a5fc6df 295 DECLARE_ARG("--manage-cgroups=full");
e29fe1dd
TA
296 DECLARE_ARG("--ext-mount-map");
297 DECLARE_ARG("auto");
298 DECLARE_ARG("--enable-external-sharing");
299 DECLARE_ARG("--enable-external-masters");
dd62857a
TA
300 DECLARE_ARG("--enable-fs");
301 DECLARE_ARG("hugetlbfs");
5b454329
TA
302 DECLARE_ARG("--enable-fs");
303 DECLARE_ARG("tracefs");
e29fe1dd 304 DECLARE_ARG("-D");
b2c3710f 305 DECLARE_ARG(opts->user->directory);
e29fe1dd
TA
306 DECLARE_ARG("-o");
307 DECLARE_ARG(log);
308
2202afc9 309 for (i = 0; i < cgroup_ops->num_hierarchies(cgroup_ops); i++) {
0ab5703f 310 char **controllers = NULL, *fullname;
31b204e4 311 char *path, *tmp;
0ab5703f 312
2202afc9 313 if (!cgroup_ops->get_hierarchies(cgroup_ops, i, &controllers)) {
0ab5703f
TA
314 ERROR("failed to get hierarchy %d", i);
315 goto err;
316 }
317
318 /* if we are in a dump, we have to ask the monitor process what
319 * the right cgroup is. if this is a restore, we can just use
320 * the handler the restore task created.
321 */
322 if (!strcmp(opts->action, "dump") || !strcmp(opts->action, "pre-dump")) {
323 path = lxc_cmd_get_cgroup_path(opts->c->name, opts->c->config_path, controllers[0]);
324 if (!path) {
325 ERROR("failed to get cgroup path for %s", controllers[0]);
326 goto err;
327 }
328 } else {
329 const char *p;
330
2202afc9 331 p = cgroup_ops->get_cgroup(cgroup_ops, controllers[0]);
0ab5703f
TA
332 if (!p) {
333 ERROR("failed to get cgroup path for %s", controllers[0]);
334 goto err;
335 }
336
337 path = strdup(p);
338 if (!path) {
339 ERROR("strdup failed");
340 goto err;
341 }
342 }
343
31b204e4
CB
344 tmp = lxc_deslashify(path);
345 if (!tmp) {
346 ERROR("Failed to remove extraneous slashes from \"%s\"",
347 path);
0ab5703f
TA
348 free(path);
349 goto err;
350 }
31b204e4
CB
351 free(path);
352 path = tmp;
0ab5703f
TA
353
354 fullname = lxc_string_join(",", (const char **) controllers, false);
355 if (!fullname) {
356 ERROR("failed to join controllers");
357 free(path);
358 goto err;
359 }
360
361 ret = sprintf(buf, "%s:%s", fullname, path);
362 free(path);
363 free(fullname);
364 if (ret < 0 || ret >= sizeof(buf)) {
365 ERROR("sprintf of cgroup root arg failed");
366 goto err;
367 }
368
369 DECLARE_ARG("--cgroup-root");
370 DECLARE_ARG(buf);
371 }
372
b2c3710f 373 if (opts->user->verbose)
e29fe1dd
TA
374 DECLARE_ARG("-vvvvvv");
375
b9ee6643
TA
376 if (opts->user->action_script) {
377 DECLARE_ARG("--action-script");
378 DECLARE_ARG(opts->user->action_script);
379 }
380
5ef5c9a3 381 mnts = make_anonymous_mount_file(&opts->c->lxc_conf->mount_list);
5f4e44a2
TA
382 if (!mnts)
383 goto err;
384
385 while (getmntent_r(mnts, &mntent, buf, sizeof(buf))) {
19d2422b 386 char *fmt, *key, *val, *mntdata;
5f4e44a2 387 char arg[2 * PATH_MAX + 2];
19d2422b
TA
388 unsigned long flags;
389
390 if (parse_mntopts(mntent.mnt_opts, &flags, &mntdata) < 0)
391 goto err;
392
393 free(mntdata);
394
395 /* only add --ext-mount-map for actual bind mounts */
396 if (!(flags & MS_BIND))
397 continue;
5f4e44a2
TA
398
399 if (strcmp(opts->action, "dump") == 0) {
400 fmt = "/%s:%s";
401 key = mntent.mnt_dir;
402 val = mntent.mnt_dir;
403 } else {
404 fmt = "%s:%s";
405 key = mntent.mnt_dir;
406 val = mntent.mnt_fsname;
407 }
408
409 ret = snprintf(arg, sizeof(arg), fmt, key, val);
410 if (ret < 0 || ret >= sizeof(arg)) {
411 fclose(mnts);
412 ERROR("snprintf failed");
413 goto err;
414 }
415
416 DECLARE_ARG("--ext-mount-map");
417 DECLARE_ARG(arg);
418 }
419 fclose(mnts);
420
aef3d51e 421 if (strcmp(opts->action, "dump") == 0 || strcmp(opts->action, "pre-dump") == 0) {
dc259399 422 char pid[32], *freezer_relative;
e29fe1dd
TA
423
424 if (sprintf(pid, "%d", opts->c->init_pid(opts->c)) < 0)
425 goto err;
426
427 DECLARE_ARG("-t");
428 DECLARE_ARG(pid);
dc259399
TA
429
430 freezer_relative = lxc_cmd_get_cgroup_path(opts->c->name,
431 opts->c->config_path,
432 "freezer");
433 if (!freezer_relative) {
434 ERROR("failed getting freezer path");
435 goto err;
436 }
437
438 ret = snprintf(log, sizeof(log), "/sys/fs/cgroup/freezer/%s", freezer_relative);
439 if (ret < 0 || ret >= sizeof(log))
440 goto err;
441
f1954503
AR
442 if (!opts->user->disable_skip_in_flight &&
443 strcmp(opts->criu_version, CRIU_IN_FLIGHT_SUPPORT) >= 0)
444 DECLARE_ARG("--skip-in-flight");
445
dc259399
TA
446 DECLARE_ARG("--freeze-cgroup");
447 DECLARE_ARG(log);
448
4b54788e 449 if (opts->tty_id[0]) {
36d2096c
TA
450 DECLARE_ARG("--ext-mount-map");
451 DECLARE_ARG("/dev/console:console");
452
4b54788e
TA
453 DECLARE_ARG("--external");
454 DECLARE_ARG(opts->tty_id);
455 }
456
b2c3710f 457 if (opts->user->predump_dir) {
aef3d51e 458 DECLARE_ARG("--prev-images-dir");
b2c3710f 459 DECLARE_ARG(opts->user->predump_dir);
9f99a33f 460 DECLARE_ARG("--track-mem");
74eb576c 461 }
4c0c0319 462
b2c3710f 463 if (opts->user->pageserver_address && opts->user->pageserver_port) {
74eb576c
NE
464 DECLARE_ARG("--page-server");
465 DECLARE_ARG("--address");
b2c3710f 466 DECLARE_ARG(opts->user->pageserver_address);
74eb576c 467 DECLARE_ARG("--port");
b2c3710f 468 DECLARE_ARG(opts->user->pageserver_port);
74eb576c 469 }
aef3d51e 470
19d1509c
TA
471 if (!opts->user->preserves_inodes)
472 DECLARE_ARG("--force-irmap");
473
b2b7b0d2
TA
474 if (opts->user->ghost_limit) {
475 char ghost_limit[32];
476
9b945f13 477 ret = sprintf(ghost_limit, "%"PRIu64, opts->user->ghost_limit);
b2b7b0d2 478 if (ret < 0 || ret >= sizeof(ghost_limit)) {
9b945f13 479 ERROR("failed to print ghost limit %"PRIu64, opts->user->ghost_limit);
b2b7b0d2
TA
480 goto err;
481 }
482
483 DECLARE_ARG("--ghost-limit");
484 DECLARE_ARG(ghost_limit);
485 }
486
aef3d51e 487 /* only for final dump */
b2c3710f 488 if (strcmp(opts->action, "dump") == 0 && !opts->user->stop)
e29fe1dd
TA
489 DECLARE_ARG("--leave-running");
490 } else if (strcmp(opts->action, "restore") == 0) {
491 void *m;
492 int additional;
13389b29 493 struct lxc_conf *lxc_conf = opts->c->lxc_conf;
e29fe1dd
TA
494
495 DECLARE_ARG("--root");
496 DECLARE_ARG(opts->c->lxc_conf->rootfs.mount);
497 DECLARE_ARG("--restore-detached");
498 DECLARE_ARG("--restore-sibling");
e29fe1dd 499
0e4be3cf 500 if (ttys[0]) {
97e4f1a9 501 if (opts->console_fd < 0) {
3aed4934 502 ERROR("lxc.console.path configured on source host but not target");
97e4f1a9
TA
503 goto err;
504 }
505
0e4be3cf 506 ret = snprintf(buf, sizeof(buf), "fd[%d]:%s", opts->console_fd, ttys);
4b54788e
TA
507 if (ret < 0 || ret >= sizeof(buf))
508 goto err;
509
510 DECLARE_ARG("--inherit-fd");
511 DECLARE_ARG(buf);
512 }
513 if (opts->console_name) {
514 if (snprintf(buf, sizeof(buf), "console:%s", opts->console_name) < 0) {
515 SYSERROR("sprintf'd too many bytes");
516 }
517 DECLARE_ARG("--ext-mount-map");
518 DECLARE_ARG(buf);
519 }
520
13389b29
TA
521 if (lxc_conf->lsm_aa_profile || lxc_conf->lsm_se_context) {
522
523 if (lxc_conf->lsm_aa_profile)
524 ret = snprintf(buf, sizeof(buf), "apparmor:%s", lxc_conf->lsm_aa_profile);
525 else
526 ret = snprintf(buf, sizeof(buf), "selinux:%s", lxc_conf->lsm_se_context);
527
528 if (ret < 0 || ret >= sizeof(buf))
529 goto err;
530
531 DECLARE_ARG("--lsm-profile");
532 DECLARE_ARG(buf);
533 }
534
e29fe1dd
TA
535 additional = lxc_list_len(&opts->c->lxc_conf->network) * 2;
536
fa071249
TA
537 m = realloc(argv, (argc + additional + 1) * sizeof(*argv));
538 if (!m)
539 goto err;
e29fe1dd
TA
540 argv = m;
541
542 lxc_list_for_each(it, &opts->c->lxc_conf->network) {
9de31d5a 543 size_t retlen;
e29fe1dd 544 char eth[128], *veth;
46c8ffd5 545 char *fmt;
e29fe1dd 546 struct lxc_netdev *n = it->elem;
46c8ffd5
AR
547 bool external_not_veth;
548
74ad3607 549 if (cmp_version(opts->criu_version, CRIU_EXTERNAL_NOT_VETH) >= 0) {
46c8ffd5
AR
550 /* Since criu version 2.8 the usage of --veth-pair
551 * has been deprecated:
552 * git tag --contains f2037e6d3445fc400
553 * v2.8 */
554 external_not_veth = true;
555 } else {
556 external_not_veth = false;
557 }
e29fe1dd 558
42277b1c 559 if (n->name[0] != '\0') {
9de31d5a
CB
560 retlen = strlcpy(eth, n->name, sizeof(eth));
561 if (retlen >= sizeof(eth))
e29fe1dd 562 goto err;
796a109d
TA
563 } else {
564 ret = snprintf(eth, sizeof(eth), "eth%d", netnr);
565 if (ret < 0 || ret >= sizeof(eth))
566 goto err;
567 }
e29fe1dd 568
e2697330
TA
569 switch (n->type) {
570 case LXC_NET_VETH:
571 veth = n->priv.veth_attr.pair;
ea7f6b29
CB
572 if (veth[0] == '\0')
573 veth = n->priv.veth_attr.veth1;
e29fe1dd 574
de4855a8 575 if (n->link[0] != '\0') {
46c8ffd5
AR
576 if (external_not_veth)
577 fmt = "veth[%s]:%s@%s";
578 else
579 fmt = "%s=%s@%s";
580
581 ret = snprintf(buf, sizeof(buf), fmt, eth, veth, n->link);
582 } else {
583 if (external_not_veth)
584 fmt = "veth[%s]:%s";
585 else
586 fmt = "%s=%s";
587
588 ret = snprintf(buf, sizeof(buf), fmt, eth, veth);
589 }
e2697330
TA
590 if (ret < 0 || ret >= sizeof(buf))
591 goto err;
592 break;
593 case LXC_NET_MACVLAN:
de4855a8 594 if (n->link[0] == '\0') {
9f1f54b0 595 ERROR("no host interface for macvlan %s", n->name);
e2697330
TA
596 goto err;
597 }
598
599 ret = snprintf(buf, sizeof(buf), "macvlan[%s]:%s", eth, n->link);
600 if (ret < 0 || ret >= sizeof(buf))
601 goto err;
602 break;
603 case LXC_NET_NONE:
604 case LXC_NET_EMPTY:
605 break;
606 default:
607 /* we have screened for this earlier... */
9f1f54b0 608 ERROR("unexpected network type %d", n->type);
e29fe1dd 609 goto err;
e2697330 610 }
e29fe1dd 611
46c8ffd5
AR
612 if (external_not_veth)
613 DECLARE_ARG("--external");
614 else
615 DECLARE_ARG("--veth-pair");
e29fe1dd 616 DECLARE_ARG(buf);
2f3fbc6b 617 netnr++;
e29fe1dd
TA
618 }
619
620 }
621
622 argv[argc] = NULL;
623
cf4b07a5 624 buf[0] = 0;
a17fa3c0 625 pos = 0;
72a30576 626
cf4b07a5 627 for (i = 0; argv[i]; i++) {
72a30576
NE
628 ret = snprintf(buf + pos, sizeof(buf) - pos, "%s ", argv[i]);
629 if (ret < 0 || ret >= sizeof(buf) - pos)
630 goto err;
631 else
632 pos += ret;
cf4b07a5
TA
633 }
634
635 INFO("execing: %s", buf);
636
5af85cb1
TA
637 /* before criu inits its log, it sometimes prints things to stdout/err;
638 * let's be sure we capture that.
639 */
640 if (dup2(opts->pipefd, STDOUT_FILENO) < 0) {
641 SYSERROR("dup2 stdout failed");
642 goto err;
643 }
644
645 if (dup2(opts->pipefd, STDERR_FILENO) < 0) {
646 SYSERROR("dup2 stderr failed");
647 goto err;
648 }
649
650 close(opts->pipefd);
651
e29fe1dd
TA
652#undef DECLARE_ARG
653 execv(argv[0], argv);
654err:
e29fe1dd
TA
655 for (i = 0; argv[i]; i++)
656 free(argv[i]);
657 free(argv);
658}
659
b5b12b9e
AR
660/*
661 * Function to check if the checks activated in 'features_to_check' are
662 * available with the current architecture/kernel/criu combination.
663 *
664 * Parameter features_to_check is a bit mask of all features that should be
665 * checked (see feature check defines in lxc/lxccontainer.h).
666 *
667 * If the return value is true, all requested features are supported. If
668 * the return value is false the features_to_check parameter is updated
669 * to reflect which features are available. '0' means no feature but
670 * also that something went totally wrong.
671 *
672 * Some of the code flow of criu_version_ok() is duplicated and maybe it
673 * is a good candidate for refactoring.
674 */
675bool __criu_check_feature(uint64_t *features_to_check)
676{
677 pid_t pid;
678 uint64_t current_bit = 0;
679 int ret;
680 int features = *features_to_check;
681 /* Feature checking is currently always like
682 * criu check --feature <feature-name>
683 */
684 char *args[] = { "criu", "check", "--feature", NULL, NULL };
685
686 if ((features & ~FEATURE_MEM_TRACK & ~FEATURE_LAZY_PAGES) != 0) {
687 /* There are feature bits activated we do not understand.
688 * Refusing to answer at all */
689 *features_to_check = 0;
690 return false;
691 }
692
693 while (current_bit < sizeof(uint64_t) * 8) {
694 /* only test requested features */
695 if (!(features & (1ULL << current_bit))) {
696 /* skip this */
697 current_bit++;
698 continue;
699 }
700
701 pid = fork();
702 if (pid < 0) {
703 SYSERROR("fork() failed");
704 *features_to_check = 0;
705 return false;
706 }
707
708 if (pid == 0) {
709 if ((1ULL << current_bit) == FEATURE_MEM_TRACK)
710 /* This is needed for pre-dump support, which
711 * enables pre-copy migration. */
712 args[3] = "mem_dirty_track";
713 else if ((1ULL << current_bit) == FEATURE_LAZY_PAGES)
714 /* CRIU has two checks for userfaultfd support.
715 *
716 * The simpler check is only for 'uffd'. If the
717 * kernel supports userfaultfd without noncoop
718 * then only process can be lazily restored
719 * which do not fork. With 'uffd-noncoop'
720 * it is also possible to lazily restore processes
721 * which do fork. For a container runtime like
722 * LXC checking only for 'uffd' makes not much sense. */
723 args[3] = "uffd-noncoop";
724 else
4f43526d 725 _exit(EXIT_FAILURE);
b5b12b9e
AR
726
727 null_stdfds();
728
729 execvp("criu", args);
730 SYSERROR("Failed to exec \"criu\"");
4f43526d 731 _exit(EXIT_FAILURE);
b5b12b9e
AR
732 }
733
734 ret = wait_for_pid(pid);
735
736 if (ret == -1) {
737 /* It is not known why CRIU failed. Either
738 * CRIU is not available, the feature check
739 * does not exist or the feature is not
740 * supported. */
741 INFO("feature not supported");
742 /* Clear not supported feature bit */
743 features &= ~(1ULL << current_bit);
744 }
745
746 current_bit++;
747 /* no more checks requested; exit check loop */
748 if (!(features & ~((1ULL << current_bit)-1)))
749 break;
750 }
751 if (features != *features_to_check) {
752 *features_to_check = features;
753 return false;
754 }
755 return true;
756}
757
8ba5ced7
TA
758/*
759 * Check to see if the criu version is recent enough for all the features we
760 * use. This version allows either CRIU_VERSION or (CRIU_GITID_VERSION and
761 * CRIU_GITID_PATCHLEVEL) to work, enabling users building from git to c/r
762 * things potentially before a version is released with a particular feature.
763 *
764 * The intent is that when criu development slows down, we can drop this, but
765 * for now we shouldn't attempt to c/r with versions that we know won't work.
5407e2ab
CB
766 *
767 * Note: If version != NULL criu_version() stores the detected criu version in
768 * version. Allocates memory for version which must be freed by caller.
8ba5ced7 769 */
5407e2ab 770static bool criu_version_ok(char **version)
8ba5ced7
TA
771{
772 int pipes[2];
773 pid_t pid;
774
775 if (pipe(pipes) < 0) {
776 SYSERROR("pipe() failed");
777 return false;
778 }
779
780 pid = fork();
781 if (pid < 0) {
782 SYSERROR("fork() failed");
783 return false;
784 }
785
786 if (pid == 0) {
787 char *args[] = { "criu", "--version", NULL };
755fa453 788 char *path;
8ba5ced7
TA
789 close(pipes[0]);
790
791 close(STDERR_FILENO);
792 if (dup2(pipes[1], STDOUT_FILENO) < 0)
665bb114 793 _exit(EXIT_FAILURE);
8ba5ced7 794
755fa453 795 path = on_path("criu", NULL);
d9b32b09 796 if (!path)
665bb114 797 _exit(EXIT_FAILURE);
d9b32b09 798
755fa453 799 execv(path, args);
665bb114 800 _exit(EXIT_FAILURE);
8ba5ced7
TA
801 } else {
802 FILE *f;
5407e2ab 803 char *tmp;
8ba5ced7
TA
804 int patch;
805
806 close(pipes[1]);
807 if (wait_for_pid(pid) < 0) {
808 close(pipes[0]);
4eae4051 809 SYSERROR("execing criu failed, is it installed?");
8ba5ced7
TA
810 return false;
811 }
812
813 f = fdopen(pipes[0], "r");
814 if (!f) {
815 close(pipes[0]);
816 return false;
817 }
818
5407e2ab
CB
819 tmp = malloc(1024);
820 if (!tmp) {
821 fclose(f);
822 return false;
823 }
824
825 if (fscanf(f, "Version: %1023[^\n]s", tmp) != 1)
8ba5ced7
TA
826 goto version_error;
827
828 if (fgetc(f) != '\n')
829 goto version_error;
830
5407e2ab 831 if (strcmp(tmp, CRIU_VERSION) >= 0)
8ba5ced7
TA
832 goto version_match;
833
5407e2ab 834 if (fscanf(f, "GitID: v%1023[^-]s", tmp) != 1)
8ba5ced7
TA
835 goto version_error;
836
837 if (fgetc(f) != '-')
838 goto version_error;
839
840 if (fscanf(f, "%d", &patch) != 1)
841 goto version_error;
842
5407e2ab 843 if (strcmp(tmp, CRIU_GITID_VERSION) < 0)
8ba5ced7
TA
844 goto version_error;
845
846 if (patch < CRIU_GITID_PATCHLEVEL)
847 goto version_error;
848
849version_match:
3158ab5b 850 fclose(f);
5407e2ab
CB
851 if (!version)
852 free(tmp);
853 else
854 *version = tmp;
8ba5ced7
TA
855 return true;
856
857version_error:
3158ab5b 858 fclose(f);
5407e2ab 859 free(tmp);
9f1f54b0 860 ERROR("must have criu " CRIU_VERSION " or greater to checkpoint/restore");
8ba5ced7
TA
861 return false;
862 }
863}
864
e29fe1dd
TA
865/* Check and make sure the container has a configuration that we know CRIU can
866 * dump. */
f1954503 867static bool criu_ok(struct lxc_container *c, char **criu_version)
e29fe1dd
TA
868{
869 struct lxc_list *it;
e29fe1dd 870
f1954503 871 if (!criu_version_ok(criu_version))
8ba5ced7
TA
872 return false;
873
e29fe1dd 874 if (geteuid()) {
9f1f54b0 875 ERROR("Must be root to checkpoint");
e29fe1dd
TA
876 return false;
877 }
878
879 /* We only know how to restore containers with veth networks. */
880 lxc_list_for_each(it, &c->lxc_conf->network) {
881 struct lxc_netdev *n = it->elem;
65b20221
TA
882 switch(n->type) {
883 case LXC_NET_VETH:
884 case LXC_NET_NONE:
885 case LXC_NET_EMPTY:
e2697330 886 case LXC_NET_MACVLAN:
65b20221
TA
887 break;
888 default:
9f1f54b0 889 ERROR("Found un-dumpable network: %s (%s)", lxc_net_type_to_str(n->type), n->name);
e29fe1dd
TA
890 return false;
891 }
892 }
893
e29fe1dd
TA
894 return true;
895}
896
e29fe1dd
TA
897static bool restore_net_info(struct lxc_container *c)
898{
899 struct lxc_list *it;
900 bool has_error = true;
901
902 if (container_mem_lock(c))
903 return false;
904
905 lxc_list_for_each(it, &c->lxc_conf->network) {
906 struct lxc_netdev *netdev = it->elem;
907 char template[IFNAMSIZ];
65b20221
TA
908
909 if (netdev->type != LXC_NET_VETH)
910 continue;
911
e29fe1dd
TA
912 snprintf(template, sizeof(template), "vethXXXXXX");
913
de4855a8
CB
914 if (netdev->priv.veth_attr.pair[0] == '\0' &&
915 netdev->priv.veth_attr.veth1[0] == '\0') {
966e9f1f 916 if (!lxc_mkifname(template))
de4855a8
CB
917 goto out_unlock;
918
966e9f1f 919 strcpy(netdev->priv.veth_attr.veth1, template);
de4855a8 920 }
e29fe1dd
TA
921 }
922
923 has_error = false;
924
925out_unlock:
926 container_mem_unlock(c);
927 return !has_error;
928}
929
1a0e70ac 930/* do_restore never returns, the calling process is used as the monitor process.
5a24adb8 931 * do_restore calls _exit() if it fails.
1a0e70ac 932 */
c33b0338 933static void do_restore(struct lxc_container *c, int status_pipe, struct migrate_opts *opts, char *criu_version)
e29fe1dd 934{
5af9369b 935 int fd, ret;
e29fe1dd 936 pid_t pid;
e29fe1dd 937 struct lxc_handler *handler;
113ebd57 938 int status = 0;
9b1e2e6e 939 int pipes[2] = {-1, -1};
2202afc9 940 struct cgroup_ops *cgroup_ops;
e29fe1dd 941
a7fb6043
TA
942 /* Try to detach from the current controlling tty if it exists.
943 * Othwerise, lxc_init (via lxc_console) will attach the container's
944 * console output to the current tty, which is probably not what any
945 * library user wants, and if they do, they can just manually configure
946 * it :)
947 */
948 fd = open("/dev/tty", O_RDWR);
949 if (fd >= 0) {
950 if (ioctl(fd, TIOCNOTTY, NULL) < 0)
951 SYSERROR("couldn't detach from tty");
952 close(fd);
953 }
954
5e5576a4 955 handler = lxc_init_handler(c->name, c->lxc_conf, c->config_path, false);
e29fe1dd
TA
956 if (!handler)
957 goto out;
958
aa460476
CB
959 if (lxc_init(c->name, handler) < 0)
960 goto out;
961
2202afc9
CB
962 cgroup_ops = cgroup_init(NULL);
963 if (!cgroup_ops)
e29fe1dd 964 goto out_fini_handler;
2202afc9 965 handler->cgroup_ops = cgroup_ops;
e29fe1dd 966
2202afc9 967 if (!cgroup_ops->create(cgroup_ops, handler)) {
e29fe1dd
TA
968 ERROR("failed creating groups");
969 goto out_fini_handler;
970 }
971
972 if (!restore_net_info(c)) {
973 ERROR("failed restoring network info");
974 goto out_fini_handler;
975 }
976
5af9369b
CB
977 ret = resolve_clone_flags(handler);
978 if (ret < 0) {
979 ERROR("%s - Unsupported clone flag specified", strerror(errno));
980 goto out_fini_handler;
981 }
e29fe1dd 982
3d9a5c85
TA
983 if (pipe(pipes) < 0) {
984 SYSERROR("pipe() failed");
985 goto out_fini_handler;
986 }
987
e29fe1dd
TA
988 pid = fork();
989 if (pid < 0)
990 goto out_fini_handler;
991
992 if (pid == 0) {
993 struct criu_opts os;
994 struct lxc_rootfs *rootfs;
4b54788e 995 int flags;
e29fe1dd 996
3d9a5c85
TA
997 close(status_pipe);
998 status_pipe = -1;
999
1000 close(pipes[0]);
1001 pipes[0] = -1;
e29fe1dd
TA
1002
1003 if (unshare(CLONE_NEWNS))
1004 goto out_fini_handler;
1005
1006 /* CRIU needs the lxc root bind mounted so that it is the root of some
1007 * mount. */
1008 rootfs = &c->lxc_conf->rootfs;
1009
1010 if (rootfs_is_blockdev(c->lxc_conf)) {
1011 if (do_rootfs_setup(c->lxc_conf, c->name, c->config_path) < 0)
1012 goto out_fini_handler;
1013 } else {
1014 if (mkdir(rootfs->mount, 0755) < 0 && errno != EEXIST)
1015 goto out_fini_handler;
1016
1017 if (mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL) < 0) {
1018 SYSERROR("remount / to private failed");
1019 goto out_fini_handler;
1020 }
1021
1022 if (mount(rootfs->path, rootfs->mount, NULL, MS_BIND, NULL) < 0) {
1023 rmdir(rootfs->mount);
1024 goto out_fini_handler;
1025 }
1026 }
1027
5af85cb1 1028 os.pipefd = pipes[1];
e29fe1dd 1029 os.action = "restore";
b2c3710f 1030 os.user = opts;
e29fe1dd 1031 os.c = c;
4b54788e 1032 os.console_fd = c->lxc_conf->console.slave;
f1954503 1033 os.criu_version = criu_version;
0ab5703f 1034 os.handler = handler;
4b54788e 1035
97e4f1a9
TA
1036 if (os.console_fd >= 0) {
1037 /* Twiddle the FD_CLOEXEC bit. We want to pass this FD to criu
1038 * via --inherit-fd, so we don't want it to close.
1039 */
1040 flags = fcntl(os.console_fd, F_GETFD);
1041 if (flags < 0) {
1042 SYSERROR("F_GETFD failed: %d", os.console_fd);
1043 goto out_fini_handler;
1044 }
4b54788e 1045
97e4f1a9 1046 flags &= ~FD_CLOEXEC;
4b54788e 1047
97e4f1a9
TA
1048 if (fcntl(os.console_fd, F_SETFD, flags) < 0) {
1049 SYSERROR("F_SETFD failed");
1050 goto out_fini_handler;
1051 }
4b54788e
TA
1052 }
1053 os.console_name = c->lxc_conf->console.name;
e29fe1dd
TA
1054
1055 /* exec_criu() returning is an error */
2202afc9 1056 exec_criu(cgroup_ops, &os);
e29fe1dd
TA
1057 umount(rootfs->mount);
1058 rmdir(rootfs->mount);
1059 goto out_fini_handler;
1060 } else {
1061 int ret;
1062 char title[2048];
1063
3d9a5c85
TA
1064 close(pipes[1]);
1065 pipes[1] = -1;
1066
e29fe1dd
TA
1067 pid_t w = waitpid(pid, &status, 0);
1068 if (w == -1) {
1069 SYSERROR("waitpid");
1070 goto out_fini_handler;
1071 }
1072
e29fe1dd 1073 if (WIFEXITED(status)) {
75d219f0
TA
1074 char buf[4096];
1075
e29fe1dd 1076 if (WEXITSTATUS(status)) {
3d9a5c85
TA
1077 int n;
1078
1079 n = read(pipes[0], buf, sizeof(buf));
1080 if (n < 0) {
1081 SYSERROR("failed reading from criu stderr");
1082 goto out_fini_handler;
1083 }
1084
2735dfae
TA
1085 if (n == sizeof(buf))
1086 n--;
3d9a5c85
TA
1087 buf[n] = 0;
1088
9f1f54b0 1089 ERROR("criu process exited %d, output:\n%s", WEXITSTATUS(status), buf);
e29fe1dd
TA
1090 goto out_fini_handler;
1091 } else {
3eba9b49 1092 ret = snprintf(buf, sizeof(buf), "/proc/self/task/%lu/children", (unsigned long)syscall(__NR_gettid));
75d219f0
TA
1093 if (ret < 0 || ret >= sizeof(buf)) {
1094 ERROR("snprintf'd too many characters: %d", ret);
1095 goto out_fini_handler;
1096 }
1097
1098 FILE *f = fopen(buf, "r");
e29fe1dd 1099 if (!f) {
9f1f54b0 1100 SYSERROR("couldn't read restore's children file %s", buf);
e29fe1dd
TA
1101 goto out_fini_handler;
1102 }
1103
1104 ret = fscanf(f, "%d", (int*) &handler->pid);
1105 fclose(f);
1106 if (ret != 1) {
1107 ERROR("reading restore pid failed");
1108 goto out_fini_handler;
1109 }
1110
f8a41688
TA
1111 if (lxc_set_state(c->name, handler, RUNNING)) {
1112 ERROR("error setting running state after restore");
e29fe1dd 1113 goto out_fini_handler;
f8a41688 1114 }
e29fe1dd
TA
1115 }
1116 } else {
9f1f54b0 1117 ERROR("CRIU was killed with signal %d", WTERMSIG(status));
e29fe1dd
TA
1118 goto out_fini_handler;
1119 }
1120
3d9a5c85
TA
1121 close(pipes[0]);
1122
f3886023
TA
1123 ret = write(status_pipe, &status, sizeof(status));
1124 close(status_pipe);
1125 status_pipe = -1;
1126
1127 if (sizeof(status) != ret) {
1128 SYSERROR("failed to write all of status");
1129 goto out_fini_handler;
1130 }
1131
e29fe1dd
TA
1132 /*
1133 * See comment in lxcapi_start; we don't care if these
1134 * fail because it's just a beauty thing. We just
1135 * assign the return here to silence potential.
1136 */
1137 ret = snprintf(title, sizeof(title), "[lxc monitor] %s %s", c->config_path, c->name);
223e30c1
CB
1138 if (ret < 0 || (size_t)ret >= sizeof(title))
1139 INFO("Setting truncated process name");
1140
e29fe1dd 1141 ret = setproctitle(title);
223e30c1
CB
1142 if (ret < 0)
1143 INFO("Failed to set process name");
e29fe1dd
TA
1144
1145 ret = lxc_poll(c->name, handler);
1146 if (ret)
1147 lxc_abort(c->name, handler);
1148 lxc_fini(c->name, handler);
5a24adb8 1149 _exit(ret);
e29fe1dd
TA
1150 }
1151
1152out_fini_handler:
3d9a5c85
TA
1153 if (pipes[0] >= 0)
1154 close(pipes[0]);
1155 if (pipes[1] >= 0)
1156 close(pipes[1]);
1157
e29fe1dd
TA
1158 lxc_fini(c->name, handler);
1159
1160out:
3d9a5c85 1161 if (status_pipe >= 0) {
f3886023
TA
1162 /* ensure getting here was a failure, e.g. if we failed to
1163 * parse the child pid or something, even after a successful
1164 * restore
1165 */
1166 if (!status)
1167 status = 1;
113ebd57
CB
1168
1169 if (write(status_pipe, &status, sizeof(status)) != sizeof(status))
e29fe1dd 1170 SYSERROR("writing status failed");
3d9a5c85 1171 close(status_pipe);
e29fe1dd
TA
1172 }
1173
5a24adb8 1174 _exit(EXIT_FAILURE);
e29fe1dd 1175}
aef3d51e 1176
4b54788e
TA
1177static int save_tty_major_minor(char *directory, struct lxc_container *c, char *tty_id, int len)
1178{
1179 FILE *f;
1180 char path[PATH_MAX];
1181 int ret;
1182 struct stat sb;
1183
1184 if (c->lxc_conf->console.path && !strcmp(c->lxc_conf->console.path, "none")) {
1185 tty_id[0] = 0;
1186 return 0;
1187 }
1188
1189 ret = snprintf(path, sizeof(path), "/proc/%d/root/dev/console", c->init_pid(c));
1190 if (ret < 0 || ret >= sizeof(path)) {
1191 ERROR("snprintf'd too many chacters: %d", ret);
1192 return -1;
1193 }
1194
1195 ret = stat(path, &sb);
1196 if (ret < 0) {
1197 SYSERROR("stat of %s failed", path);
1198 return -1;
1199 }
1200
1201 ret = snprintf(path, sizeof(path), "%s/tty.info", directory);
1202 if (ret < 0 || ret >= sizeof(path)) {
1203 ERROR("snprintf'd too many characters: %d", ret);
1204 return -1;
1205 }
1206
f03280a7
TA
1207 ret = snprintf(tty_id, len, "tty[%llx:%llx]",
1208 (long long unsigned) sb.st_rdev,
1209 (long long unsigned) sb.st_dev);
4b54788e
TA
1210 if (ret < 0 || ret >= sizeof(path)) {
1211 ERROR("snprintf'd too many characters: %d", ret);
1212 return -1;
1213 }
1214
1215 f = fopen(path, "w");
1216 if (!f) {
1217 SYSERROR("failed to open %s", path);
1218 return -1;
1219 }
1220
1221 ret = fprintf(f, "%s", tty_id);
1222 fclose(f);
1223 if (ret < 0)
1224 SYSERROR("failed to write to %s", path);
1225 return ret;
1226}
1227
aef3d51e 1228/* do one of either predump or a regular dump */
b2c3710f 1229static bool do_dump(struct lxc_container *c, char *mode, struct migrate_opts *opts)
aef3d51e 1230{
0e4adc1a 1231 int ret;
aef3d51e 1232 pid_t pid;
5af85cb1 1233 int criuout[2];
0e4adc1a 1234 char *criu_version = NULL;
aef3d51e 1235
f1954503 1236 if (!criu_ok(c, &criu_version))
aef3d51e
TA
1237 return false;
1238
0e4adc1a
CB
1239 ret = pipe(criuout);
1240 if (ret < 0) {
5af85cb1 1241 SYSERROR("pipe() failed");
aef3d51e 1242 return false;
5af85cb1
TA
1243 }
1244
1245 if (mkdir_p(opts->directory, 0700) < 0)
1246 goto fail;
aef3d51e
TA
1247
1248 pid = fork();
1249 if (pid < 0) {
1250 SYSERROR("fork failed");
5af85cb1 1251 goto fail;
aef3d51e
TA
1252 }
1253
1254 if (pid == 0) {
1255 struct criu_opts os;
0ab5703f 1256 struct lxc_handler h;
2202afc9 1257 struct cgroup_ops *cgroup_ops;
0ab5703f 1258
5af85cb1
TA
1259 close(criuout[0]);
1260
41784e4e
CB
1261 lxc_zero_handler(&h);
1262
0ab5703f 1263 h.name = c->name;
2202afc9
CB
1264
1265 cgroup_ops = cgroup_init(NULL);
1266 if (!cgroup_ops) {
0ab5703f 1267 ERROR("failed to cgroup_init()");
7211378b 1268 _exit(EXIT_FAILURE);
2202afc9 1269 return -1;
0ab5703f 1270 }
2202afc9 1271 h.cgroup_ops = cgroup_ops;
aef3d51e 1272
5af85cb1 1273 os.pipefd = criuout[1];
aef3d51e 1274 os.action = mode;
b2c3710f 1275 os.user = opts;
aef3d51e 1276 os.c = c;
4b54788e 1277 os.console_name = c->lxc_conf->console.path;
f1954503 1278 os.criu_version = criu_version;
74eb576c 1279
0e4adc1a
CB
1280 ret = save_tty_major_minor(opts->directory, c, os.tty_id, sizeof(os.tty_id));
1281 if (ret < 0) {
1282 free(criu_version);
7211378b 1283 _exit(EXIT_FAILURE);
0e4adc1a 1284 }
aef3d51e
TA
1285
1286 /* exec_criu() returning is an error */
2202afc9 1287 exec_criu(cgroup_ops, &os);
0e4adc1a 1288 free(criu_version);
7211378b 1289 _exit(EXIT_FAILURE);
aef3d51e
TA
1290 } else {
1291 int status;
5af85cb1
TA
1292 ssize_t n;
1293 char buf[4096];
1294 bool ret;
1295
1296 close(criuout[1]);
1297
aef3d51e
TA
1298 pid_t w = waitpid(pid, &status, 0);
1299 if (w == -1) {
1300 SYSERROR("waitpid");
5af85cb1 1301 close(criuout[0]);
aef3d51e
TA
1302 return false;
1303 }
1304
5af85cb1
TA
1305 n = read(criuout[0], buf, sizeof(buf));
1306 close(criuout[0]);
1307 if (n < 0) {
1308 SYSERROR("read");
1309 n = 0;
1310 }
1311 buf[n] = 0;
1312
aef3d51e
TA
1313 if (WIFEXITED(status)) {
1314 if (WEXITSTATUS(status)) {
9f1f54b0 1315 ERROR("dump failed with %d", WEXITSTATUS(status));
5af85cb1
TA
1316 ret = false;
1317 } else {
1318 ret = true;
aef3d51e 1319 }
aef3d51e 1320 } else if (WIFSIGNALED(status)) {
9f1f54b0 1321 ERROR("dump signaled with %d", WTERMSIG(status));
5af85cb1 1322 ret = false;
aef3d51e 1323 } else {
9f1f54b0 1324 ERROR("unknown dump exit %d", status);
5af85cb1 1325 ret = false;
aef3d51e 1326 }
5af85cb1
TA
1327
1328 if (!ret)
1329 ERROR("criu output: %s", buf);
1330 return ret;
aef3d51e 1331 }
5af85cb1
TA
1332fail:
1333 close(criuout[0]);
1334 close(criuout[1]);
1335 rmdir(opts->directory);
0e4adc1a 1336 free(criu_version);
5af85cb1 1337 return false;
aef3d51e
TA
1338}
1339
b2c3710f 1340bool __criu_pre_dump(struct lxc_container *c, struct migrate_opts *opts)
aef3d51e 1341{
b2c3710f 1342 return do_dump(c, "pre-dump", opts);
aef3d51e
TA
1343}
1344
b2c3710f 1345bool __criu_dump(struct lxc_container *c, struct migrate_opts *opts)
aef3d51e
TA
1346{
1347 char path[PATH_MAX];
1348 int ret;
1349
b2c3710f 1350 ret = snprintf(path, sizeof(path), "%s/inventory.img", opts->directory);
aef3d51e
TA
1351 if (ret < 0 || ret >= sizeof(path))
1352 return false;
1353
1354 if (access(path, F_OK) == 0) {
9f1f54b0 1355 ERROR("please use a fresh directory for the dump directory");
aef3d51e
TA
1356 return false;
1357 }
1358
b2c3710f 1359 return do_dump(c, "dump", opts);
aef3d51e
TA
1360}
1361
b2c3710f 1362bool __criu_restore(struct lxc_container *c, struct migrate_opts *opts)
aef3d51e
TA
1363{
1364 pid_t pid;
1365 int status, nread;
1366 int pipefd[2];
f1954503 1367 char *criu_version = NULL;
aef3d51e 1368
f1954503 1369 if (!criu_ok(c, &criu_version))
aef3d51e
TA
1370 return false;
1371
1372 if (geteuid()) {
9f1f54b0 1373 ERROR("Must be root to restore");
aef3d51e
TA
1374 return false;
1375 }
1376
1377 if (pipe(pipefd)) {
1378 ERROR("failed to create pipe");
1379 return false;
1380 }
1381
1382 pid = fork();
1383 if (pid < 0) {
1384 close(pipefd[0]);
1385 close(pipefd[1]);
1386 return false;
1387 }
1388
1389 if (pid == 0) {
1390 close(pipefd[0]);
1a0e70ac 1391 /* this never returns */
f1954503 1392 do_restore(c, pipefd[1], opts, criu_version);
aef3d51e
TA
1393 }
1394
1395 close(pipefd[1]);
1396
1397 nread = read(pipefd[0], &status, sizeof(status));
1398 close(pipefd[0]);
1399 if (sizeof(status) != nread) {
1400 ERROR("reading status from pipe failed");
1401 goto err_wait;
1402 }
1403
1a0e70ac
CB
1404 /* If the criu process was killed or exited nonzero, wait() for the
1405 * handler, since the restore process died. Otherwise, we don't need to
1406 * wait, since the child becomes the monitor process.
1407 */
aef3d51e
TA
1408 if (!WIFEXITED(status) || WEXITSTATUS(status))
1409 goto err_wait;
1410 return true;
1411
1412err_wait:
1413 if (wait_for_pid(pid))
1414 ERROR("restore process died");
1415 return false;
1416}