]> git.proxmox.com Git - mirror_lxc.git/blame - src/lxc/criu.c
Merge pull request #1072 from adrianreber/master
[mirror_lxc.git] / src / lxc / criu.c
CommitLineData
e29fe1dd
TA
1/*
2 * lxc: linux Container library
3 *
4 * Copyright © 2014-2015 Canonical Ltd.
5 *
6 * Authors:
7 * Tycho Andersen <tycho.andersen@canonical.com>
8 *
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 */
23#define _GNU_SOURCE
24#include <assert.h>
25#include <linux/limits.h>
26#include <sched.h>
27#include <stdio.h>
28#include <stdlib.h>
29#include <string.h>
30#include <sys/mount.h>
31#include <sys/types.h>
32#include <sys/wait.h>
33#include <unistd.h>
34
35#include "config.h"
36
4ec31c52 37#include "bdev/bdev.h"
e29fe1dd
TA
38#include "cgroup.h"
39#include "conf.h"
dc259399 40#include "commands.h"
e29fe1dd
TA
41#include "criu.h"
42#include "log.h"
43#include "lxc.h"
44#include "lxclock.h"
45#include "network.h"
46#include "utils.h"
47
73d46752
TA
48#define CRIU_VERSION "2.0"
49
50#define CRIU_GITID_VERSION "2.0"
51#define CRIU_GITID_PATCHLEVEL 0
52
f1954503
AR
53#define CRIU_IN_FLIGHT_SUPPORT "2.4"
54
e29fe1dd
TA
55lxc_log_define(lxc_criu, lxc);
56
73d46752
TA
57struct criu_opts {
58 /* The type of criu invocation, one of "dump" or "restore" */
59 char *action;
60
b2c3710f
TA
61 /* the user-provided migrate options relevant to this action */
62 struct migrate_opts *user;
73d46752
TA
63
64 /* The container to dump */
65 struct lxc_container *c;
66
73d46752 67 /* dump: stop the container or not after dumping? */
4b54788e 68 char tty_id[32]; /* the criu tty id for /dev/console, i.e. "tty[${rdev}:${dev}]" */
73d46752
TA
69
70 /* restore: the file to write the init process' pid into */
71 char *pidfile;
72 const char *cgroup_path;
4b54788e
TA
73 int console_fd;
74 /* The path that is bind mounted from /dev/console, if any. We don't
75 * want to use `--ext-mount-map auto`'s result here because the pts
76 * device may have a different path (e.g. if the pty number is
77 * different) on the target host. NULL if lxc.console = "none".
78 */
79 char *console_name;
f1954503
AR
80
81 /* The detected version of criu */
82 char *criu_version;
73d46752
TA
83};
84
4b54788e
TA
85static int load_tty_major_minor(char *directory, char *output, int len)
86{
87 FILE *f;
88 char path[PATH_MAX];
89 int ret;
90
91 ret = snprintf(path, sizeof(path), "%s/tty.info", directory);
92 if (ret < 0 || ret >= sizeof(path)) {
93 ERROR("snprintf'd too many chacters: %d", ret);
94 return -1;
95 }
96
97 f = fopen(path, "r");
98 if (!f) {
99 /* This means we're coming from a liblxc which didn't export
100 * the tty info. In this case they had to have lxc.console =
101 * none, so there's no problem restoring.
102 */
103 if (errno == ENOENT)
104 return 0;
105
106 SYSERROR("couldn't open %s", path);
107 return -1;
108 }
109
110 if (!fgets(output, len, f)) {
111 fclose(f);
112 SYSERROR("couldn't read %s", path);
113 return -1;
114 }
115
116 fclose(f);
117 return 0;
118}
119
9451eeff 120static void exec_criu(struct criu_opts *opts)
e29fe1dd
TA
121{
122 char **argv, log[PATH_MAX];
19d1509c 123 int static_args = 23, argc = 0, i, ret;
e29fe1dd
TA
124 int netnr = 0;
125 struct lxc_list *it;
126
a17fa3c0
NE
127 char buf[4096], tty_info[32];
128 size_t pos;
e9195050
TA
129 /* If we are currently in a cgroup /foo/bar, and the container is in a
130 * cgroup /lxc/foo, lxcfs will give us an ENOENT if some task in the
131 * container has an open fd that points to one of the cgroup files
132 * (systemd always opens its "root" cgroup). So, let's escape to the
133 * /actual/ root cgroup so that lxcfs thinks criu has enough rights to
134 * see all cgroups.
135 */
7103fe6f 136 if (!cgroup_escape()) {
e9195050
TA
137 ERROR("failed to escape cgroups");
138 return;
139 }
140
e29fe1dd 141 /* The command line always looks like:
19d1509c 142 * criu $(action) --tcp-established --file-locks --link-remap \
0a5fc6df 143 * --manage-cgroups=full action-script foo.sh -D $(directory) \
e29fe1dd
TA
144 * -o $(directory)/$(action).log --ext-mount-map auto
145 * --enable-external-sharing --enable-external-masters
4b54788e 146 * --enable-fs hugetlbfs --enable-fs tracefs --ext-mount-map console:/dev/pts/n
e29fe1dd
TA
147 * +1 for final NULL */
148
aef3d51e 149 if (strcmp(opts->action, "dump") == 0 || strcmp(opts->action, "pre-dump") == 0) {
dc259399
TA
150 /* -t pid --freeze-cgroup /lxc/ct */
151 static_args += 4;
e29fe1dd 152
aef3d51e 153 /* --prev-images-dir <path-to-directory-A-relative-to-B> */
b2c3710f 154 if (opts->user->predump_dir)
aef3d51e
TA
155 static_args += 2;
156
74eb576c 157 /* --page-server --address <address> --port <port> */
b2c3710f 158 if (opts->user->pageserver_address && opts->user->pageserver_port)
74eb576c
NE
159 static_args += 5;
160
aef3d51e 161 /* --leave-running (only for final dump) */
b2c3710f 162 if (strcmp(opts->action, "dump") == 0 && !opts->user->stop)
e29fe1dd 163 static_args++;
4b54788e
TA
164
165 /* --external tty[88,4] */
166 if (opts->tty_id[0])
167 static_args += 2;
19d1509c
TA
168
169 /* --force-irmap */
170 if (!opts->user->preserves_inodes)
171 static_args++;
e29fe1dd
TA
172 } else if (strcmp(opts->action, "restore") == 0) {
173 /* --root $(lxc_mount_point) --restore-detached
13389b29
TA
174 * --restore-sibling --pidfile $foo --cgroup-root $foo
175 * --lsm-profile apparmor:whatever
176 */
177 static_args += 10;
4b54788e
TA
178
179 tty_info[0] = 0;
b2c3710f 180 if (load_tty_major_minor(opts->user->directory, tty_info, sizeof(tty_info)))
4b54788e
TA
181 return;
182
183 /* --inherit-fd fd[%d]:tty[%s] */
184 if (tty_info[0])
185 static_args += 2;
e29fe1dd
TA
186 } else {
187 return;
188 }
189
b2c3710f 190 if (opts->user->verbose)
e29fe1dd
TA
191 static_args++;
192
b9ee6643
TA
193 if (opts->user->action_script)
194 static_args += 2;
195
b2c3710f 196 ret = snprintf(log, PATH_MAX, "%s/%s.log", opts->user->directory, opts->action);
e29fe1dd
TA
197 if (ret < 0 || ret >= PATH_MAX) {
198 ERROR("logfile name too long\n");
199 return;
200 }
201
202 argv = malloc(static_args * sizeof(*argv));
203 if (!argv)
204 return;
205
206 memset(argv, 0, static_args * sizeof(*argv));
207
208#define DECLARE_ARG(arg) \
209 do { \
210 if (arg == NULL) { \
211 ERROR("Got NULL argument for criu"); \
212 goto err; \
213 } \
214 argv[argc++] = strdup(arg); \
215 if (!argv[argc-1]) \
216 goto err; \
217 } while (0)
218
219 argv[argc++] = on_path("criu", NULL);
220 if (!argv[argc-1]) {
221 ERROR("Couldn't find criu binary\n");
222 goto err;
223 }
224
225 DECLARE_ARG(opts->action);
226 DECLARE_ARG("--tcp-established");
227 DECLARE_ARG("--file-locks");
228 DECLARE_ARG("--link-remap");
0a5fc6df 229 DECLARE_ARG("--manage-cgroups=full");
e29fe1dd
TA
230 DECLARE_ARG("--ext-mount-map");
231 DECLARE_ARG("auto");
232 DECLARE_ARG("--enable-external-sharing");
233 DECLARE_ARG("--enable-external-masters");
dd62857a
TA
234 DECLARE_ARG("--enable-fs");
235 DECLARE_ARG("hugetlbfs");
5b454329
TA
236 DECLARE_ARG("--enable-fs");
237 DECLARE_ARG("tracefs");
e29fe1dd 238 DECLARE_ARG("-D");
b2c3710f 239 DECLARE_ARG(opts->user->directory);
e29fe1dd
TA
240 DECLARE_ARG("-o");
241 DECLARE_ARG(log);
242
b2c3710f 243 if (opts->user->verbose)
e29fe1dd
TA
244 DECLARE_ARG("-vvvvvv");
245
b9ee6643
TA
246 if (opts->user->action_script) {
247 DECLARE_ARG("--action-script");
248 DECLARE_ARG(opts->user->action_script);
249 }
250
aef3d51e 251 if (strcmp(opts->action, "dump") == 0 || strcmp(opts->action, "pre-dump") == 0) {
dc259399 252 char pid[32], *freezer_relative;
e29fe1dd
TA
253
254 if (sprintf(pid, "%d", opts->c->init_pid(opts->c)) < 0)
255 goto err;
256
257 DECLARE_ARG("-t");
258 DECLARE_ARG(pid);
dc259399
TA
259
260 freezer_relative = lxc_cmd_get_cgroup_path(opts->c->name,
261 opts->c->config_path,
262 "freezer");
263 if (!freezer_relative) {
264 ERROR("failed getting freezer path");
265 goto err;
266 }
267
268 ret = snprintf(log, sizeof(log), "/sys/fs/cgroup/freezer/%s", freezer_relative);
269 if (ret < 0 || ret >= sizeof(log))
270 goto err;
271
f1954503
AR
272 if (!opts->user->disable_skip_in_flight &&
273 strcmp(opts->criu_version, CRIU_IN_FLIGHT_SUPPORT) >= 0)
274 DECLARE_ARG("--skip-in-flight");
275
dc259399
TA
276 DECLARE_ARG("--freeze-cgroup");
277 DECLARE_ARG(log);
278
4b54788e 279 if (opts->tty_id[0]) {
36d2096c
TA
280 DECLARE_ARG("--ext-mount-map");
281 DECLARE_ARG("/dev/console:console");
282
4b54788e
TA
283 DECLARE_ARG("--external");
284 DECLARE_ARG(opts->tty_id);
285 }
286
b2c3710f 287 if (opts->user->predump_dir) {
aef3d51e 288 DECLARE_ARG("--prev-images-dir");
b2c3710f 289 DECLARE_ARG(opts->user->predump_dir);
74eb576c 290 }
4c0c0319 291
b2c3710f 292 if (opts->user->pageserver_address && opts->user->pageserver_port) {
74eb576c
NE
293 DECLARE_ARG("--page-server");
294 DECLARE_ARG("--address");
b2c3710f 295 DECLARE_ARG(opts->user->pageserver_address);
74eb576c 296 DECLARE_ARG("--port");
b2c3710f 297 DECLARE_ARG(opts->user->pageserver_port);
74eb576c 298 }
aef3d51e 299
19d1509c
TA
300 if (!opts->user->preserves_inodes)
301 DECLARE_ARG("--force-irmap");
302
aef3d51e 303 /* only for final dump */
b2c3710f 304 if (strcmp(opts->action, "dump") == 0 && !opts->user->stop)
e29fe1dd
TA
305 DECLARE_ARG("--leave-running");
306 } else if (strcmp(opts->action, "restore") == 0) {
307 void *m;
308 int additional;
13389b29 309 struct lxc_conf *lxc_conf = opts->c->lxc_conf;
e29fe1dd
TA
310
311 DECLARE_ARG("--root");
312 DECLARE_ARG(opts->c->lxc_conf->rootfs.mount);
313 DECLARE_ARG("--restore-detached");
314 DECLARE_ARG("--restore-sibling");
315 DECLARE_ARG("--pidfile");
316 DECLARE_ARG(opts->pidfile);
317 DECLARE_ARG("--cgroup-root");
318 DECLARE_ARG(opts->cgroup_path);
319
4b54788e 320 if (tty_info[0]) {
97e4f1a9
TA
321 if (opts->console_fd < 0) {
322 ERROR("lxc.console configured on source host but not target");
323 goto err;
324 }
325
4b54788e
TA
326 ret = snprintf(buf, sizeof(buf), "fd[%d]:%s", opts->console_fd, tty_info);
327 if (ret < 0 || ret >= sizeof(buf))
328 goto err;
329
330 DECLARE_ARG("--inherit-fd");
331 DECLARE_ARG(buf);
332 }
333 if (opts->console_name) {
334 if (snprintf(buf, sizeof(buf), "console:%s", opts->console_name) < 0) {
335 SYSERROR("sprintf'd too many bytes");
336 }
337 DECLARE_ARG("--ext-mount-map");
338 DECLARE_ARG(buf);
339 }
340
13389b29
TA
341 if (lxc_conf->lsm_aa_profile || lxc_conf->lsm_se_context) {
342
343 if (lxc_conf->lsm_aa_profile)
344 ret = snprintf(buf, sizeof(buf), "apparmor:%s", lxc_conf->lsm_aa_profile);
345 else
346 ret = snprintf(buf, sizeof(buf), "selinux:%s", lxc_conf->lsm_se_context);
347
348 if (ret < 0 || ret >= sizeof(buf))
349 goto err;
350
351 DECLARE_ARG("--lsm-profile");
352 DECLARE_ARG(buf);
353 }
354
e29fe1dd
TA
355 additional = lxc_list_len(&opts->c->lxc_conf->network) * 2;
356
fa071249
TA
357 m = realloc(argv, (argc + additional + 1) * sizeof(*argv));
358 if (!m)
359 goto err;
e29fe1dd
TA
360 argv = m;
361
362 lxc_list_for_each(it, &opts->c->lxc_conf->network) {
363 char eth[128], *veth;
364 struct lxc_netdev *n = it->elem;
365
65b20221
TA
366 if (n->type != LXC_NET_VETH)
367 continue;
368
e29fe1dd
TA
369 if (n->name) {
370 if (strlen(n->name) >= sizeof(eth))
371 goto err;
372 strncpy(eth, n->name, sizeof(eth));
373 } else
374 sprintf(eth, "eth%d", netnr);
375
376 veth = n->priv.veth_attr.pair;
377
c1fd648d
TA
378 if (n->link)
379 ret = snprintf(buf, sizeof(buf), "%s=%s@%s", eth, veth, n->link);
380 else
381 ret = snprintf(buf, sizeof(buf), "%s=%s", eth, veth);
e29fe1dd
TA
382 if (ret < 0 || ret >= sizeof(buf))
383 goto err;
384
385 DECLARE_ARG("--veth-pair");
386 DECLARE_ARG(buf);
387 }
388
389 }
390
391 argv[argc] = NULL;
392
cf4b07a5 393 buf[0] = 0;
a17fa3c0 394 pos = 0;
72a30576 395
cf4b07a5 396 for (i = 0; argv[i]; i++) {
72a30576
NE
397 ret = snprintf(buf + pos, sizeof(buf) - pos, "%s ", argv[i]);
398 if (ret < 0 || ret >= sizeof(buf) - pos)
399 goto err;
400 else
401 pos += ret;
cf4b07a5
TA
402 }
403
404 INFO("execing: %s", buf);
405
e29fe1dd
TA
406#undef DECLARE_ARG
407 execv(argv[0], argv);
408err:
e29fe1dd
TA
409 for (i = 0; argv[i]; i++)
410 free(argv[i]);
411 free(argv);
412}
413
8ba5ced7
TA
414/*
415 * Check to see if the criu version is recent enough for all the features we
416 * use. This version allows either CRIU_VERSION or (CRIU_GITID_VERSION and
417 * CRIU_GITID_PATCHLEVEL) to work, enabling users building from git to c/r
418 * things potentially before a version is released with a particular feature.
419 *
420 * The intent is that when criu development slows down, we can drop this, but
421 * for now we shouldn't attempt to c/r with versions that we know won't work.
5407e2ab
CB
422 *
423 * Note: If version != NULL criu_version() stores the detected criu version in
424 * version. Allocates memory for version which must be freed by caller.
8ba5ced7 425 */
5407e2ab 426static bool criu_version_ok(char **version)
8ba5ced7
TA
427{
428 int pipes[2];
429 pid_t pid;
430
431 if (pipe(pipes) < 0) {
432 SYSERROR("pipe() failed");
433 return false;
434 }
435
436 pid = fork();
437 if (pid < 0) {
438 SYSERROR("fork() failed");
439 return false;
440 }
441
442 if (pid == 0) {
443 char *args[] = { "criu", "--version", NULL };
755fa453 444 char *path;
8ba5ced7
TA
445 close(pipes[0]);
446
447 close(STDERR_FILENO);
448 if (dup2(pipes[1], STDOUT_FILENO) < 0)
449 exit(1);
450
755fa453 451 path = on_path("criu", NULL);
d9b32b09
SH
452 if (!path)
453 exit(1);
454
755fa453 455 execv(path, args);
8ba5ced7
TA
456 exit(1);
457 } else {
458 FILE *f;
5407e2ab 459 char *tmp;
8ba5ced7
TA
460 int patch;
461
462 close(pipes[1]);
463 if (wait_for_pid(pid) < 0) {
464 close(pipes[0]);
4eae4051 465 SYSERROR("execing criu failed, is it installed?");
8ba5ced7
TA
466 return false;
467 }
468
469 f = fdopen(pipes[0], "r");
470 if (!f) {
471 close(pipes[0]);
472 return false;
473 }
474
5407e2ab
CB
475 tmp = malloc(1024);
476 if (!tmp) {
477 fclose(f);
478 return false;
479 }
480
481 if (fscanf(f, "Version: %1023[^\n]s", tmp) != 1)
8ba5ced7
TA
482 goto version_error;
483
484 if (fgetc(f) != '\n')
485 goto version_error;
486
5407e2ab 487 if (strcmp(tmp, CRIU_VERSION) >= 0)
8ba5ced7
TA
488 goto version_match;
489
5407e2ab 490 if (fscanf(f, "GitID: v%1023[^-]s", tmp) != 1)
8ba5ced7
TA
491 goto version_error;
492
493 if (fgetc(f) != '-')
494 goto version_error;
495
496 if (fscanf(f, "%d", &patch) != 1)
497 goto version_error;
498
5407e2ab 499 if (strcmp(tmp, CRIU_GITID_VERSION) < 0)
8ba5ced7
TA
500 goto version_error;
501
502 if (patch < CRIU_GITID_PATCHLEVEL)
503 goto version_error;
504
505version_match:
3158ab5b 506 fclose(f);
5407e2ab
CB
507 if (!version)
508 free(tmp);
509 else
510 *version = tmp;
8ba5ced7
TA
511 return true;
512
513version_error:
3158ab5b 514 fclose(f);
5407e2ab 515 free(tmp);
8ba5ced7
TA
516 ERROR("must have criu " CRIU_VERSION " or greater to checkpoint/restore\n");
517 return false;
518 }
519}
520
e29fe1dd
TA
521/* Check and make sure the container has a configuration that we know CRIU can
522 * dump. */
f1954503 523static bool criu_ok(struct lxc_container *c, char **criu_version)
e29fe1dd
TA
524{
525 struct lxc_list *it;
e29fe1dd 526
f1954503 527 if (!criu_version_ok(criu_version))
8ba5ced7
TA
528 return false;
529
e29fe1dd
TA
530 if (geteuid()) {
531 ERROR("Must be root to checkpoint\n");
532 return false;
533 }
534
535 /* We only know how to restore containers with veth networks. */
536 lxc_list_for_each(it, &c->lxc_conf->network) {
537 struct lxc_netdev *n = it->elem;
65b20221
TA
538 switch(n->type) {
539 case LXC_NET_VETH:
540 case LXC_NET_NONE:
541 case LXC_NET_EMPTY:
542 break;
543 default:
e29fe1dd
TA
544 ERROR("Found network that is not VETH or NONE\n");
545 return false;
546 }
547 }
548
e29fe1dd
TA
549 return true;
550}
551
e29fe1dd
TA
552static bool restore_net_info(struct lxc_container *c)
553{
554 struct lxc_list *it;
555 bool has_error = true;
556
557 if (container_mem_lock(c))
558 return false;
559
560 lxc_list_for_each(it, &c->lxc_conf->network) {
561 struct lxc_netdev *netdev = it->elem;
562 char template[IFNAMSIZ];
65b20221
TA
563
564 if (netdev->type != LXC_NET_VETH)
565 continue;
566
e29fe1dd
TA
567 snprintf(template, sizeof(template), "vethXXXXXX");
568
569 if (!netdev->priv.veth_attr.pair)
570 netdev->priv.veth_attr.pair = lxc_mkifname(template);
571
572 if (!netdev->priv.veth_attr.pair)
573 goto out_unlock;
574 }
575
576 has_error = false;
577
578out_unlock:
579 container_mem_unlock(c);
580 return !has_error;
581}
582
aef3d51e
TA
583// do_restore never returns, the calling process is used as the
584// monitor process. do_restore calls exit() if it fails.
f1954503 585void do_restore(struct lxc_container *c, int status_pipe, struct migrate_opts *opts, char *criu_version)
e29fe1dd
TA
586{
587 pid_t pid;
588 char pidfile[L_tmpnam];
589 struct lxc_handler *handler;
3d9a5c85 590 int status, pipes[2] = {-1, -1};
e29fe1dd
TA
591
592 if (!tmpnam(pidfile))
593 goto out;
594
595 handler = lxc_init(c->name, c->lxc_conf, c->config_path);
596 if (!handler)
597 goto out;
598
599 if (!cgroup_init(handler)) {
600 ERROR("failed initing cgroups");
601 goto out_fini_handler;
602 }
603
604 if (!cgroup_create(handler)) {
605 ERROR("failed creating groups");
606 goto out_fini_handler;
607 }
608
609 if (!restore_net_info(c)) {
610 ERROR("failed restoring network info");
611 goto out_fini_handler;
612 }
613
614 resolve_clone_flags(handler);
615
3d9a5c85
TA
616 if (pipe(pipes) < 0) {
617 SYSERROR("pipe() failed");
618 goto out_fini_handler;
619 }
620
e29fe1dd
TA
621 pid = fork();
622 if (pid < 0)
623 goto out_fini_handler;
624
625 if (pid == 0) {
626 struct criu_opts os;
627 struct lxc_rootfs *rootfs;
4b54788e 628 int flags;
e29fe1dd 629
3d9a5c85
TA
630 close(status_pipe);
631 status_pipe = -1;
632
633 close(pipes[0]);
634 pipes[0] = -1;
635 if (dup2(pipes[1], STDERR_FILENO) < 0) {
636 SYSERROR("dup2 failed");
637 goto out_fini_handler;
638 }
639
640 if (dup2(pipes[1], STDOUT_FILENO) < 0) {
641 SYSERROR("dup2 failed");
642 goto out_fini_handler;
643 }
e29fe1dd
TA
644
645 if (unshare(CLONE_NEWNS))
646 goto out_fini_handler;
647
648 /* CRIU needs the lxc root bind mounted so that it is the root of some
649 * mount. */
650 rootfs = &c->lxc_conf->rootfs;
651
652 if (rootfs_is_blockdev(c->lxc_conf)) {
653 if (do_rootfs_setup(c->lxc_conf, c->name, c->config_path) < 0)
654 goto out_fini_handler;
655 } else {
656 if (mkdir(rootfs->mount, 0755) < 0 && errno != EEXIST)
657 goto out_fini_handler;
658
659 if (mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL) < 0) {
660 SYSERROR("remount / to private failed");
661 goto out_fini_handler;
662 }
663
664 if (mount(rootfs->path, rootfs->mount, NULL, MS_BIND, NULL) < 0) {
665 rmdir(rootfs->mount);
666 goto out_fini_handler;
667 }
668 }
669
670 os.action = "restore";
b2c3710f 671 os.user = opts;
e29fe1dd
TA
672 os.c = c;
673 os.pidfile = pidfile;
e29fe1dd 674 os.cgroup_path = cgroup_canonical_path(handler);
4b54788e 675 os.console_fd = c->lxc_conf->console.slave;
f1954503 676 os.criu_version = criu_version;
4b54788e 677
97e4f1a9
TA
678 if (os.console_fd >= 0) {
679 /* Twiddle the FD_CLOEXEC bit. We want to pass this FD to criu
680 * via --inherit-fd, so we don't want it to close.
681 */
682 flags = fcntl(os.console_fd, F_GETFD);
683 if (flags < 0) {
684 SYSERROR("F_GETFD failed: %d", os.console_fd);
685 goto out_fini_handler;
686 }
4b54788e 687
97e4f1a9 688 flags &= ~FD_CLOEXEC;
4b54788e 689
97e4f1a9
TA
690 if (fcntl(os.console_fd, F_SETFD, flags) < 0) {
691 SYSERROR("F_SETFD failed");
692 goto out_fini_handler;
693 }
4b54788e
TA
694 }
695 os.console_name = c->lxc_conf->console.name;
e29fe1dd
TA
696
697 /* exec_criu() returning is an error */
7103fe6f 698 exec_criu(&os);
e29fe1dd
TA
699 umount(rootfs->mount);
700 rmdir(rootfs->mount);
701 goto out_fini_handler;
702 } else {
703 int ret;
704 char title[2048];
705
3d9a5c85
TA
706 close(pipes[1]);
707 pipes[1] = -1;
708
e29fe1dd
TA
709 pid_t w = waitpid(pid, &status, 0);
710 if (w == -1) {
711 SYSERROR("waitpid");
712 goto out_fini_handler;
713 }
714
3d9a5c85
TA
715 ret = write(status_pipe, &status, sizeof(status));
716 close(status_pipe);
717 status_pipe = -1;
e29fe1dd
TA
718
719 if (sizeof(status) != ret) {
720 SYSERROR("failed to write all of status");
721 goto out_fini_handler;
722 }
723
724 if (WIFEXITED(status)) {
725 if (WEXITSTATUS(status)) {
3d9a5c85
TA
726 char buf[4096];
727 int n;
728
729 n = read(pipes[0], buf, sizeof(buf));
730 if (n < 0) {
731 SYSERROR("failed reading from criu stderr");
732 goto out_fini_handler;
733 }
734
735 buf[n] = 0;
736
737 ERROR("criu process exited %d, output:\n%s\n", WEXITSTATUS(status), buf);
e29fe1dd
TA
738 goto out_fini_handler;
739 } else {
740 int ret;
741 FILE *f = fopen(pidfile, "r");
742 if (!f) {
743 SYSERROR("couldn't read restore's init pidfile %s\n", pidfile);
744 goto out_fini_handler;
745 }
746
747 ret = fscanf(f, "%d", (int*) &handler->pid);
748 fclose(f);
59c2d406
TA
749 if (unlink(pidfile) < 0 && errno != ENOENT)
750 SYSERROR("unlinking pidfile failed");
751
e29fe1dd
TA
752 if (ret != 1) {
753 ERROR("reading restore pid failed");
754 goto out_fini_handler;
755 }
756
f8a41688
TA
757 if (lxc_set_state(c->name, handler, RUNNING)) {
758 ERROR("error setting running state after restore");
e29fe1dd 759 goto out_fini_handler;
f8a41688 760 }
e29fe1dd
TA
761 }
762 } else {
763 ERROR("CRIU was killed with signal %d\n", WTERMSIG(status));
764 goto out_fini_handler;
765 }
766
3d9a5c85
TA
767 close(pipes[0]);
768
e29fe1dd
TA
769 /*
770 * See comment in lxcapi_start; we don't care if these
771 * fail because it's just a beauty thing. We just
772 * assign the return here to silence potential.
773 */
774 ret = snprintf(title, sizeof(title), "[lxc monitor] %s %s", c->config_path, c->name);
775 ret = setproctitle(title);
776
777 ret = lxc_poll(c->name, handler);
778 if (ret)
779 lxc_abort(c->name, handler);
780 lxc_fini(c->name, handler);
781 exit(ret);
782 }
783
784out_fini_handler:
3d9a5c85
TA
785 if (pipes[0] >= 0)
786 close(pipes[0]);
787 if (pipes[1] >= 0)
788 close(pipes[1]);
789
e29fe1dd 790 lxc_fini(c->name, handler);
59c2d406
TA
791 if (unlink(pidfile) < 0 && errno != ENOENT)
792 SYSERROR("unlinking pidfile failed");
e29fe1dd
TA
793
794out:
3d9a5c85 795 if (status_pipe >= 0) {
e29fe1dd 796 status = 1;
3d9a5c85 797 if (write(status_pipe, &status, sizeof(status)) != sizeof(status)) {
e29fe1dd
TA
798 SYSERROR("writing status failed");
799 }
3d9a5c85 800 close(status_pipe);
e29fe1dd
TA
801 }
802
803 exit(1);
804}
aef3d51e 805
4b54788e
TA
806static int save_tty_major_minor(char *directory, struct lxc_container *c, char *tty_id, int len)
807{
808 FILE *f;
809 char path[PATH_MAX];
810 int ret;
811 struct stat sb;
812
813 if (c->lxc_conf->console.path && !strcmp(c->lxc_conf->console.path, "none")) {
814 tty_id[0] = 0;
815 return 0;
816 }
817
818 ret = snprintf(path, sizeof(path), "/proc/%d/root/dev/console", c->init_pid(c));
819 if (ret < 0 || ret >= sizeof(path)) {
820 ERROR("snprintf'd too many chacters: %d", ret);
821 return -1;
822 }
823
824 ret = stat(path, &sb);
825 if (ret < 0) {
826 SYSERROR("stat of %s failed", path);
827 return -1;
828 }
829
830 ret = snprintf(path, sizeof(path), "%s/tty.info", directory);
831 if (ret < 0 || ret >= sizeof(path)) {
832 ERROR("snprintf'd too many characters: %d", ret);
833 return -1;
834 }
835
f03280a7
TA
836 ret = snprintf(tty_id, len, "tty[%llx:%llx]",
837 (long long unsigned) sb.st_rdev,
838 (long long unsigned) sb.st_dev);
4b54788e
TA
839 if (ret < 0 || ret >= sizeof(path)) {
840 ERROR("snprintf'd too many characters: %d", ret);
841 return -1;
842 }
843
844 f = fopen(path, "w");
845 if (!f) {
846 SYSERROR("failed to open %s", path);
847 return -1;
848 }
849
850 ret = fprintf(f, "%s", tty_id);
851 fclose(f);
852 if (ret < 0)
853 SYSERROR("failed to write to %s", path);
854 return ret;
855}
856
aef3d51e 857/* do one of either predump or a regular dump */
b2c3710f 858static bool do_dump(struct lxc_container *c, char *mode, struct migrate_opts *opts)
aef3d51e
TA
859{
860 pid_t pid;
f1954503 861 char *criu_version = NULL;
aef3d51e 862
f1954503 863 if (!criu_ok(c, &criu_version))
aef3d51e
TA
864 return false;
865
b2c3710f 866 if (mkdir_p(opts->directory, 0700) < 0)
aef3d51e
TA
867 return false;
868
869 pid = fork();
870 if (pid < 0) {
871 SYSERROR("fork failed");
872 return false;
873 }
874
875 if (pid == 0) {
876 struct criu_opts os;
877
878 os.action = mode;
b2c3710f 879 os.user = opts;
aef3d51e 880 os.c = c;
4b54788e 881 os.console_name = c->lxc_conf->console.path;
f1954503 882 os.criu_version = criu_version;
74eb576c 883
b2c3710f 884 if (save_tty_major_minor(opts->directory, c, os.tty_id, sizeof(os.tty_id)) < 0)
4b54788e 885 exit(1);
aef3d51e
TA
886
887 /* exec_criu() returning is an error */
7103fe6f 888 exec_criu(&os);
aef3d51e
TA
889 exit(1);
890 } else {
891 int status;
892 pid_t w = waitpid(pid, &status, 0);
893 if (w == -1) {
894 SYSERROR("waitpid");
895 return false;
896 }
897
898 if (WIFEXITED(status)) {
899 if (WEXITSTATUS(status)) {
900 ERROR("dump failed with %d\n", WEXITSTATUS(status));
901 return false;
902 }
903
904 return true;
905 } else if (WIFSIGNALED(status)) {
906 ERROR("dump signaled with %d\n", WTERMSIG(status));
907 return false;
908 } else {
909 ERROR("unknown dump exit %d\n", status);
910 return false;
911 }
912 }
913}
914
b2c3710f 915bool __criu_pre_dump(struct lxc_container *c, struct migrate_opts *opts)
aef3d51e 916{
b2c3710f 917 return do_dump(c, "pre-dump", opts);
aef3d51e
TA
918}
919
b2c3710f 920bool __criu_dump(struct lxc_container *c, struct migrate_opts *opts)
aef3d51e
TA
921{
922 char path[PATH_MAX];
923 int ret;
924
b2c3710f 925 ret = snprintf(path, sizeof(path), "%s/inventory.img", opts->directory);
aef3d51e
TA
926 if (ret < 0 || ret >= sizeof(path))
927 return false;
928
929 if (access(path, F_OK) == 0) {
930 ERROR("please use a fresh directory for the dump directory\n");
931 return false;
932 }
933
b2c3710f 934 return do_dump(c, "dump", opts);
aef3d51e
TA
935}
936
b2c3710f 937bool __criu_restore(struct lxc_container *c, struct migrate_opts *opts)
aef3d51e
TA
938{
939 pid_t pid;
940 int status, nread;
941 int pipefd[2];
f1954503 942 char *criu_version = NULL;
aef3d51e 943
f1954503 944 if (!criu_ok(c, &criu_version))
aef3d51e
TA
945 return false;
946
947 if (geteuid()) {
948 ERROR("Must be root to restore\n");
949 return false;
950 }
951
952 if (pipe(pipefd)) {
953 ERROR("failed to create pipe");
954 return false;
955 }
956
957 pid = fork();
958 if (pid < 0) {
959 close(pipefd[0]);
960 close(pipefd[1]);
961 return false;
962 }
963
964 if (pid == 0) {
965 close(pipefd[0]);
966 // this never returns
f1954503 967 do_restore(c, pipefd[1], opts, criu_version);
aef3d51e
TA
968 }
969
970 close(pipefd[1]);
971
972 nread = read(pipefd[0], &status, sizeof(status));
973 close(pipefd[0]);
974 if (sizeof(status) != nread) {
975 ERROR("reading status from pipe failed");
976 goto err_wait;
977 }
978
979 // If the criu process was killed or exited nonzero, wait() for the
980 // handler, since the restore process died. Otherwise, we don't need to
981 // wait, since the child becomes the monitor process.
982 if (!WIFEXITED(status) || WEXITSTATUS(status))
983 goto err_wait;
984 return true;
985
986err_wait:
987 if (wait_for_pid(pid))
988 ERROR("restore process died");
989 return false;
990}