]> git.proxmox.com Git - mirror_lxc.git/blame - src/lxc/criu.c
Merge pull request #1078 from brauner/2016-07-11/add_cgns
[mirror_lxc.git] / src / lxc / criu.c
CommitLineData
e29fe1dd
TA
1/*
2 * lxc: linux Container library
3 *
4 * Copyright © 2014-2015 Canonical Ltd.
5 *
6 * Authors:
7 * Tycho Andersen <tycho.andersen@canonical.com>
8 *
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 */
23#define _GNU_SOURCE
24#include <assert.h>
25#include <linux/limits.h>
26#include <sched.h>
27#include <stdio.h>
28#include <stdlib.h>
29#include <string.h>
30#include <sys/mount.h>
31#include <sys/types.h>
32#include <sys/wait.h>
33#include <unistd.h>
34
35#include "config.h"
36
4ec31c52 37#include "bdev/bdev.h"
e29fe1dd
TA
38#include "cgroup.h"
39#include "conf.h"
dc259399 40#include "commands.h"
e29fe1dd
TA
41#include "criu.h"
42#include "log.h"
43#include "lxc.h"
44#include "lxclock.h"
45#include "network.h"
46#include "utils.h"
47
73d46752
TA
48#define CRIU_VERSION "2.0"
49
50#define CRIU_GITID_VERSION "2.0"
51#define CRIU_GITID_PATCHLEVEL 0
52
e29fe1dd
TA
53lxc_log_define(lxc_criu, lxc);
54
73d46752
TA
55struct criu_opts {
56 /* The type of criu invocation, one of "dump" or "restore" */
57 char *action;
58
b2c3710f
TA
59 /* the user-provided migrate options relevant to this action */
60 struct migrate_opts *user;
73d46752
TA
61
62 /* The container to dump */
63 struct lxc_container *c;
64
73d46752 65 /* dump: stop the container or not after dumping? */
4b54788e 66 char tty_id[32]; /* the criu tty id for /dev/console, i.e. "tty[${rdev}:${dev}]" */
73d46752
TA
67
68 /* restore: the file to write the init process' pid into */
69 char *pidfile;
70 const char *cgroup_path;
4b54788e
TA
71 int console_fd;
72 /* The path that is bind mounted from /dev/console, if any. We don't
73 * want to use `--ext-mount-map auto`'s result here because the pts
74 * device may have a different path (e.g. if the pty number is
75 * different) on the target host. NULL if lxc.console = "none".
76 */
77 char *console_name;
73d46752
TA
78};
79
4b54788e
TA
80static int load_tty_major_minor(char *directory, char *output, int len)
81{
82 FILE *f;
83 char path[PATH_MAX];
84 int ret;
85
86 ret = snprintf(path, sizeof(path), "%s/tty.info", directory);
87 if (ret < 0 || ret >= sizeof(path)) {
88 ERROR("snprintf'd too many chacters: %d", ret);
89 return -1;
90 }
91
92 f = fopen(path, "r");
93 if (!f) {
94 /* This means we're coming from a liblxc which didn't export
95 * the tty info. In this case they had to have lxc.console =
96 * none, so there's no problem restoring.
97 */
98 if (errno == ENOENT)
99 return 0;
100
101 SYSERROR("couldn't open %s", path);
102 return -1;
103 }
104
105 if (!fgets(output, len, f)) {
106 fclose(f);
107 SYSERROR("couldn't read %s", path);
108 return -1;
109 }
110
111 fclose(f);
112 return 0;
113}
114
9451eeff 115static void exec_criu(struct criu_opts *opts)
e29fe1dd
TA
116{
117 char **argv, log[PATH_MAX];
19d1509c 118 int static_args = 23, argc = 0, i, ret;
e29fe1dd
TA
119 int netnr = 0;
120 struct lxc_list *it;
121
a17fa3c0
NE
122 char buf[4096], tty_info[32];
123 size_t pos;
e9195050
TA
124 /* If we are currently in a cgroup /foo/bar, and the container is in a
125 * cgroup /lxc/foo, lxcfs will give us an ENOENT if some task in the
126 * container has an open fd that points to one of the cgroup files
127 * (systemd always opens its "root" cgroup). So, let's escape to the
128 * /actual/ root cgroup so that lxcfs thinks criu has enough rights to
129 * see all cgroups.
130 */
7103fe6f 131 if (!cgroup_escape()) {
e9195050
TA
132 ERROR("failed to escape cgroups");
133 return;
134 }
135
e29fe1dd 136 /* The command line always looks like:
19d1509c 137 * criu $(action) --tcp-established --file-locks --link-remap \
0a5fc6df 138 * --manage-cgroups=full action-script foo.sh -D $(directory) \
e29fe1dd
TA
139 * -o $(directory)/$(action).log --ext-mount-map auto
140 * --enable-external-sharing --enable-external-masters
4b54788e 141 * --enable-fs hugetlbfs --enable-fs tracefs --ext-mount-map console:/dev/pts/n
e29fe1dd
TA
142 * +1 for final NULL */
143
aef3d51e 144 if (strcmp(opts->action, "dump") == 0 || strcmp(opts->action, "pre-dump") == 0) {
dc259399
TA
145 /* -t pid --freeze-cgroup /lxc/ct */
146 static_args += 4;
e29fe1dd 147
aef3d51e 148 /* --prev-images-dir <path-to-directory-A-relative-to-B> */
b2c3710f 149 if (opts->user->predump_dir)
aef3d51e
TA
150 static_args += 2;
151
74eb576c 152 /* --page-server --address <address> --port <port> */
b2c3710f 153 if (opts->user->pageserver_address && opts->user->pageserver_port)
74eb576c
NE
154 static_args += 5;
155
aef3d51e 156 /* --leave-running (only for final dump) */
b2c3710f 157 if (strcmp(opts->action, "dump") == 0 && !opts->user->stop)
e29fe1dd 158 static_args++;
4b54788e
TA
159
160 /* --external tty[88,4] */
161 if (opts->tty_id[0])
162 static_args += 2;
19d1509c
TA
163
164 /* --force-irmap */
165 if (!opts->user->preserves_inodes)
166 static_args++;
e29fe1dd
TA
167 } else if (strcmp(opts->action, "restore") == 0) {
168 /* --root $(lxc_mount_point) --restore-detached
13389b29
TA
169 * --restore-sibling --pidfile $foo --cgroup-root $foo
170 * --lsm-profile apparmor:whatever
171 */
172 static_args += 10;
4b54788e
TA
173
174 tty_info[0] = 0;
b2c3710f 175 if (load_tty_major_minor(opts->user->directory, tty_info, sizeof(tty_info)))
4b54788e
TA
176 return;
177
178 /* --inherit-fd fd[%d]:tty[%s] */
179 if (tty_info[0])
180 static_args += 2;
e29fe1dd
TA
181 } else {
182 return;
183 }
184
b2c3710f 185 if (opts->user->verbose)
e29fe1dd
TA
186 static_args++;
187
b9ee6643
TA
188 if (opts->user->action_script)
189 static_args += 2;
190
b2c3710f 191 ret = snprintf(log, PATH_MAX, "%s/%s.log", opts->user->directory, opts->action);
e29fe1dd
TA
192 if (ret < 0 || ret >= PATH_MAX) {
193 ERROR("logfile name too long\n");
194 return;
195 }
196
197 argv = malloc(static_args * sizeof(*argv));
198 if (!argv)
199 return;
200
201 memset(argv, 0, static_args * sizeof(*argv));
202
203#define DECLARE_ARG(arg) \
204 do { \
205 if (arg == NULL) { \
206 ERROR("Got NULL argument for criu"); \
207 goto err; \
208 } \
209 argv[argc++] = strdup(arg); \
210 if (!argv[argc-1]) \
211 goto err; \
212 } while (0)
213
214 argv[argc++] = on_path("criu", NULL);
215 if (!argv[argc-1]) {
216 ERROR("Couldn't find criu binary\n");
217 goto err;
218 }
219
220 DECLARE_ARG(opts->action);
221 DECLARE_ARG("--tcp-established");
222 DECLARE_ARG("--file-locks");
223 DECLARE_ARG("--link-remap");
0a5fc6df 224 DECLARE_ARG("--manage-cgroups=full");
e29fe1dd
TA
225 DECLARE_ARG("--ext-mount-map");
226 DECLARE_ARG("auto");
227 DECLARE_ARG("--enable-external-sharing");
228 DECLARE_ARG("--enable-external-masters");
dd62857a
TA
229 DECLARE_ARG("--enable-fs");
230 DECLARE_ARG("hugetlbfs");
5b454329
TA
231 DECLARE_ARG("--enable-fs");
232 DECLARE_ARG("tracefs");
e29fe1dd 233 DECLARE_ARG("-D");
b2c3710f 234 DECLARE_ARG(opts->user->directory);
e29fe1dd
TA
235 DECLARE_ARG("-o");
236 DECLARE_ARG(log);
237
b2c3710f 238 if (opts->user->verbose)
e29fe1dd
TA
239 DECLARE_ARG("-vvvvvv");
240
b9ee6643
TA
241 if (opts->user->action_script) {
242 DECLARE_ARG("--action-script");
243 DECLARE_ARG(opts->user->action_script);
244 }
245
aef3d51e 246 if (strcmp(opts->action, "dump") == 0 || strcmp(opts->action, "pre-dump") == 0) {
dc259399 247 char pid[32], *freezer_relative;
e29fe1dd
TA
248
249 if (sprintf(pid, "%d", opts->c->init_pid(opts->c)) < 0)
250 goto err;
251
252 DECLARE_ARG("-t");
253 DECLARE_ARG(pid);
dc259399
TA
254
255 freezer_relative = lxc_cmd_get_cgroup_path(opts->c->name,
256 opts->c->config_path,
257 "freezer");
258 if (!freezer_relative) {
259 ERROR("failed getting freezer path");
260 goto err;
261 }
262
263 ret = snprintf(log, sizeof(log), "/sys/fs/cgroup/freezer/%s", freezer_relative);
264 if (ret < 0 || ret >= sizeof(log))
265 goto err;
266
267 DECLARE_ARG("--freeze-cgroup");
268 DECLARE_ARG(log);
269
4b54788e 270 if (opts->tty_id[0]) {
36d2096c
TA
271 DECLARE_ARG("--ext-mount-map");
272 DECLARE_ARG("/dev/console:console");
273
4b54788e
TA
274 DECLARE_ARG("--external");
275 DECLARE_ARG(opts->tty_id);
276 }
277
b2c3710f 278 if (opts->user->predump_dir) {
aef3d51e 279 DECLARE_ARG("--prev-images-dir");
b2c3710f 280 DECLARE_ARG(opts->user->predump_dir);
74eb576c 281 }
4c0c0319 282
b2c3710f 283 if (opts->user->pageserver_address && opts->user->pageserver_port) {
74eb576c
NE
284 DECLARE_ARG("--page-server");
285 DECLARE_ARG("--address");
b2c3710f 286 DECLARE_ARG(opts->user->pageserver_address);
74eb576c 287 DECLARE_ARG("--port");
b2c3710f 288 DECLARE_ARG(opts->user->pageserver_port);
74eb576c 289 }
aef3d51e 290
19d1509c
TA
291 if (!opts->user->preserves_inodes)
292 DECLARE_ARG("--force-irmap");
293
aef3d51e 294 /* only for final dump */
b2c3710f 295 if (strcmp(opts->action, "dump") == 0 && !opts->user->stop)
e29fe1dd
TA
296 DECLARE_ARG("--leave-running");
297 } else if (strcmp(opts->action, "restore") == 0) {
298 void *m;
299 int additional;
13389b29 300 struct lxc_conf *lxc_conf = opts->c->lxc_conf;
e29fe1dd
TA
301
302 DECLARE_ARG("--root");
303 DECLARE_ARG(opts->c->lxc_conf->rootfs.mount);
304 DECLARE_ARG("--restore-detached");
305 DECLARE_ARG("--restore-sibling");
306 DECLARE_ARG("--pidfile");
307 DECLARE_ARG(opts->pidfile);
308 DECLARE_ARG("--cgroup-root");
309 DECLARE_ARG(opts->cgroup_path);
310
4b54788e 311 if (tty_info[0]) {
97e4f1a9
TA
312 if (opts->console_fd < 0) {
313 ERROR("lxc.console configured on source host but not target");
314 goto err;
315 }
316
4b54788e
TA
317 ret = snprintf(buf, sizeof(buf), "fd[%d]:%s", opts->console_fd, tty_info);
318 if (ret < 0 || ret >= sizeof(buf))
319 goto err;
320
321 DECLARE_ARG("--inherit-fd");
322 DECLARE_ARG(buf);
323 }
324 if (opts->console_name) {
325 if (snprintf(buf, sizeof(buf), "console:%s", opts->console_name) < 0) {
326 SYSERROR("sprintf'd too many bytes");
327 }
328 DECLARE_ARG("--ext-mount-map");
329 DECLARE_ARG(buf);
330 }
331
13389b29
TA
332 if (lxc_conf->lsm_aa_profile || lxc_conf->lsm_se_context) {
333
334 if (lxc_conf->lsm_aa_profile)
335 ret = snprintf(buf, sizeof(buf), "apparmor:%s", lxc_conf->lsm_aa_profile);
336 else
337 ret = snprintf(buf, sizeof(buf), "selinux:%s", lxc_conf->lsm_se_context);
338
339 if (ret < 0 || ret >= sizeof(buf))
340 goto err;
341
342 DECLARE_ARG("--lsm-profile");
343 DECLARE_ARG(buf);
344 }
345
e29fe1dd
TA
346 additional = lxc_list_len(&opts->c->lxc_conf->network) * 2;
347
fa071249
TA
348 m = realloc(argv, (argc + additional + 1) * sizeof(*argv));
349 if (!m)
350 goto err;
e29fe1dd
TA
351 argv = m;
352
353 lxc_list_for_each(it, &opts->c->lxc_conf->network) {
354 char eth[128], *veth;
355 struct lxc_netdev *n = it->elem;
356
65b20221
TA
357 if (n->type != LXC_NET_VETH)
358 continue;
359
e29fe1dd
TA
360 if (n->name) {
361 if (strlen(n->name) >= sizeof(eth))
362 goto err;
363 strncpy(eth, n->name, sizeof(eth));
364 } else
365 sprintf(eth, "eth%d", netnr);
366
367 veth = n->priv.veth_attr.pair;
368
c1fd648d
TA
369 if (n->link)
370 ret = snprintf(buf, sizeof(buf), "%s=%s@%s", eth, veth, n->link);
371 else
372 ret = snprintf(buf, sizeof(buf), "%s=%s", eth, veth);
e29fe1dd
TA
373 if (ret < 0 || ret >= sizeof(buf))
374 goto err;
375
376 DECLARE_ARG("--veth-pair");
377 DECLARE_ARG(buf);
378 }
379
380 }
381
382 argv[argc] = NULL;
383
cf4b07a5 384 buf[0] = 0;
a17fa3c0 385 pos = 0;
72a30576 386
cf4b07a5 387 for (i = 0; argv[i]; i++) {
72a30576
NE
388 ret = snprintf(buf + pos, sizeof(buf) - pos, "%s ", argv[i]);
389 if (ret < 0 || ret >= sizeof(buf) - pos)
390 goto err;
391 else
392 pos += ret;
cf4b07a5
TA
393 }
394
395 INFO("execing: %s", buf);
396
e29fe1dd
TA
397#undef DECLARE_ARG
398 execv(argv[0], argv);
399err:
e29fe1dd
TA
400 for (i = 0; argv[i]; i++)
401 free(argv[i]);
402 free(argv);
403}
404
8ba5ced7
TA
405/*
406 * Check to see if the criu version is recent enough for all the features we
407 * use. This version allows either CRIU_VERSION or (CRIU_GITID_VERSION and
408 * CRIU_GITID_PATCHLEVEL) to work, enabling users building from git to c/r
409 * things potentially before a version is released with a particular feature.
410 *
411 * The intent is that when criu development slows down, we can drop this, but
412 * for now we shouldn't attempt to c/r with versions that we know won't work.
5407e2ab
CB
413 *
414 * Note: If version != NULL criu_version() stores the detected criu version in
415 * version. Allocates memory for version which must be freed by caller.
8ba5ced7 416 */
5407e2ab 417static bool criu_version_ok(char **version)
8ba5ced7
TA
418{
419 int pipes[2];
420 pid_t pid;
421
422 if (pipe(pipes) < 0) {
423 SYSERROR("pipe() failed");
424 return false;
425 }
426
427 pid = fork();
428 if (pid < 0) {
429 SYSERROR("fork() failed");
430 return false;
431 }
432
433 if (pid == 0) {
434 char *args[] = { "criu", "--version", NULL };
755fa453 435 char *path;
8ba5ced7
TA
436 close(pipes[0]);
437
438 close(STDERR_FILENO);
439 if (dup2(pipes[1], STDOUT_FILENO) < 0)
440 exit(1);
441
755fa453 442 path = on_path("criu", NULL);
d9b32b09
SH
443 if (!path)
444 exit(1);
445
755fa453 446 execv(path, args);
8ba5ced7
TA
447 exit(1);
448 } else {
449 FILE *f;
5407e2ab 450 char *tmp;
8ba5ced7
TA
451 int patch;
452
453 close(pipes[1]);
454 if (wait_for_pid(pid) < 0) {
455 close(pipes[0]);
4eae4051 456 SYSERROR("execing criu failed, is it installed?");
8ba5ced7
TA
457 return false;
458 }
459
460 f = fdopen(pipes[0], "r");
461 if (!f) {
462 close(pipes[0]);
463 return false;
464 }
465
5407e2ab
CB
466 tmp = malloc(1024);
467 if (!tmp) {
468 fclose(f);
469 return false;
470 }
471
472 if (fscanf(f, "Version: %1023[^\n]s", tmp) != 1)
8ba5ced7
TA
473 goto version_error;
474
475 if (fgetc(f) != '\n')
476 goto version_error;
477
5407e2ab 478 if (strcmp(tmp, CRIU_VERSION) >= 0)
8ba5ced7
TA
479 goto version_match;
480
5407e2ab 481 if (fscanf(f, "GitID: v%1023[^-]s", tmp) != 1)
8ba5ced7
TA
482 goto version_error;
483
484 if (fgetc(f) != '-')
485 goto version_error;
486
487 if (fscanf(f, "%d", &patch) != 1)
488 goto version_error;
489
5407e2ab 490 if (strcmp(tmp, CRIU_GITID_VERSION) < 0)
8ba5ced7
TA
491 goto version_error;
492
493 if (patch < CRIU_GITID_PATCHLEVEL)
494 goto version_error;
495
496version_match:
3158ab5b 497 fclose(f);
5407e2ab
CB
498 if (!version)
499 free(tmp);
500 else
501 *version = tmp;
8ba5ced7
TA
502 return true;
503
504version_error:
3158ab5b 505 fclose(f);
5407e2ab 506 free(tmp);
8ba5ced7
TA
507 ERROR("must have criu " CRIU_VERSION " or greater to checkpoint/restore\n");
508 return false;
509 }
510}
511
e29fe1dd
TA
512/* Check and make sure the container has a configuration that we know CRIU can
513 * dump. */
73d46752 514static bool criu_ok(struct lxc_container *c)
e29fe1dd
TA
515{
516 struct lxc_list *it;
e29fe1dd 517
5407e2ab 518 if (!criu_version_ok(NULL))
8ba5ced7
TA
519 return false;
520
e29fe1dd
TA
521 if (geteuid()) {
522 ERROR("Must be root to checkpoint\n");
523 return false;
524 }
525
526 /* We only know how to restore containers with veth networks. */
527 lxc_list_for_each(it, &c->lxc_conf->network) {
528 struct lxc_netdev *n = it->elem;
65b20221
TA
529 switch(n->type) {
530 case LXC_NET_VETH:
531 case LXC_NET_NONE:
532 case LXC_NET_EMPTY:
533 break;
534 default:
e29fe1dd
TA
535 ERROR("Found network that is not VETH or NONE\n");
536 return false;
537 }
538 }
539
e29fe1dd
TA
540 return true;
541}
542
e29fe1dd
TA
543static bool restore_net_info(struct lxc_container *c)
544{
545 struct lxc_list *it;
546 bool has_error = true;
547
548 if (container_mem_lock(c))
549 return false;
550
551 lxc_list_for_each(it, &c->lxc_conf->network) {
552 struct lxc_netdev *netdev = it->elem;
553 char template[IFNAMSIZ];
65b20221
TA
554
555 if (netdev->type != LXC_NET_VETH)
556 continue;
557
e29fe1dd
TA
558 snprintf(template, sizeof(template), "vethXXXXXX");
559
560 if (!netdev->priv.veth_attr.pair)
561 netdev->priv.veth_attr.pair = lxc_mkifname(template);
562
563 if (!netdev->priv.veth_attr.pair)
564 goto out_unlock;
565 }
566
567 has_error = false;
568
569out_unlock:
570 container_mem_unlock(c);
571 return !has_error;
572}
573
aef3d51e
TA
574// do_restore never returns, the calling process is used as the
575// monitor process. do_restore calls exit() if it fails.
b2c3710f 576void do_restore(struct lxc_container *c, int status_pipe, struct migrate_opts *opts)
e29fe1dd
TA
577{
578 pid_t pid;
579 char pidfile[L_tmpnam];
580 struct lxc_handler *handler;
3d9a5c85 581 int status, pipes[2] = {-1, -1};
e29fe1dd
TA
582
583 if (!tmpnam(pidfile))
584 goto out;
585
586 handler = lxc_init(c->name, c->lxc_conf, c->config_path);
587 if (!handler)
588 goto out;
589
590 if (!cgroup_init(handler)) {
591 ERROR("failed initing cgroups");
592 goto out_fini_handler;
593 }
594
595 if (!cgroup_create(handler)) {
596 ERROR("failed creating groups");
597 goto out_fini_handler;
598 }
599
600 if (!restore_net_info(c)) {
601 ERROR("failed restoring network info");
602 goto out_fini_handler;
603 }
604
605 resolve_clone_flags(handler);
606
3d9a5c85
TA
607 if (pipe(pipes) < 0) {
608 SYSERROR("pipe() failed");
609 goto out_fini_handler;
610 }
611
e29fe1dd
TA
612 pid = fork();
613 if (pid < 0)
614 goto out_fini_handler;
615
616 if (pid == 0) {
617 struct criu_opts os;
618 struct lxc_rootfs *rootfs;
4b54788e 619 int flags;
e29fe1dd 620
3d9a5c85
TA
621 close(status_pipe);
622 status_pipe = -1;
623
624 close(pipes[0]);
625 pipes[0] = -1;
626 if (dup2(pipes[1], STDERR_FILENO) < 0) {
627 SYSERROR("dup2 failed");
628 goto out_fini_handler;
629 }
630
631 if (dup2(pipes[1], STDOUT_FILENO) < 0) {
632 SYSERROR("dup2 failed");
633 goto out_fini_handler;
634 }
e29fe1dd
TA
635
636 if (unshare(CLONE_NEWNS))
637 goto out_fini_handler;
638
639 /* CRIU needs the lxc root bind mounted so that it is the root of some
640 * mount. */
641 rootfs = &c->lxc_conf->rootfs;
642
643 if (rootfs_is_blockdev(c->lxc_conf)) {
644 if (do_rootfs_setup(c->lxc_conf, c->name, c->config_path) < 0)
645 goto out_fini_handler;
646 } else {
647 if (mkdir(rootfs->mount, 0755) < 0 && errno != EEXIST)
648 goto out_fini_handler;
649
650 if (mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL) < 0) {
651 SYSERROR("remount / to private failed");
652 goto out_fini_handler;
653 }
654
655 if (mount(rootfs->path, rootfs->mount, NULL, MS_BIND, NULL) < 0) {
656 rmdir(rootfs->mount);
657 goto out_fini_handler;
658 }
659 }
660
661 os.action = "restore";
b2c3710f 662 os.user = opts;
e29fe1dd
TA
663 os.c = c;
664 os.pidfile = pidfile;
e29fe1dd 665 os.cgroup_path = cgroup_canonical_path(handler);
4b54788e
TA
666 os.console_fd = c->lxc_conf->console.slave;
667
97e4f1a9
TA
668 if (os.console_fd >= 0) {
669 /* Twiddle the FD_CLOEXEC bit. We want to pass this FD to criu
670 * via --inherit-fd, so we don't want it to close.
671 */
672 flags = fcntl(os.console_fd, F_GETFD);
673 if (flags < 0) {
674 SYSERROR("F_GETFD failed: %d", os.console_fd);
675 goto out_fini_handler;
676 }
4b54788e 677
97e4f1a9 678 flags &= ~FD_CLOEXEC;
4b54788e 679
97e4f1a9
TA
680 if (fcntl(os.console_fd, F_SETFD, flags) < 0) {
681 SYSERROR("F_SETFD failed");
682 goto out_fini_handler;
683 }
4b54788e
TA
684 }
685 os.console_name = c->lxc_conf->console.name;
e29fe1dd
TA
686
687 /* exec_criu() returning is an error */
7103fe6f 688 exec_criu(&os);
e29fe1dd
TA
689 umount(rootfs->mount);
690 rmdir(rootfs->mount);
691 goto out_fini_handler;
692 } else {
693 int ret;
694 char title[2048];
695
3d9a5c85
TA
696 close(pipes[1]);
697 pipes[1] = -1;
698
e29fe1dd
TA
699 pid_t w = waitpid(pid, &status, 0);
700 if (w == -1) {
701 SYSERROR("waitpid");
702 goto out_fini_handler;
703 }
704
3d9a5c85
TA
705 ret = write(status_pipe, &status, sizeof(status));
706 close(status_pipe);
707 status_pipe = -1;
e29fe1dd
TA
708
709 if (sizeof(status) != ret) {
710 SYSERROR("failed to write all of status");
711 goto out_fini_handler;
712 }
713
714 if (WIFEXITED(status)) {
715 if (WEXITSTATUS(status)) {
3d9a5c85
TA
716 char buf[4096];
717 int n;
718
719 n = read(pipes[0], buf, sizeof(buf));
720 if (n < 0) {
721 SYSERROR("failed reading from criu stderr");
722 goto out_fini_handler;
723 }
724
725 buf[n] = 0;
726
727 ERROR("criu process exited %d, output:\n%s\n", WEXITSTATUS(status), buf);
e29fe1dd
TA
728 goto out_fini_handler;
729 } else {
730 int ret;
731 FILE *f = fopen(pidfile, "r");
732 if (!f) {
733 SYSERROR("couldn't read restore's init pidfile %s\n", pidfile);
734 goto out_fini_handler;
735 }
736
737 ret = fscanf(f, "%d", (int*) &handler->pid);
738 fclose(f);
59c2d406
TA
739 if (unlink(pidfile) < 0 && errno != ENOENT)
740 SYSERROR("unlinking pidfile failed");
741
e29fe1dd
TA
742 if (ret != 1) {
743 ERROR("reading restore pid failed");
744 goto out_fini_handler;
745 }
746
f8a41688
TA
747 if (lxc_set_state(c->name, handler, RUNNING)) {
748 ERROR("error setting running state after restore");
e29fe1dd 749 goto out_fini_handler;
f8a41688 750 }
e29fe1dd
TA
751 }
752 } else {
753 ERROR("CRIU was killed with signal %d\n", WTERMSIG(status));
754 goto out_fini_handler;
755 }
756
3d9a5c85
TA
757 close(pipes[0]);
758
e29fe1dd
TA
759 /*
760 * See comment in lxcapi_start; we don't care if these
761 * fail because it's just a beauty thing. We just
762 * assign the return here to silence potential.
763 */
764 ret = snprintf(title, sizeof(title), "[lxc monitor] %s %s", c->config_path, c->name);
765 ret = setproctitle(title);
766
767 ret = lxc_poll(c->name, handler);
768 if (ret)
769 lxc_abort(c->name, handler);
770 lxc_fini(c->name, handler);
771 exit(ret);
772 }
773
774out_fini_handler:
3d9a5c85
TA
775 if (pipes[0] >= 0)
776 close(pipes[0]);
777 if (pipes[1] >= 0)
778 close(pipes[1]);
779
e29fe1dd 780 lxc_fini(c->name, handler);
59c2d406
TA
781 if (unlink(pidfile) < 0 && errno != ENOENT)
782 SYSERROR("unlinking pidfile failed");
e29fe1dd
TA
783
784out:
3d9a5c85 785 if (status_pipe >= 0) {
e29fe1dd 786 status = 1;
3d9a5c85 787 if (write(status_pipe, &status, sizeof(status)) != sizeof(status)) {
e29fe1dd
TA
788 SYSERROR("writing status failed");
789 }
3d9a5c85 790 close(status_pipe);
e29fe1dd
TA
791 }
792
793 exit(1);
794}
aef3d51e 795
4b54788e
TA
796static int save_tty_major_minor(char *directory, struct lxc_container *c, char *tty_id, int len)
797{
798 FILE *f;
799 char path[PATH_MAX];
800 int ret;
801 struct stat sb;
802
803 if (c->lxc_conf->console.path && !strcmp(c->lxc_conf->console.path, "none")) {
804 tty_id[0] = 0;
805 return 0;
806 }
807
808 ret = snprintf(path, sizeof(path), "/proc/%d/root/dev/console", c->init_pid(c));
809 if (ret < 0 || ret >= sizeof(path)) {
810 ERROR("snprintf'd too many chacters: %d", ret);
811 return -1;
812 }
813
814 ret = stat(path, &sb);
815 if (ret < 0) {
816 SYSERROR("stat of %s failed", path);
817 return -1;
818 }
819
820 ret = snprintf(path, sizeof(path), "%s/tty.info", directory);
821 if (ret < 0 || ret >= sizeof(path)) {
822 ERROR("snprintf'd too many characters: %d", ret);
823 return -1;
824 }
825
f03280a7
TA
826 ret = snprintf(tty_id, len, "tty[%llx:%llx]",
827 (long long unsigned) sb.st_rdev,
828 (long long unsigned) sb.st_dev);
4b54788e
TA
829 if (ret < 0 || ret >= sizeof(path)) {
830 ERROR("snprintf'd too many characters: %d", ret);
831 return -1;
832 }
833
834 f = fopen(path, "w");
835 if (!f) {
836 SYSERROR("failed to open %s", path);
837 return -1;
838 }
839
840 ret = fprintf(f, "%s", tty_id);
841 fclose(f);
842 if (ret < 0)
843 SYSERROR("failed to write to %s", path);
844 return ret;
845}
846
aef3d51e 847/* do one of either predump or a regular dump */
b2c3710f 848static bool do_dump(struct lxc_container *c, char *mode, struct migrate_opts *opts)
aef3d51e
TA
849{
850 pid_t pid;
851
852 if (!criu_ok(c))
853 return false;
854
b2c3710f 855 if (mkdir_p(opts->directory, 0700) < 0)
aef3d51e
TA
856 return false;
857
858 pid = fork();
859 if (pid < 0) {
860 SYSERROR("fork failed");
861 return false;
862 }
863
864 if (pid == 0) {
865 struct criu_opts os;
866
867 os.action = mode;
b2c3710f 868 os.user = opts;
aef3d51e 869 os.c = c;
4b54788e 870 os.console_name = c->lxc_conf->console.path;
74eb576c 871
b2c3710f 872 if (save_tty_major_minor(opts->directory, c, os.tty_id, sizeof(os.tty_id)) < 0)
4b54788e 873 exit(1);
aef3d51e
TA
874
875 /* exec_criu() returning is an error */
7103fe6f 876 exec_criu(&os);
aef3d51e
TA
877 exit(1);
878 } else {
879 int status;
880 pid_t w = waitpid(pid, &status, 0);
881 if (w == -1) {
882 SYSERROR("waitpid");
883 return false;
884 }
885
886 if (WIFEXITED(status)) {
887 if (WEXITSTATUS(status)) {
888 ERROR("dump failed with %d\n", WEXITSTATUS(status));
889 return false;
890 }
891
892 return true;
893 } else if (WIFSIGNALED(status)) {
894 ERROR("dump signaled with %d\n", WTERMSIG(status));
895 return false;
896 } else {
897 ERROR("unknown dump exit %d\n", status);
898 return false;
899 }
900 }
901}
902
b2c3710f 903bool __criu_pre_dump(struct lxc_container *c, struct migrate_opts *opts)
aef3d51e 904{
b2c3710f 905 return do_dump(c, "pre-dump", opts);
aef3d51e
TA
906}
907
b2c3710f 908bool __criu_dump(struct lxc_container *c, struct migrate_opts *opts)
aef3d51e
TA
909{
910 char path[PATH_MAX];
911 int ret;
912
b2c3710f 913 ret = snprintf(path, sizeof(path), "%s/inventory.img", opts->directory);
aef3d51e
TA
914 if (ret < 0 || ret >= sizeof(path))
915 return false;
916
917 if (access(path, F_OK) == 0) {
918 ERROR("please use a fresh directory for the dump directory\n");
919 return false;
920 }
921
b2c3710f 922 return do_dump(c, "dump", opts);
aef3d51e
TA
923}
924
b2c3710f 925bool __criu_restore(struct lxc_container *c, struct migrate_opts *opts)
aef3d51e
TA
926{
927 pid_t pid;
928 int status, nread;
929 int pipefd[2];
930
931 if (!criu_ok(c))
932 return false;
933
934 if (geteuid()) {
935 ERROR("Must be root to restore\n");
936 return false;
937 }
938
939 if (pipe(pipefd)) {
940 ERROR("failed to create pipe");
941 return false;
942 }
943
944 pid = fork();
945 if (pid < 0) {
946 close(pipefd[0]);
947 close(pipefd[1]);
948 return false;
949 }
950
951 if (pid == 0) {
952 close(pipefd[0]);
953 // this never returns
b2c3710f 954 do_restore(c, pipefd[1], opts);
aef3d51e
TA
955 }
956
957 close(pipefd[1]);
958
959 nread = read(pipefd[0], &status, sizeof(status));
960 close(pipefd[0]);
961 if (sizeof(status) != nread) {
962 ERROR("reading status from pipe failed");
963 goto err_wait;
964 }
965
966 // If the criu process was killed or exited nonzero, wait() for the
967 // handler, since the restore process died. Otherwise, we don't need to
968 // wait, since the child becomes the monitor process.
969 if (!WIFEXITED(status) || WEXITSTATUS(status))
970 goto err_wait;
971 return true;
972
973err_wait:
974 if (wait_for_pid(pid))
975 ERROR("restore process died");
976 return false;
977}