]> git.proxmox.com Git - mirror_lxc.git/blame - src/lxc/criu.c
tools: use correct exit code for lxc-stop
[mirror_lxc.git] / src / lxc / criu.c
CommitLineData
e29fe1dd
TA
1/*
2 * lxc: linux Container library
3 *
4 * Copyright © 2014-2015 Canonical Ltd.
5 *
6 * Authors:
7 * Tycho Andersen <tycho.andersen@canonical.com>
8 *
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 */
23#define _GNU_SOURCE
24#include <assert.h>
9b945f13 25#include <inttypes.h>
e29fe1dd
TA
26#include <linux/limits.h>
27#include <sched.h>
28#include <stdio.h>
29#include <stdlib.h>
30#include <string.h>
31#include <sys/mount.h>
32#include <sys/types.h>
33#include <sys/wait.h>
34#include <unistd.h>
35
36#include "config.h"
37
d8e48992 38#include "bdev.h"
e29fe1dd
TA
39#include "cgroup.h"
40#include "conf.h"
dc259399 41#include "commands.h"
e29fe1dd
TA
42#include "criu.h"
43#include "log.h"
44#include "lxc.h"
45#include "lxclock.h"
46#include "network.h"
47#include "utils.h"
48
c33b0338 49#define CRIU_VERSION "2.0"
73d46752
TA
50
51#define CRIU_GITID_VERSION "2.0"
52#define CRIU_GITID_PATCHLEVEL 0
53
f1954503
AR
54#define CRIU_IN_FLIGHT_SUPPORT "2.4"
55
e29fe1dd
TA
56lxc_log_define(lxc_criu, lxc);
57
73d46752
TA
58struct criu_opts {
59 /* The type of criu invocation, one of "dump" or "restore" */
60 char *action;
61
b2c3710f
TA
62 /* the user-provided migrate options relevant to this action */
63 struct migrate_opts *user;
73d46752
TA
64
65 /* The container to dump */
66 struct lxc_container *c;
67
73d46752 68 /* dump: stop the container or not after dumping? */
4b54788e 69 char tty_id[32]; /* the criu tty id for /dev/console, i.e. "tty[${rdev}:${dev}]" */
73d46752
TA
70
71 /* restore: the file to write the init process' pid into */
0ab5703f 72 struct lxc_handler *handler;
4b54788e
TA
73 int console_fd;
74 /* The path that is bind mounted from /dev/console, if any. We don't
75 * want to use `--ext-mount-map auto`'s result here because the pts
76 * device may have a different path (e.g. if the pty number is
77 * different) on the target host. NULL if lxc.console = "none".
78 */
79 char *console_name;
f1954503
AR
80
81 /* The detected version of criu */
82 char *criu_version;
73d46752
TA
83};
84
4b54788e
TA
85static int load_tty_major_minor(char *directory, char *output, int len)
86{
87 FILE *f;
88 char path[PATH_MAX];
89 int ret;
90
91 ret = snprintf(path, sizeof(path), "%s/tty.info", directory);
92 if (ret < 0 || ret >= sizeof(path)) {
93 ERROR("snprintf'd too many chacters: %d", ret);
94 return -1;
95 }
96
97 f = fopen(path, "r");
98 if (!f) {
99 /* This means we're coming from a liblxc which didn't export
100 * the tty info. In this case they had to have lxc.console =
101 * none, so there's no problem restoring.
102 */
103 if (errno == ENOENT)
104 return 0;
105
106 SYSERROR("couldn't open %s", path);
107 return -1;
108 }
109
110 if (!fgets(output, len, f)) {
111 fclose(f);
112 SYSERROR("couldn't read %s", path);
113 return -1;
114 }
115
116 fclose(f);
117 return 0;
118}
119
9451eeff 120static void exec_criu(struct criu_opts *opts)
e29fe1dd
TA
121{
122 char **argv, log[PATH_MAX];
19d1509c 123 int static_args = 23, argc = 0, i, ret;
e29fe1dd
TA
124 int netnr = 0;
125 struct lxc_list *it;
126
a17fa3c0
NE
127 char buf[4096], tty_info[32];
128 size_t pos;
e9195050
TA
129 /* If we are currently in a cgroup /foo/bar, and the container is in a
130 * cgroup /lxc/foo, lxcfs will give us an ENOENT if some task in the
131 * container has an open fd that points to one of the cgroup files
132 * (systemd always opens its "root" cgroup). So, let's escape to the
133 * /actual/ root cgroup so that lxcfs thinks criu has enough rights to
134 * see all cgroups.
135 */
7103fe6f 136 if (!cgroup_escape()) {
e9195050
TA
137 ERROR("failed to escape cgroups");
138 return;
139 }
140
e29fe1dd 141 /* The command line always looks like:
19d1509c 142 * criu $(action) --tcp-established --file-locks --link-remap \
5f178bc9 143 * --manage-cgroups=full --action-script foo.sh -D $(directory) \
e29fe1dd
TA
144 * -o $(directory)/$(action).log --ext-mount-map auto
145 * --enable-external-sharing --enable-external-masters
4b54788e 146 * --enable-fs hugetlbfs --enable-fs tracefs --ext-mount-map console:/dev/pts/n
e29fe1dd
TA
147 * +1 for final NULL */
148
aef3d51e 149 if (strcmp(opts->action, "dump") == 0 || strcmp(opts->action, "pre-dump") == 0) {
dc259399
TA
150 /* -t pid --freeze-cgroup /lxc/ct */
151 static_args += 4;
e29fe1dd 152
aef3d51e 153 /* --prev-images-dir <path-to-directory-A-relative-to-B> */
b2c3710f 154 if (opts->user->predump_dir)
aef3d51e
TA
155 static_args += 2;
156
74eb576c 157 /* --page-server --address <address> --port <port> */
b2c3710f 158 if (opts->user->pageserver_address && opts->user->pageserver_port)
74eb576c
NE
159 static_args += 5;
160
aef3d51e 161 /* --leave-running (only for final dump) */
b2c3710f 162 if (strcmp(opts->action, "dump") == 0 && !opts->user->stop)
e29fe1dd 163 static_args++;
4b54788e
TA
164
165 /* --external tty[88,4] */
166 if (opts->tty_id[0])
167 static_args += 2;
19d1509c
TA
168
169 /* --force-irmap */
170 if (!opts->user->preserves_inodes)
171 static_args++;
b2b7b0d2
TA
172
173 /* --ghost-limit 1024 */
174 if (opts->user->ghost_limit)
175 static_args += 2;
e29fe1dd
TA
176 } else if (strcmp(opts->action, "restore") == 0) {
177 /* --root $(lxc_mount_point) --restore-detached
0ab5703f 178 * --restore-sibling
13389b29
TA
179 * --lsm-profile apparmor:whatever
180 */
0ab5703f 181 static_args += 6;
4b54788e
TA
182
183 tty_info[0] = 0;
b2c3710f 184 if (load_tty_major_minor(opts->user->directory, tty_info, sizeof(tty_info)))
4b54788e
TA
185 return;
186
187 /* --inherit-fd fd[%d]:tty[%s] */
188 if (tty_info[0])
189 static_args += 2;
e29fe1dd
TA
190 } else {
191 return;
192 }
193
09e80d0c
TA
194 if (cgroup_num_hierarchies() > 0)
195 static_args += 2 * cgroup_num_hierarchies();
0ab5703f 196
b2c3710f 197 if (opts->user->verbose)
e29fe1dd
TA
198 static_args++;
199
b9ee6643
TA
200 if (opts->user->action_script)
201 static_args += 2;
202
b2c3710f 203 ret = snprintf(log, PATH_MAX, "%s/%s.log", opts->user->directory, opts->action);
e29fe1dd
TA
204 if (ret < 0 || ret >= PATH_MAX) {
205 ERROR("logfile name too long\n");
206 return;
207 }
208
209 argv = malloc(static_args * sizeof(*argv));
210 if (!argv)
211 return;
212
213 memset(argv, 0, static_args * sizeof(*argv));
214
215#define DECLARE_ARG(arg) \
216 do { \
217 if (arg == NULL) { \
218 ERROR("Got NULL argument for criu"); \
219 goto err; \
220 } \
221 argv[argc++] = strdup(arg); \
222 if (!argv[argc-1]) \
223 goto err; \
224 } while (0)
225
226 argv[argc++] = on_path("criu", NULL);
227 if (!argv[argc-1]) {
228 ERROR("Couldn't find criu binary\n");
229 goto err;
230 }
231
232 DECLARE_ARG(opts->action);
233 DECLARE_ARG("--tcp-established");
234 DECLARE_ARG("--file-locks");
235 DECLARE_ARG("--link-remap");
0a5fc6df 236 DECLARE_ARG("--manage-cgroups=full");
e29fe1dd
TA
237 DECLARE_ARG("--ext-mount-map");
238 DECLARE_ARG("auto");
239 DECLARE_ARG("--enable-external-sharing");
240 DECLARE_ARG("--enable-external-masters");
dd62857a
TA
241 DECLARE_ARG("--enable-fs");
242 DECLARE_ARG("hugetlbfs");
5b454329
TA
243 DECLARE_ARG("--enable-fs");
244 DECLARE_ARG("tracefs");
e29fe1dd 245 DECLARE_ARG("-D");
b2c3710f 246 DECLARE_ARG(opts->user->directory);
e29fe1dd
TA
247 DECLARE_ARG("-o");
248 DECLARE_ARG(log);
249
0ab5703f
TA
250 for (i = 0; i < cgroup_num_hierarchies(); i++) {
251 char **controllers = NULL, *fullname;
252 char *path;
253
254 if (!cgroup_get_hierarchies(i, &controllers)) {
255 ERROR("failed to get hierarchy %d", i);
256 goto err;
257 }
258
259 /* if we are in a dump, we have to ask the monitor process what
260 * the right cgroup is. if this is a restore, we can just use
261 * the handler the restore task created.
262 */
263 if (!strcmp(opts->action, "dump") || !strcmp(opts->action, "pre-dump")) {
264 path = lxc_cmd_get_cgroup_path(opts->c->name, opts->c->config_path, controllers[0]);
265 if (!path) {
266 ERROR("failed to get cgroup path for %s", controllers[0]);
267 goto err;
268 }
269 } else {
270 const char *p;
271
272 p = cgroup_get_cgroup(opts->handler, controllers[0]);
273 if (!p) {
274 ERROR("failed to get cgroup path for %s", controllers[0]);
275 goto err;
276 }
277
278 path = strdup(p);
279 if (!path) {
280 ERROR("strdup failed");
281 goto err;
282 }
283 }
284
c56a9652 285 if (!lxc_deslashify(&path)) {
0ab5703f
TA
286 ERROR("failed to deslashify %s", path);
287 free(path);
288 goto err;
289 }
290
291 fullname = lxc_string_join(",", (const char **) controllers, false);
292 if (!fullname) {
293 ERROR("failed to join controllers");
294 free(path);
295 goto err;
296 }
297
298 ret = sprintf(buf, "%s:%s", fullname, path);
299 free(path);
300 free(fullname);
301 if (ret < 0 || ret >= sizeof(buf)) {
302 ERROR("sprintf of cgroup root arg failed");
303 goto err;
304 }
305
306 DECLARE_ARG("--cgroup-root");
307 DECLARE_ARG(buf);
308 }
309
b2c3710f 310 if (opts->user->verbose)
e29fe1dd
TA
311 DECLARE_ARG("-vvvvvv");
312
b9ee6643
TA
313 if (opts->user->action_script) {
314 DECLARE_ARG("--action-script");
315 DECLARE_ARG(opts->user->action_script);
316 }
317
aef3d51e 318 if (strcmp(opts->action, "dump") == 0 || strcmp(opts->action, "pre-dump") == 0) {
dc259399 319 char pid[32], *freezer_relative;
e29fe1dd
TA
320
321 if (sprintf(pid, "%d", opts->c->init_pid(opts->c)) < 0)
322 goto err;
323
324 DECLARE_ARG("-t");
325 DECLARE_ARG(pid);
dc259399
TA
326
327 freezer_relative = lxc_cmd_get_cgroup_path(opts->c->name,
328 opts->c->config_path,
329 "freezer");
330 if (!freezer_relative) {
331 ERROR("failed getting freezer path");
332 goto err;
333 }
334
335 ret = snprintf(log, sizeof(log), "/sys/fs/cgroup/freezer/%s", freezer_relative);
336 if (ret < 0 || ret >= sizeof(log))
337 goto err;
338
f1954503
AR
339 if (!opts->user->disable_skip_in_flight &&
340 strcmp(opts->criu_version, CRIU_IN_FLIGHT_SUPPORT) >= 0)
341 DECLARE_ARG("--skip-in-flight");
342
dc259399
TA
343 DECLARE_ARG("--freeze-cgroup");
344 DECLARE_ARG(log);
345
4b54788e 346 if (opts->tty_id[0]) {
36d2096c
TA
347 DECLARE_ARG("--ext-mount-map");
348 DECLARE_ARG("/dev/console:console");
349
4b54788e
TA
350 DECLARE_ARG("--external");
351 DECLARE_ARG(opts->tty_id);
352 }
353
b2c3710f 354 if (opts->user->predump_dir) {
aef3d51e 355 DECLARE_ARG("--prev-images-dir");
b2c3710f 356 DECLARE_ARG(opts->user->predump_dir);
74eb576c 357 }
4c0c0319 358
b2c3710f 359 if (opts->user->pageserver_address && opts->user->pageserver_port) {
74eb576c
NE
360 DECLARE_ARG("--page-server");
361 DECLARE_ARG("--address");
b2c3710f 362 DECLARE_ARG(opts->user->pageserver_address);
74eb576c 363 DECLARE_ARG("--port");
b2c3710f 364 DECLARE_ARG(opts->user->pageserver_port);
74eb576c 365 }
aef3d51e 366
19d1509c
TA
367 if (!opts->user->preserves_inodes)
368 DECLARE_ARG("--force-irmap");
369
b2b7b0d2
TA
370 if (opts->user->ghost_limit) {
371 char ghost_limit[32];
372
9b945f13 373 ret = sprintf(ghost_limit, "%"PRIu64, opts->user->ghost_limit);
b2b7b0d2 374 if (ret < 0 || ret >= sizeof(ghost_limit)) {
9b945f13 375 ERROR("failed to print ghost limit %"PRIu64, opts->user->ghost_limit);
b2b7b0d2
TA
376 goto err;
377 }
378
379 DECLARE_ARG("--ghost-limit");
380 DECLARE_ARG(ghost_limit);
381 }
382
aef3d51e 383 /* only for final dump */
b2c3710f 384 if (strcmp(opts->action, "dump") == 0 && !opts->user->stop)
e29fe1dd
TA
385 DECLARE_ARG("--leave-running");
386 } else if (strcmp(opts->action, "restore") == 0) {
387 void *m;
388 int additional;
13389b29 389 struct lxc_conf *lxc_conf = opts->c->lxc_conf;
e29fe1dd
TA
390
391 DECLARE_ARG("--root");
392 DECLARE_ARG(opts->c->lxc_conf->rootfs.mount);
393 DECLARE_ARG("--restore-detached");
394 DECLARE_ARG("--restore-sibling");
e29fe1dd 395
4b54788e 396 if (tty_info[0]) {
97e4f1a9
TA
397 if (opts->console_fd < 0) {
398 ERROR("lxc.console configured on source host but not target");
399 goto err;
400 }
401
4b54788e
TA
402 ret = snprintf(buf, sizeof(buf), "fd[%d]:%s", opts->console_fd, tty_info);
403 if (ret < 0 || ret >= sizeof(buf))
404 goto err;
405
406 DECLARE_ARG("--inherit-fd");
407 DECLARE_ARG(buf);
408 }
409 if (opts->console_name) {
410 if (snprintf(buf, sizeof(buf), "console:%s", opts->console_name) < 0) {
411 SYSERROR("sprintf'd too many bytes");
412 }
413 DECLARE_ARG("--ext-mount-map");
414 DECLARE_ARG(buf);
415 }
416
13389b29
TA
417 if (lxc_conf->lsm_aa_profile || lxc_conf->lsm_se_context) {
418
419 if (lxc_conf->lsm_aa_profile)
420 ret = snprintf(buf, sizeof(buf), "apparmor:%s", lxc_conf->lsm_aa_profile);
421 else
422 ret = snprintf(buf, sizeof(buf), "selinux:%s", lxc_conf->lsm_se_context);
423
424 if (ret < 0 || ret >= sizeof(buf))
425 goto err;
426
427 DECLARE_ARG("--lsm-profile");
428 DECLARE_ARG(buf);
429 }
430
e29fe1dd
TA
431 additional = lxc_list_len(&opts->c->lxc_conf->network) * 2;
432
fa071249
TA
433 m = realloc(argv, (argc + additional + 1) * sizeof(*argv));
434 if (!m)
435 goto err;
e29fe1dd
TA
436 argv = m;
437
438 lxc_list_for_each(it, &opts->c->lxc_conf->network) {
439 char eth[128], *veth;
440 struct lxc_netdev *n = it->elem;
441
442 if (n->name) {
443 if (strlen(n->name) >= sizeof(eth))
444 goto err;
445 strncpy(eth, n->name, sizeof(eth));
796a109d
TA
446 } else {
447 ret = snprintf(eth, sizeof(eth), "eth%d", netnr);
448 if (ret < 0 || ret >= sizeof(eth))
449 goto err;
450 }
e29fe1dd 451
e2697330
TA
452 switch (n->type) {
453 case LXC_NET_VETH:
454 veth = n->priv.veth_attr.pair;
e29fe1dd 455
e2697330
TA
456 if (n->link)
457 ret = snprintf(buf, sizeof(buf), "veth[%s]:%s@%s", eth, veth, n->link);
458 else
459 ret = snprintf(buf, sizeof(buf), "veth[%s]:%s", eth, veth);
460 if (ret < 0 || ret >= sizeof(buf))
461 goto err;
462 break;
463 case LXC_NET_MACVLAN:
e2697330
TA
464 if (!n->link) {
465 ERROR("no host interface for macvlan %s\n", n->name);
466 goto err;
467 }
468
469 ret = snprintf(buf, sizeof(buf), "macvlan[%s]:%s", eth, n->link);
470 if (ret < 0 || ret >= sizeof(buf))
471 goto err;
472 break;
473 case LXC_NET_NONE:
474 case LXC_NET_EMPTY:
475 break;
476 default:
477 /* we have screened for this earlier... */
478 ERROR("unexpected network type %d\n", n->type);
e29fe1dd 479 goto err;
e2697330 480 }
e29fe1dd 481
0f90d613 482 DECLARE_ARG("--external");
e29fe1dd 483 DECLARE_ARG(buf);
2f3fbc6b 484 netnr++;
e29fe1dd
TA
485 }
486
487 }
488
489 argv[argc] = NULL;
490
cf4b07a5 491 buf[0] = 0;
a17fa3c0 492 pos = 0;
72a30576 493
cf4b07a5 494 for (i = 0; argv[i]; i++) {
72a30576
NE
495 ret = snprintf(buf + pos, sizeof(buf) - pos, "%s ", argv[i]);
496 if (ret < 0 || ret >= sizeof(buf) - pos)
497 goto err;
498 else
499 pos += ret;
cf4b07a5
TA
500 }
501
502 INFO("execing: %s", buf);
503
e29fe1dd
TA
504#undef DECLARE_ARG
505 execv(argv[0], argv);
506err:
e29fe1dd
TA
507 for (i = 0; argv[i]; i++)
508 free(argv[i]);
509 free(argv);
510}
511
8ba5ced7
TA
512/*
513 * Check to see if the criu version is recent enough for all the features we
514 * use. This version allows either CRIU_VERSION or (CRIU_GITID_VERSION and
515 * CRIU_GITID_PATCHLEVEL) to work, enabling users building from git to c/r
516 * things potentially before a version is released with a particular feature.
517 *
518 * The intent is that when criu development slows down, we can drop this, but
519 * for now we shouldn't attempt to c/r with versions that we know won't work.
5407e2ab
CB
520 *
521 * Note: If version != NULL criu_version() stores the detected criu version in
522 * version. Allocates memory for version which must be freed by caller.
8ba5ced7 523 */
5407e2ab 524static bool criu_version_ok(char **version)
8ba5ced7
TA
525{
526 int pipes[2];
527 pid_t pid;
528
529 if (pipe(pipes) < 0) {
530 SYSERROR("pipe() failed");
531 return false;
532 }
533
534 pid = fork();
535 if (pid < 0) {
536 SYSERROR("fork() failed");
537 return false;
538 }
539
540 if (pid == 0) {
541 char *args[] = { "criu", "--version", NULL };
755fa453 542 char *path;
8ba5ced7
TA
543 close(pipes[0]);
544
545 close(STDERR_FILENO);
546 if (dup2(pipes[1], STDOUT_FILENO) < 0)
547 exit(1);
548
755fa453 549 path = on_path("criu", NULL);
d9b32b09
SH
550 if (!path)
551 exit(1);
552
755fa453 553 execv(path, args);
8ba5ced7
TA
554 exit(1);
555 } else {
556 FILE *f;
5407e2ab 557 char *tmp;
8ba5ced7
TA
558 int patch;
559
560 close(pipes[1]);
561 if (wait_for_pid(pid) < 0) {
562 close(pipes[0]);
4eae4051 563 SYSERROR("execing criu failed, is it installed?");
8ba5ced7
TA
564 return false;
565 }
566
567 f = fdopen(pipes[0], "r");
568 if (!f) {
569 close(pipes[0]);
570 return false;
571 }
572
5407e2ab
CB
573 tmp = malloc(1024);
574 if (!tmp) {
575 fclose(f);
576 return false;
577 }
578
579 if (fscanf(f, "Version: %1023[^\n]s", tmp) != 1)
8ba5ced7
TA
580 goto version_error;
581
582 if (fgetc(f) != '\n')
583 goto version_error;
584
5407e2ab 585 if (strcmp(tmp, CRIU_VERSION) >= 0)
8ba5ced7
TA
586 goto version_match;
587
5407e2ab 588 if (fscanf(f, "GitID: v%1023[^-]s", tmp) != 1)
8ba5ced7
TA
589 goto version_error;
590
591 if (fgetc(f) != '-')
592 goto version_error;
593
594 if (fscanf(f, "%d", &patch) != 1)
595 goto version_error;
596
5407e2ab 597 if (strcmp(tmp, CRIU_GITID_VERSION) < 0)
8ba5ced7
TA
598 goto version_error;
599
600 if (patch < CRIU_GITID_PATCHLEVEL)
601 goto version_error;
602
603version_match:
3158ab5b 604 fclose(f);
5407e2ab
CB
605 if (!version)
606 free(tmp);
607 else
608 *version = tmp;
8ba5ced7
TA
609 return true;
610
611version_error:
3158ab5b 612 fclose(f);
5407e2ab 613 free(tmp);
8ba5ced7
TA
614 ERROR("must have criu " CRIU_VERSION " or greater to checkpoint/restore\n");
615 return false;
616 }
617}
618
e29fe1dd
TA
619/* Check and make sure the container has a configuration that we know CRIU can
620 * dump. */
f1954503 621static bool criu_ok(struct lxc_container *c, char **criu_version)
e29fe1dd
TA
622{
623 struct lxc_list *it;
e29fe1dd 624
f1954503 625 if (!criu_version_ok(criu_version))
8ba5ced7
TA
626 return false;
627
e29fe1dd
TA
628 if (geteuid()) {
629 ERROR("Must be root to checkpoint\n");
630 return false;
631 }
632
633 /* We only know how to restore containers with veth networks. */
634 lxc_list_for_each(it, &c->lxc_conf->network) {
635 struct lxc_netdev *n = it->elem;
65b20221
TA
636 switch(n->type) {
637 case LXC_NET_VETH:
638 case LXC_NET_NONE:
639 case LXC_NET_EMPTY:
e2697330 640 case LXC_NET_MACVLAN:
65b20221
TA
641 break;
642 default:
e2697330 643 ERROR("Found un-dumpable network: %s (%s)\n", lxc_net_type_to_str(n->type), n->name);
e29fe1dd
TA
644 return false;
645 }
646 }
647
e29fe1dd
TA
648 return true;
649}
650
e29fe1dd
TA
651static bool restore_net_info(struct lxc_container *c)
652{
653 struct lxc_list *it;
654 bool has_error = true;
655
656 if (container_mem_lock(c))
657 return false;
658
659 lxc_list_for_each(it, &c->lxc_conf->network) {
660 struct lxc_netdev *netdev = it->elem;
661 char template[IFNAMSIZ];
65b20221
TA
662
663 if (netdev->type != LXC_NET_VETH)
664 continue;
665
e29fe1dd
TA
666 snprintf(template, sizeof(template), "vethXXXXXX");
667
668 if (!netdev->priv.veth_attr.pair)
669 netdev->priv.veth_attr.pair = lxc_mkifname(template);
670
671 if (!netdev->priv.veth_attr.pair)
672 goto out_unlock;
673 }
674
675 has_error = false;
676
677out_unlock:
678 container_mem_unlock(c);
679 return !has_error;
680}
681
aef3d51e
TA
682// do_restore never returns, the calling process is used as the
683// monitor process. do_restore calls exit() if it fails.
c33b0338 684static void do_restore(struct lxc_container *c, int status_pipe, struct migrate_opts *opts, char *criu_version)
e29fe1dd
TA
685{
686 pid_t pid;
e29fe1dd 687 struct lxc_handler *handler;
a7fb6043 688 int status, fd;
9b1e2e6e 689 int pipes[2] = {-1, -1};
e29fe1dd 690
a7fb6043
TA
691 /* Try to detach from the current controlling tty if it exists.
692 * Othwerise, lxc_init (via lxc_console) will attach the container's
693 * console output to the current tty, which is probably not what any
694 * library user wants, and if they do, they can just manually configure
695 * it :)
696 */
697 fd = open("/dev/tty", O_RDWR);
698 if (fd >= 0) {
699 if (ioctl(fd, TIOCNOTTY, NULL) < 0)
700 SYSERROR("couldn't detach from tty");
701 close(fd);
702 }
703
e29fe1dd
TA
704 handler = lxc_init(c->name, c->lxc_conf, c->config_path);
705 if (!handler)
706 goto out;
707
708 if (!cgroup_init(handler)) {
709 ERROR("failed initing cgroups");
710 goto out_fini_handler;
711 }
712
713 if (!cgroup_create(handler)) {
714 ERROR("failed creating groups");
715 goto out_fini_handler;
716 }
717
718 if (!restore_net_info(c)) {
719 ERROR("failed restoring network info");
720 goto out_fini_handler;
721 }
722
723 resolve_clone_flags(handler);
724
3d9a5c85
TA
725 if (pipe(pipes) < 0) {
726 SYSERROR("pipe() failed");
727 goto out_fini_handler;
728 }
729
e29fe1dd
TA
730 pid = fork();
731 if (pid < 0)
732 goto out_fini_handler;
733
734 if (pid == 0) {
735 struct criu_opts os;
736 struct lxc_rootfs *rootfs;
4b54788e 737 int flags;
e29fe1dd 738
3d9a5c85
TA
739 close(status_pipe);
740 status_pipe = -1;
741
742 close(pipes[0]);
743 pipes[0] = -1;
744 if (dup2(pipes[1], STDERR_FILENO) < 0) {
745 SYSERROR("dup2 failed");
746 goto out_fini_handler;
747 }
748
749 if (dup2(pipes[1], STDOUT_FILENO) < 0) {
750 SYSERROR("dup2 failed");
751 goto out_fini_handler;
752 }
e29fe1dd
TA
753
754 if (unshare(CLONE_NEWNS))
755 goto out_fini_handler;
756
757 /* CRIU needs the lxc root bind mounted so that it is the root of some
758 * mount. */
759 rootfs = &c->lxc_conf->rootfs;
760
761 if (rootfs_is_blockdev(c->lxc_conf)) {
762 if (do_rootfs_setup(c->lxc_conf, c->name, c->config_path) < 0)
763 goto out_fini_handler;
764 } else {
765 if (mkdir(rootfs->mount, 0755) < 0 && errno != EEXIST)
766 goto out_fini_handler;
767
768 if (mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL) < 0) {
769 SYSERROR("remount / to private failed");
770 goto out_fini_handler;
771 }
772
773 if (mount(rootfs->path, rootfs->mount, NULL, MS_BIND, NULL) < 0) {
774 rmdir(rootfs->mount);
775 goto out_fini_handler;
776 }
777 }
778
779 os.action = "restore";
b2c3710f 780 os.user = opts;
e29fe1dd 781 os.c = c;
4b54788e 782 os.console_fd = c->lxc_conf->console.slave;
f1954503 783 os.criu_version = criu_version;
0ab5703f 784 os.handler = handler;
4b54788e 785
97e4f1a9
TA
786 if (os.console_fd >= 0) {
787 /* Twiddle the FD_CLOEXEC bit. We want to pass this FD to criu
788 * via --inherit-fd, so we don't want it to close.
789 */
790 flags = fcntl(os.console_fd, F_GETFD);
791 if (flags < 0) {
792 SYSERROR("F_GETFD failed: %d", os.console_fd);
793 goto out_fini_handler;
794 }
4b54788e 795
97e4f1a9 796 flags &= ~FD_CLOEXEC;
4b54788e 797
97e4f1a9
TA
798 if (fcntl(os.console_fd, F_SETFD, flags) < 0) {
799 SYSERROR("F_SETFD failed");
800 goto out_fini_handler;
801 }
4b54788e
TA
802 }
803 os.console_name = c->lxc_conf->console.name;
e29fe1dd
TA
804
805 /* exec_criu() returning is an error */
7103fe6f 806 exec_criu(&os);
e29fe1dd
TA
807 umount(rootfs->mount);
808 rmdir(rootfs->mount);
809 goto out_fini_handler;
810 } else {
811 int ret;
812 char title[2048];
813
3d9a5c85
TA
814 close(pipes[1]);
815 pipes[1] = -1;
816
e29fe1dd
TA
817 pid_t w = waitpid(pid, &status, 0);
818 if (w == -1) {
819 SYSERROR("waitpid");
820 goto out_fini_handler;
821 }
822
e29fe1dd 823 if (WIFEXITED(status)) {
75d219f0
TA
824 char buf[4096];
825
e29fe1dd 826 if (WEXITSTATUS(status)) {
3d9a5c85
TA
827 int n;
828
829 n = read(pipes[0], buf, sizeof(buf));
830 if (n < 0) {
831 SYSERROR("failed reading from criu stderr");
832 goto out_fini_handler;
833 }
834
835 buf[n] = 0;
836
837 ERROR("criu process exited %d, output:\n%s\n", WEXITSTATUS(status), buf);
e29fe1dd
TA
838 goto out_fini_handler;
839 } else {
3eba9b49 840 ret = snprintf(buf, sizeof(buf), "/proc/self/task/%lu/children", (unsigned long)syscall(__NR_gettid));
75d219f0
TA
841 if (ret < 0 || ret >= sizeof(buf)) {
842 ERROR("snprintf'd too many characters: %d", ret);
843 goto out_fini_handler;
844 }
845
846 FILE *f = fopen(buf, "r");
e29fe1dd 847 if (!f) {
75d219f0 848 SYSERROR("couldn't read restore's children file %s\n", buf);
e29fe1dd
TA
849 goto out_fini_handler;
850 }
851
852 ret = fscanf(f, "%d", (int*) &handler->pid);
853 fclose(f);
854 if (ret != 1) {
855 ERROR("reading restore pid failed");
856 goto out_fini_handler;
857 }
858
f8a41688
TA
859 if (lxc_set_state(c->name, handler, RUNNING)) {
860 ERROR("error setting running state after restore");
e29fe1dd 861 goto out_fini_handler;
f8a41688 862 }
e29fe1dd
TA
863 }
864 } else {
865 ERROR("CRIU was killed with signal %d\n", WTERMSIG(status));
866 goto out_fini_handler;
867 }
868
3d9a5c85
TA
869 close(pipes[0]);
870
f3886023
TA
871 ret = write(status_pipe, &status, sizeof(status));
872 close(status_pipe);
873 status_pipe = -1;
874
875 if (sizeof(status) != ret) {
876 SYSERROR("failed to write all of status");
877 goto out_fini_handler;
878 }
879
e29fe1dd
TA
880 /*
881 * See comment in lxcapi_start; we don't care if these
882 * fail because it's just a beauty thing. We just
883 * assign the return here to silence potential.
884 */
885 ret = snprintf(title, sizeof(title), "[lxc monitor] %s %s", c->config_path, c->name);
886 ret = setproctitle(title);
887
888 ret = lxc_poll(c->name, handler);
889 if (ret)
890 lxc_abort(c->name, handler);
891 lxc_fini(c->name, handler);
892 exit(ret);
893 }
894
895out_fini_handler:
3d9a5c85
TA
896 if (pipes[0] >= 0)
897 close(pipes[0]);
898 if (pipes[1] >= 0)
899 close(pipes[1]);
900
e29fe1dd
TA
901 lxc_fini(c->name, handler);
902
903out:
3d9a5c85 904 if (status_pipe >= 0) {
f3886023
TA
905 /* ensure getting here was a failure, e.g. if we failed to
906 * parse the child pid or something, even after a successful
907 * restore
908 */
909 if (!status)
910 status = 1;
3d9a5c85 911 if (write(status_pipe, &status, sizeof(status)) != sizeof(status)) {
e29fe1dd
TA
912 SYSERROR("writing status failed");
913 }
3d9a5c85 914 close(status_pipe);
e29fe1dd
TA
915 }
916
917 exit(1);
918}
aef3d51e 919
4b54788e
TA
920static int save_tty_major_minor(char *directory, struct lxc_container *c, char *tty_id, int len)
921{
922 FILE *f;
923 char path[PATH_MAX];
924 int ret;
925 struct stat sb;
926
927 if (c->lxc_conf->console.path && !strcmp(c->lxc_conf->console.path, "none")) {
928 tty_id[0] = 0;
929 return 0;
930 }
931
932 ret = snprintf(path, sizeof(path), "/proc/%d/root/dev/console", c->init_pid(c));
933 if (ret < 0 || ret >= sizeof(path)) {
934 ERROR("snprintf'd too many chacters: %d", ret);
935 return -1;
936 }
937
938 ret = stat(path, &sb);
939 if (ret < 0) {
940 SYSERROR("stat of %s failed", path);
941 return -1;
942 }
943
944 ret = snprintf(path, sizeof(path), "%s/tty.info", directory);
945 if (ret < 0 || ret >= sizeof(path)) {
946 ERROR("snprintf'd too many characters: %d", ret);
947 return -1;
948 }
949
f03280a7
TA
950 ret = snprintf(tty_id, len, "tty[%llx:%llx]",
951 (long long unsigned) sb.st_rdev,
952 (long long unsigned) sb.st_dev);
4b54788e
TA
953 if (ret < 0 || ret >= sizeof(path)) {
954 ERROR("snprintf'd too many characters: %d", ret);
955 return -1;
956 }
957
958 f = fopen(path, "w");
959 if (!f) {
960 SYSERROR("failed to open %s", path);
961 return -1;
962 }
963
964 ret = fprintf(f, "%s", tty_id);
965 fclose(f);
966 if (ret < 0)
967 SYSERROR("failed to write to %s", path);
968 return ret;
969}
970
aef3d51e 971/* do one of either predump or a regular dump */
b2c3710f 972static bool do_dump(struct lxc_container *c, char *mode, struct migrate_opts *opts)
aef3d51e
TA
973{
974 pid_t pid;
f1954503 975 char *criu_version = NULL;
aef3d51e 976
f1954503 977 if (!criu_ok(c, &criu_version))
aef3d51e
TA
978 return false;
979
b2c3710f 980 if (mkdir_p(opts->directory, 0700) < 0)
aef3d51e
TA
981 return false;
982
983 pid = fork();
984 if (pid < 0) {
985 SYSERROR("fork failed");
986 return false;
987 }
988
989 if (pid == 0) {
990 struct criu_opts os;
0ab5703f
TA
991 struct lxc_handler h;
992
993 h.name = c->name;
994 if (!cgroup_init(&h)) {
995 ERROR("failed to cgroup_init()");
996 exit(1);
997 }
aef3d51e
TA
998
999 os.action = mode;
b2c3710f 1000 os.user = opts;
aef3d51e 1001 os.c = c;
4b54788e 1002 os.console_name = c->lxc_conf->console.path;
f1954503 1003 os.criu_version = criu_version;
74eb576c 1004
b2c3710f 1005 if (save_tty_major_minor(opts->directory, c, os.tty_id, sizeof(os.tty_id)) < 0)
4b54788e 1006 exit(1);
aef3d51e
TA
1007
1008 /* exec_criu() returning is an error */
7103fe6f 1009 exec_criu(&os);
aef3d51e
TA
1010 exit(1);
1011 } else {
1012 int status;
1013 pid_t w = waitpid(pid, &status, 0);
1014 if (w == -1) {
1015 SYSERROR("waitpid");
1016 return false;
1017 }
1018
1019 if (WIFEXITED(status)) {
1020 if (WEXITSTATUS(status)) {
1021 ERROR("dump failed with %d\n", WEXITSTATUS(status));
1022 return false;
1023 }
1024
1025 return true;
1026 } else if (WIFSIGNALED(status)) {
1027 ERROR("dump signaled with %d\n", WTERMSIG(status));
1028 return false;
1029 } else {
1030 ERROR("unknown dump exit %d\n", status);
1031 return false;
1032 }
1033 }
1034}
1035
b2c3710f 1036bool __criu_pre_dump(struct lxc_container *c, struct migrate_opts *opts)
aef3d51e 1037{
b2c3710f 1038 return do_dump(c, "pre-dump", opts);
aef3d51e
TA
1039}
1040
b2c3710f 1041bool __criu_dump(struct lxc_container *c, struct migrate_opts *opts)
aef3d51e
TA
1042{
1043 char path[PATH_MAX];
1044 int ret;
1045
b2c3710f 1046 ret = snprintf(path, sizeof(path), "%s/inventory.img", opts->directory);
aef3d51e
TA
1047 if (ret < 0 || ret >= sizeof(path))
1048 return false;
1049
1050 if (access(path, F_OK) == 0) {
1051 ERROR("please use a fresh directory for the dump directory\n");
1052 return false;
1053 }
1054
b2c3710f 1055 return do_dump(c, "dump", opts);
aef3d51e
TA
1056}
1057
b2c3710f 1058bool __criu_restore(struct lxc_container *c, struct migrate_opts *opts)
aef3d51e
TA
1059{
1060 pid_t pid;
1061 int status, nread;
1062 int pipefd[2];
f1954503 1063 char *criu_version = NULL;
aef3d51e 1064
f1954503 1065 if (!criu_ok(c, &criu_version))
aef3d51e
TA
1066 return false;
1067
1068 if (geteuid()) {
1069 ERROR("Must be root to restore\n");
1070 return false;
1071 }
1072
1073 if (pipe(pipefd)) {
1074 ERROR("failed to create pipe");
1075 return false;
1076 }
1077
1078 pid = fork();
1079 if (pid < 0) {
1080 close(pipefd[0]);
1081 close(pipefd[1]);
1082 return false;
1083 }
1084
1085 if (pid == 0) {
1086 close(pipefd[0]);
1087 // this never returns
f1954503 1088 do_restore(c, pipefd[1], opts, criu_version);
aef3d51e
TA
1089 }
1090
1091 close(pipefd[1]);
1092
1093 nread = read(pipefd[0], &status, sizeof(status));
1094 close(pipefd[0]);
1095 if (sizeof(status) != nread) {
1096 ERROR("reading status from pipe failed");
1097 goto err_wait;
1098 }
1099
1100 // If the criu process was killed or exited nonzero, wait() for the
1101 // handler, since the restore process died. Otherwise, we don't need to
1102 // wait, since the child becomes the monitor process.
1103 if (!WIFEXITED(status) || WEXITSTATUS(status))
1104 goto err_wait;
1105 return true;
1106
1107err_wait:
1108 if (wait_for_pid(pid))
1109 ERROR("restore process died");
1110 return false;
1111}