]> git.proxmox.com Git - mirror_lxc.git/blame - src/lxc/criu.c
Merge pull request #1756 from brauner/2017-08-10/further_lxc_2.1_preparations
[mirror_lxc.git] / src / lxc / criu.c
CommitLineData
e29fe1dd
TA
1/*
2 * lxc: linux Container library
3 *
4 * Copyright © 2014-2015 Canonical Ltd.
5 *
6 * Authors:
7 * Tycho Andersen <tycho.andersen@canonical.com>
8 *
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 */
23#define _GNU_SOURCE
9b945f13 24#include <inttypes.h>
e29fe1dd
TA
25#include <linux/limits.h>
26#include <sched.h>
27#include <stdio.h>
28#include <stdlib.h>
29#include <string.h>
30#include <sys/mount.h>
31#include <sys/types.h>
32#include <sys/wait.h>
33#include <unistd.h>
34
35#include "config.h"
36
e29fe1dd
TA
37#include "cgroup.h"
38#include "conf.h"
dc259399 39#include "commands.h"
e29fe1dd
TA
40#include "criu.h"
41#include "log.h"
42#include "lxc.h"
43#include "lxclock.h"
44#include "network.h"
28d832c4 45#include "storage.h"
e29fe1dd
TA
46#include "utils.h"
47
5f4e44a2
TA
48#if IS_BIONIC
49#include <../include/lxcmntent.h>
50#else
51#include <mntent.h>
52#endif
53
c33b0338 54#define CRIU_VERSION "2.0"
73d46752
TA
55
56#define CRIU_GITID_VERSION "2.0"
57#define CRIU_GITID_PATCHLEVEL 0
58
f1954503 59#define CRIU_IN_FLIGHT_SUPPORT "2.4"
46c8ffd5 60#define CRIU_EXTERNAL_NOT_VETH "2.8"
f1954503 61
e29fe1dd
TA
62lxc_log_define(lxc_criu, lxc);
63
73d46752 64struct criu_opts {
5af85cb1
TA
65 /* the thing to hook to stdout and stderr for logging */
66 int pipefd;
67
73d46752
TA
68 /* The type of criu invocation, one of "dump" or "restore" */
69 char *action;
70
b2c3710f
TA
71 /* the user-provided migrate options relevant to this action */
72 struct migrate_opts *user;
73d46752
TA
73
74 /* The container to dump */
75 struct lxc_container *c;
76
73d46752 77 /* dump: stop the container or not after dumping? */
4b54788e 78 char tty_id[32]; /* the criu tty id for /dev/console, i.e. "tty[${rdev}:${dev}]" */
73d46752
TA
79
80 /* restore: the file to write the init process' pid into */
0ab5703f 81 struct lxc_handler *handler;
4b54788e
TA
82 int console_fd;
83 /* The path that is bind mounted from /dev/console, if any. We don't
84 * want to use `--ext-mount-map auto`'s result here because the pts
85 * device may have a different path (e.g. if the pty number is
3aed4934 86 * different) on the target host. NULL if lxc.console.path = "none".
4b54788e
TA
87 */
88 char *console_name;
f1954503
AR
89
90 /* The detected version of criu */
91 char *criu_version;
73d46752
TA
92};
93
4b54788e
TA
94static int load_tty_major_minor(char *directory, char *output, int len)
95{
96 FILE *f;
97 char path[PATH_MAX];
98 int ret;
99
100 ret = snprintf(path, sizeof(path), "%s/tty.info", directory);
101 if (ret < 0 || ret >= sizeof(path)) {
102 ERROR("snprintf'd too many chacters: %d", ret);
103 return -1;
104 }
105
106 f = fopen(path, "r");
107 if (!f) {
108 /* This means we're coming from a liblxc which didn't export
3aed4934
CB
109 * the tty info. In this case they had to have lxc.console.path
110 * = * none, so there's no problem restoring.
4b54788e
TA
111 */
112 if (errno == ENOENT)
113 return 0;
114
115 SYSERROR("couldn't open %s", path);
116 return -1;
117 }
118
119 if (!fgets(output, len, f)) {
120 fclose(f);
121 SYSERROR("couldn't read %s", path);
122 return -1;
123 }
124
125 fclose(f);
126 return 0;
127}
128
9451eeff 129static void exec_criu(struct criu_opts *opts)
e29fe1dd
TA
130{
131 char **argv, log[PATH_MAX];
19d1509c 132 int static_args = 23, argc = 0, i, ret;
e29fe1dd
TA
133 int netnr = 0;
134 struct lxc_list *it;
5f4e44a2
TA
135 FILE *mnts;
136 struct mntent mntent;
e29fe1dd 137
a17fa3c0
NE
138 char buf[4096], tty_info[32];
139 size_t pos;
5af85cb1 140
e9195050
TA
141 /* If we are currently in a cgroup /foo/bar, and the container is in a
142 * cgroup /lxc/foo, lxcfs will give us an ENOENT if some task in the
143 * container has an open fd that points to one of the cgroup files
144 * (systemd always opens its "root" cgroup). So, let's escape to the
145 * /actual/ root cgroup so that lxcfs thinks criu has enough rights to
146 * see all cgroups.
147 */
7103fe6f 148 if (!cgroup_escape()) {
e9195050
TA
149 ERROR("failed to escape cgroups");
150 return;
151 }
152
e29fe1dd 153 /* The command line always looks like:
19d1509c 154 * criu $(action) --tcp-established --file-locks --link-remap \
5f178bc9 155 * --manage-cgroups=full --action-script foo.sh -D $(directory) \
e29fe1dd
TA
156 * -o $(directory)/$(action).log --ext-mount-map auto
157 * --enable-external-sharing --enable-external-masters
4b54788e 158 * --enable-fs hugetlbfs --enable-fs tracefs --ext-mount-map console:/dev/pts/n
e29fe1dd
TA
159 * +1 for final NULL */
160
aef3d51e 161 if (strcmp(opts->action, "dump") == 0 || strcmp(opts->action, "pre-dump") == 0) {
dc259399
TA
162 /* -t pid --freeze-cgroup /lxc/ct */
163 static_args += 4;
e29fe1dd 164
aef3d51e 165 /* --prev-images-dir <path-to-directory-A-relative-to-B> */
b2c3710f 166 if (opts->user->predump_dir)
aef3d51e
TA
167 static_args += 2;
168
74eb576c 169 /* --page-server --address <address> --port <port> */
b2c3710f 170 if (opts->user->pageserver_address && opts->user->pageserver_port)
74eb576c
NE
171 static_args += 5;
172
aef3d51e 173 /* --leave-running (only for final dump) */
b2c3710f 174 if (strcmp(opts->action, "dump") == 0 && !opts->user->stop)
e29fe1dd 175 static_args++;
4b54788e
TA
176
177 /* --external tty[88,4] */
178 if (opts->tty_id[0])
179 static_args += 2;
19d1509c
TA
180
181 /* --force-irmap */
182 if (!opts->user->preserves_inodes)
183 static_args++;
b2b7b0d2
TA
184
185 /* --ghost-limit 1024 */
186 if (opts->user->ghost_limit)
187 static_args += 2;
e29fe1dd
TA
188 } else if (strcmp(opts->action, "restore") == 0) {
189 /* --root $(lxc_mount_point) --restore-detached
0ab5703f 190 * --restore-sibling
13389b29
TA
191 * --lsm-profile apparmor:whatever
192 */
0ab5703f 193 static_args += 6;
4b54788e
TA
194
195 tty_info[0] = 0;
b2c3710f 196 if (load_tty_major_minor(opts->user->directory, tty_info, sizeof(tty_info)))
4b54788e
TA
197 return;
198
199 /* --inherit-fd fd[%d]:tty[%s] */
200 if (tty_info[0])
201 static_args += 2;
e29fe1dd
TA
202 } else {
203 return;
204 }
205
09e80d0c
TA
206 if (cgroup_num_hierarchies() > 0)
207 static_args += 2 * cgroup_num_hierarchies();
0ab5703f 208
b2c3710f 209 if (opts->user->verbose)
e29fe1dd
TA
210 static_args++;
211
b9ee6643
TA
212 if (opts->user->action_script)
213 static_args += 2;
214
5f4e44a2
TA
215 static_args += 2 * lxc_list_len(&opts->c->lxc_conf->mount_list);
216
b2c3710f 217 ret = snprintf(log, PATH_MAX, "%s/%s.log", opts->user->directory, opts->action);
e29fe1dd 218 if (ret < 0 || ret >= PATH_MAX) {
9f1f54b0 219 ERROR("logfile name too long");
e29fe1dd
TA
220 return;
221 }
222
223 argv = malloc(static_args * sizeof(*argv));
224 if (!argv)
225 return;
226
227 memset(argv, 0, static_args * sizeof(*argv));
228
229#define DECLARE_ARG(arg) \
230 do { \
231 if (arg == NULL) { \
232 ERROR("Got NULL argument for criu"); \
233 goto err; \
234 } \
235 argv[argc++] = strdup(arg); \
236 if (!argv[argc-1]) \
237 goto err; \
238 } while (0)
239
240 argv[argc++] = on_path("criu", NULL);
241 if (!argv[argc-1]) {
9f1f54b0 242 ERROR("Couldn't find criu binary");
e29fe1dd
TA
243 goto err;
244 }
245
246 DECLARE_ARG(opts->action);
247 DECLARE_ARG("--tcp-established");
248 DECLARE_ARG("--file-locks");
249 DECLARE_ARG("--link-remap");
0a5fc6df 250 DECLARE_ARG("--manage-cgroups=full");
e29fe1dd
TA
251 DECLARE_ARG("--ext-mount-map");
252 DECLARE_ARG("auto");
253 DECLARE_ARG("--enable-external-sharing");
254 DECLARE_ARG("--enable-external-masters");
dd62857a
TA
255 DECLARE_ARG("--enable-fs");
256 DECLARE_ARG("hugetlbfs");
5b454329
TA
257 DECLARE_ARG("--enable-fs");
258 DECLARE_ARG("tracefs");
e29fe1dd 259 DECLARE_ARG("-D");
b2c3710f 260 DECLARE_ARG(opts->user->directory);
e29fe1dd
TA
261 DECLARE_ARG("-o");
262 DECLARE_ARG(log);
263
0ab5703f
TA
264 for (i = 0; i < cgroup_num_hierarchies(); i++) {
265 char **controllers = NULL, *fullname;
31b204e4 266 char *path, *tmp;
0ab5703f
TA
267
268 if (!cgroup_get_hierarchies(i, &controllers)) {
269 ERROR("failed to get hierarchy %d", i);
270 goto err;
271 }
272
273 /* if we are in a dump, we have to ask the monitor process what
274 * the right cgroup is. if this is a restore, we can just use
275 * the handler the restore task created.
276 */
277 if (!strcmp(opts->action, "dump") || !strcmp(opts->action, "pre-dump")) {
278 path = lxc_cmd_get_cgroup_path(opts->c->name, opts->c->config_path, controllers[0]);
279 if (!path) {
280 ERROR("failed to get cgroup path for %s", controllers[0]);
281 goto err;
282 }
283 } else {
284 const char *p;
285
286 p = cgroup_get_cgroup(opts->handler, controllers[0]);
287 if (!p) {
288 ERROR("failed to get cgroup path for %s", controllers[0]);
289 goto err;
290 }
291
292 path = strdup(p);
293 if (!path) {
294 ERROR("strdup failed");
295 goto err;
296 }
297 }
298
31b204e4
CB
299 tmp = lxc_deslashify(path);
300 if (!tmp) {
301 ERROR("Failed to remove extraneous slashes from \"%s\"",
302 path);
0ab5703f
TA
303 free(path);
304 goto err;
305 }
31b204e4
CB
306 free(path);
307 path = tmp;
0ab5703f
TA
308
309 fullname = lxc_string_join(",", (const char **) controllers, false);
310 if (!fullname) {
311 ERROR("failed to join controllers");
312 free(path);
313 goto err;
314 }
315
316 ret = sprintf(buf, "%s:%s", fullname, path);
317 free(path);
318 free(fullname);
319 if (ret < 0 || ret >= sizeof(buf)) {
320 ERROR("sprintf of cgroup root arg failed");
321 goto err;
322 }
323
324 DECLARE_ARG("--cgroup-root");
325 DECLARE_ARG(buf);
326 }
327
b2c3710f 328 if (opts->user->verbose)
e29fe1dd
TA
329 DECLARE_ARG("-vvvvvv");
330
b9ee6643
TA
331 if (opts->user->action_script) {
332 DECLARE_ARG("--action-script");
333 DECLARE_ARG(opts->user->action_script);
334 }
335
5ef5c9a3 336 mnts = make_anonymous_mount_file(&opts->c->lxc_conf->mount_list);
5f4e44a2
TA
337 if (!mnts)
338 goto err;
339
340 while (getmntent_r(mnts, &mntent, buf, sizeof(buf))) {
19d2422b 341 char *fmt, *key, *val, *mntdata;
5f4e44a2 342 char arg[2 * PATH_MAX + 2];
19d2422b
TA
343 unsigned long flags;
344
345 if (parse_mntopts(mntent.mnt_opts, &flags, &mntdata) < 0)
346 goto err;
347
348 free(mntdata);
349
350 /* only add --ext-mount-map for actual bind mounts */
351 if (!(flags & MS_BIND))
352 continue;
5f4e44a2
TA
353
354 if (strcmp(opts->action, "dump") == 0) {
355 fmt = "/%s:%s";
356 key = mntent.mnt_dir;
357 val = mntent.mnt_dir;
358 } else {
359 fmt = "%s:%s";
360 key = mntent.mnt_dir;
361 val = mntent.mnt_fsname;
362 }
363
364 ret = snprintf(arg, sizeof(arg), fmt, key, val);
365 if (ret < 0 || ret >= sizeof(arg)) {
366 fclose(mnts);
367 ERROR("snprintf failed");
368 goto err;
369 }
370
371 DECLARE_ARG("--ext-mount-map");
372 DECLARE_ARG(arg);
373 }
374 fclose(mnts);
375
aef3d51e 376 if (strcmp(opts->action, "dump") == 0 || strcmp(opts->action, "pre-dump") == 0) {
dc259399 377 char pid[32], *freezer_relative;
e29fe1dd
TA
378
379 if (sprintf(pid, "%d", opts->c->init_pid(opts->c)) < 0)
380 goto err;
381
382 DECLARE_ARG("-t");
383 DECLARE_ARG(pid);
dc259399
TA
384
385 freezer_relative = lxc_cmd_get_cgroup_path(opts->c->name,
386 opts->c->config_path,
387 "freezer");
388 if (!freezer_relative) {
389 ERROR("failed getting freezer path");
390 goto err;
391 }
392
393 ret = snprintf(log, sizeof(log), "/sys/fs/cgroup/freezer/%s", freezer_relative);
394 if (ret < 0 || ret >= sizeof(log))
395 goto err;
396
f1954503
AR
397 if (!opts->user->disable_skip_in_flight &&
398 strcmp(opts->criu_version, CRIU_IN_FLIGHT_SUPPORT) >= 0)
399 DECLARE_ARG("--skip-in-flight");
400
dc259399
TA
401 DECLARE_ARG("--freeze-cgroup");
402 DECLARE_ARG(log);
403
4b54788e 404 if (opts->tty_id[0]) {
36d2096c
TA
405 DECLARE_ARG("--ext-mount-map");
406 DECLARE_ARG("/dev/console:console");
407
4b54788e
TA
408 DECLARE_ARG("--external");
409 DECLARE_ARG(opts->tty_id);
410 }
411
b2c3710f 412 if (opts->user->predump_dir) {
aef3d51e 413 DECLARE_ARG("--prev-images-dir");
b2c3710f 414 DECLARE_ARG(opts->user->predump_dir);
9f99a33f 415 DECLARE_ARG("--track-mem");
74eb576c 416 }
4c0c0319 417
b2c3710f 418 if (opts->user->pageserver_address && opts->user->pageserver_port) {
74eb576c
NE
419 DECLARE_ARG("--page-server");
420 DECLARE_ARG("--address");
b2c3710f 421 DECLARE_ARG(opts->user->pageserver_address);
74eb576c 422 DECLARE_ARG("--port");
b2c3710f 423 DECLARE_ARG(opts->user->pageserver_port);
74eb576c 424 }
aef3d51e 425
19d1509c
TA
426 if (!opts->user->preserves_inodes)
427 DECLARE_ARG("--force-irmap");
428
b2b7b0d2
TA
429 if (opts->user->ghost_limit) {
430 char ghost_limit[32];
431
9b945f13 432 ret = sprintf(ghost_limit, "%"PRIu64, opts->user->ghost_limit);
b2b7b0d2 433 if (ret < 0 || ret >= sizeof(ghost_limit)) {
9b945f13 434 ERROR("failed to print ghost limit %"PRIu64, opts->user->ghost_limit);
b2b7b0d2
TA
435 goto err;
436 }
437
438 DECLARE_ARG("--ghost-limit");
439 DECLARE_ARG(ghost_limit);
440 }
441
aef3d51e 442 /* only for final dump */
b2c3710f 443 if (strcmp(opts->action, "dump") == 0 && !opts->user->stop)
e29fe1dd
TA
444 DECLARE_ARG("--leave-running");
445 } else if (strcmp(opts->action, "restore") == 0) {
446 void *m;
447 int additional;
13389b29 448 struct lxc_conf *lxc_conf = opts->c->lxc_conf;
e29fe1dd
TA
449
450 DECLARE_ARG("--root");
451 DECLARE_ARG(opts->c->lxc_conf->rootfs.mount);
452 DECLARE_ARG("--restore-detached");
453 DECLARE_ARG("--restore-sibling");
e29fe1dd 454
4b54788e 455 if (tty_info[0]) {
97e4f1a9 456 if (opts->console_fd < 0) {
3aed4934 457 ERROR("lxc.console.path configured on source host but not target");
97e4f1a9
TA
458 goto err;
459 }
460
4b54788e
TA
461 ret = snprintf(buf, sizeof(buf), "fd[%d]:%s", opts->console_fd, tty_info);
462 if (ret < 0 || ret >= sizeof(buf))
463 goto err;
464
465 DECLARE_ARG("--inherit-fd");
466 DECLARE_ARG(buf);
467 }
468 if (opts->console_name) {
469 if (snprintf(buf, sizeof(buf), "console:%s", opts->console_name) < 0) {
470 SYSERROR("sprintf'd too many bytes");
471 }
472 DECLARE_ARG("--ext-mount-map");
473 DECLARE_ARG(buf);
474 }
475
13389b29
TA
476 if (lxc_conf->lsm_aa_profile || lxc_conf->lsm_se_context) {
477
478 if (lxc_conf->lsm_aa_profile)
479 ret = snprintf(buf, sizeof(buf), "apparmor:%s", lxc_conf->lsm_aa_profile);
480 else
481 ret = snprintf(buf, sizeof(buf), "selinux:%s", lxc_conf->lsm_se_context);
482
483 if (ret < 0 || ret >= sizeof(buf))
484 goto err;
485
486 DECLARE_ARG("--lsm-profile");
487 DECLARE_ARG(buf);
488 }
489
e29fe1dd
TA
490 additional = lxc_list_len(&opts->c->lxc_conf->network) * 2;
491
fa071249
TA
492 m = realloc(argv, (argc + additional + 1) * sizeof(*argv));
493 if (!m)
494 goto err;
e29fe1dd
TA
495 argv = m;
496
497 lxc_list_for_each(it, &opts->c->lxc_conf->network) {
498 char eth[128], *veth;
46c8ffd5 499 char *fmt;
e29fe1dd 500 struct lxc_netdev *n = it->elem;
46c8ffd5
AR
501 bool external_not_veth;
502
503 if (strcmp(opts->criu_version, CRIU_EXTERNAL_NOT_VETH) >= 0) {
504 /* Since criu version 2.8 the usage of --veth-pair
505 * has been deprecated:
506 * git tag --contains f2037e6d3445fc400
507 * v2.8 */
508 external_not_veth = true;
509 } else {
510 external_not_veth = false;
511 }
e29fe1dd
TA
512
513 if (n->name) {
514 if (strlen(n->name) >= sizeof(eth))
515 goto err;
516 strncpy(eth, n->name, sizeof(eth));
796a109d
TA
517 } else {
518 ret = snprintf(eth, sizeof(eth), "eth%d", netnr);
519 if (ret < 0 || ret >= sizeof(eth))
520 goto err;
521 }
e29fe1dd 522
e2697330
TA
523 switch (n->type) {
524 case LXC_NET_VETH:
525 veth = n->priv.veth_attr.pair;
e29fe1dd 526
46c8ffd5
AR
527 if (n->link) {
528 if (external_not_veth)
529 fmt = "veth[%s]:%s@%s";
530 else
531 fmt = "%s=%s@%s";
532
533 ret = snprintf(buf, sizeof(buf), fmt, eth, veth, n->link);
534 } else {
535 if (external_not_veth)
536 fmt = "veth[%s]:%s";
537 else
538 fmt = "%s=%s";
539
540 ret = snprintf(buf, sizeof(buf), fmt, eth, veth);
541 }
e2697330
TA
542 if (ret < 0 || ret >= sizeof(buf))
543 goto err;
544 break;
545 case LXC_NET_MACVLAN:
e2697330 546 if (!n->link) {
9f1f54b0 547 ERROR("no host interface for macvlan %s", n->name);
e2697330
TA
548 goto err;
549 }
550
551 ret = snprintf(buf, sizeof(buf), "macvlan[%s]:%s", eth, n->link);
552 if (ret < 0 || ret >= sizeof(buf))
553 goto err;
554 break;
555 case LXC_NET_NONE:
556 case LXC_NET_EMPTY:
557 break;
558 default:
559 /* we have screened for this earlier... */
9f1f54b0 560 ERROR("unexpected network type %d", n->type);
e29fe1dd 561 goto err;
e2697330 562 }
e29fe1dd 563
46c8ffd5
AR
564 if (external_not_veth)
565 DECLARE_ARG("--external");
566 else
567 DECLARE_ARG("--veth-pair");
e29fe1dd 568 DECLARE_ARG(buf);
2f3fbc6b 569 netnr++;
e29fe1dd
TA
570 }
571
572 }
573
574 argv[argc] = NULL;
575
cf4b07a5 576 buf[0] = 0;
a17fa3c0 577 pos = 0;
72a30576 578
cf4b07a5 579 for (i = 0; argv[i]; i++) {
72a30576
NE
580 ret = snprintf(buf + pos, sizeof(buf) - pos, "%s ", argv[i]);
581 if (ret < 0 || ret >= sizeof(buf) - pos)
582 goto err;
583 else
584 pos += ret;
cf4b07a5
TA
585 }
586
587 INFO("execing: %s", buf);
588
5af85cb1
TA
589 /* before criu inits its log, it sometimes prints things to stdout/err;
590 * let's be sure we capture that.
591 */
592 if (dup2(opts->pipefd, STDOUT_FILENO) < 0) {
593 SYSERROR("dup2 stdout failed");
594 goto err;
595 }
596
597 if (dup2(opts->pipefd, STDERR_FILENO) < 0) {
598 SYSERROR("dup2 stderr failed");
599 goto err;
600 }
601
602 close(opts->pipefd);
603
e29fe1dd
TA
604#undef DECLARE_ARG
605 execv(argv[0], argv);
606err:
e29fe1dd
TA
607 for (i = 0; argv[i]; i++)
608 free(argv[i]);
609 free(argv);
610}
611
8ba5ced7
TA
612/*
613 * Check to see if the criu version is recent enough for all the features we
614 * use. This version allows either CRIU_VERSION or (CRIU_GITID_VERSION and
615 * CRIU_GITID_PATCHLEVEL) to work, enabling users building from git to c/r
616 * things potentially before a version is released with a particular feature.
617 *
618 * The intent is that when criu development slows down, we can drop this, but
619 * for now we shouldn't attempt to c/r with versions that we know won't work.
5407e2ab
CB
620 *
621 * Note: If version != NULL criu_version() stores the detected criu version in
622 * version. Allocates memory for version which must be freed by caller.
8ba5ced7 623 */
5407e2ab 624static bool criu_version_ok(char **version)
8ba5ced7
TA
625{
626 int pipes[2];
627 pid_t pid;
628
629 if (pipe(pipes) < 0) {
630 SYSERROR("pipe() failed");
631 return false;
632 }
633
634 pid = fork();
635 if (pid < 0) {
636 SYSERROR("fork() failed");
637 return false;
638 }
639
640 if (pid == 0) {
641 char *args[] = { "criu", "--version", NULL };
755fa453 642 char *path;
8ba5ced7
TA
643 close(pipes[0]);
644
645 close(STDERR_FILENO);
646 if (dup2(pipes[1], STDOUT_FILENO) < 0)
647 exit(1);
648
755fa453 649 path = on_path("criu", NULL);
d9b32b09
SH
650 if (!path)
651 exit(1);
652
755fa453 653 execv(path, args);
8ba5ced7
TA
654 exit(1);
655 } else {
656 FILE *f;
5407e2ab 657 char *tmp;
8ba5ced7
TA
658 int patch;
659
660 close(pipes[1]);
661 if (wait_for_pid(pid) < 0) {
662 close(pipes[0]);
4eae4051 663 SYSERROR("execing criu failed, is it installed?");
8ba5ced7
TA
664 return false;
665 }
666
667 f = fdopen(pipes[0], "r");
668 if (!f) {
669 close(pipes[0]);
670 return false;
671 }
672
5407e2ab
CB
673 tmp = malloc(1024);
674 if (!tmp) {
675 fclose(f);
676 return false;
677 }
678
679 if (fscanf(f, "Version: %1023[^\n]s", tmp) != 1)
8ba5ced7
TA
680 goto version_error;
681
682 if (fgetc(f) != '\n')
683 goto version_error;
684
5407e2ab 685 if (strcmp(tmp, CRIU_VERSION) >= 0)
8ba5ced7
TA
686 goto version_match;
687
5407e2ab 688 if (fscanf(f, "GitID: v%1023[^-]s", tmp) != 1)
8ba5ced7
TA
689 goto version_error;
690
691 if (fgetc(f) != '-')
692 goto version_error;
693
694 if (fscanf(f, "%d", &patch) != 1)
695 goto version_error;
696
5407e2ab 697 if (strcmp(tmp, CRIU_GITID_VERSION) < 0)
8ba5ced7
TA
698 goto version_error;
699
700 if (patch < CRIU_GITID_PATCHLEVEL)
701 goto version_error;
702
703version_match:
3158ab5b 704 fclose(f);
5407e2ab
CB
705 if (!version)
706 free(tmp);
707 else
708 *version = tmp;
8ba5ced7
TA
709 return true;
710
711version_error:
3158ab5b 712 fclose(f);
5407e2ab 713 free(tmp);
9f1f54b0 714 ERROR("must have criu " CRIU_VERSION " or greater to checkpoint/restore");
8ba5ced7
TA
715 return false;
716 }
717}
718
e29fe1dd
TA
719/* Check and make sure the container has a configuration that we know CRIU can
720 * dump. */
f1954503 721static bool criu_ok(struct lxc_container *c, char **criu_version)
e29fe1dd
TA
722{
723 struct lxc_list *it;
e29fe1dd 724
f1954503 725 if (!criu_version_ok(criu_version))
8ba5ced7
TA
726 return false;
727
e29fe1dd 728 if (geteuid()) {
9f1f54b0 729 ERROR("Must be root to checkpoint");
e29fe1dd
TA
730 return false;
731 }
732
733 /* We only know how to restore containers with veth networks. */
734 lxc_list_for_each(it, &c->lxc_conf->network) {
735 struct lxc_netdev *n = it->elem;
65b20221
TA
736 switch(n->type) {
737 case LXC_NET_VETH:
738 case LXC_NET_NONE:
739 case LXC_NET_EMPTY:
e2697330 740 case LXC_NET_MACVLAN:
65b20221
TA
741 break;
742 default:
9f1f54b0 743 ERROR("Found un-dumpable network: %s (%s)", lxc_net_type_to_str(n->type), n->name);
e29fe1dd
TA
744 return false;
745 }
746 }
747
e29fe1dd
TA
748 return true;
749}
750
e29fe1dd
TA
751static bool restore_net_info(struct lxc_container *c)
752{
753 struct lxc_list *it;
754 bool has_error = true;
755
756 if (container_mem_lock(c))
757 return false;
758
759 lxc_list_for_each(it, &c->lxc_conf->network) {
760 struct lxc_netdev *netdev = it->elem;
761 char template[IFNAMSIZ];
65b20221
TA
762
763 if (netdev->type != LXC_NET_VETH)
764 continue;
765
e29fe1dd
TA
766 snprintf(template, sizeof(template), "vethXXXXXX");
767
768 if (!netdev->priv.veth_attr.pair)
769 netdev->priv.veth_attr.pair = lxc_mkifname(template);
770
771 if (!netdev->priv.veth_attr.pair)
772 goto out_unlock;
773 }
774
775 has_error = false;
776
777out_unlock:
778 container_mem_unlock(c);
779 return !has_error;
780}
781
aef3d51e
TA
782// do_restore never returns, the calling process is used as the
783// monitor process. do_restore calls exit() if it fails.
c33b0338 784static void do_restore(struct lxc_container *c, int status_pipe, struct migrate_opts *opts, char *criu_version)
e29fe1dd
TA
785{
786 pid_t pid;
e29fe1dd 787 struct lxc_handler *handler;
a7fb6043 788 int status, fd;
9b1e2e6e 789 int pipes[2] = {-1, -1};
e29fe1dd 790
a7fb6043
TA
791 /* Try to detach from the current controlling tty if it exists.
792 * Othwerise, lxc_init (via lxc_console) will attach the container's
793 * console output to the current tty, which is probably not what any
794 * library user wants, and if they do, they can just manually configure
795 * it :)
796 */
797 fd = open("/dev/tty", O_RDWR);
798 if (fd >= 0) {
799 if (ioctl(fd, TIOCNOTTY, NULL) < 0)
800 SYSERROR("couldn't detach from tty");
801 close(fd);
802 }
803
5e5576a4 804 handler = lxc_init_handler(c->name, c->lxc_conf, c->config_path, false);
e29fe1dd
TA
805 if (!handler)
806 goto out;
807
aa460476
CB
808 if (lxc_init(c->name, handler) < 0)
809 goto out;
810
e29fe1dd
TA
811 if (!cgroup_init(handler)) {
812 ERROR("failed initing cgroups");
813 goto out_fini_handler;
814 }
815
816 if (!cgroup_create(handler)) {
817 ERROR("failed creating groups");
818 goto out_fini_handler;
819 }
820
821 if (!restore_net_info(c)) {
822 ERROR("failed restoring network info");
823 goto out_fini_handler;
824 }
825
826 resolve_clone_flags(handler);
827
3d9a5c85
TA
828 if (pipe(pipes) < 0) {
829 SYSERROR("pipe() failed");
830 goto out_fini_handler;
831 }
832
e29fe1dd
TA
833 pid = fork();
834 if (pid < 0)
835 goto out_fini_handler;
836
837 if (pid == 0) {
838 struct criu_opts os;
839 struct lxc_rootfs *rootfs;
4b54788e 840 int flags;
e29fe1dd 841
3d9a5c85
TA
842 close(status_pipe);
843 status_pipe = -1;
844
845 close(pipes[0]);
846 pipes[0] = -1;
e29fe1dd
TA
847
848 if (unshare(CLONE_NEWNS))
849 goto out_fini_handler;
850
851 /* CRIU needs the lxc root bind mounted so that it is the root of some
852 * mount. */
853 rootfs = &c->lxc_conf->rootfs;
854
855 if (rootfs_is_blockdev(c->lxc_conf)) {
856 if (do_rootfs_setup(c->lxc_conf, c->name, c->config_path) < 0)
857 goto out_fini_handler;
858 } else {
859 if (mkdir(rootfs->mount, 0755) < 0 && errno != EEXIST)
860 goto out_fini_handler;
861
862 if (mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL) < 0) {
863 SYSERROR("remount / to private failed");
864 goto out_fini_handler;
865 }
866
867 if (mount(rootfs->path, rootfs->mount, NULL, MS_BIND, NULL) < 0) {
868 rmdir(rootfs->mount);
869 goto out_fini_handler;
870 }
871 }
872
5af85cb1 873 os.pipefd = pipes[1];
e29fe1dd 874 os.action = "restore";
b2c3710f 875 os.user = opts;
e29fe1dd 876 os.c = c;
4b54788e 877 os.console_fd = c->lxc_conf->console.slave;
f1954503 878 os.criu_version = criu_version;
0ab5703f 879 os.handler = handler;
4b54788e 880
97e4f1a9
TA
881 if (os.console_fd >= 0) {
882 /* Twiddle the FD_CLOEXEC bit. We want to pass this FD to criu
883 * via --inherit-fd, so we don't want it to close.
884 */
885 flags = fcntl(os.console_fd, F_GETFD);
886 if (flags < 0) {
887 SYSERROR("F_GETFD failed: %d", os.console_fd);
888 goto out_fini_handler;
889 }
4b54788e 890
97e4f1a9 891 flags &= ~FD_CLOEXEC;
4b54788e 892
97e4f1a9
TA
893 if (fcntl(os.console_fd, F_SETFD, flags) < 0) {
894 SYSERROR("F_SETFD failed");
895 goto out_fini_handler;
896 }
4b54788e
TA
897 }
898 os.console_name = c->lxc_conf->console.name;
e29fe1dd
TA
899
900 /* exec_criu() returning is an error */
7103fe6f 901 exec_criu(&os);
e29fe1dd
TA
902 umount(rootfs->mount);
903 rmdir(rootfs->mount);
904 goto out_fini_handler;
905 } else {
906 int ret;
907 char title[2048];
908
3d9a5c85
TA
909 close(pipes[1]);
910 pipes[1] = -1;
911
e29fe1dd
TA
912 pid_t w = waitpid(pid, &status, 0);
913 if (w == -1) {
914 SYSERROR("waitpid");
915 goto out_fini_handler;
916 }
917
e29fe1dd 918 if (WIFEXITED(status)) {
75d219f0
TA
919 char buf[4096];
920
e29fe1dd 921 if (WEXITSTATUS(status)) {
3d9a5c85
TA
922 int n;
923
924 n = read(pipes[0], buf, sizeof(buf));
925 if (n < 0) {
926 SYSERROR("failed reading from criu stderr");
927 goto out_fini_handler;
928 }
929
2735dfae
TA
930 if (n == sizeof(buf))
931 n--;
3d9a5c85
TA
932 buf[n] = 0;
933
9f1f54b0 934 ERROR("criu process exited %d, output:\n%s", WEXITSTATUS(status), buf);
e29fe1dd
TA
935 goto out_fini_handler;
936 } else {
3eba9b49 937 ret = snprintf(buf, sizeof(buf), "/proc/self/task/%lu/children", (unsigned long)syscall(__NR_gettid));
75d219f0
TA
938 if (ret < 0 || ret >= sizeof(buf)) {
939 ERROR("snprintf'd too many characters: %d", ret);
940 goto out_fini_handler;
941 }
942
943 FILE *f = fopen(buf, "r");
e29fe1dd 944 if (!f) {
9f1f54b0 945 SYSERROR("couldn't read restore's children file %s", buf);
e29fe1dd
TA
946 goto out_fini_handler;
947 }
948
949 ret = fscanf(f, "%d", (int*) &handler->pid);
950 fclose(f);
951 if (ret != 1) {
952 ERROR("reading restore pid failed");
953 goto out_fini_handler;
954 }
955
f8a41688
TA
956 if (lxc_set_state(c->name, handler, RUNNING)) {
957 ERROR("error setting running state after restore");
e29fe1dd 958 goto out_fini_handler;
f8a41688 959 }
e29fe1dd
TA
960 }
961 } else {
9f1f54b0 962 ERROR("CRIU was killed with signal %d", WTERMSIG(status));
e29fe1dd
TA
963 goto out_fini_handler;
964 }
965
3d9a5c85
TA
966 close(pipes[0]);
967
f3886023
TA
968 ret = write(status_pipe, &status, sizeof(status));
969 close(status_pipe);
970 status_pipe = -1;
971
972 if (sizeof(status) != ret) {
973 SYSERROR("failed to write all of status");
974 goto out_fini_handler;
975 }
976
e29fe1dd
TA
977 /*
978 * See comment in lxcapi_start; we don't care if these
979 * fail because it's just a beauty thing. We just
980 * assign the return here to silence potential.
981 */
982 ret = snprintf(title, sizeof(title), "[lxc monitor] %s %s", c->config_path, c->name);
983 ret = setproctitle(title);
984
985 ret = lxc_poll(c->name, handler);
986 if (ret)
987 lxc_abort(c->name, handler);
988 lxc_fini(c->name, handler);
989 exit(ret);
990 }
991
992out_fini_handler:
3d9a5c85
TA
993 if (pipes[0] >= 0)
994 close(pipes[0]);
995 if (pipes[1] >= 0)
996 close(pipes[1]);
997
e29fe1dd
TA
998 lxc_fini(c->name, handler);
999
1000out:
3d9a5c85 1001 if (status_pipe >= 0) {
f3886023
TA
1002 /* ensure getting here was a failure, e.g. if we failed to
1003 * parse the child pid or something, even after a successful
1004 * restore
1005 */
1006 if (!status)
1007 status = 1;
3d9a5c85 1008 if (write(status_pipe, &status, sizeof(status)) != sizeof(status)) {
e29fe1dd
TA
1009 SYSERROR("writing status failed");
1010 }
3d9a5c85 1011 close(status_pipe);
e29fe1dd
TA
1012 }
1013
1014 exit(1);
1015}
aef3d51e 1016
4b54788e
TA
1017static int save_tty_major_minor(char *directory, struct lxc_container *c, char *tty_id, int len)
1018{
1019 FILE *f;
1020 char path[PATH_MAX];
1021 int ret;
1022 struct stat sb;
1023
1024 if (c->lxc_conf->console.path && !strcmp(c->lxc_conf->console.path, "none")) {
1025 tty_id[0] = 0;
1026 return 0;
1027 }
1028
1029 ret = snprintf(path, sizeof(path), "/proc/%d/root/dev/console", c->init_pid(c));
1030 if (ret < 0 || ret >= sizeof(path)) {
1031 ERROR("snprintf'd too many chacters: %d", ret);
1032 return -1;
1033 }
1034
1035 ret = stat(path, &sb);
1036 if (ret < 0) {
1037 SYSERROR("stat of %s failed", path);
1038 return -1;
1039 }
1040
1041 ret = snprintf(path, sizeof(path), "%s/tty.info", directory);
1042 if (ret < 0 || ret >= sizeof(path)) {
1043 ERROR("snprintf'd too many characters: %d", ret);
1044 return -1;
1045 }
1046
f03280a7
TA
1047 ret = snprintf(tty_id, len, "tty[%llx:%llx]",
1048 (long long unsigned) sb.st_rdev,
1049 (long long unsigned) sb.st_dev);
4b54788e
TA
1050 if (ret < 0 || ret >= sizeof(path)) {
1051 ERROR("snprintf'd too many characters: %d", ret);
1052 return -1;
1053 }
1054
1055 f = fopen(path, "w");
1056 if (!f) {
1057 SYSERROR("failed to open %s", path);
1058 return -1;
1059 }
1060
1061 ret = fprintf(f, "%s", tty_id);
1062 fclose(f);
1063 if (ret < 0)
1064 SYSERROR("failed to write to %s", path);
1065 return ret;
1066}
1067
aef3d51e 1068/* do one of either predump or a regular dump */
b2c3710f 1069static bool do_dump(struct lxc_container *c, char *mode, struct migrate_opts *opts)
aef3d51e
TA
1070{
1071 pid_t pid;
f1954503 1072 char *criu_version = NULL;
5af85cb1 1073 int criuout[2];
aef3d51e 1074
f1954503 1075 if (!criu_ok(c, &criu_version))
aef3d51e
TA
1076 return false;
1077
5af85cb1
TA
1078 if (pipe(criuout) < 0) {
1079 SYSERROR("pipe() failed");
aef3d51e 1080 return false;
5af85cb1
TA
1081 }
1082
1083 if (mkdir_p(opts->directory, 0700) < 0)
1084 goto fail;
aef3d51e
TA
1085
1086 pid = fork();
1087 if (pid < 0) {
1088 SYSERROR("fork failed");
5af85cb1 1089 goto fail;
aef3d51e
TA
1090 }
1091
1092 if (pid == 0) {
1093 struct criu_opts os;
0ab5703f
TA
1094 struct lxc_handler h;
1095
5af85cb1
TA
1096 close(criuout[0]);
1097
0ab5703f
TA
1098 h.name = c->name;
1099 if (!cgroup_init(&h)) {
1100 ERROR("failed to cgroup_init()");
1101 exit(1);
1102 }
aef3d51e 1103
5af85cb1 1104 os.pipefd = criuout[1];
aef3d51e 1105 os.action = mode;
b2c3710f 1106 os.user = opts;
aef3d51e 1107 os.c = c;
4b54788e 1108 os.console_name = c->lxc_conf->console.path;
f1954503 1109 os.criu_version = criu_version;
74eb576c 1110
b2c3710f 1111 if (save_tty_major_minor(opts->directory, c, os.tty_id, sizeof(os.tty_id)) < 0)
4b54788e 1112 exit(1);
aef3d51e
TA
1113
1114 /* exec_criu() returning is an error */
7103fe6f 1115 exec_criu(&os);
aef3d51e
TA
1116 exit(1);
1117 } else {
1118 int status;
5af85cb1
TA
1119 ssize_t n;
1120 char buf[4096];
1121 bool ret;
1122
1123 close(criuout[1]);
1124
aef3d51e
TA
1125 pid_t w = waitpid(pid, &status, 0);
1126 if (w == -1) {
1127 SYSERROR("waitpid");
5af85cb1 1128 close(criuout[0]);
aef3d51e
TA
1129 return false;
1130 }
1131
5af85cb1
TA
1132 n = read(criuout[0], buf, sizeof(buf));
1133 close(criuout[0]);
1134 if (n < 0) {
1135 SYSERROR("read");
1136 n = 0;
1137 }
1138 buf[n] = 0;
1139
aef3d51e
TA
1140 if (WIFEXITED(status)) {
1141 if (WEXITSTATUS(status)) {
9f1f54b0 1142 ERROR("dump failed with %d", WEXITSTATUS(status));
5af85cb1
TA
1143 ret = false;
1144 } else {
1145 ret = true;
aef3d51e 1146 }
aef3d51e 1147 } else if (WIFSIGNALED(status)) {
9f1f54b0 1148 ERROR("dump signaled with %d", WTERMSIG(status));
5af85cb1 1149 ret = false;
aef3d51e 1150 } else {
9f1f54b0 1151 ERROR("unknown dump exit %d", status);
5af85cb1 1152 ret = false;
aef3d51e 1153 }
5af85cb1
TA
1154
1155 if (!ret)
1156 ERROR("criu output: %s", buf);
1157 return ret;
aef3d51e 1158 }
5af85cb1
TA
1159fail:
1160 close(criuout[0]);
1161 close(criuout[1]);
1162 rmdir(opts->directory);
1163 return false;
aef3d51e
TA
1164}
1165
b2c3710f 1166bool __criu_pre_dump(struct lxc_container *c, struct migrate_opts *opts)
aef3d51e 1167{
b2c3710f 1168 return do_dump(c, "pre-dump", opts);
aef3d51e
TA
1169}
1170
b2c3710f 1171bool __criu_dump(struct lxc_container *c, struct migrate_opts *opts)
aef3d51e
TA
1172{
1173 char path[PATH_MAX];
1174 int ret;
1175
b2c3710f 1176 ret = snprintf(path, sizeof(path), "%s/inventory.img", opts->directory);
aef3d51e
TA
1177 if (ret < 0 || ret >= sizeof(path))
1178 return false;
1179
1180 if (access(path, F_OK) == 0) {
9f1f54b0 1181 ERROR("please use a fresh directory for the dump directory");
aef3d51e
TA
1182 return false;
1183 }
1184
b2c3710f 1185 return do_dump(c, "dump", opts);
aef3d51e
TA
1186}
1187
b2c3710f 1188bool __criu_restore(struct lxc_container *c, struct migrate_opts *opts)
aef3d51e
TA
1189{
1190 pid_t pid;
1191 int status, nread;
1192 int pipefd[2];
f1954503 1193 char *criu_version = NULL;
aef3d51e 1194
f1954503 1195 if (!criu_ok(c, &criu_version))
aef3d51e
TA
1196 return false;
1197
1198 if (geteuid()) {
9f1f54b0 1199 ERROR("Must be root to restore");
aef3d51e
TA
1200 return false;
1201 }
1202
1203 if (pipe(pipefd)) {
1204 ERROR("failed to create pipe");
1205 return false;
1206 }
1207
1208 pid = fork();
1209 if (pid < 0) {
1210 close(pipefd[0]);
1211 close(pipefd[1]);
1212 return false;
1213 }
1214
1215 if (pid == 0) {
1216 close(pipefd[0]);
1217 // this never returns
f1954503 1218 do_restore(c, pipefd[1], opts, criu_version);
aef3d51e
TA
1219 }
1220
1221 close(pipefd[1]);
1222
1223 nread = read(pipefd[0], &status, sizeof(status));
1224 close(pipefd[0]);
1225 if (sizeof(status) != nread) {
1226 ERROR("reading status from pipe failed");
1227 goto err_wait;
1228 }
1229
1230 // If the criu process was killed or exited nonzero, wait() for the
1231 // handler, since the restore process died. Otherwise, we don't need to
1232 // wait, since the child becomes the monitor process.
1233 if (!WIFEXITED(status) || WEXITSTATUS(status))
1234 goto err_wait;
1235 return true;
1236
1237err_wait:
1238 if (wait_for_pid(pid))
1239 ERROR("restore process died");
1240 return false;
1241}