]> git.proxmox.com Git - mirror_lxc.git/blame - src/lxc/criu.c
configure: check for memfd_create()
[mirror_lxc.git] / src / lxc / criu.c
CommitLineData
e29fe1dd
TA
1/*
2 * lxc: linux Container library
3 *
4 * Copyright © 2014-2015 Canonical Ltd.
5 *
6 * Authors:
7 * Tycho Andersen <tycho.andersen@canonical.com>
8 *
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 */
23#define _GNU_SOURCE
24#include <assert.h>
9b945f13 25#include <inttypes.h>
e29fe1dd
TA
26#include <linux/limits.h>
27#include <sched.h>
28#include <stdio.h>
29#include <stdlib.h>
30#include <string.h>
31#include <sys/mount.h>
32#include <sys/types.h>
33#include <sys/wait.h>
34#include <unistd.h>
35
36#include "config.h"
37
d8e48992 38#include "bdev.h"
e29fe1dd
TA
39#include "cgroup.h"
40#include "conf.h"
dc259399 41#include "commands.h"
e29fe1dd
TA
42#include "criu.h"
43#include "log.h"
44#include "lxc.h"
45#include "lxclock.h"
46#include "network.h"
47#include "utils.h"
48
5f4e44a2
TA
49#if IS_BIONIC
50#include <../include/lxcmntent.h>
51#else
52#include <mntent.h>
53#endif
54
c33b0338 55#define CRIU_VERSION "2.0"
73d46752
TA
56
57#define CRIU_GITID_VERSION "2.0"
58#define CRIU_GITID_PATCHLEVEL 0
59
f1954503 60#define CRIU_IN_FLIGHT_SUPPORT "2.4"
46c8ffd5 61#define CRIU_EXTERNAL_NOT_VETH "2.8"
f1954503 62
e29fe1dd
TA
63lxc_log_define(lxc_criu, lxc);
64
73d46752 65struct criu_opts {
5af85cb1
TA
66 /* the thing to hook to stdout and stderr for logging */
67 int pipefd;
68
73d46752
TA
69 /* The type of criu invocation, one of "dump" or "restore" */
70 char *action;
71
b2c3710f
TA
72 /* the user-provided migrate options relevant to this action */
73 struct migrate_opts *user;
73d46752
TA
74
75 /* The container to dump */
76 struct lxc_container *c;
77
73d46752 78 /* dump: stop the container or not after dumping? */
4b54788e 79 char tty_id[32]; /* the criu tty id for /dev/console, i.e. "tty[${rdev}:${dev}]" */
73d46752
TA
80
81 /* restore: the file to write the init process' pid into */
0ab5703f 82 struct lxc_handler *handler;
4b54788e
TA
83 int console_fd;
84 /* The path that is bind mounted from /dev/console, if any. We don't
85 * want to use `--ext-mount-map auto`'s result here because the pts
86 * device may have a different path (e.g. if the pty number is
87 * different) on the target host. NULL if lxc.console = "none".
88 */
89 char *console_name;
f1954503
AR
90
91 /* The detected version of criu */
92 char *criu_version;
73d46752
TA
93};
94
4b54788e
TA
95static int load_tty_major_minor(char *directory, char *output, int len)
96{
97 FILE *f;
98 char path[PATH_MAX];
99 int ret;
100
101 ret = snprintf(path, sizeof(path), "%s/tty.info", directory);
102 if (ret < 0 || ret >= sizeof(path)) {
103 ERROR("snprintf'd too many chacters: %d", ret);
104 return -1;
105 }
106
107 f = fopen(path, "r");
108 if (!f) {
109 /* This means we're coming from a liblxc which didn't export
110 * the tty info. In this case they had to have lxc.console =
111 * none, so there's no problem restoring.
112 */
113 if (errno == ENOENT)
114 return 0;
115
116 SYSERROR("couldn't open %s", path);
117 return -1;
118 }
119
120 if (!fgets(output, len, f)) {
121 fclose(f);
122 SYSERROR("couldn't read %s", path);
123 return -1;
124 }
125
126 fclose(f);
127 return 0;
128}
129
9451eeff 130static void exec_criu(struct criu_opts *opts)
e29fe1dd
TA
131{
132 char **argv, log[PATH_MAX];
19d1509c 133 int static_args = 23, argc = 0, i, ret;
e29fe1dd
TA
134 int netnr = 0;
135 struct lxc_list *it;
5f4e44a2
TA
136 FILE *mnts;
137 struct mntent mntent;
e29fe1dd 138
a17fa3c0
NE
139 char buf[4096], tty_info[32];
140 size_t pos;
5af85cb1 141
e9195050
TA
142 /* If we are currently in a cgroup /foo/bar, and the container is in a
143 * cgroup /lxc/foo, lxcfs will give us an ENOENT if some task in the
144 * container has an open fd that points to one of the cgroup files
145 * (systemd always opens its "root" cgroup). So, let's escape to the
146 * /actual/ root cgroup so that lxcfs thinks criu has enough rights to
147 * see all cgroups.
148 */
7103fe6f 149 if (!cgroup_escape()) {
e9195050
TA
150 ERROR("failed to escape cgroups");
151 return;
152 }
153
e29fe1dd 154 /* The command line always looks like:
19d1509c 155 * criu $(action) --tcp-established --file-locks --link-remap \
5f178bc9 156 * --manage-cgroups=full --action-script foo.sh -D $(directory) \
e29fe1dd
TA
157 * -o $(directory)/$(action).log --ext-mount-map auto
158 * --enable-external-sharing --enable-external-masters
4b54788e 159 * --enable-fs hugetlbfs --enable-fs tracefs --ext-mount-map console:/dev/pts/n
e29fe1dd
TA
160 * +1 for final NULL */
161
aef3d51e 162 if (strcmp(opts->action, "dump") == 0 || strcmp(opts->action, "pre-dump") == 0) {
dc259399
TA
163 /* -t pid --freeze-cgroup /lxc/ct */
164 static_args += 4;
e29fe1dd 165
aef3d51e 166 /* --prev-images-dir <path-to-directory-A-relative-to-B> */
b2c3710f 167 if (opts->user->predump_dir)
aef3d51e
TA
168 static_args += 2;
169
74eb576c 170 /* --page-server --address <address> --port <port> */
b2c3710f 171 if (opts->user->pageserver_address && opts->user->pageserver_port)
74eb576c
NE
172 static_args += 5;
173
aef3d51e 174 /* --leave-running (only for final dump) */
b2c3710f 175 if (strcmp(opts->action, "dump") == 0 && !opts->user->stop)
e29fe1dd 176 static_args++;
4b54788e
TA
177
178 /* --external tty[88,4] */
179 if (opts->tty_id[0])
180 static_args += 2;
19d1509c
TA
181
182 /* --force-irmap */
183 if (!opts->user->preserves_inodes)
184 static_args++;
b2b7b0d2
TA
185
186 /* --ghost-limit 1024 */
187 if (opts->user->ghost_limit)
188 static_args += 2;
e29fe1dd
TA
189 } else if (strcmp(opts->action, "restore") == 0) {
190 /* --root $(lxc_mount_point) --restore-detached
0ab5703f 191 * --restore-sibling
13389b29
TA
192 * --lsm-profile apparmor:whatever
193 */
0ab5703f 194 static_args += 6;
4b54788e
TA
195
196 tty_info[0] = 0;
b2c3710f 197 if (load_tty_major_minor(opts->user->directory, tty_info, sizeof(tty_info)))
4b54788e
TA
198 return;
199
200 /* --inherit-fd fd[%d]:tty[%s] */
201 if (tty_info[0])
202 static_args += 2;
e29fe1dd
TA
203 } else {
204 return;
205 }
206
09e80d0c
TA
207 if (cgroup_num_hierarchies() > 0)
208 static_args += 2 * cgroup_num_hierarchies();
0ab5703f 209
b2c3710f 210 if (opts->user->verbose)
e29fe1dd
TA
211 static_args++;
212
b9ee6643
TA
213 if (opts->user->action_script)
214 static_args += 2;
215
5f4e44a2
TA
216 static_args += 2 * lxc_list_len(&opts->c->lxc_conf->mount_list);
217
b2c3710f 218 ret = snprintf(log, PATH_MAX, "%s/%s.log", opts->user->directory, opts->action);
e29fe1dd 219 if (ret < 0 || ret >= PATH_MAX) {
9f1f54b0 220 ERROR("logfile name too long");
e29fe1dd
TA
221 return;
222 }
223
224 argv = malloc(static_args * sizeof(*argv));
225 if (!argv)
226 return;
227
228 memset(argv, 0, static_args * sizeof(*argv));
229
230#define DECLARE_ARG(arg) \
231 do { \
232 if (arg == NULL) { \
233 ERROR("Got NULL argument for criu"); \
234 goto err; \
235 } \
236 argv[argc++] = strdup(arg); \
237 if (!argv[argc-1]) \
238 goto err; \
239 } while (0)
240
241 argv[argc++] = on_path("criu", NULL);
242 if (!argv[argc-1]) {
9f1f54b0 243 ERROR("Couldn't find criu binary");
e29fe1dd
TA
244 goto err;
245 }
246
247 DECLARE_ARG(opts->action);
248 DECLARE_ARG("--tcp-established");
249 DECLARE_ARG("--file-locks");
250 DECLARE_ARG("--link-remap");
0a5fc6df 251 DECLARE_ARG("--manage-cgroups=full");
e29fe1dd
TA
252 DECLARE_ARG("--ext-mount-map");
253 DECLARE_ARG("auto");
254 DECLARE_ARG("--enable-external-sharing");
255 DECLARE_ARG("--enable-external-masters");
dd62857a
TA
256 DECLARE_ARG("--enable-fs");
257 DECLARE_ARG("hugetlbfs");
5b454329
TA
258 DECLARE_ARG("--enable-fs");
259 DECLARE_ARG("tracefs");
e29fe1dd 260 DECLARE_ARG("-D");
b2c3710f 261 DECLARE_ARG(opts->user->directory);
e29fe1dd
TA
262 DECLARE_ARG("-o");
263 DECLARE_ARG(log);
264
0ab5703f
TA
265 for (i = 0; i < cgroup_num_hierarchies(); i++) {
266 char **controllers = NULL, *fullname;
267 char *path;
268
269 if (!cgroup_get_hierarchies(i, &controllers)) {
270 ERROR("failed to get hierarchy %d", i);
271 goto err;
272 }
273
274 /* if we are in a dump, we have to ask the monitor process what
275 * the right cgroup is. if this is a restore, we can just use
276 * the handler the restore task created.
277 */
278 if (!strcmp(opts->action, "dump") || !strcmp(opts->action, "pre-dump")) {
279 path = lxc_cmd_get_cgroup_path(opts->c->name, opts->c->config_path, controllers[0]);
280 if (!path) {
281 ERROR("failed to get cgroup path for %s", controllers[0]);
282 goto err;
283 }
284 } else {
285 const char *p;
286
287 p = cgroup_get_cgroup(opts->handler, controllers[0]);
288 if (!p) {
289 ERROR("failed to get cgroup path for %s", controllers[0]);
290 goto err;
291 }
292
293 path = strdup(p);
294 if (!path) {
295 ERROR("strdup failed");
296 goto err;
297 }
298 }
299
c56a9652 300 if (!lxc_deslashify(&path)) {
0ab5703f
TA
301 ERROR("failed to deslashify %s", path);
302 free(path);
303 goto err;
304 }
305
306 fullname = lxc_string_join(",", (const char **) controllers, false);
307 if (!fullname) {
308 ERROR("failed to join controllers");
309 free(path);
310 goto err;
311 }
312
313 ret = sprintf(buf, "%s:%s", fullname, path);
314 free(path);
315 free(fullname);
316 if (ret < 0 || ret >= sizeof(buf)) {
317 ERROR("sprintf of cgroup root arg failed");
318 goto err;
319 }
320
321 DECLARE_ARG("--cgroup-root");
322 DECLARE_ARG(buf);
323 }
324
b2c3710f 325 if (opts->user->verbose)
e29fe1dd
TA
326 DECLARE_ARG("-vvvvvv");
327
b9ee6643
TA
328 if (opts->user->action_script) {
329 DECLARE_ARG("--action-script");
330 DECLARE_ARG(opts->user->action_script);
331 }
332
5f4e44a2
TA
333 mnts = write_mount_file(&opts->c->lxc_conf->mount_list);
334 if (!mnts)
335 goto err;
336
337 while (getmntent_r(mnts, &mntent, buf, sizeof(buf))) {
338 char *fmt, *key, *val;
339 char arg[2 * PATH_MAX + 2];
340
341 if (strcmp(opts->action, "dump") == 0) {
342 fmt = "/%s:%s";
343 key = mntent.mnt_dir;
344 val = mntent.mnt_dir;
345 } else {
346 fmt = "%s:%s";
347 key = mntent.mnt_dir;
348 val = mntent.mnt_fsname;
349 }
350
351 ret = snprintf(arg, sizeof(arg), fmt, key, val);
352 if (ret < 0 || ret >= sizeof(arg)) {
353 fclose(mnts);
354 ERROR("snprintf failed");
355 goto err;
356 }
357
358 DECLARE_ARG("--ext-mount-map");
359 DECLARE_ARG(arg);
360 }
361 fclose(mnts);
362
aef3d51e 363 if (strcmp(opts->action, "dump") == 0 || strcmp(opts->action, "pre-dump") == 0) {
dc259399 364 char pid[32], *freezer_relative;
e29fe1dd
TA
365
366 if (sprintf(pid, "%d", opts->c->init_pid(opts->c)) < 0)
367 goto err;
368
369 DECLARE_ARG("-t");
370 DECLARE_ARG(pid);
dc259399
TA
371
372 freezer_relative = lxc_cmd_get_cgroup_path(opts->c->name,
373 opts->c->config_path,
374 "freezer");
375 if (!freezer_relative) {
376 ERROR("failed getting freezer path");
377 goto err;
378 }
379
380 ret = snprintf(log, sizeof(log), "/sys/fs/cgroup/freezer/%s", freezer_relative);
381 if (ret < 0 || ret >= sizeof(log))
382 goto err;
383
f1954503
AR
384 if (!opts->user->disable_skip_in_flight &&
385 strcmp(opts->criu_version, CRIU_IN_FLIGHT_SUPPORT) >= 0)
386 DECLARE_ARG("--skip-in-flight");
387
dc259399
TA
388 DECLARE_ARG("--freeze-cgroup");
389 DECLARE_ARG(log);
390
4b54788e 391 if (opts->tty_id[0]) {
36d2096c
TA
392 DECLARE_ARG("--ext-mount-map");
393 DECLARE_ARG("/dev/console:console");
394
4b54788e
TA
395 DECLARE_ARG("--external");
396 DECLARE_ARG(opts->tty_id);
397 }
398
b2c3710f 399 if (opts->user->predump_dir) {
aef3d51e 400 DECLARE_ARG("--prev-images-dir");
b2c3710f 401 DECLARE_ARG(opts->user->predump_dir);
9f99a33f 402 DECLARE_ARG("--track-mem");
74eb576c 403 }
4c0c0319 404
b2c3710f 405 if (opts->user->pageserver_address && opts->user->pageserver_port) {
74eb576c
NE
406 DECLARE_ARG("--page-server");
407 DECLARE_ARG("--address");
b2c3710f 408 DECLARE_ARG(opts->user->pageserver_address);
74eb576c 409 DECLARE_ARG("--port");
b2c3710f 410 DECLARE_ARG(opts->user->pageserver_port);
74eb576c 411 }
aef3d51e 412
19d1509c
TA
413 if (!opts->user->preserves_inodes)
414 DECLARE_ARG("--force-irmap");
415
b2b7b0d2
TA
416 if (opts->user->ghost_limit) {
417 char ghost_limit[32];
418
9b945f13 419 ret = sprintf(ghost_limit, "%"PRIu64, opts->user->ghost_limit);
b2b7b0d2 420 if (ret < 0 || ret >= sizeof(ghost_limit)) {
9b945f13 421 ERROR("failed to print ghost limit %"PRIu64, opts->user->ghost_limit);
b2b7b0d2
TA
422 goto err;
423 }
424
425 DECLARE_ARG("--ghost-limit");
426 DECLARE_ARG(ghost_limit);
427 }
428
aef3d51e 429 /* only for final dump */
b2c3710f 430 if (strcmp(opts->action, "dump") == 0 && !opts->user->stop)
e29fe1dd
TA
431 DECLARE_ARG("--leave-running");
432 } else if (strcmp(opts->action, "restore") == 0) {
433 void *m;
434 int additional;
13389b29 435 struct lxc_conf *lxc_conf = opts->c->lxc_conf;
e29fe1dd
TA
436
437 DECLARE_ARG("--root");
438 DECLARE_ARG(opts->c->lxc_conf->rootfs.mount);
439 DECLARE_ARG("--restore-detached");
440 DECLARE_ARG("--restore-sibling");
e29fe1dd 441
4b54788e 442 if (tty_info[0]) {
97e4f1a9
TA
443 if (opts->console_fd < 0) {
444 ERROR("lxc.console configured on source host but not target");
445 goto err;
446 }
447
4b54788e
TA
448 ret = snprintf(buf, sizeof(buf), "fd[%d]:%s", opts->console_fd, tty_info);
449 if (ret < 0 || ret >= sizeof(buf))
450 goto err;
451
452 DECLARE_ARG("--inherit-fd");
453 DECLARE_ARG(buf);
454 }
455 if (opts->console_name) {
456 if (snprintf(buf, sizeof(buf), "console:%s", opts->console_name) < 0) {
457 SYSERROR("sprintf'd too many bytes");
458 }
459 DECLARE_ARG("--ext-mount-map");
460 DECLARE_ARG(buf);
461 }
462
13389b29
TA
463 if (lxc_conf->lsm_aa_profile || lxc_conf->lsm_se_context) {
464
465 if (lxc_conf->lsm_aa_profile)
466 ret = snprintf(buf, sizeof(buf), "apparmor:%s", lxc_conf->lsm_aa_profile);
467 else
468 ret = snprintf(buf, sizeof(buf), "selinux:%s", lxc_conf->lsm_se_context);
469
470 if (ret < 0 || ret >= sizeof(buf))
471 goto err;
472
473 DECLARE_ARG("--lsm-profile");
474 DECLARE_ARG(buf);
475 }
476
e29fe1dd
TA
477 additional = lxc_list_len(&opts->c->lxc_conf->network) * 2;
478
fa071249
TA
479 m = realloc(argv, (argc + additional + 1) * sizeof(*argv));
480 if (!m)
481 goto err;
e29fe1dd
TA
482 argv = m;
483
484 lxc_list_for_each(it, &opts->c->lxc_conf->network) {
485 char eth[128], *veth;
46c8ffd5 486 char *fmt;
e29fe1dd 487 struct lxc_netdev *n = it->elem;
46c8ffd5
AR
488 bool external_not_veth;
489
490 if (strcmp(opts->criu_version, CRIU_EXTERNAL_NOT_VETH) >= 0) {
491 /* Since criu version 2.8 the usage of --veth-pair
492 * has been deprecated:
493 * git tag --contains f2037e6d3445fc400
494 * v2.8 */
495 external_not_veth = true;
496 } else {
497 external_not_veth = false;
498 }
e29fe1dd
TA
499
500 if (n->name) {
501 if (strlen(n->name) >= sizeof(eth))
502 goto err;
503 strncpy(eth, n->name, sizeof(eth));
796a109d
TA
504 } else {
505 ret = snprintf(eth, sizeof(eth), "eth%d", netnr);
506 if (ret < 0 || ret >= sizeof(eth))
507 goto err;
508 }
e29fe1dd 509
e2697330
TA
510 switch (n->type) {
511 case LXC_NET_VETH:
512 veth = n->priv.veth_attr.pair;
e29fe1dd 513
46c8ffd5
AR
514 if (n->link) {
515 if (external_not_veth)
516 fmt = "veth[%s]:%s@%s";
517 else
518 fmt = "%s=%s@%s";
519
520 ret = snprintf(buf, sizeof(buf), fmt, eth, veth, n->link);
521 } else {
522 if (external_not_veth)
523 fmt = "veth[%s]:%s";
524 else
525 fmt = "%s=%s";
526
527 ret = snprintf(buf, sizeof(buf), fmt, eth, veth);
528 }
e2697330
TA
529 if (ret < 0 || ret >= sizeof(buf))
530 goto err;
531 break;
532 case LXC_NET_MACVLAN:
e2697330 533 if (!n->link) {
9f1f54b0 534 ERROR("no host interface for macvlan %s", n->name);
e2697330
TA
535 goto err;
536 }
537
538 ret = snprintf(buf, sizeof(buf), "macvlan[%s]:%s", eth, n->link);
539 if (ret < 0 || ret >= sizeof(buf))
540 goto err;
541 break;
542 case LXC_NET_NONE:
543 case LXC_NET_EMPTY:
544 break;
545 default:
546 /* we have screened for this earlier... */
9f1f54b0 547 ERROR("unexpected network type %d", n->type);
e29fe1dd 548 goto err;
e2697330 549 }
e29fe1dd 550
46c8ffd5
AR
551 if (external_not_veth)
552 DECLARE_ARG("--external");
553 else
554 DECLARE_ARG("--veth-pair");
e29fe1dd 555 DECLARE_ARG(buf);
2f3fbc6b 556 netnr++;
e29fe1dd
TA
557 }
558
559 }
560
561 argv[argc] = NULL;
562
cf4b07a5 563 buf[0] = 0;
a17fa3c0 564 pos = 0;
72a30576 565
cf4b07a5 566 for (i = 0; argv[i]; i++) {
72a30576
NE
567 ret = snprintf(buf + pos, sizeof(buf) - pos, "%s ", argv[i]);
568 if (ret < 0 || ret >= sizeof(buf) - pos)
569 goto err;
570 else
571 pos += ret;
cf4b07a5
TA
572 }
573
574 INFO("execing: %s", buf);
575
5af85cb1
TA
576 /* before criu inits its log, it sometimes prints things to stdout/err;
577 * let's be sure we capture that.
578 */
579 if (dup2(opts->pipefd, STDOUT_FILENO) < 0) {
580 SYSERROR("dup2 stdout failed");
581 goto err;
582 }
583
584 if (dup2(opts->pipefd, STDERR_FILENO) < 0) {
585 SYSERROR("dup2 stderr failed");
586 goto err;
587 }
588
589 close(opts->pipefd);
590
e29fe1dd
TA
591#undef DECLARE_ARG
592 execv(argv[0], argv);
593err:
e29fe1dd
TA
594 for (i = 0; argv[i]; i++)
595 free(argv[i]);
596 free(argv);
597}
598
8ba5ced7
TA
599/*
600 * Check to see if the criu version is recent enough for all the features we
601 * use. This version allows either CRIU_VERSION or (CRIU_GITID_VERSION and
602 * CRIU_GITID_PATCHLEVEL) to work, enabling users building from git to c/r
603 * things potentially before a version is released with a particular feature.
604 *
605 * The intent is that when criu development slows down, we can drop this, but
606 * for now we shouldn't attempt to c/r with versions that we know won't work.
5407e2ab
CB
607 *
608 * Note: If version != NULL criu_version() stores the detected criu version in
609 * version. Allocates memory for version which must be freed by caller.
8ba5ced7 610 */
5407e2ab 611static bool criu_version_ok(char **version)
8ba5ced7
TA
612{
613 int pipes[2];
614 pid_t pid;
615
616 if (pipe(pipes) < 0) {
617 SYSERROR("pipe() failed");
618 return false;
619 }
620
621 pid = fork();
622 if (pid < 0) {
623 SYSERROR("fork() failed");
624 return false;
625 }
626
627 if (pid == 0) {
628 char *args[] = { "criu", "--version", NULL };
755fa453 629 char *path;
8ba5ced7
TA
630 close(pipes[0]);
631
632 close(STDERR_FILENO);
633 if (dup2(pipes[1], STDOUT_FILENO) < 0)
634 exit(1);
635
755fa453 636 path = on_path("criu", NULL);
d9b32b09
SH
637 if (!path)
638 exit(1);
639
755fa453 640 execv(path, args);
8ba5ced7
TA
641 exit(1);
642 } else {
643 FILE *f;
5407e2ab 644 char *tmp;
8ba5ced7
TA
645 int patch;
646
647 close(pipes[1]);
648 if (wait_for_pid(pid) < 0) {
649 close(pipes[0]);
4eae4051 650 SYSERROR("execing criu failed, is it installed?");
8ba5ced7
TA
651 return false;
652 }
653
654 f = fdopen(pipes[0], "r");
655 if (!f) {
656 close(pipes[0]);
657 return false;
658 }
659
5407e2ab
CB
660 tmp = malloc(1024);
661 if (!tmp) {
662 fclose(f);
663 return false;
664 }
665
666 if (fscanf(f, "Version: %1023[^\n]s", tmp) != 1)
8ba5ced7
TA
667 goto version_error;
668
669 if (fgetc(f) != '\n')
670 goto version_error;
671
5407e2ab 672 if (strcmp(tmp, CRIU_VERSION) >= 0)
8ba5ced7
TA
673 goto version_match;
674
5407e2ab 675 if (fscanf(f, "GitID: v%1023[^-]s", tmp) != 1)
8ba5ced7
TA
676 goto version_error;
677
678 if (fgetc(f) != '-')
679 goto version_error;
680
681 if (fscanf(f, "%d", &patch) != 1)
682 goto version_error;
683
5407e2ab 684 if (strcmp(tmp, CRIU_GITID_VERSION) < 0)
8ba5ced7
TA
685 goto version_error;
686
687 if (patch < CRIU_GITID_PATCHLEVEL)
688 goto version_error;
689
690version_match:
3158ab5b 691 fclose(f);
5407e2ab
CB
692 if (!version)
693 free(tmp);
694 else
695 *version = tmp;
8ba5ced7
TA
696 return true;
697
698version_error:
3158ab5b 699 fclose(f);
5407e2ab 700 free(tmp);
9f1f54b0 701 ERROR("must have criu " CRIU_VERSION " or greater to checkpoint/restore");
8ba5ced7
TA
702 return false;
703 }
704}
705
e29fe1dd
TA
706/* Check and make sure the container has a configuration that we know CRIU can
707 * dump. */
f1954503 708static bool criu_ok(struct lxc_container *c, char **criu_version)
e29fe1dd
TA
709{
710 struct lxc_list *it;
e29fe1dd 711
f1954503 712 if (!criu_version_ok(criu_version))
8ba5ced7
TA
713 return false;
714
e29fe1dd 715 if (geteuid()) {
9f1f54b0 716 ERROR("Must be root to checkpoint");
e29fe1dd
TA
717 return false;
718 }
719
720 /* We only know how to restore containers with veth networks. */
721 lxc_list_for_each(it, &c->lxc_conf->network) {
722 struct lxc_netdev *n = it->elem;
65b20221
TA
723 switch(n->type) {
724 case LXC_NET_VETH:
725 case LXC_NET_NONE:
726 case LXC_NET_EMPTY:
e2697330 727 case LXC_NET_MACVLAN:
65b20221
TA
728 break;
729 default:
9f1f54b0 730 ERROR("Found un-dumpable network: %s (%s)", lxc_net_type_to_str(n->type), n->name);
e29fe1dd
TA
731 return false;
732 }
733 }
734
e29fe1dd
TA
735 return true;
736}
737
e29fe1dd
TA
738static bool restore_net_info(struct lxc_container *c)
739{
740 struct lxc_list *it;
741 bool has_error = true;
742
743 if (container_mem_lock(c))
744 return false;
745
746 lxc_list_for_each(it, &c->lxc_conf->network) {
747 struct lxc_netdev *netdev = it->elem;
748 char template[IFNAMSIZ];
65b20221
TA
749
750 if (netdev->type != LXC_NET_VETH)
751 continue;
752
e29fe1dd
TA
753 snprintf(template, sizeof(template), "vethXXXXXX");
754
755 if (!netdev->priv.veth_attr.pair)
756 netdev->priv.veth_attr.pair = lxc_mkifname(template);
757
758 if (!netdev->priv.veth_attr.pair)
759 goto out_unlock;
760 }
761
762 has_error = false;
763
764out_unlock:
765 container_mem_unlock(c);
766 return !has_error;
767}
768
aef3d51e
TA
769// do_restore never returns, the calling process is used as the
770// monitor process. do_restore calls exit() if it fails.
c33b0338 771static void do_restore(struct lxc_container *c, int status_pipe, struct migrate_opts *opts, char *criu_version)
e29fe1dd
TA
772{
773 pid_t pid;
e29fe1dd 774 struct lxc_handler *handler;
a7fb6043 775 int status, fd;
9b1e2e6e 776 int pipes[2] = {-1, -1};
e29fe1dd 777
a7fb6043
TA
778 /* Try to detach from the current controlling tty if it exists.
779 * Othwerise, lxc_init (via lxc_console) will attach the container's
780 * console output to the current tty, which is probably not what any
781 * library user wants, and if they do, they can just manually configure
782 * it :)
783 */
784 fd = open("/dev/tty", O_RDWR);
785 if (fd >= 0) {
786 if (ioctl(fd, TIOCNOTTY, NULL) < 0)
787 SYSERROR("couldn't detach from tty");
788 close(fd);
789 }
790
e29fe1dd
TA
791 handler = lxc_init(c->name, c->lxc_conf, c->config_path);
792 if (!handler)
793 goto out;
794
795 if (!cgroup_init(handler)) {
796 ERROR("failed initing cgroups");
797 goto out_fini_handler;
798 }
799
800 if (!cgroup_create(handler)) {
801 ERROR("failed creating groups");
802 goto out_fini_handler;
803 }
804
805 if (!restore_net_info(c)) {
806 ERROR("failed restoring network info");
807 goto out_fini_handler;
808 }
809
810 resolve_clone_flags(handler);
811
3d9a5c85
TA
812 if (pipe(pipes) < 0) {
813 SYSERROR("pipe() failed");
814 goto out_fini_handler;
815 }
816
e29fe1dd
TA
817 pid = fork();
818 if (pid < 0)
819 goto out_fini_handler;
820
821 if (pid == 0) {
822 struct criu_opts os;
823 struct lxc_rootfs *rootfs;
4b54788e 824 int flags;
e29fe1dd 825
3d9a5c85
TA
826 close(status_pipe);
827 status_pipe = -1;
828
829 close(pipes[0]);
830 pipes[0] = -1;
e29fe1dd
TA
831
832 if (unshare(CLONE_NEWNS))
833 goto out_fini_handler;
834
835 /* CRIU needs the lxc root bind mounted so that it is the root of some
836 * mount. */
837 rootfs = &c->lxc_conf->rootfs;
838
839 if (rootfs_is_blockdev(c->lxc_conf)) {
840 if (do_rootfs_setup(c->lxc_conf, c->name, c->config_path) < 0)
841 goto out_fini_handler;
842 } else {
843 if (mkdir(rootfs->mount, 0755) < 0 && errno != EEXIST)
844 goto out_fini_handler;
845
846 if (mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL) < 0) {
847 SYSERROR("remount / to private failed");
848 goto out_fini_handler;
849 }
850
851 if (mount(rootfs->path, rootfs->mount, NULL, MS_BIND, NULL) < 0) {
852 rmdir(rootfs->mount);
853 goto out_fini_handler;
854 }
855 }
856
5af85cb1 857 os.pipefd = pipes[1];
e29fe1dd 858 os.action = "restore";
b2c3710f 859 os.user = opts;
e29fe1dd 860 os.c = c;
4b54788e 861 os.console_fd = c->lxc_conf->console.slave;
f1954503 862 os.criu_version = criu_version;
0ab5703f 863 os.handler = handler;
4b54788e 864
97e4f1a9
TA
865 if (os.console_fd >= 0) {
866 /* Twiddle the FD_CLOEXEC bit. We want to pass this FD to criu
867 * via --inherit-fd, so we don't want it to close.
868 */
869 flags = fcntl(os.console_fd, F_GETFD);
870 if (flags < 0) {
871 SYSERROR("F_GETFD failed: %d", os.console_fd);
872 goto out_fini_handler;
873 }
4b54788e 874
97e4f1a9 875 flags &= ~FD_CLOEXEC;
4b54788e 876
97e4f1a9
TA
877 if (fcntl(os.console_fd, F_SETFD, flags) < 0) {
878 SYSERROR("F_SETFD failed");
879 goto out_fini_handler;
880 }
4b54788e
TA
881 }
882 os.console_name = c->lxc_conf->console.name;
e29fe1dd
TA
883
884 /* exec_criu() returning is an error */
7103fe6f 885 exec_criu(&os);
e29fe1dd
TA
886 umount(rootfs->mount);
887 rmdir(rootfs->mount);
888 goto out_fini_handler;
889 } else {
890 int ret;
891 char title[2048];
892
3d9a5c85
TA
893 close(pipes[1]);
894 pipes[1] = -1;
895
e29fe1dd
TA
896 pid_t w = waitpid(pid, &status, 0);
897 if (w == -1) {
898 SYSERROR("waitpid");
899 goto out_fini_handler;
900 }
901
e29fe1dd 902 if (WIFEXITED(status)) {
75d219f0
TA
903 char buf[4096];
904
e29fe1dd 905 if (WEXITSTATUS(status)) {
3d9a5c85
TA
906 int n;
907
908 n = read(pipes[0], buf, sizeof(buf));
909 if (n < 0) {
910 SYSERROR("failed reading from criu stderr");
911 goto out_fini_handler;
912 }
913
2735dfae
TA
914 if (n == sizeof(buf))
915 n--;
3d9a5c85
TA
916 buf[n] = 0;
917
9f1f54b0 918 ERROR("criu process exited %d, output:\n%s", WEXITSTATUS(status), buf);
e29fe1dd
TA
919 goto out_fini_handler;
920 } else {
3eba9b49 921 ret = snprintf(buf, sizeof(buf), "/proc/self/task/%lu/children", (unsigned long)syscall(__NR_gettid));
75d219f0
TA
922 if (ret < 0 || ret >= sizeof(buf)) {
923 ERROR("snprintf'd too many characters: %d", ret);
924 goto out_fini_handler;
925 }
926
927 FILE *f = fopen(buf, "r");
e29fe1dd 928 if (!f) {
9f1f54b0 929 SYSERROR("couldn't read restore's children file %s", buf);
e29fe1dd
TA
930 goto out_fini_handler;
931 }
932
933 ret = fscanf(f, "%d", (int*) &handler->pid);
934 fclose(f);
935 if (ret != 1) {
936 ERROR("reading restore pid failed");
937 goto out_fini_handler;
938 }
939
f8a41688
TA
940 if (lxc_set_state(c->name, handler, RUNNING)) {
941 ERROR("error setting running state after restore");
e29fe1dd 942 goto out_fini_handler;
f8a41688 943 }
e29fe1dd
TA
944 }
945 } else {
9f1f54b0 946 ERROR("CRIU was killed with signal %d", WTERMSIG(status));
e29fe1dd
TA
947 goto out_fini_handler;
948 }
949
3d9a5c85
TA
950 close(pipes[0]);
951
f3886023
TA
952 ret = write(status_pipe, &status, sizeof(status));
953 close(status_pipe);
954 status_pipe = -1;
955
956 if (sizeof(status) != ret) {
957 SYSERROR("failed to write all of status");
958 goto out_fini_handler;
959 }
960
e29fe1dd
TA
961 /*
962 * See comment in lxcapi_start; we don't care if these
963 * fail because it's just a beauty thing. We just
964 * assign the return here to silence potential.
965 */
966 ret = snprintf(title, sizeof(title), "[lxc monitor] %s %s", c->config_path, c->name);
967 ret = setproctitle(title);
968
969 ret = lxc_poll(c->name, handler);
970 if (ret)
971 lxc_abort(c->name, handler);
972 lxc_fini(c->name, handler);
973 exit(ret);
974 }
975
976out_fini_handler:
3d9a5c85
TA
977 if (pipes[0] >= 0)
978 close(pipes[0]);
979 if (pipes[1] >= 0)
980 close(pipes[1]);
981
e29fe1dd
TA
982 lxc_fini(c->name, handler);
983
984out:
3d9a5c85 985 if (status_pipe >= 0) {
f3886023
TA
986 /* ensure getting here was a failure, e.g. if we failed to
987 * parse the child pid or something, even after a successful
988 * restore
989 */
990 if (!status)
991 status = 1;
3d9a5c85 992 if (write(status_pipe, &status, sizeof(status)) != sizeof(status)) {
e29fe1dd
TA
993 SYSERROR("writing status failed");
994 }
3d9a5c85 995 close(status_pipe);
e29fe1dd
TA
996 }
997
998 exit(1);
999}
aef3d51e 1000
4b54788e
TA
1001static int save_tty_major_minor(char *directory, struct lxc_container *c, char *tty_id, int len)
1002{
1003 FILE *f;
1004 char path[PATH_MAX];
1005 int ret;
1006 struct stat sb;
1007
1008 if (c->lxc_conf->console.path && !strcmp(c->lxc_conf->console.path, "none")) {
1009 tty_id[0] = 0;
1010 return 0;
1011 }
1012
1013 ret = snprintf(path, sizeof(path), "/proc/%d/root/dev/console", c->init_pid(c));
1014 if (ret < 0 || ret >= sizeof(path)) {
1015 ERROR("snprintf'd too many chacters: %d", ret);
1016 return -1;
1017 }
1018
1019 ret = stat(path, &sb);
1020 if (ret < 0) {
1021 SYSERROR("stat of %s failed", path);
1022 return -1;
1023 }
1024
1025 ret = snprintf(path, sizeof(path), "%s/tty.info", directory);
1026 if (ret < 0 || ret >= sizeof(path)) {
1027 ERROR("snprintf'd too many characters: %d", ret);
1028 return -1;
1029 }
1030
f03280a7
TA
1031 ret = snprintf(tty_id, len, "tty[%llx:%llx]",
1032 (long long unsigned) sb.st_rdev,
1033 (long long unsigned) sb.st_dev);
4b54788e
TA
1034 if (ret < 0 || ret >= sizeof(path)) {
1035 ERROR("snprintf'd too many characters: %d", ret);
1036 return -1;
1037 }
1038
1039 f = fopen(path, "w");
1040 if (!f) {
1041 SYSERROR("failed to open %s", path);
1042 return -1;
1043 }
1044
1045 ret = fprintf(f, "%s", tty_id);
1046 fclose(f);
1047 if (ret < 0)
1048 SYSERROR("failed to write to %s", path);
1049 return ret;
1050}
1051
aef3d51e 1052/* do one of either predump or a regular dump */
b2c3710f 1053static bool do_dump(struct lxc_container *c, char *mode, struct migrate_opts *opts)
aef3d51e
TA
1054{
1055 pid_t pid;
f1954503 1056 char *criu_version = NULL;
5af85cb1 1057 int criuout[2];
aef3d51e 1058
f1954503 1059 if (!criu_ok(c, &criu_version))
aef3d51e
TA
1060 return false;
1061
5af85cb1
TA
1062 if (pipe(criuout) < 0) {
1063 SYSERROR("pipe() failed");
aef3d51e 1064 return false;
5af85cb1
TA
1065 }
1066
1067 if (mkdir_p(opts->directory, 0700) < 0)
1068 goto fail;
aef3d51e
TA
1069
1070 pid = fork();
1071 if (pid < 0) {
1072 SYSERROR("fork failed");
5af85cb1 1073 goto fail;
aef3d51e
TA
1074 }
1075
1076 if (pid == 0) {
1077 struct criu_opts os;
0ab5703f
TA
1078 struct lxc_handler h;
1079
5af85cb1
TA
1080 close(criuout[0]);
1081
0ab5703f
TA
1082 h.name = c->name;
1083 if (!cgroup_init(&h)) {
1084 ERROR("failed to cgroup_init()");
1085 exit(1);
1086 }
aef3d51e 1087
5af85cb1 1088 os.pipefd = criuout[1];
aef3d51e 1089 os.action = mode;
b2c3710f 1090 os.user = opts;
aef3d51e 1091 os.c = c;
4b54788e 1092 os.console_name = c->lxc_conf->console.path;
f1954503 1093 os.criu_version = criu_version;
74eb576c 1094
b2c3710f 1095 if (save_tty_major_minor(opts->directory, c, os.tty_id, sizeof(os.tty_id)) < 0)
4b54788e 1096 exit(1);
aef3d51e
TA
1097
1098 /* exec_criu() returning is an error */
7103fe6f 1099 exec_criu(&os);
aef3d51e
TA
1100 exit(1);
1101 } else {
1102 int status;
5af85cb1
TA
1103 ssize_t n;
1104 char buf[4096];
1105 bool ret;
1106
1107 close(criuout[1]);
1108
aef3d51e
TA
1109 pid_t w = waitpid(pid, &status, 0);
1110 if (w == -1) {
1111 SYSERROR("waitpid");
5af85cb1 1112 close(criuout[0]);
aef3d51e
TA
1113 return false;
1114 }
1115
5af85cb1
TA
1116 n = read(criuout[0], buf, sizeof(buf));
1117 close(criuout[0]);
1118 if (n < 0) {
1119 SYSERROR("read");
1120 n = 0;
1121 }
1122 buf[n] = 0;
1123
aef3d51e
TA
1124 if (WIFEXITED(status)) {
1125 if (WEXITSTATUS(status)) {
9f1f54b0 1126 ERROR("dump failed with %d", WEXITSTATUS(status));
5af85cb1
TA
1127 ret = false;
1128 } else {
1129 ret = true;
aef3d51e 1130 }
aef3d51e 1131 } else if (WIFSIGNALED(status)) {
9f1f54b0 1132 ERROR("dump signaled with %d", WTERMSIG(status));
5af85cb1 1133 ret = false;
aef3d51e 1134 } else {
9f1f54b0 1135 ERROR("unknown dump exit %d", status);
5af85cb1 1136 ret = false;
aef3d51e 1137 }
5af85cb1
TA
1138
1139 if (!ret)
1140 ERROR("criu output: %s", buf);
1141 return ret;
aef3d51e 1142 }
5af85cb1
TA
1143fail:
1144 close(criuout[0]);
1145 close(criuout[1]);
1146 rmdir(opts->directory);
1147 return false;
aef3d51e
TA
1148}
1149
b2c3710f 1150bool __criu_pre_dump(struct lxc_container *c, struct migrate_opts *opts)
aef3d51e 1151{
b2c3710f 1152 return do_dump(c, "pre-dump", opts);
aef3d51e
TA
1153}
1154
b2c3710f 1155bool __criu_dump(struct lxc_container *c, struct migrate_opts *opts)
aef3d51e
TA
1156{
1157 char path[PATH_MAX];
1158 int ret;
1159
b2c3710f 1160 ret = snprintf(path, sizeof(path), "%s/inventory.img", opts->directory);
aef3d51e
TA
1161 if (ret < 0 || ret >= sizeof(path))
1162 return false;
1163
1164 if (access(path, F_OK) == 0) {
9f1f54b0 1165 ERROR("please use a fresh directory for the dump directory");
aef3d51e
TA
1166 return false;
1167 }
1168
b2c3710f 1169 return do_dump(c, "dump", opts);
aef3d51e
TA
1170}
1171
b2c3710f 1172bool __criu_restore(struct lxc_container *c, struct migrate_opts *opts)
aef3d51e
TA
1173{
1174 pid_t pid;
1175 int status, nread;
1176 int pipefd[2];
f1954503 1177 char *criu_version = NULL;
aef3d51e 1178
f1954503 1179 if (!criu_ok(c, &criu_version))
aef3d51e
TA
1180 return false;
1181
1182 if (geteuid()) {
9f1f54b0 1183 ERROR("Must be root to restore");
aef3d51e
TA
1184 return false;
1185 }
1186
1187 if (pipe(pipefd)) {
1188 ERROR("failed to create pipe");
1189 return false;
1190 }
1191
1192 pid = fork();
1193 if (pid < 0) {
1194 close(pipefd[0]);
1195 close(pipefd[1]);
1196 return false;
1197 }
1198
1199 if (pid == 0) {
1200 close(pipefd[0]);
1201 // this never returns
f1954503 1202 do_restore(c, pipefd[1], opts, criu_version);
aef3d51e
TA
1203 }
1204
1205 close(pipefd[1]);
1206
1207 nread = read(pipefd[0], &status, sizeof(status));
1208 close(pipefd[0]);
1209 if (sizeof(status) != nread) {
1210 ERROR("reading status from pipe failed");
1211 goto err_wait;
1212 }
1213
1214 // If the criu process was killed or exited nonzero, wait() for the
1215 // handler, since the restore process died. Otherwise, we don't need to
1216 // wait, since the child becomes the monitor process.
1217 if (!WIFEXITED(status) || WEXITSTATUS(status))
1218 goto err_wait;
1219 return true;
1220
1221err_wait:
1222 if (wait_for_pid(pid))
1223 ERROR("restore process died");
1224 return false;
1225}