]> git.proxmox.com Git - mirror_lxc.git/blame - src/lxc/criu.c
start: fix cgroup namespace preservation
[mirror_lxc.git] / src / lxc / criu.c
CommitLineData
e29fe1dd
TA
1/*
2 * lxc: linux Container library
3 *
4 * Copyright © 2014-2015 Canonical Ltd.
5 *
6 * Authors:
7 * Tycho Andersen <tycho.andersen@canonical.com>
8 *
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 */
23#define _GNU_SOURCE
9b945f13 24#include <inttypes.h>
e29fe1dd
TA
25#include <linux/limits.h>
26#include <sched.h>
27#include <stdio.h>
28#include <stdlib.h>
29#include <string.h>
30#include <sys/mount.h>
31#include <sys/types.h>
32#include <sys/wait.h>
33#include <unistd.h>
34
35#include "config.h"
36
e29fe1dd
TA
37#include "cgroup.h"
38#include "conf.h"
dc259399 39#include "commands.h"
e29fe1dd
TA
40#include "criu.h"
41#include "log.h"
42#include "lxc.h"
43#include "lxclock.h"
44#include "network.h"
28d832c4 45#include "storage.h"
e29fe1dd
TA
46#include "utils.h"
47
5f4e44a2
TA
48#if IS_BIONIC
49#include <../include/lxcmntent.h>
50#else
51#include <mntent.h>
52#endif
53
c33b0338 54#define CRIU_VERSION "2.0"
73d46752
TA
55
56#define CRIU_GITID_VERSION "2.0"
57#define CRIU_GITID_PATCHLEVEL 0
58
f1954503 59#define CRIU_IN_FLIGHT_SUPPORT "2.4"
46c8ffd5 60#define CRIU_EXTERNAL_NOT_VETH "2.8"
f1954503 61
e29fe1dd
TA
62lxc_log_define(lxc_criu, lxc);
63
73d46752 64struct criu_opts {
5af85cb1
TA
65 /* the thing to hook to stdout and stderr for logging */
66 int pipefd;
67
73d46752
TA
68 /* The type of criu invocation, one of "dump" or "restore" */
69 char *action;
70
b2c3710f
TA
71 /* the user-provided migrate options relevant to this action */
72 struct migrate_opts *user;
73d46752
TA
73
74 /* The container to dump */
75 struct lxc_container *c;
76
73d46752 77 /* dump: stop the container or not after dumping? */
4b54788e 78 char tty_id[32]; /* the criu tty id for /dev/console, i.e. "tty[${rdev}:${dev}]" */
73d46752
TA
79
80 /* restore: the file to write the init process' pid into */
0ab5703f 81 struct lxc_handler *handler;
4b54788e
TA
82 int console_fd;
83 /* The path that is bind mounted from /dev/console, if any. We don't
84 * want to use `--ext-mount-map auto`'s result here because the pts
85 * device may have a different path (e.g. if the pty number is
3aed4934 86 * different) on the target host. NULL if lxc.console.path = "none".
4b54788e
TA
87 */
88 char *console_name;
f1954503
AR
89
90 /* The detected version of criu */
91 char *criu_version;
73d46752
TA
92};
93
4b54788e
TA
94static int load_tty_major_minor(char *directory, char *output, int len)
95{
96 FILE *f;
97 char path[PATH_MAX];
98 int ret;
99
100 ret = snprintf(path, sizeof(path), "%s/tty.info", directory);
101 if (ret < 0 || ret >= sizeof(path)) {
102 ERROR("snprintf'd too many chacters: %d", ret);
103 return -1;
104 }
105
106 f = fopen(path, "r");
107 if (!f) {
108 /* This means we're coming from a liblxc which didn't export
3aed4934
CB
109 * the tty info. In this case they had to have lxc.console.path
110 * = * none, so there's no problem restoring.
4b54788e
TA
111 */
112 if (errno == ENOENT)
113 return 0;
114
115 SYSERROR("couldn't open %s", path);
116 return -1;
117 }
118
119 if (!fgets(output, len, f)) {
120 fclose(f);
121 SYSERROR("couldn't read %s", path);
122 return -1;
123 }
124
125 fclose(f);
126 return 0;
127}
128
74ad3607
FB
129static int cmp_version(const char *v1, const char *v2)
130{
131 int ret;
132 int oct_v1[3], oct_v2[3];
133
134 memset(oct_v1, -1, sizeof(oct_v1));
135 memset(oct_v2, -1, sizeof(oct_v2));
136
137 ret = sscanf(v1, "%d.%d.%d", &oct_v1[0], &oct_v1[1], &oct_v1[2]);
138 if (ret < 1)
139 return -1;
140
141 ret = sscanf(v2, "%d.%d.%d", &oct_v2[0], &oct_v2[1], &oct_v2[2]);
142 if (ret < 1)
143 return -1;
144
145 /* Major version is greater. */
146 if (oct_v1[0] > oct_v2[0])
147 return 1;
148
149 if (oct_v1[0] < oct_v2[0])
150 return -1;
151
152 /* Minor number is greater.*/
153 if (oct_v1[1] > oct_v2[1])
154 return 1;
155
156 if (oct_v1[1] < oct_v2[1])
157 return -1;
158
159 /* Patch number is greater. */
160 if (oct_v1[2] > oct_v2[2])
161 return 1;
162
163 /* Patch numbers are equal. */
164 if (oct_v1[2] == oct_v2[2])
165 return 0;
166
167 return -1;
168}
169
9451eeff 170static void exec_criu(struct criu_opts *opts)
e29fe1dd
TA
171{
172 char **argv, log[PATH_MAX];
19d1509c 173 int static_args = 23, argc = 0, i, ret;
e29fe1dd
TA
174 int netnr = 0;
175 struct lxc_list *it;
5f4e44a2
TA
176 FILE *mnts;
177 struct mntent mntent;
e29fe1dd 178
a17fa3c0
NE
179 char buf[4096], tty_info[32];
180 size_t pos;
5af85cb1 181
e9195050
TA
182 /* If we are currently in a cgroup /foo/bar, and the container is in a
183 * cgroup /lxc/foo, lxcfs will give us an ENOENT if some task in the
184 * container has an open fd that points to one of the cgroup files
185 * (systemd always opens its "root" cgroup). So, let's escape to the
186 * /actual/ root cgroup so that lxcfs thinks criu has enough rights to
187 * see all cgroups.
188 */
7103fe6f 189 if (!cgroup_escape()) {
e9195050
TA
190 ERROR("failed to escape cgroups");
191 return;
192 }
193
e29fe1dd 194 /* The command line always looks like:
19d1509c 195 * criu $(action) --tcp-established --file-locks --link-remap \
5f178bc9 196 * --manage-cgroups=full --action-script foo.sh -D $(directory) \
e29fe1dd
TA
197 * -o $(directory)/$(action).log --ext-mount-map auto
198 * --enable-external-sharing --enable-external-masters
4b54788e 199 * --enable-fs hugetlbfs --enable-fs tracefs --ext-mount-map console:/dev/pts/n
e29fe1dd
TA
200 * +1 for final NULL */
201
aef3d51e 202 if (strcmp(opts->action, "dump") == 0 || strcmp(opts->action, "pre-dump") == 0) {
dc259399
TA
203 /* -t pid --freeze-cgroup /lxc/ct */
204 static_args += 4;
e29fe1dd 205
aef3d51e 206 /* --prev-images-dir <path-to-directory-A-relative-to-B> */
b2c3710f 207 if (opts->user->predump_dir)
aef3d51e
TA
208 static_args += 2;
209
74eb576c 210 /* --page-server --address <address> --port <port> */
b2c3710f 211 if (opts->user->pageserver_address && opts->user->pageserver_port)
74eb576c
NE
212 static_args += 5;
213
aef3d51e 214 /* --leave-running (only for final dump) */
b2c3710f 215 if (strcmp(opts->action, "dump") == 0 && !opts->user->stop)
e29fe1dd 216 static_args++;
4b54788e
TA
217
218 /* --external tty[88,4] */
219 if (opts->tty_id[0])
220 static_args += 2;
19d1509c
TA
221
222 /* --force-irmap */
223 if (!opts->user->preserves_inodes)
224 static_args++;
b2b7b0d2
TA
225
226 /* --ghost-limit 1024 */
227 if (opts->user->ghost_limit)
228 static_args += 2;
e29fe1dd
TA
229 } else if (strcmp(opts->action, "restore") == 0) {
230 /* --root $(lxc_mount_point) --restore-detached
0ab5703f 231 * --restore-sibling
13389b29
TA
232 * --lsm-profile apparmor:whatever
233 */
0ab5703f 234 static_args += 6;
4b54788e
TA
235
236 tty_info[0] = 0;
b2c3710f 237 if (load_tty_major_minor(opts->user->directory, tty_info, sizeof(tty_info)))
4b54788e
TA
238 return;
239
240 /* --inherit-fd fd[%d]:tty[%s] */
241 if (tty_info[0])
242 static_args += 2;
e29fe1dd
TA
243 } else {
244 return;
245 }
246
09e80d0c
TA
247 if (cgroup_num_hierarchies() > 0)
248 static_args += 2 * cgroup_num_hierarchies();
0ab5703f 249
b2c3710f 250 if (opts->user->verbose)
e29fe1dd
TA
251 static_args++;
252
b9ee6643
TA
253 if (opts->user->action_script)
254 static_args += 2;
255
5f4e44a2
TA
256 static_args += 2 * lxc_list_len(&opts->c->lxc_conf->mount_list);
257
b2c3710f 258 ret = snprintf(log, PATH_MAX, "%s/%s.log", opts->user->directory, opts->action);
e29fe1dd 259 if (ret < 0 || ret >= PATH_MAX) {
9f1f54b0 260 ERROR("logfile name too long");
e29fe1dd
TA
261 return;
262 }
263
264 argv = malloc(static_args * sizeof(*argv));
265 if (!argv)
266 return;
267
268 memset(argv, 0, static_args * sizeof(*argv));
269
270#define DECLARE_ARG(arg) \
271 do { \
272 if (arg == NULL) { \
273 ERROR("Got NULL argument for criu"); \
274 goto err; \
275 } \
276 argv[argc++] = strdup(arg); \
277 if (!argv[argc-1]) \
278 goto err; \
279 } while (0)
280
281 argv[argc++] = on_path("criu", NULL);
282 if (!argv[argc-1]) {
9f1f54b0 283 ERROR("Couldn't find criu binary");
e29fe1dd
TA
284 goto err;
285 }
286
287 DECLARE_ARG(opts->action);
288 DECLARE_ARG("--tcp-established");
289 DECLARE_ARG("--file-locks");
290 DECLARE_ARG("--link-remap");
0a5fc6df 291 DECLARE_ARG("--manage-cgroups=full");
e29fe1dd
TA
292 DECLARE_ARG("--ext-mount-map");
293 DECLARE_ARG("auto");
294 DECLARE_ARG("--enable-external-sharing");
295 DECLARE_ARG("--enable-external-masters");
dd62857a
TA
296 DECLARE_ARG("--enable-fs");
297 DECLARE_ARG("hugetlbfs");
5b454329
TA
298 DECLARE_ARG("--enable-fs");
299 DECLARE_ARG("tracefs");
e29fe1dd 300 DECLARE_ARG("-D");
b2c3710f 301 DECLARE_ARG(opts->user->directory);
e29fe1dd
TA
302 DECLARE_ARG("-o");
303 DECLARE_ARG(log);
304
0ab5703f
TA
305 for (i = 0; i < cgroup_num_hierarchies(); i++) {
306 char **controllers = NULL, *fullname;
31b204e4 307 char *path, *tmp;
0ab5703f
TA
308
309 if (!cgroup_get_hierarchies(i, &controllers)) {
310 ERROR("failed to get hierarchy %d", i);
311 goto err;
312 }
313
314 /* if we are in a dump, we have to ask the monitor process what
315 * the right cgroup is. if this is a restore, we can just use
316 * the handler the restore task created.
317 */
318 if (!strcmp(opts->action, "dump") || !strcmp(opts->action, "pre-dump")) {
319 path = lxc_cmd_get_cgroup_path(opts->c->name, opts->c->config_path, controllers[0]);
320 if (!path) {
321 ERROR("failed to get cgroup path for %s", controllers[0]);
322 goto err;
323 }
324 } else {
325 const char *p;
326
327 p = cgroup_get_cgroup(opts->handler, controllers[0]);
328 if (!p) {
329 ERROR("failed to get cgroup path for %s", controllers[0]);
330 goto err;
331 }
332
333 path = strdup(p);
334 if (!path) {
335 ERROR("strdup failed");
336 goto err;
337 }
338 }
339
31b204e4
CB
340 tmp = lxc_deslashify(path);
341 if (!tmp) {
342 ERROR("Failed to remove extraneous slashes from \"%s\"",
343 path);
0ab5703f
TA
344 free(path);
345 goto err;
346 }
31b204e4
CB
347 free(path);
348 path = tmp;
0ab5703f
TA
349
350 fullname = lxc_string_join(",", (const char **) controllers, false);
351 if (!fullname) {
352 ERROR("failed to join controllers");
353 free(path);
354 goto err;
355 }
356
357 ret = sprintf(buf, "%s:%s", fullname, path);
358 free(path);
359 free(fullname);
360 if (ret < 0 || ret >= sizeof(buf)) {
361 ERROR("sprintf of cgroup root arg failed");
362 goto err;
363 }
364
365 DECLARE_ARG("--cgroup-root");
366 DECLARE_ARG(buf);
367 }
368
b2c3710f 369 if (opts->user->verbose)
e29fe1dd
TA
370 DECLARE_ARG("-vvvvvv");
371
b9ee6643
TA
372 if (opts->user->action_script) {
373 DECLARE_ARG("--action-script");
374 DECLARE_ARG(opts->user->action_script);
375 }
376
5ef5c9a3 377 mnts = make_anonymous_mount_file(&opts->c->lxc_conf->mount_list);
5f4e44a2
TA
378 if (!mnts)
379 goto err;
380
381 while (getmntent_r(mnts, &mntent, buf, sizeof(buf))) {
19d2422b 382 char *fmt, *key, *val, *mntdata;
5f4e44a2 383 char arg[2 * PATH_MAX + 2];
19d2422b
TA
384 unsigned long flags;
385
386 if (parse_mntopts(mntent.mnt_opts, &flags, &mntdata) < 0)
387 goto err;
388
389 free(mntdata);
390
391 /* only add --ext-mount-map for actual bind mounts */
392 if (!(flags & MS_BIND))
393 continue;
5f4e44a2
TA
394
395 if (strcmp(opts->action, "dump") == 0) {
396 fmt = "/%s:%s";
397 key = mntent.mnt_dir;
398 val = mntent.mnt_dir;
399 } else {
400 fmt = "%s:%s";
401 key = mntent.mnt_dir;
402 val = mntent.mnt_fsname;
403 }
404
405 ret = snprintf(arg, sizeof(arg), fmt, key, val);
406 if (ret < 0 || ret >= sizeof(arg)) {
407 fclose(mnts);
408 ERROR("snprintf failed");
409 goto err;
410 }
411
412 DECLARE_ARG("--ext-mount-map");
413 DECLARE_ARG(arg);
414 }
415 fclose(mnts);
416
aef3d51e 417 if (strcmp(opts->action, "dump") == 0 || strcmp(opts->action, "pre-dump") == 0) {
dc259399 418 char pid[32], *freezer_relative;
e29fe1dd
TA
419
420 if (sprintf(pid, "%d", opts->c->init_pid(opts->c)) < 0)
421 goto err;
422
423 DECLARE_ARG("-t");
424 DECLARE_ARG(pid);
dc259399
TA
425
426 freezer_relative = lxc_cmd_get_cgroup_path(opts->c->name,
427 opts->c->config_path,
428 "freezer");
429 if (!freezer_relative) {
430 ERROR("failed getting freezer path");
431 goto err;
432 }
433
434 ret = snprintf(log, sizeof(log), "/sys/fs/cgroup/freezer/%s", freezer_relative);
435 if (ret < 0 || ret >= sizeof(log))
436 goto err;
437
f1954503
AR
438 if (!opts->user->disable_skip_in_flight &&
439 strcmp(opts->criu_version, CRIU_IN_FLIGHT_SUPPORT) >= 0)
440 DECLARE_ARG("--skip-in-flight");
441
dc259399
TA
442 DECLARE_ARG("--freeze-cgroup");
443 DECLARE_ARG(log);
444
4b54788e 445 if (opts->tty_id[0]) {
36d2096c
TA
446 DECLARE_ARG("--ext-mount-map");
447 DECLARE_ARG("/dev/console:console");
448
4b54788e
TA
449 DECLARE_ARG("--external");
450 DECLARE_ARG(opts->tty_id);
451 }
452
b2c3710f 453 if (opts->user->predump_dir) {
aef3d51e 454 DECLARE_ARG("--prev-images-dir");
b2c3710f 455 DECLARE_ARG(opts->user->predump_dir);
9f99a33f 456 DECLARE_ARG("--track-mem");
74eb576c 457 }
4c0c0319 458
b2c3710f 459 if (opts->user->pageserver_address && opts->user->pageserver_port) {
74eb576c
NE
460 DECLARE_ARG("--page-server");
461 DECLARE_ARG("--address");
b2c3710f 462 DECLARE_ARG(opts->user->pageserver_address);
74eb576c 463 DECLARE_ARG("--port");
b2c3710f 464 DECLARE_ARG(opts->user->pageserver_port);
74eb576c 465 }
aef3d51e 466
19d1509c
TA
467 if (!opts->user->preserves_inodes)
468 DECLARE_ARG("--force-irmap");
469
b2b7b0d2
TA
470 if (opts->user->ghost_limit) {
471 char ghost_limit[32];
472
9b945f13 473 ret = sprintf(ghost_limit, "%"PRIu64, opts->user->ghost_limit);
b2b7b0d2 474 if (ret < 0 || ret >= sizeof(ghost_limit)) {
9b945f13 475 ERROR("failed to print ghost limit %"PRIu64, opts->user->ghost_limit);
b2b7b0d2
TA
476 goto err;
477 }
478
479 DECLARE_ARG("--ghost-limit");
480 DECLARE_ARG(ghost_limit);
481 }
482
aef3d51e 483 /* only for final dump */
b2c3710f 484 if (strcmp(opts->action, "dump") == 0 && !opts->user->stop)
e29fe1dd
TA
485 DECLARE_ARG("--leave-running");
486 } else if (strcmp(opts->action, "restore") == 0) {
487 void *m;
488 int additional;
13389b29 489 struct lxc_conf *lxc_conf = opts->c->lxc_conf;
e29fe1dd
TA
490
491 DECLARE_ARG("--root");
492 DECLARE_ARG(opts->c->lxc_conf->rootfs.mount);
493 DECLARE_ARG("--restore-detached");
494 DECLARE_ARG("--restore-sibling");
e29fe1dd 495
4b54788e 496 if (tty_info[0]) {
97e4f1a9 497 if (opts->console_fd < 0) {
3aed4934 498 ERROR("lxc.console.path configured on source host but not target");
97e4f1a9
TA
499 goto err;
500 }
501
4b54788e
TA
502 ret = snprintf(buf, sizeof(buf), "fd[%d]:%s", opts->console_fd, tty_info);
503 if (ret < 0 || ret >= sizeof(buf))
504 goto err;
505
506 DECLARE_ARG("--inherit-fd");
507 DECLARE_ARG(buf);
508 }
509 if (opts->console_name) {
510 if (snprintf(buf, sizeof(buf), "console:%s", opts->console_name) < 0) {
511 SYSERROR("sprintf'd too many bytes");
512 }
513 DECLARE_ARG("--ext-mount-map");
514 DECLARE_ARG(buf);
515 }
516
13389b29
TA
517 if (lxc_conf->lsm_aa_profile || lxc_conf->lsm_se_context) {
518
519 if (lxc_conf->lsm_aa_profile)
520 ret = snprintf(buf, sizeof(buf), "apparmor:%s", lxc_conf->lsm_aa_profile);
521 else
522 ret = snprintf(buf, sizeof(buf), "selinux:%s", lxc_conf->lsm_se_context);
523
524 if (ret < 0 || ret >= sizeof(buf))
525 goto err;
526
527 DECLARE_ARG("--lsm-profile");
528 DECLARE_ARG(buf);
529 }
530
e29fe1dd
TA
531 additional = lxc_list_len(&opts->c->lxc_conf->network) * 2;
532
fa071249
TA
533 m = realloc(argv, (argc + additional + 1) * sizeof(*argv));
534 if (!m)
535 goto err;
e29fe1dd
TA
536 argv = m;
537
538 lxc_list_for_each(it, &opts->c->lxc_conf->network) {
539 char eth[128], *veth;
46c8ffd5 540 char *fmt;
e29fe1dd 541 struct lxc_netdev *n = it->elem;
46c8ffd5
AR
542 bool external_not_veth;
543
74ad3607 544 if (cmp_version(opts->criu_version, CRIU_EXTERNAL_NOT_VETH) >= 0) {
46c8ffd5
AR
545 /* Since criu version 2.8 the usage of --veth-pair
546 * has been deprecated:
547 * git tag --contains f2037e6d3445fc400
548 * v2.8 */
549 external_not_veth = true;
550 } else {
551 external_not_veth = false;
552 }
e29fe1dd 553
42277b1c 554 if (n->name[0] != '\0') {
e29fe1dd
TA
555 if (strlen(n->name) >= sizeof(eth))
556 goto err;
557 strncpy(eth, n->name, sizeof(eth));
796a109d
TA
558 } else {
559 ret = snprintf(eth, sizeof(eth), "eth%d", netnr);
560 if (ret < 0 || ret >= sizeof(eth))
561 goto err;
562 }
e29fe1dd 563
e2697330
TA
564 switch (n->type) {
565 case LXC_NET_VETH:
566 veth = n->priv.veth_attr.pair;
ea7f6b29
CB
567 if (veth[0] == '\0')
568 veth = n->priv.veth_attr.veth1;
e29fe1dd 569
de4855a8 570 if (n->link[0] != '\0') {
46c8ffd5
AR
571 if (external_not_veth)
572 fmt = "veth[%s]:%s@%s";
573 else
574 fmt = "%s=%s@%s";
575
576 ret = snprintf(buf, sizeof(buf), fmt, eth, veth, n->link);
577 } else {
578 if (external_not_veth)
579 fmt = "veth[%s]:%s";
580 else
581 fmt = "%s=%s";
582
583 ret = snprintf(buf, sizeof(buf), fmt, eth, veth);
584 }
e2697330
TA
585 if (ret < 0 || ret >= sizeof(buf))
586 goto err;
587 break;
588 case LXC_NET_MACVLAN:
de4855a8 589 if (n->link[0] == '\0') {
9f1f54b0 590 ERROR("no host interface for macvlan %s", n->name);
e2697330
TA
591 goto err;
592 }
593
594 ret = snprintf(buf, sizeof(buf), "macvlan[%s]:%s", eth, n->link);
595 if (ret < 0 || ret >= sizeof(buf))
596 goto err;
597 break;
598 case LXC_NET_NONE:
599 case LXC_NET_EMPTY:
600 break;
601 default:
602 /* we have screened for this earlier... */
9f1f54b0 603 ERROR("unexpected network type %d", n->type);
e29fe1dd 604 goto err;
e2697330 605 }
e29fe1dd 606
46c8ffd5
AR
607 if (external_not_veth)
608 DECLARE_ARG("--external");
609 else
610 DECLARE_ARG("--veth-pair");
e29fe1dd 611 DECLARE_ARG(buf);
2f3fbc6b 612 netnr++;
e29fe1dd
TA
613 }
614
615 }
616
617 argv[argc] = NULL;
618
cf4b07a5 619 buf[0] = 0;
a17fa3c0 620 pos = 0;
72a30576 621
cf4b07a5 622 for (i = 0; argv[i]; i++) {
72a30576
NE
623 ret = snprintf(buf + pos, sizeof(buf) - pos, "%s ", argv[i]);
624 if (ret < 0 || ret >= sizeof(buf) - pos)
625 goto err;
626 else
627 pos += ret;
cf4b07a5
TA
628 }
629
630 INFO("execing: %s", buf);
631
5af85cb1
TA
632 /* before criu inits its log, it sometimes prints things to stdout/err;
633 * let's be sure we capture that.
634 */
635 if (dup2(opts->pipefd, STDOUT_FILENO) < 0) {
636 SYSERROR("dup2 stdout failed");
637 goto err;
638 }
639
640 if (dup2(opts->pipefd, STDERR_FILENO) < 0) {
641 SYSERROR("dup2 stderr failed");
642 goto err;
643 }
644
645 close(opts->pipefd);
646
e29fe1dd
TA
647#undef DECLARE_ARG
648 execv(argv[0], argv);
649err:
e29fe1dd
TA
650 for (i = 0; argv[i]; i++)
651 free(argv[i]);
652 free(argv);
653}
654
8ba5ced7
TA
655/*
656 * Check to see if the criu version is recent enough for all the features we
657 * use. This version allows either CRIU_VERSION or (CRIU_GITID_VERSION and
658 * CRIU_GITID_PATCHLEVEL) to work, enabling users building from git to c/r
659 * things potentially before a version is released with a particular feature.
660 *
661 * The intent is that when criu development slows down, we can drop this, but
662 * for now we shouldn't attempt to c/r with versions that we know won't work.
5407e2ab
CB
663 *
664 * Note: If version != NULL criu_version() stores the detected criu version in
665 * version. Allocates memory for version which must be freed by caller.
8ba5ced7 666 */
5407e2ab 667static bool criu_version_ok(char **version)
8ba5ced7
TA
668{
669 int pipes[2];
670 pid_t pid;
671
672 if (pipe(pipes) < 0) {
673 SYSERROR("pipe() failed");
674 return false;
675 }
676
677 pid = fork();
678 if (pid < 0) {
679 SYSERROR("fork() failed");
680 return false;
681 }
682
683 if (pid == 0) {
684 char *args[] = { "criu", "--version", NULL };
755fa453 685 char *path;
8ba5ced7
TA
686 close(pipes[0]);
687
688 close(STDERR_FILENO);
689 if (dup2(pipes[1], STDOUT_FILENO) < 0)
690 exit(1);
691
755fa453 692 path = on_path("criu", NULL);
d9b32b09
SH
693 if (!path)
694 exit(1);
695
755fa453 696 execv(path, args);
8ba5ced7
TA
697 exit(1);
698 } else {
699 FILE *f;
5407e2ab 700 char *tmp;
8ba5ced7
TA
701 int patch;
702
703 close(pipes[1]);
704 if (wait_for_pid(pid) < 0) {
705 close(pipes[0]);
4eae4051 706 SYSERROR("execing criu failed, is it installed?");
8ba5ced7
TA
707 return false;
708 }
709
710 f = fdopen(pipes[0], "r");
711 if (!f) {
712 close(pipes[0]);
713 return false;
714 }
715
5407e2ab
CB
716 tmp = malloc(1024);
717 if (!tmp) {
718 fclose(f);
719 return false;
720 }
721
722 if (fscanf(f, "Version: %1023[^\n]s", tmp) != 1)
8ba5ced7
TA
723 goto version_error;
724
725 if (fgetc(f) != '\n')
726 goto version_error;
727
5407e2ab 728 if (strcmp(tmp, CRIU_VERSION) >= 0)
8ba5ced7
TA
729 goto version_match;
730
5407e2ab 731 if (fscanf(f, "GitID: v%1023[^-]s", tmp) != 1)
8ba5ced7
TA
732 goto version_error;
733
734 if (fgetc(f) != '-')
735 goto version_error;
736
737 if (fscanf(f, "%d", &patch) != 1)
738 goto version_error;
739
5407e2ab 740 if (strcmp(tmp, CRIU_GITID_VERSION) < 0)
8ba5ced7
TA
741 goto version_error;
742
743 if (patch < CRIU_GITID_PATCHLEVEL)
744 goto version_error;
745
746version_match:
3158ab5b 747 fclose(f);
5407e2ab
CB
748 if (!version)
749 free(tmp);
750 else
751 *version = tmp;
8ba5ced7
TA
752 return true;
753
754version_error:
3158ab5b 755 fclose(f);
5407e2ab 756 free(tmp);
9f1f54b0 757 ERROR("must have criu " CRIU_VERSION " or greater to checkpoint/restore");
8ba5ced7
TA
758 return false;
759 }
760}
761
e29fe1dd
TA
762/* Check and make sure the container has a configuration that we know CRIU can
763 * dump. */
f1954503 764static bool criu_ok(struct lxc_container *c, char **criu_version)
e29fe1dd
TA
765{
766 struct lxc_list *it;
e29fe1dd 767
f1954503 768 if (!criu_version_ok(criu_version))
8ba5ced7
TA
769 return false;
770
e29fe1dd 771 if (geteuid()) {
9f1f54b0 772 ERROR("Must be root to checkpoint");
e29fe1dd
TA
773 return false;
774 }
775
776 /* We only know how to restore containers with veth networks. */
777 lxc_list_for_each(it, &c->lxc_conf->network) {
778 struct lxc_netdev *n = it->elem;
65b20221
TA
779 switch(n->type) {
780 case LXC_NET_VETH:
781 case LXC_NET_NONE:
782 case LXC_NET_EMPTY:
e2697330 783 case LXC_NET_MACVLAN:
65b20221
TA
784 break;
785 default:
9f1f54b0 786 ERROR("Found un-dumpable network: %s (%s)", lxc_net_type_to_str(n->type), n->name);
e29fe1dd
TA
787 return false;
788 }
789 }
790
e29fe1dd
TA
791 return true;
792}
793
e29fe1dd
TA
794static bool restore_net_info(struct lxc_container *c)
795{
796 struct lxc_list *it;
797 bool has_error = true;
798
799 if (container_mem_lock(c))
800 return false;
801
802 lxc_list_for_each(it, &c->lxc_conf->network) {
803 struct lxc_netdev *netdev = it->elem;
804 char template[IFNAMSIZ];
65b20221
TA
805
806 if (netdev->type != LXC_NET_VETH)
807 continue;
808
e29fe1dd
TA
809 snprintf(template, sizeof(template), "vethXXXXXX");
810
de4855a8
CB
811 if (netdev->priv.veth_attr.pair[0] == '\0' &&
812 netdev->priv.veth_attr.veth1[0] == '\0') {
966e9f1f 813 if (!lxc_mkifname(template))
de4855a8
CB
814 goto out_unlock;
815
966e9f1f 816 strcpy(netdev->priv.veth_attr.veth1, template);
de4855a8 817 }
e29fe1dd
TA
818 }
819
820 has_error = false;
821
822out_unlock:
823 container_mem_unlock(c);
824 return !has_error;
825}
826
1a0e70ac
CB
827/* do_restore never returns, the calling process is used as the monitor process.
828 * do_restore calls exit() if it fails.
829 */
c33b0338 830static void do_restore(struct lxc_container *c, int status_pipe, struct migrate_opts *opts, char *criu_version)
e29fe1dd 831{
5af9369b 832 int fd, ret;
e29fe1dd 833 pid_t pid;
e29fe1dd 834 struct lxc_handler *handler;
113ebd57 835 int status = 0;
9b1e2e6e 836 int pipes[2] = {-1, -1};
e29fe1dd 837
a7fb6043
TA
838 /* Try to detach from the current controlling tty if it exists.
839 * Othwerise, lxc_init (via lxc_console) will attach the container's
840 * console output to the current tty, which is probably not what any
841 * library user wants, and if they do, they can just manually configure
842 * it :)
843 */
844 fd = open("/dev/tty", O_RDWR);
845 if (fd >= 0) {
846 if (ioctl(fd, TIOCNOTTY, NULL) < 0)
847 SYSERROR("couldn't detach from tty");
848 close(fd);
849 }
850
5e5576a4 851 handler = lxc_init_handler(c->name, c->lxc_conf, c->config_path, false);
e29fe1dd
TA
852 if (!handler)
853 goto out;
854
aa460476
CB
855 if (lxc_init(c->name, handler) < 0)
856 goto out;
857
e29fe1dd
TA
858 if (!cgroup_init(handler)) {
859 ERROR("failed initing cgroups");
860 goto out_fini_handler;
861 }
862
863 if (!cgroup_create(handler)) {
864 ERROR("failed creating groups");
865 goto out_fini_handler;
866 }
867
868 if (!restore_net_info(c)) {
869 ERROR("failed restoring network info");
870 goto out_fini_handler;
871 }
872
5af9369b
CB
873 ret = resolve_clone_flags(handler);
874 if (ret < 0) {
875 ERROR("%s - Unsupported clone flag specified", strerror(errno));
876 goto out_fini_handler;
877 }
e29fe1dd 878
3d9a5c85
TA
879 if (pipe(pipes) < 0) {
880 SYSERROR("pipe() failed");
881 goto out_fini_handler;
882 }
883
e29fe1dd
TA
884 pid = fork();
885 if (pid < 0)
886 goto out_fini_handler;
887
888 if (pid == 0) {
889 struct criu_opts os;
890 struct lxc_rootfs *rootfs;
4b54788e 891 int flags;
e29fe1dd 892
3d9a5c85
TA
893 close(status_pipe);
894 status_pipe = -1;
895
896 close(pipes[0]);
897 pipes[0] = -1;
e29fe1dd
TA
898
899 if (unshare(CLONE_NEWNS))
900 goto out_fini_handler;
901
902 /* CRIU needs the lxc root bind mounted so that it is the root of some
903 * mount. */
904 rootfs = &c->lxc_conf->rootfs;
905
906 if (rootfs_is_blockdev(c->lxc_conf)) {
907 if (do_rootfs_setup(c->lxc_conf, c->name, c->config_path) < 0)
908 goto out_fini_handler;
909 } else {
910 if (mkdir(rootfs->mount, 0755) < 0 && errno != EEXIST)
911 goto out_fini_handler;
912
913 if (mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL) < 0) {
914 SYSERROR("remount / to private failed");
915 goto out_fini_handler;
916 }
917
918 if (mount(rootfs->path, rootfs->mount, NULL, MS_BIND, NULL) < 0) {
919 rmdir(rootfs->mount);
920 goto out_fini_handler;
921 }
922 }
923
5af85cb1 924 os.pipefd = pipes[1];
e29fe1dd 925 os.action = "restore";
b2c3710f 926 os.user = opts;
e29fe1dd 927 os.c = c;
4b54788e 928 os.console_fd = c->lxc_conf->console.slave;
f1954503 929 os.criu_version = criu_version;
0ab5703f 930 os.handler = handler;
4b54788e 931
97e4f1a9
TA
932 if (os.console_fd >= 0) {
933 /* Twiddle the FD_CLOEXEC bit. We want to pass this FD to criu
934 * via --inherit-fd, so we don't want it to close.
935 */
936 flags = fcntl(os.console_fd, F_GETFD);
937 if (flags < 0) {
938 SYSERROR("F_GETFD failed: %d", os.console_fd);
939 goto out_fini_handler;
940 }
4b54788e 941
97e4f1a9 942 flags &= ~FD_CLOEXEC;
4b54788e 943
97e4f1a9
TA
944 if (fcntl(os.console_fd, F_SETFD, flags) < 0) {
945 SYSERROR("F_SETFD failed");
946 goto out_fini_handler;
947 }
4b54788e
TA
948 }
949 os.console_name = c->lxc_conf->console.name;
e29fe1dd
TA
950
951 /* exec_criu() returning is an error */
7103fe6f 952 exec_criu(&os);
e29fe1dd
TA
953 umount(rootfs->mount);
954 rmdir(rootfs->mount);
955 goto out_fini_handler;
956 } else {
957 int ret;
958 char title[2048];
959
3d9a5c85
TA
960 close(pipes[1]);
961 pipes[1] = -1;
962
e29fe1dd
TA
963 pid_t w = waitpid(pid, &status, 0);
964 if (w == -1) {
965 SYSERROR("waitpid");
966 goto out_fini_handler;
967 }
968
e29fe1dd 969 if (WIFEXITED(status)) {
75d219f0
TA
970 char buf[4096];
971
e29fe1dd 972 if (WEXITSTATUS(status)) {
3d9a5c85
TA
973 int n;
974
975 n = read(pipes[0], buf, sizeof(buf));
976 if (n < 0) {
977 SYSERROR("failed reading from criu stderr");
978 goto out_fini_handler;
979 }
980
2735dfae
TA
981 if (n == sizeof(buf))
982 n--;
3d9a5c85
TA
983 buf[n] = 0;
984
9f1f54b0 985 ERROR("criu process exited %d, output:\n%s", WEXITSTATUS(status), buf);
e29fe1dd
TA
986 goto out_fini_handler;
987 } else {
3eba9b49 988 ret = snprintf(buf, sizeof(buf), "/proc/self/task/%lu/children", (unsigned long)syscall(__NR_gettid));
75d219f0
TA
989 if (ret < 0 || ret >= sizeof(buf)) {
990 ERROR("snprintf'd too many characters: %d", ret);
991 goto out_fini_handler;
992 }
993
994 FILE *f = fopen(buf, "r");
e29fe1dd 995 if (!f) {
9f1f54b0 996 SYSERROR("couldn't read restore's children file %s", buf);
e29fe1dd
TA
997 goto out_fini_handler;
998 }
999
1000 ret = fscanf(f, "%d", (int*) &handler->pid);
1001 fclose(f);
1002 if (ret != 1) {
1003 ERROR("reading restore pid failed");
1004 goto out_fini_handler;
1005 }
1006
f8a41688
TA
1007 if (lxc_set_state(c->name, handler, RUNNING)) {
1008 ERROR("error setting running state after restore");
e29fe1dd 1009 goto out_fini_handler;
f8a41688 1010 }
e29fe1dd
TA
1011 }
1012 } else {
9f1f54b0 1013 ERROR("CRIU was killed with signal %d", WTERMSIG(status));
e29fe1dd
TA
1014 goto out_fini_handler;
1015 }
1016
3d9a5c85
TA
1017 close(pipes[0]);
1018
f3886023
TA
1019 ret = write(status_pipe, &status, sizeof(status));
1020 close(status_pipe);
1021 status_pipe = -1;
1022
1023 if (sizeof(status) != ret) {
1024 SYSERROR("failed to write all of status");
1025 goto out_fini_handler;
1026 }
1027
e29fe1dd
TA
1028 /*
1029 * See comment in lxcapi_start; we don't care if these
1030 * fail because it's just a beauty thing. We just
1031 * assign the return here to silence potential.
1032 */
1033 ret = snprintf(title, sizeof(title), "[lxc monitor] %s %s", c->config_path, c->name);
223e30c1
CB
1034 if (ret < 0 || (size_t)ret >= sizeof(title))
1035 INFO("Setting truncated process name");
1036
e29fe1dd 1037 ret = setproctitle(title);
223e30c1
CB
1038 if (ret < 0)
1039 INFO("Failed to set process name");
e29fe1dd
TA
1040
1041 ret = lxc_poll(c->name, handler);
1042 if (ret)
1043 lxc_abort(c->name, handler);
1044 lxc_fini(c->name, handler);
1045 exit(ret);
1046 }
1047
1048out_fini_handler:
3d9a5c85
TA
1049 if (pipes[0] >= 0)
1050 close(pipes[0]);
1051 if (pipes[1] >= 0)
1052 close(pipes[1]);
1053
e29fe1dd
TA
1054 lxc_fini(c->name, handler);
1055
1056out:
3d9a5c85 1057 if (status_pipe >= 0) {
f3886023
TA
1058 /* ensure getting here was a failure, e.g. if we failed to
1059 * parse the child pid or something, even after a successful
1060 * restore
1061 */
1062 if (!status)
1063 status = 1;
113ebd57
CB
1064
1065 if (write(status_pipe, &status, sizeof(status)) != sizeof(status))
e29fe1dd 1066 SYSERROR("writing status failed");
3d9a5c85 1067 close(status_pipe);
e29fe1dd
TA
1068 }
1069
1070 exit(1);
1071}
aef3d51e 1072
4b54788e
TA
1073static int save_tty_major_minor(char *directory, struct lxc_container *c, char *tty_id, int len)
1074{
1075 FILE *f;
1076 char path[PATH_MAX];
1077 int ret;
1078 struct stat sb;
1079
1080 if (c->lxc_conf->console.path && !strcmp(c->lxc_conf->console.path, "none")) {
1081 tty_id[0] = 0;
1082 return 0;
1083 }
1084
1085 ret = snprintf(path, sizeof(path), "/proc/%d/root/dev/console", c->init_pid(c));
1086 if (ret < 0 || ret >= sizeof(path)) {
1087 ERROR("snprintf'd too many chacters: %d", ret);
1088 return -1;
1089 }
1090
1091 ret = stat(path, &sb);
1092 if (ret < 0) {
1093 SYSERROR("stat of %s failed", path);
1094 return -1;
1095 }
1096
1097 ret = snprintf(path, sizeof(path), "%s/tty.info", directory);
1098 if (ret < 0 || ret >= sizeof(path)) {
1099 ERROR("snprintf'd too many characters: %d", ret);
1100 return -1;
1101 }
1102
f03280a7
TA
1103 ret = snprintf(tty_id, len, "tty[%llx:%llx]",
1104 (long long unsigned) sb.st_rdev,
1105 (long long unsigned) sb.st_dev);
4b54788e
TA
1106 if (ret < 0 || ret >= sizeof(path)) {
1107 ERROR("snprintf'd too many characters: %d", ret);
1108 return -1;
1109 }
1110
1111 f = fopen(path, "w");
1112 if (!f) {
1113 SYSERROR("failed to open %s", path);
1114 return -1;
1115 }
1116
1117 ret = fprintf(f, "%s", tty_id);
1118 fclose(f);
1119 if (ret < 0)
1120 SYSERROR("failed to write to %s", path);
1121 return ret;
1122}
1123
aef3d51e 1124/* do one of either predump or a regular dump */
b2c3710f 1125static bool do_dump(struct lxc_container *c, char *mode, struct migrate_opts *opts)
aef3d51e
TA
1126{
1127 pid_t pid;
f1954503 1128 char *criu_version = NULL;
5af85cb1 1129 int criuout[2];
aef3d51e 1130
f1954503 1131 if (!criu_ok(c, &criu_version))
aef3d51e
TA
1132 return false;
1133
5af85cb1
TA
1134 if (pipe(criuout) < 0) {
1135 SYSERROR("pipe() failed");
aef3d51e 1136 return false;
5af85cb1
TA
1137 }
1138
1139 if (mkdir_p(opts->directory, 0700) < 0)
1140 goto fail;
aef3d51e
TA
1141
1142 pid = fork();
1143 if (pid < 0) {
1144 SYSERROR("fork failed");
5af85cb1 1145 goto fail;
aef3d51e
TA
1146 }
1147
1148 if (pid == 0) {
1149 struct criu_opts os;
0ab5703f
TA
1150 struct lxc_handler h;
1151
5af85cb1
TA
1152 close(criuout[0]);
1153
0ab5703f
TA
1154 h.name = c->name;
1155 if (!cgroup_init(&h)) {
1156 ERROR("failed to cgroup_init()");
1157 exit(1);
1158 }
aef3d51e 1159
5af85cb1 1160 os.pipefd = criuout[1];
aef3d51e 1161 os.action = mode;
b2c3710f 1162 os.user = opts;
aef3d51e 1163 os.c = c;
4b54788e 1164 os.console_name = c->lxc_conf->console.path;
f1954503 1165 os.criu_version = criu_version;
74eb576c 1166
b2c3710f 1167 if (save_tty_major_minor(opts->directory, c, os.tty_id, sizeof(os.tty_id)) < 0)
4b54788e 1168 exit(1);
aef3d51e
TA
1169
1170 /* exec_criu() returning is an error */
7103fe6f 1171 exec_criu(&os);
aef3d51e
TA
1172 exit(1);
1173 } else {
1174 int status;
5af85cb1
TA
1175 ssize_t n;
1176 char buf[4096];
1177 bool ret;
1178
1179 close(criuout[1]);
1180
aef3d51e
TA
1181 pid_t w = waitpid(pid, &status, 0);
1182 if (w == -1) {
1183 SYSERROR("waitpid");
5af85cb1 1184 close(criuout[0]);
aef3d51e
TA
1185 return false;
1186 }
1187
5af85cb1
TA
1188 n = read(criuout[0], buf, sizeof(buf));
1189 close(criuout[0]);
1190 if (n < 0) {
1191 SYSERROR("read");
1192 n = 0;
1193 }
1194 buf[n] = 0;
1195
aef3d51e
TA
1196 if (WIFEXITED(status)) {
1197 if (WEXITSTATUS(status)) {
9f1f54b0 1198 ERROR("dump failed with %d", WEXITSTATUS(status));
5af85cb1
TA
1199 ret = false;
1200 } else {
1201 ret = true;
aef3d51e 1202 }
aef3d51e 1203 } else if (WIFSIGNALED(status)) {
9f1f54b0 1204 ERROR("dump signaled with %d", WTERMSIG(status));
5af85cb1 1205 ret = false;
aef3d51e 1206 } else {
9f1f54b0 1207 ERROR("unknown dump exit %d", status);
5af85cb1 1208 ret = false;
aef3d51e 1209 }
5af85cb1
TA
1210
1211 if (!ret)
1212 ERROR("criu output: %s", buf);
1213 return ret;
aef3d51e 1214 }
5af85cb1
TA
1215fail:
1216 close(criuout[0]);
1217 close(criuout[1]);
1218 rmdir(opts->directory);
1219 return false;
aef3d51e
TA
1220}
1221
b2c3710f 1222bool __criu_pre_dump(struct lxc_container *c, struct migrate_opts *opts)
aef3d51e 1223{
b2c3710f 1224 return do_dump(c, "pre-dump", opts);
aef3d51e
TA
1225}
1226
b2c3710f 1227bool __criu_dump(struct lxc_container *c, struct migrate_opts *opts)
aef3d51e
TA
1228{
1229 char path[PATH_MAX];
1230 int ret;
1231
b2c3710f 1232 ret = snprintf(path, sizeof(path), "%s/inventory.img", opts->directory);
aef3d51e
TA
1233 if (ret < 0 || ret >= sizeof(path))
1234 return false;
1235
1236 if (access(path, F_OK) == 0) {
9f1f54b0 1237 ERROR("please use a fresh directory for the dump directory");
aef3d51e
TA
1238 return false;
1239 }
1240
b2c3710f 1241 return do_dump(c, "dump", opts);
aef3d51e
TA
1242}
1243
b2c3710f 1244bool __criu_restore(struct lxc_container *c, struct migrate_opts *opts)
aef3d51e
TA
1245{
1246 pid_t pid;
1247 int status, nread;
1248 int pipefd[2];
f1954503 1249 char *criu_version = NULL;
aef3d51e 1250
f1954503 1251 if (!criu_ok(c, &criu_version))
aef3d51e
TA
1252 return false;
1253
1254 if (geteuid()) {
9f1f54b0 1255 ERROR("Must be root to restore");
aef3d51e
TA
1256 return false;
1257 }
1258
1259 if (pipe(pipefd)) {
1260 ERROR("failed to create pipe");
1261 return false;
1262 }
1263
1264 pid = fork();
1265 if (pid < 0) {
1266 close(pipefd[0]);
1267 close(pipefd[1]);
1268 return false;
1269 }
1270
1271 if (pid == 0) {
1272 close(pipefd[0]);
1a0e70ac 1273 /* this never returns */
f1954503 1274 do_restore(c, pipefd[1], opts, criu_version);
aef3d51e
TA
1275 }
1276
1277 close(pipefd[1]);
1278
1279 nread = read(pipefd[0], &status, sizeof(status));
1280 close(pipefd[0]);
1281 if (sizeof(status) != nread) {
1282 ERROR("reading status from pipe failed");
1283 goto err_wait;
1284 }
1285
1a0e70ac
CB
1286 /* If the criu process was killed or exited nonzero, wait() for the
1287 * handler, since the restore process died. Otherwise, we don't need to
1288 * wait, since the child becomes the monitor process.
1289 */
aef3d51e
TA
1290 if (!WIFEXITED(status) || WEXITSTATUS(status))
1291 goto err_wait;
1292 return true;
1293
1294err_wait:
1295 if (wait_for_pid(pid))
1296 ERROR("restore process died");
1297 return false;
1298}