]>
Commit | Line | Data |
---|---|---|
e29fe1dd TA |
1 | /* |
2 | * lxc: linux Container library | |
3 | * | |
4 | * Copyright © 2014-2015 Canonical Ltd. | |
5 | * | |
6 | * Authors: | |
7 | * Tycho Andersen <tycho.andersen@canonical.com> | |
8 | * | |
9 | * This library is free software; you can redistribute it and/or | |
10 | * modify it under the terms of the GNU Lesser General Public | |
11 | * License as published by the Free Software Foundation; either | |
12 | * version 2.1 of the License, or (at your option) any later version. | |
13 | * | |
14 | * This library is distributed in the hope that it will be useful, | |
15 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
16 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
17 | * Lesser General Public License for more details. | |
18 | * | |
19 | * You should have received a copy of the GNU Lesser General Public | |
20 | * License along with this library; if not, write to the Free Software | |
21 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
22 | */ | |
23 | #define _GNU_SOURCE | |
24 | #include <assert.h> | |
25 | #include <linux/limits.h> | |
26 | #include <sched.h> | |
27 | #include <stdio.h> | |
28 | #include <stdlib.h> | |
29 | #include <string.h> | |
30 | #include <sys/mount.h> | |
31 | #include <sys/types.h> | |
32 | #include <sys/wait.h> | |
33 | #include <unistd.h> | |
34 | ||
35 | #include "config.h" | |
36 | ||
4ec31c52 | 37 | #include "bdev/bdev.h" |
e29fe1dd TA |
38 | #include "cgroup.h" |
39 | #include "conf.h" | |
dc259399 | 40 | #include "commands.h" |
e29fe1dd TA |
41 | #include "criu.h" |
42 | #include "log.h" | |
43 | #include "lxc.h" | |
44 | #include "lxclock.h" | |
45 | #include "network.h" | |
46 | #include "utils.h" | |
47 | ||
73d46752 TA |
48 | #define CRIU_VERSION "2.0" |
49 | ||
50 | #define CRIU_GITID_VERSION "2.0" | |
51 | #define CRIU_GITID_PATCHLEVEL 0 | |
52 | ||
e29fe1dd TA |
53 | lxc_log_define(lxc_criu, lxc); |
54 | ||
73d46752 TA |
55 | struct criu_opts { |
56 | /* The type of criu invocation, one of "dump" or "restore" */ | |
57 | char *action; | |
58 | ||
b2c3710f TA |
59 | /* the user-provided migrate options relevant to this action */ |
60 | struct migrate_opts *user; | |
73d46752 TA |
61 | |
62 | /* The container to dump */ | |
63 | struct lxc_container *c; | |
64 | ||
73d46752 | 65 | /* dump: stop the container or not after dumping? */ |
4b54788e | 66 | char tty_id[32]; /* the criu tty id for /dev/console, i.e. "tty[${rdev}:${dev}]" */ |
73d46752 TA |
67 | |
68 | /* restore: the file to write the init process' pid into */ | |
69 | char *pidfile; | |
70 | const char *cgroup_path; | |
4b54788e TA |
71 | int console_fd; |
72 | /* The path that is bind mounted from /dev/console, if any. We don't | |
73 | * want to use `--ext-mount-map auto`'s result here because the pts | |
74 | * device may have a different path (e.g. if the pty number is | |
75 | * different) on the target host. NULL if lxc.console = "none". | |
76 | */ | |
77 | char *console_name; | |
73d46752 TA |
78 | }; |
79 | ||
4b54788e TA |
80 | static int load_tty_major_minor(char *directory, char *output, int len) |
81 | { | |
82 | FILE *f; | |
83 | char path[PATH_MAX]; | |
84 | int ret; | |
85 | ||
86 | ret = snprintf(path, sizeof(path), "%s/tty.info", directory); | |
87 | if (ret < 0 || ret >= sizeof(path)) { | |
88 | ERROR("snprintf'd too many chacters: %d", ret); | |
89 | return -1; | |
90 | } | |
91 | ||
92 | f = fopen(path, "r"); | |
93 | if (!f) { | |
94 | /* This means we're coming from a liblxc which didn't export | |
95 | * the tty info. In this case they had to have lxc.console = | |
96 | * none, so there's no problem restoring. | |
97 | */ | |
98 | if (errno == ENOENT) | |
99 | return 0; | |
100 | ||
101 | SYSERROR("couldn't open %s", path); | |
102 | return -1; | |
103 | } | |
104 | ||
105 | if (!fgets(output, len, f)) { | |
106 | fclose(f); | |
107 | SYSERROR("couldn't read %s", path); | |
108 | return -1; | |
109 | } | |
110 | ||
111 | fclose(f); | |
112 | return 0; | |
113 | } | |
114 | ||
9451eeff | 115 | static void exec_criu(struct criu_opts *opts) |
e29fe1dd TA |
116 | { |
117 | char **argv, log[PATH_MAX]; | |
4b54788e | 118 | int static_args = 24, argc = 0, i, ret; |
e29fe1dd TA |
119 | int netnr = 0; |
120 | struct lxc_list *it; | |
121 | ||
a17fa3c0 NE |
122 | char buf[4096], tty_info[32]; |
123 | size_t pos; | |
e9195050 TA |
124 | /* If we are currently in a cgroup /foo/bar, and the container is in a |
125 | * cgroup /lxc/foo, lxcfs will give us an ENOENT if some task in the | |
126 | * container has an open fd that points to one of the cgroup files | |
127 | * (systemd always opens its "root" cgroup). So, let's escape to the | |
128 | * /actual/ root cgroup so that lxcfs thinks criu has enough rights to | |
129 | * see all cgroups. | |
130 | */ | |
7103fe6f | 131 | if (!cgroup_escape()) { |
e9195050 TA |
132 | ERROR("failed to escape cgroups"); |
133 | return; | |
134 | } | |
135 | ||
e29fe1dd TA |
136 | /* The command line always looks like: |
137 | * criu $(action) --tcp-established --file-locks --link-remap --force-irmap \ | |
138 | * --manage-cgroups action-script foo.sh -D $(directory) \ | |
139 | * -o $(directory)/$(action).log --ext-mount-map auto | |
140 | * --enable-external-sharing --enable-external-masters | |
4b54788e | 141 | * --enable-fs hugetlbfs --enable-fs tracefs --ext-mount-map console:/dev/pts/n |
e29fe1dd TA |
142 | * +1 for final NULL */ |
143 | ||
aef3d51e | 144 | if (strcmp(opts->action, "dump") == 0 || strcmp(opts->action, "pre-dump") == 0) { |
dc259399 TA |
145 | /* -t pid --freeze-cgroup /lxc/ct */ |
146 | static_args += 4; | |
e29fe1dd | 147 | |
aef3d51e | 148 | /* --prev-images-dir <path-to-directory-A-relative-to-B> */ |
b2c3710f | 149 | if (opts->user->predump_dir) |
aef3d51e TA |
150 | static_args += 2; |
151 | ||
74eb576c | 152 | /* --page-server --address <address> --port <port> */ |
b2c3710f | 153 | if (opts->user->pageserver_address && opts->user->pageserver_port) |
74eb576c NE |
154 | static_args += 5; |
155 | ||
aef3d51e | 156 | /* --leave-running (only for final dump) */ |
b2c3710f | 157 | if (strcmp(opts->action, "dump") == 0 && !opts->user->stop) |
e29fe1dd | 158 | static_args++; |
4b54788e TA |
159 | |
160 | /* --external tty[88,4] */ | |
161 | if (opts->tty_id[0]) | |
162 | static_args += 2; | |
e29fe1dd TA |
163 | } else if (strcmp(opts->action, "restore") == 0) { |
164 | /* --root $(lxc_mount_point) --restore-detached | |
13389b29 TA |
165 | * --restore-sibling --pidfile $foo --cgroup-root $foo |
166 | * --lsm-profile apparmor:whatever | |
167 | */ | |
168 | static_args += 10; | |
4b54788e TA |
169 | |
170 | tty_info[0] = 0; | |
b2c3710f | 171 | if (load_tty_major_minor(opts->user->directory, tty_info, sizeof(tty_info))) |
4b54788e TA |
172 | return; |
173 | ||
174 | /* --inherit-fd fd[%d]:tty[%s] */ | |
175 | if (tty_info[0]) | |
176 | static_args += 2; | |
e29fe1dd TA |
177 | } else { |
178 | return; | |
179 | } | |
180 | ||
b2c3710f | 181 | if (opts->user->verbose) |
e29fe1dd TA |
182 | static_args++; |
183 | ||
b2c3710f | 184 | ret = snprintf(log, PATH_MAX, "%s/%s.log", opts->user->directory, opts->action); |
e29fe1dd TA |
185 | if (ret < 0 || ret >= PATH_MAX) { |
186 | ERROR("logfile name too long\n"); | |
187 | return; | |
188 | } | |
189 | ||
190 | argv = malloc(static_args * sizeof(*argv)); | |
191 | if (!argv) | |
192 | return; | |
193 | ||
194 | memset(argv, 0, static_args * sizeof(*argv)); | |
195 | ||
196 | #define DECLARE_ARG(arg) \ | |
197 | do { \ | |
198 | if (arg == NULL) { \ | |
199 | ERROR("Got NULL argument for criu"); \ | |
200 | goto err; \ | |
201 | } \ | |
202 | argv[argc++] = strdup(arg); \ | |
203 | if (!argv[argc-1]) \ | |
204 | goto err; \ | |
205 | } while (0) | |
206 | ||
207 | argv[argc++] = on_path("criu", NULL); | |
208 | if (!argv[argc-1]) { | |
209 | ERROR("Couldn't find criu binary\n"); | |
210 | goto err; | |
211 | } | |
212 | ||
213 | DECLARE_ARG(opts->action); | |
214 | DECLARE_ARG("--tcp-established"); | |
215 | DECLARE_ARG("--file-locks"); | |
216 | DECLARE_ARG("--link-remap"); | |
217 | DECLARE_ARG("--force-irmap"); | |
218 | DECLARE_ARG("--manage-cgroups"); | |
219 | DECLARE_ARG("--ext-mount-map"); | |
220 | DECLARE_ARG("auto"); | |
221 | DECLARE_ARG("--enable-external-sharing"); | |
222 | DECLARE_ARG("--enable-external-masters"); | |
dd62857a TA |
223 | DECLARE_ARG("--enable-fs"); |
224 | DECLARE_ARG("hugetlbfs"); | |
5b454329 TA |
225 | DECLARE_ARG("--enable-fs"); |
226 | DECLARE_ARG("tracefs"); | |
e29fe1dd | 227 | DECLARE_ARG("-D"); |
b2c3710f | 228 | DECLARE_ARG(opts->user->directory); |
e29fe1dd TA |
229 | DECLARE_ARG("-o"); |
230 | DECLARE_ARG(log); | |
231 | ||
b2c3710f | 232 | if (opts->user->verbose) |
e29fe1dd TA |
233 | DECLARE_ARG("-vvvvvv"); |
234 | ||
aef3d51e | 235 | if (strcmp(opts->action, "dump") == 0 || strcmp(opts->action, "pre-dump") == 0) { |
dc259399 | 236 | char pid[32], *freezer_relative; |
e29fe1dd TA |
237 | |
238 | if (sprintf(pid, "%d", opts->c->init_pid(opts->c)) < 0) | |
239 | goto err; | |
240 | ||
241 | DECLARE_ARG("-t"); | |
242 | DECLARE_ARG(pid); | |
dc259399 TA |
243 | |
244 | freezer_relative = lxc_cmd_get_cgroup_path(opts->c->name, | |
245 | opts->c->config_path, | |
246 | "freezer"); | |
247 | if (!freezer_relative) { | |
248 | ERROR("failed getting freezer path"); | |
249 | goto err; | |
250 | } | |
251 | ||
252 | ret = snprintf(log, sizeof(log), "/sys/fs/cgroup/freezer/%s", freezer_relative); | |
253 | if (ret < 0 || ret >= sizeof(log)) | |
254 | goto err; | |
255 | ||
256 | DECLARE_ARG("--freeze-cgroup"); | |
257 | DECLARE_ARG(log); | |
258 | ||
4b54788e | 259 | if (opts->tty_id[0]) { |
36d2096c TA |
260 | DECLARE_ARG("--ext-mount-map"); |
261 | DECLARE_ARG("/dev/console:console"); | |
262 | ||
4b54788e TA |
263 | DECLARE_ARG("--external"); |
264 | DECLARE_ARG(opts->tty_id); | |
265 | } | |
266 | ||
b2c3710f | 267 | if (opts->user->predump_dir) { |
aef3d51e | 268 | DECLARE_ARG("--prev-images-dir"); |
b2c3710f | 269 | DECLARE_ARG(opts->user->predump_dir); |
74eb576c | 270 | } |
4c0c0319 | 271 | |
b2c3710f | 272 | if (opts->user->pageserver_address && opts->user->pageserver_port) { |
74eb576c NE |
273 | DECLARE_ARG("--page-server"); |
274 | DECLARE_ARG("--address"); | |
b2c3710f | 275 | DECLARE_ARG(opts->user->pageserver_address); |
74eb576c | 276 | DECLARE_ARG("--port"); |
b2c3710f | 277 | DECLARE_ARG(opts->user->pageserver_port); |
74eb576c | 278 | } |
aef3d51e TA |
279 | |
280 | /* only for final dump */ | |
b2c3710f | 281 | if (strcmp(opts->action, "dump") == 0 && !opts->user->stop) |
e29fe1dd TA |
282 | DECLARE_ARG("--leave-running"); |
283 | } else if (strcmp(opts->action, "restore") == 0) { | |
284 | void *m; | |
285 | int additional; | |
13389b29 | 286 | struct lxc_conf *lxc_conf = opts->c->lxc_conf; |
e29fe1dd TA |
287 | |
288 | DECLARE_ARG("--root"); | |
289 | DECLARE_ARG(opts->c->lxc_conf->rootfs.mount); | |
290 | DECLARE_ARG("--restore-detached"); | |
291 | DECLARE_ARG("--restore-sibling"); | |
292 | DECLARE_ARG("--pidfile"); | |
293 | DECLARE_ARG(opts->pidfile); | |
294 | DECLARE_ARG("--cgroup-root"); | |
295 | DECLARE_ARG(opts->cgroup_path); | |
296 | ||
4b54788e | 297 | if (tty_info[0]) { |
97e4f1a9 TA |
298 | if (opts->console_fd < 0) { |
299 | ERROR("lxc.console configured on source host but not target"); | |
300 | goto err; | |
301 | } | |
302 | ||
4b54788e TA |
303 | ret = snprintf(buf, sizeof(buf), "fd[%d]:%s", opts->console_fd, tty_info); |
304 | if (ret < 0 || ret >= sizeof(buf)) | |
305 | goto err; | |
306 | ||
307 | DECLARE_ARG("--inherit-fd"); | |
308 | DECLARE_ARG(buf); | |
309 | } | |
310 | if (opts->console_name) { | |
311 | if (snprintf(buf, sizeof(buf), "console:%s", opts->console_name) < 0) { | |
312 | SYSERROR("sprintf'd too many bytes"); | |
313 | } | |
314 | DECLARE_ARG("--ext-mount-map"); | |
315 | DECLARE_ARG(buf); | |
316 | } | |
317 | ||
13389b29 TA |
318 | if (lxc_conf->lsm_aa_profile || lxc_conf->lsm_se_context) { |
319 | ||
320 | if (lxc_conf->lsm_aa_profile) | |
321 | ret = snprintf(buf, sizeof(buf), "apparmor:%s", lxc_conf->lsm_aa_profile); | |
322 | else | |
323 | ret = snprintf(buf, sizeof(buf), "selinux:%s", lxc_conf->lsm_se_context); | |
324 | ||
325 | if (ret < 0 || ret >= sizeof(buf)) | |
326 | goto err; | |
327 | ||
328 | DECLARE_ARG("--lsm-profile"); | |
329 | DECLARE_ARG(buf); | |
330 | } | |
331 | ||
e29fe1dd TA |
332 | additional = lxc_list_len(&opts->c->lxc_conf->network) * 2; |
333 | ||
fa071249 TA |
334 | m = realloc(argv, (argc + additional + 1) * sizeof(*argv)); |
335 | if (!m) | |
336 | goto err; | |
e29fe1dd TA |
337 | argv = m; |
338 | ||
339 | lxc_list_for_each(it, &opts->c->lxc_conf->network) { | |
340 | char eth[128], *veth; | |
341 | struct lxc_netdev *n = it->elem; | |
342 | ||
65b20221 TA |
343 | if (n->type != LXC_NET_VETH) |
344 | continue; | |
345 | ||
e29fe1dd TA |
346 | if (n->name) { |
347 | if (strlen(n->name) >= sizeof(eth)) | |
348 | goto err; | |
349 | strncpy(eth, n->name, sizeof(eth)); | |
350 | } else | |
351 | sprintf(eth, "eth%d", netnr); | |
352 | ||
353 | veth = n->priv.veth_attr.pair; | |
354 | ||
c1fd648d TA |
355 | if (n->link) |
356 | ret = snprintf(buf, sizeof(buf), "%s=%s@%s", eth, veth, n->link); | |
357 | else | |
358 | ret = snprintf(buf, sizeof(buf), "%s=%s", eth, veth); | |
e29fe1dd TA |
359 | if (ret < 0 || ret >= sizeof(buf)) |
360 | goto err; | |
361 | ||
362 | DECLARE_ARG("--veth-pair"); | |
363 | DECLARE_ARG(buf); | |
364 | } | |
365 | ||
366 | } | |
367 | ||
368 | argv[argc] = NULL; | |
369 | ||
cf4b07a5 | 370 | buf[0] = 0; |
a17fa3c0 | 371 | pos = 0; |
72a30576 | 372 | |
cf4b07a5 | 373 | for (i = 0; argv[i]; i++) { |
72a30576 NE |
374 | ret = snprintf(buf + pos, sizeof(buf) - pos, "%s ", argv[i]); |
375 | if (ret < 0 || ret >= sizeof(buf) - pos) | |
376 | goto err; | |
377 | else | |
378 | pos += ret; | |
cf4b07a5 TA |
379 | } |
380 | ||
381 | INFO("execing: %s", buf); | |
382 | ||
e29fe1dd TA |
383 | #undef DECLARE_ARG |
384 | execv(argv[0], argv); | |
385 | err: | |
e29fe1dd TA |
386 | for (i = 0; argv[i]; i++) |
387 | free(argv[i]); | |
388 | free(argv); | |
389 | } | |
390 | ||
8ba5ced7 TA |
391 | /* |
392 | * Check to see if the criu version is recent enough for all the features we | |
393 | * use. This version allows either CRIU_VERSION or (CRIU_GITID_VERSION and | |
394 | * CRIU_GITID_PATCHLEVEL) to work, enabling users building from git to c/r | |
395 | * things potentially before a version is released with a particular feature. | |
396 | * | |
397 | * The intent is that when criu development slows down, we can drop this, but | |
398 | * for now we shouldn't attempt to c/r with versions that we know won't work. | |
399 | */ | |
400 | static bool criu_version_ok() | |
401 | { | |
402 | int pipes[2]; | |
403 | pid_t pid; | |
404 | ||
405 | if (pipe(pipes) < 0) { | |
406 | SYSERROR("pipe() failed"); | |
407 | return false; | |
408 | } | |
409 | ||
410 | pid = fork(); | |
411 | if (pid < 0) { | |
412 | SYSERROR("fork() failed"); | |
413 | return false; | |
414 | } | |
415 | ||
416 | if (pid == 0) { | |
417 | char *args[] = { "criu", "--version", NULL }; | |
755fa453 | 418 | char *path; |
8ba5ced7 TA |
419 | close(pipes[0]); |
420 | ||
421 | close(STDERR_FILENO); | |
422 | if (dup2(pipes[1], STDOUT_FILENO) < 0) | |
423 | exit(1); | |
424 | ||
755fa453 | 425 | path = on_path("criu", NULL); |
d9b32b09 SH |
426 | if (!path) |
427 | exit(1); | |
428 | ||
755fa453 | 429 | execv(path, args); |
8ba5ced7 TA |
430 | exit(1); |
431 | } else { | |
432 | FILE *f; | |
433 | char version[1024]; | |
434 | int patch; | |
435 | ||
436 | close(pipes[1]); | |
437 | if (wait_for_pid(pid) < 0) { | |
438 | close(pipes[0]); | |
4eae4051 | 439 | SYSERROR("execing criu failed, is it installed?"); |
8ba5ced7 TA |
440 | return false; |
441 | } | |
442 | ||
443 | f = fdopen(pipes[0], "r"); | |
444 | if (!f) { | |
445 | close(pipes[0]); | |
446 | return false; | |
447 | } | |
448 | ||
a90277df | 449 | if (fscanf(f, "Version: %1023[^\n]s", version) != 1) |
8ba5ced7 TA |
450 | goto version_error; |
451 | ||
452 | if (fgetc(f) != '\n') | |
453 | goto version_error; | |
454 | ||
455 | if (strcmp(version, CRIU_VERSION) >= 0) | |
456 | goto version_match; | |
457 | ||
a90277df | 458 | if (fscanf(f, "GitID: v%1023[^-]s", version) != 1) |
8ba5ced7 TA |
459 | goto version_error; |
460 | ||
461 | if (fgetc(f) != '-') | |
462 | goto version_error; | |
463 | ||
464 | if (fscanf(f, "%d", &patch) != 1) | |
465 | goto version_error; | |
466 | ||
467 | if (strcmp(version, CRIU_GITID_VERSION) < 0) | |
468 | goto version_error; | |
469 | ||
470 | if (patch < CRIU_GITID_PATCHLEVEL) | |
471 | goto version_error; | |
472 | ||
473 | version_match: | |
3158ab5b | 474 | fclose(f); |
8ba5ced7 TA |
475 | return true; |
476 | ||
477 | version_error: | |
3158ab5b | 478 | fclose(f); |
8ba5ced7 TA |
479 | ERROR("must have criu " CRIU_VERSION " or greater to checkpoint/restore\n"); |
480 | return false; | |
481 | } | |
482 | } | |
483 | ||
e29fe1dd TA |
484 | /* Check and make sure the container has a configuration that we know CRIU can |
485 | * dump. */ | |
73d46752 | 486 | static bool criu_ok(struct lxc_container *c) |
e29fe1dd TA |
487 | { |
488 | struct lxc_list *it; | |
e29fe1dd | 489 | |
8ba5ced7 TA |
490 | if (!criu_version_ok()) |
491 | return false; | |
492 | ||
e29fe1dd TA |
493 | if (geteuid()) { |
494 | ERROR("Must be root to checkpoint\n"); | |
495 | return false; | |
496 | } | |
497 | ||
498 | /* We only know how to restore containers with veth networks. */ | |
499 | lxc_list_for_each(it, &c->lxc_conf->network) { | |
500 | struct lxc_netdev *n = it->elem; | |
65b20221 TA |
501 | switch(n->type) { |
502 | case LXC_NET_VETH: | |
503 | case LXC_NET_NONE: | |
504 | case LXC_NET_EMPTY: | |
505 | break; | |
506 | default: | |
e29fe1dd TA |
507 | ERROR("Found network that is not VETH or NONE\n"); |
508 | return false; | |
509 | } | |
510 | } | |
511 | ||
e29fe1dd TA |
512 | return true; |
513 | } | |
514 | ||
e29fe1dd TA |
515 | static bool restore_net_info(struct lxc_container *c) |
516 | { | |
517 | struct lxc_list *it; | |
518 | bool has_error = true; | |
519 | ||
520 | if (container_mem_lock(c)) | |
521 | return false; | |
522 | ||
523 | lxc_list_for_each(it, &c->lxc_conf->network) { | |
524 | struct lxc_netdev *netdev = it->elem; | |
525 | char template[IFNAMSIZ]; | |
65b20221 TA |
526 | |
527 | if (netdev->type != LXC_NET_VETH) | |
528 | continue; | |
529 | ||
e29fe1dd TA |
530 | snprintf(template, sizeof(template), "vethXXXXXX"); |
531 | ||
532 | if (!netdev->priv.veth_attr.pair) | |
533 | netdev->priv.veth_attr.pair = lxc_mkifname(template); | |
534 | ||
535 | if (!netdev->priv.veth_attr.pair) | |
536 | goto out_unlock; | |
537 | } | |
538 | ||
539 | has_error = false; | |
540 | ||
541 | out_unlock: | |
542 | container_mem_unlock(c); | |
543 | return !has_error; | |
544 | } | |
545 | ||
aef3d51e TA |
546 | // do_restore never returns, the calling process is used as the |
547 | // monitor process. do_restore calls exit() if it fails. | |
b2c3710f | 548 | void do_restore(struct lxc_container *c, int status_pipe, struct migrate_opts *opts) |
e29fe1dd TA |
549 | { |
550 | pid_t pid; | |
551 | char pidfile[L_tmpnam]; | |
552 | struct lxc_handler *handler; | |
3d9a5c85 | 553 | int status, pipes[2] = {-1, -1}; |
e29fe1dd TA |
554 | |
555 | if (!tmpnam(pidfile)) | |
556 | goto out; | |
557 | ||
558 | handler = lxc_init(c->name, c->lxc_conf, c->config_path); | |
559 | if (!handler) | |
560 | goto out; | |
561 | ||
562 | if (!cgroup_init(handler)) { | |
563 | ERROR("failed initing cgroups"); | |
564 | goto out_fini_handler; | |
565 | } | |
566 | ||
567 | if (!cgroup_create(handler)) { | |
568 | ERROR("failed creating groups"); | |
569 | goto out_fini_handler; | |
570 | } | |
571 | ||
572 | if (!restore_net_info(c)) { | |
573 | ERROR("failed restoring network info"); | |
574 | goto out_fini_handler; | |
575 | } | |
576 | ||
577 | resolve_clone_flags(handler); | |
578 | ||
3d9a5c85 TA |
579 | if (pipe(pipes) < 0) { |
580 | SYSERROR("pipe() failed"); | |
581 | goto out_fini_handler; | |
582 | } | |
583 | ||
e29fe1dd TA |
584 | pid = fork(); |
585 | if (pid < 0) | |
586 | goto out_fini_handler; | |
587 | ||
588 | if (pid == 0) { | |
589 | struct criu_opts os; | |
590 | struct lxc_rootfs *rootfs; | |
4b54788e | 591 | int flags; |
e29fe1dd | 592 | |
3d9a5c85 TA |
593 | close(status_pipe); |
594 | status_pipe = -1; | |
595 | ||
596 | close(pipes[0]); | |
597 | pipes[0] = -1; | |
598 | if (dup2(pipes[1], STDERR_FILENO) < 0) { | |
599 | SYSERROR("dup2 failed"); | |
600 | goto out_fini_handler; | |
601 | } | |
602 | ||
603 | if (dup2(pipes[1], STDOUT_FILENO) < 0) { | |
604 | SYSERROR("dup2 failed"); | |
605 | goto out_fini_handler; | |
606 | } | |
e29fe1dd TA |
607 | |
608 | if (unshare(CLONE_NEWNS)) | |
609 | goto out_fini_handler; | |
610 | ||
611 | /* CRIU needs the lxc root bind mounted so that it is the root of some | |
612 | * mount. */ | |
613 | rootfs = &c->lxc_conf->rootfs; | |
614 | ||
615 | if (rootfs_is_blockdev(c->lxc_conf)) { | |
616 | if (do_rootfs_setup(c->lxc_conf, c->name, c->config_path) < 0) | |
617 | goto out_fini_handler; | |
618 | } else { | |
619 | if (mkdir(rootfs->mount, 0755) < 0 && errno != EEXIST) | |
620 | goto out_fini_handler; | |
621 | ||
622 | if (mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL) < 0) { | |
623 | SYSERROR("remount / to private failed"); | |
624 | goto out_fini_handler; | |
625 | } | |
626 | ||
627 | if (mount(rootfs->path, rootfs->mount, NULL, MS_BIND, NULL) < 0) { | |
628 | rmdir(rootfs->mount); | |
629 | goto out_fini_handler; | |
630 | } | |
631 | } | |
632 | ||
633 | os.action = "restore"; | |
b2c3710f | 634 | os.user = opts; |
e29fe1dd TA |
635 | os.c = c; |
636 | os.pidfile = pidfile; | |
e29fe1dd | 637 | os.cgroup_path = cgroup_canonical_path(handler); |
4b54788e TA |
638 | os.console_fd = c->lxc_conf->console.slave; |
639 | ||
97e4f1a9 TA |
640 | if (os.console_fd >= 0) { |
641 | /* Twiddle the FD_CLOEXEC bit. We want to pass this FD to criu | |
642 | * via --inherit-fd, so we don't want it to close. | |
643 | */ | |
644 | flags = fcntl(os.console_fd, F_GETFD); | |
645 | if (flags < 0) { | |
646 | SYSERROR("F_GETFD failed: %d", os.console_fd); | |
647 | goto out_fini_handler; | |
648 | } | |
4b54788e | 649 | |
97e4f1a9 | 650 | flags &= ~FD_CLOEXEC; |
4b54788e | 651 | |
97e4f1a9 TA |
652 | if (fcntl(os.console_fd, F_SETFD, flags) < 0) { |
653 | SYSERROR("F_SETFD failed"); | |
654 | goto out_fini_handler; | |
655 | } | |
4b54788e TA |
656 | } |
657 | os.console_name = c->lxc_conf->console.name; | |
e29fe1dd TA |
658 | |
659 | /* exec_criu() returning is an error */ | |
7103fe6f | 660 | exec_criu(&os); |
e29fe1dd TA |
661 | umount(rootfs->mount); |
662 | rmdir(rootfs->mount); | |
663 | goto out_fini_handler; | |
664 | } else { | |
665 | int ret; | |
666 | char title[2048]; | |
667 | ||
3d9a5c85 TA |
668 | close(pipes[1]); |
669 | pipes[1] = -1; | |
670 | ||
e29fe1dd TA |
671 | pid_t w = waitpid(pid, &status, 0); |
672 | if (w == -1) { | |
673 | SYSERROR("waitpid"); | |
674 | goto out_fini_handler; | |
675 | } | |
676 | ||
3d9a5c85 TA |
677 | ret = write(status_pipe, &status, sizeof(status)); |
678 | close(status_pipe); | |
679 | status_pipe = -1; | |
e29fe1dd TA |
680 | |
681 | if (sizeof(status) != ret) { | |
682 | SYSERROR("failed to write all of status"); | |
683 | goto out_fini_handler; | |
684 | } | |
685 | ||
686 | if (WIFEXITED(status)) { | |
687 | if (WEXITSTATUS(status)) { | |
3d9a5c85 TA |
688 | char buf[4096]; |
689 | int n; | |
690 | ||
691 | n = read(pipes[0], buf, sizeof(buf)); | |
692 | if (n < 0) { | |
693 | SYSERROR("failed reading from criu stderr"); | |
694 | goto out_fini_handler; | |
695 | } | |
696 | ||
697 | buf[n] = 0; | |
698 | ||
699 | ERROR("criu process exited %d, output:\n%s\n", WEXITSTATUS(status), buf); | |
e29fe1dd TA |
700 | goto out_fini_handler; |
701 | } else { | |
702 | int ret; | |
703 | FILE *f = fopen(pidfile, "r"); | |
704 | if (!f) { | |
705 | SYSERROR("couldn't read restore's init pidfile %s\n", pidfile); | |
706 | goto out_fini_handler; | |
707 | } | |
708 | ||
709 | ret = fscanf(f, "%d", (int*) &handler->pid); | |
710 | fclose(f); | |
59c2d406 TA |
711 | if (unlink(pidfile) < 0 && errno != ENOENT) |
712 | SYSERROR("unlinking pidfile failed"); | |
713 | ||
e29fe1dd TA |
714 | if (ret != 1) { |
715 | ERROR("reading restore pid failed"); | |
716 | goto out_fini_handler; | |
717 | } | |
718 | ||
f8a41688 TA |
719 | if (lxc_set_state(c->name, handler, RUNNING)) { |
720 | ERROR("error setting running state after restore"); | |
e29fe1dd | 721 | goto out_fini_handler; |
f8a41688 | 722 | } |
e29fe1dd TA |
723 | } |
724 | } else { | |
725 | ERROR("CRIU was killed with signal %d\n", WTERMSIG(status)); | |
726 | goto out_fini_handler; | |
727 | } | |
728 | ||
3d9a5c85 TA |
729 | close(pipes[0]); |
730 | ||
e29fe1dd TA |
731 | /* |
732 | * See comment in lxcapi_start; we don't care if these | |
733 | * fail because it's just a beauty thing. We just | |
734 | * assign the return here to silence potential. | |
735 | */ | |
736 | ret = snprintf(title, sizeof(title), "[lxc monitor] %s %s", c->config_path, c->name); | |
737 | ret = setproctitle(title); | |
738 | ||
739 | ret = lxc_poll(c->name, handler); | |
740 | if (ret) | |
741 | lxc_abort(c->name, handler); | |
742 | lxc_fini(c->name, handler); | |
743 | exit(ret); | |
744 | } | |
745 | ||
746 | out_fini_handler: | |
3d9a5c85 TA |
747 | if (pipes[0] >= 0) |
748 | close(pipes[0]); | |
749 | if (pipes[1] >= 0) | |
750 | close(pipes[1]); | |
751 | ||
e29fe1dd | 752 | lxc_fini(c->name, handler); |
59c2d406 TA |
753 | if (unlink(pidfile) < 0 && errno != ENOENT) |
754 | SYSERROR("unlinking pidfile failed"); | |
e29fe1dd TA |
755 | |
756 | out: | |
3d9a5c85 | 757 | if (status_pipe >= 0) { |
e29fe1dd | 758 | status = 1; |
3d9a5c85 | 759 | if (write(status_pipe, &status, sizeof(status)) != sizeof(status)) { |
e29fe1dd TA |
760 | SYSERROR("writing status failed"); |
761 | } | |
3d9a5c85 | 762 | close(status_pipe); |
e29fe1dd TA |
763 | } |
764 | ||
765 | exit(1); | |
766 | } | |
aef3d51e | 767 | |
4b54788e TA |
768 | static int save_tty_major_minor(char *directory, struct lxc_container *c, char *tty_id, int len) |
769 | { | |
770 | FILE *f; | |
771 | char path[PATH_MAX]; | |
772 | int ret; | |
773 | struct stat sb; | |
774 | ||
775 | if (c->lxc_conf->console.path && !strcmp(c->lxc_conf->console.path, "none")) { | |
776 | tty_id[0] = 0; | |
777 | return 0; | |
778 | } | |
779 | ||
780 | ret = snprintf(path, sizeof(path), "/proc/%d/root/dev/console", c->init_pid(c)); | |
781 | if (ret < 0 || ret >= sizeof(path)) { | |
782 | ERROR("snprintf'd too many chacters: %d", ret); | |
783 | return -1; | |
784 | } | |
785 | ||
786 | ret = stat(path, &sb); | |
787 | if (ret < 0) { | |
788 | SYSERROR("stat of %s failed", path); | |
789 | return -1; | |
790 | } | |
791 | ||
792 | ret = snprintf(path, sizeof(path), "%s/tty.info", directory); | |
793 | if (ret < 0 || ret >= sizeof(path)) { | |
794 | ERROR("snprintf'd too many characters: %d", ret); | |
795 | return -1; | |
796 | } | |
797 | ||
f03280a7 TA |
798 | ret = snprintf(tty_id, len, "tty[%llx:%llx]", |
799 | (long long unsigned) sb.st_rdev, | |
800 | (long long unsigned) sb.st_dev); | |
4b54788e TA |
801 | if (ret < 0 || ret >= sizeof(path)) { |
802 | ERROR("snprintf'd too many characters: %d", ret); | |
803 | return -1; | |
804 | } | |
805 | ||
806 | f = fopen(path, "w"); | |
807 | if (!f) { | |
808 | SYSERROR("failed to open %s", path); | |
809 | return -1; | |
810 | } | |
811 | ||
812 | ret = fprintf(f, "%s", tty_id); | |
813 | fclose(f); | |
814 | if (ret < 0) | |
815 | SYSERROR("failed to write to %s", path); | |
816 | return ret; | |
817 | } | |
818 | ||
aef3d51e | 819 | /* do one of either predump or a regular dump */ |
b2c3710f | 820 | static bool do_dump(struct lxc_container *c, char *mode, struct migrate_opts *opts) |
aef3d51e TA |
821 | { |
822 | pid_t pid; | |
823 | ||
824 | if (!criu_ok(c)) | |
825 | return false; | |
826 | ||
b2c3710f | 827 | if (mkdir_p(opts->directory, 0700) < 0) |
aef3d51e TA |
828 | return false; |
829 | ||
830 | pid = fork(); | |
831 | if (pid < 0) { | |
832 | SYSERROR("fork failed"); | |
833 | return false; | |
834 | } | |
835 | ||
836 | if (pid == 0) { | |
837 | struct criu_opts os; | |
838 | ||
839 | os.action = mode; | |
b2c3710f | 840 | os.user = opts; |
aef3d51e | 841 | os.c = c; |
4b54788e | 842 | os.console_name = c->lxc_conf->console.path; |
74eb576c | 843 | |
b2c3710f | 844 | if (save_tty_major_minor(opts->directory, c, os.tty_id, sizeof(os.tty_id)) < 0) |
4b54788e | 845 | exit(1); |
aef3d51e TA |
846 | |
847 | /* exec_criu() returning is an error */ | |
7103fe6f | 848 | exec_criu(&os); |
aef3d51e TA |
849 | exit(1); |
850 | } else { | |
851 | int status; | |
852 | pid_t w = waitpid(pid, &status, 0); | |
853 | if (w == -1) { | |
854 | SYSERROR("waitpid"); | |
855 | return false; | |
856 | } | |
857 | ||
858 | if (WIFEXITED(status)) { | |
859 | if (WEXITSTATUS(status)) { | |
860 | ERROR("dump failed with %d\n", WEXITSTATUS(status)); | |
861 | return false; | |
862 | } | |
863 | ||
864 | return true; | |
865 | } else if (WIFSIGNALED(status)) { | |
866 | ERROR("dump signaled with %d\n", WTERMSIG(status)); | |
867 | return false; | |
868 | } else { | |
869 | ERROR("unknown dump exit %d\n", status); | |
870 | return false; | |
871 | } | |
872 | } | |
873 | } | |
874 | ||
b2c3710f | 875 | bool __criu_pre_dump(struct lxc_container *c, struct migrate_opts *opts) |
aef3d51e | 876 | { |
b2c3710f | 877 | return do_dump(c, "pre-dump", opts); |
aef3d51e TA |
878 | } |
879 | ||
b2c3710f | 880 | bool __criu_dump(struct lxc_container *c, struct migrate_opts *opts) |
aef3d51e TA |
881 | { |
882 | char path[PATH_MAX]; | |
883 | int ret; | |
884 | ||
b2c3710f | 885 | ret = snprintf(path, sizeof(path), "%s/inventory.img", opts->directory); |
aef3d51e TA |
886 | if (ret < 0 || ret >= sizeof(path)) |
887 | return false; | |
888 | ||
889 | if (access(path, F_OK) == 0) { | |
890 | ERROR("please use a fresh directory for the dump directory\n"); | |
891 | return false; | |
892 | } | |
893 | ||
b2c3710f | 894 | return do_dump(c, "dump", opts); |
aef3d51e TA |
895 | } |
896 | ||
b2c3710f | 897 | bool __criu_restore(struct lxc_container *c, struct migrate_opts *opts) |
aef3d51e TA |
898 | { |
899 | pid_t pid; | |
900 | int status, nread; | |
901 | int pipefd[2]; | |
902 | ||
903 | if (!criu_ok(c)) | |
904 | return false; | |
905 | ||
906 | if (geteuid()) { | |
907 | ERROR("Must be root to restore\n"); | |
908 | return false; | |
909 | } | |
910 | ||
911 | if (pipe(pipefd)) { | |
912 | ERROR("failed to create pipe"); | |
913 | return false; | |
914 | } | |
915 | ||
916 | pid = fork(); | |
917 | if (pid < 0) { | |
918 | close(pipefd[0]); | |
919 | close(pipefd[1]); | |
920 | return false; | |
921 | } | |
922 | ||
923 | if (pid == 0) { | |
924 | close(pipefd[0]); | |
925 | // this never returns | |
b2c3710f | 926 | do_restore(c, pipefd[1], opts); |
aef3d51e TA |
927 | } |
928 | ||
929 | close(pipefd[1]); | |
930 | ||
931 | nread = read(pipefd[0], &status, sizeof(status)); | |
932 | close(pipefd[0]); | |
933 | if (sizeof(status) != nread) { | |
934 | ERROR("reading status from pipe failed"); | |
935 | goto err_wait; | |
936 | } | |
937 | ||
938 | // If the criu process was killed or exited nonzero, wait() for the | |
939 | // handler, since the restore process died. Otherwise, we don't need to | |
940 | // wait, since the child becomes the monitor process. | |
941 | if (!WIFEXITED(status) || WEXITSTATUS(status)) | |
942 | goto err_wait; | |
943 | return true; | |
944 | ||
945 | err_wait: | |
946 | if (wait_for_pid(pid)) | |
947 | ERROR("restore process died"); | |
948 | return false; | |
949 | } |