]>
Commit | Line | Data |
---|---|---|
e29fe1dd TA |
1 | /* |
2 | * lxc: linux Container library | |
3 | * | |
4 | * Copyright © 2014-2015 Canonical Ltd. | |
5 | * | |
6 | * Authors: | |
7 | * Tycho Andersen <tycho.andersen@canonical.com> | |
8 | * | |
9 | * This library is free software; you can redistribute it and/or | |
10 | * modify it under the terms of the GNU Lesser General Public | |
11 | * License as published by the Free Software Foundation; either | |
12 | * version 2.1 of the License, or (at your option) any later version. | |
13 | * | |
14 | * This library is distributed in the hope that it will be useful, | |
15 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
16 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
17 | * Lesser General Public License for more details. | |
18 | * | |
19 | * You should have received a copy of the GNU Lesser General Public | |
20 | * License along with this library; if not, write to the Free Software | |
21 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
22 | */ | |
23 | #define _GNU_SOURCE | |
24 | #include <assert.h> | |
25 | #include <linux/limits.h> | |
26 | #include <sched.h> | |
27 | #include <stdio.h> | |
28 | #include <stdlib.h> | |
29 | #include <string.h> | |
30 | #include <sys/mount.h> | |
31 | #include <sys/types.h> | |
32 | #include <sys/wait.h> | |
33 | #include <unistd.h> | |
34 | ||
35 | #include "config.h" | |
36 | ||
37 | #include "bdev.h" | |
38 | #include "cgroup.h" | |
39 | #include "conf.h" | |
40 | #include "criu.h" | |
41 | #include "log.h" | |
42 | #include "lxc.h" | |
43 | #include "lxclock.h" | |
44 | #include "network.h" | |
45 | #include "utils.h" | |
46 | ||
47 | lxc_log_define(lxc_criu, lxc); | |
48 | ||
49 | void exec_criu(struct criu_opts *opts) | |
50 | { | |
51 | char **argv, log[PATH_MAX]; | |
dd62857a | 52 | int static_args = 20, argc = 0, i, ret; |
e29fe1dd TA |
53 | int netnr = 0; |
54 | struct lxc_list *it; | |
55 | ||
56 | char buf[4096]; | |
57 | FILE *mnts = NULL; | |
58 | ||
59 | /* The command line always looks like: | |
60 | * criu $(action) --tcp-established --file-locks --link-remap --force-irmap \ | |
61 | * --manage-cgroups action-script foo.sh -D $(directory) \ | |
62 | * -o $(directory)/$(action).log --ext-mount-map auto | |
63 | * --enable-external-sharing --enable-external-masters | |
dd62857a | 64 | * --enable-fs hugetlbfs |
e29fe1dd TA |
65 | * +1 for final NULL */ |
66 | ||
67 | if (strcmp(opts->action, "dump") == 0) { | |
68 | /* -t pid */ | |
69 | static_args += 2; | |
70 | ||
71 | /* --leave-running */ | |
72 | if (!opts->stop) | |
73 | static_args++; | |
74 | } else if (strcmp(opts->action, "restore") == 0) { | |
75 | /* --root $(lxc_mount_point) --restore-detached | |
76 | * --restore-sibling --pidfile $foo --cgroup-root $foo */ | |
77 | static_args += 8; | |
78 | } else { | |
79 | return; | |
80 | } | |
81 | ||
82 | if (opts->verbose) | |
83 | static_args++; | |
84 | ||
85 | ret = snprintf(log, PATH_MAX, "%s/%s.log", opts->directory, opts->action); | |
86 | if (ret < 0 || ret >= PATH_MAX) { | |
87 | ERROR("logfile name too long\n"); | |
88 | return; | |
89 | } | |
90 | ||
91 | argv = malloc(static_args * sizeof(*argv)); | |
92 | if (!argv) | |
93 | return; | |
94 | ||
95 | memset(argv, 0, static_args * sizeof(*argv)); | |
96 | ||
97 | #define DECLARE_ARG(arg) \ | |
98 | do { \ | |
99 | if (arg == NULL) { \ | |
100 | ERROR("Got NULL argument for criu"); \ | |
101 | goto err; \ | |
102 | } \ | |
103 | argv[argc++] = strdup(arg); \ | |
104 | if (!argv[argc-1]) \ | |
105 | goto err; \ | |
106 | } while (0) | |
107 | ||
108 | argv[argc++] = on_path("criu", NULL); | |
109 | if (!argv[argc-1]) { | |
110 | ERROR("Couldn't find criu binary\n"); | |
111 | goto err; | |
112 | } | |
113 | ||
114 | DECLARE_ARG(opts->action); | |
115 | DECLARE_ARG("--tcp-established"); | |
116 | DECLARE_ARG("--file-locks"); | |
117 | DECLARE_ARG("--link-remap"); | |
118 | DECLARE_ARG("--force-irmap"); | |
119 | DECLARE_ARG("--manage-cgroups"); | |
120 | DECLARE_ARG("--ext-mount-map"); | |
121 | DECLARE_ARG("auto"); | |
122 | DECLARE_ARG("--enable-external-sharing"); | |
123 | DECLARE_ARG("--enable-external-masters"); | |
dd62857a TA |
124 | DECLARE_ARG("--enable-fs"); |
125 | DECLARE_ARG("hugetlbfs"); | |
e29fe1dd TA |
126 | DECLARE_ARG("-D"); |
127 | DECLARE_ARG(opts->directory); | |
128 | DECLARE_ARG("-o"); | |
129 | DECLARE_ARG(log); | |
130 | ||
131 | if (opts->verbose) | |
132 | DECLARE_ARG("-vvvvvv"); | |
133 | ||
134 | if (strcmp(opts->action, "dump") == 0) { | |
135 | char pid[32]; | |
136 | ||
137 | if (sprintf(pid, "%d", opts->c->init_pid(opts->c)) < 0) | |
138 | goto err; | |
139 | ||
140 | DECLARE_ARG("-t"); | |
141 | DECLARE_ARG(pid); | |
142 | if (!opts->stop) | |
143 | DECLARE_ARG("--leave-running"); | |
144 | } else if (strcmp(opts->action, "restore") == 0) { | |
145 | void *m; | |
146 | int additional; | |
147 | ||
148 | DECLARE_ARG("--root"); | |
149 | DECLARE_ARG(opts->c->lxc_conf->rootfs.mount); | |
150 | DECLARE_ARG("--restore-detached"); | |
151 | DECLARE_ARG("--restore-sibling"); | |
152 | DECLARE_ARG("--pidfile"); | |
153 | DECLARE_ARG(opts->pidfile); | |
154 | DECLARE_ARG("--cgroup-root"); | |
155 | DECLARE_ARG(opts->cgroup_path); | |
156 | ||
157 | additional = lxc_list_len(&opts->c->lxc_conf->network) * 2; | |
158 | ||
159 | m = realloc(argv, (argc + additional + 1) * sizeof(*argv)); \ | |
160 | if (!m) \ | |
161 | goto err; \ | |
162 | argv = m; | |
163 | ||
164 | lxc_list_for_each(it, &opts->c->lxc_conf->network) { | |
165 | char eth[128], *veth; | |
166 | struct lxc_netdev *n = it->elem; | |
167 | ||
168 | if (n->name) { | |
169 | if (strlen(n->name) >= sizeof(eth)) | |
170 | goto err; | |
171 | strncpy(eth, n->name, sizeof(eth)); | |
172 | } else | |
173 | sprintf(eth, "eth%d", netnr); | |
174 | ||
175 | veth = n->priv.veth_attr.pair; | |
176 | ||
177 | ret = snprintf(buf, sizeof(buf), "%s=%s@%s", eth, veth, n->link); | |
178 | if (ret < 0 || ret >= sizeof(buf)) | |
179 | goto err; | |
180 | ||
181 | DECLARE_ARG("--veth-pair"); | |
182 | DECLARE_ARG(buf); | |
183 | } | |
184 | ||
185 | } | |
186 | ||
187 | argv[argc] = NULL; | |
188 | ||
189 | #undef DECLARE_ARG | |
190 | execv(argv[0], argv); | |
191 | err: | |
192 | if (mnts) | |
193 | fclose(mnts); | |
194 | for (i = 0; argv[i]; i++) | |
195 | free(argv[i]); | |
196 | free(argv); | |
197 | } | |
198 | ||
8ba5ced7 TA |
199 | /* |
200 | * Check to see if the criu version is recent enough for all the features we | |
201 | * use. This version allows either CRIU_VERSION or (CRIU_GITID_VERSION and | |
202 | * CRIU_GITID_PATCHLEVEL) to work, enabling users building from git to c/r | |
203 | * things potentially before a version is released with a particular feature. | |
204 | * | |
205 | * The intent is that when criu development slows down, we can drop this, but | |
206 | * for now we shouldn't attempt to c/r with versions that we know won't work. | |
207 | */ | |
208 | static bool criu_version_ok() | |
209 | { | |
210 | int pipes[2]; | |
211 | pid_t pid; | |
212 | ||
213 | if (pipe(pipes) < 0) { | |
214 | SYSERROR("pipe() failed"); | |
215 | return false; | |
216 | } | |
217 | ||
218 | pid = fork(); | |
219 | if (pid < 0) { | |
220 | SYSERROR("fork() failed"); | |
221 | return false; | |
222 | } | |
223 | ||
224 | if (pid == 0) { | |
225 | char *args[] = { "criu", "--version", NULL }; | |
226 | close(pipes[0]); | |
227 | ||
228 | close(STDERR_FILENO); | |
229 | if (dup2(pipes[1], STDOUT_FILENO) < 0) | |
230 | exit(1); | |
231 | ||
232 | execv("/usr/local/sbin/criu", args); | |
233 | exit(1); | |
234 | } else { | |
235 | FILE *f; | |
236 | char version[1024]; | |
237 | int patch; | |
238 | ||
239 | close(pipes[1]); | |
240 | if (wait_for_pid(pid) < 0) { | |
241 | close(pipes[0]); | |
4eae4051 | 242 | SYSERROR("execing criu failed, is it installed?"); |
8ba5ced7 TA |
243 | return false; |
244 | } | |
245 | ||
246 | f = fdopen(pipes[0], "r"); | |
247 | if (!f) { | |
248 | close(pipes[0]); | |
249 | return false; | |
250 | } | |
251 | ||
252 | if (fscanf(f, "Version: %1024[^\n]s", version) != 1) | |
253 | goto version_error; | |
254 | ||
255 | if (fgetc(f) != '\n') | |
256 | goto version_error; | |
257 | ||
258 | if (strcmp(version, CRIU_VERSION) >= 0) | |
259 | goto version_match; | |
260 | ||
261 | if (fscanf(f, "GitID: v%1024[^-]s", version) != 1) | |
262 | goto version_error; | |
263 | ||
264 | if (fgetc(f) != '-') | |
265 | goto version_error; | |
266 | ||
267 | if (fscanf(f, "%d", &patch) != 1) | |
268 | goto version_error; | |
269 | ||
270 | if (strcmp(version, CRIU_GITID_VERSION) < 0) | |
271 | goto version_error; | |
272 | ||
273 | if (patch < CRIU_GITID_PATCHLEVEL) | |
274 | goto version_error; | |
275 | ||
276 | version_match: | |
277 | close(pipes[0]); | |
278 | return true; | |
279 | ||
280 | version_error: | |
281 | close(pipes[0]); | |
282 | ERROR("must have criu " CRIU_VERSION " or greater to checkpoint/restore\n"); | |
283 | return false; | |
284 | } | |
285 | } | |
286 | ||
e29fe1dd TA |
287 | /* Check and make sure the container has a configuration that we know CRIU can |
288 | * dump. */ | |
289 | bool criu_ok(struct lxc_container *c) | |
290 | { | |
291 | struct lxc_list *it; | |
292 | bool found_deny_rule = false; | |
293 | ||
8ba5ced7 TA |
294 | if (!criu_version_ok()) |
295 | return false; | |
296 | ||
e29fe1dd TA |
297 | if (geteuid()) { |
298 | ERROR("Must be root to checkpoint\n"); | |
299 | return false; | |
300 | } | |
301 | ||
302 | /* We only know how to restore containers with veth networks. */ | |
303 | lxc_list_for_each(it, &c->lxc_conf->network) { | |
304 | struct lxc_netdev *n = it->elem; | |
305 | if (n->type != LXC_NET_VETH && n->type != LXC_NET_NONE) { | |
306 | ERROR("Found network that is not VETH or NONE\n"); | |
307 | return false; | |
308 | } | |
309 | } | |
310 | ||
311 | // These requirements come from http://criu.org/LXC | |
312 | if (c->lxc_conf->console.path && | |
313 | strcmp(c->lxc_conf->console.path, "none") != 0) { | |
314 | ERROR("lxc.console must be none\n"); | |
315 | return false; | |
316 | } | |
317 | ||
318 | if (c->lxc_conf->tty != 0) { | |
319 | ERROR("lxc.tty must be 0\n"); | |
320 | return false; | |
321 | } | |
322 | ||
323 | lxc_list_for_each(it, &c->lxc_conf->cgroup) { | |
324 | struct lxc_cgroup *cg = it->elem; | |
325 | if (strcmp(cg->subsystem, "devices.deny") == 0 && | |
326 | strcmp(cg->value, "c 5:1 rwm") == 0) { | |
327 | ||
328 | found_deny_rule = true; | |
329 | break; | |
330 | } | |
331 | } | |
332 | ||
333 | if (!found_deny_rule) { | |
334 | ERROR("couldn't find devices.deny = c 5:1 rwm"); | |
335 | return false; | |
336 | } | |
337 | ||
338 | return true; | |
339 | } | |
340 | ||
341 | bool dump_net_info(struct lxc_container *c, char *directory) | |
342 | { | |
343 | int netnr; | |
344 | struct lxc_list *it; | |
345 | ||
346 | netnr = 0; | |
347 | lxc_list_for_each(it, &c->lxc_conf->network) { | |
348 | char *veth = NULL, *bridge = NULL, veth_path[PATH_MAX], eth[128]; | |
349 | struct lxc_netdev *n = it->elem; | |
350 | bool has_error = true; | |
351 | int pret; | |
352 | ||
353 | pret = snprintf(veth_path, PATH_MAX, "lxc.network.%d.veth.pair", netnr); | |
354 | if (pret < 0 || pret >= PATH_MAX) | |
355 | goto out; | |
356 | ||
357 | veth = c->get_running_config_item(c, veth_path); | |
358 | if (!veth) { | |
359 | /* criu_ok() checks that all interfaces are | |
360 | * LXC_NET{VETH,NONE}, and VETHs should have this | |
361 | * config */ | |
362 | assert(n->type == LXC_NET_NONE); | |
363 | break; | |
364 | } | |
365 | ||
366 | bridge = c->get_running_config_item(c, veth_path); | |
367 | if (!bridge) | |
368 | goto out; | |
369 | ||
370 | pret = snprintf(veth_path, PATH_MAX, "%s/veth%d", directory, netnr); | |
371 | if (pret < 0 || pret >= PATH_MAX || print_to_file(veth_path, veth) < 0) | |
372 | goto out; | |
373 | ||
374 | if (n->name) { | |
375 | if (strlen(n->name) >= 128) | |
376 | goto out; | |
377 | strncpy(eth, n->name, 128); | |
378 | } else | |
379 | sprintf(eth, "eth%d", netnr); | |
380 | ||
381 | has_error = false; | |
382 | out: | |
383 | free(veth); | |
384 | free(bridge); | |
385 | if (has_error) | |
386 | return false; | |
387 | } | |
388 | ||
389 | return true; | |
390 | } | |
391 | ||
392 | static bool restore_net_info(struct lxc_container *c) | |
393 | { | |
394 | struct lxc_list *it; | |
395 | bool has_error = true; | |
396 | ||
397 | if (container_mem_lock(c)) | |
398 | return false; | |
399 | ||
400 | lxc_list_for_each(it, &c->lxc_conf->network) { | |
401 | struct lxc_netdev *netdev = it->elem; | |
402 | char template[IFNAMSIZ]; | |
403 | snprintf(template, sizeof(template), "vethXXXXXX"); | |
404 | ||
405 | if (!netdev->priv.veth_attr.pair) | |
406 | netdev->priv.veth_attr.pair = lxc_mkifname(template); | |
407 | ||
408 | if (!netdev->priv.veth_attr.pair) | |
409 | goto out_unlock; | |
410 | } | |
411 | ||
412 | has_error = false; | |
413 | ||
414 | out_unlock: | |
415 | container_mem_unlock(c); | |
416 | return !has_error; | |
417 | } | |
418 | ||
419 | void do_restore(struct lxc_container *c, int pipe, char *directory, bool verbose) | |
420 | { | |
421 | pid_t pid; | |
422 | char pidfile[L_tmpnam]; | |
423 | struct lxc_handler *handler; | |
424 | int status; | |
425 | ||
426 | if (!tmpnam(pidfile)) | |
427 | goto out; | |
428 | ||
429 | handler = lxc_init(c->name, c->lxc_conf, c->config_path); | |
430 | if (!handler) | |
431 | goto out; | |
432 | ||
433 | if (!cgroup_init(handler)) { | |
434 | ERROR("failed initing cgroups"); | |
435 | goto out_fini_handler; | |
436 | } | |
437 | ||
438 | if (!cgroup_create(handler)) { | |
439 | ERROR("failed creating groups"); | |
440 | goto out_fini_handler; | |
441 | } | |
442 | ||
443 | if (!restore_net_info(c)) { | |
444 | ERROR("failed restoring network info"); | |
445 | goto out_fini_handler; | |
446 | } | |
447 | ||
448 | resolve_clone_flags(handler); | |
449 | ||
450 | pid = fork(); | |
451 | if (pid < 0) | |
452 | goto out_fini_handler; | |
453 | ||
454 | if (pid == 0) { | |
455 | struct criu_opts os; | |
456 | struct lxc_rootfs *rootfs; | |
457 | ||
458 | close(pipe); | |
459 | pipe = -1; | |
460 | ||
461 | if (unshare(CLONE_NEWNS)) | |
462 | goto out_fini_handler; | |
463 | ||
464 | /* CRIU needs the lxc root bind mounted so that it is the root of some | |
465 | * mount. */ | |
466 | rootfs = &c->lxc_conf->rootfs; | |
467 | ||
468 | if (rootfs_is_blockdev(c->lxc_conf)) { | |
469 | if (do_rootfs_setup(c->lxc_conf, c->name, c->config_path) < 0) | |
470 | goto out_fini_handler; | |
471 | } else { | |
472 | if (mkdir(rootfs->mount, 0755) < 0 && errno != EEXIST) | |
473 | goto out_fini_handler; | |
474 | ||
475 | if (mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL) < 0) { | |
476 | SYSERROR("remount / to private failed"); | |
477 | goto out_fini_handler; | |
478 | } | |
479 | ||
480 | if (mount(rootfs->path, rootfs->mount, NULL, MS_BIND, NULL) < 0) { | |
481 | rmdir(rootfs->mount); | |
482 | goto out_fini_handler; | |
483 | } | |
484 | } | |
485 | ||
486 | os.action = "restore"; | |
487 | os.directory = directory; | |
488 | os.c = c; | |
489 | os.pidfile = pidfile; | |
490 | os.verbose = verbose; | |
491 | os.cgroup_path = cgroup_canonical_path(handler); | |
492 | ||
493 | /* exec_criu() returning is an error */ | |
494 | exec_criu(&os); | |
495 | umount(rootfs->mount); | |
496 | rmdir(rootfs->mount); | |
497 | goto out_fini_handler; | |
498 | } else { | |
499 | int ret; | |
500 | char title[2048]; | |
501 | ||
502 | pid_t w = waitpid(pid, &status, 0); | |
503 | if (w == -1) { | |
504 | SYSERROR("waitpid"); | |
505 | goto out_fini_handler; | |
506 | } | |
507 | ||
508 | ret = write(pipe, &status, sizeof(status)); | |
509 | close(pipe); | |
510 | pipe = -1; | |
511 | ||
512 | if (sizeof(status) != ret) { | |
513 | SYSERROR("failed to write all of status"); | |
514 | goto out_fini_handler; | |
515 | } | |
516 | ||
517 | if (WIFEXITED(status)) { | |
518 | if (WEXITSTATUS(status)) { | |
519 | goto out_fini_handler; | |
520 | } else { | |
521 | int ret; | |
522 | FILE *f = fopen(pidfile, "r"); | |
523 | if (!f) { | |
524 | SYSERROR("couldn't read restore's init pidfile %s\n", pidfile); | |
525 | goto out_fini_handler; | |
526 | } | |
527 | ||
528 | ret = fscanf(f, "%d", (int*) &handler->pid); | |
529 | fclose(f); | |
59c2d406 TA |
530 | if (unlink(pidfile) < 0 && errno != ENOENT) |
531 | SYSERROR("unlinking pidfile failed"); | |
532 | ||
e29fe1dd TA |
533 | if (ret != 1) { |
534 | ERROR("reading restore pid failed"); | |
535 | goto out_fini_handler; | |
536 | } | |
537 | ||
538 | if (lxc_set_state(c->name, handler, RUNNING)) | |
539 | goto out_fini_handler; | |
540 | } | |
541 | } else { | |
542 | ERROR("CRIU was killed with signal %d\n", WTERMSIG(status)); | |
543 | goto out_fini_handler; | |
544 | } | |
545 | ||
546 | /* | |
547 | * See comment in lxcapi_start; we don't care if these | |
548 | * fail because it's just a beauty thing. We just | |
549 | * assign the return here to silence potential. | |
550 | */ | |
551 | ret = snprintf(title, sizeof(title), "[lxc monitor] %s %s", c->config_path, c->name); | |
552 | ret = setproctitle(title); | |
553 | ||
554 | ret = lxc_poll(c->name, handler); | |
555 | if (ret) | |
556 | lxc_abort(c->name, handler); | |
557 | lxc_fini(c->name, handler); | |
558 | exit(ret); | |
559 | } | |
560 | ||
561 | out_fini_handler: | |
562 | lxc_fini(c->name, handler); | |
59c2d406 TA |
563 | if (unlink(pidfile) < 0 && errno != ENOENT) |
564 | SYSERROR("unlinking pidfile failed"); | |
e29fe1dd TA |
565 | |
566 | out: | |
567 | if (pipe >= 0) { | |
568 | status = 1; | |
569 | if (write(pipe, &status, sizeof(status)) != sizeof(status)) { | |
570 | SYSERROR("writing status failed"); | |
571 | } | |
572 | close(pipe); | |
573 | } | |
574 | ||
575 | exit(1); | |
576 | } |