]>
Commit | Line | Data |
---|---|---|
e29fe1dd TA |
1 | /* |
2 | * lxc: linux Container library | |
3 | * | |
4 | * Copyright © 2014-2015 Canonical Ltd. | |
5 | * | |
6 | * Authors: | |
7 | * Tycho Andersen <tycho.andersen@canonical.com> | |
8 | * | |
9 | * This library is free software; you can redistribute it and/or | |
10 | * modify it under the terms of the GNU Lesser General Public | |
11 | * License as published by the Free Software Foundation; either | |
12 | * version 2.1 of the License, or (at your option) any later version. | |
13 | * | |
14 | * This library is distributed in the hope that it will be useful, | |
15 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
16 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
17 | * Lesser General Public License for more details. | |
18 | * | |
19 | * You should have received a copy of the GNU Lesser General Public | |
20 | * License along with this library; if not, write to the Free Software | |
21 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
22 | */ | |
23 | #define _GNU_SOURCE | |
24 | #include <assert.h> | |
25 | #include <linux/limits.h> | |
26 | #include <sched.h> | |
27 | #include <stdio.h> | |
28 | #include <stdlib.h> | |
29 | #include <string.h> | |
30 | #include <sys/mount.h> | |
31 | #include <sys/types.h> | |
32 | #include <sys/wait.h> | |
33 | #include <unistd.h> | |
34 | ||
35 | #include "config.h" | |
36 | ||
37 | #include "bdev.h" | |
38 | #include "cgroup.h" | |
39 | #include "conf.h" | |
40 | #include "criu.h" | |
41 | #include "log.h" | |
42 | #include "lxc.h" | |
43 | #include "lxclock.h" | |
44 | #include "network.h" | |
45 | #include "utils.h" | |
46 | ||
47 | lxc_log_define(lxc_criu, lxc); | |
48 | ||
49 | void exec_criu(struct criu_opts *opts) | |
50 | { | |
51 | char **argv, log[PATH_MAX]; | |
dd62857a | 52 | int static_args = 20, argc = 0, i, ret; |
e29fe1dd TA |
53 | int netnr = 0; |
54 | struct lxc_list *it; | |
55 | ||
56 | char buf[4096]; | |
57 | FILE *mnts = NULL; | |
58 | ||
59 | /* The command line always looks like: | |
60 | * criu $(action) --tcp-established --file-locks --link-remap --force-irmap \ | |
61 | * --manage-cgroups action-script foo.sh -D $(directory) \ | |
62 | * -o $(directory)/$(action).log --ext-mount-map auto | |
63 | * --enable-external-sharing --enable-external-masters | |
dd62857a | 64 | * --enable-fs hugetlbfs |
e29fe1dd TA |
65 | * +1 for final NULL */ |
66 | ||
67 | if (strcmp(opts->action, "dump") == 0) { | |
68 | /* -t pid */ | |
69 | static_args += 2; | |
70 | ||
71 | /* --leave-running */ | |
72 | if (!opts->stop) | |
73 | static_args++; | |
74 | } else if (strcmp(opts->action, "restore") == 0) { | |
75 | /* --root $(lxc_mount_point) --restore-detached | |
76 | * --restore-sibling --pidfile $foo --cgroup-root $foo */ | |
77 | static_args += 8; | |
78 | } else { | |
79 | return; | |
80 | } | |
81 | ||
82 | if (opts->verbose) | |
83 | static_args++; | |
84 | ||
85 | ret = snprintf(log, PATH_MAX, "%s/%s.log", opts->directory, opts->action); | |
86 | if (ret < 0 || ret >= PATH_MAX) { | |
87 | ERROR("logfile name too long\n"); | |
88 | return; | |
89 | } | |
90 | ||
91 | argv = malloc(static_args * sizeof(*argv)); | |
92 | if (!argv) | |
93 | return; | |
94 | ||
95 | memset(argv, 0, static_args * sizeof(*argv)); | |
96 | ||
97 | #define DECLARE_ARG(arg) \ | |
98 | do { \ | |
99 | if (arg == NULL) { \ | |
100 | ERROR("Got NULL argument for criu"); \ | |
101 | goto err; \ | |
102 | } \ | |
103 | argv[argc++] = strdup(arg); \ | |
104 | if (!argv[argc-1]) \ | |
105 | goto err; \ | |
106 | } while (0) | |
107 | ||
108 | argv[argc++] = on_path("criu", NULL); | |
109 | if (!argv[argc-1]) { | |
110 | ERROR("Couldn't find criu binary\n"); | |
111 | goto err; | |
112 | } | |
113 | ||
114 | DECLARE_ARG(opts->action); | |
115 | DECLARE_ARG("--tcp-established"); | |
116 | DECLARE_ARG("--file-locks"); | |
117 | DECLARE_ARG("--link-remap"); | |
118 | DECLARE_ARG("--force-irmap"); | |
119 | DECLARE_ARG("--manage-cgroups"); | |
120 | DECLARE_ARG("--ext-mount-map"); | |
121 | DECLARE_ARG("auto"); | |
122 | DECLARE_ARG("--enable-external-sharing"); | |
123 | DECLARE_ARG("--enable-external-masters"); | |
dd62857a TA |
124 | DECLARE_ARG("--enable-fs"); |
125 | DECLARE_ARG("hugetlbfs"); | |
e29fe1dd TA |
126 | DECLARE_ARG("-D"); |
127 | DECLARE_ARG(opts->directory); | |
128 | DECLARE_ARG("-o"); | |
129 | DECLARE_ARG(log); | |
130 | ||
131 | if (opts->verbose) | |
132 | DECLARE_ARG("-vvvvvv"); | |
133 | ||
134 | if (strcmp(opts->action, "dump") == 0) { | |
135 | char pid[32]; | |
136 | ||
137 | if (sprintf(pid, "%d", opts->c->init_pid(opts->c)) < 0) | |
138 | goto err; | |
139 | ||
140 | DECLARE_ARG("-t"); | |
141 | DECLARE_ARG(pid); | |
142 | if (!opts->stop) | |
143 | DECLARE_ARG("--leave-running"); | |
144 | } else if (strcmp(opts->action, "restore") == 0) { | |
145 | void *m; | |
146 | int additional; | |
147 | ||
148 | DECLARE_ARG("--root"); | |
149 | DECLARE_ARG(opts->c->lxc_conf->rootfs.mount); | |
150 | DECLARE_ARG("--restore-detached"); | |
151 | DECLARE_ARG("--restore-sibling"); | |
152 | DECLARE_ARG("--pidfile"); | |
153 | DECLARE_ARG(opts->pidfile); | |
154 | DECLARE_ARG("--cgroup-root"); | |
155 | DECLARE_ARG(opts->cgroup_path); | |
156 | ||
157 | additional = lxc_list_len(&opts->c->lxc_conf->network) * 2; | |
158 | ||
159 | m = realloc(argv, (argc + additional + 1) * sizeof(*argv)); \ | |
160 | if (!m) \ | |
161 | goto err; \ | |
162 | argv = m; | |
163 | ||
164 | lxc_list_for_each(it, &opts->c->lxc_conf->network) { | |
165 | char eth[128], *veth; | |
166 | struct lxc_netdev *n = it->elem; | |
167 | ||
168 | if (n->name) { | |
169 | if (strlen(n->name) >= sizeof(eth)) | |
170 | goto err; | |
171 | strncpy(eth, n->name, sizeof(eth)); | |
172 | } else | |
173 | sprintf(eth, "eth%d", netnr); | |
174 | ||
175 | veth = n->priv.veth_attr.pair; | |
176 | ||
177 | ret = snprintf(buf, sizeof(buf), "%s=%s@%s", eth, veth, n->link); | |
178 | if (ret < 0 || ret >= sizeof(buf)) | |
179 | goto err; | |
180 | ||
181 | DECLARE_ARG("--veth-pair"); | |
182 | DECLARE_ARG(buf); | |
183 | } | |
184 | ||
185 | } | |
186 | ||
187 | argv[argc] = NULL; | |
188 | ||
189 | #undef DECLARE_ARG | |
190 | execv(argv[0], argv); | |
191 | err: | |
192 | if (mnts) | |
193 | fclose(mnts); | |
194 | for (i = 0; argv[i]; i++) | |
195 | free(argv[i]); | |
196 | free(argv); | |
197 | } | |
198 | ||
8ba5ced7 TA |
199 | /* |
200 | * Check to see if the criu version is recent enough for all the features we | |
201 | * use. This version allows either CRIU_VERSION or (CRIU_GITID_VERSION and | |
202 | * CRIU_GITID_PATCHLEVEL) to work, enabling users building from git to c/r | |
203 | * things potentially before a version is released with a particular feature. | |
204 | * | |
205 | * The intent is that when criu development slows down, we can drop this, but | |
206 | * for now we shouldn't attempt to c/r with versions that we know won't work. | |
207 | */ | |
208 | static bool criu_version_ok() | |
209 | { | |
210 | int pipes[2]; | |
211 | pid_t pid; | |
212 | ||
213 | if (pipe(pipes) < 0) { | |
214 | SYSERROR("pipe() failed"); | |
215 | return false; | |
216 | } | |
217 | ||
218 | pid = fork(); | |
219 | if (pid < 0) { | |
220 | SYSERROR("fork() failed"); | |
221 | return false; | |
222 | } | |
223 | ||
224 | if (pid == 0) { | |
225 | char *args[] = { "criu", "--version", NULL }; | |
755fa453 | 226 | char *path; |
8ba5ced7 TA |
227 | close(pipes[0]); |
228 | ||
229 | close(STDERR_FILENO); | |
230 | if (dup2(pipes[1], STDOUT_FILENO) < 0) | |
231 | exit(1); | |
232 | ||
755fa453 TA |
233 | path = on_path("criu", NULL); |
234 | execv(path, args); | |
8ba5ced7 TA |
235 | exit(1); |
236 | } else { | |
237 | FILE *f; | |
238 | char version[1024]; | |
239 | int patch; | |
240 | ||
241 | close(pipes[1]); | |
242 | if (wait_for_pid(pid) < 0) { | |
243 | close(pipes[0]); | |
4eae4051 | 244 | SYSERROR("execing criu failed, is it installed?"); |
8ba5ced7 TA |
245 | return false; |
246 | } | |
247 | ||
248 | f = fdopen(pipes[0], "r"); | |
249 | if (!f) { | |
250 | close(pipes[0]); | |
251 | return false; | |
252 | } | |
253 | ||
254 | if (fscanf(f, "Version: %1024[^\n]s", version) != 1) | |
255 | goto version_error; | |
256 | ||
257 | if (fgetc(f) != '\n') | |
258 | goto version_error; | |
259 | ||
260 | if (strcmp(version, CRIU_VERSION) >= 0) | |
261 | goto version_match; | |
262 | ||
263 | if (fscanf(f, "GitID: v%1024[^-]s", version) != 1) | |
264 | goto version_error; | |
265 | ||
266 | if (fgetc(f) != '-') | |
267 | goto version_error; | |
268 | ||
269 | if (fscanf(f, "%d", &patch) != 1) | |
270 | goto version_error; | |
271 | ||
272 | if (strcmp(version, CRIU_GITID_VERSION) < 0) | |
273 | goto version_error; | |
274 | ||
275 | if (patch < CRIU_GITID_PATCHLEVEL) | |
276 | goto version_error; | |
277 | ||
278 | version_match: | |
279 | close(pipes[0]); | |
280 | return true; | |
281 | ||
282 | version_error: | |
283 | close(pipes[0]); | |
284 | ERROR("must have criu " CRIU_VERSION " or greater to checkpoint/restore\n"); | |
285 | return false; | |
286 | } | |
287 | } | |
288 | ||
e29fe1dd TA |
289 | /* Check and make sure the container has a configuration that we know CRIU can |
290 | * dump. */ | |
291 | bool criu_ok(struct lxc_container *c) | |
292 | { | |
293 | struct lxc_list *it; | |
294 | bool found_deny_rule = false; | |
295 | ||
8ba5ced7 TA |
296 | if (!criu_version_ok()) |
297 | return false; | |
298 | ||
e29fe1dd TA |
299 | if (geteuid()) { |
300 | ERROR("Must be root to checkpoint\n"); | |
301 | return false; | |
302 | } | |
303 | ||
304 | /* We only know how to restore containers with veth networks. */ | |
305 | lxc_list_for_each(it, &c->lxc_conf->network) { | |
306 | struct lxc_netdev *n = it->elem; | |
307 | if (n->type != LXC_NET_VETH && n->type != LXC_NET_NONE) { | |
308 | ERROR("Found network that is not VETH or NONE\n"); | |
309 | return false; | |
310 | } | |
311 | } | |
312 | ||
313 | // These requirements come from http://criu.org/LXC | |
314 | if (c->lxc_conf->console.path && | |
315 | strcmp(c->lxc_conf->console.path, "none") != 0) { | |
316 | ERROR("lxc.console must be none\n"); | |
317 | return false; | |
318 | } | |
319 | ||
320 | if (c->lxc_conf->tty != 0) { | |
321 | ERROR("lxc.tty must be 0\n"); | |
322 | return false; | |
323 | } | |
324 | ||
325 | lxc_list_for_each(it, &c->lxc_conf->cgroup) { | |
326 | struct lxc_cgroup *cg = it->elem; | |
327 | if (strcmp(cg->subsystem, "devices.deny") == 0 && | |
328 | strcmp(cg->value, "c 5:1 rwm") == 0) { | |
329 | ||
330 | found_deny_rule = true; | |
331 | break; | |
332 | } | |
333 | } | |
334 | ||
335 | if (!found_deny_rule) { | |
336 | ERROR("couldn't find devices.deny = c 5:1 rwm"); | |
337 | return false; | |
338 | } | |
339 | ||
340 | return true; | |
341 | } | |
342 | ||
343 | bool dump_net_info(struct lxc_container *c, char *directory) | |
344 | { | |
345 | int netnr; | |
346 | struct lxc_list *it; | |
347 | ||
348 | netnr = 0; | |
349 | lxc_list_for_each(it, &c->lxc_conf->network) { | |
350 | char *veth = NULL, *bridge = NULL, veth_path[PATH_MAX], eth[128]; | |
351 | struct lxc_netdev *n = it->elem; | |
352 | bool has_error = true; | |
353 | int pret; | |
354 | ||
355 | pret = snprintf(veth_path, PATH_MAX, "lxc.network.%d.veth.pair", netnr); | |
356 | if (pret < 0 || pret >= PATH_MAX) | |
357 | goto out; | |
358 | ||
359 | veth = c->get_running_config_item(c, veth_path); | |
360 | if (!veth) { | |
361 | /* criu_ok() checks that all interfaces are | |
362 | * LXC_NET{VETH,NONE}, and VETHs should have this | |
363 | * config */ | |
364 | assert(n->type == LXC_NET_NONE); | |
365 | break; | |
366 | } | |
367 | ||
368 | bridge = c->get_running_config_item(c, veth_path); | |
369 | if (!bridge) | |
370 | goto out; | |
371 | ||
372 | pret = snprintf(veth_path, PATH_MAX, "%s/veth%d", directory, netnr); | |
373 | if (pret < 0 || pret >= PATH_MAX || print_to_file(veth_path, veth) < 0) | |
374 | goto out; | |
375 | ||
376 | if (n->name) { | |
377 | if (strlen(n->name) >= 128) | |
378 | goto out; | |
379 | strncpy(eth, n->name, 128); | |
380 | } else | |
381 | sprintf(eth, "eth%d", netnr); | |
382 | ||
383 | has_error = false; | |
384 | out: | |
385 | free(veth); | |
386 | free(bridge); | |
387 | if (has_error) | |
388 | return false; | |
389 | } | |
390 | ||
391 | return true; | |
392 | } | |
393 | ||
394 | static bool restore_net_info(struct lxc_container *c) | |
395 | { | |
396 | struct lxc_list *it; | |
397 | bool has_error = true; | |
398 | ||
399 | if (container_mem_lock(c)) | |
400 | return false; | |
401 | ||
402 | lxc_list_for_each(it, &c->lxc_conf->network) { | |
403 | struct lxc_netdev *netdev = it->elem; | |
404 | char template[IFNAMSIZ]; | |
405 | snprintf(template, sizeof(template), "vethXXXXXX"); | |
406 | ||
407 | if (!netdev->priv.veth_attr.pair) | |
408 | netdev->priv.veth_attr.pair = lxc_mkifname(template); | |
409 | ||
410 | if (!netdev->priv.veth_attr.pair) | |
411 | goto out_unlock; | |
412 | } | |
413 | ||
414 | has_error = false; | |
415 | ||
416 | out_unlock: | |
417 | container_mem_unlock(c); | |
418 | return !has_error; | |
419 | } | |
420 | ||
421 | void do_restore(struct lxc_container *c, int pipe, char *directory, bool verbose) | |
422 | { | |
423 | pid_t pid; | |
424 | char pidfile[L_tmpnam]; | |
425 | struct lxc_handler *handler; | |
426 | int status; | |
427 | ||
428 | if (!tmpnam(pidfile)) | |
429 | goto out; | |
430 | ||
431 | handler = lxc_init(c->name, c->lxc_conf, c->config_path); | |
432 | if (!handler) | |
433 | goto out; | |
434 | ||
435 | if (!cgroup_init(handler)) { | |
436 | ERROR("failed initing cgroups"); | |
437 | goto out_fini_handler; | |
438 | } | |
439 | ||
440 | if (!cgroup_create(handler)) { | |
441 | ERROR("failed creating groups"); | |
442 | goto out_fini_handler; | |
443 | } | |
444 | ||
445 | if (!restore_net_info(c)) { | |
446 | ERROR("failed restoring network info"); | |
447 | goto out_fini_handler; | |
448 | } | |
449 | ||
450 | resolve_clone_flags(handler); | |
451 | ||
452 | pid = fork(); | |
453 | if (pid < 0) | |
454 | goto out_fini_handler; | |
455 | ||
456 | if (pid == 0) { | |
457 | struct criu_opts os; | |
458 | struct lxc_rootfs *rootfs; | |
459 | ||
460 | close(pipe); | |
461 | pipe = -1; | |
462 | ||
463 | if (unshare(CLONE_NEWNS)) | |
464 | goto out_fini_handler; | |
465 | ||
466 | /* CRIU needs the lxc root bind mounted so that it is the root of some | |
467 | * mount. */ | |
468 | rootfs = &c->lxc_conf->rootfs; | |
469 | ||
470 | if (rootfs_is_blockdev(c->lxc_conf)) { | |
471 | if (do_rootfs_setup(c->lxc_conf, c->name, c->config_path) < 0) | |
472 | goto out_fini_handler; | |
473 | } else { | |
474 | if (mkdir(rootfs->mount, 0755) < 0 && errno != EEXIST) | |
475 | goto out_fini_handler; | |
476 | ||
477 | if (mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL) < 0) { | |
478 | SYSERROR("remount / to private failed"); | |
479 | goto out_fini_handler; | |
480 | } | |
481 | ||
482 | if (mount(rootfs->path, rootfs->mount, NULL, MS_BIND, NULL) < 0) { | |
483 | rmdir(rootfs->mount); | |
484 | goto out_fini_handler; | |
485 | } | |
486 | } | |
487 | ||
488 | os.action = "restore"; | |
489 | os.directory = directory; | |
490 | os.c = c; | |
491 | os.pidfile = pidfile; | |
492 | os.verbose = verbose; | |
493 | os.cgroup_path = cgroup_canonical_path(handler); | |
494 | ||
495 | /* exec_criu() returning is an error */ | |
496 | exec_criu(&os); | |
497 | umount(rootfs->mount); | |
498 | rmdir(rootfs->mount); | |
499 | goto out_fini_handler; | |
500 | } else { | |
501 | int ret; | |
502 | char title[2048]; | |
503 | ||
504 | pid_t w = waitpid(pid, &status, 0); | |
505 | if (w == -1) { | |
506 | SYSERROR("waitpid"); | |
507 | goto out_fini_handler; | |
508 | } | |
509 | ||
510 | ret = write(pipe, &status, sizeof(status)); | |
511 | close(pipe); | |
512 | pipe = -1; | |
513 | ||
514 | if (sizeof(status) != ret) { | |
515 | SYSERROR("failed to write all of status"); | |
516 | goto out_fini_handler; | |
517 | } | |
518 | ||
519 | if (WIFEXITED(status)) { | |
520 | if (WEXITSTATUS(status)) { | |
521 | goto out_fini_handler; | |
522 | } else { | |
523 | int ret; | |
524 | FILE *f = fopen(pidfile, "r"); | |
525 | if (!f) { | |
526 | SYSERROR("couldn't read restore's init pidfile %s\n", pidfile); | |
527 | goto out_fini_handler; | |
528 | } | |
529 | ||
530 | ret = fscanf(f, "%d", (int*) &handler->pid); | |
531 | fclose(f); | |
59c2d406 TA |
532 | if (unlink(pidfile) < 0 && errno != ENOENT) |
533 | SYSERROR("unlinking pidfile failed"); | |
534 | ||
e29fe1dd TA |
535 | if (ret != 1) { |
536 | ERROR("reading restore pid failed"); | |
537 | goto out_fini_handler; | |
538 | } | |
539 | ||
540 | if (lxc_set_state(c->name, handler, RUNNING)) | |
541 | goto out_fini_handler; | |
542 | } | |
543 | } else { | |
544 | ERROR("CRIU was killed with signal %d\n", WTERMSIG(status)); | |
545 | goto out_fini_handler; | |
546 | } | |
547 | ||
548 | /* | |
549 | * See comment in lxcapi_start; we don't care if these | |
550 | * fail because it's just a beauty thing. We just | |
551 | * assign the return here to silence potential. | |
552 | */ | |
553 | ret = snprintf(title, sizeof(title), "[lxc monitor] %s %s", c->config_path, c->name); | |
554 | ret = setproctitle(title); | |
555 | ||
556 | ret = lxc_poll(c->name, handler); | |
557 | if (ret) | |
558 | lxc_abort(c->name, handler); | |
559 | lxc_fini(c->name, handler); | |
560 | exit(ret); | |
561 | } | |
562 | ||
563 | out_fini_handler: | |
564 | lxc_fini(c->name, handler); | |
59c2d406 TA |
565 | if (unlink(pidfile) < 0 && errno != ENOENT) |
566 | SYSERROR("unlinking pidfile failed"); | |
e29fe1dd TA |
567 | |
568 | out: | |
569 | if (pipe >= 0) { | |
570 | status = 1; | |
571 | if (write(pipe, &status, sizeof(status)) != sizeof(status)) { | |
572 | SYSERROR("writing status failed"); | |
573 | } | |
574 | close(pipe); | |
575 | } | |
576 | ||
577 | exit(1); | |
578 | } |