* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#define _GNU_SOURCE
-#include <assert.h>
+#include <inttypes.h>
#include <linux/limits.h>
#include <sched.h>
#include <stdio.h>
#include "config.h"
-#include "bdev/bdev.h"
+#include "bdev.h"
#include "cgroup.h"
#include "conf.h"
#include "commands.h"
#include "network.h"
#include "utils.h"
-#define CRIU_VERSION "2.0"
+#if IS_BIONIC
+#include <../include/lxcmntent.h>
+#else
+#include <mntent.h>
+#endif
+
+#define CRIU_VERSION "2.0"
#define CRIU_GITID_VERSION "2.0"
#define CRIU_GITID_PATCHLEVEL 0
+#define CRIU_IN_FLIGHT_SUPPORT "2.4"
+#define CRIU_EXTERNAL_NOT_VETH "2.8"
+
lxc_log_define(lxc_criu, lxc);
struct criu_opts {
+ /* the thing to hook to stdout and stderr for logging */
+ int pipefd;
+
/* The type of criu invocation, one of "dump" or "restore" */
char *action;
char tty_id[32]; /* the criu tty id for /dev/console, i.e. "tty[${rdev}:${dev}]" */
/* restore: the file to write the init process' pid into */
- char *pidfile;
- const char *cgroup_path;
+ struct lxc_handler *handler;
int console_fd;
/* The path that is bind mounted from /dev/console, if any. We don't
* want to use `--ext-mount-map auto`'s result here because the pts
* different) on the target host. NULL if lxc.console = "none".
*/
char *console_name;
+
+ /* The detected version of criu */
+ char *criu_version;
};
static int load_tty_major_minor(char *directory, char *output, int len)
int static_args = 23, argc = 0, i, ret;
int netnr = 0;
struct lxc_list *it;
+ FILE *mnts;
+ struct mntent mntent;
char buf[4096], tty_info[32];
size_t pos;
+
/* If we are currently in a cgroup /foo/bar, and the container is in a
* cgroup /lxc/foo, lxcfs will give us an ENOENT if some task in the
* container has an open fd that points to one of the cgroup files
/* The command line always looks like:
* criu $(action) --tcp-established --file-locks --link-remap \
- * --manage-cgroups=full action-script foo.sh -D $(directory) \
+ * --manage-cgroups=full --action-script foo.sh -D $(directory) \
* -o $(directory)/$(action).log --ext-mount-map auto
* --enable-external-sharing --enable-external-masters
* --enable-fs hugetlbfs --enable-fs tracefs --ext-mount-map console:/dev/pts/n
/* --force-irmap */
if (!opts->user->preserves_inodes)
static_args++;
+
+ /* --ghost-limit 1024 */
+ if (opts->user->ghost_limit)
+ static_args += 2;
} else if (strcmp(opts->action, "restore") == 0) {
/* --root $(lxc_mount_point) --restore-detached
- * --restore-sibling --pidfile $foo --cgroup-root $foo
+ * --restore-sibling
* --lsm-profile apparmor:whatever
*/
- static_args += 10;
+ static_args += 6;
tty_info[0] = 0;
if (load_tty_major_minor(opts->user->directory, tty_info, sizeof(tty_info)))
return;
}
+ if (cgroup_num_hierarchies() > 0)
+ static_args += 2 * cgroup_num_hierarchies();
+
if (opts->user->verbose)
static_args++;
if (opts->user->action_script)
static_args += 2;
+ static_args += 2 * lxc_list_len(&opts->c->lxc_conf->mount_list);
+
ret = snprintf(log, PATH_MAX, "%s/%s.log", opts->user->directory, opts->action);
if (ret < 0 || ret >= PATH_MAX) {
- ERROR("logfile name too long\n");
+ ERROR("logfile name too long");
return;
}
argv[argc++] = on_path("criu", NULL);
if (!argv[argc-1]) {
- ERROR("Couldn't find criu binary\n");
+ ERROR("Couldn't find criu binary");
goto err;
}
DECLARE_ARG("-o");
DECLARE_ARG(log);
+ for (i = 0; i < cgroup_num_hierarchies(); i++) {
+ char **controllers = NULL, *fullname;
+ char *path;
+
+ if (!cgroup_get_hierarchies(i, &controllers)) {
+ ERROR("failed to get hierarchy %d", i);
+ goto err;
+ }
+
+ /* if we are in a dump, we have to ask the monitor process what
+ * the right cgroup is. if this is a restore, we can just use
+ * the handler the restore task created.
+ */
+ if (!strcmp(opts->action, "dump") || !strcmp(opts->action, "pre-dump")) {
+ path = lxc_cmd_get_cgroup_path(opts->c->name, opts->c->config_path, controllers[0]);
+ if (!path) {
+ ERROR("failed to get cgroup path for %s", controllers[0]);
+ goto err;
+ }
+ } else {
+ const char *p;
+
+ p = cgroup_get_cgroup(opts->handler, controllers[0]);
+ if (!p) {
+ ERROR("failed to get cgroup path for %s", controllers[0]);
+ goto err;
+ }
+
+ path = strdup(p);
+ if (!path) {
+ ERROR("strdup failed");
+ goto err;
+ }
+ }
+
+ if (!lxc_deslashify(&path)) {
+ ERROR("failed to deslashify %s", path);
+ free(path);
+ goto err;
+ }
+
+ fullname = lxc_string_join(",", (const char **) controllers, false);
+ if (!fullname) {
+ ERROR("failed to join controllers");
+ free(path);
+ goto err;
+ }
+
+ ret = sprintf(buf, "%s:%s", fullname, path);
+ free(path);
+ free(fullname);
+ if (ret < 0 || ret >= sizeof(buf)) {
+ ERROR("sprintf of cgroup root arg failed");
+ goto err;
+ }
+
+ DECLARE_ARG("--cgroup-root");
+ DECLARE_ARG(buf);
+ }
+
if (opts->user->verbose)
DECLARE_ARG("-vvvvvv");
DECLARE_ARG(opts->user->action_script);
}
+ mnts = make_anonymous_mount_file(&opts->c->lxc_conf->mount_list);
+ if (!mnts)
+ goto err;
+
+ while (getmntent_r(mnts, &mntent, buf, sizeof(buf))) {
+ char *fmt, *key, *val, *mntdata;
+ char arg[2 * PATH_MAX + 2];
+ unsigned long flags;
+
+ if (parse_mntopts(mntent.mnt_opts, &flags, &mntdata) < 0)
+ goto err;
+
+ free(mntdata);
+
+ /* only add --ext-mount-map for actual bind mounts */
+ if (!(flags & MS_BIND))
+ continue;
+
+ if (strcmp(opts->action, "dump") == 0) {
+ fmt = "/%s:%s";
+ key = mntent.mnt_dir;
+ val = mntent.mnt_dir;
+ } else {
+ fmt = "%s:%s";
+ key = mntent.mnt_dir;
+ val = mntent.mnt_fsname;
+ }
+
+ ret = snprintf(arg, sizeof(arg), fmt, key, val);
+ if (ret < 0 || ret >= sizeof(arg)) {
+ fclose(mnts);
+ ERROR("snprintf failed");
+ goto err;
+ }
+
+ DECLARE_ARG("--ext-mount-map");
+ DECLARE_ARG(arg);
+ }
+ fclose(mnts);
+
if (strcmp(opts->action, "dump") == 0 || strcmp(opts->action, "pre-dump") == 0) {
char pid[32], *freezer_relative;
if (ret < 0 || ret >= sizeof(log))
goto err;
+ if (!opts->user->disable_skip_in_flight &&
+ strcmp(opts->criu_version, CRIU_IN_FLIGHT_SUPPORT) >= 0)
+ DECLARE_ARG("--skip-in-flight");
+
DECLARE_ARG("--freeze-cgroup");
DECLARE_ARG(log);
if (opts->user->predump_dir) {
DECLARE_ARG("--prev-images-dir");
DECLARE_ARG(opts->user->predump_dir);
+ DECLARE_ARG("--track-mem");
}
if (opts->user->pageserver_address && opts->user->pageserver_port) {
if (!opts->user->preserves_inodes)
DECLARE_ARG("--force-irmap");
+ if (opts->user->ghost_limit) {
+ char ghost_limit[32];
+
+ ret = sprintf(ghost_limit, "%"PRIu64, opts->user->ghost_limit);
+ if (ret < 0 || ret >= sizeof(ghost_limit)) {
+ ERROR("failed to print ghost limit %"PRIu64, opts->user->ghost_limit);
+ goto err;
+ }
+
+ DECLARE_ARG("--ghost-limit");
+ DECLARE_ARG(ghost_limit);
+ }
+
/* only for final dump */
if (strcmp(opts->action, "dump") == 0 && !opts->user->stop)
DECLARE_ARG("--leave-running");
DECLARE_ARG(opts->c->lxc_conf->rootfs.mount);
DECLARE_ARG("--restore-detached");
DECLARE_ARG("--restore-sibling");
- DECLARE_ARG("--pidfile");
- DECLARE_ARG(opts->pidfile);
- DECLARE_ARG("--cgroup-root");
- DECLARE_ARG(opts->cgroup_path);
if (tty_info[0]) {
if (opts->console_fd < 0) {
lxc_list_for_each(it, &opts->c->lxc_conf->network) {
char eth[128], *veth;
+ char *fmt;
struct lxc_netdev *n = it->elem;
-
- if (n->type != LXC_NET_VETH)
- continue;
+ bool external_not_veth;
+
+ if (strcmp(opts->criu_version, CRIU_EXTERNAL_NOT_VETH) >= 0) {
+ /* Since criu version 2.8 the usage of --veth-pair
+ * has been deprecated:
+ * git tag --contains f2037e6d3445fc400
+ * v2.8 */
+ external_not_veth = true;
+ } else {
+ external_not_veth = false;
+ }
if (n->name) {
if (strlen(n->name) >= sizeof(eth))
goto err;
strncpy(eth, n->name, sizeof(eth));
- } else
- sprintf(eth, "eth%d", netnr);
+ } else {
+ ret = snprintf(eth, sizeof(eth), "eth%d", netnr);
+ if (ret < 0 || ret >= sizeof(eth))
+ goto err;
+ }
- veth = n->priv.veth_attr.pair;
+ switch (n->type) {
+ case LXC_NET_VETH:
+ veth = n->priv.veth_attr.pair;
- if (n->link)
- ret = snprintf(buf, sizeof(buf), "%s=%s@%s", eth, veth, n->link);
- else
- ret = snprintf(buf, sizeof(buf), "%s=%s", eth, veth);
- if (ret < 0 || ret >= sizeof(buf))
+ if (n->link) {
+ if (external_not_veth)
+ fmt = "veth[%s]:%s@%s";
+ else
+ fmt = "%s=%s@%s";
+
+ ret = snprintf(buf, sizeof(buf), fmt, eth, veth, n->link);
+ } else {
+ if (external_not_veth)
+ fmt = "veth[%s]:%s";
+ else
+ fmt = "%s=%s";
+
+ ret = snprintf(buf, sizeof(buf), fmt, eth, veth);
+ }
+ if (ret < 0 || ret >= sizeof(buf))
+ goto err;
+ break;
+ case LXC_NET_MACVLAN:
+ if (!n->link) {
+ ERROR("no host interface for macvlan %s", n->name);
+ goto err;
+ }
+
+ ret = snprintf(buf, sizeof(buf), "macvlan[%s]:%s", eth, n->link);
+ if (ret < 0 || ret >= sizeof(buf))
+ goto err;
+ break;
+ case LXC_NET_NONE:
+ case LXC_NET_EMPTY:
+ break;
+ default:
+ /* we have screened for this earlier... */
+ ERROR("unexpected network type %d", n->type);
goto err;
+ }
- DECLARE_ARG("--veth-pair");
+ if (external_not_veth)
+ DECLARE_ARG("--external");
+ else
+ DECLARE_ARG("--veth-pair");
DECLARE_ARG(buf);
+ netnr++;
}
}
INFO("execing: %s", buf);
+ /* before criu inits its log, it sometimes prints things to stdout/err;
+ * let's be sure we capture that.
+ */
+ if (dup2(opts->pipefd, STDOUT_FILENO) < 0) {
+ SYSERROR("dup2 stdout failed");
+ goto err;
+ }
+
+ if (dup2(opts->pipefd, STDERR_FILENO) < 0) {
+ SYSERROR("dup2 stderr failed");
+ goto err;
+ }
+
+ close(opts->pipefd);
+
#undef DECLARE_ARG
execv(argv[0], argv);
err:
version_error:
fclose(f);
free(tmp);
- ERROR("must have criu " CRIU_VERSION " or greater to checkpoint/restore\n");
+ ERROR("must have criu " CRIU_VERSION " or greater to checkpoint/restore");
return false;
}
}
/* Check and make sure the container has a configuration that we know CRIU can
* dump. */
-static bool criu_ok(struct lxc_container *c)
+static bool criu_ok(struct lxc_container *c, char **criu_version)
{
struct lxc_list *it;
- if (!criu_version_ok(NULL))
+ if (!criu_version_ok(criu_version))
return false;
if (geteuid()) {
- ERROR("Must be root to checkpoint\n");
+ ERROR("Must be root to checkpoint");
return false;
}
case LXC_NET_VETH:
case LXC_NET_NONE:
case LXC_NET_EMPTY:
+ case LXC_NET_MACVLAN:
break;
default:
- ERROR("Found network that is not VETH or NONE\n");
+ ERROR("Found un-dumpable network: %s (%s)", lxc_net_type_to_str(n->type), n->name);
return false;
}
}
// do_restore never returns, the calling process is used as the
// monitor process. do_restore calls exit() if it fails.
-void do_restore(struct lxc_container *c, int status_pipe, struct migrate_opts *opts)
+static void do_restore(struct lxc_container *c, int status_pipe, struct migrate_opts *opts, char *criu_version)
{
pid_t pid;
- char pidfile[L_tmpnam];
struct lxc_handler *handler;
- int status, pipes[2] = {-1, -1};
-
- if (!tmpnam(pidfile))
- goto out;
+ int status, fd;
+ int pipes[2] = {-1, -1};
+
+ /* Try to detach from the current controlling tty if it exists.
+ * Othwerise, lxc_init (via lxc_console) will attach the container's
+ * console output to the current tty, which is probably not what any
+ * library user wants, and if they do, they can just manually configure
+ * it :)
+ */
+ fd = open("/dev/tty", O_RDWR);
+ if (fd >= 0) {
+ if (ioctl(fd, TIOCNOTTY, NULL) < 0)
+ SYSERROR("couldn't detach from tty");
+ close(fd);
+ }
handler = lxc_init(c->name, c->lxc_conf, c->config_path);
if (!handler)
close(pipes[0]);
pipes[0] = -1;
- if (dup2(pipes[1], STDERR_FILENO) < 0) {
- SYSERROR("dup2 failed");
- goto out_fini_handler;
- }
-
- if (dup2(pipes[1], STDOUT_FILENO) < 0) {
- SYSERROR("dup2 failed");
- goto out_fini_handler;
- }
if (unshare(CLONE_NEWNS))
goto out_fini_handler;
}
}
+ os.pipefd = pipes[1];
os.action = "restore";
os.user = opts;
os.c = c;
- os.pidfile = pidfile;
- os.cgroup_path = cgroup_canonical_path(handler);
os.console_fd = c->lxc_conf->console.slave;
+ os.criu_version = criu_version;
+ os.handler = handler;
if (os.console_fd >= 0) {
/* Twiddle the FD_CLOEXEC bit. We want to pass this FD to criu
goto out_fini_handler;
}
- ret = write(status_pipe, &status, sizeof(status));
- close(status_pipe);
- status_pipe = -1;
-
- if (sizeof(status) != ret) {
- SYSERROR("failed to write all of status");
- goto out_fini_handler;
- }
-
if (WIFEXITED(status)) {
+ char buf[4096];
+
if (WEXITSTATUS(status)) {
- char buf[4096];
int n;
n = read(pipes[0], buf, sizeof(buf));
goto out_fini_handler;
}
+ if (n == sizeof(buf))
+ n--;
buf[n] = 0;
- ERROR("criu process exited %d, output:\n%s\n", WEXITSTATUS(status), buf);
+ ERROR("criu process exited %d, output:\n%s", WEXITSTATUS(status), buf);
goto out_fini_handler;
} else {
- int ret;
- FILE *f = fopen(pidfile, "r");
+ ret = snprintf(buf, sizeof(buf), "/proc/self/task/%lu/children", (unsigned long)syscall(__NR_gettid));
+ if (ret < 0 || ret >= sizeof(buf)) {
+ ERROR("snprintf'd too many characters: %d", ret);
+ goto out_fini_handler;
+ }
+
+ FILE *f = fopen(buf, "r");
if (!f) {
- SYSERROR("couldn't read restore's init pidfile %s\n", pidfile);
+ SYSERROR("couldn't read restore's children file %s", buf);
goto out_fini_handler;
}
ret = fscanf(f, "%d", (int*) &handler->pid);
fclose(f);
- if (unlink(pidfile) < 0 && errno != ENOENT)
- SYSERROR("unlinking pidfile failed");
-
if (ret != 1) {
ERROR("reading restore pid failed");
goto out_fini_handler;
}
}
} else {
- ERROR("CRIU was killed with signal %d\n", WTERMSIG(status));
+ ERROR("CRIU was killed with signal %d", WTERMSIG(status));
goto out_fini_handler;
}
close(pipes[0]);
+ ret = write(status_pipe, &status, sizeof(status));
+ close(status_pipe);
+ status_pipe = -1;
+
+ if (sizeof(status) != ret) {
+ SYSERROR("failed to write all of status");
+ goto out_fini_handler;
+ }
+
/*
* See comment in lxcapi_start; we don't care if these
* fail because it's just a beauty thing. We just
close(pipes[1]);
lxc_fini(c->name, handler);
- if (unlink(pidfile) < 0 && errno != ENOENT)
- SYSERROR("unlinking pidfile failed");
out:
if (status_pipe >= 0) {
- status = 1;
+ /* ensure getting here was a failure, e.g. if we failed to
+ * parse the child pid or something, even after a successful
+ * restore
+ */
+ if (!status)
+ status = 1;
if (write(status_pipe, &status, sizeof(status)) != sizeof(status)) {
SYSERROR("writing status failed");
}
static bool do_dump(struct lxc_container *c, char *mode, struct migrate_opts *opts)
{
pid_t pid;
+ char *criu_version = NULL;
+ int criuout[2];
- if (!criu_ok(c))
+ if (!criu_ok(c, &criu_version))
return false;
- if (mkdir_p(opts->directory, 0700) < 0)
+ if (pipe(criuout) < 0) {
+ SYSERROR("pipe() failed");
return false;
+ }
+
+ if (mkdir_p(opts->directory, 0700) < 0)
+ goto fail;
pid = fork();
if (pid < 0) {
SYSERROR("fork failed");
- return false;
+ goto fail;
}
if (pid == 0) {
struct criu_opts os;
+ struct lxc_handler h;
+ close(criuout[0]);
+
+ h.name = c->name;
+ if (!cgroup_init(&h)) {
+ ERROR("failed to cgroup_init()");
+ exit(1);
+ }
+
+ os.pipefd = criuout[1];
os.action = mode;
os.user = opts;
os.c = c;
os.console_name = c->lxc_conf->console.path;
+ os.criu_version = criu_version;
if (save_tty_major_minor(opts->directory, c, os.tty_id, sizeof(os.tty_id)) < 0)
exit(1);
exit(1);
} else {
int status;
+ ssize_t n;
+ char buf[4096];
+ bool ret;
+
+ close(criuout[1]);
+
pid_t w = waitpid(pid, &status, 0);
if (w == -1) {
SYSERROR("waitpid");
+ close(criuout[0]);
return false;
}
+ n = read(criuout[0], buf, sizeof(buf));
+ close(criuout[0]);
+ if (n < 0) {
+ SYSERROR("read");
+ n = 0;
+ }
+ buf[n] = 0;
+
if (WIFEXITED(status)) {
if (WEXITSTATUS(status)) {
- ERROR("dump failed with %d\n", WEXITSTATUS(status));
- return false;
+ ERROR("dump failed with %d", WEXITSTATUS(status));
+ ret = false;
+ } else {
+ ret = true;
}
-
- return true;
} else if (WIFSIGNALED(status)) {
- ERROR("dump signaled with %d\n", WTERMSIG(status));
- return false;
+ ERROR("dump signaled with %d", WTERMSIG(status));
+ ret = false;
} else {
- ERROR("unknown dump exit %d\n", status);
- return false;
+ ERROR("unknown dump exit %d", status);
+ ret = false;
}
+
+ if (!ret)
+ ERROR("criu output: %s", buf);
+ return ret;
}
+fail:
+ close(criuout[0]);
+ close(criuout[1]);
+ rmdir(opts->directory);
+ return false;
}
bool __criu_pre_dump(struct lxc_container *c, struct migrate_opts *opts)
return false;
if (access(path, F_OK) == 0) {
- ERROR("please use a fresh directory for the dump directory\n");
+ ERROR("please use a fresh directory for the dump directory");
return false;
}
pid_t pid;
int status, nread;
int pipefd[2];
+ char *criu_version = NULL;
- if (!criu_ok(c))
+ if (!criu_ok(c, &criu_version))
return false;
if (geteuid()) {
- ERROR("Must be root to restore\n");
+ ERROR("Must be root to restore");
return false;
}
if (pid == 0) {
close(pipefd[0]);
// this never returns
- do_restore(c, pipefd[1], opts);
+ do_restore(c, pipefd[1], opts, criu_version);
}
close(pipefd[1]);