lxc_clone: pass non-stack allocated stack to clone

[mirror_lxc.git] / src / lxc / namespace.c
diff --git a/src/lxc/namespace.c b/src/lxc/namespace.c

index 6512685d0ec455519100343588774e1d938b9fca..bc14fb52c91aaba4c03c6fb4396fca301fcf2a17 100644 (file)
--- a/src/lxc/namespace.c
+++ b/src/lxc/namespace.c
@@ -4,7 +4,7 @@
   * (C) Copyright IBM Corp. 2007, 2009
   *
   * Authors:
- * Daniel Lezcano <dlezcano at fr.ibm.com>
+ * Daniel Lezcano <daniel.lezcano at free.fr>
   *
   * This library is free software; you can redistribute it and/or
   * modify it under the terms of the GNU Lesser General Public
@@ -18,100 +18,168 @@
   *
   * You should have received a copy of the GNU Lesser General Public
   * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
   */
  
-#include <unistd.h>
-#include <alloca.h>
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE 1
+#endif
  #include <errno.h>
+#include <fcntl.h>
+#include <sched.h>
  #include <signal.h>
-#include <syscall.h>
  #include <sys/param.h>
-#include <sys/types.h>
  #include <sys/stat.h>
-#include <fcntl.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <unistd.h>
  
-#include "namespace.h"
+#include "config.h"
  #include "log.h"
+#include "memory_utils.h"
+#include "namespace.h"
+#include "utils.h"
  
-#include "setns.h"
-
-lxc_log_define(lxc_namespace, lxc);
+lxc_log_define(namespace, lxc);
  
  struct clone_arg {
         int (*fn)(void *);
         void *arg;
  };
  
-int setns(int fd, int nstype)
-{
-#ifndef __NR_setns
-       errno = ENOSYS;
-       return -1;
-#else
-       return syscall(__NR_setns, fd, nstype);
-#endif
-}
-
  static int do_clone(void *arg)
  {
         struct clone_arg *clone_arg = arg;
         return clone_arg->fn(clone_arg->arg);
  }
  
-pid_t lxc_clone(int (*fn)(void *), void *arg, int flags)
+#define __LXC_STACK_SIZE 4096
+pid_t lxc_clone(int (*fn)(void *), void *arg, int flags, int *pidfd)
  {
+       pid_t ret;
         struct clone_arg clone_arg = {
-               .fn = fn,
-               .arg = arg,
+           .fn = fn,
+           .arg = arg,
         };
+       void *stack;
  
-       long stack_size = sysconf(_SC_PAGESIZE);
-       void *stack = alloca(stack_size);
-       pid_t ret;
+       stack = malloc(__LXC_STACK_SIZE);
+       if (!stack) {
+               SYSERROR("Failed to allocate clone stack");
+               return -ENOMEM;
+       }
  
  #ifdef __ia64__
-       ret = __clone2(do_clone, stack,
-                      stack_size, flags | SIGCHLD, &clone_arg);
+       ret = __clone2(fn, stack, __LXC_STACK_SIZE, flags | SIGCHLD, &clone_arg, pidfd);
  #else
-       ret = clone(do_clone, stack  + stack_size, flags | SIGCHLD, &clone_arg);
+       ret = clone(fn, stack + __LXC_STACK_SIZE, flags | SIGCHLD, &clone_arg, pidfd);
  #endif
         if (ret < 0)
-               ERROR("failed to clone(0x%x): %s", flags, strerror(errno));
+               SYSERROR("Failed to clone (%#x)", flags);
  
         return ret;
  }
  
-int lxc_attach(pid_t pid)
+/* Leave the user namespace at the first position in the array of structs so
+ * that we always attach to it first when iterating over the struct and using
+ * setns() to switch namespaces. This especially affects lxc_attach(): Suppose
+ * you cloned a new user namespace and mount namespace as an unprivileged user
+ * on the host and want to setns() to the mount namespace. This requires you to
+ * attach to the user namespace first otherwise the kernel will fail this check:
+ *
+ *        if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) ||
+ *            !ns_capable(current_user_ns(), CAP_SYS_CHROOT) ||
+ *            !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
+ *            return -EPERM;
+ *
+ *    in
+ *
+ *        linux/fs/namespace.c:mntns_install().
+ */
+const struct ns_info ns_info[LXC_NS_MAX] = {
+       [LXC_NS_USER]    = { "user",   CLONE_NEWUSER,   "CLONE_NEWUSER",   "LXC_USER_NS"    },
+       [LXC_NS_MNT]    =  { "mnt",    CLONE_NEWNS,     "CLONE_NEWNS",     "LXC_MNT_NS"     },
+       [LXC_NS_PID]    =  { "pid",    CLONE_NEWPID,    "CLONE_NEWPID",    "LXC_PID_NS"     },
+       [LXC_NS_UTS]    =  { "uts",    CLONE_NEWUTS,    "CLONE_NEWUTS",    "LXC_UTS_NS"     },
+       [LXC_NS_IPC]    =  { "ipc",    CLONE_NEWIPC,    "CLONE_NEWIPC",    "LXC_IPC_NS"     },
+       [LXC_NS_NET]    =  { "net",    CLONE_NEWNET,    "CLONE_NEWNET",    "LXC_NET_NS"     },
+       [LXC_NS_CGROUP] =  { "cgroup", CLONE_NEWCGROUP, "CLONE_NEWCGROUP", "LXC_CGROUP_NS"  }
+};
+
+int lxc_namespace_2_cloneflag(const char *namespace)
+{
+       int i;
+
+       for (i = 0; i < LXC_NS_MAX; i++)
+               if (!strcasecmp(ns_info[i].proc_name, namespace))
+                       return ns_info[i].clone_flag;
+
+       ERROR("Invalid namespace name \"%s\"", namespace);
+       return -EINVAL;
+}
+
+int lxc_namespace_2_ns_idx(const char *namespace)
  {
-       char path[MAXPATHLEN];
-       char *ns[] = { "pid", "mnt", "net", "ipc", "uts" };
-       const int size = sizeof(ns) / sizeof(char *);
-       int fd[size];
         int i;
  
-       sprintf(path, "/proc/%d/ns", pid);
-       if (access(path, X_OK)) {
-               ERROR("Does this kernel version support 'attach' ?");
+       for (i = 0; i < LXC_NS_MAX; i++)
+               if (!strcmp(ns_info[i].proc_name, namespace))
+                       return i;
+
+       ERROR("Invalid namespace name \"%s\"", namespace);
+       return -EINVAL;
+}
+
+extern int lxc_namespace_2_std_identifiers(char *namespaces)
+{
+       char **it;
+       char *del;
+
+       /* The identifiers for namespaces used with lxc-attach and lxc-unshare
+        * as given on the manpage do not align with the standard identifiers.
+        * This affects network, mount, and uts namespaces. The standard identifiers
+        * are: "mnt", "uts", and "net" whereas lxc-attach and lxc-unshare uses
+        * "MOUNT", "UTSNAME", and "NETWORK". So let's use some cheap memmove()s
+        * to replace them by their standard identifiers.
+        * Let's illustrate this with an example:
+        * Assume the string:
+        *
+        *      "IPC|MOUNT|PID"
+        *
+        * then we memmove()
+        *
+        *      dest: del + 1 == OUNT|PID
+        *      src:  del + 3 == NT|PID
+        */
+       if (!namespaces)
                 return -1;
-       }
  
-       for (i = 0; i < size; i++) {
-               sprintf(path, "/proc/%d/ns/%s", pid, ns[i]);
-               fd[i] = open(path, O_RDONLY);
-               if (fd[i] < 0) {
-                       SYSERROR("failed to open '%s'", path);
-                       return -1;
-               }
+       while ((del = strstr(namespaces, "MOUNT")))
+               memmove(del + 1, del + 3, strlen(del) - 2);
+
+       for (it = (char *[]){"NETWORK", "UTSNAME", NULL}; it && *it; it++)
+               while ((del = strstr(namespaces, *it)))
+                       memmove(del + 3, del + 7, strlen(del) - 6);
+
+       return 0;
+}
+
+int lxc_fill_namespace_flags(char *flaglist, int *flags)
+{
+       char *token;
+       int aflag;
+
+       if (!flaglist) {
+               ERROR("At least one namespace is needed.");
+               return -1;
         }
  
-       for (i = 0; i < size; i++) {
-               if (setns(fd[i], 0)) {
-                       SYSERROR("failed to set namespace '%s'", ns[i]);
+       lxc_iterate_parts(token, flaglist, "|") {
+               aflag = lxc_namespace_2_cloneflag(token);
+               if (aflag < 0)
                         return -1;
-               }
  
-               close(fd[i]);
+               *flags |= aflag;
         }
  
         return 0;