]> git.proxmox.com Git - mirror_ubuntu-hirsute-kernel.git/blobdiff - fs/shiftfs.c
btrfs: fix slab cache flags for free space tree bitmap
[mirror_ubuntu-hirsute-kernel.git] / fs / shiftfs.c
index f7cada126daaa62146fc018c3dbdacd0ca81eadd..4f1d949035572a21a7e7d1df2268f1f8245b7741 100644 (file)
@@ -1,5 +1,8 @@
+#include <linux/btrfs.h>
+#include <linux/capability.h>
 #include <linux/cred.h>
 #include <linux/mount.h>
+#include <linux/fdtable.h>
 #include <linux/file.h>
 #include <linux/fs.h>
 #include <linux/namei.h>
 #include <linux/kernel.h>
 #include <linux/magic.h>
 #include <linux/parser.h>
+#include <linux/security.h>
 #include <linux/seq_file.h>
 #include <linux/statfs.h>
 #include <linux/slab.h>
 #include <linux/user_namespace.h>
 #include <linux/uidgid.h>
 #include <linux/xattr.h>
+#include <linux/posix_acl.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/uio.h>
+#include <linux/fiemap.h>
 
 struct shiftfs_super_info {
        struct vfsmount *mnt;
        struct user_namespace *userns;
+       /* creds of process who created the super block */
+       const struct cred *creator_cred;
        bool mark;
+       unsigned int passthrough;
+       unsigned int passthrough_mark;
 };
 
-static struct inode *shiftfs_new_inode(struct super_block *sb, umode_t mode,
-                                      struct dentry *dentry);
+static void shiftfs_fill_inode(struct inode *inode, unsigned long ino,
+                              umode_t mode, dev_t dev, struct dentry *dentry);
+
+#define SHIFTFS_PASSTHROUGH_NONE 0
+#define SHIFTFS_PASSTHROUGH_STAT 1
+#define SHIFTFS_PASSTHROUGH_IOCTL 2
+#define SHIFTFS_PASSTHROUGH_ALL                                                \
+       (SHIFTFS_PASSTHROUGH_STAT | SHIFTFS_PASSTHROUGH_IOCTL)
+
+static inline bool shiftfs_passthrough_ioctls(struct shiftfs_super_info *info)
+{
+       if (!(info->passthrough & SHIFTFS_PASSTHROUGH_IOCTL))
+               return false;
+
+       return true;
+}
+
+static inline bool shiftfs_passthrough_statfs(struct shiftfs_super_info *info)
+{
+       if (!(info->passthrough & SHIFTFS_PASSTHROUGH_STAT))
+               return false;
+
+       return true;
+}
 
 enum {
        OPT_MARK,
+       OPT_PASSTHROUGH,
        OPT_LAST,
 };
 
 /* global filesystem options */
 static const match_table_t tokens = {
        { OPT_MARK, "mark" },
+       { OPT_PASSTHROUGH, "passthrough=%u" },
        { OPT_LAST, NULL }
 };
 
-static const struct cred *shiftfs_get_up_creds(struct super_block *sb)
+static const struct cred *shiftfs_override_creds(const struct super_block *sb)
 {
-       struct shiftfs_super_info *ssi = sb->s_fs_info;
-       struct cred *cred = prepare_creds();
+       struct shiftfs_super_info *sbinfo = sb->s_fs_info;
 
-       if (!cred)
-               return NULL;
+       return override_creds(sbinfo->creator_cred);
+}
+
+static inline void shiftfs_revert_object_creds(const struct cred *oldcred,
+                                              struct cred *newcred)
+{
+       revert_creds(oldcred);
+       put_cred(newcred);
+}
 
-       cred->fsuid = KUIDT_INIT(from_kuid(sb->s_user_ns, cred->fsuid));
-       cred->fsgid = KGIDT_INIT(from_kgid(sb->s_user_ns, cred->fsgid));
-       put_user_ns(cred->user_ns);
-       cred->user_ns = get_user_ns(ssi->userns);
+static kuid_t shift_kuid(struct user_namespace *from, struct user_namespace *to,
+                        kuid_t kuid)
+{
+       uid_t uid = from_kuid(from, kuid);
+       return make_kuid(to, uid);
+}
 
-       return cred;
+static kgid_t shift_kgid(struct user_namespace *from, struct user_namespace *to,
+                        kgid_t kgid)
+{
+       gid_t gid = from_kgid(from, kgid);
+       return make_kgid(to, gid);
 }
 
-static const struct cred *shiftfs_new_creds(const struct cred **newcred,
-                                           struct super_block *sb)
+static int shiftfs_override_object_creds(const struct super_block *sb,
+                                        const struct cred **oldcred,
+                                        struct cred **newcred,
+                                        struct dentry *dentry, umode_t mode,
+                                        bool hardlink)
 {
-       const struct cred *cred = shiftfs_get_up_creds(sb);
+       struct shiftfs_super_info *sbinfo = sb->s_fs_info;
+       kuid_t fsuid = current_fsuid();
+       kgid_t fsgid = current_fsgid();
 
-       *newcred = cred;
+       *oldcred = shiftfs_override_creds(sb);
 
-       if (cred)
-               cred = override_creds(cred);
-       else
-               printk(KERN_ERR "shiftfs: Credential override failed: no memory\n");
+       *newcred = prepare_creds();
+       if (!*newcred) {
+               revert_creds(*oldcred);
+               return -ENOMEM;
+       }
+
+       (*newcred)->fsuid = shift_kuid(sb->s_user_ns, sbinfo->userns, fsuid);
+       (*newcred)->fsgid = shift_kgid(sb->s_user_ns, sbinfo->userns, fsgid);
+
+       if (!hardlink) {
+               int err = security_dentry_create_files_as(dentry, mode,
+                                                         &dentry->d_name,
+                                                         *oldcred, *newcred);
+               if (err) {
+                       shiftfs_revert_object_creds(*oldcred, *newcred);
+                       return err;
+               }
+       }
+
+       put_cred(override_creds(*newcred));
+       return 0;
+}
+
+static void shiftfs_copyattr(struct inode *from, struct inode *to)
+{
+       struct user_namespace *from_ns = from->i_sb->s_user_ns;
+       struct user_namespace *to_ns = to->i_sb->s_user_ns;
+
+       to->i_uid = shift_kuid(from_ns, to_ns, from->i_uid);
+       to->i_gid = shift_kgid(from_ns, to_ns, from->i_gid);
+       to->i_mode = from->i_mode;
+       to->i_atime = from->i_atime;
+       to->i_mtime = from->i_mtime;
+       to->i_ctime = from->i_ctime;
+       i_size_write(to, i_size_read(from));
+}
+
+static void shiftfs_copyflags(struct inode *from, struct inode *to)
+{
+       unsigned int mask = S_SYNC | S_IMMUTABLE | S_APPEND | S_NOATIME;
 
-       return cred;
+       inode_set_flags(to, from->i_flags & mask, mask);
 }
 
-static void shiftfs_old_creds(const struct cred *oldcred,
-                             const struct cred **newcred)
+static void shiftfs_file_accessed(struct file *file)
 {
-       if (!*newcred)
+       struct inode *upperi, *loweri;
+
+       if (file->f_flags & O_NOATIME)
                return;
 
-       revert_creds(oldcred);
-       put_cred(*newcred);
+       upperi = file_inode(file);
+       loweri = upperi->i_private;
+
+       if (!loweri)
+               return;
+
+       upperi->i_mtime = loweri->i_mtime;
+       upperi->i_ctime = loweri->i_ctime;
+
+       touch_atime(&file->f_path);
 }
 
-static int shiftfs_parse_options(struct shiftfs_super_info *ssi, char *options)
+static int shiftfs_parse_mount_options(struct shiftfs_super_info *sbinfo,
+                                      char *options)
 {
        char *p;
        substring_t args[MAX_OPT_ARGS];
 
-       ssi->mark = false;
+       sbinfo->mark = false;
+       sbinfo->passthrough = 0;
 
        while ((p = strsep(&options, ",")) != NULL) {
-               int token;
+               int err, intarg, token;
 
                if (!*p)
                        continue;
@@ -91,121 +191,136 @@ static int shiftfs_parse_options(struct shiftfs_super_info *ssi, char *options)
                token = match_token(p, tokens, args);
                switch (token) {
                case OPT_MARK:
-                       ssi->mark = true;
+                       sbinfo->mark = true;
+                       break;
+               case OPT_PASSTHROUGH:
+                       err = match_int(&args[0], &intarg);
+                       if (err)
+                               return err;
+
+                       if (intarg & ~SHIFTFS_PASSTHROUGH_ALL)
+                               return -EINVAL;
+
+                       sbinfo->passthrough = intarg;
                        break;
                default:
                        return -EINVAL;
                }
        }
+
        return 0;
 }
 
 static void shiftfs_d_release(struct dentry *dentry)
 {
-       struct dentry *real = dentry->d_fsdata;
+       struct dentry *lowerd = dentry->d_fsdata;
 
-       dput(real);
+       if (lowerd)
+               dput(lowerd);
 }
 
 static struct dentry *shiftfs_d_real(struct dentry *dentry,
                                     const struct inode *inode)
 {
-       struct dentry *real = dentry->d_fsdata;
+       struct dentry *lowerd = dentry->d_fsdata;
+
+       if (inode && d_inode(dentry) == inode)
+               return dentry;
 
-       if (unlikely(real->d_flags & DCACHE_OP_REAL))
-               return real->d_op->d_real(real, real->d_inode);
+       lowerd = d_real(lowerd, inode);
+       if (lowerd && (!inode || inode == d_inode(lowerd)))
+               return lowerd;
 
-       return real;
+       WARN(1, "shiftfs_d_real(%pd4, %s:%lu): real dentry not found\n", dentry,
+            inode ? inode->i_sb->s_id : "NULL", inode ? inode->i_ino : 0);
+       return dentry;
 }
 
 static int shiftfs_d_weak_revalidate(struct dentry *dentry, unsigned int flags)
 {
-       struct dentry *real = dentry->d_fsdata;
+       int err = 1;
+       struct dentry *lowerd = dentry->d_fsdata;
 
-       if (d_unhashed(real))
+       if (d_is_negative(lowerd) != d_is_negative(dentry))
                return 0;
 
-       if (!(real->d_flags & DCACHE_OP_WEAK_REVALIDATE))
-               return 1;
+       if ((lowerd->d_flags & DCACHE_OP_WEAK_REVALIDATE))
+               err = lowerd->d_op->d_weak_revalidate(lowerd, flags);
 
-       return real->d_op->d_weak_revalidate(real, flags);
+       if (d_really_is_positive(dentry)) {
+               struct inode *inode = d_inode(dentry);
+               struct inode *loweri = d_inode(lowerd);
+
+               shiftfs_copyattr(loweri, inode);
+       }
+
+       return err;
 }
 
 static int shiftfs_d_revalidate(struct dentry *dentry, unsigned int flags)
 {
-       struct dentry *real = dentry->d_fsdata;
-       int ret;
+       int err = 1;
+       struct dentry *lowerd = dentry->d_fsdata;
 
-       if (d_unhashed(real))
+       if (d_unhashed(lowerd) ||
+           ((d_is_negative(lowerd) != d_is_negative(dentry))))
                return 0;
 
-       /*
-        * inode state of underlying changed from positive to negative
-        * or vice versa; force a lookup to update our view
-        */
-       if (d_is_negative(real) != d_is_negative(dentry))
-               return 0;
+       if (flags & LOOKUP_RCU)
+               return -ECHILD;
 
-       if (!(real->d_flags & DCACHE_OP_REVALIDATE))
-               return 1;
+       if ((lowerd->d_flags & DCACHE_OP_REVALIDATE))
+               err = lowerd->d_op->d_revalidate(lowerd, flags);
 
-       ret = real->d_op->d_revalidate(real, flags);
+       if (d_really_is_positive(dentry)) {
+               struct inode *inode = d_inode(dentry);
+               struct inode *loweri = d_inode(lowerd);
 
-       if (ret == 0 && !(flags & LOOKUP_RCU))
-               d_invalidate(real);
+               shiftfs_copyattr(loweri, inode);
+       }
 
-       return ret;
+       return err;
 }
 
 static const struct dentry_operations shiftfs_dentry_ops = {
-       .d_release      = shiftfs_d_release,
-       .d_real         = shiftfs_d_real,
-       .d_revalidate   = shiftfs_d_revalidate,
+       .d_release         = shiftfs_d_release,
+       .d_real            = shiftfs_d_real,
+       .d_revalidate      = shiftfs_d_revalidate,
        .d_weak_revalidate = shiftfs_d_weak_revalidate,
 };
 
-static int shiftfs_readlink(struct dentry *dentry, char __user *data,
-                           int flags)
-{
-       struct dentry *real = dentry->d_fsdata;
-       const struct inode_operations *iop = real->d_inode->i_op;
-
-       if (iop->readlink)
-               return iop->readlink(real, data, flags);
-
-       return -EINVAL;
-}
-
 static const char *shiftfs_get_link(struct dentry *dentry, struct inode *inode,
                                    struct delayed_call *done)
 {
-       if (dentry) {
-               struct dentry *real = dentry->d_fsdata;
-               struct inode *reali = real->d_inode;
-               const struct inode_operations *iop = reali->i_op;
-               const char *res = ERR_PTR(-EPERM);
-
-               if (iop->get_link)
-                       res = iop->get_link(real, reali, done);
+       const char *p;
+       const struct cred *oldcred;
+       struct dentry *lowerd;
 
-               return res;
-       } else {
-               /* RCU lookup not supported */
+       /* RCU lookup not supported */
+       if (!dentry)
                return ERR_PTR(-ECHILD);
-       }
+
+       lowerd = dentry->d_fsdata;
+       oldcred = shiftfs_override_creds(dentry->d_sb);
+       p = vfs_get_link(lowerd, done);
+       revert_creds(oldcred);
+
+       return p;
 }
 
 static int shiftfs_setxattr(struct dentry *dentry, struct inode *inode,
                            const char *name, const void *value,
                            size_t size, int flags)
 {
-       struct dentry *real = dentry->d_fsdata;
-       int err = -EOPNOTSUPP;
-       const struct cred *oldcred, *newcred;
+       struct dentry *lowerd = dentry->d_fsdata;
+       int err;
+       const struct cred *oldcred;
+
+       oldcred = shiftfs_override_creds(dentry->d_sb);
+       err = vfs_setxattr(lowerd, name, value, size, flags);
+       revert_creds(oldcred);
 
-       oldcred = shiftfs_new_creds(&newcred, dentry->d_sb);
-       err = vfs_setxattr(real, name, value, size, flags);
-       shiftfs_old_creds(oldcred, &newcred);
+       shiftfs_copyattr(lowerd->d_inode, inode);
 
        return err;
 }
@@ -214,13 +329,13 @@ static int shiftfs_xattr_get(const struct xattr_handler *handler,
                             struct dentry *dentry, struct inode *inode,
                             const char *name, void *value, size_t size)
 {
-       struct dentry *real = dentry->d_fsdata;
+       struct dentry *lowerd = dentry->d_fsdata;
        int err;
-       const struct cred *oldcred, *newcred;
+       const struct cred *oldcred;
 
-       oldcred = shiftfs_new_creds(&newcred, dentry->d_sb);
-       err = vfs_getxattr(real, name, value, size);
-       shiftfs_old_creds(oldcred, &newcred);
+       oldcred = shiftfs_override_creds(dentry->d_sb);
+       err = vfs_getxattr(lowerd, name, value, size);
+       revert_creds(oldcred);
 
        return err;
 }
@@ -228,26 +343,29 @@ static int shiftfs_xattr_get(const struct xattr_handler *handler,
 static ssize_t shiftfs_listxattr(struct dentry *dentry, char *list,
                                 size_t size)
 {
-       struct dentry *real = dentry->d_fsdata;
+       struct dentry *lowerd = dentry->d_fsdata;
        int err;
-       const struct cred *oldcred, *newcred;
+       const struct cred *oldcred;
 
-       oldcred = shiftfs_new_creds(&newcred, dentry->d_sb);
-       err = vfs_listxattr(real, list, size);
-       shiftfs_old_creds(oldcred, &newcred);
+       oldcred = shiftfs_override_creds(dentry->d_sb);
+       err = vfs_listxattr(lowerd, list, size);
+       revert_creds(oldcred);
 
        return err;
 }
 
 static int shiftfs_removexattr(struct dentry *dentry, const char *name)
 {
-       struct dentry *real = dentry->d_fsdata;
+       struct dentry *lowerd = dentry->d_fsdata;
        int err;
-       const struct cred *oldcred, *newcred;
+       const struct cred *oldcred;
+
+       oldcred = shiftfs_override_creds(dentry->d_sb);
+       err = vfs_removexattr(lowerd, name);
+       revert_creds(oldcred);
 
-       oldcred = shiftfs_new_creds(&newcred, dentry->d_sb);
-       err = vfs_removexattr(real, name);
-       shiftfs_old_creds(oldcred, &newcred);
+       /* update c/mtime */
+       shiftfs_copyattr(lowerd->d_inode, d_inode(dentry));
 
        return err;
 }
@@ -262,93 +380,157 @@ static int shiftfs_xattr_set(const struct xattr_handler *handler,
        return shiftfs_setxattr(dentry, inode, name, value, size, flags);
 }
 
-static void shiftfs_fill_inode(struct inode *inode, struct dentry *dentry)
+static int shiftfs_inode_test(struct inode *inode, void *data)
 {
-       struct inode *reali;
-
-       if (!dentry)
-               return;
-
-       reali = dentry->d_inode;
-
-       if (!reali->i_op->get_link)
-               inode->i_opflags |= IOP_NOFOLLOW;
+       return inode->i_private == data;
+}
 
-       inode->i_mapping = reali->i_mapping;
-       inode->i_private = dentry;
+static int shiftfs_inode_set(struct inode *inode, void *data)
+{
+       inode->i_private = data;
+       return 0;
 }
 
-static int shiftfs_make_object(struct inode *dir, struct dentry *dentry,
-                              umode_t mode, const char *symlink,
-                              struct dentry *hardlink, bool excl)
+static int shiftfs_create_object(struct inode *diri, struct dentry *dentry,
+                                umode_t mode, const char *symlink,
+                                struct dentry *hardlink, bool excl)
 {
-       struct dentry *real = dir->i_private, *new = dentry->d_fsdata;
-       struct inode *reali = real->d_inode, *newi;
-       const struct inode_operations *iop = reali->i_op;
        int err;
-       const struct cred *oldcred, *newcred;
-       bool op_ok = false;
+       const struct cred *oldcred;
+       struct cred *newcred;
+       void *loweri_iop_ptr = NULL;
+       umode_t modei = mode;
+       struct super_block *dir_sb = diri->i_sb;
+       struct dentry *lowerd_new = dentry->d_fsdata;
+       struct inode *inode = NULL, *loweri_dir = diri->i_private;
+       const struct inode_operations *loweri_dir_iop = loweri_dir->i_op;
+       struct dentry *lowerd_link = NULL;
 
        if (hardlink) {
-               op_ok = iop->link;
+               loweri_iop_ptr = loweri_dir_iop->link;
        } else {
                switch (mode & S_IFMT) {
                case S_IFDIR:
-                       op_ok = iop->mkdir;
+                       loweri_iop_ptr = loweri_dir_iop->mkdir;
                        break;
                case S_IFREG:
-                       op_ok = iop->create;
+                       loweri_iop_ptr = loweri_dir_iop->create;
                        break;
                case S_IFLNK:
-                       op_ok = iop->symlink;
+                       loweri_iop_ptr = loweri_dir_iop->symlink;
+                       break;
+               case S_IFSOCK:
+                       /* fall through */
+               case S_IFIFO:
+                       loweri_iop_ptr = loweri_dir_iop->mknod;
+                       break;
                }
        }
-       if (!op_ok)
-               return -EINVAL;
+       if (!loweri_iop_ptr) {
+               err = -EINVAL;
+               goto out_iput;
+       }
 
+       inode_lock_nested(loweri_dir, I_MUTEX_PARENT);
 
-       newi = shiftfs_new_inode(dentry->d_sb, mode, NULL);
-       if (!newi)
-               return -ENOMEM;
+       if (!hardlink) {
+               inode = new_inode(dir_sb);
+               if (!inode) {
+                       err = -ENOMEM;
+                       goto out_iput;
+               }
 
-       oldcred = shiftfs_new_creds(&newcred, dentry->d_sb);
+               /*
+                * new_inode() will have added the new inode to the super
+                * block's list of inodes. Further below we will call
+                * inode_insert5() Which would perform the same operation again
+                * thereby corrupting the list. To avoid this raise I_CREATING
+                * in i_state which will cause inode_insert5() to skip this
+                * step. I_CREATING will be cleared by d_instantiate_new()
+                * below.
+                */
+               spin_lock(&inode->i_lock);
+               inode->i_state |= I_CREATING;
+               spin_unlock(&inode->i_lock);
 
-       inode_lock_nested(reali, I_MUTEX_PARENT);
+               inode_init_owner(inode, diri, mode);
+               modei = inode->i_mode;
+       }
 
-       err = -EINVAL;          /* shut gcc up about uninit var */
-       if (hardlink) {
-               struct dentry *realhardlink = hardlink->d_fsdata;
+       err = shiftfs_override_object_creds(dentry->d_sb, &oldcred, &newcred,
+                                           dentry, modei, hardlink != NULL);
+       if (err)
+               goto out_iput;
 
-               err = vfs_link(realhardlink, reali, new, NULL);
+       if (hardlink) {
+               lowerd_link = hardlink->d_fsdata;
+               err = vfs_link(lowerd_link, loweri_dir, lowerd_new, NULL);
        } else {
-               switch (mode & S_IFMT) {
+               switch (modei & S_IFMT) {
                case S_IFDIR:
-                       err = vfs_mkdir(reali, new, mode);
+                       err = vfs_mkdir(loweri_dir, lowerd_new, modei);
                        break;
                case S_IFREG:
-                       err = vfs_create(reali, new, mode, excl);
+                       err = vfs_create(loweri_dir, lowerd_new, modei, excl);
                        break;
                case S_IFLNK:
-                       err = vfs_symlink(reali, new, symlink);
+                       err = vfs_symlink(loweri_dir, lowerd_new, symlink);
+                       break;
+               case S_IFSOCK:
+                       /* fall through */
+               case S_IFIFO:
+                       err = vfs_mknod(loweri_dir, lowerd_new, modei, 0);
+                       break;
+               default:
+                       err = -EINVAL;
+                       break;
                }
        }
 
-       shiftfs_old_creds(oldcred, &newcred);
+       shiftfs_revert_object_creds(oldcred, newcred);
 
+       if (!err && WARN_ON(!lowerd_new->d_inode))
+               err = -EIO;
        if (err)
-               goto out_dput;
+               goto out_iput;
+
+       if (hardlink) {
+               inode = d_inode(hardlink);
+               ihold(inode);
+
+               /* copy up times from lower inode */
+               shiftfs_copyattr(d_inode(lowerd_link), inode);
+               set_nlink(d_inode(hardlink), d_inode(lowerd_link)->i_nlink);
+               d_instantiate(dentry, inode);
+       } else {
+               struct inode *inode_tmp;
+               struct inode *loweri_new = d_inode(lowerd_new);
+
+               inode_tmp = inode_insert5(inode, (unsigned long)loweri_new,
+                                         shiftfs_inode_test, shiftfs_inode_set,
+                                         loweri_new);
+               if (unlikely(inode_tmp != inode)) {
+                       pr_err_ratelimited("shiftfs: newly created inode found in cache\n");
+                       iput(inode_tmp);
+                       err = -EINVAL;
+                       goto out_iput;
+               }
 
-       shiftfs_fill_inode(newi, new);
+               ihold(loweri_new);
+               shiftfs_fill_inode(inode, loweri_new->i_ino, loweri_new->i_mode,
+                                  0, lowerd_new);
+               d_instantiate_new(dentry, inode);
+       }
 
-       d_instantiate(dentry, newi);
+       shiftfs_copyattr(loweri_dir, diri);
+       if (loweri_iop_ptr == loweri_dir_iop->mkdir)
+               set_nlink(diri, loweri_dir->i_nlink);
 
-       new = NULL;
-       newi = NULL;
+       inode = NULL;
 
- out_dput:
-       dput(new);
-       iput(newi);
-       inode_unlock(reali);
+out_iput:
+       iput(inode);
+       inode_unlock(loweri_dir);
 
        return err;
 }
@@ -358,7 +540,7 @@ static int shiftfs_create(struct inode *dir, struct dentry *dentry,
 {
        mode |= S_IFREG;
 
-       return shiftfs_make_object(dir, dentry, mode, NULL, NULL, excl);
+       return shiftfs_create_object(dir, dentry, mode, NULL, NULL, excl);
 }
 
 static int shiftfs_mkdir(struct inode *dir, struct dentry *dentry,
@@ -366,39 +548,59 @@ static int shiftfs_mkdir(struct inode *dir, struct dentry *dentry,
 {
        mode |= S_IFDIR;
 
-       return shiftfs_make_object(dir, dentry, mode, NULL, NULL, false);
+       return shiftfs_create_object(dir, dentry, mode, NULL, NULL, false);
 }
 
 static int shiftfs_link(struct dentry *hardlink, struct inode *dir,
                        struct dentry *dentry)
 {
-       return shiftfs_make_object(dir, dentry, 0, NULL, hardlink, false);
+       return shiftfs_create_object(dir, dentry, 0, NULL, hardlink, false);
+}
+
+static int shiftfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
+                        dev_t rdev)
+{
+       if (!S_ISFIFO(mode) && !S_ISSOCK(mode))
+               return -EPERM;
+
+       return shiftfs_create_object(dir, dentry, mode, NULL, NULL, false);
 }
 
 static int shiftfs_symlink(struct inode *dir, struct dentry *dentry,
                           const char *symlink)
 {
-       return shiftfs_make_object(dir, dentry, S_IFLNK, symlink, NULL, false);
+       return shiftfs_create_object(dir, dentry, S_IFLNK, symlink, NULL, false);
 }
 
 static int shiftfs_rm(struct inode *dir, struct dentry *dentry, bool rmdir)
 {
-       struct dentry *real = dir->i_private, *new = dentry->d_fsdata;
-       struct inode *reali = real->d_inode;
+       struct dentry *lowerd = dentry->d_fsdata;
+       struct inode *loweri = dir->i_private;
+       struct inode *inode = d_inode(dentry);
        int err;
-       const struct cred *oldcred, *newcred;
-
-       inode_lock_nested(reali, I_MUTEX_PARENT);
-
-       oldcred = shiftfs_new_creds(&newcred, dentry->d_sb);
+       const struct cred *oldcred;
 
+       dget(lowerd);
+       oldcred = shiftfs_override_creds(dentry->d_sb);
+       inode_lock_nested(loweri, I_MUTEX_PARENT);
        if (rmdir)
-               err = vfs_rmdir(reali, new);
+               err = vfs_rmdir(loweri, lowerd);
        else
-               err = vfs_unlink(reali, new, NULL);
+               err = vfs_unlink(loweri, lowerd, NULL);
+       revert_creds(oldcred);
 
-       shiftfs_old_creds(oldcred, &newcred);
-       inode_unlock(reali);
+       if (!err) {
+               d_drop(dentry);
+
+               if (rmdir)
+                       clear_nlink(inode);
+               else
+                       drop_nlink(inode);
+       }
+       inode_unlock(loweri);
+
+       shiftfs_copyattr(loweri, dir);
+       dput(lowerd);
 
        return err;
 }
@@ -417,27 +619,30 @@ static int shiftfs_rename(struct inode *olddir, struct dentry *old,
                          struct inode *newdir, struct dentry *new,
                          unsigned int flags)
 {
-       struct dentry *rodd = olddir->i_private, *rndd = newdir->i_private,
-               *realold = old->d_fsdata,
-               *realnew = new->d_fsdata, *trap;
-       struct inode *realolddir = rodd->d_inode, *realnewdir = rndd->d_inode;
+       struct dentry *lowerd_dir_old = old->d_parent->d_fsdata,
+                     *lowerd_dir_new = new->d_parent->d_fsdata,
+                     *lowerd_old = old->d_fsdata, *lowerd_new = new->d_fsdata,
+                     *trapd;
+       struct inode *loweri_dir_old = lowerd_dir_old->d_inode,
+                    *loweri_dir_new = lowerd_dir_new->d_inode;
        int err = -EINVAL;
-       const struct cred *oldcred, *newcred;
+       const struct cred *oldcred;
 
-       trap = lock_rename(rndd, rodd);
+       trapd = lock_rename(lowerd_dir_new, lowerd_dir_old);
 
-       if (trap == realold || trap == realnew)
+       if (trapd == lowerd_old || trapd == lowerd_new)
                goto out_unlock;
 
-       oldcred = shiftfs_new_creds(&newcred, old->d_sb);
-
-       err = vfs_rename(realolddir, realold, realnewdir,
-                        realnew, NULL, flags);
+       oldcred = shiftfs_override_creds(old->d_sb);
+       err = vfs_rename(loweri_dir_old, lowerd_old, loweri_dir_new, lowerd_new,
+                        NULL, flags);
+       revert_creds(oldcred);
 
-       shiftfs_old_creds(oldcred, &newcred);
+       shiftfs_copyattr(loweri_dir_old, olddir);
+       shiftfs_copyattr(loweri_dir_new, newdir);
 
- out_unlock:
-       unlock_rename(rndd, rodd);
+out_unlock:
+       unlock_rename(lowerd_dir_new, lowerd_dir_old);
 
        return err;
 }
@@ -445,304 +650,1474 @@ static int shiftfs_rename(struct inode *olddir, struct dentry *old,
 static struct dentry *shiftfs_lookup(struct inode *dir, struct dentry *dentry,
                                     unsigned int flags)
 {
-       struct dentry *real = dir->i_private, *new;
-       struct inode *reali = real->d_inode, *newi;
-       const struct cred *oldcred, *newcred;
-
-       inode_lock(reali);
-       oldcred = shiftfs_new_creds(&newcred, dentry->d_sb);
-       new = lookup_one_len(dentry->d_name.name, real, dentry->d_name.len);
-       shiftfs_old_creds(oldcred, &newcred);
-       inode_unlock(reali);
+       struct dentry *new;
+       struct inode *newi;
+       const struct cred *oldcred;
+       struct dentry *lowerd = dentry->d_parent->d_fsdata;
+       struct inode *inode = NULL, *loweri = lowerd->d_inode;
+
+       inode_lock(loweri);
+       oldcred = shiftfs_override_creds(dentry->d_sb);
+       new = lookup_one_len(dentry->d_name.name, lowerd, dentry->d_name.len);
+       revert_creds(oldcred);
+       inode_unlock(loweri);
 
        if (IS_ERR(new))
                return new;
 
        dentry->d_fsdata = new;
 
-       newi = NULL;
-       if (!new->d_inode)
+       newi = new->d_inode;
+       if (!newi)
                goto out;
 
-       newi = shiftfs_new_inode(dentry->d_sb, new->d_inode->i_mode, new);
-       if (!newi) {
+       inode = iget5_locked(dentry->d_sb, (unsigned long)newi,
+                            shiftfs_inode_test, shiftfs_inode_set, newi);
+       if (!inode) {
                dput(new);
                return ERR_PTR(-ENOMEM);
        }
+       if (inode->i_state & I_NEW) {
+               /*
+                * inode->i_private set by shiftfs_inode_set(), but we still
+                * need to take a reference
+               */
+               ihold(newi);
+               shiftfs_fill_inode(inode, newi->i_ino, newi->i_mode, 0, new);
+               unlock_new_inode(inode);
+       }
 
- out:
-       return d_splice_alias(newi, dentry);
+out:
+       return d_splice_alias(inode, dentry);
 }
 
 static int shiftfs_permission(struct inode *inode, int mask)
 {
-       struct dentry *real = inode->i_private;
-       struct inode *reali = real->d_inode;
-       const struct inode_operations *iop = reali->i_op;
        int err;
-       const struct cred *oldcred, *newcred;
+       const struct cred *oldcred;
+       struct inode *loweri = inode->i_private;
 
-       if (mask & MAY_NOT_BLOCK)
+       if (!loweri) {
+               WARN_ON(!(mask & MAY_NOT_BLOCK));
                return -ECHILD;
+       }
 
-       oldcred = shiftfs_new_creds(&newcred, inode->i_sb);
-       if (iop->permission)
-               err = iop->permission(reali, mask);
-       else
-               err = generic_permission(reali, mask);
-       shiftfs_old_creds(oldcred, &newcred);
+       err = generic_permission(inode, mask);
+       if (err)
+               return err;
+
+       oldcred = shiftfs_override_creds(inode->i_sb);
+       err = inode_permission(loweri, mask);
+       revert_creds(oldcred);
 
        return err;
 }
 
-static int shiftfs_setattr(struct dentry *dentry, struct iattr *attr)
+static int shiftfs_fiemap(struct inode *inode,
+                         struct fiemap_extent_info *fieinfo, u64 start,
+                         u64 len)
 {
-       struct dentry *real = dentry->d_fsdata;
-       struct inode *reali = real->d_inode;
-       const struct inode_operations *iop = reali->i_op;
-       struct iattr newattr = *attr;
-       const struct cred *oldcred, *newcred;
-       struct super_block *sb = dentry->d_sb;
        int err;
+       const struct cred *oldcred;
+       struct inode *loweri = inode->i_private;
 
-       newattr.ia_uid = KUIDT_INIT(from_kuid(sb->s_user_ns, attr->ia_uid));
-       newattr.ia_gid = KGIDT_INIT(from_kgid(sb->s_user_ns, attr->ia_gid));
+       if (!loweri->i_op->fiemap)
+               return -EOPNOTSUPP;
 
-       oldcred = shiftfs_new_creds(&newcred, dentry->d_sb);
-       inode_lock(reali);
-       if (iop->setattr)
-               err = iop->setattr(real, &newattr);
-       else
-               err = simple_setattr(real, &newattr);
-       inode_unlock(reali);
-       shiftfs_old_creds(oldcred, &newcred);
+       oldcred = shiftfs_override_creds(inode->i_sb);
+       if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC)
+               filemap_write_and_wait(loweri->i_mapping);
+       err = loweri->i_op->fiemap(loweri, fieinfo, start, len);
+       revert_creds(oldcred);
+
+       return err;
+}
+
+static int shiftfs_tmpfile(struct inode *dir, struct dentry *dentry,
+                          umode_t mode)
+{
+       int err;
+       const struct cred *oldcred;
+       struct dentry *lowerd = dentry->d_fsdata;
+       struct inode *loweri = dir->i_private;
+
+       if (!loweri->i_op->tmpfile)
+               return -EOPNOTSUPP;
+
+       oldcred = shiftfs_override_creds(dir->i_sb);
+       err = loweri->i_op->tmpfile(loweri, lowerd, mode);
+       revert_creds(oldcred);
+
+       return err;
+}
+
+static int shiftfs_setattr(struct dentry *dentry, struct iattr *attr)
+{
+       struct dentry *lowerd = dentry->d_fsdata;
+       struct inode *loweri = lowerd->d_inode;
+       struct iattr newattr;
+       const struct cred *oldcred;
+       struct super_block *sb = dentry->d_sb;
+       struct shiftfs_super_info *sbinfo = sb->s_fs_info;
+       int err;
 
+       err = setattr_prepare(dentry, attr);
        if (err)
                return err;
 
-       /* all OK, reflect the change on our inode */
-       setattr_copy(d_inode(dentry), attr);
-       return 0;
+       newattr = *attr;
+       newattr.ia_uid = shift_kuid(sb->s_user_ns, sbinfo->userns, attr->ia_uid);
+       newattr.ia_gid = shift_kgid(sb->s_user_ns, sbinfo->userns, attr->ia_gid);
+
+       /*
+        * mode change is for clearing setuid/setgid bits. Allow lower fs
+        * to interpret this in its own way.
+        */
+       if (newattr.ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID))
+               newattr.ia_valid &= ~ATTR_MODE;
+
+       inode_lock(loweri);
+       oldcred = shiftfs_override_creds(dentry->d_sb);
+       err = notify_change(lowerd, &newattr, NULL);
+       revert_creds(oldcred);
+       inode_unlock(loweri);
+
+       shiftfs_copyattr(loweri, d_inode(dentry));
+
+       return err;
 }
 
 static int shiftfs_getattr(const struct path *path, struct kstat *stat,
                           u32 request_mask, unsigned int query_flags)
 {
        struct inode *inode = path->dentry->d_inode;
-       struct dentry *real = path->dentry->d_fsdata;
-       struct inode *reali = real->d_inode;
-       const struct inode_operations *iop = reali->i_op;
-       struct path newpath = { .mnt = path->dentry->d_sb->s_fs_info, .dentry = real };
-       int err = 0;
-
-       if (iop->getattr)
-               err = iop->getattr(&newpath, stat, request_mask, query_flags);
-       else
-               generic_fillattr(reali, stat);
+       struct dentry *lowerd = path->dentry->d_fsdata;
+       struct inode *loweri = lowerd->d_inode;
+       struct shiftfs_super_info *info = path->dentry->d_sb->s_fs_info;
+       struct path newpath = { .mnt = info->mnt, .dentry = lowerd };
+       struct user_namespace *from_ns = loweri->i_sb->s_user_ns;
+       struct user_namespace *to_ns = inode->i_sb->s_user_ns;
+       const struct cred *oldcred;
+       int err;
+
+       oldcred = shiftfs_override_creds(inode->i_sb);
+       err = vfs_getattr(&newpath, stat, request_mask, query_flags);
+       revert_creds(oldcred);
 
        if (err)
                return err;
 
        /* transform the underlying id */
-       stat->uid = make_kuid(inode->i_sb->s_user_ns, __kuid_val(stat->uid));
-       stat->gid = make_kgid(inode->i_sb->s_user_ns, __kgid_val(stat->gid));
+       stat->uid = shift_kuid(from_ns, to_ns, stat->uid);
+       stat->gid = shift_kgid(from_ns, to_ns, stat->gid);
        return 0;
 }
 
-static const struct inode_operations shiftfs_inode_ops = {
-       .lookup         = shiftfs_lookup,
-       .getattr        = shiftfs_getattr,
-       .setattr        = shiftfs_setattr,
-       .permission     = shiftfs_permission,
-       .mkdir          = shiftfs_mkdir,
-       .symlink        = shiftfs_symlink,
-       .get_link       = shiftfs_get_link,
-       .readlink       = shiftfs_readlink,
-       .unlink         = shiftfs_unlink,
-       .rmdir          = shiftfs_rmdir,
-       .rename         = shiftfs_rename,
-       .link           = shiftfs_link,
-       .create         = shiftfs_create,
-       .mknod          = NULL, /* no special files currently */
-       .listxattr      = shiftfs_listxattr,
-};
+#ifdef CONFIG_SHIFT_FS_POSIX_ACL
+
+static int
+shift_acl_ids(struct user_namespace *from, struct user_namespace *to,
+             struct posix_acl *acl)
+{
+       int i;
+
+       for (i = 0; i < acl->a_count; i++) {
+               struct posix_acl_entry *e = &acl->a_entries[i];
+               switch(e->e_tag) {
+               case ACL_USER:
+                       e->e_uid = shift_kuid(from, to, e->e_uid);
+                       if (!uid_valid(e->e_uid))
+                               return -EOVERFLOW;
+                       break;
+               case ACL_GROUP:
+                       e->e_gid = shift_kgid(from, to, e->e_gid);
+                       if (!gid_valid(e->e_gid))
+                               return -EOVERFLOW;
+                       break;
+               }
+       }
+       return 0;
+}
 
-static struct inode *shiftfs_new_inode(struct super_block *sb, umode_t mode,
-                                      struct dentry *dentry)
+static void
+shift_acl_xattr_ids(struct user_namespace *from, struct user_namespace *to,
+                   void *value, size_t size)
 {
-       struct inode *inode;
+       struct posix_acl_xattr_header *header = value;
+       struct posix_acl_xattr_entry *entry = (void *)(header + 1), *end;
+       int count;
+       kuid_t kuid;
+       kgid_t kgid;
 
-       inode = new_inode(sb);
-       if (!inode)
+       if (!value)
+               return;
+       if (size < sizeof(struct posix_acl_xattr_header))
+               return;
+       if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION))
+               return;
+
+       count = posix_acl_xattr_count(size);
+       if (count < 0)
+               return;
+       if (count == 0)
+               return;
+
+       for (end = entry + count; entry != end; entry++) {
+               switch(le16_to_cpu(entry->e_tag)) {
+               case ACL_USER:
+                       kuid = make_kuid(&init_user_ns, le32_to_cpu(entry->e_id));
+                       kuid = shift_kuid(from, to, kuid);
+                       entry->e_id = cpu_to_le32(from_kuid(&init_user_ns, kuid));
+                       break;
+               case ACL_GROUP:
+                       kgid = make_kgid(&init_user_ns, le32_to_cpu(entry->e_id));
+                       kgid = shift_kgid(from, to, kgid);
+                       entry->e_id = cpu_to_le32(from_kgid(&init_user_ns, kgid));
+                       break;
+               default:
+                       break;
+               }
+       }
+}
+
+static struct posix_acl *shiftfs_get_acl(struct inode *inode, int type)
+{
+       struct inode *loweri = inode->i_private;
+       const struct cred *oldcred;
+       struct posix_acl *lower_acl, *acl = NULL;
+       struct user_namespace *from_ns = loweri->i_sb->s_user_ns;
+       struct user_namespace *to_ns = inode->i_sb->s_user_ns;
+       int size;
+       int err;
+
+       if (!IS_POSIXACL(loweri))
                return NULL;
 
-       /*
-        * our inode is completely vestigial.  All lookups, getattr
-        * and permission checks are done on the underlying inode, so
-        * what the user sees is entirely from the underlying inode.
-        */
-       mode &= S_IFMT;
+       oldcred = shiftfs_override_creds(inode->i_sb);
+       lower_acl = get_acl(loweri, type);
+       revert_creds(oldcred);
 
-       inode->i_ino = get_next_ino();
-       inode->i_mode = mode;
-       inode->i_flags |= S_NOATIME | S_NOCMTIME;
+       if (lower_acl && !IS_ERR(lower_acl)) {
+               /* XXX: export posix_acl_clone? */
+               size = sizeof(struct posix_acl) +
+                      lower_acl->a_count * sizeof(struct posix_acl_entry);
+               acl = kmemdup(lower_acl, size, GFP_KERNEL);
+               posix_acl_release(lower_acl);
+
+               if (!acl)
+                       return ERR_PTR(-ENOMEM);
 
-       inode->i_op = &shiftfs_inode_ops;
+               refcount_set(&acl->a_refcount, 1);
 
-       shiftfs_fill_inode(inode, dentry);
+               err = shift_acl_ids(from_ns, to_ns, acl);
+               if (err) {
+                       kfree(acl);
+                       return ERR_PTR(err);
+               }
+       }
 
-       return inode;
+       return acl;
 }
 
-static int shiftfs_show_options(struct seq_file *m, struct dentry *dentry)
+static int
+shiftfs_posix_acl_xattr_get(const struct xattr_handler *handler,
+                          struct dentry *dentry, struct inode *inode,
+                          const char *name, void *buffer, size_t size)
 {
-       struct super_block *sb = dentry->d_sb;
-       struct shiftfs_super_info *ssi = sb->s_fs_info;
+       struct inode *loweri = inode->i_private;
+       int ret;
 
-       if (ssi->mark)
-               seq_show_option(m, "mark", NULL);
+       ret = shiftfs_xattr_get(NULL, dentry, inode, handler->name,
+                               buffer, size);
+       if (ret < 0)
+               return ret;
 
-       return 0;
+       inode_lock(loweri);
+       shift_acl_xattr_ids(loweri->i_sb->s_user_ns, inode->i_sb->s_user_ns,
+                           buffer, size);
+       inode_unlock(loweri);
+       return ret;
 }
 
-static int shiftfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+static int
+shiftfs_posix_acl_xattr_set(const struct xattr_handler *handler,
+                           struct dentry *dentry, struct inode *inode,
+                           const char *name, const void *value,
+                           size_t size, int flags)
 {
-       struct super_block *sb = dentry->d_sb;
-       struct shiftfs_super_info *ssi = sb->s_fs_info;
-       struct dentry *root = sb->s_root;
-       struct dentry *realroot = root->d_fsdata;
-       struct path realpath = { .mnt = ssi->mnt, .dentry = realroot };
+       struct inode *loweri = inode->i_private;
        int err;
 
-       err = vfs_statfs(&realpath, buf);
-       if (err)
-               return err;
-
-       buf->f_type = sb->s_magic;
-
-       return 0;
-}
+       if (!IS_POSIXACL(loweri) || !loweri->i_op->set_acl)
+               return -EOPNOTSUPP;
+       if (handler->flags == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode))
+               return value ? -EACCES : 0;
+       if (!inode_owner_or_capable(inode))
+               return -EPERM;
+
+       if (value) {
+               shift_acl_xattr_ids(inode->i_sb->s_user_ns,
+                                   loweri->i_sb->s_user_ns,
+                                   (void *)value, size);
+               err = shiftfs_setxattr(dentry, inode, handler->name, value,
+                                      size, flags);
+       } else {
+               err = shiftfs_removexattr(dentry, handler->name);
+       }
 
-static void shiftfs_put_super(struct super_block *sb)
-{
-       struct shiftfs_super_info *ssi = sb->s_fs_info;
+       if (!err)
+               shiftfs_copyattr(loweri, inode);
 
-       mntput(ssi->mnt);
-       put_user_ns(ssi->userns);
-       kfree(ssi);
+       return err;
 }
 
-static const struct xattr_handler shiftfs_xattr_handler = {
-       .prefix = "",
-       .get    = shiftfs_xattr_get,
-       .set    = shiftfs_xattr_set,
+static const struct xattr_handler
+shiftfs_posix_acl_access_xattr_handler = {
+       .name = XATTR_NAME_POSIX_ACL_ACCESS,
+       .flags = ACL_TYPE_ACCESS,
+       .get = shiftfs_posix_acl_xattr_get,
+       .set = shiftfs_posix_acl_xattr_set,
 };
 
-const struct xattr_handler *shiftfs_xattr_handlers[] = {
-       &shiftfs_xattr_handler,
-       NULL
+static const struct xattr_handler
+shiftfs_posix_acl_default_xattr_handler = {
+       .name = XATTR_NAME_POSIX_ACL_DEFAULT,
+       .flags = ACL_TYPE_DEFAULT,
+       .get = shiftfs_posix_acl_xattr_get,
+       .set = shiftfs_posix_acl_xattr_set,
 };
 
-static const struct super_operations shiftfs_super_ops = {
-       .put_super      = shiftfs_put_super,
-       .show_options   = shiftfs_show_options,
-       .statfs         = shiftfs_statfs,
-};
+#else /* !CONFIG_SHIFT_FS_POSIX_ACL */
 
-struct shiftfs_data {
-       void *data;
-       const char *path;
-};
+#define shiftfs_get_acl NULL
+
+#endif /* CONFIG_SHIFT_FS_POSIX_ACL */
+
+static const struct inode_operations shiftfs_dir_inode_operations = {
+       .lookup         = shiftfs_lookup,
+       .mkdir          = shiftfs_mkdir,
+       .symlink        = shiftfs_symlink,
+       .unlink         = shiftfs_unlink,
+       .rmdir          = shiftfs_rmdir,
+       .rename         = shiftfs_rename,
+       .link           = shiftfs_link,
+       .setattr        = shiftfs_setattr,
+       .create         = shiftfs_create,
+       .mknod          = shiftfs_mknod,
+       .permission     = shiftfs_permission,
+       .getattr        = shiftfs_getattr,
+       .listxattr      = shiftfs_listxattr,
+       .get_acl        = shiftfs_get_acl,
+};
+
+static const struct inode_operations shiftfs_file_inode_operations = {
+       .fiemap         = shiftfs_fiemap,
+       .getattr        = shiftfs_getattr,
+       .get_acl        = shiftfs_get_acl,
+       .listxattr      = shiftfs_listxattr,
+       .permission     = shiftfs_permission,
+       .setattr        = shiftfs_setattr,
+       .tmpfile        = shiftfs_tmpfile,
+};
+
+static const struct inode_operations shiftfs_special_inode_operations = {
+       .getattr        = shiftfs_getattr,
+       .get_acl        = shiftfs_get_acl,
+       .listxattr      = shiftfs_listxattr,
+       .permission     = shiftfs_permission,
+       .setattr        = shiftfs_setattr,
+};
+
+static const struct inode_operations shiftfs_symlink_inode_operations = {
+       .getattr        = shiftfs_getattr,
+       .get_link       = shiftfs_get_link,
+       .listxattr      = shiftfs_listxattr,
+       .setattr        = shiftfs_setattr,
+};
+
+static struct file *shiftfs_open_realfile(const struct file *file,
+                                         struct inode *realinode)
+{
+       struct file *realfile;
+       const struct cred *old_cred;
+       struct inode *inode = file_inode(file);
+       struct dentry *lowerd = file->f_path.dentry->d_fsdata;
+       struct shiftfs_super_info *info = inode->i_sb->s_fs_info;
+       struct path realpath = { .mnt = info->mnt, .dentry = lowerd };
+
+       old_cred = shiftfs_override_creds(inode->i_sb);
+       realfile = open_with_fake_path(&realpath, file->f_flags, realinode,
+                                      info->creator_cred);
+       revert_creds(old_cred);
+
+       return realfile;
+}
+
+#define SHIFTFS_SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT)
+
+static int shiftfs_change_flags(struct file *file, unsigned int flags)
+{
+       struct inode *inode = file_inode(file);
+       int err;
+
+       /* if some flag changed that cannot be changed then something's amiss */
+       if (WARN_ON((file->f_flags ^ flags) & ~SHIFTFS_SETFL_MASK))
+               return -EIO;
+
+       flags &= SHIFTFS_SETFL_MASK;
+
+       if (((flags ^ file->f_flags) & O_APPEND) && IS_APPEND(inode))
+               return -EPERM;
+
+       if (flags & O_DIRECT) {
+               if (!file->f_mapping->a_ops ||
+                   !file->f_mapping->a_ops->direct_IO)
+                       return -EINVAL;
+       }
+
+       if (file->f_op->check_flags) {
+               err = file->f_op->check_flags(flags);
+               if (err)
+                       return err;
+       }
+
+       spin_lock(&file->f_lock);
+       file->f_flags = (file->f_flags & ~SHIFTFS_SETFL_MASK) | flags;
+       spin_unlock(&file->f_lock);
+
+       return 0;
+}
+
+static int shiftfs_open(struct inode *inode, struct file *file)
+{
+       struct file *realfile;
+
+       realfile = shiftfs_open_realfile(file, inode->i_private);
+       if (IS_ERR(realfile))
+               return PTR_ERR(realfile);
+
+       file->private_data = realfile;
+       /* For O_DIRECT dentry_open() checks f_mapping->a_ops->direct_IO. */
+       file->f_mapping = realfile->f_mapping;
+
+       return 0;
+}
+
+static int shiftfs_dir_open(struct inode *inode, struct file *file)
+{
+       struct file *realfile;
+       const struct cred *oldcred;
+       struct dentry *lowerd = file->f_path.dentry->d_fsdata;
+       struct shiftfs_super_info *info = inode->i_sb->s_fs_info;
+       struct path realpath = { .mnt = info->mnt, .dentry = lowerd };
+
+       oldcred = shiftfs_override_creds(file->f_path.dentry->d_sb);
+       realfile = dentry_open(&realpath, file->f_flags | O_NOATIME,
+                              info->creator_cred);
+       revert_creds(oldcred);
+       if (IS_ERR(realfile))
+               return PTR_ERR(realfile);
+
+       file->private_data = realfile;
+
+       return 0;
+}
+
+static int shiftfs_release(struct inode *inode, struct file *file)
+{
+       struct file *realfile = file->private_data;
+
+       if (realfile)
+               fput(realfile);
+
+       return 0;
+}
+
+static int shiftfs_dir_release(struct inode *inode, struct file *file)
+{
+       return shiftfs_release(inode, file);
+}
+
+static loff_t shiftfs_dir_llseek(struct file *file, loff_t offset, int whence)
+{
+       struct file *realfile = file->private_data;
+
+       return vfs_llseek(realfile, offset, whence);
+}
+
+static loff_t shiftfs_file_llseek(struct file *file, loff_t offset, int whence)
+{
+       struct inode *realinode = file_inode(file)->i_private;
+
+       return generic_file_llseek_size(file, offset, whence,
+                                       realinode->i_sb->s_maxbytes,
+                                       i_size_read(realinode));
+}
+
+/* XXX: Need to figure out what to to about atime updates, maybe other
+ * timestamps too ... ref. ovl_file_accessed() */
+
+static rwf_t shiftfs_iocb_to_rwf(struct kiocb *iocb)
+{
+       int ifl = iocb->ki_flags;
+       rwf_t flags = 0;
+
+       if (ifl & IOCB_NOWAIT)
+               flags |= RWF_NOWAIT;
+       if (ifl & IOCB_HIPRI)
+               flags |= RWF_HIPRI;
+       if (ifl & IOCB_DSYNC)
+               flags |= RWF_DSYNC;
+       if (ifl & IOCB_SYNC)
+               flags |= RWF_SYNC;
+
+       return flags;
+}
+
+static int shiftfs_real_fdget(const struct file *file, struct fd *lowerfd)
+{
+       struct file *realfile;
+
+       if (file->f_op->open != shiftfs_open &&
+           file->f_op->open != shiftfs_dir_open)
+               return -EINVAL;
+
+       realfile = file->private_data;
+       lowerfd->flags = 0;
+       lowerfd->file = realfile;
+
+       /* Did the flags change since open? */
+       if (unlikely(file->f_flags & ~lowerfd->file->f_flags))
+               return shiftfs_change_flags(lowerfd->file, file->f_flags);
+
+       return 0;
+}
+
+static ssize_t shiftfs_read_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+       struct file *file = iocb->ki_filp;
+       struct fd lowerfd;
+       const struct cred *oldcred;
+       ssize_t ret;
+
+       if (!iov_iter_count(iter))
+               return 0;
+
+       ret = shiftfs_real_fdget(file, &lowerfd);
+       if (ret)
+               return ret;
+
+       oldcred = shiftfs_override_creds(file->f_path.dentry->d_sb);
+       ret = vfs_iter_read(lowerfd.file, iter, &iocb->ki_pos,
+                           shiftfs_iocb_to_rwf(iocb));
+       revert_creds(oldcred);
+
+       shiftfs_file_accessed(file);
+
+       fdput(lowerfd);
+       return ret;
+}
+
+static ssize_t shiftfs_write_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+       struct file *file = iocb->ki_filp;
+       struct inode *inode = file_inode(file);
+       struct fd lowerfd;
+       const struct cred *oldcred;
+       ssize_t ret;
+
+       if (!iov_iter_count(iter))
+               return 0;
+
+       inode_lock(inode);
+       /* Update mode */
+       shiftfs_copyattr(inode->i_private, inode);
+       ret = file_remove_privs(file);
+       if (ret)
+               goto out_unlock;
+
+       ret = shiftfs_real_fdget(file, &lowerfd);
+       if (ret)
+               goto out_unlock;
+
+       oldcred = shiftfs_override_creds(file->f_path.dentry->d_sb);
+       file_start_write(lowerfd.file);
+       ret = vfs_iter_write(lowerfd.file, iter, &iocb->ki_pos,
+                            shiftfs_iocb_to_rwf(iocb));
+       file_end_write(lowerfd.file);
+       revert_creds(oldcred);
+
+       /* Update size */
+       shiftfs_copyattr(inode->i_private, inode);
+
+       fdput(lowerfd);
+
+out_unlock:
+       inode_unlock(inode);
+       return ret;
+}
+
+static int shiftfs_fsync(struct file *file, loff_t start, loff_t end,
+                        int datasync)
+{
+       struct fd lowerfd;
+       const struct cred *oldcred;
+       int ret;
+
+       ret = shiftfs_real_fdget(file, &lowerfd);
+       if (ret)
+               return ret;
+
+       oldcred = shiftfs_override_creds(file->f_path.dentry->d_sb);
+       ret = vfs_fsync_range(lowerfd.file, start, end, datasync);
+       revert_creds(oldcred);
+
+       fdput(lowerfd);
+       return ret;
+}
+
+static int shiftfs_mmap(struct file *file, struct vm_area_struct *vma)
+{
+       struct file *realfile = file->private_data;
+       const struct cred *oldcred;
+       int ret;
+
+       if (!realfile->f_op->mmap)
+               return -ENODEV;
+
+       if (WARN_ON(file != vma->vm_file))
+               return -EIO;
+
+       oldcred = shiftfs_override_creds(file->f_path.dentry->d_sb);
+       vma->vm_file = get_file(realfile);
+       ret = call_mmap(vma->vm_file, vma);
+       revert_creds(oldcred);
+
+       shiftfs_file_accessed(file);
+
+       if (ret) {
+               /*
+                * Drop refcount from new vm_file value and restore original
+                * vm_file value
+                */
+               vma->vm_file = file;
+               fput(realfile);
+       } else {
+               /* Drop refcount from previous vm_file value */
+               fput(file);
+       }
+
+       return ret;
+}
+
+static long shiftfs_fallocate(struct file *file, int mode, loff_t offset,
+                             loff_t len)
+{
+       struct inode *inode = file_inode(file);
+       struct inode *loweri = inode->i_private;
+       struct fd lowerfd;
+       const struct cred *oldcred;
+       int ret;
+
+       ret = shiftfs_real_fdget(file, &lowerfd);
+       if (ret)
+               return ret;
+
+       oldcred = shiftfs_override_creds(file->f_path.dentry->d_sb);
+       ret = vfs_fallocate(lowerfd.file, mode, offset, len);
+       revert_creds(oldcred);
+
+       /* Update size */
+       shiftfs_copyattr(loweri, inode);
+
+       fdput(lowerfd);
+       return ret;
+}
+
+static int shiftfs_fadvise(struct file *file, loff_t offset, loff_t len,
+                          int advice)
+{
+       struct fd lowerfd;
+       const struct cred *oldcred;
+       int ret;
+
+       ret = shiftfs_real_fdget(file, &lowerfd);
+       if (ret)
+               return ret;
+
+       oldcred = shiftfs_override_creds(file->f_path.dentry->d_sb);
+       ret = vfs_fadvise(lowerfd.file, offset, len, advice);
+       revert_creds(oldcred);
+
+       fdput(lowerfd);
+       return ret;
+}
+
+static int shiftfs_override_ioctl_creds(int cmd, const struct super_block *sb,
+                                       const struct cred **oldcred,
+                                       struct cred **newcred)
+{
+       struct shiftfs_super_info *sbinfo = sb->s_fs_info;
+       kuid_t fsuid = current_fsuid();
+       kgid_t fsgid = current_fsgid();
+
+       *oldcred = shiftfs_override_creds(sb);
+
+       *newcred = prepare_creds();
+       if (!*newcred) {
+               revert_creds(*oldcred);
+               return -ENOMEM;
+       }
+
+       (*newcred)->fsuid = shift_kuid(sb->s_user_ns, sbinfo->userns, fsuid);
+       (*newcred)->fsgid = shift_kgid(sb->s_user_ns, sbinfo->userns, fsgid);
+
+       /* clear all caps to prevent bypassing capable() checks */
+       cap_clear((*newcred)->cap_bset);
+       cap_clear((*newcred)->cap_effective);
+       cap_clear((*newcred)->cap_inheritable);
+       cap_clear((*newcred)->cap_permitted);
+
+       if (cmd == BTRFS_IOC_SNAP_DESTROY) {
+               kuid_t kuid_root = make_kuid(sb->s_user_ns, 0);
+               /*
+                * Allow the root user in the container to remove subvolumes
+                * from other users.
+                */
+               if (uid_valid(kuid_root) && uid_eq(fsuid, kuid_root))
+                       cap_raise((*newcred)->cap_effective, CAP_DAC_OVERRIDE);
+       }
+
+       put_cred(override_creds(*newcred));
+       return 0;
+}
+
+static inline void shiftfs_revert_ioctl_creds(const struct cred *oldcred,
+                                             struct cred *newcred)
+{
+       return shiftfs_revert_object_creds(oldcred, newcred);
+}
+
+static inline bool is_btrfs_snap_ioctl(int cmd)
+{
+       if ((cmd == BTRFS_IOC_SNAP_CREATE) || (cmd == BTRFS_IOC_SNAP_CREATE_V2))
+               return true;
+
+       return false;
+}
+
+static int shiftfs_btrfs_ioctl_fd_restore(int cmd, int fd, void __user *arg,
+                                         struct btrfs_ioctl_vol_args *v1,
+                                         struct btrfs_ioctl_vol_args_v2 *v2)
+{
+       int ret;
+
+       if (!is_btrfs_snap_ioctl(cmd))
+               return 0;
+
+       if (cmd == BTRFS_IOC_SNAP_CREATE)
+               ret = copy_to_user(arg, v1, sizeof(*v1));
+       else
+               ret = copy_to_user(arg, v2, sizeof(*v2));
+
+       close_fd(fd);
+       kfree(v1);
+       kfree(v2);
+
+       return ret;
+}
+
+static int shiftfs_btrfs_ioctl_fd_replace(int cmd, void __user *arg,
+                                         struct btrfs_ioctl_vol_args **b1,
+                                         struct btrfs_ioctl_vol_args_v2 **b2,
+                                         int *newfd)
+{
+       int oldfd, ret;
+       struct fd src;
+       struct fd lfd = {};
+       struct btrfs_ioctl_vol_args *v1 = NULL;
+       struct btrfs_ioctl_vol_args_v2 *v2 = NULL;
+
+       if (!is_btrfs_snap_ioctl(cmd))
+               return 0;
+
+       if (cmd == BTRFS_IOC_SNAP_CREATE) {
+               v1 = memdup_user(arg, sizeof(*v1));
+               if (IS_ERR(v1))
+                       return PTR_ERR(v1);
+               oldfd = v1->fd;
+               *b1 = v1;
+       } else {
+               v2 = memdup_user(arg, sizeof(*v2));
+               if (IS_ERR(v2))
+                       return PTR_ERR(v2);
+               oldfd = v2->fd;
+               *b2 = v2;
+       }
+
+       src = fdget(oldfd);
+       if (!src.file)
+               return -EINVAL;
+
+       ret = shiftfs_real_fdget(src.file, &lfd);
+       if (ret) {
+               fdput(src);
+               return ret;
+       }
+
+       /*
+        * shiftfs_real_fdget() does not take a reference to lfd.file, so
+        * take a reference here to offset the one which will be put by
+        * close_fd(), and make sure that reference is put on fdput(lfd).
+        */
+       get_file(lfd.file);
+       lfd.flags |= FDPUT_FPUT;
+       fdput(src);
+
+       *newfd = get_unused_fd_flags(lfd.file->f_flags);
+       if (*newfd < 0) {
+               fdput(lfd);
+               return *newfd;
+       }
+
+       fd_install(*newfd, lfd.file);
+
+       if (cmd == BTRFS_IOC_SNAP_CREATE) {
+               v1->fd = *newfd;
+               ret = copy_to_user(arg, v1, sizeof(*v1));
+               v1->fd = oldfd;
+       } else {
+               v2->fd = *newfd;
+               ret = copy_to_user(arg, v2, sizeof(*v2));
+               v2->fd = oldfd;
+       }
+
+       if (ret)
+               shiftfs_btrfs_ioctl_fd_restore(cmd, *newfd, arg, v1, v2);
+
+       return ret;
+}
+
+static long shiftfs_real_ioctl(struct file *file, unsigned int cmd,
+                              unsigned long arg)
+{
+       struct fd lowerfd;
+       struct cred *newcred;
+       const struct cred *oldcred;
+       int newfd = -EBADF;
+       long err = 0, ret = 0;
+       void __user *argp = (void __user *)arg;
+       struct super_block *sb = file->f_path.dentry->d_sb;
+       struct btrfs_ioctl_vol_args *btrfs_v1 = NULL;
+       struct btrfs_ioctl_vol_args_v2 *btrfs_v2 = NULL;
+
+       ret = shiftfs_btrfs_ioctl_fd_replace(cmd, argp, &btrfs_v1, &btrfs_v2,
+                                            &newfd);
+       if (ret < 0)
+               return ret;
+
+       ret = shiftfs_real_fdget(file, &lowerfd);
+       if (ret)
+               goto out_restore;
+
+       ret = shiftfs_override_ioctl_creds(cmd, sb, &oldcred, &newcred);
+       if (ret)
+               goto out_fdput;
+
+       ret = vfs_ioctl(lowerfd.file, cmd, arg);
+
+       shiftfs_revert_ioctl_creds(oldcred, newcred);
+
+       shiftfs_copyattr(file_inode(lowerfd.file), file_inode(file));
+       shiftfs_copyflags(file_inode(lowerfd.file), file_inode(file));
+
+out_fdput:
+       fdput(lowerfd);
+
+out_restore:
+       err = shiftfs_btrfs_ioctl_fd_restore(cmd, newfd, argp,
+                                            btrfs_v1, btrfs_v2);
+       if (!ret)
+               ret = err;
+
+       return ret;
+}
+
+static bool in_ioctl_whitelist(int flag, unsigned long arg)
+{
+       void __user *argp = (void __user *)arg;
+       u64 flags = 0;
+
+       switch (flag) {
+       case BTRFS_IOC_FS_INFO:
+               return true;
+       case BTRFS_IOC_SNAP_CREATE:
+               return true;
+       case BTRFS_IOC_SNAP_CREATE_V2:
+               return true;
+       case BTRFS_IOC_SUBVOL_CREATE:
+               return true;
+       case BTRFS_IOC_SUBVOL_CREATE_V2:
+               return true;
+       case BTRFS_IOC_SUBVOL_GETFLAGS:
+               return true;
+       case BTRFS_IOC_SUBVOL_SETFLAGS:
+               if (copy_from_user(&flags, argp, sizeof(flags)))
+                       return false;
+
+               if (flags & ~BTRFS_SUBVOL_RDONLY)
+                       return false;
+
+               return true;
+       case BTRFS_IOC_SNAP_DESTROY:
+               return true;
+       }
+
+       return false;
+}
+
+static long shiftfs_ioctl(struct file *file, unsigned int cmd,
+                         unsigned long arg)
+{
+       switch (cmd) {
+       case FS_IOC_GETVERSION:
+               /* fall through */
+       case FS_IOC_GETFLAGS:
+               /* fall through */
+       case FS_IOC_SETFLAGS:
+               break;
+       default:
+               if (!in_ioctl_whitelist(cmd, arg) ||
+                   !shiftfs_passthrough_ioctls(file->f_path.dentry->d_sb->s_fs_info))
+                       return -ENOTTY;
+       }
+
+       return shiftfs_real_ioctl(file, cmd, arg);
+}
+
+static long shiftfs_compat_ioctl(struct file *file, unsigned int cmd,
+                                unsigned long arg)
+{
+       switch (cmd) {
+       case FS_IOC32_GETVERSION:
+               /* fall through */
+       case FS_IOC32_GETFLAGS:
+               /* fall through */
+       case FS_IOC32_SETFLAGS:
+               break;
+       default:
+               if (!in_ioctl_whitelist(cmd, arg) ||
+                   !shiftfs_passthrough_ioctls(file->f_path.dentry->d_sb->s_fs_info))
+                       return -ENOIOCTLCMD;
+       }
+
+       return shiftfs_real_ioctl(file, cmd, arg);
+}
+
+enum shiftfs_copyop {
+       SHIFTFS_COPY,
+       SHIFTFS_CLONE,
+       SHIFTFS_DEDUPE,
+};
+
+static ssize_t shiftfs_copyfile(struct file *file_in, loff_t pos_in,
+                               struct file *file_out, loff_t pos_out, u64 len,
+                               unsigned int flags, enum shiftfs_copyop op)
+{
+       ssize_t ret;
+       struct fd real_in, real_out;
+       const struct cred *oldcred;
+       struct inode *inode_out = file_inode(file_out);
+       struct inode *loweri = inode_out->i_private;
+
+       ret = shiftfs_real_fdget(file_out, &real_out);
+       if (ret)
+               return ret;
+
+       ret = shiftfs_real_fdget(file_in, &real_in);
+       if (ret) {
+               fdput(real_out);
+               return ret;
+       }
+
+       oldcred = shiftfs_override_creds(inode_out->i_sb);
+       switch (op) {
+       case SHIFTFS_COPY:
+               ret = vfs_copy_file_range(real_in.file, pos_in, real_out.file,
+                                         pos_out, len, flags);
+               break;
+
+       case SHIFTFS_CLONE:
+               ret = vfs_clone_file_range(real_in.file, pos_in, real_out.file,
+                                          pos_out, len, flags);
+               break;
+
+       case SHIFTFS_DEDUPE:
+               ret = vfs_dedupe_file_range_one(real_in.file, pos_in,
+                                               real_out.file, pos_out, len,
+                                               flags);
+               break;
+       }
+       revert_creds(oldcred);
+
+       /* Update size */
+       shiftfs_copyattr(loweri, inode_out);
+
+       fdput(real_in);
+       fdput(real_out);
+
+       return ret;
+}
+
+static ssize_t shiftfs_copy_file_range(struct file *file_in, loff_t pos_in,
+                                      struct file *file_out, loff_t pos_out,
+                                      size_t len, unsigned int flags)
+{
+       return shiftfs_copyfile(file_in, pos_in, file_out, pos_out, len, flags,
+                               SHIFTFS_COPY);
+}
+
+static loff_t shiftfs_remap_file_range(struct file *file_in, loff_t pos_in,
+                                      struct file *file_out, loff_t pos_out,
+                                      loff_t len, unsigned int remap_flags)
+{
+       enum shiftfs_copyop op;
+
+       if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
+               return -EINVAL;
+
+       if (remap_flags & REMAP_FILE_DEDUP)
+               op = SHIFTFS_DEDUPE;
+       else
+               op = SHIFTFS_CLONE;
+
+       return shiftfs_copyfile(file_in, pos_in, file_out, pos_out, len,
+                               remap_flags, op);
+}
+
+static int shiftfs_iterate_shared(struct file *file, struct dir_context *ctx)
+{
+       const struct cred *oldcred;
+       int err = -ENOTDIR;
+       struct file *realfile = file->private_data;
+
+       oldcred = shiftfs_override_creds(file->f_path.dentry->d_sb);
+       err = iterate_dir(realfile, ctx);
+       revert_creds(oldcred);
+
+       return err;
+}
+
+const struct file_operations shiftfs_file_operations = {
+       .open                   = shiftfs_open,
+       .release                = shiftfs_release,
+       .llseek                 = shiftfs_file_llseek,
+       .read_iter              = shiftfs_read_iter,
+       .write_iter             = shiftfs_write_iter,
+       .fsync                  = shiftfs_fsync,
+       .mmap                   = shiftfs_mmap,
+       .fallocate              = shiftfs_fallocate,
+       .fadvise                = shiftfs_fadvise,
+       .unlocked_ioctl         = shiftfs_ioctl,
+       .compat_ioctl           = shiftfs_compat_ioctl,
+       .copy_file_range        = shiftfs_copy_file_range,
+       .remap_file_range       = shiftfs_remap_file_range,
+};
+
+const struct file_operations shiftfs_dir_operations = {
+       .open                   = shiftfs_dir_open,
+       .release                = shiftfs_dir_release,
+       .compat_ioctl           = shiftfs_compat_ioctl,
+       .fsync                  = shiftfs_fsync,
+       .iterate_shared         = shiftfs_iterate_shared,
+       .llseek                 = shiftfs_dir_llseek,
+       .read                   = generic_read_dir,
+       .unlocked_ioctl         = shiftfs_ioctl,
+};
+
+static const struct address_space_operations shiftfs_aops = {
+       /* For O_DIRECT dentry_open() checks f_mapping->a_ops->direct_IO */
+       .direct_IO      = noop_direct_IO,
+};
+
+static void shiftfs_fill_inode(struct inode *inode, unsigned long ino,
+                              umode_t mode, dev_t dev, struct dentry *dentry)
+{
+       struct inode *loweri;
+
+       inode->i_ino = ino;
+       inode->i_flags |= S_NOCMTIME;
+
+       mode &= S_IFMT;
+       inode->i_mode = mode;
+       switch (mode & S_IFMT) {
+       case S_IFDIR:
+               inode->i_op = &shiftfs_dir_inode_operations;
+               inode->i_fop = &shiftfs_dir_operations;
+               break;
+       case S_IFLNK:
+               inode->i_op = &shiftfs_symlink_inode_operations;
+               break;
+       case S_IFREG:
+               inode->i_op = &shiftfs_file_inode_operations;
+               inode->i_fop = &shiftfs_file_operations;
+               inode->i_mapping->a_ops = &shiftfs_aops;
+               break;
+       default:
+               inode->i_op = &shiftfs_special_inode_operations;
+               init_special_inode(inode, mode, dev);
+               break;
+       }
+
+       if (!dentry)
+               return;
+
+       loweri = dentry->d_inode;
+       if (!loweri->i_op->get_link)
+               inode->i_opflags |= IOP_NOFOLLOW;
+
+       shiftfs_copyattr(loweri, inode);
+       shiftfs_copyflags(loweri, inode);
+       set_nlink(inode, loweri->i_nlink);
+}
+
+static int shiftfs_show_options(struct seq_file *m, struct dentry *dentry)
+{
+       struct super_block *sb = dentry->d_sb;
+       struct shiftfs_super_info *sbinfo = sb->s_fs_info;
+
+       if (sbinfo->mark)
+               seq_show_option(m, "mark", NULL);
+
+       if (sbinfo->passthrough)
+               seq_printf(m, ",passthrough=%u", sbinfo->passthrough);
+
+       return 0;
+}
+
+static int shiftfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+       struct super_block *sb = dentry->d_sb;
+       struct shiftfs_super_info *sbinfo = sb->s_fs_info;
+       struct dentry *root = sb->s_root;
+       struct dentry *realroot = root->d_fsdata;
+       struct path realpath = { .mnt = sbinfo->mnt, .dentry = realroot };
+       int err;
+
+       err = vfs_statfs(&realpath, buf);
+       if (err)
+               return err;
+
+       if (!shiftfs_passthrough_statfs(sbinfo))
+               buf->f_type = sb->s_magic;
+
+       return 0;
+}
+
+static void shiftfs_evict_inode(struct inode *inode)
+{
+       struct inode *loweri = inode->i_private;
+
+       clear_inode(inode);
+
+       if (loweri)
+               iput(loweri);
+}
+
+static void shiftfs_put_super(struct super_block *sb)
+{
+       struct shiftfs_super_info *sbinfo = sb->s_fs_info;
+
+       if (sbinfo) {
+               mntput(sbinfo->mnt);
+               put_cred(sbinfo->creator_cred);
+               kfree(sbinfo);
+       }
+}
+
+static const struct xattr_handler shiftfs_xattr_handler = {
+       .prefix = "",
+       .get    = shiftfs_xattr_get,
+       .set    = shiftfs_xattr_set,
+};
+
+const struct xattr_handler *shiftfs_xattr_handlers[] = {
+#ifdef CONFIG_SHIFT_FS_POSIX_ACL
+       &shiftfs_posix_acl_access_xattr_handler,
+       &shiftfs_posix_acl_default_xattr_handler,
+#endif
+       &shiftfs_xattr_handler,
+       NULL
+};
+
+static inline bool passthrough_is_subset(int old_flags, int new_flags)
+{
+       if ((new_flags & old_flags) != new_flags)
+               return false;
+
+       return true;
+}
+
+static int shiftfs_super_check_flags(unsigned long old_flags,
+                                    unsigned long new_flags)
+{
+       if ((old_flags & SB_RDONLY) && !(new_flags & SB_RDONLY))
+               return -EPERM;
+
+       if ((old_flags & SB_NOSUID) && !(new_flags & SB_NOSUID))
+               return -EPERM;
+
+       if ((old_flags & SB_NODEV) && !(new_flags & SB_NODEV))
+               return -EPERM;
+
+       if ((old_flags & SB_NOEXEC) && !(new_flags & SB_NOEXEC))
+               return -EPERM;
+
+       if ((old_flags & SB_NOATIME) && !(new_flags & SB_NOATIME))
+               return -EPERM;
+
+       if ((old_flags & SB_NODIRATIME) && !(new_flags & SB_NODIRATIME))
+               return -EPERM;
+
+       if (!(old_flags & SB_POSIXACL) && (new_flags & SB_POSIXACL))
+               return -EPERM;
+
+       return 0;
+}
+
+static int shiftfs_remount(struct super_block *sb, int *flags, char *data)
+{
+       int err;
+       struct shiftfs_super_info new = {};
+       struct shiftfs_super_info *info = sb->s_fs_info;
+
+       err = shiftfs_parse_mount_options(&new, data);
+       if (err)
+               return err;
+
+       err = shiftfs_super_check_flags(sb->s_flags, *flags);
+       if (err)
+               return err;
+
+       /* Mark mount option cannot be changed. */
+       if (info->mark || (info->mark != new.mark))
+               return -EPERM;
+
+       if (info->passthrough != new.passthrough) {
+               /* Don't allow exceeding passthrough options of mark mount. */
+               if (!passthrough_is_subset(info->passthrough_mark,
+                                          info->passthrough))
+                       return -EPERM;
+
+               info->passthrough = new.passthrough;
+       }
+
+       return 0;
+}
+
+static const struct super_operations shiftfs_super_ops = {
+       .put_super      = shiftfs_put_super,
+       .show_options   = shiftfs_show_options,
+       .statfs         = shiftfs_statfs,
+       .remount_fs     = shiftfs_remount,
+       .evict_inode    = shiftfs_evict_inode,
+};
+
+struct shiftfs_data {
+       void *data;
+       const char *path;
+};
+
+static void shiftfs_super_force_flags(struct super_block *sb,
+                                     unsigned long lower_flags)
+{
+       sb->s_flags |= lower_flags & (SB_RDONLY | SB_NOSUID | SB_NODEV |
+                                     SB_NOEXEC | SB_NOATIME | SB_NODIRATIME);
+
+       if (!(lower_flags & SB_POSIXACL))
+               sb->s_flags &= ~SB_POSIXACL;
+}
 
 static int shiftfs_fill_super(struct super_block *sb, void *raw_data,
                              int silent)
 {
+       int err;
+       struct path path = {};
+       struct shiftfs_super_info *sbinfo_mp;
+       char *name = NULL;
+       struct inode *inode = NULL;
+       struct dentry *dentry = NULL;
        struct shiftfs_data *data = raw_data;
-       char *name = kstrdup(data->path, GFP_KERNEL);
-       int err = -ENOMEM;
-       struct shiftfs_super_info *ssi = NULL;
-       struct path path;
-       struct dentry *dentry;
+       struct shiftfs_super_info *sbinfo = NULL;
 
-       if (!name)
-               goto out;
+       if (!data->path)
+               return -EINVAL;
 
-       ssi = kzalloc(sizeof(*ssi), GFP_KERNEL);
-       if (!ssi)
-               goto out;
+       sb->s_fs_info = kzalloc(sizeof(*sbinfo), GFP_KERNEL);
+       if (!sb->s_fs_info)
+               return -ENOMEM;
+       sbinfo = sb->s_fs_info;
 
-       err = -EPERM;
-       err = shiftfs_parse_options(ssi, data->data);
+       err = shiftfs_parse_mount_options(sbinfo, data->data);
        if (err)
-               goto out;
+               return err;
 
-       /* to mark a mount point, must be real root */
-       if (ssi->mark && !capable(CAP_SYS_ADMIN))
-               goto out;
+       /* to mount a mark, must be userns admin */
+       if (!sbinfo->mark && !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
+               return -EPERM;
 
-       /* else to mount a mark, must be userns admin */
-       if (!ssi->mark && !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
-               goto out;
+       name = kstrdup(data->path, GFP_KERNEL);
+       if (!name)
+               return -ENOMEM;
 
        err = kern_path(name, LOOKUP_FOLLOW, &path);
        if (err)
-               goto out;
-
-       err = -EPERM;
+               goto out_free_name;
 
        if (!S_ISDIR(path.dentry->d_inode->i_mode)) {
                err = -ENOTDIR;
-               goto out_put;
+               goto out_put_path;
        }
 
-       sb->s_stack_depth = path.dentry->d_sb->s_stack_depth + 1;
-       if (sb->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) {
-               printk(KERN_ERR "shiftfs: maximum stacking depth exceeded\n");
-               err = -EINVAL;
-               goto out_put;
-       }
+       sb->s_flags |= SB_POSIXACL;
+
+       if (sbinfo->mark) {
+               struct cred *cred_tmp;
+               struct super_block *lower_sb = path.mnt->mnt_sb;
+
+               /* to mark a mount point, must root wrt lower s_user_ns */
+               if (!ns_capable(lower_sb->s_user_ns, CAP_SYS_ADMIN)) {
+                       err = -EPERM;
+                       goto out_put_path;
+               }
 
-       if (ssi->mark) {
                /*
                 * this part is visible unshifted, so make sure no
                 * executables that could be used to give suid
                 * privileges
                 */
                sb->s_iflags = SB_I_NOEXEC;
-               ssi->mnt = path.mnt;
-               dentry = path.dentry;
-       } else {
-               struct shiftfs_super_info *mp_ssi;
+
+               shiftfs_super_force_flags(sb, lower_sb->s_flags);
 
                /*
-                * this leg executes if we're admin capable in
-                * the namespace, so be very careful
+                * Handle nesting of shiftfs mounts by referring this mark
+                * mount back to the original mark mount. This is more
+                * efficient and alleviates concerns about stack depth.
                 */
+               if (lower_sb->s_magic == SHIFTFS_MAGIC) {
+                       sbinfo_mp = lower_sb->s_fs_info;
+
+                       /* Doesn't make sense to mark a mark mount */
+                       if (sbinfo_mp->mark) {
+                               err = -EINVAL;
+                               goto out_put_path;
+                       }
+
+                       if (!passthrough_is_subset(sbinfo_mp->passthrough,
+                                                  sbinfo->passthrough)) {
+                               err = -EPERM;
+                               goto out_put_path;
+                       }
+
+                       sbinfo->mnt = mntget(sbinfo_mp->mnt);
+                       dentry = dget(path.dentry->d_fsdata);
+                       /*
+                        * Copy up the passthrough mount options from the
+                        * parent mark mountpoint.
+                        */
+                       sbinfo->passthrough_mark = sbinfo_mp->passthrough_mark;
+                       sbinfo->creator_cred = get_cred(sbinfo_mp->creator_cred);
+               } else {
+                       sbinfo->mnt = mntget(path.mnt);
+                       dentry = dget(path.dentry);
+                       /*
+                        * For a new mark passthrough_mark and passthrough
+                        * are identical.
+                        */
+                       sbinfo->passthrough_mark = sbinfo->passthrough;
+
+                       cred_tmp = prepare_creds();
+                       if (!cred_tmp) {
+                               err = -ENOMEM;
+                               goto out_put_path;
+                       }
+                       /* Don't override disk quota limits or use reserved space. */
+                       cap_lower(cred_tmp->cap_effective, CAP_SYS_RESOURCE);
+                       sbinfo->creator_cred = cred_tmp;
+               }
+       } else {
+               /*
+                * This leg executes if we're admin capable in the namespace,
+                * so be very careful.
+                */
+               err = -EPERM;
                if (path.dentry->d_sb->s_magic != SHIFTFS_MAGIC)
-                       goto out_put;
-               mp_ssi = path.dentry->d_sb->s_fs_info;
-               if (!mp_ssi->mark)
-                       goto out_put;
-               ssi->mnt = mntget(mp_ssi->mnt);
+                       goto out_put_path;
+
+               sbinfo_mp = path.dentry->d_sb->s_fs_info;
+               if (!sbinfo_mp->mark)
+                       goto out_put_path;
+
+               if (!passthrough_is_subset(sbinfo_mp->passthrough,
+                                          sbinfo->passthrough))
+                       goto out_put_path;
+
+               sbinfo->mnt = mntget(sbinfo_mp->mnt);
+               sbinfo->creator_cred = get_cred(sbinfo_mp->creator_cred);
                dentry = dget(path.dentry->d_fsdata);
-               path_put(&path);
+               /*
+                * Copy up passthrough settings from mark mountpoint so we can
+                * verify when the overlay wants to remount with different
+                * passthrough settings.
+                */
+               sbinfo->passthrough_mark = sbinfo_mp->passthrough;
+               shiftfs_super_force_flags(sb, path.mnt->mnt_sb->s_flags);
        }
-       ssi->userns = get_user_ns(dentry->d_sb->s_user_ns);
-       sb->s_fs_info = ssi;
+
+       sb->s_stack_depth = dentry->d_sb->s_stack_depth + 1;
+       if (sb->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) {
+               printk(KERN_ERR "shiftfs: maximum stacking depth exceeded\n");
+               err = -EINVAL;
+               goto out_put_path;
+       }
+
+       inode = new_inode(sb);
+       if (!inode) {
+               err = -ENOMEM;
+               goto out_put_path;
+       }
+       shiftfs_fill_inode(inode, dentry->d_inode->i_ino, S_IFDIR, 0, dentry);
+
+       ihold(dentry->d_inode);
+       inode->i_private = dentry->d_inode;
+
        sb->s_magic = SHIFTFS_MAGIC;
+       sb->s_maxbytes = MAX_LFS_FILESIZE;
        sb->s_op = &shiftfs_super_ops;
        sb->s_xattr = shiftfs_xattr_handlers;
        sb->s_d_op = &shiftfs_dentry_ops;
-       sb->s_root = d_make_root(shiftfs_new_inode(sb, S_IFDIR, dentry));
+       sb->s_root = d_make_root(inode);
+       if (!sb->s_root) {
+               err = -ENOMEM;
+               goto out_put_path;
+       }
+
        sb->s_root->d_fsdata = dentry;
+       sbinfo->userns = get_user_ns(dentry->d_sb->s_user_ns);
+       shiftfs_copyattr(dentry->d_inode, sb->s_root->d_inode);
 
-       return 0;
+       dentry = NULL;
+       err = 0;
 
- out_put:
+out_put_path:
        path_put(&path);
- out:
+
+out_free_name:
        kfree(name);
-       kfree(ssi);
+
+       dput(dentry);
+
        return err;
 }
 
@@ -774,7 +2149,9 @@ static void __exit shiftfs_exit(void)
 
 MODULE_ALIAS_FS("shiftfs");
 MODULE_AUTHOR("James Bottomley");
-MODULE_DESCRIPTION("uid/gid shifting bind filesystem");
+MODULE_AUTHOR("Seth Forshee <seth.forshee@canonical.com>");
+MODULE_AUTHOR("Christian Brauner <christian.brauner@ubuntu.com>");
+MODULE_DESCRIPTION("id shifting filesystem");
 MODULE_LICENSE("GPL v2");
 module_init(shiftfs_init)
 module_exit(shiftfs_exit)