cstyle: Resolve C style issues

[mirror_zfs.git] / module / zfs / zfs_ctldir.c
diff --git a/module/zfs/zfs_ctldir.c b/module/zfs/zfs_ctldir.c

index 0da0261006360d25246aa1133ff218c78ce5a19c..96520545a2804d12f187db0e5f9749917de10abb 100644 (file)
--- a/module/zfs/zfs_ctldir.c
+++ b/module/zfs/zfs_ctldir.c
@@ -19,23 +19,30 @@
   * CDDL HEADER END
   */
  /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ *
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (C) 2011 Lawrence Livermore National Security, LLC.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * LLNL-CODE-403049.
+ * Rewritten for Linux by:
+ *   Rohan Puri <rohan.puri15@gmail.com>
+ *   Brian Behlendorf <behlendorf1@llnl.gov>
+ * Copyright (c) 2013 by Delphix. All rights reserved.
   */
  
  /*
   * ZFS control directory (a.k.a. ".zfs")
   *
   * This directory provides a common location for all ZFS meta-objects.
- * Currently, this is only the 'snapshot' directory, but this may expand in the
- * future.  The elements are built using the GFS primitives, as the hierarchy
+ * Currently, this is only the 'snapshot' and 'shares' directory, but this may
+ * expand in the future.  The elements are built dynamically, as the hierarchy
   * does not actually exist on disk.
   *
   * For 'snapshot', we don't want to have all snapshots always mounted, because
   * this would take up a huge amount of space in /etc/mnttab.  We have three
   * types of objects:
   *
- *     ctldir ------> snapshotdir -------> snapshot
+ *     ctldir ------> snapshotdir -------> snapshot
   *                                             |
   *                                             |
   *                                             V
@@ -44,59 +51,86 @@
   * The 'snapshot' node contains just enough information to lookup '..' and act
   * as a mountpoint for the snapshot.  Whenever we lookup a specific snapshot, we
   * perform an automount of the underlying filesystem and return the
- * corresponding vnode.
+ * corresponding inode.
   *
- * All mounts are handled automatically by the kernel, but unmounts are
- * (currently) handled from user land.  The main reason is that there is no
- * reliable way to auto-unmount the filesystem when it's "no longer in use".
- * When the user unmounts a filesystem, we call zfsctl_unmount(), which
- * unmounts any snapshots within the snapshot directory.
+ * All mounts are handled automatically by an user mode helper which invokes
+ * the mount mount procedure.  Unmounts are handled by allowing the mount
+ * point to expire so the kernel may automatically unmount it.
   *
   * The '.zfs', '.zfs/snapshot', and all directories created under
- * '.zfs/snapshot' (ie: '.zfs/snapshot/<snapname>') are all GFS nodes and
- * share the same vfs_t as the head filesystem (what '.zfs' lives under).
+ * '.zfs/snapshot' (ie: '.zfs/snapshot/<snapname>') all share the same
+ * share the same zfs_sb_t as the head filesystem (what '.zfs' lives under).
   *
- * File systems mounted ontop of the GFS nodes '.zfs/snapshot/<snapname>'
- * (ie: snapshots) are ZFS nodes and have their own unique vfs_t.
- * However, vnodes within these mounted on file systems have their v_vfsp
- * fields set to the head filesystem to make NFS happy (see
- * zfsctl_snapdir_lookup()). We VFS_HOLD the head filesystem's vfs_t
- * so that it cannot be freed until all snapshots have been unmounted.
+ * File systems mounted on top of the '.zfs/snapshot/<snapname>' paths
+ * (ie: snapshots) are complete ZFS filesystems and have their own unique
+ * zfs_sb_t.  However, the fsid reported by these mounts will be the same
+ * as that used by the parent zfs_sb_t to make NFS happy.
   */
  
-#include <fs/fs_subr.h>
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/pathname.h>
+#include <sys/vfs.h>
+#include <sys/vfs_opreg.h>
  #include <sys/zfs_ctldir.h>
  #include <sys/zfs_ioctl.h>
  #include <sys/zfs_vfsops.h>
-#include <sys/vfs_opreg.h>
-#include <sys/gfs.h>
+#include <sys/zfs_vnops.h>
  #include <sys/stat.h>
  #include <sys/dmu.h>
+#include <sys/dsl_destroy.h>
  #include <sys/dsl_deleg.h>
  #include <sys/mount.h>
-#include <sys/sunddi.h>
-
+#include <sys/zpl.h>
  #include "zfs_namecheck.h"
  
-typedef struct zfsctl_node {
-       gfs_dir_t       zc_gfs_private;
-       uint64_t        zc_id;
-       timestruc_t     zc_cmtime;      /* ctime and mtime, always the same */
-} zfsctl_node_t;
+/*
+ * Control Directory Tunables (.zfs)
+ */
+int zfs_expire_snapshot = ZFSCTL_EXPIRE_SNAPSHOT;
  
-typedef struct zfsctl_snapdir {
-       zfsctl_node_t   sd_node;
-       kmutex_t        sd_lock;
-       avl_tree_t      sd_snaps;
-} zfsctl_snapdir_t;
+/*
+ * Dedicated task queue for unmounting snapshots.
+ */
+static taskq_t *zfs_expire_taskq;
  
-typedef struct {
-       char            *se_name;
-       vnode_t         *se_root;
-       avl_node_t      se_node;
-} zfs_snapentry_t;
+static zfs_snapentry_t *
+zfsctl_sep_alloc(void)
+{
+       return (kmem_zalloc(sizeof (zfs_snapentry_t), KM_SLEEP));
+}
  
-static int
+void
+zfsctl_sep_free(zfs_snapentry_t *sep)
+{
+       kmem_free(sep->se_name, MAXNAMELEN);
+       kmem_free(sep->se_path, PATH_MAX);
+       kmem_free(sep, sizeof (zfs_snapentry_t));
+}
+
+/*
+ * Attempt to expire an automounted snapshot, unmounts are attempted every
+ * 'zfs_expire_snapshot' seconds until they succeed.  The work request is
+ * responsible for rescheduling itself and freeing the zfs_expire_snapshot_t.
+ */
+static void
+zfsctl_expire_snapshot(void *data)
+{
+       zfs_snapentry_t *sep = (zfs_snapentry_t *)data;
+       zfs_sb_t *zsb = ITOZSB(sep->se_inode);
+       int error;
+
+       error = zfsctl_unmount_snapshot(zsb, sep->se_name, MNT_EXPIRE);
+       if (error == EBUSY)
+               sep->se_taskqid = taskq_dispatch_delay(zfs_expire_taskq,
+                   zfsctl_expire_snapshot, sep, TQ_SLEEP,
+                   ddi_get_lbolt() + zfs_expire_snapshot * HZ);
+}
+
+int
  snapentry_compare(const void *a, const void *b)
  {
         const zfs_snapentry_t *sa = a;
@@ -111,226 +145,193 @@ snapentry_compare(const void *a, const void *b)
                 return (0);
  }
  
-vnodeops_t *zfsctl_ops_root;
-vnodeops_t *zfsctl_ops_snapdir;
-vnodeops_t *zfsctl_ops_snapshot;
-
-static const fs_operation_def_t zfsctl_tops_root[];
-static const fs_operation_def_t zfsctl_tops_snapdir[];
-static const fs_operation_def_t zfsctl_tops_snapshot[];
-
-static vnode_t *zfsctl_mknode_snapdir(vnode_t *);
-static vnode_t *zfsctl_snapshot_mknode(vnode_t *, uint64_t objset);
-static int zfsctl_unmount_snap(zfs_snapentry_t *, int, cred_t *);
-
-static gfs_opsvec_t zfsctl_opsvec[] = {
-       { ".zfs", zfsctl_tops_root, &zfsctl_ops_root },
-       { ".zfs/snapshot", zfsctl_tops_snapdir, &zfsctl_ops_snapdir },
-       { ".zfs/snapshot/vnode", zfsctl_tops_snapshot, &zfsctl_ops_snapshot },
-       { NULL }
-};
-
-/*
- * Root directory elements.  We have only a single static entry, 'snapshot'.
- */
-static gfs_dirent_t zfsctl_root_entries[] = {
-       { "snapshot", zfsctl_mknode_snapdir, GFS_CACHE_VNODE },
-       { NULL }
-};
-
-/* include . and .. in the calculation */
-#define        NROOT_ENTRIES   ((sizeof (zfsctl_root_entries) / \
-    sizeof (gfs_dirent_t)) + 1)
-
-
-/*
- * Initialize the various GFS pieces we'll need to create and manipulate .zfs
- * directories.  This is called from the ZFS init routine, and initializes the
- * vnode ops vectors that we'll be using.
- */
-void
-zfsctl_init(void)
+boolean_t
+zfsctl_is_node(struct inode *ip)
  {
-       VERIFY(gfs_make_opsvec(zfsctl_opsvec) == 0);
+       return (ITOZ(ip)->z_is_ctldir);
  }
  
-void
-zfsctl_fini(void)
+boolean_t
+zfsctl_is_snapdir(struct inode *ip)
  {
-       /*
-        * Remove vfsctl vnode ops
-        */
-       if (zfsctl_ops_root)
-               vn_freevnodeops(zfsctl_ops_root);
-       if (zfsctl_ops_snapdir)
-               vn_freevnodeops(zfsctl_ops_snapdir);
-       if (zfsctl_ops_snapshot)
-               vn_freevnodeops(zfsctl_ops_snapshot);
-
-       zfsctl_ops_root = NULL;
-       zfsctl_ops_snapdir = NULL;
-       zfsctl_ops_snapshot = NULL;
+       return (zfsctl_is_node(ip) && (ip->i_ino <= ZFSCTL_INO_SNAPDIRS));
  }
  
  /*
- * Return the inode number associated with the 'snapshot' directory.
+ * Allocate a new inode with the passed id and ops.
   */
-/* ARGSUSED */
-static ino64_t
-zfsctl_root_inode_cb(vnode_t *vp, int index)
+static struct inode *
+zfsctl_inode_alloc(zfs_sb_t *zsb, uint64_t id,
+    const struct file_operations *fops, const struct inode_operations *ops)
  {
-       ASSERT(index == 0);
-       return (ZFSCTL_INO_SNAPDIR);
+       struct timespec now = current_fs_time(zsb->z_sb);
+       struct inode *ip;
+       znode_t *zp;
+
+       ip = new_inode(zsb->z_sb);
+       if (ip == NULL)
+               return (NULL);
+
+       zp = ITOZ(ip);
+       ASSERT3P(zp->z_dirlocks, ==, NULL);
+       ASSERT3P(zp->z_acl_cached, ==, NULL);
+       ASSERT3P(zp->z_xattr_cached, ==, NULL);
+       zp->z_id = id;
+       zp->z_unlinked = 0;
+       zp->z_atime_dirty = 0;
+       zp->z_zn_prefetch = 0;
+       zp->z_moved = 0;
+       zp->z_sa_hdl = NULL;
+       zp->z_blksz = 0;
+       zp->z_seq = 0;
+       zp->z_mapcnt = 0;
+       zp->z_gen = 0;
+       zp->z_size = 0;
+       zp->z_atime[0] = 0;
+       zp->z_atime[1] = 0;
+       zp->z_links = 0;
+       zp->z_pflags = 0;
+       zp->z_uid = 0;
+       zp->z_gid = 0;
+       zp->z_mode = 0;
+       zp->z_sync_cnt = 0;
+       zp->z_is_zvol = B_FALSE;
+       zp->z_is_mapped = B_FALSE;
+       zp->z_is_ctldir = B_TRUE;
+       zp->z_is_sa = B_FALSE;
+       zp->z_is_stale = B_FALSE;
+       ip->i_ino = id;
+       ip->i_mode = (S_IFDIR | S_IRUGO | S_IXUGO);
+       ip->i_uid = SUID_TO_KUID(0);
+       ip->i_gid = SGID_TO_KGID(0);
+       ip->i_blkbits = SPA_MINBLOCKSHIFT;
+       ip->i_atime = now;
+       ip->i_mtime = now;
+       ip->i_ctime = now;
+       ip->i_fop = fops;
+       ip->i_op = ops;
+
+       if (insert_inode_locked(ip)) {
+               unlock_new_inode(ip);
+               iput(ip);
+               return (NULL);
+       }
+
+       mutex_enter(&zsb->z_znodes_lock);
+       list_insert_tail(&zsb->z_all_znodes, zp);
+       zsb->z_nr_znodes++;
+       membar_producer();
+       mutex_exit(&zsb->z_znodes_lock);
+
+       unlock_new_inode(ip);
+
+       return (ip);
  }
  
  /*
- * Create the '.zfs' directory.  This directory is cached as part of the VFS
- * structure.  This results in a hold on the vfs_t.  The code in zfs_umount()
- * therefore checks against a vfs_count of 2 instead of 1.  This reference
- * is removed when the ctldir is destroyed in the unmount.
+ * Lookup the inode with given id, it will be allocated if needed.
   */
-void
-zfsctl_create(zfsvfs_t *zfsvfs)
+static struct inode *
+zfsctl_inode_lookup(zfs_sb_t *zsb, uint64_t id,
+    const struct file_operations *fops, const struct inode_operations *ops)
  {
-       vnode_t *vp, *rvp;
-       zfsctl_node_t *zcp;
-
-       ASSERT(zfsvfs->z_ctldir == NULL);
+       struct inode *ip = NULL;
  
-       vp = gfs_root_create(sizeof (zfsctl_node_t), zfsvfs->z_vfs,
-           zfsctl_ops_root, ZFSCTL_INO_ROOT, zfsctl_root_entries,
-           zfsctl_root_inode_cb, MAXNAMELEN, NULL, NULL);
-       zcp = vp->v_data;
-       zcp->zc_id = ZFSCTL_INO_ROOT;
-
-       VERIFY(VFS_ROOT(zfsvfs->z_vfs, &rvp) == 0);
-       ZFS_TIME_DECODE(&zcp->zc_cmtime, VTOZ(rvp)->z_phys->zp_crtime);
-       VN_RELE(rvp);
+       while (ip == NULL) {
+               ip = ilookup(zsb->z_sb, (unsigned long)id);
+               if (ip)
+                       break;
  
-       /*
-        * We're only faking the fact that we have a root of a filesystem for
-        * the sake of the GFS interfaces.  Undo the flag manipulation it did
-        * for us.
-        */
-       vp->v_flag &= ~(VROOT | VNOCACHE | VNOMAP | VNOSWAP | VNOMOUNT);
+               /* May fail due to concurrent zfsctl_inode_alloc() */
+               ip = zfsctl_inode_alloc(zsb, id, fops, ops);
+       }
  
-       zfsvfs->z_ctldir = vp;
+       return (ip);
  }
  
  /*
- * Destroy the '.zfs' directory.  Only called when the filesystem is unmounted.
- * There might still be more references if we were force unmounted, but only
- * new zfs_inactive() calls can occur and they don't reference .zfs
+ * Free zfsctl inode specific structures, currently there are none.
   */
  void
-zfsctl_destroy(zfsvfs_t *zfsvfs)
+zfsctl_inode_destroy(struct inode *ip)
  {
-       VN_RELE(zfsvfs->z_ctldir);
-       zfsvfs->z_ctldir = NULL;
  }
  
  /*
- * Given a root znode, retrieve the associated .zfs directory.
- * Add a hold to the vnode and return it.
+ * An inode is being evicted from the cache.
   */
-vnode_t *
-zfsctl_root(znode_t *zp)
+void
+zfsctl_inode_inactive(struct inode *ip)
  {
-       ASSERT(zfs_has_ctldir(zp));
-       VN_HOLD(zp->z_zfsvfs->z_ctldir);
-       return (zp->z_zfsvfs->z_ctldir);
+       if (zfsctl_is_snapdir(ip))
+               zfsctl_snapdir_inactive(ip);
  }
  
  /*
- * Common open routine.  Disallow any write access.
+ * Create the '.zfs' directory.  This directory is cached as part of the VFS
+ * structure.  This results in a hold on the zfs_sb_t.  The code in zfs_umount()
+ * therefore checks against a vfs_count of 2 instead of 1.  This reference
+ * is removed when the ctldir is destroyed in the unmount.  All other entities
+ * under the '.zfs' directory are created dynamically as needed.
+ *
+ * Because the dynamically created '.zfs' directory entries assume the use
+ * of 64-bit inode numbers this support must be disabled on 32-bit systems.
   */
-/* ARGSUSED */
-static int
-zfsctl_common_open(vnode_t **vpp, int flags, cred_t *cr, caller_context_t *ct)
+int
+zfsctl_create(zfs_sb_t *zsb)
  {
-       if (flags & FWRITE)
-               return (EACCES);
+#if defined(CONFIG_64BIT)
+       ASSERT(zsb->z_ctldir == NULL);
  
-       return (0);
-}
+       zsb->z_ctldir = zfsctl_inode_alloc(zsb, ZFSCTL_INO_ROOT,
+           &zpl_fops_root, &zpl_ops_root);
+       if (zsb->z_ctldir == NULL)
+               return (SET_ERROR(ENOENT));
  
-/*
- * Common close routine.  Nothing to do here.
- */
-/* ARGSUSED */
-static int
-zfsctl_common_close(vnode_t *vpp, int flags, int count, offset_t off,
-    cred_t *cr, caller_context_t *ct)
-{
         return (0);
+#else
+       return (SET_ERROR(EOPNOTSUPP));
+#endif /* CONFIG_64BIT */
  }
  
  /*
- * Common access routine.  Disallow writes.
+ * Destroy the '.zfs' directory.  Only called when the filesystem is unmounted.
   */
-/* ARGSUSED */
-static int
-zfsctl_common_access(vnode_t *vp, int mode, int flags, cred_t *cr,
-    caller_context_t *ct)
+void
+zfsctl_destroy(zfs_sb_t *zsb)
  {
-       if (flags & V_ACE_MASK) {
-               if (mode & ACE_ALL_WRITE_PERMS)
-                       return (EACCES);
-       } else {
-               if (mode & VWRITE)
-                       return (EACCES);
-       }
-
-       return (0);
+       iput(zsb->z_ctldir);
+       zsb->z_ctldir = NULL;
  }
  
  /*
- * Common getattr function.  Fill in basic information.
+ * Given a root znode, retrieve the associated .zfs directory.
+ * Add a hold to the vnode and return it.
   */
-static void
-zfsctl_common_getattr(vnode_t *vp, vattr_t *vap)
+struct inode *
+zfsctl_root(znode_t *zp)
  {
-       zfsctl_node_t   *zcp = vp->v_data;
-       timestruc_t     now;
-
-       vap->va_uid = 0;
-       vap->va_gid = 0;
-       vap->va_rdev = 0;
-       /*
-        * We are a purly virtual object, so we have no
-        * blocksize or allocated blocks.
-        */
-       vap->va_blksize = 0;
-       vap->va_nblocks = 0;
-       vap->va_seq = 0;
-       vap->va_fsid = vp->v_vfsp->vfs_dev;
-       vap->va_mode = S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP |
-           S_IROTH | S_IXOTH;
-       vap->va_type = VDIR;
-       /*
-        * We live in the now (for atime).
-        */
-       gethrestime(&now);
-       vap->va_atime = now;
-       vap->va_mtime = vap->va_ctime = zcp->zc_cmtime;
+       ASSERT(zfs_has_ctldir(zp));
+       igrab(ZTOZSB(zp)->z_ctldir);
+       return (ZTOZSB(zp)->z_ctldir);
  }
  
  /*ARGSUSED*/
-static int
-zfsctl_common_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
+int
+zfsctl_fid(struct inode *ip, fid_t *fidp)
  {
-       zfsvfs_t        *zfsvfs = vp->v_vfsp->vfs_data;
-       zfsctl_node_t   *zcp = vp->v_data;
-       uint64_t        object = zcp->zc_id;
+       znode_t         *zp = ITOZ(ip);
+       zfs_sb_t        *zsb = ITOZSB(ip);
+       uint64_t        object = zp->z_id;
         zfid_short_t    *zfid;
         int             i;
  
-       ZFS_ENTER(zfsvfs);
+       ZFS_ENTER(zsb);
  
         if (fidp->fid_len < SHORT_FID_LEN) {
                 fidp->fid_len = SHORT_FID_LEN;
-               ZFS_EXIT(zfsvfs);
-               return (ENOSPC);
+               ZFS_EXIT(zsb);
+               return (SET_ERROR(ENOSPC));
         }
  
         zfid = (zfid_short_t *)fidp;
@@ -344,836 +345,683 @@ zfsctl_common_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
         for (i = 0; i < sizeof (zfid->zf_gen); i++)
                 zfid->zf_gen[i] = 0;
  
-       ZFS_EXIT(zfsvfs);
+       ZFS_EXIT(zsb);
         return (0);
  }
  
-/*
- * .zfs inode namespace
- *
- * We need to generate unique inode numbers for all files and directories
- * within the .zfs pseudo-filesystem.  We use the following scheme:
- *
- *     ENTRY                   ZFSCTL_INODE
- *     .zfs                    1
- *     .zfs/snapshot           2
- *     .zfs/snapshot/<snap>    objectid(snap)
- */
-
-#define        ZFSCTL_INO_SNAP(id)     (id)
-
-/*
- * Get root directory attributes.
- */
-/* ARGSUSED */
  static int
-zfsctl_root_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
-    caller_context_t *ct)
+zfsctl_snapshot_zname(struct inode *ip, const char *name, int len, char *zname)
  {
-       zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
+       objset_t *os = ITOZSB(ip)->z_os;
  
-       ZFS_ENTER(zfsvfs);
-       vap->va_nodeid = ZFSCTL_INO_ROOT;
-       vap->va_nlink = vap->va_size = NROOT_ENTRIES;
+       if (snapshot_namecheck(name, NULL, NULL) != 0)
+               return (SET_ERROR(EILSEQ));
  
-       zfsctl_common_getattr(vp, vap);
-       ZFS_EXIT(zfsvfs);
+       dmu_objset_name(os, zname);
+       if ((strlen(zname) + 1 + strlen(name)) >= len)
+               return (SET_ERROR(ENAMETOOLONG));
+
+       (void) strcat(zname, "@");
+       (void) strcat(zname, name);
  
         return (0);
  }
  
  /*
- * Special case the handling of "..".
+ * Gets the full dataset name that corresponds to the given snapshot name
+ * Example:
+ *     zfsctl_snapshot_zname("snap1") -> "mypool/myfs@snap1"
   */
-/* ARGSUSED */
-int
-zfsctl_root_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
-    int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
-    int *direntflags, pathname_t *realpnp)
+static int
+zfsctl_snapshot_zpath(struct path *path, int len, char *zpath)
  {
-       zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
-       int err;
+       char *path_buffer, *path_ptr;
+       int path_len, error = 0;
  
-       /*
-        * No extended attributes allowed under .zfs
-        */
-       if (flags & LOOKUP_XATTR)
-               return (EINVAL);
+       path_buffer = kmem_alloc(len, KM_SLEEP);
  
-       ZFS_ENTER(zfsvfs);
+       path_ptr = d_path(path, path_buffer, len);
+       if (IS_ERR(path_ptr)) {
+               error = -PTR_ERR(path_ptr);
+               goto out;
+       }
  
-       if (strcmp(nm, "..") == 0) {
-               err = VFS_ROOT(dvp->v_vfsp, vpp);
-       } else {
-               err = gfs_vop_lookup(dvp, nm, vpp, pnp, flags, rdir,
-                   cr, ct, direntflags, realpnp);
+       path_len = path_buffer + len - 1 - path_ptr;
+       if (path_len > len) {
+               error = SET_ERROR(EFAULT);
+               goto out;
         }
  
-       ZFS_EXIT(zfsvfs);
+       memcpy(zpath, path_ptr, path_len);
+       zpath[path_len] = '\0';
+out:
+       kmem_free(path_buffer, len);
  
-       return (err);
+       return (error);
  }
  
-static int
-zfsctl_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
-    caller_context_t *ct)
+/*
+ * Special case the handling of "..".
+ */
+/* ARGSUSED */
+int
+zfsctl_root_lookup(struct inode *dip, char *name, struct inode **ipp,
+    int flags, cred_t *cr, int *direntflags, pathname_t *realpnp)
  {
-       /*
-        * We only care about ACL_ENABLED so that libsec can
-        * display ACL correctly and not default to POSIX draft.
-        */
-       if (cmd == _PC_ACL_ENABLED) {
-               *valp = _ACL_ACE_ENABLED;
-               return (0);
+       zfs_sb_t *zsb = ITOZSB(dip);
+       int error = 0;
+
+       ZFS_ENTER(zsb);
+
+       if (strcmp(name, "..") == 0) {
+               *ipp = dip->i_sb->s_root->d_inode;
+       } else if (strcmp(name, ZFS_SNAPDIR_NAME) == 0) {
+               *ipp = zfsctl_inode_lookup(zsb, ZFSCTL_INO_SNAPDIR,
+                   &zpl_fops_snapdir, &zpl_ops_snapdir);
+       } else if (strcmp(name, ZFS_SHAREDIR_NAME) == 0) {
+               *ipp = zfsctl_inode_lookup(zsb, ZFSCTL_INO_SHARES,
+                   &zpl_fops_shares, &zpl_ops_shares);
+       } else {
+               *ipp = NULL;
         }
  
-       return (fs_pathconf(vp, cmd, valp, cr, ct));
-}
+       if (*ipp == NULL)
+               error = SET_ERROR(ENOENT);
  
-static const fs_operation_def_t zfsctl_tops_root[] = {
-       { VOPNAME_OPEN,         { .vop_open = zfsctl_common_open }      },
-       { VOPNAME_CLOSE,        { .vop_close = zfsctl_common_close }    },
-       { VOPNAME_IOCTL,        { .error = fs_inval }                   },
-       { VOPNAME_GETATTR,      { .vop_getattr = zfsctl_root_getattr }  },
-       { VOPNAME_ACCESS,       { .vop_access = zfsctl_common_access }  },
-       { VOPNAME_READDIR,      { .vop_readdir = gfs_vop_readdir }      },
-       { VOPNAME_LOOKUP,       { .vop_lookup = zfsctl_root_lookup }    },
-       { VOPNAME_SEEK,         { .vop_seek = fs_seek }                 },
-       { VOPNAME_INACTIVE,     { .vop_inactive = gfs_vop_inactive }    },
-       { VOPNAME_PATHCONF,     { .vop_pathconf = zfsctl_pathconf }     },
-       { VOPNAME_FID,          { .vop_fid = zfsctl_common_fid  }       },
-       { NULL }
-};
+       ZFS_EXIT(zsb);
  
-static int
-zfsctl_snapshot_zname(vnode_t *vp, const char *name, int len, char *zname)
-{
-       objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os;
-
-       if (snapshot_namecheck(name, NULL, NULL) != 0)
-               return (EILSEQ);
-       dmu_objset_name(os, zname);
-       if (strlen(zname) + 1 + strlen(name) >= len)
-               return (ENAMETOOLONG);
-       (void) strcat(zname, "@");
-       (void) strcat(zname, name);
-       return (0);
+       return (error);
  }
  
-static int
-zfsctl_unmount_snap(zfs_snapentry_t *sep, int fflags, cred_t *cr)
+/*
+ * Lookup entry point for the 'snapshot' directory.  Try to open the
+ * snapshot if it exist, creating the pseudo filesystem inode as necessary.
+ * Perform a mount of the associated dataset on top of the inode.
+ */
+/* ARGSUSED */
+int
+zfsctl_snapdir_lookup(struct inode *dip, char *name, struct inode **ipp,
+    int flags, cred_t *cr, int *direntflags, pathname_t *realpnp)
  {
-       vnode_t *svp = sep->se_root;
+       zfs_sb_t *zsb = ITOZSB(dip);
+       uint64_t id;
         int error;
  
-       ASSERT(vn_ismntpt(svp));
-
-       /* this will be dropped by dounmount() */
-       if ((error = vn_vfswlock(svp)) != 0)
-               return (error);
+       ZFS_ENTER(zsb);
  
-       VN_HOLD(svp);
-       error = dounmount(vn_mountedvfs(svp), fflags, cr);
+       error = dmu_snapshot_lookup(zsb->z_os, name, &id);
         if (error) {
-               VN_RELE(svp);
+               ZFS_EXIT(zsb);
                 return (error);
         }
-       VFS_RELE(svp->v_vfsp);
-       /*
-        * We can't use VN_RELE(), as that will try to invoke
-        * zfsctl_snapdir_inactive(), which would cause us to destroy
-        * the sd_lock mutex held by our caller.
-        */
-       ASSERT(svp->v_count == 1);
-       gfs_vop_inactive(svp, cr, NULL);
  
-       kmem_free(sep->se_name, strlen(sep->se_name) + 1);
-       kmem_free(sep, sizeof (zfs_snapentry_t));
+       *ipp = zfsctl_inode_lookup(zsb, ZFSCTL_INO_SNAPDIRS - id,
+           &simple_dir_operations, &simple_dir_inode_operations);
+       if (*ipp) {
+#ifdef HAVE_AUTOMOUNT
+               (*ipp)->i_flags |= S_AUTOMOUNT;
+#endif /* HAVE_AUTOMOUNT */
+       } else {
+               error = SET_ERROR(ENOENT);
+       }
  
-       return (0);
+       ZFS_EXIT(zsb);
+
+       return (error);
  }
  
  static void
-zfsctl_rename_snap(zfsctl_snapdir_t *sdp, zfs_snapentry_t *sep, const char *nm)
+zfsctl_rename_snap(zfs_sb_t *zsb, zfs_snapentry_t *sep, const char *name)
  {
         avl_index_t where;
-       vfs_t *vfsp;
-       refstr_t *pathref;
-       char newpath[MAXNAMELEN];
-       char *tail;
  
-       ASSERT(MUTEX_HELD(&sdp->sd_lock));
+       ASSERT(MUTEX_HELD(&zsb->z_ctldir_lock));
         ASSERT(sep != NULL);
  
-       vfsp = vn_mountedvfs(sep->se_root);
-       ASSERT(vfsp != NULL);
-
-       vfs_lock_wait(vfsp);
-
         /*
          * Change the name in the AVL tree.
          */
-       avl_remove(&sdp->sd_snaps, sep);
-       kmem_free(sep->se_name, strlen(sep->se_name) + 1);
-       sep->se_name = kmem_alloc(strlen(nm) + 1, KM_SLEEP);
-       (void) strcpy(sep->se_name, nm);
-       VERIFY(avl_find(&sdp->sd_snaps, sep, &where) == NULL);
-       avl_insert(&sdp->sd_snaps, sep, where);
-
-       /*
-        * Change the current mountpoint info:
-        *      - update the tail of the mntpoint path
-        *      - update the tail of the resource path
-        */
-       pathref = vfs_getmntpoint(vfsp);
-       (void) strncpy(newpath, refstr_value(pathref), sizeof (newpath));
-       VERIFY((tail = strrchr(newpath, '/')) != NULL);
-       *(tail+1) = '\0';
-       ASSERT3U(strlen(newpath) + strlen(nm), <, sizeof (newpath));
-       (void) strcat(newpath, nm);
-       refstr_rele(pathref);
-       vfs_setmntpoint(vfsp, newpath);
-
-       pathref = vfs_getresource(vfsp);
-       (void) strncpy(newpath, refstr_value(pathref), sizeof (newpath));
-       VERIFY((tail = strrchr(newpath, '@')) != NULL);
-       *(tail+1) = '\0';
-       ASSERT3U(strlen(newpath) + strlen(nm), <, sizeof (newpath));
-       (void) strcat(newpath, nm);
-       refstr_rele(pathref);
-       vfs_setresource(vfsp, newpath);
-
-       vfs_unlock(vfsp);
+       avl_remove(&zsb->z_ctldir_snaps, sep);
+       (void) strcpy(sep->se_name, name);
+       VERIFY(avl_find(&zsb->z_ctldir_snaps, sep, &where) == NULL);
+       avl_insert(&zsb->z_ctldir_snaps, sep, where);
  }
  
+/*
+ * Renaming a directory under '.zfs/snapshot' will automatically trigger
+ * a rename of the snapshot to the new given name.  The rename is confined
+ * to the '.zfs/snapshot' directory snapshots cannot be moved elsewhere.
+ */
  /*ARGSUSED*/
-static int
-zfsctl_snapdir_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm,
-    cred_t *cr, caller_context_t *ct, int flags)
+int
+zfsctl_snapdir_rename(struct inode *sdip, char *snm,
+    struct inode *tdip, char *tnm, cred_t *cr, int flags)
  {
-       zfsctl_snapdir_t *sdp = sdvp->v_data;
+       zfs_sb_t *zsb = ITOZSB(sdip);
         zfs_snapentry_t search, *sep;
-       zfsvfs_t *zfsvfs;
         avl_index_t where;
-       char from[MAXNAMELEN], to[MAXNAMELEN];
-       char real[MAXNAMELEN];
-       int err;
+       char *to, *from, *real, *fsname;
+       int error;
+
+       ZFS_ENTER(zsb);
  
-       zfsvfs = sdvp->v_vfsp->vfs_data;
-       ZFS_ENTER(zfsvfs);
+       to = kmem_alloc(MAXNAMELEN, KM_SLEEP);
+       from = kmem_alloc(MAXNAMELEN, KM_SLEEP);
+       real = kmem_alloc(MAXNAMELEN, KM_SLEEP);
+       fsname = kmem_alloc(MAXNAMELEN, KM_SLEEP);
  
-       if ((flags & FIGNORECASE) || zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
-               err = dmu_snapshot_realname(zfsvfs->z_os, snm, real,
+       if (zsb->z_case == ZFS_CASE_INSENSITIVE) {
+               error = dmu_snapshot_realname(zsb->z_os, snm, real,
                     MAXNAMELEN, NULL);
-               if (err == 0) {
+               if (error == 0) {
                         snm = real;
-               } else if (err != ENOTSUP) {
-                       ZFS_EXIT(zfsvfs);
-                       return (err);
+               } else if (error != ENOTSUP) {
+                       goto out;
                 }
         }
  
-       ZFS_EXIT(zfsvfs);
+       dmu_objset_name(zsb->z_os, fsname);
  
-       err = zfsctl_snapshot_zname(sdvp, snm, MAXNAMELEN, from);
-       if (!err)
-               err = zfsctl_snapshot_zname(tdvp, tnm, MAXNAMELEN, to);
-       if (!err)
-               err = zfs_secpolicy_rename_perms(from, to, cr);
-       if (err)
-               return (err);
+       error = zfsctl_snapshot_zname(sdip, snm, MAXNAMELEN, from);
+       if (error == 0)
+               error = zfsctl_snapshot_zname(tdip, tnm, MAXNAMELEN, to);
+       if (error == 0)
+               error = zfs_secpolicy_rename_perms(from, to, cr);
+       if (error != 0)
+               goto out;
  
         /*
          * Cannot move snapshots out of the snapdir.
          */
-       if (sdvp != tdvp)
-               return (EINVAL);
+       if (sdip != tdip) {
+               error = SET_ERROR(EINVAL);
+               goto out;
+       }
+
+       /*
+        * No-op when names are identical.
+        */
+       if (strcmp(snm, tnm) == 0) {
+               error = 0;
+               goto out;
+       }
  
-       if (strcmp(snm, tnm) == 0)
-               return (0);
+       mutex_enter(&zsb->z_ctldir_lock);
  
-       mutex_enter(&sdp->sd_lock);
+       error = dsl_dataset_rename_snapshot(fsname, snm, tnm, B_FALSE);
+       if (error)
+               goto out_unlock;
  
         search.se_name = (char *)snm;
-       if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) == NULL) {
-               mutex_exit(&sdp->sd_lock);
-               return (ENOENT);
-       }
+       sep = avl_find(&zsb->z_ctldir_snaps, &search, &where);
+       if (sep)
+               zfsctl_rename_snap(zsb, sep, tnm);
  
-       err = dmu_objset_rename(from, to, B_FALSE);
-       if (err == 0)
-               zfsctl_rename_snap(sdp, sep, tnm);
+out_unlock:
+       mutex_exit(&zsb->z_ctldir_lock);
+out:
+       kmem_free(from, MAXNAMELEN);
+       kmem_free(to, MAXNAMELEN);
+       kmem_free(real, MAXNAMELEN);
+       kmem_free(fsname, MAXNAMELEN);
  
-       mutex_exit(&sdp->sd_lock);
+       ZFS_EXIT(zsb);
  
-       return (err);
+       return (error);
  }
  
+/*
+ * Removing a directory under '.zfs/snapshot' will automatically trigger
+ * the removal of the snapshot with the given name.
+ */
  /* ARGSUSED */
-static int
-zfsctl_snapdir_remove(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr,
-    caller_context_t *ct, int flags)
+int
+zfsctl_snapdir_remove(struct inode *dip, char *name, cred_t *cr, int flags)
  {
-       zfsctl_snapdir_t *sdp = dvp->v_data;
-       zfs_snapentry_t *sep;
-       zfs_snapentry_t search;
-       zfsvfs_t *zfsvfs;
-       char snapname[MAXNAMELEN];
-       char real[MAXNAMELEN];
-       int err;
+       zfs_sb_t *zsb = ITOZSB(dip);
+       char *snapname, *real;
+       int error;
  
-       zfsvfs = dvp->v_vfsp->vfs_data;
-       ZFS_ENTER(zfsvfs);
+       ZFS_ENTER(zsb);
  
-       if ((flags & FIGNORECASE) || zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
+       snapname = kmem_alloc(MAXNAMELEN, KM_SLEEP);
+       real = kmem_alloc(MAXNAMELEN, KM_SLEEP);
  
-               err = dmu_snapshot_realname(zfsvfs->z_os, name, real,
+       if (zsb->z_case == ZFS_CASE_INSENSITIVE) {
+               error = dmu_snapshot_realname(zsb->z_os, name, real,
                     MAXNAMELEN, NULL);
-               if (err == 0) {
+               if (error == 0) {
                         name = real;
-               } else if (err != ENOTSUP) {
-                       ZFS_EXIT(zfsvfs);
-                       return (err);
+               } else if (error != ENOTSUP) {
+                       goto out;
                 }
         }
  
-       ZFS_EXIT(zfsvfs);
-
-       err = zfsctl_snapshot_zname(dvp, name, MAXNAMELEN, snapname);
-       if (!err)
-               err = zfs_secpolicy_destroy_perms(snapname, cr);
-       if (err)
-               return (err);
-
-       mutex_enter(&sdp->sd_lock);
+       error = zfsctl_snapshot_zname(dip, name, MAXNAMELEN, snapname);
+       if (error == 0)
+               error = zfs_secpolicy_destroy_perms(snapname, cr);
+       if (error != 0)
+               goto out;
  
-       search.se_name = name;
-       sep = avl_find(&sdp->sd_snaps, &search, NULL);
-       if (sep) {
-               avl_remove(&sdp->sd_snaps, sep);
-               err = zfsctl_unmount_snap(sep, MS_FORCE, cr);
-               if (err)
-                       avl_add(&sdp->sd_snaps, sep);
-               else
-                       err = dmu_objset_destroy(snapname);
-       } else {
-               err = ENOENT;
-       }
+       error = zfsctl_unmount_snapshot(zsb, name, MNT_FORCE);
+       if ((error == 0) || (error == ENOENT))
+               error = dsl_destroy_snapshot(snapname, B_FALSE);
+out:
+       kmem_free(snapname, MAXNAMELEN);
+       kmem_free(real, MAXNAMELEN);
  
-       mutex_exit(&sdp->sd_lock);
+       ZFS_EXIT(zsb);
  
-       return (err);
+       return (error);
  }
  
  /*
- * This creates a snapshot under '.zfs/snapshot'.
+ * Creating a directory under '.zfs/snapshot' will automatically trigger
+ * the creation of a new snapshot with the given name.
   */
  /* ARGSUSED */
-static int
-zfsctl_snapdir_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t  **vpp,
-    cred_t *cr, caller_context_t *cc, int flags, vsecattr_t *vsecp)
+int
+zfsctl_snapdir_mkdir(struct inode *dip, char *dirname, vattr_t *vap,
+       struct inode **ipp, cred_t *cr, int flags)
  {
-       zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
-       char name[MAXNAMELEN];
-       int err;
-       static enum symfollow follow = NO_FOLLOW;
-       static enum uio_seg seg = UIO_SYSSPACE;
+       zfs_sb_t *zsb = ITOZSB(dip);
+       char *dsname;
+       int error;
  
-       if (snapshot_namecheck(dirname, NULL, NULL) != 0)
-               return (EILSEQ);
+       dsname = kmem_alloc(MAXNAMELEN, KM_SLEEP);
  
-       dmu_objset_name(zfsvfs->z_os, name);
+       if (snapshot_namecheck(dirname, NULL, NULL) != 0) {
+               error = SET_ERROR(EILSEQ);
+               goto out;
+       }
  
-       *vpp = NULL;
+       dmu_objset_name(zsb->z_os, dsname);
  
-       err = zfs_secpolicy_snapshot_perms(name, cr);
-       if (err)
-               return (err);
+       error = zfs_secpolicy_snapshot_perms(dsname, cr);
+       if (error != 0)
+               goto out;
  
-       if (err == 0) {
-               err = dmu_objset_snapshot(name, dirname, B_FALSE);
-               if (err)
-                       return (err);
-               err = lookupnameat(dirname, seg, follow, NULL, vpp, dvp);
+       if (error == 0) {
+               error = dmu_objset_snapshot_one(dsname, dirname);
+               if (error != 0)
+                       goto out;
+
+               error = zfsctl_snapdir_lookup(dip, dirname, ipp,
+                   0, cr, NULL, NULL);
         }
+out:
+       kmem_free(dsname, MAXNAMELEN);
  
-       return (err);
+       return (error);
  }
  
  /*
- * Lookup entry point for the 'snapshot' directory.  Try to open the
- * snapshot if it exist, creating the pseudo filesystem vnode as necessary.
- * Perform a mount of the associated dataset on top of the vnode.
+ * When a .zfs/snapshot/<snapshot> inode is evicted they must be removed
+ * from the snapshot list.  This will normally happen as part of the auto
+ * unmount, however in the case of a manual snapshot unmount this will be
+ * the only notification we receive.
   */
-/* ARGSUSED */
-static int
-zfsctl_snapdir_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
-    int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
-    int *direntflags, pathname_t *realpnp)
+void
+zfsctl_snapdir_inactive(struct inode *ip)
  {
-       zfsctl_snapdir_t *sdp = dvp->v_data;
-       objset_t *snap;
-       char snapname[MAXNAMELEN];
-       char real[MAXNAMELEN];
-       char *mountpoint;
-       zfs_snapentry_t *sep, search;
-       struct mounta margs;
-       vfs_t *vfsp;
-       size_t mountpoint_len;
-       avl_index_t where;
-       zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
-       int err;
-
-       /*
-        * No extended attributes allowed under .zfs
-        */
-       if (flags & LOOKUP_XATTR)
-               return (EINVAL);
+       zfs_sb_t *zsb = ITOZSB(ip);
+       zfs_snapentry_t *sep, *next;
  
-       ASSERT(dvp->v_type == VDIR);
+       mutex_enter(&zsb->z_ctldir_lock);
  
-       if (gfs_lookup_dot(vpp, dvp, zfsvfs->z_ctldir, nm) == 0)
-               return (0);
+       sep = avl_first(&zsb->z_ctldir_snaps);
+       while (sep != NULL) {
+               next = AVL_NEXT(&zsb->z_ctldir_snaps, sep);
  
-       /*
-        * If we get a recursive call, that means we got called
-        * from the domount() code while it was trying to look up the
-        * spec (which looks like a local path for zfs).  We need to
-        * add some flag to domount() to tell it not to do this lookup.
-        */
-       if (MUTEX_HELD(&sdp->sd_lock))
-               return (ENOENT);
+               if (sep->se_inode == ip) {
+                       avl_remove(&zsb->z_ctldir_snaps, sep);
+                       taskq_cancel_id(zfs_expire_taskq, sep->se_taskqid);
+                       zfsctl_sep_free(sep);
+                       break;
+               }
+               sep = next;
+       }
  
-       ZFS_ENTER(zfsvfs);
+       mutex_exit(&zsb->z_ctldir_lock);
+}
  
-       if (flags & FIGNORECASE) {
-               boolean_t conflict = B_FALSE;
+/*
+ * Attempt to unmount a snapshot by making a call to user space.
+ * There is no assurance that this can or will succeed, is just a
+ * best effort.  In the case where it does fail, perhaps because
+ * it's in use, the unmount will fail harmlessly.
+ */
+#define        SET_UNMOUNT_CMD \
+       "exec 0</dev/null " \
+       "     1>/dev/null " \
+       "     2>/dev/null; " \
+       "umount -t zfs -n %s'%s'"
  
-               err = dmu_snapshot_realname(zfsvfs->z_os, nm, real,
-                   MAXNAMELEN, &conflict);
-               if (err == 0) {
-                       nm = real;
-               } else if (err != ENOTSUP) {
-                       ZFS_EXIT(zfsvfs);
-                       return (err);
-               }
-               if (realpnp)
-                       (void) strlcpy(realpnp->pn_buf, nm,
-                           realpnp->pn_bufsize);
-               if (conflict && direntflags)
-                       *direntflags = ED_CASE_CONFLICT;
-       }
+static int
+__zfsctl_unmount_snapshot(zfs_snapentry_t *sep, int flags)
+{
+       char *argv[] = { "/bin/sh", "-c", NULL, NULL };
+       char *envp[] = { NULL };
+       int error;
  
-       mutex_enter(&sdp->sd_lock);
-       search.se_name = (char *)nm;
-       if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) != NULL) {
-               *vpp = sep->se_root;
-               VN_HOLD(*vpp);
-               err = traverse(vpp);
-               if (err) {
-                       VN_RELE(*vpp);
-                       *vpp = NULL;
-               } else if (*vpp == sep->se_root) {
-                       /*
-                        * The snapshot was unmounted behind our backs,
-                        * try to remount it.
-                        */
-                       goto domount;
-               } else {
-                       /*
-                        * VROOT was set during the traverse call.  We need
-                        * to clear it since we're pretending to be part
-                        * of our parent's vfs.
-                        */
-                       (*vpp)->v_flag &= ~VROOT;
-               }
-               mutex_exit(&sdp->sd_lock);
-               ZFS_EXIT(zfsvfs);
-               return (err);
-       }
+       argv[2] = kmem_asprintf(SET_UNMOUNT_CMD,
+           flags & MNT_FORCE ? "-f " : "", sep->se_path);
+       error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
+       strfree(argv[2]);
  
         /*
-        * The requested snapshot is not currently mounted, look it up.
+        * The umount system utility will return 256 on error.  We must
+        * assume this error is because the file system is busy so it is
+        * converted to the more sensible EBUSY.
          */
-       err = zfsctl_snapshot_zname(dvp, nm, MAXNAMELEN, snapname);
-       if (err) {
-               mutex_exit(&sdp->sd_lock);
-               ZFS_EXIT(zfsvfs);
-               /*
-                * handle "ls *" or "?" in a graceful manner,
-                * forcing EILSEQ to ENOENT.
-                * Since shell ultimately passes "*" or "?" as name to lookup
-                */
-               return (err == EILSEQ ? ENOENT : err);
-       }
-       if (dmu_objset_open(snapname, DMU_OST_ZFS,
-           DS_MODE_USER | DS_MODE_READONLY, &snap) != 0) {
-               mutex_exit(&sdp->sd_lock);
-               ZFS_EXIT(zfsvfs);
-               return (ENOENT);
-       }
-
-       sep = kmem_alloc(sizeof (zfs_snapentry_t), KM_SLEEP);
-       sep->se_name = kmem_alloc(strlen(nm) + 1, KM_SLEEP);
-       (void) strcpy(sep->se_name, nm);
-       *vpp = sep->se_root = zfsctl_snapshot_mknode(dvp, dmu_objset_id(snap));
-       avl_insert(&sdp->sd_snaps, sep, where);
-
-       dmu_objset_close(snap);
-domount:
-       mountpoint_len = strlen(refstr_value(dvp->v_vfsp->vfs_mntpt)) +
-           strlen("/.zfs/snapshot/") + strlen(nm) + 1;
-       mountpoint = kmem_alloc(mountpoint_len, KM_SLEEP);
-       (void) snprintf(mountpoint, mountpoint_len, "%s/.zfs/snapshot/%s",
-           refstr_value(dvp->v_vfsp->vfs_mntpt), nm);
-
-       margs.spec = snapname;
-       margs.dir = mountpoint;
-       margs.flags = MS_SYSSPACE | MS_NOMNTTAB;
-       margs.fstype = "zfs";
-       margs.dataptr = NULL;
-       margs.datalen = 0;
-       margs.optptr = NULL;
-       margs.optlen = 0;
-
-       err = domount("zfs", &margs, *vpp, kcred, &vfsp);
-       kmem_free(mountpoint, mountpoint_len);
-
-       if (err == 0) {
-               /*
-                * Return the mounted root rather than the covered mount point.
-                * Takes the GFS vnode at .zfs/snapshot/<snapname> and returns
-                * the ZFS vnode mounted on top of the GFS node.  This ZFS
-                * vnode is the root the newly created vfsp.
-                */
-               VFS_RELE(vfsp);
-               err = traverse(vpp);
-       }
-
-       if (err == 0) {
-               /*
-                * Fix up the root vnode mounted on .zfs/snapshot/<snapname>.
-                *
-                * This is where we lie about our v_vfsp in order to
-                * make .zfs/snapshot/<snapname> accessible over NFS
-                * without requiring manual mounts of <snapname>.
-                */
-               ASSERT(VTOZ(*vpp)->z_zfsvfs != zfsvfs);
-               VTOZ(*vpp)->z_zfsvfs->z_parent = zfsvfs;
-               (*vpp)->v_vfsp = zfsvfs->z_vfs;
-               (*vpp)->v_flag &= ~VROOT;
-       }
-       mutex_exit(&sdp->sd_lock);
-       ZFS_EXIT(zfsvfs);
+       if (error)
+               error = SET_ERROR(EBUSY);
  
         /*
-        * If we had an error, drop our hold on the vnode and
-        * zfsctl_snapshot_inactive() will clean up.
+        * This was the result of a manual unmount, cancel the delayed work
+        * to prevent zfsctl_expire_snapshot() from attempting a unmount.
          */
-       if (err) {
-               VN_RELE(*vpp);
-               *vpp = NULL;
-       }
-       return (err);
+       if ((error == 0) && !(flags & MNT_EXPIRE))
+               taskq_cancel_id(zfs_expire_taskq, sep->se_taskqid);
+
+
+       return (error);
  }
  
-/* ARGSUSED */
-static int
-zfsctl_snapdir_readdir_cb(vnode_t *vp, void *dp, int *eofp,
-    offset_t *offp, offset_t *nextp, void *data, int flags)
+int
+zfsctl_unmount_snapshot(zfs_sb_t *zsb, char *name, int flags)
  {
-       zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
-       char snapname[MAXNAMELEN];
-       uint64_t id, cookie;
-       boolean_t case_conflict;
-       int error;
+       zfs_snapentry_t search;
+       zfs_snapentry_t *sep;
+       int error = 0;
  
-       ZFS_ENTER(zfsvfs);
+       mutex_enter(&zsb->z_ctldir_lock);
  
-       cookie = *offp;
-       error = dmu_snapshot_list_next(zfsvfs->z_os, MAXNAMELEN, snapname, &id,
-           &cookie, &case_conflict);
-       if (error) {
-               ZFS_EXIT(zfsvfs);
-               if (error == ENOENT) {
-                       *eofp = 1;
-                       return (0);
-               }
-               return (error);
-       }
+       search.se_name = name;
+       sep = avl_find(&zsb->z_ctldir_snaps, &search, NULL);
+       if (sep) {
+               avl_remove(&zsb->z_ctldir_snaps, sep);
+               mutex_exit(&zsb->z_ctldir_lock);
  
-       if (flags & V_RDDIR_ENTFLAGS) {
-               edirent_t *eodp = dp;
+               error = __zfsctl_unmount_snapshot(sep, flags);
  
-               (void) strcpy(eodp->ed_name, snapname);
-               eodp->ed_ino = ZFSCTL_INO_SNAP(id);
-               eodp->ed_eflags = case_conflict ? ED_CASE_CONFLICT : 0;
+               mutex_enter(&zsb->z_ctldir_lock);
+               if (error == EBUSY)
+                       avl_add(&zsb->z_ctldir_snaps, sep);
+               else
+                       zfsctl_sep_free(sep);
         } else {
-               struct dirent64 *odp = dp;
-
-               (void) strcpy(odp->d_name, snapname);
-               odp->d_ino = ZFSCTL_INO_SNAP(id);
+               error = SET_ERROR(ENOENT);
         }
-       *nextp = cookie;
  
-       ZFS_EXIT(zfsvfs);
+       mutex_exit(&zsb->z_ctldir_lock);
+       ASSERT3S(error, >=, 0);
  
-       return (0);
+       return (error);
  }
  
  /*
- * pvp is the '.zfs' directory (zfsctl_node_t).
- * Creates vp, which is '.zfs/snapshot' (zfsctl_snapdir_t).
- *
- * This function is the callback to create a GFS vnode for '.zfs/snapshot'
- * when a lookup is performed on .zfs for "snapshot".
+ * Traverse all mounted snapshots and attempt to unmount them.  This
+ * is best effort, on failure EEXIST is returned and count will be set
+ * to the number of file snapshots which could not be unmounted.
   */
-vnode_t *
-zfsctl_mknode_snapdir(vnode_t *pvp)
+int
+zfsctl_unmount_snapshots(zfs_sb_t *zsb, int flags, int *count)
  {
-       vnode_t *vp;
-       zfsctl_snapdir_t *sdp;
-
-       vp = gfs_dir_create(sizeof (zfsctl_snapdir_t), pvp,
-           zfsctl_ops_snapdir, NULL, NULL, MAXNAMELEN,
-           zfsctl_snapdir_readdir_cb, NULL);
-       sdp = vp->v_data;
-       sdp->sd_node.zc_id = ZFSCTL_INO_SNAPDIR;
-       sdp->sd_node.zc_cmtime = ((zfsctl_node_t *)pvp->v_data)->zc_cmtime;
-       mutex_init(&sdp->sd_lock, NULL, MUTEX_DEFAULT, NULL);
-       avl_create(&sdp->sd_snaps, snapentry_compare,
-           sizeof (zfs_snapentry_t), offsetof(zfs_snapentry_t, se_node));
-       return (vp);
-}
+       zfs_snapentry_t *sep, *next;
+       int error = 0;
  
-/* ARGSUSED */
-static int
-zfsctl_snapdir_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
-    caller_context_t *ct)
-{
-       zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
-       zfsctl_snapdir_t *sdp = vp->v_data;
+       *count = 0;
  
-       ZFS_ENTER(zfsvfs);
-       zfsctl_common_getattr(vp, vap);
-       vap->va_nodeid = gfs_file_inode(vp);
-       vap->va_nlink = vap->va_size = avl_numnodes(&sdp->sd_snaps) + 2;
-       ZFS_EXIT(zfsvfs);
+       ASSERT(zsb->z_ctldir != NULL);
+       mutex_enter(&zsb->z_ctldir_lock);
  
-       return (0);
-}
+       sep = avl_first(&zsb->z_ctldir_snaps);
+       while (sep != NULL) {
+               next = AVL_NEXT(&zsb->z_ctldir_snaps, sep);
+               avl_remove(&zsb->z_ctldir_snaps, sep);
+               mutex_exit(&zsb->z_ctldir_lock);
  
-/* ARGSUSED */
-static void
-zfsctl_snapdir_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
-{
-       zfsctl_snapdir_t *sdp = vp->v_data;
-       void *private;
-
-       private = gfs_dir_inactive(vp);
-       if (private != NULL) {
-               ASSERT(avl_numnodes(&sdp->sd_snaps) == 0);
-               mutex_destroy(&sdp->sd_lock);
-               avl_destroy(&sdp->sd_snaps);
-               kmem_free(private, sizeof (zfsctl_snapdir_t));
-       }
-}
+               error = __zfsctl_unmount_snapshot(sep, flags);
  
-static const fs_operation_def_t zfsctl_tops_snapdir[] = {
-       { VOPNAME_OPEN,         { .vop_open = zfsctl_common_open }      },
-       { VOPNAME_CLOSE,        { .vop_close = zfsctl_common_close }    },
-       { VOPNAME_IOCTL,        { .error = fs_inval }                   },
-       { VOPNAME_GETATTR,      { .vop_getattr = zfsctl_snapdir_getattr } },
-       { VOPNAME_ACCESS,       { .vop_access = zfsctl_common_access }  },
-       { VOPNAME_RENAME,       { .vop_rename = zfsctl_snapdir_rename } },
-       { VOPNAME_RMDIR,        { .vop_rmdir = zfsctl_snapdir_remove }  },
-       { VOPNAME_MKDIR,        { .vop_mkdir = zfsctl_snapdir_mkdir }   },
-       { VOPNAME_READDIR,      { .vop_readdir = gfs_vop_readdir }      },
-       { VOPNAME_LOOKUP,       { .vop_lookup = zfsctl_snapdir_lookup } },
-       { VOPNAME_SEEK,         { .vop_seek = fs_seek }                 },
-       { VOPNAME_INACTIVE,     { .vop_inactive = zfsctl_snapdir_inactive } },
-       { VOPNAME_FID,          { .vop_fid = zfsctl_common_fid }        },
-       { NULL }
-};
+               mutex_enter(&zsb->z_ctldir_lock);
+               if (error == EBUSY) {
+                       avl_add(&zsb->z_ctldir_snaps, sep);
+                       (*count)++;
+               } else {
+                       zfsctl_sep_free(sep);
+               }
  
-/*
- * pvp is the GFS vnode '.zfs/snapshot'.
- *
- * This creates a GFS node under '.zfs/snapshot' representing each
- * snapshot.  This newly created GFS node is what we mount snapshot
- * vfs_t's ontop of.
- */
-static vnode_t *
-zfsctl_snapshot_mknode(vnode_t *pvp, uint64_t objset)
-{
-       vnode_t *vp;
-       zfsctl_node_t *zcp;
+               sep = next;
+       }
  
-       vp = gfs_dir_create(sizeof (zfsctl_node_t), pvp,
-           zfsctl_ops_snapshot, NULL, NULL, MAXNAMELEN, NULL, NULL);
-       zcp = vp->v_data;
-       zcp->zc_id = objset;
-       VFS_HOLD(vp->v_vfsp);
+       mutex_exit(&zsb->z_ctldir_lock);
  
-       return (vp);
+       return ((*count > 0) ? EEXIST : 0);
  }
  
-static void
-zfsctl_snapshot_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
+#define        MOUNT_BUSY 0x80         /* Mount failed due to EBUSY (from mntent.h) */
+
+#define        SET_MOUNT_CMD \
+       "exec 0</dev/null " \
+       "     1>/dev/null " \
+       "     2>/dev/null; " \
+       "mount -t zfs -n '%s' '%s'"
+
+int
+zfsctl_mount_snapshot(struct path *path, int flags)
  {
-       zfsctl_snapdir_t *sdp;
-       zfs_snapentry_t *sep, *next;
-       vnode_t *dvp;
+       struct dentry *dentry = path->dentry;
+       struct inode *ip = dentry->d_inode;
+       zfs_sb_t *zsb = ITOZSB(ip);
+       char *full_name, *full_path;
+       zfs_snapentry_t *sep;
+       zfs_snapentry_t search;
+       char *argv[] = { "/bin/sh", "-c", NULL, NULL };
+       char *envp[] = { NULL };
+       int error;
  
-       VERIFY(gfs_dir_lookup(vp, "..", &dvp, cr, 0, NULL, NULL) == 0);
-       sdp = dvp->v_data;
+       ZFS_ENTER(zsb);
  
-       mutex_enter(&sdp->sd_lock);
+       full_name = kmem_zalloc(MAXNAMELEN, KM_SLEEP);
+       full_path = kmem_zalloc(PATH_MAX, KM_SLEEP);
  
-       if (vp->v_count > 1) {
-               mutex_exit(&sdp->sd_lock);
-               return;
-       }
-       ASSERT(!vn_ismntpt(vp));
+       error = zfsctl_snapshot_zname(ip, dname(dentry), MAXNAMELEN, full_name);
+       if (error)
+               goto error;
  
-       sep = avl_first(&sdp->sd_snaps);
-       while (sep != NULL) {
-               next = AVL_NEXT(&sdp->sd_snaps, sep);
+       error = zfsctl_snapshot_zpath(path, PATH_MAX, full_path);
+       if (error)
+               goto error;
  
-               if (sep->se_root == vp) {
-                       avl_remove(&sdp->sd_snaps, sep);
-                       kmem_free(sep->se_name, strlen(sep->se_name) + 1);
-                       kmem_free(sep, sizeof (zfs_snapentry_t));
-                       break;
-               }
-               sep = next;
+       /*
+        * Attempt to mount the snapshot from user space.  Normally this
+        * would be done using the vfs_kern_mount() function, however that
+        * function is marked GPL-only and cannot be used.  On error we
+        * careful to log the real error to the console and return EISDIR
+        * to safely abort the automount.  This should be very rare.
+        *
+        * If the user mode helper happens to return EBUSY, a concurrent
+        * mount is already in progress in which case the error is ignored.
+        * Take note that if the program was executed successfully the return
+        * value from call_usermodehelper() will be (exitcode << 8 + signal).
+        */
+       argv[2] = kmem_asprintf(SET_MOUNT_CMD, full_name, full_path);
+       error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
+       strfree(argv[2]);
+       if (error && !(error & MOUNT_BUSY << 8)) {
+               printk("ZFS: Unable to automount %s at %s: %d\n",
+                   full_name, full_path, error);
+               error = SET_ERROR(EISDIR);
+               goto error;
         }
-       ASSERT(sep != NULL);
  
-       mutex_exit(&sdp->sd_lock);
-       VN_RELE(dvp);
-       VFS_RELE(vp->v_vfsp);
+       error = 0;
+       mutex_enter(&zsb->z_ctldir_lock);
  
         /*
-        * Dispose of the vnode for the snapshot mount point.
-        * This is safe to do because once this entry has been removed
-        * from the AVL tree, it can't be found again, so cannot become
-        * "active".  If we lookup the same name again we will end up
-        * creating a new vnode.
+        * Ensure a previous entry does not exist, if it does safely remove
+        * it any cancel the outstanding expiration.  This can occur when a
+        * snapshot is manually unmounted and then an automount is triggered.
          */
-       gfs_vop_inactive(vp, cr, ct);
+       search.se_name = full_name;
+       sep = avl_find(&zsb->z_ctldir_snaps, &search, NULL);
+       if (sep) {
+               avl_remove(&zsb->z_ctldir_snaps, sep);
+               taskq_cancel_id(zfs_expire_taskq, sep->se_taskqid);
+               zfsctl_sep_free(sep);
+       }
+
+       sep = zfsctl_sep_alloc();
+       sep->se_name = full_name;
+       sep->se_path = full_path;
+       sep->se_inode = ip;
+       avl_add(&zsb->z_ctldir_snaps, sep);
+
+       sep->se_taskqid = taskq_dispatch_delay(zfs_expire_taskq,
+           zfsctl_expire_snapshot, sep, TQ_SLEEP,
+           ddi_get_lbolt() + zfs_expire_snapshot * HZ);
+
+       mutex_exit(&zsb->z_ctldir_lock);
+error:
+       if (error) {
+               kmem_free(full_name, MAXNAMELEN);
+               kmem_free(full_path, PATH_MAX);
+       }
+
+       ZFS_EXIT(zsb);
+
+       return (error);
  }
  
+/*
+ * Check if this super block has a matching objset id.
+ */
+static int
+zfsctl_test_super(struct super_block *sb, void *objsetidp)
+{
+       zfs_sb_t *zsb = sb->s_fs_info;
+       uint64_t objsetid = *(uint64_t *)objsetidp;
+
+       return (dmu_objset_id(zsb->z_os) == objsetid);
+}
  
  /*
- * These VP's should never see the light of day.  They should always
- * be covered.
+ * Prevent a new super block from being allocated if an existing one
+ * could not be located.  We only want to preform a lookup operation.
   */
-static const fs_operation_def_t zfsctl_tops_snapshot[] = {
-       VOPNAME_INACTIVE, { .vop_inactive =  zfsctl_snapshot_inactive },
-       NULL, NULL
-};
+static int
+zfsctl_set_super(struct super_block *sb, void *objsetidp)
+{
+       return (-EEXIST);
+}
  
  int
-zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp)
+zfsctl_lookup_objset(struct super_block *sb, uint64_t objsetid, zfs_sb_t **zsbp)
  {
-       zfsvfs_t *zfsvfs = vfsp->vfs_data;
-       vnode_t *dvp, *vp;
-       zfsctl_snapdir_t *sdp;
-       zfsctl_node_t *zcp;
+       zfs_sb_t *zsb = sb->s_fs_info;
+       struct super_block *sbp;
         zfs_snapentry_t *sep;
+       uint64_t id;
         int error;
  
-       ASSERT(zfsvfs->z_ctldir != NULL);
-       error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp,
-           NULL, 0, NULL, kcred, NULL, NULL, NULL);
-       if (error != 0)
-               return (error);
-       sdp = dvp->v_data;
+       ASSERT(zsb->z_ctldir != NULL);
+
+       mutex_enter(&zsb->z_ctldir_lock);
  
-       mutex_enter(&sdp->sd_lock);
-       sep = avl_first(&sdp->sd_snaps);
+       /*
+        * Verify that the snapshot is mounted.
+        */
+       sep = avl_first(&zsb->z_ctldir_snaps);
         while (sep != NULL) {
-               vp = sep->se_root;
-               zcp = vp->v_data;
-               if (zcp->zc_id == objsetid)
+               error = dmu_snapshot_lookup(zsb->z_os, sep->se_name, &id);
+               if (error)
+                       goto out;
+
+               if (id == objsetid)
                         break;
  
-               sep = AVL_NEXT(&sdp->sd_snaps, sep);
+               sep = AVL_NEXT(&zsb->z_ctldir_snaps, sep);
         }
  
         if (sep != NULL) {
-               VN_HOLD(vp);
                 /*
-                * Return the mounted root rather than the covered mount point.
-                * Takes the GFS vnode at .zfs/snapshot/<snapshot objsetid>
-                * and returns the ZFS vnode mounted on top of the GFS node.
-                * This ZFS vnode is the root of the vfs for objset 'objsetid'.
+                * Lookup the mounted root rather than the covered mount
+                * point.  This may fail if the snapshot has just been
+                * unmounted by an unrelated user space process.  This
+                * race cannot occur to an expired mount point because
+                * we hold the zsb->z_ctldir_lock to prevent the race.
                  */
-               error = traverse(&vp);
-               if (error == 0) {
-                       if (vp == sep->se_root)
-                               error = EINVAL;
-                       else
-                               *zfsvfsp = VTOZ(vp)->z_zfsvfs;
+               sbp = zpl_sget(&zpl_fs_type, zfsctl_test_super,
+                   zfsctl_set_super, 0, &id);
+               if (IS_ERR(sbp)) {
+                       error = -PTR_ERR(sbp);
+               } else {
+                       *zsbp = sbp->s_fs_info;
+                       deactivate_super(sbp);
                 }
-               mutex_exit(&sdp->sd_lock);
-               VN_RELE(vp);
         } else {
-               error = EINVAL;
-               mutex_exit(&sdp->sd_lock);
+               error = SET_ERROR(EINVAL);
         }
-
-       VN_RELE(dvp);
+out:
+       mutex_exit(&zsb->z_ctldir_lock);
+       ASSERT3S(error, >=, 0);
  
         return (error);
  }
  
-/*
- * Unmount any snapshots for the given filesystem.  This is called from
- * zfs_umount() - if we have a ctldir, then go through and unmount all the
- * snapshots.
- */
+/* ARGSUSED */
  int
-zfsctl_umount_snapshots(vfs_t *vfsp, int fflags, cred_t *cr)
+zfsctl_shares_lookup(struct inode *dip, char *name, struct inode **ipp,
+    int flags, cred_t *cr, int *direntflags, pathname_t *realpnp)
  {
-       zfsvfs_t *zfsvfs = vfsp->vfs_data;
-       vnode_t *dvp;
-       zfsctl_snapdir_t *sdp;
-       zfs_snapentry_t *sep, *next;
+       zfs_sb_t *zsb = ITOZSB(dip);
+       struct inode *ip;
+       znode_t *dzp;
         int error;
  
-       ASSERT(zfsvfs->z_ctldir != NULL);
-       error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp,
-           NULL, 0, NULL, cr, NULL, NULL, NULL);
-       if (error != 0)
-               return (error);
-       sdp = dvp->v_data;
-
-       mutex_enter(&sdp->sd_lock);
+       ZFS_ENTER(zsb);
  
-       sep = avl_first(&sdp->sd_snaps);
-       while (sep != NULL) {
-               next = AVL_NEXT(&sdp->sd_snaps, sep);
+       if (zsb->z_shares_dir == 0) {
+               ZFS_EXIT(zsb);
+               return (SET_ERROR(ENOTSUP));
+       }
  
-               /*
-                * If this snapshot is not mounted, then it must
-                * have just been unmounted by somebody else, and
-                * will be cleaned up by zfsctl_snapdir_inactive().
-                */
-               if (vn_ismntpt(sep->se_root)) {
-                       avl_remove(&sdp->sd_snaps, sep);
-                       error = zfsctl_unmount_snap(sep, fflags, cr);
-                       if (error) {
-                               avl_add(&sdp->sd_snaps, sep);
-                               break;
-                       }
-               }
-               sep = next;
+       error = zfs_zget(zsb, zsb->z_shares_dir, &dzp);
+       if (error) {
+               ZFS_EXIT(zsb);
+               return (error);
         }
  
-       mutex_exit(&sdp->sd_lock);
-       VN_RELE(dvp);
+       error = zfs_lookup(ZTOI(dzp), name, &ip, 0, cr, NULL, NULL);
+
+       iput(ZTOI(dzp));
+       ZFS_EXIT(zsb);
  
         return (error);
  }
+
+
+/*
+ * Initialize the various pieces we'll need to create and manipulate .zfs
+ * directories.  Currently this is unused but available.
+ */
+void
+zfsctl_init(void)
+{
+       zfs_expire_taskq = taskq_create("z_unmount", 1, maxclsyspri,
+           1, 8, TASKQ_PREPOPULATE);
+}
+
+/*
+ * Cleanup the various pieces we needed for .zfs directories.  In particular
+ * ensure the expiry timer is canceled safely.
+ */
+void
+zfsctl_fini(void)
+{
+       taskq_destroy(zfs_expire_taskq);
+}
+
+module_param(zfs_expire_snapshot, int, 0644);
+MODULE_PARM_DESC(zfs_expire_snapshot, "Seconds to expire .zfs/snapshot");