Fix typo/etc in module/zfs/zfs_ctldir.c

[mirror_zfs.git] / module / zfs / zfs_ctldir.c
diff --git a/module/zfs/zfs_ctldir.c b/module/zfs/zfs_ctldir.c

index 51b12a1d50e4f766e8e97039e74f13d4d9508006..46e6e19b91d57871a589c40639169501bd6e9668 100644 (file)
--- a/module/zfs/zfs_ctldir.c
+++ b/module/zfs/zfs_ctldir.c
@@ -19,22 +19,32 @@
   * CDDL HEADER END
   */
  /*
+ *
   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (C) 2011 Lawrence Livermore National Security, LLC.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * LLNL-CODE-403049.
+ * Rewritten for Linux by:
+ *   Rohan Puri <rohan.puri15@gmail.com>
+ *   Brian Behlendorf <behlendorf1@llnl.gov>
+ * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved.
+ * Copyright (c) 2018 George Melikov. All Rights Reserved.
   */
  
  /*
   * ZFS control directory (a.k.a. ".zfs")
   *
   * This directory provides a common location for all ZFS meta-objects.
- * Currently, this is only the 'snapshot' directory, but this may expand in the
- * future.  The elements are built using the GFS primitives, as the hierarchy
+ * Currently, this is only the 'snapshot' and 'shares' directory, but this may
+ * expand in the future.  The elements are built dynamically, as the hierarchy
   * does not actually exist on disk.
   *
   * For 'snapshot', we don't want to have all snapshots always mounted, because
   * this would take up a huge amount of space in /etc/mnttab.  We have three
   * types of objects:
   *
- *     ctldir ------> snapshotdir -------> snapshot
+ *     ctldir ------> snapshotdir -------> snapshot
   *                                             |
   *                                             |
   *                                             V
@@ -43,66 +53,174 @@
   * The 'snapshot' node contains just enough information to lookup '..' and act
   * as a mountpoint for the snapshot.  Whenever we lookup a specific snapshot, we
   * perform an automount of the underlying filesystem and return the
- * corresponding vnode.
+ * corresponding inode.
   *
- * All mounts are handled automatically by the kernel, but unmounts are
- * (currently) handled from user land.  The main reason is that there is no
- * reliable way to auto-unmount the filesystem when it's "no longer in use".
- * When the user unmounts a filesystem, we call zfsctl_unmount(), which
- * unmounts any snapshots within the snapshot directory.
+ * All mounts are handled automatically by an user mode helper which invokes
+ * the mount procedure.  Unmounts are handled by allowing the mount
+ * point to expire so the kernel may automatically unmount it.
   *
   * The '.zfs', '.zfs/snapshot', and all directories created under
- * '.zfs/snapshot' (ie: '.zfs/snapshot/<snapname>') are all GFS nodes and
- * share the same vfs_t as the head filesystem (what '.zfs' lives under).
+ * '.zfs/snapshot' (ie: '.zfs/snapshot/<snapname>') all share the same
+ * zfsvfs_t as the head filesystem (what '.zfs' lives under).
   *
- * File systems mounted ontop of the GFS nodes '.zfs/snapshot/<snapname>'
- * (ie: snapshots) are ZFS nodes and have their own unique vfs_t.
- * However, vnodes within these mounted on file systems have their v_vfsp
- * fields set to the head filesystem to make NFS happy (see
- * zfsctl_snapdir_lookup()). We VFS_HOLD the head filesystem's vfs_t
- * so that it cannot be freed until all snapshots have been unmounted.
+ * File systems mounted on top of the '.zfs/snapshot/<snapname>' paths
+ * (ie: snapshots) are complete ZFS filesystems and have their own unique
+ * zfsvfs_t.  However, the fsid reported by these mounts will be the same
+ * as that used by the parent zfsvfs_t to make NFS happy.
   */
  
-#ifdef HAVE_ZPL
-
-#include <fs/fs_subr.h>
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/sysmacros.h>
+#include <sys/pathname.h>
+#include <sys/vfs.h>
  #include <sys/zfs_ctldir.h>
  #include <sys/zfs_ioctl.h>
  #include <sys/zfs_vfsops.h>
-#include <sys/vfs_opreg.h>
-#include <sys/gfs.h>
+#include <sys/zfs_vnops.h>
  #include <sys/stat.h>
  #include <sys/dmu.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_destroy.h>
  #include <sys/dsl_deleg.h>
-#include <sys/mount.h>
-#include <sys/sunddi.h>
-
+#include <sys/zpl.h>
  #include "zfs_namecheck.h"
  
-typedef struct zfsctl_node {
-       gfs_dir_t       zc_gfs_private;
-       uint64_t        zc_id;
-       timestruc_t     zc_cmtime;      /* ctime and mtime, always the same */
-} zfsctl_node_t;
+/*
+ * Two AVL trees are maintained which contain all currently automounted
+ * snapshots.  Every automounted snapshots maps to a single zfs_snapentry_t
+ * entry which MUST:
+ *
+ *   - be attached to both trees, and
+ *   - be unique, no duplicate entries are allowed.
+ *
+ * The zfs_snapshots_by_name tree is indexed by the full dataset name
+ * while the zfs_snapshots_by_objsetid tree is indexed by the unique
+ * objsetid.  This allows for fast lookups either by name or objsetid.
+ */
+static avl_tree_t zfs_snapshots_by_name;
+static avl_tree_t zfs_snapshots_by_objsetid;
+static krwlock_t zfs_snapshot_lock;
  
-typedef struct zfsctl_snapdir {
-       zfsctl_node_t   sd_node;
-       kmutex_t        sd_lock;
-       avl_tree_t      sd_snaps;
-} zfsctl_snapdir_t;
+/*
+ * Control Directory Tunables (.zfs)
+ */
+int zfs_expire_snapshot = ZFSCTL_EXPIRE_SNAPSHOT;
+int zfs_admin_snapshot = 0;
  
  typedef struct {
-       char            *se_name;
-       vnode_t         *se_root;
-       avl_node_t      se_node;
+       char            *se_name;       /* full snapshot name */
+       char            *se_path;       /* full mount path */
+       spa_t           *se_spa;        /* pool spa */
+       uint64_t        se_objsetid;    /* snapshot objset id */
+       struct dentry   *se_root_dentry; /* snapshot root dentry */
+       taskqid_t       se_taskqid;     /* scheduled unmount taskqid */
+       avl_node_t      se_node_name;   /* zfs_snapshots_by_name link */
+       avl_node_t      se_node_objsetid; /* zfs_snapshots_by_objsetid link */
+       zfs_refcount_t  se_refcount;    /* reference count */
  } zfs_snapentry_t;
  
+static void zfsctl_snapshot_unmount_delay_impl(zfs_snapentry_t *se, int delay);
+
+/*
+ * Allocate a new zfs_snapentry_t being careful to make a copy of the
+ * the snapshot name and provided mount point.  No reference is taken.
+ */
+static zfs_snapentry_t *
+zfsctl_snapshot_alloc(char *full_name, char *full_path, spa_t *spa,
+    uint64_t objsetid, struct dentry *root_dentry)
+{
+       zfs_snapentry_t *se;
+
+       se = kmem_zalloc(sizeof (zfs_snapentry_t), KM_SLEEP);
+
+       se->se_name = strdup(full_name);
+       se->se_path = strdup(full_path);
+       se->se_spa = spa;
+       se->se_objsetid = objsetid;
+       se->se_root_dentry = root_dentry;
+       se->se_taskqid = TASKQID_INVALID;
+
+       zfs_refcount_create(&se->se_refcount);
+
+       return (se);
+}
+
+/*
+ * Free a zfs_snapentry_t the caller must ensure there are no active
+ * references.
+ */
+static void
+zfsctl_snapshot_free(zfs_snapentry_t *se)
+{
+       zfs_refcount_destroy(&se->se_refcount);
+       strfree(se->se_name);
+       strfree(se->se_path);
+
+       kmem_free(se, sizeof (zfs_snapentry_t));
+}
+
+/*
+ * Hold a reference on the zfs_snapentry_t.
+ */
+static void
+zfsctl_snapshot_hold(zfs_snapentry_t *se)
+{
+       zfs_refcount_add(&se->se_refcount, NULL);
+}
+
+/*
+ * Release a reference on the zfs_snapentry_t.  When the number of
+ * references drops to zero the structure will be freed.
+ */
+static void
+zfsctl_snapshot_rele(zfs_snapentry_t *se)
+{
+       if (zfs_refcount_remove(&se->se_refcount, NULL) == 0)
+               zfsctl_snapshot_free(se);
+}
+
+/*
+ * Add a zfs_snapentry_t to both the zfs_snapshots_by_name and
+ * zfs_snapshots_by_objsetid trees.  While the zfs_snapentry_t is part
+ * of the trees a reference is held.
+ */
+static void
+zfsctl_snapshot_add(zfs_snapentry_t *se)
+{
+       ASSERT(RW_WRITE_HELD(&zfs_snapshot_lock));
+       zfs_refcount_add(&se->se_refcount, NULL);
+       avl_add(&zfs_snapshots_by_name, se);
+       avl_add(&zfs_snapshots_by_objsetid, se);
+}
+
+/*
+ * Remove a zfs_snapentry_t from both the zfs_snapshots_by_name and
+ * zfs_snapshots_by_objsetid trees.  Upon removal a reference is dropped,
+ * this can result in the structure being freed if that was the last
+ * remaining reference.
+ */
+static void
+zfsctl_snapshot_remove(zfs_snapentry_t *se)
+{
+       ASSERT(RW_WRITE_HELD(&zfs_snapshot_lock));
+       avl_remove(&zfs_snapshots_by_name, se);
+       avl_remove(&zfs_snapshots_by_objsetid, se);
+       zfsctl_snapshot_rele(se);
+}
+
+/*
+ * Snapshot name comparison function for the zfs_snapshots_by_name.
+ */
  static int
-snapentry_compare(const void *a, const void *b)
+snapentry_compare_by_name(const void *a, const void *b)
  {
-       const zfs_snapentry_t *sa = a;
-       const zfs_snapentry_t *sb = b;
-       int ret = strcmp(sa->se_name, sb->se_name);
+       const zfs_snapentry_t *se_a = a;
+       const zfs_snapentry_t *se_b = b;
+       int ret;
+
+       ret = strcmp(se_a->se_name, se_b->se_name);
  
         if (ret < 0)
                 return (-1);
@@ -112,1241 +230,1038 @@ snapentry_compare(const void *a, const void *b)
                 return (0);
  }
  
-vnodeops_t *zfsctl_ops_root;
-vnodeops_t *zfsctl_ops_snapdir;
-vnodeops_t *zfsctl_ops_snapshot;
-vnodeops_t *zfsctl_ops_shares;
-vnodeops_t *zfsctl_ops_shares_dir;
-
-static const fs_operation_def_t zfsctl_tops_root[];
-static const fs_operation_def_t zfsctl_tops_snapdir[];
-static const fs_operation_def_t zfsctl_tops_snapshot[];
-static const fs_operation_def_t zfsctl_tops_shares[];
-
-static vnode_t *zfsctl_mknode_snapdir(vnode_t *);
-static vnode_t *zfsctl_mknode_shares(vnode_t *);
-static vnode_t *zfsctl_snapshot_mknode(vnode_t *, uint64_t objset);
-static int zfsctl_unmount_snap(zfs_snapentry_t *, int, cred_t *);
-
-static gfs_opsvec_t zfsctl_opsvec[] = {
-       { ".zfs", zfsctl_tops_root, &zfsctl_ops_root },
-       { ".zfs/snapshot", zfsctl_tops_snapdir, &zfsctl_ops_snapdir },
-       { ".zfs/snapshot/vnode", zfsctl_tops_snapshot, &zfsctl_ops_snapshot },
-       { ".zfs/shares", zfsctl_tops_shares, &zfsctl_ops_shares_dir },
-       { ".zfs/shares/vnode", zfsctl_tops_shares, &zfsctl_ops_shares },
-       { NULL }
-};
-
  /*
- * Root directory elements.  We only have two entries
- * snapshot and shares.
+ * Snapshot name comparison function for the zfs_snapshots_by_objsetid.
   */
-static gfs_dirent_t zfsctl_root_entries[] = {
-       { "snapshot", zfsctl_mknode_snapdir, GFS_CACHE_VNODE },
-       { "shares", zfsctl_mknode_shares, GFS_CACHE_VNODE },
-       { NULL }
-};
+static int
+snapentry_compare_by_objsetid(const void *a, const void *b)
+{
+       const zfs_snapentry_t *se_a = a;
+       const zfs_snapentry_t *se_b = b;
  
-/* include . and .. in the calculation */
-#define        NROOT_ENTRIES   ((sizeof (zfsctl_root_entries) / \
-    sizeof (gfs_dirent_t)) + 1)
+       if (se_a->se_spa != se_b->se_spa)
+               return ((ulong_t)se_a->se_spa < (ulong_t)se_b->se_spa ? -1 : 1);
  
+       if (se_a->se_objsetid < se_b->se_objsetid)
+               return (-1);
+       else if (se_a->se_objsetid > se_b->se_objsetid)
+               return (1);
+       else
+               return (0);
+}
  
  /*
- * Initialize the various GFS pieces we'll need to create and manipulate .zfs
- * directories.  This is called from the ZFS init routine, and initializes the
- * vnode ops vectors that we'll be using.
+ * Find a zfs_snapentry_t in zfs_snapshots_by_name.  If the snapname
+ * is found a pointer to the zfs_snapentry_t is returned and a reference
+ * taken on the structure.  The caller is responsible for dropping the
+ * reference with zfsctl_snapshot_rele().  If the snapname is not found
+ * NULL will be returned.
   */
-void
-zfsctl_init(void)
+static zfs_snapentry_t *
+zfsctl_snapshot_find_by_name(char *snapname)
  {
-       VERIFY(gfs_make_opsvec(zfsctl_opsvec) == 0);
-}
+       zfs_snapentry_t *se, search;
  
-void
-zfsctl_fini(void)
-{
-       /*
-        * Remove vfsctl vnode ops
-        */
-       if (zfsctl_ops_root)
-               vn_freevnodeops(zfsctl_ops_root);
-       if (zfsctl_ops_snapdir)
-               vn_freevnodeops(zfsctl_ops_snapdir);
-       if (zfsctl_ops_snapshot)
-               vn_freevnodeops(zfsctl_ops_snapshot);
-       if (zfsctl_ops_shares)
-               vn_freevnodeops(zfsctl_ops_shares);
-       if (zfsctl_ops_shares_dir)
-               vn_freevnodeops(zfsctl_ops_shares_dir);
-
-       zfsctl_ops_root = NULL;
-       zfsctl_ops_snapdir = NULL;
-       zfsctl_ops_snapshot = NULL;
-       zfsctl_ops_shares = NULL;
-       zfsctl_ops_shares_dir = NULL;
-}
+       ASSERT(RW_LOCK_HELD(&zfs_snapshot_lock));
  
-boolean_t
-zfsctl_is_node(vnode_t *vp)
-{
-       return (vn_matchops(vp, zfsctl_ops_root) ||
-           vn_matchops(vp, zfsctl_ops_snapdir) ||
-           vn_matchops(vp, zfsctl_ops_snapshot) ||
-           vn_matchops(vp, zfsctl_ops_shares) ||
-           vn_matchops(vp, zfsctl_ops_shares_dir));
+       search.se_name = snapname;
+       se = avl_find(&zfs_snapshots_by_name, &search, NULL);
+       if (se)
+               zfs_refcount_add(&se->se_refcount, NULL);
  
+       return (se);
  }
  
  /*
- * Return the inode number associated with the 'snapshot' or
- * 'shares' directory.
+ * Find a zfs_snapentry_t in zfs_snapshots_by_objsetid given the objset id
+ * rather than the snapname.  In all other respects it behaves the same
+ * as zfsctl_snapshot_find_by_name().
   */
-/* ARGSUSED */
-static ino64_t
-zfsctl_root_inode_cb(vnode_t *vp, int index)
+static zfs_snapentry_t *
+zfsctl_snapshot_find_by_objsetid(spa_t *spa, uint64_t objsetid)
  {
-       zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
+       zfs_snapentry_t *se, search;
  
-       ASSERT(index <= 2);
+       ASSERT(RW_LOCK_HELD(&zfs_snapshot_lock));
  
-       if (index == 0)
-               return (ZFSCTL_INO_SNAPDIR);
+       search.se_spa = spa;
+       search.se_objsetid = objsetid;
+       se = avl_find(&zfs_snapshots_by_objsetid, &search, NULL);
+       if (se)
+               zfs_refcount_add(&se->se_refcount, NULL);
  
-       return (zfsvfs->z_shares_dir);
+       return (se);
  }
  
  /*
- * Create the '.zfs' directory.  This directory is cached as part of the VFS
- * structure.  This results in a hold on the vfs_t.  The code in zfs_umount()
- * therefore checks against a vfs_count of 2 instead of 1.  This reference
- * is removed when the ctldir is destroyed in the unmount.
+ * Rename a zfs_snapentry_t in the zfs_snapshots_by_name.  The structure is
+ * removed, renamed, and added back to the new correct location in the tree.
   */
-void
-zfsctl_create(zfsvfs_t *zfsvfs)
+static int
+zfsctl_snapshot_rename(char *old_snapname, char *new_snapname)
  {
-       vnode_t *vp, *rvp;
-       zfsctl_node_t *zcp;
-       uint64_t crtime[2];
+       zfs_snapentry_t *se;
  
-       ASSERT(zfsvfs->z_ctldir == NULL);
-
-       vp = gfs_root_create(sizeof (zfsctl_node_t), zfsvfs->z_vfs,
-           zfsctl_ops_root, ZFSCTL_INO_ROOT, zfsctl_root_entries,
-           zfsctl_root_inode_cb, MAXNAMELEN, NULL, NULL);
-       zcp = vp->v_data;
-       zcp->zc_id = ZFSCTL_INO_ROOT;
+       ASSERT(RW_WRITE_HELD(&zfs_snapshot_lock));
  
-       VERIFY(VFS_ROOT(zfsvfs->z_vfs, &rvp) == 0);
-       VERIFY(0 == sa_lookup(VTOZ(rvp)->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs),
-           &crtime, sizeof (crtime)));
-       ZFS_TIME_DECODE(&zcp->zc_cmtime, crtime);
-       VN_RELE(rvp);
+       se = zfsctl_snapshot_find_by_name(old_snapname);
+       if (se == NULL)
+               return (SET_ERROR(ENOENT));
  
-       /*
-        * We're only faking the fact that we have a root of a filesystem for
-        * the sake of the GFS interfaces.  Undo the flag manipulation it did
-        * for us.
-        */
-       vp->v_flag &= ~(VROOT | VNOCACHE | VNOMAP | VNOSWAP | VNOMOUNT);
+       zfsctl_snapshot_remove(se);
+       strfree(se->se_name);
+       se->se_name = strdup(new_snapname);
+       zfsctl_snapshot_add(se);
+       zfsctl_snapshot_rele(se);
  
-       zfsvfs->z_ctldir = vp;
+       return (0);
  }
  
  /*
- * Destroy the '.zfs' directory.  Only called when the filesystem is unmounted.
- * There might still be more references if we were force unmounted, but only
- * new zfs_inactive() calls can occur and they don't reference .zfs
+ * Delayed task responsible for unmounting an expired automounted snapshot.
   */
-void
-zfsctl_destroy(zfsvfs_t *zfsvfs)
+static void
+snapentry_expire(void *data)
  {
-       VN_RELE(zfsvfs->z_ctldir);
-       zfsvfs->z_ctldir = NULL;
+       zfs_snapentry_t *se = (zfs_snapentry_t *)data;
+       spa_t *spa = se->se_spa;
+       uint64_t objsetid = se->se_objsetid;
+
+       if (zfs_expire_snapshot <= 0) {
+               zfsctl_snapshot_rele(se);
+               return;
+       }
+
+       se->se_taskqid = TASKQID_INVALID;
+       (void) zfsctl_snapshot_unmount(se->se_name, MNT_EXPIRE);
+       zfsctl_snapshot_rele(se);
+
+       /*
+        * Reschedule the unmount if the zfs_snapentry_t wasn't removed.
+        * This can occur when the snapshot is busy.
+        */
+       rw_enter(&zfs_snapshot_lock, RW_READER);
+       if ((se = zfsctl_snapshot_find_by_objsetid(spa, objsetid)) != NULL) {
+               zfsctl_snapshot_unmount_delay_impl(se, zfs_expire_snapshot);
+               zfsctl_snapshot_rele(se);
+       }
+       rw_exit(&zfs_snapshot_lock);
  }
  
  /*
- * Given a root znode, retrieve the associated .zfs directory.
- * Add a hold to the vnode and return it.
+ * Cancel an automatic unmount of a snapname.  This callback is responsible
+ * for dropping the reference on the zfs_snapentry_t which was taken when
+ * during dispatch.
   */
-vnode_t *
-zfsctl_root(znode_t *zp)
+static void
+zfsctl_snapshot_unmount_cancel(zfs_snapentry_t *se)
  {
-       ASSERT(zfs_has_ctldir(zp));
-       VN_HOLD(zp->z_zfsvfs->z_ctldir);
-       return (zp->z_zfsvfs->z_ctldir);
+       if (taskq_cancel_id(system_delay_taskq, se->se_taskqid) == 0) {
+               se->se_taskqid = TASKQID_INVALID;
+               zfsctl_snapshot_rele(se);
+       }
  }
  
  /*
- * Common open routine.  Disallow any write access.
+ * Dispatch the unmount task for delayed handling with a hold protecting it.
   */
-/* ARGSUSED */
-static int
-zfsctl_common_open(vnode_t **vpp, int flags, cred_t *cr, caller_context_t *ct)
+static void
+zfsctl_snapshot_unmount_delay_impl(zfs_snapentry_t *se, int delay)
  {
-       if (flags & FWRITE)
-               return (EACCES);
+       ASSERT3S(se->se_taskqid, ==, TASKQID_INVALID);
  
-       return (0);
+       if (delay <= 0)
+               return;
+
+       zfsctl_snapshot_hold(se);
+       se->se_taskqid = taskq_dispatch_delay(system_delay_taskq,
+           snapentry_expire, se, TQ_SLEEP, ddi_get_lbolt() + delay * HZ);
  }
  
  /*
- * Common close routine.  Nothing to do here.
+ * Schedule an automatic unmount of objset id to occur in delay seconds from
+ * now.  Any previous delayed unmount will be cancelled in favor of the
+ * updated deadline.  A reference is taken by zfsctl_snapshot_find_by_name()
+ * and held until the outstanding task is handled or cancelled.
   */
-/* ARGSUSED */
-static int
-zfsctl_common_close(vnode_t *vpp, int flags, int count, offset_t off,
-    cred_t *cr, caller_context_t *ct)
+int
+zfsctl_snapshot_unmount_delay(spa_t *spa, uint64_t objsetid, int delay)
  {
-       return (0);
+       zfs_snapentry_t *se;
+       int error = ENOENT;
+
+       rw_enter(&zfs_snapshot_lock, RW_READER);
+       if ((se = zfsctl_snapshot_find_by_objsetid(spa, objsetid)) != NULL) {
+               zfsctl_snapshot_unmount_cancel(se);
+               zfsctl_snapshot_unmount_delay_impl(se, delay);
+               zfsctl_snapshot_rele(se);
+               error = 0;
+       }
+       rw_exit(&zfs_snapshot_lock);
+
+       return (error);
  }
  
  /*
- * Common access routine.  Disallow writes.
+ * Check if snapname is currently mounted.  Returned non-zero when mounted
+ * and zero when unmounted.
   */
-/* ARGSUSED */
-static int
-zfsctl_common_access(vnode_t *vp, int mode, int flags, cred_t *cr,
-    caller_context_t *ct)
+static boolean_t
+zfsctl_snapshot_ismounted(char *snapname)
  {
-       if (flags & V_ACE_MASK) {
-               if (mode & ACE_ALL_WRITE_PERMS)
-                       return (EACCES);
-       } else {
-               if (mode & VWRITE)
-                       return (EACCES);
+       zfs_snapentry_t *se;
+       boolean_t ismounted = B_FALSE;
+
+       rw_enter(&zfs_snapshot_lock, RW_READER);
+       if ((se = zfsctl_snapshot_find_by_name(snapname)) != NULL) {
+               zfsctl_snapshot_rele(se);
+               ismounted = B_TRUE;
         }
+       rw_exit(&zfs_snapshot_lock);
  
-       return (0);
+       return (ismounted);
  }
  
  /*
- * Common getattr function.  Fill in basic information.
+ * Check if the given inode is a part of the virtual .zfs directory.
   */
-static void
-zfsctl_common_getattr(vnode_t *vp, vattr_t *vap)
+boolean_t
+zfsctl_is_node(struct inode *ip)
  {
-       timestruc_t     now;
+       return (ITOZ(ip)->z_is_ctldir);
+}
  
-       vap->va_uid = 0;
-       vap->va_gid = 0;
-       vap->va_rdev = 0;
-       /*
-        * We are a purely virtual object, so we have no
-        * blocksize or allocated blocks.
-        */
-       vap->va_blksize = 0;
-       vap->va_nblocks = 0;
-       vap->va_seq = 0;
-       vap->va_fsid = vp->v_vfsp->vfs_dev;
-       vap->va_mode = S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP |
-           S_IROTH | S_IXOTH;
-       vap->va_type = VDIR;
-       /*
-        * We live in the now (for atime).
-        */
-       gethrestime(&now);
-       vap->va_atime = now;
+/*
+ * Check if the given inode is a .zfs/snapshots/snapname directory.
+ */
+boolean_t
+zfsctl_is_snapdir(struct inode *ip)
+{
+       return (zfsctl_is_node(ip) && (ip->i_ino <= ZFSCTL_INO_SNAPDIRS));
  }
  
-/*ARGSUSED*/
-static int
-zfsctl_common_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
+/*
+ * Allocate a new inode with the passed id and ops.
+ */
+static struct inode *
+zfsctl_inode_alloc(zfsvfs_t *zfsvfs, uint64_t id,
+    const struct file_operations *fops, const struct inode_operations *ops)
  {
-       zfsvfs_t        *zfsvfs = vp->v_vfsp->vfs_data;
-       zfsctl_node_t   *zcp = vp->v_data;
-       uint64_t        object = zcp->zc_id;
-       zfid_short_t    *zfid;
-       int             i;
+       inode_timespec_t now;
+       struct inode *ip;
+       znode_t *zp;
+
+       ip = new_inode(zfsvfs->z_sb);
+       if (ip == NULL)
+               return (NULL);
+
+       now = current_time(ip);
+       zp = ITOZ(ip);
+       ASSERT3P(zp->z_dirlocks, ==, NULL);
+       ASSERT3P(zp->z_acl_cached, ==, NULL);
+       ASSERT3P(zp->z_xattr_cached, ==, NULL);
+       zp->z_id = id;
+       zp->z_unlinked = 0;
+       zp->z_atime_dirty = 0;
+       zp->z_zn_prefetch = 0;
+       zp->z_moved = 0;
+       zp->z_sa_hdl = NULL;
+       zp->z_blksz = 0;
+       zp->z_seq = 0;
+       zp->z_mapcnt = 0;
+       zp->z_size = 0;
+       zp->z_pflags = 0;
+       zp->z_mode = 0;
+       zp->z_sync_cnt = 0;
+       zp->z_is_mapped = B_FALSE;
+       zp->z_is_ctldir = B_TRUE;
+       zp->z_is_sa = B_FALSE;
+       zp->z_is_stale = B_FALSE;
+       ip->i_generation = 0;
+       ip->i_ino = id;
+       ip->i_mode = (S_IFDIR | S_IRWXUGO);
+       ip->i_uid = SUID_TO_KUID(0);
+       ip->i_gid = SGID_TO_KGID(0);
+       ip->i_blkbits = SPA_MINBLOCKSHIFT;
+       ip->i_atime = now;
+       ip->i_mtime = now;
+       ip->i_ctime = now;
+       ip->i_fop = fops;
+       ip->i_op = ops;
+#if defined(IOP_XATTR)
+       ip->i_opflags &= ~IOP_XATTR;
+#endif
+
+       if (insert_inode_locked(ip)) {
+               unlock_new_inode(ip);
+               iput(ip);
+               return (NULL);
+       }
  
-       ZFS_ENTER(zfsvfs);
+       mutex_enter(&zfsvfs->z_znodes_lock);
+       list_insert_tail(&zfsvfs->z_all_znodes, zp);
+       zfsvfs->z_nr_znodes++;
+       membar_producer();
+       mutex_exit(&zfsvfs->z_znodes_lock);
  
-       if (fidp->fid_len < SHORT_FID_LEN) {
-               fidp->fid_len = SHORT_FID_LEN;
-               ZFS_EXIT(zfsvfs);
-               return (ENOSPC);
-       }
+       unlock_new_inode(ip);
  
-       zfid = (zfid_short_t *)fidp;
+       return (ip);
+}
  
-       zfid->zf_len = SHORT_FID_LEN;
+/*
+ * Lookup the inode with given id, it will be allocated if needed.
+ */
+static struct inode *
+zfsctl_inode_lookup(zfsvfs_t *zfsvfs, uint64_t id,
+    const struct file_operations *fops, const struct inode_operations *ops)
+{
+       struct inode *ip = NULL;
  
-       for (i = 0; i < sizeof (zfid->zf_object); i++)
-               zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
+       while (ip == NULL) {
+               ip = ilookup(zfsvfs->z_sb, (unsigned long)id);
+               if (ip)
+                       break;
  
-       /* .zfs znodes always have a generation number of 0 */
-       for (i = 0; i < sizeof (zfid->zf_gen); i++)
-               zfid->zf_gen[i] = 0;
+               /* May fail due to concurrent zfsctl_inode_alloc() */
+               ip = zfsctl_inode_alloc(zfsvfs, id, fops, ops);
+       }
  
-       ZFS_EXIT(zfsvfs);
-       return (0);
+       return (ip);
  }
  
-
-/*ARGSUSED*/
-static int
-zfsctl_shares_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
+/*
+ * Create the '.zfs' directory.  This directory is cached as part of the VFS
+ * structure.  This results in a hold on the zfsvfs_t.  The code in zfs_umount()
+ * therefore checks against a vfs_count of 2 instead of 1.  This reference
+ * is removed when the ctldir is destroyed in the unmount.  All other entities
+ * under the '.zfs' directory are created dynamically as needed.
+ *
+ * Because the dynamically created '.zfs' directory entries assume the use
+ * of 64-bit inode numbers this support must be disabled on 32-bit systems.
+ */
+int
+zfsctl_create(zfsvfs_t *zfsvfs)
  {
-       zfsvfs_t        *zfsvfs = vp->v_vfsp->vfs_data;
-       znode_t         *dzp;
-       int             error;
+       ASSERT(zfsvfs->z_ctldir == NULL);
  
-       ZFS_ENTER(zfsvfs);
+       zfsvfs->z_ctldir = zfsctl_inode_alloc(zfsvfs, ZFSCTL_INO_ROOT,
+           &zpl_fops_root, &zpl_ops_root);
+       if (zfsvfs->z_ctldir == NULL)
+               return (SET_ERROR(ENOENT));
  
-       if (zfsvfs->z_shares_dir == 0) {
-               ZFS_EXIT(zfsvfs);
-               return (ENOTSUP);
-       }
+       return (0);
+}
  
-       if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) {
-               error = VOP_FID(ZTOV(dzp), fidp, ct);
-               VN_RELE(ZTOV(dzp));
+/*
+ * Destroy the '.zfs' directory or remove a snapshot from zfs_snapshots_by_name.
+ * Only called when the filesystem is unmounted.
+ */
+void
+zfsctl_destroy(zfsvfs_t *zfsvfs)
+{
+       if (zfsvfs->z_issnap) {
+               zfs_snapentry_t *se;
+               spa_t *spa = zfsvfs->z_os->os_spa;
+               uint64_t objsetid = dmu_objset_id(zfsvfs->z_os);
+
+               rw_enter(&zfs_snapshot_lock, RW_WRITER);
+               se = zfsctl_snapshot_find_by_objsetid(spa, objsetid);
+               if (se != NULL)
+                       zfsctl_snapshot_remove(se);
+               rw_exit(&zfs_snapshot_lock);
+               if (se != NULL) {
+                       zfsctl_snapshot_unmount_cancel(se);
+                       zfsctl_snapshot_rele(se);
+               }
+       } else if (zfsvfs->z_ctldir) {
+               iput(zfsvfs->z_ctldir);
+               zfsvfs->z_ctldir = NULL;
         }
-
-       ZFS_EXIT(zfsvfs);
-       return (error);
  }
+
  /*
- * .zfs inode namespace
- *
- * We need to generate unique inode numbers for all files and directories
- * within the .zfs pseudo-filesystem.  We use the following scheme:
- *
- *     ENTRY                   ZFSCTL_INODE
- *     .zfs                    1
- *     .zfs/snapshot           2
- *     .zfs/snapshot/<snap>    objectid(snap)
+ * Given a root znode, retrieve the associated .zfs directory.
+ * Add a hold to the vnode and return it.
   */
-
-#define        ZFSCTL_INO_SNAP(id)     (id)
+struct inode *
+zfsctl_root(znode_t *zp)
+{
+       ASSERT(zfs_has_ctldir(zp));
+       igrab(ZTOZSB(zp)->z_ctldir);
+       return (ZTOZSB(zp)->z_ctldir);
+}
  
  /*
- * Get root directory attributes.
+ * Generate a long fid to indicate a snapdir. We encode whether snapdir is
+ * already monunted in gen field. We do this because nfsd lookup will not
+ * trigger automount. Next time the nfsd does fh_to_dentry, we will notice
+ * this and do automount and return ESTALE to force nfsd revalidate and follow
+ * mount.
   */
-/* ARGSUSED */
  static int
-zfsctl_root_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
-    caller_context_t *ct)
+zfsctl_snapdir_fid(struct inode *ip, fid_t *fidp)
  {
-       zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
-       zfsctl_node_t *zcp = vp->v_data;
+       zfid_short_t *zfid = (zfid_short_t *)fidp;
+       zfid_long_t *zlfid = (zfid_long_t *)fidp;
+       uint32_t gen = 0;
+       uint64_t object;
+       uint64_t objsetid;
+       int i;
+       struct dentry *dentry;
+
+       if (fidp->fid_len < LONG_FID_LEN) {
+               fidp->fid_len = LONG_FID_LEN;
+               return (SET_ERROR(ENOSPC));
+       }
  
-       ZFS_ENTER(zfsvfs);
-       vap->va_nodeid = ZFSCTL_INO_ROOT;
-       vap->va_nlink = vap->va_size = NROOT_ENTRIES;
-       vap->va_mtime = vap->va_ctime = zcp->zc_cmtime;
+       object = ip->i_ino;
+       objsetid = ZFSCTL_INO_SNAPDIRS - ip->i_ino;
+       zfid->zf_len = LONG_FID_LEN;
  
-       zfsctl_common_getattr(vp, vap);
-       ZFS_EXIT(zfsvfs);
+       dentry = d_obtain_alias(igrab(ip));
+       if (!IS_ERR(dentry)) {
+               gen = !!d_mountpoint(dentry);
+               dput(dentry);
+       }
+
+       for (i = 0; i < sizeof (zfid->zf_object); i++)
+               zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
+
+       for (i = 0; i < sizeof (zfid->zf_gen); i++)
+               zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
+
+       for (i = 0; i < sizeof (zlfid->zf_setid); i++)
+               zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
+
+       for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
+               zlfid->zf_setgen[i] = 0;
  
         return (0);
  }
  
  /*
- * Special case the handling of "..".
+ * Generate an appropriate fid for an entry in the .zfs directory.
   */
-/* ARGSUSED */
  int
-zfsctl_root_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
-    int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
-    int *direntflags, pathname_t *realpnp)
+zfsctl_fid(struct inode *ip, fid_t *fidp)
  {
-       zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
-       int err;
-
-       /*
-        * No extended attributes allowed under .zfs
-        */
-       if (flags & LOOKUP_XATTR)
-               return (EINVAL);
+       znode_t         *zp = ITOZ(ip);
+       zfsvfs_t        *zfsvfs = ITOZSB(ip);
+       uint64_t        object = zp->z_id;
+       zfid_short_t    *zfid;
+       int             i;
  
         ZFS_ENTER(zfsvfs);
  
-       if (strcmp(nm, "..") == 0) {
-               err = VFS_ROOT(dvp->v_vfsp, vpp);
-       } else {
-               err = gfs_vop_lookup(dvp, nm, vpp, pnp, flags, rdir,
-                   cr, ct, direntflags, realpnp);
+       if (zfsctl_is_snapdir(ip)) {
+               ZFS_EXIT(zfsvfs);
+               return (zfsctl_snapdir_fid(ip, fidp));
         }
  
-       ZFS_EXIT(zfsvfs);
+       if (fidp->fid_len < SHORT_FID_LEN) {
+               fidp->fid_len = SHORT_FID_LEN;
+               ZFS_EXIT(zfsvfs);
+               return (SET_ERROR(ENOSPC));
+       }
  
-       return (err);
-}
+       zfid = (zfid_short_t *)fidp;
  
-static int
-zfsctl_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
-    caller_context_t *ct)
-{
-       /*
-        * We only care about ACL_ENABLED so that libsec can
-        * display ACL correctly and not default to POSIX draft.
-        */
-       if (cmd == _PC_ACL_ENABLED) {
-               *valp = _ACL_ACE_ENABLED;
-               return (0);
-       }
+       zfid->zf_len = SHORT_FID_LEN;
  
-       return (fs_pathconf(vp, cmd, valp, cr, ct));
-}
+       for (i = 0; i < sizeof (zfid->zf_object); i++)
+               zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
  
-static const fs_operation_def_t zfsctl_tops_root[] = {
-       { VOPNAME_OPEN,         { .vop_open = zfsctl_common_open }      },
-       { VOPNAME_CLOSE,        { .vop_close = zfsctl_common_close }    },
-       { VOPNAME_IOCTL,        { .error = fs_inval }                   },
-       { VOPNAME_GETATTR,      { .vop_getattr = zfsctl_root_getattr }  },
-       { VOPNAME_ACCESS,       { .vop_access = zfsctl_common_access }  },
-       { VOPNAME_READDIR,      { .vop_readdir = gfs_vop_readdir }      },
-       { VOPNAME_LOOKUP,       { .vop_lookup = zfsctl_root_lookup }    },
-       { VOPNAME_SEEK,         { .vop_seek = fs_seek }                 },
-       { VOPNAME_INACTIVE,     { .vop_inactive = gfs_vop_inactive }    },
-       { VOPNAME_PATHCONF,     { .vop_pathconf = zfsctl_pathconf }     },
-       { VOPNAME_FID,          { .vop_fid = zfsctl_common_fid  }       },
-       { NULL }
-};
+       /* .zfs znodes always have a generation number of 0 */
+       for (i = 0; i < sizeof (zfid->zf_gen); i++)
+               zfid->zf_gen[i] = 0;
  
-static int
-zfsctl_snapshot_zname(vnode_t *vp, const char *name, int len, char *zname)
-{
-       objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os;
-
-       if (snapshot_namecheck(name, NULL, NULL) != 0)
-               return (EILSEQ);
-       dmu_objset_name(os, zname);
-       if (strlen(zname) + 1 + strlen(name) >= len)
-               return (ENAMETOOLONG);
-       (void) strcat(zname, "@");
-       (void) strcat(zname, name);
+       ZFS_EXIT(zfsvfs);
         return (0);
  }
  
+/*
+ * Construct a full dataset name in full_name: "pool/dataset@snap_name"
+ */
  static int
-zfsctl_unmount_snap(zfs_snapentry_t *sep, int fflags, cred_t *cr)
+zfsctl_snapshot_name(zfsvfs_t *zfsvfs, const char *snap_name, int len,
+    char *full_name)
  {
-       vnode_t *svp = sep->se_root;
-       int error;
+       objset_t *os = zfsvfs->z_os;
  
-       ASSERT(vn_ismntpt(svp));
+       if (zfs_component_namecheck(snap_name, NULL, NULL) != 0)
+               return (SET_ERROR(EILSEQ));
  
-       /* this will be dropped by dounmount() */
-       if ((error = vn_vfswlock(svp)) != 0)
-               return (error);
+       dmu_objset_name(os, full_name);
+       if ((strlen(full_name) + 1 + strlen(snap_name)) >= len)
+               return (SET_ERROR(ENAMETOOLONG));
  
-       VN_HOLD(svp);
-       error = dounmount(vn_mountedvfs(svp), fflags, cr);
-       if (error) {
-               VN_RELE(svp);
-               return (error);
-       }
-
-       /*
-        * We can't use VN_RELE(), as that will try to invoke
-        * zfsctl_snapdir_inactive(), which would cause us to destroy
-        * the sd_lock mutex held by our caller.
-        */
-       ASSERT(svp->v_count == 1);
-       gfs_vop_inactive(svp, cr, NULL);
-
-       kmem_free(sep->se_name, strlen(sep->se_name) + 1);
-       kmem_free(sep, sizeof (zfs_snapentry_t));
+       (void) strcat(full_name, "@");
+       (void) strcat(full_name, snap_name);
  
         return (0);
  }
  
-static void
-zfsctl_rename_snap(zfsctl_snapdir_t *sdp, zfs_snapentry_t *sep, const char *nm)
+/*
+ * Returns full path in full_path: "/pool/dataset/.zfs/snapshot/snap_name/"
+ */
+static int
+zfsctl_snapshot_path(struct path *path, int len, char *full_path)
  {
-       avl_index_t where;
-       vfs_t *vfsp;
-       refstr_t *pathref;
-       char newpath[MAXNAMELEN];
-       char *tail;
+       char *path_buffer, *path_ptr;
+       int path_len, error = 0;
  
-       ASSERT(MUTEX_HELD(&sdp->sd_lock));
-       ASSERT(sep != NULL);
+       path_buffer = kmem_alloc(len, KM_SLEEP);
  
-       vfsp = vn_mountedvfs(sep->se_root);
-       ASSERT(vfsp != NULL);
+       path_ptr = d_path(path, path_buffer, len);
+       if (IS_ERR(path_ptr)) {
+               error = -PTR_ERR(path_ptr);
+               goto out;
+       }
  
-       vfs_lock_wait(vfsp);
+       path_len = path_buffer + len - 1 - path_ptr;
+       if (path_len > len) {
+               error = SET_ERROR(EFAULT);
+               goto out;
+       }
  
-       /*
-        * Change the name in the AVL tree.
-        */
-       avl_remove(&sdp->sd_snaps, sep);
-       kmem_free(sep->se_name, strlen(sep->se_name) + 1);
-       sep->se_name = kmem_alloc(strlen(nm) + 1, KM_SLEEP);
-       (void) strcpy(sep->se_name, nm);
-       VERIFY(avl_find(&sdp->sd_snaps, sep, &where) == NULL);
-       avl_insert(&sdp->sd_snaps, sep, where);
+       memcpy(full_path, path_ptr, path_len);
+       full_path[path_len] = '\0';
+out:
+       kmem_free(path_buffer, len);
  
-       /*
-        * Change the current mountpoint info:
-        *      - update the tail of the mntpoint path
-        *      - update the tail of the resource path
-        */
-       pathref = vfs_getmntpoint(vfsp);
-       (void) strncpy(newpath, refstr_value(pathref), sizeof (newpath));
-       VERIFY((tail = strrchr(newpath, '/')) != NULL);
-       *(tail+1) = '\0';
-       ASSERT3U(strlen(newpath) + strlen(nm), <, sizeof (newpath));
-       (void) strcat(newpath, nm);
-       refstr_rele(pathref);
-       vfs_setmntpoint(vfsp, newpath, 0);
-
-       pathref = vfs_getresource(vfsp);
-       (void) strncpy(newpath, refstr_value(pathref), sizeof (newpath));
-       VERIFY((tail = strrchr(newpath, '@')) != NULL);
-       *(tail+1) = '\0';
-       ASSERT3U(strlen(newpath) + strlen(nm), <, sizeof (newpath));
-       (void) strcat(newpath, nm);
-       refstr_rele(pathref);
-       vfs_setresource(vfsp, newpath, 0);
-
-       vfs_unlock(vfsp);
+       return (error);
  }
  
-/*ARGSUSED*/
+/*
+ * Returns full path in full_path: "/pool/dataset/.zfs/snapshot/snap_name/"
+ */
  static int
-zfsctl_snapdir_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm,
-    cred_t *cr, caller_context_t *ct, int flags)
+zfsctl_snapshot_path_objset(zfsvfs_t *zfsvfs, uint64_t objsetid,
+    int path_len, char *full_path)
  {
-       zfsctl_snapdir_t *sdp = sdvp->v_data;
-       zfs_snapentry_t search, *sep;
-       zfsvfs_t *zfsvfs;
-       avl_index_t where;
-       char from[MAXNAMELEN], to[MAXNAMELEN];
-       char real[MAXNAMELEN];
-       int err;
-
-       zfsvfs = sdvp->v_vfsp->vfs_data;
-       ZFS_ENTER(zfsvfs);
-
-       if ((flags & FIGNORECASE) || zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
-               err = dmu_snapshot_realname(zfsvfs->z_os, snm, real,
-                   MAXNAMELEN, NULL);
-               if (err == 0) {
-                       snm = real;
-               } else if (err != ENOTSUP) {
-                       ZFS_EXIT(zfsvfs);
-                       return (err);
-               }
-       }
-
-       ZFS_EXIT(zfsvfs);
+       objset_t *os = zfsvfs->z_os;
+       fstrans_cookie_t cookie;
+       char *snapname;
+       boolean_t case_conflict;
+       uint64_t id, pos = 0;
+       int error = 0;
  
-       err = zfsctl_snapshot_zname(sdvp, snm, MAXNAMELEN, from);
-       if (!err)
-               err = zfsctl_snapshot_zname(tdvp, tnm, MAXNAMELEN, to);
-       if (!err)
-               err = zfs_secpolicy_rename_perms(from, to, cr);
-       if (err)
-               return (err);
+       if (zfsvfs->z_vfs->vfs_mntpoint == NULL)
+               return (SET_ERROR(ENOENT));
  
-       /*
-        * Cannot move snapshots out of the snapdir.
-        */
-       if (sdvp != tdvp)
-               return (EINVAL);
+       cookie = spl_fstrans_mark();
+       snapname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
  
-       if (strcmp(snm, tnm) == 0)
-               return (0);
+       while (error == 0) {
+               dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
+               error = dmu_snapshot_list_next(zfsvfs->z_os,
+                   ZFS_MAX_DATASET_NAME_LEN, snapname, &id, &pos,
+                   &case_conflict);
+               dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
+               if (error)
+                       goto out;
  
-       mutex_enter(&sdp->sd_lock);
-
-       search.se_name = (char *)snm;
-       if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) == NULL) {
-               mutex_exit(&sdp->sd_lock);
-               return (ENOENT);
+               if (id == objsetid)
+                       break;
         }
  
-       err = dmu_objset_rename(from, to, B_FALSE);
-       if (err == 0)
-               zfsctl_rename_snap(sdp, sep, tnm);
+       snprintf(full_path, path_len, "%s/.zfs/snapshot/%s",
+           zfsvfs->z_vfs->vfs_mntpoint, snapname);
+out:
+       kmem_free(snapname, ZFS_MAX_DATASET_NAME_LEN);
+       spl_fstrans_unmark(cookie);
  
-       mutex_exit(&sdp->sd_lock);
-
-       return (err);
+       return (error);
  }
  
-/* ARGSUSED */
-static int
-zfsctl_snapdir_remove(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr,
-    caller_context_t *ct, int flags)
+/*
+ * Special case the handling of "..".
+ */
+int
+zfsctl_root_lookup(struct inode *dip, char *name, struct inode **ipp,
+    int flags, cred_t *cr, int *direntflags, pathname_t *realpnp)
  {
-       zfsctl_snapdir_t *sdp = dvp->v_data;
-       zfs_snapentry_t *sep;
-       zfs_snapentry_t search;
-       zfsvfs_t *zfsvfs;
-       char snapname[MAXNAMELEN];
-       char real[MAXNAMELEN];
-       int err;
+       zfsvfs_t *zfsvfs = ITOZSB(dip);
+       int error = 0;
  
-       zfsvfs = dvp->v_vfsp->vfs_data;
         ZFS_ENTER(zfsvfs);
  
-       if ((flags & FIGNORECASE) || zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
-
-               err = dmu_snapshot_realname(zfsvfs->z_os, name, real,
-                   MAXNAMELEN, NULL);
-               if (err == 0) {
-                       name = real;
-               } else if (err != ENOTSUP) {
-                       ZFS_EXIT(zfsvfs);
-                       return (err);
-               }
-       }
-
-       ZFS_EXIT(zfsvfs);
-
-       err = zfsctl_snapshot_zname(dvp, name, MAXNAMELEN, snapname);
-       if (!err)
-               err = zfs_secpolicy_destroy_perms(snapname, cr);
-       if (err)
-               return (err);
-
-       mutex_enter(&sdp->sd_lock);
-
-       search.se_name = name;
-       sep = avl_find(&sdp->sd_snaps, &search, NULL);
-       if (sep) {
-               avl_remove(&sdp->sd_snaps, sep);
-               err = zfsctl_unmount_snap(sep, MS_FORCE, cr);
-               if (err)
-                       avl_add(&sdp->sd_snaps, sep);
-               else
-                       err = dmu_objset_destroy(snapname, B_FALSE);
+       if (strcmp(name, "..") == 0) {
+               *ipp = dip->i_sb->s_root->d_inode;
+       } else if (strcmp(name, ZFS_SNAPDIR_NAME) == 0) {
+               *ipp = zfsctl_inode_lookup(zfsvfs, ZFSCTL_INO_SNAPDIR,
+                   &zpl_fops_snapdir, &zpl_ops_snapdir);
+       } else if (strcmp(name, ZFS_SHAREDIR_NAME) == 0) {
+               *ipp = zfsctl_inode_lookup(zfsvfs, ZFSCTL_INO_SHARES,
+                   &zpl_fops_shares, &zpl_ops_shares);
         } else {
-               err = ENOENT;
+               *ipp = NULL;
         }
  
-       mutex_exit(&sdp->sd_lock);
+       if (*ipp == NULL)
+               error = SET_ERROR(ENOENT);
+
+       ZFS_EXIT(zfsvfs);
  
-       return (err);
+       return (error);
  }
  
  /*
- * This creates a snapshot under '.zfs/snapshot'.
+ * Lookup entry point for the 'snapshot' directory.  Try to open the
+ * snapshot if it exist, creating the pseudo filesystem inode as necessary.
   */
-/* ARGSUSED */
-static int
-zfsctl_snapdir_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t  **vpp,
-    cred_t *cr, caller_context_t *cc, int flags, vsecattr_t *vsecp)
+int
+zfsctl_snapdir_lookup(struct inode *dip, char *name, struct inode **ipp,
+    int flags, cred_t *cr, int *direntflags, pathname_t *realpnp)
  {
-       zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
-       char name[MAXNAMELEN];
-       int err;
-       static enum symfollow follow = NO_FOLLOW;
-       static enum uio_seg seg = UIO_SYSSPACE;
-
-       if (snapshot_namecheck(dirname, NULL, NULL) != 0)
-               return (EILSEQ);
+       zfsvfs_t *zfsvfs = ITOZSB(dip);
+       uint64_t id;
+       int error;
  
-       dmu_objset_name(zfsvfs->z_os, name);
+       ZFS_ENTER(zfsvfs);
  
-       *vpp = NULL;
+       error = dmu_snapshot_lookup(zfsvfs->z_os, name, &id);
+       if (error) {
+               ZFS_EXIT(zfsvfs);
+               return (error);
+       }
  
-       err = zfs_secpolicy_snapshot_perms(name, cr);
-       if (err)
-               return (err);
+       *ipp = zfsctl_inode_lookup(zfsvfs, ZFSCTL_INO_SNAPDIRS - id,
+           &simple_dir_operations, &simple_dir_inode_operations);
+       if (*ipp == NULL)
+               error = SET_ERROR(ENOENT);
  
-       if (err == 0) {
-               err = dmu_objset_snapshot(name, dirname, NULL, NULL,
-                   B_FALSE, B_FALSE, -1);
-               if (err)
-                       return (err);
-               err = lookupnameat(dirname, seg, follow, NULL, vpp, dvp);
-       }
+       ZFS_EXIT(zfsvfs);
  
-       return (err);
+       return (error);
  }
  
  /*
- * Lookup entry point for the 'snapshot' directory.  Try to open the
- * snapshot if it exist, creating the pseudo filesystem vnode as necessary.
- * Perform a mount of the associated dataset on top of the vnode.
+ * Renaming a directory under '.zfs/snapshot' will automatically trigger
+ * a rename of the snapshot to the new given name.  The rename is confined
+ * to the '.zfs/snapshot' directory snapshots cannot be moved elsewhere.
   */
-/* ARGSUSED */
-static int
-zfsctl_snapdir_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
-    int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
-    int *direntflags, pathname_t *realpnp)
+int
+zfsctl_snapdir_rename(struct inode *sdip, char *snm,
+    struct inode *tdip, char *tnm, cred_t *cr, int flags)
  {
-       zfsctl_snapdir_t *sdp = dvp->v_data;
-       objset_t *snap;
-       char snapname[MAXNAMELEN];
-       char real[MAXNAMELEN];
-       char *mountpoint;
-       zfs_snapentry_t *sep, search;
-       struct mounta margs;
-       vfs_t *vfsp;
-       size_t mountpoint_len;
-       avl_index_t where;
-       zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
-       int err;
-
-       /*
-        * No extended attributes allowed under .zfs
-        */
-       if (flags & LOOKUP_XATTR)
-               return (EINVAL);
-
-       ASSERT(dvp->v_type == VDIR);
+       zfsvfs_t *zfsvfs = ITOZSB(sdip);
+       char *to, *from, *real, *fsname;
+       int error;
  
-       /*
-        * If we get a recursive call, that means we got called
-        * from the domount() code while it was trying to look up the
-        * spec (which looks like a local path for zfs).  We need to
-        * add some flag to domount() to tell it not to do this lookup.
-        */
-       if (MUTEX_HELD(&sdp->sd_lock))
-               return (ENOENT);
+       if (!zfs_admin_snapshot)
+               return (SET_ERROR(EACCES));
  
         ZFS_ENTER(zfsvfs);
  
-       if (gfs_lookup_dot(vpp, dvp, zfsvfs->z_ctldir, nm) == 0) {
-               ZFS_EXIT(zfsvfs);
-               return (0);
-       }
-
-       if (flags & FIGNORECASE) {
-               boolean_t conflict = B_FALSE;
+       to = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
+       from = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
+       real = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
+       fsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
  
-               err = dmu_snapshot_realname(zfsvfs->z_os, nm, real,
-                   MAXNAMELEN, &conflict);
-               if (err == 0) {
-                       nm = real;
-               } else if (err != ENOTSUP) {
-                       ZFS_EXIT(zfsvfs);
-                       return (err);
+       if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
+               error = dmu_snapshot_realname(zfsvfs->z_os, snm, real,
+                   ZFS_MAX_DATASET_NAME_LEN, NULL);
+               if (error == 0) {
+                       snm = real;
+               } else if (error != ENOTSUP) {
+                       goto out;
                 }
-               if (realpnp)
-                       (void) strlcpy(realpnp->pn_buf, nm,
-                           realpnp->pn_bufsize);
-               if (conflict && direntflags)
-                       *direntflags = ED_CASE_CONFLICT;
         }
  
-       mutex_enter(&sdp->sd_lock);
-       search.se_name = (char *)nm;
-       if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) != NULL) {
-               *vpp = sep->se_root;
-               VN_HOLD(*vpp);
-               err = traverse(vpp);
-               if (err) {
-                       VN_RELE(*vpp);
-                       *vpp = NULL;
-               } else if (*vpp == sep->se_root) {
-                       /*
-                        * The snapshot was unmounted behind our backs,
-                        * try to remount it.
-                        */
-                       goto domount;
-               } else {
-                       /*
-                        * VROOT was set during the traverse call.  We need
-                        * to clear it since we're pretending to be part
-                        * of our parent's vfs.
-                        */
-                       (*vpp)->v_flag &= ~VROOT;
-               }
-               mutex_exit(&sdp->sd_lock);
-               ZFS_EXIT(zfsvfs);
-               return (err);
-       }
+       dmu_objset_name(zfsvfs->z_os, fsname);
+
+       error = zfsctl_snapshot_name(ITOZSB(sdip), snm,
+           ZFS_MAX_DATASET_NAME_LEN, from);
+       if (error == 0)
+               error = zfsctl_snapshot_name(ITOZSB(tdip), tnm,
+                   ZFS_MAX_DATASET_NAME_LEN, to);
+       if (error == 0)
+               error = zfs_secpolicy_rename_perms(from, to, cr);
+       if (error != 0)
+               goto out;
  
         /*
-        * The requested snapshot is not currently mounted, look it up.
+        * Cannot move snapshots out of the snapdir.
          */
-       err = zfsctl_snapshot_zname(dvp, nm, MAXNAMELEN, snapname);
-       if (err) {
-               mutex_exit(&sdp->sd_lock);
-               ZFS_EXIT(zfsvfs);
-               /*
-                * handle "ls *" or "?" in a graceful manner,
-                * forcing EILSEQ to ENOENT.
-                * Since shell ultimately passes "*" or "?" as name to lookup
-                */
-               return (err == EILSEQ ? ENOENT : err);
+       if (sdip != tdip) {
+               error = SET_ERROR(EINVAL);
+               goto out;
         }
-       if (dmu_objset_hold(snapname, FTAG, &snap) != 0) {
-               mutex_exit(&sdp->sd_lock);
-               ZFS_EXIT(zfsvfs);
-               return (ENOENT);
-       }
-
-       sep = kmem_alloc(sizeof (zfs_snapentry_t), KM_SLEEP);
-       sep->se_name = kmem_alloc(strlen(nm) + 1, KM_SLEEP);
-       (void) strcpy(sep->se_name, nm);
-       *vpp = sep->se_root = zfsctl_snapshot_mknode(dvp, dmu_objset_id(snap));
-       avl_insert(&sdp->sd_snaps, sep, where);
-
-       dmu_objset_rele(snap, FTAG);
-domount:
-       mountpoint_len = strlen(refstr_value(dvp->v_vfsp->vfs_mntpt)) +
-           strlen("/.zfs/snapshot/") + strlen(nm) + 1;
-       mountpoint = kmem_alloc(mountpoint_len, KM_SLEEP);
-       (void) snprintf(mountpoint, mountpoint_len, "%s/.zfs/snapshot/%s",
-           refstr_value(dvp->v_vfsp->vfs_mntpt), nm);
-
-       margs.spec = snapname;
-       margs.dir = mountpoint;
-       margs.flags = MS_SYSSPACE | MS_NOMNTTAB;
-       margs.fstype = "zfs";
-       margs.dataptr = NULL;
-       margs.datalen = 0;
-       margs.optptr = NULL;
-       margs.optlen = 0;
-
-       err = domount("zfs", &margs, *vpp, kcred, &vfsp);
-       kmem_free(mountpoint, mountpoint_len);
-
-       if (err == 0) {
-               /*
-                * Return the mounted root rather than the covered mount point.
-                * Takes the GFS vnode at .zfs/snapshot/<snapname> and returns
-                * the ZFS vnode mounted on top of the GFS node.  This ZFS
-                * vnode is the root of the newly created vfsp.
-                */
-               VFS_RELE(vfsp);
-               err = traverse(vpp);
-       }
-
-       if (err == 0) {
-               /*
-                * Fix up the root vnode mounted on .zfs/snapshot/<snapname>.
-                *
-                * This is where we lie about our v_vfsp in order to
-                * make .zfs/snapshot/<snapname> accessible over NFS
-                * without requiring manual mounts of <snapname>.
-                */
-               ASSERT(VTOZ(*vpp)->z_zfsvfs != zfsvfs);
-               VTOZ(*vpp)->z_zfsvfs->z_parent = zfsvfs;
-               (*vpp)->v_vfsp = zfsvfs->z_vfs;
-               (*vpp)->v_flag &= ~VROOT;
-       }
-       mutex_exit(&sdp->sd_lock);
-       ZFS_EXIT(zfsvfs);
  
         /*
-        * If we had an error, drop our hold on the vnode and
-        * zfsctl_snapshot_inactive() will clean up.
+        * No-op when names are identical.
          */
-       if (err) {
-               VN_RELE(*vpp);
-               *vpp = NULL;
+       if (strcmp(snm, tnm) == 0) {
+               error = 0;
+               goto out;
         }
-       return (err);
-}
  
-/* ARGSUSED */
-static int
-zfsctl_shares_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
-    int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
-    int *direntflags, pathname_t *realpnp)
-{
-       zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
-       znode_t *dzp;
-       int error;
+       rw_enter(&zfs_snapshot_lock, RW_WRITER);
  
-       ZFS_ENTER(zfsvfs);
+       error = dsl_dataset_rename_snapshot(fsname, snm, tnm, B_FALSE);
+       if (error == 0)
+               (void) zfsctl_snapshot_rename(snm, tnm);
  
-       if (gfs_lookup_dot(vpp, dvp, zfsvfs->z_ctldir, nm) == 0) {
-               ZFS_EXIT(zfsvfs);
-               return (0);
-       }
+       rw_exit(&zfs_snapshot_lock);
+out:
+       kmem_free(from, ZFS_MAX_DATASET_NAME_LEN);
+       kmem_free(to, ZFS_MAX_DATASET_NAME_LEN);
+       kmem_free(real, ZFS_MAX_DATASET_NAME_LEN);
+       kmem_free(fsname, ZFS_MAX_DATASET_NAME_LEN);
  
-       if (zfsvfs->z_shares_dir == 0) {
-               ZFS_EXIT(zfsvfs);
-               return (ENOTSUP);
-       }
-       if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0)
-               error = VOP_LOOKUP(ZTOV(dzp), nm, vpp, pnp,
-                   flags, rdir, cr, ct, direntflags, realpnp);
-
-       VN_RELE(ZTOV(dzp));
         ZFS_EXIT(zfsvfs);
  
         return (error);
  }
  
-/* ARGSUSED */
-static int
-zfsctl_snapdir_readdir_cb(vnode_t *vp, void *dp, int *eofp,
-    offset_t *offp, offset_t *nextp, void *data, int flags)
+/*
+ * Removing a directory under '.zfs/snapshot' will automatically trigger
+ * the removal of the snapshot with the given name.
+ */
+int
+zfsctl_snapdir_remove(struct inode *dip, char *name, cred_t *cr, int flags)
  {
-       zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
-       char snapname[MAXNAMELEN];
-       uint64_t id, cookie;
-       boolean_t case_conflict;
+       zfsvfs_t *zfsvfs = ITOZSB(dip);
+       char *snapname, *real;
         int error;
  
+       if (!zfs_admin_snapshot)
+               return (SET_ERROR(EACCES));
+
         ZFS_ENTER(zfsvfs);
  
-       cookie = *offp;
-       error = dmu_snapshot_list_next(zfsvfs->z_os, MAXNAMELEN, snapname, &id,
-           &cookie, &case_conflict);
-       if (error) {
-               ZFS_EXIT(zfsvfs);
-               if (error == ENOENT) {
-                       *eofp = 1;
-                       return (0);
+       snapname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
+       real = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
+
+       if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
+               error = dmu_snapshot_realname(zfsvfs->z_os, name, real,
+                   ZFS_MAX_DATASET_NAME_LEN, NULL);
+               if (error == 0) {
+                       name = real;
+               } else if (error != ENOTSUP) {
+                       goto out;
                 }
-               return (error);
         }
  
-       if (flags & V_RDDIR_ENTFLAGS) {
-               edirent_t *eodp = dp;
-
-               (void) strcpy(eodp->ed_name, snapname);
-               eodp->ed_ino = ZFSCTL_INO_SNAP(id);
-               eodp->ed_eflags = case_conflict ? ED_CASE_CONFLICT : 0;
-       } else {
-               struct dirent64 *odp = dp;
+       error = zfsctl_snapshot_name(ITOZSB(dip), name,
+           ZFS_MAX_DATASET_NAME_LEN, snapname);
+       if (error == 0)
+               error = zfs_secpolicy_destroy_perms(snapname, cr);
+       if (error != 0)
+               goto out;
  
-               (void) strcpy(odp->d_name, snapname);
-               odp->d_ino = ZFSCTL_INO_SNAP(id);
-       }
-       *nextp = cookie;
+       error = zfsctl_snapshot_unmount(snapname, MNT_FORCE);
+       if ((error == 0) || (error == ENOENT))
+               error = dsl_destroy_snapshot(snapname, B_FALSE);
+out:
+       kmem_free(snapname, ZFS_MAX_DATASET_NAME_LEN);
+       kmem_free(real, ZFS_MAX_DATASET_NAME_LEN);
  
         ZFS_EXIT(zfsvfs);
  
-       return (0);
+       return (error);
  }
  
-/* ARGSUSED */
-static int
-zfsctl_shares_readdir(vnode_t *vp, uio_t *uiop, cred_t *cr, int *eofp,
-    caller_context_t *ct, int flags)
+/*
+ * Creating a directory under '.zfs/snapshot' will automatically trigger
+ * the creation of a new snapshot with the given name.
+ */
+int
+zfsctl_snapdir_mkdir(struct inode *dip, char *dirname, vattr_t *vap,
+    struct inode **ipp, cred_t *cr, int flags)
  {
-       zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
-       znode_t *dzp;
+       zfsvfs_t *zfsvfs = ITOZSB(dip);
+       char *dsname;
         int error;
  
-       ZFS_ENTER(zfsvfs);
+       if (!zfs_admin_snapshot)
+               return (SET_ERROR(EACCES));
  
-       if (zfsvfs->z_shares_dir == 0) {
-               ZFS_EXIT(zfsvfs);
-               return (ENOTSUP);
+       dsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
+
+       if (zfs_component_namecheck(dirname, NULL, NULL) != 0) {
+               error = SET_ERROR(EILSEQ);
+               goto out;
         }
-       if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) {
-               error = VOP_READDIR(ZTOV(dzp), uiop, cr, eofp, ct, flags);
-               VN_RELE(ZTOV(dzp));
-       } else {
-               *eofp = 1;
-               error = ENOENT;
+
+       dmu_objset_name(zfsvfs->z_os, dsname);
+
+       error = zfs_secpolicy_snapshot_perms(dsname, cr);
+       if (error != 0)
+               goto out;
+
+       if (error == 0) {
+               error = dmu_objset_snapshot_one(dsname, dirname);
+               if (error != 0)
+                       goto out;
+
+               error = zfsctl_snapdir_lookup(dip, dirname, ipp,
+                   0, cr, NULL, NULL);
         }
+out:
+       kmem_free(dsname, ZFS_MAX_DATASET_NAME_LEN);
  
-       ZFS_EXIT(zfsvfs);
         return (error);
  }
  
  /*
- * pvp is the '.zfs' directory (zfsctl_node_t).
- * Creates vp, which is '.zfs/snapshot' (zfsctl_snapdir_t).
- *
- * This function is the callback to create a GFS vnode for '.zfs/snapshot'
- * when a lookup is performed on .zfs for "snapshot".
+ * Attempt to unmount a snapshot by making a call to user space.
+ * There is no assurance that this can or will succeed, is just a
+ * best effort.  In the case where it does fail, perhaps because
+ * it's in use, the unmount will fail harmlessly.
   */
-vnode_t *
-zfsctl_mknode_snapdir(vnode_t *pvp)
+int
+zfsctl_snapshot_unmount(char *snapname, int flags)
  {
-       vnode_t *vp;
-       zfsctl_snapdir_t *sdp;
-
-       vp = gfs_dir_create(sizeof (zfsctl_snapdir_t), pvp,
-           zfsctl_ops_snapdir, NULL, NULL, MAXNAMELEN,
-           zfsctl_snapdir_readdir_cb, NULL);
-       sdp = vp->v_data;
-       sdp->sd_node.zc_id = ZFSCTL_INO_SNAPDIR;
-       sdp->sd_node.zc_cmtime = ((zfsctl_node_t *)pvp->v_data)->zc_cmtime;
-       mutex_init(&sdp->sd_lock, NULL, MUTEX_DEFAULT, NULL);
-       avl_create(&sdp->sd_snaps, snapentry_compare,
-           sizeof (zfs_snapentry_t), offsetof(zfs_snapentry_t, se_node));
-       return (vp);
-}
+       char *argv[] = { "/usr/bin/env", "umount", "-t", "zfs", "-n", NULL,
+           NULL };
+       char *envp[] = { NULL };
+       zfs_snapentry_t *se;
+       int error;
  
-vnode_t *
-zfsctl_mknode_shares(vnode_t *pvp)
-{
-       vnode_t *vp;
-       zfsctl_node_t *sdp;
+       rw_enter(&zfs_snapshot_lock, RW_READER);
+       if ((se = zfsctl_snapshot_find_by_name(snapname)) == NULL) {
+               rw_exit(&zfs_snapshot_lock);
+               return (SET_ERROR(ENOENT));
+       }
+       rw_exit(&zfs_snapshot_lock);
  
-       vp = gfs_dir_create(sizeof (zfsctl_node_t), pvp,
-           zfsctl_ops_shares, NULL, NULL, MAXNAMELEN,
-           NULL, NULL);
-       sdp = vp->v_data;
-       sdp->zc_cmtime = ((zfsctl_node_t *)pvp->v_data)->zc_cmtime;
-       return (vp);
+       if (flags & MNT_FORCE)
+               argv[4] = "-fn";
+       argv[5] = se->se_path;
+       dprintf("unmount; path=%s\n", se->se_path);
+       error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
+       zfsctl_snapshot_rele(se);
  
+
+       /*
+        * The umount system utility will return 256 on error.  We must
+        * assume this error is because the file system is busy so it is
+        * converted to the more sensible EBUSY.
+        */
+       if (error)
+               error = SET_ERROR(EBUSY);
+
+       return (error);
  }
  
-/* ARGSUSED */
-static int
-zfsctl_shares_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
-    caller_context_t *ct)
+#define        MOUNT_BUSY 0x80         /* Mount failed due to EBUSY (from mntent.h) */
+
+int
+zfsctl_snapshot_mount(struct path *path, int flags)
  {
-       zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
-       znode_t *dzp;
+       struct dentry *dentry = path->dentry;
+       struct inode *ip = dentry->d_inode;
+       zfsvfs_t *zfsvfs;
+       zfsvfs_t *snap_zfsvfs;
+       zfs_snapentry_t *se;
+       char *full_name, *full_path;
+       char *argv[] = { "/usr/bin/env", "mount", "-t", "zfs", "-n", NULL, NULL,
+           NULL };
+       char *envp[] = { NULL };
         int error;
+       struct path spath;
+
+       if (ip == NULL)
+               return (SET_ERROR(EISDIR));
  
+       zfsvfs = ITOZSB(ip);
         ZFS_ENTER(zfsvfs);
-       if (zfsvfs->z_shares_dir == 0) {
-               ZFS_EXIT(zfsvfs);
-               return (ENOTSUP);
-       }
-       if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) {
-               error = VOP_GETATTR(ZTOV(dzp), vap, flags, cr, ct);
-               VN_RELE(ZTOV(dzp));
-       }
-       ZFS_EXIT(zfsvfs);
-       return (error);
  
+       full_name = kmem_zalloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
+       full_path = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
  
-}
+       error = zfsctl_snapshot_name(zfsvfs, dname(dentry),
+           ZFS_MAX_DATASET_NAME_LEN, full_name);
+       if (error)
+               goto error;
  
-/* ARGSUSED */
-static int
-zfsctl_snapdir_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
-    caller_context_t *ct)
-{
-       zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
-       zfsctl_snapdir_t *sdp = vp->v_data;
+       error = zfsctl_snapshot_path(path, MAXPATHLEN, full_path);
+       if (error)
+               goto error;
  
-       ZFS_ENTER(zfsvfs);
-       zfsctl_common_getattr(vp, vap);
-       vap->va_nodeid = gfs_file_inode(vp);
-       vap->va_nlink = vap->va_size = avl_numnodes(&sdp->sd_snaps) + 2;
-       vap->va_ctime = vap->va_mtime = dmu_objset_snap_cmtime(zfsvfs->z_os);
-       ZFS_EXIT(zfsvfs);
+       /*
+        * Multiple concurrent automounts of a snapshot are never allowed.
+        * The snapshot may be manually mounted as many times as desired.
+        */
+       if (zfsctl_snapshot_ismounted(full_name)) {
+               error = 0;
+               goto error;
+       }
  
-       return (0);
-}
+       /*
+        * Attempt to mount the snapshot from user space.  Normally this
+        * would be done using the vfs_kern_mount() function, however that
+        * function is marked GPL-only and cannot be used.  On error we
+        * careful to log the real error to the console and return EISDIR
+        * to safely abort the automount.  This should be very rare.
+        *
+        * If the user mode helper happens to return EBUSY, a concurrent
+        * mount is already in progress in which case the error is ignored.
+        * Take note that if the program was executed successfully the return
+        * value from call_usermodehelper() will be (exitcode << 8 + signal).
+        */
+       dprintf("mount; name=%s path=%s\n", full_name, full_path);
+       argv[5] = full_name;
+       argv[6] = full_path;
+       error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
+       if (error) {
+               if (!(error & MOUNT_BUSY << 8)) {
+                       cmn_err(CE_WARN, "Unable to automount %s/%s: %d",
+                           full_path, full_name, error);
+                       error = SET_ERROR(EISDIR);
+               } else {
+                       /*
+                        * EBUSY, this could mean a concurrent mount, or the
+                        * snapshot has already been mounted at completely
+                        * different place. We return 0 so VFS will retry. For
+                        * the latter case the VFS will retry several times
+                        * and return ELOOP, which is probably not a very good
+                        * behavior.
+                        */
+                       error = 0;
+               }
+               goto error;
+       }
  
-/* ARGSUSED */
-static void
-zfsctl_snapdir_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
-{
-       zfsctl_snapdir_t *sdp = vp->v_data;
-       void *private;
-
-       private = gfs_dir_inactive(vp);
-       if (private != NULL) {
-               ASSERT(avl_numnodes(&sdp->sd_snaps) == 0);
-               mutex_destroy(&sdp->sd_lock);
-               avl_destroy(&sdp->sd_snaps);
-               kmem_free(private, sizeof (zfsctl_snapdir_t));
+       /*
+        * Follow down in to the mounted snapshot and set MNT_SHRINKABLE
+        * to identify this as an automounted filesystem.
+        */
+       spath = *path;
+       path_get(&spath);
+       if (zpl_follow_down_one(&spath)) {
+               snap_zfsvfs = ITOZSB(spath.dentry->d_inode);
+               snap_zfsvfs->z_parent = zfsvfs;
+               dentry = spath.dentry;
+               spath.mnt->mnt_flags |= MNT_SHRINKABLE;
+
+               rw_enter(&zfs_snapshot_lock, RW_WRITER);
+               se = zfsctl_snapshot_alloc(full_name, full_path,
+                   snap_zfsvfs->z_os->os_spa, dmu_objset_id(snap_zfsvfs->z_os),
+                   dentry);
+               zfsctl_snapshot_add(se);
+               zfsctl_snapshot_unmount_delay_impl(se, zfs_expire_snapshot);
+               rw_exit(&zfs_snapshot_lock);
         }
-}
+       path_put(&spath);
+error:
+       kmem_free(full_name, ZFS_MAX_DATASET_NAME_LEN);
+       kmem_free(full_path, MAXPATHLEN);
  
-static const fs_operation_def_t zfsctl_tops_snapdir[] = {
-       { VOPNAME_OPEN,         { .vop_open = zfsctl_common_open }      },
-       { VOPNAME_CLOSE,        { .vop_close = zfsctl_common_close }    },
-       { VOPNAME_IOCTL,        { .error = fs_inval }                   },
-       { VOPNAME_GETATTR,      { .vop_getattr = zfsctl_snapdir_getattr } },
-       { VOPNAME_ACCESS,       { .vop_access = zfsctl_common_access }  },
-       { VOPNAME_RENAME,       { .vop_rename = zfsctl_snapdir_rename } },
-       { VOPNAME_RMDIR,        { .vop_rmdir = zfsctl_snapdir_remove }  },
-       { VOPNAME_MKDIR,        { .vop_mkdir = zfsctl_snapdir_mkdir }   },
-       { VOPNAME_READDIR,      { .vop_readdir = gfs_vop_readdir }      },
-       { VOPNAME_LOOKUP,       { .vop_lookup = zfsctl_snapdir_lookup } },
-       { VOPNAME_SEEK,         { .vop_seek = fs_seek }                 },
-       { VOPNAME_INACTIVE,     { .vop_inactive = zfsctl_snapdir_inactive } },
-       { VOPNAME_FID,          { .vop_fid = zfsctl_common_fid }        },
-       { NULL }
-};
-
-static const fs_operation_def_t zfsctl_tops_shares[] = {
-       { VOPNAME_OPEN,         { .vop_open = zfsctl_common_open }      },
-       { VOPNAME_CLOSE,        { .vop_close = zfsctl_common_close }    },
-       { VOPNAME_IOCTL,        { .error = fs_inval }                   },
-       { VOPNAME_GETATTR,      { .vop_getattr = zfsctl_shares_getattr } },
-       { VOPNAME_ACCESS,       { .vop_access = zfsctl_common_access }  },
-       { VOPNAME_READDIR,      { .vop_readdir = zfsctl_shares_readdir } },
-       { VOPNAME_LOOKUP,       { .vop_lookup = zfsctl_shares_lookup }  },
-       { VOPNAME_SEEK,         { .vop_seek = fs_seek }                 },
-       { VOPNAME_INACTIVE,     { .vop_inactive = gfs_vop_inactive } },
-       { VOPNAME_FID,          { .vop_fid = zfsctl_shares_fid } },
-       { NULL }
-};
+       ZFS_EXIT(zfsvfs);
+
+       return (error);
+}
  
  /*
- * pvp is the GFS vnode '.zfs/snapshot'.
- *
- * This creates a GFS node under '.zfs/snapshot' representing each
- * snapshot.  This newly created GFS node is what we mount snapshot
- * vfs_t's ontop of.
+ * Get the snapdir inode from fid
   */
-static vnode_t *
-zfsctl_snapshot_mknode(vnode_t *pvp, uint64_t objset)
+int
+zfsctl_snapdir_vget(struct super_block *sb, uint64_t objsetid, int gen,
+    struct inode **ipp)
  {
-       vnode_t *vp;
-       zfsctl_node_t *zcp;
-
-       vp = gfs_dir_create(sizeof (zfsctl_node_t), pvp,
-           zfsctl_ops_snapshot, NULL, NULL, MAXNAMELEN, NULL, NULL);
-       zcp = vp->v_data;
-       zcp->zc_id = objset;
-
-       return (vp);
-}
+       int error;
+       struct path path;
+       char *mnt;
+       struct dentry *dentry;
  
-static void
-zfsctl_snapshot_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
-{
-       zfsctl_snapdir_t *sdp;
-       zfs_snapentry_t *sep, *next;
-       vnode_t *dvp;
+       mnt = kmem_alloc(MAXPATHLEN, KM_SLEEP);
  
-       VERIFY(gfs_dir_lookup(vp, "..", &dvp, cr, 0, NULL, NULL) == 0);
-       sdp = dvp->v_data;
+       error = zfsctl_snapshot_path_objset(sb->s_fs_info, objsetid,
+           MAXPATHLEN, mnt);
+       if (error)
+               goto out;
  
-       mutex_enter(&sdp->sd_lock);
+       /* Trigger automount */
+       error = -kern_path(mnt, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &path);
+       if (error)
+               goto out;
  
-       if (vp->v_count > 1) {
-               mutex_exit(&sdp->sd_lock);
-               return;
+       path_put(&path);
+       /*
+        * Get the snapdir inode. Note, we don't want to use the above
+        * path because it contains the root of the snapshot rather
+        * than the snapdir.
+        */
+       *ipp = ilookup(sb, ZFSCTL_INO_SNAPDIRS - objsetid);
+       if (*ipp == NULL) {
+               error = SET_ERROR(ENOENT);
+               goto out;
         }
-       ASSERT(!vn_ismntpt(vp));
  
-       sep = avl_first(&sdp->sd_snaps);
-       while (sep != NULL) {
-               next = AVL_NEXT(&sdp->sd_snaps, sep);
-
-               if (sep->se_root == vp) {
-                       avl_remove(&sdp->sd_snaps, sep);
-                       kmem_free(sep->se_name, strlen(sep->se_name) + 1);
-                       kmem_free(sep, sizeof (zfs_snapentry_t));
-                       break;
-               }
-               sep = next;
+       /* check gen, see zfsctl_snapdir_fid */
+       dentry = d_obtain_alias(igrab(*ipp));
+       if (gen != (!IS_ERR(dentry) && d_mountpoint(dentry))) {
+               iput(*ipp);
+               *ipp = NULL;
+               error = SET_ERROR(ENOENT);
         }
-       ASSERT(sep != NULL);
-
-       mutex_exit(&sdp->sd_lock);
-       VN_RELE(dvp);
-
-       /*
-        * Dispose of the vnode for the snapshot mount point.
-        * This is safe to do because once this entry has been removed
-        * from the AVL tree, it can't be found again, so cannot become
-        * "active".  If we lookup the same name again we will end up
-        * creating a new vnode.
-        */
-       gfs_vop_inactive(vp, cr, ct);
+       if (!IS_ERR(dentry))
+               dput(dentry);
+out:
+       kmem_free(mnt, MAXPATHLEN);
+       return (error);
  }
  
-
-/*
- * These VP's should never see the light of day.  They should always
- * be covered.
- */
-static const fs_operation_def_t zfsctl_tops_snapshot[] = {
-       VOPNAME_INACTIVE, { .vop_inactive =  zfsctl_snapshot_inactive },
-       NULL, NULL
-};
-
  int
-zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp)
+zfsctl_shares_lookup(struct inode *dip, char *name, struct inode **ipp,
+    int flags, cred_t *cr, int *direntflags, pathname_t *realpnp)
  {
-       zfsvfs_t *zfsvfs = vfsp->vfs_data;
-       vnode_t *dvp, *vp;
-       zfsctl_snapdir_t *sdp;
-       zfsctl_node_t *zcp;
-       zfs_snapentry_t *sep;
+       zfsvfs_t *zfsvfs = ITOZSB(dip);
+       struct inode *ip;
+       znode_t *dzp;
         int error;
  
-       ASSERT(zfsvfs->z_ctldir != NULL);
-       error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp,
-           NULL, 0, NULL, kcred, NULL, NULL, NULL);
-       if (error != 0)
-               return (error);
-       sdp = dvp->v_data;
-
-       mutex_enter(&sdp->sd_lock);
-       sep = avl_first(&sdp->sd_snaps);
-       while (sep != NULL) {
-               vp = sep->se_root;
-               zcp = vp->v_data;
-               if (zcp->zc_id == objsetid)
-                       break;
+       ZFS_ENTER(zfsvfs);
  
-               sep = AVL_NEXT(&sdp->sd_snaps, sep);
+       if (zfsvfs->z_shares_dir == 0) {
+               ZFS_EXIT(zfsvfs);
+               return (SET_ERROR(ENOTSUP));
         }
  
-       if (sep != NULL) {
-               VN_HOLD(vp);
-               /*
-                * Return the mounted root rather than the covered mount point.
-                * Takes the GFS vnode at .zfs/snapshot/<snapshot objsetid>
-                * and returns the ZFS vnode mounted on top of the GFS node.
-                * This ZFS vnode is the root of the vfs for objset 'objsetid'.
-                */
-               error = traverse(&vp);
-               if (error == 0) {
-                       if (vp == sep->se_root)
-                               error = EINVAL;
-                       else
-                               *zfsvfsp = VTOZ(vp)->z_zfsvfs;
-               }
-               mutex_exit(&sdp->sd_lock);
-               VN_RELE(vp);
-       } else {
-               error = EINVAL;
-               mutex_exit(&sdp->sd_lock);
+       if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) {
+               error = zfs_lookup(ZTOI(dzp), name, &ip, 0, cr, NULL, NULL);
+               iput(ZTOI(dzp));
         }
  
-       VN_RELE(dvp);
+       ZFS_EXIT(zfsvfs);
  
         return (error);
  }
  
  /*
- * Unmount any snapshots for the given filesystem.  This is called from
- * zfs_umount() - if we have a ctldir, then go through and unmount all the
- * snapshots.
+ * Initialize the various pieces we'll need to create and manipulate .zfs
+ * directories.  Currently this is unused but available.
   */
-int
-zfsctl_umount_snapshots(vfs_t *vfsp, int fflags, cred_t *cr)
+void
+zfsctl_init(void)
  {
-       zfsvfs_t *zfsvfs = vfsp->vfs_data;
-       vnode_t *dvp;
-       zfsctl_snapdir_t *sdp;
-       zfs_snapentry_t *sep, *next;
-       int error;
+       avl_create(&zfs_snapshots_by_name, snapentry_compare_by_name,
+           sizeof (zfs_snapentry_t), offsetof(zfs_snapentry_t,
+           se_node_name));
+       avl_create(&zfs_snapshots_by_objsetid, snapentry_compare_by_objsetid,
+           sizeof (zfs_snapentry_t), offsetof(zfs_snapentry_t,
+           se_node_objsetid));
+       rw_init(&zfs_snapshot_lock, NULL, RW_DEFAULT, NULL);
+}
  
-       ASSERT(zfsvfs->z_ctldir != NULL);
-       error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp,
-           NULL, 0, NULL, cr, NULL, NULL, NULL);
-       if (error != 0)
-               return (error);
-       sdp = dvp->v_data;
-
-       mutex_enter(&sdp->sd_lock);
-
-       sep = avl_first(&sdp->sd_snaps);
-       while (sep != NULL) {
-               next = AVL_NEXT(&sdp->sd_snaps, sep);
-
-               /*
-                * If this snapshot is not mounted, then it must
-                * have just been unmounted by somebody else, and
-                * will be cleaned up by zfsctl_snapdir_inactive().
-                */
-               if (vn_ismntpt(sep->se_root)) {
-                       avl_remove(&sdp->sd_snaps, sep);
-                       error = zfsctl_unmount_snap(sep, fflags, cr);
-                       if (error) {
-                               avl_add(&sdp->sd_snaps, sep);
-                               break;
-                       }
-               }
-               sep = next;
-       }
+/*
+ * Cleanup the various pieces we needed for .zfs directories.  In particular
+ * ensure the expiry timer is canceled safely.
+ */
+void
+zfsctl_fini(void)
+{
+       avl_destroy(&zfs_snapshots_by_name);
+       avl_destroy(&zfs_snapshots_by_objsetid);
+       rw_destroy(&zfs_snapshot_lock);
+}
  
-       mutex_exit(&sdp->sd_lock);
-       VN_RELE(dvp);
+module_param(zfs_admin_snapshot, int, 0644);
+MODULE_PARM_DESC(zfs_admin_snapshot, "Enable mkdir/rmdir/mv in .zfs/snapshot");
  
-       return (error);
-}
-#endif /* HAVE_ZPL */
+module_param(zfs_expire_snapshot, int, 0644);
+MODULE_PARM_DESC(zfs_expire_snapshot, "Seconds to expire .zfs/snapshot");