Fix 'zfs rollback' on mounted file systems

author Brian Behlendorf <behlendorf1@llnl.gov>

Wed, 16 Jan 2013 00:41:09 +0000 (16:41 -0800)

committer Brian Behlendorf <behlendorf1@llnl.gov>

Thu, 17 Jan 2013 17:51:20 +0000 (09:51 -0800)
author Brian Behlendorf <behlendorf1@llnl.gov>
Wed, 16 Jan 2013 00:41:09 +0000 (16:41 -0800)
committer Brian Behlendorf <behlendorf1@llnl.gov>
Thu, 17 Jan 2013 17:51:20 +0000 (09:51 -0800)
diff --git a/include/linux/vfs_compat.h b/include/linux/vfs_compat.h

index c4e1771ae59fc4b405636d5d8c987aa96e5d97e2..c9fa76ecef0eeb89c04ba247041a57c124e8c121 100644 (file)
--- a/include/linux/vfs_compat.h
+++ b/include/linux/vfs_compat.h
@@ -94,6 +94,14 @@ bdi_setup_and_register(struct backing_dev_info *bdi,char *name,unsigned int cap)
  }
  #endif /* HAVE_BDI && !HAVE_BDI_SETUP_AND_REGISTER */
  
+/*
+ * 2.6.38 API change,
+ * LOOKUP_RCU flag introduced to distinguish rcu-walk from ref-walk cases.
+ */
+#ifndef LOOKUP_RCU
+#define LOOKUP_RCU      0x0
+#endif /* LOOKUP_RCU */
+
  /*
   * 3.2-rc1 API change,
   * Add set_nlink() if it is not exported by the Linux kernel.
diff --git a/include/sys/zfs_vfsops.h b/include/sys/zfs_vfsops.h

index 4dd46710f72bf20a90d19990c67a4291bde95127..f685c1296401493c3b2765f5727a5ff97e9a957e 100644 (file)
--- a/include/sys/zfs_vfsops.h
+++ b/include/sys/zfs_vfsops.h
@@ -69,6 +69,7 @@ typedef struct zfs_sb {
         krwlock_t       z_teardown_inactive_lock;
         list_t          z_all_znodes;   /* all znodes in the fs */
         uint64_t        z_nr_znodes;    /* number of znodes in the fs */
+       unsigned long   z_rollback_time;/* last online rollback time */
         kmutex_t        z_znodes_lock;  /* lock for z_all_znodes */
         struct inode    *z_ctldir;      /* .zfs directory inode */
         avl_tree_t      z_ctldir_snaps; /* .zfs/snapshot entries */
diff --git a/include/sys/zfs_znode.h b/include/sys/zfs_znode.h

index 0b75d52955ae95b82b94231f5bcc7b8f729e47be..41233547be78e6446efcc5af837ccdbb90fa8f60 100644 (file)
--- a/include/sys/zfs_znode.h
+++ b/include/sys/zfs_znode.h
@@ -216,6 +216,7 @@ typedef struct znode {
         boolean_t       z_is_zvol;      /* are we used by the zvol */
         boolean_t       z_is_mapped;    /* are we mmap'ed */
         boolean_t       z_is_ctldir;    /* are we .zfs entry */
+       boolean_t       z_is_stale;     /* are we stale due to rollback? */
         struct inode    z_inode;        /* generic vfs inode */
  } znode_t;
  
diff --git a/include/sys/zpl.h b/include/sys/zpl.h

index e34b323bdb18f4ca3218f7e92c762500cbe350b2..61a57ef297a5ba1b3bd7730b713484057e028573 100644 (file)
--- a/include/sys/zpl.h
+++ b/include/sys/zpl.h
@@ -28,18 +28,20 @@
  #include <sys/vfs.h>
  #include <linux/vfs_compat.h>
  #include <linux/xattr_compat.h>
+#include <linux/dcache_compat.h>
  #include <linux/exportfs.h>
  #include <linux/writeback.h>
  #include <linux/falloc.h>
  
  /* zpl_inode.c */
  extern void zpl_vap_init(vattr_t *vap, struct inode *dir,
-    struct dentry *dentry, zpl_umode_t mode, cred_t *cr);
+    zpl_umode_t mode, cred_t *cr);
  
  extern const struct inode_operations zpl_inode_operations;
  extern const struct inode_operations zpl_dir_inode_operations;
  extern const struct inode_operations zpl_symlink_inode_operations;
  extern const struct inode_operations zpl_special_inode_operations;
+extern dentry_operations_t zpl_dentry_operations;
  
  /* zpl_file.c */
  extern ssize_t zpl_read_common(struct inode *ip, const char *buf,
diff --git a/module/zfs/zfs_ctldir.c b/module/zfs/zfs_ctldir.c

index 622d865df7728d1cde9a603d07ab3ef6e3a1d21c..b3801d4949fcdb189171218a6a2be1b2178cbaa4 100644 (file)
--- a/module/zfs/zfs_ctldir.c
+++ b/module/zfs/zfs_ctldir.c
@@ -197,6 +197,7 @@ zfsctl_inode_alloc(zfs_sb_t *zsb, uint64_t id,
         zp->z_is_mapped = B_FALSE;
         zp->z_is_ctldir = B_TRUE;
         zp->z_is_sa = B_FALSE;
+       zp->z_is_stale = B_FALSE;
         ip->i_ino = id;
         ip->i_mode = (S_IFDIR | S_IRUGO | S_IXUGO);
         ip->i_uid = 0;
diff --git a/module/zfs/zfs_vfsops.c b/module/zfs/zfs_vfsops.c

index fc5c2ba397f17c0c106edc02608f97dcb5750b6d..ac5c317ce8eb17e792fcbf0c285acf596c8dae76 100644 (file)
--- a/module/zfs/zfs_vfsops.c
+++ b/module/zfs/zfs_vfsops.c
@@ -1032,7 +1032,7 @@ EXPORT_SYMBOL(zfs_sb_prune);
  #endif /* HAVE_SHRINK */
  
  /*
- * Teardown the zfs_sb_t::z_os.
+ * Teardown the zfs_sb_t.
   *
   * Note, if 'unmounting' if FALSE, we return with the 'z_teardown_lock'
   * and 'z_teardown_inactive_lock' held.
@@ -1053,7 +1053,6 @@ zfs_sb_teardown(zfs_sb_t *zsb, boolean_t unmounting)
                  * for non-snapshots.
                  */
                 shrink_dcache_sb(zsb->z_parent->z_sb);
-               (void) spl_invalidate_inodes(zsb->z_parent->z_sb, 0);
         }
  
         /*
@@ -1079,25 +1078,26 @@ zfs_sb_teardown(zfs_sb_t *zsb, boolean_t unmounting)
         }
  
         /*
-        * At this point there are no vops active, and any new vops will
-        * fail with EIO since we have z_teardown_lock for writer (only
-        * relavent for forced unmount).
+        * At this point there are no VFS ops active, and any new VFS ops
+        * will fail with EIO since we have z_teardown_lock for writer (only
+        * relevant for forced unmount).
          *
          * Release all holds on dbufs.
          */
         mutex_enter(&zsb->z_znodes_lock);
         for (zp = list_head(&zsb->z_all_znodes); zp != NULL;
-           zp = list_next(&zsb->z_all_znodes, zp))
+           zp = list_next(&zsb->z_all_znodes, zp)) {
                 if (zp->z_sa_hdl) {
                         ASSERT(atomic_read(&ZTOI(zp)->i_count) > 0);
                         zfs_znode_dmu_fini(zp);
                 }
+       }
         mutex_exit(&zsb->z_znodes_lock);
  
         /*
-        * If we are unmounting, set the unmounted flag and let new vops
+        * If we are unmounting, set the unmounted flag and let new VFS ops
          * unblock.  zfs_inactive will have the unmounted behavior, and all
-        * other vops will fail with EIO.
+        * other VFS ops will fail with EIO.
          */
         if (unmounting) {
                 zsb->z_unmounted = B_TRUE;
@@ -1392,7 +1392,7 @@ zfs_vget(struct super_block *sb, struct inode **ipp, fid_t *fidp)
  EXPORT_SYMBOL(zfs_vget);
  
  /*
- * Block out VOPs and close zfs_sb_t::z_os
+ * Block out VFS ops and close zfs_sb_t
   *
   * Note, if successful, then we return with the 'z_teardown_lock' and
   * 'z_teardown_inactive_lock' write held.
@@ -1404,6 +1404,7 @@ zfs_suspend_fs(zfs_sb_t *zsb)
  
         if ((error = zfs_sb_teardown(zsb, B_FALSE)) != 0)
                 return (error);
+
         dmu_objset_disown(zsb->z_os, zsb);
  
         return (0);
@@ -1411,7 +1412,7 @@ zfs_suspend_fs(zfs_sb_t *zsb)
  EXPORT_SYMBOL(zfs_suspend_fs);
  
  /*
- * Reopen zfs_sb_t::z_os and release VOPs.
+ * Reopen zfs_sb_t and release VFS ops.
   */
  int
  zfs_resume_fs(zfs_sb_t *zsb, const char *osname)
@@ -1440,30 +1441,37 @@ zfs_resume_fs(zfs_sb_t *zsb, const char *osname)
                         goto bail;
  
                 VERIFY(zfs_sb_setup(zsb, B_FALSE) == 0);
+               zsb->z_rollback_time = jiffies;
  
                 /*
-                * Attempt to re-establish all the active znodes with
-                * their dbufs.  If a zfs_rezget() fails, then we'll let
-                * any potential callers discover that via ZFS_ENTER_VERIFY_VP
-                * when they try to use their znode.
+                * Attempt to re-establish all the active inodes with their
+                * dbufs.  If a zfs_rezget() fails, then we unhash the inode
+                * and mark it stale.  This prevents a collision if a new
+                * inode/object is created which must use the same inode
+                * number.  The stale inode will be be released when the
+                * VFS prunes the dentry holding the remaining references
+                * on the stale inode.
                  */
                 mutex_enter(&zsb->z_znodes_lock);
                 for (zp = list_head(&zsb->z_all_znodes); zp;
                     zp = list_next(&zsb->z_all_znodes, zp)) {
-                       (void) zfs_rezget(zp);
+                       err2 = zfs_rezget(zp);
+                       if (err2) {
+                               remove_inode_hash(ZTOI(zp));
+                               zp->z_is_stale = B_TRUE;
+                       }
                 }
                 mutex_exit(&zsb->z_znodes_lock);
-
         }
  
  bail:
-       /* release the VOPs */
+       /* release the VFS ops */
         rw_exit(&zsb->z_teardown_inactive_lock);
         rrw_exit(&zsb->z_teardown_lock, FTAG);
  
         if (err) {
                 /*
-                * Since we couldn't reopen zfs_sb_t::z_os, force
+                * Since we couldn't reopen zfs_sb_t, force
                  * unmount this file system.
                  */
                 (void) zfs_umount(zsb->z_sb);
diff --git a/module/zfs/zfs_znode.c b/module/zfs/zfs_znode.c

index 8074f1d008c4c7d1d5505845955b2ccbb2210831..9bf26a734c40d328aaaba3109c273096b4aabdda 100644 (file)
--- a/module/zfs/zfs_znode.c
+++ b/module/zfs/zfs_znode.c
@@ -274,8 +274,10 @@ zfs_inode_destroy(struct inode *ip)
                 zfsctl_inode_destroy(ip);
  
         mutex_enter(&zsb->z_znodes_lock);
-       list_remove(&zsb->z_all_znodes, zp);
-       zsb->z_nr_znodes--;
+       if (list_link_active(&zp->z_link_node)) {
+               list_remove(&zsb->z_all_znodes, zp);
+               zsb->z_nr_znodes--;
+       }
         mutex_exit(&zsb->z_znodes_lock);
  
         if (zp->z_acl_cached) {
@@ -348,7 +350,7 @@ zfs_inode_set_ops(zfs_sb_t *zsb, struct inode *ip)
  static znode_t *
  zfs_znode_alloc(zfs_sb_t *zsb, dmu_buf_t *db, int blksz,
      dmu_object_type_t obj_type, uint64_t obj, sa_handle_t *hdl,
-    struct dentry *dentry, struct inode *dip)
+    struct inode *dip)
  {
         znode_t *zp;
         struct inode *ip;
@@ -379,6 +381,7 @@ zfs_znode_alloc(zfs_sb_t *zsb, dmu_buf_t *db, int blksz,
         zp->z_is_zvol = B_FALSE;
         zp->z_is_mapped = B_FALSE;
         zp->z_is_ctldir = B_FALSE;
+       zp->z_is_stale = B_FALSE;
  
         zfs_znode_sa_init(zsb, zp, db, obj_type, hdl);
  
@@ -414,11 +417,15 @@ zfs_znode_alloc(zfs_sb_t *zsb, dmu_buf_t *db, int blksz,
         zfs_inode_update(zp);
         zfs_inode_set_ops(zsb, ip);
  
-       if (insert_inode_locked(ip))
-               goto error;
-
-       if (dentry)
-               d_instantiate(dentry, ip);
+       /*
+        * The only way insert_inode_locked() can fail is if the ip->i_ino
+        * number is already hashed for this super block.  This can never
+        * happen because the inode numbers map 1:1 with the object numbers.
+        *
+        * The one exception is rolling back a mounted file system, but in
+        * this case all the active inode are unhashed during the rollback.
+        */
+       VERIFY3S(insert_inode_locked(ip), ==, 0);
  
         mutex_enter(&zsb->z_znodes_lock);
         list_insert_tail(&zsb->z_all_znodes, zp);
@@ -720,9 +727,9 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
  
         if (!(flag & IS_ROOT_NODE)) {
                 *zpp = zfs_znode_alloc(zsb, db, 0, obj_type, obj, sa_hdl,
-                   vap->va_dentry, ZTOI(dzp));
-               ASSERT(*zpp != NULL);
-               ASSERT(dzp != NULL);
+                   ZTOI(dzp));
+               VERIFY(*zpp != NULL);
+               VERIFY(dzp != NULL);
         } else {
                 /*
                  * If we are creating the root node, the "parent" we
@@ -931,7 +938,7 @@ again:
          * bonus buffer.
          */
         zp = zfs_znode_alloc(zsb, db, doi.doi_data_block_size,
-           doi.doi_bonus_type, obj_num, NULL, NULL, NULL);
+           doi.doi_bonus_type, obj_num, NULL, NULL);
         if (zp == NULL) {
                 err = ENOENT;
         } else {
@@ -961,8 +968,20 @@ zfs_rezget(znode_t *zp)
                 zfs_acl_free(zp->z_acl_cached);
                 zp->z_acl_cached = NULL;
         }
-
         mutex_exit(&zp->z_acl_lock);
+
+       rw_enter(&zp->z_xattr_lock, RW_WRITER);
+       if (zp->z_xattr_cached) {
+               nvlist_free(zp->z_xattr_cached);
+               zp->z_xattr_cached = NULL;
+       }
+
+       if (zp->z_xattr_parent) {
+               iput(ZTOI(zp->z_xattr_parent));
+               zp->z_xattr_parent = NULL;
+       }
+       rw_exit(&zp->z_xattr_lock);
+
         ASSERT(zp->z_sa_hdl == NULL);
         err = sa_buf_hold(zsb->z_os, obj_num, NULL, &db);
         if (err) {
@@ -1016,6 +1035,7 @@ zfs_rezget(znode_t *zp)
  
         zp->z_unlinked = (zp->z_links == 0);
         zp->z_blksz = doi.doi_data_block_size;
+       zfs_inode_update(zp);
  
         ZFS_OBJ_HOLD_EXIT(zsb, obj_num);
  
diff --git a/module/zfs/zpl_ctldir.c b/module/zfs/zpl_ctldir.c

index 2e5209f8c094645ea431f3d9e115a35dff4a2495..54bdbe409482218752c2ad55ae6573b617002551 100644 (file)
--- a/module/zfs/zpl_ctldir.c
+++ b/module/zfs/zpl_ctldir.c
@@ -366,7 +366,7 @@ zpl_snapdir_mkdir(struct inode *dip, struct dentry *dentry, zpl_umode_t mode)
  
         crhold(cr);
         vap = kmem_zalloc(sizeof(vattr_t), KM_SLEEP);
-       zpl_vap_init(vap, dip, dentry, mode | S_IFDIR, cr);
+       zpl_vap_init(vap, dip, mode | S_IFDIR, cr);
  
         error = -zfsctl_snapdir_mkdir(dip, dname(dentry), vap, &ip, cr, 0);
         if (error == 0) {
diff --git a/module/zfs/zpl_inode.c b/module/zfs/zpl_inode.c

index 6175c2e93f8e4c2fe168f86a651a6764b3f8fc57..15ee0f61059f792e749043a67a9bfe5a2849d12f 100644 (file)
--- a/module/zfs/zpl_inode.c
+++ b/module/zfs/zpl_inode.c
@@ -46,6 +46,10 @@ zpl_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
         ASSERT3S(error, <=, 0);
         crfree(cr);
  
+       spin_lock(&dentry->d_lock);
+       dentry->d_time = jiffies;
+       spin_unlock(&dentry->d_lock);
+
         if (error) {
                 if (error == -ENOENT)
                         return d_splice_alias(NULL, dentry);
@@ -57,12 +61,10 @@ zpl_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
  }
  
  void
-zpl_vap_init(vattr_t *vap, struct inode *dir, struct dentry *dentry,
-    zpl_umode_t mode, cred_t *cr)
+zpl_vap_init(vattr_t *vap, struct inode *dir, zpl_umode_t mode, cred_t *cr)
  {
         vap->va_mask = ATTR_MODE;
         vap->va_mode = mode;
-       vap->va_dentry = dentry;
         vap->va_uid = crgetfsuid(cr);
  
         if (dir && dir->i_mode & S_ISGID) {
@@ -90,12 +92,14 @@ zpl_create(struct inode *dir, struct dentry *dentry, zpl_umode_t mode,
  
         crhold(cr);
         vap = kmem_zalloc(sizeof(vattr_t), KM_SLEEP);
-       zpl_vap_init(vap, dir, dentry, mode, cr);
+       zpl_vap_init(vap, dir, mode, cr);
  
         error = -zfs_create(dir, dname(dentry), vap, 0, mode, &ip, cr, 0, NULL);
         if (error == 0) {
                 error = zpl_xattr_security_init(ip, dir, &dentry->d_name);
                 VERIFY3S(error, ==, 0);
+               d_instantiate(dentry, ip);
+               d_set_d_op(dentry, &zpl_dentry_operations);
         }
  
         kmem_free(vap, sizeof(vattr_t));
@@ -123,11 +127,15 @@ zpl_mknod(struct inode *dir, struct dentry *dentry, zpl_umode_t mode,
  
         crhold(cr);
         vap = kmem_zalloc(sizeof(vattr_t), KM_SLEEP);
-       zpl_vap_init(vap, dir, dentry, mode, cr);
+       zpl_vap_init(vap, dir, mode, cr);
         vap->va_rdev = rdev;
  
-       error = -zfs_create(dir, (char *)dentry->d_name.name,
-           vap, 0, mode, &ip, cr, 0, NULL);
+       error = -zfs_create(dir, dname(dentry), vap, 0, mode, &ip, cr, 0, NULL);
+       if (error == 0) {
+               d_instantiate(dentry, ip);
+               d_set_d_op(dentry, &zpl_dentry_operations);
+       }
+
         kmem_free(vap, sizeof(vattr_t));
         crfree(cr);
         ASSERT3S(error, <=, 0);
@@ -159,9 +167,14 @@ zpl_mkdir(struct inode *dir, struct dentry *dentry, zpl_umode_t mode)
  
         crhold(cr);
         vap = kmem_zalloc(sizeof(vattr_t), KM_SLEEP);
-       zpl_vap_init(vap, dir, dentry, mode | S_IFDIR, cr);
+       zpl_vap_init(vap, dir, mode | S_IFDIR, cr);
  
         error = -zfs_mkdir(dir, dname(dentry), vap, &ip, cr, 0, NULL);
+       if (error == 0) {
+               d_instantiate(dentry, ip);
+               d_set_d_op(dentry, &zpl_dentry_operations);
+       }
+
         kmem_free(vap, sizeof(vattr_t));
         crfree(cr);
         ASSERT3S(error, <=, 0);
@@ -262,9 +275,14 @@ zpl_symlink(struct inode *dir, struct dentry *dentry, const char *name)
  
         crhold(cr);
         vap = kmem_zalloc(sizeof(vattr_t), KM_SLEEP);
-       zpl_vap_init(vap, dir, dentry, S_IFLNK | S_IRWXUGO, cr);
+       zpl_vap_init(vap, dir, S_IFLNK | S_IRWXUGO, cr);
  
         error = -zfs_symlink(dir, dname(dentry), vap, (char *)name, &ip, cr, 0);
+       if (error == 0) {
+               d_instantiate(dentry, ip);
+               d_set_d_op(dentry, &zpl_dentry_operations);
+       }
+
         kmem_free(vap, sizeof(vattr_t));
         crfree(cr);
         ASSERT3S(error, <=, 0);
@@ -334,6 +352,7 @@ zpl_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
         }
  
         d_instantiate(dentry, ip);
+       d_set_d_op(dentry, &zpl_dentry_operations);
  out:
         crfree(cr);
         ASSERT3S(error, <=, 0);
@@ -378,6 +397,44 @@ zpl_fallocate(struct inode *ip, int mode, loff_t offset, loff_t len)
  }
  #endif /* HAVE_INODE_FALLOCATE */
  
+static int
+#ifdef HAVE_D_REVALIDATE_NAMEIDATA
+zpl_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+       unsigned int flags = nd->flags;
+#else
+zpl_revalidate(struct dentry *dentry, unsigned int flags)
+{
+#endif /* HAVE_D_REVALIDATE_NAMEIDATA */
+       zfs_sb_t *zsb = dentry->d_sb->s_fs_info;
+       int error;
+
+       if (flags & LOOKUP_RCU)
+               return (-ECHILD);
+
+       /*
+        * After a rollback negative dentries created before the rollback
+        * time must be invalidated.  Otherwise they can obscure files which
+        * are only present in the rolled back dataset.
+        */
+       if (dentry->d_inode == NULL) {
+               spin_lock(&dentry->d_lock);
+               error = time_before(dentry->d_time, zsb->z_rollback_time);
+               spin_unlock(&dentry->d_lock);
+
+               if (error)
+                       return (0);
+       }
+
+       /*
+        * The dentry may reference a stale inode if a mounted file system
+        * was rolled back to a point in time where the object didn't exist.
+        */
+       if (dentry->d_inode && ITOZ(dentry->d_inode)->z_is_stale)
+               return (0);
+
+       return (1);
+}
  
  const struct inode_operations zpl_inode_operations = {
         .create         = zpl_create,
@@ -440,3 +497,7 @@ const struct inode_operations zpl_special_inode_operations = {
         .removexattr    = generic_removexattr,
         .listxattr      = zpl_xattr_list,
  };
+
+dentry_operations_t zpl_dentry_operations = {
+       .d_revalidate   = zpl_revalidate,
+};
author	Brian Behlendorf <behlendorf1@llnl.gov>
	Wed, 16 Jan 2013 00:41:09 +0000 (16:41 -0800)
committer	Brian Behlendorf <behlendorf1@llnl.gov>
	Thu, 17 Jan 2013 17:51:20 +0000 (09:51 -0800)
include/linux/vfs_compat.h		patch \| blob \| blame \| history
include/sys/zfs_vfsops.h		patch \| blob \| blame \| history
include/sys/zfs_znode.h		patch \| blob \| blame \| history
include/sys/zpl.h		patch \| blob \| blame \| history
module/zfs/zfs_ctldir.c		patch \| blob \| blame \| history
module/zfs/zfs_vfsops.c		patch \| blob \| blame \| history
module/zfs/zfs_znode.c		patch \| blob \| blame \| history
module/zfs/zpl_ctldir.c		patch \| blob \| blame \| history
module/zfs/zpl_inode.c		patch \| blob \| blame \| history