]> git.proxmox.com Git - mirror_ubuntu-focal-kernel.git/commitdiff
Merge tag 'iversion-v4.16-1' of git://git.kernel.org/pub/scm/linux/kernel/git/jlayton...
authorLinus Torvalds <torvalds@linux-foundation.org>
Mon, 29 Jan 2018 21:33:53 +0000 (13:33 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Mon, 29 Jan 2018 21:33:53 +0000 (13:33 -0800)
Pull inode->i_version rework from Jeff Layton:
 "This pile of patches is a rework of the inode->i_version field. We
  have traditionally incremented that field on every inode data or
  metadata change. Typically this increment needs to be logged on disk
  even when nothing else has changed, which is rather expensive.

  It turns out though that none of the consumers of that field actually
  require this behavior. The only real requirement for all of them is
  that it be different iff the inode has changed since the last time the
  field was checked.

  Given that, we can optimize away most of the i_version increments and
  avoid dirtying inode metadata when the only change is to the i_version
  and no one is querying it. Queries of the i_version field are rather
  rare, so we can help write performance under many common workloads.

  This patch series converts existing accesses of the i_version field to
  a new API, and then converts all of the in-kernel filesystems to use
  it. The last patch in the series then converts the backend
  implementation to a scheme that optimizes away a large portion of the
  metadata updates when no one is looking at it.

  In my own testing this series significantly helps performance with
  small I/O sizes. I also got this email for Christmas this year from
  the kernel test robot (a 244% r/w bandwidth improvement with XFS over
  DAX, with 4k writes):

    https://lkml.org/lkml/2017/12/25/8

  A few of the earlier patches in this pile are also flowing to you via
  other trees (mm, integrity, and nfsd trees in particular)".

* tag 'iversion-v4.16-1' of git://git.kernel.org/pub/scm/linux/kernel/git/jlayton/linux: (22 commits)
  fs: handle inode->i_version more efficiently
  btrfs: only dirty the inode in btrfs_update_time if something was changed
  xfs: avoid setting XFS_ILOG_CORE if i_version doesn't need incrementing
  fs: only set S_VERSION when updating times if necessary
  IMA: switch IMA over to new i_version API
  xfs: convert to new i_version API
  ufs: use new i_version API
  ocfs2: convert to new i_version API
  nfsd: convert to new i_version API
  nfs: convert to new i_version API
  ext4: convert to new i_version API
  ext2: convert to new i_version API
  exofs: switch to new i_version API
  btrfs: convert to new i_version API
  afs: convert to new i_version API
  affs: convert to new i_version API
  fat: convert to new i_version API
  fs: don't take the i_lock in inode_inc_iversion
  fs: new API for handling inode->i_version
  ntfs: remove i_version handling
  ...

1  2 
fs/afs/inode.c
fs/btrfs/delayed-inode.c
fs/btrfs/inode.c
fs/ext4/inode.c
fs/ext4/namei.c
fs/nfs/write.c
fs/xfs/xfs_icache.c
fs/xfs/xfs_inode.c

diff --combined fs/afs/inode.c
index 1e81864ef0b29bffcc10944c30bc57796815025d,dcd2e08d6cdb87c9ca0bad6398026ce4c8ee982e..c7f17c44c7ce88243b4df6016ad2865e2df1c594
@@@ -21,6 -21,7 +21,7 @@@
  #include <linux/sched.h>
  #include <linux/mount.h>
  #include <linux/namei.h>
+ #include <linux/iversion.h>
  #include "internal.h"
  
  static const struct inode_operations afs_symlink_inode_operations = {
@@@ -89,7 -90,7 +90,7 @@@ static int afs_inode_map_status(struct 
        inode->i_atime          = inode->i_mtime = inode->i_ctime;
        inode->i_blocks         = 0;
        inode->i_generation     = vnode->fid.unique;
-       inode->i_version        = vnode->status.data_version;
+       inode_set_iversion_raw(inode, vnode->status.data_version);
        inode->i_mapping->a_ops = &afs_fs_aops;
  
        read_sequnlock_excl(&vnode->cb_lock);
@@@ -218,7 -219,7 +219,7 @@@ struct inode *afs_iget_autocell(struct 
        inode->i_ctime.tv_nsec  = 0;
        inode->i_atime          = inode->i_mtime = inode->i_ctime;
        inode->i_blocks         = 0;
-       inode->i_version        = 0;
+       inode_set_iversion_raw(inode, 0);
        inode->i_generation     = 0;
  
        set_bit(AFS_VNODE_PSEUDODIR, &vnode->flags);
@@@ -377,10 -378,6 +378,10 @@@ int afs_validate(struct afs_vnode *vnod
        }
  
        read_sequnlock_excl(&vnode->cb_lock);
 +
 +      if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
 +              clear_nlink(&vnode->vfs_inode);
 +
        if (valid)
                goto valid;
  
diff --combined fs/btrfs/delayed-inode.c
index a6226cd6063c78dd0194c4c368698047da84b0a2,6a246ae2bcb2793b218472d07ebecb3c447dcc45..d4db406e2d3940f0b9c61778202720c7ffc29596
@@@ -18,6 -18,7 +18,7 @@@
   */
  
  #include <linux/slab.h>
+ #include <linux/iversion.h>
  #include "delayed-inode.h"
  #include "disk-io.h"
  #include "transaction.h"
@@@ -87,7 -88,6 +88,7 @@@ static struct btrfs_delayed_node *btrfs
  
        spin_lock(&root->inode_lock);
        node = radix_tree_lookup(&root->delayed_nodes_tree, ino);
 +
        if (node) {
                if (btrfs_inode->delayed_node) {
                        refcount_inc(&node->refs);      /* can be accessed */
                        spin_unlock(&root->inode_lock);
                        return node;
                }
 -              btrfs_inode->delayed_node = node;
 -              /* can be accessed and cached in the inode */
 -              refcount_add(2, &node->refs);
 +
 +              /*
 +               * It's possible that we're racing into the middle of removing
 +               * this node from the radix tree.  In this case, the refcount
 +               * was zero and it should never go back to one.  Just return
 +               * NULL like it was never in the radix at all; our release
 +               * function is in the process of removing it.
 +               *
 +               * Some implementations of refcount_inc refuse to bump the
 +               * refcount once it has hit zero.  If we don't do this dance
 +               * here, refcount_inc() may decide to just WARN_ONCE() instead
 +               * of actually bumping the refcount.
 +               *
 +               * If this node is properly in the radix, we want to bump the
 +               * refcount twice, once for the inode and once for this get
 +               * operation.
 +               */
 +              if (refcount_inc_not_zero(&node->refs)) {
 +                      refcount_inc(&node->refs);
 +                      btrfs_inode->delayed_node = node;
 +              } else {
 +                      node = NULL;
 +              }
 +
                spin_unlock(&root->inode_lock);
                return node;
        }
@@@ -276,18 -255,17 +277,18 @@@ static void __btrfs_release_delayed_nod
        mutex_unlock(&delayed_node->mutex);
  
        if (refcount_dec_and_test(&delayed_node->refs)) {
 -              bool free = false;
                struct btrfs_root *root = delayed_node->root;
 +
                spin_lock(&root->inode_lock);
 -              if (refcount_read(&delayed_node->refs) == 0) {
 -                      radix_tree_delete(&root->delayed_nodes_tree,
 -                                        delayed_node->inode_id);
 -                      free = true;
 -              }
 +              /*
 +               * Once our refcount goes to zero, nobody is allowed to bump it
 +               * back up.  We can delete it now.
 +               */
 +              ASSERT(refcount_read(&delayed_node->refs) == 0);
 +              radix_tree_delete(&root->delayed_nodes_tree,
 +                                delayed_node->inode_id);
                spin_unlock(&root->inode_lock);
 -              if (free)
 -                      kmem_cache_free(delayed_node_cache, delayed_node);
 +              kmem_cache_free(delayed_node_cache, delayed_node);
        }
  }
  
@@@ -1633,18 -1611,28 +1634,18 @@@ void btrfs_readdir_put_delayed_items(st
  int btrfs_should_delete_dir_index(struct list_head *del_list,
                                  u64 index)
  {
 -      struct btrfs_delayed_item *curr, *next;
 -      int ret;
 -
 -      if (list_empty(del_list))
 -              return 0;
 +      struct btrfs_delayed_item *curr;
 +      int ret = 0;
  
 -      list_for_each_entry_safe(curr, next, del_list, readdir_list) {
 +      list_for_each_entry(curr, del_list, readdir_list) {
                if (curr->key.offset > index)
                        break;
 -
 -              list_del(&curr->readdir_list);
 -              ret = (curr->key.offset == index);
 -
 -              if (refcount_dec_and_test(&curr->refs))
 -                      kfree(curr);
 -
 -              if (ret)
 -                      return 1;
 -              else
 -                      continue;
 +              if (curr->key.offset == index) {
 +                      ret = 1;
 +                      break;
 +              }
        }
 -      return 0;
 +      return ret;
  }
  
  /*
@@@ -1713,7 -1701,8 +1714,8 @@@ static void fill_stack_inode_item(struc
        btrfs_set_stack_inode_nbytes(inode_item, inode_get_bytes(inode));
        btrfs_set_stack_inode_generation(inode_item,
                                         BTRFS_I(inode)->generation);
-       btrfs_set_stack_inode_sequence(inode_item, inode->i_version);
+       btrfs_set_stack_inode_sequence(inode_item,
+                                      inode_peek_iversion(inode));
        btrfs_set_stack_inode_transid(inode_item, trans->transid);
        btrfs_set_stack_inode_rdev(inode_item, inode->i_rdev);
        btrfs_set_stack_inode_flags(inode_item, BTRFS_I(inode)->flags);
@@@ -1767,7 -1756,8 +1769,8 @@@ int btrfs_fill_inode(struct inode *inod
        BTRFS_I(inode)->generation = btrfs_stack_inode_generation(inode_item);
          BTRFS_I(inode)->last_trans = btrfs_stack_inode_transid(inode_item);
  
-       inode->i_version = btrfs_stack_inode_sequence(inode_item);
+       inode_set_iversion_queried(inode,
+                                  btrfs_stack_inode_sequence(inode_item));
        inode->i_rdev = 0;
        *rdev = btrfs_stack_inode_rdev(inode_item);
        BTRFS_I(inode)->flags = btrfs_stack_inode_flags(inode_item);
diff --combined fs/btrfs/inode.c
index cb1e2d201434e33c94a648a08670e664f6cca05d,76245323a7c8abed0db0280fe775fdb7694cb53e..734b37d9d459a41ab4043184a1c25bef7757aa17
@@@ -43,6 -43,7 +43,7 @@@
  #include <linux/posix_acl_xattr.h>
  #include <linux/uio.h>
  #include <linux/magic.h>
+ #include <linux/iversion.h>
  #include "ctree.h"
  #include "disk-io.h"
  #include "transaction.h"
@@@ -3777,7 -3778,8 +3778,8 @@@ static int btrfs_read_locked_inode(stru
        BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
        BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item);
  
-       inode->i_version = btrfs_inode_sequence(leaf, inode_item);
+       inode_set_iversion_queried(inode,
+                                  btrfs_inode_sequence(leaf, inode_item));
        inode->i_generation = BTRFS_I(inode)->generation;
        inode->i_rdev = 0;
        rdev = btrfs_inode_rdev(leaf, inode_item);
@@@ -3945,7 -3947,8 +3947,8 @@@ static void fill_inode_item(struct btrf
                                     &token);
        btrfs_set_token_inode_generation(leaf, item, BTRFS_I(inode)->generation,
                                         &token);
-       btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token);
+       btrfs_set_token_inode_sequence(leaf, item, inode_peek_iversion(inode),
+                                      &token);
        btrfs_set_token_inode_transid(leaf, item, trans->transid, &token);
        btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token);
        btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token);
@@@ -6104,19 -6107,20 +6107,20 @@@ static int btrfs_update_time(struct ino
                             int flags)
  {
        struct btrfs_root *root = BTRFS_I(inode)->root;
+       bool dirty = flags & ~S_VERSION;
  
        if (btrfs_root_readonly(root))
                return -EROFS;
  
        if (flags & S_VERSION)
-               inode_inc_iversion(inode);
+               dirty |= inode_maybe_inc_iversion(inode, dirty);
        if (flags & S_CTIME)
                inode->i_ctime = *now;
        if (flags & S_MTIME)
                inode->i_mtime = *now;
        if (flags & S_ATIME)
                inode->i_atime = *now;
-       return btrfs_dirty_inode(inode);
+       return dirty ? btrfs_dirty_inode(inode) : 0;
  }
  
  /*
@@@ -8015,7 -8019,6 +8019,7 @@@ static blk_status_t dio_read_error(stru
        int segs;
        int ret;
        blk_status_t status;
 +      struct bio_vec bvec;
  
        BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
  
        }
  
        segs = bio_segments(failed_bio);
 +      bio_get_first_bvec(failed_bio, &bvec);
        if (segs > 1 ||
 -          (failed_bio->bi_io_vec->bv_len > btrfs_inode_sectorsize(inode)))
 +          (bvec.bv_len > btrfs_inode_sectorsize(inode)))
                read_mode |= REQ_FAILFAST_DEV;
  
        isector = start - btrfs_io_bio(failed_bio)->logical;
@@@ -8076,7 -8078,7 +8080,7 @@@ static void btrfs_retry_endio_nocsum(st
        ASSERT(bio->bi_vcnt == 1);
        io_tree = &BTRFS_I(inode)->io_tree;
        failure_tree = &BTRFS_I(inode)->io_failure_tree;
 -      ASSERT(bio->bi_io_vec->bv_len == btrfs_inode_sectorsize(inode));
 +      ASSERT(bio_first_bvec_all(bio)->bv_len == btrfs_inode_sectorsize(inode));
  
        done->uptodate = 1;
        ASSERT(!bio_flagged(bio, BIO_CLONED));
@@@ -8166,7 -8168,7 +8170,7 @@@ static void btrfs_retry_endio(struct bi
        uptodate = 1;
  
        ASSERT(bio->bi_vcnt == 1);
 -      ASSERT(bio->bi_io_vec->bv_len == btrfs_inode_sectorsize(done->inode));
 +      ASSERT(bio_first_bvec_all(bio)->bv_len == btrfs_inode_sectorsize(done->inode));
  
        io_tree = &BTRFS_I(inode)->io_tree;
        failure_tree = &BTRFS_I(inode)->io_failure_tree;
diff --combined fs/ext4/inode.c
index 534a9130f62578931a24477f317c17b42c71ffc3,1b0d54b372f2345211201d8b515102d830a84240..0eff5b761c6e0687a91daf716b44c683f06a5b47
@@@ -39,6 -39,7 +39,7 @@@
  #include <linux/slab.h>
  #include <linux/bitops.h>
  #include <linux/iomap.h>
+ #include <linux/iversion.h>
  
  #include "ext4_jbd2.h"
  #include "xattr.h"
@@@ -149,15 -150,6 +150,15 @@@ static int ext4_meta_trans_blocks(struc
   */
  int ext4_inode_is_fast_symlink(struct inode *inode)
  {
 +      if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) {
 +              int ea_blocks = EXT4_I(inode)->i_file_acl ?
 +                              EXT4_CLUSTER_SIZE(inode->i_sb) >> 9 : 0;
 +
 +              if (ext4_has_inline_data(inode))
 +                      return 0;
 +
 +              return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0);
 +      }
        return S_ISLNK(inode->i_mode) && inode->i_size &&
               (inode->i_size < EXT4_N_BLOCKS * 4);
  }
@@@ -4882,12 -4874,14 +4883,14 @@@ struct inode *ext4_iget(struct super_bl
        EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode);
  
        if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) {
-               inode->i_version = le32_to_cpu(raw_inode->i_disk_version);
+               u64 ivers = le32_to_cpu(raw_inode->i_disk_version);
                if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
                        if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
-                               inode->i_version |=
+                               ivers |=
                    (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
                }
+               inode_set_iversion_queried(inode, ivers);
        }
  
        ret = 0;
@@@ -5173,11 -5167,13 +5176,13 @@@ static int ext4_do_update_inode(handle_
        }
  
        if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) {
-               raw_inode->i_disk_version = cpu_to_le32(inode->i_version);
+               u64 ivers = inode_peek_iversion(inode);
+               raw_inode->i_disk_version = cpu_to_le32(ivers);
                if (ei->i_extra_isize) {
                        if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
                                raw_inode->i_version_hi =
-                                       cpu_to_le32(inode->i_version >> 32);
+                                       cpu_to_le32(ivers >> 32);
                        raw_inode->i_extra_isize =
                                cpu_to_le16(ei->i_extra_isize);
                }
diff --combined fs/ext4/namei.c
index e750d68fbcb50c0447e13556905da8401f5f6b03,55f6e38de5baa3c9bacf7b5000118266ac071349..6660686e505a394818aa38cd4bb9fece1129fef4
@@@ -34,6 -34,7 +34,7 @@@
  #include <linux/quotaops.h>
  #include <linux/buffer_head.h>
  #include <linux/bio.h>
+ #include <linux/iversion.h>
  #include "ext4.h"
  #include "ext4_jbd2.h"
  
@@@ -1399,10 -1400,6 +1400,10 @@@ static struct buffer_head * ext4_find_e
                               "falling back\n"));
        }
        nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb);
 +      if (!nblocks) {
 +              ret = NULL;
 +              goto cleanup_and_exit;
 +      }
        start = EXT4_I(dir)->i_dir_start_lookup;
        if (start >= nblocks)
                start = 0;
@@@ -2959,7 -2956,7 +2960,7 @@@ static int ext4_rmdir(struct inode *dir
                             "empty directory '%.*s' has too many links (%u)",
                             dentry->d_name.len, dentry->d_name.name,
                             inode->i_nlink);
-       inode->i_version++;
+       inode_inc_iversion(inode);
        clear_nlink(inode);
        /* There's no need to set i_disksize: the fact that i_nlink is
         * zero will ensure that the right thing happens during any
@@@ -3365,7 -3362,7 +3366,7 @@@ static int ext4_setent(handle_t *handle
        ent->de->inode = cpu_to_le32(ino);
        if (ext4_has_feature_filetype(ent->dir->i_sb))
                ent->de->file_type = file_type;
-       ent->dir->i_version++;
+       inode_inc_iversion(ent->dir);
        ent->dir->i_ctime = ent->dir->i_mtime =
                current_time(ent->dir);
        ext4_mark_inode_dirty(handle, ent->dir);
diff --combined fs/nfs/write.c
index 4a379d7918f23e1130468c2f58bfea3623035116,f87cbe126fa054ae1aa3d05f192db44ebb45588c..12b2d477836b9725a34426520f65e3000a565a3a
@@@ -23,6 -23,7 +23,7 @@@
  #include <linux/export.h>
  #include <linux/freezer.h>
  #include <linux/wait.h>
+ #include <linux/iversion.h>
  
  #include <linux/uaccess.h>
  
@@@ -753,11 -754,8 +754,8 @@@ static void nfs_inode_add_request(struc
         */
        spin_lock(&mapping->private_lock);
        if (!nfs_have_writebacks(inode) &&
-           NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE)) {
-               spin_lock(&inode->i_lock);
-               inode->i_version++;
-               spin_unlock(&inode->i_lock);
-       }
+           NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE))
+               inode_inc_iversion_raw(inode);
        if (likely(!PageSwapCache(req->wb_page))) {
                set_bit(PG_MAPPED, &req->wb_flags);
                SetPagePrivate(req->wb_page);
@@@ -1890,8 -1888,6 +1888,8 @@@ int nfs_commit_inode(struct inode *inod
        if (res)
                error = nfs_generic_commit_list(inode, &head, how, &cinfo);
        nfs_commit_end(cinfo.mds);
 +      if (res == 0)
 +              return res;
        if (error < 0)
                goto out_error;
        if (!may_wait)
diff --combined fs/xfs/xfs_icache.c
index 3861d61fb265f66a9d39723286d5adc9772cc15e,4c315adb05e63beab1d36b80122a07aad6fdb2fb..3bcb8fd2a826f317fc018721c942940629259b06
@@@ -37,6 -37,7 +37,7 @@@
  
  #include <linux/kthread.h>
  #include <linux/freezer.h>
+ #include <linux/iversion.h>
  
  /*
   * Allocate and initialise an xfs_inode.
@@@ -293,14 -294,14 +294,14 @@@ xfs_reinit_inode
        int             error;
        uint32_t        nlink = inode->i_nlink;
        uint32_t        generation = inode->i_generation;
-       uint64_t        version = inode->i_version;
+       uint64_t        version = inode_peek_iversion(inode);
        umode_t         mode = inode->i_mode;
  
        error = inode_init_always(mp->m_super, inode);
  
        set_nlink(inode, nlink);
        inode->i_generation = generation;
-       inode->i_version = version;
+       inode_set_iversion_queried(inode, version);
        inode->i_mode = mode;
        return error;
  }
@@@ -870,7 -871,7 +871,7 @@@ xfs_eofblocks_worker
   * based on the 'speculative_cow_prealloc_lifetime' tunable (5m by default).
   * (We'll just piggyback on the post-EOF prealloc space workqueue.)
   */
 -STATIC void
 +void
  xfs_queue_cowblocks(
        struct xfs_mount *mp)
  {
@@@ -1536,23 -1537,8 +1537,23 @@@ xfs_inode_free_quota_eofblocks
        return __xfs_inode_free_quota_eofblocks(ip, xfs_icache_free_eofblocks);
  }
  
 +static inline unsigned long
 +xfs_iflag_for_tag(
 +      int             tag)
 +{
 +      switch (tag) {
 +      case XFS_ICI_EOFBLOCKS_TAG:
 +              return XFS_IEOFBLOCKS;
 +      case XFS_ICI_COWBLOCKS_TAG:
 +              return XFS_ICOWBLOCKS;
 +      default:
 +              ASSERT(0);
 +              return 0;
 +      }
 +}
 +
  static void
 -__xfs_inode_set_eofblocks_tag(
 +__xfs_inode_set_blocks_tag(
        xfs_inode_t     *ip,
        void            (*execute)(struct xfs_mount *mp),
        void            (*set_tp)(struct xfs_mount *mp, xfs_agnumber_t agno,
         * Don't bother locking the AG and looking up in the radix trees
         * if we already know that we have the tag set.
         */
 -      if (ip->i_flags & XFS_IEOFBLOCKS)
 +      if (ip->i_flags & xfs_iflag_for_tag(tag))
                return;
        spin_lock(&ip->i_flags_lock);
 -      ip->i_flags |= XFS_IEOFBLOCKS;
 +      ip->i_flags |= xfs_iflag_for_tag(tag);
        spin_unlock(&ip->i_flags_lock);
  
        pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
@@@ -1602,13 -1588,13 +1603,13 @@@ xfs_inode_set_eofblocks_tag
        xfs_inode_t     *ip)
  {
        trace_xfs_inode_set_eofblocks_tag(ip);
 -      return __xfs_inode_set_eofblocks_tag(ip, xfs_queue_eofblocks,
 +      return __xfs_inode_set_blocks_tag(ip, xfs_queue_eofblocks,
                        trace_xfs_perag_set_eofblocks,
                        XFS_ICI_EOFBLOCKS_TAG);
  }
  
  static void
 -__xfs_inode_clear_eofblocks_tag(
 +__xfs_inode_clear_blocks_tag(
        xfs_inode_t     *ip,
        void            (*clear_tp)(struct xfs_mount *mp, xfs_agnumber_t agno,
                                    int error, unsigned long caller_ip),
        struct xfs_perag *pag;
  
        spin_lock(&ip->i_flags_lock);
 -      ip->i_flags &= ~XFS_IEOFBLOCKS;
 +      ip->i_flags &= ~xfs_iflag_for_tag(tag);
        spin_unlock(&ip->i_flags_lock);
  
        pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
@@@ -1645,7 -1631,7 +1646,7 @@@ xfs_inode_clear_eofblocks_tag
        xfs_inode_t     *ip)
  {
        trace_xfs_inode_clear_eofblocks_tag(ip);
 -      return __xfs_inode_clear_eofblocks_tag(ip,
 +      return __xfs_inode_clear_blocks_tag(ip,
                        trace_xfs_perag_clear_eofblocks, XFS_ICI_EOFBLOCKS_TAG);
  }
  
@@@ -1739,7 -1725,7 +1740,7 @@@ xfs_inode_set_cowblocks_tag
        xfs_inode_t     *ip)
  {
        trace_xfs_inode_set_cowblocks_tag(ip);
 -      return __xfs_inode_set_eofblocks_tag(ip, xfs_queue_cowblocks,
 +      return __xfs_inode_set_blocks_tag(ip, xfs_queue_cowblocks,
                        trace_xfs_perag_set_cowblocks,
                        XFS_ICI_COWBLOCKS_TAG);
  }
@@@ -1749,6 -1735,6 +1750,6 @@@ xfs_inode_clear_cowblocks_tag
        xfs_inode_t     *ip)
  {
        trace_xfs_inode_clear_cowblocks_tag(ip);
 -      return __xfs_inode_clear_eofblocks_tag(ip,
 +      return __xfs_inode_clear_blocks_tag(ip,
                        trace_xfs_perag_clear_cowblocks, XFS_ICI_COWBLOCKS_TAG);
  }
diff --combined fs/xfs/xfs_inode.c
index 6f95bdb408ced01b9471b931714d279003a22d92,dfc5e60d8af3344e6e7a0464fbb07fa4c3bd2dbd..9f424e0aef1f9c86423a70ce74a6bf22fd821b23
@@@ -16,6 -16,7 +16,7 @@@
   * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
   */
  #include <linux/log2.h>
+ #include <linux/iversion.h>
  
  #include "xfs.h"
  #include "xfs_fs.h"
@@@ -749,6 -750,7 +750,6 @@@ xfs_ialloc
        xfs_nlink_t     nlink,
        dev_t           rdev,
        prid_t          prid,
 -      int             okalloc,
        xfs_buf_t       **ialloc_context,
        xfs_inode_t     **ipp)
  {
         * Call the space management code to pick
         * the on-disk inode to be allocated.
         */
 -      error = xfs_dialloc(tp, pip ? pip->i_ino : 0, mode, okalloc,
 +      error = xfs_dialloc(tp, pip ? pip->i_ino : 0, mode,
                            ialloc_context, &ino);
        if (error)
                return error;
        ip->i_d.di_flags = 0;
  
        if (ip->i_d.di_version == 3) {
-               inode->i_version = 1;
+               inode_set_iversion(inode, 1);
                ip->i_d.di_flags2 = 0;
                ip->i_d.di_cowextsize = 0;
                ip->i_d.di_crtime.t_sec = (int32_t)tv.tv_sec;
@@@ -956,6 -958,7 +957,6 @@@ xfs_dir_ialloc
        xfs_nlink_t     nlink,
        dev_t           rdev,
        prid_t          prid,           /* project id */
 -      int             okalloc,        /* ok to allocate new space */
        xfs_inode_t     **ipp,          /* pointer to inode; it will be
                                           locked. */
        int             *committed)
         * transaction commit so that no other process can steal
         * the inode(s) that we've just allocated.
         */
 -      code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, okalloc,
 -                        &ialloc_context, &ip);
 +      code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, &ialloc_context,
 +                      &ip);
  
        /*
         * Return an error if we were unable to allocate a new inode.
                 * this call should always succeed.
                 */
                code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid,
 -                                okalloc, &ialloc_context, &ip);
 +                                &ialloc_context, &ip);
  
                /*
                 * If we get an error at this point, return to the caller
@@@ -1180,6 -1183,11 +1181,6 @@@ xfs_create
                xfs_flush_inodes(mp);
                error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp);
        }
 -      if (error == -ENOSPC) {
 -              /* No space at all so try a "no-allocation" reservation */
 -              resblks = 0;
 -              error = xfs_trans_alloc(mp, tres, 0, 0, 0, &tp);
 -      }
        if (error)
                goto out_release_inode;
  
        if (error)
                goto out_trans_cancel;
  
 -      if (!resblks) {
 -              error = xfs_dir_canenter(tp, dp, name);
 -              if (error)
 -                      goto out_trans_cancel;
 -      }
 -
        /*
         * A newly created regular or special file just has one directory
         * entry pointing to them, but a directory also the "." entry
         * pointing to itself.
         */
 -      error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev,
 -                             prid, resblks > 0, &ip, NULL);
 +      error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev, prid, &ip,
 +                      NULL);
        if (error)
                goto out_trans_cancel;
  
@@@ -1327,6 -1341,11 +1328,6 @@@ xfs_create_tmpfile
        tres = &M_RES(mp)->tr_create_tmpfile;
  
        error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp);
 -      if (error == -ENOSPC) {
 -              /* No space at all so try a "no-allocation" reservation */
 -              resblks = 0;
 -              error = xfs_trans_alloc(mp, tres, 0, 0, 0, &tp);
 -      }
        if (error)
                goto out_release_inode;
  
        if (error)
                goto out_trans_cancel;
  
 -      error = xfs_dir_ialloc(&tp, dp, mode, 1, 0,
 -                              prid, resblks > 0, &ip, NULL);
 +      error = xfs_dir_ialloc(&tp, dp, mode, 1, 0, prid, &ip, NULL);
        if (error)
                goto out_trans_cancel;
  
@@@ -1487,24 -1507,6 +1488,24 @@@ xfs_link
        return error;
  }
  
 +/* Clear the reflink flag and the cowblocks tag if possible. */
 +static void
 +xfs_itruncate_clear_reflink_flags(
 +      struct xfs_inode        *ip)
 +{
 +      struct xfs_ifork        *dfork;
 +      struct xfs_ifork        *cfork;
 +
 +      if (!xfs_is_reflink_inode(ip))
 +              return;
 +      dfork = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
 +      cfork = XFS_IFORK_PTR(ip, XFS_COW_FORK);
 +      if (dfork->if_bytes == 0 && cfork->if_bytes == 0)
 +              ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK;
 +      if (cfork->if_bytes == 0)
 +              xfs_inode_clear_cowblocks_tag(ip);
 +}
 +
  /*
   * Free up the underlying blocks past new_size.  The new size must be smaller
   * than the current size.  This routine can be used both for the attribute and
@@@ -1601,7 -1603,15 +1602,7 @@@ xfs_itruncate_extents
        if (error)
                goto out;
  
 -      /*
 -       * Clear the reflink flag if there are no data fork blocks and
 -       * there are no extents staged in the cow fork.
 -       */
 -      if (xfs_is_reflink_inode(ip) && ip->i_cnextents == 0) {
 -              if (ip->i_d.di_nblocks == 0)
 -                      ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK;
 -              xfs_inode_clear_cowblocks_tag(ip);
 -      }
 +      xfs_itruncate_clear_reflink_flags(ip);
  
        /*
         * Always re-log the inode so that our permanent transaction can keep