]> git.proxmox.com Git - mirror_ubuntu-artful-kernel.git/commitdiff
Merge tag 'xfs-rmap-for-linus-4.8-rc1' of git://git.kernel.org/pub/scm/linux/kernel...
authorLinus Torvalds <torvalds@linux-foundation.org>
Sat, 6 Aug 2016 13:50:36 +0000 (09:50 -0400)
committerLinus Torvalds <torvalds@linux-foundation.org>
Sat, 6 Aug 2016 13:50:36 +0000 (09:50 -0400)
Pull more xfs updates from Dave Chinner:
 "This is the second part of the XFS updates for this merge cycle, and
  contains the new reverse block mapping feature for XFS.

  Reverse mapping allows us to track the owner of a specific block on
  disk precisely.  It is implemented as a set of btrees (one per
  allocation group) that track the owners of allocated extents.
  Effectively it is a "used space tree" that is updated when we allocate
  or free extents.  i.e. it is coherent with the free space btrees we
  already maintain and never overlaps with them.

  This reverse mapping infrastructure is the building block of several
  upcoming features - reflink, copy-on-write data, dedupe, online
  metadata and data scrubbing, highly accurate bad sector/data loss
  reporting to users, and significantly improved reconstruction of
  damaged and corrupted filesystems.  There's a lot of new stuff coming
  along in the next couple of cycles,a nd it all builds in the rmap
  infrastructure.

  As such, it's a huge chunk of new code with new on-disk format
  features and internal infrastructure.  It warns at mount time as an
  experimental feature and that it may eat data (as we do with all new
  on-disk features until they stabilise).  We have not released
  userspace suport for it yet - userspace support currently requires
  download from Darrick's xfsprogs repo and build from source, so the
  access to this feature is really developer/tester only at this point.
  Initial userspace support will be released at the same time kernel
  with this code in it is released.

  The new rmap enabled code regresses 3 xfstests - all are ENOSPC
  related corner cases, one of which Darrick posted a fix for a few
  hours ago.  The other two are fixed by infrastructure that is part of
  the upcoming reflink patchset.  This new ENOSPC infrastructure
  requires a on-disk format tweak required to keep mount times in
  check - we need to keep an on-disk count of allocated rmapbt blocks so
  we don't have to scan the entire btrees at mount time to count them.

  This is currently being tested and will be part of the fixes sent in
  the next week or two so users will not be exposed to this change"

* tag 'xfs-rmap-for-linus-4.8-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/dgc/linux-xfs: (52 commits)
  xfs: move (and rename) the deferred bmap-free tracepoints
  xfs: collapse single use static functions
  xfs: remove unnecessary parentheses from log redo item recovery functions
  xfs: remove the extents array from the rmap update done log item
  xfs: in btree_lshift, only allocate temporary cursor when needed
  xfs: remove unnecesary lshift/rshift key initialization
  xfs: remove the get*keys and update_keys btree ops pointers
  xfs: enable the rmap btree functionality
  xfs: don't update rmapbt when fixing agfl
  xfs: disable XFS_IOC_SWAPEXT when rmap btree is enabled
  xfs: add rmap btree block detection to log recovery
  xfs: add rmap btree geometry feature flag
  xfs: propagate bmap updates to rmapbt
  xfs: enable the xfs_defer mechanism to process rmaps to update
  xfs: log rmap intent items
  xfs: create rmap update intent log items
  xfs: add rmap btree insert and delete helpers
  xfs: convert unwritten status of reverse mappings
  xfs: remove an extent from the rmap btree
  xfs: add an extent to the rmap btree
  ...

64 files changed:
fs/xfs/Makefile
fs/xfs/libxfs/xfs_alloc.c
fs/xfs/libxfs/xfs_alloc.h
fs/xfs/libxfs/xfs_alloc_btree.c
fs/xfs/libxfs/xfs_attr.c
fs/xfs/libxfs/xfs_attr_leaf.c
fs/xfs/libxfs/xfs_attr_remote.c
fs/xfs/libxfs/xfs_bmap.c
fs/xfs/libxfs/xfs_bmap.h
fs/xfs/libxfs/xfs_bmap_btree.c
fs/xfs/libxfs/xfs_btree.c
fs/xfs/libxfs/xfs_btree.h
fs/xfs/libxfs/xfs_da_btree.c
fs/xfs/libxfs/xfs_da_btree.h
fs/xfs/libxfs/xfs_da_format.h
fs/xfs/libxfs/xfs_defer.c [new file with mode: 0644]
fs/xfs/libxfs/xfs_defer.h [new file with mode: 0644]
fs/xfs/libxfs/xfs_dir2.c
fs/xfs/libxfs/xfs_dir2.h
fs/xfs/libxfs/xfs_format.h
fs/xfs/libxfs/xfs_fs.h
fs/xfs/libxfs/xfs_ialloc.c
fs/xfs/libxfs/xfs_ialloc.h
fs/xfs/libxfs/xfs_ialloc_btree.c
fs/xfs/libxfs/xfs_inode_buf.c
fs/xfs/libxfs/xfs_log_format.h
fs/xfs/libxfs/xfs_rmap.c [new file with mode: 0644]
fs/xfs/libxfs/xfs_rmap.h [new file with mode: 0644]
fs/xfs/libxfs/xfs_rmap_btree.c [new file with mode: 0644]
fs/xfs/libxfs/xfs_rmap_btree.h [new file with mode: 0644]
fs/xfs/libxfs/xfs_sb.c
fs/xfs/libxfs/xfs_shared.h
fs/xfs/libxfs/xfs_trans_resv.c
fs/xfs/libxfs/xfs_trans_resv.h
fs/xfs/libxfs/xfs_types.h
fs/xfs/xfs_bmap_util.c
fs/xfs/xfs_bmap_util.h
fs/xfs/xfs_discard.c
fs/xfs/xfs_dquot.c
fs/xfs/xfs_error.h
fs/xfs/xfs_extfree_item.c
fs/xfs/xfs_extfree_item.h
fs/xfs/xfs_filestream.c
fs/xfs/xfs_fsops.c
fs/xfs/xfs_inode.c
fs/xfs/xfs_inode.h
fs/xfs/xfs_ioctl.c
fs/xfs/xfs_iomap.c
fs/xfs/xfs_log_recover.c
fs/xfs/xfs_mount.c
fs/xfs/xfs_mount.h
fs/xfs/xfs_ondisk.h
fs/xfs/xfs_rmap_item.c [new file with mode: 0644]
fs/xfs/xfs_rmap_item.h [new file with mode: 0644]
fs/xfs/xfs_rtalloc.c
fs/xfs/xfs_stats.c
fs/xfs/xfs_stats.h
fs/xfs/xfs_super.c
fs/xfs/xfs_symlink.c
fs/xfs/xfs_trace.c
fs/xfs/xfs_trace.h
fs/xfs/xfs_trans.h
fs/xfs/xfs_trans_extfree.c
fs/xfs/xfs_trans_rmap.c [new file with mode: 0644]

index 52c288514be1ff729a38f7b6d9a8e7232ea60b67..fc593c8694936e91af919e5dc59c2e69750c22fb 100644 (file)
@@ -39,6 +39,7 @@ xfs-y                         += $(addprefix libxfs/, \
                                   xfs_btree.o \
                                   xfs_da_btree.o \
                                   xfs_da_format.o \
+                                  xfs_defer.o \
                                   xfs_dir2.o \
                                   xfs_dir2_block.o \
                                   xfs_dir2_data.o \
@@ -51,6 +52,8 @@ xfs-y                         += $(addprefix libxfs/, \
                                   xfs_inode_fork.o \
                                   xfs_inode_buf.o \
                                   xfs_log_rlimit.o \
+                                  xfs_rmap.o \
+                                  xfs_rmap_btree.o \
                                   xfs_sb.o \
                                   xfs_symlink_remote.o \
                                   xfs_trans_resv.o \
@@ -100,11 +103,13 @@ xfs-y                             += xfs_log.o \
                                   xfs_extfree_item.o \
                                   xfs_icreate_item.o \
                                   xfs_inode_item.o \
+                                  xfs_rmap_item.o \
                                   xfs_log_recover.o \
                                   xfs_trans_ail.o \
                                   xfs_trans_buf.o \
                                   xfs_trans_extfree.o \
                                   xfs_trans_inode.o \
+                                  xfs_trans_rmap.o \
 
 # optional features
 xfs-$(CONFIG_XFS_QUOTA)                += xfs_dquot.o \
index 88c26b827a2dd0819464a82222d822bb3f8e9202..776ae2f325d1e4f534540206a30a138b01c1ea46 100644 (file)
 #include "xfs_bit.h"
 #include "xfs_sb.h"
 #include "xfs_mount.h"
+#include "xfs_defer.h"
 #include "xfs_inode.h"
 #include "xfs_btree.h"
+#include "xfs_rmap.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_alloc.h"
 #include "xfs_extent_busy.h"
@@ -49,6 +51,81 @@ STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *);
 STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *,
                xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *);
 
+xfs_extlen_t
+xfs_prealloc_blocks(
+       struct xfs_mount        *mp)
+{
+       if (xfs_sb_version_hasrmapbt(&mp->m_sb))
+               return XFS_RMAP_BLOCK(mp) + 1;
+       if (xfs_sb_version_hasfinobt(&mp->m_sb))
+               return XFS_FIBT_BLOCK(mp) + 1;
+       return XFS_IBT_BLOCK(mp) + 1;
+}
+
+/*
+ * In order to avoid ENOSPC-related deadlock caused by out-of-order locking of
+ * AGF buffer (PV 947395), we place constraints on the relationship among
+ * actual allocations for data blocks, freelist blocks, and potential file data
+ * bmap btree blocks. However, these restrictions may result in no actual space
+ * allocated for a delayed extent, for example, a data block in a certain AG is
+ * allocated but there is no additional block for the additional bmap btree
+ * block due to a split of the bmap btree of the file. The result of this may
+ * lead to an infinite loop when the file gets flushed to disk and all delayed
+ * extents need to be actually allocated. To get around this, we explicitly set
+ * aside a few blocks which will not be reserved in delayed allocation.
+ *
+ * When rmap is disabled, we need to reserve 4 fsbs _per AG_ for the freelist
+ * and 4 more to handle a potential split of the file's bmap btree.
+ *
+ * When rmap is enabled, we must also be able to handle two rmap btree inserts
+ * to record both the file data extent and a new bmbt block.  The bmbt block
+ * might not be in the same AG as the file data extent.  In the worst case
+ * the bmap btree splits multiple levels and all the new blocks come from
+ * different AGs, so set aside enough to handle rmap btree splits in all AGs.
+ */
+unsigned int
+xfs_alloc_set_aside(
+       struct xfs_mount        *mp)
+{
+       unsigned int            blocks;
+
+       blocks = 4 + (mp->m_sb.sb_agcount * XFS_ALLOC_AGFL_RESERVE);
+       if (xfs_sb_version_hasrmapbt(&mp->m_sb))
+               blocks += mp->m_sb.sb_agcount * mp->m_rmap_maxlevels;
+       return blocks;
+}
+
+/*
+ * When deciding how much space to allocate out of an AG, we limit the
+ * allocation maximum size to the size the AG. However, we cannot use all the
+ * blocks in the AG - some are permanently used by metadata. These
+ * blocks are generally:
+ *     - the AG superblock, AGF, AGI and AGFL
+ *     - the AGF (bno and cnt) and AGI btree root blocks, and optionally
+ *       the AGI free inode and rmap btree root blocks.
+ *     - blocks on the AGFL according to xfs_alloc_set_aside() limits
+ *     - the rmapbt root block
+ *
+ * The AG headers are sector sized, so the amount of space they take up is
+ * dependent on filesystem geometry. The others are all single blocks.
+ */
+unsigned int
+xfs_alloc_ag_max_usable(
+       struct xfs_mount        *mp)
+{
+       unsigned int            blocks;
+
+       blocks = XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)); /* ag headers */
+       blocks += XFS_ALLOC_AGFL_RESERVE;
+       blocks += 3;                    /* AGF, AGI btree root blocks */
+       if (xfs_sb_version_hasfinobt(&mp->m_sb))
+               blocks++;               /* finobt root block */
+       if (xfs_sb_version_hasrmapbt(&mp->m_sb))
+               blocks++;               /* rmap root block */
+
+       return mp->m_sb.sb_agblocks - blocks;
+}
+
 /*
  * Lookup the record equal to [bno, len] in the btree given by cur.
  */
@@ -636,6 +713,14 @@ xfs_alloc_ag_vextent(
        ASSERT(!args->wasfromfl || !args->isfl);
        ASSERT(args->agbno % args->alignment == 0);
 
+       /* if not file data, insert new block into the reverse map btree */
+       if (args->oinfo.oi_owner != XFS_RMAP_OWN_UNKNOWN) {
+               error = xfs_rmap_alloc(args->tp, args->agbp, args->agno,
+                                      args->agbno, args->len, &args->oinfo);
+               if (error)
+                       return error;
+       }
+
        if (!args->wasfromfl) {
                error = xfs_alloc_update_counters(args->tp, args->pag,
                                                  args->agbp,
@@ -1577,14 +1662,15 @@ error0:
 /*
  * Free the extent starting at agno/bno for length.
  */
-STATIC int                     /* error */
+STATIC int
 xfs_free_ag_extent(
-       xfs_trans_t     *tp,    /* transaction pointer */
-       xfs_buf_t       *agbp,  /* buffer for a.g. freelist header */
-       xfs_agnumber_t  agno,   /* allocation group number */
-       xfs_agblock_t   bno,    /* starting block number */
-       xfs_extlen_t    len,    /* length of extent */
-       int             isfl)   /* set if is freelist blocks - no sb acctg */
+       xfs_trans_t             *tp,
+       xfs_buf_t               *agbp,
+       xfs_agnumber_t          agno,
+       xfs_agblock_t           bno,
+       xfs_extlen_t            len,
+       struct xfs_owner_info   *oinfo,
+       int                     isfl)
 {
        xfs_btree_cur_t *bno_cur;       /* cursor for by-block btree */
        xfs_btree_cur_t *cnt_cur;       /* cursor for by-size btree */
@@ -1601,12 +1687,19 @@ xfs_free_ag_extent(
        xfs_extlen_t    nlen;           /* new length of freespace */
        xfs_perag_t     *pag;           /* per allocation group data */
 
+       bno_cur = cnt_cur = NULL;
        mp = tp->t_mountp;
+
+       if (oinfo->oi_owner != XFS_RMAP_OWN_UNKNOWN) {
+               error = xfs_rmap_free(tp, agbp, agno, bno, len, oinfo);
+               if (error)
+                       goto error0;
+       }
+
        /*
         * Allocate and initialize a cursor for the by-block btree.
         */
        bno_cur = xfs_allocbt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_BNO);
-       cnt_cur = NULL;
        /*
         * Look for a neighboring block on the left (lower block numbers)
         * that is contiguous with this space.
@@ -1875,6 +1968,11 @@ xfs_alloc_min_freelist(
        /* space needed by-size freespace btree */
        min_free += min_t(unsigned int, pag->pagf_levels[XFS_BTNUM_CNTi] + 1,
                                       mp->m_ag_maxlevels);
+       /* space needed reverse mapping used space btree */
+       if (xfs_sb_version_hasrmapbt(&mp->m_sb))
+               min_free += min_t(unsigned int,
+                                 pag->pagf_levels[XFS_BTNUM_RMAPi] + 1,
+                                 mp->m_rmap_maxlevels);
 
        return min_free;
 }
@@ -1992,21 +2090,34 @@ xfs_alloc_fix_freelist(
         * anything other than extra overhead when we need to put more blocks
         * back on the free list? Maybe we should only do this when space is
         * getting low or the AGFL is more than half full?
+        *
+        * The NOSHRINK flag prevents the AGFL from being shrunk if it's too
+        * big; the NORMAP flag prevents AGFL expand/shrink operations from
+        * updating the rmapbt.  Both flags are used in xfs_repair while we're
+        * rebuilding the rmapbt, and neither are used by the kernel.  They're
+        * both required to ensure that rmaps are correctly recorded for the
+        * regenerated AGFL, bnobt, and cntbt.  See repair/phase5.c and
+        * repair/rmap.c in xfsprogs for details.
         */
-       while (pag->pagf_flcount > need) {
+       memset(&targs, 0, sizeof(targs));
+       if (flags & XFS_ALLOC_FLAG_NORMAP)
+               xfs_rmap_skip_owner_update(&targs.oinfo);
+       else
+               xfs_rmap_ag_owner(&targs.oinfo, XFS_RMAP_OWN_AG);
+       while (!(flags & XFS_ALLOC_FLAG_NOSHRINK) && pag->pagf_flcount > need) {
                struct xfs_buf  *bp;
 
                error = xfs_alloc_get_freelist(tp, agbp, &bno, 0);
                if (error)
                        goto out_agbp_relse;
-               error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1, 1);
+               error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1,
+                                          &targs.oinfo, 1);
                if (error)
                        goto out_agbp_relse;
                bp = xfs_btree_get_bufs(mp, tp, args->agno, bno, 0);
                xfs_trans_binval(tp, bp);
        }
 
-       memset(&targs, 0, sizeof(targs));
        targs.tp = tp;
        targs.mp = mp;
        targs.agbp = agbp;
@@ -2271,6 +2382,10 @@ xfs_agf_verify(
            be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) > XFS_BTREE_MAXLEVELS)
                return false;
 
+       if (xfs_sb_version_hasrmapbt(&mp->m_sb) &&
+           be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) > XFS_BTREE_MAXLEVELS)
+               return false;
+
        /*
         * during growfs operations, the perag is not fully initialised,
         * so we can't use it for any useful checking. growfs ensures we can't
@@ -2402,6 +2517,8 @@ xfs_alloc_read_agf(
                        be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]);
                pag->pagf_levels[XFS_BTNUM_CNTi] =
                        be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]);
+               pag->pagf_levels[XFS_BTNUM_RMAPi] =
+                       be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAPi]);
                spin_lock_init(&pag->pagb_lock);
                pag->pagb_count = 0;
                pag->pagb_tree = RB_ROOT;
@@ -2691,7 +2808,8 @@ int                               /* error */
 xfs_free_extent(
        struct xfs_trans        *tp,    /* transaction pointer */
        xfs_fsblock_t           bno,    /* starting block number of extent */
-       xfs_extlen_t            len)    /* length of extent */
+       xfs_extlen_t            len,    /* length of extent */
+       struct xfs_owner_info   *oinfo) /* extent owner */
 {
        struct xfs_mount        *mp = tp->t_mountp;
        struct xfs_buf          *agbp;
@@ -2701,6 +2819,11 @@ xfs_free_extent(
 
        ASSERT(len != 0);
 
+       if (XFS_TEST_ERROR(false, mp,
+                       XFS_ERRTAG_FREE_EXTENT,
+                       XFS_RANDOM_FREE_EXTENT))
+               return -EIO;
+
        error = xfs_free_extent_fix_freelist(tp, agno, &agbp);
        if (error)
                return error;
@@ -2712,7 +2835,7 @@ xfs_free_extent(
                agbno + len <= be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_length),
                                err);
 
-       error = xfs_free_ag_extent(tp, agbp, agno, agbno, len, 0);
+       error = xfs_free_ag_extent(tp, agbp, agno, agbno, len, oinfo, 0);
        if (error)
                goto err;
 
index cf268b2d0b6c2035aaad4b781d43e2cebeedcd54..6fe2d6b7cfe93e6ed87f999438877e28ebad3ef6 100644 (file)
@@ -54,41 +54,8 @@ typedef unsigned int xfs_alloctype_t;
  */
 #define        XFS_ALLOC_FLAG_TRYLOCK  0x00000001  /* use trylock for buffer locking */
 #define        XFS_ALLOC_FLAG_FREEING  0x00000002  /* indicate caller is freeing extents*/
-
-/*
- * In order to avoid ENOSPC-related deadlock caused by
- * out-of-order locking of AGF buffer (PV 947395), we place
- * constraints on the relationship among actual allocations for
- * data blocks, freelist blocks, and potential file data bmap
- * btree blocks. However, these restrictions may result in no
- * actual space allocated for a delayed extent, for example, a data
- * block in a certain AG is allocated but there is no additional
- * block for the additional bmap btree block due to a split of the
- * bmap btree of the file. The result of this may lead to an
- * infinite loop in xfssyncd when the file gets flushed to disk and
- * all delayed extents need to be actually allocated. To get around
- * this, we explicitly set aside a few blocks which will not be
- * reserved in delayed allocation. Considering the minimum number of
- * needed freelist blocks is 4 fsbs _per AG_, a potential split of file's bmap
- * btree requires 1 fsb, so we set the number of set-aside blocks
- * to 4 + 4*agcount.
- */
-#define XFS_ALLOC_SET_ASIDE(mp)  (4 + ((mp)->m_sb.sb_agcount * 4))
-
-/*
- * When deciding how much space to allocate out of an AG, we limit the
- * allocation maximum size to the size the AG. However, we cannot use all the
- * blocks in the AG - some are permanently used by metadata. These
- * blocks are generally:
- *     - the AG superblock, AGF, AGI and AGFL
- *     - the AGF (bno and cnt) and AGI btree root blocks
- *     - 4 blocks on the AGFL according to XFS_ALLOC_SET_ASIDE() limits
- *
- * The AG headers are sector sized, so the amount of space they take up is
- * dependent on filesystem geometry. The others are all single blocks.
- */
-#define XFS_ALLOC_AG_MAX_USABLE(mp)    \
-       ((mp)->m_sb.sb_agblocks - XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)) - 7)
+#define        XFS_ALLOC_FLAG_NORMAP   0x00000004  /* don't modify the rmapbt */
+#define        XFS_ALLOC_FLAG_NOSHRINK 0x00000008  /* don't shrink the freelist */
 
 
 /*
@@ -123,6 +90,7 @@ typedef struct xfs_alloc_arg {
        char            isfl;           /* set if is freelist blocks - !acctg */
        char            userdata;       /* mask defining userdata treatment */
        xfs_fsblock_t   firstblock;     /* io first block allocated */
+       struct xfs_owner_info   oinfo;  /* owner of blocks being allocated */
 } xfs_alloc_arg_t;
 
 /*
@@ -132,6 +100,11 @@ typedef struct xfs_alloc_arg {
 #define XFS_ALLOC_INITIAL_USER_DATA    (1 << 1)/* special case start of file */
 #define XFS_ALLOC_USERDATA_ZERO                (1 << 2)/* zero extent on allocation */
 
+/* freespace limit calculations */
+#define XFS_ALLOC_AGFL_RESERVE 4
+unsigned int xfs_alloc_set_aside(struct xfs_mount *mp);
+unsigned int xfs_alloc_ag_max_usable(struct xfs_mount *mp);
+
 xfs_extlen_t xfs_alloc_longest_free_extent(struct xfs_mount *mp,
                struct xfs_perag *pag, xfs_extlen_t need);
 unsigned int xfs_alloc_min_freelist(struct xfs_mount *mp,
@@ -208,9 +181,10 @@ xfs_alloc_vextent(
  */
 int                            /* error */
 xfs_free_extent(
-       struct xfs_trans *tp,   /* transaction pointer */
-       xfs_fsblock_t   bno,    /* starting block number of extent */
-       xfs_extlen_t    len);   /* length of extent */
+       struct xfs_trans        *tp,    /* transaction pointer */
+       xfs_fsblock_t           bno,    /* starting block number of extent */
+       xfs_extlen_t            len,    /* length of extent */
+       struct xfs_owner_info   *oinfo);/* extent owner */
 
 int                            /* error */
 xfs_alloc_lookup_ge(
@@ -232,4 +206,6 @@ int xfs_alloc_fix_freelist(struct xfs_alloc_arg *args, int flags);
 int xfs_free_extent_fix_freelist(struct xfs_trans *tp, xfs_agnumber_t agno,
                struct xfs_buf **agbp);
 
+xfs_extlen_t xfs_prealloc_blocks(struct xfs_mount *mp);
+
 #endif /* __XFS_ALLOC_H__ */
index d9b42425291e37c6a4845c21dd0e1f61d8a76e86..5ba2dac5e67c492a1a9fe5047995899290e25220 100644 (file)
@@ -211,17 +211,6 @@ xfs_allocbt_init_key_from_rec(
        key->alloc.ar_blockcount = rec->alloc.ar_blockcount;
 }
 
-STATIC void
-xfs_allocbt_init_rec_from_key(
-       union xfs_btree_key     *key,
-       union xfs_btree_rec     *rec)
-{
-       ASSERT(key->alloc.ar_startblock != 0);
-
-       rec->alloc.ar_startblock = key->alloc.ar_startblock;
-       rec->alloc.ar_blockcount = key->alloc.ar_blockcount;
-}
-
 STATIC void
 xfs_allocbt_init_rec_from_cur(
        struct xfs_btree_cur    *cur,
@@ -406,7 +395,6 @@ static const struct xfs_btree_ops xfs_allocbt_ops = {
        .get_minrecs            = xfs_allocbt_get_minrecs,
        .get_maxrecs            = xfs_allocbt_get_maxrecs,
        .init_key_from_rec      = xfs_allocbt_init_key_from_rec,
-       .init_rec_from_key      = xfs_allocbt_init_rec_from_key,
        .init_rec_from_cur      = xfs_allocbt_init_rec_from_cur,
        .init_ptr_from_cur      = xfs_allocbt_init_ptr_from_cur,
        .key_diff               = xfs_allocbt_key_diff,
index 4e126f41a0aa97d1f73773ea8efb89ef7a52746c..af1ecb19121e9e8569c0ee907652405575d882c8 100644 (file)
@@ -23,6 +23,7 @@
 #include "xfs_trans_resv.h"
 #include "xfs_bit.h"
 #include "xfs_mount.h"
+#include "xfs_defer.h"
 #include "xfs_da_format.h"
 #include "xfs_da_btree.h"
 #include "xfs_attr_sf.h"
@@ -203,7 +204,7 @@ xfs_attr_set(
 {
        struct xfs_mount        *mp = dp->i_mount;
        struct xfs_da_args      args;
-       struct xfs_bmap_free    flist;
+       struct xfs_defer_ops    dfops;
        struct xfs_trans_res    tres;
        xfs_fsblock_t           firstblock;
        int                     rsvd = (flags & ATTR_ROOT) != 0;
@@ -221,7 +222,7 @@ xfs_attr_set(
        args.value = value;
        args.valuelen = valuelen;
        args.firstblock = &firstblock;
-       args.flist = &flist;
+       args.dfops = &dfops;
        args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT;
        args.total = xfs_attr_calc_size(&args, &local);
 
@@ -316,13 +317,13 @@ xfs_attr_set(
                 * It won't fit in the shortform, transform to a leaf block.
                 * GROT: another possible req'mt for a double-split btree op.
                 */
-               xfs_bmap_init(args.flist, args.firstblock);
+               xfs_defer_init(args.dfops, args.firstblock);
                error = xfs_attr_shortform_to_leaf(&args);
                if (!error)
-                       error = xfs_bmap_finish(&args.trans, args.flist, dp);
+                       error = xfs_defer_finish(&args.trans, args.dfops, dp);
                if (error) {
                        args.trans = NULL;
-                       xfs_bmap_cancel(&flist);
+                       xfs_defer_cancel(&dfops);
                        goto out;
                }
 
@@ -382,7 +383,7 @@ xfs_attr_remove(
 {
        struct xfs_mount        *mp = dp->i_mount;
        struct xfs_da_args      args;
-       struct xfs_bmap_free    flist;
+       struct xfs_defer_ops    dfops;
        xfs_fsblock_t           firstblock;
        int                     error;
 
@@ -399,7 +400,7 @@ xfs_attr_remove(
                return error;
 
        args.firstblock = &firstblock;
-       args.flist = &flist;
+       args.dfops = &dfops;
 
        /*
         * we have no control over the attribute names that userspace passes us
@@ -584,13 +585,13 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
                 * Commit that transaction so that the node_addname() call
                 * can manage its own transactions.
                 */
-               xfs_bmap_init(args->flist, args->firstblock);
+               xfs_defer_init(args->dfops, args->firstblock);
                error = xfs_attr3_leaf_to_node(args);
                if (!error)
-                       error = xfs_bmap_finish(&args->trans, args->flist, dp);
+                       error = xfs_defer_finish(&args->trans, args->dfops, dp);
                if (error) {
                        args->trans = NULL;
-                       xfs_bmap_cancel(args->flist);
+                       xfs_defer_cancel(args->dfops);
                        return error;
                }
 
@@ -674,15 +675,15 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
                 * If the result is small enough, shrink it all into the inode.
                 */
                if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) {
-                       xfs_bmap_init(args->flist, args->firstblock);
+                       xfs_defer_init(args->dfops, args->firstblock);
                        error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
                        /* bp is gone due to xfs_da_shrink_inode */
                        if (!error)
-                               error = xfs_bmap_finish(&args->trans,
-                                                       args->flist, dp);
+                               error = xfs_defer_finish(&args->trans,
+                                                       args->dfops, dp);
                        if (error) {
                                args->trans = NULL;
-                               xfs_bmap_cancel(args->flist);
+                               xfs_defer_cancel(args->dfops);
                                return error;
                        }
                }
@@ -737,14 +738,14 @@ xfs_attr_leaf_removename(xfs_da_args_t *args)
         * If the result is small enough, shrink it all into the inode.
         */
        if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) {
-               xfs_bmap_init(args->flist, args->firstblock);
+               xfs_defer_init(args->dfops, args->firstblock);
                error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
                /* bp is gone due to xfs_da_shrink_inode */
                if (!error)
-                       error = xfs_bmap_finish(&args->trans, args->flist, dp);
+                       error = xfs_defer_finish(&args->trans, args->dfops, dp);
                if (error) {
                        args->trans = NULL;
-                       xfs_bmap_cancel(args->flist);
+                       xfs_defer_cancel(args->dfops);
                        return error;
                }
        }
@@ -863,14 +864,14 @@ restart:
                         */
                        xfs_da_state_free(state);
                        state = NULL;
-                       xfs_bmap_init(args->flist, args->firstblock);
+                       xfs_defer_init(args->dfops, args->firstblock);
                        error = xfs_attr3_leaf_to_node(args);
                        if (!error)
-                               error = xfs_bmap_finish(&args->trans,
-                                                       args->flist, dp);
+                               error = xfs_defer_finish(&args->trans,
+                                                       args->dfops, dp);
                        if (error) {
                                args->trans = NULL;
-                               xfs_bmap_cancel(args->flist);
+                               xfs_defer_cancel(args->dfops);
                                goto out;
                        }
 
@@ -891,13 +892,13 @@ restart:
                 * in the index/blkno/rmtblkno/rmtblkcnt fields and
                 * in the index2/blkno2/rmtblkno2/rmtblkcnt2 fields.
                 */
-               xfs_bmap_init(args->flist, args->firstblock);
+               xfs_defer_init(args->dfops, args->firstblock);
                error = xfs_da3_split(state);
                if (!error)
-                       error = xfs_bmap_finish(&args->trans, args->flist, dp);
+                       error = xfs_defer_finish(&args->trans, args->dfops, dp);
                if (error) {
                        args->trans = NULL;
-                       xfs_bmap_cancel(args->flist);
+                       xfs_defer_cancel(args->dfops);
                        goto out;
                }
        } else {
@@ -990,14 +991,14 @@ restart:
                 * Check to see if the tree needs to be collapsed.
                 */
                if (retval && (state->path.active > 1)) {
-                       xfs_bmap_init(args->flist, args->firstblock);
+                       xfs_defer_init(args->dfops, args->firstblock);
                        error = xfs_da3_join(state);
                        if (!error)
-                               error = xfs_bmap_finish(&args->trans,
-                                                       args->flist, dp);
+                               error = xfs_defer_finish(&args->trans,
+                                                       args->dfops, dp);
                        if (error) {
                                args->trans = NULL;
-                               xfs_bmap_cancel(args->flist);
+                               xfs_defer_cancel(args->dfops);
                                goto out;
                        }
                }
@@ -1113,13 +1114,13 @@ xfs_attr_node_removename(xfs_da_args_t *args)
         * Check to see if the tree needs to be collapsed.
         */
        if (retval && (state->path.active > 1)) {
-               xfs_bmap_init(args->flist, args->firstblock);
+               xfs_defer_init(args->dfops, args->firstblock);
                error = xfs_da3_join(state);
                if (!error)
-                       error = xfs_bmap_finish(&args->trans, args->flist, dp);
+                       error = xfs_defer_finish(&args->trans, args->dfops, dp);
                if (error) {
                        args->trans = NULL;
-                       xfs_bmap_cancel(args->flist);
+                       xfs_defer_cancel(args->dfops);
                        goto out;
                }
                /*
@@ -1146,15 +1147,15 @@ xfs_attr_node_removename(xfs_da_args_t *args)
                        goto out;
 
                if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) {
-                       xfs_bmap_init(args->flist, args->firstblock);
+                       xfs_defer_init(args->dfops, args->firstblock);
                        error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
                        /* bp is gone due to xfs_da_shrink_inode */
                        if (!error)
-                               error = xfs_bmap_finish(&args->trans,
-                                                       args->flist, dp);
+                               error = xfs_defer_finish(&args->trans,
+                                                       args->dfops, dp);
                        if (error) {
                                args->trans = NULL;
-                               xfs_bmap_cancel(args->flist);
+                               xfs_defer_cancel(args->dfops);
                                goto out;
                        }
                } else
index 01a5ecfedfcf162cc155d1214cd2746eec5a46b7..8ea91f3630938a63523602e5d14c2553a472c015 100644 (file)
@@ -792,7 +792,7 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
        nargs.dp = dp;
        nargs.geo = args->geo;
        nargs.firstblock = args->firstblock;
-       nargs.flist = args->flist;
+       nargs.dfops = args->dfops;
        nargs.total = args->total;
        nargs.whichfork = XFS_ATTR_FORK;
        nargs.trans = args->trans;
@@ -922,7 +922,7 @@ xfs_attr3_leaf_to_shortform(
        nargs.geo = args->geo;
        nargs.dp = dp;
        nargs.firstblock = args->firstblock;
-       nargs.flist = args->flist;
+       nargs.dfops = args->dfops;
        nargs.total = args->total;
        nargs.whichfork = XFS_ATTR_FORK;
        nargs.trans = args->trans;
index a572532a55cdc3bcd6f971ef3b84efd2f950183a..d52f525f5b2dffe74be16ccf1bae2494a02787d7 100644 (file)
@@ -24,6 +24,7 @@
 #include "xfs_trans_resv.h"
 #include "xfs_bit.h"
 #include "xfs_mount.h"
+#include "xfs_defer.h"
 #include "xfs_da_format.h"
 #include "xfs_da_btree.h"
 #include "xfs_inode.h"
@@ -460,16 +461,16 @@ xfs_attr_rmtval_set(
                 * extent and then crash then the block may not contain the
                 * correct metadata after log recovery occurs.
                 */
-               xfs_bmap_init(args->flist, args->firstblock);
+               xfs_defer_init(args->dfops, args->firstblock);
                nmap = 1;
                error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)lblkno,
                                  blkcnt, XFS_BMAPI_ATTRFORK, args->firstblock,
-                                 args->total, &map, &nmap, args->flist);
+                                 args->total, &map, &nmap, args->dfops);
                if (!error)
-                       error = xfs_bmap_finish(&args->trans, args->flist, dp);
+                       error = xfs_defer_finish(&args->trans, args->dfops, dp);
                if (error) {
                        args->trans = NULL;
-                       xfs_bmap_cancel(args->flist);
+                       xfs_defer_cancel(args->dfops);
                        return error;
                }
 
@@ -503,7 +504,7 @@ xfs_attr_rmtval_set(
 
                ASSERT(blkcnt > 0);
 
-               xfs_bmap_init(args->flist, args->firstblock);
+               xfs_defer_init(args->dfops, args->firstblock);
                nmap = 1;
                error = xfs_bmapi_read(dp, (xfs_fileoff_t)lblkno,
                                       blkcnt, &map, &nmap,
@@ -603,16 +604,16 @@ xfs_attr_rmtval_remove(
        blkcnt = args->rmtblkcnt;
        done = 0;
        while (!done) {
-               xfs_bmap_init(args->flist, args->firstblock);
+               xfs_defer_init(args->dfops, args->firstblock);
                error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt,
                                    XFS_BMAPI_ATTRFORK, 1, args->firstblock,
-                                   args->flist, &done);
+                                   args->dfops, &done);
                if (!error)
-                       error = xfs_bmap_finish(&args->trans, args->flist,
+                       error = xfs_defer_finish(&args->trans, args->dfops,
                                                args->dp);
                if (error) {
                        args->trans = NULL;
-                       xfs_bmap_cancel(args->flist);
+                       xfs_defer_cancel(args->dfops);
                        return error;
                }
 
index 2f2c85cc81173dec19952d2a5ae4efe2f9377983..b060bca93402710fcea1fc2da4544e4d856329bf 100644 (file)
@@ -24,6 +24,7 @@
 #include "xfs_bit.h"
 #include "xfs_sb.h"
 #include "xfs_mount.h"
+#include "xfs_defer.h"
 #include "xfs_da_format.h"
 #include "xfs_da_btree.h"
 #include "xfs_dir2.h"
@@ -45,6 +46,7 @@
 #include "xfs_symlink.h"
 #include "xfs_attr_leaf.h"
 #include "xfs_filestream.h"
+#include "xfs_rmap.h"
 
 
 kmem_zone_t            *xfs_bmap_free_item_zone;
@@ -570,12 +572,13 @@ xfs_bmap_validate_ret(
  */
 void
 xfs_bmap_add_free(
-       struct xfs_mount        *mp,            /* mount point structure */
-       struct xfs_bmap_free    *flist,         /* list of extents */
-       xfs_fsblock_t           bno,            /* fs block number of extent */
-       xfs_filblks_t           len)            /* length of extent */
+       struct xfs_mount                *mp,
+       struct xfs_defer_ops            *dfops,
+       xfs_fsblock_t                   bno,
+       xfs_filblks_t                   len,
+       struct xfs_owner_info           *oinfo)
 {
-       struct xfs_bmap_free_item       *new;           /* new element */
+       struct xfs_extent_free_item     *new;           /* new element */
 #ifdef DEBUG
        xfs_agnumber_t          agno;
        xfs_agblock_t           agbno;
@@ -592,44 +595,17 @@ xfs_bmap_add_free(
        ASSERT(agbno + len <= mp->m_sb.sb_agblocks);
 #endif
        ASSERT(xfs_bmap_free_item_zone != NULL);
-       new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP);
-       new->xbfi_startblock = bno;
-       new->xbfi_blockcount = (xfs_extlen_t)len;
-       list_add(&new->xbfi_list, &flist->xbf_flist);
-       flist->xbf_count++;
-}
-
-/*
- * Remove the entry "free" from the free item list.  Prev points to the
- * previous entry, unless "free" is the head of the list.
- */
-void
-xfs_bmap_del_free(
-       struct xfs_bmap_free            *flist, /* free item list header */
-       struct xfs_bmap_free_item       *free)  /* list item to be freed */
-{
-       list_del(&free->xbfi_list);
-       flist->xbf_count--;
-       kmem_zone_free(xfs_bmap_free_item_zone, free);
-}
-
-/*
- * Free up any items left in the list.
- */
-void
-xfs_bmap_cancel(
-       struct xfs_bmap_free            *flist) /* list of bmap_free_items */
-{
-       struct xfs_bmap_free_item       *free;  /* free list item */
 
-       if (flist->xbf_count == 0)
-               return;
-       while (!list_empty(&flist->xbf_flist)) {
-               free = list_first_entry(&flist->xbf_flist,
-                               struct xfs_bmap_free_item, xbfi_list);
-               xfs_bmap_del_free(flist, free);
-       }
-       ASSERT(flist->xbf_count == 0);
+       new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP);
+       new->xefi_startblock = bno;
+       new->xefi_blockcount = (xfs_extlen_t)len;
+       if (oinfo)
+               new->xefi_oinfo = *oinfo;
+       else
+               xfs_rmap_skip_owner_update(&new->xefi_oinfo);
+       trace_xfs_bmap_free_defer(mp, XFS_FSB_TO_AGNO(mp, bno), 0,
+                       XFS_FSB_TO_AGBNO(mp, bno), len);
+       xfs_defer_add(dfops, XFS_DEFER_OPS_TYPE_FREE, &new->xefi_list);
 }
 
 /*
@@ -659,6 +635,7 @@ xfs_bmap_btree_to_extents(
        xfs_mount_t             *mp;    /* mount point structure */
        __be64                  *pp;    /* ptr to block address */
        struct xfs_btree_block  *rblock;/* root btree block */
+       struct xfs_owner_info   oinfo;
 
        mp = ip->i_mount;
        ifp = XFS_IFORK_PTR(ip, whichfork);
@@ -682,7 +659,8 @@ xfs_bmap_btree_to_extents(
        cblock = XFS_BUF_TO_BLOCK(cbp);
        if ((error = xfs_btree_check_block(cur, cblock, 0, cbp)))
                return error;
-       xfs_bmap_add_free(mp, cur->bc_private.b.flist, cbno, 1);
+       xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, whichfork);
+       xfs_bmap_add_free(mp, cur->bc_private.b.dfops, cbno, 1, &oinfo);
        ip->i_d.di_nblocks--;
        xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
        xfs_trans_binval(tp, cbp);
@@ -705,7 +683,7 @@ xfs_bmap_extents_to_btree(
        xfs_trans_t             *tp,            /* transaction pointer */
        xfs_inode_t             *ip,            /* incore inode pointer */
        xfs_fsblock_t           *firstblock,    /* first-block-allocated */
-       xfs_bmap_free_t         *flist,         /* blocks freed in xaction */
+       struct xfs_defer_ops    *dfops,         /* blocks freed in xaction */
        xfs_btree_cur_t         **curp,         /* cursor returned to caller */
        int                     wasdel,         /* converting a delayed alloc */
        int                     *logflagsp,     /* inode logging flags */
@@ -754,7 +732,7 @@ xfs_bmap_extents_to_btree(
         */
        cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
        cur->bc_private.b.firstblock = *firstblock;
-       cur->bc_private.b.flist = flist;
+       cur->bc_private.b.dfops = dfops;
        cur->bc_private.b.flags = wasdel ? XFS_BTCUR_BPRV_WASDEL : 0;
        /*
         * Convert to a btree with two levels, one record in root.
@@ -763,11 +741,12 @@ xfs_bmap_extents_to_btree(
        memset(&args, 0, sizeof(args));
        args.tp = tp;
        args.mp = mp;
+       xfs_rmap_ino_bmbt_owner(&args.oinfo, ip->i_ino, whichfork);
        args.firstblock = *firstblock;
        if (*firstblock == NULLFSBLOCK) {
                args.type = XFS_ALLOCTYPE_START_BNO;
                args.fsbno = XFS_INO_TO_FSB(mp, ip->i_ino);
-       } else if (flist->xbf_low) {
+       } else if (dfops->dop_low) {
                args.type = XFS_ALLOCTYPE_START_BNO;
                args.fsbno = *firstblock;
        } else {
@@ -788,7 +767,7 @@ xfs_bmap_extents_to_btree(
        ASSERT(args.fsbno != NULLFSBLOCK);
        ASSERT(*firstblock == NULLFSBLOCK ||
               args.agno == XFS_FSB_TO_AGNO(mp, *firstblock) ||
-              (flist->xbf_low &&
+              (dfops->dop_low &&
                args.agno > XFS_FSB_TO_AGNO(mp, *firstblock)));
        *firstblock = cur->bc_private.b.firstblock = args.fsbno;
        cur->bc_private.b.allocated++;
@@ -909,6 +888,7 @@ xfs_bmap_local_to_extents(
        memset(&args, 0, sizeof(args));
        args.tp = tp;
        args.mp = ip->i_mount;
+       xfs_rmap_ino_owner(&args.oinfo, ip->i_ino, whichfork, 0);
        args.firstblock = *firstblock;
        /*
         * Allocate a block.  We know we need only one, since the
@@ -973,7 +953,7 @@ xfs_bmap_add_attrfork_btree(
        xfs_trans_t             *tp,            /* transaction pointer */
        xfs_inode_t             *ip,            /* incore inode pointer */
        xfs_fsblock_t           *firstblock,    /* first block allocated */
-       xfs_bmap_free_t         *flist,         /* blocks to free at commit */
+       struct xfs_defer_ops    *dfops,         /* blocks to free at commit */
        int                     *flags)         /* inode logging flags */
 {
        xfs_btree_cur_t         *cur;           /* btree cursor */
@@ -986,7 +966,7 @@ xfs_bmap_add_attrfork_btree(
                *flags |= XFS_ILOG_DBROOT;
        else {
                cur = xfs_bmbt_init_cursor(mp, tp, ip, XFS_DATA_FORK);
-               cur->bc_private.b.flist = flist;
+               cur->bc_private.b.dfops = dfops;
                cur->bc_private.b.firstblock = *firstblock;
                if ((error = xfs_bmbt_lookup_ge(cur, 0, 0, 0, &stat)))
                        goto error0;
@@ -1016,7 +996,7 @@ xfs_bmap_add_attrfork_extents(
        xfs_trans_t             *tp,            /* transaction pointer */
        xfs_inode_t             *ip,            /* incore inode pointer */
        xfs_fsblock_t           *firstblock,    /* first block allocated */
-       xfs_bmap_free_t         *flist,         /* blocks to free at commit */
+       struct xfs_defer_ops    *dfops,         /* blocks to free at commit */
        int                     *flags)         /* inode logging flags */
 {
        xfs_btree_cur_t         *cur;           /* bmap btree cursor */
@@ -1025,7 +1005,7 @@ xfs_bmap_add_attrfork_extents(
        if (ip->i_d.di_nextents * sizeof(xfs_bmbt_rec_t) <= XFS_IFORK_DSIZE(ip))
                return 0;
        cur = NULL;
-       error = xfs_bmap_extents_to_btree(tp, ip, firstblock, flist, &cur, 0,
+       error = xfs_bmap_extents_to_btree(tp, ip, firstblock, dfops, &cur, 0,
                flags, XFS_DATA_FORK);
        if (cur) {
                cur->bc_private.b.allocated = 0;
@@ -1051,7 +1031,7 @@ xfs_bmap_add_attrfork_local(
        xfs_trans_t             *tp,            /* transaction pointer */
        xfs_inode_t             *ip,            /* incore inode pointer */
        xfs_fsblock_t           *firstblock,    /* first block allocated */
-       xfs_bmap_free_t         *flist,         /* blocks to free at commit */
+       struct xfs_defer_ops    *dfops,         /* blocks to free at commit */
        int                     *flags)         /* inode logging flags */
 {
        xfs_da_args_t           dargs;          /* args for dir/attr code */
@@ -1064,7 +1044,7 @@ xfs_bmap_add_attrfork_local(
                dargs.geo = ip->i_mount->m_dir_geo;
                dargs.dp = ip;
                dargs.firstblock = firstblock;
-               dargs.flist = flist;
+               dargs.dfops = dfops;
                dargs.total = dargs.geo->fsbcount;
                dargs.whichfork = XFS_DATA_FORK;
                dargs.trans = tp;
@@ -1092,7 +1072,7 @@ xfs_bmap_add_attrfork(
        int                     rsvd)           /* xact may use reserved blks */
 {
        xfs_fsblock_t           firstblock;     /* 1st block/ag allocated */
-       xfs_bmap_free_t         flist;          /* freed extent records */
+       struct xfs_defer_ops    dfops;          /* freed extent records */
        xfs_mount_t             *mp;            /* mount structure */
        xfs_trans_t             *tp;            /* transaction pointer */
        int                     blks;           /* space reservation */
@@ -1158,18 +1138,18 @@ xfs_bmap_add_attrfork(
        ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP);
        ip->i_afp->if_flags = XFS_IFEXTENTS;
        logflags = 0;
-       xfs_bmap_init(&flist, &firstblock);
+       xfs_defer_init(&dfops, &firstblock);
        switch (ip->i_d.di_format) {
        case XFS_DINODE_FMT_LOCAL:
-               error = xfs_bmap_add_attrfork_local(tp, ip, &firstblock, &flist,
+               error = xfs_bmap_add_attrfork_local(tp, ip, &firstblock, &dfops,
                        &logflags);
                break;
        case XFS_DINODE_FMT_EXTENTS:
                error = xfs_bmap_add_attrfork_extents(tp, ip, &firstblock,
-                       &flist, &logflags);
+                       &dfops, &logflags);
                break;
        case XFS_DINODE_FMT_BTREE:
-               error = xfs_bmap_add_attrfork_btree(tp, ip, &firstblock, &flist,
+               error = xfs_bmap_add_attrfork_btree(tp, ip, &firstblock, &dfops,
                        &logflags);
                break;
        default:
@@ -1198,7 +1178,7 @@ xfs_bmap_add_attrfork(
                        xfs_log_sb(tp);
        }
 
-       error = xfs_bmap_finish(&tp, &flist, NULL);
+       error = xfs_defer_finish(&tp, &dfops, NULL);
        if (error)
                goto bmap_cancel;
        error = xfs_trans_commit(tp);
@@ -1206,7 +1186,7 @@ xfs_bmap_add_attrfork(
        return error;
 
 bmap_cancel:
-       xfs_bmap_cancel(&flist);
+       xfs_defer_cancel(&dfops);
 trans_cancel:
        xfs_trans_cancel(tp);
        xfs_iunlock(ip, XFS_ILOCK_EXCL);
@@ -2003,7 +1983,7 @@ xfs_bmap_add_extent_delay_real(
 
                if (xfs_bmap_needs_btree(bma->ip, whichfork)) {
                        error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
-                                       bma->firstblock, bma->flist,
+                                       bma->firstblock, bma->dfops,
                                        &bma->cur, 1, &tmp_rval, whichfork);
                        rval |= tmp_rval;
                        if (error)
@@ -2087,7 +2067,7 @@ xfs_bmap_add_extent_delay_real(
 
                if (xfs_bmap_needs_btree(bma->ip, whichfork)) {
                        error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
-                               bma->firstblock, bma->flist, &bma->cur, 1,
+                               bma->firstblock, bma->dfops, &bma->cur, 1,
                                &tmp_rval, whichfork);
                        rval |= tmp_rval;
                        if (error)
@@ -2156,7 +2136,7 @@ xfs_bmap_add_extent_delay_real(
 
                if (xfs_bmap_needs_btree(bma->ip, whichfork)) {
                        error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
-                                       bma->firstblock, bma->flist, &bma->cur,
+                                       bma->firstblock, bma->dfops, &bma->cur,
                                        1, &tmp_rval, whichfork);
                        rval |= tmp_rval;
                        if (error)
@@ -2199,13 +2179,18 @@ xfs_bmap_add_extent_delay_real(
                ASSERT(0);
        }
 
+       /* add reverse mapping */
+       error = xfs_rmap_map_extent(mp, bma->dfops, bma->ip, whichfork, new);
+       if (error)
+               goto done;
+
        /* convert to a btree if necessary */
        if (xfs_bmap_needs_btree(bma->ip, whichfork)) {
                int     tmp_logflags;   /* partial log flag return val */
 
                ASSERT(bma->cur == NULL);
                error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
-                               bma->firstblock, bma->flist, &bma->cur,
+                               bma->firstblock, bma->dfops, &bma->cur,
                                da_old > 0, &tmp_logflags, whichfork);
                bma->logflags |= tmp_logflags;
                if (error)
@@ -2247,7 +2232,7 @@ xfs_bmap_add_extent_unwritten_real(
        xfs_btree_cur_t         **curp, /* if *curp is null, not a btree */
        xfs_bmbt_irec_t         *new,   /* new data to add to file extents */
        xfs_fsblock_t           *first, /* pointer to firstblock variable */
-       xfs_bmap_free_t         *flist, /* list of extents to be freed */
+       struct xfs_defer_ops    *dfops, /* list of extents to be freed */
        int                     *logflagsp) /* inode logging flags */
 {
        xfs_btree_cur_t         *cur;   /* btree cursor */
@@ -2735,12 +2720,17 @@ xfs_bmap_add_extent_unwritten_real(
                ASSERT(0);
        }
 
+       /* update reverse mappings */
+       error = xfs_rmap_convert_extent(mp, dfops, ip, XFS_DATA_FORK, new);
+       if (error)
+               goto done;
+
        /* convert to a btree if necessary */
        if (xfs_bmap_needs_btree(ip, XFS_DATA_FORK)) {
                int     tmp_logflags;   /* partial log flag return val */
 
                ASSERT(cur == NULL);
-               error = xfs_bmap_extents_to_btree(tp, ip, first, flist, &cur,
+               error = xfs_bmap_extents_to_btree(tp, ip, first, dfops, &cur,
                                0, &tmp_logflags, XFS_DATA_FORK);
                *logflagsp |= tmp_logflags;
                if (error)
@@ -3127,13 +3117,18 @@ xfs_bmap_add_extent_hole_real(
                break;
        }
 
+       /* add reverse mapping */
+       error = xfs_rmap_map_extent(mp, bma->dfops, bma->ip, whichfork, new);
+       if (error)
+               goto done;
+
        /* convert to a btree if necessary */
        if (xfs_bmap_needs_btree(bma->ip, whichfork)) {
                int     tmp_logflags;   /* partial log flag return val */
 
                ASSERT(bma->cur == NULL);
                error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
-                               bma->firstblock, bma->flist, &bma->cur,
+                               bma->firstblock, bma->dfops, &bma->cur,
                                0, &tmp_logflags, whichfork);
                bma->logflags |= tmp_logflags;
                if (error)
@@ -3691,9 +3686,10 @@ xfs_bmap_btalloc(
        args.tp = ap->tp;
        args.mp = mp;
        args.fsbno = ap->blkno;
+       xfs_rmap_skip_owner_update(&args.oinfo);
 
        /* Trim the allocation back to the maximum an AG can fit. */
-       args.maxlen = MIN(ap->length, XFS_ALLOC_AG_MAX_USABLE(mp));
+       args.maxlen = MIN(ap->length, mp->m_ag_max_usable);
        args.firstblock = *ap->firstblock;
        blen = 0;
        if (nullfb) {
@@ -3708,7 +3704,7 @@ xfs_bmap_btalloc(
                        error = xfs_bmap_btalloc_nullfb(ap, &args, &blen);
                if (error)
                        return error;
-       } else if (ap->flist->xbf_low) {
+       } else if (ap->dfops->dop_low) {
                if (xfs_inode_is_filestream(ap->ip))
                        args.type = XFS_ALLOCTYPE_FIRST_AG;
                else
@@ -3741,7 +3737,7 @@ xfs_bmap_btalloc(
         * is >= the stripe unit and the allocation offset is
         * at the end of file.
         */
-       if (!ap->flist->xbf_low && ap->aeof) {
+       if (!ap->dfops->dop_low && ap->aeof) {
                if (!ap->offset) {
                        args.alignment = stripe_align;
                        atype = args.type;
@@ -3834,7 +3830,7 @@ xfs_bmap_btalloc(
                args.minleft = 0;
                if ((error = xfs_alloc_vextent(&args)))
                        return error;
-               ap->flist->xbf_low = 1;
+               ap->dfops->dop_low = true;
        }
        if (args.fsbno != NULLFSBLOCK) {
                /*
@@ -3844,7 +3840,7 @@ xfs_bmap_btalloc(
                ASSERT(*ap->firstblock == NULLFSBLOCK ||
                       XFS_FSB_TO_AGNO(mp, *ap->firstblock) ==
                       XFS_FSB_TO_AGNO(mp, args.fsbno) ||
-                      (ap->flist->xbf_low &&
+                      (ap->dfops->dop_low &&
                        XFS_FSB_TO_AGNO(mp, *ap->firstblock) <
                        XFS_FSB_TO_AGNO(mp, args.fsbno)));
 
@@ -3852,7 +3848,7 @@ xfs_bmap_btalloc(
                if (*ap->firstblock == NULLFSBLOCK)
                        *ap->firstblock = args.fsbno;
                ASSERT(nullfb || fb_agno == args.agno ||
-                      (ap->flist->xbf_low && fb_agno < args.agno));
+                      (ap->dfops->dop_low && fb_agno < args.agno));
                ap->length = args.len;
                ap->ip->i_d.di_nblocks += args.len;
                xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
@@ -4319,7 +4315,7 @@ xfs_bmapi_allocate(
        if (error)
                return error;
 
-       if (bma->flist->xbf_low)
+       if (bma->dfops->dop_low)
                bma->minleft = 0;
        if (bma->cur)
                bma->cur->bc_private.b.firstblock = *bma->firstblock;
@@ -4328,7 +4324,7 @@ xfs_bmapi_allocate(
        if ((ifp->if_flags & XFS_IFBROOT) && !bma->cur) {
                bma->cur = xfs_bmbt_init_cursor(mp, bma->tp, bma->ip, whichfork);
                bma->cur->bc_private.b.firstblock = *bma->firstblock;
-               bma->cur->bc_private.b.flist = bma->flist;
+               bma->cur->bc_private.b.dfops = bma->dfops;
        }
        /*
         * Bump the number of extents we've allocated
@@ -4409,7 +4405,7 @@ xfs_bmapi_convert_unwritten(
                bma->cur = xfs_bmbt_init_cursor(bma->ip->i_mount, bma->tp,
                                        bma->ip, whichfork);
                bma->cur->bc_private.b.firstblock = *bma->firstblock;
-               bma->cur->bc_private.b.flist = bma->flist;
+               bma->cur->bc_private.b.dfops = bma->dfops;
        }
        mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN)
                                ? XFS_EXT_NORM : XFS_EXT_UNWRITTEN;
@@ -4426,7 +4422,7 @@ xfs_bmapi_convert_unwritten(
        }
 
        error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, &bma->idx,
-                       &bma->cur, mval, bma->firstblock, bma->flist,
+                       &bma->cur, mval, bma->firstblock, bma->dfops,
                        &tmp_logflags);
        /*
         * Log the inode core unconditionally in the unwritten extent conversion
@@ -4480,7 +4476,7 @@ xfs_bmapi_write(
        xfs_extlen_t            total,          /* total blocks needed */
        struct xfs_bmbt_irec    *mval,          /* output: map values */
        int                     *nmap,          /* i/o: mval size/count */
-       struct xfs_bmap_free    *flist)         /* i/o: list extents to free */
+       struct xfs_defer_ops    *dfops)         /* i/o: list extents to free */
 {
        struct xfs_mount        *mp = ip->i_mount;
        struct xfs_ifork        *ifp;
@@ -4570,7 +4566,7 @@ xfs_bmapi_write(
        bma.ip = ip;
        bma.total = total;
        bma.userdata = 0;
-       bma.flist = flist;
+       bma.dfops = dfops;
        bma.firstblock = firstblock;
 
        while (bno < end && n < *nmap) {
@@ -4684,7 +4680,7 @@ error0:
                               XFS_FSB_TO_AGNO(mp, *firstblock) ==
                               XFS_FSB_TO_AGNO(mp,
                                       bma.cur->bc_private.b.firstblock) ||
-                              (flist->xbf_low &&
+                              (dfops->dop_low &&
                                XFS_FSB_TO_AGNO(mp, *firstblock) <
                                XFS_FSB_TO_AGNO(mp,
                                        bma.cur->bc_private.b.firstblock)));
@@ -4768,7 +4764,7 @@ xfs_bmap_del_extent(
        xfs_inode_t             *ip,    /* incore inode pointer */
        xfs_trans_t             *tp,    /* current transaction pointer */
        xfs_extnum_t            *idx,   /* extent number to update/delete */
-       xfs_bmap_free_t         *flist, /* list of extents to be freed */
+       struct xfs_defer_ops    *dfops, /* list of extents to be freed */
        xfs_btree_cur_t         *cur,   /* if null, not a btree */
        xfs_bmbt_irec_t         *del,   /* data to remove from extents */
        int                     *logflagsp, /* inode logging flags */
@@ -4870,6 +4866,7 @@ xfs_bmap_del_extent(
                nblks = 0;
                do_fx = 0;
        }
+
        /*
         * Set flag value to use in switch statement.
         * Left-contig is 2, right-contig is 1.
@@ -5052,12 +5049,20 @@ xfs_bmap_del_extent(
                ++*idx;
                break;
        }
+
+       /* remove reverse mapping */
+       if (!delay) {
+               error = xfs_rmap_unmap_extent(mp, dfops, ip, whichfork, del);
+               if (error)
+                       goto done;
+       }
+
        /*
         * If we need to, add to list of extents to delete.
         */
        if (do_fx)
-               xfs_bmap_add_free(mp, flist, del->br_startblock,
-                       del->br_blockcount);
+               xfs_bmap_add_free(mp, dfops, del->br_startblock,
+                               del->br_blockcount, NULL);
        /*
         * Adjust inode # blocks in the file.
         */
@@ -5097,7 +5102,7 @@ xfs_bunmapi(
        xfs_extnum_t            nexts,          /* number of extents max */
        xfs_fsblock_t           *firstblock,    /* first allocated block
                                                   controls a.g. for allocs */
-       xfs_bmap_free_t         *flist,         /* i/o: list extents to free */
+       struct xfs_defer_ops    *dfops,         /* i/o: list extents to free */
        int                     *done)          /* set if not done yet */
 {
        xfs_btree_cur_t         *cur;           /* bmap btree cursor */
@@ -5170,7 +5175,7 @@ xfs_bunmapi(
                ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE);
                cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
                cur->bc_private.b.firstblock = *firstblock;
-               cur->bc_private.b.flist = flist;
+               cur->bc_private.b.dfops = dfops;
                cur->bc_private.b.flags = 0;
        } else
                cur = NULL;
@@ -5179,8 +5184,10 @@ xfs_bunmapi(
                /*
                 * Synchronize by locking the bitmap inode.
                 */
-               xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
+               xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL|XFS_ILOCK_RTBITMAP);
                xfs_trans_ijoin(tp, mp->m_rbmip, XFS_ILOCK_EXCL);
+               xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL|XFS_ILOCK_RTSUM);
+               xfs_trans_ijoin(tp, mp->m_rsumip, XFS_ILOCK_EXCL);
        }
 
        extno = 0;
@@ -5262,7 +5269,7 @@ xfs_bunmapi(
                        }
                        del.br_state = XFS_EXT_UNWRITTEN;
                        error = xfs_bmap_add_extent_unwritten_real(tp, ip,
-                                       &lastx, &cur, &del, firstblock, flist,
+                                       &lastx, &cur, &del, firstblock, dfops,
                                        &logflags);
                        if (error)
                                goto error0;
@@ -5321,7 +5328,7 @@ xfs_bunmapi(
                                lastx--;
                                error = xfs_bmap_add_extent_unwritten_real(tp,
                                                ip, &lastx, &cur, &prev,
-                                               firstblock, flist, &logflags);
+                                               firstblock, dfops, &logflags);
                                if (error)
                                        goto error0;
                                goto nodelete;
@@ -5330,7 +5337,7 @@ xfs_bunmapi(
                                del.br_state = XFS_EXT_UNWRITTEN;
                                error = xfs_bmap_add_extent_unwritten_real(tp,
                                                ip, &lastx, &cur, &del,
-                                               firstblock, flist, &logflags);
+                                               firstblock, dfops, &logflags);
                                if (error)
                                        goto error0;
                                goto nodelete;
@@ -5388,7 +5395,7 @@ xfs_bunmapi(
                } else if (cur)
                        cur->bc_private.b.flags &= ~XFS_BTCUR_BPRV_WASDEL;
 
-               error = xfs_bmap_del_extent(ip, tp, &lastx, flist, cur, &del,
+               error = xfs_bmap_del_extent(ip, tp, &lastx, dfops, cur, &del,
                                &tmp_logflags, whichfork);
                logflags |= tmp_logflags;
                if (error)
@@ -5422,7 +5429,7 @@ nodelete:
         */
        if (xfs_bmap_needs_btree(ip, whichfork)) {
                ASSERT(cur == NULL);
-               error = xfs_bmap_extents_to_btree(tp, ip, firstblock, flist,
+               error = xfs_bmap_extents_to_btree(tp, ip, firstblock, dfops,
                        &cur, 0, &tmp_logflags, whichfork);
                logflags |= tmp_logflags;
                if (error)
@@ -5589,7 +5596,8 @@ xfs_bmse_shift_one(
        struct xfs_bmbt_rec_host        *gotp,
        struct xfs_btree_cur            *cur,
        int                             *logflags,
-       enum shift_direction            direction)
+       enum shift_direction            direction,
+       struct xfs_defer_ops            *dfops)
 {
        struct xfs_ifork                *ifp;
        struct xfs_mount                *mp;
@@ -5637,9 +5645,13 @@ xfs_bmse_shift_one(
                /* check whether to merge the extent or shift it down */
                if (xfs_bmse_can_merge(&adj_irec, &got,
                                       offset_shift_fsb)) {
-                       return xfs_bmse_merge(ip, whichfork, offset_shift_fsb,
-                                             *current_ext, gotp, adj_irecp,
-                                             cur, logflags);
+                       error = xfs_bmse_merge(ip, whichfork, offset_shift_fsb,
+                                              *current_ext, gotp, adj_irecp,
+                                              cur, logflags);
+                       if (error)
+                               return error;
+                       adj_irec = got;
+                       goto update_rmap;
                }
        } else {
                startoff = got.br_startoff + offset_shift_fsb;
@@ -5676,9 +5688,10 @@ update_current_ext:
                (*current_ext)--;
        xfs_bmbt_set_startoff(gotp, startoff);
        *logflags |= XFS_ILOG_CORE;
+       adj_irec = got;
        if (!cur) {
                *logflags |= XFS_ILOG_DEXT;
-               return 0;
+               goto update_rmap;
        }
 
        error = xfs_bmbt_lookup_eq(cur, got.br_startoff, got.br_startblock,
@@ -5688,8 +5701,18 @@ update_current_ext:
        XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
 
        got.br_startoff = startoff;
-       return xfs_bmbt_update(cur, got.br_startoff, got.br_startblock,
-                              got.br_blockcount, got.br_state);
+       error = xfs_bmbt_update(cur, got.br_startoff, got.br_startblock,
+                       got.br_blockcount, got.br_state);
+       if (error)
+               return error;
+
+update_rmap:
+       /* update reverse mapping */
+       error = xfs_rmap_unmap_extent(mp, dfops, ip, whichfork, &adj_irec);
+       if (error)
+               return error;
+       adj_irec.br_startoff = startoff;
+       return xfs_rmap_map_extent(mp, dfops, ip, whichfork, &adj_irec);
 }
 
 /*
@@ -5711,7 +5734,7 @@ xfs_bmap_shift_extents(
        int                     *done,
        xfs_fileoff_t           stop_fsb,
        xfs_fsblock_t           *firstblock,
-       struct xfs_bmap_free    *flist,
+       struct xfs_defer_ops    *dfops,
        enum shift_direction    direction,
        int                     num_exts)
 {
@@ -5756,7 +5779,7 @@ xfs_bmap_shift_extents(
        if (ifp->if_flags & XFS_IFBROOT) {
                cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
                cur->bc_private.b.firstblock = *firstblock;
-               cur->bc_private.b.flist = flist;
+               cur->bc_private.b.dfops = dfops;
                cur->bc_private.b.flags = 0;
        }
 
@@ -5817,7 +5840,7 @@ xfs_bmap_shift_extents(
        while (nexts++ < num_exts) {
                error = xfs_bmse_shift_one(ip, whichfork, offset_shift_fsb,
                                           &current_ext, gotp, cur, &logflags,
-                                          direction);
+                                          direction, dfops);
                if (error)
                        goto del_cursor;
                /*
@@ -5865,7 +5888,7 @@ xfs_bmap_split_extent_at(
        struct xfs_inode        *ip,
        xfs_fileoff_t           split_fsb,
        xfs_fsblock_t           *firstfsb,
-       struct xfs_bmap_free    *free_list)
+       struct xfs_defer_ops    *dfops)
 {
        int                             whichfork = XFS_DATA_FORK;
        struct xfs_btree_cur            *cur = NULL;
@@ -5927,7 +5950,7 @@ xfs_bmap_split_extent_at(
        if (ifp->if_flags & XFS_IFBROOT) {
                cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
                cur->bc_private.b.firstblock = *firstfsb;
-               cur->bc_private.b.flist = free_list;
+               cur->bc_private.b.dfops = dfops;
                cur->bc_private.b.flags = 0;
                error = xfs_bmbt_lookup_eq(cur, got.br_startoff,
                                got.br_startblock,
@@ -5980,7 +6003,7 @@ xfs_bmap_split_extent_at(
                int tmp_logflags; /* partial log flag return val */
 
                ASSERT(cur == NULL);
-               error = xfs_bmap_extents_to_btree(tp, ip, firstfsb, free_list,
+               error = xfs_bmap_extents_to_btree(tp, ip, firstfsb, dfops,
                                &cur, 0, &tmp_logflags, whichfork);
                logflags |= tmp_logflags;
        }
@@ -6004,7 +6027,7 @@ xfs_bmap_split_extent(
 {
        struct xfs_mount        *mp = ip->i_mount;
        struct xfs_trans        *tp;
-       struct xfs_bmap_free    free_list;
+       struct xfs_defer_ops    dfops;
        xfs_fsblock_t           firstfsb;
        int                     error;
 
@@ -6016,21 +6039,21 @@ xfs_bmap_split_extent(
        xfs_ilock(ip, XFS_ILOCK_EXCL);
        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
 
-       xfs_bmap_init(&free_list, &firstfsb);
+       xfs_defer_init(&dfops, &firstfsb);
 
        error = xfs_bmap_split_extent_at(tp, ip, split_fsb,
-                       &firstfsb, &free_list);
+                       &firstfsb, &dfops);
        if (error)
                goto out;
 
-       error = xfs_bmap_finish(&tp, &free_list, NULL);
+       error = xfs_defer_finish(&tp, &dfops, NULL);
        if (error)
                goto out;
 
        return xfs_trans_commit(tp);
 
 out:
-       xfs_bmap_cancel(&free_list);
+       xfs_defer_cancel(&dfops);
        xfs_trans_cancel(tp);
        return error;
 }
index f1f3ae6c0a3f5d9b507002819afb4632eb1f95ba..254034f9694135c94ad5827c21608f51f183fe3e 100644 (file)
@@ -32,7 +32,7 @@ extern kmem_zone_t    *xfs_bmap_free_item_zone;
  */
 struct xfs_bmalloca {
        xfs_fsblock_t           *firstblock; /* i/o first block allocated */
-       struct xfs_bmap_free    *flist; /* bmap freelist */
+       struct xfs_defer_ops    *dfops; /* bmap freelist */
        struct xfs_trans        *tp;    /* transaction pointer */
        struct xfs_inode        *ip;    /* incore inode pointer */
        struct xfs_bmbt_irec    prev;   /* extent before the new one */
@@ -62,34 +62,14 @@ struct xfs_bmalloca {
  * List of extents to be free "later".
  * The list is kept sorted on xbf_startblock.
  */
-struct xfs_bmap_free_item
+struct xfs_extent_free_item
 {
-       xfs_fsblock_t           xbfi_startblock;/* starting fs block number */
-       xfs_extlen_t            xbfi_blockcount;/* number of blocks in extent */
-       struct list_head        xbfi_list;
+       xfs_fsblock_t           xefi_startblock;/* starting fs block number */
+       xfs_extlen_t            xefi_blockcount;/* number of blocks in extent */
+       struct list_head        xefi_list;
+       struct xfs_owner_info   xefi_oinfo;     /* extent owner */
 };
 
-/*
- * Header for free extent list.
- *
- * xbf_low is used by the allocator to activate the lowspace algorithm -
- * when free space is running low the extent allocator may choose to
- * allocate an extent from an AG without leaving sufficient space for
- * a btree split when inserting the new extent.  In this case the allocator
- * will enable the lowspace algorithm which is supposed to allow further
- * allocations (such as btree splits and newroots) to allocate from
- * sequential AGs.  In order to avoid locking AGs out of order the lowspace
- * algorithm will start searching for free space from AG 0.  If the correct
- * transaction reservations have been made then this algorithm will eventually
- * find all the space it needs.
- */
-typedef        struct xfs_bmap_free
-{
-       struct list_head        xbf_flist;      /* list of to-be-free extents */
-       int                     xbf_count;      /* count of items on list */
-       int                     xbf_low;        /* alloc in low mode */
-} xfs_bmap_free_t;
-
 #define        XFS_BMAP_MAX_NMAP       4
 
 /*
@@ -139,14 +119,6 @@ static inline int xfs_bmapi_aflag(int w)
 #define        DELAYSTARTBLOCK         ((xfs_fsblock_t)-1LL)
 #define        HOLESTARTBLOCK          ((xfs_fsblock_t)-2LL)
 
-static inline void xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp)
-{
-       INIT_LIST_HEAD(&flp->xbf_flist);
-       flp->xbf_count = 0;
-       flp->xbf_low = 0;
-       *fbp = NULLFSBLOCK;
-}
-
 /*
  * Flags for xfs_bmap_add_extent*.
  */
@@ -193,11 +165,9 @@ void       xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt,
 
 int    xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd);
 void   xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork);
-void   xfs_bmap_add_free(struct xfs_mount *mp, struct xfs_bmap_free *flist,
-                         xfs_fsblock_t bno, xfs_filblks_t len);
-void   xfs_bmap_cancel(struct xfs_bmap_free *flist);
-int    xfs_bmap_finish(struct xfs_trans **tp, struct xfs_bmap_free *flist,
-                       struct xfs_inode *ip);
+void   xfs_bmap_add_free(struct xfs_mount *mp, struct xfs_defer_ops *dfops,
+                         xfs_fsblock_t bno, xfs_filblks_t len,
+                         struct xfs_owner_info *oinfo);
 void   xfs_bmap_compute_maxlevels(struct xfs_mount *mp, int whichfork);
 int    xfs_bmap_first_unused(struct xfs_trans *tp, struct xfs_inode *ip,
                xfs_extlen_t len, xfs_fileoff_t *unused, int whichfork);
@@ -218,18 +188,18 @@ int       xfs_bmapi_write(struct xfs_trans *tp, struct xfs_inode *ip,
                xfs_fileoff_t bno, xfs_filblks_t len, int flags,
                xfs_fsblock_t *firstblock, xfs_extlen_t total,
                struct xfs_bmbt_irec *mval, int *nmap,
-               struct xfs_bmap_free *flist);
+               struct xfs_defer_ops *dfops);
 int    xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip,
                xfs_fileoff_t bno, xfs_filblks_t len, int flags,
                xfs_extnum_t nexts, xfs_fsblock_t *firstblock,
-               struct xfs_bmap_free *flist, int *done);
+               struct xfs_defer_ops *dfops, int *done);
 int    xfs_check_nostate_extents(struct xfs_ifork *ifp, xfs_extnum_t idx,
                xfs_extnum_t num);
 uint   xfs_default_attroffset(struct xfs_inode *ip);
 int    xfs_bmap_shift_extents(struct xfs_trans *tp, struct xfs_inode *ip,
                xfs_fileoff_t *next_fsb, xfs_fileoff_t offset_shift_fsb,
                int *done, xfs_fileoff_t stop_fsb, xfs_fsblock_t *firstblock,
-               struct xfs_bmap_free *flist, enum shift_direction direction,
+               struct xfs_defer_ops *dfops, enum shift_direction direction,
                int num_exts);
 int    xfs_bmap_split_extent(struct xfs_inode *ip, xfs_fileoff_t split_offset);
 
index db0c71e470c9575d54d5fd2a268eda6c49f7ff3a..cd85274e810cd1457dd62dfa7abfb725138a35fc 100644 (file)
@@ -23,6 +23,7 @@
 #include "xfs_trans_resv.h"
 #include "xfs_bit.h"
 #include "xfs_mount.h"
+#include "xfs_defer.h"
 #include "xfs_inode.h"
 #include "xfs_trans.h"
 #include "xfs_inode_item.h"
@@ -34,6 +35,7 @@
 #include "xfs_quota.h"
 #include "xfs_trace.h"
 #include "xfs_cksum.h"
+#include "xfs_rmap.h"
 
 /*
  * Determine the extent state.
@@ -406,11 +408,11 @@ xfs_bmbt_dup_cursor(
                        cur->bc_private.b.ip, cur->bc_private.b.whichfork);
 
        /*
-        * Copy the firstblock, flist, and flags values,
+        * Copy the firstblock, dfops, and flags values,
         * since init cursor doesn't get them.
         */
        new->bc_private.b.firstblock = cur->bc_private.b.firstblock;
-       new->bc_private.b.flist = cur->bc_private.b.flist;
+       new->bc_private.b.dfops = cur->bc_private.b.dfops;
        new->bc_private.b.flags = cur->bc_private.b.flags;
 
        return new;
@@ -423,7 +425,7 @@ xfs_bmbt_update_cursor(
 {
        ASSERT((dst->bc_private.b.firstblock != NULLFSBLOCK) ||
               (dst->bc_private.b.ip->i_d.di_flags & XFS_DIFLAG_REALTIME));
-       ASSERT(dst->bc_private.b.flist == src->bc_private.b.flist);
+       ASSERT(dst->bc_private.b.dfops == src->bc_private.b.dfops);
 
        dst->bc_private.b.allocated += src->bc_private.b.allocated;
        dst->bc_private.b.firstblock = src->bc_private.b.firstblock;
@@ -446,6 +448,8 @@ xfs_bmbt_alloc_block(
        args.mp = cur->bc_mp;
        args.fsbno = cur->bc_private.b.firstblock;
        args.firstblock = args.fsbno;
+       xfs_rmap_ino_bmbt_owner(&args.oinfo, cur->bc_private.b.ip->i_ino,
+                       cur->bc_private.b.whichfork);
 
        if (args.fsbno == NULLFSBLOCK) {
                args.fsbno = be64_to_cpu(start->l);
@@ -462,7 +466,7 @@ xfs_bmbt_alloc_block(
                 * block allocation here and corrupt the filesystem.
                 */
                args.minleft = args.tp->t_blk_res;
-       } else if (cur->bc_private.b.flist->xbf_low) {
+       } else if (cur->bc_private.b.dfops->dop_low) {
                args.type = XFS_ALLOCTYPE_START_BNO;
        } else {
                args.type = XFS_ALLOCTYPE_NEAR_BNO;
@@ -490,7 +494,7 @@ xfs_bmbt_alloc_block(
                error = xfs_alloc_vextent(&args);
                if (error)
                        goto error0;
-               cur->bc_private.b.flist->xbf_low = 1;
+               cur->bc_private.b.dfops->dop_low = true;
        }
        if (args.fsbno == NULLFSBLOCK) {
                XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
@@ -525,8 +529,10 @@ xfs_bmbt_free_block(
        struct xfs_inode        *ip = cur->bc_private.b.ip;
        struct xfs_trans        *tp = cur->bc_tp;
        xfs_fsblock_t           fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp));
+       struct xfs_owner_info   oinfo;
 
-       xfs_bmap_add_free(mp, cur->bc_private.b.flist, fsbno, 1);
+       xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, cur->bc_private.b.whichfork);
+       xfs_bmap_add_free(mp, cur->bc_private.b.dfops, fsbno, 1, &oinfo);
        ip->i_d.di_nblocks--;
 
        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
@@ -599,17 +605,6 @@ xfs_bmbt_init_key_from_rec(
                cpu_to_be64(xfs_bmbt_disk_get_startoff(&rec->bmbt));
 }
 
-STATIC void
-xfs_bmbt_init_rec_from_key(
-       union xfs_btree_key     *key,
-       union xfs_btree_rec     *rec)
-{
-       ASSERT(key->bmbt.br_startoff != 0);
-
-       xfs_bmbt_disk_set_allf(&rec->bmbt, be64_to_cpu(key->bmbt.br_startoff),
-                              0, 0, XFS_EXT_NORM);
-}
-
 STATIC void
 xfs_bmbt_init_rec_from_cur(
        struct xfs_btree_cur    *cur,
@@ -760,7 +755,6 @@ static const struct xfs_btree_ops xfs_bmbt_ops = {
        .get_minrecs            = xfs_bmbt_get_minrecs,
        .get_dmaxrecs           = xfs_bmbt_get_dmaxrecs,
        .init_key_from_rec      = xfs_bmbt_init_key_from_rec,
-       .init_rec_from_key      = xfs_bmbt_init_rec_from_key,
        .init_rec_from_cur      = xfs_bmbt_init_rec_from_cur,
        .init_ptr_from_cur      = xfs_bmbt_init_ptr_from_cur,
        .key_diff               = xfs_bmbt_key_diff,
@@ -800,7 +794,7 @@ xfs_bmbt_init_cursor(
        cur->bc_private.b.forksize = XFS_IFORK_SIZE(ip, whichfork);
        cur->bc_private.b.ip = ip;
        cur->bc_private.b.firstblock = NULLFSBLOCK;
-       cur->bc_private.b.flist = NULL;
+       cur->bc_private.b.dfops = NULL;
        cur->bc_private.b.allocated = 0;
        cur->bc_private.b.flags = 0;
        cur->bc_private.b.whichfork = whichfork;
index 07eeb0b4ca74f1253eccbc3a2d0434278f0165ee..b5c213a051cde3f703227f10b73e367470a4df43 100644 (file)
@@ -23,6 +23,7 @@
 #include "xfs_trans_resv.h"
 #include "xfs_bit.h"
 #include "xfs_mount.h"
+#include "xfs_defer.h"
 #include "xfs_inode.h"
 #include "xfs_trans.h"
 #include "xfs_inode_item.h"
@@ -43,15 +44,14 @@ kmem_zone_t *xfs_btree_cur_zone;
  * Btree magic numbers.
  */
 static const __uint32_t xfs_magics[2][XFS_BTNUM_MAX] = {
-       { XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, XFS_BMAP_MAGIC, XFS_IBT_MAGIC,
+       { XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, 0, XFS_BMAP_MAGIC, XFS_IBT_MAGIC,
          XFS_FIBT_MAGIC },
-       { XFS_ABTB_CRC_MAGIC, XFS_ABTC_CRC_MAGIC,
+       { XFS_ABTB_CRC_MAGIC, XFS_ABTC_CRC_MAGIC, XFS_RMAP_CRC_MAGIC,
          XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC, XFS_FIBT_CRC_MAGIC }
 };
 #define xfs_btree_magic(cur) \
        xfs_magics[!!((cur)->bc_flags & XFS_BTREE_CRC_BLOCKS)][cur->bc_btnum]
 
-
 STATIC int                             /* error (0 or EFSCORRUPTED) */
 xfs_btree_check_lblock(
        struct xfs_btree_cur    *cur,   /* btree cursor */
@@ -428,6 +428,50 @@ xfs_btree_dup_cursor(
  * into a btree block (xfs_btree_*_offset) or return a pointer to the given
  * record, key or pointer (xfs_btree_*_addr).  Note that all addressing
  * inside the btree block is done using indices starting at one, not zero!
+ *
+ * If XFS_BTREE_OVERLAPPING is set, then this btree supports keys containing
+ * overlapping intervals.  In such a tree, records are still sorted lowest to
+ * highest and indexed by the smallest key value that refers to the record.
+ * However, nodes are different: each pointer has two associated keys -- one
+ * indexing the lowest key available in the block(s) below (the same behavior
+ * as the key in a regular btree) and another indexing the highest key
+ * available in the block(s) below.  Because records are /not/ sorted by the
+ * highest key, all leaf block updates require us to compute the highest key
+ * that matches any record in the leaf and to recursively update the high keys
+ * in the nodes going further up in the tree, if necessary.  Nodes look like
+ * this:
+ *
+ *             +--------+-----+-----+-----+-----+-----+-------+-------+-----+
+ * Non-Leaf:   | header | lo1 | hi1 | lo2 | hi2 | ... | ptr 1 | ptr 2 | ... |
+ *             +--------+-----+-----+-----+-----+-----+-------+-------+-----+
+ *
+ * To perform an interval query on an overlapped tree, perform the usual
+ * depth-first search and use the low and high keys to decide if we can skip
+ * that particular node.  If a leaf node is reached, return the records that
+ * intersect the interval.  Note that an interval query may return numerous
+ * entries.  For a non-overlapped tree, simply search for the record associated
+ * with the lowest key and iterate forward until a non-matching record is
+ * found.  Section 14.3 ("Interval Trees") of _Introduction to Algorithms_ by
+ * Cormen, Leiserson, Rivest, and Stein (2nd or 3rd ed. only) discuss this in
+ * more detail.
+ *
+ * Why do we care about overlapping intervals?  Let's say you have a bunch of
+ * reverse mapping records on a reflink filesystem:
+ *
+ * 1: +- file A startblock B offset C length D -----------+
+ * 2:      +- file E startblock F offset G length H --------------+
+ * 3:      +- file I startblock F offset J length K --+
+ * 4:                                                        +- file L... --+
+ *
+ * Now say we want to map block (B+D) into file A at offset (C+D).  Ideally,
+ * we'd simply increment the length of record 1.  But how do we find the record
+ * that ends at (B+D-1) (i.e. record 1)?  A LE lookup of (B+D-1) would return
+ * record 3 because the keys are ordered first by startblock.  An interval
+ * query would return records 1 and 2 because they both overlap (B+D-1), and
+ * from that we can pick out record 1 as the appropriate left neighbor.
+ *
+ * In the non-overlapped case you can do a LE lookup and decrement the cursor
+ * because a record's interval must end before the next record.
  */
 
 /*
@@ -478,6 +522,18 @@ xfs_btree_key_offset(
                (n - 1) * cur->bc_ops->key_len;
 }
 
+/*
+ * Calculate offset of the n-th high key in a btree block.
+ */
+STATIC size_t
+xfs_btree_high_key_offset(
+       struct xfs_btree_cur    *cur,
+       int                     n)
+{
+       return xfs_btree_block_len(cur) +
+               (n - 1) * cur->bc_ops->key_len + (cur->bc_ops->key_len / 2);
+}
+
 /*
  * Calculate offset of the n-th block pointer in a btree block.
  */
@@ -518,6 +574,19 @@ xfs_btree_key_addr(
                ((char *)block + xfs_btree_key_offset(cur, n));
 }
 
+/*
+ * Return a pointer to the n-th high key in the btree block.
+ */
+STATIC union xfs_btree_key *
+xfs_btree_high_key_addr(
+       struct xfs_btree_cur    *cur,
+       int                     n,
+       struct xfs_btree_block  *block)
+{
+       return (union xfs_btree_key *)
+               ((char *)block + xfs_btree_high_key_offset(cur, n));
+}
+
 /*
  * Return a pointer to the n-th block pointer in the btree block.
  */
@@ -1144,6 +1213,9 @@ xfs_btree_set_refs(
        case XFS_BTNUM_BMAP:
                xfs_buf_set_ref(bp, XFS_BMAP_BTREE_REF);
                break;
+       case XFS_BTNUM_RMAP:
+               xfs_buf_set_ref(bp, XFS_RMAP_BTREE_REF);
+               break;
        default:
                ASSERT(0);
        }
@@ -1879,32 +1951,214 @@ error0:
        return error;
 }
 
+/* Find the high key storage area from a regular key. */
+STATIC union xfs_btree_key *
+xfs_btree_high_key_from_key(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_key     *key)
+{
+       ASSERT(cur->bc_flags & XFS_BTREE_OVERLAPPING);
+       return (union xfs_btree_key *)((char *)key +
+                       (cur->bc_ops->key_len / 2));
+}
+
+/* Determine the low (and high if overlapped) keys of a leaf block */
+STATIC void
+xfs_btree_get_leaf_keys(
+       struct xfs_btree_cur    *cur,
+       struct xfs_btree_block  *block,
+       union xfs_btree_key     *key)
+{
+       union xfs_btree_key     max_hkey;
+       union xfs_btree_key     hkey;
+       union xfs_btree_rec     *rec;
+       union xfs_btree_key     *high;
+       int                     n;
+
+       rec = xfs_btree_rec_addr(cur, 1, block);
+       cur->bc_ops->init_key_from_rec(key, rec);
+
+       if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
+
+               cur->bc_ops->init_high_key_from_rec(&max_hkey, rec);
+               for (n = 2; n <= xfs_btree_get_numrecs(block); n++) {
+                       rec = xfs_btree_rec_addr(cur, n, block);
+                       cur->bc_ops->init_high_key_from_rec(&hkey, rec);
+                       if (cur->bc_ops->diff_two_keys(cur, &hkey, &max_hkey)
+                                       > 0)
+                               max_hkey = hkey;
+               }
+
+               high = xfs_btree_high_key_from_key(cur, key);
+               memcpy(high, &max_hkey, cur->bc_ops->key_len / 2);
+       }
+}
+
+/* Determine the low (and high if overlapped) keys of a node block */
+STATIC void
+xfs_btree_get_node_keys(
+       struct xfs_btree_cur    *cur,
+       struct xfs_btree_block  *block,
+       union xfs_btree_key     *key)
+{
+       union xfs_btree_key     *hkey;
+       union xfs_btree_key     *max_hkey;
+       union xfs_btree_key     *high;
+       int                     n;
+
+       if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
+               memcpy(key, xfs_btree_key_addr(cur, 1, block),
+                               cur->bc_ops->key_len / 2);
+
+               max_hkey = xfs_btree_high_key_addr(cur, 1, block);
+               for (n = 2; n <= xfs_btree_get_numrecs(block); n++) {
+                       hkey = xfs_btree_high_key_addr(cur, n, block);
+                       if (cur->bc_ops->diff_two_keys(cur, hkey, max_hkey) > 0)
+                               max_hkey = hkey;
+               }
+
+               high = xfs_btree_high_key_from_key(cur, key);
+               memcpy(high, max_hkey, cur->bc_ops->key_len / 2);
+       } else {
+               memcpy(key, xfs_btree_key_addr(cur, 1, block),
+                               cur->bc_ops->key_len);
+       }
+}
+
+/* Derive the keys for any btree block. */
+STATIC void
+xfs_btree_get_keys(
+       struct xfs_btree_cur    *cur,
+       struct xfs_btree_block  *block,
+       union xfs_btree_key     *key)
+{
+       if (be16_to_cpu(block->bb_level) == 0)
+               xfs_btree_get_leaf_keys(cur, block, key);
+       else
+               xfs_btree_get_node_keys(cur, block, key);
+}
+
 /*
- * Update keys at all levels from here to the root along the cursor's path.
+ * Decide if we need to update the parent keys of a btree block.  For
+ * a standard btree this is only necessary if we're updating the first
+ * record/key.  For an overlapping btree, we must always update the
+ * keys because the highest key can be in any of the records or keys
+ * in the block.
+ */
+static inline bool
+xfs_btree_needs_key_update(
+       struct xfs_btree_cur    *cur,
+       int                     ptr)
+{
+       return (cur->bc_flags & XFS_BTREE_OVERLAPPING) || ptr == 1;
+}
+
+/*
+ * Update the low and high parent keys of the given level, progressing
+ * towards the root.  If force_all is false, stop if the keys for a given
+ * level do not need updating.
  */
 STATIC int
-xfs_btree_updkey(
+__xfs_btree_updkeys(
+       struct xfs_btree_cur    *cur,
+       int                     level,
+       struct xfs_btree_block  *block,
+       struct xfs_buf          *bp0,
+       bool                    force_all)
+{
+       union xfs_btree_bigkey  key;    /* keys from current level */
+       union xfs_btree_key     *lkey;  /* keys from the next level up */
+       union xfs_btree_key     *hkey;
+       union xfs_btree_key     *nlkey; /* keys from the next level up */
+       union xfs_btree_key     *nhkey;
+       struct xfs_buf          *bp;
+       int                     ptr;
+
+       ASSERT(cur->bc_flags & XFS_BTREE_OVERLAPPING);
+
+       /* Exit if there aren't any parent levels to update. */
+       if (level + 1 >= cur->bc_nlevels)
+               return 0;
+
+       trace_xfs_btree_updkeys(cur, level, bp0);
+
+       lkey = (union xfs_btree_key *)&key;
+       hkey = xfs_btree_high_key_from_key(cur, lkey);
+       xfs_btree_get_keys(cur, block, lkey);
+       for (level++; level < cur->bc_nlevels; level++) {
+#ifdef DEBUG
+               int             error;
+#endif
+               block = xfs_btree_get_block(cur, level, &bp);
+               trace_xfs_btree_updkeys(cur, level, bp);
+#ifdef DEBUG
+               error = xfs_btree_check_block(cur, block, level, bp);
+               if (error) {
+                       XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+                       return error;
+               }
+#endif
+               ptr = cur->bc_ptrs[level];
+               nlkey = xfs_btree_key_addr(cur, ptr, block);
+               nhkey = xfs_btree_high_key_addr(cur, ptr, block);
+               if (!force_all &&
+                   !(cur->bc_ops->diff_two_keys(cur, nlkey, lkey) != 0 ||
+                     cur->bc_ops->diff_two_keys(cur, nhkey, hkey) != 0))
+                       break;
+               xfs_btree_copy_keys(cur, nlkey, lkey, 1);
+               xfs_btree_log_keys(cur, bp, ptr, ptr);
+               if (level + 1 >= cur->bc_nlevels)
+                       break;
+               xfs_btree_get_node_keys(cur, block, lkey);
+       }
+
+       return 0;
+}
+
+/* Update all the keys from some level in cursor back to the root. */
+STATIC int
+xfs_btree_updkeys_force(
+       struct xfs_btree_cur    *cur,
+       int                     level)
+{
+       struct xfs_buf          *bp;
+       struct xfs_btree_block  *block;
+
+       block = xfs_btree_get_block(cur, level, &bp);
+       return __xfs_btree_updkeys(cur, level, block, bp, true);
+}
+
+/*
+ * Update the parent keys of the given level, progressing towards the root.
+ */
+STATIC int
+xfs_btree_update_keys(
        struct xfs_btree_cur    *cur,
-       union xfs_btree_key     *keyp,
        int                     level)
 {
        struct xfs_btree_block  *block;
        struct xfs_buf          *bp;
        union xfs_btree_key     *kp;
+       union xfs_btree_key     key;
        int                     ptr;
 
+       ASSERT(level >= 0);
+
+       block = xfs_btree_get_block(cur, level, &bp);
+       if (cur->bc_flags & XFS_BTREE_OVERLAPPING)
+               return __xfs_btree_updkeys(cur, level, block, bp, false);
+
        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
        XFS_BTREE_TRACE_ARGIK(cur, level, keyp);
 
-       ASSERT(!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) || level >= 1);
-
        /*
         * Go up the tree from this level toward the root.
         * At each level, update the key value to the value input.
         * Stop when we reach a level where the cursor isn't pointing
         * at the first entry in the block.
         */
-       for (ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
+       xfs_btree_get_keys(cur, block, &key);
+       for (level++, ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
 #ifdef DEBUG
                int             error;
 #endif
@@ -1918,7 +2172,7 @@ xfs_btree_updkey(
 #endif
                ptr = cur->bc_ptrs[level];
                kp = xfs_btree_key_addr(cur, ptr, block);
-               xfs_btree_copy_keys(cur, kp, keyp, 1);
+               xfs_btree_copy_keys(cur, kp, &key, 1);
                xfs_btree_log_keys(cur, bp, ptr, ptr);
        }
 
@@ -1970,12 +2224,9 @@ xfs_btree_update(
                                            ptr, LASTREC_UPDATE);
        }
 
-       /* Updating first rec in leaf. Pass new key value up to our parent. */
-       if (ptr == 1) {
-               union xfs_btree_key     key;
-
-               cur->bc_ops->init_key_from_rec(&key, rec);
-               error = xfs_btree_updkey(cur, &key, 1);
+       /* Pass new key value up to our parent. */
+       if (xfs_btree_needs_key_update(cur, ptr)) {
+               error = xfs_btree_update_keys(cur, 0);
                if (error)
                        goto error0;
        }
@@ -1998,18 +2249,19 @@ xfs_btree_lshift(
        int                     level,
        int                     *stat)          /* success/failure */
 {
-       union xfs_btree_key     key;            /* btree key */
        struct xfs_buf          *lbp;           /* left buffer pointer */
        struct xfs_btree_block  *left;          /* left btree block */
        int                     lrecs;          /* left record count */
        struct xfs_buf          *rbp;           /* right buffer pointer */
        struct xfs_btree_block  *right;         /* right btree block */
+       struct xfs_btree_cur    *tcur;          /* temporary btree cursor */
        int                     rrecs;          /* right record count */
        union xfs_btree_ptr     lptr;           /* left btree pointer */
        union xfs_btree_key     *rkp = NULL;    /* right btree key */
        union xfs_btree_ptr     *rpp = NULL;    /* right address pointer */
        union xfs_btree_rec     *rrp = NULL;    /* right record pointer */
        int                     error;          /* error return value */
+       int                     i;
 
        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
        XFS_BTREE_TRACE_ARGI(cur, level);
@@ -2139,18 +2391,33 @@ xfs_btree_lshift(
                        xfs_btree_rec_addr(cur, 2, right),
                        -1, rrecs);
                xfs_btree_log_recs(cur, rbp, 1, rrecs);
+       }
 
-               /*
-                * If it's the first record in the block, we'll need a key
-                * structure to pass up to the next level (updkey).
-                */
-               cur->bc_ops->init_key_from_rec(&key,
-                       xfs_btree_rec_addr(cur, 1, right));
-               rkp = &key;
+       /*
+        * Using a temporary cursor, update the parent key values of the
+        * block on the left.
+        */
+       if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
+               error = xfs_btree_dup_cursor(cur, &tcur);
+               if (error)
+                       goto error0;
+               i = xfs_btree_firstrec(tcur, level);
+               XFS_WANT_CORRUPTED_GOTO(tcur->bc_mp, i == 1, error0);
+
+               error = xfs_btree_decrement(tcur, level, &i);
+               if (error)
+                       goto error1;
+
+               /* Update the parent high keys of the left block, if needed. */
+               error = xfs_btree_update_keys(tcur, level);
+               if (error)
+                       goto error1;
+
+               xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
        }
 
-       /* Update the parent key values of right. */
-       error = xfs_btree_updkey(cur, rkp, level + 1);
+       /* Update the parent keys of the right block. */
+       error = xfs_btree_update_keys(cur, level);
        if (error)
                goto error0;
 
@@ -2169,6 +2436,11 @@ out0:
 error0:
        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
        return error;
+
+error1:
+       XFS_BTREE_TRACE_CURSOR(tcur, XBT_ERROR);
+       xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
+       return error;
 }
 
 /*
@@ -2181,7 +2453,6 @@ xfs_btree_rshift(
        int                     level,
        int                     *stat)          /* success/failure */
 {
-       union xfs_btree_key     key;            /* btree key */
        struct xfs_buf          *lbp;           /* left buffer pointer */
        struct xfs_btree_block  *left;          /* left btree block */
        struct xfs_buf          *rbp;           /* right buffer pointer */
@@ -2290,12 +2561,6 @@ xfs_btree_rshift(
                /* Now put the new data in, and log it. */
                xfs_btree_copy_recs(cur, rrp, lrp, 1);
                xfs_btree_log_recs(cur, rbp, 1, rrecs + 1);
-
-               cur->bc_ops->init_key_from_rec(&key, rrp);
-               rkp = &key;
-
-               ASSERT(cur->bc_ops->recs_inorder(cur, rrp,
-                       xfs_btree_rec_addr(cur, 2, right)));
        }
 
        /*
@@ -2315,13 +2580,21 @@ xfs_btree_rshift(
        if (error)
                goto error0;
        i = xfs_btree_lastrec(tcur, level);
-       XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
+       XFS_WANT_CORRUPTED_GOTO(tcur->bc_mp, i == 1, error0);
 
        error = xfs_btree_increment(tcur, level, &i);
        if (error)
                goto error1;
 
-       error = xfs_btree_updkey(tcur, rkp, level + 1);
+       /* Update the parent high keys of the left block, if needed. */
+       if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
+               error = xfs_btree_update_keys(cur, level);
+               if (error)
+                       goto error1;
+       }
+
+       /* Update the parent keys of the right block. */
+       error = xfs_btree_update_keys(tcur, level);
        if (error)
                goto error1;
 
@@ -2422,6 +2695,11 @@ __xfs_btree_split(
 
        XFS_BTREE_STATS_ADD(cur, moves, rrecs);
 
+       /* Adjust numrecs for the later get_*_keys() calls. */
+       lrecs -= rrecs;
+       xfs_btree_set_numrecs(left, lrecs);
+       xfs_btree_set_numrecs(right, xfs_btree_get_numrecs(right) + rrecs);
+
        /*
         * Copy btree block entries from the left block over to the
         * new block, the right. Update the right block and log the
@@ -2447,14 +2725,15 @@ __xfs_btree_split(
                }
 #endif
 
+               /* Copy the keys & pointers to the new block. */
                xfs_btree_copy_keys(cur, rkp, lkp, rrecs);
                xfs_btree_copy_ptrs(cur, rpp, lpp, rrecs);
 
                xfs_btree_log_keys(cur, rbp, 1, rrecs);
                xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
 
-               /* Grab the keys to the entries moved to the right block */
-               xfs_btree_copy_keys(cur, key, rkp, 1);
+               /* Stash the keys of the new block for later insertion. */
+               xfs_btree_get_node_keys(cur, right, key);
        } else {
                /* It's a leaf.  Move records.  */
                union xfs_btree_rec     *lrp;   /* left record pointer */
@@ -2463,27 +2742,23 @@ __xfs_btree_split(
                lrp = xfs_btree_rec_addr(cur, src_index, left);
                rrp = xfs_btree_rec_addr(cur, 1, right);
 
+               /* Copy records to the new block. */
                xfs_btree_copy_recs(cur, rrp, lrp, rrecs);
                xfs_btree_log_recs(cur, rbp, 1, rrecs);
 
-               cur->bc_ops->init_key_from_rec(key,
-                       xfs_btree_rec_addr(cur, 1, right));
+               /* Stash the keys of the new block for later insertion. */
+               xfs_btree_get_leaf_keys(cur, right, key);
        }
 
-
        /*
         * Find the left block number by looking in the buffer.
-        * Adjust numrecs, sibling pointers.
+        * Adjust sibling pointers.
         */
        xfs_btree_get_sibling(cur, left, &rrptr, XFS_BB_RIGHTSIB);
        xfs_btree_set_sibling(cur, right, &rrptr, XFS_BB_RIGHTSIB);
        xfs_btree_set_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
        xfs_btree_set_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
 
-       lrecs -= rrecs;
-       xfs_btree_set_numrecs(left, lrecs);
-       xfs_btree_set_numrecs(right, xfs_btree_get_numrecs(right) + rrecs);
-
        xfs_btree_log_block(cur, rbp, XFS_BB_ALL_BITS);
        xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
 
@@ -2499,6 +2774,14 @@ __xfs_btree_split(
                xfs_btree_set_sibling(cur, rrblock, &rptr, XFS_BB_LEFTSIB);
                xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
        }
+
+       /* Update the parent high keys of the left block, if needed. */
+       if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
+               error = xfs_btree_update_keys(cur, level);
+               if (error)
+                       goto error0;
+       }
+
        /*
         * If the cursor is really in the right block, move it there.
         * If it's just pointing past the last entry in left, then we'll
@@ -2802,6 +3085,7 @@ xfs_btree_new_root(
                bp = lbp;
                nptr = 2;
        }
+
        /* Fill in the new block's btree header and log it. */
        xfs_btree_init_block_cur(cur, nbp, cur->bc_nlevels, 2);
        xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS);
@@ -2810,19 +3094,24 @@ xfs_btree_new_root(
 
        /* Fill in the key data in the new root. */
        if (xfs_btree_get_level(left) > 0) {
-               xfs_btree_copy_keys(cur,
-                               xfs_btree_key_addr(cur, 1, new),
-                               xfs_btree_key_addr(cur, 1, left), 1);
-               xfs_btree_copy_keys(cur,
-                               xfs_btree_key_addr(cur, 2, new),
-                               xfs_btree_key_addr(cur, 1, right), 1);
+               /*
+                * Get the keys for the left block's keys and put them directly
+                * in the parent block.  Do the same for the right block.
+                */
+               xfs_btree_get_node_keys(cur, left,
+                               xfs_btree_key_addr(cur, 1, new));
+               xfs_btree_get_node_keys(cur, right,
+                               xfs_btree_key_addr(cur, 2, new));
        } else {
-               cur->bc_ops->init_key_from_rec(
-                               xfs_btree_key_addr(cur, 1, new),
-                               xfs_btree_rec_addr(cur, 1, left));
-               cur->bc_ops->init_key_from_rec(
-                               xfs_btree_key_addr(cur, 2, new),
-                               xfs_btree_rec_addr(cur, 1, right));
+               /*
+                * Get the keys for the left block's records and put them
+                * directly in the parent block.  Do the same for the right
+                * block.
+                */
+               xfs_btree_get_leaf_keys(cur, left,
+                       xfs_btree_key_addr(cur, 1, new));
+               xfs_btree_get_leaf_keys(cur, right,
+                       xfs_btree_key_addr(cur, 2, new));
        }
        xfs_btree_log_keys(cur, nbp, 1, 2);
 
@@ -2858,10 +3147,9 @@ xfs_btree_make_block_unfull(
        int                     *index, /* new tree index */
        union xfs_btree_ptr     *nptr,  /* new btree ptr */
        struct xfs_btree_cur    **ncur, /* new btree cursor */
-       union xfs_btree_rec     *nrec,  /* new record */
+       union xfs_btree_key     *key,   /* key of new block */
        int                     *stat)
 {
-       union xfs_btree_key     key;    /* new btree key value */
        int                     error = 0;
 
        if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
@@ -2871,6 +3159,7 @@ xfs_btree_make_block_unfull(
                if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) {
                        /* A root block that can be made bigger. */
                        xfs_iroot_realloc(ip, 1, cur->bc_private.b.whichfork);
+                       *stat = 1;
                } else {
                        /* A root block that needs replacing */
                        int     logflags = 0;
@@ -2906,13 +3195,12 @@ xfs_btree_make_block_unfull(
         * If this works we have to re-set our variables because we
         * could be in a different block now.
         */
-       error = xfs_btree_split(cur, level, nptr, &key, ncur, stat);
+       error = xfs_btree_split(cur, level, nptr, key, ncur, stat);
        if (error || *stat == 0)
                return error;
 
 
        *index = cur->bc_ptrs[level];
-       cur->bc_ops->init_rec_from_key(&key, nrec);
        return 0;
 }
 
@@ -2925,16 +3213,17 @@ xfs_btree_insrec(
        struct xfs_btree_cur    *cur,   /* btree cursor */
        int                     level,  /* level to insert record at */
        union xfs_btree_ptr     *ptrp,  /* i/o: block number inserted */
-       union xfs_btree_rec     *recp,  /* i/o: record data inserted */
+       union xfs_btree_rec     *rec,   /* record to insert */
+       union xfs_btree_key     *key,   /* i/o: block key for ptrp */
        struct xfs_btree_cur    **curp, /* output: new cursor replacing cur */
        int                     *stat)  /* success/failure */
 {
        struct xfs_btree_block  *block; /* btree block */
        struct xfs_buf          *bp;    /* buffer for block */
-       union xfs_btree_key     key;    /* btree key */
        union xfs_btree_ptr     nptr;   /* new block ptr */
        struct xfs_btree_cur    *ncur;  /* new btree cursor */
-       union xfs_btree_rec     nrec;   /* new record count */
+       union xfs_btree_bigkey  nkey;   /* new block key */
+       union xfs_btree_key     *lkey;
        int                     optr;   /* old key/record index */
        int                     ptr;    /* key/record index */
        int                     numrecs;/* number of records */
@@ -2942,11 +3231,13 @@ xfs_btree_insrec(
 #ifdef DEBUG
        int                     i;
 #endif
+       xfs_daddr_t             old_bn;
 
        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
-       XFS_BTREE_TRACE_ARGIPR(cur, level, *ptrp, recp);
+       XFS_BTREE_TRACE_ARGIPR(cur, level, *ptrp, &rec);
 
        ncur = NULL;
+       lkey = (union xfs_btree_key *)&nkey;
 
        /*
         * If we have an external root pointer, and we've made it to the
@@ -2969,15 +3260,13 @@ xfs_btree_insrec(
                return 0;
        }
 
-       /* Make a key out of the record data to be inserted, and save it. */
-       cur->bc_ops->init_key_from_rec(&key, recp);
-
        optr = ptr;
 
        XFS_BTREE_STATS_INC(cur, insrec);
 
        /* Get pointers to the btree buffer and block. */
        block = xfs_btree_get_block(cur, level, &bp);
+       old_bn = bp ? bp->b_bn : XFS_BUF_DADDR_NULL;
        numrecs = xfs_btree_get_numrecs(block);
 
 #ifdef DEBUG
@@ -2988,10 +3277,10 @@ xfs_btree_insrec(
        /* Check that the new entry is being inserted in the right place. */
        if (ptr <= numrecs) {
                if (level == 0) {
-                       ASSERT(cur->bc_ops->recs_inorder(cur, recp,
+                       ASSERT(cur->bc_ops->recs_inorder(cur, rec,
                                xfs_btree_rec_addr(cur, ptr, block)));
                } else {
-                       ASSERT(cur->bc_ops->keys_inorder(cur, &key,
+                       ASSERT(cur->bc_ops->keys_inorder(cur, key,
                                xfs_btree_key_addr(cur, ptr, block)));
                }
        }
@@ -3004,7 +3293,7 @@ xfs_btree_insrec(
        xfs_btree_set_ptr_null(cur, &nptr);
        if (numrecs == cur->bc_ops->get_maxrecs(cur, level)) {
                error = xfs_btree_make_block_unfull(cur, level, numrecs,
-                                       &optr, &ptr, &nptr, &ncur, &nrec, stat);
+                                       &optr, &ptr, &nptr, &ncur, lkey, stat);
                if (error || *stat == 0)
                        goto error0;
        }
@@ -3054,7 +3343,7 @@ xfs_btree_insrec(
 #endif
 
                /* Now put the new data in, bump numrecs and log it. */
-               xfs_btree_copy_keys(cur, kp, &key, 1);
+               xfs_btree_copy_keys(cur, kp, key, 1);
                xfs_btree_copy_ptrs(cur, pp, ptrp, 1);
                numrecs++;
                xfs_btree_set_numrecs(block, numrecs);
@@ -3075,7 +3364,7 @@ xfs_btree_insrec(
                xfs_btree_shift_recs(cur, rp, 1, numrecs - ptr + 1);
 
                /* Now put the new data in, bump numrecs and log it. */
-               xfs_btree_copy_recs(cur, rp, recp, 1);
+               xfs_btree_copy_recs(cur, rp, rec, 1);
                xfs_btree_set_numrecs(block, ++numrecs);
                xfs_btree_log_recs(cur, bp, ptr, numrecs);
 #ifdef DEBUG
@@ -3089,9 +3378,18 @@ xfs_btree_insrec(
        /* Log the new number of records in the btree header. */
        xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
 
-       /* If we inserted at the start of a block, update the parents' keys. */
-       if (optr == 1) {
-               error = xfs_btree_updkey(cur, &key, level + 1);
+       /*
+        * If we just inserted into a new tree block, we have to
+        * recalculate nkey here because nkey is out of date.
+        *
+        * Otherwise we're just updating an existing block (having shoved
+        * some records into the new tree block), so use the regular key
+        * update mechanism.
+        */
+       if (bp && bp->b_bn != old_bn) {
+               xfs_btree_get_keys(cur, block, lkey);
+       } else if (xfs_btree_needs_key_update(cur, optr)) {
+               error = xfs_btree_update_keys(cur, level);
                if (error)
                        goto error0;
        }
@@ -3101,7 +3399,7 @@ xfs_btree_insrec(
         * we are at the far right edge of the tree, update it.
         */
        if (xfs_btree_is_lastrec(cur, block, level)) {
-               cur->bc_ops->update_lastrec(cur, block, recp,
+               cur->bc_ops->update_lastrec(cur, block, rec,
                                            ptr, LASTREC_INSREC);
        }
 
@@ -3111,7 +3409,7 @@ xfs_btree_insrec(
         */
        *ptrp = nptr;
        if (!xfs_btree_ptr_is_null(cur, &nptr)) {
-               *recp = nrec;
+               xfs_btree_copy_keys(cur, key, lkey, 1);
                *curp = ncur;
        }
 
@@ -3142,14 +3440,20 @@ xfs_btree_insert(
        union xfs_btree_ptr     nptr;   /* new block number (split result) */
        struct xfs_btree_cur    *ncur;  /* new cursor (split result) */
        struct xfs_btree_cur    *pcur;  /* previous level's cursor */
+       union xfs_btree_bigkey  bkey;   /* key of block to insert */
+       union xfs_btree_key     *key;
        union xfs_btree_rec     rec;    /* record to insert */
 
        level = 0;
        ncur = NULL;
        pcur = cur;
+       key = (union xfs_btree_key *)&bkey;
 
        xfs_btree_set_ptr_null(cur, &nptr);
+
+       /* Make a key out of the record data to be inserted, and save it. */
        cur->bc_ops->init_rec_from_cur(cur, &rec);
+       cur->bc_ops->init_key_from_rec(key, &rec);
 
        /*
         * Loop going up the tree, starting at the leaf level.
@@ -3161,7 +3465,8 @@ xfs_btree_insert(
                 * Insert nrec/nptr into this level of the tree.
                 * Note if we fail, nptr will be null.
                 */
-               error = xfs_btree_insrec(pcur, level, &nptr, &rec, &ncur, &i);
+               error = xfs_btree_insrec(pcur, level, &nptr, &rec, key,
+                               &ncur, &i);
                if (error) {
                        if (pcur != cur)
                                xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
@@ -3385,8 +3690,6 @@ xfs_btree_delrec(
        struct xfs_buf          *bp;            /* buffer for block */
        int                     error;          /* error return value */
        int                     i;              /* loop counter */
-       union xfs_btree_key     key;            /* storage for keyp */
-       union xfs_btree_key     *keyp = &key;   /* passed to the next level */
        union xfs_btree_ptr     lptr;           /* left sibling block ptr */
        struct xfs_buf          *lbp;           /* left buffer pointer */
        struct xfs_btree_block  *left;          /* left btree block */
@@ -3457,13 +3760,6 @@ xfs_btree_delrec(
                        xfs_btree_log_keys(cur, bp, ptr, numrecs - 1);
                        xfs_btree_log_ptrs(cur, bp, ptr, numrecs - 1);
                }
-
-               /*
-                * If it's the first record in the block, we'll need to pass a
-                * key up to the next level (updkey).
-                */
-               if (ptr == 1)
-                       keyp = xfs_btree_key_addr(cur, 1, block);
        } else {
                /* It's a leaf. operate on records */
                if (ptr < numrecs) {
@@ -3472,16 +3768,6 @@ xfs_btree_delrec(
                                -1, numrecs - ptr);
                        xfs_btree_log_recs(cur, bp, ptr, numrecs - 1);
                }
-
-               /*
-                * If it's the first record in the block, we'll need a key
-                * structure to pass up to the next level (updkey).
-                */
-               if (ptr == 1) {
-                       cur->bc_ops->init_key_from_rec(&key,
-                                       xfs_btree_rec_addr(cur, 1, block));
-                       keyp = &key;
-               }
        }
 
        /*
@@ -3548,8 +3834,8 @@ xfs_btree_delrec(
         * If we deleted the leftmost entry in the block, update the
         * key values above us in the tree.
         */
-       if (ptr == 1) {
-               error = xfs_btree_updkey(cur, keyp, level + 1);
+       if (xfs_btree_needs_key_update(cur, ptr)) {
+               error = xfs_btree_update_keys(cur, level);
                if (error)
                        goto error0;
        }
@@ -3878,6 +4164,16 @@ xfs_btree_delrec(
        if (level > 0)
                cur->bc_ptrs[level]--;
 
+       /*
+        * We combined blocks, so we have to update the parent keys if the
+        * btree supports overlapped intervals.  However, bc_ptrs[level + 1]
+        * points to the old block so that the caller knows which record to
+        * delete.  Therefore, the caller must be savvy enough to call updkeys
+        * for us if we return stat == 2.  The other exit points from this
+        * function don't require deletions further up the tree, so they can
+        * call updkeys directly.
+        */
+
        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
        /* Return value means the next level up has something to do. */
        *stat = 2;
@@ -3903,6 +4199,7 @@ xfs_btree_delete(
        int                     error;  /* error return value */
        int                     level;
        int                     i;
+       bool                    joined = false;
 
        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
 
@@ -3916,6 +4213,18 @@ xfs_btree_delete(
                error = xfs_btree_delrec(cur, level, &i);
                if (error)
                        goto error0;
+               if (i == 2)
+                       joined = true;
+       }
+
+       /*
+        * If we combined blocks as part of deleting the record, delrec won't
+        * have updated the parent high keys so we have to do that here.
+        */
+       if (joined && (cur->bc_flags & XFS_BTREE_OVERLAPPING)) {
+               error = xfs_btree_updkeys_force(cur, 0);
+               if (error)
+                       goto error0;
        }
 
        if (i == 0) {
@@ -3978,6 +4287,81 @@ xfs_btree_get_rec(
        return 0;
 }
 
+/* Visit a block in a btree. */
+STATIC int
+xfs_btree_visit_block(
+       struct xfs_btree_cur            *cur,
+       int                             level,
+       xfs_btree_visit_blocks_fn       fn,
+       void                            *data)
+{
+       struct xfs_btree_block          *block;
+       struct xfs_buf                  *bp;
+       union xfs_btree_ptr             rptr;
+       int                             error;
+
+       /* do right sibling readahead */
+       xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
+       block = xfs_btree_get_block(cur, level, &bp);
+
+       /* process the block */
+       error = fn(cur, level, data);
+       if (error)
+               return error;
+
+       /* now read rh sibling block for next iteration */
+       xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
+       if (xfs_btree_ptr_is_null(cur, &rptr))
+               return -ENOENT;
+
+       return xfs_btree_lookup_get_block(cur, level, &rptr, &block);
+}
+
+
+/* Visit every block in a btree. */
+int
+xfs_btree_visit_blocks(
+       struct xfs_btree_cur            *cur,
+       xfs_btree_visit_blocks_fn       fn,
+       void                            *data)
+{
+       union xfs_btree_ptr             lptr;
+       int                             level;
+       struct xfs_btree_block          *block = NULL;
+       int                             error = 0;
+
+       cur->bc_ops->init_ptr_from_cur(cur, &lptr);
+
+       /* for each level */
+       for (level = cur->bc_nlevels - 1; level >= 0; level--) {
+               /* grab the left hand block */
+               error = xfs_btree_lookup_get_block(cur, level, &lptr, &block);
+               if (error)
+                       return error;
+
+               /* readahead the left most block for the next level down */
+               if (level > 0) {
+                       union xfs_btree_ptr     *ptr;
+
+                       ptr = xfs_btree_ptr_addr(cur, 1, block);
+                       xfs_btree_readahead_ptr(cur, ptr, 1);
+
+                       /* save for the next iteration of the loop */
+                       lptr = *ptr;
+               }
+
+               /* for each buffer in the level */
+               do {
+                       error = xfs_btree_visit_block(cur, level, fn, data);
+               } while (!error);
+
+               if (error != -ENOENT)
+                       return error;
+       }
+
+       return 0;
+}
+
 /*
  * Change the owner of a btree.
  *
@@ -4002,26 +4386,27 @@ xfs_btree_get_rec(
  * just queue the modified buffer as delayed write buffer so the transaction
  * recovery completion writes the changes to disk.
  */
+struct xfs_btree_block_change_owner_info {
+       __uint64_t              new_owner;
+       struct list_head        *buffer_list;
+};
+
 static int
 xfs_btree_block_change_owner(
        struct xfs_btree_cur    *cur,
        int                     level,
-       __uint64_t              new_owner,
-       struct list_head        *buffer_list)
+       void                    *data)
 {
+       struct xfs_btree_block_change_owner_info        *bbcoi = data;
        struct xfs_btree_block  *block;
        struct xfs_buf          *bp;
-       union xfs_btree_ptr     rptr;
-
-       /* do right sibling readahead */
-       xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
 
        /* modify the owner */
        block = xfs_btree_get_block(cur, level, &bp);
        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
-               block->bb_u.l.bb_owner = cpu_to_be64(new_owner);
+               block->bb_u.l.bb_owner = cpu_to_be64(bbcoi->new_owner);
        else
-               block->bb_u.s.bb_owner = cpu_to_be32(new_owner);
+               block->bb_u.s.bb_owner = cpu_to_be32(bbcoi->new_owner);
 
        /*
         * If the block is a root block hosted in an inode, we might not have a
@@ -4035,19 +4420,14 @@ xfs_btree_block_change_owner(
                        xfs_trans_ordered_buf(cur->bc_tp, bp);
                        xfs_btree_log_block(cur, bp, XFS_BB_OWNER);
                } else {
-                       xfs_buf_delwri_queue(bp, buffer_list);
+                       xfs_buf_delwri_queue(bp, bbcoi->buffer_list);
                }
        } else {
                ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
                ASSERT(level == cur->bc_nlevels - 1);
        }
 
-       /* now read rh sibling block for next iteration */
-       xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
-       if (xfs_btree_ptr_is_null(cur, &rptr))
-               return -ENOENT;
-
-       return xfs_btree_lookup_get_block(cur, level, &rptr, &block);
+       return 0;
 }
 
 int
@@ -4056,43 +4436,13 @@ xfs_btree_change_owner(
        __uint64_t              new_owner,
        struct list_head        *buffer_list)
 {
-       union xfs_btree_ptr     lptr;
-       int                     level;
-       struct xfs_btree_block  *block = NULL;
-       int                     error = 0;
-
-       cur->bc_ops->init_ptr_from_cur(cur, &lptr);
-
-       /* for each level */
-       for (level = cur->bc_nlevels - 1; level >= 0; level--) {
-               /* grab the left hand block */
-               error = xfs_btree_lookup_get_block(cur, level, &lptr, &block);
-               if (error)
-                       return error;
-
-               /* readahead the left most block for the next level down */
-               if (level > 0) {
-                       union xfs_btree_ptr     *ptr;
-
-                       ptr = xfs_btree_ptr_addr(cur, 1, block);
-                       xfs_btree_readahead_ptr(cur, ptr, 1);
-
-                       /* save for the next iteration of the loop */
-                       lptr = *ptr;
-               }
-
-               /* for each buffer in the level */
-               do {
-                       error = xfs_btree_block_change_owner(cur, level,
-                                                            new_owner,
-                                                            buffer_list);
-               } while (!error);
+       struct xfs_btree_block_change_owner_info        bbcoi;
 
-               if (error != -ENOENT)
-                       return error;
-       }
+       bbcoi.new_owner = new_owner;
+       bbcoi.buffer_list = buffer_list;
 
-       return 0;
+       return xfs_btree_visit_blocks(cur, xfs_btree_block_change_owner,
+                       &bbcoi);
 }
 
 /**
@@ -4171,3 +4521,267 @@ xfs_btree_compute_maxlevels(
                maxblocks = (maxblocks + limits[1] - 1) / limits[1];
        return level;
 }
+
+/*
+ * Query a regular btree for all records overlapping a given interval.
+ * Start with a LE lookup of the key of low_rec and return all records
+ * until we find a record with a key greater than the key of high_rec.
+ */
+STATIC int
+xfs_btree_simple_query_range(
+       struct xfs_btree_cur            *cur,
+       union xfs_btree_key             *low_key,
+       union xfs_btree_key             *high_key,
+       xfs_btree_query_range_fn        fn,
+       void                            *priv)
+{
+       union xfs_btree_rec             *recp;
+       union xfs_btree_key             rec_key;
+       __int64_t                       diff;
+       int                             stat;
+       bool                            firstrec = true;
+       int                             error;
+
+       ASSERT(cur->bc_ops->init_high_key_from_rec);
+       ASSERT(cur->bc_ops->diff_two_keys);
+
+       /*
+        * Find the leftmost record.  The btree cursor must be set
+        * to the low record used to generate low_key.
+        */
+       stat = 0;
+       error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, &stat);
+       if (error)
+               goto out;
+
+       while (stat) {
+               /* Find the record. */
+               error = xfs_btree_get_rec(cur, &recp, &stat);
+               if (error || !stat)
+                       break;
+               cur->bc_ops->init_high_key_from_rec(&rec_key, recp);
+
+               /* Skip if high_key(rec) < low_key. */
+               if (firstrec) {
+                       firstrec = false;
+                       diff = cur->bc_ops->diff_two_keys(cur, low_key,
+                                       &rec_key);
+                       if (diff > 0)
+                               goto advloop;
+               }
+
+               /* Stop if high_key < low_key(rec). */
+               diff = cur->bc_ops->diff_two_keys(cur, &rec_key, high_key);
+               if (diff > 0)
+                       break;
+
+               /* Callback */
+               error = fn(cur, recp, priv);
+               if (error < 0 || error == XFS_BTREE_QUERY_RANGE_ABORT)
+                       break;
+
+advloop:
+               /* Move on to the next record. */
+               error = xfs_btree_increment(cur, 0, &stat);
+               if (error)
+                       break;
+       }
+
+out:
+       return error;
+}
+
+/*
+ * Query an overlapped interval btree for all records overlapping a given
+ * interval.  This function roughly follows the algorithm given in
+ * "Interval Trees" of _Introduction to Algorithms_, which is section
+ * 14.3 in the 2nd and 3rd editions.
+ *
+ * First, generate keys for the low and high records passed in.
+ *
+ * For any leaf node, generate the high and low keys for the record.
+ * If the record keys overlap with the query low/high keys, pass the
+ * record to the function iterator.
+ *
+ * For any internal node, compare the low and high keys of each
+ * pointer against the query low/high keys.  If there's an overlap,
+ * follow the pointer.
+ *
+ * As an optimization, we stop scanning a block when we find a low key
+ * that is greater than the query's high key.
+ */
+STATIC int
+xfs_btree_overlapped_query_range(
+       struct xfs_btree_cur            *cur,
+       union xfs_btree_key             *low_key,
+       union xfs_btree_key             *high_key,
+       xfs_btree_query_range_fn        fn,
+       void                            *priv)
+{
+       union xfs_btree_ptr             ptr;
+       union xfs_btree_ptr             *pp;
+       union xfs_btree_key             rec_key;
+       union xfs_btree_key             rec_hkey;
+       union xfs_btree_key             *lkp;
+       union xfs_btree_key             *hkp;
+       union xfs_btree_rec             *recp;
+       struct xfs_btree_block          *block;
+       __int64_t                       ldiff;
+       __int64_t                       hdiff;
+       int                             level;
+       struct xfs_buf                  *bp;
+       int                             i;
+       int                             error;
+
+       /* Load the root of the btree. */
+       level = cur->bc_nlevels - 1;
+       cur->bc_ops->init_ptr_from_cur(cur, &ptr);
+       error = xfs_btree_lookup_get_block(cur, level, &ptr, &block);
+       if (error)
+               return error;
+       xfs_btree_get_block(cur, level, &bp);
+       trace_xfs_btree_overlapped_query_range(cur, level, bp);
+#ifdef DEBUG
+       error = xfs_btree_check_block(cur, block, level, bp);
+       if (error)
+               goto out;
+#endif
+       cur->bc_ptrs[level] = 1;
+
+       while (level < cur->bc_nlevels) {
+               block = xfs_btree_get_block(cur, level, &bp);
+
+               /* End of node, pop back towards the root. */
+               if (cur->bc_ptrs[level] > be16_to_cpu(block->bb_numrecs)) {
+pop_up:
+                       if (level < cur->bc_nlevels - 1)
+                               cur->bc_ptrs[level + 1]++;
+                       level++;
+                       continue;
+               }
+
+               if (level == 0) {
+                       /* Handle a leaf node. */
+                       recp = xfs_btree_rec_addr(cur, cur->bc_ptrs[0], block);
+
+                       cur->bc_ops->init_high_key_from_rec(&rec_hkey, recp);
+                       ldiff = cur->bc_ops->diff_two_keys(cur, &rec_hkey,
+                                       low_key);
+
+                       cur->bc_ops->init_key_from_rec(&rec_key, recp);
+                       hdiff = cur->bc_ops->diff_two_keys(cur, high_key,
+                                       &rec_key);
+
+                       /*
+                        * If (record's high key >= query's low key) and
+                        *    (query's high key >= record's low key), then
+                        * this record overlaps the query range; callback.
+                        */
+                       if (ldiff >= 0 && hdiff >= 0) {
+                               error = fn(cur, recp, priv);
+                               if (error < 0 ||
+                                   error == XFS_BTREE_QUERY_RANGE_ABORT)
+                                       break;
+                       } else if (hdiff < 0) {
+                               /* Record is larger than high key; pop. */
+                               goto pop_up;
+                       }
+                       cur->bc_ptrs[level]++;
+                       continue;
+               }
+
+               /* Handle an internal node. */
+               lkp = xfs_btree_key_addr(cur, cur->bc_ptrs[level], block);
+               hkp = xfs_btree_high_key_addr(cur, cur->bc_ptrs[level], block);
+               pp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[level], block);
+
+               ldiff = cur->bc_ops->diff_two_keys(cur, hkp, low_key);
+               hdiff = cur->bc_ops->diff_two_keys(cur, high_key, lkp);
+
+               /*
+                * If (pointer's high key >= query's low key) and
+                *    (query's high key >= pointer's low key), then
+                * this record overlaps the query range; follow pointer.
+                */
+               if (ldiff >= 0 && hdiff >= 0) {
+                       level--;
+                       error = xfs_btree_lookup_get_block(cur, level, pp,
+                                       &block);
+                       if (error)
+                               goto out;
+                       xfs_btree_get_block(cur, level, &bp);
+                       trace_xfs_btree_overlapped_query_range(cur, level, bp);
+#ifdef DEBUG
+                       error = xfs_btree_check_block(cur, block, level, bp);
+                       if (error)
+                               goto out;
+#endif
+                       cur->bc_ptrs[level] = 1;
+                       continue;
+               } else if (hdiff < 0) {
+                       /* The low key is larger than the upper range; pop. */
+                       goto pop_up;
+               }
+               cur->bc_ptrs[level]++;
+       }
+
+out:
+       /*
+        * If we don't end this function with the cursor pointing at a record
+        * block, a subsequent non-error cursor deletion will not release
+        * node-level buffers, causing a buffer leak.  This is quite possible
+        * with a zero-results range query, so release the buffers if we
+        * failed to return any results.
+        */
+       if (cur->bc_bufs[0] == NULL) {
+               for (i = 0; i < cur->bc_nlevels; i++) {
+                       if (cur->bc_bufs[i]) {
+                               xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[i]);
+                               cur->bc_bufs[i] = NULL;
+                               cur->bc_ptrs[i] = 0;
+                               cur->bc_ra[i] = 0;
+                       }
+               }
+       }
+
+       return error;
+}
+
+/*
+ * Query a btree for all records overlapping a given interval of keys.  The
+ * supplied function will be called with each record found; return one of the
+ * XFS_BTREE_QUERY_RANGE_{CONTINUE,ABORT} values or the usual negative error
+ * code.  This function returns XFS_BTREE_QUERY_RANGE_ABORT, zero, or a
+ * negative error code.
+ */
+int
+xfs_btree_query_range(
+       struct xfs_btree_cur            *cur,
+       union xfs_btree_irec            *low_rec,
+       union xfs_btree_irec            *high_rec,
+       xfs_btree_query_range_fn        fn,
+       void                            *priv)
+{
+       union xfs_btree_rec             rec;
+       union xfs_btree_key             low_key;
+       union xfs_btree_key             high_key;
+
+       /* Find the keys of both ends of the interval. */
+       cur->bc_rec = *high_rec;
+       cur->bc_ops->init_rec_from_cur(cur, &rec);
+       cur->bc_ops->init_key_from_rec(&high_key, &rec);
+
+       cur->bc_rec = *low_rec;
+       cur->bc_ops->init_rec_from_cur(cur, &rec);
+       cur->bc_ops->init_key_from_rec(&low_key, &rec);
+
+       /* Enforce low key < high key. */
+       if (cur->bc_ops->diff_two_keys(cur, &low_key, &high_key) > 0)
+               return -EINVAL;
+
+       if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
+               return xfs_btree_simple_query_range(cur, &low_key,
+                               &high_key, fn, priv);
+       return xfs_btree_overlapped_query_range(cur, &low_key, &high_key,
+                       fn, priv);
+}
index 785a996821591c89e9a74cea413789225c7ba890..04d0865e5e6dc0b3f5c30ac9f8a31f243f65baa2 100644 (file)
@@ -19,7 +19,7 @@
 #define        __XFS_BTREE_H__
 
 struct xfs_buf;
-struct xfs_bmap_free;
+struct xfs_defer_ops;
 struct xfs_inode;
 struct xfs_mount;
 struct xfs_trans;
@@ -38,17 +38,37 @@ union xfs_btree_ptr {
 };
 
 union xfs_btree_key {
-       xfs_bmbt_key_t          bmbt;
-       xfs_bmdr_key_t          bmbr;   /* bmbt root block */
-       xfs_alloc_key_t         alloc;
-       xfs_inobt_key_t         inobt;
+       struct xfs_bmbt_key             bmbt;
+       xfs_bmdr_key_t                  bmbr;   /* bmbt root block */
+       xfs_alloc_key_t                 alloc;
+       struct xfs_inobt_key            inobt;
+       struct xfs_rmap_key             rmap;
+};
+
+/*
+ * In-core key that holds both low and high keys for overlapped btrees.
+ * The two keys are packed next to each other on disk, so do the same
+ * in memory.  Preserve the existing xfs_btree_key as a single key to
+ * avoid the mental model breakage that would happen if we passed a
+ * bigkey into a function that operates on a single key.
+ */
+union xfs_btree_bigkey {
+       struct xfs_bmbt_key             bmbt;
+       xfs_bmdr_key_t                  bmbr;   /* bmbt root block */
+       xfs_alloc_key_t                 alloc;
+       struct xfs_inobt_key            inobt;
+       struct {
+               struct xfs_rmap_key     rmap;
+               struct xfs_rmap_key     rmap_hi;
+       };
 };
 
 union xfs_btree_rec {
-       xfs_bmbt_rec_t          bmbt;
-       xfs_bmdr_rec_t          bmbr;   /* bmbt root block */
-       xfs_alloc_rec_t         alloc;
-       xfs_inobt_rec_t         inobt;
+       struct xfs_bmbt_rec             bmbt;
+       xfs_bmdr_rec_t                  bmbr;   /* bmbt root block */
+       struct xfs_alloc_rec            alloc;
+       struct xfs_inobt_rec            inobt;
+       struct xfs_rmap_rec             rmap;
 };
 
 /*
@@ -63,6 +83,7 @@ union xfs_btree_rec {
 #define        XFS_BTNUM_BMAP  ((xfs_btnum_t)XFS_BTNUM_BMAPi)
 #define        XFS_BTNUM_INO   ((xfs_btnum_t)XFS_BTNUM_INOi)
 #define        XFS_BTNUM_FINO  ((xfs_btnum_t)XFS_BTNUM_FINOi)
+#define        XFS_BTNUM_RMAP  ((xfs_btnum_t)XFS_BTNUM_RMAPi)
 
 /*
  * For logging record fields.
@@ -95,6 +116,7 @@ do {    \
        case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_INC(__mp, bmbt, stat); break; \
        case XFS_BTNUM_INO: __XFS_BTREE_STATS_INC(__mp, ibt, stat); break; \
        case XFS_BTNUM_FINO: __XFS_BTREE_STATS_INC(__mp, fibt, stat); break; \
+       case XFS_BTNUM_RMAP: __XFS_BTREE_STATS_INC(__mp, rmap, stat); break; \
        case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break;       \
        }       \
 } while (0)
@@ -115,11 +137,13 @@ do {    \
                __XFS_BTREE_STATS_ADD(__mp, ibt, stat, val); break; \
        case XFS_BTNUM_FINO:    \
                __XFS_BTREE_STATS_ADD(__mp, fibt, stat, val); break; \
+       case XFS_BTNUM_RMAP:    \
+               __XFS_BTREE_STATS_ADD(__mp, rmap, stat, val); break; \
        case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \
        }       \
 } while (0)
 
-#define        XFS_BTREE_MAXLEVELS     8       /* max of all btrees */
+#define        XFS_BTREE_MAXLEVELS     9       /* max of all btrees */
 
 struct xfs_btree_ops {
        /* size of the key and record structures */
@@ -158,17 +182,25 @@ struct xfs_btree_ops {
        /* init values of btree structures */
        void    (*init_key_from_rec)(union xfs_btree_key *key,
                                     union xfs_btree_rec *rec);
-       void    (*init_rec_from_key)(union xfs_btree_key *key,
-                                    union xfs_btree_rec *rec);
        void    (*init_rec_from_cur)(struct xfs_btree_cur *cur,
                                     union xfs_btree_rec *rec);
        void    (*init_ptr_from_cur)(struct xfs_btree_cur *cur,
                                     union xfs_btree_ptr *ptr);
+       void    (*init_high_key_from_rec)(union xfs_btree_key *key,
+                                         union xfs_btree_rec *rec);
 
        /* difference between key value and cursor value */
        __int64_t (*key_diff)(struct xfs_btree_cur *cur,
                              union xfs_btree_key *key);
 
+       /*
+        * Difference between key2 and key1 -- positive if key1 > key2,
+        * negative if key1 < key2, and zero if equal.
+        */
+       __int64_t (*diff_two_keys)(struct xfs_btree_cur *cur,
+                                  union xfs_btree_key *key1,
+                                  union xfs_btree_key *key2);
+
        const struct xfs_buf_ops        *buf_ops;
 
 #if defined(DEBUG) || defined(XFS_WARN)
@@ -192,6 +224,13 @@ struct xfs_btree_ops {
 #define LASTREC_DELREC 2
 
 
+union xfs_btree_irec {
+       struct xfs_alloc_rec_incore     a;
+       struct xfs_bmbt_irec            b;
+       struct xfs_inobt_rec_incore     i;
+       struct xfs_rmap_irec            r;
+};
+
 /*
  * Btree cursor structure.
  * This collects all information needed by the btree code in one place.
@@ -202,11 +241,7 @@ typedef struct xfs_btree_cur
        struct xfs_mount        *bc_mp; /* file system mount struct */
        const struct xfs_btree_ops *bc_ops;
        uint                    bc_flags; /* btree features - below */
-       union {
-               xfs_alloc_rec_incore_t  a;
-               xfs_bmbt_irec_t         b;
-               xfs_inobt_rec_incore_t  i;
-       }               bc_rec;         /* current insert/search record value */
+       union xfs_btree_irec    bc_rec; /* current insert/search record value */
        struct xfs_buf  *bc_bufs[XFS_BTREE_MAXLEVELS];  /* buf ptr per level */
        int             bc_ptrs[XFS_BTREE_MAXLEVELS];   /* key/record # */
        __uint8_t       bc_ra[XFS_BTREE_MAXLEVELS];     /* readahead bits */
@@ -218,11 +253,12 @@ typedef struct xfs_btree_cur
        union {
                struct {                        /* needed for BNO, CNT, INO */
                        struct xfs_buf  *agbp;  /* agf/agi buffer pointer */
+                       struct xfs_defer_ops *dfops;    /* deferred updates */
                        xfs_agnumber_t  agno;   /* ag number */
                } a;
                struct {                        /* needed for BMAP */
                        struct xfs_inode *ip;   /* pointer to our inode */
-                       struct xfs_bmap_free *flist;    /* list to free after */
+                       struct xfs_defer_ops *dfops;    /* deferred updates */
                        xfs_fsblock_t   firstblock;     /* 1st blk allocated */
                        int             allocated;      /* count of alloced */
                        short           forksize;       /* fork's inode space */
@@ -238,6 +274,7 @@ typedef struct xfs_btree_cur
 #define XFS_BTREE_ROOT_IN_INODE                (1<<1)  /* root may be variable size */
 #define XFS_BTREE_LASTREC_UPDATE       (1<<2)  /* track last rec externally */
 #define XFS_BTREE_CRC_BLOCKS           (1<<3)  /* uses extended btree blocks */
+#define XFS_BTREE_OVERLAPPING          (1<<4)  /* overlapping intervals */
 
 
 #define        XFS_BTREE_NOERROR       0
@@ -477,4 +514,19 @@ bool xfs_btree_sblock_verify(struct xfs_buf *bp, unsigned int max_recs);
 uint xfs_btree_compute_maxlevels(struct xfs_mount *mp, uint *limits,
                                 unsigned long len);
 
+/* return codes */
+#define XFS_BTREE_QUERY_RANGE_CONTINUE 0       /* keep iterating */
+#define XFS_BTREE_QUERY_RANGE_ABORT    1       /* stop iterating */
+typedef int (*xfs_btree_query_range_fn)(struct xfs_btree_cur *cur,
+               union xfs_btree_rec *rec, void *priv);
+
+int xfs_btree_query_range(struct xfs_btree_cur *cur,
+               union xfs_btree_irec *low_rec, union xfs_btree_irec *high_rec,
+               xfs_btree_query_range_fn fn, void *priv);
+
+typedef int (*xfs_btree_visit_blocks_fn)(struct xfs_btree_cur *cur, int level,
+               void *data);
+int xfs_btree_visit_blocks(struct xfs_btree_cur *cur,
+               xfs_btree_visit_blocks_fn fn, void *data);
+
 #endif /* __XFS_BTREE_H__ */
index 0f1f165f404864dc31e4a43b56124610ab6b57cc..f2dc1a950c85c691aac4dd00d3c0c640fc8f1543 100644 (file)
@@ -2029,7 +2029,7 @@ xfs_da_grow_inode_int(
        error = xfs_bmapi_write(tp, dp, *bno, count,
                        xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA|XFS_BMAPI_CONTIG,
                        args->firstblock, args->total, &map, &nmap,
-                       args->flist);
+                       args->dfops);
        if (error)
                return error;
 
@@ -2052,7 +2052,7 @@ xfs_da_grow_inode_int(
                        error = xfs_bmapi_write(tp, dp, b, c,
                                        xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA,
                                        args->firstblock, args->total,
-                                       &mapp[mapi], &nmap, args->flist);
+                                       &mapp[mapi], &nmap, args->dfops);
                        if (error)
                                goto out_free_map;
                        if (nmap < 1)
@@ -2362,7 +2362,7 @@ xfs_da_shrink_inode(
                 */
                error = xfs_bunmapi(tp, dp, dead_blkno, count,
                                    xfs_bmapi_aflag(w), 0, args->firstblock,
-                                   args->flist, &done);
+                                   args->dfops, &done);
                if (error == -ENOSPC) {
                        if (w != XFS_DATA_FORK)
                                break;
index 6e153e399a7759ea5d73270c7e63645221f7026b..98c75cbe6ac2ec6063bf381cead3cea002a87dff 100644 (file)
@@ -19,7 +19,7 @@
 #ifndef __XFS_DA_BTREE_H__
 #define        __XFS_DA_BTREE_H__
 
-struct xfs_bmap_free;
+struct xfs_defer_ops;
 struct xfs_inode;
 struct xfs_trans;
 struct zone;
@@ -70,7 +70,7 @@ typedef struct xfs_da_args {
        xfs_ino_t       inumber;        /* input/output inode number */
        struct xfs_inode *dp;           /* directory inode to manipulate */
        xfs_fsblock_t   *firstblock;    /* ptr to firstblock for bmap calls */
-       struct xfs_bmap_free *flist;    /* ptr to freelist for bmap_finish */
+       struct xfs_defer_ops *dfops;    /* ptr to freelist for bmap_finish */
        struct xfs_trans *trans;        /* current trans (changes over time) */
        xfs_extlen_t    total;          /* total blocks needed, for 1st bmap */
        int             whichfork;      /* data or attribute fork */
index 685f23b670568ea3df1f2944d70ac6e3ea2cc238..9a492a9e19bd0af9e52532ebb99ca7ec114b2f5d 100644 (file)
@@ -629,6 +629,7 @@ typedef struct xfs_attr_shortform {
        struct xfs_attr_sf_hdr {        /* constant-structure header block */
                __be16  totsize;        /* total bytes in shortform list */
                __u8    count;  /* count of active entries */
+               __u8    padding;
        } hdr;
        struct xfs_attr_sf_entry {
                __uint8_t namelen;      /* actual length of name (no NULL) */
diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c
new file mode 100644 (file)
index 0000000..054a203
--- /dev/null
@@ -0,0 +1,463 @@
+/*
+ * Copyright (C) 2016 Oracle.  All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_trans.h"
+#include "xfs_trace.h"
+
+/*
+ * Deferred Operations in XFS
+ *
+ * Due to the way locking rules work in XFS, certain transactions (block
+ * mapping and unmapping, typically) have permanent reservations so that
+ * we can roll the transaction to adhere to AG locking order rules and
+ * to unlock buffers between metadata updates.  Prior to rmap/reflink,
+ * the mapping code had a mechanism to perform these deferrals for
+ * extents that were going to be freed; this code makes that facility
+ * more generic.
+ *
+ * When adding the reverse mapping and reflink features, it became
+ * necessary to perform complex remapping multi-transactions to comply
+ * with AG locking order rules, and to be able to spread a single
+ * refcount update operation (an operation on an n-block extent can
+ * update as many as n records!) among multiple transactions.  XFS can
+ * roll a transaction to facilitate this, but using this facility
+ * requires us to log "intent" items in case log recovery needs to
+ * redo the operation, and to log "done" items to indicate that redo
+ * is not necessary.
+ *
+ * Deferred work is tracked in xfs_defer_pending items.  Each pending
+ * item tracks one type of deferred work.  Incoming work items (which
+ * have not yet had an intent logged) are attached to a pending item
+ * on the dop_intake list, where they wait for the caller to finish
+ * the deferred operations.
+ *
+ * Finishing a set of deferred operations is an involved process.  To
+ * start, we define "rolling a deferred-op transaction" as follows:
+ *
+ * > For each xfs_defer_pending item on the dop_intake list,
+ *   - Sort the work items in AG order.  XFS locking
+ *     order rules require us to lock buffers in AG order.
+ *   - Create a log intent item for that type.
+ *   - Attach it to the pending item.
+ *   - Move the pending item from the dop_intake list to the
+ *     dop_pending list.
+ * > Roll the transaction.
+ *
+ * NOTE: To avoid exceeding the transaction reservation, we limit the
+ * number of items that we attach to a given xfs_defer_pending.
+ *
+ * The actual finishing process looks like this:
+ *
+ * > For each xfs_defer_pending in the dop_pending list,
+ *   - Roll the deferred-op transaction as above.
+ *   - Create a log done item for that type, and attach it to the
+ *     log intent item.
+ *   - For each work item attached to the log intent item,
+ *     * Perform the described action.
+ *     * Attach the work item to the log done item.
+ *
+ * The key here is that we must log an intent item for all pending
+ * work items every time we roll the transaction, and that we must log
+ * a done item as soon as the work is completed.  With this mechanism
+ * we can perform complex remapping operations, chaining intent items
+ * as needed.
+ *
+ * This is an example of remapping the extent (E, E+B) into file X at
+ * offset A and dealing with the extent (C, C+B) already being mapped
+ * there:
+ * +-------------------------------------------------+
+ * | Unmap file X startblock C offset A length B     | t0
+ * | Intent to reduce refcount for extent (C, B)     |
+ * | Intent to remove rmap (X, C, A, B)              |
+ * | Intent to free extent (D, 1) (bmbt block)       |
+ * | Intent to map (X, A, B) at startblock E         |
+ * +-------------------------------------------------+
+ * | Map file X startblock E offset A length B       | t1
+ * | Done mapping (X, E, A, B)                       |
+ * | Intent to increase refcount for extent (E, B)   |
+ * | Intent to add rmap (X, E, A, B)                 |
+ * +-------------------------------------------------+
+ * | Reduce refcount for extent (C, B)               | t2
+ * | Done reducing refcount for extent (C, B)        |
+ * | Increase refcount for extent (E, B)             |
+ * | Done increasing refcount for extent (E, B)      |
+ * | Intent to free extent (C, B)                    |
+ * | Intent to free extent (F, 1) (refcountbt block) |
+ * | Intent to remove rmap (F, 1, REFC)              |
+ * +-------------------------------------------------+
+ * | Remove rmap (X, C, A, B)                        | t3
+ * | Done removing rmap (X, C, A, B)                 |
+ * | Add rmap (X, E, A, B)                           |
+ * | Done adding rmap (X, E, A, B)                   |
+ * | Remove rmap (F, 1, REFC)                        |
+ * | Done removing rmap (F, 1, REFC)                 |
+ * +-------------------------------------------------+
+ * | Free extent (C, B)                              | t4
+ * | Done freeing extent (C, B)                      |
+ * | Free extent (D, 1)                              |
+ * | Done freeing extent (D, 1)                      |
+ * | Free extent (F, 1)                              |
+ * | Done freeing extent (F, 1)                      |
+ * +-------------------------------------------------+
+ *
+ * If we should crash before t2 commits, log recovery replays
+ * the following intent items:
+ *
+ * - Intent to reduce refcount for extent (C, B)
+ * - Intent to remove rmap (X, C, A, B)
+ * - Intent to free extent (D, 1) (bmbt block)
+ * - Intent to increase refcount for extent (E, B)
+ * - Intent to add rmap (X, E, A, B)
+ *
+ * In the process of recovering, it should also generate and take care
+ * of these intent items:
+ *
+ * - Intent to free extent (C, B)
+ * - Intent to free extent (F, 1) (refcountbt block)
+ * - Intent to remove rmap (F, 1, REFC)
+ */
+
+static const struct xfs_defer_op_type *defer_op_types[XFS_DEFER_OPS_TYPE_MAX];
+
+/*
+ * For each pending item in the intake list, log its intent item and the
+ * associated extents, then add the entire intake list to the end of
+ * the pending list.
+ */
+STATIC void
+xfs_defer_intake_work(
+       struct xfs_trans                *tp,
+       struct xfs_defer_ops            *dop)
+{
+       struct list_head                *li;
+       struct xfs_defer_pending        *dfp;
+
+       list_for_each_entry(dfp, &dop->dop_intake, dfp_list) {
+               trace_xfs_defer_intake_work(tp->t_mountp, dfp);
+               dfp->dfp_intent = dfp->dfp_type->create_intent(tp,
+                               dfp->dfp_count);
+               list_sort(tp->t_mountp, &dfp->dfp_work,
+                               dfp->dfp_type->diff_items);
+               list_for_each(li, &dfp->dfp_work)
+                       dfp->dfp_type->log_item(tp, dfp->dfp_intent, li);
+       }
+
+       list_splice_tail_init(&dop->dop_intake, &dop->dop_pending);
+}
+
+/* Abort all the intents that were committed. */
+STATIC void
+xfs_defer_trans_abort(
+       struct xfs_trans                *tp,
+       struct xfs_defer_ops            *dop,
+       int                             error)
+{
+       struct xfs_defer_pending        *dfp;
+
+       trace_xfs_defer_trans_abort(tp->t_mountp, dop);
+       /*
+        * If the transaction was committed, drop the intent reference
+        * since we're bailing out of here. The other reference is
+        * dropped when the intent hits the AIL.  If the transaction
+        * was not committed, the intent is freed by the intent item
+        * unlock handler on abort.
+        */
+       if (!dop->dop_committed)
+               return;
+
+       /* Abort intent items. */
+       list_for_each_entry(dfp, &dop->dop_pending, dfp_list) {
+               trace_xfs_defer_pending_abort(tp->t_mountp, dfp);
+               if (dfp->dfp_committed)
+                       dfp->dfp_type->abort_intent(dfp->dfp_intent);
+       }
+
+       /* Shut down FS. */
+       xfs_force_shutdown(tp->t_mountp, (error == -EFSCORRUPTED) ?
+                       SHUTDOWN_CORRUPT_INCORE : SHUTDOWN_META_IO_ERROR);
+}
+
+/* Roll a transaction so we can do some deferred op processing. */
+STATIC int
+xfs_defer_trans_roll(
+       struct xfs_trans                **tp,
+       struct xfs_defer_ops            *dop,
+       struct xfs_inode                *ip)
+{
+       int                             i;
+       int                             error;
+
+       /* Log all the joined inodes except the one we passed in. */
+       for (i = 0; i < XFS_DEFER_OPS_NR_INODES && dop->dop_inodes[i]; i++) {
+               if (dop->dop_inodes[i] == ip)
+                       continue;
+               xfs_trans_log_inode(*tp, dop->dop_inodes[i], XFS_ILOG_CORE);
+       }
+
+       trace_xfs_defer_trans_roll((*tp)->t_mountp, dop);
+
+       /* Roll the transaction. */
+       error = xfs_trans_roll(tp, ip);
+       if (error) {
+               trace_xfs_defer_trans_roll_error((*tp)->t_mountp, dop, error);
+               xfs_defer_trans_abort(*tp, dop, error);
+               return error;
+       }
+       dop->dop_committed = true;
+
+       /* Rejoin the joined inodes except the one we passed in. */
+       for (i = 0; i < XFS_DEFER_OPS_NR_INODES && dop->dop_inodes[i]; i++) {
+               if (dop->dop_inodes[i] == ip)
+                       continue;
+               xfs_trans_ijoin(*tp, dop->dop_inodes[i], 0);
+       }
+
+       return error;
+}
+
+/* Do we have any work items to finish? */
+bool
+xfs_defer_has_unfinished_work(
+       struct xfs_defer_ops            *dop)
+{
+       return !list_empty(&dop->dop_pending) || !list_empty(&dop->dop_intake);
+}
+
+/*
+ * Add this inode to the deferred op.  Each joined inode is relogged
+ * each time we roll the transaction, in addition to any inode passed
+ * to xfs_defer_finish().
+ */
+int
+xfs_defer_join(
+       struct xfs_defer_ops            *dop,
+       struct xfs_inode                *ip)
+{
+       int                             i;
+
+       for (i = 0; i < XFS_DEFER_OPS_NR_INODES; i++) {
+               if (dop->dop_inodes[i] == ip)
+                       return 0;
+               else if (dop->dop_inodes[i] == NULL) {
+                       dop->dop_inodes[i] = ip;
+                       return 0;
+               }
+       }
+
+       return -EFSCORRUPTED;
+}
+
+/*
+ * Finish all the pending work.  This involves logging intent items for
+ * any work items that wandered in since the last transaction roll (if
+ * one has even happened), rolling the transaction, and finishing the
+ * work items in the first item on the logged-and-pending list.
+ *
+ * If an inode is provided, relog it to the new transaction.
+ */
+int
+xfs_defer_finish(
+       struct xfs_trans                **tp,
+       struct xfs_defer_ops            *dop,
+       struct xfs_inode                *ip)
+{
+       struct xfs_defer_pending        *dfp;
+       struct list_head                *li;
+       struct list_head                *n;
+       void                            *done_item = NULL;
+       void                            *state;
+       int                             error = 0;
+       void                            (*cleanup_fn)(struct xfs_trans *, void *, int);
+
+       ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
+
+       trace_xfs_defer_finish((*tp)->t_mountp, dop);
+
+       /* Until we run out of pending work to finish... */
+       while (xfs_defer_has_unfinished_work(dop)) {
+               /* Log intents for work items sitting in the intake. */
+               xfs_defer_intake_work(*tp, dop);
+
+               /* Roll the transaction. */
+               error = xfs_defer_trans_roll(tp, dop, ip);
+               if (error)
+                       goto out;
+
+               /* Mark all pending intents as committed. */
+               list_for_each_entry_reverse(dfp, &dop->dop_pending, dfp_list) {
+                       if (dfp->dfp_committed)
+                               break;
+                       trace_xfs_defer_pending_commit((*tp)->t_mountp, dfp);
+                       dfp->dfp_committed = true;
+               }
+
+               /* Log an intent-done item for the first pending item. */
+               dfp = list_first_entry(&dop->dop_pending,
+                               struct xfs_defer_pending, dfp_list);
+               trace_xfs_defer_pending_finish((*tp)->t_mountp, dfp);
+               done_item = dfp->dfp_type->create_done(*tp, dfp->dfp_intent,
+                               dfp->dfp_count);
+               cleanup_fn = dfp->dfp_type->finish_cleanup;
+
+               /* Finish the work items. */
+               state = NULL;
+               list_for_each_safe(li, n, &dfp->dfp_work) {
+                       list_del(li);
+                       dfp->dfp_count--;
+                       error = dfp->dfp_type->finish_item(*tp, dop, li,
+                                       done_item, &state);
+                       if (error) {
+                               /*
+                                * Clean up after ourselves and jump out.
+                                * xfs_defer_cancel will take care of freeing
+                                * all these lists and stuff.
+                                */
+                               if (cleanup_fn)
+                                       cleanup_fn(*tp, state, error);
+                               xfs_defer_trans_abort(*tp, dop, error);
+                               goto out;
+                       }
+               }
+               /* Done with the dfp, free it. */
+               list_del(&dfp->dfp_list);
+               kmem_free(dfp);
+
+               if (cleanup_fn)
+                       cleanup_fn(*tp, state, error);
+       }
+
+out:
+       if (error)
+               trace_xfs_defer_finish_error((*tp)->t_mountp, dop, error);
+       else
+               trace_xfs_defer_finish_done((*tp)->t_mountp, dop);
+       return error;
+}
+
+/*
+ * Free up any items left in the list.
+ */
+void
+xfs_defer_cancel(
+       struct xfs_defer_ops            *dop)
+{
+       struct xfs_defer_pending        *dfp;
+       struct xfs_defer_pending        *pli;
+       struct list_head                *pwi;
+       struct list_head                *n;
+
+       trace_xfs_defer_cancel(NULL, dop);
+
+       /*
+        * Free the pending items.  Caller should already have arranged
+        * for the intent items to be released.
+        */
+       list_for_each_entry_safe(dfp, pli, &dop->dop_intake, dfp_list) {
+               trace_xfs_defer_intake_cancel(NULL, dfp);
+               list_del(&dfp->dfp_list);
+               list_for_each_safe(pwi, n, &dfp->dfp_work) {
+                       list_del(pwi);
+                       dfp->dfp_count--;
+                       dfp->dfp_type->cancel_item(pwi);
+               }
+               ASSERT(dfp->dfp_count == 0);
+               kmem_free(dfp);
+       }
+       list_for_each_entry_safe(dfp, pli, &dop->dop_pending, dfp_list) {
+               trace_xfs_defer_pending_cancel(NULL, dfp);
+               list_del(&dfp->dfp_list);
+               list_for_each_safe(pwi, n, &dfp->dfp_work) {
+                       list_del(pwi);
+                       dfp->dfp_count--;
+                       dfp->dfp_type->cancel_item(pwi);
+               }
+               ASSERT(dfp->dfp_count == 0);
+               kmem_free(dfp);
+       }
+}
+
+/* Add an item for later deferred processing. */
+void
+xfs_defer_add(
+       struct xfs_defer_ops            *dop,
+       enum xfs_defer_ops_type         type,
+       struct list_head                *li)
+{
+       struct xfs_defer_pending        *dfp = NULL;
+
+       /*
+        * Add the item to a pending item at the end of the intake list.
+        * If the last pending item has the same type, reuse it.  Else,
+        * create a new pending item at the end of the intake list.
+        */
+       if (!list_empty(&dop->dop_intake)) {
+               dfp = list_last_entry(&dop->dop_intake,
+                               struct xfs_defer_pending, dfp_list);
+               if (dfp->dfp_type->type != type ||
+                   (dfp->dfp_type->max_items &&
+                    dfp->dfp_count >= dfp->dfp_type->max_items))
+                       dfp = NULL;
+       }
+       if (!dfp) {
+               dfp = kmem_alloc(sizeof(struct xfs_defer_pending),
+                               KM_SLEEP | KM_NOFS);
+               dfp->dfp_type = defer_op_types[type];
+               dfp->dfp_committed = false;
+               dfp->dfp_intent = NULL;
+               dfp->dfp_count = 0;
+               INIT_LIST_HEAD(&dfp->dfp_work);
+               list_add_tail(&dfp->dfp_list, &dop->dop_intake);
+       }
+
+       list_add_tail(li, &dfp->dfp_work);
+       dfp->dfp_count++;
+}
+
+/* Initialize a deferred operation list. */
+void
+xfs_defer_init_op_type(
+       const struct xfs_defer_op_type  *type)
+{
+       defer_op_types[type->type] = type;
+}
+
+/* Initialize a deferred operation. */
+void
+xfs_defer_init(
+       struct xfs_defer_ops            *dop,
+       xfs_fsblock_t                   *fbp)
+{
+       dop->dop_committed = false;
+       dop->dop_low = false;
+       memset(&dop->dop_inodes, 0, sizeof(dop->dop_inodes));
+       *fbp = NULLFSBLOCK;
+       INIT_LIST_HEAD(&dop->dop_intake);
+       INIT_LIST_HEAD(&dop->dop_pending);
+       trace_xfs_defer_init(NULL, dop);
+}
diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h
new file mode 100644 (file)
index 0000000..cc3981c
--- /dev/null
@@ -0,0 +1,97 @@
+/*
+ * Copyright (C) 2016 Oracle.  All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+#ifndef __XFS_DEFER_H__
+#define        __XFS_DEFER_H__
+
+struct xfs_defer_op_type;
+
+/*
+ * Save a log intent item and a list of extents, so that we can replay
+ * whatever action had to happen to the extent list and file the log done
+ * item.
+ */
+struct xfs_defer_pending {
+       const struct xfs_defer_op_type  *dfp_type;      /* function pointers */
+       struct list_head                dfp_list;       /* pending items */
+       bool                            dfp_committed;  /* committed trans? */
+       void                            *dfp_intent;    /* log intent item */
+       struct list_head                dfp_work;       /* work items */
+       unsigned int                    dfp_count;      /* # extent items */
+};
+
+/*
+ * Header for deferred operation list.
+ *
+ * dop_low is used by the allocator to activate the lowspace algorithm -
+ * when free space is running low the extent allocator may choose to
+ * allocate an extent from an AG without leaving sufficient space for
+ * a btree split when inserting the new extent.  In this case the allocator
+ * will enable the lowspace algorithm which is supposed to allow further
+ * allocations (such as btree splits and newroots) to allocate from
+ * sequential AGs.  In order to avoid locking AGs out of order the lowspace
+ * algorithm will start searching for free space from AG 0.  If the correct
+ * transaction reservations have been made then this algorithm will eventually
+ * find all the space it needs.
+ */
+enum xfs_defer_ops_type {
+       XFS_DEFER_OPS_TYPE_RMAP,
+       XFS_DEFER_OPS_TYPE_FREE,
+       XFS_DEFER_OPS_TYPE_MAX,
+};
+
+#define XFS_DEFER_OPS_NR_INODES        2       /* join up to two inodes */
+
+struct xfs_defer_ops {
+       bool                    dop_committed;  /* did any trans commit? */
+       bool                    dop_low;        /* alloc in low mode */
+       struct list_head        dop_intake;     /* unlogged pending work */
+       struct list_head        dop_pending;    /* logged pending work */
+
+       /* relog these inodes with each roll */
+       struct xfs_inode        *dop_inodes[XFS_DEFER_OPS_NR_INODES];
+};
+
+void xfs_defer_add(struct xfs_defer_ops *dop, enum xfs_defer_ops_type type,
+               struct list_head *h);
+int xfs_defer_finish(struct xfs_trans **tp, struct xfs_defer_ops *dop,
+               struct xfs_inode *ip);
+void xfs_defer_cancel(struct xfs_defer_ops *dop);
+void xfs_defer_init(struct xfs_defer_ops *dop, xfs_fsblock_t *fbp);
+bool xfs_defer_has_unfinished_work(struct xfs_defer_ops *dop);
+int xfs_defer_join(struct xfs_defer_ops *dop, struct xfs_inode *ip);
+
+/* Description of a deferred type. */
+struct xfs_defer_op_type {
+       enum xfs_defer_ops_type type;
+       unsigned int            max_items;
+       void (*abort_intent)(void *);
+       void *(*create_done)(struct xfs_trans *, void *, unsigned int);
+       int (*finish_item)(struct xfs_trans *, struct xfs_defer_ops *,
+                       struct list_head *, void *, void **);
+       void (*finish_cleanup)(struct xfs_trans *, void *, int);
+       void (*cancel_item)(struct list_head *);
+       int (*diff_items)(void *, struct list_head *, struct list_head *);
+       void *(*create_intent)(struct xfs_trans *, uint);
+       void (*log_item)(struct xfs_trans *, void *, struct list_head *);
+};
+
+void xfs_defer_init_op_type(const struct xfs_defer_op_type *type);
+
+#endif /* __XFS_DEFER_H__ */
index af0f9d171f8a012758d778a0bd105e51448e5cf3..20a96dd5af7eb6d4ebbaf07a1f6a1e4b10e7f981 100644 (file)
@@ -21,6 +21,7 @@
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_mount.h"
+#include "xfs_defer.h"
 #include "xfs_da_format.h"
 #include "xfs_da_btree.h"
 #include "xfs_inode.h"
@@ -259,7 +260,7 @@ xfs_dir_createname(
        struct xfs_name         *name,
        xfs_ino_t               inum,           /* new entry inode number */
        xfs_fsblock_t           *first,         /* bmap's firstblock */
-       xfs_bmap_free_t         *flist,         /* bmap's freeblock list */
+       struct xfs_defer_ops    *dfops,         /* bmap's freeblock list */
        xfs_extlen_t            total)          /* bmap's total block count */
 {
        struct xfs_da_args      *args;
@@ -286,7 +287,7 @@ xfs_dir_createname(
        args->inumber = inum;
        args->dp = dp;
        args->firstblock = first;
-       args->flist = flist;
+       args->dfops = dfops;
        args->total = total;
        args->whichfork = XFS_DATA_FORK;
        args->trans = tp;
@@ -436,7 +437,7 @@ xfs_dir_removename(
        struct xfs_name *name,
        xfs_ino_t       ino,
        xfs_fsblock_t   *first,         /* bmap's firstblock */
-       xfs_bmap_free_t *flist,         /* bmap's freeblock list */
+       struct xfs_defer_ops    *dfops,         /* bmap's freeblock list */
        xfs_extlen_t    total)          /* bmap's total block count */
 {
        struct xfs_da_args *args;
@@ -458,7 +459,7 @@ xfs_dir_removename(
        args->inumber = ino;
        args->dp = dp;
        args->firstblock = first;
-       args->flist = flist;
+       args->dfops = dfops;
        args->total = total;
        args->whichfork = XFS_DATA_FORK;
        args->trans = tp;
@@ -498,7 +499,7 @@ xfs_dir_replace(
        struct xfs_name *name,          /* name of entry to replace */
        xfs_ino_t       inum,           /* new inode number */
        xfs_fsblock_t   *first,         /* bmap's firstblock */
-       xfs_bmap_free_t *flist,         /* bmap's freeblock list */
+       struct xfs_defer_ops    *dfops,         /* bmap's freeblock list */
        xfs_extlen_t    total)          /* bmap's total block count */
 {
        struct xfs_da_args *args;
@@ -523,7 +524,7 @@ xfs_dir_replace(
        args->inumber = inum;
        args->dp = dp;
        args->firstblock = first;
-       args->flist = flist;
+       args->dfops = dfops;
        args->total = total;
        args->whichfork = XFS_DATA_FORK;
        args->trans = tp;
@@ -680,7 +681,7 @@ xfs_dir2_shrink_inode(
 
        /* Unmap the fsblock(s). */
        error = xfs_bunmapi(tp, dp, da, args->geo->fsbcount, 0, 0,
-                           args->firstblock, args->flist, &done);
+                           args->firstblock, args->dfops, &done);
        if (error) {
                /*
                 * ENOSPC actually can happen if we're in a removename with no
index e55353651f5b8678c14b0c22cbe3da1faae29548..becc926c3e3d900db0a021dd091e46e828fa6f09 100644 (file)
@@ -18,7 +18,7 @@
 #ifndef __XFS_DIR2_H__
 #define __XFS_DIR2_H__
 
-struct xfs_bmap_free;
+struct xfs_defer_ops;
 struct xfs_da_args;
 struct xfs_inode;
 struct xfs_mount;
@@ -129,18 +129,18 @@ extern int xfs_dir_init(struct xfs_trans *tp, struct xfs_inode *dp,
 extern int xfs_dir_createname(struct xfs_trans *tp, struct xfs_inode *dp,
                                struct xfs_name *name, xfs_ino_t inum,
                                xfs_fsblock_t *first,
-                               struct xfs_bmap_free *flist, xfs_extlen_t tot);
+                               struct xfs_defer_ops *dfops, xfs_extlen_t tot);
 extern int xfs_dir_lookup(struct xfs_trans *tp, struct xfs_inode *dp,
                                struct xfs_name *name, xfs_ino_t *inum,
                                struct xfs_name *ci_name);
 extern int xfs_dir_removename(struct xfs_trans *tp, struct xfs_inode *dp,
                                struct xfs_name *name, xfs_ino_t ino,
                                xfs_fsblock_t *first,
-                               struct xfs_bmap_free *flist, xfs_extlen_t tot);
+                               struct xfs_defer_ops *dfops, xfs_extlen_t tot);
 extern int xfs_dir_replace(struct xfs_trans *tp, struct xfs_inode *dp,
                                struct xfs_name *name, xfs_ino_t inum,
                                xfs_fsblock_t *first,
-                               struct xfs_bmap_free *flist, xfs_extlen_t tot);
+                               struct xfs_defer_ops *dfops, xfs_extlen_t tot);
 extern int xfs_dir_canenter(struct xfs_trans *tp, struct xfs_inode *dp,
                                struct xfs_name *name);
 
index adb204d40f2246c181051566e385463f077e15ca..f814d42c73b2fb7484dd76ad024f63891de155f0 100644 (file)
@@ -455,8 +455,10 @@ xfs_sb_has_compat_feature(
 }
 
 #define XFS_SB_FEAT_RO_COMPAT_FINOBT   (1 << 0)                /* free inode btree */
+#define XFS_SB_FEAT_RO_COMPAT_RMAPBT   (1 << 1)                /* reverse map btree */
 #define XFS_SB_FEAT_RO_COMPAT_ALL \
-               (XFS_SB_FEAT_RO_COMPAT_FINOBT)
+               (XFS_SB_FEAT_RO_COMPAT_FINOBT | \
+                XFS_SB_FEAT_RO_COMPAT_RMAPBT)
 #define XFS_SB_FEAT_RO_COMPAT_UNKNOWN  ~XFS_SB_FEAT_RO_COMPAT_ALL
 static inline bool
 xfs_sb_has_ro_compat_feature(
@@ -538,6 +540,12 @@ static inline bool xfs_sb_version_hasmetauuid(struct xfs_sb *sbp)
                (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_META_UUID);
 }
 
+static inline bool xfs_sb_version_hasrmapbt(struct xfs_sb *sbp)
+{
+       return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) &&
+               (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_RMAPBT);
+}
+
 /*
  * end of superblock version macros
  */
@@ -598,10 +606,10 @@ xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino)
 #define        XFS_AGI_GOOD_VERSION(v) ((v) == XFS_AGI_VERSION)
 
 /*
- * Btree number 0 is bno, 1 is cnt This value gives the size of the
+ * Btree number 0 is bno, 1 is cnt, 2 is rmap. This value gives the size of the
  * arrays below.
  */
-#define        XFS_BTNUM_AGF   ((int)XFS_BTNUM_CNTi + 1)
+#define        XFS_BTNUM_AGF   ((int)XFS_BTNUM_RMAPi + 1)
 
 /*
  * The second word of agf_levels in the first a.g. overlaps the EFS
@@ -618,12 +626,10 @@ typedef struct xfs_agf {
        __be32          agf_seqno;      /* sequence # starting from 0 */
        __be32          agf_length;     /* size in blocks of a.g. */
        /*
-        * Freespace information
+        * Freespace and rmap information
         */
        __be32          agf_roots[XFS_BTNUM_AGF];       /* root blocks */
-       __be32          agf_spare0;     /* spare field */
        __be32          agf_levels[XFS_BTNUM_AGF];      /* btree levels */
-       __be32          agf_spare1;     /* spare field */
 
        __be32          agf_flfirst;    /* first freelist block's index */
        __be32          agf_fllast;     /* last freelist block's index */
@@ -1308,17 +1314,118 @@ typedef __be32 xfs_inobt_ptr_t;
 #define        XFS_FIBT_BLOCK(mp)              ((xfs_agblock_t)(XFS_IBT_BLOCK(mp) + 1))
 
 /*
- * The first data block of an AG depends on whether the filesystem was formatted
- * with the finobt feature. If so, account for the finobt reserved root btree
- * block.
+ * Reverse mapping btree format definitions
+ *
+ * There is a btree for the reverse map per allocation group
+ */
+#define        XFS_RMAP_CRC_MAGIC      0x524d4233      /* 'RMB3' */
+
+/*
+ * Ownership info for an extent.  This is used to create reverse-mapping
+ * entries.
  */
-#define XFS_PREALLOC_BLOCKS(mp) \
+#define XFS_OWNER_INFO_ATTR_FORK       (1 << 0)
+#define XFS_OWNER_INFO_BMBT_BLOCK      (1 << 1)
+struct xfs_owner_info {
+       uint64_t                oi_owner;
+       xfs_fileoff_t           oi_offset;
+       unsigned int            oi_flags;
+};
+
+/*
+ * Special owner types.
+ *
+ * Seeing as we only support up to 8EB, we have the upper bit of the owner field
+ * to tell us we have a special owner value. We use these for static metadata
+ * allocated at mkfs/growfs time, as well as for freespace management metadata.
+ */
+#define XFS_RMAP_OWN_NULL      (-1ULL) /* No owner, for growfs */
+#define XFS_RMAP_OWN_UNKNOWN   (-2ULL) /* Unknown owner, for EFI recovery */
+#define XFS_RMAP_OWN_FS                (-3ULL) /* static fs metadata */
+#define XFS_RMAP_OWN_LOG       (-4ULL) /* static fs metadata */
+#define XFS_RMAP_OWN_AG                (-5ULL) /* AG freespace btree blocks */
+#define XFS_RMAP_OWN_INOBT     (-6ULL) /* Inode btree blocks */
+#define XFS_RMAP_OWN_INODES    (-7ULL) /* Inode chunk */
+#define XFS_RMAP_OWN_MIN       (-8ULL) /* guard */
+
+#define XFS_RMAP_NON_INODE_OWNER(owner)        (!!((owner) & (1ULL << 63)))
+
+/*
+ * Data record structure
+ */
+struct xfs_rmap_rec {
+       __be32          rm_startblock;  /* extent start block */
+       __be32          rm_blockcount;  /* extent length */
+       __be64          rm_owner;       /* extent owner */
+       __be64          rm_offset;      /* offset within the owner */
+};
+
+/*
+ * rmap btree record
+ *  rm_offset:63 is the attribute fork flag
+ *  rm_offset:62 is the bmbt block flag
+ *  rm_offset:61 is the unwritten extent flag (same as l0:63 in bmbt)
+ *  rm_offset:54-60 aren't used and should be zero
+ *  rm_offset:0-53 is the block offset within the inode
+ */
+#define XFS_RMAP_OFF_ATTR_FORK ((__uint64_t)1ULL << 63)
+#define XFS_RMAP_OFF_BMBT_BLOCK        ((__uint64_t)1ULL << 62)
+#define XFS_RMAP_OFF_UNWRITTEN ((__uint64_t)1ULL << 61)
+
+#define XFS_RMAP_LEN_MAX       ((__uint32_t)~0U)
+#define XFS_RMAP_OFF_FLAGS     (XFS_RMAP_OFF_ATTR_FORK | \
+                                XFS_RMAP_OFF_BMBT_BLOCK | \
+                                XFS_RMAP_OFF_UNWRITTEN)
+#define XFS_RMAP_OFF_MASK      ((__uint64_t)0x3FFFFFFFFFFFFFULL)
+
+#define XFS_RMAP_OFF(off)              ((off) & XFS_RMAP_OFF_MASK)
+
+#define XFS_RMAP_IS_BMBT_BLOCK(off)    (!!((off) & XFS_RMAP_OFF_BMBT_BLOCK))
+#define XFS_RMAP_IS_ATTR_FORK(off)     (!!((off) & XFS_RMAP_OFF_ATTR_FORK))
+#define XFS_RMAP_IS_UNWRITTEN(len)     (!!((off) & XFS_RMAP_OFF_UNWRITTEN))
+
+#define RMAPBT_STARTBLOCK_BITLEN       32
+#define RMAPBT_BLOCKCOUNT_BITLEN       32
+#define RMAPBT_OWNER_BITLEN            64
+#define RMAPBT_ATTRFLAG_BITLEN         1
+#define RMAPBT_BMBTFLAG_BITLEN         1
+#define RMAPBT_EXNTFLAG_BITLEN         1
+#define RMAPBT_UNUSED_OFFSET_BITLEN    7
+#define RMAPBT_OFFSET_BITLEN           54
+
+#define XFS_RMAP_ATTR_FORK             (1 << 0)
+#define XFS_RMAP_BMBT_BLOCK            (1 << 1)
+#define XFS_RMAP_UNWRITTEN             (1 << 2)
+#define XFS_RMAP_KEY_FLAGS             (XFS_RMAP_ATTR_FORK | \
+                                        XFS_RMAP_BMBT_BLOCK)
+#define XFS_RMAP_REC_FLAGS             (XFS_RMAP_UNWRITTEN)
+struct xfs_rmap_irec {
+       xfs_agblock_t   rm_startblock;  /* extent start block */
+       xfs_extlen_t    rm_blockcount;  /* extent length */
+       __uint64_t      rm_owner;       /* extent owner */
+       __uint64_t      rm_offset;      /* offset within the owner */
+       unsigned int    rm_flags;       /* state flags */
+};
+
+/*
+ * Key structure
+ *
+ * We don't use the length for lookups
+ */
+struct xfs_rmap_key {
+       __be32          rm_startblock;  /* extent start block */
+       __be64          rm_owner;       /* extent owner */
+       __be64          rm_offset;      /* offset within the owner */
+} __attribute__((packed));
+
+/* btree pointer type */
+typedef __be32 xfs_rmap_ptr_t;
+
+#define        XFS_RMAP_BLOCK(mp) \
        (xfs_sb_version_hasfinobt(&((mp)->m_sb)) ? \
         XFS_FIBT_BLOCK(mp) + 1 : \
         XFS_IBT_BLOCK(mp) + 1)
 
-
-
 /*
  * BMAP Btree format definitions
  *
index f5ec9c5ccae6ace2873c0f1c44db1816baf0847b..79455058b752588e7855afde61ad59ba4ce3186f 100644 (file)
@@ -206,6 +206,7 @@ typedef struct xfs_fsop_resblks {
 #define XFS_FSOP_GEOM_FLAGS_FTYPE      0x10000 /* inode directory types */
 #define XFS_FSOP_GEOM_FLAGS_FINOBT     0x20000 /* free inode btree */
 #define XFS_FSOP_GEOM_FLAGS_SPINODES   0x40000 /* sparse inode chunks  */
+#define XFS_FSOP_GEOM_FLAGS_RMAPBT     0x80000 /* Reverse mapping btree */
 
 /*
  * Minimum and maximum sizes need for growth checks.
index 4b1e408169a83de0c03825a8167bc8a678dd7124..51b4e0de1fdc424e13f039adf98ae2789a27ba74 100644 (file)
@@ -24,6 +24,7 @@
 #include "xfs_bit.h"
 #include "xfs_sb.h"
 #include "xfs_mount.h"
+#include "xfs_defer.h"
 #include "xfs_inode.h"
 #include "xfs_btree.h"
 #include "xfs_ialloc.h"
@@ -39,6 +40,7 @@
 #include "xfs_icache.h"
 #include "xfs_trace.h"
 #include "xfs_log.h"
+#include "xfs_rmap.h"
 
 
 /*
@@ -614,6 +616,7 @@ xfs_ialloc_ag_alloc(
        args.tp = tp;
        args.mp = tp->t_mountp;
        args.fsbno = NULLFSBLOCK;
+       xfs_rmap_ag_owner(&args.oinfo, XFS_RMAP_OWN_INODES);
 
 #ifdef DEBUG
        /* randomly do sparse inode allocations */
@@ -1817,19 +1820,21 @@ xfs_difree_inode_chunk(
        struct xfs_mount                *mp,
        xfs_agnumber_t                  agno,
        struct xfs_inobt_rec_incore     *rec,
-       struct xfs_bmap_free            *flist)
+       struct xfs_defer_ops            *dfops)
 {
        xfs_agblock_t   sagbno = XFS_AGINO_TO_AGBNO(mp, rec->ir_startino);
        int             startidx, endidx;
        int             nextbit;
        xfs_agblock_t   agbno;
        int             contigblk;
+       struct xfs_owner_info   oinfo;
        DECLARE_BITMAP(holemask, XFS_INOBT_HOLEMASK_BITS);
+       xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INODES);
 
        if (!xfs_inobt_issparse(rec->ir_holemask)) {
                /* not sparse, calculate extent info directly */
-               xfs_bmap_add_free(mp, flist, XFS_AGB_TO_FSB(mp, agno, sagbno),
-                                 mp->m_ialloc_blks);
+               xfs_bmap_add_free(mp, dfops, XFS_AGB_TO_FSB(mp, agno, sagbno),
+                                 mp->m_ialloc_blks, &oinfo);
                return;
        }
 
@@ -1872,8 +1877,8 @@ xfs_difree_inode_chunk(
 
                ASSERT(agbno % mp->m_sb.sb_spino_align == 0);
                ASSERT(contigblk % mp->m_sb.sb_spino_align == 0);
-               xfs_bmap_add_free(mp, flist, XFS_AGB_TO_FSB(mp, agno, agbno),
-                                 contigblk);
+               xfs_bmap_add_free(mp, dfops, XFS_AGB_TO_FSB(mp, agno, agbno),
+                                 contigblk, &oinfo);
 
                /* reset range to current bit and carry on... */
                startidx = endidx = nextbit;
@@ -1889,7 +1894,7 @@ xfs_difree_inobt(
        struct xfs_trans                *tp,
        struct xfs_buf                  *agbp,
        xfs_agino_t                     agino,
-       struct xfs_bmap_free            *flist,
+       struct xfs_defer_ops            *dfops,
        struct xfs_icluster             *xic,
        struct xfs_inobt_rec_incore     *orec)
 {
@@ -1976,7 +1981,7 @@ xfs_difree_inobt(
                        goto error0;
                }
 
-               xfs_difree_inode_chunk(mp, agno, &rec, flist);
+               xfs_difree_inode_chunk(mp, agno, &rec, dfops);
        } else {
                xic->deleted = 0;
 
@@ -2121,7 +2126,7 @@ int
 xfs_difree(
        struct xfs_trans        *tp,            /* transaction pointer */
        xfs_ino_t               inode,          /* inode to be freed */
-       struct xfs_bmap_free    *flist,         /* extents to free */
+       struct xfs_defer_ops    *dfops,         /* extents to free */
        struct xfs_icluster     *xic)   /* cluster info if deleted */
 {
        /* REFERENCED */
@@ -2173,7 +2178,7 @@ xfs_difree(
        /*
         * Fix up the inode allocation btree.
         */
-       error = xfs_difree_inobt(mp, tp, agbp, agino, flist, xic, &rec);
+       error = xfs_difree_inobt(mp, tp, agbp, agino, dfops, xic, &rec);
        if (error)
                goto error0;
 
index 6e450df2979bfc80a7983cff0dbd79124741aa19..0bb89669fc072fd6bdf5ee44ded2b345aaf5de0b 100644 (file)
@@ -95,7 +95,7 @@ int                                   /* error */
 xfs_difree(
        struct xfs_trans *tp,           /* transaction pointer */
        xfs_ino_t       inode,          /* inode to be freed */
-       struct xfs_bmap_free *flist,    /* extents to free */
+       struct xfs_defer_ops *dfops,    /* extents to free */
        struct xfs_icluster *ifree);    /* cluster info if deleted */
 
 /*
index 89c21d771e35edbc026eb7fe7cb373280774b162..31ca2208c03dfbd9ff7f1b384cac616d1e4b022b 100644 (file)
@@ -32,6 +32,7 @@
 #include "xfs_trace.h"
 #include "xfs_cksum.h"
 #include "xfs_trans.h"
+#include "xfs_rmap.h"
 
 
 STATIC int
@@ -96,6 +97,7 @@ xfs_inobt_alloc_block(
        memset(&args, 0, sizeof(args));
        args.tp = cur->bc_tp;
        args.mp = cur->bc_mp;
+       xfs_rmap_ag_owner(&args.oinfo, XFS_RMAP_OWN_INOBT);
        args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.a.agno, sbno);
        args.minlen = 1;
        args.maxlen = 1;
@@ -125,8 +127,12 @@ xfs_inobt_free_block(
        struct xfs_btree_cur    *cur,
        struct xfs_buf          *bp)
 {
+       struct xfs_owner_info   oinfo;
+
+       xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INOBT);
        return xfs_free_extent(cur->bc_tp,
-                       XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(bp)), 1);
+                       XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(bp)), 1,
+                       &oinfo);
 }
 
 STATIC int
@@ -145,14 +151,6 @@ xfs_inobt_init_key_from_rec(
        key->inobt.ir_startino = rec->inobt.ir_startino;
 }
 
-STATIC void
-xfs_inobt_init_rec_from_key(
-       union xfs_btree_key     *key,
-       union xfs_btree_rec     *rec)
-{
-       rec->inobt.ir_startino = key->inobt.ir_startino;
-}
-
 STATIC void
 xfs_inobt_init_rec_from_cur(
        struct xfs_btree_cur    *cur,
@@ -314,7 +312,6 @@ static const struct xfs_btree_ops xfs_inobt_ops = {
        .get_minrecs            = xfs_inobt_get_minrecs,
        .get_maxrecs            = xfs_inobt_get_maxrecs,
        .init_key_from_rec      = xfs_inobt_init_key_from_rec,
-       .init_rec_from_key      = xfs_inobt_init_rec_from_key,
        .init_rec_from_cur      = xfs_inobt_init_rec_from_cur,
        .init_ptr_from_cur      = xfs_inobt_init_ptr_from_cur,
        .key_diff               = xfs_inobt_key_diff,
@@ -336,7 +333,6 @@ static const struct xfs_btree_ops xfs_finobt_ops = {
        .get_minrecs            = xfs_inobt_get_minrecs,
        .get_maxrecs            = xfs_inobt_get_maxrecs,
        .init_key_from_rec      = xfs_inobt_init_key_from_rec,
-       .init_rec_from_key      = xfs_inobt_init_rec_from_key,
        .init_rec_from_cur      = xfs_inobt_init_rec_from_cur,
        .init_ptr_from_cur      = xfs_finobt_init_ptr_from_cur,
        .key_diff               = xfs_inobt_key_diff,
index 9d9559eb2835a33621e568392fab2c1074022da3..4b9769e23c834278eabe70ea429dacabd2d934f2 100644 (file)
@@ -22,6 +22,7 @@
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_mount.h"
+#include "xfs_defer.h"
 #include "xfs_inode.h"
 #include "xfs_error.h"
 #include "xfs_cksum.h"
index e8f49c029ff05098ddc91eeeffe5ba7102eff77f..a6eed43fa7cd5d7898c7d15c8823bdd4874dec2b 100644 (file)
@@ -110,7 +110,9 @@ static inline uint xlog_get_cycle(char *ptr)
 #define XLOG_REG_TYPE_COMMIT           18
 #define XLOG_REG_TYPE_TRANSHDR         19
 #define XLOG_REG_TYPE_ICREATE          20
-#define XLOG_REG_TYPE_MAX              20
+#define XLOG_REG_TYPE_RUI_FORMAT       21
+#define XLOG_REG_TYPE_RUD_FORMAT       22
+#define XLOG_REG_TYPE_MAX              22
 
 /*
  * Flags to log operation header
@@ -227,6 +229,8 @@ typedef struct xfs_trans_header {
 #define        XFS_LI_DQUOT            0x123d
 #define        XFS_LI_QUOTAOFF         0x123e
 #define        XFS_LI_ICREATE          0x123f
+#define        XFS_LI_RUI              0x1240  /* rmap update intent */
+#define        XFS_LI_RUD              0x1241
 
 #define XFS_LI_TYPE_DESC \
        { XFS_LI_EFI,           "XFS_LI_EFI" }, \
@@ -236,7 +240,9 @@ typedef struct xfs_trans_header {
        { XFS_LI_BUF,           "XFS_LI_BUF" }, \
        { XFS_LI_DQUOT,         "XFS_LI_DQUOT" }, \
        { XFS_LI_QUOTAOFF,      "XFS_LI_QUOTAOFF" }, \
-       { XFS_LI_ICREATE,       "XFS_LI_ICREATE" }
+       { XFS_LI_ICREATE,       "XFS_LI_ICREATE" }, \
+       { XFS_LI_RUI,           "XFS_LI_RUI" }, \
+       { XFS_LI_RUD,           "XFS_LI_RUD" }
 
 /*
  * Inode Log Item Format definitions.
@@ -603,6 +609,59 @@ typedef struct xfs_efd_log_format_64 {
        xfs_extent_64_t         efd_extents[1]; /* array of extents freed */
 } xfs_efd_log_format_64_t;
 
+/*
+ * RUI/RUD (reverse mapping) log format definitions
+ */
+struct xfs_map_extent {
+       __uint64_t              me_owner;
+       __uint64_t              me_startblock;
+       __uint64_t              me_startoff;
+       __uint32_t              me_len;
+       __uint32_t              me_flags;
+};
+
+/* rmap me_flags: upper bits are flags, lower byte is type code */
+#define XFS_RMAP_EXTENT_MAP            1
+#define XFS_RMAP_EXTENT_UNMAP          3
+#define XFS_RMAP_EXTENT_CONVERT                5
+#define XFS_RMAP_EXTENT_ALLOC          7
+#define XFS_RMAP_EXTENT_FREE           8
+#define XFS_RMAP_EXTENT_TYPE_MASK      0xFF
+
+#define XFS_RMAP_EXTENT_ATTR_FORK      (1U << 31)
+#define XFS_RMAP_EXTENT_BMBT_BLOCK     (1U << 30)
+#define XFS_RMAP_EXTENT_UNWRITTEN      (1U << 29)
+
+#define XFS_RMAP_EXTENT_FLAGS          (XFS_RMAP_EXTENT_TYPE_MASK | \
+                                        XFS_RMAP_EXTENT_ATTR_FORK | \
+                                        XFS_RMAP_EXTENT_BMBT_BLOCK | \
+                                        XFS_RMAP_EXTENT_UNWRITTEN)
+
+/*
+ * This is the structure used to lay out an rui log item in the
+ * log.  The rui_extents field is a variable size array whose
+ * size is given by rui_nextents.
+ */
+struct xfs_rui_log_format {
+       __uint16_t              rui_type;       /* rui log item type */
+       __uint16_t              rui_size;       /* size of this item */
+       __uint32_t              rui_nextents;   /* # extents to free */
+       __uint64_t              rui_id;         /* rui identifier */
+       struct xfs_map_extent   rui_extents[1]; /* array of extents to rmap */
+};
+
+/*
+ * This is the structure used to lay out an rud log item in the
+ * log.  The rud_extents array is a variable size array whose
+ * size is given by rud_nextents;
+ */
+struct xfs_rud_log_format {
+       __uint16_t              rud_type;       /* rud log item type */
+       __uint16_t              rud_size;       /* size of this item */
+       __uint32_t              __pad;
+       __uint64_t              rud_rui_id;     /* id of corresponding rui */
+};
+
 /*
  * Dquot Log format definitions.
  *
diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
new file mode 100644 (file)
index 0000000..73d0540
--- /dev/null
@@ -0,0 +1,1399 @@
+/*
+ * Copyright (c) 2014 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_btree.h"
+#include "xfs_trans.h"
+#include "xfs_alloc.h"
+#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_trans_space.h"
+#include "xfs_trace.h"
+#include "xfs_error.h"
+#include "xfs_extent_busy.h"
+#include "xfs_bmap.h"
+#include "xfs_inode.h"
+
+/*
+ * Lookup the first record less than or equal to [bno, len, owner, offset]
+ * in the btree given by cur.
+ */
+int
+xfs_rmap_lookup_le(
+       struct xfs_btree_cur    *cur,
+       xfs_agblock_t           bno,
+       xfs_extlen_t            len,
+       uint64_t                owner,
+       uint64_t                offset,
+       unsigned int            flags,
+       int                     *stat)
+{
+       cur->bc_rec.r.rm_startblock = bno;
+       cur->bc_rec.r.rm_blockcount = len;
+       cur->bc_rec.r.rm_owner = owner;
+       cur->bc_rec.r.rm_offset = offset;
+       cur->bc_rec.r.rm_flags = flags;
+       return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
+}
+
+/*
+ * Lookup the record exactly matching [bno, len, owner, offset]
+ * in the btree given by cur.
+ */
+int
+xfs_rmap_lookup_eq(
+       struct xfs_btree_cur    *cur,
+       xfs_agblock_t           bno,
+       xfs_extlen_t            len,
+       uint64_t                owner,
+       uint64_t                offset,
+       unsigned int            flags,
+       int                     *stat)
+{
+       cur->bc_rec.r.rm_startblock = bno;
+       cur->bc_rec.r.rm_blockcount = len;
+       cur->bc_rec.r.rm_owner = owner;
+       cur->bc_rec.r.rm_offset = offset;
+       cur->bc_rec.r.rm_flags = flags;
+       return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
+}
+
+/*
+ * Update the record referred to by cur to the value given
+ * by [bno, len, owner, offset].
+ * This either works (return 0) or gets an EFSCORRUPTED error.
+ */
+STATIC int
+xfs_rmap_update(
+       struct xfs_btree_cur    *cur,
+       struct xfs_rmap_irec    *irec)
+{
+       union xfs_btree_rec     rec;
+       int                     error;
+
+       trace_xfs_rmap_update(cur->bc_mp, cur->bc_private.a.agno,
+                       irec->rm_startblock, irec->rm_blockcount,
+                       irec->rm_owner, irec->rm_offset, irec->rm_flags);
+
+       rec.rmap.rm_startblock = cpu_to_be32(irec->rm_startblock);
+       rec.rmap.rm_blockcount = cpu_to_be32(irec->rm_blockcount);
+       rec.rmap.rm_owner = cpu_to_be64(irec->rm_owner);
+       rec.rmap.rm_offset = cpu_to_be64(
+                       xfs_rmap_irec_offset_pack(irec));
+       error = xfs_btree_update(cur, &rec);
+       if (error)
+               trace_xfs_rmap_update_error(cur->bc_mp,
+                               cur->bc_private.a.agno, error, _RET_IP_);
+       return error;
+}
+
+int
+xfs_rmap_insert(
+       struct xfs_btree_cur    *rcur,
+       xfs_agblock_t           agbno,
+       xfs_extlen_t            len,
+       uint64_t                owner,
+       uint64_t                offset,
+       unsigned int            flags)
+{
+       int                     i;
+       int                     error;
+
+       trace_xfs_rmap_insert(rcur->bc_mp, rcur->bc_private.a.agno, agbno,
+                       len, owner, offset, flags);
+
+       error = xfs_rmap_lookup_eq(rcur, agbno, len, owner, offset, flags, &i);
+       if (error)
+               goto done;
+       XFS_WANT_CORRUPTED_GOTO(rcur->bc_mp, i == 0, done);
+
+       rcur->bc_rec.r.rm_startblock = agbno;
+       rcur->bc_rec.r.rm_blockcount = len;
+       rcur->bc_rec.r.rm_owner = owner;
+       rcur->bc_rec.r.rm_offset = offset;
+       rcur->bc_rec.r.rm_flags = flags;
+       error = xfs_btree_insert(rcur, &i);
+       if (error)
+               goto done;
+       XFS_WANT_CORRUPTED_GOTO(rcur->bc_mp, i == 1, done);
+done:
+       if (error)
+               trace_xfs_rmap_insert_error(rcur->bc_mp,
+                               rcur->bc_private.a.agno, error, _RET_IP_);
+       return error;
+}
+
+static int
+xfs_rmap_btrec_to_irec(
+       union xfs_btree_rec     *rec,
+       struct xfs_rmap_irec    *irec)
+{
+       irec->rm_flags = 0;
+       irec->rm_startblock = be32_to_cpu(rec->rmap.rm_startblock);
+       irec->rm_blockcount = be32_to_cpu(rec->rmap.rm_blockcount);
+       irec->rm_owner = be64_to_cpu(rec->rmap.rm_owner);
+       return xfs_rmap_irec_offset_unpack(be64_to_cpu(rec->rmap.rm_offset),
+                       irec);
+}
+
+/*
+ * Get the data from the pointed-to record.
+ */
+int
+xfs_rmap_get_rec(
+       struct xfs_btree_cur    *cur,
+       struct xfs_rmap_irec    *irec,
+       int                     *stat)
+{
+       union xfs_btree_rec     *rec;
+       int                     error;
+
+       error = xfs_btree_get_rec(cur, &rec, stat);
+       if (error || !*stat)
+               return error;
+
+       return xfs_rmap_btrec_to_irec(rec, irec);
+}
+
+/*
+ * Find the extent in the rmap btree and remove it.
+ *
+ * The record we find should always be an exact match for the extent that we're
+ * looking for, since we insert them into the btree without modification.
+ *
+ * Special Case #1: when growing the filesystem, we "free" an extent when
+ * growing the last AG. This extent is new space and so it is not tracked as
+ * used space in the btree. The growfs code will pass in an owner of
+ * XFS_RMAP_OWN_NULL to indicate that it expected that there is no owner of this
+ * extent. We verify that - the extent lookup result in a record that does not
+ * overlap.
+ *
+ * Special Case #2: EFIs do not record the owner of the extent, so when
+ * recovering EFIs from the log we pass in XFS_RMAP_OWN_UNKNOWN to tell the rmap
+ * btree to ignore the owner (i.e. wildcard match) so we don't trigger
+ * corruption checks during log recovery.
+ */
+STATIC int
+xfs_rmap_unmap(
+       struct xfs_btree_cur    *cur,
+       xfs_agblock_t           bno,
+       xfs_extlen_t            len,
+       bool                    unwritten,
+       struct xfs_owner_info   *oinfo)
+{
+       struct xfs_mount        *mp = cur->bc_mp;
+       struct xfs_rmap_irec    ltrec;
+       uint64_t                ltoff;
+       int                     error = 0;
+       int                     i;
+       uint64_t                owner;
+       uint64_t                offset;
+       unsigned int            flags;
+       bool                    ignore_off;
+
+       xfs_owner_info_unpack(oinfo, &owner, &offset, &flags);
+       ignore_off = XFS_RMAP_NON_INODE_OWNER(owner) ||
+                       (flags & XFS_RMAP_BMBT_BLOCK);
+       if (unwritten)
+               flags |= XFS_RMAP_UNWRITTEN;
+       trace_xfs_rmap_unmap(mp, cur->bc_private.a.agno, bno, len,
+                       unwritten, oinfo);
+
+       /*
+        * We should always have a left record because there's a static record
+        * for the AG headers at rm_startblock == 0 created by mkfs/growfs that
+        * will not ever be removed from the tree.
+        */
+       error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, flags, &i);
+       if (error)
+               goto out_error;
+       XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error);
+
+       error = xfs_rmap_get_rec(cur, &ltrec, &i);
+       if (error)
+               goto out_error;
+       XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error);
+       trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
+                       cur->bc_private.a.agno, ltrec.rm_startblock,
+                       ltrec.rm_blockcount, ltrec.rm_owner,
+                       ltrec.rm_offset, ltrec.rm_flags);
+       ltoff = ltrec.rm_offset;
+
+       /*
+        * For growfs, the incoming extent must be beyond the left record we
+        * just found as it is new space and won't be used by anyone. This is
+        * just a corruption check as we don't actually do anything with this
+        * extent.  Note that we need to use >= instead of > because it might
+        * be the case that the "left" extent goes all the way to EOFS.
+        */
+       if (owner == XFS_RMAP_OWN_NULL) {
+               XFS_WANT_CORRUPTED_GOTO(mp, bno >= ltrec.rm_startblock +
+                                               ltrec.rm_blockcount, out_error);
+               goto out_done;
+       }
+
+       /* Make sure the unwritten flag matches. */
+       XFS_WANT_CORRUPTED_GOTO(mp, (flags & XFS_RMAP_UNWRITTEN) ==
+                       (ltrec.rm_flags & XFS_RMAP_UNWRITTEN), out_error);
+
+       /* Make sure the extent we found covers the entire freeing range. */
+       XFS_WANT_CORRUPTED_GOTO(mp, ltrec.rm_startblock <= bno &&
+               ltrec.rm_startblock + ltrec.rm_blockcount >=
+               bno + len, out_error);
+
+       /* Make sure the owner matches what we expect to find in the tree. */
+       XFS_WANT_CORRUPTED_GOTO(mp, owner == ltrec.rm_owner ||
+                                   XFS_RMAP_NON_INODE_OWNER(owner), out_error);
+
+       /* Check the offset, if necessary. */
+       if (!XFS_RMAP_NON_INODE_OWNER(owner)) {
+               if (flags & XFS_RMAP_BMBT_BLOCK) {
+                       XFS_WANT_CORRUPTED_GOTO(mp,
+                                       ltrec.rm_flags & XFS_RMAP_BMBT_BLOCK,
+                                       out_error);
+               } else {
+                       XFS_WANT_CORRUPTED_GOTO(mp,
+                                       ltrec.rm_offset <= offset, out_error);
+                       XFS_WANT_CORRUPTED_GOTO(mp,
+                                       ltoff + ltrec.rm_blockcount >= offset + len,
+                                       out_error);
+               }
+       }
+
+       if (ltrec.rm_startblock == bno && ltrec.rm_blockcount == len) {
+               /* exact match, simply remove the record from rmap tree */
+               trace_xfs_rmap_delete(mp, cur->bc_private.a.agno,
+                               ltrec.rm_startblock, ltrec.rm_blockcount,
+                               ltrec.rm_owner, ltrec.rm_offset,
+                               ltrec.rm_flags);
+               error = xfs_btree_delete(cur, &i);
+               if (error)
+                       goto out_error;
+               XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error);
+       } else if (ltrec.rm_startblock == bno) {
+               /*
+                * overlap left hand side of extent: move the start, trim the
+                * length and update the current record.
+                *
+                *       ltbno                ltlen
+                * Orig:    |oooooooooooooooooooo|
+                * Freeing: |fffffffff|
+                * Result:            |rrrrrrrrrr|
+                *         bno       len
+                */
+               ltrec.rm_startblock += len;
+               ltrec.rm_blockcount -= len;
+               if (!ignore_off)
+                       ltrec.rm_offset += len;
+               error = xfs_rmap_update(cur, &ltrec);
+               if (error)
+                       goto out_error;
+       } else if (ltrec.rm_startblock + ltrec.rm_blockcount == bno + len) {
+               /*
+                * overlap right hand side of extent: trim the length and update
+                * the current record.
+                *
+                *       ltbno                ltlen
+                * Orig:    |oooooooooooooooooooo|
+                * Freeing:            |fffffffff|
+                * Result:  |rrrrrrrrrr|
+                *                    bno       len
+                */
+               ltrec.rm_blockcount -= len;
+               error = xfs_rmap_update(cur, &ltrec);
+               if (error)
+                       goto out_error;
+       } else {
+
+               /*
+                * overlap middle of extent: trim the length of the existing
+                * record to the length of the new left-extent size, increment
+                * the insertion position so we can insert a new record
+                * containing the remaining right-extent space.
+                *
+                *       ltbno                ltlen
+                * Orig:    |oooooooooooooooooooo|
+                * Freeing:       |fffffffff|
+                * Result:  |rrrrr|         |rrrr|
+                *               bno       len
+                */
+               xfs_extlen_t    orig_len = ltrec.rm_blockcount;
+
+               ltrec.rm_blockcount = bno - ltrec.rm_startblock;
+               error = xfs_rmap_update(cur, &ltrec);
+               if (error)
+                       goto out_error;
+
+               error = xfs_btree_increment(cur, 0, &i);
+               if (error)
+                       goto out_error;
+
+               cur->bc_rec.r.rm_startblock = bno + len;
+               cur->bc_rec.r.rm_blockcount = orig_len - len -
+                                                    ltrec.rm_blockcount;
+               cur->bc_rec.r.rm_owner = ltrec.rm_owner;
+               if (ignore_off)
+                       cur->bc_rec.r.rm_offset = 0;
+               else
+                       cur->bc_rec.r.rm_offset = offset + len;
+               cur->bc_rec.r.rm_flags = flags;
+               trace_xfs_rmap_insert(mp, cur->bc_private.a.agno,
+                               cur->bc_rec.r.rm_startblock,
+                               cur->bc_rec.r.rm_blockcount,
+                               cur->bc_rec.r.rm_owner,
+                               cur->bc_rec.r.rm_offset,
+                               cur->bc_rec.r.rm_flags);
+               error = xfs_btree_insert(cur, &i);
+               if (error)
+                       goto out_error;
+       }
+
+out_done:
+       trace_xfs_rmap_unmap_done(mp, cur->bc_private.a.agno, bno, len,
+                       unwritten, oinfo);
+out_error:
+       if (error)
+               trace_xfs_rmap_unmap_error(mp, cur->bc_private.a.agno,
+                               error, _RET_IP_);
+       return error;
+}
+
+/*
+ * Remove a reference to an extent in the rmap btree.
+ */
+int
+xfs_rmap_free(
+       struct xfs_trans        *tp,
+       struct xfs_buf          *agbp,
+       xfs_agnumber_t          agno,
+       xfs_agblock_t           bno,
+       xfs_extlen_t            len,
+       struct xfs_owner_info   *oinfo)
+{
+       struct xfs_mount        *mp = tp->t_mountp;
+       struct xfs_btree_cur    *cur;
+       int                     error;
+
+       if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
+               return 0;
+
+       cur = xfs_rmapbt_init_cursor(mp, tp, agbp, agno);
+
+       error = xfs_rmap_unmap(cur, bno, len, false, oinfo);
+       if (error)
+               goto out_error;
+
+       xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+       return 0;
+
+out_error:
+       xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+       return error;
+}
+
+/*
+ * A mergeable rmap must have the same owner and the same values for
+ * the unwritten, attr_fork, and bmbt flags.  The startblock and
+ * offset are checked separately.
+ */
+static bool
+xfs_rmap_is_mergeable(
+       struct xfs_rmap_irec    *irec,
+       uint64_t                owner,
+       unsigned int            flags)
+{
+       if (irec->rm_owner == XFS_RMAP_OWN_NULL)
+               return false;
+       if (irec->rm_owner != owner)
+               return false;
+       if ((flags & XFS_RMAP_UNWRITTEN) ^
+           (irec->rm_flags & XFS_RMAP_UNWRITTEN))
+               return false;
+       if ((flags & XFS_RMAP_ATTR_FORK) ^
+           (irec->rm_flags & XFS_RMAP_ATTR_FORK))
+               return false;
+       if ((flags & XFS_RMAP_BMBT_BLOCK) ^
+           (irec->rm_flags & XFS_RMAP_BMBT_BLOCK))
+               return false;
+       return true;
+}
+
+/*
+ * When we allocate a new block, the first thing we do is add a reference to
+ * the extent in the rmap btree. This takes the form of a [agbno, length,
+ * owner, offset] record.  Flags are encoded in the high bits of the offset
+ * field.
+ */
+STATIC int
+xfs_rmap_map(
+       struct xfs_btree_cur    *cur,
+       xfs_agblock_t           bno,
+       xfs_extlen_t            len,
+       bool                    unwritten,
+       struct xfs_owner_info   *oinfo)
+{
+       struct xfs_mount        *mp = cur->bc_mp;
+       struct xfs_rmap_irec    ltrec;
+       struct xfs_rmap_irec    gtrec;
+       int                     have_gt;
+       int                     have_lt;
+       int                     error = 0;
+       int                     i;
+       uint64_t                owner;
+       uint64_t                offset;
+       unsigned int            flags = 0;
+       bool                    ignore_off;
+
+       xfs_owner_info_unpack(oinfo, &owner, &offset, &flags);
+       ASSERT(owner != 0);
+       ignore_off = XFS_RMAP_NON_INODE_OWNER(owner) ||
+                       (flags & XFS_RMAP_BMBT_BLOCK);
+       if (unwritten)
+               flags |= XFS_RMAP_UNWRITTEN;
+       trace_xfs_rmap_map(mp, cur->bc_private.a.agno, bno, len,
+                       unwritten, oinfo);
+
+       /*
+        * For the initial lookup, look for an exact match or the left-adjacent
+        * record for our insertion point. This will also give us the record for
+        * start block contiguity tests.
+        */
+       error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, flags,
+                       &have_lt);
+       if (error)
+               goto out_error;
+       XFS_WANT_CORRUPTED_GOTO(mp, have_lt == 1, out_error);
+
+       error = xfs_rmap_get_rec(cur, &ltrec, &have_lt);
+       if (error)
+               goto out_error;
+       XFS_WANT_CORRUPTED_GOTO(mp, have_lt == 1, out_error);
+       trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
+                       cur->bc_private.a.agno, ltrec.rm_startblock,
+                       ltrec.rm_blockcount, ltrec.rm_owner,
+                       ltrec.rm_offset, ltrec.rm_flags);
+
+       if (!xfs_rmap_is_mergeable(&ltrec, owner, flags))
+               have_lt = 0;
+
+       XFS_WANT_CORRUPTED_GOTO(mp,
+               have_lt == 0 ||
+               ltrec.rm_startblock + ltrec.rm_blockcount <= bno, out_error);
+
+       /*
+        * Increment the cursor to see if we have a right-adjacent record to our
+        * insertion point. This will give us the record for end block
+        * contiguity tests.
+        */
+       error = xfs_btree_increment(cur, 0, &have_gt);
+       if (error)
+               goto out_error;
+       if (have_gt) {
+               error = xfs_rmap_get_rec(cur, &gtrec, &have_gt);
+               if (error)
+                       goto out_error;
+               XFS_WANT_CORRUPTED_GOTO(mp, have_gt == 1, out_error);
+               XFS_WANT_CORRUPTED_GOTO(mp, bno + len <= gtrec.rm_startblock,
+                                       out_error);
+               trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp,
+                       cur->bc_private.a.agno, gtrec.rm_startblock,
+                       gtrec.rm_blockcount, gtrec.rm_owner,
+                       gtrec.rm_offset, gtrec.rm_flags);
+               if (!xfs_rmap_is_mergeable(&gtrec, owner, flags))
+                       have_gt = 0;
+       }
+
+       /*
+        * Note: cursor currently points one record to the right of ltrec, even
+        * if there is no record in the tree to the right.
+        */
+       if (have_lt &&
+           ltrec.rm_startblock + ltrec.rm_blockcount == bno &&
+           (ignore_off || ltrec.rm_offset + ltrec.rm_blockcount == offset)) {
+               /*
+                * left edge contiguous, merge into left record.
+                *
+                *       ltbno     ltlen
+                * orig:   |ooooooooo|
+                * adding:           |aaaaaaaaa|
+                * result: |rrrrrrrrrrrrrrrrrrr|
+                *                  bno       len
+                */
+               ltrec.rm_blockcount += len;
+               if (have_gt &&
+                   bno + len == gtrec.rm_startblock &&
+                   (ignore_off || offset + len == gtrec.rm_offset) &&
+                   (unsigned long)ltrec.rm_blockcount + len +
+                               gtrec.rm_blockcount <= XFS_RMAP_LEN_MAX) {
+                       /*
+                        * right edge also contiguous, delete right record
+                        * and merge into left record.
+                        *
+                        *       ltbno     ltlen    gtbno     gtlen
+                        * orig:   |ooooooooo|         |ooooooooo|
+                        * adding:           |aaaaaaaaa|
+                        * result: |rrrrrrrrrrrrrrrrrrrrrrrrrrrrr|
+                        */
+                       ltrec.rm_blockcount += gtrec.rm_blockcount;
+                       trace_xfs_rmap_delete(mp, cur->bc_private.a.agno,
+                                       gtrec.rm_startblock,
+                                       gtrec.rm_blockcount,
+                                       gtrec.rm_owner,
+                                       gtrec.rm_offset,
+                                       gtrec.rm_flags);
+                       error = xfs_btree_delete(cur, &i);
+                       if (error)
+                               goto out_error;
+                       XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error);
+               }
+
+               /* point the cursor back to the left record and update */
+               error = xfs_btree_decrement(cur, 0, &have_gt);
+               if (error)
+                       goto out_error;
+               error = xfs_rmap_update(cur, &ltrec);
+               if (error)
+                       goto out_error;
+       } else if (have_gt &&
+                  bno + len == gtrec.rm_startblock &&
+                  (ignore_off || offset + len == gtrec.rm_offset)) {
+               /*
+                * right edge contiguous, merge into right record.
+                *
+                *                 gtbno     gtlen
+                * Orig:             |ooooooooo|
+                * adding: |aaaaaaaaa|
+                * Result: |rrrrrrrrrrrrrrrrrrr|
+                *        bno       len
+                */
+               gtrec.rm_startblock = bno;
+               gtrec.rm_blockcount += len;
+               if (!ignore_off)
+                       gtrec.rm_offset = offset;
+               error = xfs_rmap_update(cur, &gtrec);
+               if (error)
+                       goto out_error;
+       } else {
+               /*
+                * no contiguous edge with identical owner, insert
+                * new record at current cursor position.
+                */
+               cur->bc_rec.r.rm_startblock = bno;
+               cur->bc_rec.r.rm_blockcount = len;
+               cur->bc_rec.r.rm_owner = owner;
+               cur->bc_rec.r.rm_offset = offset;
+               cur->bc_rec.r.rm_flags = flags;
+               trace_xfs_rmap_insert(mp, cur->bc_private.a.agno, bno, len,
+                       owner, offset, flags);
+               error = xfs_btree_insert(cur, &i);
+               if (error)
+                       goto out_error;
+               XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error);
+       }
+
+       trace_xfs_rmap_map_done(mp, cur->bc_private.a.agno, bno, len,
+                       unwritten, oinfo);
+out_error:
+       if (error)
+               trace_xfs_rmap_map_error(mp, cur->bc_private.a.agno,
+                               error, _RET_IP_);
+       return error;
+}
+
+/*
+ * Add a reference to an extent in the rmap btree.
+ */
+int
+xfs_rmap_alloc(
+       struct xfs_trans        *tp,
+       struct xfs_buf          *agbp,
+       xfs_agnumber_t          agno,
+       xfs_agblock_t           bno,
+       xfs_extlen_t            len,
+       struct xfs_owner_info   *oinfo)
+{
+       struct xfs_mount        *mp = tp->t_mountp;
+       struct xfs_btree_cur    *cur;
+       int                     error;
+
+       if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
+               return 0;
+
+       cur = xfs_rmapbt_init_cursor(mp, tp, agbp, agno);
+       error = xfs_rmap_map(cur, bno, len, false, oinfo);
+       if (error)
+               goto out_error;
+
+       xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+       return 0;
+
+out_error:
+       xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+       return error;
+}
+
+#define RMAP_LEFT_CONTIG       (1 << 0)
+#define RMAP_RIGHT_CONTIG      (1 << 1)
+#define RMAP_LEFT_FILLING      (1 << 2)
+#define RMAP_RIGHT_FILLING     (1 << 3)
+#define RMAP_LEFT_VALID                (1 << 6)
+#define RMAP_RIGHT_VALID       (1 << 7)
+
+#define LEFT           r[0]
+#define RIGHT          r[1]
+#define PREV           r[2]
+#define NEW            r[3]
+
+/*
+ * Convert an unwritten extent to a real extent or vice versa.
+ * Does not handle overlapping extents.
+ */
+STATIC int
+xfs_rmap_convert(
+       struct xfs_btree_cur    *cur,
+       xfs_agblock_t           bno,
+       xfs_extlen_t            len,
+       bool                    unwritten,
+       struct xfs_owner_info   *oinfo)
+{
+       struct xfs_mount        *mp = cur->bc_mp;
+       struct xfs_rmap_irec    r[4];   /* neighbor extent entries */
+                                       /* left is 0, right is 1, prev is 2 */
+                                       /* new is 3 */
+       uint64_t                owner;
+       uint64_t                offset;
+       uint64_t                new_endoff;
+       unsigned int            oldext;
+       unsigned int            newext;
+       unsigned int            flags = 0;
+       int                     i;
+       int                     state = 0;
+       int                     error;
+
+       xfs_owner_info_unpack(oinfo, &owner, &offset, &flags);
+       ASSERT(!(XFS_RMAP_NON_INODE_OWNER(owner) ||
+                       (flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))));
+       oldext = unwritten ? XFS_RMAP_UNWRITTEN : 0;
+       new_endoff = offset + len;
+       trace_xfs_rmap_convert(mp, cur->bc_private.a.agno, bno, len,
+                       unwritten, oinfo);
+
+       /*
+        * For the initial lookup, look for an exact match or the left-adjacent
+        * record for our insertion point. This will also give us the record for
+        * start block contiguity tests.
+        */
+       error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, oldext, &i);
+       if (error)
+               goto done;
+       XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+
+       error = xfs_rmap_get_rec(cur, &PREV, &i);
+       if (error)
+               goto done;
+       XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+       trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
+                       cur->bc_private.a.agno, PREV.rm_startblock,
+                       PREV.rm_blockcount, PREV.rm_owner,
+                       PREV.rm_offset, PREV.rm_flags);
+
+       ASSERT(PREV.rm_offset <= offset);
+       ASSERT(PREV.rm_offset + PREV.rm_blockcount >= new_endoff);
+       ASSERT((PREV.rm_flags & XFS_RMAP_UNWRITTEN) == oldext);
+       newext = ~oldext & XFS_RMAP_UNWRITTEN;
+
+       /*
+        * Set flags determining what part of the previous oldext allocation
+        * extent is being replaced by a newext allocation.
+        */
+       if (PREV.rm_offset == offset)
+               state |= RMAP_LEFT_FILLING;
+       if (PREV.rm_offset + PREV.rm_blockcount == new_endoff)
+               state |= RMAP_RIGHT_FILLING;
+
+       /*
+        * Decrement the cursor to see if we have a left-adjacent record to our
+        * insertion point. This will give us the record for end block
+        * contiguity tests.
+        */
+       error = xfs_btree_decrement(cur, 0, &i);
+       if (error)
+               goto done;
+       if (i) {
+               state |= RMAP_LEFT_VALID;
+               error = xfs_rmap_get_rec(cur, &LEFT, &i);
+               if (error)
+                       goto done;
+               XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+               XFS_WANT_CORRUPTED_GOTO(mp,
+                               LEFT.rm_startblock + LEFT.rm_blockcount <= bno,
+                               done);
+               trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp,
+                               cur->bc_private.a.agno, LEFT.rm_startblock,
+                               LEFT.rm_blockcount, LEFT.rm_owner,
+                               LEFT.rm_offset, LEFT.rm_flags);
+               if (LEFT.rm_startblock + LEFT.rm_blockcount == bno &&
+                   LEFT.rm_offset + LEFT.rm_blockcount == offset &&
+                   xfs_rmap_is_mergeable(&LEFT, owner, newext))
+                       state |= RMAP_LEFT_CONTIG;
+       }
+
+       /*
+        * Increment the cursor to see if we have a right-adjacent record to our
+        * insertion point. This will give us the record for end block
+        * contiguity tests.
+        */
+       error = xfs_btree_increment(cur, 0, &i);
+       if (error)
+               goto done;
+       XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+       error = xfs_btree_increment(cur, 0, &i);
+       if (error)
+               goto done;
+       if (i) {
+               state |= RMAP_RIGHT_VALID;
+               error = xfs_rmap_get_rec(cur, &RIGHT, &i);
+               if (error)
+                       goto done;
+               XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+               XFS_WANT_CORRUPTED_GOTO(mp, bno + len <= RIGHT.rm_startblock,
+                                       done);
+               trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp,
+                               cur->bc_private.a.agno, RIGHT.rm_startblock,
+                               RIGHT.rm_blockcount, RIGHT.rm_owner,
+                               RIGHT.rm_offset, RIGHT.rm_flags);
+               if (bno + len == RIGHT.rm_startblock &&
+                   offset + len == RIGHT.rm_offset &&
+                   xfs_rmap_is_mergeable(&RIGHT, owner, newext))
+                       state |= RMAP_RIGHT_CONTIG;
+       }
+
+       /* check that left + prev + right is not too long */
+       if ((state & (RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG |
+                        RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG)) ==
+           (RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG |
+            RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG) &&
+           (unsigned long)LEFT.rm_blockcount + len +
+            RIGHT.rm_blockcount > XFS_RMAP_LEN_MAX)
+               state &= ~RMAP_RIGHT_CONTIG;
+
+       trace_xfs_rmap_convert_state(mp, cur->bc_private.a.agno, state,
+                       _RET_IP_);
+
+       /* reset the cursor back to PREV */
+       error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, oldext, &i);
+       if (error)
+               goto done;
+       XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+
+       /*
+        * Switch out based on the FILLING and CONTIG state bits.
+        */
+       switch (state & (RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG |
+                        RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG)) {
+       case RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG |
+            RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG:
+               /*
+                * Setting all of a previous oldext extent to newext.
+                * The left and right neighbors are both contiguous with new.
+                */
+               error = xfs_btree_increment(cur, 0, &i);
+               if (error)
+                       goto done;
+               XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+               trace_xfs_rmap_delete(mp, cur->bc_private.a.agno,
+                               RIGHT.rm_startblock, RIGHT.rm_blockcount,
+                               RIGHT.rm_owner, RIGHT.rm_offset,
+                               RIGHT.rm_flags);
+               error = xfs_btree_delete(cur, &i);
+               if (error)
+                       goto done;
+               XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+               error = xfs_btree_decrement(cur, 0, &i);
+               if (error)
+                       goto done;
+               XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+               trace_xfs_rmap_delete(mp, cur->bc_private.a.agno,
+                               PREV.rm_startblock, PREV.rm_blockcount,
+                               PREV.rm_owner, PREV.rm_offset,
+                               PREV.rm_flags);
+               error = xfs_btree_delete(cur, &i);
+               if (error)
+                       goto done;
+               XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+               error = xfs_btree_decrement(cur, 0, &i);
+               if (error)
+                       goto done;
+               XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+               NEW = LEFT;
+               NEW.rm_blockcount += PREV.rm_blockcount + RIGHT.rm_blockcount;
+               error = xfs_rmap_update(cur, &NEW);
+               if (error)
+                       goto done;
+               break;
+
+       case RMAP_LEFT_FILLING | RMAP_RIGHT_FILLING | RMAP_LEFT_CONTIG:
+               /*
+                * Setting all of a previous oldext extent to newext.
+                * The left neighbor is contiguous, the right is not.
+                */
+               trace_xfs_rmap_delete(mp, cur->bc_private.a.agno,
+                               PREV.rm_startblock, PREV.rm_blockcount,
+                               PREV.rm_owner, PREV.rm_offset,
+                               PREV.rm_flags);
+               error = xfs_btree_delete(cur, &i);
+               if (error)
+                       goto done;
+               XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+               error = xfs_btree_decrement(cur, 0, &i);
+               if (error)
+                       goto done;
+               XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+               NEW = LEFT;
+               NEW.rm_blockcount += PREV.rm_blockcount;
+               error = xfs_rmap_update(cur, &NEW);
+               if (error)
+                       goto done;
+               break;
+
+       case RMAP_LEFT_FILLING | RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG:
+               /*
+                * Setting all of a previous oldext extent to newext.
+                * The right neighbor is contiguous, the left is not.
+                */
+               error = xfs_btree_increment(cur, 0, &i);
+               if (error)
+                       goto done;
+               XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+               trace_xfs_rmap_delete(mp, cur->bc_private.a.agno,
+                               RIGHT.rm_startblock, RIGHT.rm_blockcount,
+                               RIGHT.rm_owner, RIGHT.rm_offset,
+                               RIGHT.rm_flags);
+               error = xfs_btree_delete(cur, &i);
+               if (error)
+                       goto done;
+               XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+               error = xfs_btree_decrement(cur, 0, &i);
+               if (error)
+                       goto done;
+               XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+               NEW = PREV;
+               NEW.rm_blockcount = len + RIGHT.rm_blockcount;
+               NEW.rm_flags = newext;
+               error = xfs_rmap_update(cur, &NEW);
+               if (error)
+                       goto done;
+               break;
+
+       case RMAP_LEFT_FILLING | RMAP_RIGHT_FILLING:
+               /*
+                * Setting all of a previous oldext extent to newext.
+                * Neither the left nor right neighbors are contiguous with
+                * the new one.
+                */
+               NEW = PREV;
+               NEW.rm_flags = newext;
+               error = xfs_rmap_update(cur, &NEW);
+               if (error)
+                       goto done;
+               break;
+
+       case RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG:
+               /*
+                * Setting the first part of a previous oldext extent to newext.
+                * The left neighbor is contiguous.
+                */
+               NEW = PREV;
+               NEW.rm_offset += len;
+               NEW.rm_startblock += len;
+               NEW.rm_blockcount -= len;
+               error = xfs_rmap_update(cur, &NEW);
+               if (error)
+                       goto done;
+               error = xfs_btree_decrement(cur, 0, &i);
+               if (error)
+                       goto done;
+               NEW = LEFT;
+               NEW.rm_blockcount += len;
+               error = xfs_rmap_update(cur, &NEW);
+               if (error)
+                       goto done;
+               break;
+
+       case RMAP_LEFT_FILLING:
+               /*
+                * Setting the first part of a previous oldext extent to newext.
+                * The left neighbor is not contiguous.
+                */
+               NEW = PREV;
+               NEW.rm_startblock += len;
+               NEW.rm_offset += len;
+               NEW.rm_blockcount -= len;
+               error = xfs_rmap_update(cur, &NEW);
+               if (error)
+                       goto done;
+               NEW.rm_startblock = bno;
+               NEW.rm_owner = owner;
+               NEW.rm_offset = offset;
+               NEW.rm_blockcount = len;
+               NEW.rm_flags = newext;
+               cur->bc_rec.r = NEW;
+               trace_xfs_rmap_insert(mp, cur->bc_private.a.agno, bno,
+                               len, owner, offset, newext);
+               error = xfs_btree_insert(cur, &i);
+               if (error)
+                       goto done;
+               XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+               break;
+
+       case RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG:
+               /*
+                * Setting the last part of a previous oldext extent to newext.
+                * The right neighbor is contiguous with the new allocation.
+                */
+               NEW = PREV;
+               NEW.rm_blockcount -= len;
+               error = xfs_rmap_update(cur, &NEW);
+               if (error)
+                       goto done;
+               error = xfs_btree_increment(cur, 0, &i);
+               if (error)
+                       goto done;
+               NEW = RIGHT;
+               NEW.rm_offset = offset;
+               NEW.rm_startblock = bno;
+               NEW.rm_blockcount += len;
+               error = xfs_rmap_update(cur, &NEW);
+               if (error)
+                       goto done;
+               break;
+
+       case RMAP_RIGHT_FILLING:
+               /*
+                * Setting the last part of a previous oldext extent to newext.
+                * The right neighbor is not contiguous.
+                */
+               NEW = PREV;
+               NEW.rm_blockcount -= len;
+               error = xfs_rmap_update(cur, &NEW);
+               if (error)
+                       goto done;
+               error = xfs_rmap_lookup_eq(cur, bno, len, owner, offset,
+                               oldext, &i);
+               if (error)
+                       goto done;
+               XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done);
+               NEW.rm_startblock = bno;
+               NEW.rm_owner = owner;
+               NEW.rm_offset = offset;
+               NEW.rm_blockcount = len;
+               NEW.rm_flags = newext;
+               cur->bc_rec.r = NEW;
+               trace_xfs_rmap_insert(mp, cur->bc_private.a.agno, bno,
+                               len, owner, offset, newext);
+               error = xfs_btree_insert(cur, &i);
+               if (error)
+                       goto done;
+               XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+               break;
+
+       case 0:
+               /*
+                * Setting the middle part of a previous oldext extent to
+                * newext.  Contiguity is impossible here.
+                * One extent becomes three extents.
+                */
+               /* new right extent - oldext */
+               NEW.rm_startblock = bno + len;
+               NEW.rm_owner = owner;
+               NEW.rm_offset = new_endoff;
+               NEW.rm_blockcount = PREV.rm_offset + PREV.rm_blockcount -
+                               new_endoff;
+               NEW.rm_flags = PREV.rm_flags;
+               error = xfs_rmap_update(cur, &NEW);
+               if (error)
+                       goto done;
+               /* new left extent - oldext */
+               NEW = PREV;
+               NEW.rm_blockcount = offset - PREV.rm_offset;
+               cur->bc_rec.r = NEW;
+               trace_xfs_rmap_insert(mp, cur->bc_private.a.agno,
+                               NEW.rm_startblock, NEW.rm_blockcount,
+                               NEW.rm_owner, NEW.rm_offset,
+                               NEW.rm_flags);
+               error = xfs_btree_insert(cur, &i);
+               if (error)
+                       goto done;
+               XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+               /*
+                * Reset the cursor to the position of the new extent
+                * we are about to insert as we can't trust it after
+                * the previous insert.
+                */
+               error = xfs_rmap_lookup_eq(cur, bno, len, owner, offset,
+                               oldext, &i);
+               if (error)
+                       goto done;
+               XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done);
+               /* new middle extent - newext */
+               cur->bc_rec.r.rm_flags &= ~XFS_RMAP_UNWRITTEN;
+               cur->bc_rec.r.rm_flags |= newext;
+               trace_xfs_rmap_insert(mp, cur->bc_private.a.agno, bno, len,
+                               owner, offset, newext);
+               error = xfs_btree_insert(cur, &i);
+               if (error)
+                       goto done;
+               XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+               break;
+
+       case RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG | RMAP_RIGHT_CONTIG:
+       case RMAP_RIGHT_FILLING | RMAP_LEFT_CONTIG | RMAP_RIGHT_CONTIG:
+       case RMAP_LEFT_FILLING | RMAP_RIGHT_CONTIG:
+       case RMAP_RIGHT_FILLING | RMAP_LEFT_CONTIG:
+       case RMAP_LEFT_CONTIG | RMAP_RIGHT_CONTIG:
+       case RMAP_LEFT_CONTIG:
+       case RMAP_RIGHT_CONTIG:
+               /*
+                * These cases are all impossible.
+                */
+               ASSERT(0);
+       }
+
+       trace_xfs_rmap_convert_done(mp, cur->bc_private.a.agno, bno, len,
+                       unwritten, oinfo);
+done:
+       if (error)
+               trace_xfs_rmap_convert_error(cur->bc_mp,
+                               cur->bc_private.a.agno, error, _RET_IP_);
+       return error;
+}
+
+#undef NEW
+#undef LEFT
+#undef RIGHT
+#undef PREV
+
+struct xfs_rmap_query_range_info {
+       xfs_rmap_query_range_fn fn;
+       void                            *priv;
+};
+
+/* Format btree record and pass to our callback. */
+STATIC int
+xfs_rmap_query_range_helper(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_rec     *rec,
+       void                    *priv)
+{
+       struct xfs_rmap_query_range_info        *query = priv;
+       struct xfs_rmap_irec                    irec;
+       int                                     error;
+
+       error = xfs_rmap_btrec_to_irec(rec, &irec);
+       if (error)
+               return error;
+       return query->fn(cur, &irec, query->priv);
+}
+
+/* Find all rmaps between two keys. */
+int
+xfs_rmap_query_range(
+       struct xfs_btree_cur            *cur,
+       struct xfs_rmap_irec            *low_rec,
+       struct xfs_rmap_irec            *high_rec,
+       xfs_rmap_query_range_fn fn,
+       void                            *priv)
+{
+       union xfs_btree_irec            low_brec;
+       union xfs_btree_irec            high_brec;
+       struct xfs_rmap_query_range_info        query;
+
+       low_brec.r = *low_rec;
+       high_brec.r = *high_rec;
+       query.priv = priv;
+       query.fn = fn;
+       return xfs_btree_query_range(cur, &low_brec, &high_brec,
+                       xfs_rmap_query_range_helper, &query);
+}
+
+/* Clean up after calling xfs_rmap_finish_one. */
+void
+xfs_rmap_finish_one_cleanup(
+       struct xfs_trans        *tp,
+       struct xfs_btree_cur    *rcur,
+       int                     error)
+{
+       struct xfs_buf          *agbp;
+
+       if (rcur == NULL)
+               return;
+       agbp = rcur->bc_private.a.agbp;
+       xfs_btree_del_cursor(rcur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+       if (error)
+               xfs_trans_brelse(tp, agbp);
+}
+
+/*
+ * Process one of the deferred rmap operations.  We pass back the
+ * btree cursor to maintain our lock on the rmapbt between calls.
+ * This saves time and eliminates a buffer deadlock between the
+ * superblock and the AGF because we'll always grab them in the same
+ * order.
+ */
+int
+xfs_rmap_finish_one(
+       struct xfs_trans                *tp,
+       enum xfs_rmap_intent_type       type,
+       __uint64_t                      owner,
+       int                             whichfork,
+       xfs_fileoff_t                   startoff,
+       xfs_fsblock_t                   startblock,
+       xfs_filblks_t                   blockcount,
+       xfs_exntst_t                    state,
+       struct xfs_btree_cur            **pcur)
+{
+       struct xfs_mount                *mp = tp->t_mountp;
+       struct xfs_btree_cur            *rcur;
+       struct xfs_buf                  *agbp = NULL;
+       int                             error = 0;
+       xfs_agnumber_t                  agno;
+       struct xfs_owner_info           oinfo;
+       xfs_agblock_t                   bno;
+       bool                            unwritten;
+
+       agno = XFS_FSB_TO_AGNO(mp, startblock);
+       ASSERT(agno != NULLAGNUMBER);
+       bno = XFS_FSB_TO_AGBNO(mp, startblock);
+
+       trace_xfs_rmap_deferred(mp, agno, type, bno, owner, whichfork,
+                       startoff, blockcount, state);
+
+       if (XFS_TEST_ERROR(false, mp,
+                       XFS_ERRTAG_RMAP_FINISH_ONE,
+                       XFS_RANDOM_RMAP_FINISH_ONE))
+               return -EIO;
+
+       /*
+        * If we haven't gotten a cursor or the cursor AG doesn't match
+        * the startblock, get one now.
+        */
+       rcur = *pcur;
+       if (rcur != NULL && rcur->bc_private.a.agno != agno) {
+               xfs_rmap_finish_one_cleanup(tp, rcur, 0);
+               rcur = NULL;
+               *pcur = NULL;
+       }
+       if (rcur == NULL) {
+               /*
+                * Refresh the freelist before we start changing the
+                * rmapbt, because a shape change could cause us to
+                * allocate blocks.
+                */
+               error = xfs_free_extent_fix_freelist(tp, agno, &agbp);
+               if (error)
+                       return error;
+               if (!agbp)
+                       return -EFSCORRUPTED;
+
+               rcur = xfs_rmapbt_init_cursor(mp, tp, agbp, agno);
+               if (!rcur) {
+                       error = -ENOMEM;
+                       goto out_cur;
+               }
+       }
+       *pcur = rcur;
+
+       xfs_rmap_ino_owner(&oinfo, owner, whichfork, startoff);
+       unwritten = state == XFS_EXT_UNWRITTEN;
+       bno = XFS_FSB_TO_AGBNO(rcur->bc_mp, startblock);
+
+       switch (type) {
+       case XFS_RMAP_ALLOC:
+       case XFS_RMAP_MAP:
+               error = xfs_rmap_map(rcur, bno, blockcount, unwritten, &oinfo);
+               break;
+       case XFS_RMAP_FREE:
+       case XFS_RMAP_UNMAP:
+               error = xfs_rmap_unmap(rcur, bno, blockcount, unwritten,
+                               &oinfo);
+               break;
+       case XFS_RMAP_CONVERT:
+               error = xfs_rmap_convert(rcur, bno, blockcount, !unwritten,
+                               &oinfo);
+               break;
+       default:
+               ASSERT(0);
+               error = -EFSCORRUPTED;
+       }
+       return error;
+
+out_cur:
+       xfs_trans_brelse(tp, agbp);
+
+       return error;
+}
+
+/*
+ * Don't defer an rmap if we aren't an rmap filesystem.
+ */
+static bool
+xfs_rmap_update_is_needed(
+       struct xfs_mount        *mp)
+{
+       return xfs_sb_version_hasrmapbt(&mp->m_sb);
+}
+
+/*
+ * Record a rmap intent; the list is kept sorted first by AG and then by
+ * increasing age.
+ */
+static int
+__xfs_rmap_add(
+       struct xfs_mount                *mp,
+       struct xfs_defer_ops            *dfops,
+       enum xfs_rmap_intent_type       type,
+       __uint64_t                      owner,
+       int                             whichfork,
+       struct xfs_bmbt_irec            *bmap)
+{
+       struct xfs_rmap_intent  *ri;
+
+       trace_xfs_rmap_defer(mp, XFS_FSB_TO_AGNO(mp, bmap->br_startblock),
+                       type,
+                       XFS_FSB_TO_AGBNO(mp, bmap->br_startblock),
+                       owner, whichfork,
+                       bmap->br_startoff,
+                       bmap->br_blockcount,
+                       bmap->br_state);
+
+       ri = kmem_alloc(sizeof(struct xfs_rmap_intent), KM_SLEEP | KM_NOFS);
+       INIT_LIST_HEAD(&ri->ri_list);
+       ri->ri_type = type;
+       ri->ri_owner = owner;
+       ri->ri_whichfork = whichfork;
+       ri->ri_bmap = *bmap;
+
+       xfs_defer_add(dfops, XFS_DEFER_OPS_TYPE_RMAP, &ri->ri_list);
+       return 0;
+}
+
+/* Map an extent into a file. */
+int
+xfs_rmap_map_extent(
+       struct xfs_mount        *mp,
+       struct xfs_defer_ops    *dfops,
+       struct xfs_inode        *ip,
+       int                     whichfork,
+       struct xfs_bmbt_irec    *PREV)
+{
+       if (!xfs_rmap_update_is_needed(mp))
+               return 0;
+
+       return __xfs_rmap_add(mp, dfops, XFS_RMAP_MAP, ip->i_ino,
+                       whichfork, PREV);
+}
+
+/* Unmap an extent out of a file. */
+int
+xfs_rmap_unmap_extent(
+       struct xfs_mount        *mp,
+       struct xfs_defer_ops    *dfops,
+       struct xfs_inode        *ip,
+       int                     whichfork,
+       struct xfs_bmbt_irec    *PREV)
+{
+       if (!xfs_rmap_update_is_needed(mp))
+               return 0;
+
+       return __xfs_rmap_add(mp, dfops, XFS_RMAP_UNMAP, ip->i_ino,
+                       whichfork, PREV);
+}
+
+/* Convert a data fork extent from unwritten to real or vice versa. */
+int
+xfs_rmap_convert_extent(
+       struct xfs_mount        *mp,
+       struct xfs_defer_ops    *dfops,
+       struct xfs_inode        *ip,
+       int                     whichfork,
+       struct xfs_bmbt_irec    *PREV)
+{
+       if (!xfs_rmap_update_is_needed(mp))
+               return 0;
+
+       return __xfs_rmap_add(mp, dfops, XFS_RMAP_CONVERT, ip->i_ino,
+                       whichfork, PREV);
+}
+
+/* Schedule the creation of an rmap for non-file data. */
+int
+xfs_rmap_alloc_extent(
+       struct xfs_mount        *mp,
+       struct xfs_defer_ops    *dfops,
+       xfs_agnumber_t          agno,
+       xfs_agblock_t           bno,
+       xfs_extlen_t            len,
+       __uint64_t              owner)
+{
+       struct xfs_bmbt_irec    bmap;
+
+       if (!xfs_rmap_update_is_needed(mp))
+               return 0;
+
+       bmap.br_startblock = XFS_AGB_TO_FSB(mp, agno, bno);
+       bmap.br_blockcount = len;
+       bmap.br_startoff = 0;
+       bmap.br_state = XFS_EXT_NORM;
+
+       return __xfs_rmap_add(mp, dfops, XFS_RMAP_ALLOC, owner,
+                       XFS_DATA_FORK, &bmap);
+}
+
+/* Schedule the deletion of an rmap for non-file data. */
+int
+xfs_rmap_free_extent(
+       struct xfs_mount        *mp,
+       struct xfs_defer_ops    *dfops,
+       xfs_agnumber_t          agno,
+       xfs_agblock_t           bno,
+       xfs_extlen_t            len,
+       __uint64_t              owner)
+{
+       struct xfs_bmbt_irec    bmap;
+
+       if (!xfs_rmap_update_is_needed(mp))
+               return 0;
+
+       bmap.br_startblock = XFS_AGB_TO_FSB(mp, agno, bno);
+       bmap.br_blockcount = len;
+       bmap.br_startoff = 0;
+       bmap.br_state = XFS_EXT_NORM;
+
+       return __xfs_rmap_add(mp, dfops, XFS_RMAP_FREE, owner,
+                       XFS_DATA_FORK, &bmap);
+}
diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h
new file mode 100644 (file)
index 0000000..71cf99a
--- /dev/null
@@ -0,0 +1,209 @@
+/*
+ * Copyright (C) 2016 Oracle.  All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+#ifndef __XFS_RMAP_H__
+#define __XFS_RMAP_H__
+
+static inline void
+xfs_rmap_ag_owner(
+       struct xfs_owner_info   *oi,
+       uint64_t                owner)
+{
+       oi->oi_owner = owner;
+       oi->oi_offset = 0;
+       oi->oi_flags = 0;
+}
+
+static inline void
+xfs_rmap_ino_bmbt_owner(
+       struct xfs_owner_info   *oi,
+       xfs_ino_t               ino,
+       int                     whichfork)
+{
+       oi->oi_owner = ino;
+       oi->oi_offset = 0;
+       oi->oi_flags = XFS_OWNER_INFO_BMBT_BLOCK;
+       if (whichfork == XFS_ATTR_FORK)
+               oi->oi_flags |= XFS_OWNER_INFO_ATTR_FORK;
+}
+
+static inline void
+xfs_rmap_ino_owner(
+       struct xfs_owner_info   *oi,
+       xfs_ino_t               ino,
+       int                     whichfork,
+       xfs_fileoff_t           offset)
+{
+       oi->oi_owner = ino;
+       oi->oi_offset = offset;
+       oi->oi_flags = 0;
+       if (whichfork == XFS_ATTR_FORK)
+               oi->oi_flags |= XFS_OWNER_INFO_ATTR_FORK;
+}
+
+static inline void
+xfs_rmap_skip_owner_update(
+       struct xfs_owner_info   *oi)
+{
+       oi->oi_owner = XFS_RMAP_OWN_UNKNOWN;
+}
+
+/* Reverse mapping functions. */
+
+struct xfs_buf;
+
+static inline __u64
+xfs_rmap_irec_offset_pack(
+       const struct xfs_rmap_irec      *irec)
+{
+       __u64                   x;
+
+       x = XFS_RMAP_OFF(irec->rm_offset);
+       if (irec->rm_flags & XFS_RMAP_ATTR_FORK)
+               x |= XFS_RMAP_OFF_ATTR_FORK;
+       if (irec->rm_flags & XFS_RMAP_BMBT_BLOCK)
+               x |= XFS_RMAP_OFF_BMBT_BLOCK;
+       if (irec->rm_flags & XFS_RMAP_UNWRITTEN)
+               x |= XFS_RMAP_OFF_UNWRITTEN;
+       return x;
+}
+
+static inline int
+xfs_rmap_irec_offset_unpack(
+       __u64                   offset,
+       struct xfs_rmap_irec    *irec)
+{
+       if (offset & ~(XFS_RMAP_OFF_MASK | XFS_RMAP_OFF_FLAGS))
+               return -EFSCORRUPTED;
+       irec->rm_offset = XFS_RMAP_OFF(offset);
+       if (offset & XFS_RMAP_OFF_ATTR_FORK)
+               irec->rm_flags |= XFS_RMAP_ATTR_FORK;
+       if (offset & XFS_RMAP_OFF_BMBT_BLOCK)
+               irec->rm_flags |= XFS_RMAP_BMBT_BLOCK;
+       if (offset & XFS_RMAP_OFF_UNWRITTEN)
+               irec->rm_flags |= XFS_RMAP_UNWRITTEN;
+       return 0;
+}
+
+static inline void
+xfs_owner_info_unpack(
+       struct xfs_owner_info   *oinfo,
+       uint64_t                *owner,
+       uint64_t                *offset,
+       unsigned int            *flags)
+{
+       unsigned int            r = 0;
+
+       *owner = oinfo->oi_owner;
+       *offset = oinfo->oi_offset;
+       if (oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK)
+               r |= XFS_RMAP_ATTR_FORK;
+       if (oinfo->oi_flags & XFS_OWNER_INFO_BMBT_BLOCK)
+               r |= XFS_RMAP_BMBT_BLOCK;
+       *flags = r;
+}
+
+static inline void
+xfs_owner_info_pack(
+       struct xfs_owner_info   *oinfo,
+       uint64_t                owner,
+       uint64_t                offset,
+       unsigned int            flags)
+{
+       oinfo->oi_owner = owner;
+       oinfo->oi_offset = XFS_RMAP_OFF(offset);
+       oinfo->oi_flags = 0;
+       if (flags & XFS_RMAP_ATTR_FORK)
+               oinfo->oi_flags |= XFS_OWNER_INFO_ATTR_FORK;
+       if (flags & XFS_RMAP_BMBT_BLOCK)
+               oinfo->oi_flags |= XFS_OWNER_INFO_BMBT_BLOCK;
+}
+
+int xfs_rmap_alloc(struct xfs_trans *tp, struct xfs_buf *agbp,
+                  xfs_agnumber_t agno, xfs_agblock_t bno, xfs_extlen_t len,
+                  struct xfs_owner_info *oinfo);
+int xfs_rmap_free(struct xfs_trans *tp, struct xfs_buf *agbp,
+                 xfs_agnumber_t agno, xfs_agblock_t bno, xfs_extlen_t len,
+                 struct xfs_owner_info *oinfo);
+
+int xfs_rmap_lookup_le(struct xfs_btree_cur *cur, xfs_agblock_t bno,
+               xfs_extlen_t len, uint64_t owner, uint64_t offset,
+               unsigned int flags, int *stat);
+int xfs_rmap_lookup_eq(struct xfs_btree_cur *cur, xfs_agblock_t bno,
+               xfs_extlen_t len, uint64_t owner, uint64_t offset,
+               unsigned int flags, int *stat);
+int xfs_rmap_insert(struct xfs_btree_cur *rcur, xfs_agblock_t agbno,
+               xfs_extlen_t len, uint64_t owner, uint64_t offset,
+               unsigned int flags);
+int xfs_rmap_get_rec(struct xfs_btree_cur *cur, struct xfs_rmap_irec *irec,
+               int *stat);
+
+typedef int (*xfs_rmap_query_range_fn)(
+       struct xfs_btree_cur    *cur,
+       struct xfs_rmap_irec    *rec,
+       void                    *priv);
+
+int xfs_rmap_query_range(struct xfs_btree_cur *cur,
+               struct xfs_rmap_irec *low_rec, struct xfs_rmap_irec *high_rec,
+               xfs_rmap_query_range_fn fn, void *priv);
+
+enum xfs_rmap_intent_type {
+       XFS_RMAP_MAP,
+       XFS_RMAP_MAP_SHARED,
+       XFS_RMAP_UNMAP,
+       XFS_RMAP_UNMAP_SHARED,
+       XFS_RMAP_CONVERT,
+       XFS_RMAP_CONVERT_SHARED,
+       XFS_RMAP_ALLOC,
+       XFS_RMAP_FREE,
+};
+
+struct xfs_rmap_intent {
+       struct list_head                        ri_list;
+       enum xfs_rmap_intent_type               ri_type;
+       __uint64_t                              ri_owner;
+       int                                     ri_whichfork;
+       struct xfs_bmbt_irec                    ri_bmap;
+};
+
+/* functions for updating the rmapbt based on bmbt map/unmap operations */
+int xfs_rmap_map_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops,
+               struct xfs_inode *ip, int whichfork,
+               struct xfs_bmbt_irec *imap);
+int xfs_rmap_unmap_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops,
+               struct xfs_inode *ip, int whichfork,
+               struct xfs_bmbt_irec *imap);
+int xfs_rmap_convert_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops,
+               struct xfs_inode *ip, int whichfork,
+               struct xfs_bmbt_irec *imap);
+int xfs_rmap_alloc_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops,
+               xfs_agnumber_t agno, xfs_agblock_t bno, xfs_extlen_t len,
+               __uint64_t owner);
+int xfs_rmap_free_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops,
+               xfs_agnumber_t agno, xfs_agblock_t bno, xfs_extlen_t len,
+               __uint64_t owner);
+
+void xfs_rmap_finish_one_cleanup(struct xfs_trans *tp,
+               struct xfs_btree_cur *rcur, int error);
+int xfs_rmap_finish_one(struct xfs_trans *tp, enum xfs_rmap_intent_type type,
+               __uint64_t owner, int whichfork, xfs_fileoff_t startoff,
+               xfs_fsblock_t startblock, xfs_filblks_t blockcount,
+               xfs_exntst_t state, struct xfs_btree_cur **pcur);
+
+#endif /* __XFS_RMAP_H__ */
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c
new file mode 100644 (file)
index 0000000..bc1faeb
--- /dev/null
@@ -0,0 +1,511 @@
+/*
+ * Copyright (c) 2014 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_alloc.h"
+#include "xfs_btree.h"
+#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_trace.h"
+#include "xfs_cksum.h"
+#include "xfs_error.h"
+#include "xfs_extent_busy.h"
+
+/*
+ * Reverse map btree.
+ *
+ * This is a per-ag tree used to track the owner(s) of a given extent. With
+ * reflink it is possible for there to be multiple owners, which is a departure
+ * from classic XFS. Owner records for data extents are inserted when the
+ * extent is mapped and removed when an extent is unmapped.  Owner records for
+ * all other block types (i.e. metadata) are inserted when an extent is
+ * allocated and removed when an extent is freed. There can only be one owner
+ * of a metadata extent, usually an inode or some other metadata structure like
+ * an AG btree.
+ *
+ * The rmap btree is part of the free space management, so blocks for the tree
+ * are sourced from the agfl. Hence we need transaction reservation support for
+ * this tree so that the freelist is always large enough. This also impacts on
+ * the minimum space we need to leave free in the AG.
+ *
+ * The tree is ordered by [ag block, owner, offset]. This is a large key size,
+ * but it is the only way to enforce unique keys when a block can be owned by
+ * multiple files at any offset. There's no need to order/search by extent
+ * size for online updating/management of the tree. It is intended that most
+ * reverse lookups will be to find the owner(s) of a particular block, or to
+ * try to recover tree and file data from corrupt primary metadata.
+ */
+
+static struct xfs_btree_cur *
+xfs_rmapbt_dup_cursor(
+       struct xfs_btree_cur    *cur)
+{
+       return xfs_rmapbt_init_cursor(cur->bc_mp, cur->bc_tp,
+                       cur->bc_private.a.agbp, cur->bc_private.a.agno);
+}
+
+STATIC void
+xfs_rmapbt_set_root(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_ptr     *ptr,
+       int                     inc)
+{
+       struct xfs_buf          *agbp = cur->bc_private.a.agbp;
+       struct xfs_agf          *agf = XFS_BUF_TO_AGF(agbp);
+       xfs_agnumber_t          seqno = be32_to_cpu(agf->agf_seqno);
+       int                     btnum = cur->bc_btnum;
+       struct xfs_perag        *pag = xfs_perag_get(cur->bc_mp, seqno);
+
+       ASSERT(ptr->s != 0);
+
+       agf->agf_roots[btnum] = ptr->s;
+       be32_add_cpu(&agf->agf_levels[btnum], inc);
+       pag->pagf_levels[btnum] += inc;
+       xfs_perag_put(pag);
+
+       xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS);
+}
+
+STATIC int
+xfs_rmapbt_alloc_block(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_ptr     *start,
+       union xfs_btree_ptr     *new,
+       int                     *stat)
+{
+       int                     error;
+       xfs_agblock_t           bno;
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+
+       /* Allocate the new block from the freelist. If we can't, give up.  */
+       error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp,
+                                      &bno, 1);
+       if (error) {
+               XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+               return error;
+       }
+
+       trace_xfs_rmapbt_alloc_block(cur->bc_mp, cur->bc_private.a.agno,
+                       bno, 1);
+       if (bno == NULLAGBLOCK) {
+               XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+               *stat = 0;
+               return 0;
+       }
+
+       xfs_extent_busy_reuse(cur->bc_mp, cur->bc_private.a.agno, bno, 1,
+                       false);
+
+       xfs_trans_agbtree_delta(cur->bc_tp, 1);
+       new->s = cpu_to_be32(bno);
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+       *stat = 1;
+       return 0;
+}
+
+STATIC int
+xfs_rmapbt_free_block(
+       struct xfs_btree_cur    *cur,
+       struct xfs_buf          *bp)
+{
+       struct xfs_buf          *agbp = cur->bc_private.a.agbp;
+       struct xfs_agf          *agf = XFS_BUF_TO_AGF(agbp);
+       xfs_agblock_t           bno;
+       int                     error;
+
+       bno = xfs_daddr_to_agbno(cur->bc_mp, XFS_BUF_ADDR(bp));
+       trace_xfs_rmapbt_free_block(cur->bc_mp, cur->bc_private.a.agno,
+                       bno, 1);
+       error = xfs_alloc_put_freelist(cur->bc_tp, agbp, NULL, bno, 1);
+       if (error)
+               return error;
+
+       xfs_extent_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1,
+                             XFS_EXTENT_BUSY_SKIP_DISCARD);
+       xfs_trans_agbtree_delta(cur->bc_tp, -1);
+
+       return 0;
+}
+
+STATIC int
+xfs_rmapbt_get_minrecs(
+       struct xfs_btree_cur    *cur,
+       int                     level)
+{
+       return cur->bc_mp->m_rmap_mnr[level != 0];
+}
+
+STATIC int
+xfs_rmapbt_get_maxrecs(
+       struct xfs_btree_cur    *cur,
+       int                     level)
+{
+       return cur->bc_mp->m_rmap_mxr[level != 0];
+}
+
+STATIC void
+xfs_rmapbt_init_key_from_rec(
+       union xfs_btree_key     *key,
+       union xfs_btree_rec     *rec)
+{
+       key->rmap.rm_startblock = rec->rmap.rm_startblock;
+       key->rmap.rm_owner = rec->rmap.rm_owner;
+       key->rmap.rm_offset = rec->rmap.rm_offset;
+}
+
+/*
+ * The high key for a reverse mapping record can be computed by shifting
+ * the startblock and offset to the highest value that would still map
+ * to that record.  In practice this means that we add blockcount-1 to
+ * the startblock for all records, and if the record is for a data/attr
+ * fork mapping, we add blockcount-1 to the offset too.
+ */
+STATIC void
+xfs_rmapbt_init_high_key_from_rec(
+       union xfs_btree_key     *key,
+       union xfs_btree_rec     *rec)
+{
+       __uint64_t              off;
+       int                     adj;
+
+       adj = be32_to_cpu(rec->rmap.rm_blockcount) - 1;
+
+       key->rmap.rm_startblock = rec->rmap.rm_startblock;
+       be32_add_cpu(&key->rmap.rm_startblock, adj);
+       key->rmap.rm_owner = rec->rmap.rm_owner;
+       key->rmap.rm_offset = rec->rmap.rm_offset;
+       if (XFS_RMAP_NON_INODE_OWNER(be64_to_cpu(rec->rmap.rm_owner)) ||
+           XFS_RMAP_IS_BMBT_BLOCK(be64_to_cpu(rec->rmap.rm_offset)))
+               return;
+       off = be64_to_cpu(key->rmap.rm_offset);
+       off = (XFS_RMAP_OFF(off) + adj) | (off & ~XFS_RMAP_OFF_MASK);
+       key->rmap.rm_offset = cpu_to_be64(off);
+}
+
+STATIC void
+xfs_rmapbt_init_rec_from_cur(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_rec     *rec)
+{
+       rec->rmap.rm_startblock = cpu_to_be32(cur->bc_rec.r.rm_startblock);
+       rec->rmap.rm_blockcount = cpu_to_be32(cur->bc_rec.r.rm_blockcount);
+       rec->rmap.rm_owner = cpu_to_be64(cur->bc_rec.r.rm_owner);
+       rec->rmap.rm_offset = cpu_to_be64(
+                       xfs_rmap_irec_offset_pack(&cur->bc_rec.r));
+}
+
+STATIC void
+xfs_rmapbt_init_ptr_from_cur(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_ptr     *ptr)
+{
+       struct xfs_agf          *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
+
+       ASSERT(cur->bc_private.a.agno == be32_to_cpu(agf->agf_seqno));
+       ASSERT(agf->agf_roots[cur->bc_btnum] != 0);
+
+       ptr->s = agf->agf_roots[cur->bc_btnum];
+}
+
+STATIC __int64_t
+xfs_rmapbt_key_diff(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_key     *key)
+{
+       struct xfs_rmap_irec    *rec = &cur->bc_rec.r;
+       struct xfs_rmap_key     *kp = &key->rmap;
+       __u64                   x, y;
+       __int64_t               d;
+
+       d = (__int64_t)be32_to_cpu(kp->rm_startblock) - rec->rm_startblock;
+       if (d)
+               return d;
+
+       x = be64_to_cpu(kp->rm_owner);
+       y = rec->rm_owner;
+       if (x > y)
+               return 1;
+       else if (y > x)
+               return -1;
+
+       x = XFS_RMAP_OFF(be64_to_cpu(kp->rm_offset));
+       y = rec->rm_offset;
+       if (x > y)
+               return 1;
+       else if (y > x)
+               return -1;
+       return 0;
+}
+
+STATIC __int64_t
+xfs_rmapbt_diff_two_keys(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_key     *k1,
+       union xfs_btree_key     *k2)
+{
+       struct xfs_rmap_key     *kp1 = &k1->rmap;
+       struct xfs_rmap_key     *kp2 = &k2->rmap;
+       __int64_t               d;
+       __u64                   x, y;
+
+       d = (__int64_t)be32_to_cpu(kp1->rm_startblock) -
+                      be32_to_cpu(kp2->rm_startblock);
+       if (d)
+               return d;
+
+       x = be64_to_cpu(kp1->rm_owner);
+       y = be64_to_cpu(kp2->rm_owner);
+       if (x > y)
+               return 1;
+       else if (y > x)
+               return -1;
+
+       x = XFS_RMAP_OFF(be64_to_cpu(kp1->rm_offset));
+       y = XFS_RMAP_OFF(be64_to_cpu(kp2->rm_offset));
+       if (x > y)
+               return 1;
+       else if (y > x)
+               return -1;
+       return 0;
+}
+
+static bool
+xfs_rmapbt_verify(
+       struct xfs_buf          *bp)
+{
+       struct xfs_mount        *mp = bp->b_target->bt_mount;
+       struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
+       struct xfs_perag        *pag = bp->b_pag;
+       unsigned int            level;
+
+       /*
+        * magic number and level verification
+        *
+        * During growfs operations, we can't verify the exact level or owner as
+        * the perag is not fully initialised and hence not attached to the
+        * buffer.  In this case, check against the maximum tree depth.
+        *
+        * Similarly, during log recovery we will have a perag structure
+        * attached, but the agf information will not yet have been initialised
+        * from the on disk AGF. Again, we can only check against maximum limits
+        * in this case.
+        */
+       if (block->bb_magic != cpu_to_be32(XFS_RMAP_CRC_MAGIC))
+               return false;
+
+       if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
+               return false;
+       if (!xfs_btree_sblock_v5hdr_verify(bp))
+               return false;
+
+       level = be16_to_cpu(block->bb_level);
+       if (pag && pag->pagf_init) {
+               if (level >= pag->pagf_levels[XFS_BTNUM_RMAPi])
+                       return false;
+       } else if (level >= mp->m_rmap_maxlevels)
+               return false;
+
+       return xfs_btree_sblock_verify(bp, mp->m_rmap_mxr[level != 0]);
+}
+
+static void
+xfs_rmapbt_read_verify(
+       struct xfs_buf  *bp)
+{
+       if (!xfs_btree_sblock_verify_crc(bp))
+               xfs_buf_ioerror(bp, -EFSBADCRC);
+       else if (!xfs_rmapbt_verify(bp))
+               xfs_buf_ioerror(bp, -EFSCORRUPTED);
+
+       if (bp->b_error) {
+               trace_xfs_btree_corrupt(bp, _RET_IP_);
+               xfs_verifier_error(bp);
+       }
+}
+
+static void
+xfs_rmapbt_write_verify(
+       struct xfs_buf  *bp)
+{
+       if (!xfs_rmapbt_verify(bp)) {
+               trace_xfs_btree_corrupt(bp, _RET_IP_);
+               xfs_buf_ioerror(bp, -EFSCORRUPTED);
+               xfs_verifier_error(bp);
+               return;
+       }
+       xfs_btree_sblock_calc_crc(bp);
+
+}
+
+const struct xfs_buf_ops xfs_rmapbt_buf_ops = {
+       .name                   = "xfs_rmapbt",
+       .verify_read            = xfs_rmapbt_read_verify,
+       .verify_write           = xfs_rmapbt_write_verify,
+};
+
+#if defined(DEBUG) || defined(XFS_WARN)
+STATIC int
+xfs_rmapbt_keys_inorder(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_key     *k1,
+       union xfs_btree_key     *k2)
+{
+       __uint32_t              x;
+       __uint32_t              y;
+       __uint64_t              a;
+       __uint64_t              b;
+
+       x = be32_to_cpu(k1->rmap.rm_startblock);
+       y = be32_to_cpu(k2->rmap.rm_startblock);
+       if (x < y)
+               return 1;
+       else if (x > y)
+               return 0;
+       a = be64_to_cpu(k1->rmap.rm_owner);
+       b = be64_to_cpu(k2->rmap.rm_owner);
+       if (a < b)
+               return 1;
+       else if (a > b)
+               return 0;
+       a = XFS_RMAP_OFF(be64_to_cpu(k1->rmap.rm_offset));
+       b = XFS_RMAP_OFF(be64_to_cpu(k2->rmap.rm_offset));
+       if (a <= b)
+               return 1;
+       return 0;
+}
+
+STATIC int
+xfs_rmapbt_recs_inorder(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_rec     *r1,
+       union xfs_btree_rec     *r2)
+{
+       __uint32_t              x;
+       __uint32_t              y;
+       __uint64_t              a;
+       __uint64_t              b;
+
+       x = be32_to_cpu(r1->rmap.rm_startblock);
+       y = be32_to_cpu(r2->rmap.rm_startblock);
+       if (x < y)
+               return 1;
+       else if (x > y)
+               return 0;
+       a = be64_to_cpu(r1->rmap.rm_owner);
+       b = be64_to_cpu(r2->rmap.rm_owner);
+       if (a < b)
+               return 1;
+       else if (a > b)
+               return 0;
+       a = XFS_RMAP_OFF(be64_to_cpu(r1->rmap.rm_offset));
+       b = XFS_RMAP_OFF(be64_to_cpu(r2->rmap.rm_offset));
+       if (a <= b)
+               return 1;
+       return 0;
+}
+#endif /* DEBUG */
+
+static const struct xfs_btree_ops xfs_rmapbt_ops = {
+       .rec_len                = sizeof(struct xfs_rmap_rec),
+       .key_len                = 2 * sizeof(struct xfs_rmap_key),
+
+       .dup_cursor             = xfs_rmapbt_dup_cursor,
+       .set_root               = xfs_rmapbt_set_root,
+       .alloc_block            = xfs_rmapbt_alloc_block,
+       .free_block             = xfs_rmapbt_free_block,
+       .get_minrecs            = xfs_rmapbt_get_minrecs,
+       .get_maxrecs            = xfs_rmapbt_get_maxrecs,
+       .init_key_from_rec      = xfs_rmapbt_init_key_from_rec,
+       .init_high_key_from_rec = xfs_rmapbt_init_high_key_from_rec,
+       .init_rec_from_cur      = xfs_rmapbt_init_rec_from_cur,
+       .init_ptr_from_cur      = xfs_rmapbt_init_ptr_from_cur,
+       .key_diff               = xfs_rmapbt_key_diff,
+       .buf_ops                = &xfs_rmapbt_buf_ops,
+       .diff_two_keys          = xfs_rmapbt_diff_two_keys,
+#if defined(DEBUG) || defined(XFS_WARN)
+       .keys_inorder           = xfs_rmapbt_keys_inorder,
+       .recs_inorder           = xfs_rmapbt_recs_inorder,
+#endif
+};
+
+/*
+ * Allocate a new allocation btree cursor.
+ */
+struct xfs_btree_cur *
+xfs_rmapbt_init_cursor(
+       struct xfs_mount        *mp,
+       struct xfs_trans        *tp,
+       struct xfs_buf          *agbp,
+       xfs_agnumber_t          agno)
+{
+       struct xfs_agf          *agf = XFS_BUF_TO_AGF(agbp);
+       struct xfs_btree_cur    *cur;
+
+       cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS);
+       cur->bc_tp = tp;
+       cur->bc_mp = mp;
+       /* Overlapping btree; 2 keys per pointer. */
+       cur->bc_btnum = XFS_BTNUM_RMAP;
+       cur->bc_flags = XFS_BTREE_CRC_BLOCKS | XFS_BTREE_OVERLAPPING;
+       cur->bc_blocklog = mp->m_sb.sb_blocklog;
+       cur->bc_ops = &xfs_rmapbt_ops;
+       cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]);
+
+       cur->bc_private.a.agbp = agbp;
+       cur->bc_private.a.agno = agno;
+
+       return cur;
+}
+
+/*
+ * Calculate number of records in an rmap btree block.
+ */
+int
+xfs_rmapbt_maxrecs(
+       struct xfs_mount        *mp,
+       int                     blocklen,
+       int                     leaf)
+{
+       blocklen -= XFS_RMAP_BLOCK_LEN;
+
+       if (leaf)
+               return blocklen / sizeof(struct xfs_rmap_rec);
+       return blocklen /
+               (2 * sizeof(struct xfs_rmap_key) + sizeof(xfs_rmap_ptr_t));
+}
+
+/* Compute the maximum height of an rmap btree. */
+void
+xfs_rmapbt_compute_maxlevels(
+       struct xfs_mount                *mp)
+{
+       mp->m_rmap_maxlevels = xfs_btree_compute_maxlevels(mp,
+                       mp->m_rmap_mnr, mp->m_sb.sb_agblocks);
+}
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.h b/fs/xfs/libxfs/xfs_rmap_btree.h
new file mode 100644 (file)
index 0000000..e73a553
--- /dev/null
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2014 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_RMAP_BTREE_H__
+#define __XFS_RMAP_BTREE_H__
+
+struct xfs_buf;
+struct xfs_btree_cur;
+struct xfs_mount;
+
+/* rmaps only exist on crc enabled filesystems */
+#define XFS_RMAP_BLOCK_LEN     XFS_BTREE_SBLOCK_CRC_LEN
+
+/*
+ * Record, key, and pointer address macros for btree blocks.
+ *
+ * (note that some of these may appear unused, but they are used in userspace)
+ */
+#define XFS_RMAP_REC_ADDR(block, index) \
+       ((struct xfs_rmap_rec *) \
+               ((char *)(block) + XFS_RMAP_BLOCK_LEN + \
+                (((index) - 1) * sizeof(struct xfs_rmap_rec))))
+
+#define XFS_RMAP_KEY_ADDR(block, index) \
+       ((struct xfs_rmap_key *) \
+               ((char *)(block) + XFS_RMAP_BLOCK_LEN + \
+                ((index) - 1) * 2 * sizeof(struct xfs_rmap_key)))
+
+#define XFS_RMAP_HIGH_KEY_ADDR(block, index) \
+       ((struct xfs_rmap_key *) \
+               ((char *)(block) + XFS_RMAP_BLOCK_LEN + \
+                sizeof(struct xfs_rmap_key) + \
+                ((index) - 1) * 2 * sizeof(struct xfs_rmap_key)))
+
+#define XFS_RMAP_PTR_ADDR(block, index, maxrecs) \
+       ((xfs_rmap_ptr_t *) \
+               ((char *)(block) + XFS_RMAP_BLOCK_LEN + \
+                (maxrecs) * 2 * sizeof(struct xfs_rmap_key) + \
+                ((index) - 1) * sizeof(xfs_rmap_ptr_t)))
+
+struct xfs_btree_cur *xfs_rmapbt_init_cursor(struct xfs_mount *mp,
+                               struct xfs_trans *tp, struct xfs_buf *bp,
+                               xfs_agnumber_t agno);
+int xfs_rmapbt_maxrecs(struct xfs_mount *mp, int blocklen, int leaf);
+extern void xfs_rmapbt_compute_maxlevels(struct xfs_mount *mp);
+
+#endif /* __XFS_RMAP_BTREE_H__ */
index 12ca86778e023e4261998660f39b482927a16c02..0e3d4f5ec33c6f945b30b5ad47c9f39c46139ac0 100644 (file)
@@ -24,6 +24,7 @@
 #include "xfs_bit.h"
 #include "xfs_sb.h"
 #include "xfs_mount.h"
+#include "xfs_defer.h"
 #include "xfs_inode.h"
 #include "xfs_ialloc.h"
 #include "xfs_alloc.h"
@@ -36,6 +37,7 @@
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
 #include "xfs_log.h"
+#include "xfs_rmap_btree.h"
 
 /*
  * Physical superblock buffer manipulations. Shared with libxfs in userspace.
@@ -729,6 +731,11 @@ xfs_sb_mount_common(
        mp->m_bmap_dmnr[0] = mp->m_bmap_dmxr[0] / 2;
        mp->m_bmap_dmnr[1] = mp->m_bmap_dmxr[1] / 2;
 
+       mp->m_rmap_mxr[0] = xfs_rmapbt_maxrecs(mp, sbp->sb_blocksize, 1);
+       mp->m_rmap_mxr[1] = xfs_rmapbt_maxrecs(mp, sbp->sb_blocksize, 0);
+       mp->m_rmap_mnr[0] = mp->m_rmap_mxr[0] / 2;
+       mp->m_rmap_mnr[1] = mp->m_rmap_mxr[1] / 2;
+
        mp->m_bsize = XFS_FSB_TO_BB(mp, 1);
        mp->m_ialloc_inos = (int)MAX((__uint16_t)XFS_INODES_PER_CHUNK,
                                        sbp->sb_inopblock);
@@ -738,6 +745,8 @@ xfs_sb_mount_common(
                mp->m_ialloc_min_blks = sbp->sb_spino_align;
        else
                mp->m_ialloc_min_blks = mp->m_ialloc_blks;
+       mp->m_alloc_set_aside = xfs_alloc_set_aside(mp);
+       mp->m_ag_max_usable = xfs_alloc_ag_max_usable(mp);
 }
 
 /*
index 16002b5ec4eb82c2988fc6f559f0e3ed995ec1e9..0c5b30bd884cdce801780290935a5deb0c7d9de2 100644 (file)
@@ -38,6 +38,7 @@ extern const struct xfs_buf_ops xfs_agi_buf_ops;
 extern const struct xfs_buf_ops xfs_agf_buf_ops;
 extern const struct xfs_buf_ops xfs_agfl_buf_ops;
 extern const struct xfs_buf_ops xfs_allocbt_buf_ops;
+extern const struct xfs_buf_ops xfs_rmapbt_buf_ops;
 extern const struct xfs_buf_ops xfs_attr3_leaf_buf_ops;
 extern const struct xfs_buf_ops xfs_attr3_rmt_buf_ops;
 extern const struct xfs_buf_ops xfs_bmbt_buf_ops;
@@ -116,6 +117,7 @@ int xfs_log_calc_minimum_size(struct xfs_mount *);
 #define        XFS_INO_BTREE_REF       3
 #define        XFS_ALLOC_BTREE_REF     2
 #define        XFS_BMAP_BTREE_REF      2
+#define        XFS_RMAP_BTREE_REF      2
 #define        XFS_DIR_BTREE_REF       2
 #define        XFS_INO_REF             2
 #define        XFS_ATTR_BTREE_REF      1
index 68cb1e7bf2bb1d38e34398ab6dff35be5bd81c18..301ef2f4dbd6258f8981aa2d91bed524a8ce9110 100644 (file)
@@ -63,6 +63,30 @@ xfs_calc_buf_res(
        return nbufs * (size + xfs_buf_log_overhead());
 }
 
+/*
+ * Per-extent log reservation for the btree changes involved in freeing or
+ * allocating an extent.  In classic XFS there were two trees that will be
+ * modified (bnobt + cntbt).  With rmap enabled, there are three trees
+ * (rmapbt).  The number of blocks reserved is based on the formula:
+ *
+ * num trees * ((2 blocks/level * max depth) - 1)
+ *
+ * Keep in mind that max depth is calculated separately for each type of tree.
+ */
+static uint
+xfs_allocfree_log_count(
+       struct xfs_mount *mp,
+       uint            num_ops)
+{
+       uint            blocks;
+
+       blocks = num_ops * 2 * (2 * mp->m_ag_maxlevels - 1);
+       if (xfs_sb_version_hasrmapbt(&mp->m_sb))
+               blocks += num_ops * (2 * mp->m_rmap_maxlevels - 1);
+
+       return blocks;
+}
+
 /*
  * Logging inodes is really tricksy. They are logged in memory format,
  * which means that what we write into the log doesn't directly translate into
@@ -126,7 +150,7 @@ xfs_calc_inode_res(
  */
 STATIC uint
 xfs_calc_finobt_res(
-       struct xfs_mount        *mp,
+       struct xfs_mount        *mp,
        int                     alloc,
        int                     modify)
 {
@@ -137,7 +161,7 @@ xfs_calc_finobt_res(
 
        res = xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1));
        if (alloc)
-               res += xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), 
+               res += xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1),
                                        XFS_FSB_TO_B(mp, 1));
        if (modify)
                res += (uint)XFS_FSB_TO_B(mp, 1);
@@ -153,9 +177,9 @@ xfs_calc_finobt_res(
  * item logged to try to account for the overhead of the transaction mechanism.
  *
  * Note:  Most of the reservations underestimate the number of allocation
- * groups into which they could free extents in the xfs_bmap_finish() call.
+ * groups into which they could free extents in the xfs_defer_finish() call.
  * This is because the number in the worst case is quite high and quite
- * unusual.  In order to fix this we need to change xfs_bmap_finish() to free
+ * unusual.  In order to fix this we need to change xfs_defer_finish() to free
  * extents in only a single AG at a time.  This will require changes to the
  * EFI code as well, however, so that the EFI for the extents not freed is
  * logged again in each transaction.  See SGI PV #261917.
@@ -188,10 +212,10 @@ xfs_calc_write_reservation(
                     xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK),
                                      XFS_FSB_TO_B(mp, 1)) +
                     xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
-                    xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
+                    xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2),
                                      XFS_FSB_TO_B(mp, 1))),
                    (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
-                    xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
+                    xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2),
                                      XFS_FSB_TO_B(mp, 1))));
 }
 
@@ -217,10 +241,10 @@ xfs_calc_itruncate_reservation(
                     xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1,
                                      XFS_FSB_TO_B(mp, 1))),
                    (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) +
-                    xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 4),
+                    xfs_calc_buf_res(xfs_allocfree_log_count(mp, 4),
                                      XFS_FSB_TO_B(mp, 1)) +
                    xfs_calc_buf_res(5, 0) +
-                   xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+                   xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1),
                                     XFS_FSB_TO_B(mp, 1)) +
                    xfs_calc_buf_res(2 + mp->m_ialloc_blks +
                                     mp->m_in_maxlevels, 0)));
@@ -247,7 +271,7 @@ xfs_calc_rename_reservation(
                     xfs_calc_buf_res(2 * XFS_DIROP_LOG_COUNT(mp),
                                      XFS_FSB_TO_B(mp, 1))),
                    (xfs_calc_buf_res(7, mp->m_sb.sb_sectsize) +
-                    xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 3),
+                    xfs_calc_buf_res(xfs_allocfree_log_count(mp, 3),
                                      XFS_FSB_TO_B(mp, 1))));
 }
 
@@ -286,7 +310,7 @@ xfs_calc_link_reservation(
                     xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
                                      XFS_FSB_TO_B(mp, 1))),
                    (xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
-                    xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+                    xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1),
                                      XFS_FSB_TO_B(mp, 1))));
 }
 
@@ -324,7 +348,7 @@ xfs_calc_remove_reservation(
                     xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
                                      XFS_FSB_TO_B(mp, 1))),
                    (xfs_calc_buf_res(4, mp->m_sb.sb_sectsize) +
-                    xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
+                    xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2),
                                      XFS_FSB_TO_B(mp, 1))));
 }
 
@@ -371,7 +395,7 @@ xfs_calc_create_resv_alloc(
                mp->m_sb.sb_sectsize +
                xfs_calc_buf_res(mp->m_ialloc_blks, XFS_FSB_TO_B(mp, 1)) +
                xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) +
-               xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+               xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1),
                                 XFS_FSB_TO_B(mp, 1));
 }
 
@@ -399,7 +423,7 @@ xfs_calc_icreate_resv_alloc(
        return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
                mp->m_sb.sb_sectsize +
                xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) +
-               xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+               xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1),
                                 XFS_FSB_TO_B(mp, 1)) +
                xfs_calc_finobt_res(mp, 0, 0);
 }
@@ -483,7 +507,7 @@ xfs_calc_ifree_reservation(
                xfs_calc_buf_res(1, 0) +
                xfs_calc_buf_res(2 + mp->m_ialloc_blks +
                                 mp->m_in_maxlevels, 0) +
-               xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+               xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1),
                                 XFS_FSB_TO_B(mp, 1)) +
                xfs_calc_finobt_res(mp, 0, 1);
 }
@@ -513,7 +537,7 @@ xfs_calc_growdata_reservation(
        struct xfs_mount        *mp)
 {
        return xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
-               xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+               xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1),
                                 XFS_FSB_TO_B(mp, 1));
 }
 
@@ -535,7 +559,7 @@ xfs_calc_growrtalloc_reservation(
                xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK),
                                 XFS_FSB_TO_B(mp, 1)) +
                xfs_calc_inode_res(mp, 1) +
-               xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+               xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1),
                                 XFS_FSB_TO_B(mp, 1));
 }
 
@@ -611,7 +635,7 @@ xfs_calc_addafork_reservation(
                xfs_calc_buf_res(1, mp->m_dir_geo->blksize) +
                xfs_calc_buf_res(XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1,
                                 XFS_FSB_TO_B(mp, 1)) +
-               xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+               xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1),
                                 XFS_FSB_TO_B(mp, 1));
 }
 
@@ -634,7 +658,7 @@ xfs_calc_attrinval_reservation(
                    xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK),
                                     XFS_FSB_TO_B(mp, 1))),
                   (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) +
-                   xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 4),
+                   xfs_calc_buf_res(xfs_allocfree_log_count(mp, 4),
                                     XFS_FSB_TO_B(mp, 1))));
 }
 
@@ -701,7 +725,7 @@ xfs_calc_attrrm_reservation(
                                        XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) +
                     xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), 0)),
                    (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
-                    xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
+                    xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2),
                                      XFS_FSB_TO_B(mp, 1))));
 }
 
index 797815012c0e31fe711132b3c65aecd5591f7e38..0eb46ed6d404da7d3076e8338f56289ae7f83151 100644 (file)
@@ -67,16 +67,6 @@ struct xfs_trans_resv {
 /* shorthand way of accessing reservation structure */
 #define M_RES(mp)      (&(mp)->m_resv)
 
-/*
- * Per-extent log reservation for the allocation btree changes
- * involved in freeing or allocating an extent.
- * 2 trees * (2 blocks/level * max depth - 1) * block size
- */
-#define        XFS_ALLOCFREE_LOG_RES(mp,nx) \
-       ((nx) * (2 * XFS_FSB_TO_B((mp), 2 * (mp)->m_ag_maxlevels - 1)))
-#define        XFS_ALLOCFREE_LOG_COUNT(mp,nx) \
-       ((nx) * (2 * (2 * (mp)->m_ag_maxlevels - 1)))
-
 /*
  * Per-directory log reservation for any directory change.
  * dir blocks: (1 btree block per level + data block + free block) * dblock size
index b79dc66b2ecd4afb89f924cb046ac5b37f1ff8d8..3d503647f26b6924ecbe9b702d076bf29220830a 100644 (file)
@@ -108,8 +108,8 @@ typedef enum {
 } xfs_lookup_t;
 
 typedef enum {
-       XFS_BTNUM_BNOi, XFS_BTNUM_CNTi, XFS_BTNUM_BMAPi, XFS_BTNUM_INOi,
-       XFS_BTNUM_FINOi, XFS_BTNUM_MAX
+       XFS_BTNUM_BNOi, XFS_BTNUM_CNTi, XFS_BTNUM_RMAPi, XFS_BTNUM_BMAPi,
+       XFS_BTNUM_INOi, XFS_BTNUM_FINOi, XFS_BTNUM_MAX
 } xfs_btnum_t;
 
 struct xfs_name {
index cd4a850564f2a7d7f48d75c4c6b59f0f2994932e..4ece4f2ffc7271ef7249b4e410f5e48391aa0383 100644 (file)
@@ -25,6 +25,7 @@
 #include "xfs_bit.h"
 #include "xfs_mount.h"
 #include "xfs_da_format.h"
+#include "xfs_defer.h"
 #include "xfs_inode.h"
 #include "xfs_btree.h"
 #include "xfs_trans.h"
@@ -40,6 +41,7 @@
 #include "xfs_trace.h"
 #include "xfs_icache.h"
 #include "xfs_log.h"
+#include "xfs_rmap_btree.h"
 
 /* Kernel only BMAP related definitions and functions */
 
@@ -79,95 +81,6 @@ xfs_zero_extent(
                GFP_NOFS, true);
 }
 
-/* Sort bmap items by AG. */
-static int
-xfs_bmap_free_list_cmp(
-       void                    *priv,
-       struct list_head        *a,
-       struct list_head        *b)
-{
-       struct xfs_mount        *mp = priv;
-       struct xfs_bmap_free_item       *ra;
-       struct xfs_bmap_free_item       *rb;
-
-       ra = container_of(a, struct xfs_bmap_free_item, xbfi_list);
-       rb = container_of(b, struct xfs_bmap_free_item, xbfi_list);
-       return  XFS_FSB_TO_AGNO(mp, ra->xbfi_startblock) -
-               XFS_FSB_TO_AGNO(mp, rb->xbfi_startblock);
-}
-
-/*
- * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi
- * caller.  Frees all the extents that need freeing, which must be done
- * last due to locking considerations.  We never free any extents in
- * the first transaction.
- *
- * If an inode *ip is provided, rejoin it to the transaction if
- * the transaction was committed.
- */
-int                                            /* error */
-xfs_bmap_finish(
-       struct xfs_trans                **tp,   /* transaction pointer addr */
-       struct xfs_bmap_free            *flist, /* i/o: list extents to free */
-       struct xfs_inode                *ip)
-{
-       struct xfs_efd_log_item         *efd;   /* extent free data */
-       struct xfs_efi_log_item         *efi;   /* extent free intention */
-       int                             error;  /* error return value */
-       int                             committed;/* xact committed or not */
-       struct xfs_bmap_free_item       *free;  /* free extent item */
-
-       ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
-       if (flist->xbf_count == 0)
-               return 0;
-
-       list_sort((*tp)->t_mountp, &flist->xbf_flist, xfs_bmap_free_list_cmp);
-
-       efi = xfs_trans_get_efi(*tp, flist->xbf_count);
-       list_for_each_entry(free, &flist->xbf_flist, xbfi_list)
-               xfs_trans_log_efi_extent(*tp, efi, free->xbfi_startblock,
-                       free->xbfi_blockcount);
-
-       error = __xfs_trans_roll(tp, ip, &committed);
-       if (error) {
-               /*
-                * If the transaction was committed, drop the EFD reference
-                * since we're bailing out of here. The other reference is
-                * dropped when the EFI hits the AIL.
-                *
-                * If the transaction was not committed, the EFI is freed by the
-                * EFI item unlock handler on abort. Also, we have a new
-                * transaction so we should return committed=1 even though we're
-                * returning an error.
-                */
-               if (committed) {
-                       xfs_efi_release(efi);
-                       xfs_force_shutdown((*tp)->t_mountp,
-                                          SHUTDOWN_META_IO_ERROR);
-               }
-               return error;
-       }
-
-       /*
-        * Get an EFD and free each extent in the list, logging to the EFD in
-        * the process. The remaining bmap free list is cleaned up by the caller
-        * on error.
-        */
-       efd = xfs_trans_get_efd(*tp, efi, flist->xbf_count);
-       while (!list_empty(&flist->xbf_flist)) {
-               free = list_first_entry(&flist->xbf_flist,
-                               struct xfs_bmap_free_item, xbfi_list);
-               error = xfs_trans_free_extent(*tp, efd, free->xbfi_startblock,
-                                             free->xbfi_blockcount);
-               if (error)
-                       return error;
-
-               xfs_bmap_del_free(flist, free);
-       }
-
-       return 0;
-}
-
 int
 xfs_bmap_rtalloc(
        struct xfs_bmalloca     *ap)    /* bmap alloc argument struct */
@@ -214,9 +127,9 @@ xfs_bmap_rtalloc(
        /*
         * Lock out modifications to both the RT bitmap and summary inodes
         */
-       xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
+       xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL|XFS_ILOCK_RTBITMAP);
        xfs_trans_ijoin(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL);
-       xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL);
+       xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL|XFS_ILOCK_RTSUM);
        xfs_trans_ijoin(ap->tp, mp->m_rsumip, XFS_ILOCK_EXCL);
 
        /*
@@ -773,7 +686,7 @@ xfs_bmap_punch_delalloc_range(
                xfs_bmbt_irec_t imap;
                int             nimaps = 1;
                xfs_fsblock_t   firstblock;
-               xfs_bmap_free_t flist;
+               struct xfs_defer_ops dfops;
 
                /*
                 * Map the range first and check that it is a delalloc extent
@@ -804,18 +717,18 @@ xfs_bmap_punch_delalloc_range(
                WARN_ON(imap.br_blockcount == 0);
 
                /*
-                * Note: while we initialise the firstblock/flist pair, they
+                * Note: while we initialise the firstblock/dfops pair, they
                 * should never be used because blocks should never be
                 * allocated or freed for a delalloc extent and hence we need
                 * don't cancel or finish them after the xfs_bunmapi() call.
                 */
-               xfs_bmap_init(&flist, &firstblock);
+               xfs_defer_init(&dfops, &firstblock);
                error = xfs_bunmapi(NULL, ip, start_fsb, 1, 0, 1, &firstblock,
-                                       &flist, &done);
+                                       &dfops, &done);
                if (error)
                        break;
 
-               ASSERT(!flist.xbf_count && list_empty(&flist.xbf_flist));
+               ASSERT(!xfs_defer_has_unfinished_work(&dfops));
 next_block:
                start_fsb++;
                remaining--;
@@ -972,7 +885,7 @@ xfs_alloc_file_space(
        int                     rt;
        xfs_trans_t             *tp;
        xfs_bmbt_irec_t         imaps[1], *imapp;
-       xfs_bmap_free_t         free_list;
+       struct xfs_defer_ops    dfops;
        uint                    qblocks, resblks, resrtextents;
        int                     error;
 
@@ -1063,17 +976,17 @@ xfs_alloc_file_space(
 
                xfs_trans_ijoin(tp, ip, 0);
 
-               xfs_bmap_init(&free_list, &firstfsb);
+               xfs_defer_init(&dfops, &firstfsb);
                error = xfs_bmapi_write(tp, ip, startoffset_fsb,
                                        allocatesize_fsb, alloc_type, &firstfsb,
-                                       resblks, imapp, &nimaps, &free_list);
+                                       resblks, imapp, &nimaps, &dfops);
                if (error)
                        goto error0;
 
                /*
                 * Complete the transaction
                 */
-               error = xfs_bmap_finish(&tp, &free_list, NULL);
+               error = xfs_defer_finish(&tp, &dfops, NULL);
                if (error)
                        goto error0;
 
@@ -1096,7 +1009,7 @@ xfs_alloc_file_space(
        return error;
 
 error0:        /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
-       xfs_bmap_cancel(&free_list);
+       xfs_defer_cancel(&dfops);
        xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag);
 
 error1:        /* Just cancel transaction */
@@ -1114,7 +1027,7 @@ xfs_unmap_extent(
 {
        struct xfs_mount        *mp = ip->i_mount;
        struct xfs_trans        *tp;
-       struct xfs_bmap_free    free_list;
+       struct xfs_defer_ops    dfops;
        xfs_fsblock_t           firstfsb;
        uint                    resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
        int                     error;
@@ -1133,13 +1046,13 @@ xfs_unmap_extent(
 
        xfs_trans_ijoin(tp, ip, 0);
 
-       xfs_bmap_init(&free_list, &firstfsb);
+       xfs_defer_init(&dfops, &firstfsb);
        error = xfs_bunmapi(tp, ip, startoffset_fsb, len_fsb, 0, 2, &firstfsb,
-                       &free_list, done);
+                       &dfops, done);
        if (error)
                goto out_bmap_cancel;
 
-       error = xfs_bmap_finish(&tp, &free_list, NULL);
+       error = xfs_defer_finish(&tp, &dfops, ip);
        if (error)
                goto out_bmap_cancel;
 
@@ -1149,7 +1062,7 @@ out_unlock:
        return error;
 
 out_bmap_cancel:
-       xfs_bmap_cancel(&free_list);
+       xfs_defer_cancel(&dfops);
 out_trans_cancel:
        xfs_trans_cancel(tp);
        goto out_unlock;
@@ -1338,7 +1251,7 @@ xfs_shift_file_space(
        struct xfs_mount        *mp = ip->i_mount;
        struct xfs_trans        *tp;
        int                     error;
-       struct xfs_bmap_free    free_list;
+       struct xfs_defer_ops    dfops;
        xfs_fsblock_t           first_block;
        xfs_fileoff_t           stop_fsb;
        xfs_fileoff_t           next_fsb;
@@ -1416,19 +1329,19 @@ xfs_shift_file_space(
 
                xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
 
-               xfs_bmap_init(&free_list, &first_block);
+               xfs_defer_init(&dfops, &first_block);
 
                /*
                 * We are using the write transaction in which max 2 bmbt
                 * updates are allowed
                 */
                error = xfs_bmap_shift_extents(tp, ip, &next_fsb, shift_fsb,
-                               &done, stop_fsb, &first_block, &free_list,
+                               &done, stop_fsb, &first_block, &dfops,
                                direction, XFS_BMAP_MAX_SHIFT_EXTENTS);
                if (error)
                        goto out_bmap_cancel;
 
-               error = xfs_bmap_finish(&tp, &free_list, NULL);
+               error = xfs_defer_finish(&tp, &dfops, NULL);
                if (error)
                        goto out_bmap_cancel;
 
@@ -1438,7 +1351,7 @@ xfs_shift_file_space(
        return error;
 
 out_bmap_cancel:
-       xfs_bmap_cancel(&free_list);
+       xfs_defer_cancel(&dfops);
 out_trans_cancel:
        xfs_trans_cancel(tp);
        return error;
@@ -1622,6 +1535,10 @@ xfs_swap_extents(
        __uint64_t      tmp;
        int             lock_flags;
 
+       /* XXX: we can't do this with rmap, will fix later */
+       if (xfs_sb_version_hasrmapbt(&mp->m_sb))
+               return -EOPNOTSUPP;
+
        tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL);
        if (!tempifp) {
                error = -ENOMEM;
index f20071432ca6222e0d103ac8d4a54380e90565e0..68a621a8e0c0700adbd6369e3a0a588c43ebf114 100644 (file)
@@ -21,7 +21,7 @@
 /* Kernel only BMAP related definitions and functions */
 
 struct xfs_bmbt_irec;
-struct xfs_bmap_free_item;
+struct xfs_extent_free_item;
 struct xfs_ifork;
 struct xfs_inode;
 struct xfs_mount;
@@ -40,8 +40,6 @@ int   xfs_getbmap(struct xfs_inode *ip, struct getbmapx *bmv,
                xfs_bmap_format_t formatter, void *arg);
 
 /* functions in xfs_bmap.c that are only needed by xfs_bmap_util.c */
-void   xfs_bmap_del_free(struct xfs_bmap_free *flist,
-                         struct xfs_bmap_free_item *free);
 int    xfs_bmap_extsize_align(struct xfs_mount *mp, struct xfs_bmbt_irec *gotp,
                               struct xfs_bmbt_irec *prevp, xfs_extlen_t extsz,
                               int rt, int eof, int delay, int convert,
index 272c3f8b6f7d0f11a0564e40b00b0568a28683dd..4ff499aa7338f6b7ea098955710938de67791dd4 100644 (file)
@@ -179,7 +179,7 @@ xfs_ioc_trim(
         * matter as trimming blocks is an advisory interface.
         */
        if (range.start >= XFS_FSB_TO_B(mp, mp->m_sb.sb_dblocks) ||
-           range.minlen > XFS_FSB_TO_B(mp, XFS_ALLOC_AG_MAX_USABLE(mp)) ||
+           range.minlen > XFS_FSB_TO_B(mp, mp->m_ag_max_usable) ||
            range.len < mp->m_sb.sb_blocksize)
                return -EINVAL;
 
index ccb0811963b27d2b112b23df77337ab9b2c54c2c..7a30b8f11db7a26f8a82ded531e8a5170ea03ad5 100644 (file)
@@ -23,6 +23,7 @@
 #include "xfs_trans_resv.h"
 #include "xfs_bit.h"
 #include "xfs_mount.h"
+#include "xfs_defer.h"
 #include "xfs_inode.h"
 #include "xfs_bmap.h"
 #include "xfs_bmap_util.h"
@@ -307,7 +308,7 @@ xfs_qm_dqalloc(
        xfs_buf_t       **O_bpp)
 {
        xfs_fsblock_t   firstblock;
-       xfs_bmap_free_t flist;
+       struct xfs_defer_ops dfops;
        xfs_bmbt_irec_t map;
        int             nmaps, error;
        xfs_buf_t       *bp;
@@ -320,7 +321,7 @@ xfs_qm_dqalloc(
        /*
         * Initialize the bmap freelist prior to calling bmapi code.
         */
-       xfs_bmap_init(&flist, &firstblock);
+       xfs_defer_init(&dfops, &firstblock);
        xfs_ilock(quotip, XFS_ILOCK_EXCL);
        /*
         * Return if this type of quotas is turned off while we didn't
@@ -336,7 +337,7 @@ xfs_qm_dqalloc(
        error = xfs_bmapi_write(tp, quotip, offset_fsb,
                                XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA,
                                &firstblock, XFS_QM_DQALLOC_SPACE_RES(mp),
-                               &map, &nmaps, &flist);
+                               &map, &nmaps, &dfops);
        if (error)
                goto error0;
        ASSERT(map.br_blockcount == XFS_DQUOT_CLUSTER_SIZE_FSB);
@@ -368,7 +369,7 @@ xfs_qm_dqalloc(
                              dqp->dq_flags & XFS_DQ_ALLTYPES, bp);
 
        /*
-        * xfs_bmap_finish() may commit the current transaction and
+        * xfs_defer_finish() may commit the current transaction and
         * start a second transaction if the freelist is not empty.
         *
         * Since we still want to modify this buffer, we need to
@@ -382,7 +383,7 @@ xfs_qm_dqalloc(
 
        xfs_trans_bhold(tp, bp);
 
-       error = xfs_bmap_finish(tpp, &flist, NULL);
+       error = xfs_defer_finish(tpp, &dfops, NULL);
        if (error)
                goto error1;
 
@@ -398,7 +399,7 @@ xfs_qm_dqalloc(
        return 0;
 
 error1:
-       xfs_bmap_cancel(&flist);
+       xfs_defer_cancel(&dfops);
 error0:
        xfs_iunlock(quotip, XFS_ILOCK_EXCL);
 
index 2e4f67f688560b39b9a58aa54a4168607f495918..3d224702fbc0c4f6469d1a475ffbbc70beadaead 100644 (file)
@@ -90,7 +90,9 @@ extern void xfs_verifier_error(struct xfs_buf *bp);
 #define XFS_ERRTAG_STRATCMPL_IOERR                     19
 #define XFS_ERRTAG_DIOWRITE_IOERR                      20
 #define XFS_ERRTAG_BMAPIFORMAT                         21
-#define XFS_ERRTAG_MAX                                 22
+#define XFS_ERRTAG_FREE_EXTENT                         22
+#define XFS_ERRTAG_RMAP_FINISH_ONE                     23
+#define XFS_ERRTAG_MAX                                 24
 
 /*
  * Random factors for above tags, 1 means always, 2 means 1/2 time, etc.
@@ -117,6 +119,8 @@ extern void xfs_verifier_error(struct xfs_buf *bp);
 #define XFS_RANDOM_STRATCMPL_IOERR                     (XFS_RANDOM_DEFAULT/10)
 #define XFS_RANDOM_DIOWRITE_IOERR                      (XFS_RANDOM_DEFAULT/10)
 #define        XFS_RANDOM_BMAPIFORMAT                          XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_FREE_EXTENT                         1
+#define XFS_RANDOM_RMAP_FINISH_ONE                     1
 
 #ifdef DEBUG
 extern int xfs_error_test_active;
index ab779460ecbf3ab8d4c89fecdde1b4a48e66b746..d7bc14906af87f14ef570a968ead42100445c8bd 100644 (file)
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
+#include "xfs_bit.h"
 #include "xfs_mount.h"
 #include "xfs_trans.h"
 #include "xfs_trans_priv.h"
 #include "xfs_buf_item.h"
 #include "xfs_extfree_item.h"
 #include "xfs_log.h"
+#include "xfs_btree.h"
+#include "xfs_rmap.h"
 
 
 kmem_zone_t    *xfs_efi_zone;
@@ -486,3 +489,69 @@ xfs_efd_init(
 
        return efdp;
 }
+
+/*
+ * Process an extent free intent item that was recovered from
+ * the log.  We need to free the extents that it describes.
+ */
+int
+xfs_efi_recover(
+       struct xfs_mount        *mp,
+       struct xfs_efi_log_item *efip)
+{
+       struct xfs_efd_log_item *efdp;
+       struct xfs_trans        *tp;
+       int                     i;
+       int                     error = 0;
+       xfs_extent_t            *extp;
+       xfs_fsblock_t           startblock_fsb;
+       struct xfs_owner_info   oinfo;
+
+       ASSERT(!test_bit(XFS_EFI_RECOVERED, &efip->efi_flags));
+
+       /*
+        * First check the validity of the extents described by the
+        * EFI.  If any are bad, then assume that all are bad and
+        * just toss the EFI.
+        */
+       for (i = 0; i < efip->efi_format.efi_nextents; i++) {
+               extp = &efip->efi_format.efi_extents[i];
+               startblock_fsb = XFS_BB_TO_FSB(mp,
+                                  XFS_FSB_TO_DADDR(mp, extp->ext_start));
+               if (startblock_fsb == 0 ||
+                   extp->ext_len == 0 ||
+                   startblock_fsb >= mp->m_sb.sb_dblocks ||
+                   extp->ext_len >= mp->m_sb.sb_agblocks) {
+                       /*
+                        * This will pull the EFI from the AIL and
+                        * free the memory associated with it.
+                        */
+                       set_bit(XFS_EFI_RECOVERED, &efip->efi_flags);
+                       xfs_efi_release(efip);
+                       return -EIO;
+               }
+       }
+
+       error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
+       if (error)
+               return error;
+       efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents);
+
+       xfs_rmap_skip_owner_update(&oinfo);
+       for (i = 0; i < efip->efi_format.efi_nextents; i++) {
+               extp = &efip->efi_format.efi_extents[i];
+               error = xfs_trans_free_extent(tp, efdp, extp->ext_start,
+                                             extp->ext_len, &oinfo);
+               if (error)
+                       goto abort_error;
+
+       }
+
+       set_bit(XFS_EFI_RECOVERED, &efip->efi_flags);
+       error = xfs_trans_commit(tp);
+       return error;
+
+abort_error:
+       xfs_trans_cancel(tp);
+       return error;
+}
index 8fa8651705e1dc33bb84f1234b411aca9c73ef76..a32c794a86b7b48761aac60fe598a6f8876d63f1 100644 (file)
@@ -98,4 +98,7 @@ int                   xfs_efi_copy_format(xfs_log_iovec_t *buf,
 void                   xfs_efi_item_free(xfs_efi_log_item_t *);
 void                   xfs_efi_release(struct xfs_efi_log_item *);
 
+int                    xfs_efi_recover(struct xfs_mount *mp,
+                                       struct xfs_efi_log_item *efip);
+
 #endif /* __XFS_EXTFREE_ITEM_H__ */
index a51353a1f87f1a5e78064c0598f42397ece8f767..4a33a3304369109f2864bf97362962865f56c930 100644 (file)
@@ -22,6 +22,7 @@
 #include "xfs_trans_resv.h"
 #include "xfs_sb.h"
 #include "xfs_mount.h"
+#include "xfs_defer.h"
 #include "xfs_inode.h"
 #include "xfs_bmap.h"
 #include "xfs_bmap_util.h"
@@ -385,7 +386,7 @@ xfs_filestream_new_ag(
        }
 
        flags = (ap->userdata ? XFS_PICK_USERDATA : 0) |
-               (ap->flist->xbf_low ? XFS_PICK_LOWSPACE : 0);
+               (ap->dfops->dop_low ? XFS_PICK_LOWSPACE : 0);
 
        err = xfs_filestream_pick_ag(pip, startag, agp, flags, minlen);
 
index 7191c3878b4a774e26be7a0d6a40a750edfa6538..0f96847b90e1175d2c6d0f497fe278e67f95e141 100644 (file)
@@ -23,6 +23,7 @@
 #include "xfs_trans_resv.h"
 #include "xfs_sb.h"
 #include "xfs_mount.h"
+#include "xfs_defer.h"
 #include "xfs_da_format.h"
 #include "xfs_da_btree.h"
 #include "xfs_inode.h"
@@ -32,6 +33,7 @@
 #include "xfs_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_alloc.h"
+#include "xfs_rmap_btree.h"
 #include "xfs_ialloc.h"
 #include "xfs_fsops.h"
 #include "xfs_itable.h"
@@ -40,6 +42,7 @@
 #include "xfs_trace.h"
 #include "xfs_log.h"
 #include "xfs_filestream.h"
+#include "xfs_rmap.h"
 
 /*
  * File system operations
@@ -103,7 +106,9 @@ xfs_fs_geometry(
                        (xfs_sb_version_hasfinobt(&mp->m_sb) ?
                                XFS_FSOP_GEOM_FLAGS_FINOBT : 0) |
                        (xfs_sb_version_hassparseinodes(&mp->m_sb) ?
-                               XFS_FSOP_GEOM_FLAGS_SPINODES : 0);
+                               XFS_FSOP_GEOM_FLAGS_SPINODES : 0) |
+                       (xfs_sb_version_hasrmapbt(&mp->m_sb) ?
+                               XFS_FSOP_GEOM_FLAGS_RMAPBT : 0);
                geo->logsectsize = xfs_sb_version_hassector(&mp->m_sb) ?
                                mp->m_sb.sb_logsectsize : BBSIZE;
                geo->rtsectsize = mp->m_sb.sb_blocksize;
@@ -239,10 +244,16 @@ xfs_growfs_data_private(
                agf->agf_roots[XFS_BTNUM_CNTi] = cpu_to_be32(XFS_CNT_BLOCK(mp));
                agf->agf_levels[XFS_BTNUM_BNOi] = cpu_to_be32(1);
                agf->agf_levels[XFS_BTNUM_CNTi] = cpu_to_be32(1);
+               if (xfs_sb_version_hasrmapbt(&mp->m_sb)) {
+                       agf->agf_roots[XFS_BTNUM_RMAPi] =
+                                               cpu_to_be32(XFS_RMAP_BLOCK(mp));
+                       agf->agf_levels[XFS_BTNUM_RMAPi] = cpu_to_be32(1);
+               }
+
                agf->agf_flfirst = cpu_to_be32(1);
                agf->agf_fllast = 0;
                agf->agf_flcount = 0;
-               tmpsize = agsize - XFS_PREALLOC_BLOCKS(mp);
+               tmpsize = agsize - mp->m_ag_prealloc_blocks;
                agf->agf_freeblks = cpu_to_be32(tmpsize);
                agf->agf_longest = cpu_to_be32(tmpsize);
                if (xfs_sb_version_hascrc(&mp->m_sb))
@@ -339,7 +350,7 @@ xfs_growfs_data_private(
                                                agno, 0);
 
                arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1);
-               arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp));
+               arec->ar_startblock = cpu_to_be32(mp->m_ag_prealloc_blocks);
                arec->ar_blockcount = cpu_to_be32(
                        agsize - be32_to_cpu(arec->ar_startblock));
 
@@ -368,7 +379,7 @@ xfs_growfs_data_private(
                                                agno, 0);
 
                arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1);
-               arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp));
+               arec->ar_startblock = cpu_to_be32(mp->m_ag_prealloc_blocks);
                arec->ar_blockcount = cpu_to_be32(
                        agsize - be32_to_cpu(arec->ar_startblock));
                nfree += be32_to_cpu(arec->ar_blockcount);
@@ -378,6 +389,72 @@ xfs_growfs_data_private(
                if (error)
                        goto error0;
 
+               /* RMAP btree root block */
+               if (xfs_sb_version_hasrmapbt(&mp->m_sb)) {
+                       struct xfs_rmap_rec     *rrec;
+                       struct xfs_btree_block  *block;
+
+                       bp = xfs_growfs_get_hdr_buf(mp,
+                               XFS_AGB_TO_DADDR(mp, agno, XFS_RMAP_BLOCK(mp)),
+                               BTOBB(mp->m_sb.sb_blocksize), 0,
+                               &xfs_rmapbt_buf_ops);
+                       if (!bp) {
+                               error = -ENOMEM;
+                               goto error0;
+                       }
+
+                       xfs_btree_init_block(mp, bp, XFS_RMAP_CRC_MAGIC, 0, 0,
+                                               agno, XFS_BTREE_CRC_BLOCKS);
+                       block = XFS_BUF_TO_BLOCK(bp);
+
+
+                       /*
+                        * mark the AG header regions as static metadata The BNO
+                        * btree block is the first block after the headers, so
+                        * it's location defines the size of region the static
+                        * metadata consumes.
+                        *
+                        * Note: unlike mkfs, we never have to account for log
+                        * space when growing the data regions
+                        */
+                       rrec = XFS_RMAP_REC_ADDR(block, 1);
+                       rrec->rm_startblock = 0;
+                       rrec->rm_blockcount = cpu_to_be32(XFS_BNO_BLOCK(mp));
+                       rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_FS);
+                       rrec->rm_offset = 0;
+                       be16_add_cpu(&block->bb_numrecs, 1);
+
+                       /* account freespace btree root blocks */
+                       rrec = XFS_RMAP_REC_ADDR(block, 2);
+                       rrec->rm_startblock = cpu_to_be32(XFS_BNO_BLOCK(mp));
+                       rrec->rm_blockcount = cpu_to_be32(2);
+                       rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_AG);
+                       rrec->rm_offset = 0;
+                       be16_add_cpu(&block->bb_numrecs, 1);
+
+                       /* account inode btree root blocks */
+                       rrec = XFS_RMAP_REC_ADDR(block, 3);
+                       rrec->rm_startblock = cpu_to_be32(XFS_IBT_BLOCK(mp));
+                       rrec->rm_blockcount = cpu_to_be32(XFS_RMAP_BLOCK(mp) -
+                                                       XFS_IBT_BLOCK(mp));
+                       rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_INOBT);
+                       rrec->rm_offset = 0;
+                       be16_add_cpu(&block->bb_numrecs, 1);
+
+                       /* account for rmap btree root */
+                       rrec = XFS_RMAP_REC_ADDR(block, 4);
+                       rrec->rm_startblock = cpu_to_be32(XFS_RMAP_BLOCK(mp));
+                       rrec->rm_blockcount = cpu_to_be32(1);
+                       rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_AG);
+                       rrec->rm_offset = 0;
+                       be16_add_cpu(&block->bb_numrecs, 1);
+
+                       error = xfs_bwrite(bp);
+                       xfs_buf_relse(bp);
+                       if (error)
+                               goto error0;
+               }
+
                /*
                 * INO btree root block
                 */
@@ -435,6 +512,8 @@ xfs_growfs_data_private(
         * There are new blocks in the old last a.g.
         */
        if (new) {
+               struct xfs_owner_info   oinfo;
+
                /*
                 * Change the agi length.
                 */
@@ -462,14 +541,20 @@ xfs_growfs_data_private(
                       be32_to_cpu(agi->agi_length));
 
                xfs_alloc_log_agf(tp, bp, XFS_AGF_LENGTH);
+
                /*
                 * Free the new space.
+                *
+                * XFS_RMAP_OWN_NULL is used here to tell the rmap btree that
+                * this doesn't actually exist in the rmap btree.
                 */
-               error = xfs_free_extent(tp, XFS_AGB_TO_FSB(mp, agno,
-                       be32_to_cpu(agf->agf_length) - new), new);
-               if (error) {
+               xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_NULL);
+               error = xfs_free_extent(tp,
+                               XFS_AGB_TO_FSB(mp, agno,
+                                       be32_to_cpu(agf->agf_length) - new),
+                               new, &oinfo);
+               if (error)
                        goto error0;
-               }
        }
 
        /*
@@ -501,6 +586,7 @@ xfs_growfs_data_private(
        } else
                mp->m_maxicount = 0;
        xfs_set_low_space_thresholds(mp);
+       mp->m_alloc_set_aside = xfs_alloc_set_aside(mp);
 
        /* update secondary superblocks. */
        for (agno = 1; agno < nagcount; agno++) {
@@ -638,7 +724,7 @@ xfs_fs_counts(
        cnt->allocino = percpu_counter_read_positive(&mp->m_icount);
        cnt->freeino = percpu_counter_read_positive(&mp->m_ifree);
        cnt->freedata = percpu_counter_read_positive(&mp->m_fdblocks) -
-                                                       XFS_ALLOC_SET_ASIDE(mp);
+                                               mp->m_alloc_set_aside;
 
        spin_lock(&mp->m_sb_lock);
        cnt->freertx = mp->m_sb.sb_frextents;
@@ -726,7 +812,7 @@ xfs_reserve_blocks(
        error = -ENOSPC;
        do {
                free = percpu_counter_sum(&mp->m_fdblocks) -
-                                                       XFS_ALLOC_SET_ASIDE(mp);
+                                               mp->m_alloc_set_aside;
                if (!free)
                        break;
 
index 8825bcfd314c1d228b83a065fa901ce6a1be7128..e08eaea6327b5c4752264c7a54996b75fd2e2447 100644 (file)
@@ -25,6 +25,7 @@
 #include "xfs_trans_resv.h"
 #include "xfs_sb.h"
 #include "xfs_mount.h"
+#include "xfs_defer.h"
 #include "xfs_inode.h"
 #include "xfs_da_format.h"
 #include "xfs_da_btree.h"
@@ -1122,7 +1123,7 @@ xfs_create(
        struct xfs_inode        *ip = NULL;
        struct xfs_trans        *tp = NULL;
        int                     error;
-       xfs_bmap_free_t         free_list;
+       struct xfs_defer_ops    dfops;
        xfs_fsblock_t           first_block;
        bool                    unlock_dp_on_error = false;
        prid_t                  prid;
@@ -1182,7 +1183,7 @@ xfs_create(
                      XFS_IOLOCK_PARENT | XFS_ILOCK_PARENT);
        unlock_dp_on_error = true;
 
-       xfs_bmap_init(&free_list, &first_block);
+       xfs_defer_init(&dfops, &first_block);
 
        /*
         * Reserve disk quota and the inode.
@@ -1219,7 +1220,7 @@ xfs_create(
        unlock_dp_on_error = false;
 
        error = xfs_dir_createname(tp, dp, name, ip->i_ino,
-                                       &first_block, &free_list, resblks ?
+                                       &first_block, &dfops, resblks ?
                                        resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
        if (error) {
                ASSERT(error != -ENOSPC);
@@ -1253,7 +1254,7 @@ xfs_create(
         */
        xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
 
-       error = xfs_bmap_finish(&tp, &free_list, NULL);
+       error = xfs_defer_finish(&tp, &dfops, NULL);
        if (error)
                goto out_bmap_cancel;
 
@@ -1269,7 +1270,7 @@ xfs_create(
        return 0;
 
  out_bmap_cancel:
-       xfs_bmap_cancel(&free_list);
+       xfs_defer_cancel(&dfops);
  out_trans_cancel:
        xfs_trans_cancel(tp);
  out_release_inode:
@@ -1401,7 +1402,7 @@ xfs_link(
        xfs_mount_t             *mp = tdp->i_mount;
        xfs_trans_t             *tp;
        int                     error;
-       xfs_bmap_free_t         free_list;
+       struct xfs_defer_ops    dfops;
        xfs_fsblock_t           first_block;
        int                     resblks;
 
@@ -1452,7 +1453,7 @@ xfs_link(
                        goto error_return;
        }
 
-       xfs_bmap_init(&free_list, &first_block);
+       xfs_defer_init(&dfops, &first_block);
 
        /*
         * Handle initial link state of O_TMPFILE inode
@@ -1464,7 +1465,7 @@ xfs_link(
        }
 
        error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino,
-                                       &first_block, &free_list, resblks);
+                                       &first_block, &dfops, resblks);
        if (error)
                goto error_return;
        xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
@@ -1482,9 +1483,9 @@ xfs_link(
        if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
                xfs_trans_set_sync(tp);
 
-       error = xfs_bmap_finish(&tp, &free_list, NULL);
+       error = xfs_defer_finish(&tp, &dfops, NULL);
        if (error) {
-               xfs_bmap_cancel(&free_list);
+               xfs_defer_cancel(&dfops);
                goto error_return;
        }
 
@@ -1526,7 +1527,7 @@ xfs_itruncate_extents(
 {
        struct xfs_mount        *mp = ip->i_mount;
        struct xfs_trans        *tp = *tpp;
-       xfs_bmap_free_t         free_list;
+       struct xfs_defer_ops    dfops;
        xfs_fsblock_t           first_block;
        xfs_fileoff_t           first_unmap_block;
        xfs_fileoff_t           last_block;
@@ -1562,12 +1563,12 @@ xfs_itruncate_extents(
        ASSERT(first_unmap_block < last_block);
        unmap_len = last_block - first_unmap_block + 1;
        while (!done) {
-               xfs_bmap_init(&free_list, &first_block);
+               xfs_defer_init(&dfops, &first_block);
                error = xfs_bunmapi(tp, ip,
                                    first_unmap_block, unmap_len,
                                    xfs_bmapi_aflag(whichfork),
                                    XFS_ITRUNC_MAX_EXTENTS,
-                                   &first_block, &free_list,
+                                   &first_block, &dfops,
                                    &done);
                if (error)
                        goto out_bmap_cancel;
@@ -1576,7 +1577,7 @@ xfs_itruncate_extents(
                 * Duplicate the transaction that has the permanent
                 * reservation and commit the old transaction.
                 */
-               error = xfs_bmap_finish(&tp, &free_list, ip);
+               error = xfs_defer_finish(&tp, &dfops, ip);
                if (error)
                        goto out_bmap_cancel;
 
@@ -1602,7 +1603,7 @@ out_bmap_cancel:
         * the transaction can be properly aborted.  We just need to make sure
         * we're not holding any resources that we were not when we came in.
         */
-       xfs_bmap_cancel(&free_list);
+       xfs_defer_cancel(&dfops);
        goto out;
 }
 
@@ -1743,7 +1744,7 @@ STATIC int
 xfs_inactive_ifree(
        struct xfs_inode *ip)
 {
-       xfs_bmap_free_t         free_list;
+       struct xfs_defer_ops    dfops;
        xfs_fsblock_t           first_block;
        struct xfs_mount        *mp = ip->i_mount;
        struct xfs_trans        *tp;
@@ -1780,8 +1781,8 @@ xfs_inactive_ifree(
        xfs_ilock(ip, XFS_ILOCK_EXCL);
        xfs_trans_ijoin(tp, ip, 0);
 
-       xfs_bmap_init(&free_list, &first_block);
-       error = xfs_ifree(tp, ip, &free_list);
+       xfs_defer_init(&dfops, &first_block);
+       error = xfs_ifree(tp, ip, &dfops);
        if (error) {
                /*
                 * If we fail to free the inode, shut down.  The cancel
@@ -1807,11 +1808,11 @@ xfs_inactive_ifree(
         * Just ignore errors at this point.  There is nothing we can do except
         * to try to keep going. Make sure it's not a silent error.
         */
-       error = xfs_bmap_finish(&tp, &free_list, NULL);
+       error = xfs_defer_finish(&tp, &dfops, NULL);
        if (error) {
-               xfs_notice(mp, "%s: xfs_bmap_finish returned error %d",
+               xfs_notice(mp, "%s: xfs_defer_finish returned error %d",
                        __func__, error);
-               xfs_bmap_cancel(&free_list);
+               xfs_defer_cancel(&dfops);
        }
        error = xfs_trans_commit(tp);
        if (error)
@@ -2367,7 +2368,7 @@ int
 xfs_ifree(
        xfs_trans_t     *tp,
        xfs_inode_t     *ip,
-       xfs_bmap_free_t *flist)
+       struct xfs_defer_ops    *dfops)
 {
        int                     error;
        struct xfs_icluster     xic = { 0 };
@@ -2386,7 +2387,7 @@ xfs_ifree(
        if (error)
                return error;
 
-       error = xfs_difree(tp, ip->i_ino, flist, &xic);
+       error = xfs_difree(tp, ip->i_ino, dfops, &xic);
        if (error)
                return error;
 
@@ -2474,7 +2475,7 @@ xfs_iunpin_wait(
  * directory entry.
  *
  * This is still safe from a transactional point of view - it is not until we
- * get to xfs_bmap_finish() that we have the possibility of multiple
+ * get to xfs_defer_finish() that we have the possibility of multiple
  * transactions in this operation. Hence as long as we remove the directory
  * entry and drop the link count in the first transaction of the remove
  * operation, there are no transactional constraints on the ordering here.
@@ -2489,7 +2490,7 @@ xfs_remove(
        xfs_trans_t             *tp = NULL;
        int                     is_dir = S_ISDIR(VFS_I(ip)->i_mode);
        int                     error = 0;
-       xfs_bmap_free_t         free_list;
+       struct xfs_defer_ops    dfops;
        xfs_fsblock_t           first_block;
        uint                    resblks;
 
@@ -2571,9 +2572,9 @@ xfs_remove(
        if (error)
                goto out_trans_cancel;
 
-       xfs_bmap_init(&free_list, &first_block);
+       xfs_defer_init(&dfops, &first_block);
        error = xfs_dir_removename(tp, dp, name, ip->i_ino,
-                                       &first_block, &free_list, resblks);
+                                       &first_block, &dfops, resblks);
        if (error) {
                ASSERT(error != -ENOENT);
                goto out_bmap_cancel;
@@ -2587,7 +2588,7 @@ xfs_remove(
        if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
                xfs_trans_set_sync(tp);
 
-       error = xfs_bmap_finish(&tp, &free_list, NULL);
+       error = xfs_defer_finish(&tp, &dfops, NULL);
        if (error)
                goto out_bmap_cancel;
 
@@ -2601,7 +2602,7 @@ xfs_remove(
        return 0;
 
  out_bmap_cancel:
-       xfs_bmap_cancel(&free_list);
+       xfs_defer_cancel(&dfops);
  out_trans_cancel:
        xfs_trans_cancel(tp);
  std_return:
@@ -2662,7 +2663,7 @@ xfs_sort_for_rename(
 static int
 xfs_finish_rename(
        struct xfs_trans        *tp,
-       struct xfs_bmap_free    *free_list)
+       struct xfs_defer_ops    *dfops)
 {
        int                     error;
 
@@ -2673,9 +2674,9 @@ xfs_finish_rename(
        if (tp->t_mountp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
                xfs_trans_set_sync(tp);
 
-       error = xfs_bmap_finish(&tp, free_list, NULL);
+       error = xfs_defer_finish(&tp, dfops, NULL);
        if (error) {
-               xfs_bmap_cancel(free_list);
+               xfs_defer_cancel(dfops);
                xfs_trans_cancel(tp);
                return error;
        }
@@ -2697,7 +2698,7 @@ xfs_cross_rename(
        struct xfs_inode        *dp2,
        struct xfs_name         *name2,
        struct xfs_inode        *ip2,
-       struct xfs_bmap_free    *free_list,
+       struct xfs_defer_ops    *dfops,
        xfs_fsblock_t           *first_block,
        int                     spaceres)
 {
@@ -2709,14 +2710,14 @@ xfs_cross_rename(
        /* Swap inode number for dirent in first parent */
        error = xfs_dir_replace(tp, dp1, name1,
                                ip2->i_ino,
-                               first_block, free_list, spaceres);
+                               first_block, dfops, spaceres);
        if (error)
                goto out_trans_abort;
 
        /* Swap inode number for dirent in second parent */
        error = xfs_dir_replace(tp, dp2, name2,
                                ip1->i_ino,
-                               first_block, free_list, spaceres);
+                               first_block, dfops, spaceres);
        if (error)
                goto out_trans_abort;
 
@@ -2731,7 +2732,7 @@ xfs_cross_rename(
                if (S_ISDIR(VFS_I(ip2)->i_mode)) {
                        error = xfs_dir_replace(tp, ip2, &xfs_name_dotdot,
                                                dp1->i_ino, first_block,
-                                               free_list, spaceres);
+                                               dfops, spaceres);
                        if (error)
                                goto out_trans_abort;
 
@@ -2758,7 +2759,7 @@ xfs_cross_rename(
                if (S_ISDIR(VFS_I(ip1)->i_mode)) {
                        error = xfs_dir_replace(tp, ip1, &xfs_name_dotdot,
                                                dp2->i_ino, first_block,
-                                               free_list, spaceres);
+                                               dfops, spaceres);
                        if (error)
                                goto out_trans_abort;
 
@@ -2797,10 +2798,10 @@ xfs_cross_rename(
        }
        xfs_trans_ichgtime(tp, dp1, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
        xfs_trans_log_inode(tp, dp1, XFS_ILOG_CORE);
-       return xfs_finish_rename(tp, free_list);
+       return xfs_finish_rename(tp, dfops);
 
 out_trans_abort:
-       xfs_bmap_cancel(free_list);
+       xfs_defer_cancel(dfops);
        xfs_trans_cancel(tp);
        return error;
 }
@@ -2855,7 +2856,7 @@ xfs_rename(
 {
        struct xfs_mount        *mp = src_dp->i_mount;
        struct xfs_trans        *tp;
-       struct xfs_bmap_free    free_list;
+       struct xfs_defer_ops    dfops;
        xfs_fsblock_t           first_block;
        struct xfs_inode        *wip = NULL;            /* whiteout inode */
        struct xfs_inode        *inodes[__XFS_SORT_INODES];
@@ -2944,13 +2945,13 @@ xfs_rename(
                goto out_trans_cancel;
        }
 
-       xfs_bmap_init(&free_list, &first_block);
+       xfs_defer_init(&dfops, &first_block);
 
        /* RENAME_EXCHANGE is unique from here on. */
        if (flags & RENAME_EXCHANGE)
                return xfs_cross_rename(tp, src_dp, src_name, src_ip,
                                        target_dp, target_name, target_ip,
-                                       &free_list, &first_block, spaceres);
+                                       &dfops, &first_block, spaceres);
 
        /*
         * Set up the target.
@@ -2972,7 +2973,7 @@ xfs_rename(
                 */
                error = xfs_dir_createname(tp, target_dp, target_name,
                                                src_ip->i_ino, &first_block,
-                                               &free_list, spaceres);
+                                               &dfops, spaceres);
                if (error)
                        goto out_bmap_cancel;
 
@@ -3012,7 +3013,7 @@ xfs_rename(
                 */
                error = xfs_dir_replace(tp, target_dp, target_name,
                                        src_ip->i_ino,
-                                       &first_block, &free_list, spaceres);
+                                       &first_block, &dfops, spaceres);
                if (error)
                        goto out_bmap_cancel;
 
@@ -3047,7 +3048,7 @@ xfs_rename(
                 */
                error = xfs_dir_replace(tp, src_ip, &xfs_name_dotdot,
                                        target_dp->i_ino,
-                                       &first_block, &free_list, spaceres);
+                                       &first_block, &dfops, spaceres);
                ASSERT(error != -EEXIST);
                if (error)
                        goto out_bmap_cancel;
@@ -3086,10 +3087,10 @@ xfs_rename(
         */
        if (wip) {
                error = xfs_dir_replace(tp, src_dp, src_name, wip->i_ino,
-                                       &first_block, &free_list, spaceres);
+                                       &first_block, &dfops, spaceres);
        } else
                error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino,
-                                          &first_block, &free_list, spaceres);
+                                          &first_block, &dfops, spaceres);
        if (error)
                goto out_bmap_cancel;
 
@@ -3124,13 +3125,13 @@ xfs_rename(
        if (new_parent)
                xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE);
 
-       error = xfs_finish_rename(tp, &free_list);
+       error = xfs_finish_rename(tp, &dfops);
        if (wip)
                IRELE(wip);
        return error;
 
 out_bmap_cancel:
-       xfs_bmap_cancel(&free_list);
+       xfs_defer_cancel(&dfops);
 out_trans_cancel:
        xfs_trans_cancel(tp);
 out_release_wip:
index 8eb78ec4a6e227d6034848edaf2a8fce77c2ca7e..e1a411e08f00f6b6e0815b6f7af3a58aed746db2 100644 (file)
@@ -27,7 +27,7 @@
 struct xfs_dinode;
 struct xfs_inode;
 struct xfs_buf;
-struct xfs_bmap_free;
+struct xfs_defer_ops;
 struct xfs_bmbt_irec;
 struct xfs_inode_log_item;
 struct xfs_mount;
@@ -398,7 +398,7 @@ uint                xfs_ilock_attr_map_shared(struct xfs_inode *);
 
 uint           xfs_ip2xflags(struct xfs_inode *);
 int            xfs_ifree(struct xfs_trans *, xfs_inode_t *,
-                          struct xfs_bmap_free *);
+                          struct xfs_defer_ops *);
 int            xfs_itruncate_extents(struct xfs_trans **, struct xfs_inode *,
                                      int, xfs_fsize_t);
 void           xfs_iext_realloc(xfs_inode_t *, int, int);
index 9a7c87809d3b331bce018873f06ebae2ecc700ec..cf46658392ceadebc09b35ccf4dd5f7b4db8ddb5 100644 (file)
@@ -387,6 +387,7 @@ xfs_attrlist_by_handle(
 {
        int                     error = -ENOMEM;
        attrlist_cursor_kern_t  *cursor;
+       struct xfs_fsop_attrlist_handlereq __user       *p = arg;
        xfs_fsop_attrlist_handlereq_t al_hreq;
        struct dentry           *dentry;
        char                    *kbuf;
@@ -419,6 +420,11 @@ xfs_attrlist_by_handle(
        if (error)
                goto out_kfree;
 
+       if (copy_to_user(&p->pos, cursor, sizeof(attrlist_cursor_kern_t))) {
+               error = -EFAULT;
+               goto out_kfree;
+       }
+
        if (copy_to_user(al_hreq.buffer, kbuf, al_hreq.buflen))
                error = -EFAULT;
 
index 620fc91204443c62466283c1e92553656cde35f1..2114d53df433134a35084635b5238e2a775c6f0f 100644 (file)
@@ -23,6 +23,7 @@
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_mount.h"
+#include "xfs_defer.h"
 #include "xfs_inode.h"
 #include "xfs_btree.h"
 #include "xfs_bmap_btree.h"
@@ -128,7 +129,7 @@ xfs_iomap_write_direct(
        int             quota_flag;
        int             rt;
        xfs_trans_t     *tp;
-       xfs_bmap_free_t free_list;
+       struct xfs_defer_ops dfops;
        uint            qblocks, resblks, resrtextents;
        int             error;
        int             lockmode;
@@ -231,18 +232,18 @@ xfs_iomap_write_direct(
         * From this point onwards we overwrite the imap pointer that the
         * caller gave to us.
         */
-       xfs_bmap_init(&free_list, &firstfsb);
+       xfs_defer_init(&dfops, &firstfsb);
        nimaps = 1;
        error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb,
                                bmapi_flags, &firstfsb, resblks, imap,
-                               &nimaps, &free_list);
+                               &nimaps, &dfops);
        if (error)
                goto out_bmap_cancel;
 
        /*
         * Complete the transaction
         */
-       error = xfs_bmap_finish(&tp, &free_list, NULL);
+       error = xfs_defer_finish(&tp, &dfops, NULL);
        if (error)
                goto out_bmap_cancel;
 
@@ -266,7 +267,7 @@ out_unlock:
        return error;
 
 out_bmap_cancel:
-       xfs_bmap_cancel(&free_list);
+       xfs_defer_cancel(&dfops);
        xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag);
 out_trans_cancel:
        xfs_trans_cancel(tp);
@@ -685,7 +686,7 @@ xfs_iomap_write_allocate(
        xfs_fileoff_t   offset_fsb, last_block;
        xfs_fileoff_t   end_fsb, map_start_fsb;
        xfs_fsblock_t   first_block;
-       xfs_bmap_free_t free_list;
+       struct xfs_defer_ops    dfops;
        xfs_filblks_t   count_fsb;
        xfs_trans_t     *tp;
        int             nimaps;
@@ -727,7 +728,7 @@ xfs_iomap_write_allocate(
                        xfs_ilock(ip, XFS_ILOCK_EXCL);
                        xfs_trans_ijoin(tp, ip, 0);
 
-                       xfs_bmap_init(&free_list, &first_block);
+                       xfs_defer_init(&dfops, &first_block);
 
                        /*
                         * it is possible that the extents have changed since
@@ -783,11 +784,11 @@ xfs_iomap_write_allocate(
                        error = xfs_bmapi_write(tp, ip, map_start_fsb,
                                                count_fsb, 0, &first_block,
                                                nres, imap, &nimaps,
-                                               &free_list);
+                                               &dfops);
                        if (error)
                                goto trans_cancel;
 
-                       error = xfs_bmap_finish(&tp, &free_list, NULL);
+                       error = xfs_defer_finish(&tp, &dfops, NULL);
                        if (error)
                                goto trans_cancel;
 
@@ -821,7 +822,7 @@ xfs_iomap_write_allocate(
        }
 
 trans_cancel:
-       xfs_bmap_cancel(&free_list);
+       xfs_defer_cancel(&dfops);
        xfs_trans_cancel(tp);
 error0:
        xfs_iunlock(ip, XFS_ILOCK_EXCL);
@@ -842,7 +843,7 @@ xfs_iomap_write_unwritten(
        int             nimaps;
        xfs_trans_t     *tp;
        xfs_bmbt_irec_t imap;
-       xfs_bmap_free_t free_list;
+       struct xfs_defer_ops dfops;
        xfs_fsize_t     i_size;
        uint            resblks;
        int             error;
@@ -886,11 +887,11 @@ xfs_iomap_write_unwritten(
                /*
                 * Modify the unwritten extent state of the buffer.
                 */
-               xfs_bmap_init(&free_list, &firstfsb);
+               xfs_defer_init(&dfops, &firstfsb);
                nimaps = 1;
                error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb,
                                        XFS_BMAPI_CONVERT, &firstfsb, resblks,
-                                       &imap, &nimaps, &free_list);
+                                       &imap, &nimaps, &dfops);
                if (error)
                        goto error_on_bmapi_transaction;
 
@@ -909,7 +910,7 @@ xfs_iomap_write_unwritten(
                        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
                }
 
-               error = xfs_bmap_finish(&tp, &free_list, NULL);
+               error = xfs_defer_finish(&tp, &dfops, NULL);
                if (error)
                        goto error_on_bmapi_transaction;
 
@@ -936,7 +937,7 @@ xfs_iomap_write_unwritten(
        return 0;
 
 error_on_bmapi_transaction:
-       xfs_bmap_cancel(&free_list);
+       xfs_defer_cancel(&dfops);
        xfs_trans_cancel(tp);
        xfs_iunlock(ip, XFS_ILOCK_EXCL);
        return error;
index 83599784384686c2cb306bd2c2843422d5a2966c..e8638fd2c0c3a046c9ede4a2e4891c8b12110d92 100644 (file)
@@ -43,6 +43,7 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_error.h"
 #include "xfs_dir2.h"
+#include "xfs_rmap_item.h"
 
 #define BLK_AVG(blk1, blk2)    ((blk1+blk2) >> 1)
 
@@ -1911,6 +1912,8 @@ xlog_recover_reorder_trans(
                case XFS_LI_QUOTAOFF:
                case XFS_LI_EFD:
                case XFS_LI_EFI:
+               case XFS_LI_RUI:
+               case XFS_LI_RUD:
                        trace_xfs_log_recover_item_reorder_tail(log,
                                                        trans, item, pass);
                        list_move_tail(&item->ri_list, &inode_list);
@@ -2228,6 +2231,7 @@ xlog_recover_get_buf_lsn(
        case XFS_ABTC_CRC_MAGIC:
        case XFS_ABTB_MAGIC:
        case XFS_ABTC_MAGIC:
+       case XFS_RMAP_CRC_MAGIC:
        case XFS_IBT_CRC_MAGIC:
        case XFS_IBT_MAGIC: {
                struct xfs_btree_block *btb = blk;
@@ -2396,6 +2400,9 @@ xlog_recover_validate_buf_type(
                case XFS_BMAP_MAGIC:
                        bp->b_ops = &xfs_bmbt_buf_ops;
                        break;
+               case XFS_RMAP_CRC_MAGIC:
+                       bp->b_ops = &xfs_rmapbt_buf_ops;
+                       break;
                default:
                        xfs_warn(mp, "Bad btree block magic!");
                        ASSERT(0);
@@ -3414,6 +3421,99 @@ xlog_recover_efd_pass2(
        return 0;
 }
 
+/*
+ * This routine is called to create an in-core extent rmap update
+ * item from the rui format structure which was logged on disk.
+ * It allocates an in-core rui, copies the extents from the format
+ * structure into it, and adds the rui to the AIL with the given
+ * LSN.
+ */
+STATIC int
+xlog_recover_rui_pass2(
+       struct xlog                     *log,
+       struct xlog_recover_item        *item,
+       xfs_lsn_t                       lsn)
+{
+       int                             error;
+       struct xfs_mount                *mp = log->l_mp;
+       struct xfs_rui_log_item         *ruip;
+       struct xfs_rui_log_format       *rui_formatp;
+
+       rui_formatp = item->ri_buf[0].i_addr;
+
+       ruip = xfs_rui_init(mp, rui_formatp->rui_nextents);
+       error = xfs_rui_copy_format(&item->ri_buf[0], &ruip->rui_format);
+       if (error) {
+               xfs_rui_item_free(ruip);
+               return error;
+       }
+       atomic_set(&ruip->rui_next_extent, rui_formatp->rui_nextents);
+
+       spin_lock(&log->l_ailp->xa_lock);
+       /*
+        * The RUI has two references. One for the RUD and one for RUI to ensure
+        * it makes it into the AIL. Insert the RUI into the AIL directly and
+        * drop the RUI reference. Note that xfs_trans_ail_update() drops the
+        * AIL lock.
+        */
+       xfs_trans_ail_update(log->l_ailp, &ruip->rui_item, lsn);
+       xfs_rui_release(ruip);
+       return 0;
+}
+
+
+/*
+ * This routine is called when an RUD format structure is found in a committed
+ * transaction in the log. Its purpose is to cancel the corresponding RUI if it
+ * was still in the log. To do this it searches the AIL for the RUI with an id
+ * equal to that in the RUD format structure. If we find it we drop the RUD
+ * reference, which removes the RUI from the AIL and frees it.
+ */
+STATIC int
+xlog_recover_rud_pass2(
+       struct xlog                     *log,
+       struct xlog_recover_item        *item)
+{
+       struct xfs_rud_log_format       *rud_formatp;
+       struct xfs_rui_log_item         *ruip = NULL;
+       struct xfs_log_item             *lip;
+       __uint64_t                      rui_id;
+       struct xfs_ail_cursor           cur;
+       struct xfs_ail                  *ailp = log->l_ailp;
+
+       rud_formatp = item->ri_buf[0].i_addr;
+       ASSERT(item->ri_buf[0].i_len == sizeof(struct xfs_rud_log_format));
+       rui_id = rud_formatp->rud_rui_id;
+
+       /*
+        * Search for the RUI with the id in the RUD format structure in the
+        * AIL.
+        */
+       spin_lock(&ailp->xa_lock);
+       lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
+       while (lip != NULL) {
+               if (lip->li_type == XFS_LI_RUI) {
+                       ruip = (struct xfs_rui_log_item *)lip;
+                       if (ruip->rui_format.rui_id == rui_id) {
+                               /*
+                                * Drop the RUD reference to the RUI. This
+                                * removes the RUI from the AIL and frees it.
+                                */
+                               spin_unlock(&ailp->xa_lock);
+                               xfs_rui_release(ruip);
+                               spin_lock(&ailp->xa_lock);
+                               break;
+                       }
+               }
+               lip = xfs_trans_ail_cursor_next(ailp, &cur);
+       }
+
+       xfs_trans_ail_cursor_done(&cur);
+       spin_unlock(&ailp->xa_lock);
+
+       return 0;
+}
+
 /*
  * This routine is called when an inode create format structure is found in a
  * committed transaction in the log.  It's purpose is to initialise the inodes
@@ -3639,6 +3739,8 @@ xlog_recover_ra_pass2(
        case XFS_LI_EFI:
        case XFS_LI_EFD:
        case XFS_LI_QUOTAOFF:
+       case XFS_LI_RUI:
+       case XFS_LI_RUD:
        default:
                break;
        }
@@ -3662,6 +3764,8 @@ xlog_recover_commit_pass1(
        case XFS_LI_EFD:
        case XFS_LI_DQUOT:
        case XFS_LI_ICREATE:
+       case XFS_LI_RUI:
+       case XFS_LI_RUD:
                /* nothing to do in pass 1 */
                return 0;
        default:
@@ -3692,6 +3796,10 @@ xlog_recover_commit_pass2(
                return xlog_recover_efi_pass2(log, item, trans->r_lsn);
        case XFS_LI_EFD:
                return xlog_recover_efd_pass2(log, item);
+       case XFS_LI_RUI:
+               return xlog_recover_rui_pass2(log, item, trans->r_lsn);
+       case XFS_LI_RUD:
+               return xlog_recover_rud_pass2(log, item);
        case XFS_LI_DQUOT:
                return xlog_recover_dquot_pass2(log, buffer_list, item,
                                                trans->r_lsn);
@@ -4164,126 +4272,156 @@ xlog_recover_process_data(
        return 0;
 }
 
-/*
- * Process an extent free intent item that was recovered from
- * the log.  We need to free the extents that it describes.
- */
+/* Recover the EFI if necessary. */
 STATIC int
 xlog_recover_process_efi(
-       xfs_mount_t             *mp,
-       xfs_efi_log_item_t      *efip)
+       struct xfs_mount                *mp,
+       struct xfs_ail                  *ailp,
+       struct xfs_log_item             *lip)
 {
-       xfs_efd_log_item_t      *efdp;
-       xfs_trans_t             *tp;
-       int                     i;
-       int                     error = 0;
-       xfs_extent_t            *extp;
-       xfs_fsblock_t           startblock_fsb;
-
-       ASSERT(!test_bit(XFS_EFI_RECOVERED, &efip->efi_flags));
+       struct xfs_efi_log_item         *efip;
+       int                             error;
 
        /*
-        * First check the validity of the extents described by the
-        * EFI.  If any are bad, then assume that all are bad and
-        * just toss the EFI.
+        * Skip EFIs that we've already processed.
         */
-       for (i = 0; i < efip->efi_format.efi_nextents; i++) {
-               extp = &(efip->efi_format.efi_extents[i]);
-               startblock_fsb = XFS_BB_TO_FSB(mp,
-                                  XFS_FSB_TO_DADDR(mp, extp->ext_start));
-               if ((startblock_fsb == 0) ||
-                   (extp->ext_len == 0) ||
-                   (startblock_fsb >= mp->m_sb.sb_dblocks) ||
-                   (extp->ext_len >= mp->m_sb.sb_agblocks)) {
-                       /*
-                        * This will pull the EFI from the AIL and
-                        * free the memory associated with it.
-                        */
-                       set_bit(XFS_EFI_RECOVERED, &efip->efi_flags);
-                       xfs_efi_release(efip);
-                       return -EIO;
-               }
-       }
+       efip = container_of(lip, struct xfs_efi_log_item, efi_item);
+       if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags))
+               return 0;
 
-       error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
-       if (error)
-               return error;
-       efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents);
+       spin_unlock(&ailp->xa_lock);
+       error = xfs_efi_recover(mp, efip);
+       spin_lock(&ailp->xa_lock);
 
-       for (i = 0; i < efip->efi_format.efi_nextents; i++) {
-               extp = &(efip->efi_format.efi_extents[i]);
-               error = xfs_trans_free_extent(tp, efdp, extp->ext_start,
-                                             extp->ext_len);
-               if (error)
-                       goto abort_error;
+       return error;
+}
 
-       }
+/* Release the EFI since we're cancelling everything. */
+STATIC void
+xlog_recover_cancel_efi(
+       struct xfs_mount                *mp,
+       struct xfs_ail                  *ailp,
+       struct xfs_log_item             *lip)
+{
+       struct xfs_efi_log_item         *efip;
 
-       set_bit(XFS_EFI_RECOVERED, &efip->efi_flags);
-       error = xfs_trans_commit(tp);
-       return error;
+       efip = container_of(lip, struct xfs_efi_log_item, efi_item);
+
+       spin_unlock(&ailp->xa_lock);
+       xfs_efi_release(efip);
+       spin_lock(&ailp->xa_lock);
+}
+
+/* Recover the RUI if necessary. */
+STATIC int
+xlog_recover_process_rui(
+       struct xfs_mount                *mp,
+       struct xfs_ail                  *ailp,
+       struct xfs_log_item             *lip)
+{
+       struct xfs_rui_log_item         *ruip;
+       int                             error;
+
+       /*
+        * Skip RUIs that we've already processed.
+        */
+       ruip = container_of(lip, struct xfs_rui_log_item, rui_item);
+       if (test_bit(XFS_RUI_RECOVERED, &ruip->rui_flags))
+               return 0;
+
+       spin_unlock(&ailp->xa_lock);
+       error = xfs_rui_recover(mp, ruip);
+       spin_lock(&ailp->xa_lock);
 
-abort_error:
-       xfs_trans_cancel(tp);
        return error;
 }
 
+/* Release the RUI since we're cancelling everything. */
+STATIC void
+xlog_recover_cancel_rui(
+       struct xfs_mount                *mp,
+       struct xfs_ail                  *ailp,
+       struct xfs_log_item             *lip)
+{
+       struct xfs_rui_log_item         *ruip;
+
+       ruip = container_of(lip, struct xfs_rui_log_item, rui_item);
+
+       spin_unlock(&ailp->xa_lock);
+       xfs_rui_release(ruip);
+       spin_lock(&ailp->xa_lock);
+}
+
+/* Is this log item a deferred action intent? */
+static inline bool xlog_item_is_intent(struct xfs_log_item *lip)
+{
+       switch (lip->li_type) {
+       case XFS_LI_EFI:
+       case XFS_LI_RUI:
+               return true;
+       default:
+               return false;
+       }
+}
+
 /*
- * When this is called, all of the EFIs which did not have
- * corresponding EFDs should be in the AIL.  What we do now
- * is free the extents associated with each one.
+ * When this is called, all of the log intent items which did not have
+ * corresponding log done items should be in the AIL.  What we do now
+ * is update the data structures associated with each one.
  *
- * Since we process the EFIs in normal transactions, they
- * will be removed at some point after the commit.  This prevents
- * us from just walking down the list processing each one.
- * We'll use a flag in the EFI to skip those that we've already
- * processed and use the AIL iteration mechanism's generation
- * count to try to speed this up at least a bit.
+ * Since we process the log intent items in normal transactions, they
+ * will be removed at some point after the commit.  This prevents us
+ * from just walking down the list processing each one.  We'll use a
+ * flag in the intent item to skip those that we've already processed
+ * and use the AIL iteration mechanism's generation count to try to
+ * speed this up at least a bit.
  *
- * When we start, we know that the EFIs are the only things in
- * the AIL.  As we process them, however, other items are added
- * to the AIL.  Since everything added to the AIL must come after
- * everything already in the AIL, we stop processing as soon as
- * we see something other than an EFI in the AIL.
+ * When we start, we know that the intents are the only things in the
+ * AIL.  As we process them, however, other items are added to the
+ * AIL.
  */
 STATIC int
-xlog_recover_process_efis(
+xlog_recover_process_intents(
        struct xlog             *log)
 {
        struct xfs_log_item     *lip;
-       struct xfs_efi_log_item *efip;
        int                     error = 0;
        struct xfs_ail_cursor   cur;
        struct xfs_ail          *ailp;
+       xfs_lsn_t               last_lsn;
 
        ailp = log->l_ailp;
        spin_lock(&ailp->xa_lock);
        lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
+       last_lsn = xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block);
        while (lip != NULL) {
                /*
-                * We're done when we see something other than an EFI.
-                * There should be no EFIs left in the AIL now.
+                * We're done when we see something other than an intent.
+                * There should be no intents left in the AIL now.
                 */
-               if (lip->li_type != XFS_LI_EFI) {
+               if (!xlog_item_is_intent(lip)) {
 #ifdef DEBUG
                        for (; lip; lip = xfs_trans_ail_cursor_next(ailp, &cur))
-                               ASSERT(lip->li_type != XFS_LI_EFI);
+                               ASSERT(!xlog_item_is_intent(lip));
 #endif
                        break;
                }
 
                /*
-                * Skip EFIs that we've already processed.
+                * We should never see a redo item with a LSN higher than
+                * the last transaction we found in the log at the start
+                * of recovery.
                 */
-               efip = container_of(lip, struct xfs_efi_log_item, efi_item);
-               if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)) {
-                       lip = xfs_trans_ail_cursor_next(ailp, &cur);
-                       continue;
-               }
+               ASSERT(XFS_LSN_CMP(last_lsn, lip->li_lsn) >= 0);
 
-               spin_unlock(&ailp->xa_lock);
-               error = xlog_recover_process_efi(log->l_mp, efip);
-               spin_lock(&ailp->xa_lock);
+               switch (lip->li_type) {
+               case XFS_LI_EFI:
+                       error = xlog_recover_process_efi(log->l_mp, ailp, lip);
+                       break;
+               case XFS_LI_RUI:
+                       error = xlog_recover_process_rui(log->l_mp, ailp, lip);
+                       break;
+               }
                if (error)
                        goto out;
                lip = xfs_trans_ail_cursor_next(ailp, &cur);
@@ -4295,15 +4433,14 @@ out:
 }
 
 /*
- * A cancel occurs when the mount has failed and we're bailing out. Release all
- * pending EFIs so they don't pin the AIL.
+ * A cancel occurs when the mount has failed and we're bailing out.
+ * Release all pending log intent items so they don't pin the AIL.
  */
 STATIC int
-xlog_recover_cancel_efis(
+xlog_recover_cancel_intents(
        struct xlog             *log)
 {
        struct xfs_log_item     *lip;
-       struct xfs_efi_log_item *efip;
        int                     error = 0;
        struct xfs_ail_cursor   cur;
        struct xfs_ail          *ailp;
@@ -4313,22 +4450,25 @@ xlog_recover_cancel_efis(
        lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
        while (lip != NULL) {
                /*
-                * We're done when we see something other than an EFI.
-                * There should be no EFIs left in the AIL now.
+                * We're done when we see something other than an intent.
+                * There should be no intents left in the AIL now.
                 */
-               if (lip->li_type != XFS_LI_EFI) {
+               if (!xlog_item_is_intent(lip)) {
 #ifdef DEBUG
                        for (; lip; lip = xfs_trans_ail_cursor_next(ailp, &cur))
-                               ASSERT(lip->li_type != XFS_LI_EFI);
+                               ASSERT(!xlog_item_is_intent(lip));
 #endif
                        break;
                }
 
-               efip = container_of(lip, struct xfs_efi_log_item, efi_item);
-
-               spin_unlock(&ailp->xa_lock);
-               xfs_efi_release(efip);
-               spin_lock(&ailp->xa_lock);
+               switch (lip->li_type) {
+               case XFS_LI_EFI:
+                       xlog_recover_cancel_efi(log->l_mp, ailp, lip);
+                       break;
+               case XFS_LI_RUI:
+                       xlog_recover_cancel_rui(log->l_mp, ailp, lip);
+                       break;
+               }
 
                lip = xfs_trans_ail_cursor_next(ailp, &cur);
        }
@@ -5023,6 +5163,7 @@ xlog_do_recover(
                xfs_warn(mp, "Failed post-recovery per-ag init: %d", error);
                return error;
        }
+       mp->m_alloc_set_aside = xfs_alloc_set_aside(mp);
 
        xlog_recover_check_summary(log);
 
@@ -5139,16 +5280,17 @@ xlog_recover_finish(
         */
        if (log->l_flags & XLOG_RECOVERY_NEEDED) {
                int     error;
-               error = xlog_recover_process_efis(log);
+               error = xlog_recover_process_intents(log);
                if (error) {
-                       xfs_alert(log->l_mp, "Failed to recover EFIs");
+                       xfs_alert(log->l_mp, "Failed to recover intents");
                        return error;
                }
+
                /*
-                * Sync the log to get all the EFIs out of the AIL.
+                * Sync the log to get all the intents out of the AIL.
                 * This isn't absolutely necessary, but it helps in
                 * case the unlink transactions would have problems
-                * pushing the EFIs out of the way.
+                * pushing the intents out of the way.
                 */
                xfs_log_force(log->l_mp, XFS_LOG_SYNC);
 
@@ -5173,7 +5315,7 @@ xlog_recover_cancel(
        int             error = 0;
 
        if (log->l_flags & XLOG_RECOVERY_NEEDED)
-               error = xlog_recover_cancel_efis(log);
+               error = xlog_recover_cancel_intents(log);
 
        return error;
 }
index 970c19ba2f560f62bc077d6797c6713848dcd5ec..faeead671f9ff02af6ca9c797a1480611ed4c95b 100644 (file)
@@ -24,6 +24,7 @@
 #include "xfs_bit.h"
 #include "xfs_sb.h"
 #include "xfs_mount.h"
+#include "xfs_defer.h"
 #include "xfs_da_format.h"
 #include "xfs_da_btree.h"
 #include "xfs_inode.h"
@@ -41,6 +42,7 @@
 #include "xfs_trace.h"
 #include "xfs_icache.h"
 #include "xfs_sysfs.h"
+#include "xfs_rmap_btree.h"
 
 
 static DEFINE_MUTEX(xfs_uuid_table_mutex);
@@ -230,6 +232,8 @@ xfs_initialize_perag(
 
        if (maxagi)
                *maxagi = index;
+
+       mp->m_ag_prealloc_blocks = xfs_prealloc_blocks(mp);
        return 0;
 
 out_unwind:
@@ -679,6 +683,7 @@ xfs_mountfs(
        xfs_bmap_compute_maxlevels(mp, XFS_DATA_FORK);
        xfs_bmap_compute_maxlevels(mp, XFS_ATTR_FORK);
        xfs_ialloc_compute_maxlevels(mp);
+       xfs_rmapbt_compute_maxlevels(mp);
 
        xfs_set_maxicount(mp);
 
@@ -1216,7 +1221,7 @@ xfs_mod_fdblocks(
                batch = XFS_FDBLOCKS_BATCH;
 
        __percpu_counter_add(&mp->m_fdblocks, delta, batch);
-       if (__percpu_counter_compare(&mp->m_fdblocks, XFS_ALLOC_SET_ASIDE(mp),
+       if (__percpu_counter_compare(&mp->m_fdblocks, mp->m_alloc_set_aside,
                                     XFS_FDBLOCKS_BATCH) >= 0) {
                /* we had space! */
                return 0;
index c1b798c7212618462ee2130814eac8544c691bf0..b36676cde10302acd3937b5308f11a6c7c42db03 100644 (file)
@@ -116,9 +116,15 @@ typedef struct xfs_mount {
        uint                    m_bmap_dmnr[2]; /* min bmap btree records */
        uint                    m_inobt_mxr[2]; /* max inobt btree records */
        uint                    m_inobt_mnr[2]; /* min inobt btree records */
+       uint                    m_rmap_mxr[2];  /* max rmap btree records */
+       uint                    m_rmap_mnr[2];  /* min rmap btree records */
        uint                    m_ag_maxlevels; /* XFS_AG_MAXLEVELS */
        uint                    m_bm_maxlevels[2]; /* XFS_BM_MAXLEVELS */
        uint                    m_in_maxlevels; /* max inobt btree levels. */
+       uint                    m_rmap_maxlevels; /* max rmap btree levels */
+       xfs_extlen_t            m_ag_prealloc_blocks; /* reserved ag blocks */
+       uint                    m_alloc_set_aside; /* space we can't use */
+       uint                    m_ag_max_usable; /* max space per AG */
        struct radix_tree_root  m_perag_tree;   /* per-ag accounting info */
        spinlock_t              m_perag_lock;   /* lock for m_perag_tree */
        struct mutex            m_growlock;     /* growfs mutex */
index 0cc8d8f74356759917f778d02d1e9a79888dd1f6..69e2986a377619876ac3757ed6d255db9433b91a 100644 (file)
@@ -49,11 +49,14 @@ xfs_check_ondisk_structs(void)
        XFS_CHECK_STRUCT_SIZE(struct xfs_dsymlink_hdr,          56);
        XFS_CHECK_STRUCT_SIZE(struct xfs_inobt_key,             4);
        XFS_CHECK_STRUCT_SIZE(struct xfs_inobt_rec,             16);
+       XFS_CHECK_STRUCT_SIZE(struct xfs_rmap_key,              20);
+       XFS_CHECK_STRUCT_SIZE(struct xfs_rmap_rec,              24);
        XFS_CHECK_STRUCT_SIZE(struct xfs_timestamp,             8);
        XFS_CHECK_STRUCT_SIZE(xfs_alloc_key_t,                  8);
        XFS_CHECK_STRUCT_SIZE(xfs_alloc_ptr_t,                  4);
        XFS_CHECK_STRUCT_SIZE(xfs_alloc_rec_t,                  8);
        XFS_CHECK_STRUCT_SIZE(xfs_inobt_ptr_t,                  4);
+       XFS_CHECK_STRUCT_SIZE(xfs_rmap_ptr_t,                   4);
 
        /* dir/attr trees */
        XFS_CHECK_STRUCT_SIZE(struct xfs_attr3_leaf_hdr,        80);
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
new file mode 100644 (file)
index 0000000..2500f28
--- /dev/null
@@ -0,0 +1,536 @@
+/*
+ * Copyright (C) 2016 Oracle.  All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_buf_item.h"
+#include "xfs_rmap_item.h"
+#include "xfs_log.h"
+#include "xfs_rmap.h"
+
+
+kmem_zone_t    *xfs_rui_zone;
+kmem_zone_t    *xfs_rud_zone;
+
+static inline struct xfs_rui_log_item *RUI_ITEM(struct xfs_log_item *lip)
+{
+       return container_of(lip, struct xfs_rui_log_item, rui_item);
+}
+
+void
+xfs_rui_item_free(
+       struct xfs_rui_log_item *ruip)
+{
+       if (ruip->rui_format.rui_nextents > XFS_RUI_MAX_FAST_EXTENTS)
+               kmem_free(ruip);
+       else
+               kmem_zone_free(xfs_rui_zone, ruip);
+}
+
+/*
+ * This returns the number of iovecs needed to log the given rui item.
+ * We only need 1 iovec for an rui item.  It just logs the rui_log_format
+ * structure.
+ */
+static inline int
+xfs_rui_item_sizeof(
+       struct xfs_rui_log_item *ruip)
+{
+       return sizeof(struct xfs_rui_log_format) +
+                       (ruip->rui_format.rui_nextents - 1) *
+                       sizeof(struct xfs_map_extent);
+}
+
+STATIC void
+xfs_rui_item_size(
+       struct xfs_log_item     *lip,
+       int                     *nvecs,
+       int                     *nbytes)
+{
+       *nvecs += 1;
+       *nbytes += xfs_rui_item_sizeof(RUI_ITEM(lip));
+}
+
+/*
+ * This is called to fill in the vector of log iovecs for the
+ * given rui log item. We use only 1 iovec, and we point that
+ * at the rui_log_format structure embedded in the rui item.
+ * It is at this point that we assert that all of the extent
+ * slots in the rui item have been filled.
+ */
+STATIC void
+xfs_rui_item_format(
+       struct xfs_log_item     *lip,
+       struct xfs_log_vec      *lv)
+{
+       struct xfs_rui_log_item *ruip = RUI_ITEM(lip);
+       struct xfs_log_iovec    *vecp = NULL;
+
+       ASSERT(atomic_read(&ruip->rui_next_extent) ==
+                       ruip->rui_format.rui_nextents);
+
+       ruip->rui_format.rui_type = XFS_LI_RUI;
+       ruip->rui_format.rui_size = 1;
+
+       xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_RUI_FORMAT, &ruip->rui_format,
+                       xfs_rui_item_sizeof(ruip));
+}
+
+/*
+ * Pinning has no meaning for an rui item, so just return.
+ */
+STATIC void
+xfs_rui_item_pin(
+       struct xfs_log_item     *lip)
+{
+}
+
+/*
+ * The unpin operation is the last place an RUI is manipulated in the log. It is
+ * either inserted in the AIL or aborted in the event of a log I/O error. In
+ * either case, the RUI transaction has been successfully committed to make it
+ * this far. Therefore, we expect whoever committed the RUI to either construct
+ * and commit the RUD or drop the RUD's reference in the event of error. Simply
+ * drop the log's RUI reference now that the log is done with it.
+ */
+STATIC void
+xfs_rui_item_unpin(
+       struct xfs_log_item     *lip,
+       int                     remove)
+{
+       struct xfs_rui_log_item *ruip = RUI_ITEM(lip);
+
+       xfs_rui_release(ruip);
+}
+
+/*
+ * RUI items have no locking or pushing.  However, since RUIs are pulled from
+ * the AIL when their corresponding RUDs are committed to disk, their situation
+ * is very similar to being pinned.  Return XFS_ITEM_PINNED so that the caller
+ * will eventually flush the log.  This should help in getting the RUI out of
+ * the AIL.
+ */
+STATIC uint
+xfs_rui_item_push(
+       struct xfs_log_item     *lip,
+       struct list_head        *buffer_list)
+{
+       return XFS_ITEM_PINNED;
+}
+
+/*
+ * The RUI has been either committed or aborted if the transaction has been
+ * cancelled. If the transaction was cancelled, an RUD isn't going to be
+ * constructed and thus we free the RUI here directly.
+ */
+STATIC void
+xfs_rui_item_unlock(
+       struct xfs_log_item     *lip)
+{
+       if (lip->li_flags & XFS_LI_ABORTED)
+               xfs_rui_item_free(RUI_ITEM(lip));
+}
+
+/*
+ * The RUI is logged only once and cannot be moved in the log, so simply return
+ * the lsn at which it's been logged.
+ */
+STATIC xfs_lsn_t
+xfs_rui_item_committed(
+       struct xfs_log_item     *lip,
+       xfs_lsn_t               lsn)
+{
+       return lsn;
+}
+
+/*
+ * The RUI dependency tracking op doesn't do squat.  It can't because
+ * it doesn't know where the free extent is coming from.  The dependency
+ * tracking has to be handled by the "enclosing" metadata object.  For
+ * example, for inodes, the inode is locked throughout the extent freeing
+ * so the dependency should be recorded there.
+ */
+STATIC void
+xfs_rui_item_committing(
+       struct xfs_log_item     *lip,
+       xfs_lsn_t               lsn)
+{
+}
+
+/*
+ * This is the ops vector shared by all rui log items.
+ */
+static const struct xfs_item_ops xfs_rui_item_ops = {
+       .iop_size       = xfs_rui_item_size,
+       .iop_format     = xfs_rui_item_format,
+       .iop_pin        = xfs_rui_item_pin,
+       .iop_unpin      = xfs_rui_item_unpin,
+       .iop_unlock     = xfs_rui_item_unlock,
+       .iop_committed  = xfs_rui_item_committed,
+       .iop_push       = xfs_rui_item_push,
+       .iop_committing = xfs_rui_item_committing,
+};
+
+/*
+ * Allocate and initialize an rui item with the given number of extents.
+ */
+struct xfs_rui_log_item *
+xfs_rui_init(
+       struct xfs_mount                *mp,
+       uint                            nextents)
+
+{
+       struct xfs_rui_log_item         *ruip;
+       uint                            size;
+
+       ASSERT(nextents > 0);
+       if (nextents > XFS_RUI_MAX_FAST_EXTENTS) {
+               size = (uint)(sizeof(struct xfs_rui_log_item) +
+                       ((nextents - 1) * sizeof(struct xfs_map_extent)));
+               ruip = kmem_zalloc(size, KM_SLEEP);
+       } else {
+               ruip = kmem_zone_zalloc(xfs_rui_zone, KM_SLEEP);
+       }
+
+       xfs_log_item_init(mp, &ruip->rui_item, XFS_LI_RUI, &xfs_rui_item_ops);
+       ruip->rui_format.rui_nextents = nextents;
+       ruip->rui_format.rui_id = (uintptr_t)(void *)ruip;
+       atomic_set(&ruip->rui_next_extent, 0);
+       atomic_set(&ruip->rui_refcount, 2);
+
+       return ruip;
+}
+
+/*
+ * Copy an RUI format buffer from the given buf, and into the destination
+ * RUI format structure.  The RUI/RUD items were designed not to need any
+ * special alignment handling.
+ */
+int
+xfs_rui_copy_format(
+       struct xfs_log_iovec            *buf,
+       struct xfs_rui_log_format       *dst_rui_fmt)
+{
+       struct xfs_rui_log_format       *src_rui_fmt;
+       uint                            len;
+
+       src_rui_fmt = buf->i_addr;
+       len = sizeof(struct xfs_rui_log_format) +
+                       (src_rui_fmt->rui_nextents - 1) *
+                       sizeof(struct xfs_map_extent);
+
+       if (buf->i_len != len)
+               return -EFSCORRUPTED;
+
+       memcpy((char *)dst_rui_fmt, (char *)src_rui_fmt, len);
+       return 0;
+}
+
+/*
+ * Freeing the RUI requires that we remove it from the AIL if it has already
+ * been placed there. However, the RUI may not yet have been placed in the AIL
+ * when called by xfs_rui_release() from RUD processing due to the ordering of
+ * committed vs unpin operations in bulk insert operations. Hence the reference
+ * count to ensure only the last caller frees the RUI.
+ */
+void
+xfs_rui_release(
+       struct xfs_rui_log_item *ruip)
+{
+       if (atomic_dec_and_test(&ruip->rui_refcount)) {
+               xfs_trans_ail_remove(&ruip->rui_item, SHUTDOWN_LOG_IO_ERROR);
+               xfs_rui_item_free(ruip);
+       }
+}
+
+static inline struct xfs_rud_log_item *RUD_ITEM(struct xfs_log_item *lip)
+{
+       return container_of(lip, struct xfs_rud_log_item, rud_item);
+}
+
+STATIC void
+xfs_rud_item_size(
+       struct xfs_log_item     *lip,
+       int                     *nvecs,
+       int                     *nbytes)
+{
+       *nvecs += 1;
+       *nbytes += sizeof(struct xfs_rud_log_format);
+}
+
+/*
+ * This is called to fill in the vector of log iovecs for the
+ * given rud log item. We use only 1 iovec, and we point that
+ * at the rud_log_format structure embedded in the rud item.
+ * It is at this point that we assert that all of the extent
+ * slots in the rud item have been filled.
+ */
+STATIC void
+xfs_rud_item_format(
+       struct xfs_log_item     *lip,
+       struct xfs_log_vec      *lv)
+{
+       struct xfs_rud_log_item *rudp = RUD_ITEM(lip);
+       struct xfs_log_iovec    *vecp = NULL;
+
+       rudp->rud_format.rud_type = XFS_LI_RUD;
+       rudp->rud_format.rud_size = 1;
+
+       xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_RUD_FORMAT, &rudp->rud_format,
+                       sizeof(struct xfs_rud_log_format));
+}
+
+/*
+ * Pinning has no meaning for an rud item, so just return.
+ */
+STATIC void
+xfs_rud_item_pin(
+       struct xfs_log_item     *lip)
+{
+}
+
+/*
+ * Since pinning has no meaning for an rud item, unpinning does
+ * not either.
+ */
+STATIC void
+xfs_rud_item_unpin(
+       struct xfs_log_item     *lip,
+       int                     remove)
+{
+}
+
+/*
+ * There isn't much you can do to push on an rud item.  It is simply stuck
+ * waiting for the log to be flushed to disk.
+ */
+STATIC uint
+xfs_rud_item_push(
+       struct xfs_log_item     *lip,
+       struct list_head        *buffer_list)
+{
+       return XFS_ITEM_PINNED;
+}
+
+/*
+ * The RUD is either committed or aborted if the transaction is cancelled. If
+ * the transaction is cancelled, drop our reference to the RUI and free the
+ * RUD.
+ */
+STATIC void
+xfs_rud_item_unlock(
+       struct xfs_log_item     *lip)
+{
+       struct xfs_rud_log_item *rudp = RUD_ITEM(lip);
+
+       if (lip->li_flags & XFS_LI_ABORTED) {
+               xfs_rui_release(rudp->rud_ruip);
+               kmem_zone_free(xfs_rud_zone, rudp);
+       }
+}
+
+/*
+ * When the rud item is committed to disk, all we need to do is delete our
+ * reference to our partner rui item and then free ourselves. Since we're
+ * freeing ourselves we must return -1 to keep the transaction code from
+ * further referencing this item.
+ */
+STATIC xfs_lsn_t
+xfs_rud_item_committed(
+       struct xfs_log_item     *lip,
+       xfs_lsn_t               lsn)
+{
+       struct xfs_rud_log_item *rudp = RUD_ITEM(lip);
+
+       /*
+        * Drop the RUI reference regardless of whether the RUD has been
+        * aborted. Once the RUD transaction is constructed, it is the sole
+        * responsibility of the RUD to release the RUI (even if the RUI is
+        * aborted due to log I/O error).
+        */
+       xfs_rui_release(rudp->rud_ruip);
+       kmem_zone_free(xfs_rud_zone, rudp);
+
+       return (xfs_lsn_t)-1;
+}
+
+/*
+ * The RUD dependency tracking op doesn't do squat.  It can't because
+ * it doesn't know where the free extent is coming from.  The dependency
+ * tracking has to be handled by the "enclosing" metadata object.  For
+ * example, for inodes, the inode is locked throughout the extent freeing
+ * so the dependency should be recorded there.
+ */
+STATIC void
+xfs_rud_item_committing(
+       struct xfs_log_item     *lip,
+       xfs_lsn_t               lsn)
+{
+}
+
+/*
+ * This is the ops vector shared by all rud log items.
+ */
+static const struct xfs_item_ops xfs_rud_item_ops = {
+       .iop_size       = xfs_rud_item_size,
+       .iop_format     = xfs_rud_item_format,
+       .iop_pin        = xfs_rud_item_pin,
+       .iop_unpin      = xfs_rud_item_unpin,
+       .iop_unlock     = xfs_rud_item_unlock,
+       .iop_committed  = xfs_rud_item_committed,
+       .iop_push       = xfs_rud_item_push,
+       .iop_committing = xfs_rud_item_committing,
+};
+
+/*
+ * Allocate and initialize an rud item with the given number of extents.
+ */
+struct xfs_rud_log_item *
+xfs_rud_init(
+       struct xfs_mount                *mp,
+       struct xfs_rui_log_item         *ruip)
+
+{
+       struct xfs_rud_log_item *rudp;
+
+       rudp = kmem_zone_zalloc(xfs_rud_zone, KM_SLEEP);
+       xfs_log_item_init(mp, &rudp->rud_item, XFS_LI_RUD, &xfs_rud_item_ops);
+       rudp->rud_ruip = ruip;
+       rudp->rud_format.rud_rui_id = ruip->rui_format.rui_id;
+
+       return rudp;
+}
+
+/*
+ * Process an rmap update intent item that was recovered from the log.
+ * We need to update the rmapbt.
+ */
+int
+xfs_rui_recover(
+       struct xfs_mount                *mp,
+       struct xfs_rui_log_item         *ruip)
+{
+       int                             i;
+       int                             error = 0;
+       struct xfs_map_extent           *rmap;
+       xfs_fsblock_t                   startblock_fsb;
+       bool                            op_ok;
+       struct xfs_rud_log_item         *rudp;
+       enum xfs_rmap_intent_type       type;
+       int                             whichfork;
+       xfs_exntst_t                    state;
+       struct xfs_trans                *tp;
+       struct xfs_btree_cur            *rcur = NULL;
+
+       ASSERT(!test_bit(XFS_RUI_RECOVERED, &ruip->rui_flags));
+
+       /*
+        * First check the validity of the extents described by the
+        * RUI.  If any are bad, then assume that all are bad and
+        * just toss the RUI.
+        */
+       for (i = 0; i < ruip->rui_format.rui_nextents; i++) {
+               rmap = &ruip->rui_format.rui_extents[i];
+               startblock_fsb = XFS_BB_TO_FSB(mp,
+                                  XFS_FSB_TO_DADDR(mp, rmap->me_startblock));
+               switch (rmap->me_flags & XFS_RMAP_EXTENT_TYPE_MASK) {
+               case XFS_RMAP_EXTENT_MAP:
+               case XFS_RMAP_EXTENT_UNMAP:
+               case XFS_RMAP_EXTENT_CONVERT:
+               case XFS_RMAP_EXTENT_ALLOC:
+               case XFS_RMAP_EXTENT_FREE:
+                       op_ok = true;
+                       break;
+               default:
+                       op_ok = false;
+                       break;
+               }
+               if (!op_ok || startblock_fsb == 0 ||
+                   rmap->me_len == 0 ||
+                   startblock_fsb >= mp->m_sb.sb_dblocks ||
+                   rmap->me_len >= mp->m_sb.sb_agblocks ||
+                   (rmap->me_flags & ~XFS_RMAP_EXTENT_FLAGS)) {
+                       /*
+                        * This will pull the RUI from the AIL and
+                        * free the memory associated with it.
+                        */
+                       set_bit(XFS_RUI_RECOVERED, &ruip->rui_flags);
+                       xfs_rui_release(ruip);
+                       return -EIO;
+               }
+       }
+
+       error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
+       if (error)
+               return error;
+       rudp = xfs_trans_get_rud(tp, ruip);
+
+       for (i = 0; i < ruip->rui_format.rui_nextents; i++) {
+               rmap = &ruip->rui_format.rui_extents[i];
+               state = (rmap->me_flags & XFS_RMAP_EXTENT_UNWRITTEN) ?
+                               XFS_EXT_UNWRITTEN : XFS_EXT_NORM;
+               whichfork = (rmap->me_flags & XFS_RMAP_EXTENT_ATTR_FORK) ?
+                               XFS_ATTR_FORK : XFS_DATA_FORK;
+               switch (rmap->me_flags & XFS_RMAP_EXTENT_TYPE_MASK) {
+               case XFS_RMAP_EXTENT_MAP:
+                       type = XFS_RMAP_MAP;
+                       break;
+               case XFS_RMAP_EXTENT_UNMAP:
+                       type = XFS_RMAP_UNMAP;
+                       break;
+               case XFS_RMAP_EXTENT_CONVERT:
+                       type = XFS_RMAP_CONVERT;
+                       break;
+               case XFS_RMAP_EXTENT_ALLOC:
+                       type = XFS_RMAP_ALLOC;
+                       break;
+               case XFS_RMAP_EXTENT_FREE:
+                       type = XFS_RMAP_FREE;
+                       break;
+               default:
+                       error = -EFSCORRUPTED;
+                       goto abort_error;
+               }
+               error = xfs_trans_log_finish_rmap_update(tp, rudp, type,
+                               rmap->me_owner, whichfork,
+                               rmap->me_startoff, rmap->me_startblock,
+                               rmap->me_len, state, &rcur);
+               if (error)
+                       goto abort_error;
+
+       }
+
+       xfs_rmap_finish_one_cleanup(tp, rcur, error);
+       set_bit(XFS_RUI_RECOVERED, &ruip->rui_flags);
+       error = xfs_trans_commit(tp);
+       return error;
+
+abort_error:
+       xfs_rmap_finish_one_cleanup(tp, rcur, error);
+       xfs_trans_cancel(tp);
+       return error;
+}
diff --git a/fs/xfs/xfs_rmap_item.h b/fs/xfs/xfs_rmap_item.h
new file mode 100644 (file)
index 0000000..aefcc3a
--- /dev/null
@@ -0,0 +1,95 @@
+/*
+ * Copyright (C) 2016 Oracle.  All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+#ifndef        __XFS_RMAP_ITEM_H__
+#define        __XFS_RMAP_ITEM_H__
+
+/*
+ * There are (currently) three pairs of rmap btree redo item types: map, unmap,
+ * and convert.  The common abbreviations for these are RUI (rmap update
+ * intent) and RUD (rmap update done).  The redo item type is encoded in the
+ * flags field of each xfs_map_extent.
+ *
+ * *I items should be recorded in the *first* of a series of rolled
+ * transactions, and the *D items should be recorded in the same transaction
+ * that records the associated rmapbt updates.  Typically, the first
+ * transaction will record a bmbt update, followed by some number of
+ * transactions containing rmapbt updates, and finally transactions with any
+ * bnobt/cntbt updates.
+ *
+ * Should the system crash after the commit of the first transaction but
+ * before the commit of the final transaction in a series, log recovery will
+ * use the redo information recorded by the intent items to replay the
+ * (rmapbt/bnobt/cntbt) metadata updates in the non-first transaction.
+ */
+
+/* kernel only RUI/RUD definitions */
+
+struct xfs_mount;
+struct kmem_zone;
+
+/*
+ * Max number of extents in fast allocation path.
+ */
+#define        XFS_RUI_MAX_FAST_EXTENTS        16
+
+/*
+ * Define RUI flag bits. Manipulated by set/clear/test_bit operators.
+ */
+#define        XFS_RUI_RECOVERED               1
+
+/*
+ * This is the "rmap update intent" log item.  It is used to log the fact that
+ * some reverse mappings need to change.  It is used in conjunction with the
+ * "rmap update done" log item described below.
+ *
+ * These log items follow the same rules as struct xfs_efi_log_item; see the
+ * comments about that structure (in xfs_extfree_item.h) for more details.
+ */
+struct xfs_rui_log_item {
+       struct xfs_log_item             rui_item;
+       atomic_t                        rui_refcount;
+       atomic_t                        rui_next_extent;
+       unsigned long                   rui_flags;      /* misc flags */
+       struct xfs_rui_log_format       rui_format;
+};
+
+/*
+ * This is the "rmap update done" log item.  It is used to log the fact that
+ * some rmapbt updates mentioned in an earlier rui item have been performed.
+ */
+struct xfs_rud_log_item {
+       struct xfs_log_item             rud_item;
+       struct xfs_rui_log_item         *rud_ruip;
+       struct xfs_rud_log_format       rud_format;
+};
+
+extern struct kmem_zone        *xfs_rui_zone;
+extern struct kmem_zone        *xfs_rud_zone;
+
+struct xfs_rui_log_item *xfs_rui_init(struct xfs_mount *, uint);
+struct xfs_rud_log_item *xfs_rud_init(struct xfs_mount *,
+               struct xfs_rui_log_item *);
+int xfs_rui_copy_format(struct xfs_log_iovec *buf,
+               struct xfs_rui_log_format *dst_rui_fmt);
+void xfs_rui_item_free(struct xfs_rui_log_item *);
+void xfs_rui_release(struct xfs_rui_log_item *);
+int xfs_rui_recover(struct xfs_mount *mp, struct xfs_rui_log_item *ruip);
+
+#endif /* __XFS_RMAP_ITEM_H__ */
index 3938b37d1043bb6fd98879fa4783b6bbec8cfef6..802bcc326d9fbe37fab8b412ce69b37390d94ba0 100644 (file)
@@ -23,6 +23,7 @@
 #include "xfs_trans_resv.h"
 #include "xfs_bit.h"
 #include "xfs_mount.h"
+#include "xfs_defer.h"
 #include "xfs_inode.h"
 #include "xfs_bmap.h"
 #include "xfs_bmap_util.h"
@@ -769,7 +770,7 @@ xfs_growfs_rt_alloc(
        xfs_daddr_t             d;              /* disk block address */
        int                     error;          /* error return value */
        xfs_fsblock_t           firstblock;/* first block allocated in xaction */
-       struct xfs_bmap_free    flist;          /* list of freed blocks */
+       struct xfs_defer_ops    dfops;          /* list of freed blocks */
        xfs_fsblock_t           fsbno;          /* filesystem block for bno */
        struct xfs_bmbt_irec    map;            /* block map output */
        int                     nmap;           /* number of block maps */
@@ -794,14 +795,14 @@ xfs_growfs_rt_alloc(
                xfs_ilock(ip, XFS_ILOCK_EXCL);
                xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
 
-               xfs_bmap_init(&flist, &firstblock);
+               xfs_defer_init(&dfops, &firstblock);
                /*
                 * Allocate blocks to the bitmap file.
                 */
                nmap = 1;
                error = xfs_bmapi_write(tp, ip, oblocks, nblocks - oblocks,
                                        XFS_BMAPI_METADATA, &firstblock,
-                                       resblks, &map, &nmap, &flist);
+                                       resblks, &map, &nmap, &dfops);
                if (!error && nmap < 1)
                        error = -ENOSPC;
                if (error)
@@ -809,7 +810,7 @@ xfs_growfs_rt_alloc(
                /*
                 * Free any blocks freed up in the transaction, then commit.
                 */
-               error = xfs_bmap_finish(&tp, &flist, NULL);
+               error = xfs_defer_finish(&tp, &dfops, NULL);
                if (error)
                        goto out_bmap_cancel;
                error = xfs_trans_commit(tp);
@@ -862,7 +863,7 @@ xfs_growfs_rt_alloc(
        return 0;
 
 out_bmap_cancel:
-       xfs_bmap_cancel(&flist);
+       xfs_defer_cancel(&dfops);
 out_trans_cancel:
        xfs_trans_cancel(tp);
        return error;
index d266e835ecc3eb22f92a7400aa67718501a20fff..6e812fe0fd43cc04b4f879c053c296f6cc1a5092 100644 (file)
@@ -61,6 +61,7 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf)
                { "bmbt2",              XFSSTAT_END_BMBT_V2             },
                { "ibt2",               XFSSTAT_END_IBT_V2              },
                { "fibt2",              XFSSTAT_END_FIBT_V2             },
+               { "rmapbt",             XFSSTAT_END_RMAP_V2             },
                /* we print both series of quota information together */
                { "qm",                 XFSSTAT_END_QM                  },
        };
index 483b0eff198836ca1516cf56bdffc5f283af8dad..657865f51e78332dae8ff7892e93213c07f71e6e 100644 (file)
@@ -197,7 +197,23 @@ struct xfsstats {
        __uint32_t              xs_fibt_2_alloc;
        __uint32_t              xs_fibt_2_free;
        __uint32_t              xs_fibt_2_moves;
-#define XFSSTAT_END_XQMSTAT            (XFSSTAT_END_FIBT_V2+6)
+#define XFSSTAT_END_RMAP_V2            (XFSSTAT_END_FIBT_V2+15)
+       __uint32_t              xs_rmap_2_lookup;
+       __uint32_t              xs_rmap_2_compare;
+       __uint32_t              xs_rmap_2_insrec;
+       __uint32_t              xs_rmap_2_delrec;
+       __uint32_t              xs_rmap_2_newroot;
+       __uint32_t              xs_rmap_2_killroot;
+       __uint32_t              xs_rmap_2_increment;
+       __uint32_t              xs_rmap_2_decrement;
+       __uint32_t              xs_rmap_2_lshift;
+       __uint32_t              xs_rmap_2_rshift;
+       __uint32_t              xs_rmap_2_split;
+       __uint32_t              xs_rmap_2_join;
+       __uint32_t              xs_rmap_2_alloc;
+       __uint32_t              xs_rmap_2_free;
+       __uint32_t              xs_rmap_2_moves;
+#define XFSSTAT_END_XQMSTAT            (XFSSTAT_END_RMAP_V2+6)
        __uint32_t              xs_qm_dqreclaims;
        __uint32_t              xs_qm_dqreclaim_misses;
        __uint32_t              xs_qm_dquot_dups;
index 0303f1005f884e0314539c94bf2a36349bc31fab..24ef83ef04de2be5c8fd1e77ab62fa3b59106376 100644 (file)
@@ -46,6 +46,7 @@
 #include "xfs_quota.h"
 #include "xfs_sysfs.h"
 #include "xfs_ondisk.h"
+#include "xfs_rmap_item.h"
 
 #include <linux/namei.h>
 #include <linux/init.h>
@@ -1075,7 +1076,7 @@ xfs_fs_statfs(
        statp->f_blocks = sbp->sb_dblocks - lsize;
        spin_unlock(&mp->m_sb_lock);
 
-       statp->f_bfree = fdblocks - XFS_ALLOC_SET_ASIDE(mp);
+       statp->f_bfree = fdblocks - mp->m_alloc_set_aside;
        statp->f_bavail = statp->f_bfree;
 
        fakeinos = statp->f_bfree << sbp->sb_inopblog;
@@ -1573,6 +1574,10 @@ xfs_fs_fill_super(
                }
        }
 
+       if (xfs_sb_version_hasrmapbt(&mp->m_sb))
+               xfs_alert(mp,
+       "EXPERIMENTAL reverse mapping btree feature enabled. Use at your own risk!");
+
        error = xfs_mountfs(mp);
        if (error)
                goto out_filestream_unmount;
@@ -1697,7 +1702,7 @@ xfs_init_zones(void)
                goto out_free_ioend_bioset;
 
        xfs_bmap_free_item_zone = kmem_zone_init(
-                       sizeof(struct xfs_bmap_free_item),
+                       sizeof(struct xfs_extent_free_item),
                        "xfs_bmap_free_item");
        if (!xfs_bmap_free_item_zone)
                goto out_destroy_log_ticket_zone;
@@ -1765,8 +1770,24 @@ xfs_init_zones(void)
        if (!xfs_icreate_zone)
                goto out_destroy_ili_zone;
 
+       xfs_rud_zone = kmem_zone_init(sizeof(struct xfs_rud_log_item),
+                       "xfs_rud_item");
+       if (!xfs_rud_zone)
+               goto out_destroy_icreate_zone;
+
+       xfs_rui_zone = kmem_zone_init((sizeof(struct xfs_rui_log_item) +
+                       ((XFS_RUI_MAX_FAST_EXTENTS - 1) *
+                               sizeof(struct xfs_map_extent))),
+                       "xfs_rui_item");
+       if (!xfs_rui_zone)
+               goto out_destroy_rud_zone;
+
        return 0;
 
+ out_destroy_rud_zone:
+       kmem_zone_destroy(xfs_rud_zone);
+ out_destroy_icreate_zone:
+       kmem_zone_destroy(xfs_icreate_zone);
  out_destroy_ili_zone:
        kmem_zone_destroy(xfs_ili_zone);
  out_destroy_inode_zone:
@@ -1805,6 +1826,8 @@ xfs_destroy_zones(void)
         * destroy caches.
         */
        rcu_barrier();
+       kmem_zone_destroy(xfs_rui_zone);
+       kmem_zone_destroy(xfs_rud_zone);
        kmem_zone_destroy(xfs_icreate_zone);
        kmem_zone_destroy(xfs_ili_zone);
        kmem_zone_destroy(xfs_inode_zone);
@@ -1854,6 +1877,9 @@ init_xfs_fs(void)
        printk(KERN_INFO XFS_VERSION_STRING " with "
                         XFS_BUILD_OPTIONS " enabled\n");
 
+       xfs_extent_free_init_defer_op();
+       xfs_rmap_update_init_defer_op();
+
        xfs_dir_startup();
 
        error = xfs_init_zones();
index 08a46c6181fdb698bf6b6deed28e21fa6c01ce7d..58142aeeeea69d2b191354911c0ebc07fd12f3fc 100644 (file)
@@ -26,6 +26,7 @@
 #include "xfs_mount.h"
 #include "xfs_da_format.h"
 #include "xfs_da_btree.h"
+#include "xfs_defer.h"
 #include "xfs_dir2.h"
 #include "xfs_inode.h"
 #include "xfs_ialloc.h"
@@ -172,7 +173,7 @@ xfs_symlink(
        struct xfs_inode        *ip = NULL;
        int                     error = 0;
        int                     pathlen;
-       struct xfs_bmap_free    free_list;
+       struct xfs_defer_ops    dfops;
        xfs_fsblock_t           first_block;
        bool                    unlock_dp_on_error = false;
        xfs_fileoff_t           first_fsb;
@@ -269,7 +270,7 @@ xfs_symlink(
         * Initialize the bmap freelist prior to calling either
         * bmapi or the directory create code.
         */
-       xfs_bmap_init(&free_list, &first_block);
+       xfs_defer_init(&dfops, &first_block);
 
        /*
         * Allocate an inode for the symlink.
@@ -313,7 +314,7 @@ xfs_symlink(
 
                error = xfs_bmapi_write(tp, ip, first_fsb, fs_blocks,
                                  XFS_BMAPI_METADATA, &first_block, resblks,
-                                 mval, &nmaps, &free_list);
+                                 mval, &nmaps, &dfops);
                if (error)
                        goto out_bmap_cancel;
 
@@ -361,7 +362,7 @@ xfs_symlink(
         * Create the directory entry for the symlink.
         */
        error = xfs_dir_createname(tp, dp, link_name, ip->i_ino,
-                                       &first_block, &free_list, resblks);
+                                       &first_block, &dfops, resblks);
        if (error)
                goto out_bmap_cancel;
        xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
@@ -376,7 +377,7 @@ xfs_symlink(
                xfs_trans_set_sync(tp);
        }
 
-       error = xfs_bmap_finish(&tp, &free_list, NULL);
+       error = xfs_defer_finish(&tp, &dfops, NULL);
        if (error)
                goto out_bmap_cancel;
 
@@ -392,7 +393,7 @@ xfs_symlink(
        return 0;
 
 out_bmap_cancel:
-       xfs_bmap_cancel(&free_list);
+       xfs_defer_cancel(&dfops);
 out_trans_cancel:
        xfs_trans_cancel(tp);
 out_release_inode:
@@ -426,7 +427,7 @@ xfs_inactive_symlink_rmt(
        int             done;
        int             error;
        xfs_fsblock_t   first_block;
-       xfs_bmap_free_t free_list;
+       struct xfs_defer_ops    dfops;
        int             i;
        xfs_mount_t     *mp;
        xfs_bmbt_irec_t mval[XFS_SYMLINK_MAPS];
@@ -465,7 +466,7 @@ xfs_inactive_symlink_rmt(
         * Find the block(s) so we can inval and unmap them.
         */
        done = 0;
-       xfs_bmap_init(&free_list, &first_block);
+       xfs_defer_init(&dfops, &first_block);
        nmaps = ARRAY_SIZE(mval);
        error = xfs_bmapi_read(ip, 0, xfs_symlink_blocks(mp, size),
                                mval, &nmaps, 0);
@@ -485,17 +486,17 @@ xfs_inactive_symlink_rmt(
                xfs_trans_binval(tp, bp);
        }
        /*
-        * Unmap the dead block(s) to the free_list.
+        * Unmap the dead block(s) to the dfops.
         */
        error = xfs_bunmapi(tp, ip, 0, size, 0, nmaps,
-                           &first_block, &free_list, &done);
+                           &first_block, &dfops, &done);
        if (error)
                goto error_bmap_cancel;
        ASSERT(done);
        /*
         * Commit the first transaction.  This logs the EFI and the inode.
         */
-       error = xfs_bmap_finish(&tp, &free_list, ip);
+       error = xfs_defer_finish(&tp, &dfops, ip);
        if (error)
                goto error_bmap_cancel;
        /*
@@ -525,7 +526,7 @@ xfs_inactive_symlink_rmt(
        return 0;
 
 error_bmap_cancel:
-       xfs_bmap_cancel(&free_list);
+       xfs_defer_cancel(&dfops);
 error_trans_cancel:
        xfs_trans_cancel(tp);
 error_unlock:
index 13a029806805fe680a919923841c636dcc64934a..7f17ae6d709a1013f277ab608eb98f64298573f2 100644 (file)
@@ -22,7 +22,9 @@
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_mount.h"
+#include "xfs_defer.h"
 #include "xfs_da_format.h"
+#include "xfs_defer.h"
 #include "xfs_inode.h"
 #include "xfs_btree.h"
 #include "xfs_da_btree.h"
index 145169093fe0cfd8c04dba93bb701061f35bff63..551b7e26980c51886d4bd8edb19c45824fc45198 100644 (file)
@@ -38,6 +38,7 @@ struct xlog_recover_item;
 struct xfs_buf_log_format;
 struct xfs_inode_log_format;
 struct xfs_bmbt_irec;
+struct xfs_btree_cur;
 
 DECLARE_EVENT_CLASS(xfs_attr_list_class,
        TP_PROTO(struct xfs_attr_list_context *ctx),
@@ -2185,6 +2186,379 @@ DEFINE_DISCARD_EVENT(xfs_discard_toosmall);
 DEFINE_DISCARD_EVENT(xfs_discard_exclude);
 DEFINE_DISCARD_EVENT(xfs_discard_busy);
 
+/* btree cursor events */
+DECLARE_EVENT_CLASS(xfs_btree_cur_class,
+       TP_PROTO(struct xfs_btree_cur *cur, int level, struct xfs_buf *bp),
+       TP_ARGS(cur, level, bp),
+       TP_STRUCT__entry(
+               __field(dev_t, dev)
+               __field(xfs_btnum_t, btnum)
+               __field(int, level)
+               __field(int, nlevels)
+               __field(int, ptr)
+               __field(xfs_daddr_t, daddr)
+       ),
+       TP_fast_assign(
+               __entry->dev = cur->bc_mp->m_super->s_dev;
+               __entry->btnum = cur->bc_btnum;
+               __entry->level = level;
+               __entry->nlevels = cur->bc_nlevels;
+               __entry->ptr = cur->bc_ptrs[level];
+               __entry->daddr = bp ? bp->b_bn : -1;
+       ),
+       TP_printk("dev %d:%d btnum %d level %d/%d ptr %d daddr 0x%llx",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->btnum,
+                 __entry->level,
+                 __entry->nlevels,
+                 __entry->ptr,
+                 (unsigned long long)__entry->daddr)
+)
+
+#define DEFINE_BTREE_CUR_EVENT(name) \
+DEFINE_EVENT(xfs_btree_cur_class, name, \
+       TP_PROTO(struct xfs_btree_cur *cur, int level, struct xfs_buf *bp), \
+       TP_ARGS(cur, level, bp))
+DEFINE_BTREE_CUR_EVENT(xfs_btree_updkeys);
+DEFINE_BTREE_CUR_EVENT(xfs_btree_overlapped_query_range);
+
+/* deferred ops */
+struct xfs_defer_pending;
+struct xfs_defer_intake;
+struct xfs_defer_ops;
+
+DECLARE_EVENT_CLASS(xfs_defer_class,
+       TP_PROTO(struct xfs_mount *mp, struct xfs_defer_ops *dop),
+       TP_ARGS(mp, dop),
+       TP_STRUCT__entry(
+               __field(dev_t, dev)
+               __field(void *, dop)
+               __field(bool, committed)
+               __field(bool, low)
+       ),
+       TP_fast_assign(
+               __entry->dev = mp ? mp->m_super->s_dev : 0;
+               __entry->dop = dop;
+               __entry->committed = dop->dop_committed;
+               __entry->low = dop->dop_low;
+       ),
+       TP_printk("dev %d:%d ops %p committed %d low %d\n",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->dop,
+                 __entry->committed,
+                 __entry->low)
+)
+#define DEFINE_DEFER_EVENT(name) \
+DEFINE_EVENT(xfs_defer_class, name, \
+       TP_PROTO(struct xfs_mount *mp, struct xfs_defer_ops *dop), \
+       TP_ARGS(mp, dop))
+
+DECLARE_EVENT_CLASS(xfs_defer_error_class,
+       TP_PROTO(struct xfs_mount *mp, struct xfs_defer_ops *dop, int error),
+       TP_ARGS(mp, dop, error),
+       TP_STRUCT__entry(
+               __field(dev_t, dev)
+               __field(void *, dop)
+               __field(bool, committed)
+               __field(bool, low)
+               __field(int, error)
+       ),
+       TP_fast_assign(
+               __entry->dev = mp ? mp->m_super->s_dev : 0;
+               __entry->dop = dop;
+               __entry->committed = dop->dop_committed;
+               __entry->low = dop->dop_low;
+               __entry->error = error;
+       ),
+       TP_printk("dev %d:%d ops %p committed %d low %d err %d\n",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->dop,
+                 __entry->committed,
+                 __entry->low,
+                 __entry->error)
+)
+#define DEFINE_DEFER_ERROR_EVENT(name) \
+DEFINE_EVENT(xfs_defer_error_class, name, \
+       TP_PROTO(struct xfs_mount *mp, struct xfs_defer_ops *dop, int error), \
+       TP_ARGS(mp, dop, error))
+
+DECLARE_EVENT_CLASS(xfs_defer_pending_class,
+       TP_PROTO(struct xfs_mount *mp, struct xfs_defer_pending *dfp),
+       TP_ARGS(mp, dfp),
+       TP_STRUCT__entry(
+               __field(dev_t, dev)
+               __field(int, type)
+               __field(void *, intent)
+               __field(bool, committed)
+               __field(int, nr)
+       ),
+       TP_fast_assign(
+               __entry->dev = mp ? mp->m_super->s_dev : 0;
+               __entry->type = dfp->dfp_type->type;
+               __entry->intent = dfp->dfp_intent;
+               __entry->committed = dfp->dfp_committed;
+               __entry->nr = dfp->dfp_count;
+       ),
+       TP_printk("dev %d:%d optype %d intent %p committed %d nr %d\n",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->type,
+                 __entry->intent,
+                 __entry->committed,
+                 __entry->nr)
+)
+#define DEFINE_DEFER_PENDING_EVENT(name) \
+DEFINE_EVENT(xfs_defer_pending_class, name, \
+       TP_PROTO(struct xfs_mount *mp, struct xfs_defer_pending *dfp), \
+       TP_ARGS(mp, dfp))
+
+DECLARE_EVENT_CLASS(xfs_phys_extent_deferred_class,
+       TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+                int type, xfs_agblock_t agbno, xfs_extlen_t len),
+       TP_ARGS(mp, agno, type, agbno, len),
+       TP_STRUCT__entry(
+               __field(dev_t, dev)
+               __field(xfs_agnumber_t, agno)
+               __field(int, type)
+               __field(xfs_agblock_t, agbno)
+               __field(xfs_extlen_t, len)
+       ),
+       TP_fast_assign(
+               __entry->dev = mp->m_super->s_dev;
+               __entry->agno = agno;
+               __entry->type = type;
+               __entry->agbno = agbno;
+               __entry->len = len;
+       ),
+       TP_printk("dev %d:%d op %d agno %u agbno %u len %u",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->type,
+                 __entry->agno,
+                 __entry->agbno,
+                 __entry->len)
+);
+#define DEFINE_PHYS_EXTENT_DEFERRED_EVENT(name) \
+DEFINE_EVENT(xfs_phys_extent_deferred_class, name, \
+       TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+                int type, \
+                xfs_agblock_t bno, \
+                xfs_extlen_t len), \
+       TP_ARGS(mp, agno, type, bno, len))
+
+DECLARE_EVENT_CLASS(xfs_map_extent_deferred_class,
+       TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+                int op,
+                xfs_agblock_t agbno,
+                xfs_ino_t ino,
+                int whichfork,
+                xfs_fileoff_t offset,
+                xfs_filblks_t len,
+                xfs_exntst_t state),
+       TP_ARGS(mp, agno, op, agbno, ino, whichfork, offset, len, state),
+       TP_STRUCT__entry(
+               __field(dev_t, dev)
+               __field(xfs_agnumber_t, agno)
+               __field(xfs_ino_t, ino)
+               __field(xfs_agblock_t, agbno)
+               __field(int, whichfork)
+               __field(xfs_fileoff_t, l_loff)
+               __field(xfs_filblks_t, l_len)
+               __field(xfs_exntst_t, l_state)
+               __field(int, op)
+       ),
+       TP_fast_assign(
+               __entry->dev = mp->m_super->s_dev;
+               __entry->agno = agno;
+               __entry->ino = ino;
+               __entry->agbno = agbno;
+               __entry->whichfork = whichfork;
+               __entry->l_loff = offset;
+               __entry->l_len = len;
+               __entry->l_state = state;
+               __entry->op = op;
+       ),
+       TP_printk("dev %d:%d op %d agno %u agbno %u owner %lld %s offset %llu len %llu state %d",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->op,
+                 __entry->agno,
+                 __entry->agbno,
+                 __entry->ino,
+                 __entry->whichfork == XFS_ATTR_FORK ? "attr" : "data",
+                 __entry->l_loff,
+                 __entry->l_len,
+                 __entry->l_state)
+);
+#define DEFINE_MAP_EXTENT_DEFERRED_EVENT(name) \
+DEFINE_EVENT(xfs_map_extent_deferred_class, name, \
+       TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+                int op, \
+                xfs_agblock_t agbno, \
+                xfs_ino_t ino, \
+                int whichfork, \
+                xfs_fileoff_t offset, \
+                xfs_filblks_t len, \
+                xfs_exntst_t state), \
+       TP_ARGS(mp, agno, op, agbno, ino, whichfork, offset, len, state))
+
+DEFINE_DEFER_EVENT(xfs_defer_init);
+DEFINE_DEFER_EVENT(xfs_defer_cancel);
+DEFINE_DEFER_EVENT(xfs_defer_trans_roll);
+DEFINE_DEFER_EVENT(xfs_defer_trans_abort);
+DEFINE_DEFER_EVENT(xfs_defer_finish);
+DEFINE_DEFER_EVENT(xfs_defer_finish_done);
+
+DEFINE_DEFER_ERROR_EVENT(xfs_defer_trans_roll_error);
+DEFINE_DEFER_ERROR_EVENT(xfs_defer_finish_error);
+DEFINE_DEFER_ERROR_EVENT(xfs_defer_op_finish_error);
+
+DEFINE_DEFER_PENDING_EVENT(xfs_defer_intake_work);
+DEFINE_DEFER_PENDING_EVENT(xfs_defer_intake_cancel);
+DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_commit);
+DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_cancel);
+DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_finish);
+DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_abort);
+
+#define DEFINE_BMAP_FREE_DEFERRED_EVENT DEFINE_PHYS_EXTENT_DEFERRED_EVENT
+DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_bmap_free_defer);
+DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_bmap_free_deferred);
+
+/* rmap tracepoints */
+DECLARE_EVENT_CLASS(xfs_rmap_class,
+       TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+                xfs_agblock_t agbno, xfs_extlen_t len, bool unwritten,
+                struct xfs_owner_info *oinfo),
+       TP_ARGS(mp, agno, agbno, len, unwritten, oinfo),
+       TP_STRUCT__entry(
+               __field(dev_t, dev)
+               __field(xfs_agnumber_t, agno)
+               __field(xfs_agblock_t, agbno)
+               __field(xfs_extlen_t, len)
+               __field(uint64_t, owner)
+               __field(uint64_t, offset)
+               __field(unsigned long, flags)
+       ),
+       TP_fast_assign(
+               __entry->dev = mp->m_super->s_dev;
+               __entry->agno = agno;
+               __entry->agbno = agbno;
+               __entry->len = len;
+               __entry->owner = oinfo->oi_owner;
+               __entry->offset = oinfo->oi_offset;
+               __entry->flags = oinfo->oi_flags;
+               if (unwritten)
+                       __entry->flags |= XFS_RMAP_UNWRITTEN;
+       ),
+       TP_printk("dev %d:%d agno %u agbno %u len %u owner %lld offset %llu flags 0x%lx",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->agno,
+                 __entry->agbno,
+                 __entry->len,
+                 __entry->owner,
+                 __entry->offset,
+                 __entry->flags)
+);
+#define DEFINE_RMAP_EVENT(name) \
+DEFINE_EVENT(xfs_rmap_class, name, \
+       TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+                xfs_agblock_t agbno, xfs_extlen_t len, bool unwritten, \
+                struct xfs_owner_info *oinfo), \
+       TP_ARGS(mp, agno, agbno, len, unwritten, oinfo))
+
+/* simple AG-based error/%ip tracepoint class */
+DECLARE_EVENT_CLASS(xfs_ag_error_class,
+       TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int error,
+                unsigned long caller_ip),
+       TP_ARGS(mp, agno, error, caller_ip),
+       TP_STRUCT__entry(
+               __field(dev_t, dev)
+               __field(xfs_agnumber_t, agno)
+               __field(int, error)
+               __field(unsigned long, caller_ip)
+       ),
+       TP_fast_assign(
+               __entry->dev = mp->m_super->s_dev;
+               __entry->agno = agno;
+               __entry->error = error;
+               __entry->caller_ip = caller_ip;
+       ),
+       TP_printk("dev %d:%d agno %u error %d caller %ps",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->agno,
+                 __entry->error,
+                 (char *)__entry->caller_ip)
+);
+
+#define DEFINE_AG_ERROR_EVENT(name) \
+DEFINE_EVENT(xfs_ag_error_class, name, \
+       TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int error, \
+                unsigned long caller_ip), \
+       TP_ARGS(mp, agno, error, caller_ip))
+
+DEFINE_RMAP_EVENT(xfs_rmap_unmap);
+DEFINE_RMAP_EVENT(xfs_rmap_unmap_done);
+DEFINE_AG_ERROR_EVENT(xfs_rmap_unmap_error);
+DEFINE_RMAP_EVENT(xfs_rmap_map);
+DEFINE_RMAP_EVENT(xfs_rmap_map_done);
+DEFINE_AG_ERROR_EVENT(xfs_rmap_map_error);
+DEFINE_RMAP_EVENT(xfs_rmap_convert);
+DEFINE_RMAP_EVENT(xfs_rmap_convert_done);
+DEFINE_AG_ERROR_EVENT(xfs_rmap_convert_error);
+DEFINE_AG_ERROR_EVENT(xfs_rmap_convert_state);
+
+DECLARE_EVENT_CLASS(xfs_rmapbt_class,
+       TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+                xfs_agblock_t agbno, xfs_extlen_t len,
+                uint64_t owner, uint64_t offset, unsigned int flags),
+       TP_ARGS(mp, agno, agbno, len, owner, offset, flags),
+       TP_STRUCT__entry(
+               __field(dev_t, dev)
+               __field(xfs_agnumber_t, agno)
+               __field(xfs_agblock_t, agbno)
+               __field(xfs_extlen_t, len)
+               __field(uint64_t, owner)
+               __field(uint64_t, offset)
+               __field(unsigned int, flags)
+       ),
+       TP_fast_assign(
+               __entry->dev = mp->m_super->s_dev;
+               __entry->agno = agno;
+               __entry->agbno = agbno;
+               __entry->len = len;
+               __entry->owner = owner;
+               __entry->offset = offset;
+               __entry->flags = flags;
+       ),
+       TP_printk("dev %d:%d agno %u agbno %u len %u owner %lld offset %llu flags 0x%x",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->agno,
+                 __entry->agbno,
+                 __entry->len,
+                 __entry->owner,
+                 __entry->offset,
+                 __entry->flags)
+);
+#define DEFINE_RMAPBT_EVENT(name) \
+DEFINE_EVENT(xfs_rmapbt_class, name, \
+       TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+                xfs_agblock_t agbno, xfs_extlen_t len, \
+                uint64_t owner, uint64_t offset, unsigned int flags), \
+       TP_ARGS(mp, agno, agbno, len, owner, offset, flags))
+
+#define DEFINE_RMAP_DEFERRED_EVENT DEFINE_MAP_EXTENT_DEFERRED_EVENT
+DEFINE_RMAP_DEFERRED_EVENT(xfs_rmap_defer);
+DEFINE_RMAP_DEFERRED_EVENT(xfs_rmap_deferred);
+
+DEFINE_BUSY_EVENT(xfs_rmapbt_alloc_block);
+DEFINE_BUSY_EVENT(xfs_rmapbt_free_block);
+DEFINE_RMAPBT_EVENT(xfs_rmap_update);
+DEFINE_RMAPBT_EVENT(xfs_rmap_insert);
+DEFINE_RMAPBT_EVENT(xfs_rmap_delete);
+DEFINE_AG_ERROR_EVENT(xfs_rmap_insert_error);
+DEFINE_AG_ERROR_EVENT(xfs_rmap_delete_error);
+DEFINE_AG_ERROR_EVENT(xfs_rmap_update_error);
+DEFINE_RMAPBT_EVENT(xfs_rmap_lookup_le_range_result);
+DEFINE_RMAPBT_EVENT(xfs_rmap_find_right_neighbor_result);
+DEFINE_RMAPBT_EVENT(xfs_rmap_find_left_neighbor_result);
+
 #endif /* _TRACE_XFS_H */
 
 #undef TRACE_INCLUDE_PATH
index 9b2b9fa89331c10d57818218ccd8fc985a3325f6..e2bf86aad33dfaef40dd877ef12e32a7715cbabc 100644 (file)
@@ -33,6 +33,9 @@ struct xfs_trans;
 struct xfs_trans_res;
 struct xfs_dquot_acct;
 struct xfs_busy_extent;
+struct xfs_rud_log_item;
+struct xfs_rui_log_item;
+struct xfs_btree_cur;
 
 typedef struct xfs_log_item {
        struct list_head                li_ail;         /* AIL pointers */
@@ -210,17 +213,14 @@ void              xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int);
 void           xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *, uint);
 void           xfs_trans_log_buf(xfs_trans_t *, struct xfs_buf *, uint, uint);
 void           xfs_trans_log_inode(xfs_trans_t *, struct xfs_inode *, uint);
-struct xfs_efi_log_item        *xfs_trans_get_efi(xfs_trans_t *, uint);
-void           xfs_trans_log_efi_extent(xfs_trans_t *,
-                                        struct xfs_efi_log_item *,
-                                        xfs_fsblock_t,
-                                        xfs_extlen_t);
-struct xfs_efd_log_item        *xfs_trans_get_efd(xfs_trans_t *,
+
+void           xfs_extent_free_init_defer_op(void);
+struct xfs_efd_log_item        *xfs_trans_get_efd(struct xfs_trans *,
                                  struct xfs_efi_log_item *,
                                  uint);
 int            xfs_trans_free_extent(struct xfs_trans *,
                                      struct xfs_efd_log_item *, xfs_fsblock_t,
-                                     xfs_extlen_t);
+                                     xfs_extlen_t, struct xfs_owner_info *);
 int            xfs_trans_commit(struct xfs_trans *);
 int            __xfs_trans_roll(struct xfs_trans **, struct xfs_inode *, int *);
 int            xfs_trans_roll(struct xfs_trans **, struct xfs_inode *);
@@ -236,4 +236,16 @@ void               xfs_trans_buf_copy_type(struct xfs_buf *dst_bp,
 extern kmem_zone_t     *xfs_trans_zone;
 extern kmem_zone_t     *xfs_log_item_desc_zone;
 
+/* rmap updates */
+enum xfs_rmap_intent_type;
+
+void xfs_rmap_update_init_defer_op(void);
+struct xfs_rud_log_item *xfs_trans_get_rud(struct xfs_trans *tp,
+               struct xfs_rui_log_item *ruip);
+int xfs_trans_log_finish_rmap_update(struct xfs_trans *tp,
+               struct xfs_rud_log_item *rudp, enum xfs_rmap_intent_type type,
+               __uint64_t owner, int whichfork, xfs_fileoff_t startoff,
+               xfs_fsblock_t startblock, xfs_filblks_t blockcount,
+               xfs_exntst_t state, struct xfs_btree_cur **pcur);
+
 #endif /* __XFS_TRANS_H__ */
index a96ae540eb629c86e15c004dc66eb60fbb6be90e..459ddec137a48a2aec19d57a739e0cdbef44ae70 100644 (file)
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
+#include "xfs_bit.h"
 #include "xfs_mount.h"
+#include "xfs_defer.h"
 #include "xfs_trans.h"
 #include "xfs_trans_priv.h"
 #include "xfs_extfree_item.h"
 #include "xfs_alloc.h"
-
-/*
- * This routine is called to allocate an "extent free intention"
- * log item that will hold nextents worth of extents.  The
- * caller must use all nextents extents, because we are not
- * flexible about this at all.
- */
-xfs_efi_log_item_t *
-xfs_trans_get_efi(xfs_trans_t  *tp,
-                 uint          nextents)
-{
-       xfs_efi_log_item_t      *efip;
-
-       ASSERT(tp != NULL);
-       ASSERT(nextents > 0);
-
-       efip = xfs_efi_init(tp->t_mountp, nextents);
-       ASSERT(efip != NULL);
-
-       /*
-        * Get a log_item_desc to point at the new item.
-        */
-       xfs_trans_add_item(tp, &efip->efi_item);
-       return efip;
-}
-
-/*
- * This routine is called to indicate that the described
- * extent is to be logged as needing to be freed.  It should
- * be called once for each extent to be freed.
- */
-void
-xfs_trans_log_efi_extent(xfs_trans_t           *tp,
-                        xfs_efi_log_item_t     *efip,
-                        xfs_fsblock_t          start_block,
-                        xfs_extlen_t           ext_len)
-{
-       uint                    next_extent;
-       xfs_extent_t            *extp;
-
-       tp->t_flags |= XFS_TRANS_DIRTY;
-       efip->efi_item.li_desc->lid_flags |= XFS_LID_DIRTY;
-
-       /*
-        * atomic_inc_return gives us the value after the increment;
-        * we want to use it as an array index so we need to subtract 1 from
-        * it.
-        */
-       next_extent = atomic_inc_return(&efip->efi_next_extent) - 1;
-       ASSERT(next_extent < efip->efi_format.efi_nextents);
-       extp = &(efip->efi_format.efi_extents[next_extent]);
-       extp->ext_start = start_block;
-       extp->ext_len = ext_len;
-}
-
+#include "xfs_bmap.h"
+#include "xfs_trace.h"
 
 /*
  * This routine is called to allocate an "extent free done"
@@ -88,12 +37,12 @@ xfs_trans_log_efi_extent(xfs_trans_t                *tp,
  * caller must use all nextents extents, because we are not
  * flexible about this at all.
  */
-xfs_efd_log_item_t *
-xfs_trans_get_efd(xfs_trans_t          *tp,
-                 xfs_efi_log_item_t    *efip,
-                 uint                  nextents)
+struct xfs_efd_log_item *
+xfs_trans_get_efd(struct xfs_trans             *tp,
+                 struct xfs_efi_log_item       *efip,
+                 uint                          nextents)
 {
-       xfs_efd_log_item_t      *efdp;
+       struct xfs_efd_log_item                 *efdp;
 
        ASSERT(tp != NULL);
        ASSERT(nextents > 0);
@@ -118,13 +67,19 @@ xfs_trans_free_extent(
        struct xfs_trans        *tp,
        struct xfs_efd_log_item *efdp,
        xfs_fsblock_t           start_block,
-       xfs_extlen_t            ext_len)
+       xfs_extlen_t            ext_len,
+       struct xfs_owner_info   *oinfo)
 {
+       struct xfs_mount        *mp = tp->t_mountp;
        uint                    next_extent;
+       xfs_agnumber_t          agno = XFS_FSB_TO_AGNO(mp, start_block);
+       xfs_agblock_t           agbno = XFS_FSB_TO_AGBNO(mp, start_block);
        struct xfs_extent       *extp;
        int                     error;
 
-       error = xfs_free_extent(tp, start_block, ext_len);
+       trace_xfs_bmap_free_deferred(tp->t_mountp, agno, 0, agbno, ext_len);
+
+       error = xfs_free_extent(tp, start_block, ext_len, oinfo);
 
        /*
         * Mark the transaction dirty, even on error. This ensures the
@@ -145,3 +100,139 @@ xfs_trans_free_extent(
 
        return error;
 }
+
+/* Sort bmap items by AG. */
+static int
+xfs_extent_free_diff_items(
+       void                            *priv,
+       struct list_head                *a,
+       struct list_head                *b)
+{
+       struct xfs_mount                *mp = priv;
+       struct xfs_extent_free_item     *ra;
+       struct xfs_extent_free_item     *rb;
+
+       ra = container_of(a, struct xfs_extent_free_item, xefi_list);
+       rb = container_of(b, struct xfs_extent_free_item, xefi_list);
+       return  XFS_FSB_TO_AGNO(mp, ra->xefi_startblock) -
+               XFS_FSB_TO_AGNO(mp, rb->xefi_startblock);
+}
+
+/* Get an EFI. */
+STATIC void *
+xfs_extent_free_create_intent(
+       struct xfs_trans                *tp,
+       unsigned int                    count)
+{
+       struct xfs_efi_log_item         *efip;
+
+       ASSERT(tp != NULL);
+       ASSERT(count > 0);
+
+       efip = xfs_efi_init(tp->t_mountp, count);
+       ASSERT(efip != NULL);
+
+       /*
+        * Get a log_item_desc to point at the new item.
+        */
+       xfs_trans_add_item(tp, &efip->efi_item);
+       return efip;
+}
+
+/* Log a free extent to the intent item. */
+STATIC void
+xfs_extent_free_log_item(
+       struct xfs_trans                *tp,
+       void                            *intent,
+       struct list_head                *item)
+{
+       struct xfs_efi_log_item         *efip = intent;
+       struct xfs_extent_free_item     *free;
+       uint                            next_extent;
+       struct xfs_extent               *extp;
+
+       free = container_of(item, struct xfs_extent_free_item, xefi_list);
+
+       tp->t_flags |= XFS_TRANS_DIRTY;
+       efip->efi_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+
+       /*
+        * atomic_inc_return gives us the value after the increment;
+        * we want to use it as an array index so we need to subtract 1 from
+        * it.
+        */
+       next_extent = atomic_inc_return(&efip->efi_next_extent) - 1;
+       ASSERT(next_extent < efip->efi_format.efi_nextents);
+       extp = &efip->efi_format.efi_extents[next_extent];
+       extp->ext_start = free->xefi_startblock;
+       extp->ext_len = free->xefi_blockcount;
+}
+
+/* Get an EFD so we can process all the free extents. */
+STATIC void *
+xfs_extent_free_create_done(
+       struct xfs_trans                *tp,
+       void                            *intent,
+       unsigned int                    count)
+{
+       return xfs_trans_get_efd(tp, intent, count);
+}
+
+/* Process a free extent. */
+STATIC int
+xfs_extent_free_finish_item(
+       struct xfs_trans                *tp,
+       struct xfs_defer_ops            *dop,
+       struct list_head                *item,
+       void                            *done_item,
+       void                            **state)
+{
+       struct xfs_extent_free_item     *free;
+       int                             error;
+
+       free = container_of(item, struct xfs_extent_free_item, xefi_list);
+       error = xfs_trans_free_extent(tp, done_item,
+                       free->xefi_startblock,
+                       free->xefi_blockcount,
+                       &free->xefi_oinfo);
+       kmem_free(free);
+       return error;
+}
+
+/* Abort all pending EFIs. */
+STATIC void
+xfs_extent_free_abort_intent(
+       void                            *intent)
+{
+       xfs_efi_release(intent);
+}
+
+/* Cancel a free extent. */
+STATIC void
+xfs_extent_free_cancel_item(
+       struct list_head                *item)
+{
+       struct xfs_extent_free_item     *free;
+
+       free = container_of(item, struct xfs_extent_free_item, xefi_list);
+       kmem_free(free);
+}
+
+static const struct xfs_defer_op_type xfs_extent_free_defer_type = {
+       .type           = XFS_DEFER_OPS_TYPE_FREE,
+       .max_items      = XFS_EFI_MAX_FAST_EXTENTS,
+       .diff_items     = xfs_extent_free_diff_items,
+       .create_intent  = xfs_extent_free_create_intent,
+       .abort_intent   = xfs_extent_free_abort_intent,
+       .log_item       = xfs_extent_free_log_item,
+       .create_done    = xfs_extent_free_create_done,
+       .finish_item    = xfs_extent_free_finish_item,
+       .cancel_item    = xfs_extent_free_cancel_item,
+};
+
+/* Register the deferred op type. */
+void
+xfs_extent_free_init_defer_op(void)
+{
+       xfs_defer_init_op_type(&xfs_extent_free_defer_type);
+}
diff --git a/fs/xfs/xfs_trans_rmap.c b/fs/xfs/xfs_trans_rmap.c
new file mode 100644 (file)
index 0000000..5a50ef8
--- /dev/null
@@ -0,0 +1,271 @@
+/*
+ * Copyright (C) 2016 Oracle.  All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_rmap_item.h"
+#include "xfs_alloc.h"
+#include "xfs_rmap.h"
+
+/* Set the map extent flags for this reverse mapping. */
+static void
+xfs_trans_set_rmap_flags(
+       struct xfs_map_extent           *rmap,
+       enum xfs_rmap_intent_type       type,
+       int                             whichfork,
+       xfs_exntst_t                    state)
+{
+       rmap->me_flags = 0;
+       if (state == XFS_EXT_UNWRITTEN)
+               rmap->me_flags |= XFS_RMAP_EXTENT_UNWRITTEN;
+       if (whichfork == XFS_ATTR_FORK)
+               rmap->me_flags |= XFS_RMAP_EXTENT_ATTR_FORK;
+       switch (type) {
+       case XFS_RMAP_MAP:
+               rmap->me_flags |= XFS_RMAP_EXTENT_MAP;
+               break;
+       case XFS_RMAP_UNMAP:
+               rmap->me_flags |= XFS_RMAP_EXTENT_UNMAP;
+               break;
+       case XFS_RMAP_CONVERT:
+               rmap->me_flags |= XFS_RMAP_EXTENT_CONVERT;
+               break;
+       case XFS_RMAP_ALLOC:
+               rmap->me_flags |= XFS_RMAP_EXTENT_ALLOC;
+               break;
+       case XFS_RMAP_FREE:
+               rmap->me_flags |= XFS_RMAP_EXTENT_FREE;
+               break;
+       default:
+               ASSERT(0);
+       }
+}
+
+struct xfs_rud_log_item *
+xfs_trans_get_rud(
+       struct xfs_trans                *tp,
+       struct xfs_rui_log_item         *ruip)
+{
+       struct xfs_rud_log_item         *rudp;
+
+       rudp = xfs_rud_init(tp->t_mountp, ruip);
+       xfs_trans_add_item(tp, &rudp->rud_item);
+       return rudp;
+}
+
+/*
+ * Finish an rmap update and log it to the RUD. Note that the transaction is
+ * marked dirty regardless of whether the rmap update succeeds or fails to
+ * support the RUI/RUD lifecycle rules.
+ */
+int
+xfs_trans_log_finish_rmap_update(
+       struct xfs_trans                *tp,
+       struct xfs_rud_log_item         *rudp,
+       enum xfs_rmap_intent_type       type,
+       __uint64_t                      owner,
+       int                             whichfork,
+       xfs_fileoff_t                   startoff,
+       xfs_fsblock_t                   startblock,
+       xfs_filblks_t                   blockcount,
+       xfs_exntst_t                    state,
+       struct xfs_btree_cur            **pcur)
+{
+       int                             error;
+
+       error = xfs_rmap_finish_one(tp, type, owner, whichfork, startoff,
+                       startblock, blockcount, state, pcur);
+
+       /*
+        * Mark the transaction dirty, even on error. This ensures the
+        * transaction is aborted, which:
+        *
+        * 1.) releases the RUI and frees the RUD
+        * 2.) shuts down the filesystem
+        */
+       tp->t_flags |= XFS_TRANS_DIRTY;
+       rudp->rud_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+
+       return error;
+}
+
+/* Sort rmap intents by AG. */
+static int
+xfs_rmap_update_diff_items(
+       void                            *priv,
+       struct list_head                *a,
+       struct list_head                *b)
+{
+       struct xfs_mount                *mp = priv;
+       struct xfs_rmap_intent          *ra;
+       struct xfs_rmap_intent          *rb;
+
+       ra = container_of(a, struct xfs_rmap_intent, ri_list);
+       rb = container_of(b, struct xfs_rmap_intent, ri_list);
+       return  XFS_FSB_TO_AGNO(mp, ra->ri_bmap.br_startblock) -
+               XFS_FSB_TO_AGNO(mp, rb->ri_bmap.br_startblock);
+}
+
+/* Get an RUI. */
+STATIC void *
+xfs_rmap_update_create_intent(
+       struct xfs_trans                *tp,
+       unsigned int                    count)
+{
+       struct xfs_rui_log_item         *ruip;
+
+       ASSERT(tp != NULL);
+       ASSERT(count > 0);
+
+       ruip = xfs_rui_init(tp->t_mountp, count);
+       ASSERT(ruip != NULL);
+
+       /*
+        * Get a log_item_desc to point at the new item.
+        */
+       xfs_trans_add_item(tp, &ruip->rui_item);
+       return ruip;
+}
+
+/* Log rmap updates in the intent item. */
+STATIC void
+xfs_rmap_update_log_item(
+       struct xfs_trans                *tp,
+       void                            *intent,
+       struct list_head                *item)
+{
+       struct xfs_rui_log_item         *ruip = intent;
+       struct xfs_rmap_intent          *rmap;
+       uint                            next_extent;
+       struct xfs_map_extent           *map;
+
+       rmap = container_of(item, struct xfs_rmap_intent, ri_list);
+
+       tp->t_flags |= XFS_TRANS_DIRTY;
+       ruip->rui_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+
+       /*
+        * atomic_inc_return gives us the value after the increment;
+        * we want to use it as an array index so we need to subtract 1 from
+        * it.
+        */
+       next_extent = atomic_inc_return(&ruip->rui_next_extent) - 1;
+       ASSERT(next_extent < ruip->rui_format.rui_nextents);
+       map = &ruip->rui_format.rui_extents[next_extent];
+       map->me_owner = rmap->ri_owner;
+       map->me_startblock = rmap->ri_bmap.br_startblock;
+       map->me_startoff = rmap->ri_bmap.br_startoff;
+       map->me_len = rmap->ri_bmap.br_blockcount;
+       xfs_trans_set_rmap_flags(map, rmap->ri_type, rmap->ri_whichfork,
+                       rmap->ri_bmap.br_state);
+}
+
+/* Get an RUD so we can process all the deferred rmap updates. */
+STATIC void *
+xfs_rmap_update_create_done(
+       struct xfs_trans                *tp,
+       void                            *intent,
+       unsigned int                    count)
+{
+       return xfs_trans_get_rud(tp, intent);
+}
+
+/* Process a deferred rmap update. */
+STATIC int
+xfs_rmap_update_finish_item(
+       struct xfs_trans                *tp,
+       struct xfs_defer_ops            *dop,
+       struct list_head                *item,
+       void                            *done_item,
+       void                            **state)
+{
+       struct xfs_rmap_intent          *rmap;
+       int                             error;
+
+       rmap = container_of(item, struct xfs_rmap_intent, ri_list);
+       error = xfs_trans_log_finish_rmap_update(tp, done_item,
+                       rmap->ri_type,
+                       rmap->ri_owner, rmap->ri_whichfork,
+                       rmap->ri_bmap.br_startoff,
+                       rmap->ri_bmap.br_startblock,
+                       rmap->ri_bmap.br_blockcount,
+                       rmap->ri_bmap.br_state,
+                       (struct xfs_btree_cur **)state);
+       kmem_free(rmap);
+       return error;
+}
+
+/* Clean up after processing deferred rmaps. */
+STATIC void
+xfs_rmap_update_finish_cleanup(
+       struct xfs_trans        *tp,
+       void                    *state,
+       int                     error)
+{
+       struct xfs_btree_cur    *rcur = state;
+
+       xfs_rmap_finish_one_cleanup(tp, rcur, error);
+}
+
+/* Abort all pending RUIs. */
+STATIC void
+xfs_rmap_update_abort_intent(
+       void                            *intent)
+{
+       xfs_rui_release(intent);
+}
+
+/* Cancel a deferred rmap update. */
+STATIC void
+xfs_rmap_update_cancel_item(
+       struct list_head                *item)
+{
+       struct xfs_rmap_intent          *rmap;
+
+       rmap = container_of(item, struct xfs_rmap_intent, ri_list);
+       kmem_free(rmap);
+}
+
+static const struct xfs_defer_op_type xfs_rmap_update_defer_type = {
+       .type           = XFS_DEFER_OPS_TYPE_RMAP,
+       .max_items      = XFS_RUI_MAX_FAST_EXTENTS,
+       .diff_items     = xfs_rmap_update_diff_items,
+       .create_intent  = xfs_rmap_update_create_intent,
+       .abort_intent   = xfs_rmap_update_abort_intent,
+       .log_item       = xfs_rmap_update_log_item,
+       .create_done    = xfs_rmap_update_create_done,
+       .finish_item    = xfs_rmap_update_finish_item,
+       .finish_cleanup = xfs_rmap_update_finish_cleanup,
+       .cancel_item    = xfs_rmap_update_cancel_item,
+};
+
+/* Register the deferred op type. */
+void
+xfs_rmap_update_init_defer_op(void)
+{
+       xfs_defer_init_op_type(&xfs_rmap_update_defer_type);
+}