Merge branch 'xfs-misc-fixes-4.6-4' into for-next

[mirror_ubuntu-bionic-kernel.git] / fs / xfs / xfs_aops.c
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c

index eed0bfc4443c293c9db36e32534a1142a801613b..7ba680e572fce603734721d998779b23b4005635 100644 (file)
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -36,6 +36,21 @@
  #include <linux/pagevec.h>
  #include <linux/writeback.h>
  
+/* flags for direct write completions */
+#define XFS_DIO_FLAG_UNWRITTEN (1 << 0)
+#define XFS_DIO_FLAG_APPEND    (1 << 1)
+
+/*
+ * structure owned by writepages passed to individual writepage calls
+ */
+struct xfs_writepage_ctx {
+       struct xfs_bmbt_irec    imap;
+       bool                    imap_valid;
+       unsigned int            io_type;
+       struct xfs_ioend        *ioend;
+       sector_t                last_block;
+};
+
  void
  xfs_count_page_state(
         struct page             *page,
@@ -214,10 +229,12 @@ xfs_end_io(
         struct xfs_inode *ip = XFS_I(ioend->io_inode);
         int             error = 0;
  
-       if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+       /*
+        * Set an error if the mount has shut down and proceed with end I/O
+        * processing so it can perform whatever cleanups are necessary.
+        */
+       if (XFS_FORCED_SHUTDOWN(ip->i_mount))
                 ioend->io_error = -EIO;
-               goto done;
-       }
  
         /*
          * For unwritten extents we need to issue transactions to convert a
@@ -265,7 +282,7 @@ xfs_alloc_ioend(
          */
         atomic_set(&ioend->io_remaining, 1);
         ioend->io_error = 0;
-       ioend->io_list = NULL;
+       INIT_LIST_HEAD(&ioend->io_list);
         ioend->io_type = type;
         ioend->io_inode = inode;
         ioend->io_buffer_head = NULL;
@@ -283,8 +300,7 @@ xfs_map_blocks(
         struct inode            *inode,
         loff_t                  offset,
         struct xfs_bmbt_irec    *imap,
-       int                     type,
-       int                     nonblocking)
+       int                     type)
  {
         struct xfs_inode        *ip = XFS_I(inode);
         struct xfs_mount        *mp = ip->i_mount;
@@ -300,12 +316,7 @@ xfs_map_blocks(
         if (type == XFS_IO_UNWRITTEN)
                 bmapi_flags |= XFS_BMAPI_IGSTATE;
  
-       if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
-               if (nonblocking)
-                       return -EAGAIN;
-               xfs_ilock(ip, XFS_ILOCK_SHARED);
-       }
-
+       xfs_ilock(ip, XFS_ILOCK_SHARED);
         ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
                (ip->i_df.if_flags & XFS_IFEXTENTS));
         ASSERT(offset <= mp->m_super->s_maxbytes);
@@ -341,7 +352,7 @@ xfs_map_blocks(
         return 0;
  }
  
-STATIC int
+STATIC bool
  xfs_imap_valid(
         struct inode            *inode,
         struct xfs_bmbt_irec    *imap,
@@ -414,8 +425,7 @@ xfs_start_buffer_writeback(
  STATIC void
  xfs_start_page_writeback(
         struct page             *page,
-       int                     clear_dirty,
-       int                     buffers)
+       int                     clear_dirty)
  {
         ASSERT(PageLocked(page));
         ASSERT(!PageWriteback(page));
@@ -434,10 +444,6 @@ xfs_start_page_writeback(
                 set_page_writeback_keepwrite(page);
  
         unlock_page(page);
-
-       /* If no buffers on the page are to be written, finish it here */
-       if (!buffers)
-               end_page_writeback(page);
  }
  
  static inline int xfs_bio_add_buffer(struct bio *bio, struct buffer_head *bh)
@@ -446,153 +452,101 @@ static inline int xfs_bio_add_buffer(struct bio *bio, struct buffer_head *bh)
  }
  
  /*
- * Submit all of the bios for all of the ioends we have saved up, covering the
- * initial writepage page and also any probed pages.
- *
- * Because we may have multiple ioends spanning a page, we need to start
- * writeback on all the buffers before we submit them for I/O. If we mark the
- * buffers as we got, then we can end up with a page that only has buffers
- * marked async write and I/O complete on can occur before we mark the other
- * buffers async write.
- *
- * The end result of this is that we trip a bug in end_page_writeback() because
- * we call it twice for the one page as the code in end_buffer_async_write()
- * assumes that all buffers on the page are started at the same time.
- *
- * The fix is two passes across the ioend list - one to start writeback on the
- * buffer_heads, and then submit them for I/O on the second pass.
+ * Submit all of the bios for an ioend. We are only passed a single ioend at a
+ * time; the caller is responsible for chaining prior to submission.
   *
   * If @fail is non-zero, it means that we have a situation where some part of
   * the submission process has failed after we have marked paged for writeback
   * and unlocked them. In this situation, we need to fail the ioend chain rather
   * than submit it to IO. This typically only happens on a filesystem shutdown.
   */
-STATIC void
+STATIC int
  xfs_submit_ioend(
         struct writeback_control *wbc,
         xfs_ioend_t             *ioend,
-       int                     fail)
+       int                     status)
  {
-       xfs_ioend_t             *head = ioend;
-       xfs_ioend_t             *next;
         struct buffer_head      *bh;
         struct bio              *bio;
         sector_t                lastblock = 0;
  
-       /* Pass 1 - start writeback */
-       do {
-               next = ioend->io_list;
-               for (bh = ioend->io_buffer_head; bh; bh = bh->b_private)
-                       xfs_start_buffer_writeback(bh);
-       } while ((ioend = next) != NULL);
+       /* Reserve log space if we might write beyond the on-disk inode size. */
+       if (!status &&
+            ioend->io_type != XFS_IO_UNWRITTEN && xfs_ioend_is_append(ioend))
+               status = xfs_setfilesize_trans_alloc(ioend);
+       /*
+        * If we are failing the IO now, just mark the ioend with an
+        * error and finish it. This will run IO completion immediately
+        * as there is only one reference to the ioend at this point in
+        * time.
+        */
+       if (status) {
+               ioend->io_error = status;
+               xfs_finish_ioend(ioend);
+               return status;
+       }
  
-       /* Pass 2 - submit I/O */
-       ioend = head;
-       do {
-               next = ioend->io_list;
-               bio = NULL;
+       bio = NULL;
+       for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {
  
-               /*
-                * If we are failing the IO now, just mark the ioend with an
-                * error and finish it. This will run IO completion immediately
-                * as there is only one reference to the ioend at this point in
-                * time.
-                */
-               if (fail) {
-                       ioend->io_error = fail;
-                       xfs_finish_ioend(ioend);
-                       continue;
+               if (!bio) {
+retry:
+                       bio = xfs_alloc_ioend_bio(bh);
+               } else if (bh->b_blocknr != lastblock + 1) {
+                       xfs_submit_ioend_bio(wbc, ioend, bio);
+                       goto retry;
                 }
  
-               for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {
-
-                       if (!bio) {
- retry:
-                               bio = xfs_alloc_ioend_bio(bh);
-                       } else if (bh->b_blocknr != lastblock + 1) {
-                               xfs_submit_ioend_bio(wbc, ioend, bio);
-                               goto retry;
-                       }
-
-                       if (xfs_bio_add_buffer(bio, bh) != bh->b_size) {
-                               xfs_submit_ioend_bio(wbc, ioend, bio);
-                               goto retry;
-                       }
-
-                       lastblock = bh->b_blocknr;
-               }
-               if (bio)
+               if (xfs_bio_add_buffer(bio, bh) != bh->b_size) {
                         xfs_submit_ioend_bio(wbc, ioend, bio);
-               xfs_finish_ioend(ioend);
-       } while ((ioend = next) != NULL);
-}
-
-/*
- * Cancel submission of all buffer_heads so far in this endio.
- * Toss the endio too.  Only ever called for the initial page
- * in a writepage request, so only ever one page.
- */
-STATIC void
-xfs_cancel_ioend(
-       xfs_ioend_t             *ioend)
-{
-       xfs_ioend_t             *next;
-       struct buffer_head      *bh, *next_bh;
-
-       do {
-               next = ioend->io_list;
-               bh = ioend->io_buffer_head;
-               do {
-                       next_bh = bh->b_private;
-                       clear_buffer_async_write(bh);
-                       /*
-                        * The unwritten flag is cleared when added to the
-                        * ioend. We're not submitting for I/O so mark the
-                        * buffer unwritten again for next time around.
-                        */
-                       if (ioend->io_type == XFS_IO_UNWRITTEN)
-                               set_buffer_unwritten(bh);
-                       unlock_buffer(bh);
-               } while ((bh = next_bh) != NULL);
+                       goto retry;
+               }
  
-               mempool_free(ioend, xfs_ioend_pool);
-       } while ((ioend = next) != NULL);
+               lastblock = bh->b_blocknr;
+       }
+       if (bio)
+               xfs_submit_ioend_bio(wbc, ioend, bio);
+       xfs_finish_ioend(ioend);
+       return 0;
  }
  
  /*
   * Test to see if we've been building up a completion structure for
   * earlier buffers -- if so, we try to append to this ioend if we
   * can, otherwise we finish off any current ioend and start another.
- * Return true if we've finished the given ioend.
+ * Return the ioend we finished off so that the caller can submit it
+ * once it has finished processing the dirty page.
   */
  STATIC void
  xfs_add_to_ioend(
         struct inode            *inode,
         struct buffer_head      *bh,
         xfs_off_t               offset,
-       unsigned int            type,
-       xfs_ioend_t             **result,
-       int                     need_ioend)
+       struct xfs_writepage_ctx *wpc,
+       struct list_head        *iolist)
  {
-       xfs_ioend_t             *ioend = *result;
-
-       if (!ioend || need_ioend || type != ioend->io_type) {
-               xfs_ioend_t     *previous = *result;
-
-               ioend = xfs_alloc_ioend(inode, type);
-               ioend->io_offset = offset;
-               ioend->io_buffer_head = bh;
-               ioend->io_buffer_tail = bh;
-               if (previous)
-                       previous->io_list = ioend;
-               *result = ioend;
+       if (!wpc->ioend || wpc->io_type != wpc->ioend->io_type ||
+           bh->b_blocknr != wpc->last_block + 1 ||
+           offset != wpc->ioend->io_offset + wpc->ioend->io_size) {
+               struct xfs_ioend        *new;
+
+               if (wpc->ioend)
+                       list_add(&wpc->ioend->io_list, iolist);
+
+               new = xfs_alloc_ioend(inode, wpc->io_type);
+               new->io_offset = offset;
+               new->io_buffer_head = bh;
+               new->io_buffer_tail = bh;
+               wpc->ioend = new;
         } else {
-               ioend->io_buffer_tail->b_private = bh;
-               ioend->io_buffer_tail = bh;
+               wpc->ioend->io_buffer_tail->b_private = bh;
+               wpc->ioend->io_buffer_tail = bh;
         }
  
         bh->b_private = NULL;
-       ioend->io_size += bh->b_size;
+       wpc->ioend->io_size += bh->b_size;
+       wpc->last_block = bh->b_blocknr;
+       xfs_start_buffer_writeback(bh);
  }
  
  STATIC void
@@ -678,183 +632,6 @@ xfs_check_page_type(
         return false;
  }
  
-/*
- * Allocate & map buffers for page given the extent map. Write it out.
- * except for the original page of a writepage, this is called on
- * delalloc/unwritten pages only, for the original page it is possible
- * that the page has no mapping at all.
- */
-STATIC int
-xfs_convert_page(
-       struct inode            *inode,
-       struct page             *page,
-       loff_t                  tindex,
-       struct xfs_bmbt_irec    *imap,
-       xfs_ioend_t             **ioendp,
-       struct writeback_control *wbc)
-{
-       struct buffer_head      *bh, *head;
-       xfs_off_t               end_offset;
-       unsigned long           p_offset;
-       unsigned int            type;
-       int                     len, page_dirty;
-       int                     count = 0, done = 0, uptodate = 1;
-       xfs_off_t               offset = page_offset(page);
-
-       if (page->index != tindex)
-               goto fail;
-       if (!trylock_page(page))
-               goto fail;
-       if (PageWriteback(page))
-               goto fail_unlock_page;
-       if (page->mapping != inode->i_mapping)
-               goto fail_unlock_page;
-       if (!xfs_check_page_type(page, (*ioendp)->io_type, false))
-               goto fail_unlock_page;
-
-       /*
-        * page_dirty is initially a count of buffers on the page before
-        * EOF and is decremented as we move each into a cleanable state.
-        *
-        * Derivation:
-        *
-        * End offset is the highest offset that this page should represent.
-        * If we are on the last page, (end_offset & (PAGE_CACHE_SIZE - 1))
-        * will evaluate non-zero and be less than PAGE_CACHE_SIZE and
-        * hence give us the correct page_dirty count. On any other page,
-        * it will be zero and in that case we need page_dirty to be the
-        * count of buffers on the page.
-        */
-       end_offset = min_t(unsigned long long,
-                       (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT,
-                       i_size_read(inode));
-
-       /*
-        * If the current map does not span the entire page we are about to try
-        * to write, then give up. The only way we can write a page that spans
-        * multiple mappings in a single writeback iteration is via the
-        * xfs_vm_writepage() function. Data integrity writeback requires the
-        * entire page to be written in a single attempt, otherwise the part of
-        * the page we don't write here doesn't get written as part of the data
-        * integrity sync.
-        *
-        * For normal writeback, we also don't attempt to write partial pages
-        * here as it simply means that write_cache_pages() will see it under
-        * writeback and ignore the page until some point in the future, at
-        * which time this will be the only page in the file that needs
-        * writeback.  Hence for more optimal IO patterns, we should always
-        * avoid partial page writeback due to multiple mappings on a page here.
-        */
-       if (!xfs_imap_valid(inode, imap, end_offset))
-               goto fail_unlock_page;
-
-       len = 1 << inode->i_blkbits;
-       p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1),
-                                       PAGE_CACHE_SIZE);
-       p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE;
-       page_dirty = p_offset / len;
-
-       /*
-        * The moment we find a buffer that doesn't match our current type
-        * specification or can't be written, abort the loop and start
-        * writeback. As per the above xfs_imap_valid() check, only
-        * xfs_vm_writepage() can handle partial page writeback fully - we are
-        * limited here to the buffers that are contiguous with the current
-        * ioend, and hence a buffer we can't write breaks that contiguity and
-        * we have to defer the rest of the IO to xfs_vm_writepage().
-        */
-       bh = head = page_buffers(page);
-       do {
-               if (offset >= end_offset)
-                       break;
-               if (!buffer_uptodate(bh))
-                       uptodate = 0;
-               if (!(PageUptodate(page) || buffer_uptodate(bh))) {
-                       done = 1;
-                       break;
-               }
-
-               if (buffer_unwritten(bh) || buffer_delay(bh) ||
-                   buffer_mapped(bh)) {
-                       if (buffer_unwritten(bh))
-                               type = XFS_IO_UNWRITTEN;
-                       else if (buffer_delay(bh))
-                               type = XFS_IO_DELALLOC;
-                       else
-                               type = XFS_IO_OVERWRITE;
-
-                       /*
-                        * imap should always be valid because of the above
-                        * partial page end_offset check on the imap.
-                        */
-                       ASSERT(xfs_imap_valid(inode, imap, offset));
-
-                       lock_buffer(bh);
-                       if (type != XFS_IO_OVERWRITE)
-                               xfs_map_at_offset(inode, bh, imap, offset);
-                       xfs_add_to_ioend(inode, bh, offset, type,
-                                        ioendp, done);
-
-                       page_dirty--;
-                       count++;
-               } else {
-                       done = 1;
-                       break;
-               }
-       } while (offset += len, (bh = bh->b_this_page) != head);
-
-       if (uptodate && bh == head)
-               SetPageUptodate(page);
-
-       if (count) {
-               if (--wbc->nr_to_write <= 0 &&
-                   wbc->sync_mode == WB_SYNC_NONE)
-                       done = 1;
-       }
-       xfs_start_page_writeback(page, !page_dirty, count);
-
-       return done;
- fail_unlock_page:
-       unlock_page(page);
- fail:
-       return 1;
-}
-
-/*
- * Convert & write out a cluster of pages in the same extent as defined
- * by mp and following the start page.
- */
-STATIC void
-xfs_cluster_write(
-       struct inode            *inode,
-       pgoff_t                 tindex,
-       struct xfs_bmbt_irec    *imap,
-       xfs_ioend_t             **ioendp,
-       struct writeback_control *wbc,
-       pgoff_t                 tlast)
-{
-       struct pagevec          pvec;
-       int                     done = 0, i;
-
-       pagevec_init(&pvec, 0);
-       while (!done && tindex <= tlast) {
-               unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1);
-
-               if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len))
-                       break;
-
-               for (i = 0; i < pagevec_count(&pvec); i++) {
-                       done = xfs_convert_page(inode, pvec.pages[i], tindex++,
-                                       imap, ioendp, wbc);
-                       if (done)
-                               break;
-               }
-
-               pagevec_release(&pvec);
-               cond_resched();
-       }
-}
-
  STATIC void
  xfs_vm_invalidatepage(
         struct page             *page,
@@ -931,6 +708,164 @@ out_invalidate:
         return;
  }
  
+/*
+ * We implement an immediate ioend submission policy here to avoid needing to
+ * chain multiple ioends and hence nest mempool allocations which can violate
+ * forward progress guarantees we need to provide. The current ioend we are
+ * adding buffers to is cached on the writepage context, and if the new buffer
+ * does not append to the cached ioend it will create a new ioend and cache that
+ * instead.
+ *
+ * If a new ioend is created and cached, the old ioend is returned and queued
+ * locally for submission once the entire page is processed or an error has been
+ * detected.  While ioends are submitted immediately after they are completed,
+ * batching optimisations are provided by higher level block plugging.
+ *
+ * At the end of a writeback pass, there will be a cached ioend remaining on the
+ * writepage context that the caller will need to submit.
+ */
+static int
+xfs_writepage_map(
+       struct xfs_writepage_ctx *wpc,
+       struct writeback_control *wbc,
+       struct inode            *inode,
+       struct page             *page,
+       loff_t                  offset,
+       __uint64_t              end_offset)
+{
+       LIST_HEAD(submit_list);
+       struct xfs_ioend        *ioend, *next;
+       struct buffer_head      *bh, *head;
+       ssize_t                 len = 1 << inode->i_blkbits;
+       int                     error = 0;
+       int                     count = 0;
+       int                     uptodate = 1;
+
+       bh = head = page_buffers(page);
+       offset = page_offset(page);
+       do {
+               if (offset >= end_offset)
+                       break;
+               if (!buffer_uptodate(bh))
+                       uptodate = 0;
+
+               /*
+                * set_page_dirty dirties all buffers in a page, independent
+                * of their state.  The dirty state however is entirely
+                * meaningless for holes (!mapped && uptodate), so skip
+                * buffers covering holes here.
+                */
+               if (!buffer_mapped(bh) && buffer_uptodate(bh)) {
+                       wpc->imap_valid = false;
+                       continue;
+               }
+
+               if (buffer_unwritten(bh)) {
+                       if (wpc->io_type != XFS_IO_UNWRITTEN) {
+                               wpc->io_type = XFS_IO_UNWRITTEN;
+                               wpc->imap_valid = false;
+                       }
+               } else if (buffer_delay(bh)) {
+                       if (wpc->io_type != XFS_IO_DELALLOC) {
+                               wpc->io_type = XFS_IO_DELALLOC;
+                               wpc->imap_valid = false;
+                       }
+               } else if (buffer_uptodate(bh)) {
+                       if (wpc->io_type != XFS_IO_OVERWRITE) {
+                               wpc->io_type = XFS_IO_OVERWRITE;
+                               wpc->imap_valid = false;
+                       }
+               } else {
+                       if (PageUptodate(page))
+                               ASSERT(buffer_mapped(bh));
+                       /*
+                        * This buffer is not uptodate and will not be
+                        * written to disk.  Ensure that we will put any
+                        * subsequent writeable buffers into a new
+                        * ioend.
+                        */
+                       wpc->imap_valid = false;
+                       continue;
+               }
+
+               if (wpc->imap_valid)
+                       wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap,
+                                                        offset);
+               if (!wpc->imap_valid) {
+                       error = xfs_map_blocks(inode, offset, &wpc->imap,
+                                            wpc->io_type);
+                       if (error)
+                               goto out;
+                       wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap,
+                                                        offset);
+               }
+               if (wpc->imap_valid) {
+                       lock_buffer(bh);
+                       if (wpc->io_type != XFS_IO_OVERWRITE)
+                               xfs_map_at_offset(inode, bh, &wpc->imap, offset);
+                       xfs_add_to_ioend(inode, bh, offset, wpc, &submit_list);
+                       count++;
+               }
+
+       } while (offset += len, ((bh = bh->b_this_page) != head));
+
+       if (uptodate && bh == head)
+               SetPageUptodate(page);
+
+       ASSERT(wpc->ioend || list_empty(&submit_list));
+
+out:
+       /*
+        * On error, we have to fail the ioend here because we have locked
+        * buffers in the ioend. If we don't do this, we'll deadlock
+        * invalidating the page as that tries to lock the buffers on the page.
+        * Also, because we may have set pages under writeback, we have to make
+        * sure we run IO completion to mark the error state of the IO
+        * appropriately, so we can't cancel the ioend directly here. That means
+        * we have to mark this page as under writeback if we included any
+        * buffers from it in the ioend chain so that completion treats it
+        * correctly.
+        *
+        * If we didn't include the page in the ioend, the on error we can
+        * simply discard and unlock it as there are no other users of the page
+        * or it's buffers right now. The caller will still need to trigger
+        * submission of outstanding ioends on the writepage context so they are
+        * treated correctly on error.
+        */
+       if (count) {
+               xfs_start_page_writeback(page, !error);
+
+               /*
+                * Preserve the original error if there was one, otherwise catch
+                * submission errors here and propagate into subsequent ioend
+                * submissions.
+                */
+               list_for_each_entry_safe(ioend, next, &submit_list, io_list) {
+                       int error2;
+
+                       list_del_init(&ioend->io_list);
+                       error2 = xfs_submit_ioend(wbc, ioend, error);
+                       if (error2 && !error)
+                               error = error2;
+               }
+       } else if (error) {
+               xfs_aops_discard_page(page);
+               ClearPageUptodate(page);
+               unlock_page(page);
+       } else {
+               /*
+                * We can end up here with no error and nothing to write if we
+                * race with a partial page truncate on a sub-page block sized
+                * filesystem. In that case we need to mark the page clean.
+                */
+               xfs_start_page_writeback(page, 1);
+               end_page_writeback(page);
+       }
+
+       mapping_set_error(page->mapping, error);
+       return error;
+}
+
  /*
   * Write out a dirty page.
   *
@@ -940,22 +875,16 @@ out_invalidate:
   * For any other dirty buffer heads on the page we should flush them.
   */
  STATIC int
-xfs_vm_writepage(
+xfs_do_writepage(
         struct page             *page,
-       struct writeback_control *wbc)
+       struct writeback_control *wbc,
+       void                    *data)
  {
+       struct xfs_writepage_ctx *wpc = data;
         struct inode            *inode = page->mapping->host;
-       struct buffer_head      *bh, *head;
-       struct xfs_bmbt_irec    imap;
-       xfs_ioend_t             *ioend = NULL, *iohead = NULL;
         loff_t                  offset;
-       unsigned int            type;
         __uint64_t              end_offset;
-       pgoff_t                 end_index, last_index;
-       ssize_t                 len;
-       int                     err, imap_valid = 0, uptodate = 1;
-       int                     count = 0;
-       int                     nonblocking = 0;
+       pgoff_t                 end_index;
  
         trace_xfs_writepage(inode, page, 0, 0);
  
@@ -982,12 +911,9 @@ xfs_vm_writepage(
         if (WARN_ON_ONCE(current->flags & PF_FSTRANS))
                 goto redirty;
  
-       /* Is this page beyond the end of the file? */
-       offset = i_size_read(inode);
-       end_index = offset >> PAGE_CACHE_SHIFT;
-       last_index = (offset - 1) >> PAGE_CACHE_SHIFT;
-
         /*
+        * Is this page beyond the end of the file?
+        *
          * The page index is less than the end_index, adjust the end_offset
          * to the highest offset that this page should represent.
          * -----------------------------------------------------
@@ -998,6 +924,8 @@ xfs_vm_writepage(
          * |     desired writeback range    |      see else    |
          * ---------------------------------^------------------|
          */
+       offset = i_size_read(inode);
+       end_index = offset >> PAGE_CACHE_SHIFT;
         if (page->index < end_index)
                 end_offset = (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT;
         else {
@@ -1049,152 +977,7 @@ xfs_vm_writepage(
                 end_offset = offset;
         }
  
-       len = 1 << inode->i_blkbits;
-
-       bh = head = page_buffers(page);
-       offset = page_offset(page);
-       type = XFS_IO_OVERWRITE;
-
-       if (wbc->sync_mode == WB_SYNC_NONE)
-               nonblocking = 1;
-
-       do {
-               int new_ioend = 0;
-
-               if (offset >= end_offset)
-                       break;
-               if (!buffer_uptodate(bh))
-                       uptodate = 0;
-
-               /*
-                * set_page_dirty dirties all buffers in a page, independent
-                * of their state.  The dirty state however is entirely
-                * meaningless for holes (!mapped && uptodate), so skip
-                * buffers covering holes here.
-                */
-               if (!buffer_mapped(bh) && buffer_uptodate(bh)) {
-                       imap_valid = 0;
-                       continue;
-               }
-
-               if (buffer_unwritten(bh)) {
-                       if (type != XFS_IO_UNWRITTEN) {
-                               type = XFS_IO_UNWRITTEN;
-                               imap_valid = 0;
-                       }
-               } else if (buffer_delay(bh)) {
-                       if (type != XFS_IO_DELALLOC) {
-                               type = XFS_IO_DELALLOC;
-                               imap_valid = 0;
-                       }
-               } else if (buffer_uptodate(bh)) {
-                       if (type != XFS_IO_OVERWRITE) {
-                               type = XFS_IO_OVERWRITE;
-                               imap_valid = 0;
-                       }
-               } else {
-                       if (PageUptodate(page))
-                               ASSERT(buffer_mapped(bh));
-                       /*
-                        * This buffer is not uptodate and will not be
-                        * written to disk.  Ensure that we will put any
-                        * subsequent writeable buffers into a new
-                        * ioend.
-                        */
-                       imap_valid = 0;
-                       continue;
-               }
-
-               if (imap_valid)
-                       imap_valid = xfs_imap_valid(inode, &imap, offset);
-               if (!imap_valid) {
-                       /*
-                        * If we didn't have a valid mapping then we need to
-                        * put the new mapping into a separate ioend structure.
-                        * This ensures non-contiguous extents always have
-                        * separate ioends, which is particularly important
-                        * for unwritten extent conversion at I/O completion
-                        * time.
-                        */
-                       new_ioend = 1;
-                       err = xfs_map_blocks(inode, offset, &imap, type,
-                                            nonblocking);
-                       if (err)
-                               goto error;
-                       imap_valid = xfs_imap_valid(inode, &imap, offset);
-               }
-               if (imap_valid) {
-                       lock_buffer(bh);
-                       if (type != XFS_IO_OVERWRITE)
-                               xfs_map_at_offset(inode, bh, &imap, offset);
-                       xfs_add_to_ioend(inode, bh, offset, type, &ioend,
-                                        new_ioend);
-                       count++;
-               }
-
-               if (!iohead)
-                       iohead = ioend;
-
-       } while (offset += len, ((bh = bh->b_this_page) != head));
-
-       if (uptodate && bh == head)
-               SetPageUptodate(page);
-
-       xfs_start_page_writeback(page, 1, count);
-
-       /* if there is no IO to be submitted for this page, we are done */
-       if (!ioend)
-               return 0;
-
-       ASSERT(iohead);
-
-       /*
-        * Any errors from this point onwards need tobe reported through the IO
-        * completion path as we have marked the initial page as under writeback
-        * and unlocked it.
-        */
-       if (imap_valid) {
-               xfs_off_t               end_index;
-
-               end_index = imap.br_startoff + imap.br_blockcount;
-
-               /* to bytes */
-               end_index <<= inode->i_blkbits;
-
-               /* to pages */
-               end_index = (end_index - 1) >> PAGE_CACHE_SHIFT;
-
-               /* check against file size */
-               if (end_index > last_index)
-                       end_index = last_index;
-
-               xfs_cluster_write(inode, page->index + 1, &imap, &ioend,
-                                 wbc, end_index);
-       }
-
-
-       /*
-        * Reserve log space if we might write beyond the on-disk inode size.
-        */
-       err = 0;
-       if (ioend->io_type != XFS_IO_UNWRITTEN && xfs_ioend_is_append(ioend))
-               err = xfs_setfilesize_trans_alloc(ioend);
-
-       xfs_submit_ioend(wbc, iohead, err);
-
-       return 0;
-
-error:
-       if (iohead)
-               xfs_cancel_ioend(iohead);
-
-       if (err == -EAGAIN)
-               goto redirty;
-
-       xfs_aops_discard_page(page);
-       ClearPageUptodate(page);
-       unlock_page(page);
-       return err;
+       return xfs_writepage_map(wpc, wbc, inode, page, offset, end_offset);
  
  redirty:
         redirty_page_for_writepage(wbc, page);
@@ -1202,13 +985,37 @@ redirty:
         return 0;
  }
  
+STATIC int
+xfs_vm_writepage(
+       struct page             *page,
+       struct writeback_control *wbc)
+{
+       struct xfs_writepage_ctx wpc = {
+               .io_type = XFS_IO_INVALID,
+       };
+       int                     ret;
+
+       ret = xfs_do_writepage(page, wbc, &wpc);
+       if (wpc.ioend)
+               ret = xfs_submit_ioend(wbc, wpc.ioend, ret);
+       return ret;
+}
+
  STATIC int
  xfs_vm_writepages(
         struct address_space    *mapping,
         struct writeback_control *wbc)
  {
+       struct xfs_writepage_ctx wpc = {
+               .io_type = XFS_IO_INVALID,
+       };
+       int                     ret;
+
         xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
-       return generic_writepages(mapping, wbc);
+       ret = write_cache_pages(mapping, wbc, xfs_do_writepage, &wpc);
+       if (wpc.ioend)
+               ret = xfs_submit_ioend(wbc, wpc.ioend, ret);
+       return ret;
  }
  
  /*
@@ -1238,27 +1045,8 @@ xfs_vm_releasepage(
  }
  
  /*
- * When we map a DIO buffer, we may need to attach an ioend that describes the
- * type of write IO we are doing. This passes to the completion function the
- * operations it needs to perform. If the mapping is for an overwrite wholly
- * within the EOF then we don't need an ioend and so we don't allocate one.
- * This avoids the unnecessary overhead of allocating and freeing ioends for
- * workloads that don't require transactions on IO completion.
- *
- * If we get multiple mappings in a single IO, we might be mapping different
- * types. But because the direct IO can only have a single private pointer, we
- * need to ensure that:
- *
- * a) i) the ioend spans the entire region of unwritten mappings; or
- *    ii) the ioend spans all the mappings that cross or are beyond EOF; and
- * b) if it contains unwritten extents, it is *permanently* marked as such
- *
- * We could do this by chaining ioends like buffered IO does, but we only
- * actually get one IO completion callback from the direct IO, and that spans
- * the entire IO regardless of how many mappings and IOs are needed to complete
- * the DIO. There is only going to be one reference to the ioend and its life
- * cycle is constrained by the DIO completion code. hence we don't need
- * reference counting here.
+ * When we map a DIO buffer, we may need to pass flags to
+ * xfs_end_io_direct_write to tell it what kind of write IO we are doing.
   *
   * Note that for DIO, an IO to the highest supported file block offset (i.e.
   * 2^63 - 1FSB bytes) will result in the offset + count overflowing a signed 64
@@ -1266,68 +1054,26 @@ xfs_vm_releasepage(
   * extending the file size. We won't know for sure until IO completion is run
   * and the actual max write offset is communicated to the IO completion
   * routine.
- *
- * For DAX page faults, we are preparing to never see unwritten extents here,
- * nor should we ever extend the inode size. Hence we will soon have nothing to
- * do here for this case, ensuring we don't have to provide an IO completion
- * callback to free an ioend that we don't actually need for a fault into the
- * page at offset (2^63 - 1FSB) bytes.
   */
-
  static void
  xfs_map_direct(
         struct inode            *inode,
         struct buffer_head      *bh_result,
         struct xfs_bmbt_irec    *imap,
-       xfs_off_t               offset,
-       bool                    dax_fault)
+       xfs_off_t               offset)
  {
-       struct xfs_ioend        *ioend;
+       uintptr_t               *flags = (uintptr_t *)&bh_result->b_private;
         xfs_off_t               size = bh_result->b_size;
-       int                     type;
-
-       if (ISUNWRITTEN(imap))
-               type = XFS_IO_UNWRITTEN;
-       else
-               type = XFS_IO_OVERWRITE;
-
-       trace_xfs_gbmap_direct(XFS_I(inode), offset, size, type, imap);
  
-       if (dax_fault) {
-               ASSERT(type == XFS_IO_OVERWRITE);
-               trace_xfs_gbmap_direct_none(XFS_I(inode), offset, size, type,
-                                           imap);
-               return;
-       }
+       trace_xfs_get_blocks_map_direct(XFS_I(inode), offset, size,
+               ISUNWRITTEN(imap) ? XFS_IO_UNWRITTEN : XFS_IO_OVERWRITE, imap);
  
-       if (bh_result->b_private) {
-               ioend = bh_result->b_private;
-               ASSERT(ioend->io_size > 0);
-               ASSERT(offset >= ioend->io_offset);
-               if (offset + size > ioend->io_offset + ioend->io_size)
-                       ioend->io_size = offset - ioend->io_offset + size;
-
-               if (type == XFS_IO_UNWRITTEN && type != ioend->io_type)
-                       ioend->io_type = XFS_IO_UNWRITTEN;
-
-               trace_xfs_gbmap_direct_update(XFS_I(inode), ioend->io_offset,
-                                             ioend->io_size, ioend->io_type,
-                                             imap);
-       } else if (type == XFS_IO_UNWRITTEN ||
-                  offset + size > i_size_read(inode) ||
-                  offset + size < 0) {
-               ioend = xfs_alloc_ioend(inode, type);
-               ioend->io_offset = offset;
-               ioend->io_size = size;
-
-               bh_result->b_private = ioend;
+       if (ISUNWRITTEN(imap)) {
+               *flags |= XFS_DIO_FLAG_UNWRITTEN;
+               set_buffer_defer_completion(bh_result);
+       } else if (offset + size > i_size_read(inode) || offset + size < 0) {
+               *flags |= XFS_DIO_FLAG_APPEND;
                 set_buffer_defer_completion(bh_result);
-
-               trace_xfs_gbmap_direct_new(XFS_I(inode), offset, size, type,
-                                          imap);
-       } else {
-               trace_xfs_gbmap_direct_none(XFS_I(inode), offset, size, type,
-                                           imap);
         }
  }
  
@@ -1498,9 +1244,12 @@ __xfs_get_blocks(
                 if (ISUNWRITTEN(&imap))
                         set_buffer_unwritten(bh_result);
                 /* direct IO needs special help */
-               if (create && direct)
-                       xfs_map_direct(inode, bh_result, &imap, offset,
-                                      dax_fault);
+               if (create && direct) {
+                       if (dax_fault)
+                               ASSERT(!ISUNWRITTEN(&imap));
+                       else
+                               xfs_map_direct(inode, bh_result, &imap, offset);
+               }
         }
  
         /*
@@ -1570,42 +1319,50 @@ xfs_get_blocks_dax_fault(
         return __xfs_get_blocks(inode, iblock, bh_result, create, true, true);
  }
  
-static void
-__xfs_end_io_direct_write(
-       struct inode            *inode,
-       struct xfs_ioend        *ioend,
+/*
+ * Complete a direct I/O write request.
+ *
+ * xfs_map_direct passes us some flags in the private data to tell us what to
+ * do.  If no flags are set, then the write IO is an overwrite wholly within
+ * the existing allocated file size and so there is nothing for us to do.
+ *
+ * Note that in this case the completion can be called in interrupt context,
+ * whereas if we have flags set we will always be called in task context
+ * (i.e. from a workqueue).
+ */
+STATIC int
+xfs_end_io_direct_write(
+       struct kiocb            *iocb,
         loff_t                  offset,
-       ssize_t                 size)
+       ssize_t                 size,
+       void                    *private)
  {
-       struct xfs_mount        *mp = XFS_I(inode)->i_mount;
+       struct inode            *inode = file_inode(iocb->ki_filp);
+       struct xfs_inode        *ip = XFS_I(inode);
+       struct xfs_mount        *mp = ip->i_mount;
+       uintptr_t               flags = (uintptr_t)private;
+       int                     error = 0;
  
-       if (XFS_FORCED_SHUTDOWN(mp) || ioend->io_error)
-               goto out_end_io;
+       trace_xfs_end_io_direct_write(ip, offset, size);
  
-       /*
-        * dio completion end_io functions are only called on writes if more
-        * than 0 bytes was written.
-        */
-       ASSERT(size > 0);
+       if (XFS_FORCED_SHUTDOWN(mp))
+               return -EIO;
  
-       /*
-        * The ioend only maps whole blocks, while the IO may be sector aligned.
-        * Hence the ioend offset/size may not match the IO offset/size exactly.
-        * Because we don't map overwrites within EOF into the ioend, the offset
-        * may not match, but only if the endio spans EOF.  Either way, write
-        * the IO sizes into the ioend so that completion processing does the
-        * right thing.
-        */
-       ASSERT(offset + size <= ioend->io_offset + ioend->io_size);
-       ioend->io_size = size;
-       ioend->io_offset = offset;
+       if (size <= 0)
+               return size;
  
         /*
-        * The ioend tells us whether we are doing unwritten extent conversion
+        * The flags tell us whether we are doing unwritten extent conversions
          * or an append transaction that updates the on-disk file size. These
          * cases are the only cases where we should *potentially* be needing
          * to update the VFS inode size.
-        *
+        */
+       if (flags == 0) {
+               ASSERT(offset + size <= i_size_read(inode));
+               return 0;
+       }
+
+       /*
          * We need to update the in-core inode size here so that we don't end up
          * with the on-disk inode size being outside the in-core inode size. We
          * have no other method of updating EOF for AIO, so always do it here
@@ -1616,91 +1373,56 @@ __xfs_end_io_direct_write(
          * here can result in EOF moving backwards and Bad Things Happen when
          * that occurs.
          */
-       spin_lock(&XFS_I(inode)->i_flags_lock);
+       spin_lock(&ip->i_flags_lock);
         if (offset + size > i_size_read(inode))
                 i_size_write(inode, offset + size);
-       spin_unlock(&XFS_I(inode)->i_flags_lock);
+       spin_unlock(&ip->i_flags_lock);
  
-       /*
-        * If we are doing an append IO that needs to update the EOF on disk,
-        * do the transaction reserve now so we can use common end io
-        * processing. Stashing the error (if there is one) in the ioend will
-        * result in the ioend processing passing on the error if it is
-        * possible as we can't return it from here.
-        */
-       if (ioend->io_type == XFS_IO_OVERWRITE)
-               ioend->io_error = xfs_setfilesize_trans_alloc(ioend);
+       if (flags & XFS_DIO_FLAG_UNWRITTEN) {
+               trace_xfs_end_io_direct_write_unwritten(ip, offset, size);
  
-out_end_io:
-       xfs_end_io(&ioend->io_work);
-       return;
-}
+               error = xfs_iomap_write_unwritten(ip, offset, size);
+       } else if (flags & XFS_DIO_FLAG_APPEND) {
+               struct xfs_trans *tp;
  
-/*
- * Complete a direct I/O write request.
- *
- * The ioend structure is passed from __xfs_get_blocks() to tell us what to do.
- * If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite
- * wholly within the EOF and so there is nothing for us to do. Note that in this
- * case the completion can be called in interrupt context, whereas if we have an
- * ioend we will always be called in task context (i.e. from a workqueue).
- */
-STATIC void
-xfs_end_io_direct_write(
-       struct kiocb            *iocb,
-       loff_t                  offset,
-       ssize_t                 size,
-       void                    *private)
-{
-       struct inode            *inode = file_inode(iocb->ki_filp);
-       struct xfs_ioend        *ioend = private;
+               trace_xfs_end_io_direct_write_append(ip, offset, size);
  
-       trace_xfs_gbmap_direct_endio(XFS_I(inode), offset, size,
-                                    ioend ? ioend->io_type : 0, NULL);
-
-       if (!ioend) {
-               ASSERT(offset + size <= i_size_read(inode));
-               return;
+               tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
+               error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
+               if (error) {
+                       xfs_trans_cancel(tp);
+                       return error;
+               }
+               error = xfs_setfilesize(ip, tp, offset, size);
         }
  
-       __xfs_end_io_direct_write(inode, ioend, offset, size);
+       return error;
  }
  
-static inline ssize_t
-xfs_vm_do_dio(
-       struct inode            *inode,
+STATIC ssize_t
+xfs_vm_direct_IO(
         struct kiocb            *iocb,
         struct iov_iter         *iter,
-       loff_t                  offset,
-       void                    (*endio)(struct kiocb   *iocb,
-                                        loff_t         offset,
-                                        ssize_t        size,
-                                        void           *private),
-       int                     flags)
+       loff_t                  offset)
  {
+       struct inode            *inode = iocb->ki_filp->f_mapping->host;
+       dio_iodone_t            *endio = NULL;
+       int                     flags = 0;
         struct block_device     *bdev;
  
-       if (IS_DAX(inode))
+       if (iov_iter_rw(iter) == WRITE) {
+               endio = xfs_end_io_direct_write;
+               flags = DIO_ASYNC_EXTEND;
+       }
+
+       if (IS_DAX(inode)) {
                 return dax_do_io(iocb, inode, iter, offset,
                                  xfs_get_blocks_direct, endio, 0);
+       }
  
         bdev = xfs_find_bdev_for_inode(inode);
         return  __blockdev_direct_IO(iocb, inode, bdev, iter, offset,
-                                    xfs_get_blocks_direct, endio, NULL, flags);
-}
-
-STATIC ssize_t
-xfs_vm_direct_IO(
-       struct kiocb            *iocb,
-       struct iov_iter         *iter,
-       loff_t                  offset)
-{
-       struct inode            *inode = iocb->ki_filp->f_mapping->host;
-
-       if (iov_iter_rw(iter) == WRITE)
-               return xfs_vm_do_dio(inode, iocb, iter, offset,
-                                    xfs_end_io_direct_write, DIO_ASYNC_EXTEND);
-       return xfs_vm_do_dio(inode, iocb, iter, offset, NULL, 0);
+                       xfs_get_blocks_direct, endio, NULL, flags);
  }
  
  /*
@@ -1784,15 +1506,23 @@ xfs_vm_write_failed(
                 if (block_start >= to)
                         break;
  
-               if (!buffer_delay(bh))
+               /*
+                * Process delalloc and unwritten buffers beyond EOF. We can
+                * encounter unwritten buffers in the event that a file has
+                * post-EOF unwritten extents and an extending write happens to
+                * fail (e.g., an unaligned write that also involves a delalloc
+                * to the same page).
+                */
+               if (!buffer_delay(bh) && !buffer_unwritten(bh))
                         continue;
  
                 if (!xfs_mp_fail_writes(mp) && !buffer_new(bh) &&
                     block_offset < i_size_read(inode))
                         continue;
  
-               xfs_vm_kill_delalloc_range(inode, block_offset,
-                                          block_offset + bh->b_size);
+               if (buffer_delay(bh))
+                       xfs_vm_kill_delalloc_range(inode, block_offset,
+                                                  block_offset + bh->b_size);
  
                 /*
                  * This buffer does not contain data anymore. make sure anyone
@@ -1803,6 +1533,7 @@ xfs_vm_write_failed(
                 clear_buffer_mapped(bh);
                 clear_buffer_new(bh);
                 clear_buffer_dirty(bh);
+               clear_buffer_unwritten(bh);
         }
  
  }