fs: Fix page cache inconsistency when mixing buffered and AIO DIO

author Lukas Czerner <lczerner@redhat.com>

Thu, 21 Sep 2017 14:16:29 +0000 (08:16 -0600)

committer Jens Axboe <axboe@kernel.dk>

Mon, 25 Sep 2017 14:56:05 +0000 (08:56 -0600)
author Lukas Czerner <lczerner@redhat.com>
Thu, 21 Sep 2017 14:16:29 +0000 (08:16 -0600)
committer Jens Axboe <axboe@kernel.dk>
Mon, 25 Sep 2017 14:56:05 +0000 (08:56 -0600)
diff --git a/fs/direct-io.c b/fs/direct-io.c

index 5fa2211e49aee2186546d8db7a70c5295c7591f0..62cf812ed0e5803ac9148a9ea8a27f00efca7f2a 100644 (file)
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -229,6 +229,7 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async)
  {
         loff_t offset = dio->iocb->ki_pos;
         ssize_t transferred = 0;
+       int err;
  
         /*
          * AIO submission can race with bio completion to get here while
@@ -258,8 +259,22 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async)
         if (ret == 0)
                 ret = transferred;
  
+       /*
+        * Try again to invalidate clean pages which might have been cached by
+        * non-direct readahead, or faulted in by get_user_pages() if the source
+        * of the write was an mmap'ed region of the file we're writing.  Either
+        * one is a pretty crazy thing to do, so we don't support it 100%.  If
+        * this invalidation fails, tough, the write still worked...
+        */
+       if (ret > 0 && dio->op == REQ_OP_WRITE &&
+           dio->inode->i_mapping->nrpages) {
+               err = invalidate_inode_pages2_range(dio->inode->i_mapping,
+                                       offset >> PAGE_SHIFT,
+                                       (offset + ret - 1) >> PAGE_SHIFT);
+               WARN_ON_ONCE(err);
+       }
+
         if (dio->end_io) {
-               int err;
  
                 // XXX: ki_pos??
                 err = dio->end_io(dio->iocb, offset, ret, dio->private);
@@ -304,6 +319,7 @@ static void dio_bio_end_aio(struct bio *bio)
         struct dio *dio = bio->bi_private;
         unsigned long remaining;
         unsigned long flags;
+       bool defer_completion = false;
  
         /* cleanup the bio */
         dio_bio_complete(dio, bio);
@@ -315,7 +331,19 @@ static void dio_bio_end_aio(struct bio *bio)
         spin_unlock_irqrestore(&dio->bio_lock, flags);
  
         if (remaining == 0) {
-               if (dio->result && dio->defer_completion) {
+               /*
+                * Defer completion when defer_completion is set or
+                * when the inode has pages mapped and this is AIO write.
+                * We need to invalidate those pages because there is a
+                * chance they contain stale data in the case buffered IO
+                * went in between AIO submission and completion into the
+                * same region.
+                */
+               if (dio->result)
+                       defer_completion = dio->defer_completion ||
+                                          (dio->op == REQ_OP_WRITE &&
+                                           dio->inode->i_mapping->nrpages);
+               if (defer_completion) {
                         INIT_WORK(&dio->complete_work, dio_aio_complete_work);
                         queue_work(dio->inode->i_sb->s_dio_done_wq,
                                    &dio->complete_work);
@@ -1210,10 +1238,19 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
          * For AIO O_(D)SYNC writes we need to defer completions to a workqueue
          * so that we can call ->fsync.
          */
-       if (dio->is_async && iov_iter_rw(iter) == WRITE &&
-           ((iocb->ki_filp->f_flags & O_DSYNC) ||
-            IS_SYNC(iocb->ki_filp->f_mapping->host))) {
-               retval = dio_set_defer_completion(dio);
+       if (dio->is_async && iov_iter_rw(iter) == WRITE) {
+               retval = 0;
+               if ((iocb->ki_filp->f_flags & O_DSYNC) ||
+                   IS_SYNC(iocb->ki_filp->f_mapping->host))
+                       retval = dio_set_defer_completion(dio);
+               else if (!dio->inode->i_sb->s_dio_done_wq) {
+                       /*
+                        * In case of AIO write racing with buffered read we
+                        * need to defer completion. We can't decide this now,
+                        * however the workqueue needs to be initialized here.
+                        */
+                       retval = sb_init_dio_done_wq(dio->inode->i_sb);
+               }
                 if (retval) {
                         /*
                          * We grab i_mutex only for reads so we don't have
diff --git a/fs/iomap.c b/fs/iomap.c

index 269b24a01f3218ff9cdb9c1ef5254f3e5a64c3b1..8194d30bdca08e9cfa244e3a1df1df219ae1b806 100644 (file)
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -713,8 +713,24 @@ struct iomap_dio {
  static ssize_t iomap_dio_complete(struct iomap_dio *dio)
  {
         struct kiocb *iocb = dio->iocb;
+       struct inode *inode = file_inode(iocb->ki_filp);
         ssize_t ret;
  
+       /*
+        * Try again to invalidate clean pages which might have been cached by
+        * non-direct readahead, or faulted in by get_user_pages() if the source
+        * of the write was an mmap'ed region of the file we're writing.  Either
+        * one is a pretty crazy thing to do, so we don't support it 100%.  If
+        * this invalidation fails, tough, the write still worked...
+        */
+       if (!dio->error &&
+           (dio->flags & IOMAP_DIO_WRITE) && inode->i_mapping->nrpages) {
+               ret = invalidate_inode_pages2_range(inode->i_mapping,
+                               iocb->ki_pos >> PAGE_SHIFT,
+                               (iocb->ki_pos + dio->size - 1) >> PAGE_SHIFT);
+               WARN_ON_ONCE(ret);
+       }
+
         if (dio->end_io) {
                 ret = dio->end_io(iocb,
                                 dio->error ? dio->error : dio->size,
@@ -1042,19 +1058,6 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
  
         ret = iomap_dio_complete(dio);
  
-       /*
-        * Try again to invalidate clean pages which might have been cached by
-        * non-direct readahead, or faulted in by get_user_pages() if the source
-        * of the write was an mmap'ed region of the file we're writing.  Either
-        * one is a pretty crazy thing to do, so we don't support it 100%.  If
-        * this invalidation fails, tough, the write still worked...
-        */
-       if (iov_iter_rw(iter) == WRITE) {
-               int err = invalidate_inode_pages2_range(mapping,
-                               start >> PAGE_SHIFT, end >> PAGE_SHIFT);
-               WARN_ON_ONCE(err);
-       }
-
         return ret;
  
  out_free_dio:
diff --git a/mm/filemap.c b/mm/filemap.c

index 870971e209670c99a335bc05903f7f12326b9882..db250d0e05655a2e4b72e4f7a7834964a641c86d 100644 (file)
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2926,9 +2926,15 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
          * we're writing.  Either one is a pretty crazy thing to do,
          * so we don't support it 100%.  If this invalidation
          * fails, tough, the write still worked...
+        *
+        * Most of the time we do not need this since dio_complete() will do
+        * the invalidation for us. However there are some file systems that
+        * do not end up with dio_complete() being called, so let's not break
+        * them by removing it completely
          */
-       invalidate_inode_pages2_range(mapping,
-                               pos >> PAGE_SHIFT, end);
+       if (mapping->nrpages)
+               invalidate_inode_pages2_range(mapping,
+                                       pos >> PAGE_SHIFT, end);
  
         if (written > 0) {
                 pos += written;
author	Lukas Czerner <lczerner@redhat.com>
	Thu, 21 Sep 2017 14:16:29 +0000 (08:16 -0600)
committer	Jens Axboe <axboe@kernel.dk>
	Mon, 25 Sep 2017 14:56:05 +0000 (08:56 -0600)
fs/direct-io.c		patch \| blob \| blame \| history
fs/iomap.c		patch \| blob \| blame \| history
mm/filemap.c		patch \| blob \| blame \| history