btrfs: Get rid of the confusing btrfs_file_extent_inline_len

[mirror_ubuntu-bionic-kernel.git] / fs / btrfs / file.c
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c

index f80254d82f409bedc91bbef14364726beeea174c..e489b879d22654c9dfcf854a4922f2f118a3f1ed 100644 (file)
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -477,6 +477,47 @@ static void btrfs_drop_pages(struct page **pages, size_t num_pages)
         }
  }
  
+static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,
+                                        const u64 start,
+                                        const u64 len,
+                                        struct extent_state **cached_state)
+{
+       u64 search_start = start;
+       const u64 end = start + len - 1;
+
+       while (search_start < end) {
+               const u64 search_len = end - search_start + 1;
+               struct extent_map *em;
+               u64 em_len;
+               int ret = 0;
+
+               em = btrfs_get_extent(inode, NULL, 0, search_start,
+                                     search_len, 0);
+               if (IS_ERR(em))
+                       return PTR_ERR(em);
+
+               if (em->block_start != EXTENT_MAP_HOLE)
+                       goto next;
+
+               em_len = em->len;
+               if (em->start < search_start)
+                       em_len -= search_start - em->start;
+               if (em_len > search_len)
+                       em_len = search_len;
+
+               ret = set_extent_bit(&inode->io_tree, search_start,
+                                    search_start + em_len - 1,
+                                    EXTENT_DELALLOC_NEW,
+                                    NULL, cached_state, GFP_NOFS);
+next:
+               search_start = extent_map_end(em);
+               free_extent_map(em);
+               if (ret)
+                       return ret;
+       }
+       return 0;
+}
+
  /*
   * after copy_from_user, pages need to be dirtied and we need to make
   * sure holes are created between the current EOF and the start of
@@ -497,14 +538,43 @@ int btrfs_dirty_pages(struct inode *inode, struct page **pages,
         u64 end_of_last_block;
         u64 end_pos = pos + write_bytes;
         loff_t isize = i_size_read(inode);
+       unsigned int extra_bits = 0;
  
         start_pos = pos & ~((u64) fs_info->sectorsize - 1);
         num_bytes = round_up(write_bytes + pos - start_pos,
                              fs_info->sectorsize);
  
         end_of_last_block = start_pos + num_bytes - 1;
+
+       /*
+        * The pages may have already been dirty, clear out old accounting so
+        * we can set things up properly
+        */
+       clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos, end_of_last_block,
+                        EXTENT_DIRTY | EXTENT_DELALLOC |
+                        EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, cached,
+                        GFP_NOFS);
+
+       if (!btrfs_is_free_space_inode(BTRFS_I(inode))) {
+               if (start_pos >= isize &&
+                   !(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC)) {
+                       /*
+                        * There can't be any extents following eof in this case
+                        * so just set the delalloc new bit for the range
+                        * directly.
+                        */
+                       extra_bits |= EXTENT_DELALLOC_NEW;
+               } else {
+                       err = btrfs_find_new_delalloc_bytes(BTRFS_I(inode),
+                                                           start_pos,
+                                                           num_bytes, cached);
+                       if (err)
+                               return err;
+               }
+       }
+
         err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
-                                       cached, 0);
+                                       extra_bits, cached, 0);
         if (err)
                 return err;
  
@@ -784,8 +854,7 @@ next_slot:
                                 btrfs_file_extent_num_bytes(leaf, fi);
                 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
                         extent_end = key.offset +
-                               btrfs_file_extent_inline_len(leaf,
-                                                    path->slots[0], fi);
+                               btrfs_file_extent_ram_bytes(leaf, fi);
                 } else {
                         /* can't happen */
                         BUG();
@@ -1404,47 +1473,6 @@ fail:
  
  }
  
-static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,
-                                        const u64 start,
-                                        const u64 len,
-                                        struct extent_state **cached_state)
-{
-       u64 search_start = start;
-       const u64 end = start + len - 1;
-
-       while (search_start < end) {
-               const u64 search_len = end - search_start + 1;
-               struct extent_map *em;
-               u64 em_len;
-               int ret = 0;
-
-               em = btrfs_get_extent(inode, NULL, 0, search_start,
-                                     search_len, 0);
-               if (IS_ERR(em))
-                       return PTR_ERR(em);
-
-               if (em->block_start != EXTENT_MAP_HOLE)
-                       goto next;
-
-               em_len = em->len;
-               if (em->start < search_start)
-                       em_len -= search_start - em->start;
-               if (em_len > search_len)
-                       em_len = search_len;
-
-               ret = set_extent_bit(&inode->io_tree, search_start,
-                                    search_start + em_len - 1,
-                                    EXTENT_DELALLOC_NEW,
-                                    NULL, cached_state, GFP_NOFS);
-next:
-               search_start = extent_map_end(em);
-               free_extent_map(em);
-               if (ret)
-                       return ret;
-       }
-       return 0;
-}
-
  /*
   * This function locks the extent and properly waits for data=ordered extents
   * to finish before allowing the pages to be modified if need.
@@ -1473,10 +1501,8 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
                 + round_up(pos + write_bytes - start_pos,
                            fs_info->sectorsize) - 1;
  
-       if (start_pos < inode->vfs_inode.i_size ||
-           (inode->flags & BTRFS_INODE_PREALLOC)) {
+       if (start_pos < inode->vfs_inode.i_size) {
                 struct btrfs_ordered_extent *ordered;
-               unsigned int clear_bits;
  
                 lock_extent_bits(&inode->io_tree, start_pos, last_pos,
                                 cached_state);
@@ -1498,27 +1524,27 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
                 }
                 if (ordered)
                         btrfs_put_ordered_extent(ordered);
-               ret = btrfs_find_new_delalloc_bytes(inode, start_pos,
-                                                   last_pos - start_pos + 1,
-                                                   cached_state);
-               clear_bits = EXTENT_DIRTY | EXTENT_DELALLOC |
-                       EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG;
-               if (ret)
-                       clear_bits |= EXTENT_DELALLOC_NEW | EXTENT_LOCKED;
-               clear_extent_bit(&inode->io_tree, start_pos,
-                                last_pos, clear_bits,
-                                (clear_bits & EXTENT_LOCKED) ? 1 : 0,
-                                0, cached_state, GFP_NOFS);
-               if (ret)
-                       return ret;
+
                 *lockstart = start_pos;
                 *lockend = last_pos;
                 ret = 1;
         }
  
+       /*
+        * It's possible the pages are dirty right now, but we don't want
+        * to clean them yet because copy_from_user may catch a page fault
+        * and we might have to fall back to one page at a time.  If that
+        * happens, we'll unlock these pages and we'd have a window where
+        * reclaim could sneak in and drop the once-dirty page on the floor
+        * without writing it.
+        *
+        * We have the pages locked and the extent range locked, so there's
+        * no way someone can start IO on any dirty pages in this range.
+        *
+        * We'll call btrfs_dirty_pages() later on, and that will flip around
+        * delalloc bits and dirty the pages as required.
+        */
         for (i = 0; i < num_pages; i++) {
-               if (clear_page_dirty_for_io(pages[i]))
-                       account_page_redirty(pages[i]);
                 set_page_extent_mapped(pages[i]);
                 WARN_ON(!PageLocked(pages[i]));
         }
@@ -1580,7 +1606,6 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
         struct btrfs_root *root = BTRFS_I(inode)->root;
         struct page **pages = NULL;
-       struct extent_state *cached_state = NULL;
         struct extent_changeset *data_reserved = NULL;
         u64 release_bytes = 0;
         u64 lockstart;
@@ -1601,6 +1626,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
  
         while (iov_iter_count(i) > 0) {
                 size_t offset = pos & (PAGE_SIZE - 1);
+               struct extent_state *cached_state = NULL;
                 size_t sector_offset;
                 size_t write_bytes = min(iov_iter_count(i),
                                          nrptrs * (size_t)PAGE_SIZE -
@@ -1625,6 +1651,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
                         break;
                 }
  
+               only_release_metadata = false;
                 sector_offset = pos & (fs_info->sectorsize - 1);
                 reserve_bytes = round_up(write_bytes + sector_offset,
                                 fs_info->sectorsize);
@@ -1747,10 +1774,21 @@ again:
                 if (copied > 0)
                         ret = btrfs_dirty_pages(inode, pages, dirty_pages,
                                                 pos, copied, NULL);
+
+               /*
+                * If we have not locked the extent range, because the range's
+                * start offset is >= i_size, we might still have a non-NULL
+                * cached extent state, acquired while marking the extent range
+                * as delalloc through btrfs_dirty_pages(). Therefore free any
+                * possible cached extent state to avoid a memory leak.
+                */
                 if (extents_locked)
                         unlock_extent_cached(&BTRFS_I(inode)->io_tree,
                                              lockstart, lockend, &cached_state,
                                              GFP_NOFS);
+               else
+                       free_extent_state(cached_state);
+
                 btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes);
                 if (ret) {
                         btrfs_drop_pages(pages, num_pages);
@@ -1770,7 +1808,6 @@ again:
                         set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
                                        lockend, EXTENT_NORESERVE, NULL,
                                        NULL, GFP_NOFS);
-                       only_release_metadata = false;
                 }
  
                 btrfs_drop_pages(pages, num_pages);
@@ -1874,7 +1911,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
         bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host);
         ssize_t err;
         loff_t pos;
-       size_t count = iov_iter_count(from);
+       size_t count;
         loff_t oldsize;
         int clean_page = 0;
  
@@ -1882,9 +1919,10 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
             (iocb->ki_flags & IOCB_NOWAIT))
                 return -EOPNOTSUPP;
  
-       if (!inode_trylock(inode)) {
-               if (iocb->ki_flags & IOCB_NOWAIT)
+       if (iocb->ki_flags & IOCB_NOWAIT) {
+               if (!inode_trylock(inode))
                         return -EAGAIN;
+       } else {
                 inode_lock(inode);
         }
  
@@ -1895,6 +1933,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
         }
  
         pos = iocb->ki_pos;
+       count = iov_iter_count(from);
         if (iocb->ki_flags & IOCB_NOWAIT) {
                 /*
                  * We will allocate space in case nodatacow is not set,
@@ -2010,10 +2049,19 @@ int btrfs_release_file(struct inode *inode, struct file *filp)
  static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end)
  {
         int ret;
+       struct blk_plug plug;
  
+       /*
+        * This is only called in fsync, which would do synchronous writes, so
+        * a plug can merge adjacent IOs as much as possible.  Esp. in case of
+        * multiple disks using raid profile, a large IO can be split to
+        * several segments of stripe length (currently 64K).
+        */
+       blk_start_plug(&plug);
         atomic_inc(&BTRFS_I(inode)->sync_writers);
         ret = btrfs_fdatawrite_range(inode, start, end);
         atomic_dec(&BTRFS_I(inode)->sync_writers);
+       blk_finish_plug(&plug);
  
         return ret;
  }
@@ -2041,6 +2089,18 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
         bool full_sync = false;
         u64 len;
  
+       /*
+        * If the inode needs a full sync, make sure we use a full range to
+        * avoid log tree corruption, due to hole detection racing with ordered
+        * extent completion for adjacent ranges, and assertion failures during
+        * hole detection.
+        */
+       if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+                    &BTRFS_I(inode)->runtime_flags)) {
+               start = 0;
+               end = LLONG_MAX;
+       }
+
         /*
          * The range length can be represented by u64, we have to do the typecasts
          * to avoid signed overflow if it's [0, LLONG_MAX] eg. from fsync()
@@ -2048,6 +2108,32 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
         len = (u64)end - (u64)start + 1;
         trace_btrfs_sync_file(file, datasync);
  
+       btrfs_init_log_ctx(&ctx, inode);
+
+       /*
+        * Before we acquired the inode's lock, someone may have dirtied more
+        * pages in the target range. We need to make sure that writeback for
+        * any such pages does not start while we are logging the inode, because
+        * if it does, any of the following might happen when we are not doing a
+        * full inode sync:
+        *
+        * 1) We log an extent after its writeback finishes but before its
+        *    checksums are added to the csum tree, leading to -EIO errors
+        *    when attempting to read the extent after a log replay.
+        *
+        * 2) We can end up logging an extent before its writeback finishes.
+        *    Therefore after the log replay we will have a file extent item
+        *    pointing to an unwritten extent (and no data checksums as well).
+        *
+        * So trigger writeback for any eventual new dirty pages and then we
+        * wait for all ordered extents to complete below.
+        */
+       ret = start_ordered_ops(inode, start, end);
+       if (ret) {
+               inode_unlock(inode);
+               goto out;
+       }
+
         /*
          * We write the dirty pages in the range and wait until they complete
          * out of the ->i_mutex. If so, we can flush the dirty pages by
@@ -2059,6 +2145,14 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
                 goto out;
  
         inode_lock(inode);
+
+       /*
+        * We take the dio_sem here because the tree log stuff can race with
+        * lockless dio writes and get an extent map logged for an extent we
+        * never waited on.  We need it this high up for lockdep reasons.
+        */
+       down_write(&BTRFS_I(inode)->dio_sem);
+
         atomic_inc(&root->log_batch);
         full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
                              &BTRFS_I(inode)->runtime_flags);
@@ -2110,6 +2204,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
                 ret = start_ordered_ops(inode, start, end);
         }
         if (ret) {
+               up_write(&BTRFS_I(inode)->dio_sem);
                 inode_unlock(inode);
                 goto out;
         }
@@ -2165,6 +2260,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
                  * checked called fsync.
                  */
                 ret = filemap_check_wb_err(inode->i_mapping, file->f_wb_err);
+               up_write(&BTRFS_I(inode)->dio_sem);
                 inode_unlock(inode);
                 goto out;
         }
@@ -2189,13 +2285,12 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
         trans = btrfs_start_transaction(root, 0);
         if (IS_ERR(trans)) {
                 ret = PTR_ERR(trans);
+               up_write(&BTRFS_I(inode)->dio_sem);
                 inode_unlock(inode);
                 goto out;
         }
         trans->sync = true;
  
-       btrfs_init_log_ctx(&ctx, inode);
-
         ret = btrfs_log_dentry_safe(trans, root, dentry, start, end, &ctx);
         if (ret < 0) {
                 /* Fallthrough and commit/free transaction. */
@@ -2212,6 +2307,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
          * file again, but that will end up using the synchronization
          * inside btrfs_sync_log to keep things safe.
          */
+       up_write(&BTRFS_I(inode)->dio_sem);
         inode_unlock(inode);
  
         /*
@@ -2253,6 +2349,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
                 ret = btrfs_end_transaction(trans);
         }
  out:
+       ASSERT(list_empty(&ctx.list));
         err = file_check_and_advance_wb_err(file);
         if (!ret)
                 ret = err;
@@ -2742,6 +2839,11 @@ out_only_mutex:
                  * for detecting, at fsync time, if the inode isn't yet in the
                  * log tree or it's there but not up to date.
                  */
+               struct timespec now = current_time(inode);
+
+               inode_inc_iversion(inode);
+               inode->i_mtime = now;
+               inode->i_ctime = now;
                 trans = btrfs_start_transaction(root, 1);
                 if (IS_ERR(trans)) {
                         err = PTR_ERR(trans);
@@ -2934,6 +3036,7 @@ static long btrfs_fallocate(struct file *file, int mode,
                         ret = btrfs_qgroup_reserve_data(inode, &data_reserved,
                                         cur_offset, last_byte - cur_offset);
                         if (ret < 0) {
+                               cur_offset = last_byte;
                                 free_extent_map(em);
                                 break;
                         }
@@ -3004,7 +3107,7 @@ out:
         /* Let go of our reservation. */
         if (ret != 0)
                 btrfs_free_reserved_data_space(inode, data_reserved,
-                               alloc_start, alloc_end - cur_offset);
+                               cur_offset, alloc_end - cur_offset);
         extent_changeset_free(data_reserved);
         return ret;
  }