]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blobdiff - fs/btrfs/file.c
Btrfs: fix enospc problems with delalloc
[mirror_ubuntu-bionic-kernel.git] / fs / btrfs / file.c
index e621ea54a3fd64bcf3dae29e4f96e887b6eff95c..6e56a468d1f51f0f595b5227cd8d9d11958f0d32 100644 (file)
 #include "locking.h"
 #include "compat.h"
 
+/*
+ * when auto defrag is enabled we
+ * queue up these defrag structs to remember which
+ * inodes need defragging passes
+ */
+struct inode_defrag {
+       struct rb_node rb_node;
+       /* objectid */
+       u64 ino;
+       /*
+        * transid where the defrag was added, we search for
+        * extents newer than this
+        */
+       u64 transid;
+
+       /* root objectid */
+       u64 root;
+
+       /* last offset we were able to defrag */
+       u64 last_offset;
+
+       /* if we've wrapped around back to zero once already */
+       int cycled;
+};
+
+/* pop a record for an inode into the defrag tree.  The lock
+ * must be held already
+ *
+ * If you're inserting a record for an older transid than an
+ * existing record, the transid already in the tree is lowered
+ *
+ * If an existing record is found the defrag item you
+ * pass in is freed
+ */
+static int __btrfs_add_inode_defrag(struct inode *inode,
+                                   struct inode_defrag *defrag)
+{
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       struct inode_defrag *entry;
+       struct rb_node **p;
+       struct rb_node *parent = NULL;
+
+       p = &root->fs_info->defrag_inodes.rb_node;
+       while (*p) {
+               parent = *p;
+               entry = rb_entry(parent, struct inode_defrag, rb_node);
+
+               if (defrag->ino < entry->ino)
+                       p = &parent->rb_left;
+               else if (defrag->ino > entry->ino)
+                       p = &parent->rb_right;
+               else {
+                       /* if we're reinserting an entry for
+                        * an old defrag run, make sure to
+                        * lower the transid of our existing record
+                        */
+                       if (defrag->transid < entry->transid)
+                               entry->transid = defrag->transid;
+                       if (defrag->last_offset > entry->last_offset)
+                               entry->last_offset = defrag->last_offset;
+                       goto exists;
+               }
+       }
+       BTRFS_I(inode)->in_defrag = 1;
+       rb_link_node(&defrag->rb_node, parent, p);
+       rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes);
+       return 0;
+
+exists:
+       kfree(defrag);
+       return 0;
+
+}
+
+/*
+ * insert a defrag record for this inode if auto defrag is
+ * enabled
+ */
+int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
+                          struct inode *inode)
+{
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       struct inode_defrag *defrag;
+       int ret = 0;
+       u64 transid;
+
+       if (!btrfs_test_opt(root, AUTO_DEFRAG))
+               return 0;
+
+       if (btrfs_fs_closing(root->fs_info))
+               return 0;
+
+       if (BTRFS_I(inode)->in_defrag)
+               return 0;
+
+       if (trans)
+               transid = trans->transid;
+       else
+               transid = BTRFS_I(inode)->root->last_trans;
+
+       defrag = kzalloc(sizeof(*defrag), GFP_NOFS);
+       if (!defrag)
+               return -ENOMEM;
+
+       defrag->ino = btrfs_ino(inode);
+       defrag->transid = transid;
+       defrag->root = root->root_key.objectid;
+
+       spin_lock(&root->fs_info->defrag_inodes_lock);
+       if (!BTRFS_I(inode)->in_defrag)
+               ret = __btrfs_add_inode_defrag(inode, defrag);
+       spin_unlock(&root->fs_info->defrag_inodes_lock);
+       return ret;
+}
+
+/*
+ * must be called with the defrag_inodes lock held
+ */
+struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info, u64 ino,
+                                            struct rb_node **next)
+{
+       struct inode_defrag *entry = NULL;
+       struct rb_node *p;
+       struct rb_node *parent = NULL;
+
+       p = info->defrag_inodes.rb_node;
+       while (p) {
+               parent = p;
+               entry = rb_entry(parent, struct inode_defrag, rb_node);
+
+               if (ino < entry->ino)
+                       p = parent->rb_left;
+               else if (ino > entry->ino)
+                       p = parent->rb_right;
+               else
+                       return entry;
+       }
+
+       if (next) {
+               while (parent && ino > entry->ino) {
+                       parent = rb_next(parent);
+                       entry = rb_entry(parent, struct inode_defrag, rb_node);
+               }
+               *next = parent;
+       }
+       return NULL;
+}
+
+/*
+ * run through the list of inodes in the FS that need
+ * defragging
+ */
+int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
+{
+       struct inode_defrag *defrag;
+       struct btrfs_root *inode_root;
+       struct inode *inode;
+       struct rb_node *n;
+       struct btrfs_key key;
+       struct btrfs_ioctl_defrag_range_args range;
+       u64 first_ino = 0;
+       int num_defrag;
+       int defrag_batch = 1024;
+
+       memset(&range, 0, sizeof(range));
+       range.len = (u64)-1;
+
+       atomic_inc(&fs_info->defrag_running);
+       spin_lock(&fs_info->defrag_inodes_lock);
+       while(1) {
+               n = NULL;
+
+               /* find an inode to defrag */
+               defrag = btrfs_find_defrag_inode(fs_info, first_ino, &n);
+               if (!defrag) {
+                       if (n)
+                               defrag = rb_entry(n, struct inode_defrag, rb_node);
+                       else if (first_ino) {
+                               first_ino = 0;
+                               continue;
+                       } else {
+                               break;
+                       }
+               }
+
+               /* remove it from the rbtree */
+               first_ino = defrag->ino + 1;
+               rb_erase(&defrag->rb_node, &fs_info->defrag_inodes);
+
+               if (btrfs_fs_closing(fs_info))
+                       goto next_free;
+
+               spin_unlock(&fs_info->defrag_inodes_lock);
+
+               /* get the inode */
+               key.objectid = defrag->root;
+               btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+               key.offset = (u64)-1;
+               inode_root = btrfs_read_fs_root_no_name(fs_info, &key);
+               if (IS_ERR(inode_root))
+                       goto next;
+
+               key.objectid = defrag->ino;
+               btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+               key.offset = 0;
+
+               inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL);
+               if (IS_ERR(inode))
+                       goto next;
+
+               /* do a chunk of defrag */
+               BTRFS_I(inode)->in_defrag = 0;
+               range.start = defrag->last_offset;
+               num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
+                                              defrag_batch);
+               /*
+                * if we filled the whole defrag batch, there
+                * must be more work to do.  Queue this defrag
+                * again
+                */
+               if (num_defrag == defrag_batch) {
+                       defrag->last_offset = range.start;
+                       __btrfs_add_inode_defrag(inode, defrag);
+                       /*
+                        * we don't want to kfree defrag, we added it back to
+                        * the rbtree
+                        */
+                       defrag = NULL;
+               } else if (defrag->last_offset && !defrag->cycled) {
+                       /*
+                        * we didn't fill our defrag batch, but
+                        * we didn't start at zero.  Make sure we loop
+                        * around to the start of the file.
+                        */
+                       defrag->last_offset = 0;
+                       defrag->cycled = 1;
+                       __btrfs_add_inode_defrag(inode, defrag);
+                       defrag = NULL;
+               }
+
+               iput(inode);
+next:
+               spin_lock(&fs_info->defrag_inodes_lock);
+next_free:
+               kfree(defrag);
+       }
+       spin_unlock(&fs_info->defrag_inodes_lock);
+
+       atomic_dec(&fs_info->defrag_running);
+
+       /*
+        * during unmount, we use the transaction_wait queue to
+        * wait for the defragger to stop
+        */
+       wake_up(&fs_info->transaction_wait);
+       return 0;
+}
 
 /* simple helper to fault in pages and copy.  This should go away
  * and be replaced with calls into generic code.
@@ -104,7 +361,7 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
 /*
  * unlocks pages after btrfs_file_write is done with them
  */
-static noinline void btrfs_drop_pages(struct page **pages, size_t num_pages)
+void btrfs_drop_pages(struct page **pages, size_t num_pages)
 {
        size_t i;
        for (i = 0; i < num_pages; i++) {
@@ -127,16 +384,13 @@ static noinline void btrfs_drop_pages(struct page **pages, size_t num_pages)
  * this also makes the decision about creating an inline extent vs
  * doing real data extents, marking pages dirty and delalloc as required.
  */
-static noinline int dirty_and_release_pages(struct btrfs_root *root,
-                                           struct file *file,
-                                           struct page **pages,
-                                           size_t num_pages,
-                                           loff_t pos,
-                                           size_t write_bytes)
+int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
+                     struct page **pages, size_t num_pages,
+                     loff_t pos, size_t write_bytes,
+                     struct extent_state **cached)
 {
        int err = 0;
        int i;
-       struct inode *inode = fdentry(file)->d_inode;
        u64 num_bytes;
        u64 start_pos;
        u64 end_of_last_block;
@@ -149,7 +403,7 @@ static noinline int dirty_and_release_pages(struct btrfs_root *root,
 
        end_of_last_block = start_pos + num_bytes - 1;
        err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
-                                       NULL);
+                                       cached);
        if (err)
                return err;
 
@@ -194,9 +448,9 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
        }
        while (1) {
                if (!split)
-                       split = alloc_extent_map(GFP_NOFS);
+                       split = alloc_extent_map();
                if (!split2)
-                       split2 = alloc_extent_map(GFP_NOFS);
+                       split2 = alloc_extent_map();
                BUG_ON(!split || !split2);
 
                write_lock(&em_tree->lock);
@@ -301,6 +555,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode,
        struct btrfs_path *path;
        struct btrfs_key key;
        struct btrfs_key new_key;
+       u64 ino = btrfs_ino(inode);
        u64 search_start = start;
        u64 disk_bytenr = 0;
        u64 num_bytes = 0;
@@ -321,14 +576,14 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode,
 
        while (1) {
                recow = 0;
-               ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
+               ret = btrfs_lookup_file_extent(trans, root, path, ino,
                                               search_start, -1);
                if (ret < 0)
                        break;
                if (ret > 0 && path->slots[0] > 0 && search_start == start) {
                        leaf = path->nodes[0];
                        btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
-                       if (key.objectid == inode->i_ino &&
+                       if (key.objectid == ino &&
                            key.type == BTRFS_EXTENT_DATA_KEY)
                                path->slots[0]--;
                }
@@ -349,7 +604,7 @@ next_slot:
                }
 
                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
-               if (key.objectid > inode->i_ino ||
+               if (key.objectid > ino ||
                    key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end)
                        break;
 
@@ -379,7 +634,7 @@ next_slot:
 
                search_start = max(key.offset, start);
                if (recow) {
-                       btrfs_release_path(root, path);
+                       btrfs_release_path(path);
                        continue;
                }
 
@@ -396,7 +651,7 @@ next_slot:
                        ret = btrfs_duplicate_item(trans, root, path,
                                                   &new_key);
                        if (ret == -EAGAIN) {
-                               btrfs_release_path(root, path);
+                               btrfs_release_path(path);
                                continue;
                        }
                        if (ret < 0)
@@ -519,7 +774,7 @@ next_slot:
                        del_nr = 0;
                        del_slot = 0;
 
-                       btrfs_release_path(root, path);
+                       btrfs_release_path(path);
                        continue;
                }
 
@@ -595,6 +850,7 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
        int del_slot = 0;
        int recow;
        int ret;
+       u64 ino = btrfs_ino(inode);
 
        btrfs_drop_extent_cache(inode, start, end - 1, 0);
 
@@ -603,7 +859,7 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
 again:
        recow = 0;
        split = start;
-       key.objectid = inode->i_ino;
+       key.objectid = ino;
        key.type = BTRFS_EXTENT_DATA_KEY;
        key.offset = split;
 
@@ -615,8 +871,7 @@ again:
 
        leaf = path->nodes[0];
        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
-       BUG_ON(key.objectid != inode->i_ino ||
-              key.type != BTRFS_EXTENT_DATA_KEY);
+       BUG_ON(key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY);
        fi = btrfs_item_ptr(leaf, path->slots[0],
                            struct btrfs_file_extent_item);
        BUG_ON(btrfs_file_extent_type(leaf, fi) !=
@@ -633,7 +888,7 @@ again:
                other_start = 0;
                other_end = start;
                if (extent_mergeable(leaf, path->slots[0] - 1,
-                                    inode->i_ino, bytenr, orig_offset,
+                                    ino, bytenr, orig_offset,
                                     &other_start, &other_end)) {
                        new_key.offset = end;
                        btrfs_set_item_key_safe(trans, root, path, &new_key);
@@ -656,7 +911,7 @@ again:
                other_start = end;
                other_end = 0;
                if (extent_mergeable(leaf, path->slots[0] + 1,
-                                    inode->i_ino, bytenr, orig_offset,
+                                    ino, bytenr, orig_offset,
                                     &other_start, &other_end)) {
                        fi = btrfs_item_ptr(leaf, path->slots[0],
                                            struct btrfs_file_extent_item);
@@ -684,7 +939,7 @@ again:
                new_key.offset = split;
                ret = btrfs_duplicate_item(trans, root, path, &new_key);
                if (ret == -EAGAIN) {
-                       btrfs_release_path(root, path);
+                       btrfs_release_path(path);
                        goto again;
                }
                BUG_ON(ret < 0);
@@ -705,7 +960,7 @@ again:
 
                ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
                                           root->root_key.objectid,
-                                          inode->i_ino, orig_offset);
+                                          ino, orig_offset);
                BUG_ON(ret);
 
                if (split == start) {
@@ -721,10 +976,10 @@ again:
        other_start = end;
        other_end = 0;
        if (extent_mergeable(leaf, path->slots[0] + 1,
-                            inode->i_ino, bytenr, orig_offset,
+                            ino, bytenr, orig_offset,
                             &other_start, &other_end)) {
                if (recow) {
-                       btrfs_release_path(root, path);
+                       btrfs_release_path(path);
                        goto again;
                }
                extent_end = other_end;
@@ -732,16 +987,16 @@ again:
                del_nr++;
                ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
                                        0, root->root_key.objectid,
-                                       inode->i_ino, orig_offset);
+                                       ino, orig_offset);
                BUG_ON(ret);
        }
        other_start = 0;
        other_end = start;
        if (extent_mergeable(leaf, path->slots[0] - 1,
-                            inode->i_ino, bytenr, orig_offset,
+                            ino, bytenr, orig_offset,
                             &other_start, &other_end)) {
                if (recow) {
-                       btrfs_release_path(root, path);
+                       btrfs_release_path(path);
                        goto again;
                }
                key.offset = other_start;
@@ -749,7 +1004,7 @@ again:
                del_nr++;
                ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
                                        0, root->root_key.objectid,
-                                       inode->i_ino, orig_offset);
+                                       ino, orig_offset);
                BUG_ON(ret);
        }
        if (del_nr == 0) {
@@ -826,7 +1081,8 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
 
 again:
        for (i = 0; i < num_pages; i++) {
-               pages[i] = grab_cache_page(inode->i_mapping, index + i);
+               pages[i] = find_or_create_page(inode->i_mapping, index + i,
+                                              GFP_NOFS);
                if (!pages[i]) {
                        faili = i - 1;
                        err = -ENOMEM;
@@ -983,18 +1239,20 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
                 * managed to copy.
                 */
                if (num_pages > dirty_pages) {
-                       if (copied > 0)
-                               atomic_inc(
-                                       &BTRFS_I(inode)->outstanding_extents);
+                       if (copied > 0) {
+                               spin_lock(&BTRFS_I(inode)->lock);
+                               BTRFS_I(inode)->outstanding_extents++;
+                               spin_unlock(&BTRFS_I(inode)->lock);
+                       }
                        btrfs_delalloc_release_space(inode,
                                        (num_pages - dirty_pages) <<
                                        PAGE_CACHE_SHIFT);
                }
 
                if (copied > 0) {
-                       ret = dirty_and_release_pages(root, file, pages,
-                                                     dirty_pages, pos,
-                                                     copied);
+                       ret = btrfs_dirty_pages(root, inode, pages,
+                                               dirty_pages, pos, copied,
+                                               NULL);
                        if (ret) {
                                btrfs_delalloc_release_space(inode,
                                        dirty_pages << PAGE_CACHE_SHIFT);
@@ -1225,14 +1483,12 @@ int btrfs_sync_file(struct file *file, int datasync)
         * the current transaction, we can bail out now without any
         * syncing
         */
-       mutex_lock(&root->fs_info->trans_mutex);
+       smp_mb();
        if (BTRFS_I(inode)->last_trans <=
            root->fs_info->last_trans_committed) {
                BTRFS_I(inode)->last_trans = 0;
-               mutex_unlock(&root->fs_info->trans_mutex);
                goto out;
        }
-       mutex_unlock(&root->fs_info->trans_mutex);
 
        /*
         * ok we haven't committed the transaction yet, lets do a commit
@@ -1378,7 +1634,7 @@ static long btrfs_fallocate(struct file *file, int mode,
        while (1) {
                em = btrfs_get_extent(inode, NULL, 0, cur_offset,
                                      alloc_end - cur_offset, 0);
-               BUG_ON(IS_ERR(em) || !em);
+               BUG_ON(IS_ERR_OR_NULL(em));
                last_byte = min(extent_map_end(em), alloc_end);
                last_byte = (last_byte + mask) & ~mask;
                if (em->block_start == EXTENT_MAP_HOLE ||