]> git.proxmox.com Git - mirror_ubuntu-zesty-kernel.git/commitdiff
Merge branch 'for-chris' of git://github.com/sensille/linux into integration
authorChris Mason <chris.mason@oracle.com>
Sun, 6 Nov 2011 08:05:08 +0000 (03:05 -0500)
committerChris Mason <chris.mason@oracle.com>
Sun, 6 Nov 2011 08:05:08 +0000 (03:05 -0500)
Conflicts:
fs/btrfs/ctree.h

Signed-off-by: Chris Mason <chris.mason@oracle.com>
1  2 
fs/btrfs/ctree.h
fs/btrfs/disk-io.c
fs/btrfs/disk-io.h
fs/btrfs/extent_io.c
fs/btrfs/extent_io.h
fs/btrfs/scrub.c
fs/btrfs/volumes.c

diff --combined fs/btrfs/ctree.h
index 6bb34fc1ff226eeba1f0f791867020ce9c9a36ae,370af767440d664133c07d6667a94635a3449412..b9ba59ff9292559330745b8afb41eefacfbf1615
@@@ -30,7 -30,6 +30,7 @@@
  #include <linux/kobject.h>
  #include <trace/events/btrfs.h>
  #include <asm/kmap_types.h>
 +#include <linux/pagemap.h>
  #include "extent_io.h"
  #include "extent_map.h"
  #include "async-thread.h"
@@@ -360,47 -359,6 +360,47 @@@ struct btrfs_header 
  #define BTRFS_SYSTEM_CHUNK_ARRAY_SIZE 2048
  #define BTRFS_LABEL_SIZE 256
  
 +/*
 + * just in case we somehow lose the roots and are not able to mount,
 + * we store an array of the roots from previous transactions
 + * in the super.
 + */
 +#define BTRFS_NUM_BACKUP_ROOTS 4
 +struct btrfs_root_backup {
 +      __le64 tree_root;
 +      __le64 tree_root_gen;
 +
 +      __le64 chunk_root;
 +      __le64 chunk_root_gen;
 +
 +      __le64 extent_root;
 +      __le64 extent_root_gen;
 +
 +      __le64 fs_root;
 +      __le64 fs_root_gen;
 +
 +      __le64 dev_root;
 +      __le64 dev_root_gen;
 +
 +      __le64 csum_root;
 +      __le64 csum_root_gen;
 +
 +      __le64 total_bytes;
 +      __le64 bytes_used;
 +      __le64 num_devices;
 +      /* future */
 +      __le64 unsed_64[4];
 +
 +      u8 tree_root_level;
 +      u8 chunk_root_level;
 +      u8 extent_root_level;
 +      u8 fs_root_level;
 +      u8 dev_root_level;
 +      u8 csum_root_level;
 +      /* future and to align */
 +      u8 unused_8[10];
 +} __attribute__ ((__packed__));
 +
  /*
   * the super block basically lists the main trees of the FS
   * it currently lacks any block count etc etc
@@@ -447,7 -405,6 +447,7 @@@ struct btrfs_super_block 
        /* future expansion */
        __le64 reserved[31];
        u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
 +      struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS];
  } __attribute__ ((__packed__));
  
  /*
@@@ -815,8 -772,14 +815,8 @@@ struct btrfs_space_info 
  struct btrfs_block_rsv {
        u64 size;
        u64 reserved;
 -      u64 freed[2];
        struct btrfs_space_info *space_info;
 -      struct list_head list;
        spinlock_t lock;
 -      atomic_t usage;
 -      unsigned int priority:8;
 -      unsigned int durable:1;
 -      unsigned int refill_used:1;
        unsigned int full:1;
  };
  
@@@ -877,10 -840,10 +877,10 @@@ struct btrfs_block_group_cache 
        spinlock_t lock;
        u64 pinned;
        u64 reserved;
 -      u64 reserved_pinned;
        u64 bytes_super;
        u64 flags;
        u64 sectorsize;
 +      u64 cache_generation;
        unsigned int ro:1;
        unsigned int dirty:1;
        unsigned int iref:1;
@@@ -936,10 -899,6 +936,10 @@@ struct btrfs_fs_info 
        spinlock_t block_group_cache_lock;
        struct rb_root block_group_cache_tree;
  
 +      /* keep track of unallocated space */
 +      spinlock_t free_chunk_lock;
 +      u64 free_chunk_space;
 +
        struct extent_io_tree freed_extents[2];
        struct extent_io_tree *pinned_extents;
  
        struct btrfs_block_rsv trans_block_rsv;
        /* block reservation for chunk tree */
        struct btrfs_block_rsv chunk_block_rsv;
 +      /* block reservation for delayed operations */
 +      struct btrfs_block_rsv delayed_block_rsv;
  
        struct btrfs_block_rsv empty_block_rsv;
  
 -      /* list of block reservations that cross multiple transactions */
 -      struct list_head durable_block_rsv_list;
 -
 -      struct mutex durable_block_rsv_mutex;
 -
        u64 generation;
        u64 last_trans_committed;
  
        wait_queue_head_t transaction_blocked_wait;
        wait_queue_head_t async_submit_wait;
  
 -      struct btrfs_super_block super_copy;
 -      struct btrfs_super_block super_for_commit;
 +      struct btrfs_super_block *super_copy;
 +      struct btrfs_super_block *super_for_commit;
        struct block_device *__bdev;
        struct super_block *sb;
        struct inode *btree_inode;
        struct btrfs_workers endio_freespace_worker;
        struct btrfs_workers submit_workers;
        struct btrfs_workers caching_workers;
+       struct btrfs_workers readahead_workers;
  
        /*
         * fixup workers take dirty pages that didn't properly go through
  
        struct btrfs_delayed_root *delayed_root;
  
+       /* readahead tree */
+       spinlock_t reada_lock;
+       struct radix_tree_root reada_tree;
++
 +      /* next backup root to be overwritten */
 +      int backup_root_index;
  };
  
  /*
@@@ -1404,7 -1368,6 +1409,7 @@@ struct btrfs_ioctl_defrag_range_args 
  #define BTRFS_MOUNT_ENOSPC_DEBUG       (1 << 15)
  #define BTRFS_MOUNT_AUTO_DEFRAG               (1 << 16)
  #define BTRFS_MOUNT_INODE_MAP_CACHE   (1 << 17)
 +#define BTRFS_MOUNT_RECOVERY          (1 << 18)
  
  #define btrfs_clear_opt(o, opt)               ((o) &= ~BTRFS_MOUNT_##opt)
  #define btrfs_set_opt(o, opt)         ((o) |= BTRFS_MOUNT_##opt)
@@@ -2020,55 -1983,6 +2025,55 @@@ static inline bool btrfs_root_readonly(
        return root->root_item.flags & BTRFS_ROOT_SUBVOL_RDONLY;
  }
  
 +/* struct btrfs_root_backup */
 +BTRFS_SETGET_STACK_FUNCS(backup_tree_root, struct btrfs_root_backup,
 +                 tree_root, 64);
 +BTRFS_SETGET_STACK_FUNCS(backup_tree_root_gen, struct btrfs_root_backup,
 +                 tree_root_gen, 64);
 +BTRFS_SETGET_STACK_FUNCS(backup_tree_root_level, struct btrfs_root_backup,
 +                 tree_root_level, 8);
 +
 +BTRFS_SETGET_STACK_FUNCS(backup_chunk_root, struct btrfs_root_backup,
 +                 chunk_root, 64);
 +BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_gen, struct btrfs_root_backup,
 +                 chunk_root_gen, 64);
 +BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_level, struct btrfs_root_backup,
 +                 chunk_root_level, 8);
 +
 +BTRFS_SETGET_STACK_FUNCS(backup_extent_root, struct btrfs_root_backup,
 +                 extent_root, 64);
 +BTRFS_SETGET_STACK_FUNCS(backup_extent_root_gen, struct btrfs_root_backup,
 +                 extent_root_gen, 64);
 +BTRFS_SETGET_STACK_FUNCS(backup_extent_root_level, struct btrfs_root_backup,
 +                 extent_root_level, 8);
 +
 +BTRFS_SETGET_STACK_FUNCS(backup_fs_root, struct btrfs_root_backup,
 +                 fs_root, 64);
 +BTRFS_SETGET_STACK_FUNCS(backup_fs_root_gen, struct btrfs_root_backup,
 +                 fs_root_gen, 64);
 +BTRFS_SETGET_STACK_FUNCS(backup_fs_root_level, struct btrfs_root_backup,
 +                 fs_root_level, 8);
 +
 +BTRFS_SETGET_STACK_FUNCS(backup_dev_root, struct btrfs_root_backup,
 +                 dev_root, 64);
 +BTRFS_SETGET_STACK_FUNCS(backup_dev_root_gen, struct btrfs_root_backup,
 +                 dev_root_gen, 64);
 +BTRFS_SETGET_STACK_FUNCS(backup_dev_root_level, struct btrfs_root_backup,
 +                 dev_root_level, 8);
 +
 +BTRFS_SETGET_STACK_FUNCS(backup_csum_root, struct btrfs_root_backup,
 +                 csum_root, 64);
 +BTRFS_SETGET_STACK_FUNCS(backup_csum_root_gen, struct btrfs_root_backup,
 +                 csum_root_gen, 64);
 +BTRFS_SETGET_STACK_FUNCS(backup_csum_root_level, struct btrfs_root_backup,
 +                 csum_root_level, 8);
 +BTRFS_SETGET_STACK_FUNCS(backup_total_bytes, struct btrfs_root_backup,
 +                 total_bytes, 64);
 +BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup,
 +                 bytes_used, 64);
 +BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup,
 +                 num_devices, 64);
 +
  /* struct btrfs_super_block */
  
  BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
@@@ -2220,11 -2134,6 +2225,11 @@@ static inline bool btrfs_mixed_space_in
                (space_info->flags & BTRFS_BLOCK_GROUP_DATA));
  }
  
 +static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping)
 +{
 +      return mapping_gfp_mask(mapping) & ~__GFP_FS;
 +}
 +
  /* extent-tree.c */
  static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root,
                                                 unsigned num_items)
                3 * num_items;
  }
  
 +/*
 + * Doing a truncate won't result in new nodes or leaves, just what we need for
 + * COW.
 + */
 +static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_root *root,
 +                                               unsigned num_items)
 +{
 +      return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) *
 +              num_items;
 +}
 +
  void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
  int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root, unsigned long count);
@@@ -2253,9 -2151,6 +2258,9 @@@ int btrfs_lookup_extent_info(struct btr
                             u64 num_bytes, u64 *refs, u64 *flags);
  int btrfs_pin_extent(struct btrfs_root *root,
                     u64 bytenr, u64 num, int reserved);
 +int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
 +                                  struct btrfs_root *root,
 +                                  u64 bytenr, u64 num_bytes);
  int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root,
                          u64 objectid, u64 offset, u64 bytenr);
@@@ -2306,8 -2201,8 +2311,8 @@@ int btrfs_free_extent(struct btrfs_tran
                      u64 root_objectid, u64 owner, u64 offset);
  
  int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
 -int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
 -                              u64 num_bytes, int reserve, int sinfo);
 +int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
 +                                     u64 start, u64 len);
  int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
                                struct btrfs_root *root);
  int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
@@@ -2350,23 -2245,25 +2355,23 @@@ void btrfs_init_block_rsv(struct btrfs_
  struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root);
  void btrfs_free_block_rsv(struct btrfs_root *root,
                          struct btrfs_block_rsv *rsv);
 -void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
 -                               struct btrfs_block_rsv *rsv);
 -int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
 -                      struct btrfs_root *root,
 +int btrfs_block_rsv_add(struct btrfs_root *root,
                        struct btrfs_block_rsv *block_rsv,
                        u64 num_bytes);
 -int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
 -                        struct btrfs_root *root,
 +int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
 +                              struct btrfs_block_rsv *block_rsv,
 +                              u64 num_bytes);
 +int btrfs_block_rsv_check(struct btrfs_root *root,
 +                        struct btrfs_block_rsv *block_rsv, int min_factor);
 +int btrfs_block_rsv_refill(struct btrfs_root *root,
                          struct btrfs_block_rsv *block_rsv,
 -                        u64 min_reserved, int min_factor);
 +                        u64 min_reserved);
  int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
                            struct btrfs_block_rsv *dst_rsv,
                            u64 num_bytes);
  void btrfs_block_rsv_release(struct btrfs_root *root,
                             struct btrfs_block_rsv *block_rsv,
                             u64 num_bytes);
 -int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans,
 -                                  struct btrfs_root *root,
 -                                  struct btrfs_block_rsv *rsv);
  int btrfs_set_block_group_ro(struct btrfs_root *root,
                             struct btrfs_block_group_cache *cache);
  int btrfs_set_block_group_rw(struct btrfs_root *root,
@@@ -2487,18 -2384,6 +2492,18 @@@ static inline int btrfs_fs_closing(stru
        smp_mb();
        return fs_info->closing;
  }
 +static inline void free_fs_info(struct btrfs_fs_info *fs_info)
 +{
 +      kfree(fs_info->delayed_root);
 +      kfree(fs_info->extent_root);
 +      kfree(fs_info->tree_root);
 +      kfree(fs_info->chunk_root);
 +      kfree(fs_info->dev_root);
 +      kfree(fs_info->csum_root);
 +      kfree(fs_info->super_copy);
 +      kfree(fs_info->super_for_commit);
 +      kfree(fs_info);
 +}
  
  /* root-item.c */
  int btrfs_find_root_ref(struct btrfs_root *tree_root,
@@@ -2699,6 -2584,11 +2704,6 @@@ int btrfs_update_inode(struct btrfs_tra
  int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
  int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
  int btrfs_orphan_cleanup(struct btrfs_root *root);
 -void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
 -                              struct btrfs_pending_snapshot *pending,
 -                              u64 *bytes_to_reserve);
 -void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
 -                              struct btrfs_pending_snapshot *pending);
  void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
                              struct btrfs_root *root);
  int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size);
@@@ -2812,4 -2702,20 +2817,20 @@@ int btrfs_scrub_cancel_devid(struct btr
  int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
                         struct btrfs_scrub_progress *progress);
  
+ /* reada.c */
+ struct reada_control {
+       struct btrfs_root       *root;          /* tree to prefetch */
+       struct btrfs_key        key_start;
+       struct btrfs_key        key_end;        /* exclusive */
+       atomic_t                elems;
+       struct kref             refcnt;
+       wait_queue_head_t       wait;
+ };
+ struct reada_control *btrfs_reada_add(struct btrfs_root *root,
+                             struct btrfs_key *start, struct btrfs_key *end);
+ int btrfs_reada_wait(void *handle);
+ void btrfs_reada_detach(void *handle);
+ int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
+                        u64 start, int err);
  #endif
diff --combined fs/btrfs/disk-io.c
index 23b6776477b73ed04adc180c77028a387db5d177,2151828aa1423fb6551129776e04907f2b875282..cedfbfb278eb6c7d7edb602d1ea42b1cd23b4efa
@@@ -256,7 -256,8 +256,7 @@@ void btrfs_csum_final(u32 crc, char *re
  static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
                           int verify)
  {
 -      u16 csum_size =
 -              btrfs_super_csum_size(&root->fs_info->super_copy);
 +      u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
        char *result = NULL;
        unsigned long len;
        unsigned long cur_len;
@@@ -366,7 -367,8 +366,8 @@@ static int btree_read_extent_buffer_pag
        clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
        io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
        while (1) {
-               ret = read_extent_buffer_pages(io_tree, eb, start, 1,
+               ret = read_extent_buffer_pages(io_tree, eb, start,
+                                              WAIT_COMPLETE,
                                               btree_get_extent, mirror_num);
                if (!ret &&
                    !verify_parent_transid(io_tree, eb, parent_transid))
@@@ -607,11 -609,47 +608,47 @@@ static int btree_readpage_end_io_hook(s
        end = min_t(u64, eb->len, PAGE_CACHE_SIZE);
        end = eb->start + end - 1;
  err:
+       if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) {
+               clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags);
+               btree_readahead_hook(root, eb, eb->start, ret);
+       }
        free_extent_buffer(eb);
  out:
        return ret;
  }
  
+ static int btree_io_failed_hook(struct bio *failed_bio,
+                        struct page *page, u64 start, u64 end,
+                        struct extent_state *state)
+ {
+       struct extent_io_tree *tree;
+       unsigned long len;
+       struct extent_buffer *eb;
+       struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
+       tree = &BTRFS_I(page->mapping->host)->io_tree;
+       if (page->private == EXTENT_PAGE_PRIVATE)
+               goto out;
+       if (!page->private)
+               goto out;
+       len = page->private >> 2;
+       WARN_ON(len == 0);
+       eb = alloc_extent_buffer(tree, start, len, page);
+       if (eb == NULL)
+               goto out;
+       if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) {
+               clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags);
+               btree_readahead_hook(root, eb, eb->start, -EIO);
+       }
+ out:
+       return -EIO;    /* we fixed nothing */
+ }
  static void end_workqueue_bio(struct bio *bio, int err)
  {
        struct end_io_wq *end_io_wq = bio->bi_private;
@@@ -973,11 -1011,43 +1010,43 @@@ int readahead_tree_block(struct btrfs_r
        if (!buf)
                return 0;
        read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
-                                buf, 0, 0, btree_get_extent, 0);
+                                buf, 0, WAIT_NONE, btree_get_extent, 0);
        free_extent_buffer(buf);
        return ret;
  }
  
+ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
+                        int mirror_num, struct extent_buffer **eb)
+ {
+       struct extent_buffer *buf = NULL;
+       struct inode *btree_inode = root->fs_info->btree_inode;
+       struct extent_io_tree *io_tree = &BTRFS_I(btree_inode)->io_tree;
+       int ret;
+       buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
+       if (!buf)
+               return 0;
+       set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags);
+       ret = read_extent_buffer_pages(io_tree, buf, 0, WAIT_PAGE_LOCK,
+                                      btree_get_extent, mirror_num);
+       if (ret) {
+               free_extent_buffer(buf);
+               return ret;
+       }
+       if (test_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags)) {
+               free_extent_buffer(buf);
+               return -EIO;
+       } else if (extent_buffer_uptodate(io_tree, buf, NULL)) {
+               *eb = buf;
+       } else {
+               free_extent_buffer(buf);
+       }
+       return 0;
+ }
  struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
                                            u64 bytenr, u32 blocksize)
  {
@@@ -1134,12 -1204,10 +1203,12 @@@ static int find_and_setup_root(struct b
  
        generation = btrfs_root_generation(&root->root_item);
        blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
 +      root->commit_root = NULL;
        root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
                                     blocksize, generation);
        if (!root->node || !btrfs_buffer_uptodate(root->node, generation)) {
                free_extent_buffer(root->node);
 +              root->node = NULL;
                return -EIO;
        }
        root->commit_root = btrfs_root_node(root);
@@@ -1578,228 -1646,6 +1647,228 @@@ sleep
        return 0;
  }
  
 +/*
 + * this will find the highest generation in the array of
 + * root backups.  The index of the highest array is returned,
 + * or -1 if we can't find anything.
 + *
 + * We check to make sure the array is valid by comparing the
 + * generation of the latest  root in the array with the generation
 + * in the super block.  If they don't match we pitch it.
 + */
 +static int find_newest_super_backup(struct btrfs_fs_info *info, u64 newest_gen)
 +{
 +      u64 cur;
 +      int newest_index = -1;
 +      struct btrfs_root_backup *root_backup;
 +      int i;
 +
 +      for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
 +              root_backup = info->super_copy->super_roots + i;
 +              cur = btrfs_backup_tree_root_gen(root_backup);
 +              if (cur == newest_gen)
 +                      newest_index = i;
 +      }
 +
 +      /* check to see if we actually wrapped around */
 +      if (newest_index == BTRFS_NUM_BACKUP_ROOTS - 1) {
 +              root_backup = info->super_copy->super_roots;
 +              cur = btrfs_backup_tree_root_gen(root_backup);
 +              if (cur == newest_gen)
 +                      newest_index = 0;
 +      }
 +      return newest_index;
 +}
 +
 +
 +/*
 + * find the oldest backup so we know where to store new entries
 + * in the backup array.  This will set the backup_root_index
 + * field in the fs_info struct
 + */
 +static void find_oldest_super_backup(struct btrfs_fs_info *info,
 +                                   u64 newest_gen)
 +{
 +      int newest_index = -1;
 +
 +      newest_index = find_newest_super_backup(info, newest_gen);
 +      /* if there was garbage in there, just move along */
 +      if (newest_index == -1) {
 +              info->backup_root_index = 0;
 +      } else {
 +              info->backup_root_index = (newest_index + 1) % BTRFS_NUM_BACKUP_ROOTS;
 +      }
 +}
 +
 +/*
 + * copy all the root pointers into the super backup array.
 + * this will bump the backup pointer by one when it is
 + * done
 + */
 +static void backup_super_roots(struct btrfs_fs_info *info)
 +{
 +      int next_backup;
 +      struct btrfs_root_backup *root_backup;
 +      int last_backup;
 +
 +      next_backup = info->backup_root_index;
 +      last_backup = (next_backup + BTRFS_NUM_BACKUP_ROOTS - 1) %
 +              BTRFS_NUM_BACKUP_ROOTS;
 +
 +      /*
 +       * just overwrite the last backup if we're at the same generation
 +       * this happens only at umount
 +       */
 +      root_backup = info->super_for_commit->super_roots + last_backup;
 +      if (btrfs_backup_tree_root_gen(root_backup) ==
 +          btrfs_header_generation(info->tree_root->node))
 +              next_backup = last_backup;
 +
 +      root_backup = info->super_for_commit->super_roots + next_backup;
 +
 +      /*
 +       * make sure all of our padding and empty slots get zero filled
 +       * regardless of which ones we use today
 +       */
 +      memset(root_backup, 0, sizeof(*root_backup));
 +
 +      info->backup_root_index = (next_backup + 1) % BTRFS_NUM_BACKUP_ROOTS;
 +
 +      btrfs_set_backup_tree_root(root_backup, info->tree_root->node->start);
 +      btrfs_set_backup_tree_root_gen(root_backup,
 +                             btrfs_header_generation(info->tree_root->node));
 +
 +      btrfs_set_backup_tree_root_level(root_backup,
 +                             btrfs_header_level(info->tree_root->node));
 +
 +      btrfs_set_backup_chunk_root(root_backup, info->chunk_root->node->start);
 +      btrfs_set_backup_chunk_root_gen(root_backup,
 +                             btrfs_header_generation(info->chunk_root->node));
 +      btrfs_set_backup_chunk_root_level(root_backup,
 +                             btrfs_header_level(info->chunk_root->node));
 +
 +      btrfs_set_backup_extent_root(root_backup, info->extent_root->node->start);
 +      btrfs_set_backup_extent_root_gen(root_backup,
 +                             btrfs_header_generation(info->extent_root->node));
 +      btrfs_set_backup_extent_root_level(root_backup,
 +                             btrfs_header_level(info->extent_root->node));
 +
 +      btrfs_set_backup_fs_root(root_backup, info->fs_root->node->start);
 +      btrfs_set_backup_fs_root_gen(root_backup,
 +                             btrfs_header_generation(info->fs_root->node));
 +      btrfs_set_backup_fs_root_level(root_backup,
 +                             btrfs_header_level(info->fs_root->node));
 +
 +      btrfs_set_backup_dev_root(root_backup, info->dev_root->node->start);
 +      btrfs_set_backup_dev_root_gen(root_backup,
 +                             btrfs_header_generation(info->dev_root->node));
 +      btrfs_set_backup_dev_root_level(root_backup,
 +                                     btrfs_header_level(info->dev_root->node));
 +
 +      btrfs_set_backup_csum_root(root_backup, info->csum_root->node->start);
 +      btrfs_set_backup_csum_root_gen(root_backup,
 +                             btrfs_header_generation(info->csum_root->node));
 +      btrfs_set_backup_csum_root_level(root_backup,
 +                             btrfs_header_level(info->csum_root->node));
 +
 +      btrfs_set_backup_total_bytes(root_backup,
 +                           btrfs_super_total_bytes(info->super_copy));
 +      btrfs_set_backup_bytes_used(root_backup,
 +                           btrfs_super_bytes_used(info->super_copy));
 +      btrfs_set_backup_num_devices(root_backup,
 +                           btrfs_super_num_devices(info->super_copy));
 +
 +      /*
 +       * if we don't copy this out to the super_copy, it won't get remembered
 +       * for the next commit
 +       */
 +      memcpy(&info->super_copy->super_roots,
 +             &info->super_for_commit->super_roots,
 +             sizeof(*root_backup) * BTRFS_NUM_BACKUP_ROOTS);
 +}
 +
 +/*
 + * this copies info out of the root backup array and back into
 + * the in-memory super block.  It is meant to help iterate through
 + * the array, so you send it the number of backups you've already
 + * tried and the last backup index you used.
 + *
 + * this returns -1 when it has tried all the backups
 + */
 +static noinline int next_root_backup(struct btrfs_fs_info *info,
 +                                   struct btrfs_super_block *super,
 +                                   int *num_backups_tried, int *backup_index)
 +{
 +      struct btrfs_root_backup *root_backup;
 +      int newest = *backup_index;
 +
 +      if (*num_backups_tried == 0) {
 +              u64 gen = btrfs_super_generation(super);
 +
 +              newest = find_newest_super_backup(info, gen);
 +              if (newest == -1)
 +                      return -1;
 +
 +              *backup_index = newest;
 +              *num_backups_tried = 1;
 +      } else if (*num_backups_tried == BTRFS_NUM_BACKUP_ROOTS) {
 +              /* we've tried all the backups, all done */
 +              return -1;
 +      } else {
 +              /* jump to the next oldest backup */
 +              newest = (*backup_index + BTRFS_NUM_BACKUP_ROOTS - 1) %
 +                      BTRFS_NUM_BACKUP_ROOTS;
 +              *backup_index = newest;
 +              *num_backups_tried += 1;
 +      }
 +      root_backup = super->super_roots + newest;
 +
 +      btrfs_set_super_generation(super,
 +                                 btrfs_backup_tree_root_gen(root_backup));
 +      btrfs_set_super_root(super, btrfs_backup_tree_root(root_backup));
 +      btrfs_set_super_root_level(super,
 +                                 btrfs_backup_tree_root_level(root_backup));
 +      btrfs_set_super_bytes_used(super, btrfs_backup_bytes_used(root_backup));
 +
 +      /*
 +       * fixme: the total bytes and num_devices need to match or we should
 +       * need a fsck
 +       */
 +      btrfs_set_super_total_bytes(super, btrfs_backup_total_bytes(root_backup));
 +      btrfs_set_super_num_devices(super, btrfs_backup_num_devices(root_backup));
 +      return 0;
 +}
 +
 +/* helper to cleanup tree roots */
 +static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
 +{
 +      free_extent_buffer(info->tree_root->node);
 +      free_extent_buffer(info->tree_root->commit_root);
 +      free_extent_buffer(info->dev_root->node);
 +      free_extent_buffer(info->dev_root->commit_root);
 +      free_extent_buffer(info->extent_root->node);
 +      free_extent_buffer(info->extent_root->commit_root);
 +      free_extent_buffer(info->csum_root->node);
 +      free_extent_buffer(info->csum_root->commit_root);
 +
 +      info->tree_root->node = NULL;
 +      info->tree_root->commit_root = NULL;
 +      info->dev_root->node = NULL;
 +      info->dev_root->commit_root = NULL;
 +      info->extent_root->node = NULL;
 +      info->extent_root->commit_root = NULL;
 +      info->csum_root->node = NULL;
 +      info->csum_root->commit_root = NULL;
 +
 +      if (chunk_root) {
 +              free_extent_buffer(info->chunk_root->node);
 +              free_extent_buffer(info->chunk_root->commit_root);
 +              info->chunk_root->node = NULL;
 +              info->chunk_root->commit_root = NULL;
 +      }
 +}
 +
 +
  struct btrfs_root *open_ctree(struct super_block *sb,
                              struct btrfs_fs_devices *fs_devices,
                              char *options)
  
        int ret;
        int err = -EINVAL;
 +      int num_backups_tried = 0;
 +      int backup_index = 0;
  
        struct btrfs_super_block *disk_super;
  
        spin_lock_init(&fs_info->fs_roots_radix_lock);
        spin_lock_init(&fs_info->delayed_iput_lock);
        spin_lock_init(&fs_info->defrag_inodes_lock);
 +      spin_lock_init(&fs_info->free_chunk_lock);
        mutex_init(&fs_info->reloc_mutex);
  
        init_completion(&fs_info->kobj_unregister);
        btrfs_init_block_rsv(&fs_info->trans_block_rsv);
        btrfs_init_block_rsv(&fs_info->chunk_block_rsv);
        btrfs_init_block_rsv(&fs_info->empty_block_rsv);
 -      INIT_LIST_HEAD(&fs_info->durable_block_rsv_list);
 -      mutex_init(&fs_info->durable_block_rsv_mutex);
 +      btrfs_init_block_rsv(&fs_info->delayed_block_rsv);
        atomic_set(&fs_info->nr_async_submits, 0);
        atomic_set(&fs_info->async_delalloc_pages, 0);
        atomic_set(&fs_info->async_submit_draining, 0);
        fs_info->metadata_ratio = 0;
        fs_info->defrag_inodes = RB_ROOT;
        fs_info->trans_no_join = 0;
 +      fs_info->free_chunk_space = 0;
  
+       /* readahead state */
+       INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT);
+       spin_lock_init(&fs_info->reada_lock);
        fs_info->thread_pool_size = min_t(unsigned long,
                                          num_online_cpus() + 2, 8);
  
                goto fail_alloc;
        }
  
 -      memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy));
 -      memcpy(&fs_info->super_for_commit, &fs_info->super_copy,
 -             sizeof(fs_info->super_for_commit));
 +      memcpy(fs_info->super_copy, bh->b_data, sizeof(*fs_info->super_copy));
 +      memcpy(fs_info->super_for_commit, fs_info->super_copy,
 +             sizeof(*fs_info->super_for_commit));
        brelse(bh);
  
 -      memcpy(fs_info->fsid, fs_info->super_copy.fsid, BTRFS_FSID_SIZE);
 +      memcpy(fs_info->fsid, fs_info->super_copy->fsid, BTRFS_FSID_SIZE);
  
 -      disk_super = &fs_info->super_copy;
 +      disk_super = fs_info->super_copy;
        if (!btrfs_super_root(disk_super))
                goto fail_alloc;
  
  
        btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
  
 +      /*
 +       * run through our array of backup supers and setup
 +       * our ring pointer to the oldest one
 +       */
 +      generation = btrfs_super_generation(disk_super);
 +      find_oldest_super_backup(fs_info, generation);
 +
        /*
         * In the long term, we'll store the compression type in the super
         * block, and it'll be used for per file compression control.
        btrfs_init_workers(&fs_info->delayed_workers, "delayed-meta",
                           fs_info->thread_pool_size,
                           &fs_info->generic_worker);
+       btrfs_init_workers(&fs_info->readahead_workers, "readahead",
+                          fs_info->thread_pool_size,
+                          &fs_info->generic_worker);
  
        /*
         * endios are largely parallel and should have a very
  
        fs_info->endio_write_workers.idle_thresh = 2;
        fs_info->endio_meta_write_workers.idle_thresh = 2;
+       fs_info->readahead_workers.idle_thresh = 2;
  
        btrfs_start_workers(&fs_info->workers, 1);
        btrfs_start_workers(&fs_info->generic_worker, 1);
        btrfs_start_workers(&fs_info->endio_freespace_worker, 1);
        btrfs_start_workers(&fs_info->delayed_workers, 1);
        btrfs_start_workers(&fs_info->caching_workers, 1);
+       btrfs_start_workers(&fs_info->readahead_workers, 1);
  
        fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
        fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
        if (!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) {
                printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n",
                       sb->s_id);
 -              goto fail_chunk_root;
 +              goto fail_tree_roots;
        }
        btrfs_set_root_node(&chunk_root->root_item, chunk_root->node);
        chunk_root->commit_root = btrfs_root_node(chunk_root);
        if (ret) {
                printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n",
                       sb->s_id);
 -              goto fail_chunk_root;
 +              goto fail_tree_roots;
        }
  
        btrfs_close_extra_devices(fs_devices);
  
 +retry_root_backup:
        blocksize = btrfs_level_size(tree_root,
                                     btrfs_super_root_level(disk_super));
        generation = btrfs_super_generation(disk_super);
        tree_root->node = read_tree_block(tree_root,
                                          btrfs_super_root(disk_super),
                                          blocksize, generation);
 -      if (!tree_root->node)
 -              goto fail_chunk_root;
 -      if (!test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
 +      if (!tree_root->node ||
 +          !test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
                printk(KERN_WARNING "btrfs: failed to read tree root on %s\n",
                       sb->s_id);
 -              goto fail_tree_root;
 +
 +              goto recovery_tree_root;
        }
 +
        btrfs_set_root_node(&tree_root->root_item, tree_root->node);
        tree_root->commit_root = btrfs_root_node(tree_root);
  
        ret = find_and_setup_root(tree_root, fs_info,
                                  BTRFS_EXTENT_TREE_OBJECTID, extent_root);
        if (ret)
 -              goto fail_tree_root;
 +              goto recovery_tree_root;
        extent_root->track_dirty = 1;
  
        ret = find_and_setup_root(tree_root, fs_info,
                                  BTRFS_DEV_TREE_OBJECTID, dev_root);
        if (ret)
 -              goto fail_extent_root;
 +              goto recovery_tree_root;
        dev_root->track_dirty = 1;
  
        ret = find_and_setup_root(tree_root, fs_info,
                                  BTRFS_CSUM_TREE_OBJECTID, csum_root);
        if (ret)
 -              goto fail_dev_root;
 +              goto recovery_tree_root;
  
        csum_root->track_dirty = 1;
  
@@@ -2359,10 -2202,20 +2437,10 @@@ fail_cleaner
  
  fail_block_groups:
        btrfs_free_block_groups(fs_info);
 -      free_extent_buffer(csum_root->node);
 -      free_extent_buffer(csum_root->commit_root);
 -fail_dev_root:
 -      free_extent_buffer(dev_root->node);
 -      free_extent_buffer(dev_root->commit_root);
 -fail_extent_root:
 -      free_extent_buffer(extent_root->node);
 -      free_extent_buffer(extent_root->commit_root);
 -fail_tree_root:
 -      free_extent_buffer(tree_root->node);
 -      free_extent_buffer(tree_root->commit_root);
 -fail_chunk_root:
 -      free_extent_buffer(chunk_root->node);
 -      free_extent_buffer(chunk_root->commit_root);
 +
 +fail_tree_roots:
 +      free_root_pointers(fs_info, 1);
 +
  fail_sb_buffer:
        btrfs_stop_workers(&fs_info->generic_worker);
        btrfs_stop_workers(&fs_info->fixup_workers);
        btrfs_stop_workers(&fs_info->delayed_workers);
        btrfs_stop_workers(&fs_info->caching_workers);
  fail_alloc:
 -      kfree(fs_info->delayed_root);
  fail_iput:
        invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
        iput(fs_info->btree_inode);
@@@ -2388,27 -2242,13 +2466,27 @@@ fail_bdi
  fail_srcu:
        cleanup_srcu_struct(&fs_info->subvol_srcu);
  fail:
 -      kfree(extent_root);
 -      kfree(tree_root);
 -      kfree(fs_info);
 -      kfree(chunk_root);
 -      kfree(dev_root);
 -      kfree(csum_root);
 +      free_fs_info(fs_info);
        return ERR_PTR(err);
 +
 +recovery_tree_root:
 +
 +      if (!btrfs_test_opt(tree_root, RECOVERY))
 +              goto fail_tree_roots;
 +
 +      free_root_pointers(fs_info, 0);
 +
 +      /* don't use the log in recovery mode, it won't be valid */
 +      btrfs_set_super_log_root(disk_super, 0);
 +
 +      /* we can't trust the free space cache either */
 +      btrfs_set_opt(fs_info->mount_opt, CLEAR_CACHE);
 +
 +      ret = next_root_backup(fs_info, fs_info->super_copy,
 +                             &num_backups_tried, &backup_index);
 +      if (ret == -1)
 +              goto fail_block_groups;
 +      goto retry_root_backup;
  }
  
  static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
@@@ -2576,11 -2416,10 +2654,11 @@@ int write_all_supers(struct btrfs_root 
        int total_errors = 0;
        u64 flags;
  
 -      max_errors = btrfs_super_num_devices(&root->fs_info->super_copy) - 1;
 +      max_errors = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
        do_barriers = !btrfs_test_opt(root, NOBARRIER);
 +      backup_super_roots(root->fs_info);
  
 -      sb = &root->fs_info->super_for_commit;
 +      sb = root->fs_info->super_for_commit;
        dev_item = &sb->dev_item;
  
        mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
@@@ -2784,6 -2623,8 +2862,6 @@@ int close_ctree(struct btrfs_root *root
        /* clear out the rbtree of defraggable inodes */
        btrfs_run_defrag_inodes(root->fs_info);
  
 -      btrfs_put_block_group_cache(fs_info);
 -
        /*
         * Here come 2 situations when btrfs is broken to flip readonly:
         *
                        printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
        }
  
 +      btrfs_put_block_group_cache(fs_info);
 +
        kthread_stop(root->fs_info->transaction_kthread);
        kthread_stop(root->fs_info->cleaner_kthread);
  
        del_fs_roots(fs_info);
  
        iput(fs_info->btree_inode);
 -      kfree(fs_info->delayed_root);
  
        btrfs_stop_workers(&fs_info->generic_worker);
        btrfs_stop_workers(&fs_info->fixup_workers);
        btrfs_stop_workers(&fs_info->submit_workers);
        btrfs_stop_workers(&fs_info->delayed_workers);
        btrfs_stop_workers(&fs_info->caching_workers);
+       btrfs_stop_workers(&fs_info->readahead_workers);
  
        btrfs_close_devices(fs_info->fs_devices);
        btrfs_mapping_tree_free(&fs_info->mapping_tree);
        bdi_destroy(&fs_info->bdi);
        cleanup_srcu_struct(&fs_info->subvol_srcu);
  
 -      kfree(fs_info->extent_root);
 -      kfree(fs_info->tree_root);
 -      kfree(fs_info->chunk_root);
 -      kfree(fs_info->dev_root);
 -      kfree(fs_info->csum_root);
 -      kfree(fs_info);
 +      free_fs_info(fs_info);
  
        return 0;
  }
@@@ -2968,8 -2814,7 +3047,8 @@@ int btrfs_read_buffer(struct extent_buf
        return ret;
  }
  
 -int btree_lock_page_hook(struct page *page)
 +static int btree_lock_page_hook(struct page *page, void *data,
 +                              void (*flush_fn)(void *))
  {
        struct inode *inode = page->mapping->host;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        if (!eb)
                goto out;
  
 -      btrfs_tree_lock(eb);
 +      if (!btrfs_try_tree_write_lock(eb)) {
 +              flush_fn(data);
 +              btrfs_tree_lock(eb);
 +      }
        btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
  
        if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
        btrfs_tree_unlock(eb);
        free_extent_buffer(eb);
  out:
 -      lock_page(page);
 +      if (!trylock_page(page)) {
 +              flush_fn(data);
 +              lock_page(page);
 +      }
        return 0;
  }
  
@@@ -3363,6 -3202,7 +3442,7 @@@ static int btrfs_cleanup_transaction(st
  static struct extent_io_ops btree_extent_io_ops = {
        .write_cache_pages_lock_hook = btree_lock_page_hook,
        .readpage_end_io_hook = btree_readpage_end_io_hook,
+       .readpage_io_failed_hook = btree_io_failed_hook,
        .submit_bio_hook = btree_submit_bio_hook,
        /* note we're sharing with inode.c for the merge bio hook */
        .merge_bio_hook = btrfs_merge_bio_hook,
diff --combined fs/btrfs/disk-io.h
index e678539c8519cf4eb081569225de648152d6a712,b3bdb5c1390f9011127f26595848a684eecf539f..c99d0a8f13fa2f38642f7826cb00c45070ac95ae
@@@ -40,6 -40,8 +40,8 @@@ struct extent_buffer *read_tree_block(s
                                      u32 blocksize, u64 parent_transid);
  int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
                         u64 parent_transid);
+ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
+                        int mirror_num, struct extent_buffer **eb);
  struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
                                                   u64 bytenr, u32 blocksize);
  int clean_tree_block(struct btrfs_trans_handle *trans,
@@@ -83,6 -85,8 +85,6 @@@ int btrfs_init_log_root_tree(struct btr
                             struct btrfs_fs_info *fs_info);
  int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
                       struct btrfs_root *root);
 -int btree_lock_page_hook(struct page *page);
 -
  
  #ifdef CONFIG_DEBUG_LOCK_ALLOC
  void btrfs_init_lockdep(void);
diff --combined fs/btrfs/extent_io.c
index cc3c58970d4e1fab554e591cb9735dca1d583e73,deba714236d7827f58a72e62e9d5399607fbabe3..c12705682c6543a4b180ccc32521cc4011a20dc4
@@@ -894,194 -894,6 +894,194 @@@ search_again
        goto again;
  }
  
 +/**
 + * convert_extent - convert all bits in a given range from one bit to another
 + * @tree:     the io tree to search
 + * @start:    the start offset in bytes
 + * @end:      the end offset in bytes (inclusive)
 + * @bits:     the bits to set in this range
 + * @clear_bits:       the bits to clear in this range
 + * @mask:     the allocation mask
 + *
 + * This will go through and set bits for the given range.  If any states exist
 + * already in this range they are set with the given bit and cleared of the
 + * clear_bits.  This is only meant to be used by things that are mergeable, ie
 + * converting from say DELALLOC to DIRTY.  This is not meant to be used with
 + * boundary bits like LOCK.
 + */
 +int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 +                     int bits, int clear_bits, gfp_t mask)
 +{
 +      struct extent_state *state;
 +      struct extent_state *prealloc = NULL;
 +      struct rb_node *node;
 +      int err = 0;
 +      u64 last_start;
 +      u64 last_end;
 +
 +again:
 +      if (!prealloc && (mask & __GFP_WAIT)) {
 +              prealloc = alloc_extent_state(mask);
 +              if (!prealloc)
 +                      return -ENOMEM;
 +      }
 +
 +      spin_lock(&tree->lock);
 +      /*
 +       * this search will find all the extents that end after
 +       * our range starts.
 +       */
 +      node = tree_search(tree, start);
 +      if (!node) {
 +              prealloc = alloc_extent_state_atomic(prealloc);
 +              if (!prealloc)
 +                      return -ENOMEM;
 +              err = insert_state(tree, prealloc, start, end, &bits);
 +              prealloc = NULL;
 +              BUG_ON(err == -EEXIST);
 +              goto out;
 +      }
 +      state = rb_entry(node, struct extent_state, rb_node);
 +hit_next:
 +      last_start = state->start;
 +      last_end = state->end;
 +
 +      /*
 +       * | ---- desired range ---- |
 +       * | state |
 +       *
 +       * Just lock what we found and keep going
 +       */
 +      if (state->start == start && state->end <= end) {
 +              struct rb_node *next_node;
 +
 +              set_state_bits(tree, state, &bits);
 +              clear_state_bit(tree, state, &clear_bits, 0);
 +
 +              merge_state(tree, state);
 +              if (last_end == (u64)-1)
 +                      goto out;
 +
 +              start = last_end + 1;
 +              next_node = rb_next(&state->rb_node);
 +              if (next_node && start < end && prealloc && !need_resched()) {
 +                      state = rb_entry(next_node, struct extent_state,
 +                                       rb_node);
 +                      if (state->start == start)
 +                              goto hit_next;
 +              }
 +              goto search_again;
 +      }
 +
 +      /*
 +       *     | ---- desired range ---- |
 +       * | state |
 +       *   or
 +       * | ------------- state -------------- |
 +       *
 +       * We need to split the extent we found, and may flip bits on
 +       * second half.
 +       *
 +       * If the extent we found extends past our
 +       * range, we just split and search again.  It'll get split
 +       * again the next time though.
 +       *
 +       * If the extent we found is inside our range, we set the
 +       * desired bit on it.
 +       */
 +      if (state->start < start) {
 +              prealloc = alloc_extent_state_atomic(prealloc);
 +              if (!prealloc)
 +                      return -ENOMEM;
 +              err = split_state(tree, state, prealloc, start);
 +              BUG_ON(err == -EEXIST);
 +              prealloc = NULL;
 +              if (err)
 +                      goto out;
 +              if (state->end <= end) {
 +                      set_state_bits(tree, state, &bits);
 +                      clear_state_bit(tree, state, &clear_bits, 0);
 +                      merge_state(tree, state);
 +                      if (last_end == (u64)-1)
 +                              goto out;
 +                      start = last_end + 1;
 +              }
 +              goto search_again;
 +      }
 +      /*
 +       * | ---- desired range ---- |
 +       *     | state | or               | state |
 +       *
 +       * There's a hole, we need to insert something in it and
 +       * ignore the extent we found.
 +       */
 +      if (state->start > start) {
 +              u64 this_end;
 +              if (end < last_start)
 +                      this_end = end;
 +              else
 +                      this_end = last_start - 1;
 +
 +              prealloc = alloc_extent_state_atomic(prealloc);
 +              if (!prealloc)
 +                      return -ENOMEM;
 +
 +              /*
 +               * Avoid to free 'prealloc' if it can be merged with
 +               * the later extent.
 +               */
 +              err = insert_state(tree, prealloc, start, this_end,
 +                                 &bits);
 +              BUG_ON(err == -EEXIST);
 +              if (err) {
 +                      free_extent_state(prealloc);
 +                      prealloc = NULL;
 +                      goto out;
 +              }
 +              prealloc = NULL;
 +              start = this_end + 1;
 +              goto search_again;
 +      }
 +      /*
 +       * | ---- desired range ---- |
 +       *                        | state |
 +       * We need to split the extent, and set the bit
 +       * on the first half
 +       */
 +      if (state->start <= end && state->end > end) {
 +              prealloc = alloc_extent_state_atomic(prealloc);
 +              if (!prealloc)
 +                      return -ENOMEM;
 +
 +              err = split_state(tree, state, prealloc, end + 1);
 +              BUG_ON(err == -EEXIST);
 +
 +              set_state_bits(tree, prealloc, &bits);
 +              clear_state_bit(tree, prealloc, &clear_bits, 0);
 +
 +              merge_state(tree, prealloc);
 +              prealloc = NULL;
 +              goto out;
 +      }
 +
 +      goto search_again;
 +
 +out:
 +      spin_unlock(&tree->lock);
 +      if (prealloc)
 +              free_extent_state(prealloc);
 +
 +      return err;
 +
 +search_again:
 +      if (start > end)
 +              goto out;
 +      spin_unlock(&tree->lock);
 +      if (mask & __GFP_WAIT)
 +              cond_resched();
 +      goto again;
 +}
 +
  /* wrappers around set/clear extent bit */
  int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
                     gfp_t mask)
@@@ -1107,7 -919,7 +1107,7 @@@ int set_extent_delalloc(struct extent_i
                        struct extent_state **cached_state, gfp_t mask)
  {
        return set_extent_bit(tree, start, end,
 -                            EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE,
 +                            EXTENT_DELALLOC | EXTENT_UPTODATE,
                              0, NULL, cached_state, mask);
  }
  
@@@ -1919,7 -1731,7 +1919,7 @@@ static void end_bio_extent_readpage(str
                if (!uptodate && tree->ops &&
                    tree->ops->readpage_io_failed_hook) {
                        ret = tree->ops->readpage_io_failed_hook(bio, page,
-                                                        start, end, NULL);
+                                                        start, end, state);
                        if (ret == 0) {
                                uptodate =
                                        test_bit(BIO_UPTODATE, &bio->bi_flags);
@@@ -2324,7 -2136,6 +2324,7 @@@ static int __extent_writepage(struct pa
        int compressed;
        int write_flags;
        unsigned long nr_written = 0;
 +      bool fill_delalloc = true;
  
        if (wbc->sync_mode == WB_SYNC_ALL)
                write_flags = WRITE_SYNC;
        trace___extent_writepage(page, inode, wbc);
  
        WARN_ON(!PageLocked(page));
 +
 +      ClearPageError(page);
 +
        pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
        if (page->index > end_index ||
           (page->index == end_index && !pg_offset)) {
  
        set_page_extent_mapped(page);
  
 +      if (!tree->ops || !tree->ops->fill_delalloc)
 +              fill_delalloc = false;
 +
        delalloc_start = start;
        delalloc_end = 0;
        page_started = 0;
 -      if (!epd->extent_locked) {
 +      if (!epd->extent_locked && fill_delalloc) {
                u64 delalloc_to_write = 0;
                /*
                 * make sure the wbc mapping index is at least updated
@@@ -2616,16 -2421,10 +2616,16 @@@ retry
                         * swizzled back from swapper_space to tmpfs file
                         * mapping
                         */
 -                      if (tree->ops && tree->ops->write_cache_pages_lock_hook)
 -                              tree->ops->write_cache_pages_lock_hook(page);
 -                      else
 -                              lock_page(page);
 +                      if (tree->ops &&
 +                          tree->ops->write_cache_pages_lock_hook) {
 +                              tree->ops->write_cache_pages_lock_hook(page,
 +                                                             data, flush_fn);
 +                      } else {
 +                              if (!trylock_page(page)) {
 +                                      flush_fn(data);
 +                                      lock_page(page);
 +                              }
 +                      }
  
                        if (unlikely(page->mapping != mapping)) {
                                unlock_page(page);
@@@ -3405,7 -3204,6 +3405,7 @@@ int clear_extent_buffer_dirty(struct ex
                                                PAGECACHE_TAG_DIRTY);
                }
                spin_unlock_irq(&page->mapping->tree_lock);
 +              ClearPageError(page);
                unlock_page(page);
        }
        return 0;
@@@ -3551,8 -3349,7 +3551,7 @@@ int extent_buffer_uptodate(struct exten
  }
  
  int read_extent_buffer_pages(struct extent_io_tree *tree,
-                            struct extent_buffer *eb,
-                            u64 start, int wait,
+                            struct extent_buffer *eb, u64 start, int wait,
                             get_extent_t *get_extent, int mirror_num)
  {
        unsigned long i;
        num_pages = num_extent_pages(eb->start, eb->len);
        for (i = start_i; i < num_pages; i++) {
                page = extent_buffer_page(eb, i);
-               if (!wait) {
+               if (wait == WAIT_NONE) {
                        if (!trylock_page(page))
                                goto unlock_exit;
                } else {
        if (bio)
                submit_one_bio(READ, bio, mirror_num, bio_flags);
  
-       if (ret || !wait)
+       if (ret || wait != WAIT_COMPLETE)
                return ret;
  
        for (i = start_i; i < num_pages; i++) {
diff --combined fs/btrfs/extent_io.h
index cbd4824a7c9429ed16bfd8cb0985216ee0e5aae5,fcaf49bcb8809682a9f01555e5b2130fb48df8b7..697570eed9e8228c700782e9f6c3d93945d902ca
@@@ -17,7 -17,6 +17,7 @@@
  #define EXTENT_NODATASUM (1 << 10)
  #define EXTENT_DO_ACCOUNTING (1 << 11)
  #define EXTENT_FIRST_DELALLOC (1 << 12)
 +#define EXTENT_NEED_WAIT (1 << 13)
  #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
  #define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
  
@@@ -33,6 -32,7 +33,7 @@@
  #define EXTENT_BUFFER_BLOCKING 1
  #define EXTENT_BUFFER_DIRTY 2
  #define EXTENT_BUFFER_CORRUPT 3
+ #define EXTENT_BUFFER_READAHEAD 4     /* this got triggered by readahead */
  
  /* these are flags for extent_clear_unlock_delalloc */
  #define EXTENT_CLEAR_UNLOCK_PAGE 0x1
@@@ -86,8 -86,7 +87,8 @@@ struct extent_io_ops 
                                  struct extent_state *other);
        void (*split_extent_hook)(struct inode *inode,
                                  struct extent_state *orig, u64 split);
 -      int (*write_cache_pages_lock_hook)(struct page *page);
 +      int (*write_cache_pages_lock_hook)(struct page *page, void *data,
 +                                         void (*flush_fn)(void *));
  };
  
  struct extent_io_tree {
@@@ -216,8 -215,6 +217,8 @@@ int set_extent_dirty(struct extent_io_t
                     gfp_t mask);
  int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
                       gfp_t mask);
 +int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 +                     int bits, int clear_bits, gfp_t mask);
  int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
                        struct extent_state **cached_state, gfp_t mask);
  int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
@@@ -252,6 -249,9 +253,9 @@@ struct extent_buffer *alloc_extent_buff
  struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
                                         u64 start, unsigned long len);
  void free_extent_buffer(struct extent_buffer *eb);
+ #define WAIT_NONE     0
+ #define WAIT_COMPLETE 1
+ #define WAIT_PAGE_LOCK        2
  int read_extent_buffer_pages(struct extent_io_tree *tree,
                             struct extent_buffer *eb, u64 start, int wait,
                             get_extent_t *get_extent, int mirror_num);
diff --combined fs/btrfs/scrub.c
index 69a600f07763dd71cc194707b9f4b5463cc06709,f930f2776589c9727f308c0be11ba5a9671ffba8..5bc4ec827b3d67bd4edb14daa1533955dd85bdb9
   * any can be found.
   *
   * Future enhancements:
-  *  - To enhance the performance, better read-ahead strategies for the
-  *    extent-tree can be employed.
   *  - In case an unrepairable extent is encountered, track which files are
   *    affected and report them
   *  - In case of a read error on files with nodatasum, map the file and read
   *    the extent to trigger a writeback of the good copy
   *  - track and record media errors, throw out bad devices
   *  - add a mode to also read unallocated space
-  *  - make the prefetch cancellable
   */
  
  struct scrub_bio;
@@@ -182,7 -179,7 +179,7 @@@ struct scrub_dev *scrub_setup_dev(struc
        sdev->curr = -1;
        atomic_set(&sdev->in_flight, 0);
        atomic_set(&sdev->cancel_req, 0);
 -      sdev->csum_size = btrfs_super_csum_size(&fs_info->super_copy);
 +      sdev->csum_size = btrfs_super_csum_size(fs_info->super_copy);
        INIT_LIST_HEAD(&sdev->csum_list);
  
        spin_lock_init(&sdev->list_lock);
@@@ -741,13 -738,16 +738,16 @@@ static noinline_for_stack int scrub_str
        int slot;
        int i;
        u64 nstripes;
-       int start_stripe;
        struct extent_buffer *l;
        struct btrfs_key key;
        u64 physical;
        u64 logical;
        u64 generation;
        u64 mirror_num;
+       struct reada_control *reada1;
+       struct reada_control *reada2;
+       struct btrfs_key key_start;
+       struct btrfs_key key_end;
  
        u64 increment = map->stripe_len;
        u64 offset;
        if (!path)
                return -ENOMEM;
  
-       path->reada = 2;
        path->search_commit_root = 1;
        path->skip_locking = 1;
  
        /*
-        * find all extents for each stripe and just read them to get
-        * them into the page cache
-        * FIXME: we can do better. build a more intelligent prefetching
+        * trigger the readahead for extent tree csum tree and wait for
+        * completion. During readahead, the scrub is officially paused
+        * to not hold off transaction commits
         */
        logical = base + offset;
-       physical = map->stripes[num].physical;
-       ret = 0;
-       for (i = 0; i < nstripes; ++i) {
-               key.objectid = logical;
-               key.type = BTRFS_EXTENT_ITEM_KEY;
-               key.offset = (u64)0;
  
-               ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
-               if (ret < 0)
-                       goto out_noplug;
-               /*
-                * we might miss half an extent here, but that doesn't matter,
-                * as it's only the prefetch
-                */
-               while (1) {
-                       l = path->nodes[0];
-                       slot = path->slots[0];
-                       if (slot >= btrfs_header_nritems(l)) {
-                               ret = btrfs_next_leaf(root, path);
-                               if (ret == 0)
-                                       continue;
-                               if (ret < 0)
-                                       goto out_noplug;
-                               break;
-                       }
-                       btrfs_item_key_to_cpu(l, &key, slot);
-                       if (key.objectid >= logical + map->stripe_len)
-                               break;
-                       path->slots[0]++;
-               }
-               btrfs_release_path(path);
-               logical += increment;
-               physical += map->stripe_len;
-               cond_resched();
+       wait_event(sdev->list_wait,
+                  atomic_read(&sdev->in_flight) == 0);
+       atomic_inc(&fs_info->scrubs_paused);
+       wake_up(&fs_info->scrub_pause_wait);
+       /* FIXME it might be better to start readahead at commit root */
+       key_start.objectid = logical;
+       key_start.type = BTRFS_EXTENT_ITEM_KEY;
+       key_start.offset = (u64)0;
+       key_end.objectid = base + offset + nstripes * increment;
+       key_end.type = BTRFS_EXTENT_ITEM_KEY;
+       key_end.offset = (u64)0;
+       reada1 = btrfs_reada_add(root, &key_start, &key_end);
+       key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+       key_start.type = BTRFS_EXTENT_CSUM_KEY;
+       key_start.offset = logical;
+       key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+       key_end.type = BTRFS_EXTENT_CSUM_KEY;
+       key_end.offset = base + offset + nstripes * increment;
+       reada2 = btrfs_reada_add(csum_root, &key_start, &key_end);
+       if (!IS_ERR(reada1))
+               btrfs_reada_wait(reada1);
+       if (!IS_ERR(reada2))
+               btrfs_reada_wait(reada2);
+       mutex_lock(&fs_info->scrub_lock);
+       while (atomic_read(&fs_info->scrub_pause_req)) {
+               mutex_unlock(&fs_info->scrub_lock);
+               wait_event(fs_info->scrub_pause_wait,
+                  atomic_read(&fs_info->scrub_pause_req) == 0);
+               mutex_lock(&fs_info->scrub_lock);
        }
+       atomic_dec(&fs_info->scrubs_paused);
+       mutex_unlock(&fs_info->scrub_lock);
+       wake_up(&fs_info->scrub_pause_wait);
  
        /*
         * collect all data csums for the stripe to avoid seeking during
         * the scrub. This might currently (crc32) end up to be about 1MB
         */
-       start_stripe = 0;
        blk_start_plug(&plug);
- again:
-       logical = base + offset + start_stripe * increment;
-       for (i = start_stripe; i < nstripes; ++i) {
-               ret = btrfs_lookup_csums_range(csum_root, logical,
-                                              logical + map->stripe_len - 1,
-                                              &sdev->csum_list, 1);
-               if (ret)
-                       goto out;
  
-               logical += increment;
-               cond_resched();
-       }
        /*
         * now find all extents for each stripe and scrub them
         */
-       logical = base + offset + start_stripe * increment;
-       physical = map->stripes[num].physical + start_stripe * map->stripe_len;
+       logical = base + offset;
+       physical = map->stripes[num].physical;
        ret = 0;
-       for (i = start_stripe; i < nstripes; ++i) {
+       for (i = 0; i < nstripes; ++i) {
                /*
                 * canceled?
                 */
                        atomic_dec(&fs_info->scrubs_paused);
                        mutex_unlock(&fs_info->scrub_lock);
                        wake_up(&fs_info->scrub_pause_wait);
-                       scrub_free_csums(sdev);
-                       start_stripe = i;
-                       goto again;
                }
  
+               ret = btrfs_lookup_csums_range(csum_root, logical,
+                                              logical + map->stripe_len - 1,
+                                              &sdev->csum_list, 1);
+               if (ret)
+                       goto out;
                key.objectid = logical;
                key.type = BTRFS_EXTENT_ITEM_KEY;
                key.offset = (u64)0;
@@@ -982,7 -971,6 +971,6 @@@ next
  
  out:
        blk_finish_plug(&plug);
- out_noplug:
        btrfs_free_path(path);
        return ret < 0 ? ret : 0;
  }
diff --combined fs/btrfs/volumes.c
index c3b45564048e738fac8eb1098c6d4f654ed06cc2,1dccce5bc93d0548c90a77f358dfb6842f95725e..f1685a2b45c88a8471d56c12351ac674fff0ca10
@@@ -366,6 -366,14 +366,14 @@@ static noinline int device_list_add(con
                }
                INIT_LIST_HEAD(&device->dev_alloc_list);
  
+               /* init readahead state */
+               spin_lock_init(&device->reada_lock);
+               device->reada_curr_zone = NULL;
+               atomic_set(&device->reada_in_flight, 0);
+               device->reada_next = 0;
+               INIT_RADIX_TREE(&device->reada_zones, GFP_NOFS & ~__GFP_WAIT);
+               INIT_RADIX_TREE(&device->reada_extents, GFP_NOFS & ~__GFP_WAIT);
                mutex_lock(&fs_devices->device_list_mutex);
                list_add_rcu(&device->dev_list, &fs_devices->devices);
                mutex_unlock(&fs_devices->device_list_mutex);
@@@ -597,8 -605,10 +605,8 @@@ static int __btrfs_open_devices(struct 
                set_blocksize(bdev, 4096);
  
                bh = btrfs_read_dev_super(bdev);
 -              if (!bh) {
 -                      ret = -EINVAL;
 +              if (!bh)
                        goto error_close;
 -              }
  
                disk_super = (struct btrfs_super_block *)bh->b_data;
                devid = btrfs_stack_device_id(&disk_super->dev_item);
@@@ -653,7 -663,7 +661,7 @@@ error
                continue;
        }
        if (fs_devices->open_devices == 0) {
 -              ret = -EIO;
 +              ret = -EINVAL;
                goto out;
        }
        fs_devices->seeding = seeding;
@@@ -1011,13 -1021,8 +1019,13 @@@ static int btrfs_free_dev_extent(struc
        }
        BUG_ON(ret);
  
 -      if (device->bytes_used > 0)
 -              device->bytes_used -= btrfs_dev_extent_length(leaf, extent);
 +      if (device->bytes_used > 0) {
 +              u64 len = btrfs_dev_extent_length(leaf, extent);
 +              device->bytes_used -= len;
 +              spin_lock(&root->fs_info->free_chunk_lock);
 +              root->fs_info->free_chunk_space += len;
 +              spin_unlock(&root->fs_info->free_chunk_lock);
 +      }
        ret = btrfs_del_item(trans, root, path);
  
  out:
@@@ -1359,11 -1364,6 +1367,11 @@@ int btrfs_rm_device(struct btrfs_root *
        if (ret)
                goto error_undo;
  
 +      spin_lock(&root->fs_info->free_chunk_lock);
 +      root->fs_info->free_chunk_space = device->total_bytes -
 +              device->bytes_used;
 +      spin_unlock(&root->fs_info->free_chunk_lock);
 +
        device->in_fs_metadata = 0;
        btrfs_scrub_cancel_dev(root, device);
  
        call_rcu(&device->rcu, free_device);
        mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
  
 -      num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1;
 -      btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices);
 +      num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
 +      btrfs_set_super_num_devices(root->fs_info->super_copy, num_devices);
  
        if (cur_devices->open_devices == 0) {
                struct btrfs_fs_devices *fs_devices;
@@@ -1458,7 -1458,7 +1466,7 @@@ static int btrfs_prepare_sprout(struct 
        struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
        struct btrfs_fs_devices *old_devices;
        struct btrfs_fs_devices *seed_devices;
 -      struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
 +      struct btrfs_super_block *disk_super = root->fs_info->super_copy;
        struct btrfs_device *device;
        u64 super_flags;
  
@@@ -1699,19 -1699,15 +1707,19 @@@ int btrfs_init_new_device(struct btrfs_
                root->fs_info->fs_devices->num_can_discard++;
        root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
  
 +      spin_lock(&root->fs_info->free_chunk_lock);
 +      root->fs_info->free_chunk_space += device->total_bytes;
 +      spin_unlock(&root->fs_info->free_chunk_lock);
 +
        if (!blk_queue_nonrot(bdev_get_queue(bdev)))
                root->fs_info->fs_devices->rotating = 1;
  
 -      total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
 -      btrfs_set_super_total_bytes(&root->fs_info->super_copy,
 +      total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy);
 +      btrfs_set_super_total_bytes(root->fs_info->super_copy,
                                    total_bytes + device->total_bytes);
  
 -      total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy);
 -      btrfs_set_super_num_devices(&root->fs_info->super_copy,
 +      total_bytes = btrfs_super_num_devices(root->fs_info->super_copy);
 +      btrfs_set_super_num_devices(root->fs_info->super_copy,
                                    total_bytes + 1);
        mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
  
@@@ -1802,7 -1798,7 +1810,7 @@@ static int __btrfs_grow_device(struct b
                      struct btrfs_device *device, u64 new_size)
  {
        struct btrfs_super_block *super_copy =
 -              &device->dev_root->fs_info->super_copy;
 +              device->dev_root->fs_info->super_copy;
        u64 old_total = btrfs_super_total_bytes(super_copy);
        u64 diff = new_size - device->total_bytes;
  
@@@ -1861,7 -1857,7 +1869,7 @@@ static int btrfs_free_chunk(struct btrf
  static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
                        chunk_offset)
  {
 -      struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
 +      struct btrfs_super_block *super_copy = root->fs_info->super_copy;
        struct btrfs_disk_key *disk_key;
        struct btrfs_chunk *chunk;
        u8 *ptr;
@@@ -2187,7 -2183,7 +2195,7 @@@ int btrfs_shrink_device(struct btrfs_de
        bool retried = false;
        struct extent_buffer *l;
        struct btrfs_key key;
 -      struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
 +      struct btrfs_super_block *super_copy = root->fs_info->super_copy;
        u64 old_total = btrfs_super_total_bytes(super_copy);
        u64 old_size = device->total_bytes;
        u64 diff = device->total_bytes - new_size;
        lock_chunks(root);
  
        device->total_bytes = new_size;
 -      if (device->writeable)
 +      if (device->writeable) {
                device->fs_devices->total_rw_bytes -= diff;
 +              spin_lock(&root->fs_info->free_chunk_lock);
 +              root->fs_info->free_chunk_space -= diff;
 +              spin_unlock(&root->fs_info->free_chunk_lock);
 +      }
        unlock_chunks(root);
  
  again:
                device->total_bytes = old_size;
                if (device->writeable)
                        device->fs_devices->total_rw_bytes += diff;
 +              spin_lock(&root->fs_info->free_chunk_lock);
 +              root->fs_info->free_chunk_space += diff;
 +              spin_unlock(&root->fs_info->free_chunk_lock);
                unlock_chunks(root);
                goto done;
        }
@@@ -2311,7 -2300,7 +2319,7 @@@ static int btrfs_add_system_chunk(struc
                           struct btrfs_key *key,
                           struct btrfs_chunk *chunk, int item_size)
  {
 -      struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
 +      struct btrfs_super_block *super_copy = root->fs_info->super_copy;
        struct btrfs_disk_key disk_key;
        u32 array_size;
        u8 *ptr;
@@@ -2634,11 -2623,6 +2642,11 @@@ static int __finish_chunk_alloc(struct 
                index++;
        }
  
 +      spin_lock(&extent_root->fs_info->free_chunk_lock);
 +      extent_root->fs_info->free_chunk_space -= (stripe_size *
 +                                                 map->num_stripes);
 +      spin_unlock(&extent_root->fs_info->free_chunk_lock);
 +
        index = 0;
        stripe = &chunk->stripe;
        while (index < map->num_stripes) {
@@@ -3640,20 -3624,15 +3648,20 @@@ static int read_one_dev(struct btrfs_ro
        fill_device_from_item(leaf, dev_item, device);
        device->dev_root = root->fs_info->dev_root;
        device->in_fs_metadata = 1;
 -      if (device->writeable)
 +      if (device->writeable) {
                device->fs_devices->total_rw_bytes += device->total_bytes;
 +              spin_lock(&root->fs_info->free_chunk_lock);
 +              root->fs_info->free_chunk_space += device->total_bytes -
 +                      device->bytes_used;
 +              spin_unlock(&root->fs_info->free_chunk_lock);
 +      }
        ret = 0;
        return ret;
  }
  
  int btrfs_read_sys_array(struct btrfs_root *root)
  {
 -      struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
 +      struct btrfs_super_block *super_copy = root->fs_info->super_copy;
        struct extent_buffer *sb;
        struct btrfs_disk_key *disk_key;
        struct btrfs_chunk *chunk;