]> git.proxmox.com Git - mirror_ubuntu-artful-kernel.git/commitdiff
Merge branch 'freespace-tree' into for-linus-4.5
authorChris Mason <clm@fb.com>
Fri, 18 Dec 2015 19:11:10 +0000 (11:11 -0800)
committerChris Mason <clm@fb.com>
Fri, 18 Dec 2015 19:11:10 +0000 (11:11 -0800)
Signed-off-by: Chris Mason <clm@fb.com>
1  2 
fs/btrfs/ctree.h
fs/btrfs/disk-io.c
fs/btrfs/extent-tree.c
fs/btrfs/extent_io.c
fs/btrfs/extent_io.h
fs/btrfs/super.c
fs/btrfs/tests/btrfs-tests.c
fs/btrfs/tests/free-space-tests.c
include/trace/events/btrfs.h

diff --combined fs/btrfs/ctree.h
index 35489e7129a7e8de9d0232279d41d3bbd19ae1df,ed610f9c04b29fc43d5efc51ae3737b8f2a053aa..cf87979a153e69c027b3a01cb96df1dc8a511158
@@@ -96,6 -96,9 +96,9 @@@ struct btrfs_ordered_sum
  /* for storing items that use the BTRFS_UUID_KEY* types */
  #define BTRFS_UUID_TREE_OBJECTID 9ULL
  
+ /* tracks free space in block groups. */
+ #define BTRFS_FREE_SPACE_TREE_OBJECTID 10ULL
  /* for storing balance parameters in the root tree */
  #define BTRFS_BALANCE_OBJECTID -4ULL
  
@@@ -500,6 -503,8 +503,8 @@@ struct btrfs_super_block 
   * Compat flags that we support.  If any incompat flags are set other than the
   * ones specified below then we will fail to mount
   */
+ #define BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE       (1ULL << 0)
  #define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF  (1ULL << 0)
  #define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (1ULL << 1)
  #define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS   (1ULL << 2)
  #define BTRFS_FEATURE_COMPAT_SUPP             0ULL
  #define BTRFS_FEATURE_COMPAT_SAFE_SET         0ULL
  #define BTRFS_FEATURE_COMPAT_SAFE_CLEAR               0ULL
- #define BTRFS_FEATURE_COMPAT_RO_SUPP          0ULL
+ #define BTRFS_FEATURE_COMPAT_RO_SUPP                  \
+       (BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE)
  #define BTRFS_FEATURE_COMPAT_RO_SAFE_SET      0ULL
  #define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR    0ULL
  
@@@ -823,18 -831,8 +831,18 @@@ struct btrfs_disk_balance_args 
         */
        __le64 profiles;
  
 -      /* usage filter */
 -      __le64 usage;
 +      /*
 +       * usage filter
 +       * BTRFS_BALANCE_ARGS_USAGE with a single value means '0..N'
 +       * BTRFS_BALANCE_ARGS_USAGE_RANGE - range syntax, min..max
 +       */
 +      union {
 +              __le64 usage;
 +              struct {
 +                      __le32 usage_min;
 +                      __le32 usage_max;
 +              };
 +      };
  
        /* devid filter */
        __le64 devid;
        /* BTRFS_BALANCE_ARGS_* */
        __le64 flags;
  
 -      /* BTRFS_BALANCE_ARGS_LIMIT value */
 -      __le64 limit;
 +      /*
 +       * BTRFS_BALANCE_ARGS_LIMIT with value 'limit'
 +       * BTRFS_BALANCE_ARGS_LIMIT_RANGE - the extend version can use minimum
 +       * and maximum
 +       */
 +      union {
 +              __le64 limit;
 +              struct {
 +                      __le32 limit_min;
 +                      __le32 limit_max;
 +              };
 +      };
  
 -      __le64 unused[7];
 +      /*
 +       * Process chunks that cross stripes_min..stripes_max devices,
 +       * BTRFS_BALANCE_ARGS_STRIPES_RANGE
 +       */
 +      __le32 stripes_min;
 +      __le32 stripes_max;
 +
 +      __le64 unused[6];
  } __attribute__ ((__packed__));
  
  /*
@@@ -1088,6 -1069,13 +1096,13 @@@ struct btrfs_block_group_item 
        __le64 flags;
  } __attribute__ ((__packed__));
  
+ struct btrfs_free_space_info {
+       __le32 extent_count;
+       __le32 flags;
+ } __attribute__ ((__packed__));
+ #define BTRFS_FREE_SPACE_USING_BITMAPS (1ULL << 0)
  #define BTRFS_QGROUP_LEVEL_SHIFT              48
  static inline u64 btrfs_qgroup_level(u64 qgroupid)
  {
@@@ -1181,10 -1169,6 +1196,10 @@@ struct btrfs_space_info 
                                   delalloc/allocations */
        u64 bytes_readonly;     /* total bytes that are read only */
  
 +      u64 max_extent_size;    /* This will hold the maximum extent size of
 +                                 the space info if we had an ENOSPC in the
 +                                 allocator. */
 +
        unsigned int full:1;    /* indicates that we cannot allocate any more
                                   chunks for this space */
        unsigned int chunk_alloc:1;     /* set if we are allocating a chunk */
@@@ -1259,9 -1243,6 +1274,9 @@@ struct btrfs_free_cluster 
        /* first extent starting offset */
        u64 window_start;
  
 +      /* We did a full search and couldn't create a cluster */
 +      bool fragmented;
 +
        struct btrfs_block_group_cache *block_group;
        /*
         * when a cluster is allocated from a block group, we put the
@@@ -1296,6 -1277,9 +1311,9 @@@ struct btrfs_caching_control 
        atomic_t count;
  };
  
+ /* Once caching_thread() finds this much free space, it will wake up waiters. */
+ #define CACHING_CTL_WAKE_UP (1024 * 1024 * 2)
  struct btrfs_io_ctl {
        void *cur, *orig;
        struct page *page;
@@@ -1321,8 -1305,20 +1339,20 @@@ struct btrfs_block_group_cache 
        u64 delalloc_bytes;
        u64 bytes_super;
        u64 flags;
-       u64 sectorsize;
        u64 cache_generation;
+       u32 sectorsize;
+       /*
+        * If the free space extent count exceeds this number, convert the block
+        * group to bitmaps.
+        */
+       u32 bitmap_high_thresh;
+       /*
+        * If the free space extent count drops below this number, convert the
+        * block group back to extents.
+        */
+       u32 bitmap_low_thresh;
  
        /*
         * It is just used for the delayed data space allocation because
        struct list_head io_list;
  
        struct btrfs_io_ctl io_ctl;
+       /* Lock for free space tree operations. */
+       struct mutex free_space_lock;
+       /*
+        * Does the block group need to be added to the free space tree?
+        * Protected by free_space_lock.
+        */
+       int needs_free_space;
  };
  
  /* delayed seq elem */
@@@ -1429,6 -1434,7 +1468,7 @@@ struct btrfs_fs_info 
        struct btrfs_root *csum_root;
        struct btrfs_root *quota_root;
        struct btrfs_root *uuid_root;
+       struct btrfs_root *free_space_root;
  
        /* the log root tree is a directory of all the other log roots */
        struct btrfs_root *log_root_tree;
@@@ -1977,9 -1983,6 +2017,9 @@@ struct btrfs_root 
        int send_in_progress;
        struct btrfs_subvolume_writers *subv_writers;
        atomic_t will_be_snapshoted;
 +
 +      /* For qgroup metadata space reserve */
 +      atomic_t qgroup_meta_rsv;
  };
  
  struct btrfs_ioctl_defrag_range_args {
   */
  #define BTRFS_BLOCK_GROUP_ITEM_KEY 192
  
+ /*
+  * Every block group is represented in the free space tree by a free space info
+  * item, which stores some accounting information. It is keyed on
+  * (block_group_start, FREE_SPACE_INFO, block_group_length).
+  */
+ #define BTRFS_FREE_SPACE_INFO_KEY 198
+ /*
+  * A free space extent tracks an extent of space that is free in a block group.
+  * It is keyed on (start, FREE_SPACE_EXTENT, length).
+  */
+ #define BTRFS_FREE_SPACE_EXTENT_KEY 199
+ /*
+  * When a block group becomes very fragmented, we convert it to use bitmaps
+  * instead of extents. A free space bitmap is keyed on
+  * (start, FREE_SPACE_BITMAP, length); the corresponding item is a bitmap with
+  * (length / sectorsize) bits.
+  */
+ #define BTRFS_FREE_SPACE_BITMAP_KEY 200
  #define BTRFS_DEV_EXTENT_KEY  204
  #define BTRFS_DEV_ITEM_KEY    216
  #define BTRFS_CHUNK_ITEM_KEY  228
  #define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21)
  #define BTRFS_MOUNT_PANIC_ON_FATAL_ERROR      (1 << 22)
  #define BTRFS_MOUNT_RESCAN_UUID_TREE  (1 << 23)
 -#define BTRFS_MOUNT_FREE_SPACE_TREE   (1 << 24)
 +#define BTRFS_MOUNT_FRAGMENT_DATA     (1 << 24)
 +#define BTRFS_MOUNT_FRAGMENT_METADATA (1 << 25)
++#define BTRFS_MOUNT_FREE_SPACE_TREE   (1 << 26)
  
  #define BTRFS_DEFAULT_COMMIT_INTERVAL (30)
  #define BTRFS_DEFAULT_MAX_INLINE      (8192)
        btrfs_clear_opt(root->fs_info->mount_opt, opt);                 \
  }
  
 +#ifdef CONFIG_BTRFS_DEBUG
 +static inline int
 +btrfs_should_fragment_free_space(struct btrfs_root *root,
 +                               struct btrfs_block_group_cache *block_group)
 +{
 +      return (btrfs_test_opt(root, FRAGMENT_METADATA) &&
 +              block_group->flags & BTRFS_BLOCK_GROUP_METADATA) ||
 +             (btrfs_test_opt(root, FRAGMENT_DATA) &&
 +              block_group->flags &  BTRFS_BLOCK_GROUP_DATA);
 +}
 +#endif
 +
  /*
   * Requests for changes that need to be done during transaction commit.
   *
@@@ -2506,6 -2517,11 +2568,11 @@@ BTRFS_SETGET_FUNCS(disk_block_group_fla
  BTRFS_SETGET_STACK_FUNCS(block_group_flags,
                        struct btrfs_block_group_item, flags, 64);
  
+ /* struct btrfs_free_space_info */
+ BTRFS_SETGET_FUNCS(free_space_extent_count, struct btrfs_free_space_info,
+                  extent_count, 32);
+ BTRFS_SETGET_FUNCS(free_space_flags, struct btrfs_free_space_info, flags, 32);
  /* struct btrfs_inode_ref */
  BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16);
  BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64);
@@@ -3367,7 -3383,7 +3434,7 @@@ static inline bool btrfs_mixed_space_in
  
  static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping)
  {
 -      return mapping_gfp_mask(mapping) & ~__GFP_FS;
 +      return mapping_gfp_constraint(mapping, ~__GFP_FS);
  }
  
  /* extent-tree.c */
@@@ -3416,7 -3432,6 +3483,7 @@@ int btrfs_cross_ref_exist(struct btrfs_
  struct btrfs_block_group_cache *btrfs_lookup_block_group(
                                                 struct btrfs_fs_info *info,
                                                 u64 bytenr);
 +void btrfs_get_block_group(struct btrfs_block_group_cache *cache);
  void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
  int get_block_group_index(struct btrfs_block_group_cache *cache);
  struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
@@@ -3431,8 -3446,7 +3498,8 @@@ void btrfs_free_tree_block(struct btrfs
  int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
                                     struct btrfs_root *root,
                                     u64 root_objectid, u64 owner,
 -                                   u64 offset, struct btrfs_key *ins);
 +                                   u64 offset, u64 ram_bytes,
 +                                   struct btrfs_key *ins);
  int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
                                   struct btrfs_root *root,
                                   u64 root_objectid, u64 owner, u64 offset,
@@@ -3451,7 -3465,7 +3518,7 @@@ int btrfs_set_disk_extent_flags(struct 
  int btrfs_free_extent(struct btrfs_trans_handle *trans,
                      struct btrfs_root *root,
                      u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
 -                    u64 owner, u64 offset, int no_quota);
 +                    u64 owner, u64 offset);
  
  int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len,
                               int delalloc);
@@@ -3464,7 -3478,7 +3531,7 @@@ int btrfs_finish_extent_commit(struct b
  int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
                         struct btrfs_root *root,
                         u64 bytenr, u64 num_bytes, u64 parent,
 -                       u64 root_objectid, u64 owner, u64 offset, int no_quota);
 +                       u64 root_objectid, u64 owner, u64 offset);
  
  int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans,
                                   struct btrfs_root *root);
@@@ -3480,9 -3494,6 +3547,9 @@@ int btrfs_make_block_group(struct btrfs
                           struct btrfs_root *root, u64 bytes_used,
                           u64 type, u64 chunk_objectid, u64 chunk_offset,
                           u64 size);
 +struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
 +                              struct btrfs_fs_info *fs_info,
 +                              const u64 chunk_offset);
  int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root, u64 group_start,
                             struct extent_map *em);
@@@ -3505,11 -3516,8 +3572,11 @@@ enum btrfs_reserve_flush_enum 
        BTRFS_RESERVE_FLUSH_ALL,
  };
  
 -int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 write_bytes);
 -void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
 +int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len);
 +int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes);
 +void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len);
 +void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
 +                                          u64 len);
  void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
                                struct btrfs_root *root);
  void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans);
@@@ -3525,8 -3533,8 +3592,8 @@@ void btrfs_subvolume_release_metadata(s
                                      u64 qgroup_reserved);
  int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes);
  void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes);
 -int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes);
 -void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes);
 +int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len);
 +void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len);
  void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type);
  struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
                                              unsigned short type);
@@@ -3573,6 -3581,9 +3640,9 @@@ void btrfs_end_write_no_snapshoting(str
  void check_system_chunk(struct btrfs_trans_handle *trans,
                        struct btrfs_root *root,
                        const u64 type);
+ u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
+                      struct btrfs_fs_info *info, u64 start, u64 end);
  /* ctree.c */
  int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
                     int level, int *slot);
@@@ -3737,6 -3748,7 +3807,7 @@@ static inline void free_fs_info(struct 
        kfree(fs_info->csum_root);
        kfree(fs_info->quota_root);
        kfree(fs_info->uuid_root);
+       kfree(fs_info->free_space_root);
        kfree(fs_info->super_copy);
        kfree(fs_info->super_for_commit);
        security_free_mnt_opts(&fs_info->security_opts);
@@@ -4063,8 -4075,8 +4134,8 @@@ int btrfs_defrag_leaves(struct btrfs_tr
  /* sysfs.c */
  int btrfs_init_sysfs(void);
  void btrfs_exit_sysfs(void);
 -int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info);
 -void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info);
 +int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info);
 +void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info);
  
  /* xattr.c */
  ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
@@@ -4098,102 -4110,14 +4169,102 @@@ void btrfs_printk(const struct btrfs_fs
  #define btrfs_info(fs_info, fmt, args...) \
        btrfs_printk(fs_info, KERN_INFO fmt, ##args)
  
 +/*
 + * Wrappers that use printk_in_rcu
 + */
 +#define btrfs_emerg_in_rcu(fs_info, fmt, args...) \
 +      btrfs_printk_in_rcu(fs_info, KERN_EMERG fmt, ##args)
 +#define btrfs_alert_in_rcu(fs_info, fmt, args...) \
 +      btrfs_printk_in_rcu(fs_info, KERN_ALERT fmt, ##args)
 +#define btrfs_crit_in_rcu(fs_info, fmt, args...) \
 +      btrfs_printk_in_rcu(fs_info, KERN_CRIT fmt, ##args)
 +#define btrfs_err_in_rcu(fs_info, fmt, args...) \
 +      btrfs_printk_in_rcu(fs_info, KERN_ERR fmt, ##args)
 +#define btrfs_warn_in_rcu(fs_info, fmt, args...) \
 +      btrfs_printk_in_rcu(fs_info, KERN_WARNING fmt, ##args)
 +#define btrfs_notice_in_rcu(fs_info, fmt, args...) \
 +      btrfs_printk_in_rcu(fs_info, KERN_NOTICE fmt, ##args)
 +#define btrfs_info_in_rcu(fs_info, fmt, args...) \
 +      btrfs_printk_in_rcu(fs_info, KERN_INFO fmt, ##args)
 +
 +/*
 + * Wrappers that use a ratelimited printk_in_rcu
 + */
 +#define btrfs_emerg_rl_in_rcu(fs_info, fmt, args...) \
 +      btrfs_printk_rl_in_rcu(fs_info, KERN_EMERG fmt, ##args)
 +#define btrfs_alert_rl_in_rcu(fs_info, fmt, args...) \
 +      btrfs_printk_rl_in_rcu(fs_info, KERN_ALERT fmt, ##args)
 +#define btrfs_crit_rl_in_rcu(fs_info, fmt, args...) \
 +      btrfs_printk_rl_in_rcu(fs_info, KERN_CRIT fmt, ##args)
 +#define btrfs_err_rl_in_rcu(fs_info, fmt, args...) \
 +      btrfs_printk_rl_in_rcu(fs_info, KERN_ERR fmt, ##args)
 +#define btrfs_warn_rl_in_rcu(fs_info, fmt, args...) \
 +      btrfs_printk_rl_in_rcu(fs_info, KERN_WARNING fmt, ##args)
 +#define btrfs_notice_rl_in_rcu(fs_info, fmt, args...) \
 +      btrfs_printk_rl_in_rcu(fs_info, KERN_NOTICE fmt, ##args)
 +#define btrfs_info_rl_in_rcu(fs_info, fmt, args...) \
 +      btrfs_printk_rl_in_rcu(fs_info, KERN_INFO fmt, ##args)
 +
 +/*
 + * Wrappers that use a ratelimited printk
 + */
 +#define btrfs_emerg_rl(fs_info, fmt, args...) \
 +      btrfs_printk_ratelimited(fs_info, KERN_EMERG fmt, ##args)
 +#define btrfs_alert_rl(fs_info, fmt, args...) \
 +      btrfs_printk_ratelimited(fs_info, KERN_ALERT fmt, ##args)
 +#define btrfs_crit_rl(fs_info, fmt, args...) \
 +      btrfs_printk_ratelimited(fs_info, KERN_CRIT fmt, ##args)
 +#define btrfs_err_rl(fs_info, fmt, args...) \
 +      btrfs_printk_ratelimited(fs_info, KERN_ERR fmt, ##args)
 +#define btrfs_warn_rl(fs_info, fmt, args...) \
 +      btrfs_printk_ratelimited(fs_info, KERN_WARNING fmt, ##args)
 +#define btrfs_notice_rl(fs_info, fmt, args...) \
 +      btrfs_printk_ratelimited(fs_info, KERN_NOTICE fmt, ##args)
 +#define btrfs_info_rl(fs_info, fmt, args...) \
 +      btrfs_printk_ratelimited(fs_info, KERN_INFO fmt, ##args)
  #ifdef DEBUG
  #define btrfs_debug(fs_info, fmt, args...) \
        btrfs_printk(fs_info, KERN_DEBUG fmt, ##args)
 +#define btrfs_debug_in_rcu(fs_info, fmt, args...) \
 +      btrfs_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
 +#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \
 +      btrfs_printk_rl_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
 +#define btrfs_debug_rl(fs_info, fmt, args...) \
 +      btrfs_printk_ratelimited(fs_info, KERN_DEBUG fmt, ##args)
  #else
  #define btrfs_debug(fs_info, fmt, args...) \
      no_printk(KERN_DEBUG fmt, ##args)
 +#define btrfs_debug_in_rcu(fs_info, fmt, args...) \
 +      no_printk(KERN_DEBUG fmt, ##args)
 +#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \
 +      no_printk(KERN_DEBUG fmt, ##args)
 +#define btrfs_debug_rl(fs_info, fmt, args...) \
 +      no_printk(KERN_DEBUG fmt, ##args)
  #endif
  
 +#define btrfs_printk_in_rcu(fs_info, fmt, args...)    \
 +do {                                                  \
 +      rcu_read_lock();                                \
 +      btrfs_printk(fs_info, fmt, ##args);             \
 +      rcu_read_unlock();                              \
 +} while (0)
 +
 +#define btrfs_printk_ratelimited(fs_info, fmt, args...)               \
 +do {                                                          \
 +      static DEFINE_RATELIMIT_STATE(_rs,                      \
 +              DEFAULT_RATELIMIT_INTERVAL,                     \
 +              DEFAULT_RATELIMIT_BURST);                       \
 +      if (__ratelimit(&_rs))                                  \
 +              btrfs_printk(fs_info, fmt, ##args);             \
 +} while (0)
 +
 +#define btrfs_printk_rl_in_rcu(fs_info, fmt, args...)         \
 +do {                                                          \
 +      rcu_read_lock();                                        \
 +      btrfs_printk_ratelimited(fs_info, fmt, ##args);         \
 +      rcu_read_unlock();                                      \
 +} while (0)
 +
  #ifdef CONFIG_BTRFS_ASSERT
  
  __cold
@@@ -4247,6 -4171,30 +4318,30 @@@ static inline void __btrfs_set_fs_incom
        }
  }
  
+ #define btrfs_clear_fs_incompat(__fs_info, opt) \
+       __btrfs_clear_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt)
+ static inline void __btrfs_clear_fs_incompat(struct btrfs_fs_info *fs_info,
+                                            u64 flag)
+ {
+       struct btrfs_super_block *disk_super;
+       u64 features;
+       disk_super = fs_info->super_copy;
+       features = btrfs_super_incompat_flags(disk_super);
+       if (features & flag) {
+               spin_lock(&fs_info->super_lock);
+               features = btrfs_super_incompat_flags(disk_super);
+               if (features & flag) {
+                       features &= ~flag;
+                       btrfs_set_super_incompat_flags(disk_super, features);
+                       btrfs_info(fs_info, "clearing %llu feature flag",
+                                        flag);
+               }
+               spin_unlock(&fs_info->super_lock);
+       }
+ }
  #define btrfs_fs_incompat(fs_info, opt) \
        __btrfs_fs_incompat((fs_info), BTRFS_FEATURE_INCOMPAT_##opt)
  
@@@ -4257,6 -4205,64 +4352,64 @@@ static inline int __btrfs_fs_incompat(s
        return !!(btrfs_super_incompat_flags(disk_super) & flag);
  }
  
+ #define btrfs_set_fs_compat_ro(__fs_info, opt) \
+       __btrfs_set_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt)
+ static inline void __btrfs_set_fs_compat_ro(struct btrfs_fs_info *fs_info,
+                                           u64 flag)
+ {
+       struct btrfs_super_block *disk_super;
+       u64 features;
+       disk_super = fs_info->super_copy;
+       features = btrfs_super_compat_ro_flags(disk_super);
+       if (!(features & flag)) {
+               spin_lock(&fs_info->super_lock);
+               features = btrfs_super_compat_ro_flags(disk_super);
+               if (!(features & flag)) {
+                       features |= flag;
+                       btrfs_set_super_compat_ro_flags(disk_super, features);
+                       btrfs_info(fs_info, "setting %llu ro feature flag",
+                                  flag);
+               }
+               spin_unlock(&fs_info->super_lock);
+       }
+ }
+ #define btrfs_clear_fs_compat_ro(__fs_info, opt) \
+       __btrfs_clear_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt)
+ static inline void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info,
+                                             u64 flag)
+ {
+       struct btrfs_super_block *disk_super;
+       u64 features;
+       disk_super = fs_info->super_copy;
+       features = btrfs_super_compat_ro_flags(disk_super);
+       if (features & flag) {
+               spin_lock(&fs_info->super_lock);
+               features = btrfs_super_compat_ro_flags(disk_super);
+               if (features & flag) {
+                       features &= ~flag;
+                       btrfs_set_super_compat_ro_flags(disk_super, features);
+                       btrfs_info(fs_info, "clearing %llu ro feature flag",
+                                  flag);
+               }
+               spin_unlock(&fs_info->super_lock);
+       }
+ }
+ #define btrfs_fs_compat_ro(fs_info, opt) \
+       __btrfs_fs_compat_ro((fs_info), BTRFS_FEATURE_COMPAT_RO_##opt)
+ static inline int __btrfs_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag)
+ {
+       struct btrfs_super_block *disk_super;
+       disk_super = fs_info->super_copy;
+       return !!(btrfs_super_compat_ro_flags(disk_super) & flag);
+ }
  /*
   * Call btrfs_abort_transaction as early as possible when an error condition is
   * detected, that way the exact line number is reported.
@@@ -4274,7 -4280,14 +4427,7 @@@ do {                                                           
                                  __LINE__, (errno));           \
  } while (0)
  
 -#define btrfs_std_error(fs_info, errno)                               \
 -do {                                                          \
 -      if ((errno))                                            \
 -              __btrfs_std_error((fs_info), __func__,          \
 -                                 __LINE__, (errno), NULL);    \
 -} while (0)
 -
 -#define btrfs_error(fs_info, errno, fmt, args...)             \
 +#define btrfs_std_error(fs_info, errno, fmt, args...)         \
  do {                                                          \
        __btrfs_std_error((fs_info), __func__, __LINE__,        \
                          (errno), fmt, ##args);                \
diff --combined fs/btrfs/disk-io.c
index 974be09e7556ca3f342cac89357364f4ce3cc016,af7ac28380c28d1d6b4326f8183573ea48b46782..52e623f598480881706586a75017da1f3a0abd69
@@@ -42,6 -42,7 +42,7 @@@
  #include "locking.h"
  #include "tree-log.h"
  #include "free-space-cache.h"
+ #include "free-space-tree.h"
  #include "inode-map.h"
  #include "check-integrity.h"
  #include "rcu-string.h"
@@@ -319,9 -320,9 +320,9 @@@ static int csum_tree_block(struct btrfs
                        memcpy(&found, result, csum_size);
  
                        read_extent_buffer(buf, &val, 0, csum_size);
 -                      printk_ratelimited(KERN_WARNING
 -                              "BTRFS: %s checksum verify failed on %llu wanted %X found %X "
 -                              "level %d\n",
 +                      btrfs_warn_rl(fs_info,
 +                              "%s checksum verify failed on %llu wanted %X found %X "
 +                              "level %d",
                                fs_info->sb->s_id, buf->start,
                                val, found, btrfs_header_level(buf));
                        if (result != (char *)&inline_result)
@@@ -368,9 -369,9 +369,9 @@@ static int verify_parent_transid(struc
                ret = 0;
                goto out;
        }
 -      printk_ratelimited(KERN_ERR
 -          "BTRFS (device %s): parent transid verify failed on %llu wanted %llu found %llu\n",
 -                      eb->fs_info->sb->s_id, eb->start,
 +      btrfs_err_rl(eb->fs_info,
 +              "parent transid verify failed on %llu wanted %llu found %llu",
 +                      eb->start,
                        parent_transid, btrfs_header_generation(eb));
        ret = 1;
  
@@@ -629,14 -630,15 +630,14 @@@ static int btree_readpage_end_io_hook(s
  
        found_start = btrfs_header_bytenr(eb);
        if (found_start != eb->start) {
 -              printk_ratelimited(KERN_ERR "BTRFS (device %s): bad tree block start "
 -                             "%llu %llu\n",
 -                             eb->fs_info->sb->s_id, found_start, eb->start);
 +              btrfs_err_rl(eb->fs_info, "bad tree block start %llu %llu",
 +                             found_start, eb->start);
                ret = -EIO;
                goto err;
        }
        if (check_tree_block_fsid(root->fs_info, eb)) {
 -              printk_ratelimited(KERN_ERR "BTRFS (device %s): bad fsid on block %llu\n",
 -                             eb->fs_info->sb->s_id, eb->start);
 +              btrfs_err_rl(eb->fs_info, "bad fsid on block %llu",
 +                             eb->start);
                ret = -EIO;
                goto err;
        }
@@@ -801,9 -803,6 +802,9 @@@ static void run_one_async_done(struct b
        limit = btrfs_async_submit_limit(fs_info);
        limit = limit * 2 / 3;
  
 +      /*
 +       * atomic_dec_return implies a barrier for waitqueue_active
 +       */
        if (atomic_dec_return(&fs_info->nr_async_submits) < limit &&
            waitqueue_active(&fs_info->async_submit_wait))
                wake_up(&fs_info->async_submit_wait);
@@@ -1267,7 -1266,6 +1268,7 @@@ static void __setup_root(u32 nodesize, 
        atomic_set(&root->orphan_inodes, 0);
        atomic_set(&root->refs, 1);
        atomic_set(&root->will_be_snapshoted, 0);
 +      atomic_set(&root->qgroup_meta_rsv, 0);
        root->log_transid = 0;
        root->log_transid_committed = -1;
        root->last_log_commit = 0;
@@@ -1650,6 -1648,9 +1651,9 @@@ struct btrfs_root *btrfs_get_fs_root(st
        if (location->objectid == BTRFS_UUID_TREE_OBJECTID)
                return fs_info->uuid_root ? fs_info->uuid_root :
                                            ERR_PTR(-ENOENT);
+       if (location->objectid == BTRFS_FREE_SPACE_TREE_OBJECTID)
+               return fs_info->free_space_root ? fs_info->free_space_root :
+                                                 ERR_PTR(-ENOENT);
  again:
        root = btrfs_lookup_fs_root(fs_info, location->objectid);
        if (root) {
@@@ -1762,7 -1763,6 +1766,7 @@@ static int cleaner_kthread(void *arg
        int again;
        struct btrfs_trans_handle *trans;
  
 +      set_freezable();
        do {
                again = 0;
  
@@@ -2148,6 -2148,7 +2152,7 @@@ static void free_root_pointers(struct b
        free_root_extent_buffers(info->uuid_root);
        if (chunk_root)
                free_root_extent_buffers(info->chunk_root);
+       free_root_extent_buffers(info->free_space_root);
  }
  
  void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info)
@@@ -2352,7 -2353,8 +2357,7 @@@ static int btrfs_replay_log(struct btrf
        u64 bytenr = btrfs_super_log_root(disk_super);
  
        if (fs_devices->rw_devices == 0) {
 -              printk(KERN_WARNING "BTRFS: log replay required "
 -                     "on RO media\n");
 +              btrfs_warn(fs_info, "log replay required on RO media");
                return -EIO;
        }
  
        log_tree_root->node = read_tree_block(tree_root, bytenr,
                        fs_info->generation + 1);
        if (IS_ERR(log_tree_root->node)) {
 -              printk(KERN_ERR "BTRFS: failed to read log tree\n");
 +              btrfs_warn(fs_info, "failed to read log tree");
                ret = PTR_ERR(log_tree_root->node);
                kfree(log_tree_root);
                return ret;
        } else if (!extent_buffer_uptodate(log_tree_root->node)) {
 -              printk(KERN_ERR "BTRFS: failed to read log tree\n");
 +              btrfs_err(fs_info, "failed to read log tree");
                free_extent_buffer(log_tree_root->node);
                kfree(log_tree_root);
                return -EIO;
        /* returns with log_tree_root freed on success */
        ret = btrfs_recover_log_trees(log_tree_root);
        if (ret) {
 -              btrfs_error(tree_root->fs_info, ret,
 +              btrfs_std_error(tree_root->fs_info, ret,
                            "Failed to recover log tree");
                free_extent_buffer(log_tree_root->node);
                kfree(log_tree_root);
@@@ -2448,6 -2450,15 +2453,15 @@@ static int btrfs_read_roots(struct btrf
                fs_info->uuid_root = root;
        }
  
+       if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
+               location.objectid = BTRFS_FREE_SPACE_TREE_OBJECTID;
+               root = btrfs_read_tree_root(tree_root, &location);
+               if (IS_ERR(root))
+                       return PTR_ERR(root);
+               set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+               fs_info->free_space_root = root;
+       }
        return 0;
  }
  
@@@ -2575,7 -2586,7 +2589,7 @@@ int open_ctree(struct super_block *sb
        fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
        fs_info->avg_delayed_ref_runtime = NSEC_PER_SEC >> 6; /* div by 64 */
        /* readahead state */
 -      INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT);
 +      INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
        spin_lock_init(&fs_info->reada_lock);
  
        fs_info->thread_pool_size = min_t(unsigned long,
         * Read super block and check the signature bytes only
         */
        bh = btrfs_read_dev_super(fs_devices->latest_bdev);
 -      if (!bh) {
 -              err = -EINVAL;
 +      if (IS_ERR(bh)) {
 +              err = PTR_ERR(bh);
                goto fail_alloc;
        }
  
@@@ -2940,7 -2951,7 +2954,7 @@@ retry_root_backup
                goto fail_fsdev_sysfs;
        }
  
 -      ret = btrfs_sysfs_add_one(fs_info);
 +      ret = btrfs_sysfs_add_mounted(fs_info);
        if (ret) {
                pr_err("BTRFS: failed to init sysfs interface: %d\n", ret);
                goto fail_fsdev_sysfs;
  
        btrfs_qgroup_rescan_resume(fs_info);
  
+       if (btrfs_test_opt(tree_root, CLEAR_CACHE) &&
+           btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
+               pr_info("BTRFS: clearing free space tree\n");
+               ret = btrfs_clear_free_space_tree(fs_info);
+               if (ret) {
+                       pr_warn("BTRFS: failed to clear free space tree %d\n",
+                               ret);
+                       close_ctree(tree_root);
+                       return ret;
+               }
+       }
+       if (btrfs_test_opt(tree_root, FREE_SPACE_TREE) &&
+           !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
+               pr_info("BTRFS: creating free space tree\n");
+               ret = btrfs_create_free_space_tree(fs_info);
+               if (ret) {
+                       pr_warn("BTRFS: failed to create free space tree %d\n",
+                               ret);
+                       close_ctree(tree_root);
+                       return ret;
+               }
+       }
        if (!fs_info->uuid_root) {
                pr_info("BTRFS: creating UUID tree\n");
                ret = btrfs_create_uuid_tree(fs_info);
@@@ -3120,7 -3155,7 +3158,7 @@@ fail_cleaner
        filemap_write_and_wait(fs_info->btree_inode->i_mapping);
  
  fail_sysfs:
 -      btrfs_sysfs_remove_one(fs_info);
 +      btrfs_sysfs_remove_mounted(fs_info);
  
  fail_fsdev_sysfs:
        btrfs_sysfs_remove_fsid(fs_info->fs_devices);
@@@ -3182,8 -3217,8 +3220,8 @@@ static void btrfs_end_buffer_write_sync
                struct btrfs_device *device = (struct btrfs_device *)
                        bh->b_private;
  
 -              printk_ratelimited_in_rcu(KERN_WARNING "BTRFS: lost page write due to "
 -                                        "I/O error on %s\n",
 +              btrfs_warn_rl_in_rcu(device->dev_root->fs_info,
 +                              "lost page write due to IO error on %s",
                                          rcu_str_deref(device->name));
                /* note, we dont' set_buffer_write_io_error because we have
                 * our own ways of dealing with the IO errors
        put_bh(bh);
  }
  
 +int btrfs_read_dev_one_super(struct block_device *bdev, int copy_num,
 +                      struct buffer_head **bh_ret)
 +{
 +      struct buffer_head *bh;
 +      struct btrfs_super_block *super;
 +      u64 bytenr;
 +
 +      bytenr = btrfs_sb_offset(copy_num);
 +      if (bytenr + BTRFS_SUPER_INFO_SIZE >= i_size_read(bdev->bd_inode))
 +              return -EINVAL;
 +
 +      bh = __bread(bdev, bytenr / 4096, BTRFS_SUPER_INFO_SIZE);
 +      /*
 +       * If we fail to read from the underlying devices, as of now
 +       * the best option we have is to mark it EIO.
 +       */
 +      if (!bh)
 +              return -EIO;
 +
 +      super = (struct btrfs_super_block *)bh->b_data;
 +      if (btrfs_super_bytenr(super) != bytenr ||
 +                  btrfs_super_magic(super) != BTRFS_MAGIC) {
 +              brelse(bh);
 +              return -EINVAL;
 +      }
 +
 +      *bh_ret = bh;
 +      return 0;
 +}
 +
 +
  struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
  {
        struct buffer_head *bh;
        struct btrfs_super_block *super;
        int i;
        u64 transid = 0;
 -      u64 bytenr;
 +      int ret = -EINVAL;
  
        /* we would like to check all the supers, but that would make
         * a btrfs mount succeed after a mkfs from a different FS.
         * later supers, using BTRFS_SUPER_MIRROR_MAX instead
         */
        for (i = 0; i < 1; i++) {
 -              bytenr = btrfs_sb_offset(i);
 -              if (bytenr + BTRFS_SUPER_INFO_SIZE >=
 -                                      i_size_read(bdev->bd_inode))
 -                      break;
 -              bh = __bread(bdev, bytenr / 4096,
 -                                      BTRFS_SUPER_INFO_SIZE);
 -              if (!bh)
 +              ret = btrfs_read_dev_one_super(bdev, i, &bh);
 +              if (ret)
                        continue;
  
                super = (struct btrfs_super_block *)bh->b_data;
 -              if (btrfs_super_bytenr(super) != bytenr ||
 -                  btrfs_super_magic(super) != BTRFS_MAGIC) {
 -                      brelse(bh);
 -                      continue;
 -              }
  
                if (!latest || btrfs_super_generation(super) > transid) {
                        brelse(latest);
                        brelse(bh);
                }
        }
 +
 +      if (!latest)
 +              return ERR_PTR(ret);
 +
        return latest;
  }
  
@@@ -3327,9 -3337,8 +3365,9 @@@ static int write_dev_supers(struct btrf
                        bh = __getblk(device->bdev, bytenr / 4096,
                                      BTRFS_SUPER_INFO_SIZE);
                        if (!bh) {
 -                              printk(KERN_ERR "BTRFS: couldn't get super "
 -                                     "buffer head for bytenr %Lu\n", bytenr);
 +                              btrfs_err(device->dev_root->fs_info,
 +                                  "couldn't get super buffer head for bytenr %llu",
 +                                  bytenr);
                                errors++;
                                continue;
                        }
@@@ -3478,31 -3487,22 +3516,31 @@@ static int barrier_all_devices(struct b
  
  int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags)
  {
 -      if ((flags & (BTRFS_BLOCK_GROUP_DUP |
 -                    BTRFS_BLOCK_GROUP_RAID0 |
 -                    BTRFS_AVAIL_ALLOC_BIT_SINGLE)) ||
 -          ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0))
 -              return 0;
 +      int raid_type;
 +      int min_tolerated = INT_MAX;
  
 -      if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
 -                   BTRFS_BLOCK_GROUP_RAID5 |
 -                   BTRFS_BLOCK_GROUP_RAID10))
 -              return 1;
 +      if ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 ||
 +          (flags & BTRFS_AVAIL_ALLOC_BIT_SINGLE))
 +              min_tolerated = min(min_tolerated,
 +                                  btrfs_raid_array[BTRFS_RAID_SINGLE].
 +                                  tolerated_failures);
  
 -      if (flags & BTRFS_BLOCK_GROUP_RAID6)
 -              return 2;
 +      for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
 +              if (raid_type == BTRFS_RAID_SINGLE)
 +                      continue;
 +              if (!(flags & btrfs_raid_group[raid_type]))
 +                      continue;
 +              min_tolerated = min(min_tolerated,
 +                                  btrfs_raid_array[raid_type].
 +                                  tolerated_failures);
 +      }
  
 -      pr_warn("BTRFS: unknown raid type: %llu\n", flags);
 -      return 0;
 +      if (min_tolerated == INT_MAX) {
 +              pr_warn("BTRFS: unknown raid flag: %llu\n", flags);
 +              min_tolerated = 0;
 +      }
 +
 +      return min_tolerated;
  }
  
  int btrfs_calc_num_tolerated_disk_barrier_failures(
@@@ -3586,7 -3586,7 +3624,7 @@@ static int write_all_supers(struct btrf
                if (ret) {
                        mutex_unlock(
                                &root->fs_info->fs_devices->device_list_mutex);
 -                      btrfs_error(root->fs_info, ret,
 +                      btrfs_std_error(root->fs_info, ret,
                                    "errors while submitting device barriers.");
                        return ret;
                }
                mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
  
                /* FUA is masked off if unsupported and can't be the reason */
 -              btrfs_error(root->fs_info, -EIO,
 +              btrfs_std_error(root->fs_info, -EIO,
                            "%d errors while writing supers", total_errors);
                return -EIO;
        }
        }
        mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
        if (total_errors > max_errors) {
 -              btrfs_error(root->fs_info, -EIO,
 +              btrfs_std_error(root->fs_info, -EIO,
                            "%d errors while writing supers", total_errors);
                return -EIO;
        }
@@@ -3780,9 -3780,6 +3818,9 @@@ void close_ctree(struct btrfs_root *roo
        fs_info->closing = 1;
        smp_mb();
  
 +      /* wait for the qgroup rescan worker to stop */
 +      btrfs_qgroup_wait_for_completion(fs_info);
 +
        /* wait for the uuid_scan task to finish */
        down(&fs_info->uuid_tree_rescan_sem);
        /* avoid complains from lockdep et al., set sem back to initial state */
                       percpu_counter_sum(&fs_info->delalloc_bytes));
        }
  
 -      btrfs_sysfs_remove_one(fs_info);
 +      btrfs_sysfs_remove_mounted(fs_info);
        btrfs_sysfs_remove_fsid(fs_info->fs_devices);
  
        btrfs_free_fs_roots(fs_info);
@@@ -4331,6 -4328,25 +4369,6 @@@ again
        return 0;
  }
  
 -static void btrfs_free_pending_ordered(struct btrfs_transaction *cur_trans,
 -                                     struct btrfs_fs_info *fs_info)
 -{
 -      struct btrfs_ordered_extent *ordered;
 -
 -      spin_lock(&fs_info->trans_lock);
 -      while (!list_empty(&cur_trans->pending_ordered)) {
 -              ordered = list_first_entry(&cur_trans->pending_ordered,
 -                                         struct btrfs_ordered_extent,
 -                                         trans_list);
 -              list_del_init(&ordered->trans_list);
 -              spin_unlock(&fs_info->trans_lock);
 -
 -              btrfs_put_ordered_extent(ordered);
 -              spin_lock(&fs_info->trans_lock);
 -      }
 -      spin_unlock(&fs_info->trans_lock);
 -}
 -
  void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
                                   struct btrfs_root *root)
  {
        cur_trans->state = TRANS_STATE_UNBLOCKED;
        wake_up(&root->fs_info->transaction_wait);
  
 -      btrfs_free_pending_ordered(cur_trans, root->fs_info);
        btrfs_destroy_delayed_inodes(root);
        btrfs_assert_delayed_root_empty(root);
  
diff --combined fs/btrfs/extent-tree.c
index 4b89680a192338c7a70a4c909e3c4374abe41284,a4a4f593ec71d34bb988dcfa430db208866ca2c5..8abb344e3dcb003ec5a3410e9cbbd23073e49db9
@@@ -33,6 -33,7 +33,7 @@@
  #include "raid56.h"
  #include "locking.h"
  #include "free-space-cache.h"
+ #include "free-space-tree.h"
  #include "math.h"
  #include "sysfs.h"
  #include "qgroup.h"
@@@ -95,7 -96,8 +96,7 @@@ static int alloc_reserved_tree_block(st
                                     struct btrfs_root *root,
                                     u64 parent, u64 root_objectid,
                                     u64 flags, struct btrfs_disk_key *key,
 -                                   int level, struct btrfs_key *ins,
 -                                   int no_quota);
 +                                   int level, struct btrfs_key *ins);
  static int do_chunk_alloc(struct btrfs_trans_handle *trans,
                          struct btrfs_root *extent_root, u64 flags,
                          int force);
@@@ -124,7 -126,7 +125,7 @@@ static int block_group_bits(struct btrf
        return (cache->flags & bits) == bits;
  }
  
 -static void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
 +void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
  {
        atomic_inc(&cache->count);
  }
@@@ -331,34 -333,13 +332,34 @@@ static void put_caching_control(struct 
                kfree(ctl);
  }
  
 +#ifdef CONFIG_BTRFS_DEBUG
 +static void fragment_free_space(struct btrfs_root *root,
 +                              struct btrfs_block_group_cache *block_group)
 +{
 +      u64 start = block_group->key.objectid;
 +      u64 len = block_group->key.offset;
 +      u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ?
 +              root->nodesize : root->sectorsize;
 +      u64 step = chunk << 1;
 +
 +      while (len > chunk) {
 +              btrfs_remove_free_space(block_group, start, chunk);
 +              start += step;
 +              if (len < step)
 +                      len = 0;
 +              else
 +                      len -= step;
 +      }
 +}
 +#endif
 +
  /*
   * this is only called by cache_block_group, since we could have freed extents
   * we need to check the pinned_extents for any extents that can't be used yet
   * since their free space will be released as soon as the transaction commits.
   */
static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
-                             struct btrfs_fs_info *info, u64 start, u64 end)
+ u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
+                      struct btrfs_fs_info *info, u64 start, u64 end)
  {
        u64 extent_start, extent_end, size, total_added = 0;
        int ret;
        return total_added;
  }
  
- static noinline void caching_thread(struct btrfs_work *work)
+ static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
  {
        struct btrfs_block_group_cache *block_group;
        struct btrfs_fs_info *fs_info;
-       struct btrfs_caching_control *caching_ctl;
        struct btrfs_root *extent_root;
        struct btrfs_path *path;
        struct extent_buffer *leaf;
        u64 total_found = 0;
        u64 last = 0;
        u32 nritems;
-       int ret = -ENOMEM;
+       int ret;
 +      bool wakeup = true;
  
-       caching_ctl = container_of(work, struct btrfs_caching_control, work);
        block_group = caching_ctl->block_group;
        fs_info = block_group->fs_info;
        extent_root = fs_info->extent_root;
  
        path = btrfs_alloc_path();
        if (!path)
-               goto out;
+               return -ENOMEM;
  
        last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
  
 +#ifdef CONFIG_BTRFS_DEBUG
 +      /*
 +       * If we're fragmenting we don't want to make anybody think we can
 +       * allocate from this block group until we've had a chance to fragment
 +       * the free space.
 +       */
 +      if (btrfs_should_fragment_free_space(extent_root, block_group))
 +              wakeup = false;
 +#endif
        /*
         * We don't want to deadlock with somebody trying to allocate a new
         * extent for the extent root while also trying to search the extent
        key.objectid = last;
        key.offset = 0;
        key.type = BTRFS_EXTENT_ITEM_KEY;
- again:
-       mutex_lock(&caching_ctl->mutex);
-       /* need to make sure the commit_root doesn't disappear */
-       down_read(&fs_info->commit_root_sem);
  
  next:
        ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
        if (ret < 0)
-               goto err;
+               goto out;
  
        leaf = path->nodes[0];
        nritems = btrfs_header_nritems(leaf);
  
                        if (need_resched() ||
                            rwsem_is_contended(&fs_info->commit_root_sem)) {
 -                              caching_ctl->progress = last;
 +                              if (wakeup)
 +                                      caching_ctl->progress = last;
                                btrfs_release_path(path);
                                up_read(&fs_info->commit_root_sem);
                                mutex_unlock(&caching_ctl->mutex);
                                cond_resched();
-                               goto again;
+                               mutex_lock(&caching_ctl->mutex);
+                               down_read(&fs_info->commit_root_sem);
+                               goto next;
                        }
  
                        ret = btrfs_next_leaf(extent_root, path);
                        if (ret < 0)
-                               goto err;
+                               goto out;
                        if (ret)
                                break;
                        leaf = path->nodes[0];
                        key.offset = 0;
                        key.type = BTRFS_EXTENT_ITEM_KEY;
  
 -                      caching_ctl->progress = last;
 +                      if (wakeup)
 +                              caching_ctl->progress = last;
                        btrfs_release_path(path);
                        goto next;
                }
                        else
                                last = key.objectid + key.offset;
  
-                       if (total_found > (1024 * 1024 * 2)) {
+                       if (total_found > CACHING_CTL_WAKE_UP) {
                                total_found = 0;
 -                              wake_up(&caching_ctl->wait);
 +                              if (wakeup)
 +                                      wake_up(&caching_ctl->wait);
                        }
                }
                path->slots[0]++;
        total_found += add_new_free_space(block_group, fs_info, last,
                                          block_group->key.objectid +
                                          block_group->key.offset);
+       caching_ctl->progress = (u64)-1;
+ out:
+       btrfs_free_path(path);
+       return ret;
+ }
+ static noinline void caching_thread(struct btrfs_work *work)
+ {
+       struct btrfs_block_group_cache *block_group;
+       struct btrfs_fs_info *fs_info;
+       struct btrfs_caching_control *caching_ctl;
+       int ret;
+       caching_ctl = container_of(work, struct btrfs_caching_control, work);
+       block_group = caching_ctl->block_group;
+       fs_info = block_group->fs_info;
+       mutex_lock(&caching_ctl->mutex);
+       down_read(&fs_info->commit_root_sem);
+       if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
+               ret = load_free_space_tree(caching_ctl);
+       else
+               ret = load_extent_tree_free(caching_ctl);
        spin_lock(&block_group->lock);
        block_group->caching_ctl = NULL;
-       block_group->cached = BTRFS_CACHE_FINISHED;
+       block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED;
        spin_unlock(&block_group->lock);
  
- err:
-       btrfs_free_path(path);
-       up_read(&fs_info->commit_root_sem);
-       free_excluded_extents(extent_root, block_group);
 +#ifdef CONFIG_BTRFS_DEBUG
 +      if (btrfs_should_fragment_free_space(extent_root, block_group)) {
 +              u64 bytes_used;
 +
 +              spin_lock(&block_group->space_info->lock);
 +              spin_lock(&block_group->lock);
 +              bytes_used = block_group->key.offset -
 +                      btrfs_block_group_used(&block_group->item);
 +              block_group->space_info->bytes_used += bytes_used >> 1;
 +              spin_unlock(&block_group->lock);
 +              spin_unlock(&block_group->space_info->lock);
 +              fragment_free_space(extent_root, block_group);
 +      }
 +#endif
 +
 +      caching_ctl->progress = (u64)-1;
 +
+       up_read(&fs_info->commit_root_sem);
+       free_excluded_extents(fs_info->extent_root, block_group);
        mutex_unlock(&caching_ctl->mutex);
- out:
-       if (ret) {
-               spin_lock(&block_group->lock);
-               block_group->caching_ctl = NULL;
-               block_group->cached = BTRFS_CACHE_ERROR;
-               spin_unlock(&block_group->lock);
-       }
        wake_up(&caching_ctl->wait);
  
        put_caching_control(caching_ctl);
@@@ -654,22 -618,6 +668,22 @@@ static int cache_block_group(struct btr
                        }
                }
                spin_unlock(&cache->lock);
 +#ifdef CONFIG_BTRFS_DEBUG
 +              if (ret == 1 &&
 +                  btrfs_should_fragment_free_space(fs_info->extent_root,
 +                                                   cache)) {
 +                      u64 bytes_used;
 +
 +                      spin_lock(&cache->space_info->lock);
 +                      spin_lock(&cache->lock);
 +                      bytes_used = cache->key.offset -
 +                              btrfs_block_group_used(&cache->item);
 +                      cache->space_info->bytes_used += bytes_used >> 1;
 +                      spin_unlock(&cache->lock);
 +                      spin_unlock(&cache->space_info->lock);
 +                      fragment_free_space(fs_info->extent_root, cache);
 +              }
 +#endif
                mutex_unlock(&caching_ctl->mutex);
  
                wake_up(&caching_ctl->wait);
                }
        } else {
                /*
-                * We are not going to do the fast caching, set cached to the
-                * appropriate value and wakeup any waiters.
+                * We're either using the free space tree or no caching at all.
+                * Set cached to the appropriate value and wakeup any waiters.
                 */
                spin_lock(&cache->lock);
                if (load_cache_only) {
@@@ -2072,7 -2020,8 +2086,7 @@@ int btrfs_discard_extent(struct btrfs_r
  int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
                         struct btrfs_root *root,
                         u64 bytenr, u64 num_bytes, u64 parent,
 -                       u64 root_objectid, u64 owner, u64 offset,
 -                       int no_quota)
 +                       u64 root_objectid, u64 owner, u64 offset)
  {
        int ret;
        struct btrfs_fs_info *fs_info = root->fs_info;
                ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
                                        num_bytes,
                                        parent, root_objectid, (int)owner,
 -                                      BTRFS_ADD_DELAYED_REF, NULL, no_quota);
 +                                      BTRFS_ADD_DELAYED_REF, NULL);
        } else {
                ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
 -                                      num_bytes,
 -                                      parent, root_objectid, owner, offset,
 -                                      BTRFS_ADD_DELAYED_REF, NULL, no_quota);
 +                                      num_bytes, parent, root_objectid,
 +                                      owner, offset, 0,
 +                                      BTRFS_ADD_DELAYED_REF, NULL);
        }
        return ret;
  }
@@@ -2110,11 -2059,15 +2124,11 @@@ static int __btrfs_inc_extent_ref(struc
        u64 num_bytes = node->num_bytes;
        u64 refs;
        int ret;
 -      int no_quota = node->no_quota;
  
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
  
 -      if (!is_fstree(root_objectid) || !root->fs_info->quota_enabled)
 -              no_quota = 1;
 -
        path->reada = 1;
        path->leave_spinning = 1;
        /* this will setup the path even if it fails to insert the back ref */
@@@ -2349,7 -2302,8 +2363,7 @@@ static int run_delayed_tree_ref(struct 
                                                parent, ref_root,
                                                extent_op->flags_to_set,
                                                &extent_op->key,
 -                                              ref->level, &ins,
 -                                              node->no_quota);
 +                                              ref->level, &ins);
        } else if (node->action == BTRFS_ADD_DELAYED_REF) {
                ret = __btrfs_inc_extent_ref(trans, root, node,
                                             parent, ref_root,
@@@ -2402,11 -2356,6 +2416,11 @@@ static int run_one_delayed_ref(struct b
                                                      node->num_bytes);
                        }
                }
 +
 +              /* Also free its reserved qgroup space */
 +              btrfs_qgroup_free_delayed_ref(root->fs_info,
 +                                            head->qgroup_ref_root,
 +                                            head->qgroup_reserved);
                return ret;
        }
  
@@@ -2495,21 -2444,7 +2509,21 @@@ static noinline int __btrfs_run_delayed
                        }
                }
  
 +              /*
 +               * We need to try and merge add/drops of the same ref since we
 +               * can run into issues with relocate dropping the implicit ref
 +               * and then it being added back again before the drop can
 +               * finish.  If we merged anything we need to re-loop so we can
 +               * get a good ref.
 +               * Or we can get node references of the same type that weren't
 +               * merged when created due to bumps in the tree mod seq, and
 +               * we need to merge them to prevent adding an inline extent
 +               * backref before dropping it (triggering a BUG_ON at
 +               * insert_inline_extent_backref()).
 +               */
                spin_lock(&locked_ref->lock);
 +              btrfs_merge_delayed_refs(trans, fs_info, delayed_refs,
 +                                       locked_ref);
  
                /*
                 * locked_ref is the head node, so we have to go one
@@@ -3185,7 -3120,7 +3199,7 @@@ static int __btrfs_mod_ref(struct btrfs
        int level;
        int ret = 0;
        int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
 -                          u64, u64, u64, u64, u64, u64, int);
 +                          u64, u64, u64, u64, u64, u64);
  
  
        if (btrfs_test_is_dummy_root(root))
                        key.offset -= btrfs_file_extent_offset(buf, fi);
                        ret = process_func(trans, root, bytenr, num_bytes,
                                           parent, ref_root, key.objectid,
 -                                         key.offset, 1);
 +                                         key.offset);
                        if (ret)
                                goto fail;
                } else {
                        bytenr = btrfs_node_blockptr(buf, i);
                        num_bytes = root->nodesize;
                        ret = process_func(trans, root, bytenr, num_bytes,
 -                                         parent, ref_root, level - 1, 0,
 -                                         1);
 +                                         parent, ref_root, level - 1, 0);
                        if (ret)
                                goto fail;
                }
@@@ -3413,15 -3349,6 +3427,15 @@@ again
        }
        spin_unlock(&block_group->lock);
  
 +      /*
 +       * We hit an ENOSPC when setting up the cache in this transaction, just
 +       * skip doing the setup, we've already cleared the cache so we're safe.
 +       */
 +      if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) {
 +              ret = -ENOSPC;
 +              goto out_put;
 +      }
 +
        /*
         * Try to preallocate enough space based on how big the block group is.
         * Keep in mind this has to include any pinned space which could end up
        num_pages *= 16;
        num_pages *= PAGE_CACHE_SIZE;
  
 -      ret = btrfs_check_data_free_space(inode, num_pages, num_pages);
 +      ret = btrfs_check_data_free_space(inode, 0, num_pages);
        if (ret)
                goto out_put;
  
        ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
                                              num_pages, num_pages,
                                              &alloc_hint);
 +      /*
 +       * Our cache requires contiguous chunks so that we don't modify a bunch
 +       * of metadata or split extents when writing the cache out, which means
 +       * we can enospc if we are heavily fragmented in addition to just normal
 +       * out of space conditions.  So if we hit this just skip setting up any
 +       * other block groups for this transaction, maybe we'll unpin enough
 +       * space the next time around.
 +       */
        if (!ret)
                dcs = BTRFS_DC_SETUP;
 -      btrfs_free_reserved_data_space(inode, num_pages);
 +      else if (ret == -ENOSPC)
 +              set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);
 +      btrfs_free_reserved_data_space(inode, 0, num_pages);
  
  out_put:
        iput(inode);
@@@ -3840,7 -3757,6 +3854,7 @@@ static int update_space_info(struct btr
        found->bytes_readonly = 0;
        found->bytes_may_use = 0;
        found->full = 0;
 +      found->max_extent_size = 0;
        found->force_alloc = CHUNK_ALLOC_NO_FORCE;
        found->chunk_alloc = 0;
        found->flush = 0;
@@@ -3917,8 -3833,7 +3931,8 @@@ static u64 btrfs_reduce_alloc_profile(s
  {
        u64 num_devices = root->fs_info->fs_devices->rw_devices;
        u64 target;
 -      u64 tmp;
 +      u64 raid_type;
 +      u64 allowed = 0;
  
        /*
         * see if restripe for this chunk_type is in progress, if so
        spin_unlock(&root->fs_info->balance_lock);
  
        /* First, mask out the RAID levels which aren't possible */
 -      if (num_devices == 1)
 -              flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0 |
 -                         BTRFS_BLOCK_GROUP_RAID5);
 -      if (num_devices < 3)
 -              flags &= ~BTRFS_BLOCK_GROUP_RAID6;
 -      if (num_devices < 4)
 -              flags &= ~BTRFS_BLOCK_GROUP_RAID10;
 -
 -      tmp = flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 |
 -                     BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 |
 -                     BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10);
 -      flags &= ~tmp;
 -
 -      if (tmp & BTRFS_BLOCK_GROUP_RAID6)
 -              tmp = BTRFS_BLOCK_GROUP_RAID6;
 -      else if (tmp & BTRFS_BLOCK_GROUP_RAID5)
 -              tmp = BTRFS_BLOCK_GROUP_RAID5;
 -      else if (tmp & BTRFS_BLOCK_GROUP_RAID10)
 -              tmp = BTRFS_BLOCK_GROUP_RAID10;
 -      else if (tmp & BTRFS_BLOCK_GROUP_RAID1)
 -              tmp = BTRFS_BLOCK_GROUP_RAID1;
 -      else if (tmp & BTRFS_BLOCK_GROUP_RAID0)
 -              tmp = BTRFS_BLOCK_GROUP_RAID0;
 -
 -      return extended_to_chunk(flags | tmp);
 +      for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
 +              if (num_devices >= btrfs_raid_array[raid_type].devs_min)
 +                      allowed |= btrfs_raid_group[raid_type];
 +      }
 +      allowed &= flags;
 +
 +      if (allowed & BTRFS_BLOCK_GROUP_RAID6)
 +              allowed = BTRFS_BLOCK_GROUP_RAID6;
 +      else if (allowed & BTRFS_BLOCK_GROUP_RAID5)
 +              allowed = BTRFS_BLOCK_GROUP_RAID5;
 +      else if (allowed & BTRFS_BLOCK_GROUP_RAID10)
 +              allowed = BTRFS_BLOCK_GROUP_RAID10;
 +      else if (allowed & BTRFS_BLOCK_GROUP_RAID1)
 +              allowed = BTRFS_BLOCK_GROUP_RAID1;
 +      else if (allowed & BTRFS_BLOCK_GROUP_RAID0)
 +              allowed = BTRFS_BLOCK_GROUP_RAID0;
 +
 +      flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
 +
 +      return extended_to_chunk(flags | allowed);
  }
  
  static u64 get_alloc_profile(struct btrfs_root *root, u64 orig_flags)
@@@ -3994,7 -3914,11 +4008,7 @@@ u64 btrfs_get_alloc_profile(struct btrf
        return ret;
  }
  
 -/*
 - * This will check the space that the inode allocates from to make sure we have
 - * enough space for bytes.
 - */
 -int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 write_bytes)
 +int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes)
  {
        struct btrfs_space_info *data_sinfo;
        struct btrfs_root *root = BTRFS_I(inode)->root;
@@@ -4093,8 -4017,7 +4107,8 @@@ commit_trans
                        if (IS_ERR(trans))
                                return PTR_ERR(trans);
                        if (have_pinned_space >= 0 ||
 -                          trans->transaction->have_free_bgs ||
 +                          test_bit(BTRFS_TRANS_HAVE_FREE_BGS,
 +                                   &trans->transaction->flags) ||
                            need_commit > 0) {
                                ret = btrfs_commit_transaction(trans, root);
                                if (ret)
                                              data_sinfo->flags, bytes, 1);
                return -ENOSPC;
        }
 -      ret = btrfs_qgroup_reserve(root, write_bytes);
 -      if (ret)
 -              goto out;
        data_sinfo->bytes_may_use += bytes;
        trace_btrfs_space_reservation(root->fs_info, "space_info",
                                      data_sinfo->flags, bytes, 1);
 -out:
        spin_unlock(&data_sinfo->lock);
  
        return ret;
  }
  
  /*
 - * Called if we need to clear a data reservation for this inode.
 + * New check_data_free_space() with ability for precious data reservation
 + * Will replace old btrfs_check_data_free_space(), but for patch split,
 + * add a new function first and then replace it.
   */
 -void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
 +int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len)
 +{
 +      struct btrfs_root *root = BTRFS_I(inode)->root;
 +      int ret;
 +
 +      /* align the range */
 +      len = round_up(start + len, root->sectorsize) -
 +            round_down(start, root->sectorsize);
 +      start = round_down(start, root->sectorsize);
 +
 +      ret = btrfs_alloc_data_chunk_ondemand(inode, len);
 +      if (ret < 0)
 +              return ret;
 +
 +      /*
 +       * Use new btrfs_qgroup_reserve_data to reserve precious data space
 +       *
 +       * TODO: Find a good method to avoid reserve data space for NOCOW
 +       * range, but don't impact performance on quota disable case.
 +       */
 +      ret = btrfs_qgroup_reserve_data(inode, start, len);
 +      return ret;
 +}
 +
 +/*
 + * Called if we need to clear a data reservation for this inode
 + * Normally in a error case.
 + *
 + * This one will *NOT* use accurate qgroup reserved space API, just for case
 + * which we can't sleep and is sure it won't affect qgroup reserved space.
 + * Like clear_bit_hook().
 + */
 +void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
 +                                          u64 len)
  {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_space_info *data_sinfo;
  
 -      /* make sure bytes are sectorsize aligned */
 -      bytes = ALIGN(bytes, root->sectorsize);
 +      /* Make sure the range is aligned to sectorsize */
 +      len = round_up(start + len, root->sectorsize) -
 +            round_down(start, root->sectorsize);
 +      start = round_down(start, root->sectorsize);
  
        data_sinfo = root->fs_info->data_sinfo;
        spin_lock(&data_sinfo->lock);
 -      WARN_ON(data_sinfo->bytes_may_use < bytes);
 -      data_sinfo->bytes_may_use -= bytes;
 +      if (WARN_ON(data_sinfo->bytes_may_use < len))
 +              data_sinfo->bytes_may_use = 0;
 +      else
 +              data_sinfo->bytes_may_use -= len;
        trace_btrfs_space_reservation(root->fs_info, "space_info",
 -                                    data_sinfo->flags, bytes, 0);
 +                                    data_sinfo->flags, len, 0);
        spin_unlock(&data_sinfo->lock);
  }
  
 +/*
 + * Called if we need to clear a data reservation for this inode
 + * Normally in a error case.
 + *
 + * This one will handle the per-indoe data rsv map for accurate reserved
 + * space framework.
 + */
 +void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len)
 +{
 +      btrfs_free_reserved_data_space_noquota(inode, start, len);
 +      btrfs_qgroup_free_data(inode, start, len);
 +}
 +
  static void force_metadata_allocation(struct btrfs_fs_info *info)
  {
        struct list_head *head = &info->space_info;
@@@ -5027,9 -4902,13 +5041,9 @@@ static struct btrfs_block_rsv *get_bloc
  {
        struct btrfs_block_rsv *block_rsv = NULL;
  
 -      if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
 -              block_rsv = trans->block_rsv;
 -
 -      if (root == root->fs_info->csum_root && trans->adding_csums)
 -              block_rsv = trans->block_rsv;
 -
 -      if (root == root->fs_info->uuid_root)
 +      if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
 +          (root == root->fs_info->csum_root && trans->adding_csums) ||
 +           (root == root->fs_info->uuid_root))
                block_rsv = trans->block_rsv;
  
        if (!block_rsv)
@@@ -5472,7 -5351,7 +5486,7 @@@ int btrfs_subvolume_reserve_metadata(st
        if (root->fs_info->quota_enabled) {
                /* One for parent inode, two for dir entries */
                num_bytes = 3 * root->nodesize;
 -              ret = btrfs_qgroup_reserve(root, num_bytes);
 +              ret = btrfs_qgroup_reserve_meta(root, num_bytes);
                if (ret)
                        return ret;
        } else {
        if (ret == -ENOSPC && use_global_rsv)
                ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes);
  
 -      if (ret) {
 -              if (*qgroup_reserved)
 -                      btrfs_qgroup_free(root, *qgroup_reserved);
 -      }
 +      if (ret && *qgroup_reserved)
 +              btrfs_qgroup_free_meta(root, *qgroup_reserved);
  
        return ret;
  }
@@@ -5652,15 -5533,15 +5666,15 @@@ int btrfs_delalloc_reserve_metadata(str
        spin_unlock(&BTRFS_I(inode)->lock);
  
        if (root->fs_info->quota_enabled) {
 -              ret = btrfs_qgroup_reserve(root, nr_extents * root->nodesize);
 +              ret = btrfs_qgroup_reserve_meta(root,
 +                              nr_extents * root->nodesize);
                if (ret)
                        goto out_fail;
        }
  
        ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
        if (unlikely(ret)) {
 -              if (root->fs_info->quota_enabled)
 -                      btrfs_qgroup_free(root, nr_extents * root->nodesize);
 +              btrfs_qgroup_free_meta(root, nr_extents * root->nodesize);
                goto out_fail;
        }
  
@@@ -5783,48 -5664,41 +5797,48 @@@ void btrfs_delalloc_release_metadata(st
  }
  
  /**
 - * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc
 + * btrfs_delalloc_reserve_space - reserve data and metadata space for
 + * delalloc
   * @inode: inode we're writing to
 - * @num_bytes: the number of bytes we want to allocate
 + * @start: start range we are writing to
 + * @len: how long the range we are writing to
 + *
 + * TODO: This function will finally replace old btrfs_delalloc_reserve_space()
   *
   * This will do the following things
   *
 - * o reserve space in the data space info for num_bytes
 - * o reserve space in the metadata space info based on number of outstanding
 + * o reserve space in data space info for num bytes
 + *   and reserve precious corresponding qgroup space
 + *   (Done in check_data_free_space)
 + *
 + * o reserve space for metadata space, based on the number of outstanding
   *   extents and how much csums will be needed
 - * o add to the inodes ->delalloc_bytes
 + *   also reserve metadata space in a per root over-reserve method.
 + * o add to the inodes->delalloc_bytes
   * o add it to the fs_info's delalloc inodes list.
 + *   (Above 3 all done in delalloc_reserve_metadata)
   *
 - * This will return 0 for success and -ENOSPC if there is no space left.
 + * Return 0 for success
 + * Return <0 for error(-ENOSPC or -EQUOT)
   */
 -int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
 +int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len)
  {
        int ret;
  
 -      ret = btrfs_check_data_free_space(inode, num_bytes, num_bytes);
 -      if (ret)
 -              return ret;
 -
 -      ret = btrfs_delalloc_reserve_metadata(inode, num_bytes);
 -      if (ret) {
 -              btrfs_free_reserved_data_space(inode, num_bytes);
 +      ret = btrfs_check_data_free_space(inode, start, len);
 +      if (ret < 0)
                return ret;
 -      }
 -
 -      return 0;
 +      ret = btrfs_delalloc_reserve_metadata(inode, len);
 +      if (ret < 0)
 +              btrfs_free_reserved_data_space(inode, start, len);
 +      return ret;
  }
  
  /**
   * btrfs_delalloc_release_space - release data and metadata space for delalloc
   * @inode: inode we're releasing space for
 - * @num_bytes: the number of bytes we want to free up
 + * @start: start position of the space already reserved
 + * @len: the len of the space already reserved
   *
   * This must be matched with a call to btrfs_delalloc_reserve_space.  This is
   * called in the case that we don't need the metadata AND data reservations
   * This function will release the metadata space that was not used and will
   * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
   * list if there are no delalloc bytes left.
 + * Also it will handle the qgroup reserved space.
   */
 -void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
 +void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len)
  {
 -      btrfs_delalloc_release_metadata(inode, num_bytes);
 -      btrfs_free_reserved_data_space(inode, num_bytes);
 +      btrfs_delalloc_release_metadata(inode, len);
 +      btrfs_free_reserved_data_space(inode, start, len);
  }
  
  static int update_block_group(struct btrfs_trans_handle *trans,
                        set_extent_dirty(info->pinned_extents,
                                         bytenr, bytenr + num_bytes - 1,
                                         GFP_NOFS | __GFP_NOFAIL);
 -                      /*
 -                       * No longer have used bytes in this block group, queue
 -                       * it for deletion.
 -                       */
 -                      if (old_val == 0) {
 -                              spin_lock(&info->unused_bgs_lock);
 -                              if (list_empty(&cache->bg_list)) {
 -                                      btrfs_get_block_group(cache);
 -                                      list_add_tail(&cache->bg_list,
 -                                                    &info->unused_bgs);
 -                              }
 -                              spin_unlock(&info->unused_bgs_lock);
 -                      }
                }
  
                spin_lock(&trans->transaction->dirty_bgs_lock);
                }
                spin_unlock(&trans->transaction->dirty_bgs_lock);
  
 +              /*
 +               * No longer have used bytes in this block group, queue it for
 +               * deletion. We do this after adding the block group to the
 +               * dirty list to avoid races between cleaner kthread and space
 +               * cache writeout.
 +               */
 +              if (!alloc && old_val == 0) {
 +                      spin_lock(&info->unused_bgs_lock);
 +                      if (list_empty(&cache->bg_list)) {
 +                              btrfs_get_block_group(cache);
 +                              list_add_tail(&cache->bg_list,
 +                                            &info->unused_bgs);
 +                      }
 +                      spin_unlock(&info->unused_bgs_lock);
 +              }
 +
                btrfs_put_block_group(cache);
                total -= num_bytes;
                bytenr += num_bytes;
@@@ -6206,34 -6076,6 +6220,34 @@@ void btrfs_prepare_extent_commit(struc
        update_global_block_rsv(fs_info);
  }
  
 +/*
 + * Returns the free cluster for the given space info and sets empty_cluster to
 + * what it should be based on the mount options.
 + */
 +static struct btrfs_free_cluster *
 +fetch_cluster_info(struct btrfs_root *root, struct btrfs_space_info *space_info,
 +                 u64 *empty_cluster)
 +{
 +      struct btrfs_free_cluster *ret = NULL;
 +      bool ssd = btrfs_test_opt(root, SSD);
 +
 +      *empty_cluster = 0;
 +      if (btrfs_mixed_space_info(space_info))
 +              return ret;
 +
 +      if (ssd)
 +              *empty_cluster = 2 * 1024 * 1024;
 +      if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
 +              ret = &root->fs_info->meta_alloc_cluster;
 +              if (!ssd)
 +                      *empty_cluster = 64 * 1024;
 +      } else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) && ssd) {
 +              ret = &root->fs_info->data_alloc_cluster;
 +      }
 +
 +      return ret;
 +}
 +
  static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end,
                              const bool return_free_space)
  {
        struct btrfs_block_group_cache *cache = NULL;
        struct btrfs_space_info *space_info;
        struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
 +      struct btrfs_free_cluster *cluster = NULL;
        u64 len;
 +      u64 total_unpinned = 0;
 +      u64 empty_cluster = 0;
        bool readonly;
  
        while (start <= end) {
                    start >= cache->key.objectid + cache->key.offset) {
                        if (cache)
                                btrfs_put_block_group(cache);
 +                      total_unpinned = 0;
                        cache = btrfs_lookup_block_group(fs_info, start);
                        BUG_ON(!cache); /* Logic error */
 +
 +                      cluster = fetch_cluster_info(root,
 +                                                   cache->space_info,
 +                                                   &empty_cluster);
 +                      empty_cluster <<= 1;
                }
  
                len = cache->key.objectid + cache->key.offset - start;
                }
  
                start += len;
 +              total_unpinned += len;
                space_info = cache->space_info;
  
 +              /*
 +               * If this space cluster has been marked as fragmented and we've
 +               * unpinned enough in this block group to potentially allow a
 +               * cluster to be created inside of it go ahead and clear the
 +               * fragmented check.
 +               */
 +              if (cluster && cluster->fragmented &&
 +                  total_unpinned > empty_cluster) {
 +                      spin_lock(&cluster->lock);
 +                      cluster->fragmented = 0;
 +                      spin_unlock(&cluster->lock);
 +              }
 +
                spin_lock(&space_info->lock);
                spin_lock(&cache->lock);
                cache->pinned -= len;
                space_info->bytes_pinned -= len;
 +              space_info->max_extent_size = 0;
                percpu_counter_add(&space_info->total_bytes_pinned, -len);
                if (cache->ro) {
                        space_info->bytes_readonly += len;
@@@ -6426,6 -6244,7 +6440,6 @@@ static int __btrfs_free_extent(struct b
        int extent_slot = 0;
        int found_extent = 0;
        int num_to_del = 1;
 -      int no_quota = node->no_quota;
        u32 item_size;
        u64 refs;
        u64 bytenr = node->bytenr;
        bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
                                                 SKINNY_METADATA);
  
 -      if (!info->quota_enabled || !is_fstree(root_objectid))
 -              no_quota = 1;
 -
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
                        }
                }
  
+               ret = add_to_free_space_tree(trans, root->fs_info, bytenr,
+                                            num_bytes);
+               if (ret) {
+                       btrfs_abort_transaction(trans, extent_root, ret);
+                       goto out;
+               }
                ret = update_block_group(trans, root, bytenr, num_bytes, 0);
                if (ret) {
                        btrfs_abort_transaction(trans, extent_root, ret);
@@@ -6759,7 -6588,7 +6780,7 @@@ void btrfs_free_tree_block(struct btrfs
                                        buf->start, buf->len,
                                        parent, root->root_key.objectid,
                                        btrfs_header_level(buf),
 -                                      BTRFS_DROP_DELAYED_REF, NULL, 0);
 +                                      BTRFS_DROP_DELAYED_REF, NULL);
                BUG_ON(ret); /* -ENOMEM */
        }
  
@@@ -6807,7 -6636,7 +6828,7 @@@ out
  /* Can return -ENOMEM */
  int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                      u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
 -                    u64 owner, u64 offset, int no_quota)
 +                    u64 owner, u64 offset)
  {
        int ret;
        struct btrfs_fs_info *fs_info = root->fs_info;
                ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
                                        num_bytes,
                                        parent, root_objectid, (int)owner,
 -                                      BTRFS_DROP_DELAYED_REF, NULL, no_quota);
 +                                      BTRFS_DROP_DELAYED_REF, NULL);
        } else {
                ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
                                                num_bytes,
                                                parent, root_objectid, owner,
 -                                              offset, BTRFS_DROP_DELAYED_REF,
 -                                              NULL, no_quota);
 +                                              offset, 0,
 +                                              BTRFS_DROP_DELAYED_REF, NULL);
        }
        return ret;
  }
@@@ -7022,7 -6851,7 +7043,7 @@@ static noinline int find_free_extent(st
        struct btrfs_block_group_cache *block_group = NULL;
        u64 search_start = 0;
        u64 max_extent_size = 0;
 -      int empty_cluster = 2 * 1024 * 1024;
 +      u64 empty_cluster = 0;
        struct btrfs_space_info *space_info;
        int loop = 0;
        int index = __get_raid_index(flags);
        bool failed_alloc = false;
        bool use_cluster = true;
        bool have_caching_bg = false;
 +      bool orig_have_caching_bg = false;
 +      bool full_search = false;
  
        WARN_ON(num_bytes < root->sectorsize);
        ins->type = BTRFS_EXTENT_ITEM_KEY;
        }
  
        /*
 -       * If the space info is for both data and metadata it means we have a
 -       * small filesystem and we can't use the clustering stuff.
 +       * If our free space is heavily fragmented we may not be able to make
 +       * big contiguous allocations, so instead of doing the expensive search
 +       * for free space, simply return ENOSPC with our max_extent_size so we
 +       * can go ahead and search for a more manageable chunk.
 +       *
 +       * If our max_extent_size is large enough for our allocation simply
 +       * disable clustering since we will likely not be able to find enough
 +       * space to create a cluster and induce latency trying.
         */
 -      if (btrfs_mixed_space_info(space_info))
 -              use_cluster = false;
 -
 -      if (flags & BTRFS_BLOCK_GROUP_METADATA && use_cluster) {
 -              last_ptr = &root->fs_info->meta_alloc_cluster;
 -              if (!btrfs_test_opt(root, SSD))
 -                      empty_cluster = 64 * 1024;
 -      }
 -
 -      if ((flags & BTRFS_BLOCK_GROUP_DATA) && use_cluster &&
 -          btrfs_test_opt(root, SSD)) {
 -              last_ptr = &root->fs_info->data_alloc_cluster;
 +      if (unlikely(space_info->max_extent_size)) {
 +              spin_lock(&space_info->lock);
 +              if (space_info->max_extent_size &&
 +                  num_bytes > space_info->max_extent_size) {
 +                      ins->offset = space_info->max_extent_size;
 +                      spin_unlock(&space_info->lock);
 +                      return -ENOSPC;
 +              } else if (space_info->max_extent_size) {
 +                      use_cluster = false;
 +              }
 +              spin_unlock(&space_info->lock);
        }
  
 +      last_ptr = fetch_cluster_info(orig_root, space_info, &empty_cluster);
        if (last_ptr) {
                spin_lock(&last_ptr->lock);
                if (last_ptr->block_group)
                        hint_byte = last_ptr->window_start;
 +              if (last_ptr->fragmented) {
 +                      /*
 +                       * We still set window_start so we can keep track of the
 +                       * last place we found an allocation to try and save
 +                       * some time.
 +                       */
 +                      hint_byte = last_ptr->window_start;
 +                      use_cluster = false;
 +              }
                spin_unlock(&last_ptr->lock);
        }
  
        search_start = max(search_start, first_logical_byte(root, 0));
        search_start = max(search_start, hint_byte);
 -
 -      if (!last_ptr)
 -              empty_cluster = 0;
 -
        if (search_start == hint_byte) {
                block_group = btrfs_lookup_block_group(root->fs_info,
                                                       search_start);
        }
  search:
        have_caching_bg = false;
 +      if (index == 0 || index == __get_raid_index(flags))
 +              full_search = true;
        down_read(&space_info->groups_sem);
        list_for_each_entry(block_group, &space_info->block_groups[index],
                            list) {
  have_block_group:
                cached = block_group_cache_done(block_group);
                if (unlikely(!cached)) {
 +                      have_caching_bg = true;
                        ret = cache_block_group(block_group, 0);
                        BUG_ON(ret < 0);
                        ret = 0;
                 * Ok we want to try and use the cluster allocator, so
                 * lets look there
                 */
 -              if (last_ptr) {
 +              if (last_ptr && use_cluster) {
                        struct btrfs_block_group_cache *used_block_group;
                        unsigned long aligned_cluster;
                        /*
@@@ -7300,16 -7113,6 +7321,16 @@@ refill_cluster
                }
  
  unclustered_alloc:
 +              /*
 +               * We are doing an unclustered alloc, set the fragmented flag so
 +               * we don't bother trying to setup a cluster again until we get
 +               * more space.
 +               */
 +              if (unlikely(last_ptr)) {
 +                      spin_lock(&last_ptr->lock);
 +                      last_ptr->fragmented = 1;
 +                      spin_unlock(&last_ptr->lock);
 +              }
                spin_lock(&block_group->free_space_ctl->tree_lock);
                if (cached &&
                    block_group->free_space_ctl->free_space <
                        failed_alloc = true;
                        goto have_block_group;
                } else if (!offset) {
 -                      if (!cached)
 -                              have_caching_bg = true;
                        goto loop;
                }
  checks:
@@@ -7382,10 -7187,6 +7403,10 @@@ loop
        }
        up_read(&space_info->groups_sem);
  
 +      if ((loop == LOOP_CACHING_NOWAIT) && have_caching_bg
 +              && !orig_have_caching_bg)
 +              orig_have_caching_bg = true;
 +
        if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg)
                goto search;
  
         */
        if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) {
                index = 0;
 -              loop++;
 +              if (loop == LOOP_CACHING_NOWAIT) {
 +                      /*
 +                       * We want to skip the LOOP_CACHING_WAIT step if we
 +                       * don't have any unached bgs and we've alrelady done a
 +                       * full search through.
 +                       */
 +                      if (orig_have_caching_bg || !full_search)
 +                              loop = LOOP_CACHING_WAIT;
 +                      else
 +                              loop = LOOP_ALLOC_CHUNK;
 +              } else {
 +                      loop++;
 +              }
 +
                if (loop == LOOP_ALLOC_CHUNK) {
                        struct btrfs_trans_handle *trans;
                        int exist = 0;
  
                        ret = do_chunk_alloc(trans, root, flags,
                                             CHUNK_ALLOC_FORCE);
 +
 +                      /*
 +                       * If we can't allocate a new chunk we've already looped
 +                       * through at least once, move on to the NO_EMPTY_SIZE
 +                       * case.
 +                       */
 +                      if (ret == -ENOSPC)
 +                              loop = LOOP_NO_EMPTY_SIZE;
 +
                        /*
                         * Do not bail out on ENOSPC since we
                         * can do more things.
                }
  
                if (loop == LOOP_NO_EMPTY_SIZE) {
 +                      /*
 +                       * Don't loop again if we already have no empty_size and
 +                       * no empty_cluster.
 +                       */
 +                      if (empty_size == 0 &&
 +                          empty_cluster == 0) {
 +                              ret = -ENOSPC;
 +                              goto out;
 +                      }
                        empty_size = 0;
                        empty_cluster = 0;
                }
        } else if (!ins->objectid) {
                ret = -ENOSPC;
        } else if (ins->objectid) {
 +              if (!use_cluster && last_ptr) {
 +                      spin_lock(&last_ptr->lock);
 +                      last_ptr->window_start = ins->objectid;
 +                      spin_unlock(&last_ptr->lock);
 +              }
                ret = 0;
        }
  out:
 -      if (ret == -ENOSPC)
 +      if (ret == -ENOSPC) {
 +              spin_lock(&space_info->lock);
 +              space_info->max_extent_size = max_extent_size;
 +              spin_unlock(&space_info->lock);
                ins->offset = max_extent_size;
 +      }
        return ret;
  }
  
@@@ -7537,7 -7298,7 +7558,7 @@@ int btrfs_reserve_extent(struct btrfs_r
                         u64 empty_size, u64 hint_byte,
                         struct btrfs_key *ins, int is_data, int delalloc)
  {
 -      bool final_tried = false;
 +      bool final_tried = num_bytes == min_alloc_size;
        u64 flags;
        int ret;
  
@@@ -7672,6 -7433,11 +7693,11 @@@ static int alloc_reserved_file_extent(s
        btrfs_mark_buffer_dirty(path->nodes[0]);
        btrfs_free_path(path);
  
+       ret = remove_from_free_space_tree(trans, fs_info, ins->objectid,
+                                         ins->offset);
+       if (ret)
+               return ret;
        ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
        if (ret) { /* -ENOENT, logic error */
                btrfs_err(fs_info, "update block group failed for %llu %llu",
@@@ -7686,7 -7452,8 +7712,7 @@@ static int alloc_reserved_tree_block(st
                                     struct btrfs_root *root,
                                     u64 parent, u64 root_objectid,
                                     u64 flags, struct btrfs_disk_key *key,
 -                                   int level, struct btrfs_key *ins,
 -                                   int no_quota)
 +                                   int level, struct btrfs_key *ins)
  {
        int ret;
        struct btrfs_fs_info *fs_info = root->fs_info;
        btrfs_mark_buffer_dirty(leaf);
        btrfs_free_path(path);
  
+       ret = remove_from_free_space_tree(trans, fs_info, ins->objectid,
+                                         num_bytes);
+       if (ret)
+               return ret;
        ret = update_block_group(trans, root, ins->objectid, root->nodesize,
                                 1);
        if (ret) { /* -ENOENT, logic error */
  int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
                                     struct btrfs_root *root,
                                     u64 root_objectid, u64 owner,
 -                                   u64 offset, struct btrfs_key *ins)
 +                                   u64 offset, u64 ram_bytes,
 +                                   struct btrfs_key *ins)
  {
        int ret;
  
        ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid,
                                         ins->offset, 0,
                                         root_objectid, owner, offset,
 -                                       BTRFS_ADD_DELAYED_EXTENT, NULL, 0);
 +                                       ram_bytes, BTRFS_ADD_DELAYED_EXTENT,
 +                                       NULL);
        return ret;
  }
  
@@@ -7992,7 -7762,7 +8023,7 @@@ struct extent_buffer *btrfs_alloc_tree_
                                                 ins.objectid, ins.offset,
                                                 parent, root_objectid, level,
                                                 BTRFS_ADD_DELAYED_EXTENT,
 -                                               extent_op, 0);
 +                                               extent_op);
                if (ret)
                        goto out_free_delayed;
        }
@@@ -8108,47 -7878,21 +8139,47 @@@ reada
  }
  
  /*
 - * TODO: Modify related function to add related node/leaf to dirty_extent_root,
 - * for later qgroup accounting.
 - *
 - * Current, this function does nothing.
 + * These may not be seen by the usual inc/dec ref code so we have to
 + * add them here.
   */
 +static int record_one_subtree_extent(struct btrfs_trans_handle *trans,
 +                                   struct btrfs_root *root, u64 bytenr,
 +                                   u64 num_bytes)
 +{
 +      struct btrfs_qgroup_extent_record *qrecord;
 +      struct btrfs_delayed_ref_root *delayed_refs;
 +
 +      qrecord = kmalloc(sizeof(*qrecord), GFP_NOFS);
 +      if (!qrecord)
 +              return -ENOMEM;
 +
 +      qrecord->bytenr = bytenr;
 +      qrecord->num_bytes = num_bytes;
 +      qrecord->old_roots = NULL;
 +
 +      delayed_refs = &trans->transaction->delayed_refs;
 +      spin_lock(&delayed_refs->lock);
 +      if (btrfs_qgroup_insert_dirty_extent(delayed_refs, qrecord))
 +              kfree(qrecord);
 +      spin_unlock(&delayed_refs->lock);
 +
 +      return 0;
 +}
 +
  static int account_leaf_items(struct btrfs_trans_handle *trans,
                              struct btrfs_root *root,
                              struct extent_buffer *eb)
  {
        int nr = btrfs_header_nritems(eb);
 -      int i, extent_type;
 +      int i, extent_type, ret;
        struct btrfs_key key;
        struct btrfs_file_extent_item *fi;
        u64 bytenr, num_bytes;
  
 +      /* We can be called directly from walk_up_proc() */
 +      if (!root->fs_info->quota_enabled)
 +              return 0;
 +
        for (i = 0; i < nr; i++) {
                btrfs_item_key_to_cpu(eb, &key, i);
  
                        continue;
  
                num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi);
 +
 +              ret = record_one_subtree_extent(trans, root, bytenr, num_bytes);
 +              if (ret)
 +                      return ret;
        }
        return 0;
  }
@@@ -8239,6 -7979,8 +8270,6 @@@ static int adjust_slots_upwards(struct 
  
  /*
   * root_eb is the subtree root and is locked before this function is called.
 - * TODO: Modify this function to mark all (including complete shared node)
 - * to dirty_extent_root to allow it get accounted in qgroup.
   */
  static int account_shared_subtree(struct btrfs_trans_handle *trans,
                                  struct btrfs_root *root,
@@@ -8316,11 -8058,6 +8347,11 @@@ walk_down
                        btrfs_tree_read_lock(eb);
                        btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
                        path->locks[level] = BTRFS_READ_LOCK_BLOCKING;
 +
 +                      ret = record_one_subtree_extent(trans, root, child_bytenr,
 +                                                      root->nodesize);
 +                      if (ret)
 +                              goto out;
                }
  
                if (level == 0) {
@@@ -8566,15 -8303,14 +8597,15 @@@ skip
                        ret = account_shared_subtree(trans, root, next,
                                                     generation, level - 1);
                        if (ret) {
 -                              printk_ratelimited(KERN_ERR "BTRFS: %s Error "
 +                              btrfs_err_rl(root->fs_info,
 +                                      "Error "
                                        "%d accounting shared subtree. Quota "
 -                                      "is out of sync, rescan required.\n",
 -                                      root->fs_info->sb->s_id, ret);
 +                                      "is out of sync, rescan required.",
 +                                      ret);
                        }
                }
                ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
 -                              root->root_key.objectid, level - 1, 0, 0);
 +                              root->root_key.objectid, level - 1, 0);
                BUG_ON(ret); /* -ENOMEM */
        }
        btrfs_tree_unlock(next);
@@@ -8659,11 -8395,10 +8690,11 @@@ static noinline int walk_up_proc(struc
                        BUG_ON(ret); /* -ENOMEM */
                        ret = account_leaf_items(trans, root, eb);
                        if (ret) {
 -                              printk_ratelimited(KERN_ERR "BTRFS: %s Error "
 +                              btrfs_err_rl(root->fs_info,
 +                                      "error "
                                        "%d accounting leaf items. Quota "
 -                                      "is out of sync, rescan required.\n",
 -                                      root->fs_info->sb->s_id, ret);
 +                                      "is out of sync, rescan required.",
 +                                      ret);
                        }
                }
                /* make block locked assertion in clean_tree_block happy */
@@@ -8985,7 -8720,7 +9016,7 @@@ out
        if (!for_reloc && root_dropped == false)
                btrfs_add_dead_root(root);
        if (err && err != -EAGAIN)
 -              btrfs_std_error(root->fs_info, err);
 +              btrfs_std_error(root->fs_info, err, NULL);
        return err;
  }
  
@@@ -9173,7 -8908,7 +9204,7 @@@ again
         * back off and let this transaction commit
         */
        mutex_lock(&root->fs_info->ro_block_group_mutex);
 -      if (trans->transaction->dirty_bg_run) {
 +      if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
                u64 transid = trans->transid;
  
                mutex_unlock(&root->fs_info->ro_block_group_mutex);
@@@ -9656,6 -9391,8 +9687,8 @@@ btrfs_create_block_group_cache(struct b
        cache->full_stripe_len = btrfs_full_stripe_len(root,
                                               &root->fs_info->mapping_tree,
                                               start);
+       set_free_space_tree_thresholds(cache);
        atomic_set(&cache->count, 1);
        spin_lock_init(&cache->lock);
        init_rwsem(&cache->data_rwsem);
        INIT_LIST_HEAD(&cache->io_list);
        btrfs_init_free_space_ctl(cache);
        atomic_set(&cache->trimming, 0);
+       mutex_init(&cache->free_space_lock);
  
        return cache;
  }
@@@ -9877,6 -9615,8 +9911,8 @@@ void btrfs_create_pending_block_groups(
                                               key.objectid, key.offset);
                if (ret)
                        btrfs_abort_transaction(trans, extent_root, ret);
+               add_block_group_free_space(trans, root->fs_info, block_group);
+               /* already aborted the transaction if it failed. */
  next:
                list_del_init(&block_group->bg_list);
        }
@@@ -9907,6 -9647,7 +9943,7 @@@ int btrfs_make_block_group(struct btrfs
        cache->flags = type;
        cache->last_byte_to_unpin = (u64)-1;
        cache->cached = BTRFS_CACHE_FINISHED;
+       cache->needs_free_space = 1;
        ret = exclude_super_stripes(root, cache);
        if (ret) {
                /*
  
        free_excluded_extents(root, cache);
  
 +#ifdef CONFIG_BTRFS_DEBUG
 +      if (btrfs_should_fragment_free_space(root, cache)) {
 +              u64 new_bytes_used = size - bytes_used;
 +
 +              bytes_used += new_bytes_used >> 1;
 +              fragment_free_space(root, cache);
 +      }
 +#endif
        /*
         * Call to ensure the corresponding space_info object is created and
         * assigned to our block group, but don't update its counters just yet.
@@@ -10277,6 -10010,10 +10314,10 @@@ int btrfs_remove_block_group(struct btr
  
        unlock_chunks(root);
  
+       ret = remove_block_group_free_space(trans, root->fs_info, block_group);
+       if (ret)
+               goto out;
        btrfs_put_block_group(block_group);
        btrfs_put_block_group(block_group);
  
        return ret;
  }
  
 +struct btrfs_trans_handle *
 +btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info,
 +                                   const u64 chunk_offset)
 +{
 +      struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
 +      struct extent_map *em;
 +      struct map_lookup *map;
 +      unsigned int num_items;
 +
 +      read_lock(&em_tree->lock);
 +      em = lookup_extent_mapping(em_tree, chunk_offset, 1);
 +      read_unlock(&em_tree->lock);
 +      ASSERT(em && em->start == chunk_offset);
 +
 +      /*
 +       * We need to reserve 3 + N units from the metadata space info in order
 +       * to remove a block group (done at btrfs_remove_chunk() and at
 +       * btrfs_remove_block_group()), which are used for:
 +       *
 +       * 1 unit for adding the free space inode's orphan (located in the tree
 +       * of tree roots).
 +       * 1 unit for deleting the block group item (located in the extent
 +       * tree).
 +       * 1 unit for deleting the free space item (located in tree of tree
 +       * roots).
 +       * N units for deleting N device extent items corresponding to each
 +       * stripe (located in the device tree).
 +       *
 +       * In order to remove a block group we also need to reserve units in the
 +       * system space info in order to update the chunk tree (update one or
 +       * more device items and remove one chunk item), but this is done at
 +       * btrfs_remove_chunk() through a call to check_system_chunk().
 +       */
 +      map = (struct map_lookup *)em->bdev;
 +      num_items = 3 + map->num_stripes;
 +      free_extent_map(em);
 +
 +      return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root,
 +                                                         num_items, 1);
 +}
 +
  /*
   * Process the unused_bgs list and remove any that don't have any allocated
   * space inside of them.
@@@ -10356,25 -10052,22 +10397,25 @@@ void btrfs_delete_unused_bgs(struct btr
                block_group = list_first_entry(&fs_info->unused_bgs,
                                               struct btrfs_block_group_cache,
                                               bg_list);
 -              space_info = block_group->space_info;
                list_del_init(&block_group->bg_list);
 +
 +              space_info = block_group->space_info;
 +
                if (ret || btrfs_mixed_space_info(space_info)) {
                        btrfs_put_block_group(block_group);
                        continue;
                }
                spin_unlock(&fs_info->unused_bgs_lock);
  
 -              mutex_lock(&root->fs_info->delete_unused_bgs_mutex);
 +              mutex_lock(&fs_info->delete_unused_bgs_mutex);
  
                /* Don't want to race with allocators so take the groups_sem */
                down_write(&space_info->groups_sem);
                spin_lock(&block_group->lock);
                if (block_group->reserved ||
                    btrfs_block_group_used(&block_group->item) ||
 -                  block_group->ro) {
 +                  block_group->ro ||
 +                  list_is_singular(&block_group->list)) {
                        /*
                         * We want to bail if we made new allocations or have
                         * outstanding allocations in this block group.  We do
                 * Want to do this before we do anything else so we can recover
                 * properly if we fail to join the transaction.
                 */
 -              /* 1 for btrfs_orphan_reserve_metadata() */
 -              trans = btrfs_start_transaction(root, 1);
 +              trans = btrfs_start_trans_remove_block_group(fs_info,
 +                                                   block_group->key.objectid);
                if (IS_ERR(trans)) {
                        btrfs_dec_block_group_ro(root, block_group);
                        ret = PTR_ERR(trans);
  end_trans:
                btrfs_end_transaction(trans, root);
  next:
 -              mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
 +              mutex_unlock(&fs_info->delete_unused_bgs_mutex);
                btrfs_put_block_group(block_group);
                spin_lock(&fs_info->unused_bgs_lock);
        }
@@@ -10715,7 -10408,8 +10756,7 @@@ void btrfs_end_write_no_snapshoting(str
  {
        percpu_counter_dec(&root->subv_writers->counter);
        /*
 -       * Make sure counter is updated before we wake up
 -       * waiters.
 +       * Make sure counter is updated before we wake up waiters.
         */
        smp_mb();
        if (waitqueue_active(&root->subv_writers->wait))
diff --combined fs/btrfs/extent_io.c
index 9abe18763a7fb001632246fd712a64fdffcc0225,a6eec2d0e254392a7f0c5047848586de682e0555..2b3f26326565c478f57a4860a948ff116ed534d9
@@@ -96,8 -96,8 +96,8 @@@ static inline void __btrfs_debug_check_
        inode = tree->mapping->host;
        isize = i_size_read(inode);
        if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
 -              printk_ratelimited(KERN_DEBUG
 -                  "BTRFS: %s: ino %llu isize %llu odd range [%llu,%llu]\n",
 +              btrfs_debug_rl(BTRFS_I(inode)->root->fs_info,
 +                  "%s: ino %llu isize %llu odd range [%llu,%llu]",
                                caller, btrfs_ino(inode), isize, start, end);
        }
  }
@@@ -131,25 -131,6 +131,25 @@@ struct extent_page_data 
        unsigned int sync_io:1;
  };
  
 +static void add_extent_changeset(struct extent_state *state, unsigned bits,
 +                               struct extent_changeset *changeset,
 +                               int set)
 +{
 +      int ret;
 +
 +      if (!changeset)
 +              return;
 +      if (set && (state->state & bits) == bits)
 +              return;
 +      if (!set && (state->state & bits) == 0)
 +              return;
 +      changeset->bytes_changed += state->end - state->start + 1;
 +      ret = ulist_add(changeset->range_changed, state->start, state->end,
 +                      GFP_ATOMIC);
 +      /* ENOMEM */
 +      BUG_ON(ret < 0);
 +}
 +
  static noinline void flush_write_bio(void *data);
  static inline struct btrfs_fs_info *
  tree_fs_info(struct extent_io_tree *tree)
@@@ -429,8 -410,7 +429,8 @@@ static void clear_state_cb(struct exten
  }
  
  static void set_state_bits(struct extent_io_tree *tree,
 -                         struct extent_state *state, unsigned *bits);
 +                         struct extent_state *state, unsigned *bits,
 +                         struct extent_changeset *changeset);
  
  /*
   * insert an extent_state struct into the tree.  'bits' are set on the
@@@ -446,7 -426,7 +446,7 @@@ static int insert_state(struct extent_i
                        struct extent_state *state, u64 start, u64 end,
                        struct rb_node ***p,
                        struct rb_node **parent,
 -                      unsigned *bits)
 +                      unsigned *bits, struct extent_changeset *changeset)
  {
        struct rb_node *node;
  
        state->start = start;
        state->end = end;
  
 -      set_state_bits(tree, state, bits);
 +      set_state_bits(tree, state, bits, changeset);
  
        node = tree_insert(&tree->state, NULL, end, &state->rb_node, p, parent);
        if (node) {
@@@ -531,8 -511,7 +531,8 @@@ static struct extent_state *next_state(
   */
  static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
                                            struct extent_state *state,
 -                                          unsigned *bits, int wake)
 +                                          unsigned *bits, int wake,
 +                                          struct extent_changeset *changeset)
  {
        struct extent_state *next;
        unsigned bits_to_clear = *bits & ~EXTENT_CTLBITS;
                tree->dirty_bytes -= range;
        }
        clear_state_cb(tree, state, bits);
 +      add_extent_changeset(state, bits_to_clear, changeset, 0);
        state->state &= ~bits_to_clear;
        if (wake)
                wake_up(&state->wq);
@@@ -591,10 -569,10 +591,10 @@@ static void extent_io_tree_panic(struc
   *
   * This takes the tree lock, and returns 0 on success and < 0 on error.
   */
 -int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 -                   unsigned bits, int wake, int delete,
 -                   struct extent_state **cached_state,
 -                   gfp_t mask)
 +static int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 +                            unsigned bits, int wake, int delete,
 +                            struct extent_state **cached_state,
 +                            gfp_t mask, struct extent_changeset *changeset)
  {
        struct extent_state *state;
        struct extent_state *cached;
        if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY))
                clear = 1;
  again:
 -      if (!prealloc && (mask & __GFP_WAIT)) {
 +      if (!prealloc && gfpflags_allow_blocking(mask)) {
                /*
                 * Don't care for allocation failure here because we might end
                 * up not needing the pre-allocated extent state at all, which
@@@ -693,8 -671,7 +693,8 @@@ hit_next
                if (err)
                        goto out;
                if (state->end <= end) {
 -                      state = clear_state_bit(tree, state, &bits, wake);
 +                      state = clear_state_bit(tree, state, &bits, wake,
 +                                              changeset);
                        goto next;
                }
                goto search_again;
                if (wake)
                        wake_up(&state->wq);
  
 -              clear_state_bit(tree, prealloc, &bits, wake);
 +              clear_state_bit(tree, prealloc, &bits, wake, changeset);
  
                prealloc = NULL;
                goto out;
        }
  
 -      state = clear_state_bit(tree, state, &bits, wake);
 +      state = clear_state_bit(tree, state, &bits, wake, changeset);
  next:
        if (last_end == (u64)-1)
                goto out;
@@@ -741,7 -718,7 +741,7 @@@ search_again
        if (start > end)
                goto out;
        spin_unlock(&tree->lock);
 -      if (mask & __GFP_WAIT)
 +      if (gfpflags_allow_blocking(mask))
                cond_resched();
        goto again;
  }
@@@ -812,7 -789,7 +812,7 @@@ out
  
  static void set_state_bits(struct extent_io_tree *tree,
                           struct extent_state *state,
 -                         unsigned *bits)
 +                         unsigned *bits, struct extent_changeset *changeset)
  {
        unsigned bits_to_set = *bits & ~EXTENT_CTLBITS;
  
                u64 range = state->end - state->start + 1;
                tree->dirty_bytes += range;
        }
 +      add_extent_changeset(state, bits_to_set, changeset, 1);
        state->state |= bits_to_set;
  }
  
@@@ -859,7 -835,7 +859,7 @@@ static int __must_chec
  __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
                 unsigned bits, unsigned exclusive_bits,
                 u64 *failed_start, struct extent_state **cached_state,
 -               gfp_t mask)
 +               gfp_t mask, struct extent_changeset *changeset)
  {
        struct extent_state *state;
        struct extent_state *prealloc = NULL;
  
        bits |= EXTENT_FIRST_DELALLOC;
  again:
 -      if (!prealloc && (mask & __GFP_WAIT)) {
 +      if (!prealloc && gfpflags_allow_blocking(mask)) {
                prealloc = alloc_extent_state(mask);
                BUG_ON(!prealloc);
        }
                prealloc = alloc_extent_state_atomic(prealloc);
                BUG_ON(!prealloc);
                err = insert_state(tree, prealloc, start, end,
 -                                 &p, &parent, &bits);
 +                                 &p, &parent, &bits, changeset);
                if (err)
                        extent_io_tree_panic(tree, err);
  
@@@ -923,7 -899,7 +923,7 @@@ hit_next
                        goto out;
                }
  
 -              set_state_bits(tree, state, &bits);
 +              set_state_bits(tree, state, &bits, changeset);
                cache_state(state, cached_state);
                merge_state(tree, state);
                if (last_end == (u64)-1)
                if (err)
                        goto out;
                if (state->end <= end) {
 -                      set_state_bits(tree, state, &bits);
 +                      set_state_bits(tree, state, &bits, changeset);
                        cache_state(state, cached_state);
                        merge_state(tree, state);
                        if (last_end == (u64)-1)
                 * the later extent.
                 */
                err = insert_state(tree, prealloc, start, this_end,
 -                                 NULL, NULL, &bits);
 +                                 NULL, NULL, &bits, changeset);
                if (err)
                        extent_io_tree_panic(tree, err);
  
                if (err)
                        extent_io_tree_panic(tree, err);
  
 -              set_state_bits(tree, prealloc, &bits);
 +              set_state_bits(tree, prealloc, &bits, changeset);
                cache_state(prealloc, cached_state);
                merge_state(tree, prealloc);
                prealloc = NULL;
@@@ -1052,7 -1028,7 +1052,7 @@@ search_again
        if (start > end)
                goto out;
        spin_unlock(&tree->lock);
 -      if (mask & __GFP_WAIT)
 +      if (gfpflags_allow_blocking(mask))
                cond_resched();
        goto again;
  }
@@@ -1062,7 -1038,7 +1062,7 @@@ int set_extent_bit(struct extent_io_tre
                   struct extent_state **cached_state, gfp_t mask)
  {
        return __set_extent_bit(tree, start, end, bits, 0, failed_start,
 -                              cached_state, mask);
 +                              cached_state, mask, NULL);
  }
  
  
@@@ -1100,7 -1076,7 +1100,7 @@@ int convert_extent_bit(struct extent_io
        btrfs_debug_check_extent_io_range(tree, start, end);
  
  again:
 -      if (!prealloc && (mask & __GFP_WAIT)) {
 +      if (!prealloc && gfpflags_allow_blocking(mask)) {
                /*
                 * Best effort, don't worry if extent state allocation fails
                 * here for the first iteration. We might have a cached state
                        goto out;
                }
                err = insert_state(tree, prealloc, start, end,
 -                                 &p, &parent, &bits);
 +                                 &p, &parent, &bits, NULL);
                if (err)
                        extent_io_tree_panic(tree, err);
                cache_state(prealloc, cached_state);
@@@ -1154,9 -1130,9 +1154,9 @@@ hit_next
         * Just lock what we found and keep going
         */
        if (state->start == start && state->end <= end) {
 -              set_state_bits(tree, state, &bits);
 +              set_state_bits(tree, state, &bits, NULL);
                cache_state(state, cached_state);
 -              state = clear_state_bit(tree, state, &clear_bits, 0);
 +              state = clear_state_bit(tree, state, &clear_bits, 0, NULL);
                if (last_end == (u64)-1)
                        goto out;
                start = last_end + 1;
                if (err)
                        goto out;
                if (state->end <= end) {
 -                      set_state_bits(tree, state, &bits);
 +                      set_state_bits(tree, state, &bits, NULL);
                        cache_state(state, cached_state);
 -                      state = clear_state_bit(tree, state, &clear_bits, 0);
 +                      state = clear_state_bit(tree, state, &clear_bits, 0,
 +                                              NULL);
                        if (last_end == (u64)-1)
                                goto out;
                        start = last_end + 1;
                 * the later extent.
                 */
                err = insert_state(tree, prealloc, start, this_end,
 -                                 NULL, NULL, &bits);
 +                                 NULL, NULL, &bits, NULL);
                if (err)
                        extent_io_tree_panic(tree, err);
                cache_state(prealloc, cached_state);
                if (err)
                        extent_io_tree_panic(tree, err);
  
 -              set_state_bits(tree, prealloc, &bits);
 +              set_state_bits(tree, prealloc, &bits, NULL);
                cache_state(prealloc, cached_state);
 -              clear_state_bit(tree, prealloc, &clear_bits, 0);
 +              clear_state_bit(tree, prealloc, &clear_bits, 0, NULL);
                prealloc = NULL;
                goto out;
        }
@@@ -1278,7 -1253,7 +1278,7 @@@ search_again
        if (start > end)
                goto out;
        spin_unlock(&tree->lock);
 -      if (mask & __GFP_WAIT)
 +      if (gfpflags_allow_blocking(mask))
                cond_resched();
        first_iteration = false;
        goto again;
@@@ -1299,30 -1274,6 +1299,30 @@@ int set_extent_bits(struct extent_io_tr
                              NULL, mask);
  }
  
 +int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
 +                         unsigned bits, gfp_t mask,
 +                         struct extent_changeset *changeset)
 +{
 +      /*
 +       * We don't support EXTENT_LOCKED yet, as current changeset will
 +       * record any bits changed, so for EXTENT_LOCKED case, it will
 +       * either fail with -EEXIST or changeset will record the whole
 +       * range.
 +       */
 +      BUG_ON(bits & EXTENT_LOCKED);
 +
 +      return __set_extent_bit(tree, start, end, bits, 0, NULL, NULL, mask,
 +                              changeset);
 +}
 +
 +int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 +                   unsigned bits, int wake, int delete,
 +                   struct extent_state **cached, gfp_t mask)
 +{
 +      return __clear_extent_bit(tree, start, end, bits, wake, delete,
 +                                cached, mask, NULL);
 +}
 +
  int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
                      unsigned bits, gfp_t mask)
  {
        return clear_extent_bit(tree, start, end, bits, wake, 0, NULL, mask);
  }
  
 +int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
 +                           unsigned bits, gfp_t mask,
 +                           struct extent_changeset *changeset)
 +{
 +      /*
 +       * Don't support EXTENT_LOCKED case, same reason as
 +       * set_record_extent_bits().
 +       */
 +      BUG_ON(bits & EXTENT_LOCKED);
 +
 +      return __clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask,
 +                                changeset);
 +}
 +
  int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
                        struct extent_state **cached_state, gfp_t mask)
  {
@@@ -1406,7 -1343,7 +1406,7 @@@ int lock_extent_bits(struct extent_io_t
        while (1) {
                err = __set_extent_bit(tree, start, end, EXTENT_LOCKED | bits,
                                       EXTENT_LOCKED, &failed_start,
 -                                     cached_state, GFP_NOFS);
 +                                     cached_state, GFP_NOFS, NULL);
                if (err == -EEXIST) {
                        wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
                        start = failed_start;
@@@ -1428,7 -1365,7 +1428,7 @@@ int try_lock_extent(struct extent_io_tr
        u64 failed_start;
  
        err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
 -                             &failed_start, NULL, GFP_NOFS);
 +                             &failed_start, NULL, GFP_NOFS, NULL);
        if (err == -EEXIST) {
                if (failed_start > start)
                        clear_extent_bit(tree, start, failed_start - 1,
@@@ -2141,8 -2078,8 +2141,8 @@@ int repair_io_failure(struct inode *ino
                return -EIO;
        }
  
 -      printk_ratelimited_in_rcu(KERN_INFO
 -                                "BTRFS: read error corrected: ino %llu off %llu (dev %s sector %llu)\n",
 +      btrfs_info_rl_in_rcu(fs_info,
 +              "read error corrected: ino %llu off %llu (dev %s sector %llu)",
                                  btrfs_ino(inode), start,
                                  rcu_str_deref(dev->name), sector);
        bio_put(bio);
@@@ -3133,12 -3070,8 +3133,12 @@@ static int __do_readpage(struct extent_
  
                        set_extent_uptodate(tree, cur, cur + iosize - 1,
                                            &cached, GFP_NOFS);
 -                      unlock_extent_cached(tree, cur, cur + iosize - 1,
 -                                           &cached, GFP_NOFS);
 +                      if (parent_locked)
 +                              free_extent_state(cached);
 +                      else
 +                              unlock_extent_cached(tree, cur,
 +                                                   cur + iosize - 1,
 +                                                   &cached, GFP_NOFS);
                        cur = cur + iosize;
                        pg_offset += iosize;
                        continue;
@@@ -4386,7 -4319,7 +4386,7 @@@ int try_release_extent_mapping(struct e
        u64 start = page_offset(page);
        u64 end = start + PAGE_CACHE_SIZE - 1;
  
 -      if ((mask & __GFP_WAIT) &&
 +      if (gfpflags_allow_blocking(mask) &&
            page->mapping->host->i_size > 16 * 1024 * 1024) {
                u64 len;
                while (start <= end) {
@@@ -4797,24 -4730,14 +4797,14 @@@ struct extent_buffer *btrfs_clone_exten
        return new;
  }
  
- struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
-                                               u64 start)
+ struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
+                                                 u64 start, unsigned long len)
  {
        struct extent_buffer *eb;
-       unsigned long len;
        unsigned long num_pages;
        unsigned long i;
  
-       if (!fs_info) {
-               /*
-                * Called only from tests that don't always have a fs_info
-                * available, but we know that nodesize is 4096
-                */
-               len = 4096;
-       } else {
-               len = fs_info->tree_root->nodesize;
-       }
-       num_pages = num_extent_pages(0, len);
+       num_pages = num_extent_pages(start, len);
  
        eb = __alloc_extent_buffer(fs_info, start, len);
        if (!eb)
        return NULL;
  }
  
+ struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
+                                               u64 start)
+ {
+       unsigned long len;
+       if (!fs_info) {
+               /*
+                * Called only from tests that don't always have a fs_info
+                * available, but we know that nodesize is 4096
+                */
+               len = 4096;
+       } else {
+               len = fs_info->tree_root->nodesize;
+       }
+       return __alloc_dummy_extent_buffer(fs_info, start, len);
+ }
  static void check_buffer_tree_ref(struct extent_buffer *eb)
  {
        int refs;
@@@ -5594,6 -5535,155 +5602,155 @@@ void copy_extent_buffer(struct extent_b
        }
  }
  
+ /*
+  * The extent buffer bitmap operations are done with byte granularity because
+  * bitmap items are not guaranteed to be aligned to a word and therefore a
+  * single word in a bitmap may straddle two pages in the extent buffer.
+  */
+ #define BIT_BYTE(nr) ((nr) / BITS_PER_BYTE)
+ #define BYTE_MASK ((1 << BITS_PER_BYTE) - 1)
+ #define BITMAP_FIRST_BYTE_MASK(start) \
+       ((BYTE_MASK << ((start) & (BITS_PER_BYTE - 1))) & BYTE_MASK)
+ #define BITMAP_LAST_BYTE_MASK(nbits) \
+       (BYTE_MASK >> (-(nbits) & (BITS_PER_BYTE - 1)))
+ /*
+  * eb_bitmap_offset() - calculate the page and offset of the byte containing the
+  * given bit number
+  * @eb: the extent buffer
+  * @start: offset of the bitmap item in the extent buffer
+  * @nr: bit number
+  * @page_index: return index of the page in the extent buffer that contains the
+  * given bit number
+  * @page_offset: return offset into the page given by page_index
+  *
+  * This helper hides the ugliness of finding the byte in an extent buffer which
+  * contains a given bit.
+  */
+ static inline void eb_bitmap_offset(struct extent_buffer *eb,
+                                   unsigned long start, unsigned long nr,
+                                   unsigned long *page_index,
+                                   size_t *page_offset)
+ {
+       size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
+       size_t byte_offset = BIT_BYTE(nr);
+       size_t offset;
+       /*
+        * The byte we want is the offset of the extent buffer + the offset of
+        * the bitmap item in the extent buffer + the offset of the byte in the
+        * bitmap item.
+        */
+       offset = start_offset + start + byte_offset;
+       *page_index = offset >> PAGE_CACHE_SHIFT;
+       *page_offset = offset & (PAGE_CACHE_SIZE - 1);
+ }
+ /**
+  * extent_buffer_test_bit - determine whether a bit in a bitmap item is set
+  * @eb: the extent buffer
+  * @start: offset of the bitmap item in the extent buffer
+  * @nr: bit number to test
+  */
+ int extent_buffer_test_bit(struct extent_buffer *eb, unsigned long start,
+                          unsigned long nr)
+ {
+       char *kaddr;
+       struct page *page;
+       unsigned long i;
+       size_t offset;
+       eb_bitmap_offset(eb, start, nr, &i, &offset);
+       page = eb->pages[i];
+       WARN_ON(!PageUptodate(page));
+       kaddr = page_address(page);
+       return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1)));
+ }
+ /**
+  * extent_buffer_bitmap_set - set an area of a bitmap
+  * @eb: the extent buffer
+  * @start: offset of the bitmap item in the extent buffer
+  * @pos: bit number of the first bit
+  * @len: number of bits to set
+  */
+ void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start,
+                             unsigned long pos, unsigned long len)
+ {
+       char *kaddr;
+       struct page *page;
+       unsigned long i;
+       size_t offset;
+       const unsigned int size = pos + len;
+       int bits_to_set = BITS_PER_BYTE - (pos % BITS_PER_BYTE);
+       unsigned int mask_to_set = BITMAP_FIRST_BYTE_MASK(pos);
+       eb_bitmap_offset(eb, start, pos, &i, &offset);
+       page = eb->pages[i];
+       WARN_ON(!PageUptodate(page));
+       kaddr = page_address(page);
+       while (len >= bits_to_set) {
+               kaddr[offset] |= mask_to_set;
+               len -= bits_to_set;
+               bits_to_set = BITS_PER_BYTE;
+               mask_to_set = ~0U;
+               if (++offset >= PAGE_CACHE_SIZE && len > 0) {
+                       offset = 0;
+                       page = eb->pages[++i];
+                       WARN_ON(!PageUptodate(page));
+                       kaddr = page_address(page);
+               }
+       }
+       if (len) {
+               mask_to_set &= BITMAP_LAST_BYTE_MASK(size);
+               kaddr[offset] |= mask_to_set;
+       }
+ }
+ /**
+  * extent_buffer_bitmap_clear - clear an area of a bitmap
+  * @eb: the extent buffer
+  * @start: offset of the bitmap item in the extent buffer
+  * @pos: bit number of the first bit
+  * @len: number of bits to clear
+  */
+ void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start,
+                               unsigned long pos, unsigned long len)
+ {
+       char *kaddr;
+       struct page *page;
+       unsigned long i;
+       size_t offset;
+       const unsigned int size = pos + len;
+       int bits_to_clear = BITS_PER_BYTE - (pos % BITS_PER_BYTE);
+       unsigned int mask_to_clear = BITMAP_FIRST_BYTE_MASK(pos);
+       eb_bitmap_offset(eb, start, pos, &i, &offset);
+       page = eb->pages[i];
+       WARN_ON(!PageUptodate(page));
+       kaddr = page_address(page);
+       while (len >= bits_to_clear) {
+               kaddr[offset] &= ~mask_to_clear;
+               len -= bits_to_clear;
+               bits_to_clear = BITS_PER_BYTE;
+               mask_to_clear = ~0U;
+               if (++offset >= PAGE_CACHE_SIZE && len > 0) {
+                       offset = 0;
+                       page = eb->pages[++i];
+                       WARN_ON(!PageUptodate(page));
+                       kaddr = page_address(page);
+               }
+       }
+       if (len) {
+               mask_to_clear &= BITMAP_LAST_BYTE_MASK(size);
+               kaddr[offset] &= ~mask_to_clear;
+       }
+ }
  static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len)
  {
        unsigned long distance = (src > dst) ? src - dst : dst - src;
@@@ -5633,15 -5723,13 +5790,15 @@@ void memcpy_extent_buffer(struct extent
        unsigned long src_i;
  
        if (src_offset + len > dst->len) {
 -              printk(KERN_ERR "BTRFS: memmove bogus src_offset %lu move "
 -                     "len %lu dst len %lu\n", src_offset, len, dst->len);
 +              btrfs_err(dst->fs_info,
 +                      "memmove bogus src_offset %lu move "
 +                     "len %lu dst len %lu", src_offset, len, dst->len);
                BUG_ON(1);
        }
        if (dst_offset + len > dst->len) {
 -              printk(KERN_ERR "BTRFS: memmove bogus dst_offset %lu move "
 -                     "len %lu dst len %lu\n", dst_offset, len, dst->len);
 +              btrfs_err(dst->fs_info,
 +                      "memmove bogus dst_offset %lu move "
 +                     "len %lu dst len %lu", dst_offset, len, dst->len);
                BUG_ON(1);
        }
  
@@@ -5681,13 -5769,13 +5838,13 @@@ void memmove_extent_buffer(struct exten
        unsigned long src_i;
  
        if (src_offset + len > dst->len) {
 -              printk(KERN_ERR "BTRFS: memmove bogus src_offset %lu move "
 -                     "len %lu len %lu\n", src_offset, len, dst->len);
 +              btrfs_err(dst->fs_info, "memmove bogus src_offset %lu move "
 +                     "len %lu len %lu", src_offset, len, dst->len);
                BUG_ON(1);
        }
        if (dst_offset + len > dst->len) {
 -              printk(KERN_ERR "BTRFS: memmove bogus dst_offset %lu move "
 -                     "len %lu len %lu\n", dst_offset, len, dst->len);
 +              btrfs_err(dst->fs_info, "memmove bogus dst_offset %lu move "
 +                     "len %lu len %lu", dst_offset, len, dst->len);
                BUG_ON(1);
        }
        if (dst_offset < src_offset) {
diff --combined fs/btrfs/extent_io.h
index f4c1ae11855f0b613894ea44026faf143021b613,9f8d7d1a70157c9842de995ba5b95e9f907560bf..350c8b0a85826ece7fed1951682972e603195880
@@@ -2,7 -2,6 +2,7 @@@
  #define __EXTENTIO__
  
  #include <linux/rbtree.h>
 +#include "ulist.h"
  
  /* bits for the extent state */
  #define EXTENT_DIRTY          (1U << 0)
@@@ -19,7 -18,6 +19,7 @@@
  #define EXTENT_NEED_WAIT      (1U << 13)
  #define EXTENT_DAMAGED                (1U << 14)
  #define EXTENT_NORESERVE      (1U << 15)
 +#define EXTENT_QGROUP_RESERVED        (1U << 16)
  #define EXTENT_IOBITS         (EXTENT_LOCKED | EXTENT_WRITEBACK)
  #define EXTENT_CTLBITS                (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
  
@@@ -163,17 -161,6 +163,17 @@@ struct extent_buffer 
  #endif
  };
  
 +/*
 + * Structure to record how many bytes and which ranges are set/cleared
 + */
 +struct extent_changeset {
 +      /* How many bytes are set/cleared in this operation */
 +      u64 bytes_changed;
 +
 +      /* Changed ranges */
 +      struct ulist *range_changed;
 +};
 +
  static inline void extent_set_compress_type(unsigned long *bio_flags,
                                            int compress_type)
  {
@@@ -223,17 -210,11 +223,17 @@@ int test_range_bit(struct extent_io_tre
                   struct extent_state *cached_state);
  int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
                      unsigned bits, gfp_t mask);
 +int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
 +                           unsigned bits, gfp_t mask,
 +                           struct extent_changeset *changeset);
  int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
                     unsigned bits, int wake, int delete,
                     struct extent_state **cached, gfp_t mask);
  int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
                    unsigned bits, gfp_t mask);
 +int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
 +                         unsigned bits, gfp_t mask,
 +                         struct extent_changeset *changeset);
  int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
                   unsigned bits, u64 *failed_start,
                   struct extent_state **cached_state, gfp_t mask);
@@@ -282,8 -263,10 +282,10 @@@ void set_page_extent_mapped(struct pag
  
  struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
                                          u64 start);
+ struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
+                                                 u64 start, unsigned long len);
  struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
-               u64 start);
+                                               u64 start);
  struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src);
  struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
                                         u64 start);
@@@ -328,6 -311,12 +330,12 @@@ void memmove_extent_buffer(struct exten
                           unsigned long src_offset, unsigned long len);
  void memset_extent_buffer(struct extent_buffer *eb, char c,
                          unsigned long start, unsigned long len);
+ int extent_buffer_test_bit(struct extent_buffer *eb, unsigned long start,
+                          unsigned long pos);
+ void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start,
+                             unsigned long pos, unsigned long len);
+ void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start,
+                               unsigned long pos, unsigned long len);
  void clear_extent_buffer_dirty(struct extent_buffer *eb);
  int set_extent_buffer_dirty(struct extent_buffer *eb);
  int set_extent_buffer_uptodate(struct extent_buffer *eb);
diff --combined fs/btrfs/super.c
index 24154e422945167f474557887c62acaf6ed0779c,bfdaf123f4e9be781c58a567fc97984652b43ead..9153d54d27c87d16e6254e49ec59c268dd3a4cd1
@@@ -130,6 -130,7 +130,6 @@@ static void btrfs_handle_error(struct b
        }
  }
  
 -#ifdef CONFIG_PRINTK
  /*
   * __btrfs_std_error decodes expected errors from the caller and
   * invokes the approciate error response.
@@@ -139,9 -140,7 +139,9 @@@ void __btrfs_std_error(struct btrfs_fs_
                       unsigned int line, int errno, const char *fmt, ...)
  {
        struct super_block *sb = fs_info->sb;
 +#ifdef CONFIG_PRINTK
        const char *errstr;
 +#endif
  
        /*
         * Special case: if the error is EROFS, and we're already
        if (errno == -EROFS && (sb->s_flags & MS_RDONLY))
                return;
  
 +#ifdef CONFIG_PRINTK
        errstr = btrfs_decode_error(errno);
        if (fmt) {
                struct va_format vaf;
                printk(KERN_CRIT "BTRFS: error (device %s) in %s:%d: errno=%d %s\n",
                        sb->s_id, function, line, errno, errstr);
        }
 +#endif
  
        /* Don't go through full error handling during mount */
        save_error_info(fs_info);
                btrfs_handle_error(fs_info);
  }
  
 +#ifdef CONFIG_PRINTK
  static const char * const logtypes[] = {
        "emergency",
        "alert",
@@@ -216,6 -212,27 +216,6 @@@ void btrfs_printk(const struct btrfs_fs
  
        va_end(args);
  }
 -
 -#else
 -
 -void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
 -                     unsigned int line, int errno, const char *fmt, ...)
 -{
 -      struct super_block *sb = fs_info->sb;
 -
 -      /*
 -       * Special case: if the error is EROFS, and we're already
 -       * under MS_RDONLY, then it is safe here.
 -       */
 -      if (errno == -EROFS && (sb->s_flags & MS_RDONLY))
 -              return;
 -
 -      /* Don't go through full error handling during mount */
 -      if (sb->s_flags & MS_BORN) {
 -              save_error_info(fs_info);
 -              btrfs_handle_error(fs_info);
 -      }
 -}
  #endif
  
  /*
@@@ -295,17 -312,15 +295,18 @@@ enum 
        Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress,
        Opt_compress_type, Opt_compress_force, Opt_compress_force_type,
        Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
-       Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed,
-       Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, Opt_inode_cache,
-       Opt_no_space_cache, Opt_recovery, Opt_skip_balance,
-       Opt_check_integrity, Opt_check_integrity_including_extent_data,
+       Opt_space_cache, Opt_space_cache_version, Opt_clear_cache,
+       Opt_user_subvol_rm_allowed, Opt_enospc_debug, Opt_subvolrootid,
+       Opt_defrag, Opt_inode_cache, Opt_no_space_cache, Opt_recovery,
+       Opt_skip_balance, Opt_check_integrity,
+       Opt_check_integrity_including_extent_data,
        Opt_check_integrity_print_mask, Opt_fatal_errors, Opt_rescan_uuid_tree,
        Opt_commit_interval, Opt_barrier, Opt_nodefrag, Opt_nodiscard,
        Opt_noenospc_debug, Opt_noflushoncommit, Opt_acl, Opt_datacow,
        Opt_datasum, Opt_treelog, Opt_noinode_cache,
 +#ifdef CONFIG_BTRFS_DEBUG
 +      Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all,
 +#endif
        Opt_err,
  };
  
@@@ -340,6 -355,7 +341,7 @@@ static match_table_t tokens = 
        {Opt_discard, "discard"},
        {Opt_nodiscard, "nodiscard"},
        {Opt_space_cache, "space_cache"},
+       {Opt_space_cache_version, "space_cache=%s"},
        {Opt_clear_cache, "clear_cache"},
        {Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"},
        {Opt_enospc_debug, "enospc_debug"},
        {Opt_rescan_uuid_tree, "rescan_uuid_tree"},
        {Opt_fatal_errors, "fatal_errors=%s"},
        {Opt_commit_interval, "commit=%d"},
 +#ifdef CONFIG_BTRFS_DEBUG
 +      {Opt_fragment_data, "fragment=data"},
 +      {Opt_fragment_metadata, "fragment=metadata"},
 +      {Opt_fragment_all, "fragment=all"},
 +#endif
        {Opt_err, NULL},
  };
  
@@@ -383,7 -394,9 +385,9 @@@ int btrfs_parse_options(struct btrfs_ro
        bool compress_force = false;
  
        cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
-       if (cache_gen)
+       if (btrfs_fs_compat_ro(root->fs_info, FREE_SPACE_TREE))
+               btrfs_set_opt(info->mount_opt, FREE_SPACE_TREE);
+       else if (cache_gen)
                btrfs_set_opt(info->mount_opt, SPACE_CACHE);
  
        if (!options)
                                             "turning off discard");
                        break;
                case Opt_space_cache:
-                       btrfs_set_and_info(root, SPACE_CACHE,
-                                          "enabling disk space caching");
+               case Opt_space_cache_version:
+                       if (token == Opt_space_cache ||
+                           strcmp(args[0].from, "v1") == 0) {
+                               btrfs_clear_opt(root->fs_info->mount_opt,
+                                               FREE_SPACE_TREE);
+                               btrfs_set_and_info(root, SPACE_CACHE,
+                                                  "enabling disk space caching");
+                       } else if (strcmp(args[0].from, "v2") == 0) {
+                               btrfs_clear_opt(root->fs_info->mount_opt,
+                                               SPACE_CACHE);
+                               btrfs_set_and_info(root, FREE_SPACE_TREE,
+                                                  "enabling free space tree");
+                       } else {
+                               ret = -EINVAL;
+                               goto out;
+                       }
                        break;
                case Opt_rescan_uuid_tree:
                        btrfs_set_opt(info->mount_opt, RESCAN_UUID_TREE);
                        break;
                case Opt_no_space_cache:
-                       btrfs_clear_and_info(root, SPACE_CACHE,
-                                            "disabling disk space caching");
+                       if (btrfs_test_opt(root, SPACE_CACHE)) {
+                               btrfs_clear_and_info(root, SPACE_CACHE,
+                                                    "disabling disk space caching");
+                       }
+                       if (btrfs_test_opt(root, FREE_SPACE_TREE)) {
+                               btrfs_clear_and_info(root, FREE_SPACE_TREE,
+                                                    "disabling free space tree");
+                       }
                        break;
                case Opt_inode_cache:
                        btrfs_set_pending_and_info(info, INODE_MAP_CACHE,
                                info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
                        }
                        break;
 +#ifdef CONFIG_BTRFS_DEBUG
 +              case Opt_fragment_all:
 +                      btrfs_info(root->fs_info, "fragmenting all space");
 +                      btrfs_set_opt(info->mount_opt, FRAGMENT_DATA);
 +                      btrfs_set_opt(info->mount_opt, FRAGMENT_METADATA);
 +                      break;
 +              case Opt_fragment_metadata:
 +                      btrfs_info(root->fs_info, "fragmenting metadata");
 +                      btrfs_set_opt(info->mount_opt,
 +                                    FRAGMENT_METADATA);
 +                      break;
 +              case Opt_fragment_data:
 +                      btrfs_info(root->fs_info, "fragmenting data");
 +                      btrfs_set_opt(info->mount_opt, FRAGMENT_DATA);
 +                      break;
 +#endif
                case Opt_err:
                        btrfs_info(root->fs_info, "unrecognized mount option '%s'", p);
                        ret = -EINVAL;
                }
        }
  out:
+       if (btrfs_fs_compat_ro(root->fs_info, FREE_SPACE_TREE) &&
+           !btrfs_test_opt(root, FREE_SPACE_TREE) &&
+           !btrfs_test_opt(root, CLEAR_CACHE)) {
+               btrfs_err(root->fs_info, "cannot disable free space tree");
+               ret = -EINVAL;
+       }
        if (!ret && btrfs_test_opt(root, SPACE_CACHE))
                btrfs_info(root->fs_info, "disk space caching is enabled");
+       if (!ret && btrfs_test_opt(root, FREE_SPACE_TREE))
+               btrfs_info(root->fs_info, "using free space tree");
        kfree(orig);
        return ret;
  }
@@@ -1162,6 -1188,8 +1195,8 @@@ static int btrfs_show_options(struct se
                seq_puts(seq, ",noacl");
        if (btrfs_test_opt(root, SPACE_CACHE))
                seq_puts(seq, ",space_cache");
+       else if (btrfs_test_opt(root, FREE_SPACE_TREE))
+               seq_puts(seq, ",space_cache=v2");
        else
                seq_puts(seq, ",nospace_cache");
        if (btrfs_test_opt(root, RESCAN_UUID_TREE))
                seq_puts(seq, ",fatal_errors=panic");
        if (info->commit_interval != BTRFS_DEFAULT_COMMIT_INTERVAL)
                seq_printf(seq, ",commit=%d", info->commit_interval);
 +#ifdef CONFIG_BTRFS_DEBUG
 +      if (btrfs_test_opt(root, FRAGMENT_DATA))
 +              seq_puts(seq, ",fragment=data");
 +      if (btrfs_test_opt(root, FRAGMENT_METADATA))
 +              seq_puts(seq, ",fragment=metadata");
 +#endif
        seq_printf(seq, ",subvolid=%llu",
                  BTRFS_I(d_inode(dentry))->root->root_key.objectid);
        seq_puts(seq, ",subvol=");
@@@ -2225,6 -2247,9 +2260,9 @@@ static int btrfs_run_sanity_tests(void
        if (ret)
                goto out;
        ret = btrfs_test_qgroups();
+       if (ret)
+               goto out;
+       ret = btrfs_test_free_space_tree();
  out:
        btrfs_destroy_test_fs();
        return ret;
index 9626252ee6b47d2b391f3383cfa9b3bb80e4110c,ba28cefdf9e7b6d0b787199136471d2d68649941..b1d920b3007017c2014d28217cb5efa1d7ed4d96
@@@ -21,6 -21,9 +21,9 @@@
  #include <linux/magic.h>
  #include "btrfs-tests.h"
  #include "../ctree.h"
+ #include "../free-space-cache.h"
+ #include "../free-space-tree.h"
+ #include "../transaction.h"
  #include "../volumes.h"
  #include "../disk-io.h"
  #include "../qgroup.h"
@@@ -122,6 -125,9 +125,9 @@@ struct btrfs_fs_info *btrfs_alloc_dummy
        INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
        INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
        INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
+       extent_io_tree_init(&fs_info->freed_extents[0], NULL);
+       extent_io_tree_init(&fs_info->freed_extents[1], NULL);
+       fs_info->pinned_extents = &fs_info->freed_extents[0];
        return fs_info;
  }
  
@@@ -169,3 -175,49 +175,55 @@@ void btrfs_free_dummy_root(struct btrfs
        kfree(root);
  }
  
+ struct btrfs_block_group_cache *
+ btrfs_alloc_dummy_block_group(unsigned long length)
+ {
+       struct btrfs_block_group_cache *cache;
+       cache = kzalloc(sizeof(*cache), GFP_NOFS);
+       if (!cache)
+               return NULL;
+       cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
+                                       GFP_NOFS);
+       if (!cache->free_space_ctl) {
+               kfree(cache);
+               return NULL;
+       }
++      cache->fs_info = btrfs_alloc_dummy_fs_info();
++      if (!cache->fs_info) {
++              kfree(cache->free_space_ctl);
++              kfree(cache);
++              return NULL;
++      }
+       cache->key.objectid = 0;
+       cache->key.offset = length;
+       cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
+       cache->sectorsize = 4096;
+       cache->full_stripe_len = 4096;
+       INIT_LIST_HEAD(&cache->list);
+       INIT_LIST_HEAD(&cache->cluster_list);
+       INIT_LIST_HEAD(&cache->bg_list);
+       btrfs_init_free_space_ctl(cache);
+       mutex_init(&cache->free_space_lock);
+       return cache;
+ }
+ void btrfs_free_dummy_block_group(struct btrfs_block_group_cache *cache)
+ {
+       if (!cache)
+               return;
+       __btrfs_remove_free_space_cache(cache->free_space_ctl);
+       kfree(cache->free_space_ctl);
+       kfree(cache);
+ }
+ void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans)
+ {
+       memset(trans, 0, sizeof(*trans));
+       trans->transid = 1;
+       INIT_LIST_HEAD(&trans->qgroup_ref_list);
+       trans->type = __TRANS_DUMMY;
+ }
index 8b72b005bfb9a212518a711a2e476c2d70b47b24,bae6c599f6045a7eae995f179f123e180862e80b..cd3e300b9ba5c9152d9f2b3210469262dcd4a861
  #include <linux/slab.h>
  #include "btrfs-tests.h"
  #include "../ctree.h"
 +#include "../disk-io.h"
  #include "../free-space-cache.h"
  
  #define BITS_PER_BITMAP               (PAGE_CACHE_SIZE * 8)
- static struct btrfs_block_group_cache *init_test_block_group(void)
- {
-       struct btrfs_block_group_cache *cache;
-       cache = kzalloc(sizeof(*cache), GFP_NOFS);
-       if (!cache)
-               return NULL;
-       cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
-                                       GFP_NOFS);
-       if (!cache->free_space_ctl) {
-               kfree(cache);
-               return NULL;
-       }
-       cache->fs_info = btrfs_alloc_dummy_fs_info();
-       if (!cache->fs_info) {
-               kfree(cache->free_space_ctl);
-               kfree(cache);
-               return NULL;
-       }
-       cache->key.objectid = 0;
-       cache->key.offset = 1024 * 1024 * 1024;
-       cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
-       cache->sectorsize = 4096;
-       cache->full_stripe_len = 4096;
-       spin_lock_init(&cache->lock);
-       INIT_LIST_HEAD(&cache->list);
-       INIT_LIST_HEAD(&cache->cluster_list);
-       INIT_LIST_HEAD(&cache->bg_list);
-       btrfs_init_free_space_ctl(cache);
-       return cache;
- }
  
  /*
   * This test just does basic sanity checking, making sure we can add an exten
@@@ -886,30 -850,16 +851,30 @@@ test_steal_space_from_bitmap_to_extent(
  int btrfs_test_free_space_cache(void)
  {
        struct btrfs_block_group_cache *cache;
 -      int ret;
 +      struct btrfs_root *root = NULL;
 +      int ret = -ENOMEM;
  
        test_msg("Running btrfs free space cache tests\n");
  
-       cache = init_test_block_group();
+       cache = btrfs_alloc_dummy_block_group(1024 * 1024 * 1024);
        if (!cache) {
                test_msg("Couldn't run the tests\n");
                return 0;
        }
  
 +      root = btrfs_alloc_dummy_root();
 +      if (IS_ERR(root)) {
 +              ret = PTR_ERR(root);
 +              goto out;
 +      }
 +
 +      root->fs_info = btrfs_alloc_dummy_fs_info();
 +      if (!root->fs_info)
 +              goto out;
 +
 +      root->fs_info->extent_root = root;
 +      cache->fs_info = root->fs_info;
 +
        ret = test_extents(cache);
        if (ret)
                goto out;
  
        ret = test_steal_space_from_bitmap_to_extent(cache);
  out:
-       __btrfs_remove_free_space_cache(cache->free_space_ctl);
-       kfree(cache->free_space_ctl);
-       kfree(cache);
+       btrfs_free_dummy_block_group(cache);
 +      btrfs_free_dummy_root(root);
        test_msg("Free space cache tests finished\n");
        return ret;
  }
index b4473dab39d613e58e4d4e58e0049d72522047cd,e6289e62a2a820c7d6c9a52f04d003ce2bf5b78a..d866f21efbbfd4722e6c23bde8320d7b54ecf2be
@@@ -45,7 -45,8 +45,8 @@@ struct btrfs_qgroup_operation
                { BTRFS_TREE_LOG_OBJECTID,      "TREE_LOG"      },      \
                { BTRFS_QUOTA_TREE_OBJECTID,    "QUOTA_TREE"    },      \
                { BTRFS_TREE_RELOC_OBJECTID,    "TREE_RELOC"    },      \
-               { BTRFS_UUID_TREE_OBJECTID,     "UUID_RELOC"    },      \
+               { BTRFS_UUID_TREE_OBJECTID,     "UUID_TREE"     },      \
+               { BTRFS_FREE_SPACE_TREE_OBJECTID, "FREE_SPACE_TREE" },  \
                { BTRFS_DATA_RELOC_TREE_OBJECTID, "DATA_RELOC_TREE" })
  
  #define show_root_type(obj)                                           \
@@@ -1117,119 -1118,6 +1118,119 @@@ DEFINE_EVENT(btrfs__workqueue_done, btr
        TP_ARGS(wq)
  );
  
 +DECLARE_EVENT_CLASS(btrfs__qgroup_data_map,
 +
 +      TP_PROTO(struct inode *inode, u64 free_reserved),
 +
 +      TP_ARGS(inode, free_reserved),
 +
 +      TP_STRUCT__entry(
 +              __field(        u64,            rootid          )
 +              __field(        unsigned long,  ino             )
 +              __field(        u64,            free_reserved   )
 +      ),
 +
 +      TP_fast_assign(
 +              __entry->rootid         =       BTRFS_I(inode)->root->objectid;
 +              __entry->ino            =       inode->i_ino;
 +              __entry->free_reserved  =       free_reserved;
 +      ),
 +
 +      TP_printk("rootid=%llu, ino=%lu, free_reserved=%llu",
 +                __entry->rootid, __entry->ino, __entry->free_reserved)
 +);
 +
 +DEFINE_EVENT(btrfs__qgroup_data_map, btrfs_qgroup_init_data_rsv_map,
 +
 +      TP_PROTO(struct inode *inode, u64 free_reserved),
 +
 +      TP_ARGS(inode, free_reserved)
 +);
 +
 +DEFINE_EVENT(btrfs__qgroup_data_map, btrfs_qgroup_free_data_rsv_map,
 +
 +      TP_PROTO(struct inode *inode, u64 free_reserved),
 +
 +      TP_ARGS(inode, free_reserved)
 +);
 +
 +#define BTRFS_QGROUP_OPERATIONS                               \
 +      { QGROUP_RESERVE,       "reserve"       },      \
 +      { QGROUP_RELEASE,       "release"       },      \
 +      { QGROUP_FREE,          "free"          }
 +
 +DECLARE_EVENT_CLASS(btrfs__qgroup_rsv_data,
 +
 +      TP_PROTO(struct inode *inode, u64 start, u64 len, u64 reserved, int op),
 +
 +      TP_ARGS(inode, start, len, reserved, op),
 +
 +      TP_STRUCT__entry(
 +              __field(        u64,            rootid          )
 +              __field(        unsigned long,  ino             )
 +              __field(        u64,            start           )
 +              __field(        u64,            len             )
 +              __field(        u64,            reserved        )
 +              __field(        int,            op              )
 +      ),
 +
 +      TP_fast_assign(
 +              __entry->rootid         = BTRFS_I(inode)->root->objectid;
 +              __entry->ino            = inode->i_ino;
 +              __entry->start          = start;
 +              __entry->len            = len;
 +              __entry->reserved       = reserved;
 +              __entry->op             = op;
 +      ),
 +
 +      TP_printk("root=%llu, ino=%lu, start=%llu, len=%llu, reserved=%llu, op=%s",
 +                __entry->rootid, __entry->ino, __entry->start, __entry->len,
 +                __entry->reserved,
 +                __print_flags((unsigned long)__entry->op, "",
 +                              BTRFS_QGROUP_OPERATIONS)
 +      )
 +);
 +
 +DEFINE_EVENT(btrfs__qgroup_rsv_data, btrfs_qgroup_reserve_data,
 +
 +      TP_PROTO(struct inode *inode, u64 start, u64 len, u64 reserved, int op),
 +
 +      TP_ARGS(inode, start, len, reserved, op)
 +);
 +
 +DEFINE_EVENT(btrfs__qgroup_rsv_data, btrfs_qgroup_release_data,
 +
 +      TP_PROTO(struct inode *inode, u64 start, u64 len, u64 reserved, int op),
 +
 +      TP_ARGS(inode, start, len, reserved, op)
 +);
 +
 +DECLARE_EVENT_CLASS(btrfs__qgroup_delayed_ref,
 +
 +      TP_PROTO(u64 ref_root, u64 reserved),
 +
 +      TP_ARGS(ref_root, reserved),
 +
 +      TP_STRUCT__entry(
 +              __field(        u64,            ref_root        )
 +              __field(        u64,            reserved        )
 +      ),
 +
 +      TP_fast_assign(
 +              __entry->ref_root       = ref_root;
 +              __entry->reserved       = reserved;
 +      ),
 +
 +      TP_printk("root=%llu, reserved=%llu, op=free",
 +                __entry->ref_root, __entry->reserved)
 +);
 +
 +DEFINE_EVENT(btrfs__qgroup_delayed_ref, btrfs_qgroup_free_delayed_ref,
 +
 +      TP_PROTO(u64 ref_root, u64 reserved),
 +
 +      TP_ARGS(ref_root, reserved)
 +);
  #endif /* _TRACE_BTRFS_H */
  
  /* This part must be outside protection */