Merge branch 'freespace-tree' into for-linus-4.5

author Chris Mason <clm@fb.com>

Fri, 18 Dec 2015 19:11:10 +0000 (11:11 -0800)

committer Chris Mason <clm@fb.com>

Fri, 18 Dec 2015 19:11:10 +0000 (11:11 -0800)
author Chris Mason <clm@fb.com>
Fri, 18 Dec 2015 19:11:10 +0000 (11:11 -0800)
committer Chris Mason <clm@fb.com>
Fri, 18 Dec 2015 19:11:10 +0000 (11:11 -0800)
diff --combined fs/btrfs/ctree.h

index 35489e7129a7e8de9d0232279d41d3bbd19ae1df,ed610f9c04b29fc43d5efc51ae3737b8f2a053aa..cf87979a153e69c027b3a01cb96df1dc8a511158
--- 1/fs/btrfs/ctree.h
--- 2/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@@ -96,6 -96,9 +96,9 @@@ struct btrfs_ordered_sum
   /* for storing items that use the BTRFS_UUID_KEY* types */
   #define BTRFS_UUID_TREE_OBJECTID 9ULL
   
+ /* tracks free space in block groups. */
+ #define BTRFS_FREE_SPACE_TREE_OBJECTID 10ULL
+ 
   /* for storing balance parameters in the root tree */
   #define BTRFS_BALANCE_OBJECTID -4ULL
   
@@@ -500,6 -503,8 +503,8 @@@ struct btrfs_super_block 
    * Compat flags that we support.  If any incompat flags are set other than the
    * ones specified below then we will fail to mount
    */
+ #define BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE       (1ULL << 0)
+ 
   #define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF  (1ULL << 0)
   #define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (1ULL << 1)
   #define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS   (1ULL << 2)
@@@ -526,7 -531,10 +531,10 @@@
   #define BTRFS_FEATURE_COMPAT_SUPP             0ULL
   #define BTRFS_FEATURE_COMPAT_SAFE_SET         0ULL
   #define BTRFS_FEATURE_COMPAT_SAFE_CLEAR               0ULL
- #define BTRFS_FEATURE_COMPAT_RO_SUPP          0ULL
+ 
+ #define BTRFS_FEATURE_COMPAT_RO_SUPP                  \
+       (BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE)
+ 
   #define BTRFS_FEATURE_COMPAT_RO_SAFE_SET      0ULL
   #define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR    0ULL
   
@@@ -823,18 -831,8 +831,18 @@@ struct btrfs_disk_balance_args 
          */
         __le64 profiles;
   
- -      /* usage filter */
- -      __le64 usage;
+ +      /*
+ +       * usage filter
+ +       * BTRFS_BALANCE_ARGS_USAGE with a single value means '0..N'
+ +       * BTRFS_BALANCE_ARGS_USAGE_RANGE - range syntax, min..max
+ +       */
+ +      union {
+ +              __le64 usage;
+ +              struct {
+ +                      __le32 usage_min;
+ +                      __le32 usage_max;
+ +              };
+ +      };
   
         /* devid filter */
         __le64 devid;
@@@ -856,27 -854,10 +864,27 @@@
         /* BTRFS_BALANCE_ARGS_* */
         __le64 flags;
   
- -      /* BTRFS_BALANCE_ARGS_LIMIT value */
- -      __le64 limit;
+ +      /*
+ +       * BTRFS_BALANCE_ARGS_LIMIT with value 'limit'
+ +       * BTRFS_BALANCE_ARGS_LIMIT_RANGE - the extend version can use minimum
+ +       * and maximum
+ +       */
+ +      union {
+ +              __le64 limit;
+ +              struct {
+ +                      __le32 limit_min;
+ +                      __le32 limit_max;
+ +              };
+ +      };
   
- -      __le64 unused[7];
+ +      /*
+ +       * Process chunks that cross stripes_min..stripes_max devices,
+ +       * BTRFS_BALANCE_ARGS_STRIPES_RANGE
+ +       */
+ +      __le32 stripes_min;
+ +      __le32 stripes_max;
+ +
+ +      __le64 unused[6];
   } __attribute__ ((__packed__));
   
   /*
@@@ -1088,6 -1069,13 +1096,13 @@@ struct btrfs_block_group_item 
         __le64 flags;
   } __attribute__ ((__packed__));
   
+ struct btrfs_free_space_info {
+       __le32 extent_count;
+       __le32 flags;
+ } __attribute__ ((__packed__));
+ 
+ #define BTRFS_FREE_SPACE_USING_BITMAPS (1ULL << 0)
+ 
   #define BTRFS_QGROUP_LEVEL_SHIFT              48
   static inline u64 btrfs_qgroup_level(u64 qgroupid)
   {
@@@ -1181,10 -1169,6 +1196,10 @@@ struct btrfs_space_info 
                                    delalloc/allocations */
         u64 bytes_readonly;     /* total bytes that are read only */
   
+ +      u64 max_extent_size;    /* This will hold the maximum extent size of
+ +                                 the space info if we had an ENOSPC in the
+ +                                 allocator. */
+ +
         unsigned int full:1;    /* indicates that we cannot allocate any more
                                    chunks for this space */
         unsigned int chunk_alloc:1;     /* set if we are allocating a chunk */
@@@ -1259,9 -1243,6 +1274,9 @@@ struct btrfs_free_cluster 
         /* first extent starting offset */
         u64 window_start;
   
+ +      /* We did a full search and couldn't create a cluster */
+ +      bool fragmented;
+ +
         struct btrfs_block_group_cache *block_group;
         /*
          * when a cluster is allocated from a block group, we put the
@@@ -1296,6 -1277,9 +1311,9 @@@ struct btrfs_caching_control 
         atomic_t count;
   };
   
+ /* Once caching_thread() finds this much free space, it will wake up waiters. */
+ #define CACHING_CTL_WAKE_UP (1024 * 1024 * 2)
+ 
   struct btrfs_io_ctl {
         void *cur, *orig;
         struct page *page;
@@@ -1321,8 -1305,20 +1339,20 @@@ struct btrfs_block_group_cache 
         u64 delalloc_bytes;
         u64 bytes_super;
         u64 flags;
-       u64 sectorsize;
         u64 cache_generation;
+       u32 sectorsize;
+ 
+       /*
+        * If the free space extent count exceeds this number, convert the block
+        * group to bitmaps.
+        */
+       u32 bitmap_high_thresh;
+ 
+       /*
+        * If the free space extent count drops below this number, convert the
+        * block group back to extents.
+        */
+       u32 bitmap_low_thresh;
   
         /*
          * It is just used for the delayed data space allocation because
@@@ -1378,6 -1374,15 +1408,15 @@@
         struct list_head io_list;
   
         struct btrfs_io_ctl io_ctl;
+ 
+       /* Lock for free space tree operations. */
+       struct mutex free_space_lock;
+ 
+       /*
+        * Does the block group need to be added to the free space tree?
+        * Protected by free_space_lock.
+        */
+       int needs_free_space;
   };
   
   /* delayed seq elem */
@@@ -1429,6 -1434,7 +1468,7 @@@ struct btrfs_fs_info 
         struct btrfs_root *csum_root;
         struct btrfs_root *quota_root;
         struct btrfs_root *uuid_root;
+       struct btrfs_root *free_space_root;
   
         /* the log root tree is a directory of all the other log roots */
         struct btrfs_root *log_root_tree;
@@@ -1977,9 -1983,6 +2017,9 @@@ struct btrfs_root 
         int send_in_progress;
         struct btrfs_subvolume_writers *subv_writers;
         atomic_t will_be_snapshoted;
+ +
+ +      /* For qgroup metadata space reserve */
+ +      atomic_t qgroup_meta_rsv;
   };
   
   struct btrfs_ioctl_defrag_range_args {
@@@ -2092,6 -2095,27 +2132,27 @@@
    */
   #define BTRFS_BLOCK_GROUP_ITEM_KEY 192
   
+ /*
+  * Every block group is represented in the free space tree by a free space info
+  * item, which stores some accounting information. It is keyed on
+  * (block_group_start, FREE_SPACE_INFO, block_group_length).
+  */
+ #define BTRFS_FREE_SPACE_INFO_KEY 198
+ 
+ /*
+  * A free space extent tracks an extent of space that is free in a block group.
+  * It is keyed on (start, FREE_SPACE_EXTENT, length).
+  */
+ #define BTRFS_FREE_SPACE_EXTENT_KEY 199
+ 
+ /*
+  * When a block group becomes very fragmented, we convert it to use bitmaps
+  * instead of extents. A free space bitmap is keyed on
+  * (start, FREE_SPACE_BITMAP, length); the corresponding item is a bitmap with
+  * (length / sectorsize) bits.
+  */
+ #define BTRFS_FREE_SPACE_BITMAP_KEY 200
+ 
   #define BTRFS_DEV_EXTENT_KEY  204
   #define BTRFS_DEV_ITEM_KEY    216
   #define BTRFS_CHUNK_ITEM_KEY  228
@@@ -2182,8 -2206,7 +2243,9 @@@
   #define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21)
   #define BTRFS_MOUNT_PANIC_ON_FATAL_ERROR      (1 << 22)
   #define BTRFS_MOUNT_RESCAN_UUID_TREE  (1 << 23)
- -#define BTRFS_MOUNT_FREE_SPACE_TREE   (1 << 24)
+ +#define BTRFS_MOUNT_FRAGMENT_DATA     (1 << 24)
+ +#define BTRFS_MOUNT_FRAGMENT_METADATA (1 << 25)
++#define BTRFS_MOUNT_FREE_SPACE_TREE   (1 << 26)
   
   #define BTRFS_DEFAULT_COMMIT_INTERVAL (30)
   #define BTRFS_DEFAULT_MAX_INLINE      (8192)
@@@ -2208,18 -2231,6 +2270,18 @@@
         btrfs_clear_opt(root->fs_info->mount_opt, opt);                 \
   }
   
+ +#ifdef CONFIG_BTRFS_DEBUG
+ +static inline int
+ +btrfs_should_fragment_free_space(struct btrfs_root *root,
+ +                               struct btrfs_block_group_cache *block_group)
+ +{
+ +      return (btrfs_test_opt(root, FRAGMENT_METADATA) &&
+ +              block_group->flags & BTRFS_BLOCK_GROUP_METADATA) ||
+ +             (btrfs_test_opt(root, FRAGMENT_DATA) &&
+ +              block_group->flags &  BTRFS_BLOCK_GROUP_DATA);
+ +}
+ +#endif
+ +
   /*
    * Requests for changes that need to be done during transaction commit.
    *
@@@ -2506,6 -2517,11 +2568,11 @@@ BTRFS_SETGET_FUNCS(disk_block_group_fla
   BTRFS_SETGET_STACK_FUNCS(block_group_flags,
                         struct btrfs_block_group_item, flags, 64);
   
+ /* struct btrfs_free_space_info */
+ BTRFS_SETGET_FUNCS(free_space_extent_count, struct btrfs_free_space_info,
+                  extent_count, 32);
+ BTRFS_SETGET_FUNCS(free_space_flags, struct btrfs_free_space_info, flags, 32);
+ 
   /* struct btrfs_inode_ref */
   BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16);
   BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64);
@@@ -3367,7 -3383,7 +3434,7 @@@ static inline bool btrfs_mixed_space_in
   
   static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping)
   {
- -      return mapping_gfp_mask(mapping) & ~__GFP_FS;
+ +      return mapping_gfp_constraint(mapping, ~__GFP_FS);
   }
   
   /* extent-tree.c */
@@@ -3416,7 -3432,6 +3483,7 @@@ int btrfs_cross_ref_exist(struct btrfs_
   struct btrfs_block_group_cache *btrfs_lookup_block_group(
                                                  struct btrfs_fs_info *info,
                                                  u64 bytenr);
+ +void btrfs_get_block_group(struct btrfs_block_group_cache *cache);
   void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
   int get_block_group_index(struct btrfs_block_group_cache *cache);
   struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
@@@ -3431,8 -3446,7 +3498,8 @@@ void btrfs_free_tree_block(struct btrfs
   int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
                                      struct btrfs_root *root,
                                      u64 root_objectid, u64 owner,
- -                                   u64 offset, struct btrfs_key *ins);
+ +                                   u64 offset, u64 ram_bytes,
+ +                                   struct btrfs_key *ins);
   int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
                                    struct btrfs_root *root,
                                    u64 root_objectid, u64 owner, u64 offset,
@@@ -3451,7 -3465,7 +3518,7 @@@ int btrfs_set_disk_extent_flags(struct 
   int btrfs_free_extent(struct btrfs_trans_handle *trans,
                       struct btrfs_root *root,
                       u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
- -                    u64 owner, u64 offset, int no_quota);
+ +                    u64 owner, u64 offset);
   
   int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len,
                                int delalloc);
@@@ -3464,7 -3478,7 +3531,7 @@@ int btrfs_finish_extent_commit(struct b
   int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root,
                          u64 bytenr, u64 num_bytes, u64 parent,
- -                       u64 root_objectid, u64 owner, u64 offset, int no_quota);
+ +                       u64 root_objectid, u64 owner, u64 offset);
   
   int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans,
                                    struct btrfs_root *root);
@@@ -3480,9 -3494,6 +3547,9 @@@ int btrfs_make_block_group(struct btrfs
                            struct btrfs_root *root, u64 bytes_used,
                            u64 type, u64 chunk_objectid, u64 chunk_offset,
                            u64 size);
+ +struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
+ +                              struct btrfs_fs_info *fs_info,
+ +                              const u64 chunk_offset);
   int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
                              struct btrfs_root *root, u64 group_start,
                              struct extent_map *em);
@@@ -3505,11 -3516,8 +3572,11 @@@ enum btrfs_reserve_flush_enum 
         BTRFS_RESERVE_FLUSH_ALL,
   };
   
- -int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 write_bytes);
- -void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
+ +int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len);
+ +int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes);
+ +void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len);
+ +void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
+ +                                          u64 len);
   void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
                                 struct btrfs_root *root);
   void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans);
@@@ -3525,8 -3533,8 +3592,8 @@@ void btrfs_subvolume_release_metadata(s
                                       u64 qgroup_reserved);
   int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes);
   void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes);
- -int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes);
- -void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes);
+ +int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len);
+ +void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len);
   void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type);
   struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
                                               unsigned short type);
@@@ -3573,6 -3581,9 +3640,9 @@@ void btrfs_end_write_no_snapshoting(str
   void check_system_chunk(struct btrfs_trans_handle *trans,
                         struct btrfs_root *root,
                         const u64 type);
+ u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
+                      struct btrfs_fs_info *info, u64 start, u64 end);
+ 
   /* ctree.c */
   int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
                      int level, int *slot);
@@@ -3737,6 -3748,7 +3807,7 @@@ static inline void free_fs_info(struct 
         kfree(fs_info->csum_root);
         kfree(fs_info->quota_root);
         kfree(fs_info->uuid_root);
+       kfree(fs_info->free_space_root);
         kfree(fs_info->super_copy);
         kfree(fs_info->super_for_commit);
         security_free_mnt_opts(&fs_info->security_opts);
@@@ -4063,8 -4075,8 +4134,8 @@@ int btrfs_defrag_leaves(struct btrfs_tr
   /* sysfs.c */
   int btrfs_init_sysfs(void);
   void btrfs_exit_sysfs(void);
- -int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info);
- -void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info);
+ +int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info);
+ +void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info);
   
   /* xattr.c */
   ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
@@@ -4098,102 -4110,14 +4169,102 @@@ void btrfs_printk(const struct btrfs_fs
   #define btrfs_info(fs_info, fmt, args...) \
         btrfs_printk(fs_info, KERN_INFO fmt, ##args)
   
+ +/*
+ + * Wrappers that use printk_in_rcu
+ + */
+ +#define btrfs_emerg_in_rcu(fs_info, fmt, args...) \
+ +      btrfs_printk_in_rcu(fs_info, KERN_EMERG fmt, ##args)
+ +#define btrfs_alert_in_rcu(fs_info, fmt, args...) \
+ +      btrfs_printk_in_rcu(fs_info, KERN_ALERT fmt, ##args)
+ +#define btrfs_crit_in_rcu(fs_info, fmt, args...) \
+ +      btrfs_printk_in_rcu(fs_info, KERN_CRIT fmt, ##args)
+ +#define btrfs_err_in_rcu(fs_info, fmt, args...) \
+ +      btrfs_printk_in_rcu(fs_info, KERN_ERR fmt, ##args)
+ +#define btrfs_warn_in_rcu(fs_info, fmt, args...) \
+ +      btrfs_printk_in_rcu(fs_info, KERN_WARNING fmt, ##args)
+ +#define btrfs_notice_in_rcu(fs_info, fmt, args...) \
+ +      btrfs_printk_in_rcu(fs_info, KERN_NOTICE fmt, ##args)
+ +#define btrfs_info_in_rcu(fs_info, fmt, args...) \
+ +      btrfs_printk_in_rcu(fs_info, KERN_INFO fmt, ##args)
+ +
+ +/*
+ + * Wrappers that use a ratelimited printk_in_rcu
+ + */
+ +#define btrfs_emerg_rl_in_rcu(fs_info, fmt, args...) \
+ +      btrfs_printk_rl_in_rcu(fs_info, KERN_EMERG fmt, ##args)
+ +#define btrfs_alert_rl_in_rcu(fs_info, fmt, args...) \
+ +      btrfs_printk_rl_in_rcu(fs_info, KERN_ALERT fmt, ##args)
+ +#define btrfs_crit_rl_in_rcu(fs_info, fmt, args...) \
+ +      btrfs_printk_rl_in_rcu(fs_info, KERN_CRIT fmt, ##args)
+ +#define btrfs_err_rl_in_rcu(fs_info, fmt, args...) \
+ +      btrfs_printk_rl_in_rcu(fs_info, KERN_ERR fmt, ##args)
+ +#define btrfs_warn_rl_in_rcu(fs_info, fmt, args...) \
+ +      btrfs_printk_rl_in_rcu(fs_info, KERN_WARNING fmt, ##args)
+ +#define btrfs_notice_rl_in_rcu(fs_info, fmt, args...) \
+ +      btrfs_printk_rl_in_rcu(fs_info, KERN_NOTICE fmt, ##args)
+ +#define btrfs_info_rl_in_rcu(fs_info, fmt, args...) \
+ +      btrfs_printk_rl_in_rcu(fs_info, KERN_INFO fmt, ##args)
+ +
+ +/*
+ + * Wrappers that use a ratelimited printk
+ + */
+ +#define btrfs_emerg_rl(fs_info, fmt, args...) \
+ +      btrfs_printk_ratelimited(fs_info, KERN_EMERG fmt, ##args)
+ +#define btrfs_alert_rl(fs_info, fmt, args...) \
+ +      btrfs_printk_ratelimited(fs_info, KERN_ALERT fmt, ##args)
+ +#define btrfs_crit_rl(fs_info, fmt, args...) \
+ +      btrfs_printk_ratelimited(fs_info, KERN_CRIT fmt, ##args)
+ +#define btrfs_err_rl(fs_info, fmt, args...) \
+ +      btrfs_printk_ratelimited(fs_info, KERN_ERR fmt, ##args)
+ +#define btrfs_warn_rl(fs_info, fmt, args...) \
+ +      btrfs_printk_ratelimited(fs_info, KERN_WARNING fmt, ##args)
+ +#define btrfs_notice_rl(fs_info, fmt, args...) \
+ +      btrfs_printk_ratelimited(fs_info, KERN_NOTICE fmt, ##args)
+ +#define btrfs_info_rl(fs_info, fmt, args...) \
+ +      btrfs_printk_ratelimited(fs_info, KERN_INFO fmt, ##args)
   #ifdef DEBUG
   #define btrfs_debug(fs_info, fmt, args...) \
         btrfs_printk(fs_info, KERN_DEBUG fmt, ##args)
+ +#define btrfs_debug_in_rcu(fs_info, fmt, args...) \
+ +      btrfs_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
+ +#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \
+ +      btrfs_printk_rl_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
+ +#define btrfs_debug_rl(fs_info, fmt, args...) \
+ +      btrfs_printk_ratelimited(fs_info, KERN_DEBUG fmt, ##args)
   #else
   #define btrfs_debug(fs_info, fmt, args...) \
       no_printk(KERN_DEBUG fmt, ##args)
+ +#define btrfs_debug_in_rcu(fs_info, fmt, args...) \
+ +      no_printk(KERN_DEBUG fmt, ##args)
+ +#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \
+ +      no_printk(KERN_DEBUG fmt, ##args)
+ +#define btrfs_debug_rl(fs_info, fmt, args...) \
+ +      no_printk(KERN_DEBUG fmt, ##args)
   #endif
   
+ +#define btrfs_printk_in_rcu(fs_info, fmt, args...)    \
+ +do {                                                  \
+ +      rcu_read_lock();                                \
+ +      btrfs_printk(fs_info, fmt, ##args);             \
+ +      rcu_read_unlock();                              \
+ +} while (0)
+ +
+ +#define btrfs_printk_ratelimited(fs_info, fmt, args...)               \
+ +do {                                                          \
+ +      static DEFINE_RATELIMIT_STATE(_rs,                      \
+ +              DEFAULT_RATELIMIT_INTERVAL,                     \
+ +              DEFAULT_RATELIMIT_BURST);                       \
+ +      if (__ratelimit(&_rs))                                  \
+ +              btrfs_printk(fs_info, fmt, ##args);             \
+ +} while (0)
+ +
+ +#define btrfs_printk_rl_in_rcu(fs_info, fmt, args...)         \
+ +do {                                                          \
+ +      rcu_read_lock();                                        \
+ +      btrfs_printk_ratelimited(fs_info, fmt, ##args);         \
+ +      rcu_read_unlock();                                      \
+ +} while (0)
+ +
   #ifdef CONFIG_BTRFS_ASSERT
   
   __cold
@@@ -4247,6 -4171,30 +4318,30 @@@ static inline void __btrfs_set_fs_incom
         }
   }
   
+ #define btrfs_clear_fs_incompat(__fs_info, opt) \
+       __btrfs_clear_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt)
+ 
+ static inline void __btrfs_clear_fs_incompat(struct btrfs_fs_info *fs_info,
+                                            u64 flag)
+ {
+       struct btrfs_super_block *disk_super;
+       u64 features;
+ 
+       disk_super = fs_info->super_copy;
+       features = btrfs_super_incompat_flags(disk_super);
+       if (features & flag) {
+               spin_lock(&fs_info->super_lock);
+               features = btrfs_super_incompat_flags(disk_super);
+               if (features & flag) {
+                       features &= ~flag;
+                       btrfs_set_super_incompat_flags(disk_super, features);
+                       btrfs_info(fs_info, "clearing %llu feature flag",
+                                        flag);
+               }
+               spin_unlock(&fs_info->super_lock);
+       }
+ }
+ 
   #define btrfs_fs_incompat(fs_info, opt) \
         __btrfs_fs_incompat((fs_info), BTRFS_FEATURE_INCOMPAT_##opt)
   
@@@ -4257,6 -4205,64 +4352,64 @@@ static inline int __btrfs_fs_incompat(s
         return !!(btrfs_super_incompat_flags(disk_super) & flag);
   }
   
+ #define btrfs_set_fs_compat_ro(__fs_info, opt) \
+       __btrfs_set_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt)
+ 
+ static inline void __btrfs_set_fs_compat_ro(struct btrfs_fs_info *fs_info,
+                                           u64 flag)
+ {
+       struct btrfs_super_block *disk_super;
+       u64 features;
+ 
+       disk_super = fs_info->super_copy;
+       features = btrfs_super_compat_ro_flags(disk_super);
+       if (!(features & flag)) {
+               spin_lock(&fs_info->super_lock);
+               features = btrfs_super_compat_ro_flags(disk_super);
+               if (!(features & flag)) {
+                       features |= flag;
+                       btrfs_set_super_compat_ro_flags(disk_super, features);
+                       btrfs_info(fs_info, "setting %llu ro feature flag",
+                                  flag);
+               }
+               spin_unlock(&fs_info->super_lock);
+       }
+ }
+ 
+ #define btrfs_clear_fs_compat_ro(__fs_info, opt) \
+       __btrfs_clear_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt)
+ 
+ static inline void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info,
+                                             u64 flag)
+ {
+       struct btrfs_super_block *disk_super;
+       u64 features;
+ 
+       disk_super = fs_info->super_copy;
+       features = btrfs_super_compat_ro_flags(disk_super);
+       if (features & flag) {
+               spin_lock(&fs_info->super_lock);
+               features = btrfs_super_compat_ro_flags(disk_super);
+               if (features & flag) {
+                       features &= ~flag;
+                       btrfs_set_super_compat_ro_flags(disk_super, features);
+                       btrfs_info(fs_info, "clearing %llu ro feature flag",
+                                  flag);
+               }
+               spin_unlock(&fs_info->super_lock);
+       }
+ }
+ 
+ #define btrfs_fs_compat_ro(fs_info, opt) \
+       __btrfs_fs_compat_ro((fs_info), BTRFS_FEATURE_COMPAT_RO_##opt)
+ 
+ static inline int __btrfs_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag)
+ {
+       struct btrfs_super_block *disk_super;
+       disk_super = fs_info->super_copy;
+       return !!(btrfs_super_compat_ro_flags(disk_super) & flag);
+ }
+ 
   /*
    * Call btrfs_abort_transaction as early as possible when an error condition is
    * detected, that way the exact line number is reported.
@@@ -4274,7 -4280,14 +4427,7 @@@ do {                                                           
                                   __LINE__, (errno));           \
   } while (0)
   
- -#define btrfs_std_error(fs_info, errno)                               \
- -do {                                                          \
- -      if ((errno))                                            \
- -              __btrfs_std_error((fs_info), __func__,          \
- -                                 __LINE__, (errno), NULL);    \
- -} while (0)
- -
- -#define btrfs_error(fs_info, errno, fmt, args...)             \
+ +#define btrfs_std_error(fs_info, errno, fmt, args...)         \
   do {                                                          \
         __btrfs_std_error((fs_info), __func__, __LINE__,        \
                           (errno), fmt, ##args);                \
diff --combined fs/btrfs/disk-io.c

index 974be09e7556ca3f342cac89357364f4ce3cc016,af7ac28380c28d1d6b4326f8183573ea48b46782..52e623f598480881706586a75017da1f3a0abd69
--- 1/fs/btrfs/disk-io.c
--- 2/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@@ -42,6 -42,7 +42,7 @@@
   #include "locking.h"
   #include "tree-log.h"
   #include "free-space-cache.h"
+ #include "free-space-tree.h"
   #include "inode-map.h"
   #include "check-integrity.h"
   #include "rcu-string.h"
@@@ -319,9 -320,9 +320,9 @@@ static int csum_tree_block(struct btrfs
                         memcpy(&found, result, csum_size);
   
                         read_extent_buffer(buf, &val, 0, csum_size);
- -                      printk_ratelimited(KERN_WARNING
- -                              "BTRFS: %s checksum verify failed on %llu wanted %X found %X "
- -                              "level %d\n",
+ +                      btrfs_warn_rl(fs_info,
+ +                              "%s checksum verify failed on %llu wanted %X found %X "
+ +                              "level %d",
                                 fs_info->sb->s_id, buf->start,
                                 val, found, btrfs_header_level(buf));
                         if (result != (char *)&inline_result)
@@@ -368,9 -369,9 +369,9 @@@ static int verify_parent_transid(struc
                 ret = 0;
                 goto out;
         }
- -      printk_ratelimited(KERN_ERR
- -          "BTRFS (device %s): parent transid verify failed on %llu wanted %llu found %llu\n",
- -                      eb->fs_info->sb->s_id, eb->start,
+ +      btrfs_err_rl(eb->fs_info,
+ +              "parent transid verify failed on %llu wanted %llu found %llu",
+ +                      eb->start,
                         parent_transid, btrfs_header_generation(eb));
         ret = 1;
   
@@@ -629,14 -630,15 +630,14 @@@ static int btree_readpage_end_io_hook(s
   
         found_start = btrfs_header_bytenr(eb);
         if (found_start != eb->start) {
- -              printk_ratelimited(KERN_ERR "BTRFS (device %s): bad tree block start "
- -                             "%llu %llu\n",
- -                             eb->fs_info->sb->s_id, found_start, eb->start);
+ +              btrfs_err_rl(eb->fs_info, "bad tree block start %llu %llu",
+ +                             found_start, eb->start);
                 ret = -EIO;
                 goto err;
         }
         if (check_tree_block_fsid(root->fs_info, eb)) {
- -              printk_ratelimited(KERN_ERR "BTRFS (device %s): bad fsid on block %llu\n",
- -                             eb->fs_info->sb->s_id, eb->start);
+ +              btrfs_err_rl(eb->fs_info, "bad fsid on block %llu",
+ +                             eb->start);
                 ret = -EIO;
                 goto err;
         }
@@@ -801,9 -803,6 +802,9 @@@ static void run_one_async_done(struct b
         limit = btrfs_async_submit_limit(fs_info);
         limit = limit * 2 / 3;
   
+ +      /*
+ +       * atomic_dec_return implies a barrier for waitqueue_active
+ +       */
         if (atomic_dec_return(&fs_info->nr_async_submits) < limit &&
             waitqueue_active(&fs_info->async_submit_wait))
                 wake_up(&fs_info->async_submit_wait);
@@@ -1267,7 -1266,6 +1268,7 @@@ static void __setup_root(u32 nodesize, 
         atomic_set(&root->orphan_inodes, 0);
         atomic_set(&root->refs, 1);
         atomic_set(&root->will_be_snapshoted, 0);
+ +      atomic_set(&root->qgroup_meta_rsv, 0);
         root->log_transid = 0;
         root->log_transid_committed = -1;
         root->last_log_commit = 0;
@@@ -1650,6 -1648,9 +1651,9 @@@ struct btrfs_root *btrfs_get_fs_root(st
         if (location->objectid == BTRFS_UUID_TREE_OBJECTID)
                 return fs_info->uuid_root ? fs_info->uuid_root :
                                             ERR_PTR(-ENOENT);
+       if (location->objectid == BTRFS_FREE_SPACE_TREE_OBJECTID)
+               return fs_info->free_space_root ? fs_info->free_space_root :
+                                                 ERR_PTR(-ENOENT);
   again:
         root = btrfs_lookup_fs_root(fs_info, location->objectid);
         if (root) {
@@@ -1762,7 -1763,6 +1766,7 @@@ static int cleaner_kthread(void *arg
         int again;
         struct btrfs_trans_handle *trans;
   
+ +      set_freezable();
         do {
                 again = 0;
   
@@@ -2148,6 -2148,7 +2152,7 @@@ static void free_root_pointers(struct b
         free_root_extent_buffers(info->uuid_root);
         if (chunk_root)
                 free_root_extent_buffers(info->chunk_root);
+       free_root_extent_buffers(info->free_space_root);
   }
   
   void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info)
@@@ -2352,7 -2353,8 +2357,7 @@@ static int btrfs_replay_log(struct btrf
         u64 bytenr = btrfs_super_log_root(disk_super);
   
         if (fs_devices->rw_devices == 0) {
- -              printk(KERN_WARNING "BTRFS: log replay required "
- -                     "on RO media\n");
+ +              btrfs_warn(fs_info, "log replay required on RO media");
                 return -EIO;
         }
   
@@@ -2367,12 -2369,12 +2372,12 @@@
         log_tree_root->node = read_tree_block(tree_root, bytenr,
                         fs_info->generation + 1);
         if (IS_ERR(log_tree_root->node)) {
- -              printk(KERN_ERR "BTRFS: failed to read log tree\n");
+ +              btrfs_warn(fs_info, "failed to read log tree");
                 ret = PTR_ERR(log_tree_root->node);
                 kfree(log_tree_root);
                 return ret;
         } else if (!extent_buffer_uptodate(log_tree_root->node)) {
- -              printk(KERN_ERR "BTRFS: failed to read log tree\n");
+ +              btrfs_err(fs_info, "failed to read log tree");
                 free_extent_buffer(log_tree_root->node);
                 kfree(log_tree_root);
                 return -EIO;
@@@ -2380,7 -2382,7 +2385,7 @@@
         /* returns with log_tree_root freed on success */
         ret = btrfs_recover_log_trees(log_tree_root);
         if (ret) {
- -              btrfs_error(tree_root->fs_info, ret,
+ +              btrfs_std_error(tree_root->fs_info, ret,
                             "Failed to recover log tree");
                 free_extent_buffer(log_tree_root->node);
                 kfree(log_tree_root);
@@@ -2448,6 -2450,15 +2453,15 @@@ static int btrfs_read_roots(struct btrf
                 fs_info->uuid_root = root;
         }
   
+       if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
+               location.objectid = BTRFS_FREE_SPACE_TREE_OBJECTID;
+               root = btrfs_read_tree_root(tree_root, &location);
+               if (IS_ERR(root))
+                       return PTR_ERR(root);
+               set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+               fs_info->free_space_root = root;
+       }
+ 
         return 0;
   }
   
@@@ -2575,7 -2586,7 +2589,7 @@@ int open_ctree(struct super_block *sb
         fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
         fs_info->avg_delayed_ref_runtime = NSEC_PER_SEC >> 6; /* div by 64 */
         /* readahead state */
- -      INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT);
+ +      INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
         spin_lock_init(&fs_info->reada_lock);
   
         fs_info->thread_pool_size = min_t(unsigned long,
@@@ -2656,8 -2667,8 +2670,8 @@@
          * Read super block and check the signature bytes only
          */
         bh = btrfs_read_dev_super(fs_devices->latest_bdev);
- -      if (!bh) {
- -              err = -EINVAL;
+ +      if (IS_ERR(bh)) {
+ +              err = PTR_ERR(bh);
                 goto fail_alloc;
         }
   
@@@ -2940,7 -2951,7 +2954,7 @@@ retry_root_backup
                 goto fail_fsdev_sysfs;
         }
   
- -      ret = btrfs_sysfs_add_one(fs_info);
+ +      ret = btrfs_sysfs_add_mounted(fs_info);
         if (ret) {
                 pr_err("BTRFS: failed to init sysfs interface: %d\n", ret);
                 goto fail_fsdev_sysfs;
@@@ -3076,6 -3087,30 +3090,30 @@@
   
         btrfs_qgroup_rescan_resume(fs_info);
   
+       if (btrfs_test_opt(tree_root, CLEAR_CACHE) &&
+           btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
+               pr_info("BTRFS: clearing free space tree\n");
+               ret = btrfs_clear_free_space_tree(fs_info);
+               if (ret) {
+                       pr_warn("BTRFS: failed to clear free space tree %d\n",
+                               ret);
+                       close_ctree(tree_root);
+                       return ret;
+               }
+       }
+ 
+       if (btrfs_test_opt(tree_root, FREE_SPACE_TREE) &&
+           !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
+               pr_info("BTRFS: creating free space tree\n");
+               ret = btrfs_create_free_space_tree(fs_info);
+               if (ret) {
+                       pr_warn("BTRFS: failed to create free space tree %d\n",
+                               ret);
+                       close_ctree(tree_root);
+                       return ret;
+               }
+       }
+ 
         if (!fs_info->uuid_root) {
                 pr_info("BTRFS: creating UUID tree\n");
                 ret = btrfs_create_uuid_tree(fs_info);
@@@ -3120,7 -3155,7 +3158,7 @@@ fail_cleaner
         filemap_write_and_wait(fs_info->btree_inode->i_mapping);
   
   fail_sysfs:
- -      btrfs_sysfs_remove_one(fs_info);
+ +      btrfs_sysfs_remove_mounted(fs_info);
   
   fail_fsdev_sysfs:
         btrfs_sysfs_remove_fsid(fs_info->fs_devices);
@@@ -3182,8 -3217,8 +3220,8 @@@ static void btrfs_end_buffer_write_sync
                 struct btrfs_device *device = (struct btrfs_device *)
                         bh->b_private;
   
- -              printk_ratelimited_in_rcu(KERN_WARNING "BTRFS: lost page write due to "
- -                                        "I/O error on %s\n",
+ +              btrfs_warn_rl_in_rcu(device->dev_root->fs_info,
+ +                              "lost page write due to IO error on %s",
                                           rcu_str_deref(device->name));
                 /* note, we dont' set_buffer_write_io_error because we have
                  * our own ways of dealing with the IO errors
@@@ -3195,37 -3230,6 +3233,37 @@@
         put_bh(bh);
   }
   
+ +int btrfs_read_dev_one_super(struct block_device *bdev, int copy_num,
+ +                      struct buffer_head **bh_ret)
+ +{
+ +      struct buffer_head *bh;
+ +      struct btrfs_super_block *super;
+ +      u64 bytenr;
+ +
+ +      bytenr = btrfs_sb_offset(copy_num);
+ +      if (bytenr + BTRFS_SUPER_INFO_SIZE >= i_size_read(bdev->bd_inode))
+ +              return -EINVAL;
+ +
+ +      bh = __bread(bdev, bytenr / 4096, BTRFS_SUPER_INFO_SIZE);
+ +      /*
+ +       * If we fail to read from the underlying devices, as of now
+ +       * the best option we have is to mark it EIO.
+ +       */
+ +      if (!bh)
+ +              return -EIO;
+ +
+ +      super = (struct btrfs_super_block *)bh->b_data;
+ +      if (btrfs_super_bytenr(super) != bytenr ||
+ +                  btrfs_super_magic(super) != BTRFS_MAGIC) {
+ +              brelse(bh);
+ +              return -EINVAL;
+ +      }
+ +
+ +      *bh_ret = bh;
+ +      return 0;
+ +}
+ +
+ +
   struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
   {
         struct buffer_head *bh;
@@@ -3233,7 -3237,7 +3271,7 @@@
         struct btrfs_super_block *super;
         int i;
         u64 transid = 0;
- -      u64 bytenr;
+ +      int ret = -EINVAL;
   
         /* we would like to check all the supers, but that would make
          * a btrfs mount succeed after a mkfs from a different FS.
@@@ -3241,11 -3245,21 +3279,11 @@@
          * later supers, using BTRFS_SUPER_MIRROR_MAX instead
          */
         for (i = 0; i < 1; i++) {
- -              bytenr = btrfs_sb_offset(i);
- -              if (bytenr + BTRFS_SUPER_INFO_SIZE >=
- -                                      i_size_read(bdev->bd_inode))
- -                      break;
- -              bh = __bread(bdev, bytenr / 4096,
- -                                      BTRFS_SUPER_INFO_SIZE);
- -              if (!bh)
+ +              ret = btrfs_read_dev_one_super(bdev, i, &bh);
+ +              if (ret)
                         continue;
   
                 super = (struct btrfs_super_block *)bh->b_data;
- -              if (btrfs_super_bytenr(super) != bytenr ||
- -                  btrfs_super_magic(super) != BTRFS_MAGIC) {
- -                      brelse(bh);
- -                      continue;
- -              }
   
                 if (!latest || btrfs_super_generation(super) > transid) {
                         brelse(latest);
@@@ -3255,10 -3269,6 +3293,10 @@@
                         brelse(bh);
                 }
         }
+ +
+ +      if (!latest)
+ +              return ERR_PTR(ret);
+ +
         return latest;
   }
   
@@@ -3327,9 -3337,8 +3365,9 @@@ static int write_dev_supers(struct btrf
                         bh = __getblk(device->bdev, bytenr / 4096,
                                       BTRFS_SUPER_INFO_SIZE);
                         if (!bh) {
- -                              printk(KERN_ERR "BTRFS: couldn't get super "
- -                                     "buffer head for bytenr %Lu\n", bytenr);
+ +                              btrfs_err(device->dev_root->fs_info,
+ +                                  "couldn't get super buffer head for bytenr %llu",
+ +                                  bytenr);
                                 errors++;
                                 continue;
                         }
@@@ -3478,31 -3487,22 +3516,31 @@@ static int barrier_all_devices(struct b
   
   int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags)
   {
- -      if ((flags & (BTRFS_BLOCK_GROUP_DUP |
- -                    BTRFS_BLOCK_GROUP_RAID0 |
- -                    BTRFS_AVAIL_ALLOC_BIT_SINGLE)) ||
- -          ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0))
- -              return 0;
+ +      int raid_type;
+ +      int min_tolerated = INT_MAX;
   
- -      if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
- -                   BTRFS_BLOCK_GROUP_RAID5 |
- -                   BTRFS_BLOCK_GROUP_RAID10))
- -              return 1;
+ +      if ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 ||
+ +          (flags & BTRFS_AVAIL_ALLOC_BIT_SINGLE))
+ +              min_tolerated = min(min_tolerated,
+ +                                  btrfs_raid_array[BTRFS_RAID_SINGLE].
+ +                                  tolerated_failures);
   
- -      if (flags & BTRFS_BLOCK_GROUP_RAID6)
- -              return 2;
+ +      for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
+ +              if (raid_type == BTRFS_RAID_SINGLE)
+ +                      continue;
+ +              if (!(flags & btrfs_raid_group[raid_type]))
+ +                      continue;
+ +              min_tolerated = min(min_tolerated,
+ +                                  btrfs_raid_array[raid_type].
+ +                                  tolerated_failures);
+ +      }
   
- -      pr_warn("BTRFS: unknown raid type: %llu\n", flags);
- -      return 0;
+ +      if (min_tolerated == INT_MAX) {
+ +              pr_warn("BTRFS: unknown raid flag: %llu\n", flags);
+ +              min_tolerated = 0;
+ +      }
+ +
+ +      return min_tolerated;
   }
   
   int btrfs_calc_num_tolerated_disk_barrier_failures(
@@@ -3586,7 -3586,7 +3624,7 @@@ static int write_all_supers(struct btrf
                 if (ret) {
                         mutex_unlock(
                                 &root->fs_info->fs_devices->device_list_mutex);
- -                      btrfs_error(root->fs_info, ret,
+ +                      btrfs_std_error(root->fs_info, ret,
                                     "errors while submitting device barriers.");
                         return ret;
                 }
@@@ -3626,7 -3626,7 +3664,7 @@@
                 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
   
                 /* FUA is masked off if unsupported and can't be the reason */
- -              btrfs_error(root->fs_info, -EIO,
+ +              btrfs_std_error(root->fs_info, -EIO,
                             "%d errors while writing supers", total_errors);
                 return -EIO;
         }
@@@ -3644,7 -3644,7 +3682,7 @@@
         }
         mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
         if (total_errors > max_errors) {
- -              btrfs_error(root->fs_info, -EIO,
+ +              btrfs_std_error(root->fs_info, -EIO,
                             "%d errors while writing supers", total_errors);
                 return -EIO;
         }
@@@ -3780,9 -3780,6 +3818,9 @@@ void close_ctree(struct btrfs_root *roo
         fs_info->closing = 1;
         smp_mb();
   
+ +      /* wait for the qgroup rescan worker to stop */
+ +      btrfs_qgroup_wait_for_completion(fs_info);
+ +
         /* wait for the uuid_scan task to finish */
         down(&fs_info->uuid_tree_rescan_sem);
         /* avoid complains from lockdep et al., set sem back to initial state */
@@@ -3833,7 -3830,7 +3871,7 @@@
                        percpu_counter_sum(&fs_info->delalloc_bytes));
         }
   
- -      btrfs_sysfs_remove_one(fs_info);
+ +      btrfs_sysfs_remove_mounted(fs_info);
         btrfs_sysfs_remove_fsid(fs_info->fs_devices);
   
         btrfs_free_fs_roots(fs_info);
@@@ -4331,6 -4328,25 +4369,6 @@@ again
         return 0;
   }
   
- -static void btrfs_free_pending_ordered(struct btrfs_transaction *cur_trans,
- -                                     struct btrfs_fs_info *fs_info)
- -{
- -      struct btrfs_ordered_extent *ordered;
- -
- -      spin_lock(&fs_info->trans_lock);
- -      while (!list_empty(&cur_trans->pending_ordered)) {
- -              ordered = list_first_entry(&cur_trans->pending_ordered,
- -                                         struct btrfs_ordered_extent,
- -                                         trans_list);
- -              list_del_init(&ordered->trans_list);
- -              spin_unlock(&fs_info->trans_lock);
- -
- -              btrfs_put_ordered_extent(ordered);
- -              spin_lock(&fs_info->trans_lock);
- -      }
- -      spin_unlock(&fs_info->trans_lock);
- -}
- -
   void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
                                    struct btrfs_root *root)
   {
@@@ -4342,6 -4358,7 +4380,6 @@@
         cur_trans->state = TRANS_STATE_UNBLOCKED;
         wake_up(&root->fs_info->transaction_wait);
   
- -      btrfs_free_pending_ordered(cur_trans, root->fs_info);
         btrfs_destroy_delayed_inodes(root);
         btrfs_assert_delayed_root_empty(root);
   
diff --combined fs/btrfs/extent-tree.c

index 4b89680a192338c7a70a4c909e3c4374abe41284,a4a4f593ec71d34bb988dcfa430db208866ca2c5..8abb344e3dcb003ec5a3410e9cbbd23073e49db9
--- 1/fs/btrfs/extent-tree.c
--- 2/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@@ -33,6 -33,7 +33,7 @@@
   #include "raid56.h"
   #include "locking.h"
   #include "free-space-cache.h"
+ #include "free-space-tree.h"
   #include "math.h"
   #include "sysfs.h"
   #include "qgroup.h"
@@@ -95,7 -96,8 +96,7 @@@ static int alloc_reserved_tree_block(st
                                      struct btrfs_root *root,
                                      u64 parent, u64 root_objectid,
                                      u64 flags, struct btrfs_disk_key *key,
- -                                   int level, struct btrfs_key *ins,
- -                                   int no_quota);
+ +                                   int level, struct btrfs_key *ins);
   static int do_chunk_alloc(struct btrfs_trans_handle *trans,
                           struct btrfs_root *extent_root, u64 flags,
                           int force);
@@@ -124,7 -126,7 +125,7 @@@ static int block_group_bits(struct btrf
         return (cache->flags & bits) == bits;
   }
   
- -static void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
+ +void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
   {
         atomic_inc(&cache->count);
   }
@@@ -331,34 -333,13 +332,34 @@@ static void put_caching_control(struct 
                 kfree(ctl);
   }
   
+ +#ifdef CONFIG_BTRFS_DEBUG
+ +static void fragment_free_space(struct btrfs_root *root,
+ +                              struct btrfs_block_group_cache *block_group)
+ +{
+ +      u64 start = block_group->key.objectid;
+ +      u64 len = block_group->key.offset;
+ +      u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ?
+ +              root->nodesize : root->sectorsize;
+ +      u64 step = chunk << 1;
+ +
+ +      while (len > chunk) {
+ +              btrfs_remove_free_space(block_group, start, chunk);
+ +              start += step;
+ +              if (len < step)
+ +                      len = 0;
+ +              else
+ +                      len -= step;
+ +      }
+ +}
+ +#endif
+ +
   /*
    * this is only called by cache_block_group, since we could have freed extents
    * we need to check the pinned_extents for any extents that can't be used yet
    * since their free space will be released as soon as the transaction commits.
    */
- static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
-                             struct btrfs_fs_info *info, u64 start, u64 end)
+ u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
+                      struct btrfs_fs_info *info, u64 start, u64 end)
   {
         u64 extent_start, extent_end, size, total_added = 0;
         int ret;
@@@ -395,11 -376,10 +396,10 @@@
         return total_added;
   }
   
- static noinline void caching_thread(struct btrfs_work *work)
+ static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
   {
         struct btrfs_block_group_cache *block_group;
         struct btrfs_fs_info *fs_info;
-       struct btrfs_caching_control *caching_ctl;
         struct btrfs_root *extent_root;
         struct btrfs_path *path;
         struct extent_buffer *leaf;
@@@ -407,29 -387,18 +407,28 @@@
         u64 total_found = 0;
         u64 last = 0;
         u32 nritems;
-       int ret = -ENOMEM;
+       int ret;
+ +      bool wakeup = true;
   
-       caching_ctl = container_of(work, struct btrfs_caching_control, work);
         block_group = caching_ctl->block_group;
         fs_info = block_group->fs_info;
         extent_root = fs_info->extent_root;
   
         path = btrfs_alloc_path();
         if (!path)
-               goto out;
+               return -ENOMEM;
   
         last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
   
+ +#ifdef CONFIG_BTRFS_DEBUG
+ +      /*
+ +       * If we're fragmenting we don't want to make anybody think we can
+ +       * allocate from this block group until we've had a chance to fragment
+ +       * the free space.
+ +       */
+ +      if (btrfs_should_fragment_free_space(extent_root, block_group))
+ +              wakeup = false;
+ +#endif
         /*
          * We don't want to deadlock with somebody trying to allocate a new
          * extent for the extent root while also trying to search the extent
@@@ -443,15 -412,11 +442,11 @@@
         key.objectid = last;
         key.offset = 0;
         key.type = BTRFS_EXTENT_ITEM_KEY;
- again:
-       mutex_lock(&caching_ctl->mutex);
-       /* need to make sure the commit_root doesn't disappear */
-       down_read(&fs_info->commit_root_sem);
   
   next:
         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
         if (ret < 0)
-               goto err;
+               goto out;
   
         leaf = path->nodes[0];
         nritems = btrfs_header_nritems(leaf);
@@@ -471,18 -436,19 +466,20 @@@
   
                         if (need_resched() ||
                             rwsem_is_contended(&fs_info->commit_root_sem)) {
- -                              caching_ctl->progress = last;
+ +                              if (wakeup)
+ +                                      caching_ctl->progress = last;
                                 btrfs_release_path(path);
                                 up_read(&fs_info->commit_root_sem);
                                 mutex_unlock(&caching_ctl->mutex);
                                 cond_resched();
-                               goto again;
+                               mutex_lock(&caching_ctl->mutex);
+                               down_read(&fs_info->commit_root_sem);
+                               goto next;
                         }
   
                         ret = btrfs_next_leaf(extent_root, path);
                         if (ret < 0)
-                               goto err;
+                               goto out;
                         if (ret)
                                 break;
                         leaf = path->nodes[0];
@@@ -495,8 -461,7 +492,8 @@@
                         key.offset = 0;
                         key.type = BTRFS_EXTENT_ITEM_KEY;
   
- -                      caching_ctl->progress = last;
+ +                      if (wakeup)
+ +                              caching_ctl->progress = last;
                         btrfs_release_path(path);
                         goto next;
                 }
@@@ -521,10 -486,9 +518,10 @@@
                         else
                                 last = key.objectid + key.offset;
   
-                       if (total_found > (1024 * 1024 * 2)) {
+                       if (total_found > CACHING_CTL_WAKE_UP) {
                                 total_found = 0;
- -                              wake_up(&caching_ctl->wait);
+ +                              if (wakeup)
+ +                                      wake_up(&caching_ctl->wait);
                         }
                 }
                 path->slots[0]++;
@@@ -534,41 -498,41 +531,58 @@@
         total_found += add_new_free_space(block_group, fs_info, last,
                                           block_group->key.objectid +
                                           block_group->key.offset);
+       caching_ctl->progress = (u64)-1;
+ 
+ out:
+       btrfs_free_path(path);
+       return ret;
+ }
+ 
+ static noinline void caching_thread(struct btrfs_work *work)
+ {
+       struct btrfs_block_group_cache *block_group;
+       struct btrfs_fs_info *fs_info;
+       struct btrfs_caching_control *caching_ctl;
+       int ret;
+ 
+       caching_ctl = container_of(work, struct btrfs_caching_control, work);
+       block_group = caching_ctl->block_group;
+       fs_info = block_group->fs_info;
+ 
+       mutex_lock(&caching_ctl->mutex);
+       down_read(&fs_info->commit_root_sem);
+ 
+       if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
+               ret = load_free_space_tree(caching_ctl);
+       else
+               ret = load_extent_tree_free(caching_ctl);
+ 
         spin_lock(&block_group->lock);
         block_group->caching_ctl = NULL;
-       block_group->cached = BTRFS_CACHE_FINISHED;
+       block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED;
         spin_unlock(&block_group->lock);
   
- err:
-       btrfs_free_path(path);
-       up_read(&fs_info->commit_root_sem);
- 
-       free_excluded_extents(extent_root, block_group);
+ +#ifdef CONFIG_BTRFS_DEBUG
+ +      if (btrfs_should_fragment_free_space(extent_root, block_group)) {
+ +              u64 bytes_used;
+ +
+ +              spin_lock(&block_group->space_info->lock);
+ +              spin_lock(&block_group->lock);
+ +              bytes_used = block_group->key.offset -
+ +                      btrfs_block_group_used(&block_group->item);
+ +              block_group->space_info->bytes_used += bytes_used >> 1;
+ +              spin_unlock(&block_group->lock);
+ +              spin_unlock(&block_group->space_info->lock);
+ +              fragment_free_space(extent_root, block_group);
+ +      }
+ +#endif
+ +
+ +      caching_ctl->progress = (u64)-1;
+ +
+       up_read(&fs_info->commit_root_sem);
+       free_excluded_extents(fs_info->extent_root, block_group);
         mutex_unlock(&caching_ctl->mutex);
- out:
-       if (ret) {
-               spin_lock(&block_group->lock);
-               block_group->caching_ctl = NULL;
-               block_group->cached = BTRFS_CACHE_ERROR;
-               spin_unlock(&block_group->lock);
-       }
+ 
         wake_up(&caching_ctl->wait);
   
         put_caching_control(caching_ctl);
@@@ -654,22 -618,6 +668,22 @@@ static int cache_block_group(struct btr
                         }
                 }
                 spin_unlock(&cache->lock);
+ +#ifdef CONFIG_BTRFS_DEBUG
+ +              if (ret == 1 &&
+ +                  btrfs_should_fragment_free_space(fs_info->extent_root,
+ +                                                   cache)) {
+ +                      u64 bytes_used;
+ +
+ +                      spin_lock(&cache->space_info->lock);
+ +                      spin_lock(&cache->lock);
+ +                      bytes_used = cache->key.offset -
+ +                              btrfs_block_group_used(&cache->item);
+ +                      cache->space_info->bytes_used += bytes_used >> 1;
+ +                      spin_unlock(&cache->lock);
+ +                      spin_unlock(&cache->space_info->lock);
+ +                      fragment_free_space(fs_info->extent_root, cache);
+ +              }
+ +#endif
                 mutex_unlock(&caching_ctl->mutex);
   
                 wake_up(&caching_ctl->wait);
@@@ -680,8 -628,8 +694,8 @@@
                 }
         } else {
                 /*
-                * We are not going to do the fast caching, set cached to the
-                * appropriate value and wakeup any waiters.
+                * We're either using the free space tree or no caching at all.
+                * Set cached to the appropriate value and wakeup any waiters.
                  */
                 spin_lock(&cache->lock);
                 if (load_cache_only) {
@@@ -2072,7 -2020,8 +2086,7 @@@ int btrfs_discard_extent(struct btrfs_r
   int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root,
                          u64 bytenr, u64 num_bytes, u64 parent,
- -                       u64 root_objectid, u64 owner, u64 offset,
- -                       int no_quota)
+ +                       u64 root_objectid, u64 owner, u64 offset)
   {
         int ret;
         struct btrfs_fs_info *fs_info = root->fs_info;
@@@ -2084,12 -2033,12 +2098,12 @@@
                 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
                                         num_bytes,
                                         parent, root_objectid, (int)owner,
- -                                      BTRFS_ADD_DELAYED_REF, NULL, no_quota);
+ +                                      BTRFS_ADD_DELAYED_REF, NULL);
         } else {
                 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
- -                                      num_bytes,
- -                                      parent, root_objectid, owner, offset,
- -                                      BTRFS_ADD_DELAYED_REF, NULL, no_quota);
+ +                                      num_bytes, parent, root_objectid,
+ +                                      owner, offset, 0,
+ +                                      BTRFS_ADD_DELAYED_REF, NULL);
         }
         return ret;
   }
@@@ -2110,11 -2059,15 +2124,11 @@@ static int __btrfs_inc_extent_ref(struc
         u64 num_bytes = node->num_bytes;
         u64 refs;
         int ret;
- -      int no_quota = node->no_quota;
   
         path = btrfs_alloc_path();
         if (!path)
                 return -ENOMEM;
   
- -      if (!is_fstree(root_objectid) || !root->fs_info->quota_enabled)
- -              no_quota = 1;
- -
         path->reada = 1;
         path->leave_spinning = 1;
         /* this will setup the path even if it fails to insert the back ref */
@@@ -2349,7 -2302,8 +2363,7 @@@ static int run_delayed_tree_ref(struct 
                                                 parent, ref_root,
                                                 extent_op->flags_to_set,
                                                 &extent_op->key,
- -                                              ref->level, &ins,
- -                                              node->no_quota);
+ +                                              ref->level, &ins);
         } else if (node->action == BTRFS_ADD_DELAYED_REF) {
                 ret = __btrfs_inc_extent_ref(trans, root, node,
                                              parent, ref_root,
@@@ -2402,11 -2356,6 +2416,11 @@@ static int run_one_delayed_ref(struct b
                                                       node->num_bytes);
                         }
                 }
+ +
+ +              /* Also free its reserved qgroup space */
+ +              btrfs_qgroup_free_delayed_ref(root->fs_info,
+ +                                            head->qgroup_ref_root,
+ +                                            head->qgroup_reserved);
                 return ret;
         }
   
@@@ -2495,21 -2444,7 +2509,21 @@@ static noinline int __btrfs_run_delayed
                         }
                 }
   
+ +              /*
+ +               * We need to try and merge add/drops of the same ref since we
+ +               * can run into issues with relocate dropping the implicit ref
+ +               * and then it being added back again before the drop can
+ +               * finish.  If we merged anything we need to re-loop so we can
+ +               * get a good ref.
+ +               * Or we can get node references of the same type that weren't
+ +               * merged when created due to bumps in the tree mod seq, and
+ +               * we need to merge them to prevent adding an inline extent
+ +               * backref before dropping it (triggering a BUG_ON at
+ +               * insert_inline_extent_backref()).
+ +               */
                 spin_lock(&locked_ref->lock);
+ +              btrfs_merge_delayed_refs(trans, fs_info, delayed_refs,
+ +                                       locked_ref);
   
                 /*
                  * locked_ref is the head node, so we have to go one
@@@ -3185,7 -3120,7 +3199,7 @@@ static int __btrfs_mod_ref(struct btrfs
         int level;
         int ret = 0;
         int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
- -                          u64, u64, u64, u64, u64, u64, int);
+ +                          u64, u64, u64, u64, u64, u64);
   
   
         if (btrfs_test_is_dummy_root(root))
@@@ -3226,14 -3161,15 +3240,14 @@@
                         key.offset -= btrfs_file_extent_offset(buf, fi);
                         ret = process_func(trans, root, bytenr, num_bytes,
                                            parent, ref_root, key.objectid,
- -                                         key.offset, 1);
+ +                                         key.offset);
                         if (ret)
                                 goto fail;
                 } else {
                         bytenr = btrfs_node_blockptr(buf, i);
                         num_bytes = root->nodesize;
                         ret = process_func(trans, root, bytenr, num_bytes,
- -                                         parent, ref_root, level - 1, 0,
- -                                         1);
+ +                                         parent, ref_root, level - 1, 0);
                         if (ret)
                                 goto fail;
                 }
@@@ -3413,15 -3349,6 +3427,15 @@@ again
         }
         spin_unlock(&block_group->lock);
   
+ +      /*
+ +       * We hit an ENOSPC when setting up the cache in this transaction, just
+ +       * skip doing the setup, we've already cleared the cache so we're safe.
+ +       */
+ +      if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) {
+ +              ret = -ENOSPC;
+ +              goto out_put;
+ +      }
+ +
         /*
          * Try to preallocate enough space based on how big the block group is.
          * Keep in mind this has to include any pinned space which could end up
@@@ -3435,26 -3362,16 +3449,26 @@@
         num_pages *= 16;
         num_pages *= PAGE_CACHE_SIZE;
   
- -      ret = btrfs_check_data_free_space(inode, num_pages, num_pages);
+ +      ret = btrfs_check_data_free_space(inode, 0, num_pages);
         if (ret)
                 goto out_put;
   
         ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
                                               num_pages, num_pages,
                                               &alloc_hint);
+ +      /*
+ +       * Our cache requires contiguous chunks so that we don't modify a bunch
+ +       * of metadata or split extents when writing the cache out, which means
+ +       * we can enospc if we are heavily fragmented in addition to just normal
+ +       * out of space conditions.  So if we hit this just skip setting up any
+ +       * other block groups for this transaction, maybe we'll unpin enough
+ +       * space the next time around.
+ +       */
         if (!ret)
                 dcs = BTRFS_DC_SETUP;
- -      btrfs_free_reserved_data_space(inode, num_pages);
+ +      else if (ret == -ENOSPC)
+ +              set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);
+ +      btrfs_free_reserved_data_space(inode, 0, num_pages);
   
   out_put:
         iput(inode);
@@@ -3840,7 -3757,6 +3854,7 @@@ static int update_space_info(struct btr
         found->bytes_readonly = 0;
         found->bytes_may_use = 0;
         found->full = 0;
+ +      found->max_extent_size = 0;
         found->force_alloc = CHUNK_ALLOC_NO_FORCE;
         found->chunk_alloc = 0;
         found->flush = 0;
@@@ -3917,8 -3833,7 +3931,8 @@@ static u64 btrfs_reduce_alloc_profile(s
   {
         u64 num_devices = root->fs_info->fs_devices->rw_devices;
         u64 target;
- -      u64 tmp;
+ +      u64 raid_type;
+ +      u64 allowed = 0;
   
         /*
          * see if restripe for this chunk_type is in progress, if so
@@@ -3936,26 -3851,31 +3950,26 @@@
         spin_unlock(&root->fs_info->balance_lock);
   
         /* First, mask out the RAID levels which aren't possible */
- -      if (num_devices == 1)
- -              flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0 |
- -                         BTRFS_BLOCK_GROUP_RAID5);
- -      if (num_devices < 3)
- -              flags &= ~BTRFS_BLOCK_GROUP_RAID6;
- -      if (num_devices < 4)
- -              flags &= ~BTRFS_BLOCK_GROUP_RAID10;
- -
- -      tmp = flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 |
- -                     BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 |
- -                     BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10);
- -      flags &= ~tmp;
- -
- -      if (tmp & BTRFS_BLOCK_GROUP_RAID6)
- -              tmp = BTRFS_BLOCK_GROUP_RAID6;
- -      else if (tmp & BTRFS_BLOCK_GROUP_RAID5)
- -              tmp = BTRFS_BLOCK_GROUP_RAID5;
- -      else if (tmp & BTRFS_BLOCK_GROUP_RAID10)
- -              tmp = BTRFS_BLOCK_GROUP_RAID10;
- -      else if (tmp & BTRFS_BLOCK_GROUP_RAID1)
- -              tmp = BTRFS_BLOCK_GROUP_RAID1;
- -      else if (tmp & BTRFS_BLOCK_GROUP_RAID0)
- -              tmp = BTRFS_BLOCK_GROUP_RAID0;
- -
- -      return extended_to_chunk(flags | tmp);
+ +      for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
+ +              if (num_devices >= btrfs_raid_array[raid_type].devs_min)
+ +                      allowed |= btrfs_raid_group[raid_type];
+ +      }
+ +      allowed &= flags;
+ +
+ +      if (allowed & BTRFS_BLOCK_GROUP_RAID6)
+ +              allowed = BTRFS_BLOCK_GROUP_RAID6;
+ +      else if (allowed & BTRFS_BLOCK_GROUP_RAID5)
+ +              allowed = BTRFS_BLOCK_GROUP_RAID5;
+ +      else if (allowed & BTRFS_BLOCK_GROUP_RAID10)
+ +              allowed = BTRFS_BLOCK_GROUP_RAID10;
+ +      else if (allowed & BTRFS_BLOCK_GROUP_RAID1)
+ +              allowed = BTRFS_BLOCK_GROUP_RAID1;
+ +      else if (allowed & BTRFS_BLOCK_GROUP_RAID0)
+ +              allowed = BTRFS_BLOCK_GROUP_RAID0;
+ +
+ +      flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
+ +
+ +      return extended_to_chunk(flags | allowed);
   }
   
   static u64 get_alloc_profile(struct btrfs_root *root, u64 orig_flags)
@@@ -3994,7 -3914,11 +4008,7 @@@ u64 btrfs_get_alloc_profile(struct btrf
         return ret;
   }
   
- -/*
- - * This will check the space that the inode allocates from to make sure we have
- - * enough space for bytes.
- - */
- -int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 write_bytes)
+ +int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes)
   {
         struct btrfs_space_info *data_sinfo;
         struct btrfs_root *root = BTRFS_I(inode)->root;
@@@ -4093,8 -4017,7 +4107,8 @@@ commit_trans
                         if (IS_ERR(trans))
                                 return PTR_ERR(trans);
                         if (have_pinned_space >= 0 ||
- -                          trans->transaction->have_free_bgs ||
+ +                          test_bit(BTRFS_TRANS_HAVE_FREE_BGS,
+ +                                   &trans->transaction->flags) ||
                             need_commit > 0) {
                                 ret = btrfs_commit_transaction(trans, root);
                                 if (ret)
@@@ -4116,86 -4039,38 +4130,86 @@@
                                               data_sinfo->flags, bytes, 1);
                 return -ENOSPC;
         }
- -      ret = btrfs_qgroup_reserve(root, write_bytes);
- -      if (ret)
- -              goto out;
         data_sinfo->bytes_may_use += bytes;
         trace_btrfs_space_reservation(root->fs_info, "space_info",
                                       data_sinfo->flags, bytes, 1);
- -out:
         spin_unlock(&data_sinfo->lock);
   
         return ret;
   }
   
   /*
- - * Called if we need to clear a data reservation for this inode.
+ + * New check_data_free_space() with ability for precious data reservation
+ + * Will replace old btrfs_check_data_free_space(), but for patch split,
+ + * add a new function first and then replace it.
    */
- -void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
+ +int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len)
+ +{
+ +      struct btrfs_root *root = BTRFS_I(inode)->root;
+ +      int ret;
+ +
+ +      /* align the range */
+ +      len = round_up(start + len, root->sectorsize) -
+ +            round_down(start, root->sectorsize);
+ +      start = round_down(start, root->sectorsize);
+ +
+ +      ret = btrfs_alloc_data_chunk_ondemand(inode, len);
+ +      if (ret < 0)
+ +              return ret;
+ +
+ +      /*
+ +       * Use new btrfs_qgroup_reserve_data to reserve precious data space
+ +       *
+ +       * TODO: Find a good method to avoid reserve data space for NOCOW
+ +       * range, but don't impact performance on quota disable case.
+ +       */
+ +      ret = btrfs_qgroup_reserve_data(inode, start, len);
+ +      return ret;
+ +}
+ +
+ +/*
+ + * Called if we need to clear a data reservation for this inode
+ + * Normally in a error case.
+ + *
+ + * This one will *NOT* use accurate qgroup reserved space API, just for case
+ + * which we can't sleep and is sure it won't affect qgroup reserved space.
+ + * Like clear_bit_hook().
+ + */
+ +void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
+ +                                          u64 len)
   {
         struct btrfs_root *root = BTRFS_I(inode)->root;
         struct btrfs_space_info *data_sinfo;
   
- -      /* make sure bytes are sectorsize aligned */
- -      bytes = ALIGN(bytes, root->sectorsize);
+ +      /* Make sure the range is aligned to sectorsize */
+ +      len = round_up(start + len, root->sectorsize) -
+ +            round_down(start, root->sectorsize);
+ +      start = round_down(start, root->sectorsize);
   
         data_sinfo = root->fs_info->data_sinfo;
         spin_lock(&data_sinfo->lock);
- -      WARN_ON(data_sinfo->bytes_may_use < bytes);
- -      data_sinfo->bytes_may_use -= bytes;
+ +      if (WARN_ON(data_sinfo->bytes_may_use < len))
+ +              data_sinfo->bytes_may_use = 0;
+ +      else
+ +              data_sinfo->bytes_may_use -= len;
         trace_btrfs_space_reservation(root->fs_info, "space_info",
- -                                    data_sinfo->flags, bytes, 0);
+ +                                    data_sinfo->flags, len, 0);
         spin_unlock(&data_sinfo->lock);
   }
   
+ +/*
+ + * Called if we need to clear a data reservation for this inode
+ + * Normally in a error case.
+ + *
+ + * This one will handle the per-indoe data rsv map for accurate reserved
+ + * space framework.
+ + */
+ +void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len)
+ +{
+ +      btrfs_free_reserved_data_space_noquota(inode, start, len);
+ +      btrfs_qgroup_free_data(inode, start, len);
+ +}
+ +
   static void force_metadata_allocation(struct btrfs_fs_info *info)
   {
         struct list_head *head = &info->space_info;
@@@ -5027,9 -4902,13 +5041,9 @@@ static struct btrfs_block_rsv *get_bloc
   {
         struct btrfs_block_rsv *block_rsv = NULL;
   
- -      if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
- -              block_rsv = trans->block_rsv;
- -
- -      if (root == root->fs_info->csum_root && trans->adding_csums)
- -              block_rsv = trans->block_rsv;
- -
- -      if (root == root->fs_info->uuid_root)
+ +      if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
+ +          (root == root->fs_info->csum_root && trans->adding_csums) ||
+ +           (root == root->fs_info->uuid_root))
                 block_rsv = trans->block_rsv;
   
         if (!block_rsv)
@@@ -5472,7 -5351,7 +5486,7 @@@ int btrfs_subvolume_reserve_metadata(st
         if (root->fs_info->quota_enabled) {
                 /* One for parent inode, two for dir entries */
                 num_bytes = 3 * root->nodesize;
- -              ret = btrfs_qgroup_reserve(root, num_bytes);
+ +              ret = btrfs_qgroup_reserve_meta(root, num_bytes);
                 if (ret)
                         return ret;
         } else {
@@@ -5490,8 -5369,10 +5504,8 @@@
         if (ret == -ENOSPC && use_global_rsv)
                 ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes);
   
- -      if (ret) {
- -              if (*qgroup_reserved)
- -                      btrfs_qgroup_free(root, *qgroup_reserved);
- -      }
+ +      if (ret && *qgroup_reserved)
+ +              btrfs_qgroup_free_meta(root, *qgroup_reserved);
   
         return ret;
   }
@@@ -5652,15 -5533,15 +5666,15 @@@ int btrfs_delalloc_reserve_metadata(str
         spin_unlock(&BTRFS_I(inode)->lock);
   
         if (root->fs_info->quota_enabled) {
- -              ret = btrfs_qgroup_reserve(root, nr_extents * root->nodesize);
+ +              ret = btrfs_qgroup_reserve_meta(root,
+ +                              nr_extents * root->nodesize);
                 if (ret)
                         goto out_fail;
         }
   
         ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
         if (unlikely(ret)) {
- -              if (root->fs_info->quota_enabled)
- -                      btrfs_qgroup_free(root, nr_extents * root->nodesize);
+ +              btrfs_qgroup_free_meta(root, nr_extents * root->nodesize);
                 goto out_fail;
         }
   
@@@ -5783,48 -5664,41 +5797,48 @@@ void btrfs_delalloc_release_metadata(st
   }
   
   /**
- - * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc
+ + * btrfs_delalloc_reserve_space - reserve data and metadata space for
+ + * delalloc
    * @inode: inode we're writing to
- - * @num_bytes: the number of bytes we want to allocate
+ + * @start: start range we are writing to
+ + * @len: how long the range we are writing to
+ + *
+ + * TODO: This function will finally replace old btrfs_delalloc_reserve_space()
    *
    * This will do the following things
    *
- - * o reserve space in the data space info for num_bytes
- - * o reserve space in the metadata space info based on number of outstanding
+ + * o reserve space in data space info for num bytes
+ + *   and reserve precious corresponding qgroup space
+ + *   (Done in check_data_free_space)
+ + *
+ + * o reserve space for metadata space, based on the number of outstanding
    *   extents and how much csums will be needed
- - * o add to the inodes ->delalloc_bytes
+ + *   also reserve metadata space in a per root over-reserve method.
+ + * o add to the inodes->delalloc_bytes
    * o add it to the fs_info's delalloc inodes list.
+ + *   (Above 3 all done in delalloc_reserve_metadata)
    *
- - * This will return 0 for success and -ENOSPC if there is no space left.
+ + * Return 0 for success
+ + * Return <0 for error(-ENOSPC or -EQUOT)
    */
- -int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
+ +int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len)
   {
         int ret;
   
- -      ret = btrfs_check_data_free_space(inode, num_bytes, num_bytes);
- -      if (ret)
- -              return ret;
- -
- -      ret = btrfs_delalloc_reserve_metadata(inode, num_bytes);
- -      if (ret) {
- -              btrfs_free_reserved_data_space(inode, num_bytes);
+ +      ret = btrfs_check_data_free_space(inode, start, len);
+ +      if (ret < 0)
                 return ret;
- -      }
- -
- -      return 0;
+ +      ret = btrfs_delalloc_reserve_metadata(inode, len);
+ +      if (ret < 0)
+ +              btrfs_free_reserved_data_space(inode, start, len);
+ +      return ret;
   }
   
   /**
    * btrfs_delalloc_release_space - release data and metadata space for delalloc
    * @inode: inode we're releasing space for
- - * @num_bytes: the number of bytes we want to free up
+ + * @start: start position of the space already reserved
+ + * @len: the len of the space already reserved
    *
    * This must be matched with a call to btrfs_delalloc_reserve_space.  This is
    * called in the case that we don't need the metadata AND data reservations
@@@ -5833,12 -5707,11 +5847,12 @@@
    * This function will release the metadata space that was not used and will
    * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
    * list if there are no delalloc bytes left.
+ + * Also it will handle the qgroup reserved space.
    */
- -void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
+ +void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len)
   {
- -      btrfs_delalloc_release_metadata(inode, num_bytes);
- -      btrfs_free_reserved_data_space(inode, num_bytes);
+ +      btrfs_delalloc_release_metadata(inode, len);
+ +      btrfs_free_reserved_data_space(inode, start, len);
   }
   
   static int update_block_group(struct btrfs_trans_handle *trans,
@@@ -5915,6 -5788,19 +5929,6 @@@
                         set_extent_dirty(info->pinned_extents,
                                          bytenr, bytenr + num_bytes - 1,
                                          GFP_NOFS | __GFP_NOFAIL);
- -                      /*
- -                       * No longer have used bytes in this block group, queue
- -                       * it for deletion.
- -                       */
- -                      if (old_val == 0) {
- -                              spin_lock(&info->unused_bgs_lock);
- -                              if (list_empty(&cache->bg_list)) {
- -                                      btrfs_get_block_group(cache);
- -                                      list_add_tail(&cache->bg_list,
- -                                                    &info->unused_bgs);
- -                              }
- -                              spin_unlock(&info->unused_bgs_lock);
- -                      }
                 }
   
                 spin_lock(&trans->transaction->dirty_bgs_lock);
@@@ -5926,22 -5812,6 +5940,22 @@@
                 }
                 spin_unlock(&trans->transaction->dirty_bgs_lock);
   
+ +              /*
+ +               * No longer have used bytes in this block group, queue it for
+ +               * deletion. We do this after adding the block group to the
+ +               * dirty list to avoid races between cleaner kthread and space
+ +               * cache writeout.
+ +               */
+ +              if (!alloc && old_val == 0) {
+ +                      spin_lock(&info->unused_bgs_lock);
+ +                      if (list_empty(&cache->bg_list)) {
+ +                              btrfs_get_block_group(cache);
+ +                              list_add_tail(&cache->bg_list,
+ +                                            &info->unused_bgs);
+ +                      }
+ +                      spin_unlock(&info->unused_bgs_lock);
+ +              }
+ +
                 btrfs_put_block_group(cache);
                 total -= num_bytes;
                 bytenr += num_bytes;
@@@ -6206,34 -6076,6 +6220,34 @@@ void btrfs_prepare_extent_commit(struc
         update_global_block_rsv(fs_info);
   }
   
+ +/*
+ + * Returns the free cluster for the given space info and sets empty_cluster to
+ + * what it should be based on the mount options.
+ + */
+ +static struct btrfs_free_cluster *
+ +fetch_cluster_info(struct btrfs_root *root, struct btrfs_space_info *space_info,
+ +                 u64 *empty_cluster)
+ +{
+ +      struct btrfs_free_cluster *ret = NULL;
+ +      bool ssd = btrfs_test_opt(root, SSD);
+ +
+ +      *empty_cluster = 0;
+ +      if (btrfs_mixed_space_info(space_info))
+ +              return ret;
+ +
+ +      if (ssd)
+ +              *empty_cluster = 2 * 1024 * 1024;
+ +      if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
+ +              ret = &root->fs_info->meta_alloc_cluster;
+ +              if (!ssd)
+ +                      *empty_cluster = 64 * 1024;
+ +      } else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) && ssd) {
+ +              ret = &root->fs_info->data_alloc_cluster;
+ +      }
+ +
+ +      return ret;
+ +}
+ +
   static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end,
                               const bool return_free_space)
   {
@@@ -6241,10 -6083,7 +6255,10 @@@
         struct btrfs_block_group_cache *cache = NULL;
         struct btrfs_space_info *space_info;
         struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
+ +      struct btrfs_free_cluster *cluster = NULL;
         u64 len;
+ +      u64 total_unpinned = 0;
+ +      u64 empty_cluster = 0;
         bool readonly;
   
         while (start <= end) {
@@@ -6253,14 -6092,8 +6267,14 @@@
                     start >= cache->key.objectid + cache->key.offset) {
                         if (cache)
                                 btrfs_put_block_group(cache);
+ +                      total_unpinned = 0;
                         cache = btrfs_lookup_block_group(fs_info, start);
                         BUG_ON(!cache); /* Logic error */
+ +
+ +                      cluster = fetch_cluster_info(root,
+ +                                                   cache->space_info,
+ +                                                   &empty_cluster);
+ +                      empty_cluster <<= 1;
                 }
   
                 len = cache->key.objectid + cache->key.offset - start;
@@@ -6273,27 -6106,12 +6287,27 @@@
                 }
   
                 start += len;
+ +              total_unpinned += len;
                 space_info = cache->space_info;
   
+ +              /*
+ +               * If this space cluster has been marked as fragmented and we've
+ +               * unpinned enough in this block group to potentially allow a
+ +               * cluster to be created inside of it go ahead and clear the
+ +               * fragmented check.
+ +               */
+ +              if (cluster && cluster->fragmented &&
+ +                  total_unpinned > empty_cluster) {
+ +                      spin_lock(&cluster->lock);
+ +                      cluster->fragmented = 0;
+ +                      spin_unlock(&cluster->lock);
+ +              }
+ +
                 spin_lock(&space_info->lock);
                 spin_lock(&cache->lock);
                 cache->pinned -= len;
                 space_info->bytes_pinned -= len;
+ +              space_info->max_extent_size = 0;
                 percpu_counter_add(&space_info->total_bytes_pinned, -len);
                 if (cache->ro) {
                         space_info->bytes_readonly += len;
@@@ -6426,6 -6244,7 +6440,6 @@@ static int __btrfs_free_extent(struct b
         int extent_slot = 0;
         int found_extent = 0;
         int num_to_del = 1;
- -      int no_quota = node->no_quota;
         u32 item_size;
         u64 refs;
         u64 bytenr = node->bytenr;
@@@ -6434,6 -6253,9 +6448,6 @@@
         bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
                                                  SKINNY_METADATA);
   
- -      if (!info->quota_enabled || !is_fstree(root_objectid))
- -              no_quota = 1;
- -
         path = btrfs_alloc_path();
         if (!path)
                 return -ENOMEM;
@@@ -6661,6 -6483,13 +6675,13 @@@
                         }
                 }
   
+               ret = add_to_free_space_tree(trans, root->fs_info, bytenr,
+                                            num_bytes);
+               if (ret) {
+                       btrfs_abort_transaction(trans, extent_root, ret);
+                       goto out;
+               }
+ 
                 ret = update_block_group(trans, root, bytenr, num_bytes, 0);
                 if (ret) {
                         btrfs_abort_transaction(trans, extent_root, ret);
@@@ -6759,7 -6588,7 +6780,7 @@@ void btrfs_free_tree_block(struct btrfs
                                         buf->start, buf->len,
                                         parent, root->root_key.objectid,
                                         btrfs_header_level(buf),
- -                                      BTRFS_DROP_DELAYED_REF, NULL, 0);
+ +                                      BTRFS_DROP_DELAYED_REF, NULL);
                 BUG_ON(ret); /* -ENOMEM */
         }
   
@@@ -6807,7 -6636,7 +6828,7 @@@ out
   /* Can return -ENOMEM */
   int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                       u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
- -                    u64 owner, u64 offset, int no_quota)
+ +                    u64 owner, u64 offset)
   {
         int ret;
         struct btrfs_fs_info *fs_info = root->fs_info;
@@@ -6830,13 -6659,13 +6851,13 @@@
                 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
                                         num_bytes,
                                         parent, root_objectid, (int)owner,
- -                                      BTRFS_DROP_DELAYED_REF, NULL, no_quota);
+ +                                      BTRFS_DROP_DELAYED_REF, NULL);
         } else {
                 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
                                                 num_bytes,
                                                 parent, root_objectid, owner,
- -                                              offset, BTRFS_DROP_DELAYED_REF,
- -                                              NULL, no_quota);
+ +                                              offset, 0,
+ +                                              BTRFS_DROP_DELAYED_REF, NULL);
         }
         return ret;
   }
@@@ -7022,7 -6851,7 +7043,7 @@@ static noinline int find_free_extent(st
         struct btrfs_block_group_cache *block_group = NULL;
         u64 search_start = 0;
         u64 max_extent_size = 0;
- -      int empty_cluster = 2 * 1024 * 1024;
+ +      u64 empty_cluster = 0;
         struct btrfs_space_info *space_info;
         int loop = 0;
         int index = __get_raid_index(flags);
@@@ -7032,8 -6861,6 +7053,8 @@@
         bool failed_alloc = false;
         bool use_cluster = true;
         bool have_caching_bg = false;
+ +      bool orig_have_caching_bg = false;
+ +      bool full_search = false;
   
         WARN_ON(num_bytes < root->sectorsize);
         ins->type = BTRFS_EXTENT_ITEM_KEY;
@@@ -7049,47 -6876,36 +7070,47 @@@
         }
   
         /*
- -       * If the space info is for both data and metadata it means we have a
- -       * small filesystem and we can't use the clustering stuff.
+ +       * If our free space is heavily fragmented we may not be able to make
+ +       * big contiguous allocations, so instead of doing the expensive search
+ +       * for free space, simply return ENOSPC with our max_extent_size so we
+ +       * can go ahead and search for a more manageable chunk.
+ +       *
+ +       * If our max_extent_size is large enough for our allocation simply
+ +       * disable clustering since we will likely not be able to find enough
+ +       * space to create a cluster and induce latency trying.
          */
- -      if (btrfs_mixed_space_info(space_info))
- -              use_cluster = false;
- -
- -      if (flags & BTRFS_BLOCK_GROUP_METADATA && use_cluster) {
- -              last_ptr = &root->fs_info->meta_alloc_cluster;
- -              if (!btrfs_test_opt(root, SSD))
- -                      empty_cluster = 64 * 1024;
- -      }
- -
- -      if ((flags & BTRFS_BLOCK_GROUP_DATA) && use_cluster &&
- -          btrfs_test_opt(root, SSD)) {
- -              last_ptr = &root->fs_info->data_alloc_cluster;
+ +      if (unlikely(space_info->max_extent_size)) {
+ +              spin_lock(&space_info->lock);
+ +              if (space_info->max_extent_size &&
+ +                  num_bytes > space_info->max_extent_size) {
+ +                      ins->offset = space_info->max_extent_size;
+ +                      spin_unlock(&space_info->lock);
+ +                      return -ENOSPC;
+ +              } else if (space_info->max_extent_size) {
+ +                      use_cluster = false;
+ +              }
+ +              spin_unlock(&space_info->lock);
         }
   
+ +      last_ptr = fetch_cluster_info(orig_root, space_info, &empty_cluster);
         if (last_ptr) {
                 spin_lock(&last_ptr->lock);
                 if (last_ptr->block_group)
                         hint_byte = last_ptr->window_start;
+ +              if (last_ptr->fragmented) {
+ +                      /*
+ +                       * We still set window_start so we can keep track of the
+ +                       * last place we found an allocation to try and save
+ +                       * some time.
+ +                       */
+ +                      hint_byte = last_ptr->window_start;
+ +                      use_cluster = false;
+ +              }
                 spin_unlock(&last_ptr->lock);
         }
   
         search_start = max(search_start, first_logical_byte(root, 0));
         search_start = max(search_start, hint_byte);
- -
- -      if (!last_ptr)
- -              empty_cluster = 0;
- -
         if (search_start == hint_byte) {
                 block_group = btrfs_lookup_block_group(root->fs_info,
                                                        search_start);
@@@ -7124,8 -6940,6 +7145,8 @@@
         }
   search:
         have_caching_bg = false;
+ +      if (index == 0 || index == __get_raid_index(flags))
+ +              full_search = true;
         down_read(&space_info->groups_sem);
         list_for_each_entry(block_group, &space_info->block_groups[index],
                             list) {
@@@ -7159,7 -6973,6 +7180,7 @@@
   have_block_group:
                 cached = block_group_cache_done(block_group);
                 if (unlikely(!cached)) {
+ +                      have_caching_bg = true;
                         ret = cache_block_group(block_group, 0);
                         BUG_ON(ret < 0);
                         ret = 0;
@@@ -7174,7 -6987,7 +7195,7 @@@
                  * Ok we want to try and use the cluster allocator, so
                  * lets look there
                  */
- -              if (last_ptr) {
+ +              if (last_ptr && use_cluster) {
                         struct btrfs_block_group_cache *used_block_group;
                         unsigned long aligned_cluster;
                         /*
@@@ -7300,16 -7113,6 +7321,16 @@@ refill_cluster
                 }
   
   unclustered_alloc:
+ +              /*
+ +               * We are doing an unclustered alloc, set the fragmented flag so
+ +               * we don't bother trying to setup a cluster again until we get
+ +               * more space.
+ +               */
+ +              if (unlikely(last_ptr)) {
+ +                      spin_lock(&last_ptr->lock);
+ +                      last_ptr->fragmented = 1;
+ +                      spin_unlock(&last_ptr->lock);
+ +              }
                 spin_lock(&block_group->free_space_ctl->tree_lock);
                 if (cached &&
                     block_group->free_space_ctl->free_space <
@@@ -7342,6 -7145,8 +7363,6 @@@
                         failed_alloc = true;
                         goto have_block_group;
                 } else if (!offset) {
- -                      if (!cached)
- -                              have_caching_bg = true;
                         goto loop;
                 }
   checks:
@@@ -7382,10 -7187,6 +7403,10 @@@ loop
         }
         up_read(&space_info->groups_sem);
   
+ +      if ((loop == LOOP_CACHING_NOWAIT) && have_caching_bg
+ +              && !orig_have_caching_bg)
+ +              orig_have_caching_bg = true;
+ +
         if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg)
                 goto search;
   
@@@ -7402,20 -7203,7 +7423,20 @@@
          */
         if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) {
                 index = 0;
- -              loop++;
+ +              if (loop == LOOP_CACHING_NOWAIT) {
+ +                      /*
+ +                       * We want to skip the LOOP_CACHING_WAIT step if we
+ +                       * don't have any unached bgs and we've alrelady done a
+ +                       * full search through.
+ +                       */
+ +                      if (orig_have_caching_bg || !full_search)
+ +                              loop = LOOP_CACHING_WAIT;
+ +                      else
+ +                              loop = LOOP_ALLOC_CHUNK;
+ +              } else {
+ +                      loop++;
+ +              }
+ +
                 if (loop == LOOP_ALLOC_CHUNK) {
                         struct btrfs_trans_handle *trans;
                         int exist = 0;
@@@ -7433,15 -7221,6 +7454,15 @@@
   
                         ret = do_chunk_alloc(trans, root, flags,
                                              CHUNK_ALLOC_FORCE);
+ +
+ +                      /*
+ +                       * If we can't allocate a new chunk we've already looped
+ +                       * through at least once, move on to the NO_EMPTY_SIZE
+ +                       * case.
+ +                       */
+ +                      if (ret == -ENOSPC)
+ +                              loop = LOOP_NO_EMPTY_SIZE;
+ +
                         /*
                          * Do not bail out on ENOSPC since we
                          * can do more things.
@@@ -7458,15 -7237,6 +7479,15 @@@
                 }
   
                 if (loop == LOOP_NO_EMPTY_SIZE) {
+ +                      /*
+ +                       * Don't loop again if we already have no empty_size and
+ +                       * no empty_cluster.
+ +                       */
+ +                      if (empty_size == 0 &&
+ +                          empty_cluster == 0) {
+ +                              ret = -ENOSPC;
+ +                              goto out;
+ +                      }
                         empty_size = 0;
                         empty_cluster = 0;
                 }
@@@ -7475,20 -7245,11 +7496,20 @@@
         } else if (!ins->objectid) {
                 ret = -ENOSPC;
         } else if (ins->objectid) {
+ +              if (!use_cluster && last_ptr) {
+ +                      spin_lock(&last_ptr->lock);
+ +                      last_ptr->window_start = ins->objectid;
+ +                      spin_unlock(&last_ptr->lock);
+ +              }
                 ret = 0;
         }
   out:
- -      if (ret == -ENOSPC)
+ +      if (ret == -ENOSPC) {
+ +              spin_lock(&space_info->lock);
+ +              space_info->max_extent_size = max_extent_size;
+ +              spin_unlock(&space_info->lock);
                 ins->offset = max_extent_size;
+ +      }
         return ret;
   }
   
@@@ -7537,7 -7298,7 +7558,7 @@@ int btrfs_reserve_extent(struct btrfs_r
                          u64 empty_size, u64 hint_byte,
                          struct btrfs_key *ins, int is_data, int delalloc)
   {
- -      bool final_tried = false;
+ +      bool final_tried = num_bytes == min_alloc_size;
         u64 flags;
         int ret;
   
@@@ -7672,6 -7433,11 +7693,11 @@@ static int alloc_reserved_file_extent(s
         btrfs_mark_buffer_dirty(path->nodes[0]);
         btrfs_free_path(path);
   
+       ret = remove_from_free_space_tree(trans, fs_info, ins->objectid,
+                                         ins->offset);
+       if (ret)
+               return ret;
+ 
         ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
         if (ret) { /* -ENOENT, logic error */
                 btrfs_err(fs_info, "update block group failed for %llu %llu",
@@@ -7686,7 -7452,8 +7712,7 @@@ static int alloc_reserved_tree_block(st
                                      struct btrfs_root *root,
                                      u64 parent, u64 root_objectid,
                                      u64 flags, struct btrfs_disk_key *key,
- -                                   int level, struct btrfs_key *ins,
- -                                   int no_quota)
+ +                                   int level, struct btrfs_key *ins)
   {
         int ret;
         struct btrfs_fs_info *fs_info = root->fs_info;
@@@ -7752,6 -7519,11 +7778,11 @@@
         btrfs_mark_buffer_dirty(leaf);
         btrfs_free_path(path);
   
+       ret = remove_from_free_space_tree(trans, fs_info, ins->objectid,
+                                         num_bytes);
+       if (ret)
+               return ret;
+ 
         ret = update_block_group(trans, root, ins->objectid, root->nodesize,
                                  1);
         if (ret) { /* -ENOENT, logic error */
@@@ -7767,8 -7539,7 +7798,8 @@@
   int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
                                      struct btrfs_root *root,
                                      u64 root_objectid, u64 owner,
- -                                   u64 offset, struct btrfs_key *ins)
+ +                                   u64 offset, u64 ram_bytes,
+ +                                   struct btrfs_key *ins)
   {
         int ret;
   
@@@ -7777,8 -7548,7 +7808,8 @@@
         ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid,
                                          ins->offset, 0,
                                          root_objectid, owner, offset,
- -                                       BTRFS_ADD_DELAYED_EXTENT, NULL, 0);
+ +                                       ram_bytes, BTRFS_ADD_DELAYED_EXTENT,
+ +                                       NULL);
         return ret;
   }
   
@@@ -7992,7 -7762,7 +8023,7 @@@ struct extent_buffer *btrfs_alloc_tree_
                                                  ins.objectid, ins.offset,
                                                  parent, root_objectid, level,
                                                  BTRFS_ADD_DELAYED_EXTENT,
- -                                               extent_op, 0);
+ +                                               extent_op);
                 if (ret)
                         goto out_free_delayed;
         }
@@@ -8108,47 -7878,21 +8139,47 @@@ reada
   }
   
   /*
- - * TODO: Modify related function to add related node/leaf to dirty_extent_root,
- - * for later qgroup accounting.
- - *
- - * Current, this function does nothing.
+ + * These may not be seen by the usual inc/dec ref code so we have to
+ + * add them here.
    */
+ +static int record_one_subtree_extent(struct btrfs_trans_handle *trans,
+ +                                   struct btrfs_root *root, u64 bytenr,
+ +                                   u64 num_bytes)
+ +{
+ +      struct btrfs_qgroup_extent_record *qrecord;
+ +      struct btrfs_delayed_ref_root *delayed_refs;
+ +
+ +      qrecord = kmalloc(sizeof(*qrecord), GFP_NOFS);
+ +      if (!qrecord)
+ +              return -ENOMEM;
+ +
+ +      qrecord->bytenr = bytenr;
+ +      qrecord->num_bytes = num_bytes;
+ +      qrecord->old_roots = NULL;
+ +
+ +      delayed_refs = &trans->transaction->delayed_refs;
+ +      spin_lock(&delayed_refs->lock);
+ +      if (btrfs_qgroup_insert_dirty_extent(delayed_refs, qrecord))
+ +              kfree(qrecord);
+ +      spin_unlock(&delayed_refs->lock);
+ +
+ +      return 0;
+ +}
+ +
   static int account_leaf_items(struct btrfs_trans_handle *trans,
                               struct btrfs_root *root,
                               struct extent_buffer *eb)
   {
         int nr = btrfs_header_nritems(eb);
- -      int i, extent_type;
+ +      int i, extent_type, ret;
         struct btrfs_key key;
         struct btrfs_file_extent_item *fi;
         u64 bytenr, num_bytes;
   
+ +      /* We can be called directly from walk_up_proc() */
+ +      if (!root->fs_info->quota_enabled)
+ +              return 0;
+ +
         for (i = 0; i < nr; i++) {
                 btrfs_item_key_to_cpu(eb, &key, i);
   
@@@ -8167,10 -7911,6 +8198,10 @@@
                         continue;
   
                 num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi);
+ +
+ +              ret = record_one_subtree_extent(trans, root, bytenr, num_bytes);
+ +              if (ret)
+ +                      return ret;
         }
         return 0;
   }
@@@ -8239,6 -7979,8 +8270,6 @@@ static int adjust_slots_upwards(struct 
   
   /*
    * root_eb is the subtree root and is locked before this function is called.
- - * TODO: Modify this function to mark all (including complete shared node)
- - * to dirty_extent_root to allow it get accounted in qgroup.
    */
   static int account_shared_subtree(struct btrfs_trans_handle *trans,
                                   struct btrfs_root *root,
@@@ -8316,11 -8058,6 +8347,11 @@@ walk_down
                         btrfs_tree_read_lock(eb);
                         btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
                         path->locks[level] = BTRFS_READ_LOCK_BLOCKING;
+ +
+ +                      ret = record_one_subtree_extent(trans, root, child_bytenr,
+ +                                                      root->nodesize);
+ +                      if (ret)
+ +                              goto out;
                 }
   
                 if (level == 0) {
@@@ -8566,15 -8303,14 +8597,15 @@@ skip
                         ret = account_shared_subtree(trans, root, next,
                                                      generation, level - 1);
                         if (ret) {
- -                              printk_ratelimited(KERN_ERR "BTRFS: %s Error "
+ +                              btrfs_err_rl(root->fs_info,
+ +                                      "Error "
                                         "%d accounting shared subtree. Quota "
- -                                      "is out of sync, rescan required.\n",
- -                                      root->fs_info->sb->s_id, ret);
+ +                                      "is out of sync, rescan required.",
+ +                                      ret);
                         }
                 }
                 ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
- -                              root->root_key.objectid, level - 1, 0, 0);
+ +                              root->root_key.objectid, level - 1, 0);
                 BUG_ON(ret); /* -ENOMEM */
         }
         btrfs_tree_unlock(next);
@@@ -8659,11 -8395,10 +8690,11 @@@ static noinline int walk_up_proc(struc
                         BUG_ON(ret); /* -ENOMEM */
                         ret = account_leaf_items(trans, root, eb);
                         if (ret) {
- -                              printk_ratelimited(KERN_ERR "BTRFS: %s Error "
+ +                              btrfs_err_rl(root->fs_info,
+ +                                      "error "
                                         "%d accounting leaf items. Quota "
- -                                      "is out of sync, rescan required.\n",
- -                                      root->fs_info->sb->s_id, ret);
+ +                                      "is out of sync, rescan required.",
+ +                                      ret);
                         }
                 }
                 /* make block locked assertion in clean_tree_block happy */
@@@ -8985,7 -8720,7 +9016,7 @@@ out
         if (!for_reloc && root_dropped == false)
                 btrfs_add_dead_root(root);
         if (err && err != -EAGAIN)
- -              btrfs_std_error(root->fs_info, err);
+ +              btrfs_std_error(root->fs_info, err, NULL);
         return err;
   }
   
@@@ -9173,7 -8908,7 +9204,7 @@@ again
          * back off and let this transaction commit
          */
         mutex_lock(&root->fs_info->ro_block_group_mutex);
- -      if (trans->transaction->dirty_bg_run) {
+ +      if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
                 u64 transid = trans->transid;
   
                 mutex_unlock(&root->fs_info->ro_block_group_mutex);
@@@ -9656,6 -9391,8 +9687,8 @@@ btrfs_create_block_group_cache(struct b
         cache->full_stripe_len = btrfs_full_stripe_len(root,
                                                &root->fs_info->mapping_tree,
                                                start);
+       set_free_space_tree_thresholds(cache);
+ 
         atomic_set(&cache->count, 1);
         spin_lock_init(&cache->lock);
         init_rwsem(&cache->data_rwsem);
@@@ -9667,6 -9404,7 +9700,7 @@@
         INIT_LIST_HEAD(&cache->io_list);
         btrfs_init_free_space_ctl(cache);
         atomic_set(&cache->trimming, 0);
+       mutex_init(&cache->free_space_lock);
   
         return cache;
   }
@@@ -9877,6 -9615,8 +9911,8 @@@ void btrfs_create_pending_block_groups(
                                                key.objectid, key.offset);
                 if (ret)
                         btrfs_abort_transaction(trans, extent_root, ret);
+               add_block_group_free_space(trans, root->fs_info, block_group);
+               /* already aborted the transaction if it failed. */
   next:
                 list_del_init(&block_group->bg_list);
         }
@@@ -9907,6 -9647,7 +9943,7 @@@ int btrfs_make_block_group(struct btrfs
         cache->flags = type;
         cache->last_byte_to_unpin = (u64)-1;
         cache->cached = BTRFS_CACHE_FINISHED;
+       cache->needs_free_space = 1;
         ret = exclude_super_stripes(root, cache);
         if (ret) {
                 /*
@@@ -9923,14 -9664,6 +9960,14 @@@
   
         free_excluded_extents(root, cache);
   
+ +#ifdef CONFIG_BTRFS_DEBUG
+ +      if (btrfs_should_fragment_free_space(root, cache)) {
+ +              u64 new_bytes_used = size - bytes_used;
+ +
+ +              bytes_used += new_bytes_used >> 1;
+ +              fragment_free_space(root, cache);
+ +      }
+ +#endif
         /*
          * Call to ensure the corresponding space_info object is created and
          * assigned to our block group, but don't update its counters just yet.
@@@ -10277,6 -10010,10 +10314,10 @@@ int btrfs_remove_block_group(struct btr
   
         unlock_chunks(root);
   
+       ret = remove_block_group_free_space(trans, root->fs_info, block_group);
+       if (ret)
+               goto out;
+ 
         btrfs_put_block_group(block_group);
         btrfs_put_block_group(block_group);
   
@@@ -10292,47 -10029,6 +10333,47 @@@ out
         return ret;
   }
   
+ +struct btrfs_trans_handle *
+ +btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info,
+ +                                   const u64 chunk_offset)
+ +{
+ +      struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
+ +      struct extent_map *em;
+ +      struct map_lookup *map;
+ +      unsigned int num_items;
+ +
+ +      read_lock(&em_tree->lock);
+ +      em = lookup_extent_mapping(em_tree, chunk_offset, 1);
+ +      read_unlock(&em_tree->lock);
+ +      ASSERT(em && em->start == chunk_offset);
+ +
+ +      /*
+ +       * We need to reserve 3 + N units from the metadata space info in order
+ +       * to remove a block group (done at btrfs_remove_chunk() and at
+ +       * btrfs_remove_block_group()), which are used for:
+ +       *
+ +       * 1 unit for adding the free space inode's orphan (located in the tree
+ +       * of tree roots).
+ +       * 1 unit for deleting the block group item (located in the extent
+ +       * tree).
+ +       * 1 unit for deleting the free space item (located in tree of tree
+ +       * roots).
+ +       * N units for deleting N device extent items corresponding to each
+ +       * stripe (located in the device tree).
+ +       *
+ +       * In order to remove a block group we also need to reserve units in the
+ +       * system space info in order to update the chunk tree (update one or
+ +       * more device items and remove one chunk item), but this is done at
+ +       * btrfs_remove_chunk() through a call to check_system_chunk().
+ +       */
+ +      map = (struct map_lookup *)em->bdev;
+ +      num_items = 3 + map->num_stripes;
+ +      free_extent_map(em);
+ +
+ +      return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root,
+ +                                                         num_items, 1);
+ +}
+ +
   /*
    * Process the unused_bgs list and remove any that don't have any allocated
    * space inside of them.
@@@ -10356,25 -10052,22 +10397,25 @@@ void btrfs_delete_unused_bgs(struct btr
                 block_group = list_first_entry(&fs_info->unused_bgs,
                                                struct btrfs_block_group_cache,
                                                bg_list);
- -              space_info = block_group->space_info;
                 list_del_init(&block_group->bg_list);
+ +
+ +              space_info = block_group->space_info;
+ +
                 if (ret || btrfs_mixed_space_info(space_info)) {
                         btrfs_put_block_group(block_group);
                         continue;
                 }
                 spin_unlock(&fs_info->unused_bgs_lock);
   
- -              mutex_lock(&root->fs_info->delete_unused_bgs_mutex);
+ +              mutex_lock(&fs_info->delete_unused_bgs_mutex);
   
                 /* Don't want to race with allocators so take the groups_sem */
                 down_write(&space_info->groups_sem);
                 spin_lock(&block_group->lock);
                 if (block_group->reserved ||
                     btrfs_block_group_used(&block_group->item) ||
- -                  block_group->ro) {
+ +                  block_group->ro ||
+ +                  list_is_singular(&block_group->list)) {
                         /*
                          * We want to bail if we made new allocations or have
                          * outstanding allocations in this block group.  We do
@@@ -10399,8 -10092,8 +10440,8 @@@
                  * Want to do this before we do anything else so we can recover
                  * properly if we fail to join the transaction.
                  */
- -              /* 1 for btrfs_orphan_reserve_metadata() */
- -              trans = btrfs_start_transaction(root, 1);
+ +              trans = btrfs_start_trans_remove_block_group(fs_info,
+ +                                                   block_group->key.objectid);
                 if (IS_ERR(trans)) {
                         btrfs_dec_block_group_ro(root, block_group);
                         ret = PTR_ERR(trans);
@@@ -10490,7 -10183,7 +10531,7 @@@
   end_trans:
                 btrfs_end_transaction(trans, root);
   next:
- -              mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
+ +              mutex_unlock(&fs_info->delete_unused_bgs_mutex);
                 btrfs_put_block_group(block_group);
                 spin_lock(&fs_info->unused_bgs_lock);
         }
@@@ -10715,7 -10408,8 +10756,7 @@@ void btrfs_end_write_no_snapshoting(str
   {
         percpu_counter_dec(&root->subv_writers->counter);
         /*
- -       * Make sure counter is updated before we wake up
- -       * waiters.
+ +       * Make sure counter is updated before we wake up waiters.
          */
         smp_mb();
         if (waitqueue_active(&root->subv_writers->wait))
diff --combined fs/btrfs/extent_io.c

index 9abe18763a7fb001632246fd712a64fdffcc0225,a6eec2d0e254392a7f0c5047848586de682e0555..2b3f26326565c478f57a4860a948ff116ed534d9
--- 1/fs/btrfs/extent_io.c
--- 2/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@@ -96,8 -96,8 +96,8 @@@ static inline void __btrfs_debug_check_
         inode = tree->mapping->host;
         isize = i_size_read(inode);
         if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
- -              printk_ratelimited(KERN_DEBUG
- -                  "BTRFS: %s: ino %llu isize %llu odd range [%llu,%llu]\n",
+ +              btrfs_debug_rl(BTRFS_I(inode)->root->fs_info,
+ +                  "%s: ino %llu isize %llu odd range [%llu,%llu]",
                                 caller, btrfs_ino(inode), isize, start, end);
         }
   }
@@@ -131,25 -131,6 +131,25 @@@ struct extent_page_data 
         unsigned int sync_io:1;
   };
   
+ +static void add_extent_changeset(struct extent_state *state, unsigned bits,
+ +                               struct extent_changeset *changeset,
+ +                               int set)
+ +{
+ +      int ret;
+ +
+ +      if (!changeset)
+ +              return;
+ +      if (set && (state->state & bits) == bits)
+ +              return;
+ +      if (!set && (state->state & bits) == 0)
+ +              return;
+ +      changeset->bytes_changed += state->end - state->start + 1;
+ +      ret = ulist_add(changeset->range_changed, state->start, state->end,
+ +                      GFP_ATOMIC);
+ +      /* ENOMEM */
+ +      BUG_ON(ret < 0);
+ +}
+ +
   static noinline void flush_write_bio(void *data);
   static inline struct btrfs_fs_info *
   tree_fs_info(struct extent_io_tree *tree)
@@@ -429,8 -410,7 +429,8 @@@ static void clear_state_cb(struct exten
   }
   
   static void set_state_bits(struct extent_io_tree *tree,
- -                         struct extent_state *state, unsigned *bits);
+ +                         struct extent_state *state, unsigned *bits,
+ +                         struct extent_changeset *changeset);
   
   /*
    * insert an extent_state struct into the tree.  'bits' are set on the
@@@ -446,7 -426,7 +446,7 @@@ static int insert_state(struct extent_i
                         struct extent_state *state, u64 start, u64 end,
                         struct rb_node ***p,
                         struct rb_node **parent,
- -                      unsigned *bits)
+ +                      unsigned *bits, struct extent_changeset *changeset)
   {
         struct rb_node *node;
   
@@@ -456,7 -436,7 +456,7 @@@
         state->start = start;
         state->end = end;
   
- -      set_state_bits(tree, state, bits);
+ +      set_state_bits(tree, state, bits, changeset);
   
         node = tree_insert(&tree->state, NULL, end, &state->rb_node, p, parent);
         if (node) {
@@@ -531,8 -511,7 +531,8 @@@ static struct extent_state *next_state(
    */
   static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
                                             struct extent_state *state,
- -                                          unsigned *bits, int wake)
+ +                                          unsigned *bits, int wake,
+ +                                          struct extent_changeset *changeset)
   {
         struct extent_state *next;
         unsigned bits_to_clear = *bits & ~EXTENT_CTLBITS;
@@@ -543,7 -522,6 +543,7 @@@
                 tree->dirty_bytes -= range;
         }
         clear_state_cb(tree, state, bits);
+ +      add_extent_changeset(state, bits_to_clear, changeset, 0);
         state->state &= ~bits_to_clear;
         if (wake)
                 wake_up(&state->wq);
@@@ -591,10 -569,10 +591,10 @@@ static void extent_io_tree_panic(struc
    *
    * This takes the tree lock, and returns 0 on success and < 0 on error.
    */
- -int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- -                   unsigned bits, int wake, int delete,
- -                   struct extent_state **cached_state,
- -                   gfp_t mask)
+ +static int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ +                            unsigned bits, int wake, int delete,
+ +                            struct extent_state **cached_state,
+ +                            gfp_t mask, struct extent_changeset *changeset)
   {
         struct extent_state *state;
         struct extent_state *cached;
@@@ -616,7 -594,7 +616,7 @@@
         if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY))
                 clear = 1;
   again:
- -      if (!prealloc && (mask & __GFP_WAIT)) {
+ +      if (!prealloc && gfpflags_allow_blocking(mask)) {
                 /*
                  * Don't care for allocation failure here because we might end
                  * up not needing the pre-allocated extent state at all, which
@@@ -693,8 -671,7 +693,8 @@@ hit_next
                 if (err)
                         goto out;
                 if (state->end <= end) {
- -                      state = clear_state_bit(tree, state, &bits, wake);
+ +                      state = clear_state_bit(tree, state, &bits, wake,
+ +                                              changeset);
                         goto next;
                 }
                 goto search_again;
@@@ -715,13 -692,13 +715,13 @@@
                 if (wake)
                         wake_up(&state->wq);
   
- -              clear_state_bit(tree, prealloc, &bits, wake);
+ +              clear_state_bit(tree, prealloc, &bits, wake, changeset);
   
                 prealloc = NULL;
                 goto out;
         }
   
- -      state = clear_state_bit(tree, state, &bits, wake);
+ +      state = clear_state_bit(tree, state, &bits, wake, changeset);
   next:
         if (last_end == (u64)-1)
                 goto out;
@@@ -741,7 -718,7 +741,7 @@@ search_again
         if (start > end)
                 goto out;
         spin_unlock(&tree->lock);
- -      if (mask & __GFP_WAIT)
+ +      if (gfpflags_allow_blocking(mask))
                 cond_resched();
         goto again;
   }
@@@ -812,7 -789,7 +812,7 @@@ out
   
   static void set_state_bits(struct extent_io_tree *tree,
                            struct extent_state *state,
- -                         unsigned *bits)
+ +                         unsigned *bits, struct extent_changeset *changeset)
   {
         unsigned bits_to_set = *bits & ~EXTENT_CTLBITS;
   
@@@ -821,7 -798,6 +821,7 @@@
                 u64 range = state->end - state->start + 1;
                 tree->dirty_bytes += range;
         }
+ +      add_extent_changeset(state, bits_to_set, changeset, 1);
         state->state |= bits_to_set;
   }
   
@@@ -859,7 -835,7 +859,7 @@@ static int __must_chec
   __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
                  unsigned bits, unsigned exclusive_bits,
                  u64 *failed_start, struct extent_state **cached_state,
- -               gfp_t mask)
+ +               gfp_t mask, struct extent_changeset *changeset)
   {
         struct extent_state *state;
         struct extent_state *prealloc = NULL;
@@@ -874,7 -850,7 +874,7 @@@
   
         bits |= EXTENT_FIRST_DELALLOC;
   again:
- -      if (!prealloc && (mask & __GFP_WAIT)) {
+ +      if (!prealloc && gfpflags_allow_blocking(mask)) {
                 prealloc = alloc_extent_state(mask);
                 BUG_ON(!prealloc);
         }
@@@ -897,7 -873,7 +897,7 @@@
                 prealloc = alloc_extent_state_atomic(prealloc);
                 BUG_ON(!prealloc);
                 err = insert_state(tree, prealloc, start, end,
- -                                 &p, &parent, &bits);
+ +                                 &p, &parent, &bits, changeset);
                 if (err)
                         extent_io_tree_panic(tree, err);
   
@@@ -923,7 -899,7 +923,7 @@@ hit_next
                         goto out;
                 }
   
- -              set_state_bits(tree, state, &bits);
+ +              set_state_bits(tree, state, &bits, changeset);
                 cache_state(state, cached_state);
                 merge_state(tree, state);
                 if (last_end == (u64)-1)
@@@ -969,7 -945,7 +969,7 @@@
                 if (err)
                         goto out;
                 if (state->end <= end) {
- -                      set_state_bits(tree, state, &bits);
+ +                      set_state_bits(tree, state, &bits, changeset);
                         cache_state(state, cached_state);
                         merge_state(tree, state);
                         if (last_end == (u64)-1)
@@@ -1004,7 -980,7 +1004,7 @@@
                  * the later extent.
                  */
                 err = insert_state(tree, prealloc, start, this_end,
- -                                 NULL, NULL, &bits);
+ +                                 NULL, NULL, &bits, changeset);
                 if (err)
                         extent_io_tree_panic(tree, err);
   
@@@ -1032,7 -1008,7 +1032,7 @@@
                 if (err)
                         extent_io_tree_panic(tree, err);
   
- -              set_state_bits(tree, prealloc, &bits);
+ +              set_state_bits(tree, prealloc, &bits, changeset);
                 cache_state(prealloc, cached_state);
                 merge_state(tree, prealloc);
                 prealloc = NULL;
@@@ -1052,7 -1028,7 +1052,7 @@@ search_again
         if (start > end)
                 goto out;
         spin_unlock(&tree->lock);
- -      if (mask & __GFP_WAIT)
+ +      if (gfpflags_allow_blocking(mask))
                 cond_resched();
         goto again;
   }
@@@ -1062,7 -1038,7 +1062,7 @@@ int set_extent_bit(struct extent_io_tre
                    struct extent_state **cached_state, gfp_t mask)
   {
         return __set_extent_bit(tree, start, end, bits, 0, failed_start,
- -                              cached_state, mask);
+ +                              cached_state, mask, NULL);
   }
   
   
@@@ -1100,7 -1076,7 +1100,7 @@@ int convert_extent_bit(struct extent_io
         btrfs_debug_check_extent_io_range(tree, start, end);
   
   again:
- -      if (!prealloc && (mask & __GFP_WAIT)) {
+ +      if (!prealloc && gfpflags_allow_blocking(mask)) {
                 /*
                  * Best effort, don't worry if extent state allocation fails
                  * here for the first iteration. We might have a cached state
@@@ -1135,7 -1111,7 +1135,7 @@@
                         goto out;
                 }
                 err = insert_state(tree, prealloc, start, end,
- -                                 &p, &parent, &bits);
+ +                                 &p, &parent, &bits, NULL);
                 if (err)
                         extent_io_tree_panic(tree, err);
                 cache_state(prealloc, cached_state);
@@@ -1154,9 -1130,9 +1154,9 @@@ hit_next
          * Just lock what we found and keep going
          */
         if (state->start == start && state->end <= end) {
- -              set_state_bits(tree, state, &bits);
+ +              set_state_bits(tree, state, &bits, NULL);
                 cache_state(state, cached_state);
- -              state = clear_state_bit(tree, state, &clear_bits, 0);
+ +              state = clear_state_bit(tree, state, &clear_bits, 0, NULL);
                 if (last_end == (u64)-1)
                         goto out;
                 start = last_end + 1;
@@@ -1195,10 -1171,9 +1195,10 @@@
                 if (err)
                         goto out;
                 if (state->end <= end) {
- -                      set_state_bits(tree, state, &bits);
+ +                      set_state_bits(tree, state, &bits, NULL);
                         cache_state(state, cached_state);
- -                      state = clear_state_bit(tree, state, &clear_bits, 0);
+ +                      state = clear_state_bit(tree, state, &clear_bits, 0,
+ +                                              NULL);
                         if (last_end == (u64)-1)
                                 goto out;
                         start = last_end + 1;
@@@ -1233,7 -1208,7 +1233,7 @@@
                  * the later extent.
                  */
                 err = insert_state(tree, prealloc, start, this_end,
- -                                 NULL, NULL, &bits);
+ +                                 NULL, NULL, &bits, NULL);
                 if (err)
                         extent_io_tree_panic(tree, err);
                 cache_state(prealloc, cached_state);
@@@ -1258,9 -1233,9 +1258,9 @@@
                 if (err)
                         extent_io_tree_panic(tree, err);
   
- -              set_state_bits(tree, prealloc, &bits);
+ +              set_state_bits(tree, prealloc, &bits, NULL);
                 cache_state(prealloc, cached_state);
- -              clear_state_bit(tree, prealloc, &clear_bits, 0);
+ +              clear_state_bit(tree, prealloc, &clear_bits, 0, NULL);
                 prealloc = NULL;
                 goto out;
         }
@@@ -1278,7 -1253,7 +1278,7 @@@ search_again
         if (start > end)
                 goto out;
         spin_unlock(&tree->lock);
- -      if (mask & __GFP_WAIT)
+ +      if (gfpflags_allow_blocking(mask))
                 cond_resched();
         first_iteration = false;
         goto again;
@@@ -1299,30 -1274,6 +1299,30 @@@ int set_extent_bits(struct extent_io_tr
                               NULL, mask);
   }
   
+ +int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ +                         unsigned bits, gfp_t mask,
+ +                         struct extent_changeset *changeset)
+ +{
+ +      /*
+ +       * We don't support EXTENT_LOCKED yet, as current changeset will
+ +       * record any bits changed, so for EXTENT_LOCKED case, it will
+ +       * either fail with -EEXIST or changeset will record the whole
+ +       * range.
+ +       */
+ +      BUG_ON(bits & EXTENT_LOCKED);
+ +
+ +      return __set_extent_bit(tree, start, end, bits, 0, NULL, NULL, mask,
+ +                              changeset);
+ +}
+ +
+ +int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ +                   unsigned bits, int wake, int delete,
+ +                   struct extent_state **cached, gfp_t mask)
+ +{
+ +      return __clear_extent_bit(tree, start, end, bits, wake, delete,
+ +                                cached, mask, NULL);
+ +}
+ +
   int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
                       unsigned bits, gfp_t mask)
   {
@@@ -1334,20 -1285,6 +1334,20 @@@
         return clear_extent_bit(tree, start, end, bits, wake, 0, NULL, mask);
   }
   
+ +int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ +                           unsigned bits, gfp_t mask,
+ +                           struct extent_changeset *changeset)
+ +{
+ +      /*
+ +       * Don't support EXTENT_LOCKED case, same reason as
+ +       * set_record_extent_bits().
+ +       */
+ +      BUG_ON(bits & EXTENT_LOCKED);
+ +
+ +      return __clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask,
+ +                                changeset);
+ +}
+ +
   int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
                         struct extent_state **cached_state, gfp_t mask)
   {
@@@ -1406,7 -1343,7 +1406,7 @@@ int lock_extent_bits(struct extent_io_t
         while (1) {
                 err = __set_extent_bit(tree, start, end, EXTENT_LOCKED | bits,
                                        EXTENT_LOCKED, &failed_start,
- -                                     cached_state, GFP_NOFS);
+ +                                     cached_state, GFP_NOFS, NULL);
                 if (err == -EEXIST) {
                         wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
                         start = failed_start;
@@@ -1428,7 -1365,7 +1428,7 @@@ int try_lock_extent(struct extent_io_tr
         u64 failed_start;
   
         err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
- -                             &failed_start, NULL, GFP_NOFS);
+ +                             &failed_start, NULL, GFP_NOFS, NULL);
         if (err == -EEXIST) {
                 if (failed_start > start)
                         clear_extent_bit(tree, start, failed_start - 1,
@@@ -2141,8 -2078,8 +2141,8 @@@ int repair_io_failure(struct inode *ino
                 return -EIO;
         }
   
- -      printk_ratelimited_in_rcu(KERN_INFO
- -                                "BTRFS: read error corrected: ino %llu off %llu (dev %s sector %llu)\n",
+ +      btrfs_info_rl_in_rcu(fs_info,
+ +              "read error corrected: ino %llu off %llu (dev %s sector %llu)",
                                   btrfs_ino(inode), start,
                                   rcu_str_deref(dev->name), sector);
         bio_put(bio);
@@@ -3133,12 -3070,8 +3133,12 @@@ static int __do_readpage(struct extent_
   
                         set_extent_uptodate(tree, cur, cur + iosize - 1,
                                             &cached, GFP_NOFS);
- -                      unlock_extent_cached(tree, cur, cur + iosize - 1,
- -                                           &cached, GFP_NOFS);
+ +                      if (parent_locked)
+ +                              free_extent_state(cached);
+ +                      else
+ +                              unlock_extent_cached(tree, cur,
+ +                                                   cur + iosize - 1,
+ +                                                   &cached, GFP_NOFS);
                         cur = cur + iosize;
                         pg_offset += iosize;
                         continue;
@@@ -4386,7 -4319,7 +4386,7 @@@ int try_release_extent_mapping(struct e
         u64 start = page_offset(page);
         u64 end = start + PAGE_CACHE_SIZE - 1;
   
- -      if ((mask & __GFP_WAIT) &&
+ +      if (gfpflags_allow_blocking(mask) &&
             page->mapping->host->i_size > 16 * 1024 * 1024) {
                 u64 len;
                 while (start <= end) {
@@@ -4797,24 -4730,14 +4797,14 @@@ struct extent_buffer *btrfs_clone_exten
         return new;
   }
   
- struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
-                                               u64 start)
+ struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
+                                                 u64 start, unsigned long len)
   {
         struct extent_buffer *eb;
-       unsigned long len;
         unsigned long num_pages;
         unsigned long i;
   
-       if (!fs_info) {
-               /*
-                * Called only from tests that don't always have a fs_info
-                * available, but we know that nodesize is 4096
-                */
-               len = 4096;
-       } else {
-               len = fs_info->tree_root->nodesize;
-       }
-       num_pages = num_extent_pages(0, len);
+       num_pages = num_extent_pages(start, len);
   
         eb = __alloc_extent_buffer(fs_info, start, len);
         if (!eb)
@@@ -4837,6 -4760,24 +4827,24 @@@ err
         return NULL;
   }
   
+ struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
+                                               u64 start)
+ {
+       unsigned long len;
+ 
+       if (!fs_info) {
+               /*
+                * Called only from tests that don't always have a fs_info
+                * available, but we know that nodesize is 4096
+                */
+               len = 4096;
+       } else {
+               len = fs_info->tree_root->nodesize;
+       }
+ 
+       return __alloc_dummy_extent_buffer(fs_info, start, len);
+ }
+ 
   static void check_buffer_tree_ref(struct extent_buffer *eb)
   {
         int refs;
@@@ -5594,6 -5535,155 +5602,155 @@@ void copy_extent_buffer(struct extent_b
         }
   }
   
+ /*
+  * The extent buffer bitmap operations are done with byte granularity because
+  * bitmap items are not guaranteed to be aligned to a word and therefore a
+  * single word in a bitmap may straddle two pages in the extent buffer.
+  */
+ #define BIT_BYTE(nr) ((nr) / BITS_PER_BYTE)
+ #define BYTE_MASK ((1 << BITS_PER_BYTE) - 1)
+ #define BITMAP_FIRST_BYTE_MASK(start) \
+       ((BYTE_MASK << ((start) & (BITS_PER_BYTE - 1))) & BYTE_MASK)
+ #define BITMAP_LAST_BYTE_MASK(nbits) \
+       (BYTE_MASK >> (-(nbits) & (BITS_PER_BYTE - 1)))
+ 
+ /*
+  * eb_bitmap_offset() - calculate the page and offset of the byte containing the
+  * given bit number
+  * @eb: the extent buffer
+  * @start: offset of the bitmap item in the extent buffer
+  * @nr: bit number
+  * @page_index: return index of the page in the extent buffer that contains the
+  * given bit number
+  * @page_offset: return offset into the page given by page_index
+  *
+  * This helper hides the ugliness of finding the byte in an extent buffer which
+  * contains a given bit.
+  */
+ static inline void eb_bitmap_offset(struct extent_buffer *eb,
+                                   unsigned long start, unsigned long nr,
+                                   unsigned long *page_index,
+                                   size_t *page_offset)
+ {
+       size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
+       size_t byte_offset = BIT_BYTE(nr);
+       size_t offset;
+ 
+       /*
+        * The byte we want is the offset of the extent buffer + the offset of
+        * the bitmap item in the extent buffer + the offset of the byte in the
+        * bitmap item.
+        */
+       offset = start_offset + start + byte_offset;
+ 
+       *page_index = offset >> PAGE_CACHE_SHIFT;
+       *page_offset = offset & (PAGE_CACHE_SIZE - 1);
+ }
+ 
+ /**
+  * extent_buffer_test_bit - determine whether a bit in a bitmap item is set
+  * @eb: the extent buffer
+  * @start: offset of the bitmap item in the extent buffer
+  * @nr: bit number to test
+  */
+ int extent_buffer_test_bit(struct extent_buffer *eb, unsigned long start,
+                          unsigned long nr)
+ {
+       char *kaddr;
+       struct page *page;
+       unsigned long i;
+       size_t offset;
+ 
+       eb_bitmap_offset(eb, start, nr, &i, &offset);
+       page = eb->pages[i];
+       WARN_ON(!PageUptodate(page));
+       kaddr = page_address(page);
+       return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1)));
+ }
+ 
+ /**
+  * extent_buffer_bitmap_set - set an area of a bitmap
+  * @eb: the extent buffer
+  * @start: offset of the bitmap item in the extent buffer
+  * @pos: bit number of the first bit
+  * @len: number of bits to set
+  */
+ void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start,
+                             unsigned long pos, unsigned long len)
+ {
+       char *kaddr;
+       struct page *page;
+       unsigned long i;
+       size_t offset;
+       const unsigned int size = pos + len;
+       int bits_to_set = BITS_PER_BYTE - (pos % BITS_PER_BYTE);
+       unsigned int mask_to_set = BITMAP_FIRST_BYTE_MASK(pos);
+ 
+       eb_bitmap_offset(eb, start, pos, &i, &offset);
+       page = eb->pages[i];
+       WARN_ON(!PageUptodate(page));
+       kaddr = page_address(page);
+ 
+       while (len >= bits_to_set) {
+               kaddr[offset] |= mask_to_set;
+               len -= bits_to_set;
+               bits_to_set = BITS_PER_BYTE;
+               mask_to_set = ~0U;
+               if (++offset >= PAGE_CACHE_SIZE && len > 0) {
+                       offset = 0;
+                       page = eb->pages[++i];
+                       WARN_ON(!PageUptodate(page));
+                       kaddr = page_address(page);
+               }
+       }
+       if (len) {
+               mask_to_set &= BITMAP_LAST_BYTE_MASK(size);
+               kaddr[offset] |= mask_to_set;
+       }
+ }
+ 
+ 
+ /**
+  * extent_buffer_bitmap_clear - clear an area of a bitmap
+  * @eb: the extent buffer
+  * @start: offset of the bitmap item in the extent buffer
+  * @pos: bit number of the first bit
+  * @len: number of bits to clear
+  */
+ void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start,
+                               unsigned long pos, unsigned long len)
+ {
+       char *kaddr;
+       struct page *page;
+       unsigned long i;
+       size_t offset;
+       const unsigned int size = pos + len;
+       int bits_to_clear = BITS_PER_BYTE - (pos % BITS_PER_BYTE);
+       unsigned int mask_to_clear = BITMAP_FIRST_BYTE_MASK(pos);
+ 
+       eb_bitmap_offset(eb, start, pos, &i, &offset);
+       page = eb->pages[i];
+       WARN_ON(!PageUptodate(page));
+       kaddr = page_address(page);
+ 
+       while (len >= bits_to_clear) {
+               kaddr[offset] &= ~mask_to_clear;
+               len -= bits_to_clear;
+               bits_to_clear = BITS_PER_BYTE;
+               mask_to_clear = ~0U;
+               if (++offset >= PAGE_CACHE_SIZE && len > 0) {
+                       offset = 0;
+                       page = eb->pages[++i];
+                       WARN_ON(!PageUptodate(page));
+                       kaddr = page_address(page);
+               }
+       }
+       if (len) {
+               mask_to_clear &= BITMAP_LAST_BYTE_MASK(size);
+               kaddr[offset] &= ~mask_to_clear;
+       }
+ }
+ 
   static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len)
   {
         unsigned long distance = (src > dst) ? src - dst : dst - src;
@@@ -5633,15 -5723,13 +5790,15 @@@ void memcpy_extent_buffer(struct extent
         unsigned long src_i;
   
         if (src_offset + len > dst->len) {
- -              printk(KERN_ERR "BTRFS: memmove bogus src_offset %lu move "
- -                     "len %lu dst len %lu\n", src_offset, len, dst->len);
+ +              btrfs_err(dst->fs_info,
+ +                      "memmove bogus src_offset %lu move "
+ +                     "len %lu dst len %lu", src_offset, len, dst->len);
                 BUG_ON(1);
         }
         if (dst_offset + len > dst->len) {
- -              printk(KERN_ERR "BTRFS: memmove bogus dst_offset %lu move "
- -                     "len %lu dst len %lu\n", dst_offset, len, dst->len);
+ +              btrfs_err(dst->fs_info,
+ +                      "memmove bogus dst_offset %lu move "
+ +                     "len %lu dst len %lu", dst_offset, len, dst->len);
                 BUG_ON(1);
         }
   
@@@ -5681,13 -5769,13 +5838,13 @@@ void memmove_extent_buffer(struct exten
         unsigned long src_i;
   
         if (src_offset + len > dst->len) {
- -              printk(KERN_ERR "BTRFS: memmove bogus src_offset %lu move "
- -                     "len %lu len %lu\n", src_offset, len, dst->len);
+ +              btrfs_err(dst->fs_info, "memmove bogus src_offset %lu move "
+ +                     "len %lu len %lu", src_offset, len, dst->len);
                 BUG_ON(1);
         }
         if (dst_offset + len > dst->len) {
- -              printk(KERN_ERR "BTRFS: memmove bogus dst_offset %lu move "
- -                     "len %lu len %lu\n", dst_offset, len, dst->len);
+ +              btrfs_err(dst->fs_info, "memmove bogus dst_offset %lu move "
+ +                     "len %lu len %lu", dst_offset, len, dst->len);
                 BUG_ON(1);
         }
         if (dst_offset < src_offset) {
diff --combined fs/btrfs/extent_io.h

index f4c1ae11855f0b613894ea44026faf143021b613,9f8d7d1a70157c9842de995ba5b95e9f907560bf..350c8b0a85826ece7fed1951682972e603195880
--- 1/fs/btrfs/extent_io.h
--- 2/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@@ -2,7 -2,6 +2,7 @@@
   #define __EXTENTIO__
   
   #include <linux/rbtree.h>
+ +#include "ulist.h"
   
   /* bits for the extent state */
   #define EXTENT_DIRTY          (1U << 0)
@@@ -19,7 -18,6 +19,7 @@@
   #define EXTENT_NEED_WAIT      (1U << 13)
   #define EXTENT_DAMAGED                (1U << 14)
   #define EXTENT_NORESERVE      (1U << 15)
+ +#define EXTENT_QGROUP_RESERVED        (1U << 16)
   #define EXTENT_IOBITS         (EXTENT_LOCKED | EXTENT_WRITEBACK)
   #define EXTENT_CTLBITS                (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
   
@@@ -163,17 -161,6 +163,17 @@@ struct extent_buffer 
   #endif
   };
   
+ +/*
+ + * Structure to record how many bytes and which ranges are set/cleared
+ + */
+ +struct extent_changeset {
+ +      /* How many bytes are set/cleared in this operation */
+ +      u64 bytes_changed;
+ +
+ +      /* Changed ranges */
+ +      struct ulist *range_changed;
+ +};
+ +
   static inline void extent_set_compress_type(unsigned long *bio_flags,
                                             int compress_type)
   {
@@@ -223,17 -210,11 +223,17 @@@ int test_range_bit(struct extent_io_tre
                    struct extent_state *cached_state);
   int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
                       unsigned bits, gfp_t mask);
+ +int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ +                           unsigned bits, gfp_t mask,
+ +                           struct extent_changeset *changeset);
   int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
                      unsigned bits, int wake, int delete,
                      struct extent_state **cached, gfp_t mask);
   int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
                     unsigned bits, gfp_t mask);
+ +int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ +                         unsigned bits, gfp_t mask,
+ +                         struct extent_changeset *changeset);
   int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
                    unsigned bits, u64 *failed_start,
                    struct extent_state **cached_state, gfp_t mask);
@@@ -282,8 -263,10 +282,10 @@@ void set_page_extent_mapped(struct pag
   
   struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
                                           u64 start);
+ struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
+                                                 u64 start, unsigned long len);
   struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
-               u64 start);
+                                               u64 start);
   struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src);
   struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
                                          u64 start);
@@@ -328,6 -311,12 +330,12 @@@ void memmove_extent_buffer(struct exten
                            unsigned long src_offset, unsigned long len);
   void memset_extent_buffer(struct extent_buffer *eb, char c,
                           unsigned long start, unsigned long len);
+ int extent_buffer_test_bit(struct extent_buffer *eb, unsigned long start,
+                          unsigned long pos);
+ void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start,
+                             unsigned long pos, unsigned long len);
+ void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start,
+                               unsigned long pos, unsigned long len);
   void clear_extent_buffer_dirty(struct extent_buffer *eb);
   int set_extent_buffer_dirty(struct extent_buffer *eb);
   int set_extent_buffer_uptodate(struct extent_buffer *eb);
diff --combined fs/btrfs/super.c

index 24154e422945167f474557887c62acaf6ed0779c,bfdaf123f4e9be781c58a567fc97984652b43ead..9153d54d27c87d16e6254e49ec59c268dd3a4cd1
--- 1/fs/btrfs/super.c
--- 2/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@@ -130,6 -130,7 +130,6 @@@ static void btrfs_handle_error(struct b
         }
   }
   
- -#ifdef CONFIG_PRINTK
   /*
    * __btrfs_std_error decodes expected errors from the caller and
    * invokes the approciate error response.
@@@ -139,9 -140,7 +139,9 @@@ void __btrfs_std_error(struct btrfs_fs_
                        unsigned int line, int errno, const char *fmt, ...)
   {
         struct super_block *sb = fs_info->sb;
+ +#ifdef CONFIG_PRINTK
         const char *errstr;
+ +#endif
   
         /*
          * Special case: if the error is EROFS, and we're already
@@@ -150,7 -149,6 +150,7 @@@
         if (errno == -EROFS && (sb->s_flags & MS_RDONLY))
                 return;
   
+ +#ifdef CONFIG_PRINTK
         errstr = btrfs_decode_error(errno);
         if (fmt) {
                 struct va_format vaf;
@@@ -168,7 -166,6 +168,7 @@@
                 printk(KERN_CRIT "BTRFS: error (device %s) in %s:%d: errno=%d %s\n",
                         sb->s_id, function, line, errno, errstr);
         }
+ +#endif
   
         /* Don't go through full error handling during mount */
         save_error_info(fs_info);
@@@ -176,7 -173,6 +176,7 @@@
                 btrfs_handle_error(fs_info);
   }
   
+ +#ifdef CONFIG_PRINTK
   static const char * const logtypes[] = {
         "emergency",
         "alert",
@@@ -216,6 -212,27 +216,6 @@@ void btrfs_printk(const struct btrfs_fs
   
         va_end(args);
   }
- -
- -#else
- -
- -void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
- -                     unsigned int line, int errno, const char *fmt, ...)
- -{
- -      struct super_block *sb = fs_info->sb;
- -
- -      /*
- -       * Special case: if the error is EROFS, and we're already
- -       * under MS_RDONLY, then it is safe here.
- -       */
- -      if (errno == -EROFS && (sb->s_flags & MS_RDONLY))
- -              return;
- -
- -      /* Don't go through full error handling during mount */
- -      if (sb->s_flags & MS_BORN) {
- -              save_error_info(fs_info);
- -              btrfs_handle_error(fs_info);
- -      }
- -}
   #endif
   
   /*
@@@ -295,17 -312,15 +295,18 @@@ enum 
         Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress,
         Opt_compress_type, Opt_compress_force, Opt_compress_force_type,
         Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
-       Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed,
-       Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, Opt_inode_cache,
-       Opt_no_space_cache, Opt_recovery, Opt_skip_balance,
-       Opt_check_integrity, Opt_check_integrity_including_extent_data,
+       Opt_space_cache, Opt_space_cache_version, Opt_clear_cache,
+       Opt_user_subvol_rm_allowed, Opt_enospc_debug, Opt_subvolrootid,
+       Opt_defrag, Opt_inode_cache, Opt_no_space_cache, Opt_recovery,
+       Opt_skip_balance, Opt_check_integrity,
+       Opt_check_integrity_including_extent_data,
         Opt_check_integrity_print_mask, Opt_fatal_errors, Opt_rescan_uuid_tree,
         Opt_commit_interval, Opt_barrier, Opt_nodefrag, Opt_nodiscard,
         Opt_noenospc_debug, Opt_noflushoncommit, Opt_acl, Opt_datacow,
         Opt_datasum, Opt_treelog, Opt_noinode_cache,
+ +#ifdef CONFIG_BTRFS_DEBUG
+ +      Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all,
+ +#endif
         Opt_err,
   };
   
@@@ -340,6 -355,7 +341,7 @@@ static match_table_t tokens = 
         {Opt_discard, "discard"},
         {Opt_nodiscard, "nodiscard"},
         {Opt_space_cache, "space_cache"},
+       {Opt_space_cache_version, "space_cache=%s"},
         {Opt_clear_cache, "clear_cache"},
         {Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"},
         {Opt_enospc_debug, "enospc_debug"},
@@@ -358,11 -374,6 +360,11 @@@
         {Opt_rescan_uuid_tree, "rescan_uuid_tree"},
         {Opt_fatal_errors, "fatal_errors=%s"},
         {Opt_commit_interval, "commit=%d"},
+ +#ifdef CONFIG_BTRFS_DEBUG
+ +      {Opt_fragment_data, "fragment=data"},
+ +      {Opt_fragment_metadata, "fragment=metadata"},
+ +      {Opt_fragment_all, "fragment=all"},
+ +#endif
         {Opt_err, NULL},
   };
   
@@@ -383,7 -394,9 +385,9 @@@ int btrfs_parse_options(struct btrfs_ro
         bool compress_force = false;
   
         cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
-       if (cache_gen)
+       if (btrfs_fs_compat_ro(root->fs_info, FREE_SPACE_TREE))
+               btrfs_set_opt(info->mount_opt, FREE_SPACE_TREE);
+       else if (cache_gen)
                 btrfs_set_opt(info->mount_opt, SPACE_CACHE);
   
         if (!options)
@@@ -617,15 -630,35 +621,35 @@@
                                              "turning off discard");
                         break;
                 case Opt_space_cache:
-                       btrfs_set_and_info(root, SPACE_CACHE,
-                                          "enabling disk space caching");
+               case Opt_space_cache_version:
+                       if (token == Opt_space_cache ||
+                           strcmp(args[0].from, "v1") == 0) {
+                               btrfs_clear_opt(root->fs_info->mount_opt,
+                                               FREE_SPACE_TREE);
+                               btrfs_set_and_info(root, SPACE_CACHE,
+                                                  "enabling disk space caching");
+                       } else if (strcmp(args[0].from, "v2") == 0) {
+                               btrfs_clear_opt(root->fs_info->mount_opt,
+                                               SPACE_CACHE);
+                               btrfs_set_and_info(root, FREE_SPACE_TREE,
+                                                  "enabling free space tree");
+                       } else {
+                               ret = -EINVAL;
+                               goto out;
+                       }
                         break;
                 case Opt_rescan_uuid_tree:
                         btrfs_set_opt(info->mount_opt, RESCAN_UUID_TREE);
                         break;
                 case Opt_no_space_cache:
-                       btrfs_clear_and_info(root, SPACE_CACHE,
-                                            "disabling disk space caching");
+                       if (btrfs_test_opt(root, SPACE_CACHE)) {
+                               btrfs_clear_and_info(root, SPACE_CACHE,
+                                                    "disabling disk space caching");
+                       }
+                       if (btrfs_test_opt(root, FREE_SPACE_TREE)) {
+                               btrfs_clear_and_info(root, FREE_SPACE_TREE,
+                                                    "disabling free space tree");
+                       }
                         break;
                 case Opt_inode_cache:
                         btrfs_set_pending_and_info(info, INODE_MAP_CACHE,
@@@ -729,22 -762,6 +753,22 @@@
                                 info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
                         }
                         break;
+ +#ifdef CONFIG_BTRFS_DEBUG
+ +              case Opt_fragment_all:
+ +                      btrfs_info(root->fs_info, "fragmenting all space");
+ +                      btrfs_set_opt(info->mount_opt, FRAGMENT_DATA);
+ +                      btrfs_set_opt(info->mount_opt, FRAGMENT_METADATA);
+ +                      break;
+ +              case Opt_fragment_metadata:
+ +                      btrfs_info(root->fs_info, "fragmenting metadata");
+ +                      btrfs_set_opt(info->mount_opt,
+ +                                    FRAGMENT_METADATA);
+ +                      break;
+ +              case Opt_fragment_data:
+ +                      btrfs_info(root->fs_info, "fragmenting data");
+ +                      btrfs_set_opt(info->mount_opt, FRAGMENT_DATA);
+ +                      break;
+ +#endif
                 case Opt_err:
                         btrfs_info(root->fs_info, "unrecognized mount option '%s'", p);
                         ret = -EINVAL;
@@@ -754,8 -771,17 +778,17 @@@
                 }
         }
   out:
+       if (btrfs_fs_compat_ro(root->fs_info, FREE_SPACE_TREE) &&
+           !btrfs_test_opt(root, FREE_SPACE_TREE) &&
+           !btrfs_test_opt(root, CLEAR_CACHE)) {
+               btrfs_err(root->fs_info, "cannot disable free space tree");
+               ret = -EINVAL;
+ 
+       }
         if (!ret && btrfs_test_opt(root, SPACE_CACHE))
                 btrfs_info(root->fs_info, "disk space caching is enabled");
+       if (!ret && btrfs_test_opt(root, FREE_SPACE_TREE))
+               btrfs_info(root->fs_info, "using free space tree");
         kfree(orig);
         return ret;
   }
@@@ -1162,6 -1188,8 +1195,8 @@@ static int btrfs_show_options(struct se
                 seq_puts(seq, ",noacl");
         if (btrfs_test_opt(root, SPACE_CACHE))
                 seq_puts(seq, ",space_cache");
+       else if (btrfs_test_opt(root, FREE_SPACE_TREE))
+               seq_puts(seq, ",space_cache=v2");
         else
                 seq_puts(seq, ",nospace_cache");
         if (btrfs_test_opt(root, RESCAN_UUID_TREE))
@@@ -1196,12 -1224,6 +1231,12 @@@
                 seq_puts(seq, ",fatal_errors=panic");
         if (info->commit_interval != BTRFS_DEFAULT_COMMIT_INTERVAL)
                 seq_printf(seq, ",commit=%d", info->commit_interval);
+ +#ifdef CONFIG_BTRFS_DEBUG
+ +      if (btrfs_test_opt(root, FRAGMENT_DATA))
+ +              seq_puts(seq, ",fragment=data");
+ +      if (btrfs_test_opt(root, FRAGMENT_METADATA))
+ +              seq_puts(seq, ",fragment=metadata");
+ +#endif
         seq_printf(seq, ",subvolid=%llu",
                   BTRFS_I(d_inode(dentry))->root->root_key.objectid);
         seq_puts(seq, ",subvol=");
@@@ -2225,6 -2247,9 +2260,9 @@@ static int btrfs_run_sanity_tests(void
         if (ret)
                 goto out;
         ret = btrfs_test_qgroups();
+       if (ret)
+               goto out;
+       ret = btrfs_test_free_space_tree();
   out:
         btrfs_destroy_test_fs();
         return ret;
diff --combined fs/btrfs/tests/btrfs-tests.c

index 9626252ee6b47d2b391f3383cfa9b3bb80e4110c,ba28cefdf9e7b6d0b787199136471d2d68649941..b1d920b3007017c2014d28217cb5efa1d7ed4d96
--- 1/fs/btrfs/tests/btrfs-tests.c
--- 2/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@@ -21,6 -21,9 +21,9 @@@
   #include <linux/magic.h>
   #include "btrfs-tests.h"
   #include "../ctree.h"
+ #include "../free-space-cache.h"
+ #include "../free-space-tree.h"
+ #include "../transaction.h"
   #include "../volumes.h"
   #include "../disk-io.h"
   #include "../qgroup.h"
@@@ -122,6 -125,9 +125,9 @@@ struct btrfs_fs_info *btrfs_alloc_dummy
         INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
         INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
         INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
+       extent_io_tree_init(&fs_info->freed_extents[0], NULL);
+       extent_io_tree_init(&fs_info->freed_extents[1], NULL);
+       fs_info->pinned_extents = &fs_info->freed_extents[0];
         return fs_info;
   }
   
@@@ -169,3 -175,49 +175,55 @@@ void btrfs_free_dummy_root(struct btrfs
         kfree(root);
   }
   
+ struct btrfs_block_group_cache *
+ btrfs_alloc_dummy_block_group(unsigned long length)
+ {
+       struct btrfs_block_group_cache *cache;
+ 
+       cache = kzalloc(sizeof(*cache), GFP_NOFS);
+       if (!cache)
+               return NULL;
+       cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
+                                       GFP_NOFS);
+       if (!cache->free_space_ctl) {
+               kfree(cache);
+               return NULL;
+       }
++      cache->fs_info = btrfs_alloc_dummy_fs_info();
++      if (!cache->fs_info) {
++              kfree(cache->free_space_ctl);
++              kfree(cache);
++              return NULL;
++      }
+ 
+       cache->key.objectid = 0;
+       cache->key.offset = length;
+       cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
+       cache->sectorsize = 4096;
+       cache->full_stripe_len = 4096;
+ 
+       INIT_LIST_HEAD(&cache->list);
+       INIT_LIST_HEAD(&cache->cluster_list);
+       INIT_LIST_HEAD(&cache->bg_list);
+       btrfs_init_free_space_ctl(cache);
+       mutex_init(&cache->free_space_lock);
+ 
+       return cache;
+ }
+ 
+ void btrfs_free_dummy_block_group(struct btrfs_block_group_cache *cache)
+ {
+       if (!cache)
+               return;
+       __btrfs_remove_free_space_cache(cache->free_space_ctl);
+       kfree(cache->free_space_ctl);
+       kfree(cache);
+ }
+ 
+ void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans)
+ {
+       memset(trans, 0, sizeof(*trans));
+       trans->transid = 1;
+       INIT_LIST_HEAD(&trans->qgroup_ref_list);
+       trans->type = __TRANS_DUMMY;
+ }
diff --combined fs/btrfs/tests/free-space-tests.c

index 8b72b005bfb9a212518a711a2e476c2d70b47b24,bae6c599f6045a7eae995f179f123e180862e80b..cd3e300b9ba5c9152d9f2b3210469262dcd4a861
--- 1/fs/btrfs/tests/free-space-tests.c
--- 2/fs/btrfs/tests/free-space-tests.c
+++ b/fs/btrfs/tests/free-space-tests.c
@@@ -19,45 -19,9 +19,10 @@@
   #include <linux/slab.h>
   #include "btrfs-tests.h"
   #include "../ctree.h"
+ +#include "../disk-io.h"
   #include "../free-space-cache.h"
   
   #define BITS_PER_BITMAP               (PAGE_CACHE_SIZE * 8)
- static struct btrfs_block_group_cache *init_test_block_group(void)
- {
-       struct btrfs_block_group_cache *cache;
- 
-       cache = kzalloc(sizeof(*cache), GFP_NOFS);
-       if (!cache)
-               return NULL;
-       cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
-                                       GFP_NOFS);
-       if (!cache->free_space_ctl) {
-               kfree(cache);
-               return NULL;
-       }
-       cache->fs_info = btrfs_alloc_dummy_fs_info();
-       if (!cache->fs_info) {
-               kfree(cache->free_space_ctl);
-               kfree(cache);
-               return NULL;
-       }
- 
-       cache->key.objectid = 0;
-       cache->key.offset = 1024 * 1024 * 1024;
-       cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
-       cache->sectorsize = 4096;
-       cache->full_stripe_len = 4096;
- 
-       spin_lock_init(&cache->lock);
-       INIT_LIST_HEAD(&cache->list);
-       INIT_LIST_HEAD(&cache->cluster_list);
-       INIT_LIST_HEAD(&cache->bg_list);
- 
-       btrfs_init_free_space_ctl(cache);
- 
-       return cache;
- }
   
   /*
    * This test just does basic sanity checking, making sure we can add an exten
@@@ -886,30 -850,16 +851,30 @@@ test_steal_space_from_bitmap_to_extent(
   int btrfs_test_free_space_cache(void)
   {
         struct btrfs_block_group_cache *cache;
- -      int ret;
+ +      struct btrfs_root *root = NULL;
+ +      int ret = -ENOMEM;
   
         test_msg("Running btrfs free space cache tests\n");
   
-       cache = init_test_block_group();
+       cache = btrfs_alloc_dummy_block_group(1024 * 1024 * 1024);
         if (!cache) {
                 test_msg("Couldn't run the tests\n");
                 return 0;
         }
   
+ +      root = btrfs_alloc_dummy_root();
+ +      if (IS_ERR(root)) {
+ +              ret = PTR_ERR(root);
+ +              goto out;
+ +      }
+ +
+ +      root->fs_info = btrfs_alloc_dummy_fs_info();
+ +      if (!root->fs_info)
+ +              goto out;
+ +
+ +      root->fs_info->extent_root = root;
+ +      cache->fs_info = root->fs_info;
+ +
         ret = test_extents(cache);
         if (ret)
                 goto out;
@@@ -922,10 -872,7 +887,8 @@@
   
         ret = test_steal_space_from_bitmap_to_extent(cache);
   out:
-       __btrfs_remove_free_space_cache(cache->free_space_ctl);
-       kfree(cache->free_space_ctl);
-       kfree(cache);
+       btrfs_free_dummy_block_group(cache);
+ +      btrfs_free_dummy_root(root);
         test_msg("Free space cache tests finished\n");
         return ret;
   }
diff --combined include/trace/events/btrfs.h

index b4473dab39d613e58e4d4e58e0049d72522047cd,e6289e62a2a820c7d6c9a52f04d003ce2bf5b78a..d866f21efbbfd4722e6c23bde8320d7b54ecf2be
--- 1/include/trace/events/btrfs.h
--- 2/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@@ -45,7 -45,8 +45,8 @@@ struct btrfs_qgroup_operation
                 { BTRFS_TREE_LOG_OBJECTID,      "TREE_LOG"      },      \
                 { BTRFS_QUOTA_TREE_OBJECTID,    "QUOTA_TREE"    },      \
                 { BTRFS_TREE_RELOC_OBJECTID,    "TREE_RELOC"    },      \
-               { BTRFS_UUID_TREE_OBJECTID,     "UUID_RELOC"    },      \
+               { BTRFS_UUID_TREE_OBJECTID,     "UUID_TREE"     },      \
+               { BTRFS_FREE_SPACE_TREE_OBJECTID, "FREE_SPACE_TREE" },  \
                 { BTRFS_DATA_RELOC_TREE_OBJECTID, "DATA_RELOC_TREE" })
   
   #define show_root_type(obj)                                           \
@@@ -1117,119 -1118,6 +1118,119 @@@ DEFINE_EVENT(btrfs__workqueue_done, btr
         TP_ARGS(wq)
   );
   
+ +DECLARE_EVENT_CLASS(btrfs__qgroup_data_map,
+ +
+ +      TP_PROTO(struct inode *inode, u64 free_reserved),
+ +
+ +      TP_ARGS(inode, free_reserved),
+ +
+ +      TP_STRUCT__entry(
+ +              __field(        u64,            rootid          )
+ +              __field(        unsigned long,  ino             )
+ +              __field(        u64,            free_reserved   )
+ +      ),
+ +
+ +      TP_fast_assign(
+ +              __entry->rootid         =       BTRFS_I(inode)->root->objectid;
+ +              __entry->ino            =       inode->i_ino;
+ +              __entry->free_reserved  =       free_reserved;
+ +      ),
+ +
+ +      TP_printk("rootid=%llu, ino=%lu, free_reserved=%llu",
+ +                __entry->rootid, __entry->ino, __entry->free_reserved)
+ +);
+ +
+ +DEFINE_EVENT(btrfs__qgroup_data_map, btrfs_qgroup_init_data_rsv_map,
+ +
+ +      TP_PROTO(struct inode *inode, u64 free_reserved),
+ +
+ +      TP_ARGS(inode, free_reserved)
+ +);
+ +
+ +DEFINE_EVENT(btrfs__qgroup_data_map, btrfs_qgroup_free_data_rsv_map,
+ +
+ +      TP_PROTO(struct inode *inode, u64 free_reserved),
+ +
+ +      TP_ARGS(inode, free_reserved)
+ +);
+ +
+ +#define BTRFS_QGROUP_OPERATIONS                               \
+ +      { QGROUP_RESERVE,       "reserve"       },      \
+ +      { QGROUP_RELEASE,       "release"       },      \
+ +      { QGROUP_FREE,          "free"          }
+ +
+ +DECLARE_EVENT_CLASS(btrfs__qgroup_rsv_data,
+ +
+ +      TP_PROTO(struct inode *inode, u64 start, u64 len, u64 reserved, int op),
+ +
+ +      TP_ARGS(inode, start, len, reserved, op),
+ +
+ +      TP_STRUCT__entry(
+ +              __field(        u64,            rootid          )
+ +              __field(        unsigned long,  ino             )
+ +              __field(        u64,            start           )
+ +              __field(        u64,            len             )
+ +              __field(        u64,            reserved        )
+ +              __field(        int,            op              )
+ +      ),
+ +
+ +      TP_fast_assign(
+ +              __entry->rootid         = BTRFS_I(inode)->root->objectid;
+ +              __entry->ino            = inode->i_ino;
+ +              __entry->start          = start;
+ +              __entry->len            = len;
+ +              __entry->reserved       = reserved;
+ +              __entry->op             = op;
+ +      ),
+ +
+ +      TP_printk("root=%llu, ino=%lu, start=%llu, len=%llu, reserved=%llu, op=%s",
+ +                __entry->rootid, __entry->ino, __entry->start, __entry->len,
+ +                __entry->reserved,
+ +                __print_flags((unsigned long)__entry->op, "",
+ +                              BTRFS_QGROUP_OPERATIONS)
+ +      )
+ +);
+ +
+ +DEFINE_EVENT(btrfs__qgroup_rsv_data, btrfs_qgroup_reserve_data,
+ +
+ +      TP_PROTO(struct inode *inode, u64 start, u64 len, u64 reserved, int op),
+ +
+ +      TP_ARGS(inode, start, len, reserved, op)
+ +);
+ +
+ +DEFINE_EVENT(btrfs__qgroup_rsv_data, btrfs_qgroup_release_data,
+ +
+ +      TP_PROTO(struct inode *inode, u64 start, u64 len, u64 reserved, int op),
+ +
+ +      TP_ARGS(inode, start, len, reserved, op)
+ +);
+ +
+ +DECLARE_EVENT_CLASS(btrfs__qgroup_delayed_ref,
+ +
+ +      TP_PROTO(u64 ref_root, u64 reserved),
+ +
+ +      TP_ARGS(ref_root, reserved),
+ +
+ +      TP_STRUCT__entry(
+ +              __field(        u64,            ref_root        )
+ +              __field(        u64,            reserved        )
+ +      ),
+ +
+ +      TP_fast_assign(
+ +              __entry->ref_root       = ref_root;
+ +              __entry->reserved       = reserved;
+ +      ),
+ +
+ +      TP_printk("root=%llu, reserved=%llu, op=free",
+ +                __entry->ref_root, __entry->reserved)
+ +);
+ +
+ +DEFINE_EVENT(btrfs__qgroup_delayed_ref, btrfs_qgroup_free_delayed_ref,
+ +
+ +      TP_PROTO(u64 ref_root, u64 reserved),
+ +
+ +      TP_ARGS(ref_root, reserved)
+ +);
   #endif /* _TRACE_BTRFS_H */
   
   /* This part must be outside protection */
author	Chris Mason <clm@fb.com>
	Fri, 18 Dec 2015 19:11:10 +0000 (11:11 -0800)
committer	Chris Mason <clm@fb.com>
	Fri, 18 Dec 2015 19:11:10 +0000 (11:11 -0800)
		1	2
fs/btrfs/ctree.h	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/disk-io.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/extent-tree.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/extent_io.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/extent_io.h	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/super.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/tests/btrfs-tests.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/tests/free-space-tests.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/trace/events/btrfs.h	patch \|	diff1 \|	diff2 \|	blob \| history