Merge branch 'integrity-check-patch-v2' of git://btrfs.giantdisaster.de/git/btrfs...

author Chris Mason <chris.mason@oracle.com>

Mon, 16 Jan 2012 20:27:58 +0000 (15:27 -0500)

committer Chris Mason <chris.mason@oracle.com>

Mon, 16 Jan 2012 20:27:58 +0000 (15:27 -0500)
author Chris Mason <chris.mason@oracle.com>
Mon, 16 Jan 2012 20:27:58 +0000 (15:27 -0500)
committer Chris Mason <chris.mason@oracle.com>
Mon, 16 Jan 2012 20:27:58 +0000 (15:27 -0500)
diff --combined fs/btrfs/Makefile

index 70798407b9a2ebd48498319496efd1e5ba33789b,bc5b3556cee689f02d8e9a0fb8c7752ab77307fa..0c4fa2befae793f1a6845322d7ba71aaa5da4374
--- 1/fs/btrfs/Makefile
--- 2/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@@ -8,6 -8,7 +8,7 @@@ btrfs-y += super.o ctree.o extent-tree.
            extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
            export.o tree-log.o free-space-cache.o zlib.o lzo.o \
            compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
- -         reada.o backref.o
+ +         reada.o backref.o ulist.o
   
   btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
+ btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --combined fs/btrfs/ctree.h

index b6d1020c4571870660998d61ea4cf34cfbaaca35,39f6188688e602174de1d0e91029522baafa9d55..3c2cbf7b666355b1106fb3fae80451b66ac56086
--- 1/fs/btrfs/ctree.h
--- 2/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@@ -86,9 -86,6 +86,9 @@@ struct btrfs_ordered_sum
   /* holds checksums of all the data extents */
   #define BTRFS_CSUM_TREE_OBJECTID 7ULL
   
+ +/* for storing balance parameters in the root tree */
+ +#define BTRFS_BALANCE_OBJECTID -4ULL
+ +
   /* orhpan objectid for tracking unlinked/truncated files */
   #define BTRFS_ORPHAN_OBJECTID -5ULL
   
@@@ -695,54 -692,6 +695,54 @@@ struct btrfs_root_ref 
         __le16 name_len;
   } __attribute__ ((__packed__));
   
+ +struct btrfs_disk_balance_args {
+ +      /*
+ +       * profiles to operate on, single is denoted by
+ +       * BTRFS_AVAIL_ALLOC_BIT_SINGLE
+ +       */
+ +      __le64 profiles;
+ +
+ +      /* usage filter */
+ +      __le64 usage;
+ +
+ +      /* devid filter */
+ +      __le64 devid;
+ +
+ +      /* devid subset filter [pstart..pend) */
+ +      __le64 pstart;
+ +      __le64 pend;
+ +
+ +      /* btrfs virtual address space subset filter [vstart..vend) */
+ +      __le64 vstart;
+ +      __le64 vend;
+ +
+ +      /*
+ +       * profile to convert to, single is denoted by
+ +       * BTRFS_AVAIL_ALLOC_BIT_SINGLE
+ +       */
+ +      __le64 target;
+ +
+ +      /* BTRFS_BALANCE_ARGS_* */
+ +      __le64 flags;
+ +
+ +      __le64 unused[8];
+ +} __attribute__ ((__packed__));
+ +
+ +/*
+ + * store balance parameters to disk so that balance can be properly
+ + * resumed after crash or unmount
+ + */
+ +struct btrfs_balance_item {
+ +      /* BTRFS_BALANCE_* */
+ +      __le64 flags;
+ +
+ +      struct btrfs_disk_balance_args data;
+ +      struct btrfs_disk_balance_args meta;
+ +      struct btrfs_disk_balance_args sys;
+ +
+ +      __le64 unused[4];
+ +} __attribute__ ((__packed__));
+ +
   #define BTRFS_FILE_EXTENT_INLINE 0
   #define BTRFS_FILE_EXTENT_REG 1
   #define BTRFS_FILE_EXTENT_PREALLOC 2
@@@ -802,32 -751,14 +802,32 @@@ struct btrfs_csum_item 
   } __attribute__ ((__packed__));
   
   /* different types of block groups (and chunks) */
- -#define BTRFS_BLOCK_GROUP_DATA     (1 << 0)
- -#define BTRFS_BLOCK_GROUP_SYSTEM   (1 << 1)
- -#define BTRFS_BLOCK_GROUP_METADATA (1 << 2)
- -#define BTRFS_BLOCK_GROUP_RAID0    (1 << 3)
- -#define BTRFS_BLOCK_GROUP_RAID1    (1 << 4)
- -#define BTRFS_BLOCK_GROUP_DUP    (1 << 5)
- -#define BTRFS_BLOCK_GROUP_RAID10   (1 << 6)
- -#define BTRFS_NR_RAID_TYPES      5
+ +#define BTRFS_BLOCK_GROUP_DATA                (1ULL << 0)
+ +#define BTRFS_BLOCK_GROUP_SYSTEM      (1ULL << 1)
+ +#define BTRFS_BLOCK_GROUP_METADATA    (1ULL << 2)
+ +#define BTRFS_BLOCK_GROUP_RAID0               (1ULL << 3)
+ +#define BTRFS_BLOCK_GROUP_RAID1               (1ULL << 4)
+ +#define BTRFS_BLOCK_GROUP_DUP         (1ULL << 5)
+ +#define BTRFS_BLOCK_GROUP_RAID10      (1ULL << 6)
+ +#define BTRFS_BLOCK_GROUP_RESERVED    BTRFS_AVAIL_ALLOC_BIT_SINGLE
+ +#define BTRFS_NR_RAID_TYPES           5
+ +
+ +#define BTRFS_BLOCK_GROUP_TYPE_MASK   (BTRFS_BLOCK_GROUP_DATA |    \
+ +                                       BTRFS_BLOCK_GROUP_SYSTEM |  \
+ +                                       BTRFS_BLOCK_GROUP_METADATA)
+ +
+ +#define BTRFS_BLOCK_GROUP_PROFILE_MASK        (BTRFS_BLOCK_GROUP_RAID0 |   \
+ +                                       BTRFS_BLOCK_GROUP_RAID1 |   \
+ +                                       BTRFS_BLOCK_GROUP_DUP |     \
+ +                                       BTRFS_BLOCK_GROUP_RAID10)
+ +/*
+ + * We need a bit for restriper to be able to tell when chunks of type
+ + * SINGLE are available.  This "extended" profile format is used in
+ + * fs_info->avail_*_alloc_bits (in-memory) and balance item fields
+ + * (on-disk).  The corresponding on-disk bit in chunk.type is reserved
+ + * to avoid remappings between two formats in future.
+ + */
+ +#define BTRFS_AVAIL_ALLOC_BIT_SINGLE  (1ULL << 48)
   
   struct btrfs_block_group_item {
         __le64 used;
@@@ -985,7 -916,6 +985,7 @@@ struct btrfs_block_group_cache 
   struct reloc_control;
   struct btrfs_device;
   struct btrfs_fs_devices;
+ +struct btrfs_balance_control;
   struct btrfs_delayed_root;
   struct btrfs_fs_info {
         u8 fsid[BTRFS_FSID_SIZE];
@@@ -1041,7 -971,7 +1041,7 @@@
          * is required instead of the faster short fsync log commits
          */
         u64 last_trans_log_full_commit;
-       unsigned long mount_opt:20;
+       unsigned long mount_opt:21;
         unsigned long compress_type:4;
         u64 max_inline;
         u64 alloc_start;
@@@ -1202,23 -1132,12 +1202,23 @@@
         spinlock_t ref_cache_lock;
         u64 total_ref_cache_size;
   
+ +      /*
+ +       * these three are in extended format (availability of single
+ +       * chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other
+ +       * types are denoted by corresponding BTRFS_BLOCK_GROUP_* bits)
+ +       */
         u64 avail_data_alloc_bits;
         u64 avail_metadata_alloc_bits;
         u64 avail_system_alloc_bits;
- -      u64 data_alloc_profile;
- -      u64 metadata_alloc_profile;
- -      u64 system_alloc_profile;
+ +
+ +      /* restriper state */
+ +      spinlock_t balance_lock;
+ +      struct mutex balance_mutex;
+ +      atomic_t balance_running;
+ +      atomic_t balance_pause_req;
+ +      atomic_t balance_cancel_req;
+ +      struct btrfs_balance_control *balance_ctl;
+ +      wait_queue_head_t balance_wait_q;
   
         unsigned data_chunk_allocations;
         unsigned metadata_ratio;
@@@ -1236,6 -1155,10 +1236,10 @@@
         int scrub_workers_refcnt;
         struct btrfs_workers scrub_workers;
   
+ #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+       u32 check_integrity_print_mask;
+ #endif
+ 
         /* filesystem state */
         u64 fs_state;
   
@@@ -1464,8 -1387,6 +1468,8 @@@ struct btrfs_ioctl_defrag_range_args 
   #define BTRFS_DEV_ITEM_KEY    216
   #define BTRFS_CHUNK_ITEM_KEY  228
   
+ +#define BTRFS_BALANCE_ITEM_KEY        248
+ +
   /*
    * string items are for debugging.  They just store a short string of
    * data in the FS
@@@ -1496,7 -1417,8 +1500,9 @@@
   #define BTRFS_MOUNT_AUTO_DEFRAG               (1 << 16)
   #define BTRFS_MOUNT_INODE_MAP_CACHE   (1 << 17)
   #define BTRFS_MOUNT_RECOVERY          (1 << 18)
- -#define BTRFS_MOUNT_CHECK_INTEGRITY   (1 << 19)
- -#define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 20)
+ +#define BTRFS_MOUNT_SKIP_BALANCE      (1 << 19)
++#define BTRFS_MOUNT_CHECK_INTEGRITY   (1 << 20)
++#define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21)
   
   #define btrfs_clear_opt(o, opt)               ((o) &= ~BTRFS_MOUNT_##opt)
   #define btrfs_set_opt(o, opt)         ((o) |= BTRFS_MOUNT_##opt)
@@@ -2161,86 -2083,8 +2167,86 @@@ BTRFS_SETGET_STACK_FUNCS(backup_bytes_u
   BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup,
                    num_devices, 64);
   
- -/* struct btrfs_super_block */
+ +/* struct btrfs_balance_item */
+ +BTRFS_SETGET_FUNCS(balance_flags, struct btrfs_balance_item, flags, 64);
+ +
+ +static inline void btrfs_balance_data(struct extent_buffer *eb,
+ +                                    struct btrfs_balance_item *bi,
+ +                                    struct btrfs_disk_balance_args *ba)
+ +{
+ +      read_eb_member(eb, bi, struct btrfs_balance_item, data, ba);
+ +}
   
+ +static inline void btrfs_set_balance_data(struct extent_buffer *eb,
+ +                                        struct btrfs_balance_item *bi,
+ +                                        struct btrfs_disk_balance_args *ba)
+ +{
+ +      write_eb_member(eb, bi, struct btrfs_balance_item, data, ba);
+ +}
+ +
+ +static inline void btrfs_balance_meta(struct extent_buffer *eb,
+ +                                    struct btrfs_balance_item *bi,
+ +                                    struct btrfs_disk_balance_args *ba)
+ +{
+ +      read_eb_member(eb, bi, struct btrfs_balance_item, meta, ba);
+ +}
+ +
+ +static inline void btrfs_set_balance_meta(struct extent_buffer *eb,
+ +                                        struct btrfs_balance_item *bi,
+ +                                        struct btrfs_disk_balance_args *ba)
+ +{
+ +      write_eb_member(eb, bi, struct btrfs_balance_item, meta, ba);
+ +}
+ +
+ +static inline void btrfs_balance_sys(struct extent_buffer *eb,
+ +                                   struct btrfs_balance_item *bi,
+ +                                   struct btrfs_disk_balance_args *ba)
+ +{
+ +      read_eb_member(eb, bi, struct btrfs_balance_item, sys, ba);
+ +}
+ +
+ +static inline void btrfs_set_balance_sys(struct extent_buffer *eb,
+ +                                       struct btrfs_balance_item *bi,
+ +                                       struct btrfs_disk_balance_args *ba)
+ +{
+ +      write_eb_member(eb, bi, struct btrfs_balance_item, sys, ba);
+ +}
+ +
+ +static inline void
+ +btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu,
+ +                             struct btrfs_disk_balance_args *disk)
+ +{
+ +      memset(cpu, 0, sizeof(*cpu));
+ +
+ +      cpu->profiles = le64_to_cpu(disk->profiles);
+ +      cpu->usage = le64_to_cpu(disk->usage);
+ +      cpu->devid = le64_to_cpu(disk->devid);
+ +      cpu->pstart = le64_to_cpu(disk->pstart);
+ +      cpu->pend = le64_to_cpu(disk->pend);
+ +      cpu->vstart = le64_to_cpu(disk->vstart);
+ +      cpu->vend = le64_to_cpu(disk->vend);
+ +      cpu->target = le64_to_cpu(disk->target);
+ +      cpu->flags = le64_to_cpu(disk->flags);
+ +}
+ +
+ +static inline void
+ +btrfs_cpu_balance_args_to_disk(struct btrfs_disk_balance_args *disk,
+ +                             struct btrfs_balance_args *cpu)
+ +{
+ +      memset(disk, 0, sizeof(*disk));
+ +
+ +      disk->profiles = cpu_to_le64(cpu->profiles);
+ +      disk->usage = cpu_to_le64(cpu->usage);
+ +      disk->devid = cpu_to_le64(cpu->devid);
+ +      disk->pstart = cpu_to_le64(cpu->pstart);
+ +      disk->pend = cpu_to_le64(cpu->pend);
+ +      disk->vstart = cpu_to_le64(cpu->vstart);
+ +      disk->vend = cpu_to_le64(cpu->vend);
+ +      disk->target = cpu_to_le64(cpu->target);
+ +      disk->flags = cpu_to_le64(cpu->flags);
+ +}
+ +
+ +/* struct btrfs_super_block */
   BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
   BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64);
   BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block,
@@@ -2439,11 -2283,11 +2445,11 @@@ struct extent_buffer *btrfs_alloc_free_
                                         struct btrfs_root *root, u32 blocksize,
                                         u64 parent, u64 root_objectid,
                                         struct btrfs_disk_key *key, int level,
- -                                      u64 hint, u64 empty_size);
+ +                                      u64 hint, u64 empty_size, int for_cow);
   void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
                            struct btrfs_root *root,
                            struct extent_buffer *buf,
- -                         u64 parent, int last_ref);
+ +                         u64 parent, int last_ref, int for_cow);
   struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
                                             struct btrfs_root *root,
                                             u64 bytenr, u32 blocksize,
@@@ -2463,17 -2307,17 +2469,17 @@@ int btrfs_reserve_extent(struct btrfs_t
                                   u64 search_end, struct btrfs_key *ins,
                                   u64 data);
   int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- -                struct extent_buffer *buf, int full_backref);
+ +                struct extent_buffer *buf, int full_backref, int for_cow);
   int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- -                struct extent_buffer *buf, int full_backref);
+ +                struct extent_buffer *buf, int full_backref, int for_cow);
   int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
                                 struct btrfs_root *root,
                                 u64 bytenr, u64 num_bytes, u64 flags,
                                 int is_data);
   int btrfs_free_extent(struct btrfs_trans_handle *trans,
                       struct btrfs_root *root,
- -                    u64 bytenr, u64 num_bytes, u64 parent,
- -                    u64 root_objectid, u64 owner, u64 offset);
+ +                    u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
+ +                    u64 owner, u64 offset, int for_cow);
   
   int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
   int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
@@@ -2485,7 -2329,7 +2491,7 @@@ int btrfs_finish_extent_commit(struct b
   int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root,
                          u64 bytenr, u64 num_bytes, u64 parent,
- -                       u64 root_objectid, u64 owner, u64 offset);
+ +                       u64 root_objectid, u64 owner, u64 offset, int for_cow);
   
   int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
                                     struct btrfs_root *root);
@@@ -2644,18 -2488,10 +2650,18 @@@ static inline int btrfs_insert_empty_it
   }
   
   int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
+ +static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p)
+ +{
+ +      ++p->slots[0];
+ +      if (p->slots[0] >= btrfs_header_nritems(p->nodes[0]))
+ +              return btrfs_next_leaf(root, p);
+ +      return 0;
+ +}
   int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
   int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf);
   void btrfs_drop_snapshot(struct btrfs_root *root,
- -                       struct btrfs_block_rsv *block_rsv, int update_ref);
+ +                       struct btrfs_block_rsv *block_rsv, int update_ref,
+ +                       int for_reloc);
   int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
                         struct btrfs_root *root,
                         struct extent_buffer *node,
@@@ -2670,7 -2506,6 +2676,7 @@@ static inline int btrfs_fs_closing(stru
   }
   static inline void free_fs_info(struct btrfs_fs_info *fs_info)
   {
+ +      kfree(fs_info->balance_ctl);
         kfree(fs_info->delayed_root);
         kfree(fs_info->extent_root);
         kfree(fs_info->tree_root);
@@@ -2681,24 -2516,6 +2687,24 @@@
         kfree(fs_info->super_for_commit);
         kfree(fs_info);
   }
+ +/**
+ + * profile_is_valid - tests whether a given profile is valid and reduced
+ + * @flags: profile to validate
+ + * @extended: if true @flags is treated as an extended profile
+ + */
+ +static inline int profile_is_valid(u64 flags, int extended)
+ +{
+ +      u64 mask = ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
+ +
+ +      flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK;
+ +      if (extended)
+ +              mask &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+ +
+ +      if (flags & mask)
+ +              return 0;
+ +      /* true if zero or exactly one bit set */
+ +      return (flags & (~flags + 1)) == flags;
+ +}
   
   /* root-item.c */
   int btrfs_find_root_ref(struct btrfs_root *tree_root,
diff --combined fs/btrfs/disk-io.c

index 9be97716c5e068153b83a5e2d493c0d010c16710,f363c6d9c3de428977ad4a1d77aeae214ef5865a..da4457f84d78d7246ac8ce4f1640664f90d4e92f
--- 1/fs/btrfs/disk-io.c
--- 2/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@@ -43,6 -43,7 +43,7 @@@
   #include "tree-log.h"
   #include "free-space-cache.h"
   #include "inode-map.h"
+ #include "check-integrity.h"
   
   static struct extent_io_ops btree_extent_io_ops;
   static void end_workqueue_fn(struct btrfs_work *work);
@@@ -1243,8 -1244,7 +1244,8 @@@ static struct btrfs_root *alloc_log_tre
         root->ref_cows = 0;
   
         leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
- -                                    BTRFS_TREE_LOG_OBJECTID, NULL, 0, 0, 0);
+ +                                    BTRFS_TREE_LOG_OBJECTID, NULL,
+ +                                    0, 0, 0, 0);
         if (IS_ERR(leaf)) {
                 kfree(root);
                 return ERR_CAST(leaf);
@@@ -2002,21 -2002,16 +2003,24 @@@ struct btrfs_root *open_ctree(struct su
         init_waitqueue_head(&fs_info->scrub_pause_wait);
         init_rwsem(&fs_info->scrub_super_lock);
         fs_info->scrub_workers_refcnt = 0;
+ #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+       fs_info->check_integrity_print_mask = 0;
+ #endif
   
+ +      spin_lock_init(&fs_info->balance_lock);
+ +      mutex_init(&fs_info->balance_mutex);
+ +      atomic_set(&fs_info->balance_running, 0);
+ +      atomic_set(&fs_info->balance_pause_req, 0);
+ +      atomic_set(&fs_info->balance_cancel_req, 0);
+ +      fs_info->balance_ctl = NULL;
+ +      init_waitqueue_head(&fs_info->balance_wait_q);
+ +
         sb->s_blocksize = 4096;
         sb->s_blocksize_bits = blksize_bits(4096);
         sb->s_bdi = &fs_info->bdi;
   
         fs_info->btree_inode->i_ino = BTRFS_BTREE_INODE_OBJECTID;
- -      fs_info->btree_inode->i_nlink = 1;
+ +      set_nlink(fs_info->btree_inode, 1);
         /*
          * we set the i_size on the btree inode to the max possible int.
          * the real end of the address space is determined by all of
@@@ -2279,7 -2274,9 +2283,7 @@@
            (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node),
            BTRFS_UUID_SIZE);
   
- -      mutex_lock(&fs_info->chunk_mutex);
         ret = btrfs_read_chunk_tree(chunk_root);
- -      mutex_unlock(&fs_info->chunk_mutex);
         if (ret) {
                 printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n",
                        sb->s_id);
@@@ -2328,6 -2325,9 +2332,6 @@@ retry_root_backup
   
         fs_info->generation = generation;
         fs_info->last_trans_committed = generation;
- -      fs_info->data_alloc_profile = (u64)-1;
- -      fs_info->metadata_alloc_profile = (u64)-1;
- -      fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
   
         ret = btrfs_init_space_info(fs_info);
         if (ret) {
@@@ -2360,6 -2360,19 +2364,19 @@@
                 btrfs_set_opt(fs_info->mount_opt, SSD);
         }
   
+ #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+       if (btrfs_test_opt(tree_root, CHECK_INTEGRITY)) {
+               ret = btrfsic_mount(tree_root, fs_devices,
+                                   btrfs_test_opt(tree_root,
+                                       CHECK_INTEGRITY_INCLUDING_EXTENT_DATA) ?
+                                   1 : 0,
+                                   fs_info->check_integrity_print_mask);
+               if (ret)
+                       printk(KERN_WARNING "btrfs: failed to initialize"
+                              " integrity check module %s\n", sb->s_id);
+       }
+ #endif
+ 
         /* do not make disk changes in broken FS */
         if (btrfs_super_log_root(disk_super) != 0 &&
             !(fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)) {
@@@ -2430,10 -2443,6 +2447,10 @@@
                 if (!err)
                         err = btrfs_orphan_cleanup(fs_info->tree_root);
                 up_read(&fs_info->cleanup_work_sem);
+ +
+ +              if (!err)
+ +                      err = btrfs_recover_balance(fs_info->tree_root);
+ +
                 if (err) {
                         close_ctree(tree_root);
                         return ERR_PTR(err);
@@@ -2642,7 -2651,7 +2659,7 @@@ static int write_dev_supers(struct btrf
                  * we fua the first super.  The others we allow
                  * to go down lazy.
                  */
-               ret = submit_bh(WRITE_FUA, bh);
+               ret = btrfsic_submit_bh(WRITE_FUA, bh);
                 if (ret)
                         errors++;
         }
@@@ -2719,7 -2728,7 +2736,7 @@@ static int write_dev_flush(struct btrfs
         device->flush_bio = bio;
   
         bio_get(bio);
-       submit_bio(WRITE_FLUSH, bio);
+       btrfsic_submit_bio(WRITE_FLUSH, bio);
   
         return 0;
   }
@@@ -2983,9 -2992,6 +3000,9 @@@ int close_ctree(struct btrfs_root *root
         fs_info->closing = 1;
         smp_mb();
   
+ +      /* pause restriper - we want to resume on mount */
+ +      btrfs_pause_balance(root->fs_info);
+ +
         btrfs_scrub_cancel(root);
   
         /* wait for any defraggers to finish */
@@@ -3068,6 -3074,11 +3085,11 @@@
         btrfs_stop_workers(&fs_info->caching_workers);
         btrfs_stop_workers(&fs_info->readahead_workers);
   
+ #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+       if (btrfs_test_opt(root, CHECK_INTEGRITY))
+               btrfsic_unmount(root, fs_info->fs_devices);
+ #endif
+ 
         btrfs_close_devices(fs_info->fs_devices);
         btrfs_mapping_tree_free(&fs_info->mapping_tree);
   
diff --combined fs/btrfs/extent_io.c

index 3622cc22ff919d4477f6b3f930ac2b86443f6ba7,246669296e0252298d3944095f32c1c1bf2bc521..9d09a4f81875817ebc45a7c5b80cbe6008061b22
--- 1/fs/btrfs/extent_io.c
--- 2/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@@ -18,6 -18,7 +18,7 @@@
   #include "ctree.h"
   #include "btrfs_inode.h"
   #include "volumes.h"
+ #include "check-integrity.h"
   
   static struct kmem_cache *extent_state_cache;
   static struct kmem_cache *extent_buffer_cache;
@@@ -1895,7 -1896,7 +1896,7 @@@ int repair_io_failure(struct btrfs_mapp
         }
         bio->bi_bdev = dev->bdev;
         bio_add_page(bio, page, length, start-page_offset(page));
-       submit_bio(WRITE_SYNC, bio);
+       btrfsic_submit_bio(WRITE_SYNC, bio);
         wait_for_completion(&compl);
   
         if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
@@@ -2393,7 -2394,7 +2394,7 @@@ static int submit_one_bio(int rw, struc
                 ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
                                            mirror_num, bio_flags, start);
         else
-               submit_bio(rw, bio);
+               btrfsic_submit_bio(rw, bio);
   
         if (bio_flagged(bio, BIO_EOPNOTSUPP))
                 ret = -EOPNOTSUPP;
@@@ -3579,7 -3580,6 +3580,7 @@@ static struct extent_buffer *__alloc_ex
         atomic_set(&eb->blocking_writers, 0);
         atomic_set(&eb->spinning_readers, 0);
         atomic_set(&eb->spinning_writers, 0);
+ +      eb->lock_nested = 0;
         init_waitqueue_head(&eb->write_lock_wq);
         init_waitqueue_head(&eb->read_lock_wq);
   
diff --combined fs/btrfs/scrub.c

index 6a6a51a809ba1c56b83471ac38a825e9af9e3689,567e148caca2689a162a17bc59f1135cd329ba5a..9770cc5bfb76c6829f96924bb82f9b3b564ca646
--- 1/fs/btrfs/scrub.c
--- 2/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@@ -25,6 -25,7 +25,7 @@@
   #include "transaction.h"
   #include "backref.h"
   #include "extent_io.h"
+ #include "check-integrity.h"
   
   /*
    * This is only the first step towards a full-features scrub. It reads all
@@@ -309,7 -310,7 +310,7 @@@ static void scrub_print_warning(const c
         u8 ref_level;
         unsigned long ptr = 0;
         const int bufsize = 4096;
- -      u64 extent_offset;
+ +      u64 extent_item_pos;
   
         path = btrfs_alloc_path();
   
@@@ -329,13 -330,12 +330,13 @@@
         if (ret < 0)
                 goto out;
   
- -      extent_offset = swarn.logical - found_key.objectid;
+ +      extent_item_pos = swarn.logical - found_key.objectid;
         swarn.extent_item_size = found_key.offset;
   
         eb = path->nodes[0];
         ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
         item_size = btrfs_item_size_nr(eb, path->slots[0]);
+ +      btrfs_release_path(path);
   
         if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
                 do {
@@@ -352,7 -352,7 +353,7 @@@
         } else {
                 swarn.path = path;
                 iterate_extent_inodes(fs_info, path, found_key.objectid,
- -                                      extent_offset,
+ +                                      extent_item_pos,
                                         scrub_print_warning_inode, &swarn);
         }
   
@@@ -733,7 -733,7 +734,7 @@@ static int scrub_fixup_io(int rw, struc
         bio_add_page(bio, page, PAGE_SIZE, 0);
         bio->bi_end_io = scrub_fixup_end_io;
         bio->bi_private = &complete;
-       submit_bio(rw, bio);
+       btrfsic_submit_bio(rw, bio);
   
         /* this will also unplug the queue */
         wait_for_completion(&complete);
@@@ -959,7 -959,7 +960,7 @@@ static int scrub_submit(struct scrub_de
         sdev->curr = -1;
         atomic_inc(&sdev->in_flight);
   
-       submit_bio(READ, sbio->bio);
+       btrfsic_submit_bio(READ, sbio->bio);
   
         return 0;
   }
diff --combined fs/btrfs/super.c

index 5a7227fa93804c7b78bb205d6af94b52c55e1b32,22a2015f1d7be5a973f247c9f5c3edf341e2c4e9..61717a4eb14f78e3aee3902a38659a754637767d
--- 1/fs/btrfs/super.c
--- 2/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@@ -164,8 -164,10 +164,10 @@@ enum 
         Opt_compress_type, Opt_compress_force, Opt_compress_force_type,
         Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
         Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed,
- -      Opt_enospc_debug, Opt_subvolrootid, Opt_defrag,
- -      Opt_inode_cache, Opt_no_space_cache, Opt_recovery,
+ +      Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, Opt_inode_cache,
+ +      Opt_no_space_cache, Opt_recovery, Opt_skip_balance,
+       Opt_check_integrity, Opt_check_integrity_including_extent_data,
+       Opt_check_integrity_print_mask,
         Opt_err,
   };
   
@@@ -201,7 -203,9 +203,10 @@@ static match_table_t tokens = 
         {Opt_inode_cache, "inode_cache"},
         {Opt_no_space_cache, "nospace_cache"},
         {Opt_recovery, "recovery"},
+ +      {Opt_skip_balance, "skip_balance"},
+       {Opt_check_integrity, "check_int"},
+       {Opt_check_integrity_including_extent_data, "check_int_data"},
+       {Opt_check_integrity_print_mask, "check_int_print_mask=%d"},
         {Opt_err, NULL},
   };
   
@@@ -400,9 -404,37 +405,40 @@@ int btrfs_parse_options(struct btrfs_ro
                         printk(KERN_INFO "btrfs: enabling auto recovery");
                         btrfs_set_opt(info->mount_opt, RECOVERY);
                         break;
+ +              case Opt_skip_balance:
+ +                      btrfs_set_opt(info->mount_opt, SKIP_BALANCE);
+ +                      break;
+ #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+               case Opt_check_integrity_including_extent_data:
+                       printk(KERN_INFO "btrfs: enabling check integrity"
+                              " including extent data\n");
+                       btrfs_set_opt(info->mount_opt,
+                                     CHECK_INTEGRITY_INCLUDING_EXTENT_DATA);
+                       btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY);
+                       break;
+               case Opt_check_integrity:
+                       printk(KERN_INFO "btrfs: enabling check integrity\n");
+                       btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY);
+                       break;
+               case Opt_check_integrity_print_mask:
+                       intarg = 0;
+                       match_int(&args[0], &intarg);
+                       if (intarg) {
+                               info->check_integrity_print_mask = intarg;
+                               printk(KERN_INFO "btrfs:"
+                                      " check_integrity_print_mask 0x%x\n",
+                                      info->check_integrity_print_mask);
+                       }
+                       break;
+ #else
+               case Opt_check_integrity_including_extent_data:
+               case Opt_check_integrity:
+               case Opt_check_integrity_print_mask:
+                       printk(KERN_ERR "btrfs: support for check_integrity*"
+                              " not compiled in!\n");
+                       ret = -EINVAL;
+                       goto out;
+ #endif
                 case Opt_err:
                         printk(KERN_INFO "btrfs: unrecognized mount option "
                                "'%s'\n", p);
@@@ -728,8 -760,6 +764,8 @@@ static int btrfs_show_options(struct se
                 seq_puts(seq, ",autodefrag");
         if (btrfs_test_opt(root, INODE_MAP_CACHE))
                 seq_puts(seq, ",inode_cache");
+ +      if (btrfs_test_opt(root, SKIP_BALANCE))
+ +              seq_puts(seq, ",skip_balance");
         return 0;
   }
   
@@@ -833,9 -863,13 +869,9 @@@ static char *setup_root_args(char *args
   static struct dentry *mount_subvol(const char *subvol_name, int flags,
                                    const char *device_name, char *data)
   {
- -      struct super_block *s;
         struct dentry *root;
         struct vfsmount *mnt;
- -      struct mnt_namespace *ns_private;
         char *newargs;
- -      struct path path;
- -      int error;
   
         newargs = setup_root_args(data);
         if (!newargs)
@@@ -846,17 -880,39 +882,17 @@@
         if (IS_ERR(mnt))
                 return ERR_CAST(mnt);
   
- -      ns_private = create_mnt_ns(mnt);
- -      if (IS_ERR(ns_private)) {
- -              mntput(mnt);
- -              return ERR_CAST(ns_private);
- -      }
+ +      root = mount_subtree(mnt, subvol_name);
   
- -      /*
- -       * This will trigger the automount of the subvol so we can just
- -       * drop the mnt we have here and return the dentry that we
- -       * found.
- -       */
- -      error = vfs_path_lookup(mnt->mnt_root, mnt, subvol_name,
- -                              LOOKUP_FOLLOW, &path);
- -      put_mnt_ns(ns_private);
- -      if (error)
- -              return ERR_PTR(error);
- -
- -      if (!is_subvolume_inode(path.dentry->d_inode)) {
- -              path_put(&path);
- -              mntput(mnt);
- -              error = -EINVAL;
+ +      if (!IS_ERR(root) && !is_subvolume_inode(root->d_inode)) {
+ +              struct super_block *s = root->d_sb;
+ +              dput(root);
+ +              root = ERR_PTR(-EINVAL);
+ +              deactivate_locked_super(s);
                 printk(KERN_ERR "btrfs: '%s' is not a valid subvolume\n",
                                 subvol_name);
- -              return ERR_PTR(-EINVAL);
         }
   
- -      /* Get a ref to the sb and the dentry we found and return it */
- -      s = path.mnt->mnt_sb;
- -      atomic_inc(&s->s_active);
- -      root = dget(path.dentry);
- -      path_put(&path);
- -      down_write(&s->s_umount);
- -
         return root;
   }
   
diff --combined fs/btrfs/volumes.c

index e0b7bb92a170c3ad529fca30ed9f6539646dbbe5,821334f6e3a1cab35bd7853c2c83f275efb30840..59e878f9fdcc6e6bf0ef0e325f3a2fce94011baa
--- 1/fs/btrfs/volumes.c
--- 2/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@@ -23,7 -23,6 +23,7 @@@
   #include <linux/random.h>
   #include <linux/iocontext.h>
   #include <linux/capability.h>
+ +#include <linux/kthread.h>
   #include <asm/div64.h>
   #include "compat.h"
   #include "ctree.h"
@@@ -33,6 -32,7 +33,7 @@@
   #include "print-tree.h"
   #include "volumes.h"
   #include "async-thread.h"
+ #include "check-integrity.h"
   
   static int init_first_rw_device(struct btrfs_trans_handle *trans,
                                 struct btrfs_root *root,
@@@ -247,7 -247,7 +248,7 @@@ loop_lock
                         sync_pending = 0;
                 }
   
-               submit_bio(cur->bi_rw, cur);
+               btrfsic_submit_bio(cur->bi_rw, cur);
                 num_run++;
                 batch_run++;
                 if (need_resched())
@@@ -830,6 -830,7 +831,6 @@@ out
   
   /*
    * find_free_dev_extent - find free space in the specified device
- - * @trans:    transaction handler
    * @device:   the device which we search the free space in
    * @num_bytes:        the size of the free space that we need
    * @start:    store the start of the free space.
@@@ -848,7 -849,8 +849,7 @@@
    * But if we don't find suitable free space, it is used to store the size of
    * the max free space.
    */
- -int find_free_dev_extent(struct btrfs_trans_handle *trans,
- -                       struct btrfs_device *device, u64 num_bytes,
+ +int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
                          u64 *start, u64 *len)
   {
         struct btrfs_key key;
@@@ -892,7 -894,7 +893,7 @@@
         key.offset = search_start;
         key.type = BTRFS_DEV_EXTENT_KEY;
   
- -      ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
+ +      ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
         if (ret < 0)
                 goto out;
         if (ret > 0) {
@@@ -1281,6 -1283,7 +1282,6 @@@ int btrfs_rm_device(struct btrfs_root *
         bool clear_super = false;
   
         mutex_lock(&uuid_mutex);
- -      mutex_lock(&root->fs_info->volume_mutex);
   
         all_avail = root->fs_info->avail_data_alloc_bits |
                 root->fs_info->avail_system_alloc_bits |
@@@ -1450,6 -1453,7 +1451,6 @@@ error_close
         if (bdev)
                 blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
   out:
- -      mutex_unlock(&root->fs_info->volume_mutex);
         mutex_unlock(&uuid_mutex);
         return ret;
   error_undo:
@@@ -1466,7 -1470,8 +1467,7 @@@
   /*
    * does all the dirty work required for changing file system's UUID.
    */
- -static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans,
- -                              struct btrfs_root *root)
+ +static int btrfs_prepare_sprout(struct btrfs_root *root)
   {
         struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
         struct btrfs_fs_devices *old_devices;
@@@ -1625,6 -1630,7 +1626,6 @@@ int btrfs_init_new_device(struct btrfs_
         }
   
         filemap_write_and_wait(bdev->bd_inode->i_mapping);
- -      mutex_lock(&root->fs_info->volume_mutex);
   
         devices = &root->fs_info->fs_devices->devices;
         /*
@@@ -1690,7 -1696,7 +1691,7 @@@
   
         if (seeding_dev) {
                 sb->s_flags &= ~MS_RDONLY;
- -              ret = btrfs_prepare_sprout(trans, root);
+ +              ret = btrfs_prepare_sprout(root);
                 BUG_ON(ret);
         }
   
@@@ -1752,7 -1758,8 +1753,7 @@@
                 ret = btrfs_relocate_sys_chunks(root);
                 BUG_ON(ret);
         }
- -out:
- -      mutex_unlock(&root->fs_info->volume_mutex);
+ +
         return ret;
   error:
         blkdev_put(bdev, FMODE_EXCL);
@@@ -1760,7 -1767,7 +1761,7 @@@
                 mutex_unlock(&uuid_mutex);
                 up_write(&sb->s_umount);
         }
- -      goto out;
+ +      return ret;
   }
   
   static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
@@@ -2071,362 -2078,6 +2072,362 @@@ error
         return ret;
   }
   
+ +static int insert_balance_item(struct btrfs_root *root,
+ +                             struct btrfs_balance_control *bctl)
+ +{
+ +      struct btrfs_trans_handle *trans;
+ +      struct btrfs_balance_item *item;
+ +      struct btrfs_disk_balance_args disk_bargs;
+ +      struct btrfs_path *path;
+ +      struct extent_buffer *leaf;
+ +      struct btrfs_key key;
+ +      int ret, err;
+ +
+ +      path = btrfs_alloc_path();
+ +      if (!path)
+ +              return -ENOMEM;
+ +
+ +      trans = btrfs_start_transaction(root, 0);
+ +      if (IS_ERR(trans)) {
+ +              btrfs_free_path(path);
+ +              return PTR_ERR(trans);
+ +      }
+ +
+ +      key.objectid = BTRFS_BALANCE_OBJECTID;
+ +      key.type = BTRFS_BALANCE_ITEM_KEY;
+ +      key.offset = 0;
+ +
+ +      ret = btrfs_insert_empty_item(trans, root, path, &key,
+ +                                    sizeof(*item));
+ +      if (ret)
+ +              goto out;
+ +
+ +      leaf = path->nodes[0];
+ +      item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
+ +
+ +      memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
+ +
+ +      btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
+ +      btrfs_set_balance_data(leaf, item, &disk_bargs);
+ +      btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
+ +      btrfs_set_balance_meta(leaf, item, &disk_bargs);
+ +      btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
+ +      btrfs_set_balance_sys(leaf, item, &disk_bargs);
+ +
+ +      btrfs_set_balance_flags(leaf, item, bctl->flags);
+ +
+ +      btrfs_mark_buffer_dirty(leaf);
+ +out:
+ +      btrfs_free_path(path);
+ +      err = btrfs_commit_transaction(trans, root);
+ +      if (err && !ret)
+ +              ret = err;
+ +      return ret;
+ +}
+ +
+ +static int del_balance_item(struct btrfs_root *root)
+ +{
+ +      struct btrfs_trans_handle *trans;
+ +      struct btrfs_path *path;
+ +      struct btrfs_key key;
+ +      int ret, err;
+ +
+ +      path = btrfs_alloc_path();
+ +      if (!path)
+ +              return -ENOMEM;
+ +
+ +      trans = btrfs_start_transaction(root, 0);
+ +      if (IS_ERR(trans)) {
+ +              btrfs_free_path(path);
+ +              return PTR_ERR(trans);
+ +      }
+ +
+ +      key.objectid = BTRFS_BALANCE_OBJECTID;
+ +      key.type = BTRFS_BALANCE_ITEM_KEY;
+ +      key.offset = 0;
+ +
+ +      ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ +      if (ret < 0)
+ +              goto out;
+ +      if (ret > 0) {
+ +              ret = -ENOENT;
+ +              goto out;
+ +      }
+ +
+ +      ret = btrfs_del_item(trans, root, path);
+ +out:
+ +      btrfs_free_path(path);
+ +      err = btrfs_commit_transaction(trans, root);
+ +      if (err && !ret)
+ +              ret = err;
+ +      return ret;
+ +}
+ +
+ +/*
+ + * This is a heuristic used to reduce the number of chunks balanced on
+ + * resume after balance was interrupted.
+ + */
+ +static void update_balance_args(struct btrfs_balance_control *bctl)
+ +{
+ +      /*
+ +       * Turn on soft mode for chunk types that were being converted.
+ +       */
+ +      if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
+ +              bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
+ +      if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
+ +              bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
+ +      if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
+ +              bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;
+ +
+ +      /*
+ +       * Turn on usage filter if is not already used.  The idea is
+ +       * that chunks that we have already balanced should be
+ +       * reasonably full.  Don't do it for chunks that are being
+ +       * converted - that will keep us from relocating unconverted
+ +       * (albeit full) chunks.
+ +       */
+ +      if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
+ +          !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
+ +              bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
+ +              bctl->data.usage = 90;
+ +      }
+ +      if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
+ +          !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
+ +              bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
+ +              bctl->sys.usage = 90;
+ +      }
+ +      if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
+ +          !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
+ +              bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
+ +              bctl->meta.usage = 90;
+ +      }
+ +}
+ +
+ +/*
+ + * Should be called with both balance and volume mutexes held to
+ + * serialize other volume operations (add_dev/rm_dev/resize) with
+ + * restriper.  Same goes for unset_balance_control.
+ + */
+ +static void set_balance_control(struct btrfs_balance_control *bctl)
+ +{
+ +      struct btrfs_fs_info *fs_info = bctl->fs_info;
+ +
+ +      BUG_ON(fs_info->balance_ctl);
+ +
+ +      spin_lock(&fs_info->balance_lock);
+ +      fs_info->balance_ctl = bctl;
+ +      spin_unlock(&fs_info->balance_lock);
+ +}
+ +
+ +static void unset_balance_control(struct btrfs_fs_info *fs_info)
+ +{
+ +      struct btrfs_balance_control *bctl = fs_info->balance_ctl;
+ +
+ +      BUG_ON(!fs_info->balance_ctl);
+ +
+ +      spin_lock(&fs_info->balance_lock);
+ +      fs_info->balance_ctl = NULL;
+ +      spin_unlock(&fs_info->balance_lock);
+ +
+ +      kfree(bctl);
+ +}
+ +
+ +/*
+ + * Balance filters.  Return 1 if chunk should be filtered out
+ + * (should not be balanced).
+ + */
+ +static int chunk_profiles_filter(u64 chunk_profile,
+ +                               struct btrfs_balance_args *bargs)
+ +{
+ +      chunk_profile &= BTRFS_BLOCK_GROUP_PROFILE_MASK;
+ +
+ +      if (chunk_profile == 0)
+ +              chunk_profile = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+ +
+ +      if (bargs->profiles & chunk_profile)
+ +              return 0;
+ +
+ +      return 1;
+ +}
+ +
+ +static u64 div_factor_fine(u64 num, int factor)
+ +{
+ +      if (factor <= 0)
+ +              return 0;
+ +      if (factor >= 100)
+ +              return num;
+ +
+ +      num *= factor;
+ +      do_div(num, 100);
+ +      return num;
+ +}
+ +
+ +static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
+ +                            struct btrfs_balance_args *bargs)
+ +{
+ +      struct btrfs_block_group_cache *cache;
+ +      u64 chunk_used, user_thresh;
+ +      int ret = 1;
+ +
+ +      cache = btrfs_lookup_block_group(fs_info, chunk_offset);
+ +      chunk_used = btrfs_block_group_used(&cache->item);
+ +
+ +      user_thresh = div_factor_fine(cache->key.offset, bargs->usage);
+ +      if (chunk_used < user_thresh)
+ +              ret = 0;
+ +
+ +      btrfs_put_block_group(cache);
+ +      return ret;
+ +}
+ +
+ +static int chunk_devid_filter(struct extent_buffer *leaf,
+ +                            struct btrfs_chunk *chunk,
+ +                            struct btrfs_balance_args *bargs)
+ +{
+ +      struct btrfs_stripe *stripe;
+ +      int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
+ +      int i;
+ +
+ +      for (i = 0; i < num_stripes; i++) {
+ +              stripe = btrfs_stripe_nr(chunk, i);
+ +              if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
+ +                      return 0;
+ +      }
+ +
+ +      return 1;
+ +}
+ +
+ +/* [pstart, pend) */
+ +static int chunk_drange_filter(struct extent_buffer *leaf,
+ +                             struct btrfs_chunk *chunk,
+ +                             u64 chunk_offset,
+ +                             struct btrfs_balance_args *bargs)
+ +{
+ +      struct btrfs_stripe *stripe;
+ +      int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
+ +      u64 stripe_offset;
+ +      u64 stripe_length;
+ +      int factor;
+ +      int i;
+ +
+ +      if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
+ +              return 0;
+ +
+ +      if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP |
+ +           BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10))
+ +              factor = 2;
+ +      else
+ +              factor = 1;
+ +      factor = num_stripes / factor;
+ +
+ +      for (i = 0; i < num_stripes; i++) {
+ +              stripe = btrfs_stripe_nr(chunk, i);
+ +              if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
+ +                      continue;
+ +
+ +              stripe_offset = btrfs_stripe_offset(leaf, stripe);
+ +              stripe_length = btrfs_chunk_length(leaf, chunk);
+ +              do_div(stripe_length, factor);
+ +
+ +              if (stripe_offset < bargs->pend &&
+ +                  stripe_offset + stripe_length > bargs->pstart)
+ +                      return 0;
+ +      }
+ +
+ +      return 1;
+ +}
+ +
+ +/* [vstart, vend) */
+ +static int chunk_vrange_filter(struct extent_buffer *leaf,
+ +                             struct btrfs_chunk *chunk,
+ +                             u64 chunk_offset,
+ +                             struct btrfs_balance_args *bargs)
+ +{
+ +      if (chunk_offset < bargs->vend &&
+ +          chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
+ +              /* at least part of the chunk is inside this vrange */
+ +              return 0;
+ +
+ +      return 1;
+ +}
+ +
+ +static int chunk_soft_convert_filter(u64 chunk_profile,
+ +                                   struct btrfs_balance_args *bargs)
+ +{
+ +      if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
+ +              return 0;
+ +
+ +      chunk_profile &= BTRFS_BLOCK_GROUP_PROFILE_MASK;
+ +
+ +      if (chunk_profile == 0)
+ +              chunk_profile = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+ +
+ +      if (bargs->target & chunk_profile)
+ +              return 1;
+ +
+ +      return 0;
+ +}
+ +
+ +static int should_balance_chunk(struct btrfs_root *root,
+ +                              struct extent_buffer *leaf,
+ +                              struct btrfs_chunk *chunk, u64 chunk_offset)
+ +{
+ +      struct btrfs_balance_control *bctl = root->fs_info->balance_ctl;
+ +      struct btrfs_balance_args *bargs = NULL;
+ +      u64 chunk_type = btrfs_chunk_type(leaf, chunk);
+ +
+ +      /* type filter */
+ +      if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
+ +            (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
+ +              return 0;
+ +      }
+ +
+ +      if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
+ +              bargs = &bctl->data;
+ +      else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
+ +              bargs = &bctl->sys;
+ +      else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
+ +              bargs = &bctl->meta;
+ +
+ +      /* profiles filter */
+ +      if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
+ +          chunk_profiles_filter(chunk_type, bargs)) {
+ +              return 0;
+ +      }
+ +
+ +      /* usage filter */
+ +      if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
+ +          chunk_usage_filter(bctl->fs_info, chunk_offset, bargs)) {
+ +              return 0;
+ +      }
+ +
+ +      /* devid filter */
+ +      if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
+ +          chunk_devid_filter(leaf, chunk, bargs)) {
+ +              return 0;
+ +      }
+ +
+ +      /* drange filter, makes sense only with devid filter */
+ +      if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
+ +          chunk_drange_filter(leaf, chunk, chunk_offset, bargs)) {
+ +              return 0;
+ +      }
+ +
+ +      /* vrange filter */
+ +      if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
+ +          chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
+ +              return 0;
+ +      }
+ +
+ +      /* soft profile changing mode */
+ +      if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
+ +          chunk_soft_convert_filter(chunk_type, bargs)) {
+ +              return 0;
+ +      }
+ +
+ +      return 1;
+ +}
+ +
   static u64 div_factor(u64 num, int factor)
   {
         if (factor == 10)
@@@ -2436,28 -2087,29 +2437,28 @@@
         return num;
   }
   
- -int btrfs_balance(struct btrfs_root *dev_root)
+ +static int __btrfs_balance(struct btrfs_fs_info *fs_info)
   {
- -      int ret;
- -      struct list_head *devices = &dev_root->fs_info->fs_devices->devices;
+ +      struct btrfs_balance_control *bctl = fs_info->balance_ctl;
+ +      struct btrfs_root *chunk_root = fs_info->chunk_root;
+ +      struct btrfs_root *dev_root = fs_info->dev_root;
+ +      struct list_head *devices;
         struct btrfs_device *device;
         u64 old_size;
         u64 size_to_free;
+ +      struct btrfs_chunk *chunk;
         struct btrfs_path *path;
         struct btrfs_key key;
- -      struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root;
- -      struct btrfs_trans_handle *trans;
         struct btrfs_key found_key;
- -
- -      if (dev_root->fs_info->sb->s_flags & MS_RDONLY)
- -              return -EROFS;
- -
- -      if (!capable(CAP_SYS_ADMIN))
- -              return -EPERM;
- -
- -      mutex_lock(&dev_root->fs_info->volume_mutex);
- -      dev_root = dev_root->fs_info->dev_root;
+ +      struct btrfs_trans_handle *trans;
+ +      struct extent_buffer *leaf;
+ +      int slot;
+ +      int ret;
+ +      int enospc_errors = 0;
+ +      bool counting = true;
   
         /* step one make some room on all the devices */
+ +      devices = &fs_info->fs_devices->devices;
         list_for_each_entry(device, devices, dev_list) {
                 old_size = device->total_bytes;
                 size_to_free = div_factor(old_size, 1);
@@@ -2486,23 -2138,11 +2487,23 @@@
                 ret = -ENOMEM;
                 goto error;
         }
+ +
+ +      /* zero out stat counters */
+ +      spin_lock(&fs_info->balance_lock);
+ +      memset(&bctl->stat, 0, sizeof(bctl->stat));
+ +      spin_unlock(&fs_info->balance_lock);
+ +again:
         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
         key.offset = (u64)-1;
         key.type = BTRFS_CHUNK_ITEM_KEY;
   
         while (1) {
+ +              if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
+ +                  atomic_read(&fs_info->balance_cancel_req)) {
+ +                      ret = -ECANCELED;
+ +                      goto error;
+ +              }
+ +
                 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
                 if (ret < 0)
                         goto error;
@@@ -2512,19 -2152,15 +2513,19 @@@
                  * failed
                  */
                 if (ret == 0)
- -                      break;
+ +                      BUG(); /* FIXME break ? */
   
                 ret = btrfs_previous_item(chunk_root, path, 0,
                                           BTRFS_CHUNK_ITEM_KEY);
- -              if (ret)
+ +              if (ret) {
+ +                      ret = 0;
                         break;
+ +              }
+ +
+ +              leaf = path->nodes[0];
+ +              slot = path->slots[0];
+ +              btrfs_item_key_to_cpu(leaf, &found_key, slot);
   
- -              btrfs_item_key_to_cpu(path->nodes[0], &found_key,
- -                                    path->slots[0]);
                 if (found_key.objectid != key.objectid)
                         break;
   
@@@ -2532,375 -2168,22 +2533,375 @@@
                 if (found_key.offset == 0)
                         break;
   
+ +              chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
+ +
+ +              if (!counting) {
+ +                      spin_lock(&fs_info->balance_lock);
+ +                      bctl->stat.considered++;
+ +                      spin_unlock(&fs_info->balance_lock);
+ +              }
+ +
+ +              ret = should_balance_chunk(chunk_root, leaf, chunk,
+ +                                         found_key.offset);
                 btrfs_release_path(path);
+ +              if (!ret)
+ +                      goto loop;
+ +
+ +              if (counting) {
+ +                      spin_lock(&fs_info->balance_lock);
+ +                      bctl->stat.expected++;
+ +                      spin_unlock(&fs_info->balance_lock);
+ +                      goto loop;
+ +              }
+ +
                 ret = btrfs_relocate_chunk(chunk_root,
                                            chunk_root->root_key.objectid,
                                            found_key.objectid,
                                            found_key.offset);
                 if (ret && ret != -ENOSPC)
                         goto error;
+ +              if (ret == -ENOSPC) {
+ +                      enospc_errors++;
+ +              } else {
+ +                      spin_lock(&fs_info->balance_lock);
+ +                      bctl->stat.completed++;
+ +                      spin_unlock(&fs_info->balance_lock);
+ +              }
+ +loop:
                 key.offset = found_key.offset - 1;
         }
- -      ret = 0;
+ +
+ +      if (counting) {
+ +              btrfs_release_path(path);
+ +              counting = false;
+ +              goto again;
+ +      }
   error:
         btrfs_free_path(path);
- -      mutex_unlock(&dev_root->fs_info->volume_mutex);
+ +      if (enospc_errors) {
+ +              printk(KERN_INFO "btrfs: %d enospc errors during balance\n",
+ +                     enospc_errors);
+ +              if (!ret)
+ +                      ret = -ENOSPC;
+ +      }
+ +
+ +      return ret;
+ +}
+ +
+ +static inline int balance_need_close(struct btrfs_fs_info *fs_info)
+ +{
+ +      /* cancel requested || normal exit path */
+ +      return atomic_read(&fs_info->balance_cancel_req) ||
+ +              (atomic_read(&fs_info->balance_pause_req) == 0 &&
+ +               atomic_read(&fs_info->balance_cancel_req) == 0);
+ +}
+ +
+ +static void __cancel_balance(struct btrfs_fs_info *fs_info)
+ +{
+ +      int ret;
+ +
+ +      unset_balance_control(fs_info);
+ +      ret = del_balance_item(fs_info->tree_root);
+ +      BUG_ON(ret);
+ +}
+ +
+ +void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
+ +                             struct btrfs_ioctl_balance_args *bargs);
+ +
+ +/*
+ + * Should be called with both balance and volume mutexes held
+ + */
+ +int btrfs_balance(struct btrfs_balance_control *bctl,
+ +                struct btrfs_ioctl_balance_args *bargs)
+ +{
+ +      struct btrfs_fs_info *fs_info = bctl->fs_info;
+ +      u64 allowed;
+ +      int ret;
+ +
+ +      if (btrfs_fs_closing(fs_info) ||
+ +          atomic_read(&fs_info->balance_pause_req) ||
+ +          atomic_read(&fs_info->balance_cancel_req)) {
+ +              ret = -EINVAL;
+ +              goto out;
+ +      }
+ +
+ +      /*
+ +       * In case of mixed groups both data and meta should be picked,
+ +       * and identical options should be given for both of them.
+ +       */
+ +      allowed = btrfs_super_incompat_flags(fs_info->super_copy);
+ +      if ((allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
+ +          (bctl->flags & (BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA))) {
+ +              if (!(bctl->flags & BTRFS_BALANCE_DATA) ||
+ +                  !(bctl->flags & BTRFS_BALANCE_METADATA) ||
+ +                  memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
+ +                      printk(KERN_ERR "btrfs: with mixed groups data and "
+ +                             "metadata balance options must be the same\n");
+ +                      ret = -EINVAL;
+ +                      goto out;
+ +              }
+ +      }
+ +
+ +      /*
+ +       * Profile changing sanity checks.  Skip them if a simple
+ +       * balance is requested.
+ +       */
+ +      if (!((bctl->data.flags | bctl->sys.flags | bctl->meta.flags) &
+ +            BTRFS_BALANCE_ARGS_CONVERT))
+ +              goto do_balance;
+ +
+ +      allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+ +      if (fs_info->fs_devices->num_devices == 1)
+ +              allowed |= BTRFS_BLOCK_GROUP_DUP;
+ +      else if (fs_info->fs_devices->num_devices < 4)
+ +              allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
+ +      else
+ +              allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
+ +                              BTRFS_BLOCK_GROUP_RAID10);
+ +
+ +      if (!profile_is_valid(bctl->data.target, 1) ||
+ +          bctl->data.target & ~allowed) {
+ +              printk(KERN_ERR "btrfs: unable to start balance with target "
+ +                     "data profile %llu\n",
+ +                     (unsigned long long)bctl->data.target);
+ +              ret = -EINVAL;
+ +              goto out;
+ +      }
+ +      if (!profile_is_valid(bctl->meta.target, 1) ||
+ +          bctl->meta.target & ~allowed) {
+ +              printk(KERN_ERR "btrfs: unable to start balance with target "
+ +                     "metadata profile %llu\n",
+ +                     (unsigned long long)bctl->meta.target);
+ +              ret = -EINVAL;
+ +              goto out;
+ +      }
+ +      if (!profile_is_valid(bctl->sys.target, 1) ||
+ +          bctl->sys.target & ~allowed) {
+ +              printk(KERN_ERR "btrfs: unable to start balance with target "
+ +                     "system profile %llu\n",
+ +                     (unsigned long long)bctl->sys.target);
+ +              ret = -EINVAL;
+ +              goto out;
+ +      }
+ +
+ +      if (bctl->data.target & BTRFS_BLOCK_GROUP_DUP) {
+ +              printk(KERN_ERR "btrfs: dup for data is not allowed\n");
+ +              ret = -EINVAL;
+ +              goto out;
+ +      }
+ +
+ +      /* allow to reduce meta or sys integrity only if force set */
+ +      allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
+ +                      BTRFS_BLOCK_GROUP_RAID10;
+ +      if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+ +           (fs_info->avail_system_alloc_bits & allowed) &&
+ +           !(bctl->sys.target & allowed)) ||
+ +          ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+ +           (fs_info->avail_metadata_alloc_bits & allowed) &&
+ +           !(bctl->meta.target & allowed))) {
+ +              if (bctl->flags & BTRFS_BALANCE_FORCE) {
+ +                      printk(KERN_INFO "btrfs: force reducing metadata "
+ +                             "integrity\n");
+ +              } else {
+ +                      printk(KERN_ERR "btrfs: balance will reduce metadata "
+ +                             "integrity, use force if you want this\n");
+ +                      ret = -EINVAL;
+ +                      goto out;
+ +              }
+ +      }
+ +
+ +do_balance:
+ +      ret = insert_balance_item(fs_info->tree_root, bctl);
+ +      if (ret && ret != -EEXIST)
+ +              goto out;
+ +
+ +      if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
+ +              BUG_ON(ret == -EEXIST);
+ +              set_balance_control(bctl);
+ +      } else {
+ +              BUG_ON(ret != -EEXIST);
+ +              spin_lock(&fs_info->balance_lock);
+ +              update_balance_args(bctl);
+ +              spin_unlock(&fs_info->balance_lock);
+ +      }
+ +
+ +      atomic_inc(&fs_info->balance_running);
+ +      mutex_unlock(&fs_info->balance_mutex);
+ +
+ +      ret = __btrfs_balance(fs_info);
+ +
+ +      mutex_lock(&fs_info->balance_mutex);
+ +      atomic_dec(&fs_info->balance_running);
+ +
+ +      if (bargs) {
+ +              memset(bargs, 0, sizeof(*bargs));
+ +              update_ioctl_balance_args(fs_info, 0, bargs);
+ +      }
+ +
+ +      if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
+ +          balance_need_close(fs_info)) {
+ +              __cancel_balance(fs_info);
+ +      }
+ +
+ +      wake_up(&fs_info->balance_wait_q);
+ +
+ +      return ret;
+ +out:
+ +      if (bctl->flags & BTRFS_BALANCE_RESUME)
+ +              __cancel_balance(fs_info);
+ +      else
+ +              kfree(bctl);
+ +      return ret;
+ +}
+ +
+ +static int balance_kthread(void *data)
+ +{
+ +      struct btrfs_balance_control *bctl =
+ +                      (struct btrfs_balance_control *)data;
+ +      struct btrfs_fs_info *fs_info = bctl->fs_info;
+ +      int ret = 0;
+ +
+ +      mutex_lock(&fs_info->volume_mutex);
+ +      mutex_lock(&fs_info->balance_mutex);
+ +
+ +      set_balance_control(bctl);
+ +
+ +      if (btrfs_test_opt(fs_info->tree_root, SKIP_BALANCE)) {
+ +              printk(KERN_INFO "btrfs: force skipping balance\n");
+ +      } else {
+ +              printk(KERN_INFO "btrfs: continuing balance\n");
+ +              ret = btrfs_balance(bctl, NULL);
+ +      }
+ +
+ +      mutex_unlock(&fs_info->balance_mutex);
+ +      mutex_unlock(&fs_info->volume_mutex);
+ +      return ret;
+ +}
+ +
+ +int btrfs_recover_balance(struct btrfs_root *tree_root)
+ +{
+ +      struct task_struct *tsk;
+ +      struct btrfs_balance_control *bctl;
+ +      struct btrfs_balance_item *item;
+ +      struct btrfs_disk_balance_args disk_bargs;
+ +      struct btrfs_path *path;
+ +      struct extent_buffer *leaf;
+ +      struct btrfs_key key;
+ +      int ret;
+ +
+ +      path = btrfs_alloc_path();
+ +      if (!path)
+ +              return -ENOMEM;
+ +
+ +      bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
+ +      if (!bctl) {
+ +              ret = -ENOMEM;
+ +              goto out;
+ +      }
+ +
+ +      key.objectid = BTRFS_BALANCE_OBJECTID;
+ +      key.type = BTRFS_BALANCE_ITEM_KEY;
+ +      key.offset = 0;
+ +
+ +      ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
+ +      if (ret < 0)
+ +              goto out_bctl;
+ +      if (ret > 0) { /* ret = -ENOENT; */
+ +              ret = 0;
+ +              goto out_bctl;
+ +      }
+ +
+ +      leaf = path->nodes[0];
+ +      item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
+ +
+ +      bctl->fs_info = tree_root->fs_info;
+ +      bctl->flags = btrfs_balance_flags(leaf, item) | BTRFS_BALANCE_RESUME;
+ +
+ +      btrfs_balance_data(leaf, item, &disk_bargs);
+ +      btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs);
+ +      btrfs_balance_meta(leaf, item, &disk_bargs);
+ +      btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs);
+ +      btrfs_balance_sys(leaf, item, &disk_bargs);
+ +      btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
+ +
+ +      tsk = kthread_run(balance_kthread, bctl, "btrfs-balance");
+ +      if (IS_ERR(tsk))
+ +              ret = PTR_ERR(tsk);
+ +      else
+ +              goto out;
+ +
+ +out_bctl:
+ +      kfree(bctl);
+ +out:
+ +      btrfs_free_path(path);
         return ret;
   }
   
+ +int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
+ +{
+ +      int ret = 0;
+ +
+ +      mutex_lock(&fs_info->balance_mutex);
+ +      if (!fs_info->balance_ctl) {
+ +              mutex_unlock(&fs_info->balance_mutex);
+ +              return -ENOTCONN;
+ +      }
+ +
+ +      if (atomic_read(&fs_info->balance_running)) {
+ +              atomic_inc(&fs_info->balance_pause_req);
+ +              mutex_unlock(&fs_info->balance_mutex);
+ +
+ +              wait_event(fs_info->balance_wait_q,
+ +                         atomic_read(&fs_info->balance_running) == 0);
+ +
+ +              mutex_lock(&fs_info->balance_mutex);
+ +              /* we are good with balance_ctl ripped off from under us */
+ +              BUG_ON(atomic_read(&fs_info->balance_running));
+ +              atomic_dec(&fs_info->balance_pause_req);
+ +      } else {
+ +              ret = -ENOTCONN;
+ +      }
+ +
+ +      mutex_unlock(&fs_info->balance_mutex);
+ +      return ret;
+ +}
+ +
+ +int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
+ +{
+ +      mutex_lock(&fs_info->balance_mutex);
+ +      if (!fs_info->balance_ctl) {
+ +              mutex_unlock(&fs_info->balance_mutex);
+ +              return -ENOTCONN;
+ +      }
+ +
+ +      atomic_inc(&fs_info->balance_cancel_req);
+ +      /*
+ +       * if we are running just wait and return, balance item is
+ +       * deleted in btrfs_balance in this case
+ +       */
+ +      if (atomic_read(&fs_info->balance_running)) {
+ +              mutex_unlock(&fs_info->balance_mutex);
+ +              wait_event(fs_info->balance_wait_q,
+ +                         atomic_read(&fs_info->balance_running) == 0);
+ +              mutex_lock(&fs_info->balance_mutex);
+ +      } else {
+ +              /* __cancel_balance needs volume_mutex */
+ +              mutex_unlock(&fs_info->balance_mutex);
+ +              mutex_lock(&fs_info->volume_mutex);
+ +              mutex_lock(&fs_info->balance_mutex);
+ +
+ +              if (fs_info->balance_ctl)
+ +                      __cancel_balance(fs_info);
+ +
+ +              mutex_unlock(&fs_info->volume_mutex);
+ +      }
+ +
+ +      BUG_ON(fs_info->balance_ctl || atomic_read(&fs_info->balance_running));
+ +      atomic_dec(&fs_info->balance_cancel_req);
+ +      mutex_unlock(&fs_info->balance_mutex);
+ +      return 0;
+ +}
+ +
   /*
    * shrinking a device means finding all of the device extents past
    * the new size, and then following the back refs to the chunks.
@@@ -3041,7 -2324,8 +3042,7 @@@ done
         return ret;
   }
   
- -static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
- -                         struct btrfs_root *root,
+ +static int btrfs_add_system_chunk(struct btrfs_root *root,
                            struct btrfs_key *key,
                            struct btrfs_chunk *chunk, int item_size)
   {
@@@ -3158,11 -2442,7 +3159,11 @@@ static int __btrfs_alloc_chunk(struct b
                 max_stripe_size = 1024 * 1024 * 1024;
                 max_chunk_size = 10 * max_stripe_size;
         } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
- -              max_stripe_size = 256 * 1024 * 1024;
+ +              /* for larger filesystems, use larger metadata chunks */
+ +              if (fs_devices->total_rw_bytes > 50ULL * 1024 * 1024 * 1024)
+ +                      max_stripe_size = 1024 * 1024 * 1024;
+ +              else
+ +                      max_stripe_size = 256 * 1024 * 1024;
                 max_chunk_size = max_stripe_size;
         } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
                 max_stripe_size = 8 * 1024 * 1024;
@@@ -3217,7 -2497,7 +3218,7 @@@
                 if (total_avail == 0)
                         continue;
   
- -              ret = find_free_dev_extent(trans, device,
+ +              ret = find_free_dev_extent(device,
                                            max_stripe_size * dev_stripes,
                                            &dev_offset, &max_avail);
                 if (ret && ret != -ENOSPC)
@@@ -3408,7 -2688,7 +3409,7 @@@ static int __finish_chunk_alloc(struct 
         BUG_ON(ret);
   
         if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
- -              ret = btrfs_add_system_chunk(trans, chunk_root, &key, chunk,
+ +              ret = btrfs_add_system_chunk(chunk_root, &key, chunk,
                                              item_size);
                 BUG_ON(ret);
         }
@@@ -3473,7 -2753,8 +3474,7 @@@ static noinline int init_first_rw_devic
                 return ret;
   
         alloc_profile = BTRFS_BLOCK_GROUP_METADATA |
- -                      (fs_info->metadata_alloc_profile &
- -                       fs_info->avail_metadata_alloc_bits);
+ +                              fs_info->avail_metadata_alloc_bits;
         alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
   
         ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
@@@ -3483,7 -2764,8 +3484,7 @@@
         sys_chunk_offset = chunk_offset + chunk_size;
   
         alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM |
- -                      (fs_info->system_alloc_profile &
- -                       fs_info->avail_system_alloc_bits);
+ +                              fs_info->avail_system_alloc_bits;
         alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
   
         ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map,
@@@ -3620,13 -2902,26 +3621,13 @@@ static int __btrfs_map_block(struct btr
         u64 stripe_nr;
         u64 stripe_nr_orig;
         u64 stripe_nr_end;
- -      int stripes_allocated = 8;
- -      int stripes_required = 1;
         int stripe_index;
         int i;
+ +      int ret = 0;
         int num_stripes;
         int max_errors = 0;
         struct btrfs_bio *bbio = NULL;
   
- -      if (bbio_ret && !(rw & (REQ_WRITE | REQ_DISCARD)))
- -              stripes_allocated = 1;
- -again:
- -      if (bbio_ret) {
- -              bbio = kzalloc(btrfs_bio_size(stripes_allocated),
- -                              GFP_NOFS);
- -              if (!bbio)
- -                      return -ENOMEM;
- -
- -              atomic_set(&bbio->error, 0);
- -      }
- -
         read_lock(&em_tree->lock);
         em = lookup_extent_mapping(em_tree, logical, *length);
         read_unlock(&em_tree->lock);
@@@ -3645,6 -2940,32 +3646,6 @@@
         if (mirror_num > map->num_stripes)
                 mirror_num = 0;
   
- -      /* if our btrfs_bio struct is too small, back off and try again */
- -      if (rw & REQ_WRITE) {
- -              if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
- -                               BTRFS_BLOCK_GROUP_DUP)) {
- -                      stripes_required = map->num_stripes;
- -                      max_errors = 1;
- -              } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
- -                      stripes_required = map->sub_stripes;
- -                      max_errors = 1;
- -              }
- -      }
- -      if (rw & REQ_DISCARD) {
- -              if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
- -                               BTRFS_BLOCK_GROUP_RAID1 |
- -                               BTRFS_BLOCK_GROUP_DUP |
- -                               BTRFS_BLOCK_GROUP_RAID10)) {
- -                      stripes_required = map->num_stripes;
- -              }
- -      }
- -      if (bbio_ret && (rw & (REQ_WRITE | REQ_DISCARD)) &&
- -          stripes_allocated < stripes_required) {
- -              stripes_allocated = map->num_stripes;
- -              free_extent_map(em);
- -              kfree(bbio);
- -              goto again;
- -      }
         stripe_nr = offset;
         /*
          * stripe_nr counts the total number of stripes we have to stride
@@@ -3660,7 -2981,10 +3661,7 @@@
   
         if (rw & REQ_DISCARD)
                 *length = min_t(u64, em->len - offset, *length);
- -      else if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
- -                            BTRFS_BLOCK_GROUP_RAID1 |
- -                            BTRFS_BLOCK_GROUP_RAID10 |
- -                            BTRFS_BLOCK_GROUP_DUP)) {
+ +      else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
                 /* we limit the length of each bio to what fits in a stripe */
                 *length = min_t(u64, em->len - offset,
                                 map->stripe_len - stripe_offset);
@@@ -3736,55 -3060,81 +3737,55 @@@
         }
         BUG_ON(stripe_index >= map->num_stripes);
   
+ +      bbio = kzalloc(btrfs_bio_size(num_stripes), GFP_NOFS);
+ +      if (!bbio) {
+ +              ret = -ENOMEM;
+ +              goto out;
+ +      }
+ +      atomic_set(&bbio->error, 0);
+ +
         if (rw & REQ_DISCARD) {
+ +              int factor = 0;
+ +              int sub_stripes = 0;
+ +              u64 stripes_per_dev = 0;
+ +              u32 remaining_stripes = 0;
+ +
+ +              if (map->type &
+ +                  (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) {
+ +                      if (map->type & BTRFS_BLOCK_GROUP_RAID0)
+ +                              sub_stripes = 1;
+ +                      else
+ +                              sub_stripes = map->sub_stripes;
+ +
+ +                      factor = map->num_stripes / sub_stripes;
+ +                      stripes_per_dev = div_u64_rem(stripe_nr_end -
+ +                                                    stripe_nr_orig,
+ +                                                    factor,
+ +                                                    &remaining_stripes);
+ +              }
+ +
                 for (i = 0; i < num_stripes; i++) {
                         bbio->stripes[i].physical =
                                 map->stripes[stripe_index].physical +
                                 stripe_offset + stripe_nr * map->stripe_len;
                         bbio->stripes[i].dev = map->stripes[stripe_index].dev;
   
- -                      if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
- -                              u64 stripes;
- -                              u32 last_stripe = 0;
- -                              int j;
- -
- -                              div_u64_rem(stripe_nr_end - 1,
- -                                          map->num_stripes,
- -                                          &last_stripe);
- -
- -                              for (j = 0; j < map->num_stripes; j++) {
- -                                      u32 test;
- -
- -                                      div_u64_rem(stripe_nr_end - 1 - j,
- -                                                  map->num_stripes, &test);
- -                                      if (test == stripe_index)
- -                                              break;
- -                              }
- -                              stripes = stripe_nr_end - 1 - j;
- -                              do_div(stripes, map->num_stripes);
- -                              bbio->stripes[i].length = map->stripe_len *
- -                                      (stripes - stripe_nr + 1);
- -
- -                              if (i == 0) {
- -                                      bbio->stripes[i].length -=
- -                                              stripe_offset;
- -                                      stripe_offset = 0;
- -                              }
- -                              if (stripe_index == last_stripe)
- -                                      bbio->stripes[i].length -=
- -                                              stripe_end_offset;
- -                      } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
- -                              u64 stripes;
- -                              int j;
- -                              int factor = map->num_stripes /
- -                                           map->sub_stripes;
- -                              u32 last_stripe = 0;
- -
- -                              div_u64_rem(stripe_nr_end - 1,
- -                                          factor, &last_stripe);
- -                              last_stripe *= map->sub_stripes;
- -
- -                              for (j = 0; j < factor; j++) {
- -                                      u32 test;
- -
- -                                      div_u64_rem(stripe_nr_end - 1 - j,
- -                                                  factor, &test);
- -
- -                                      if (test ==
- -                                          stripe_index / map->sub_stripes)
- -                                              break;
- -                              }
- -                              stripes = stripe_nr_end - 1 - j;
- -                              do_div(stripes, factor);
- -                              bbio->stripes[i].length = map->stripe_len *
- -                                      (stripes - stripe_nr + 1);
- -
- -                              if (i < map->sub_stripes) {
+ +                      if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
+ +                                       BTRFS_BLOCK_GROUP_RAID10)) {
+ +                              bbio->stripes[i].length = stripes_per_dev *
+ +                                                        map->stripe_len;
+ +                              if (i / sub_stripes < remaining_stripes)
+ +                                      bbio->stripes[i].length +=
+ +                                              map->stripe_len;
+ +                              if (i < sub_stripes)
                                         bbio->stripes[i].length -=
                                                 stripe_offset;
- -                                      if (i == map->sub_stripes - 1)
- -                                              stripe_offset = 0;
- -                              }
- -                              if (stripe_index >= last_stripe &&
- -                                  stripe_index <= (last_stripe +
- -                                                   map->sub_stripes - 1)) {
+ +                              if ((i / sub_stripes + 1) %
+ +                                  sub_stripes == remaining_stripes)
                                         bbio->stripes[i].length -=
                                                 stripe_end_offset;
- -                              }
+ +                              if (i == sub_stripes - 1)
+ +                                      stripe_offset = 0;
                         } else
                                 bbio->stripes[i].length = *length;
   
@@@ -3806,22 -3156,15 +3807,22 @@@
                         stripe_index++;
                 }
         }
- -      if (bbio_ret) {
- -              *bbio_ret = bbio;
- -              bbio->num_stripes = num_stripes;
- -              bbio->max_errors = max_errors;
- -              bbio->mirror_num = mirror_num;
+ +
+ +      if (rw & REQ_WRITE) {
+ +              if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
+ +                               BTRFS_BLOCK_GROUP_RAID10 |
+ +                               BTRFS_BLOCK_GROUP_DUP)) {
+ +                      max_errors = 1;
+ +              }
         }
+ +
+ +      *bbio_ret = bbio;
+ +      bbio->num_stripes = num_stripes;
+ +      bbio->max_errors = max_errors;
+ +      bbio->mirror_num = mirror_num;
   out:
         free_extent_map(em);
- -      return 0;
+ +      return ret;
   }
   
   int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
@@@ -3962,7 -3305,7 +3963,7 @@@ static noinline int schedule_bio(struc
         /* don't bother with additional async steps for reads, right now */
         if (!(rw & REQ_WRITE)) {
                 bio_get(bio);
-               submit_bio(rw, bio);
+               btrfsic_submit_bio(rw, bio);
                 bio_put(bio);
                 return 0;
         }
@@@ -4057,7 -3400,7 +4058,7 @@@ int btrfs_map_bio(struct btrfs_root *ro
                         if (async_submit)
                                 schedule_bio(root, dev, rw, bio);
                         else
-                               submit_bio(rw, bio);
+                               btrfsic_submit_bio(rw, bio);
                 } else {
                         bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
                         bio->bi_sector = logical >> 9;
@@@ -4226,7 -3569,7 +4227,7 @@@ static int open_seed_devices(struct btr
         struct btrfs_fs_devices *fs_devices;
         int ret;
   
- -      mutex_lock(&uuid_mutex);
+ +      BUG_ON(!mutex_is_locked(&uuid_mutex));
   
         fs_devices = root->fs_info->fs_devices->seed;
         while (fs_devices) {
@@@ -4264,6 -3607,7 +4265,6 @@@
         fs_devices->seed = root->fs_info->fs_devices->seed;
         root->fs_info->fs_devices->seed = fs_devices;
   out:
- -      mutex_unlock(&uuid_mutex);
         return ret;
   }
   
@@@ -4406,9 -3750,6 +4407,9 @@@ int btrfs_read_chunk_tree(struct btrfs_
         if (!path)
                 return -ENOMEM;
   
+ +      mutex_lock(&uuid_mutex);
+ +      lock_chunks(root);
+ +
         /* first we search for all of the device items, and then we
          * read in all of the chunk items.  This way we can create chunk
          * mappings that reference all of the devices that are afound
@@@ -4459,9 -3800,6 +4460,9 @@@ again
         }
         ret = 0;
   error:
+ +      unlock_chunks(root);
+ +      mutex_unlock(&uuid_mutex);
+ +
         btrfs_free_path(path);
         return ret;
   }
author	Chris Mason <chris.mason@oracle.com>
	Mon, 16 Jan 2012 20:27:58 +0000 (15:27 -0500)
committer	Chris Mason <chris.mason@oracle.com>
	Mon, 16 Jan 2012 20:27:58 +0000 (15:27 -0500)
		1	2
fs/btrfs/Makefile	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/ctree.h	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/disk-io.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/extent_io.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/scrub.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/super.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/volumes.c	patch \|	diff1 \|	diff2 \|	blob \| history