Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs...

author Linus Torvalds <torvalds@linux-foundation.org>

Mon, 17 Jan 2011 22:43:43 +0000 (14:43 -0800)

committer Linus Torvalds <torvalds@linux-foundation.org>

Mon, 17 Jan 2011 22:43:43 +0000 (14:43 -0800)
author Linus Torvalds <torvalds@linux-foundation.org>
Mon, 17 Jan 2011 22:43:43 +0000 (14:43 -0800)
committer Linus Torvalds <torvalds@linux-foundation.org>
Mon, 17 Jan 2011 22:43:43 +0000 (14:43 -0800)
diff --combined fs/btrfs/acl.c

index 6ae2c8cac9d568ee754f6913c29302555ee44593,6d1410e392d330d1fecba40f68b2874a4a6e0e21..15b5ca2a260624fd7549b91175ff3d8ce28c2014
--- 1/fs/btrfs/acl.c
--- 2/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@@ -60,8 -60,10 +60,10 @@@ static struct posix_acl *btrfs_get_acl(
                 size = __btrfs_getxattr(inode, name, value, size);
                 if (size > 0) {
                         acl = posix_acl_from_xattr(value, size);
-                       if (IS_ERR(acl))
+                       if (IS_ERR(acl)) {
+                               kfree(value);
                                 return acl;
+                       }
                         set_cached_acl(inode, type, acl);
                 }
                 kfree(value);
@@@ -185,23 -187,18 +187,23 @@@ static int btrfs_xattr_acl_set(struct d
         return ret;
   }
   
- -int btrfs_check_acl(struct inode *inode, int mask)
+ +int btrfs_check_acl(struct inode *inode, int mask, unsigned int flags)
   {
- -      struct posix_acl *acl;
         int error = -EAGAIN;
   
- -      acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS);
+ +      if (flags & IPERM_FLAG_RCU) {
+ +              if (!negative_cached_acl(inode, ACL_TYPE_ACCESS))
+ +                      error = -ECHILD;
   
- -      if (IS_ERR(acl))
- -              return PTR_ERR(acl);
- -      if (acl) {
- -              error = posix_acl_permission(inode, acl, mask);
- -              posix_acl_release(acl);
+ +      } else {
+ +              struct posix_acl *acl;
+ +              acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS);
+ +              if (IS_ERR(acl))
+ +                      return PTR_ERR(acl);
+ +              if (acl) {
+ +                      error = posix_acl_permission(inode, acl, mask);
+ +                      posix_acl_release(acl);
+ +              }
         }
   
         return error;
diff --combined fs/btrfs/ctree.h

index b875d445ea816463e65374f6b70dd98f14bf8318,72195378bef9851dedc56a82953efdb6ba8113d6..2c98b3af6052a25bd7ce5eeb5a0502d2ac1cca8a
--- 1/fs/btrfs/ctree.h
--- 2/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@@ -295,6 -295,14 +295,14 @@@ static inline unsigned long btrfs_chunk
   #define BTRFS_FSID_SIZE 16
   #define BTRFS_HEADER_FLAG_WRITTEN     (1ULL << 0)
   #define BTRFS_HEADER_FLAG_RELOC               (1ULL << 1)
+ 
+ /*
+  * File system states
+  */
+ 
+ /* Errors detected */
+ #define BTRFS_SUPER_FLAG_ERROR                (1ULL << 2)
+ 
   #define BTRFS_SUPER_FLAG_SEEDING      (1ULL << 32)
   #define BTRFS_SUPER_FLAG_METADUMP     (1ULL << 33)
   
@@@ -399,13 -407,15 +407,15 @@@ struct btrfs_super_block 
   #define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF  (1ULL << 0)
   #define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (1ULL << 1)
   #define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS   (1ULL << 2)
+ #define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO   (1ULL << 3)
   
   #define BTRFS_FEATURE_COMPAT_SUPP             0ULL
   #define BTRFS_FEATURE_COMPAT_RO_SUPP          0ULL
   #define BTRFS_FEATURE_INCOMPAT_SUPP                   \
         (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF |         \
          BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL |        \
-        BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
+        BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS |          \
+        BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO)
   
   /*
    * A leaf is full of items. offset and size tell us where to find
@@@ -552,9 -562,11 +562,11 @@@ struct btrfs_timespec 
   } __attribute__ ((__packed__));
   
   enum btrfs_compression_type {
-       BTRFS_COMPRESS_NONE = 0,
-       BTRFS_COMPRESS_ZLIB = 1,
-       BTRFS_COMPRESS_LAST = 2,
+       BTRFS_COMPRESS_NONE  = 0,
+       BTRFS_COMPRESS_ZLIB  = 1,
+       BTRFS_COMPRESS_LZO   = 2,
+       BTRFS_COMPRESS_TYPES = 2,
+       BTRFS_COMPRESS_LAST  = 3,
   };
   
   struct btrfs_inode_item {
@@@ -598,6 -610,8 +610,8 @@@ struct btrfs_dir_item 
         u8 type;
   } __attribute__ ((__packed__));
   
+ #define BTRFS_ROOT_SUBVOL_RDONLY      (1ULL << 0)
+ 
   struct btrfs_root_item {
         struct btrfs_inode_item inode;
         __le64 generation;
@@@ -896,7 -910,8 +910,8 @@@ struct btrfs_fs_info 
          */
         u64 last_trans_log_full_commit;
         u64 open_ioctl_trans;
-       unsigned long mount_opt;
+       unsigned long mount_opt:20;
+       unsigned long compress_type:4;
         u64 max_inline;
         u64 alloc_start;
         struct btrfs_transaction *running_transaction;
@@@ -1051,6 -1066,9 +1066,9 @@@
         unsigned metadata_ratio;
   
         void *bdev_holder;
+ 
+       /* filesystem state */
+       u64 fs_state;
   };
   
   /*
@@@ -1894,6 -1912,11 +1912,11 @@@ BTRFS_SETGET_STACK_FUNCS(root_limit, st
   BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item,
                          last_snapshot, 64);
   
+ static inline bool btrfs_root_readonly(struct btrfs_root *root)
+ {
+       return root->root_item.flags & BTRFS_ROOT_SUBVOL_RDONLY;
+ }
+ 
   /* struct btrfs_super_block */
   
   BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
@@@ -2146,6 -2169,7 +2169,7 @@@ int btrfs_make_block_group(struct btrfs
   int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
                              struct btrfs_root *root, u64 group_start);
   u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
+ u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data);
   void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
   void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
   int btrfs_check_data_free_space(struct inode *inode, u64 bytes);
@@@ -2189,6 -2213,12 +2213,12 @@@ int btrfs_set_block_group_ro(struct btr
   int btrfs_set_block_group_rw(struct btrfs_root *root,
                              struct btrfs_block_group_cache *cache);
   void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
+ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo);
+ int btrfs_error_unpin_extent_range(struct btrfs_root *root,
+                                  u64 start, u64 end);
+ int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr,
+                              u64 num_bytes);
+ 
   /* ctree.c */
   int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
                      int level, int *slot);
@@@ -2542,10 -2572,18 +2572,18 @@@ ssize_t btrfs_listxattr(struct dentry *
   /* super.c */
   int btrfs_parse_options(struct btrfs_root *root, char *options);
   int btrfs_sync_fs(struct super_block *sb, int wait);
+ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
+                    unsigned int line, int errno);
+ 
+ #define btrfs_std_error(fs_info, errno)                               \
+ do {                                                          \
+       if ((errno))                                            \
+               __btrfs_std_error((fs_info), __func__, __LINE__, (errno));\
+ } while (0)
   
   /* acl.c */
   #ifdef CONFIG_BTRFS_FS_POSIX_ACL
- -int btrfs_check_acl(struct inode *inode, int mask);
+ +int btrfs_check_acl(struct inode *inode, int mask, unsigned int flags);
   #else
   #define btrfs_check_acl NULL
   #endif
diff --combined fs/btrfs/disk-io.c

index 51d2e4de34ebe58d4eb5d2c99d1fc1bb83f692f9,1a3af9e8e0c42167594dc6e16cac5c3243f931a0..b531c36455d86553de454b79e1cb9ab612c7d029
--- 1/fs/btrfs/disk-io.c
--- 2/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@@ -44,6 -44,20 +44,20 @@@
   static struct extent_io_ops btree_extent_io_ops;
   static void end_workqueue_fn(struct btrfs_work *work);
   static void free_fs_root(struct btrfs_root *root);
+ static void btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
+                                   int read_only);
+ static int btrfs_destroy_ordered_operations(struct btrfs_root *root);
+ static int btrfs_destroy_ordered_extents(struct btrfs_root *root);
+ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
+                                     struct btrfs_root *root);
+ static int btrfs_destroy_pending_snapshots(struct btrfs_transaction *t);
+ static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root);
+ static int btrfs_destroy_marked_extents(struct btrfs_root *root,
+                                       struct extent_io_tree *dirty_pages,
+                                       int mark);
+ static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
+                                      struct extent_io_tree *pinned_extents);
+ static int btrfs_cleanup_transaction(struct btrfs_root *root);
   
   /*
    * end_io_wq structs are used to do processing in task context when an IO is
@@@ -353,6 -367,10 +367,10 @@@ static int csum_dirty_buffer(struct btr
         WARN_ON(len == 0);
   
         eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
+       if (eb == NULL) {
+               WARN_ON(1);
+               goto out;
+       }
         ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE,
                                              btrfs_header_generation(eb));
         BUG_ON(ret);
@@@ -427,6 -445,10 +445,10 @@@ static int btree_readpage_end_io_hook(s
         WARN_ON(len == 0);
   
         eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
+       if (eb == NULL) {
+               ret = -EIO;
+               goto out;
+       }
   
         found_start = btrfs_header_bytenr(eb);
         if (found_start != start) {
@@@ -1145,6 -1167,7 +1167,7 @@@ struct btrfs_root *btrfs_read_fs_root_n
         }
         btrfs_free_path(path);
         if (ret) {
+               kfree(root);
                 if (ret > 0)
                         ret = -ENOENT;
                 return ERR_PTR(ret);
@@@ -1713,8 -1736,10 +1736,10 @@@ struct btrfs_root *open_ctree(struct su
                      fs_info, BTRFS_ROOT_TREE_OBJECTID);
   
         bh = btrfs_read_dev_super(fs_devices->latest_bdev);
-       if (!bh)
+       if (!bh) {
+               err = -EINVAL;
                 goto fail_iput;
+       }
   
         memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy));
         memcpy(&fs_info->super_for_commit, &fs_info->super_copy,
@@@ -1727,6 -1752,11 +1752,11 @@@
         if (!btrfs_super_root(disk_super))
                 goto fail_iput;
   
+       /* check FS state, whether FS is broken. */
+       fs_info->fs_state |= btrfs_super_flags(disk_super);
+ 
+       btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
+ 
         ret = btrfs_parse_options(tree_root, options);
         if (ret) {
                 err = ret;
@@@ -1744,10 -1774,10 +1774,10 @@@
         }
   
         features = btrfs_super_incompat_flags(disk_super);
-       if (!(features & BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF)) {
-               features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
-               btrfs_set_super_incompat_flags(disk_super, features);
-       }
+       features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
+       if (tree_root->fs_info->compress_type & BTRFS_COMPRESS_LZO)
+               features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
+       btrfs_set_super_incompat_flags(disk_super, features);
   
         features = btrfs_super_compat_ro_flags(disk_super) &
                 ~BTRFS_FEATURE_COMPAT_RO_SUPP;
@@@ -1957,7 -1987,9 +1987,9 @@@
                 btrfs_set_opt(fs_info->mount_opt, SSD);
         }
   
-       if (btrfs_super_log_root(disk_super) != 0) {
+       /* do not make disk changes in broken FS */
+       if (btrfs_super_log_root(disk_super) != 0 &&
+           !(fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)) {
                 u64 bytenr = btrfs_super_log_root(disk_super);
   
                 if (fs_devices->rw_devices == 0) {
@@@ -2094,7 -2126,7 +2126,7 @@@ static void btrfs_end_buffer_write_sync
         if (uptodate) {
                 set_buffer_uptodate(bh);
         } else {
- -              if (!buffer_eopnotsupp(bh) && printk_ratelimit()) {
+ +              if (printk_ratelimit()) {
                         printk(KERN_WARNING "lost page write due to "
                                         "I/O error on %s\n",
                                        bdevname(bh->b_bdev, b));
@@@ -2231,10 -2263,21 +2263,10 @@@ static int write_dev_supers(struct btrf
                         bh->b_end_io = btrfs_end_buffer_write_sync;
                 }
   
- -              if (i == last_barrier && do_barriers && device->barriers) {
- -                      ret = submit_bh(WRITE_BARRIER, bh);
- -                      if (ret == -EOPNOTSUPP) {
- -                              printk("btrfs: disabling barriers on dev %s\n",
- -                                     device->name);
- -                              set_buffer_uptodate(bh);
- -                              device->barriers = 0;
- -                              /* one reference for submit_bh */
- -                              get_bh(bh);
- -                              lock_buffer(bh);
- -                              ret = submit_bh(WRITE_SYNC, bh);
- -                      }
- -              } else {
+ +              if (i == last_barrier && do_barriers)
+ +                      ret = submit_bh(WRITE_FLUSH_FUA, bh);
+ +              else
                         ret = submit_bh(WRITE_SYNC, bh);
- -              }
   
                 if (ret)
                         errors++;
@@@ -2442,8 -2485,28 +2474,28 @@@ int close_ctree(struct btrfs_root *root
         smp_mb();
   
         btrfs_put_block_group_cache(fs_info);
+ 
+       /*
+        * Here come 2 situations when btrfs is broken to flip readonly:
+        *
+        * 1. when btrfs flips readonly somewhere else before
+        * btrfs_commit_super, sb->s_flags has MS_RDONLY flag,
+        * and btrfs will skip to write sb directly to keep
+        * ERROR state on disk.
+        *
+        * 2. when btrfs flips readonly just in btrfs_commit_super,
+        * and in such case, btrfs cannnot write sb via btrfs_commit_super,
+        * and since fs_state has been set BTRFS_SUPER_FLAG_ERROR flag,
+        * btrfs will cleanup all FS resources first and write sb then.
+        */
         if (!(fs_info->sb->s_flags & MS_RDONLY)) {
-               ret =  btrfs_commit_super(root);
+               ret = btrfs_commit_super(root);
+               if (ret)
+                       printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
+       }
+ 
+       if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
+               ret = btrfs_error_commit_super(root);
                 if (ret)
                         printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
         }
@@@ -2619,6 -2682,352 +2671,352 @@@ out
         return 0;
   }
   
+ static void btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
+                             int read_only)
+ {
+       if (read_only)
+               return;
+ 
+       if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
+               printk(KERN_WARNING "warning: mount fs with errors, "
+                      "running btrfsck is recommended\n");
+ }
+ 
+ int btrfs_error_commit_super(struct btrfs_root *root)
+ {
+       int ret;
+ 
+       mutex_lock(&root->fs_info->cleaner_mutex);
+       btrfs_run_delayed_iputs(root);
+       mutex_unlock(&root->fs_info->cleaner_mutex);
+ 
+       down_write(&root->fs_info->cleanup_work_sem);
+       up_write(&root->fs_info->cleanup_work_sem);
+ 
+       /* cleanup FS via transaction */
+       btrfs_cleanup_transaction(root);
+ 
+       ret = write_ctree_super(NULL, root, 0);
+ 
+       return ret;
+ }
+ 
+ static int btrfs_destroy_ordered_operations(struct btrfs_root *root)
+ {
+       struct btrfs_inode *btrfs_inode;
+       struct list_head splice;
+ 
+       INIT_LIST_HEAD(&splice);
+ 
+       mutex_lock(&root->fs_info->ordered_operations_mutex);
+       spin_lock(&root->fs_info->ordered_extent_lock);
+ 
+       list_splice_init(&root->fs_info->ordered_operations, &splice);
+       while (!list_empty(&splice)) {
+               btrfs_inode = list_entry(splice.next, struct btrfs_inode,
+                                        ordered_operations);
+ 
+               list_del_init(&btrfs_inode->ordered_operations);
+ 
+               btrfs_invalidate_inodes(btrfs_inode->root);
+       }
+ 
+       spin_unlock(&root->fs_info->ordered_extent_lock);
+       mutex_unlock(&root->fs_info->ordered_operations_mutex);
+ 
+       return 0;
+ }
+ 
+ static int btrfs_destroy_ordered_extents(struct btrfs_root *root)
+ {
+       struct list_head splice;
+       struct btrfs_ordered_extent *ordered;
+       struct inode *inode;
+ 
+       INIT_LIST_HEAD(&splice);
+ 
+       spin_lock(&root->fs_info->ordered_extent_lock);
+ 
+       list_splice_init(&root->fs_info->ordered_extents, &splice);
+       while (!list_empty(&splice)) {
+               ordered = list_entry(splice.next, struct btrfs_ordered_extent,
+                                    root_extent_list);
+ 
+               list_del_init(&ordered->root_extent_list);
+               atomic_inc(&ordered->refs);
+ 
+               /* the inode may be getting freed (in sys_unlink path). */
+               inode = igrab(ordered->inode);
+ 
+               spin_unlock(&root->fs_info->ordered_extent_lock);
+               if (inode)
+                       iput(inode);
+ 
+               atomic_set(&ordered->refs, 1);
+               btrfs_put_ordered_extent(ordered);
+ 
+               spin_lock(&root->fs_info->ordered_extent_lock);
+       }
+ 
+       spin_unlock(&root->fs_info->ordered_extent_lock);
+ 
+       return 0;
+ }
+ 
+ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
+                                     struct btrfs_root *root)
+ {
+       struct rb_node *node;
+       struct btrfs_delayed_ref_root *delayed_refs;
+       struct btrfs_delayed_ref_node *ref;
+       int ret = 0;
+ 
+       delayed_refs = &trans->delayed_refs;
+ 
+       spin_lock(&delayed_refs->lock);
+       if (delayed_refs->num_entries == 0) {
+               printk(KERN_INFO "delayed_refs has NO entry\n");
+               return ret;
+       }
+ 
+       node = rb_first(&delayed_refs->root);
+       while (node) {
+               ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
+               node = rb_next(node);
+ 
+               ref->in_tree = 0;
+               rb_erase(&ref->rb_node, &delayed_refs->root);
+               delayed_refs->num_entries--;
+ 
+               atomic_set(&ref->refs, 1);
+               if (btrfs_delayed_ref_is_head(ref)) {
+                       struct btrfs_delayed_ref_head *head;
+ 
+                       head = btrfs_delayed_node_to_head(ref);
+                       mutex_lock(&head->mutex);
+                       kfree(head->extent_op);
+                       delayed_refs->num_heads--;
+                       if (list_empty(&head->cluster))
+                               delayed_refs->num_heads_ready--;
+                       list_del_init(&head->cluster);
+                       mutex_unlock(&head->mutex);
+               }
+ 
+               spin_unlock(&delayed_refs->lock);
+               btrfs_put_delayed_ref(ref);
+ 
+               cond_resched();
+               spin_lock(&delayed_refs->lock);
+       }
+ 
+       spin_unlock(&delayed_refs->lock);
+ 
+       return ret;
+ }
+ 
+ static int btrfs_destroy_pending_snapshots(struct btrfs_transaction *t)
+ {
+       struct btrfs_pending_snapshot *snapshot;
+       struct list_head splice;
+ 
+       INIT_LIST_HEAD(&splice);
+ 
+       list_splice_init(&t->pending_snapshots, &splice);
+ 
+       while (!list_empty(&splice)) {
+               snapshot = list_entry(splice.next,
+                                     struct btrfs_pending_snapshot,
+                                     list);
+ 
+               list_del_init(&snapshot->list);
+ 
+               kfree(snapshot);
+       }
+ 
+       return 0;
+ }
+ 
+ static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
+ {
+       struct btrfs_inode *btrfs_inode;
+       struct list_head splice;
+ 
+       INIT_LIST_HEAD(&splice);
+ 
+       list_splice_init(&root->fs_info->delalloc_inodes, &splice);
+ 
+       spin_lock(&root->fs_info->delalloc_lock);
+ 
+       while (!list_empty(&splice)) {
+               btrfs_inode = list_entry(splice.next, struct btrfs_inode,
+                                   delalloc_inodes);
+ 
+               list_del_init(&btrfs_inode->delalloc_inodes);
+ 
+               btrfs_invalidate_inodes(btrfs_inode->root);
+       }
+ 
+       spin_unlock(&root->fs_info->delalloc_lock);
+ 
+       return 0;
+ }
+ 
+ static int btrfs_destroy_marked_extents(struct btrfs_root *root,
+                                       struct extent_io_tree *dirty_pages,
+                                       int mark)
+ {
+       int ret;
+       struct page *page;
+       struct inode *btree_inode = root->fs_info->btree_inode;
+       struct extent_buffer *eb;
+       u64 start = 0;
+       u64 end;
+       u64 offset;
+       unsigned long index;
+ 
+       while (1) {
+               ret = find_first_extent_bit(dirty_pages, start, &start, &end,
+                                           mark);
+               if (ret)
+                       break;
+ 
+               clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS);
+               while (start <= end) {
+                       index = start >> PAGE_CACHE_SHIFT;
+                       start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
+                       page = find_get_page(btree_inode->i_mapping, index);
+                       if (!page)
+                               continue;
+                       offset = page_offset(page);
+ 
+                       spin_lock(&dirty_pages->buffer_lock);
+                       eb = radix_tree_lookup(
+                            &(&BTRFS_I(page->mapping->host)->io_tree)->buffer,
+                                              offset >> PAGE_CACHE_SHIFT);
+                       spin_unlock(&dirty_pages->buffer_lock);
+                       if (eb) {
+                               ret = test_and_clear_bit(EXTENT_BUFFER_DIRTY,
+                                                        &eb->bflags);
+                               atomic_set(&eb->refs, 1);
+                       }
+                       if (PageWriteback(page))
+                               end_page_writeback(page);
+ 
+                       lock_page(page);
+                       if (PageDirty(page)) {
+                               clear_page_dirty_for_io(page);
+                               spin_lock_irq(&page->mapping->tree_lock);
+                               radix_tree_tag_clear(&page->mapping->page_tree,
+                                                       page_index(page),
+                                                       PAGECACHE_TAG_DIRTY);
+                               spin_unlock_irq(&page->mapping->tree_lock);
+                       }
+ 
+                       page->mapping->a_ops->invalidatepage(page, 0);
+                       unlock_page(page);
+               }
+       }
+ 
+       return ret;
+ }
+ 
+ static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
+                                      struct extent_io_tree *pinned_extents)
+ {
+       struct extent_io_tree *unpin;
+       u64 start;
+       u64 end;
+       int ret;
+ 
+       unpin = pinned_extents;
+       while (1) {
+               ret = find_first_extent_bit(unpin, 0, &start, &end,
+                                           EXTENT_DIRTY);
+               if (ret)
+                       break;
+ 
+               /* opt_discard */
+               ret = btrfs_error_discard_extent(root, start, end + 1 - start);
+ 
+               clear_extent_dirty(unpin, start, end, GFP_NOFS);
+               btrfs_error_unpin_extent_range(root, start, end);
+               cond_resched();
+       }
+ 
+       return 0;
+ }
+ 
+ static int btrfs_cleanup_transaction(struct btrfs_root *root)
+ {
+       struct btrfs_transaction *t;
+       LIST_HEAD(list);
+ 
+       WARN_ON(1);
+ 
+       mutex_lock(&root->fs_info->trans_mutex);
+       mutex_lock(&root->fs_info->transaction_kthread_mutex);
+ 
+       list_splice_init(&root->fs_info->trans_list, &list);
+       while (!list_empty(&list)) {
+               t = list_entry(list.next, struct btrfs_transaction, list);
+               if (!t)
+                       break;
+ 
+               btrfs_destroy_ordered_operations(root);
+ 
+               btrfs_destroy_ordered_extents(root);
+ 
+               btrfs_destroy_delayed_refs(t, root);
+ 
+               btrfs_block_rsv_release(root,
+                                       &root->fs_info->trans_block_rsv,
+                                       t->dirty_pages.dirty_bytes);
+ 
+               /* FIXME: cleanup wait for commit */
+               t->in_commit = 1;
+               t->blocked = 1;
+               if (waitqueue_active(&root->fs_info->transaction_blocked_wait))
+                       wake_up(&root->fs_info->transaction_blocked_wait);
+ 
+               t->blocked = 0;
+               if (waitqueue_active(&root->fs_info->transaction_wait))
+                       wake_up(&root->fs_info->transaction_wait);
+               mutex_unlock(&root->fs_info->trans_mutex);
+ 
+               mutex_lock(&root->fs_info->trans_mutex);
+               t->commit_done = 1;
+               if (waitqueue_active(&t->commit_wait))
+                       wake_up(&t->commit_wait);
+               mutex_unlock(&root->fs_info->trans_mutex);
+ 
+               mutex_lock(&root->fs_info->trans_mutex);
+ 
+               btrfs_destroy_pending_snapshots(t);
+ 
+               btrfs_destroy_delalloc_inodes(root);
+ 
+               spin_lock(&root->fs_info->new_trans_lock);
+               root->fs_info->running_transaction = NULL;
+               spin_unlock(&root->fs_info->new_trans_lock);
+ 
+               btrfs_destroy_marked_extents(root, &t->dirty_pages,
+                                            EXTENT_DIRTY);
+ 
+               btrfs_destroy_pinned_extent(root,
+                                           root->fs_info->pinned_extents);
+ 
+               t->use_count = 0;
+               list_del_init(&t->list);
+               memset(t, 0, sizeof(*t));
+               kmem_cache_free(btrfs_transaction_cachep, t);
+       }
+ 
+       mutex_unlock(&root->fs_info->transaction_kthread_mutex);
+       mutex_unlock(&root->fs_info->trans_mutex);
+ 
+       return 0;
+ }
+ 
   static struct extent_io_ops btree_extent_io_ops = {
         .write_cache_pages_lock_hook = btree_lock_page_hook,
         .readpage_end_io_hook = btree_readpage_end_io_hook,
diff --combined fs/btrfs/extent-tree.c

index 227e5815d8382393d4adc4d0a340ee636bca9d3e,bcf303204f7f5f3473eaa7a907423ef998274f4d..b55269340cec7ecc322c0942e2b4a5882d374a3d
--- 1/fs/btrfs/extent-tree.c
--- 2/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@@ -1746,7 -1746,8 +1746,7 @@@ static int remove_extent_backref(struc
   static void btrfs_issue_discard(struct block_device *bdev,
                                 u64 start, u64 len)
   {
- -      blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL,
- -                      BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
+ +      blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL, 0);
   }
   
   static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
@@@ -3089,7 -3090,7 +3089,7 @@@ static u64 get_alloc_profile(struct btr
         return btrfs_reduce_alloc_profile(root, flags);
   }
   
- static u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
+ u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
   {
         u64 flags;
   
@@@ -3161,8 -3162,12 +3161,12 @@@ alloc
                                              bytes + 2 * 1024 * 1024,
                                              alloc_target, 0);
                         btrfs_end_transaction(trans, root);
-                       if (ret < 0)
-                               return ret;
+                       if (ret < 0) {
+                               if (ret != -ENOSPC)
+                                       return ret;
+                               else
+                                       goto commit_trans;
+                       }
   
                         if (!data_sinfo) {
                                 btrfs_set_inode_space_info(root, inode);
@@@ -3173,6 -3178,7 +3177,7 @@@
                 spin_unlock(&data_sinfo->lock);
   
                 /* commit the current transaction and try again */
+ commit_trans:
                 if (!committed && !root->fs_info->open_ioctl_trans) {
                         committed = 1;
                         trans = btrfs_join_transaction(root, 1);
@@@ -3721,11 -3727,6 +3726,6 @@@ int btrfs_block_rsv_check(struct btrfs_
                 return 0;
         }
   
-       WARN_ON(1);
-       printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n",
-               block_rsv->size, block_rsv->reserved,
-               block_rsv->freed[0], block_rsv->freed[1]);
- 
         return -ENOSPC;
   }
   
@@@ -7970,13 -7971,14 +7970,14 @@@ static int set_block_group_ro(struct bt
   
         if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
             sinfo->bytes_may_use + sinfo->bytes_readonly +
-           cache->reserved_pinned + num_bytes < sinfo->total_bytes) {
+           cache->reserved_pinned + num_bytes <= sinfo->total_bytes) {
                 sinfo->bytes_readonly += num_bytes;
                 sinfo->bytes_reserved += cache->reserved_pinned;
                 cache->reserved_pinned = 0;
                 cache->ro = 1;
                 ret = 0;
         }
+ 
         spin_unlock(&cache->lock);
         spin_unlock(&sinfo->lock);
         return ret;
@@@ -8012,6 -8014,62 +8013,62 @@@ out
         return ret;
   }
   
+ /*
+  * helper to account the unused space of all the readonly block group in the
+  * list. takes mirrors into account.
+  */
+ static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list)
+ {
+       struct btrfs_block_group_cache *block_group;
+       u64 free_bytes = 0;
+       int factor;
+ 
+       list_for_each_entry(block_group, groups_list, list) {
+               spin_lock(&block_group->lock);
+ 
+               if (!block_group->ro) {
+                       spin_unlock(&block_group->lock);
+                       continue;
+               }
+ 
+               if (block_group->flags & (BTRFS_BLOCK_GROUP_RAID1 |
+                                         BTRFS_BLOCK_GROUP_RAID10 |
+                                         BTRFS_BLOCK_GROUP_DUP))
+                       factor = 2;
+               else
+                       factor = 1;
+ 
+               free_bytes += (block_group->key.offset -
+                              btrfs_block_group_used(&block_group->item)) *
+                              factor;
+ 
+               spin_unlock(&block_group->lock);
+       }
+ 
+       return free_bytes;
+ }
+ 
+ /*
+  * helper to account the unused space of all the readonly block group in the
+  * space_info. takes mirrors into account.
+  */
+ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
+ {
+       int i;
+       u64 free_bytes = 0;
+ 
+       spin_lock(&sinfo->lock);
+ 
+       for(i = 0; i < BTRFS_NR_RAID_TYPES; i++)
+               if (!list_empty(&sinfo->block_groups[i]))
+                       free_bytes += __btrfs_get_ro_block_group_free_space(
+                                               &sinfo->block_groups[i]);
+ 
+       spin_unlock(&sinfo->lock);
+ 
+       return free_bytes;
+ }
+ 
   int btrfs_set_block_group_rw(struct btrfs_root *root,
                               struct btrfs_block_group_cache *cache)
   {
@@@ -8092,7 -8150,7 +8149,7 @@@ int btrfs_can_relocate(struct btrfs_roo
         mutex_lock(&root->fs_info->chunk_mutex);
         list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
                 u64 min_free = btrfs_block_group_used(&block_group->item);
-               u64 dev_offset, max_avail;
+               u64 dev_offset;
   
                 /*
                  * check to make sure we can actually find a chunk with enough
@@@ -8100,7 -8158,7 +8157,7 @@@
                  */
                 if (device->total_bytes > device->bytes_used + min_free) {
                         ret = find_free_dev_extent(NULL, device, min_free,
-                                                  &dev_offset, &max_avail);
+                                                  &dev_offset, NULL);
                         if (!ret)
                                 break;
                         ret = -1;
@@@ -8584,3 -8642,14 +8641,14 @@@ out
         btrfs_free_path(path);
         return ret;
   }
+ 
+ int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
+ {
+       return unpin_extent_range(root, start, end);
+ }
+ 
+ int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr,
+                              u64 num_bytes)
+ {
+       return btrfs_discard_extent(root, bytenr, num_bytes);
+ }
diff --combined fs/btrfs/extent_io.c

index 3e86b9f3650736a6e77e5468197815ac5646fb83,8b8d3d99ae68cbaa09a3369633bf7c2f48c9f3d7..2e993cf1766e28532a04e3531702931e41bf1ddb
--- 1/fs/btrfs/extent_io.c
--- 2/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@@ -2028,8 -2028,11 +2028,11 @@@ static int __extent_read_full_page(stru
                 BUG_ON(extent_map_end(em) <= cur);
                 BUG_ON(end < cur);
   
-               if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
+               if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
                         this_bio_flag = EXTENT_BIO_COMPRESSED;
+                       extent_set_compress_type(&this_bio_flag,
+                                                em->compress_type);
+               }
   
                 iosize = min(extent_map_end(em) - cur, end - cur + 1);
                 cur_end = min(extent_map_end(em) - 1, end);
@@@ -3072,10 -3075,13 +3075,12 @@@ static struct extent_buffer *__alloc_ex
   #endif
   
         eb = kmem_cache_zalloc(extent_buffer_cache, mask);
+       if (eb == NULL)
+               return NULL;
         eb->start = start;
         eb->len = len;
         spin_lock_init(&eb->lock);
         init_waitqueue_head(&eb->lock_wq);
- -      INIT_RCU_HEAD(&eb->rcu_head);
   
   #if LEAK_DEBUG
         spin_lock_irqsave(&leak_lock, flags);
diff --combined fs/btrfs/file.c

index a9e0a4eaf3d91b322184019df694b2399bcbeba1,f903433f5bdf6c4764ee28a11db12115f76cc047..c800d58f3013521c5c18ee0bb2673f875b6b695f
--- 1/fs/btrfs/file.c
--- 2/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@@ -24,7 -24,6 +24,7 @@@
   #include <linux/string.h>
   #include <linux/backing-dev.h>
   #include <linux/mpage.h>
+ +#include <linux/falloc.h>
   #include <linux/swap.h>
   #include <linux/writeback.h>
   #include <linux/statfs.h>
@@@ -225,6 -224,7 +225,7 @@@ int btrfs_drop_extent_cache(struct inod
   
                         split->bdev = em->bdev;
                         split->flags = flags;
+                       split->compress_type = em->compress_type;
                         ret = add_extent_mapping(em_tree, split);
                         BUG_ON(ret);
                         free_extent_map(split);
@@@ -239,6 -239,7 +240,7 @@@
                         split->len = em->start + em->len - (start + len);
                         split->bdev = em->bdev;
                         split->flags = flags;
+                       split->compress_type = em->compress_type;
   
                         if (compressed) {
                                 split->block_len = em->block_len;
@@@ -891,6 -892,17 +893,17 @@@ static ssize_t btrfs_file_aio_write(str
         if (err)
                 goto out;
   
+       /*
+        * If BTRFS flips readonly due to some impossible error
+        * (fs_info->fs_state now has BTRFS_SUPER_FLAG_ERROR),
+        * although we have opened a file as writable, we have
+        * to stop this write operation to ensure FS consistency.
+        */
+       if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
+               err = -EROFS;
+               goto out;
+       }
+ 
         file_update_time(file);
         BTRFS_I(inode)->sequence++;
   
@@@ -1238,117 -1250,6 +1251,117 @@@ static int btrfs_file_mmap(struct file      
         return 0;
   }
   
+ +static long btrfs_fallocate(struct file *file, int mode,
+ +                          loff_t offset, loff_t len)
+ +{
+ +      struct inode *inode = file->f_path.dentry->d_inode;
+ +      struct extent_state *cached_state = NULL;
+ +      u64 cur_offset;
+ +      u64 last_byte;
+ +      u64 alloc_start;
+ +      u64 alloc_end;
+ +      u64 alloc_hint = 0;
+ +      u64 locked_end;
+ +      u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
+ +      struct extent_map *em;
+ +      int ret;
+ +
+ +      alloc_start = offset & ~mask;
+ +      alloc_end =  (offset + len + mask) & ~mask;
+ +
+ +      /* We only support the FALLOC_FL_KEEP_SIZE mode */
+ +      if (mode & ~FALLOC_FL_KEEP_SIZE)
+ +              return -EOPNOTSUPP;
+ +
+ +      /*
+ +       * wait for ordered IO before we have any locks.  We'll loop again
+ +       * below with the locks held.
+ +       */
+ +      btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
+ +
+ +      mutex_lock(&inode->i_mutex);
+ +      ret = inode_newsize_ok(inode, alloc_end);
+ +      if (ret)
+ +              goto out;
+ +
+ +      if (alloc_start > inode->i_size) {
+ +              ret = btrfs_cont_expand(inode, alloc_start);
+ +              if (ret)
+ +                      goto out;
+ +      }
+ +
+ +      ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
+ +      if (ret)
+ +              goto out;
+ +
+ +      locked_end = alloc_end - 1;
+ +      while (1) {
+ +              struct btrfs_ordered_extent *ordered;
+ +
+ +              /* the extent lock is ordered inside the running
+ +               * transaction
+ +               */
+ +              lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
+ +                               locked_end, 0, &cached_state, GFP_NOFS);
+ +              ordered = btrfs_lookup_first_ordered_extent(inode,
+ +                                                          alloc_end - 1);
+ +              if (ordered &&
+ +                  ordered->file_offset + ordered->len > alloc_start &&
+ +                  ordered->file_offset < alloc_end) {
+ +                      btrfs_put_ordered_extent(ordered);
+ +                      unlock_extent_cached(&BTRFS_I(inode)->io_tree,
+ +                                           alloc_start, locked_end,
+ +                                           &cached_state, GFP_NOFS);
+ +                      /*
+ +                       * we can't wait on the range with the transaction
+ +                       * running or with the extent lock held
+ +                       */
+ +                      btrfs_wait_ordered_range(inode, alloc_start,
+ +                                               alloc_end - alloc_start);
+ +              } else {
+ +                      if (ordered)
+ +                              btrfs_put_ordered_extent(ordered);
+ +                      break;
+ +              }
+ +      }
+ +
+ +      cur_offset = alloc_start;
+ +      while (1) {
+ +              em = btrfs_get_extent(inode, NULL, 0, cur_offset,
+ +                                    alloc_end - cur_offset, 0);
+ +              BUG_ON(IS_ERR(em) || !em);
+ +              last_byte = min(extent_map_end(em), alloc_end);
+ +              last_byte = (last_byte + mask) & ~mask;
+ +              if (em->block_start == EXTENT_MAP_HOLE ||
+ +                  (cur_offset >= inode->i_size &&
+ +                   !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
+ +                      ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
+ +                                                      last_byte - cur_offset,
+ +                                                      1 << inode->i_blkbits,
+ +                                                      offset + len,
+ +                                                      &alloc_hint);
+ +                      if (ret < 0) {
+ +                              free_extent_map(em);
+ +                              break;
+ +                      }
+ +              }
+ +              free_extent_map(em);
+ +
+ +              cur_offset = last_byte;
+ +              if (cur_offset >= alloc_end) {
+ +                      ret = 0;
+ +                      break;
+ +              }
+ +      }
+ +      unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
+ +                           &cached_state, GFP_NOFS);
+ +
+ +      btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
+ +out:
+ +      mutex_unlock(&inode->i_mutex);
+ +      return ret;
+ +}
+ +
   const struct file_operations btrfs_file_operations = {
         .llseek         = generic_file_llseek,
         .read           = do_sync_read,
@@@ -1360,7 -1261,6 +1373,7 @@@
         .open           = generic_file_open,
         .release        = btrfs_release_file,
         .fsync          = btrfs_sync_file,
+ +      .fallocate      = btrfs_fallocate,
         .unlocked_ioctl = btrfs_ioctl,
   #ifdef CONFIG_COMPAT
         .compat_ioctl   = btrfs_ioctl,
diff --combined fs/btrfs/inode.c

index 902afbf50811ba7499bc6e08792355ab5e90c17d,1562765c8e6a9e418df3347257e9ecca879a577f..160b55b3e132043718c08c40327a13b30696a2bb
--- 1/fs/btrfs/inode.c
--- 2/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@@ -122,10 -122,10 +122,10 @@@ static noinline int insert_inline_exten
         size_t cur_size = size;
         size_t datasize;
         unsigned long offset;
-       int use_compress = 0;
+       int compress_type = BTRFS_COMPRESS_NONE;
   
         if (compressed_size && compressed_pages) {
-               use_compress = 1;
+               compress_type = root->fs_info->compress_type;
                 cur_size = compressed_size;
         }
   
@@@ -159,7 -159,7 +159,7 @@@
         btrfs_set_file_extent_ram_bytes(leaf, ei, size);
         ptr = btrfs_file_extent_inline_start(ei);
   
-       if (use_compress) {
+       if (compress_type != BTRFS_COMPRESS_NONE) {
                 struct page *cpage;
                 int i = 0;
                 while (compressed_size > 0) {
@@@ -176,7 -176,7 +176,7 @@@
                         compressed_size -= cur_size;
                 }
                 btrfs_set_file_extent_compression(leaf, ei,
-                                                 BTRFS_COMPRESS_ZLIB);
+                                                 compress_type);
         } else {
                 page = find_get_page(inode->i_mapping,
                                      start >> PAGE_CACHE_SHIFT);
@@@ -263,6 -263,7 +263,7 @@@ struct async_extent 
         u64 compressed_size;
         struct page **pages;
         unsigned long nr_pages;
+       int compress_type;
         struct list_head list;
   };
   
@@@ -280,7 -281,8 +281,8 @@@ static noinline int add_async_extent(st
                                      u64 start, u64 ram_size,
                                      u64 compressed_size,
                                      struct page **pages,
-                                    unsigned long nr_pages)
+                                    unsigned long nr_pages,
+                                    int compress_type)
   {
         struct async_extent *async_extent;
   
@@@ -290,6 -292,7 +292,7 @@@
         async_extent->compressed_size = compressed_size;
         async_extent->pages = pages;
         async_extent->nr_pages = nr_pages;
+       async_extent->compress_type = compress_type;
         list_add_tail(&async_extent->list, &cow->extents);
         return 0;
   }
@@@ -332,6 -335,7 +335,7 @@@ static noinline int compress_file_range
         unsigned long max_uncompressed = 128 * 1024;
         int i;
         int will_compress;
+       int compress_type = root->fs_info->compress_type;
   
         actual_end = min_t(u64, isize, end + 1);
   again:
@@@ -381,12 -385,16 +385,16 @@@
                 WARN_ON(pages);
                 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
   
-               ret = btrfs_zlib_compress_pages(inode->i_mapping, start,
-                                               total_compressed, pages,
-                                               nr_pages, &nr_pages_ret,
-                                               &total_in,
-                                               &total_compressed,
-                                               max_compressed);
+               if (BTRFS_I(inode)->force_compress)
+                       compress_type = BTRFS_I(inode)->force_compress;
+ 
+               ret = btrfs_compress_pages(compress_type,
+                                          inode->i_mapping, start,
+                                          total_compressed, pages,
+                                          nr_pages, &nr_pages_ret,
+                                          &total_in,
+                                          &total_compressed,
+                                          max_compressed);
   
                 if (!ret) {
                         unsigned long offset = total_compressed &
@@@ -493,7 -501,8 +501,8 @@@
                  * and will submit them to the elevator.
                  */
                 add_async_extent(async_cow, start, num_bytes,
-                                total_compressed, pages, nr_pages_ret);
+                                total_compressed, pages, nr_pages_ret,
+                                compress_type);
   
                 if (start + num_bytes < end) {
                         start += num_bytes;
@@@ -515,7 -524,8 +524,8 @@@ cleanup_and_bail_uncompressed
                         __set_page_dirty_nobuffers(locked_page);
                         /* unlocked later on in the async handlers */
                 }
-               add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0);
+               add_async_extent(async_cow, start, end - start + 1,
+                                0, NULL, 0, BTRFS_COMPRESS_NONE);
                 *num_added += 1;
         }
   
@@@ -640,6 -650,7 +650,7 @@@ retry
                 em->block_start = ins.objectid;
                 em->block_len = ins.offset;
                 em->bdev = root->fs_info->fs_devices->latest_bdev;
+               em->compress_type = async_extent->compress_type;
                 set_bit(EXTENT_FLAG_PINNED, &em->flags);
                 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
   
@@@ -656,11 -667,13 +667,13 @@@
                                                 async_extent->ram_size - 1, 0);
                 }
   
-               ret = btrfs_add_ordered_extent(inode, async_extent->start,
-                                              ins.objectid,
-                                              async_extent->ram_size,
-                                              ins.offset,
-                                              BTRFS_ORDERED_COMPRESSED);
+               ret = btrfs_add_ordered_extent_compress(inode,
+                                               async_extent->start,
+                                               ins.objectid,
+                                               async_extent->ram_size,
+                                               ins.offset,
+                                               BTRFS_ORDERED_COMPRESSED,
+                                               async_extent->compress_type);
                 BUG_ON(ret);
   
                 /*
@@@ -1670,7 -1683,7 +1683,7 @@@ static int btrfs_finish_ordered_io(stru
         struct btrfs_ordered_extent *ordered_extent = NULL;
         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
         struct extent_state *cached_state = NULL;
-       int compressed = 0;
+       int compress_type = 0;
         int ret;
         bool nolock = false;
   
@@@ -1711,9 -1724,9 +1724,9 @@@
         trans->block_rsv = &root->fs_info->delalloc_block_rsv;
   
         if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
-               compressed = 1;
+               compress_type = ordered_extent->compress_type;
         if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
-               BUG_ON(compressed);
+               BUG_ON(compress_type);
                 ret = btrfs_mark_extent_written(trans, inode,
                                                 ordered_extent->file_offset,
                                                 ordered_extent->file_offset +
@@@ -1727,7 -1740,7 +1740,7 @@@
                                                 ordered_extent->disk_len,
                                                 ordered_extent->len,
                                                 ordered_extent->len,
-                                               compressed, 0, 0,
+                                               compress_type, 0, 0,
                                                 BTRFS_FILE_EXTENT_REG);
                 unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
                                    ordered_extent->file_offset,
@@@ -1829,6 -1842,8 +1842,8 @@@ static int btrfs_io_failed_hook(struct 
                 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
                         logical = em->block_start;
                         failrec->bio_flags = EXTENT_BIO_COMPRESSED;
+                       extent_set_compress_type(&failrec->bio_flags,
+                                                em->compress_type);
                 }
                 failrec->logical = logical;
                 free_extent_map(em);
@@@ -3671,8 -3686,12 +3686,12 @@@ static int btrfs_setattr_size(struct in
   static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
   {
         struct inode *inode = dentry->d_inode;
+       struct btrfs_root *root = BTRFS_I(inode)->root;
         int err;
   
+       if (btrfs_root_readonly(root))
+               return -EROFS;
+ 
         err = inode_change_ok(inode, attr);
         if (err)
                 return err;
@@@ -3877,7 -3896,7 +3896,7 @@@ again
         p = &root->inode_tree.rb_node;
         parent = NULL;
   
- -      if (hlist_unhashed(&inode->i_hash))
+ +      if (inode_unhashed(inode))
                 return;
   
         spin_lock(&root->inode_lock);
@@@ -4084,6 -4103,8 +4103,6 @@@ struct inode *btrfs_lookup_dentry(struc
         int index;
         int ret;
   
- -      dentry->d_op = &btrfs_dentry_operations;
- -
         if (dentry->d_name.len > BTRFS_NAME_LEN)
                 return ERR_PTR(-ENAMETOOLONG);
   
@@@ -4125,7 -4146,7 +4144,7 @@@
         return inode;
   }
   
- -static int btrfs_dentry_delete(struct dentry *dentry)
+ +static int btrfs_dentry_delete(const struct dentry *dentry)
   {
         struct btrfs_root *root;
   
@@@ -4800,7 -4821,7 +4819,7 @@@ static int btrfs_link(struct dentry *ol
         }
   
         btrfs_set_trans_block_group(trans, dir);
- -      atomic_inc(&inode->i_count);
+ +      ihold(inode);
   
         err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index);
   
@@@ -4928,8 -4949,10 +4947,10 @@@ static noinline int uncompress_inline(s
         size_t max_size;
         unsigned long inline_size;
         unsigned long ptr;
+       int compress_type;
   
         WARN_ON(pg_offset != 0);
+       compress_type = btrfs_file_extent_compression(leaf, item);
         max_size = btrfs_file_extent_ram_bytes(leaf, item);
         inline_size = btrfs_file_extent_inline_item_len(leaf,
                                         btrfs_item_nr(leaf, path->slots[0]));
@@@ -4939,8 -4962,8 +4960,8 @@@
         read_extent_buffer(leaf, tmp, ptr, inline_size);
   
         max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size);
-       ret = btrfs_zlib_decompress(tmp, page, extent_offset,
-                                   inline_size, max_size);
+       ret = btrfs_decompress(compress_type, tmp, page,
+                              extent_offset, inline_size, max_size);
         if (ret) {
                 char *kaddr = kmap_atomic(page, KM_USER0);
                 unsigned long copy_size = min_t(u64,
@@@ -4982,7 -5005,7 +5003,7 @@@ struct extent_map *btrfs_get_extent(str
         struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
         struct btrfs_trans_handle *trans = NULL;
-       int compressed;
+       int compress_type;
   
   again:
         read_lock(&em_tree->lock);
@@@ -5041,7 -5064,7 +5062,7 @@@
   
         found_type = btrfs_file_extent_type(leaf, item);
         extent_start = found_key.offset;
-       compressed = btrfs_file_extent_compression(leaf, item);
+       compress_type = btrfs_file_extent_compression(leaf, item);
         if (found_type == BTRFS_FILE_EXTENT_REG ||
             found_type == BTRFS_FILE_EXTENT_PREALLOC) {
                 extent_end = extent_start +
@@@ -5087,8 -5110,9 +5108,9 @@@
                         em->block_start = EXTENT_MAP_HOLE;
                         goto insert;
                 }
-               if (compressed) {
+               if (compress_type != BTRFS_COMPRESS_NONE) {
                         set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+                       em->compress_type = compress_type;
                         em->block_start = bytenr;
                         em->block_len = btrfs_file_extent_disk_num_bytes(leaf,
                                                                          item);
@@@ -5122,12 -5146,14 +5144,14 @@@
                 em->len = (copy_size + root->sectorsize - 1) &
                         ~((u64)root->sectorsize - 1);
                 em->orig_start = EXTENT_MAP_INLINE;
-               if (compressed)
+               if (compress_type) {
                         set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+                       em->compress_type = compress_type;
+               }
                 ptr = btrfs_file_extent_inline_start(item) + extent_offset;
                 if (create == 0 && !PageUptodate(page)) {
-                       if (btrfs_file_extent_compression(leaf, item) ==
-                           BTRFS_COMPRESS_ZLIB) {
+                       if (btrfs_file_extent_compression(leaf, item) !=
+                           BTRFS_COMPRESS_NONE) {
                                 ret = uncompress_inline(path, inode, page,
                                                         pg_offset,
                                                         extent_offset, item);
@@@ -6477,7 -6503,7 +6501,7 @@@ struct inode *btrfs_alloc_inode(struct 
         ei->ordered_data_close = 0;
         ei->orphan_meta_reserved = 0;
         ei->dummy_inode = 0;
-       ei->force_compress = 0;
+       ei->force_compress = BTRFS_COMPRESS_NONE;
   
         inode = &ei->vfs_inode;
         extent_map_tree_init(&ei->extent_tree, GFP_NOFS);
@@@ -6493,13 -6519,6 +6517,13 @@@
         return inode;
   }
   
+ +static void btrfs_i_callback(struct rcu_head *head)
+ +{
+ +      struct inode *inode = container_of(head, struct inode, i_rcu);
+ +      INIT_LIST_HEAD(&inode->i_dentry);
+ +      kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
+ +}
+ +
   void btrfs_destroy_inode(struct inode *inode)
   {
         struct btrfs_ordered_extent *ordered;
@@@ -6569,7 -6588,7 +6593,7 @@@
         inode_tree_del(inode);
         btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
   free:
- -      kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
+ +      call_rcu(&inode->i_rcu, btrfs_i_callback);
   }
   
   int btrfs_drop_inode(struct inode *inode)
@@@ -7098,16 -7117,126 +7122,20 @@@ int btrfs_prealloc_file_range_trans(str
                                            min_size, actual_len, alloc_hint, trans);
   }
   
- -static long btrfs_fallocate(struct inode *inode, int mode,
- -                          loff_t offset, loff_t len)
- -{
- -      struct extent_state *cached_state = NULL;
- -      u64 cur_offset;
- -      u64 last_byte;
- -      u64 alloc_start;
- -      u64 alloc_end;
- -      u64 alloc_hint = 0;
- -      u64 locked_end;
- -      u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
- -      struct extent_map *em;
- -      int ret;
- -
- -      alloc_start = offset & ~mask;
- -      alloc_end =  (offset + len + mask) & ~mask;
- -
- -      /*
- -       * wait for ordered IO before we have any locks.  We'll loop again
- -       * below with the locks held.
- -       */
- -      btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
- -
- -      mutex_lock(&inode->i_mutex);
- -      ret = inode_newsize_ok(inode, alloc_end);
- -      if (ret)
- -              goto out;
- -
- -      if (alloc_start > inode->i_size) {
- -              ret = btrfs_cont_expand(inode, alloc_start);
- -              if (ret)
- -                      goto out;
- -      }
- -
- -      ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
- -      if (ret)
- -              goto out;
- -
- -      locked_end = alloc_end - 1;
- -      while (1) {
- -              struct btrfs_ordered_extent *ordered;
- -
- -              /* the extent lock is ordered inside the running
- -               * transaction
- -               */
- -              lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
- -                               locked_end, 0, &cached_state, GFP_NOFS);
- -              ordered = btrfs_lookup_first_ordered_extent(inode,
- -                                                          alloc_end - 1);
- -              if (ordered &&
- -                  ordered->file_offset + ordered->len > alloc_start &&
- -                  ordered->file_offset < alloc_end) {
- -                      btrfs_put_ordered_extent(ordered);
- -                      unlock_extent_cached(&BTRFS_I(inode)->io_tree,
- -                                           alloc_start, locked_end,
- -                                           &cached_state, GFP_NOFS);
- -                      /*
- -                       * we can't wait on the range with the transaction
- -                       * running or with the extent lock held
- -                       */
- -                      btrfs_wait_ordered_range(inode, alloc_start,
- -                                               alloc_end - alloc_start);
- -              } else {
- -                      if (ordered)
- -                              btrfs_put_ordered_extent(ordered);
- -                      break;
- -              }
- -      }
- -
- -      cur_offset = alloc_start;
- -      while (1) {
- -              em = btrfs_get_extent(inode, NULL, 0, cur_offset,
- -                                    alloc_end - cur_offset, 0);
- -              BUG_ON(IS_ERR(em) || !em);
- -              last_byte = min(extent_map_end(em), alloc_end);
- -              last_byte = (last_byte + mask) & ~mask;
- -              if (em->block_start == EXTENT_MAP_HOLE ||
- -                  (cur_offset >= inode->i_size &&
- -                   !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
- -                      ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
- -                                                      last_byte - cur_offset,
- -                                                      1 << inode->i_blkbits,
- -                                                      offset + len,
- -                                                      &alloc_hint);
- -                      if (ret < 0) {
- -                              free_extent_map(em);
- -                              break;
- -                      }
- -              }
- -              free_extent_map(em);
- -
- -              cur_offset = last_byte;
- -              if (cur_offset >= alloc_end) {
- -                      ret = 0;
- -                      break;
- -              }
- -      }
- -      unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
- -                           &cached_state, GFP_NOFS);
- -
- -      btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
- -out:
- -      mutex_unlock(&inode->i_mutex);
- -      return ret;
- -}
- -
   static int btrfs_set_page_dirty(struct page *page)
   {
         return __set_page_dirty_nobuffers(page);
   }
   
- -static int btrfs_permission(struct inode *inode, int mask)
+ +static int btrfs_permission(struct inode *inode, int mask, unsigned int flags)
   {
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+ 
+       if (btrfs_root_readonly(root) && (mask & MAY_WRITE))
+               return -EROFS;
         if ((BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) && (mask & MAY_WRITE))
                 return -EACCES;
- -      return generic_permission(inode, mask, btrfs_check_acl);
+ +      return generic_permission(inode, mask, flags, btrfs_check_acl);
   }
   
   static const struct inode_operations btrfs_dir_inode_operations = {
@@@ -7200,6 -7329,7 +7228,6 @@@ static const struct inode_operations bt
         .listxattr      = btrfs_listxattr,
         .removexattr    = btrfs_removexattr,
         .permission     = btrfs_permission,
- -      .fallocate      = btrfs_fallocate,
         .fiemap         = btrfs_fiemap,
   };
   static const struct inode_operations btrfs_special_inode_operations = {
diff --combined fs/btrfs/super.c

index 22acdaa78ce1fd539c8a5a5b5420c718e1a5d675,52e903b0a293b71cc08555e62c9c97991b61383a..b2130c46fdb5a8d1f368b4494a0982a1597b7235
--- 1/fs/btrfs/super.c
--- 2/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@@ -54,6 -54,90 +54,90 @@@
   
   static const struct super_operations btrfs_super_ops;
   
+ static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno,
+                                     char nbuf[16])
+ {
+       char *errstr = NULL;
+ 
+       switch (errno) {
+       case -EIO:
+               errstr = "IO failure";
+               break;
+       case -ENOMEM:
+               errstr = "Out of memory";
+               break;
+       case -EROFS:
+               errstr = "Readonly filesystem";
+               break;
+       default:
+               if (nbuf) {
+                       if (snprintf(nbuf, 16, "error %d", -errno) >= 0)
+                               errstr = nbuf;
+               }
+               break;
+       }
+ 
+       return errstr;
+ }
+ 
+ static void __save_error_info(struct btrfs_fs_info *fs_info)
+ {
+       /*
+        * today we only save the error info into ram.  Long term we'll
+        * also send it down to the disk
+        */
+       fs_info->fs_state = BTRFS_SUPER_FLAG_ERROR;
+ }
+ 
+ /* NOTE:
+  *    We move write_super stuff at umount in order to avoid deadlock
+  *    for umount hold all lock.
+  */
+ static void save_error_info(struct btrfs_fs_info *fs_info)
+ {
+       __save_error_info(fs_info);
+ }
+ 
+ /* btrfs handle error by forcing the filesystem readonly */
+ static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
+ {
+       struct super_block *sb = fs_info->sb;
+ 
+       if (sb->s_flags & MS_RDONLY)
+               return;
+ 
+       if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
+               sb->s_flags |= MS_RDONLY;
+               printk(KERN_INFO "btrfs is forced readonly\n");
+       }
+ }
+ 
+ /*
+  * __btrfs_std_error decodes expected errors from the caller and
+  * invokes the approciate error response.
+  */
+ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
+                    unsigned int line, int errno)
+ {
+       struct super_block *sb = fs_info->sb;
+       char nbuf[16];
+       const char *errstr;
+ 
+       /*
+        * Special case: if the error is EROFS, and we're already
+        * under MS_RDONLY, then it is safe here.
+        */
+       if (errno == -EROFS && (sb->s_flags & MS_RDONLY))
+               return;
+ 
+       errstr = btrfs_decode_error(fs_info, errno, nbuf);
+       printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s\n",
+               sb->s_id, function, line, errstr);
+       save_error_info(fs_info);
+ 
+       btrfs_handle_error(fs_info);
+ }
+ 
   static void btrfs_put_super(struct super_block *sb)
   {
         struct btrfs_root *root = btrfs_sb(sb);
@@@ -69,9 -153,9 +153,9 @@@ enum 
         Opt_degraded, Opt_subvol, Opt_subvolid, Opt_device, Opt_nodatasum,
         Opt_nodatacow, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd,
         Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress,
-       Opt_compress_force, Opt_notreelog, Opt_ratio, Opt_flushoncommit,
-       Opt_discard, Opt_space_cache, Opt_clear_cache, Opt_err,
-       Opt_user_subvol_rm_allowed,
+       Opt_compress_type, Opt_compress_force, Opt_compress_force_type,
+       Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
+       Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, Opt_err,
   };
   
   static match_table_t tokens = {
@@@ -86,7 -170,9 +170,9 @@@
         {Opt_alloc_start, "alloc_start=%s"},
         {Opt_thread_pool, "thread_pool=%d"},
         {Opt_compress, "compress"},
+       {Opt_compress_type, "compress=%s"},
         {Opt_compress_force, "compress-force"},
+       {Opt_compress_force_type, "compress-force=%s"},
         {Opt_ssd, "ssd"},
         {Opt_ssd_spread, "ssd_spread"},
         {Opt_nossd, "nossd"},
@@@ -112,6 -198,8 +198,8 @@@ int btrfs_parse_options(struct btrfs_ro
         char *p, *num, *orig;
         int intarg;
         int ret = 0;
+       char *compress_type;
+       bool compress_force = false;
   
         if (!options)
                 return 0;
@@@ -154,14 -242,32 +242,32 @@@
                         btrfs_set_opt(info->mount_opt, NODATACOW);
                         btrfs_set_opt(info->mount_opt, NODATASUM);
                         break;
-               case Opt_compress:
-                       printk(KERN_INFO "btrfs: use compression\n");
-                       btrfs_set_opt(info->mount_opt, COMPRESS);
-                       break;
                 case Opt_compress_force:
-                       printk(KERN_INFO "btrfs: forcing compression\n");
-                       btrfs_set_opt(info->mount_opt, FORCE_COMPRESS);
+               case Opt_compress_force_type:
+                       compress_force = true;
+               case Opt_compress:
+               case Opt_compress_type:
+                       if (token == Opt_compress ||
+                           token == Opt_compress_force ||
+                           strcmp(args[0].from, "zlib") == 0) {
+                               compress_type = "zlib";
+                               info->compress_type = BTRFS_COMPRESS_ZLIB;
+                       } else if (strcmp(args[0].from, "lzo") == 0) {
+                               compress_type = "lzo";
+                               info->compress_type = BTRFS_COMPRESS_LZO;
+                       } else {
+                               ret = -EINVAL;
+                               goto out;
+                       }
+ 
                         btrfs_set_opt(info->mount_opt, COMPRESS);
+                       if (compress_force) {
+                               btrfs_set_opt(info->mount_opt, FORCE_COMPRESS);
+                               pr_info("btrfs: force %s compression\n",
+                                       compress_type);
+                       } else
+                               pr_info("btrfs: use %s compression\n",
+                                       compress_type);
                         break;
                 case Opt_ssd:
                         printk(KERN_INFO "btrfs: use ssd allocation scheme\n");
@@@ -460,7 -566,6 +566,7 @@@ static int btrfs_fill_super(struct supe
         sb->s_maxbytes = MAX_LFS_FILESIZE;
         sb->s_magic = BTRFS_SUPER_MAGIC;
         sb->s_op = &btrfs_super_ops;
+ +      sb->s_d_op = &btrfs_dentry_operations;
         sb->s_export_op = &btrfs_export_ops;
         sb->s_xattr = btrfs_xattr_handlers;
         sb->s_time_gran = 1;
@@@ -590,8 -695,8 +696,8 @@@ static int btrfs_set_super(struct super
    * Note:  This is based on get_sb_bdev from fs/super.c with a few additions
    *      for multiple device setup.  Make sure to keep it in sync.
    */
- -static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
- -              const char *dev_name, void *data, struct vfsmount *mnt)
+ +static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
+ +              const char *dev_name, void *data)
   {
         struct block_device *bdev = NULL;
         struct super_block *s;
@@@ -611,7 -716,7 +717,7 @@@
                                           &subvol_name, &subvol_objectid,
                                           &fs_devices);
         if (error)
- -              return error;
+ +              return ERR_PTR(error);
   
         error = btrfs_scan_one_device(dev_name, mode, fs_type, &fs_devices);
         if (error)
@@@ -702,8 -807,11 +808,8 @@@
                 root = new_root;
         }
   
- -      mnt->mnt_sb = s;
- -      mnt->mnt_root = root;
- -
         kfree(subvol_name);
- -      return 0;
+ +      return root;
   
   error_s:
         error = PTR_ERR(s);
@@@ -713,7 -821,7 +819,7 @@@ error_close_devices
         kfree(tree_root);
   error_free_subvol_name:
         kfree(subvol_name);
- -      return error;
+ +      return ERR_PTR(error);
   }
   
   static int btrfs_remount(struct super_block *sb, int *flags, char *data)
@@@ -753,6 -861,127 +859,127 @@@
         return 0;
   }
   
+ /*
+  * The helper to calc the free space on the devices that can be used to store
+  * file data.
+  */
+ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
+ {
+       struct btrfs_fs_info *fs_info = root->fs_info;
+       struct btrfs_device_info *devices_info;
+       struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+       struct btrfs_device *device;
+       u64 skip_space;
+       u64 type;
+       u64 avail_space;
+       u64 used_space;
+       u64 min_stripe_size;
+       int min_stripes = 1;
+       int i = 0, nr_devices;
+       int ret;
+ 
+       nr_devices = fs_info->fs_devices->rw_devices;
+       BUG_ON(!nr_devices);
+ 
+       devices_info = kmalloc(sizeof(*devices_info) * nr_devices,
+                              GFP_NOFS);
+       if (!devices_info)
+               return -ENOMEM;
+ 
+       /* calc min stripe number for data space alloction */
+       type = btrfs_get_alloc_profile(root, 1);
+       if (type & BTRFS_BLOCK_GROUP_RAID0)
+               min_stripes = 2;
+       else if (type & BTRFS_BLOCK_GROUP_RAID1)
+               min_stripes = 2;
+       else if (type & BTRFS_BLOCK_GROUP_RAID10)
+               min_stripes = 4;
+ 
+       if (type & BTRFS_BLOCK_GROUP_DUP)
+               min_stripe_size = 2 * BTRFS_STRIPE_LEN;
+       else
+               min_stripe_size = BTRFS_STRIPE_LEN;
+ 
+       list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
+               if (!device->in_fs_metadata)
+                       continue;
+ 
+               avail_space = device->total_bytes - device->bytes_used;
+ 
+               /* align with stripe_len */
+               do_div(avail_space, BTRFS_STRIPE_LEN);
+               avail_space *= BTRFS_STRIPE_LEN;
+ 
+               /*
+                * In order to avoid overwritting the superblock on the drive,
+                * btrfs starts at an offset of at least 1MB when doing chunk
+                * allocation.
+                */
+               skip_space = 1024 * 1024;
+ 
+               /* user can set the offset in fs_info->alloc_start. */
+               if (fs_info->alloc_start + BTRFS_STRIPE_LEN <=
+                   device->total_bytes)
+                       skip_space = max(fs_info->alloc_start, skip_space);
+ 
+               /*
+                * btrfs can not use the free space in [0, skip_space - 1],
+                * we must subtract it from the total. In order to implement
+                * it, we account the used space in this range first.
+                */
+               ret = btrfs_account_dev_extents_size(device, 0, skip_space - 1,
+                                                    &used_space);
+               if (ret) {
+                       kfree(devices_info);
+                       return ret;
+               }
+ 
+               /* calc the free space in [0, skip_space - 1] */
+               skip_space -= used_space;
+ 
+               /*
+                * we can use the free space in [0, skip_space - 1], subtract
+                * it from the total.
+                */
+               if (avail_space && avail_space >= skip_space)
+                       avail_space -= skip_space;
+               else
+                       avail_space = 0;
+ 
+               if (avail_space < min_stripe_size)
+                       continue;
+ 
+               devices_info[i].dev = device;
+               devices_info[i].max_avail = avail_space;
+ 
+               i++;
+       }
+ 
+       nr_devices = i;
+ 
+       btrfs_descending_sort_devices(devices_info, nr_devices);
+ 
+       i = nr_devices - 1;
+       avail_space = 0;
+       while (nr_devices >= min_stripes) {
+               if (devices_info[i].max_avail >= min_stripe_size) {
+                       int j;
+                       u64 alloc_size;
+ 
+                       avail_space += devices_info[i].max_avail * min_stripes;
+                       alloc_size = devices_info[i].max_avail;
+                       for (j = i + 1 - min_stripes; j <= i; j++)
+                               devices_info[j].max_avail -= alloc_size;
+               }
+               i--;
+               nr_devices--;
+       }
+ 
+       kfree(devices_info);
+       *free_bytes = avail_space;
+       return 0;
+ }
+ 
   static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
   {
         struct btrfs_root *root = btrfs_sb(dentry->d_sb);
@@@ -760,17 -989,21 +987,21 @@@
         struct list_head *head = &root->fs_info->space_info;
         struct btrfs_space_info *found;
         u64 total_used = 0;
-       u64 total_used_data = 0;
+       u64 total_free_data = 0;
         int bits = dentry->d_sb->s_blocksize_bits;
         __be32 *fsid = (__be32 *)root->fs_info->fsid;
+       int ret;
   
+       /* holding chunk_muext to avoid allocating new chunks */
+       mutex_lock(&root->fs_info->chunk_mutex);
         rcu_read_lock();
         list_for_each_entry_rcu(found, head, list) {
-               if (found->flags & (BTRFS_BLOCK_GROUP_METADATA |
-                                   BTRFS_BLOCK_GROUP_SYSTEM))
-                       total_used_data += found->disk_total;
-               else
-                       total_used_data += found->disk_used;
+               if (found->flags & BTRFS_BLOCK_GROUP_DATA) {
+                       total_free_data += found->disk_total - found->disk_used;
+                       total_free_data -=
+                               btrfs_account_ro_block_groups_free_space(found);
+               }
+ 
                 total_used += found->disk_used;
         }
         rcu_read_unlock();
@@@ -778,9 -1011,17 +1009,17 @@@
         buf->f_namelen = BTRFS_NAME_LEN;
         buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits;
         buf->f_bfree = buf->f_blocks - (total_used >> bits);
-       buf->f_bavail = buf->f_blocks - (total_used_data >> bits);
         buf->f_bsize = dentry->d_sb->s_blocksize;
         buf->f_type = BTRFS_SUPER_MAGIC;
+       buf->f_bavail = total_free_data;
+       ret = btrfs_calc_avail_data_space(root, &total_free_data);
+       if (ret) {
+               mutex_unlock(&root->fs_info->chunk_mutex);
+               return ret;
+       }
+       buf->f_bavail += total_free_data;
+       buf->f_bavail = buf->f_bavail >> bits;
+       mutex_unlock(&root->fs_info->chunk_mutex);
   
         /* We treat it as constant endianness (it doesn't matter _which_)
            because we want the fsid to come out the same whether mounted
@@@ -797,7 -1038,7 +1036,7 @@@
   static struct file_system_type btrfs_fs_type = {
         .owner          = THIS_MODULE,
         .name           = "btrfs",
- -      .get_sb         = btrfs_get_sb,
+ +      .mount          = btrfs_mount,
         .kill_sb        = kill_anon_super,
         .fs_flags       = FS_REQUIRES_DEV,
   };
@@@ -866,7 -1107,6 +1105,7 @@@ static const struct file_operations btr
         .unlocked_ioctl  = btrfs_control_ioctl,
         .compat_ioctl = btrfs_control_ioctl,
         .owner   = THIS_MODULE,
+ +      .llseek = noop_llseek,
   };
   
   static struct miscdevice btrfs_misc = {
@@@ -897,10 -1137,14 +1136,14 @@@ static int __init init_btrfs_fs(void
         if (err)
                 return err;
   
-       err = btrfs_init_cachep();
+       err = btrfs_init_compress();
         if (err)
                 goto free_sysfs;
   
+       err = btrfs_init_cachep();
+       if (err)
+               goto free_compress;
+ 
         err = extent_io_init();
         if (err)
                 goto free_cachep;
@@@ -928,6 -1172,8 +1171,8 @@@ free_extent_io
         extent_io_exit();
   free_cachep:
         btrfs_destroy_cachep();
+ free_compress:
+       btrfs_exit_compress();
   free_sysfs:
         btrfs_exit_sysfs();
         return err;
@@@ -942,7 -1188,7 +1187,7 @@@ static void __exit exit_btrfs_fs(void
         unregister_filesystem(&btrfs_fs_type);
         btrfs_exit_sysfs();
         btrfs_cleanup_fs_uuids();
-       btrfs_zlib_exit();
+       btrfs_exit_compress();
   }
   
   module_init(init_btrfs_fs)
diff --combined fs/btrfs/volumes.c

index 1718e1a5c3208b154503edb278accf6e91449c55,f2d2f4ccc73884aff7cf77b89c9ac3ba9d95afe1..d158530233b701397360fa13be8265274589b896
--- 1/fs/btrfs/volumes.c
--- 2/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@@ -22,6 -22,7 +22,7 @@@
   #include <linux/blkdev.h>
   #include <linux/random.h>
   #include <linux/iocontext.h>
+ #include <linux/capability.h>
   #include <asm/div64.h>
   #include "compat.h"
   #include "ctree.h"
@@@ -398,6 -399,7 +399,6 @@@ static noinline int device_list_add(con
                 device->work.func = pending_bios_fn;
                 memcpy(device->uuid, disk_super->dev_item.uuid,
                        BTRFS_UUID_SIZE);
- -              device->barriers = 1;
                 spin_lock_init(&device->io_lock);
                 device->name = kstrdup(path, GFP_NOFS);
                 if (!device->name) {
@@@ -465,6 -467,7 +466,6 @@@ static struct btrfs_fs_devices *clone_f
                 device->devid = orig_dev->devid;
                 device->work.func = pending_bios_fn;
                 memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid));
- -              device->barriers = 1;
                 spin_lock_init(&device->io_lock);
                 INIT_LIST_HEAD(&device->dev_list);
                 INIT_LIST_HEAD(&device->dev_alloc_list);
@@@ -493,7 -496,7 +494,7 @@@ again
                         continue;
   
                 if (device->bdev) {
- -                      close_bdev_exclusive(device->bdev, device->mode);
+ +                      blkdev_put(device->bdev, device->mode);
                         device->bdev = NULL;
                         fs_devices->open_devices--;
                 }
@@@ -527,7 -530,7 +528,7 @@@ static int __btrfs_close_devices(struc
   
         list_for_each_entry(device, &fs_devices->devices, dev_list) {
                 if (device->bdev) {
- -                      close_bdev_exclusive(device->bdev, device->mode);
+ +                      blkdev_put(device->bdev, device->mode);
                         fs_devices->open_devices--;
                 }
                 if (device->writeable) {
@@@ -584,15 -587,13 +585,15 @@@ static int __btrfs_open_devices(struct 
         int seeding = 1;
         int ret = 0;
   
+ +      flags |= FMODE_EXCL;
+ +
         list_for_each_entry(device, head, dev_list) {
                 if (device->bdev)
                         continue;
                 if (!device->name)
                         continue;
   
- -              bdev = open_bdev_exclusive(device->name, flags, holder);
+ +              bdev = blkdev_get_by_path(device->name, flags, holder);
                 if (IS_ERR(bdev)) {
                         printk(KERN_INFO "open %s failed\n", device->name);
                         goto error;
@@@ -600,8 -601,10 +601,10 @@@
                 set_blocksize(bdev, 4096);
   
                 bh = btrfs_read_dev_super(bdev);
-               if (!bh)
+               if (!bh) {
+                       ret = -EINVAL;
                         goto error_close;
+               }
   
                 disk_super = (struct btrfs_super_block *)bh->b_data;
                 devid = btrfs_stack_device_id(&disk_super->dev_item);
@@@ -644,7 -647,7 +647,7 @@@
   error_brelse:
                 brelse(bh);
   error_close:
- -              close_bdev_exclusive(bdev, FMODE_READ);
+ +              blkdev_put(bdev, flags);
   error:
                 continue;
         }
@@@ -690,8 -693,7 +693,8 @@@ int btrfs_scan_one_device(const char *p
   
         mutex_lock(&uuid_mutex);
   
- -      bdev = open_bdev_exclusive(path, flags, holder);
+ +      flags |= FMODE_EXCL;
+ +      bdev = blkdev_get_by_path(path, flags, holder);
   
         if (IS_ERR(bdev)) {
                 ret = PTR_ERR(bdev);
@@@ -703,7 -705,7 +706,7 @@@
                 goto error_close;
         bh = btrfs_read_dev_super(bdev);
         if (!bh) {
-               ret = -EIO;
+               ret = -EINVAL;
                 goto error_close;
         }
         disk_super = (struct btrfs_super_block *)bh->b_data;
@@@ -723,65 -725,173 +726,173 @@@
   
         brelse(bh);
   error_close:
- -      close_bdev_exclusive(bdev, flags);
+ +      blkdev_put(bdev, flags);
   error:
         mutex_unlock(&uuid_mutex);
         return ret;
   }
   
+ /* helper to account the used device space in the range */
+ int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
+                                  u64 end, u64 *length)
+ {
+       struct btrfs_key key;
+       struct btrfs_root *root = device->dev_root;
+       struct btrfs_dev_extent *dev_extent;
+       struct btrfs_path *path;
+       u64 extent_end;
+       int ret;
+       int slot;
+       struct extent_buffer *l;
+ 
+       *length = 0;
+ 
+       if (start >= device->total_bytes)
+               return 0;
+ 
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+       path->reada = 2;
+ 
+       key.objectid = device->devid;
+       key.offset = start;
+       key.type = BTRFS_DEV_EXTENT_KEY;
+ 
+       ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+       if (ret < 0)
+               goto out;
+       if (ret > 0) {
+               ret = btrfs_previous_item(root, path, key.objectid, key.type);
+               if (ret < 0)
+                       goto out;
+       }
+ 
+       while (1) {
+               l = path->nodes[0];
+               slot = path->slots[0];
+               if (slot >= btrfs_header_nritems(l)) {
+                       ret = btrfs_next_leaf(root, path);
+                       if (ret == 0)
+                               continue;
+                       if (ret < 0)
+                               goto out;
+ 
+                       break;
+               }
+               btrfs_item_key_to_cpu(l, &key, slot);
+ 
+               if (key.objectid < device->devid)
+                       goto next;
+ 
+               if (key.objectid > device->devid)
+                       break;
+ 
+               if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
+                       goto next;
+ 
+               dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
+               extent_end = key.offset + btrfs_dev_extent_length(l,
+                                                                 dev_extent);
+               if (key.offset <= start && extent_end > end) {
+                       *length = end - start + 1;
+                       break;
+               } else if (key.offset <= start && extent_end > start)
+                       *length += extent_end - start;
+               else if (key.offset > start && extent_end <= end)
+                       *length += extent_end - key.offset;
+               else if (key.offset > start && key.offset <= end) {
+                       *length += end - key.offset + 1;
+                       break;
+               } else if (key.offset > end)
+                       break;
+ 
+ next:
+               path->slots[0]++;
+       }
+       ret = 0;
+ out:
+       btrfs_free_path(path);
+       return ret;
+ }
+ 
   /*
+  * find_free_dev_extent - find free space in the specified device
+  * @trans:    transaction handler
+  * @device:   the device which we search the free space in
+  * @num_bytes:        the size of the free space that we need
+  * @start:    store the start of the free space.
+  * @len:      the size of the free space. that we find, or the size of the max
+  *            free space if we don't find suitable free space
+  *
    * this uses a pretty simple search, the expectation is that it is
    * called very infrequently and that a given device has a small number
    * of extents
+  *
+  * @start is used to store the start of the free space if we find. But if we
+  * don't find suitable free space, it will be used to store the start position
+  * of the max free space.
+  *
+  * @len is used to store the size of the free space that we find.
+  * But if we don't find suitable free space, it is used to store the size of
+  * the max free space.
    */
   int find_free_dev_extent(struct btrfs_trans_handle *trans,
                          struct btrfs_device *device, u64 num_bytes,
-                        u64 *start, u64 *max_avail)
+                        u64 *start, u64 *len)
   {
         struct btrfs_key key;
         struct btrfs_root *root = device->dev_root;
-       struct btrfs_dev_extent *dev_extent = NULL;
+       struct btrfs_dev_extent *dev_extent;
         struct btrfs_path *path;
-       u64 hole_size = 0;
-       u64 last_byte = 0;
-       u64 search_start = 0;
+       u64 hole_size;
+       u64 max_hole_start;
+       u64 max_hole_size;
+       u64 extent_end;
+       u64 search_start;
         u64 search_end = device->total_bytes;
         int ret;
-       int slot = 0;
-       int start_found;
+       int slot;
         struct extent_buffer *l;
   
-       path = btrfs_alloc_path();
-       if (!path)
-               return -ENOMEM;
-       path->reada = 2;
-       start_found = 0;
- 
         /* FIXME use last free of some kind */
   
         /* we don't want to overwrite the superblock on the drive,
          * so we make sure to start at an offset of at least 1MB
          */
-       search_start = max((u64)1024 * 1024, search_start);
+       search_start = 1024 * 1024;
   
-       if (root->fs_info->alloc_start + num_bytes <= device->total_bytes)
+       if (root->fs_info->alloc_start + num_bytes <= search_end)
                 search_start = max(root->fs_info->alloc_start, search_start);
   
+       max_hole_start = search_start;
+       max_hole_size = 0;
+ 
+       if (search_start >= search_end) {
+               ret = -ENOSPC;
+               goto error;
+       }
+ 
+       path = btrfs_alloc_path();
+       if (!path) {
+               ret = -ENOMEM;
+               goto error;
+       }
+       path->reada = 2;
+ 
         key.objectid = device->devid;
         key.offset = search_start;
         key.type = BTRFS_DEV_EXTENT_KEY;
+ 
         ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
         if (ret < 0)
-               goto error;
+               goto out;
         if (ret > 0) {
                 ret = btrfs_previous_item(root, path, key.objectid, key.type);
                 if (ret < 0)
-                       goto error;
-               if (ret > 0)
-                       start_found = 1;
+                       goto out;
         }
-       l = path->nodes[0];
-       btrfs_item_key_to_cpu(l, &key, path->slots[0]);
+ 
         while (1) {
                 l = path->nodes[0];
                 slot = path->slots[0];
@@@ -790,24 -900,9 +901,9 @@@
                         if (ret == 0)
                                 continue;
                         if (ret < 0)
-                               goto error;
- no_more_items:
-                       if (!start_found) {
-                               if (search_start >= search_end) {
-                                       ret = -ENOSPC;
-                                       goto error;
-                               }
-                               *start = search_start;
-                               start_found = 1;
-                               goto check_pending;
-                       }
-                       *start = last_byte > search_start ?
-                               last_byte : search_start;
-                       if (search_end <= *start) {
-                               ret = -ENOSPC;
-                               goto error;
-                       }
-                       goto check_pending;
+                               goto out;
+ 
+                       break;
                 }
                 btrfs_item_key_to_cpu(l, &key, slot);
   
@@@ -815,48 -910,62 +911,62 @@@
                         goto next;
   
                 if (key.objectid > device->devid)
-                       goto no_more_items;
+                       break;
   
-               if (key.offset >= search_start && key.offset > last_byte &&
-                   start_found) {
-                       if (last_byte < search_start)
-                               last_byte = search_start;
-                       hole_size = key.offset - last_byte;
+               if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
+                       goto next;
   
-                       if (hole_size > *max_avail)
-                               *max_avail = hole_size;
+               if (key.offset > search_start) {
+                       hole_size = key.offset - search_start;
+ 
+                       if (hole_size > max_hole_size) {
+                               max_hole_start = search_start;
+                               max_hole_size = hole_size;
+                       }
   
-                       if (key.offset > last_byte &&
-                           hole_size >= num_bytes) {
-                               *start = last_byte;
-                               goto check_pending;
+                       /*
+                        * If this free space is greater than which we need,
+                        * it must be the max free space that we have found
+                        * until now, so max_hole_start must point to the start
+                        * of this free space and the length of this free space
+                        * is stored in max_hole_size. Thus, we return
+                        * max_hole_start and max_hole_size and go back to the
+                        * caller.
+                        */
+                       if (hole_size >= num_bytes) {
+                               ret = 0;
+                               goto out;
                         }
                 }
-               if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
-                       goto next;
   
-               start_found = 1;
                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
-               last_byte = key.offset + btrfs_dev_extent_length(l, dev_extent);
+               extent_end = key.offset + btrfs_dev_extent_length(l,
+                                                                 dev_extent);
+               if (extent_end > search_start)
+                       search_start = extent_end;
   next:
                 path->slots[0]++;
                 cond_resched();
         }
- check_pending:
-       /* we have to make sure we didn't find an extent that has already
-        * been allocated by the map tree or the original allocation
-        */
-       BUG_ON(*start < search_start);
   
-       if (*start + num_bytes > search_end) {
-               ret = -ENOSPC;
-               goto error;
+       hole_size = search_end- search_start;
+       if (hole_size > max_hole_size) {
+               max_hole_start = search_start;
+               max_hole_size = hole_size;
         }
-       /* check for pending inserts here */
-       ret = 0;
   
- error:
+       /* See above. */
+       if (hole_size < num_bytes)
+               ret = -ENOSPC;
+       else
+               ret = 0;
+ 
+ out:
         btrfs_free_path(path);
+ error:
+       *start = max_hole_start;
+       if (len)
+               *len = max_hole_size;
         return ret;
   }
   
@@@ -1186,8 -1295,8 +1296,8 @@@ int btrfs_rm_device(struct btrfs_root *
                         goto out;
                 }
         } else {
- -              bdev = open_bdev_exclusive(device_path, FMODE_READ,
- -                                    root->fs_info->bdev_holder);
+ +              bdev = blkdev_get_by_path(device_path, FMODE_READ | FMODE_EXCL,
+ +                                        root->fs_info->bdev_holder);
                 if (IS_ERR(bdev)) {
                         ret = PTR_ERR(bdev);
                         goto out;
@@@ -1196,7 -1305,7 +1306,7 @@@
                 set_blocksize(bdev, 4096);
                 bh = btrfs_read_dev_super(bdev);
                 if (!bh) {
-                       ret = -EIO;
+                       ret = -EINVAL;
                         goto error_close;
                 }
                 disk_super = (struct btrfs_super_block *)bh->b_data;
@@@ -1254,7 -1363,7 +1364,7 @@@
                 root->fs_info->fs_devices->latest_bdev = next_device->bdev;
   
         if (device->bdev) {
- -              close_bdev_exclusive(device->bdev, device->mode);
+ +              blkdev_put(device->bdev, device->mode);
                 device->bdev = NULL;
                 device->fs_devices->open_devices--;
         }
@@@ -1297,7 -1406,7 +1407,7 @@@ error_brelse
         brelse(bh);
   error_close:
         if (bdev)
- -              close_bdev_exclusive(bdev, FMODE_READ);
+ +              blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
   out:
         mutex_unlock(&root->fs_info->volume_mutex);
         mutex_unlock(&uuid_mutex);
@@@ -1449,8 -1558,7 +1559,8 @@@ int btrfs_init_new_device(struct btrfs_
         if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
                 return -EINVAL;
   
- -      bdev = open_bdev_exclusive(device_path, 0, root->fs_info->bdev_holder);
+ +      bdev = blkdev_get_by_path(device_path, FMODE_EXCL,
+ +                                root->fs_info->bdev_holder);
         if (IS_ERR(bdev))
                 return PTR_ERR(bdev);
   
@@@ -1498,6 -1606,7 +1608,6 @@@
         trans = btrfs_start_transaction(root, 0);
         lock_chunks(root);
   
- -      device->barriers = 1;
         device->writeable = 1;
         device->work.func = pending_bios_fn;
         generate_random_uuid(device->uuid);
@@@ -1576,7 -1685,7 +1686,7 @@@ out
         mutex_unlock(&root->fs_info->volume_mutex);
         return ret;
   error:
- -      close_bdev_exclusive(bdev, 0);
+ +      blkdev_put(bdev, FMODE_EXCL);
         if (seeding_dev) {
                 mutex_unlock(&uuid_mutex);
                 up_write(&sb->s_umount);
@@@ -1916,6 -2025,9 +2026,9 @@@ int btrfs_balance(struct btrfs_root *de
         if (dev_root->fs_info->sb->s_flags & MS_RDONLY)
                 return -EROFS;
   
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+ 
         mutex_lock(&dev_root->fs_info->volume_mutex);
         dev_root = dev_root->fs_info->dev_root;
   
@@@ -2154,66 -2266,67 +2267,67 @@@ static noinline u64 chunk_bytes_by_type
                 return calc_size * num_stripes;
   }
   
- static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
-                              struct btrfs_root *extent_root,
-                              struct map_lookup **map_ret,
-                              u64 *num_bytes, u64 *stripe_size,
-                              u64 start, u64 type)
+ /* Used to sort the devices by max_avail(descending sort) */
+ int btrfs_cmp_device_free_bytes(const void *dev_info1, const void *dev_info2)
   {
-       struct btrfs_fs_info *info = extent_root->fs_info;
-       struct btrfs_device *device = NULL;
-       struct btrfs_fs_devices *fs_devices = info->fs_devices;
-       struct list_head *cur;
-       struct map_lookup *map = NULL;
-       struct extent_map_tree *em_tree;
-       struct extent_map *em;
-       struct list_head private_devs;
-       int min_stripe_size = 1 * 1024 * 1024;
-       u64 calc_size = 1024 * 1024 * 1024;
-       u64 max_chunk_size = calc_size;
-       u64 min_free;
-       u64 avail;
-       u64 max_avail = 0;
-       u64 dev_offset;
-       int num_stripes = 1;
-       int min_stripes = 1;
-       int sub_stripes = 0;
-       int looped = 0;
-       int ret;
-       int index;
-       int stripe_len = 64 * 1024;
+       if (((struct btrfs_device_info *)dev_info1)->max_avail >
+           ((struct btrfs_device_info *)dev_info2)->max_avail)
+               return -1;
+       else if (((struct btrfs_device_info *)dev_info1)->max_avail <
+                ((struct btrfs_device_info *)dev_info2)->max_avail)
+               return 1;
+       else
+               return 0;
+ }
   
-       if ((type & BTRFS_BLOCK_GROUP_RAID1) &&
-           (type & BTRFS_BLOCK_GROUP_DUP)) {
-               WARN_ON(1);
-               type &= ~BTRFS_BLOCK_GROUP_DUP;
-       }
-       if (list_empty(&fs_devices->alloc_list))
-               return -ENOSPC;
+ static int __btrfs_calc_nstripes(struct btrfs_fs_devices *fs_devices, u64 type,
+                                int *num_stripes, int *min_stripes,
+                                int *sub_stripes)
+ {
+       *num_stripes = 1;
+       *min_stripes = 1;
+       *sub_stripes = 0;
   
         if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
-               num_stripes = fs_devices->rw_devices;
-               min_stripes = 2;
+               *num_stripes = fs_devices->rw_devices;
+               *min_stripes = 2;
         }
         if (type & (BTRFS_BLOCK_GROUP_DUP)) {
-               num_stripes = 2;
-               min_stripes = 2;
+               *num_stripes = 2;
+               *min_stripes = 2;
         }
         if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
                 if (fs_devices->rw_devices < 2)
                         return -ENOSPC;
-               num_stripes = 2;
-               min_stripes = 2;
+               *num_stripes = 2;
+               *min_stripes = 2;
         }
         if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
-               num_stripes = fs_devices->rw_devices;
-               if (num_stripes < 4)
+               *num_stripes = fs_devices->rw_devices;
+               if (*num_stripes < 4)
                         return -ENOSPC;
-               num_stripes &= ~(u32)1;
-               sub_stripes = 2;
-               min_stripes = 4;
+               *num_stripes &= ~(u32)1;
+               *sub_stripes = 2;
+               *min_stripes = 4;
         }
   
+       return 0;
+ }
+ 
+ static u64 __btrfs_calc_stripe_size(struct btrfs_fs_devices *fs_devices,
+                                   u64 proposed_size, u64 type,
+                                   int num_stripes, int small_stripe)
+ {
+       int min_stripe_size = 1 * 1024 * 1024;
+       u64 calc_size = proposed_size;
+       u64 max_chunk_size = calc_size;
+       int ncopies = 1;
+ 
+       if (type & (BTRFS_BLOCK_GROUP_RAID1 |
+                   BTRFS_BLOCK_GROUP_DUP |
+                   BTRFS_BLOCK_GROUP_RAID10))
+               ncopies = 2;
+ 
         if (type & BTRFS_BLOCK_GROUP_DATA) {
                 max_chunk_size = 10 * calc_size;
                 min_stripe_size = 64 * 1024 * 1024;
@@@ -2230,51 -2343,209 +2344,209 @@@
         max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
                              max_chunk_size);
   
- again:
-       max_avail = 0;
-       if (!map || map->num_stripes != num_stripes) {
-               kfree(map);
-               map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
-               if (!map)
-                       return -ENOMEM;
-               map->num_stripes = num_stripes;
-       }
- 
-       if (calc_size * num_stripes > max_chunk_size) {
-               calc_size = max_chunk_size;
+       if (calc_size * num_stripes > max_chunk_size * ncopies) {
+               calc_size = max_chunk_size * ncopies;
                 do_div(calc_size, num_stripes);
-               do_div(calc_size, stripe_len);
-               calc_size *= stripe_len;
+               do_div(calc_size, BTRFS_STRIPE_LEN);
+               calc_size *= BTRFS_STRIPE_LEN;
         }
   
         /* we don't want tiny stripes */
-       if (!looped)
+       if (!small_stripe)
                 calc_size = max_t(u64, min_stripe_size, calc_size);
   
         /*
-        * we're about to do_div by the stripe_len so lets make sure
+        * we're about to do_div by the BTRFS_STRIPE_LEN so lets make sure
          * we end up with something bigger than a stripe
          */
-       calc_size = max_t(u64, calc_size, stripe_len * 4);
+       calc_size = max_t(u64, calc_size, BTRFS_STRIPE_LEN);
+ 
+       do_div(calc_size, BTRFS_STRIPE_LEN);
+       calc_size *= BTRFS_STRIPE_LEN;
+ 
+       return calc_size;
+ }
+ 
+ static struct map_lookup *__shrink_map_lookup_stripes(struct map_lookup *map,
+                                                     int num_stripes)
+ {
+       struct map_lookup *new;
+       size_t len = map_lookup_size(num_stripes);
+ 
+       BUG_ON(map->num_stripes < num_stripes);
+ 
+       if (map->num_stripes == num_stripes)
+               return map;
+ 
+       new = kmalloc(len, GFP_NOFS);
+       if (!new) {
+               /* just change map->num_stripes */
+               map->num_stripes = num_stripes;
+               return map;
+       }
+ 
+       memcpy(new, map, len);
+       new->num_stripes = num_stripes;
+       kfree(map);
+       return new;
+ }
+ 
+ /*
+  * helper to allocate device space from btrfs_device_info, in which we stored
+  * max free space information of every device. It is used when we can not
+  * allocate chunks by default size.
+  *
+  * By this helper, we can allocate a new chunk as larger as possible.
+  */
+ static int __btrfs_alloc_tiny_space(struct btrfs_trans_handle *trans,
+                                   struct btrfs_fs_devices *fs_devices,
+                                   struct btrfs_device_info *devices,
+                                   int nr_device, u64 type,
+                                   struct map_lookup **map_lookup,
+                                   int min_stripes, u64 *stripe_size)
+ {
+       int i, index, sort_again = 0;
+       int min_devices = min_stripes;
+       u64 max_avail, min_free;
+       struct map_lookup *map = *map_lookup;
+       int ret;
+ 
+       if (nr_device < min_stripes)
+               return -ENOSPC;
+ 
+       btrfs_descending_sort_devices(devices, nr_device);
+ 
+       max_avail = devices[0].max_avail;
+       if (!max_avail)
+               return -ENOSPC;
+ 
+       for (i = 0; i < nr_device; i++) {
+               /*
+                * if dev_offset = 0, it means the free space of this device
+                * is less than what we need, and we didn't search max avail
+                * extent on this device, so do it now.
+                */
+               if (!devices[i].dev_offset) {
+                       ret = find_free_dev_extent(trans, devices[i].dev,
+                                                  max_avail,
+                                                  &devices[i].dev_offset,
+                                                  &devices[i].max_avail);
+                       if (ret != 0 && ret != -ENOSPC)
+                               return ret;
+                       sort_again = 1;
+               }
+       }
+ 
+       /* we update the max avail free extent of each devices, sort again */
+       if (sort_again)
+               btrfs_descending_sort_devices(devices, nr_device);
+ 
+       if (type & BTRFS_BLOCK_GROUP_DUP)
+               min_devices = 1;
+ 
+       if (!devices[min_devices - 1].max_avail)
+               return -ENOSPC;
+ 
+       max_avail = devices[min_devices - 1].max_avail;
+       if (type & BTRFS_BLOCK_GROUP_DUP)
+               do_div(max_avail, 2);
+ 
+       max_avail = __btrfs_calc_stripe_size(fs_devices, max_avail, type,
+                                            min_stripes, 1);
+       if (type & BTRFS_BLOCK_GROUP_DUP)
+               min_free = max_avail * 2;
+       else
+               min_free = max_avail;
+ 
+       if (min_free > devices[min_devices - 1].max_avail)
+               return -ENOSPC;
+ 
+       map = __shrink_map_lookup_stripes(map, min_stripes);
+       *stripe_size = max_avail;
+ 
+       index = 0;
+       for (i = 0; i < min_stripes; i++) {
+               map->stripes[i].dev = devices[index].dev;
+               map->stripes[i].physical = devices[index].dev_offset;
+               if (type & BTRFS_BLOCK_GROUP_DUP) {
+                       i++;
+                       map->stripes[i].dev = devices[index].dev;
+                       map->stripes[i].physical = devices[index].dev_offset +
+                                                  max_avail;
+               }
+               index++;
+       }
+       *map_lookup = map;
   
-       do_div(calc_size, stripe_len);
-       calc_size *= stripe_len;
+       return 0;
+ }
+ 
+ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
+                              struct btrfs_root *extent_root,
+                              struct map_lookup **map_ret,
+                              u64 *num_bytes, u64 *stripe_size,
+                              u64 start, u64 type)
+ {
+       struct btrfs_fs_info *info = extent_root->fs_info;
+       struct btrfs_device *device = NULL;
+       struct btrfs_fs_devices *fs_devices = info->fs_devices;
+       struct list_head *cur;
+       struct map_lookup *map;
+       struct extent_map_tree *em_tree;
+       struct extent_map *em;
+       struct btrfs_device_info *devices_info;
+       struct list_head private_devs;
+       u64 calc_size = 1024 * 1024 * 1024;
+       u64 min_free;
+       u64 avail;
+       u64 dev_offset;
+       int num_stripes;
+       int min_stripes;
+       int sub_stripes;
+       int min_devices;        /* the min number of devices we need */
+       int i;
+       int ret;
+       int index;
+ 
+       if ((type & BTRFS_BLOCK_GROUP_RAID1) &&
+           (type & BTRFS_BLOCK_GROUP_DUP)) {
+               WARN_ON(1);
+               type &= ~BTRFS_BLOCK_GROUP_DUP;
+       }
+       if (list_empty(&fs_devices->alloc_list))
+               return -ENOSPC;
+ 
+       ret = __btrfs_calc_nstripes(fs_devices, type, &num_stripes,
+                                   &min_stripes, &sub_stripes);
+       if (ret)
+               return ret;
+ 
+       devices_info = kzalloc(sizeof(*devices_info) * fs_devices->rw_devices,
+                              GFP_NOFS);
+       if (!devices_info)
+               return -ENOMEM;
+ 
+       map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
+       if (!map) {
+               ret = -ENOMEM;
+               goto error;
+       }
+       map->num_stripes = num_stripes;
   
         cur = fs_devices->alloc_list.next;
         index = 0;
+       i = 0;
   
-       if (type & BTRFS_BLOCK_GROUP_DUP)
+       calc_size = __btrfs_calc_stripe_size(fs_devices, calc_size, type,
+                                            num_stripes, 0);
+ 
+       if (type & BTRFS_BLOCK_GROUP_DUP) {
                 min_free = calc_size * 2;
-       else
+               min_devices = 1;
+       } else {
                 min_free = calc_size;
- 
-       /*
-        * we add 1MB because we never use the first 1MB of the device, unless
-        * we've looped, then we are likely allocating the maximum amount of
-        * space left already
-        */
-       if (!looped)
-               min_free += 1024 * 1024;
+               min_devices = min_stripes;
+       }
   
         INIT_LIST_HEAD(&private_devs);
         while (index < num_stripes) {
@@@ -2287,27 -2558,39 +2559,39 @@@
                 cur = cur->next;
   
                 if (device->in_fs_metadata && avail >= min_free) {
-                       ret = find_free_dev_extent(trans, device,
-                                                  min_free, &dev_offset,
-                                                  &max_avail);
+                       ret = find_free_dev_extent(trans, device, min_free,
+                                                  &devices_info[i].dev_offset,
+                                                  &devices_info[i].max_avail);
                         if (ret == 0) {
                                 list_move_tail(&device->dev_alloc_list,
                                                &private_devs);
                                 map->stripes[index].dev = device;
-                               map->stripes[index].physical = dev_offset;
+                               map->stripes[index].physical =
+                                               devices_info[i].dev_offset;
                                 index++;
                                 if (type & BTRFS_BLOCK_GROUP_DUP) {
                                         map->stripes[index].dev = device;
                                         map->stripes[index].physical =
-                                               dev_offset + calc_size;
+                                               devices_info[i].dev_offset +
+                                               calc_size;
                                         index++;
                                 }
-                       }
-               } else if (device->in_fs_metadata && avail > max_avail)
-                       max_avail = avail;
+                       } else if (ret != -ENOSPC)
+                               goto error;
+ 
+                       devices_info[i].dev = device;
+                       i++;
+               } else if (device->in_fs_metadata &&
+                          avail >= BTRFS_STRIPE_LEN) {
+                       devices_info[i].dev = device;
+                       devices_info[i].max_avail = avail;
+                       i++;
+               }
+ 
                 if (cur == &fs_devices->alloc_list)
                         break;
         }
+ 
         list_splice(&private_devs, &fs_devices->alloc_list);
         if (index < num_stripes) {
                 if (index >= min_stripes) {
@@@ -2316,34 -2599,36 +2600,36 @@@
                                 num_stripes /= sub_stripes;
                                 num_stripes *= sub_stripes;
                         }
-                       looped = 1;
-                       goto again;
-               }
-               if (!looped && max_avail > 0) {
-                       looped = 1;
-                       calc_size = max_avail;
-                       goto again;
+ 
+                       map = __shrink_map_lookup_stripes(map, num_stripes);
+               } else if (i >= min_devices) {
+                       ret = __btrfs_alloc_tiny_space(trans, fs_devices,
+                                                      devices_info, i, type,
+                                                      &map, min_stripes,
+                                                      &calc_size);
+                       if (ret)
+                               goto error;
+               } else {
+                       ret = -ENOSPC;
+                       goto error;
                 }
-               kfree(map);
-               return -ENOSPC;
         }
         map->sector_size = extent_root->sectorsize;
-       map->stripe_len = stripe_len;
-       map->io_align = stripe_len;
-       map->io_width = stripe_len;
+       map->stripe_len = BTRFS_STRIPE_LEN;
+       map->io_align = BTRFS_STRIPE_LEN;
+       map->io_width = BTRFS_STRIPE_LEN;
         map->type = type;
-       map->num_stripes = num_stripes;
         map->sub_stripes = sub_stripes;
   
         *map_ret = map;
         *stripe_size = calc_size;
         *num_bytes = chunk_bytes_by_type(type, calc_size,
-                                        num_stripes, sub_stripes);
+                                        map->num_stripes, sub_stripes);
   
         em = alloc_extent_map(GFP_NOFS);
         if (!em) {
-               kfree(map);
-               return -ENOMEM;
+               ret = -ENOMEM;
+               goto error;
         }
         em->bdev = (struct block_device *)map;
         em->start = start;
@@@ -2376,7 -2661,13 +2662,13 @@@
                 index++;
         }
   
+       kfree(devices_info);
         return 0;
+ 
+ error:
+       kfree(map);
+       kfree(devices_info);
+       return ret;
   }
   
   static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
@@@ -3087,6 -3378,7 +3379,6 @@@ static struct btrfs_device *add_missing
                 return NULL;
         list_add(&device->dev_list,
                  &fs_devices->devices);
- -      device->barriers = 1;
         device->dev_root = root->fs_info->dev_root;
         device->devid = devid;
         device->work.func = pending_bios_fn;
diff --combined fs/btrfs/volumes.h

index 1be7810794500b2b8d5ef6697e1749a93303ce40,7af6144a7954164717a390f4b84366d9d960f5e6..7fb59d45fe8cac7e16cd1fa9295f14bd39848db0
--- 1/fs/btrfs/volumes.h
--- 2/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@@ -20,8 -20,11 +20,11 @@@
   #define __BTRFS_VOLUMES_
   
   #include <linux/bio.h>
+ #include <linux/sort.h>
   #include "async-thread.h"
   
+ #define BTRFS_STRIPE_LEN      (64 * 1024)
+ 
   struct buffer_head;
   struct btrfs_pending_bios {
         struct bio *head;
@@@ -42,6 -45,7 +45,6 @@@ struct btrfs_device 
         int running_pending;
         u64 generation;
   
- -      int barriers;
         int writeable;
         int in_fs_metadata;
         int missing;
@@@ -50,7 -54,7 +53,7 @@@
   
         struct block_device *bdev;
   
- -      /* the mode sent to open_bdev_exclusive */
+ +      /* the mode sent to blkdev_get */
         fmode_t mode;
   
         char *name;
@@@ -136,6 -140,30 +139,30 @@@ struct btrfs_multi_bio 
         struct btrfs_bio_stripe stripes[];
   };
   
+ struct btrfs_device_info {
+       struct btrfs_device *dev;
+       u64 dev_offset;
+       u64 max_avail;
+ };
+ 
+ /* Used to sort the devices by max_avail(descending sort) */
+ int btrfs_cmp_device_free_bytes(const void *dev_info1, const void *dev_info2);
+ 
+ /*
+  * sort the devices by max_avail, in which max free extent size of each device
+  * is stored.(Descending Sort)
+  */
+ static inline void btrfs_descending_sort_devices(
+                                       struct btrfs_device_info *devices,
+                                       size_t nr_devices)
+ {
+       sort(devices, nr_devices, sizeof(struct btrfs_device_info),
+            btrfs_cmp_device_free_bytes, NULL);
+ }
+ 
+ int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
+                                  u64 end, u64 *length);
+ 
   #define btrfs_multi_bio_size(n) (sizeof(struct btrfs_multi_bio) + \
                             (sizeof(struct btrfs_bio_stripe) * (n)))
author	Linus Torvalds <torvalds@linux-foundation.org>
	Mon, 17 Jan 2011 22:43:43 +0000 (14:43 -0800)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Mon, 17 Jan 2011 22:43:43 +0000 (14:43 -0800)
		1	2
fs/btrfs/acl.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/ctree.h	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/disk-io.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/extent-tree.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/extent_io.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/file.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/inode.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/super.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/volumes.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/volumes.h	patch \|	diff1 \|	diff2 \|	blob \| history