* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs-unstable: (25 commits)
Btrfs: forced readonly mounts on errors
btrfs: Require CAP_SYS_ADMIN for filesystem rebalance
Btrfs: don't warn if we get ENOSPC in btrfs_block_rsv_check
btrfs: Fix memory leak in btrfs_read_fs_root_no_radix()
btrfs: check NULL or not
btrfs: Don't pass NULL ptr to func that may deref it.
btrfs: mount failure return value fix
btrfs: Mem leak in btrfs_get_acl()
btrfs: fix wrong free space information of btrfs
btrfs: make the chunk allocator utilize the devices better
btrfs: restructure find_free_dev_extent()
btrfs: fix wrong calculation of stripe size
btrfs: try to reclaim some space when chunk allocation fails
btrfs: fix wrong data space statistics
fs/btrfs: Fix build of ctree
Btrfs: fix off by one while setting block groups readonly
Btrfs: Add BTRFS_IOC_SUBVOL_GETFLAGS/SETFLAGS ioctls
Btrfs: Add readonly snapshots support
Btrfs: Refactor btrfs_ioctl_snap_create()
btrfs: Extract duplicate decompress code
...
size = __btrfs_getxattr(inode, name, value, size);
if (size > 0) {
acl = posix_acl_from_xattr(value, size);
- if (IS_ERR(acl))
+ if (IS_ERR(acl)) {
+ kfree(value);
return acl;
+ }
set_cached_acl(inode, type, acl);
}
kfree(value);
return ret;
}
-int btrfs_check_acl(struct inode *inode, int mask)
+int btrfs_check_acl(struct inode *inode, int mask, unsigned int flags)
{
- struct posix_acl *acl;
int error = -EAGAIN;
- acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS);
+ if (flags & IPERM_FLAG_RCU) {
+ if (!negative_cached_acl(inode, ACL_TYPE_ACCESS))
+ error = -ECHILD;
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- if (acl) {
- error = posix_acl_permission(inode, acl, mask);
- posix_acl_release(acl);
+ } else {
+ struct posix_acl *acl;
+ acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS);
+ if (IS_ERR(acl))
+ return PTR_ERR(acl);
+ if (acl) {
+ error = posix_acl_permission(inode, acl, mask);
+ posix_acl_release(acl);
+ }
}
return error;
#define BTRFS_FSID_SIZE 16
#define BTRFS_HEADER_FLAG_WRITTEN (1ULL << 0)
#define BTRFS_HEADER_FLAG_RELOC (1ULL << 1)
+
+ /*
+ * File system states
+ */
+
+ /* Errors detected */
+ #define BTRFS_SUPER_FLAG_ERROR (1ULL << 2)
+
#define BTRFS_SUPER_FLAG_SEEDING (1ULL << 32)
#define BTRFS_SUPER_FLAG_METADUMP (1ULL << 33)
#define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF (1ULL << 0)
#define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (1ULL << 1)
#define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS (1ULL << 2)
+ #define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO (1ULL << 3)
#define BTRFS_FEATURE_COMPAT_SUPP 0ULL
#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL
#define BTRFS_FEATURE_INCOMPAT_SUPP \
(BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \
BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \
- BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
+ BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \
+ BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO)
/*
* A leaf is full of items. offset and size tell us where to find
} __attribute__ ((__packed__));
enum btrfs_compression_type {
- BTRFS_COMPRESS_NONE = 0,
- BTRFS_COMPRESS_ZLIB = 1,
- BTRFS_COMPRESS_LAST = 2,
+ BTRFS_COMPRESS_NONE = 0,
+ BTRFS_COMPRESS_ZLIB = 1,
+ BTRFS_COMPRESS_LZO = 2,
+ BTRFS_COMPRESS_TYPES = 2,
+ BTRFS_COMPRESS_LAST = 3,
};
struct btrfs_inode_item {
u8 type;
} __attribute__ ((__packed__));
+ #define BTRFS_ROOT_SUBVOL_RDONLY (1ULL << 0)
+
struct btrfs_root_item {
struct btrfs_inode_item inode;
__le64 generation;
*/
u64 last_trans_log_full_commit;
u64 open_ioctl_trans;
- unsigned long mount_opt;
+ unsigned long mount_opt:20;
+ unsigned long compress_type:4;
u64 max_inline;
u64 alloc_start;
struct btrfs_transaction *running_transaction;
unsigned metadata_ratio;
void *bdev_holder;
+
+ /* filesystem state */
+ u64 fs_state;
};
/*
BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item,
last_snapshot, 64);
+ static inline bool btrfs_root_readonly(struct btrfs_root *root)
+ {
+ return root->root_item.flags & BTRFS_ROOT_SUBVOL_RDONLY;
+ }
+
/* struct btrfs_super_block */
BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 group_start);
u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
+ u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data);
void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
int btrfs_check_data_free_space(struct inode *inode, u64 bytes);
int btrfs_set_block_group_rw(struct btrfs_root *root,
struct btrfs_block_group_cache *cache);
void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
+ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo);
+ int btrfs_error_unpin_extent_range(struct btrfs_root *root,
+ u64 start, u64 end);
+ int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr,
+ u64 num_bytes);
+
/* ctree.c */
int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
int level, int *slot);
/* super.c */
int btrfs_parse_options(struct btrfs_root *root, char *options);
int btrfs_sync_fs(struct super_block *sb, int wait);
+ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
+ unsigned int line, int errno);
+
+ #define btrfs_std_error(fs_info, errno) \
+ do { \
+ if ((errno)) \
+ __btrfs_std_error((fs_info), __func__, __LINE__, (errno));\
+ } while (0)
/* acl.c */
#ifdef CONFIG_BTRFS_FS_POSIX_ACL
-int btrfs_check_acl(struct inode *inode, int mask);
+int btrfs_check_acl(struct inode *inode, int mask, unsigned int flags);
#else
#define btrfs_check_acl NULL
#endif
static struct extent_io_ops btree_extent_io_ops;
static void end_workqueue_fn(struct btrfs_work *work);
static void free_fs_root(struct btrfs_root *root);
+ static void btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
+ int read_only);
+ static int btrfs_destroy_ordered_operations(struct btrfs_root *root);
+ static int btrfs_destroy_ordered_extents(struct btrfs_root *root);
+ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
+ struct btrfs_root *root);
+ static int btrfs_destroy_pending_snapshots(struct btrfs_transaction *t);
+ static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root);
+ static int btrfs_destroy_marked_extents(struct btrfs_root *root,
+ struct extent_io_tree *dirty_pages,
+ int mark);
+ static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
+ struct extent_io_tree *pinned_extents);
+ static int btrfs_cleanup_transaction(struct btrfs_root *root);
/*
* end_io_wq structs are used to do processing in task context when an IO is
WARN_ON(len == 0);
eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
+ if (eb == NULL) {
+ WARN_ON(1);
+ goto out;
+ }
ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE,
btrfs_header_generation(eb));
BUG_ON(ret);
WARN_ON(len == 0);
eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
+ if (eb == NULL) {
+ ret = -EIO;
+ goto out;
+ }
found_start = btrfs_header_bytenr(eb);
if (found_start != start) {
}
btrfs_free_path(path);
if (ret) {
+ kfree(root);
if (ret > 0)
ret = -ENOENT;
return ERR_PTR(ret);
fs_info, BTRFS_ROOT_TREE_OBJECTID);
bh = btrfs_read_dev_super(fs_devices->latest_bdev);
- if (!bh)
+ if (!bh) {
+ err = -EINVAL;
goto fail_iput;
+ }
memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy));
memcpy(&fs_info->super_for_commit, &fs_info->super_copy,
if (!btrfs_super_root(disk_super))
goto fail_iput;
+ /* check FS state, whether FS is broken. */
+ fs_info->fs_state |= btrfs_super_flags(disk_super);
+
+ btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
+
ret = btrfs_parse_options(tree_root, options);
if (ret) {
err = ret;
}
features = btrfs_super_incompat_flags(disk_super);
- if (!(features & BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF)) {
- features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
- btrfs_set_super_incompat_flags(disk_super, features);
- }
+ features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
+ if (tree_root->fs_info->compress_type & BTRFS_COMPRESS_LZO)
+ features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
+ btrfs_set_super_incompat_flags(disk_super, features);
features = btrfs_super_compat_ro_flags(disk_super) &
~BTRFS_FEATURE_COMPAT_RO_SUPP;
btrfs_set_opt(fs_info->mount_opt, SSD);
}
- if (btrfs_super_log_root(disk_super) != 0) {
+ /* do not make disk changes in broken FS */
+ if (btrfs_super_log_root(disk_super) != 0 &&
+ !(fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)) {
u64 bytenr = btrfs_super_log_root(disk_super);
if (fs_devices->rw_devices == 0) {
if (uptodate) {
set_buffer_uptodate(bh);
} else {
- if (!buffer_eopnotsupp(bh) && printk_ratelimit()) {
+ if (printk_ratelimit()) {
printk(KERN_WARNING "lost page write due to "
"I/O error on %s\n",
bdevname(bh->b_bdev, b));
bh->b_end_io = btrfs_end_buffer_write_sync;
}
- if (i == last_barrier && do_barriers && device->barriers) {
- ret = submit_bh(WRITE_BARRIER, bh);
- if (ret == -EOPNOTSUPP) {
- printk("btrfs: disabling barriers on dev %s\n",
- device->name);
- set_buffer_uptodate(bh);
- device->barriers = 0;
- /* one reference for submit_bh */
- get_bh(bh);
- lock_buffer(bh);
- ret = submit_bh(WRITE_SYNC, bh);
- }
- } else {
+ if (i == last_barrier && do_barriers)
+ ret = submit_bh(WRITE_FLUSH_FUA, bh);
+ else
ret = submit_bh(WRITE_SYNC, bh);
- }
if (ret)
errors++;
smp_mb();
btrfs_put_block_group_cache(fs_info);
+
+ /*
+ * Here come 2 situations when btrfs is broken to flip readonly:
+ *
+ * 1. when btrfs flips readonly somewhere else before
+ * btrfs_commit_super, sb->s_flags has MS_RDONLY flag,
+ * and btrfs will skip to write sb directly to keep
+ * ERROR state on disk.
+ *
+ * 2. when btrfs flips readonly just in btrfs_commit_super,
+ * and in such case, btrfs cannnot write sb via btrfs_commit_super,
+ * and since fs_state has been set BTRFS_SUPER_FLAG_ERROR flag,
+ * btrfs will cleanup all FS resources first and write sb then.
+ */
if (!(fs_info->sb->s_flags & MS_RDONLY)) {
- ret = btrfs_commit_super(root);
+ ret = btrfs_commit_super(root);
+ if (ret)
+ printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
+ }
+
+ if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
+ ret = btrfs_error_commit_super(root);
if (ret)
printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
}
return 0;
}
+ static void btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
+ int read_only)
+ {
+ if (read_only)
+ return;
+
+ if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
+ printk(KERN_WARNING "warning: mount fs with errors, "
+ "running btrfsck is recommended\n");
+ }
+
+ int btrfs_error_commit_super(struct btrfs_root *root)
+ {
+ int ret;
+
+ mutex_lock(&root->fs_info->cleaner_mutex);
+ btrfs_run_delayed_iputs(root);
+ mutex_unlock(&root->fs_info->cleaner_mutex);
+
+ down_write(&root->fs_info->cleanup_work_sem);
+ up_write(&root->fs_info->cleanup_work_sem);
+
+ /* cleanup FS via transaction */
+ btrfs_cleanup_transaction(root);
+
+ ret = write_ctree_super(NULL, root, 0);
+
+ return ret;
+ }
+
+ static int btrfs_destroy_ordered_operations(struct btrfs_root *root)
+ {
+ struct btrfs_inode *btrfs_inode;
+ struct list_head splice;
+
+ INIT_LIST_HEAD(&splice);
+
+ mutex_lock(&root->fs_info->ordered_operations_mutex);
+ spin_lock(&root->fs_info->ordered_extent_lock);
+
+ list_splice_init(&root->fs_info->ordered_operations, &splice);
+ while (!list_empty(&splice)) {
+ btrfs_inode = list_entry(splice.next, struct btrfs_inode,
+ ordered_operations);
+
+ list_del_init(&btrfs_inode->ordered_operations);
+
+ btrfs_invalidate_inodes(btrfs_inode->root);
+ }
+
+ spin_unlock(&root->fs_info->ordered_extent_lock);
+ mutex_unlock(&root->fs_info->ordered_operations_mutex);
+
+ return 0;
+ }
+
+ static int btrfs_destroy_ordered_extents(struct btrfs_root *root)
+ {
+ struct list_head splice;
+ struct btrfs_ordered_extent *ordered;
+ struct inode *inode;
+
+ INIT_LIST_HEAD(&splice);
+
+ spin_lock(&root->fs_info->ordered_extent_lock);
+
+ list_splice_init(&root->fs_info->ordered_extents, &splice);
+ while (!list_empty(&splice)) {
+ ordered = list_entry(splice.next, struct btrfs_ordered_extent,
+ root_extent_list);
+
+ list_del_init(&ordered->root_extent_list);
+ atomic_inc(&ordered->refs);
+
+ /* the inode may be getting freed (in sys_unlink path). */
+ inode = igrab(ordered->inode);
+
+ spin_unlock(&root->fs_info->ordered_extent_lock);
+ if (inode)
+ iput(inode);
+
+ atomic_set(&ordered->refs, 1);
+ btrfs_put_ordered_extent(ordered);
+
+ spin_lock(&root->fs_info->ordered_extent_lock);
+ }
+
+ spin_unlock(&root->fs_info->ordered_extent_lock);
+
+ return 0;
+ }
+
+ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
+ struct btrfs_root *root)
+ {
+ struct rb_node *node;
+ struct btrfs_delayed_ref_root *delayed_refs;
+ struct btrfs_delayed_ref_node *ref;
+ int ret = 0;
+
+ delayed_refs = &trans->delayed_refs;
+
+ spin_lock(&delayed_refs->lock);
+ if (delayed_refs->num_entries == 0) {
+ printk(KERN_INFO "delayed_refs has NO entry\n");
+ return ret;
+ }
+
+ node = rb_first(&delayed_refs->root);
+ while (node) {
+ ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
+ node = rb_next(node);
+
+ ref->in_tree = 0;
+ rb_erase(&ref->rb_node, &delayed_refs->root);
+ delayed_refs->num_entries--;
+
+ atomic_set(&ref->refs, 1);
+ if (btrfs_delayed_ref_is_head(ref)) {
+ struct btrfs_delayed_ref_head *head;
+
+ head = btrfs_delayed_node_to_head(ref);
+ mutex_lock(&head->mutex);
+ kfree(head->extent_op);
+ delayed_refs->num_heads--;
+ if (list_empty(&head->cluster))
+ delayed_refs->num_heads_ready--;
+ list_del_init(&head->cluster);
+ mutex_unlock(&head->mutex);
+ }
+
+ spin_unlock(&delayed_refs->lock);
+ btrfs_put_delayed_ref(ref);
+
+ cond_resched();
+ spin_lock(&delayed_refs->lock);
+ }
+
+ spin_unlock(&delayed_refs->lock);
+
+ return ret;
+ }
+
+ static int btrfs_destroy_pending_snapshots(struct btrfs_transaction *t)
+ {
+ struct btrfs_pending_snapshot *snapshot;
+ struct list_head splice;
+
+ INIT_LIST_HEAD(&splice);
+
+ list_splice_init(&t->pending_snapshots, &splice);
+
+ while (!list_empty(&splice)) {
+ snapshot = list_entry(splice.next,
+ struct btrfs_pending_snapshot,
+ list);
+
+ list_del_init(&snapshot->list);
+
+ kfree(snapshot);
+ }
+
+ return 0;
+ }
+
+ static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
+ {
+ struct btrfs_inode *btrfs_inode;
+ struct list_head splice;
+
+ INIT_LIST_HEAD(&splice);
+
+ list_splice_init(&root->fs_info->delalloc_inodes, &splice);
+
+ spin_lock(&root->fs_info->delalloc_lock);
+
+ while (!list_empty(&splice)) {
+ btrfs_inode = list_entry(splice.next, struct btrfs_inode,
+ delalloc_inodes);
+
+ list_del_init(&btrfs_inode->delalloc_inodes);
+
+ btrfs_invalidate_inodes(btrfs_inode->root);
+ }
+
+ spin_unlock(&root->fs_info->delalloc_lock);
+
+ return 0;
+ }
+
+ static int btrfs_destroy_marked_extents(struct btrfs_root *root,
+ struct extent_io_tree *dirty_pages,
+ int mark)
+ {
+ int ret;
+ struct page *page;
+ struct inode *btree_inode = root->fs_info->btree_inode;
+ struct extent_buffer *eb;
+ u64 start = 0;
+ u64 end;
+ u64 offset;
+ unsigned long index;
+
+ while (1) {
+ ret = find_first_extent_bit(dirty_pages, start, &start, &end,
+ mark);
+ if (ret)
+ break;
+
+ clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS);
+ while (start <= end) {
+ index = start >> PAGE_CACHE_SHIFT;
+ start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
+ page = find_get_page(btree_inode->i_mapping, index);
+ if (!page)
+ continue;
+ offset = page_offset(page);
+
+ spin_lock(&dirty_pages->buffer_lock);
+ eb = radix_tree_lookup(
+ &(&BTRFS_I(page->mapping->host)->io_tree)->buffer,
+ offset >> PAGE_CACHE_SHIFT);
+ spin_unlock(&dirty_pages->buffer_lock);
+ if (eb) {
+ ret = test_and_clear_bit(EXTENT_BUFFER_DIRTY,
+ &eb->bflags);
+ atomic_set(&eb->refs, 1);
+ }
+ if (PageWriteback(page))
+ end_page_writeback(page);
+
+ lock_page(page);
+ if (PageDirty(page)) {
+ clear_page_dirty_for_io(page);
+ spin_lock_irq(&page->mapping->tree_lock);
+ radix_tree_tag_clear(&page->mapping->page_tree,
+ page_index(page),
+ PAGECACHE_TAG_DIRTY);
+ spin_unlock_irq(&page->mapping->tree_lock);
+ }
+
+ page->mapping->a_ops->invalidatepage(page, 0);
+ unlock_page(page);
+ }
+ }
+
+ return ret;
+ }
+
+ static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
+ struct extent_io_tree *pinned_extents)
+ {
+ struct extent_io_tree *unpin;
+ u64 start;
+ u64 end;
+ int ret;
+
+ unpin = pinned_extents;
+ while (1) {
+ ret = find_first_extent_bit(unpin, 0, &start, &end,
+ EXTENT_DIRTY);
+ if (ret)
+ break;
+
+ /* opt_discard */
+ ret = btrfs_error_discard_extent(root, start, end + 1 - start);
+
+ clear_extent_dirty(unpin, start, end, GFP_NOFS);
+ btrfs_error_unpin_extent_range(root, start, end);
+ cond_resched();
+ }
+
+ return 0;
+ }
+
+ static int btrfs_cleanup_transaction(struct btrfs_root *root)
+ {
+ struct btrfs_transaction *t;
+ LIST_HEAD(list);
+
+ WARN_ON(1);
+
+ mutex_lock(&root->fs_info->trans_mutex);
+ mutex_lock(&root->fs_info->transaction_kthread_mutex);
+
+ list_splice_init(&root->fs_info->trans_list, &list);
+ while (!list_empty(&list)) {
+ t = list_entry(list.next, struct btrfs_transaction, list);
+ if (!t)
+ break;
+
+ btrfs_destroy_ordered_operations(root);
+
+ btrfs_destroy_ordered_extents(root);
+
+ btrfs_destroy_delayed_refs(t, root);
+
+ btrfs_block_rsv_release(root,
+ &root->fs_info->trans_block_rsv,
+ t->dirty_pages.dirty_bytes);
+
+ /* FIXME: cleanup wait for commit */
+ t->in_commit = 1;
+ t->blocked = 1;
+ if (waitqueue_active(&root->fs_info->transaction_blocked_wait))
+ wake_up(&root->fs_info->transaction_blocked_wait);
+
+ t->blocked = 0;
+ if (waitqueue_active(&root->fs_info->transaction_wait))
+ wake_up(&root->fs_info->transaction_wait);
+ mutex_unlock(&root->fs_info->trans_mutex);
+
+ mutex_lock(&root->fs_info->trans_mutex);
+ t->commit_done = 1;
+ if (waitqueue_active(&t->commit_wait))
+ wake_up(&t->commit_wait);
+ mutex_unlock(&root->fs_info->trans_mutex);
+
+ mutex_lock(&root->fs_info->trans_mutex);
+
+ btrfs_destroy_pending_snapshots(t);
+
+ btrfs_destroy_delalloc_inodes(root);
+
+ spin_lock(&root->fs_info->new_trans_lock);
+ root->fs_info->running_transaction = NULL;
+ spin_unlock(&root->fs_info->new_trans_lock);
+
+ btrfs_destroy_marked_extents(root, &t->dirty_pages,
+ EXTENT_DIRTY);
+
+ btrfs_destroy_pinned_extent(root,
+ root->fs_info->pinned_extents);
+
+ t->use_count = 0;
+ list_del_init(&t->list);
+ memset(t, 0, sizeof(*t));
+ kmem_cache_free(btrfs_transaction_cachep, t);
+ }
+
+ mutex_unlock(&root->fs_info->transaction_kthread_mutex);
+ mutex_unlock(&root->fs_info->trans_mutex);
+
+ return 0;
+ }
+
static struct extent_io_ops btree_extent_io_ops = {
.write_cache_pages_lock_hook = btree_lock_page_hook,
.readpage_end_io_hook = btree_readpage_end_io_hook,
static void btrfs_issue_discard(struct block_device *bdev,
u64 start, u64 len)
{
- blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL,
- BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
+ blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL, 0);
}
static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
return btrfs_reduce_alloc_profile(root, flags);
}
- static u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
+ u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
{
u64 flags;
bytes + 2 * 1024 * 1024,
alloc_target, 0);
btrfs_end_transaction(trans, root);
- if (ret < 0)
- return ret;
+ if (ret < 0) {
+ if (ret != -ENOSPC)
+ return ret;
+ else
+ goto commit_trans;
+ }
if (!data_sinfo) {
btrfs_set_inode_space_info(root, inode);
spin_unlock(&data_sinfo->lock);
/* commit the current transaction and try again */
+ commit_trans:
if (!committed && !root->fs_info->open_ioctl_trans) {
committed = 1;
trans = btrfs_join_transaction(root, 1);
return 0;
}
- WARN_ON(1);
- printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n",
- block_rsv->size, block_rsv->reserved,
- block_rsv->freed[0], block_rsv->freed[1]);
-
return -ENOSPC;
}
if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
sinfo->bytes_may_use + sinfo->bytes_readonly +
- cache->reserved_pinned + num_bytes < sinfo->total_bytes) {
+ cache->reserved_pinned + num_bytes <= sinfo->total_bytes) {
sinfo->bytes_readonly += num_bytes;
sinfo->bytes_reserved += cache->reserved_pinned;
cache->reserved_pinned = 0;
cache->ro = 1;
ret = 0;
}
+
spin_unlock(&cache->lock);
spin_unlock(&sinfo->lock);
return ret;
return ret;
}
+ /*
+ * helper to account the unused space of all the readonly block group in the
+ * list. takes mirrors into account.
+ */
+ static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list)
+ {
+ struct btrfs_block_group_cache *block_group;
+ u64 free_bytes = 0;
+ int factor;
+
+ list_for_each_entry(block_group, groups_list, list) {
+ spin_lock(&block_group->lock);
+
+ if (!block_group->ro) {
+ spin_unlock(&block_group->lock);
+ continue;
+ }
+
+ if (block_group->flags & (BTRFS_BLOCK_GROUP_RAID1 |
+ BTRFS_BLOCK_GROUP_RAID10 |
+ BTRFS_BLOCK_GROUP_DUP))
+ factor = 2;
+ else
+ factor = 1;
+
+ free_bytes += (block_group->key.offset -
+ btrfs_block_group_used(&block_group->item)) *
+ factor;
+
+ spin_unlock(&block_group->lock);
+ }
+
+ return free_bytes;
+ }
+
+ /*
+ * helper to account the unused space of all the readonly block group in the
+ * space_info. takes mirrors into account.
+ */
+ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
+ {
+ int i;
+ u64 free_bytes = 0;
+
+ spin_lock(&sinfo->lock);
+
+ for(i = 0; i < BTRFS_NR_RAID_TYPES; i++)
+ if (!list_empty(&sinfo->block_groups[i]))
+ free_bytes += __btrfs_get_ro_block_group_free_space(
+ &sinfo->block_groups[i]);
+
+ spin_unlock(&sinfo->lock);
+
+ return free_bytes;
+ }
+
int btrfs_set_block_group_rw(struct btrfs_root *root,
struct btrfs_block_group_cache *cache)
{
mutex_lock(&root->fs_info->chunk_mutex);
list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
u64 min_free = btrfs_block_group_used(&block_group->item);
- u64 dev_offset, max_avail;
+ u64 dev_offset;
/*
* check to make sure we can actually find a chunk with enough
*/
if (device->total_bytes > device->bytes_used + min_free) {
ret = find_free_dev_extent(NULL, device, min_free,
- &dev_offset, &max_avail);
+ &dev_offset, NULL);
if (!ret)
break;
ret = -1;
btrfs_free_path(path);
return ret;
}
+
+ int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
+ {
+ return unpin_extent_range(root, start, end);
+ }
+
+ int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr,
+ u64 num_bytes)
+ {
+ return btrfs_discard_extent(root, bytenr, num_bytes);
+ }
BUG_ON(extent_map_end(em) <= cur);
BUG_ON(end < cur);
- if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
+ if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
this_bio_flag = EXTENT_BIO_COMPRESSED;
+ extent_set_compress_type(&this_bio_flag,
+ em->compress_type);
+ }
iosize = min(extent_map_end(em) - cur, end - cur + 1);
cur_end = min(extent_map_end(em) - 1, end);
#endif
eb = kmem_cache_zalloc(extent_buffer_cache, mask);
+ if (eb == NULL)
+ return NULL;
eb->start = start;
eb->len = len;
spin_lock_init(&eb->lock);
init_waitqueue_head(&eb->lock_wq);
- INIT_RCU_HEAD(&eb->rcu_head);
#if LEAK_DEBUG
spin_lock_irqsave(&leak_lock, flags);
#include <linux/string.h>
#include <linux/backing-dev.h>
#include <linux/mpage.h>
+#include <linux/falloc.h>
#include <linux/swap.h>
#include <linux/writeback.h>
#include <linux/statfs.h>
split->bdev = em->bdev;
split->flags = flags;
+ split->compress_type = em->compress_type;
ret = add_extent_mapping(em_tree, split);
BUG_ON(ret);
free_extent_map(split);
split->len = em->start + em->len - (start + len);
split->bdev = em->bdev;
split->flags = flags;
+ split->compress_type = em->compress_type;
if (compressed) {
split->block_len = em->block_len;
if (err)
goto out;
+ /*
+ * If BTRFS flips readonly due to some impossible error
+ * (fs_info->fs_state now has BTRFS_SUPER_FLAG_ERROR),
+ * although we have opened a file as writable, we have
+ * to stop this write operation to ensure FS consistency.
+ */
+ if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
+ err = -EROFS;
+ goto out;
+ }
+
file_update_time(file);
BTRFS_I(inode)->sequence++;
return 0;
}
+static long btrfs_fallocate(struct file *file, int mode,
+ loff_t offset, loff_t len)
+{
+ struct inode *inode = file->f_path.dentry->d_inode;
+ struct extent_state *cached_state = NULL;
+ u64 cur_offset;
+ u64 last_byte;
+ u64 alloc_start;
+ u64 alloc_end;
+ u64 alloc_hint = 0;
+ u64 locked_end;
+ u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
+ struct extent_map *em;
+ int ret;
+
+ alloc_start = offset & ~mask;
+ alloc_end = (offset + len + mask) & ~mask;
+
+ /* We only support the FALLOC_FL_KEEP_SIZE mode */
+ if (mode & ~FALLOC_FL_KEEP_SIZE)
+ return -EOPNOTSUPP;
+
+ /*
+ * wait for ordered IO before we have any locks. We'll loop again
+ * below with the locks held.
+ */
+ btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
+
+ mutex_lock(&inode->i_mutex);
+ ret = inode_newsize_ok(inode, alloc_end);
+ if (ret)
+ goto out;
+
+ if (alloc_start > inode->i_size) {
+ ret = btrfs_cont_expand(inode, alloc_start);
+ if (ret)
+ goto out;
+ }
+
+ ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
+ if (ret)
+ goto out;
+
+ locked_end = alloc_end - 1;
+ while (1) {
+ struct btrfs_ordered_extent *ordered;
+
+ /* the extent lock is ordered inside the running
+ * transaction
+ */
+ lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
+ locked_end, 0, &cached_state, GFP_NOFS);
+ ordered = btrfs_lookup_first_ordered_extent(inode,
+ alloc_end - 1);
+ if (ordered &&
+ ordered->file_offset + ordered->len > alloc_start &&
+ ordered->file_offset < alloc_end) {
+ btrfs_put_ordered_extent(ordered);
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree,
+ alloc_start, locked_end,
+ &cached_state, GFP_NOFS);
+ /*
+ * we can't wait on the range with the transaction
+ * running or with the extent lock held
+ */
+ btrfs_wait_ordered_range(inode, alloc_start,
+ alloc_end - alloc_start);
+ } else {
+ if (ordered)
+ btrfs_put_ordered_extent(ordered);
+ break;
+ }
+ }
+
+ cur_offset = alloc_start;
+ while (1) {
+ em = btrfs_get_extent(inode, NULL, 0, cur_offset,
+ alloc_end - cur_offset, 0);
+ BUG_ON(IS_ERR(em) || !em);
+ last_byte = min(extent_map_end(em), alloc_end);
+ last_byte = (last_byte + mask) & ~mask;
+ if (em->block_start == EXTENT_MAP_HOLE ||
+ (cur_offset >= inode->i_size &&
+ !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
+ ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
+ last_byte - cur_offset,
+ 1 << inode->i_blkbits,
+ offset + len,
+ &alloc_hint);
+ if (ret < 0) {
+ free_extent_map(em);
+ break;
+ }
+ }
+ free_extent_map(em);
+
+ cur_offset = last_byte;
+ if (cur_offset >= alloc_end) {
+ ret = 0;
+ break;
+ }
+ }
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
+ &cached_state, GFP_NOFS);
+
+ btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
+out:
+ mutex_unlock(&inode->i_mutex);
+ return ret;
+}
+
const struct file_operations btrfs_file_operations = {
.llseek = generic_file_llseek,
.read = do_sync_read,
.open = generic_file_open,
.release = btrfs_release_file,
.fsync = btrfs_sync_file,
+ .fallocate = btrfs_fallocate,
.unlocked_ioctl = btrfs_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = btrfs_ioctl,
size_t cur_size = size;
size_t datasize;
unsigned long offset;
- int use_compress = 0;
+ int compress_type = BTRFS_COMPRESS_NONE;
if (compressed_size && compressed_pages) {
- use_compress = 1;
+ compress_type = root->fs_info->compress_type;
cur_size = compressed_size;
}
btrfs_set_file_extent_ram_bytes(leaf, ei, size);
ptr = btrfs_file_extent_inline_start(ei);
- if (use_compress) {
+ if (compress_type != BTRFS_COMPRESS_NONE) {
struct page *cpage;
int i = 0;
while (compressed_size > 0) {
compressed_size -= cur_size;
}
btrfs_set_file_extent_compression(leaf, ei,
- BTRFS_COMPRESS_ZLIB);
+ compress_type);
} else {
page = find_get_page(inode->i_mapping,
start >> PAGE_CACHE_SHIFT);
u64 compressed_size;
struct page **pages;
unsigned long nr_pages;
+ int compress_type;
struct list_head list;
};
u64 start, u64 ram_size,
u64 compressed_size,
struct page **pages,
- unsigned long nr_pages)
+ unsigned long nr_pages,
+ int compress_type)
{
struct async_extent *async_extent;
async_extent->compressed_size = compressed_size;
async_extent->pages = pages;
async_extent->nr_pages = nr_pages;
+ async_extent->compress_type = compress_type;
list_add_tail(&async_extent->list, &cow->extents);
return 0;
}
unsigned long max_uncompressed = 128 * 1024;
int i;
int will_compress;
+ int compress_type = root->fs_info->compress_type;
actual_end = min_t(u64, isize, end + 1);
again:
WARN_ON(pages);
pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
- ret = btrfs_zlib_compress_pages(inode->i_mapping, start,
- total_compressed, pages,
- nr_pages, &nr_pages_ret,
- &total_in,
- &total_compressed,
- max_compressed);
+ if (BTRFS_I(inode)->force_compress)
+ compress_type = BTRFS_I(inode)->force_compress;
+
+ ret = btrfs_compress_pages(compress_type,
+ inode->i_mapping, start,
+ total_compressed, pages,
+ nr_pages, &nr_pages_ret,
+ &total_in,
+ &total_compressed,
+ max_compressed);
if (!ret) {
unsigned long offset = total_compressed &
* and will submit them to the elevator.
*/
add_async_extent(async_cow, start, num_bytes,
- total_compressed, pages, nr_pages_ret);
+ total_compressed, pages, nr_pages_ret,
+ compress_type);
if (start + num_bytes < end) {
start += num_bytes;
__set_page_dirty_nobuffers(locked_page);
/* unlocked later on in the async handlers */
}
- add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0);
+ add_async_extent(async_cow, start, end - start + 1,
+ 0, NULL, 0, BTRFS_COMPRESS_NONE);
*num_added += 1;
}
em->block_start = ins.objectid;
em->block_len = ins.offset;
em->bdev = root->fs_info->fs_devices->latest_bdev;
+ em->compress_type = async_extent->compress_type;
set_bit(EXTENT_FLAG_PINNED, &em->flags);
set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
async_extent->ram_size - 1, 0);
}
- ret = btrfs_add_ordered_extent(inode, async_extent->start,
- ins.objectid,
- async_extent->ram_size,
- ins.offset,
- BTRFS_ORDERED_COMPRESSED);
+ ret = btrfs_add_ordered_extent_compress(inode,
+ async_extent->start,
+ ins.objectid,
+ async_extent->ram_size,
+ ins.offset,
+ BTRFS_ORDERED_COMPRESSED,
+ async_extent->compress_type);
BUG_ON(ret);
/*
struct btrfs_ordered_extent *ordered_extent = NULL;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct extent_state *cached_state = NULL;
- int compressed = 0;
+ int compress_type = 0;
int ret;
bool nolock = false;
trans->block_rsv = &root->fs_info->delalloc_block_rsv;
if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
- compressed = 1;
+ compress_type = ordered_extent->compress_type;
if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
- BUG_ON(compressed);
+ BUG_ON(compress_type);
ret = btrfs_mark_extent_written(trans, inode,
ordered_extent->file_offset,
ordered_extent->file_offset +
ordered_extent->disk_len,
ordered_extent->len,
ordered_extent->len,
- compressed, 0, 0,
+ compress_type, 0, 0,
BTRFS_FILE_EXTENT_REG);
unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
ordered_extent->file_offset,
if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
logical = em->block_start;
failrec->bio_flags = EXTENT_BIO_COMPRESSED;
+ extent_set_compress_type(&failrec->bio_flags,
+ em->compress_type);
}
failrec->logical = logical;
free_extent_map(em);
static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
{
struct inode *inode = dentry->d_inode;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
int err;
+ if (btrfs_root_readonly(root))
+ return -EROFS;
+
err = inode_change_ok(inode, attr);
if (err)
return err;
p = &root->inode_tree.rb_node;
parent = NULL;
- if (hlist_unhashed(&inode->i_hash))
+ if (inode_unhashed(inode))
return;
spin_lock(&root->inode_lock);
int index;
int ret;
- dentry->d_op = &btrfs_dentry_operations;
-
if (dentry->d_name.len > BTRFS_NAME_LEN)
return ERR_PTR(-ENAMETOOLONG);
return inode;
}
-static int btrfs_dentry_delete(struct dentry *dentry)
+static int btrfs_dentry_delete(const struct dentry *dentry)
{
struct btrfs_root *root;
}
btrfs_set_trans_block_group(trans, dir);
- atomic_inc(&inode->i_count);
+ ihold(inode);
err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index);
size_t max_size;
unsigned long inline_size;
unsigned long ptr;
+ int compress_type;
WARN_ON(pg_offset != 0);
+ compress_type = btrfs_file_extent_compression(leaf, item);
max_size = btrfs_file_extent_ram_bytes(leaf, item);
inline_size = btrfs_file_extent_inline_item_len(leaf,
btrfs_item_nr(leaf, path->slots[0]));
read_extent_buffer(leaf, tmp, ptr, inline_size);
max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size);
- ret = btrfs_zlib_decompress(tmp, page, extent_offset,
- inline_size, max_size);
+ ret = btrfs_decompress(compress_type, tmp, page,
+ extent_offset, inline_size, max_size);
if (ret) {
char *kaddr = kmap_atomic(page, KM_USER0);
unsigned long copy_size = min_t(u64,
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct btrfs_trans_handle *trans = NULL;
- int compressed;
+ int compress_type;
again:
read_lock(&em_tree->lock);
found_type = btrfs_file_extent_type(leaf, item);
extent_start = found_key.offset;
- compressed = btrfs_file_extent_compression(leaf, item);
+ compress_type = btrfs_file_extent_compression(leaf, item);
if (found_type == BTRFS_FILE_EXTENT_REG ||
found_type == BTRFS_FILE_EXTENT_PREALLOC) {
extent_end = extent_start +
em->block_start = EXTENT_MAP_HOLE;
goto insert;
}
- if (compressed) {
+ if (compress_type != BTRFS_COMPRESS_NONE) {
set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+ em->compress_type = compress_type;
em->block_start = bytenr;
em->block_len = btrfs_file_extent_disk_num_bytes(leaf,
item);
em->len = (copy_size + root->sectorsize - 1) &
~((u64)root->sectorsize - 1);
em->orig_start = EXTENT_MAP_INLINE;
- if (compressed)
+ if (compress_type) {
set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+ em->compress_type = compress_type;
+ }
ptr = btrfs_file_extent_inline_start(item) + extent_offset;
if (create == 0 && !PageUptodate(page)) {
- if (btrfs_file_extent_compression(leaf, item) ==
- BTRFS_COMPRESS_ZLIB) {
+ if (btrfs_file_extent_compression(leaf, item) !=
+ BTRFS_COMPRESS_NONE) {
ret = uncompress_inline(path, inode, page,
pg_offset,
extent_offset, item);
ei->ordered_data_close = 0;
ei->orphan_meta_reserved = 0;
ei->dummy_inode = 0;
- ei->force_compress = 0;
+ ei->force_compress = BTRFS_COMPRESS_NONE;
inode = &ei->vfs_inode;
extent_map_tree_init(&ei->extent_tree, GFP_NOFS);
return inode;
}
+static void btrfs_i_callback(struct rcu_head *head)
+{
+ struct inode *inode = container_of(head, struct inode, i_rcu);
+ INIT_LIST_HEAD(&inode->i_dentry);
+ kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
+}
+
void btrfs_destroy_inode(struct inode *inode)
{
struct btrfs_ordered_extent *ordered;
inode_tree_del(inode);
btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
free:
- kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
+ call_rcu(&inode->i_rcu, btrfs_i_callback);
}
int btrfs_drop_inode(struct inode *inode)
min_size, actual_len, alloc_hint, trans);
}
-static long btrfs_fallocate(struct inode *inode, int mode,
- loff_t offset, loff_t len)
-{
- struct extent_state *cached_state = NULL;
- u64 cur_offset;
- u64 last_byte;
- u64 alloc_start;
- u64 alloc_end;
- u64 alloc_hint = 0;
- u64 locked_end;
- u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
- struct extent_map *em;
- int ret;
-
- alloc_start = offset & ~mask;
- alloc_end = (offset + len + mask) & ~mask;
-
- /*
- * wait for ordered IO before we have any locks. We'll loop again
- * below with the locks held.
- */
- btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
-
- mutex_lock(&inode->i_mutex);
- ret = inode_newsize_ok(inode, alloc_end);
- if (ret)
- goto out;
-
- if (alloc_start > inode->i_size) {
- ret = btrfs_cont_expand(inode, alloc_start);
- if (ret)
- goto out;
- }
-
- ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
- if (ret)
- goto out;
-
- locked_end = alloc_end - 1;
- while (1) {
- struct btrfs_ordered_extent *ordered;
-
- /* the extent lock is ordered inside the running
- * transaction
- */
- lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
- locked_end, 0, &cached_state, GFP_NOFS);
- ordered = btrfs_lookup_first_ordered_extent(inode,
- alloc_end - 1);
- if (ordered &&
- ordered->file_offset + ordered->len > alloc_start &&
- ordered->file_offset < alloc_end) {
- btrfs_put_ordered_extent(ordered);
- unlock_extent_cached(&BTRFS_I(inode)->io_tree,
- alloc_start, locked_end,
- &cached_state, GFP_NOFS);
- /*
- * we can't wait on the range with the transaction
- * running or with the extent lock held
- */
- btrfs_wait_ordered_range(inode, alloc_start,
- alloc_end - alloc_start);
- } else {
- if (ordered)
- btrfs_put_ordered_extent(ordered);
- break;
- }
- }
-
- cur_offset = alloc_start;
- while (1) {
- em = btrfs_get_extent(inode, NULL, 0, cur_offset,
- alloc_end - cur_offset, 0);
- BUG_ON(IS_ERR(em) || !em);
- last_byte = min(extent_map_end(em), alloc_end);
- last_byte = (last_byte + mask) & ~mask;
- if (em->block_start == EXTENT_MAP_HOLE ||
- (cur_offset >= inode->i_size &&
- !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
- ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
- last_byte - cur_offset,
- 1 << inode->i_blkbits,
- offset + len,
- &alloc_hint);
- if (ret < 0) {
- free_extent_map(em);
- break;
- }
- }
- free_extent_map(em);
-
- cur_offset = last_byte;
- if (cur_offset >= alloc_end) {
- ret = 0;
- break;
- }
- }
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
- &cached_state, GFP_NOFS);
-
- btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
-out:
- mutex_unlock(&inode->i_mutex);
- return ret;
-}
-
static int btrfs_set_page_dirty(struct page *page)
{
return __set_page_dirty_nobuffers(page);
}
-static int btrfs_permission(struct inode *inode, int mask)
+static int btrfs_permission(struct inode *inode, int mask, unsigned int flags)
{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+
+ if (btrfs_root_readonly(root) && (mask & MAY_WRITE))
+ return -EROFS;
if ((BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) && (mask & MAY_WRITE))
return -EACCES;
- return generic_permission(inode, mask, btrfs_check_acl);
+ return generic_permission(inode, mask, flags, btrfs_check_acl);
}
static const struct inode_operations btrfs_dir_inode_operations = {
.listxattr = btrfs_listxattr,
.removexattr = btrfs_removexattr,
.permission = btrfs_permission,
- .fallocate = btrfs_fallocate,
.fiemap = btrfs_fiemap,
};
static const struct inode_operations btrfs_special_inode_operations = {
static const struct super_operations btrfs_super_ops;
+ static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno,
+ char nbuf[16])
+ {
+ char *errstr = NULL;
+
+ switch (errno) {
+ case -EIO:
+ errstr = "IO failure";
+ break;
+ case -ENOMEM:
+ errstr = "Out of memory";
+ break;
+ case -EROFS:
+ errstr = "Readonly filesystem";
+ break;
+ default:
+ if (nbuf) {
+ if (snprintf(nbuf, 16, "error %d", -errno) >= 0)
+ errstr = nbuf;
+ }
+ break;
+ }
+
+ return errstr;
+ }
+
+ static void __save_error_info(struct btrfs_fs_info *fs_info)
+ {
+ /*
+ * today we only save the error info into ram. Long term we'll
+ * also send it down to the disk
+ */
+ fs_info->fs_state = BTRFS_SUPER_FLAG_ERROR;
+ }
+
+ /* NOTE:
+ * We move write_super stuff at umount in order to avoid deadlock
+ * for umount hold all lock.
+ */
+ static void save_error_info(struct btrfs_fs_info *fs_info)
+ {
+ __save_error_info(fs_info);
+ }
+
+ /* btrfs handle error by forcing the filesystem readonly */
+ static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
+ {
+ struct super_block *sb = fs_info->sb;
+
+ if (sb->s_flags & MS_RDONLY)
+ return;
+
+ if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
+ sb->s_flags |= MS_RDONLY;
+ printk(KERN_INFO "btrfs is forced readonly\n");
+ }
+ }
+
+ /*
+ * __btrfs_std_error decodes expected errors from the caller and
+ * invokes the approciate error response.
+ */
+ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
+ unsigned int line, int errno)
+ {
+ struct super_block *sb = fs_info->sb;
+ char nbuf[16];
+ const char *errstr;
+
+ /*
+ * Special case: if the error is EROFS, and we're already
+ * under MS_RDONLY, then it is safe here.
+ */
+ if (errno == -EROFS && (sb->s_flags & MS_RDONLY))
+ return;
+
+ errstr = btrfs_decode_error(fs_info, errno, nbuf);
+ printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s\n",
+ sb->s_id, function, line, errstr);
+ save_error_info(fs_info);
+
+ btrfs_handle_error(fs_info);
+ }
+
static void btrfs_put_super(struct super_block *sb)
{
struct btrfs_root *root = btrfs_sb(sb);
Opt_degraded, Opt_subvol, Opt_subvolid, Opt_device, Opt_nodatasum,
Opt_nodatacow, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd,
Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress,
- Opt_compress_force, Opt_notreelog, Opt_ratio, Opt_flushoncommit,
- Opt_discard, Opt_space_cache, Opt_clear_cache, Opt_err,
- Opt_user_subvol_rm_allowed,
+ Opt_compress_type, Opt_compress_force, Opt_compress_force_type,
+ Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
+ Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, Opt_err,
};
static match_table_t tokens = {
{Opt_alloc_start, "alloc_start=%s"},
{Opt_thread_pool, "thread_pool=%d"},
{Opt_compress, "compress"},
+ {Opt_compress_type, "compress=%s"},
{Opt_compress_force, "compress-force"},
+ {Opt_compress_force_type, "compress-force=%s"},
{Opt_ssd, "ssd"},
{Opt_ssd_spread, "ssd_spread"},
{Opt_nossd, "nossd"},
char *p, *num, *orig;
int intarg;
int ret = 0;
+ char *compress_type;
+ bool compress_force = false;
if (!options)
return 0;
btrfs_set_opt(info->mount_opt, NODATACOW);
btrfs_set_opt(info->mount_opt, NODATASUM);
break;
- case Opt_compress:
- printk(KERN_INFO "btrfs: use compression\n");
- btrfs_set_opt(info->mount_opt, COMPRESS);
- break;
case Opt_compress_force:
- printk(KERN_INFO "btrfs: forcing compression\n");
- btrfs_set_opt(info->mount_opt, FORCE_COMPRESS);
+ case Opt_compress_force_type:
+ compress_force = true;
+ case Opt_compress:
+ case Opt_compress_type:
+ if (token == Opt_compress ||
+ token == Opt_compress_force ||
+ strcmp(args[0].from, "zlib") == 0) {
+ compress_type = "zlib";
+ info->compress_type = BTRFS_COMPRESS_ZLIB;
+ } else if (strcmp(args[0].from, "lzo") == 0) {
+ compress_type = "lzo";
+ info->compress_type = BTRFS_COMPRESS_LZO;
+ } else {
+ ret = -EINVAL;
+ goto out;
+ }
+
btrfs_set_opt(info->mount_opt, COMPRESS);
+ if (compress_force) {
+ btrfs_set_opt(info->mount_opt, FORCE_COMPRESS);
+ pr_info("btrfs: force %s compression\n",
+ compress_type);
+ } else
+ pr_info("btrfs: use %s compression\n",
+ compress_type);
break;
case Opt_ssd:
printk(KERN_INFO "btrfs: use ssd allocation scheme\n");
sb->s_maxbytes = MAX_LFS_FILESIZE;
sb->s_magic = BTRFS_SUPER_MAGIC;
sb->s_op = &btrfs_super_ops;
+ sb->s_d_op = &btrfs_dentry_operations;
sb->s_export_op = &btrfs_export_ops;
sb->s_xattr = btrfs_xattr_handlers;
sb->s_time_gran = 1;
* Note: This is based on get_sb_bdev from fs/super.c with a few additions
* for multiple device setup. Make sure to keep it in sync.
*/
-static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
- const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
+ const char *dev_name, void *data)
{
struct block_device *bdev = NULL;
struct super_block *s;
&subvol_name, &subvol_objectid,
&fs_devices);
if (error)
- return error;
+ return ERR_PTR(error);
error = btrfs_scan_one_device(dev_name, mode, fs_type, &fs_devices);
if (error)
root = new_root;
}
- mnt->mnt_sb = s;
- mnt->mnt_root = root;
-
kfree(subvol_name);
- return 0;
+ return root;
error_s:
error = PTR_ERR(s);
kfree(tree_root);
error_free_subvol_name:
kfree(subvol_name);
- return error;
+ return ERR_PTR(error);
}
static int btrfs_remount(struct super_block *sb, int *flags, char *data)
return 0;
}
+ /*
+ * The helper to calc the free space on the devices that can be used to store
+ * file data.
+ */
+ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
+ {
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_device_info *devices_info;
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ struct btrfs_device *device;
+ u64 skip_space;
+ u64 type;
+ u64 avail_space;
+ u64 used_space;
+ u64 min_stripe_size;
+ int min_stripes = 1;
+ int i = 0, nr_devices;
+ int ret;
+
+ nr_devices = fs_info->fs_devices->rw_devices;
+ BUG_ON(!nr_devices);
+
+ devices_info = kmalloc(sizeof(*devices_info) * nr_devices,
+ GFP_NOFS);
+ if (!devices_info)
+ return -ENOMEM;
+
+ /* calc min stripe number for data space alloction */
+ type = btrfs_get_alloc_profile(root, 1);
+ if (type & BTRFS_BLOCK_GROUP_RAID0)
+ min_stripes = 2;
+ else if (type & BTRFS_BLOCK_GROUP_RAID1)
+ min_stripes = 2;
+ else if (type & BTRFS_BLOCK_GROUP_RAID10)
+ min_stripes = 4;
+
+ if (type & BTRFS_BLOCK_GROUP_DUP)
+ min_stripe_size = 2 * BTRFS_STRIPE_LEN;
+ else
+ min_stripe_size = BTRFS_STRIPE_LEN;
+
+ list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
+ if (!device->in_fs_metadata)
+ continue;
+
+ avail_space = device->total_bytes - device->bytes_used;
+
+ /* align with stripe_len */
+ do_div(avail_space, BTRFS_STRIPE_LEN);
+ avail_space *= BTRFS_STRIPE_LEN;
+
+ /*
+ * In order to avoid overwritting the superblock on the drive,
+ * btrfs starts at an offset of at least 1MB when doing chunk
+ * allocation.
+ */
+ skip_space = 1024 * 1024;
+
+ /* user can set the offset in fs_info->alloc_start. */
+ if (fs_info->alloc_start + BTRFS_STRIPE_LEN <=
+ device->total_bytes)
+ skip_space = max(fs_info->alloc_start, skip_space);
+
+ /*
+ * btrfs can not use the free space in [0, skip_space - 1],
+ * we must subtract it from the total. In order to implement
+ * it, we account the used space in this range first.
+ */
+ ret = btrfs_account_dev_extents_size(device, 0, skip_space - 1,
+ &used_space);
+ if (ret) {
+ kfree(devices_info);
+ return ret;
+ }
+
+ /* calc the free space in [0, skip_space - 1] */
+ skip_space -= used_space;
+
+ /*
+ * we can use the free space in [0, skip_space - 1], subtract
+ * it from the total.
+ */
+ if (avail_space && avail_space >= skip_space)
+ avail_space -= skip_space;
+ else
+ avail_space = 0;
+
+ if (avail_space < min_stripe_size)
+ continue;
+
+ devices_info[i].dev = device;
+ devices_info[i].max_avail = avail_space;
+
+ i++;
+ }
+
+ nr_devices = i;
+
+ btrfs_descending_sort_devices(devices_info, nr_devices);
+
+ i = nr_devices - 1;
+ avail_space = 0;
+ while (nr_devices >= min_stripes) {
+ if (devices_info[i].max_avail >= min_stripe_size) {
+ int j;
+ u64 alloc_size;
+
+ avail_space += devices_info[i].max_avail * min_stripes;
+ alloc_size = devices_info[i].max_avail;
+ for (j = i + 1 - min_stripes; j <= i; j++)
+ devices_info[j].max_avail -= alloc_size;
+ }
+ i--;
+ nr_devices--;
+ }
+
+ kfree(devices_info);
+ *free_bytes = avail_space;
+ return 0;
+ }
+
static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct btrfs_root *root = btrfs_sb(dentry->d_sb);
struct list_head *head = &root->fs_info->space_info;
struct btrfs_space_info *found;
u64 total_used = 0;
- u64 total_used_data = 0;
+ u64 total_free_data = 0;
int bits = dentry->d_sb->s_blocksize_bits;
__be32 *fsid = (__be32 *)root->fs_info->fsid;
+ int ret;
+ /* holding chunk_muext to avoid allocating new chunks */
+ mutex_lock(&root->fs_info->chunk_mutex);
rcu_read_lock();
list_for_each_entry_rcu(found, head, list) {
- if (found->flags & (BTRFS_BLOCK_GROUP_METADATA |
- BTRFS_BLOCK_GROUP_SYSTEM))
- total_used_data += found->disk_total;
- else
- total_used_data += found->disk_used;
+ if (found->flags & BTRFS_BLOCK_GROUP_DATA) {
+ total_free_data += found->disk_total - found->disk_used;
+ total_free_data -=
+ btrfs_account_ro_block_groups_free_space(found);
+ }
+
total_used += found->disk_used;
}
rcu_read_unlock();
buf->f_namelen = BTRFS_NAME_LEN;
buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits;
buf->f_bfree = buf->f_blocks - (total_used >> bits);
- buf->f_bavail = buf->f_blocks - (total_used_data >> bits);
buf->f_bsize = dentry->d_sb->s_blocksize;
buf->f_type = BTRFS_SUPER_MAGIC;
+ buf->f_bavail = total_free_data;
+ ret = btrfs_calc_avail_data_space(root, &total_free_data);
+ if (ret) {
+ mutex_unlock(&root->fs_info->chunk_mutex);
+ return ret;
+ }
+ buf->f_bavail += total_free_data;
+ buf->f_bavail = buf->f_bavail >> bits;
+ mutex_unlock(&root->fs_info->chunk_mutex);
/* We treat it as constant endianness (it doesn't matter _which_)
because we want the fsid to come out the same whether mounted
static struct file_system_type btrfs_fs_type = {
.owner = THIS_MODULE,
.name = "btrfs",
- .get_sb = btrfs_get_sb,
+ .mount = btrfs_mount,
.kill_sb = kill_anon_super,
.fs_flags = FS_REQUIRES_DEV,
};
.unlocked_ioctl = btrfs_control_ioctl,
.compat_ioctl = btrfs_control_ioctl,
.owner = THIS_MODULE,
+ .llseek = noop_llseek,
};
static struct miscdevice btrfs_misc = {
if (err)
return err;
- err = btrfs_init_cachep();
+ err = btrfs_init_compress();
if (err)
goto free_sysfs;
+ err = btrfs_init_cachep();
+ if (err)
+ goto free_compress;
+
err = extent_io_init();
if (err)
goto free_cachep;
extent_io_exit();
free_cachep:
btrfs_destroy_cachep();
+ free_compress:
+ btrfs_exit_compress();
free_sysfs:
btrfs_exit_sysfs();
return err;
unregister_filesystem(&btrfs_fs_type);
btrfs_exit_sysfs();
btrfs_cleanup_fs_uuids();
- btrfs_zlib_exit();
+ btrfs_exit_compress();
}
module_init(init_btrfs_fs)
#include <linux/blkdev.h>
#include <linux/random.h>
#include <linux/iocontext.h>
+ #include <linux/capability.h>
#include <asm/div64.h>
#include "compat.h"
#include "ctree.h"
device->work.func = pending_bios_fn;
memcpy(device->uuid, disk_super->dev_item.uuid,
BTRFS_UUID_SIZE);
- device->barriers = 1;
spin_lock_init(&device->io_lock);
device->name = kstrdup(path, GFP_NOFS);
if (!device->name) {
device->devid = orig_dev->devid;
device->work.func = pending_bios_fn;
memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid));
- device->barriers = 1;
spin_lock_init(&device->io_lock);
INIT_LIST_HEAD(&device->dev_list);
INIT_LIST_HEAD(&device->dev_alloc_list);
continue;
if (device->bdev) {
- close_bdev_exclusive(device->bdev, device->mode);
+ blkdev_put(device->bdev, device->mode);
device->bdev = NULL;
fs_devices->open_devices--;
}
list_for_each_entry(device, &fs_devices->devices, dev_list) {
if (device->bdev) {
- close_bdev_exclusive(device->bdev, device->mode);
+ blkdev_put(device->bdev, device->mode);
fs_devices->open_devices--;
}
if (device->writeable) {
int seeding = 1;
int ret = 0;
+ flags |= FMODE_EXCL;
+
list_for_each_entry(device, head, dev_list) {
if (device->bdev)
continue;
if (!device->name)
continue;
- bdev = open_bdev_exclusive(device->name, flags, holder);
+ bdev = blkdev_get_by_path(device->name, flags, holder);
if (IS_ERR(bdev)) {
printk(KERN_INFO "open %s failed\n", device->name);
goto error;
set_blocksize(bdev, 4096);
bh = btrfs_read_dev_super(bdev);
- if (!bh)
+ if (!bh) {
+ ret = -EINVAL;
goto error_close;
+ }
disk_super = (struct btrfs_super_block *)bh->b_data;
devid = btrfs_stack_device_id(&disk_super->dev_item);
error_brelse:
brelse(bh);
error_close:
- close_bdev_exclusive(bdev, FMODE_READ);
+ blkdev_put(bdev, flags);
error:
continue;
}
mutex_lock(&uuid_mutex);
- bdev = open_bdev_exclusive(path, flags, holder);
+ flags |= FMODE_EXCL;
+ bdev = blkdev_get_by_path(path, flags, holder);
if (IS_ERR(bdev)) {
ret = PTR_ERR(bdev);
goto error_close;
bh = btrfs_read_dev_super(bdev);
if (!bh) {
- ret = -EIO;
+ ret = -EINVAL;
goto error_close;
}
disk_super = (struct btrfs_super_block *)bh->b_data;
brelse(bh);
error_close:
- close_bdev_exclusive(bdev, flags);
+ blkdev_put(bdev, flags);
error:
mutex_unlock(&uuid_mutex);
return ret;
}
+ /* helper to account the used device space in the range */
+ int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
+ u64 end, u64 *length)
+ {
+ struct btrfs_key key;
+ struct btrfs_root *root = device->dev_root;
+ struct btrfs_dev_extent *dev_extent;
+ struct btrfs_path *path;
+ u64 extent_end;
+ int ret;
+ int slot;
+ struct extent_buffer *l;
+
+ *length = 0;
+
+ if (start >= device->total_bytes)
+ return 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ path->reada = 2;
+
+ key.objectid = device->devid;
+ key.offset = start;
+ key.type = BTRFS_DEV_EXTENT_KEY;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+ if (ret > 0) {
+ ret = btrfs_previous_item(root, path, key.objectid, key.type);
+ if (ret < 0)
+ goto out;
+ }
+
+ while (1) {
+ l = path->nodes[0];
+ slot = path->slots[0];
+ if (slot >= btrfs_header_nritems(l)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret == 0)
+ continue;
+ if (ret < 0)
+ goto out;
+
+ break;
+ }
+ btrfs_item_key_to_cpu(l, &key, slot);
+
+ if (key.objectid < device->devid)
+ goto next;
+
+ if (key.objectid > device->devid)
+ break;
+
+ if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
+ goto next;
+
+ dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
+ extent_end = key.offset + btrfs_dev_extent_length(l,
+ dev_extent);
+ if (key.offset <= start && extent_end > end) {
+ *length = end - start + 1;
+ break;
+ } else if (key.offset <= start && extent_end > start)
+ *length += extent_end - start;
+ else if (key.offset > start && extent_end <= end)
+ *length += extent_end - key.offset;
+ else if (key.offset > start && key.offset <= end) {
+ *length += end - key.offset + 1;
+ break;
+ } else if (key.offset > end)
+ break;
+
+ next:
+ path->slots[0]++;
+ }
+ ret = 0;
+ out:
+ btrfs_free_path(path);
+ return ret;
+ }
+
/*
+ * find_free_dev_extent - find free space in the specified device
+ * @trans: transaction handler
+ * @device: the device which we search the free space in
+ * @num_bytes: the size of the free space that we need
+ * @start: store the start of the free space.
+ * @len: the size of the free space. that we find, or the size of the max
+ * free space if we don't find suitable free space
+ *
* this uses a pretty simple search, the expectation is that it is
* called very infrequently and that a given device has a small number
* of extents
+ *
+ * @start is used to store the start of the free space if we find. But if we
+ * don't find suitable free space, it will be used to store the start position
+ * of the max free space.
+ *
+ * @len is used to store the size of the free space that we find.
+ * But if we don't find suitable free space, it is used to store the size of
+ * the max free space.
*/
int find_free_dev_extent(struct btrfs_trans_handle *trans,
struct btrfs_device *device, u64 num_bytes,
- u64 *start, u64 *max_avail)
+ u64 *start, u64 *len)
{
struct btrfs_key key;
struct btrfs_root *root = device->dev_root;
- struct btrfs_dev_extent *dev_extent = NULL;
+ struct btrfs_dev_extent *dev_extent;
struct btrfs_path *path;
- u64 hole_size = 0;
- u64 last_byte = 0;
- u64 search_start = 0;
+ u64 hole_size;
+ u64 max_hole_start;
+ u64 max_hole_size;
+ u64 extent_end;
+ u64 search_start;
u64 search_end = device->total_bytes;
int ret;
- int slot = 0;
- int start_found;
+ int slot;
struct extent_buffer *l;
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
- path->reada = 2;
- start_found = 0;
-
/* FIXME use last free of some kind */
/* we don't want to overwrite the superblock on the drive,
* so we make sure to start at an offset of at least 1MB
*/
- search_start = max((u64)1024 * 1024, search_start);
+ search_start = 1024 * 1024;
- if (root->fs_info->alloc_start + num_bytes <= device->total_bytes)
+ if (root->fs_info->alloc_start + num_bytes <= search_end)
search_start = max(root->fs_info->alloc_start, search_start);
+ max_hole_start = search_start;
+ max_hole_size = 0;
+
+ if (search_start >= search_end) {
+ ret = -ENOSPC;
+ goto error;
+ }
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto error;
+ }
+ path->reada = 2;
+
key.objectid = device->devid;
key.offset = search_start;
key.type = BTRFS_DEV_EXTENT_KEY;
+
ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
if (ret < 0)
- goto error;
+ goto out;
if (ret > 0) {
ret = btrfs_previous_item(root, path, key.objectid, key.type);
if (ret < 0)
- goto error;
- if (ret > 0)
- start_found = 1;
+ goto out;
}
- l = path->nodes[0];
- btrfs_item_key_to_cpu(l, &key, path->slots[0]);
+
while (1) {
l = path->nodes[0];
slot = path->slots[0];
if (ret == 0)
continue;
if (ret < 0)
- goto error;
- no_more_items:
- if (!start_found) {
- if (search_start >= search_end) {
- ret = -ENOSPC;
- goto error;
- }
- *start = search_start;
- start_found = 1;
- goto check_pending;
- }
- *start = last_byte > search_start ?
- last_byte : search_start;
- if (search_end <= *start) {
- ret = -ENOSPC;
- goto error;
- }
- goto check_pending;
+ goto out;
+
+ break;
}
btrfs_item_key_to_cpu(l, &key, slot);
goto next;
if (key.objectid > device->devid)
- goto no_more_items;
+ break;
- if (key.offset >= search_start && key.offset > last_byte &&
- start_found) {
- if (last_byte < search_start)
- last_byte = search_start;
- hole_size = key.offset - last_byte;
+ if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
+ goto next;
- if (hole_size > *max_avail)
- *max_avail = hole_size;
+ if (key.offset > search_start) {
+ hole_size = key.offset - search_start;
+
+ if (hole_size > max_hole_size) {
+ max_hole_start = search_start;
+ max_hole_size = hole_size;
+ }
- if (key.offset > last_byte &&
- hole_size >= num_bytes) {
- *start = last_byte;
- goto check_pending;
+ /*
+ * If this free space is greater than which we need,
+ * it must be the max free space that we have found
+ * until now, so max_hole_start must point to the start
+ * of this free space and the length of this free space
+ * is stored in max_hole_size. Thus, we return
+ * max_hole_start and max_hole_size and go back to the
+ * caller.
+ */
+ if (hole_size >= num_bytes) {
+ ret = 0;
+ goto out;
}
}
- if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
- goto next;
- start_found = 1;
dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
- last_byte = key.offset + btrfs_dev_extent_length(l, dev_extent);
+ extent_end = key.offset + btrfs_dev_extent_length(l,
+ dev_extent);
+ if (extent_end > search_start)
+ search_start = extent_end;
next:
path->slots[0]++;
cond_resched();
}
- check_pending:
- /* we have to make sure we didn't find an extent that has already
- * been allocated by the map tree or the original allocation
- */
- BUG_ON(*start < search_start);
- if (*start + num_bytes > search_end) {
- ret = -ENOSPC;
- goto error;
+ hole_size = search_end- search_start;
+ if (hole_size > max_hole_size) {
+ max_hole_start = search_start;
+ max_hole_size = hole_size;
}
- /* check for pending inserts here */
- ret = 0;
- error:
+ /* See above. */
+ if (hole_size < num_bytes)
+ ret = -ENOSPC;
+ else
+ ret = 0;
+
+ out:
btrfs_free_path(path);
+ error:
+ *start = max_hole_start;
+ if (len)
+ *len = max_hole_size;
return ret;
}
goto out;
}
} else {
- bdev = open_bdev_exclusive(device_path, FMODE_READ,
- root->fs_info->bdev_holder);
+ bdev = blkdev_get_by_path(device_path, FMODE_READ | FMODE_EXCL,
+ root->fs_info->bdev_holder);
if (IS_ERR(bdev)) {
ret = PTR_ERR(bdev);
goto out;
set_blocksize(bdev, 4096);
bh = btrfs_read_dev_super(bdev);
if (!bh) {
- ret = -EIO;
+ ret = -EINVAL;
goto error_close;
}
disk_super = (struct btrfs_super_block *)bh->b_data;
root->fs_info->fs_devices->latest_bdev = next_device->bdev;
if (device->bdev) {
- close_bdev_exclusive(device->bdev, device->mode);
+ blkdev_put(device->bdev, device->mode);
device->bdev = NULL;
device->fs_devices->open_devices--;
}
brelse(bh);
error_close:
if (bdev)
- close_bdev_exclusive(bdev, FMODE_READ);
+ blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
out:
mutex_unlock(&root->fs_info->volume_mutex);
mutex_unlock(&uuid_mutex);
if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
return -EINVAL;
- bdev = open_bdev_exclusive(device_path, 0, root->fs_info->bdev_holder);
+ bdev = blkdev_get_by_path(device_path, FMODE_EXCL,
+ root->fs_info->bdev_holder);
if (IS_ERR(bdev))
return PTR_ERR(bdev);
trans = btrfs_start_transaction(root, 0);
lock_chunks(root);
- device->barriers = 1;
device->writeable = 1;
device->work.func = pending_bios_fn;
generate_random_uuid(device->uuid);
mutex_unlock(&root->fs_info->volume_mutex);
return ret;
error:
- close_bdev_exclusive(bdev, 0);
+ blkdev_put(bdev, FMODE_EXCL);
if (seeding_dev) {
mutex_unlock(&uuid_mutex);
up_write(&sb->s_umount);
if (dev_root->fs_info->sb->s_flags & MS_RDONLY)
return -EROFS;
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
mutex_lock(&dev_root->fs_info->volume_mutex);
dev_root = dev_root->fs_info->dev_root;
return calc_size * num_stripes;
}
- static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
- struct btrfs_root *extent_root,
- struct map_lookup **map_ret,
- u64 *num_bytes, u64 *stripe_size,
- u64 start, u64 type)
+ /* Used to sort the devices by max_avail(descending sort) */
+ int btrfs_cmp_device_free_bytes(const void *dev_info1, const void *dev_info2)
{
- struct btrfs_fs_info *info = extent_root->fs_info;
- struct btrfs_device *device = NULL;
- struct btrfs_fs_devices *fs_devices = info->fs_devices;
- struct list_head *cur;
- struct map_lookup *map = NULL;
- struct extent_map_tree *em_tree;
- struct extent_map *em;
- struct list_head private_devs;
- int min_stripe_size = 1 * 1024 * 1024;
- u64 calc_size = 1024 * 1024 * 1024;
- u64 max_chunk_size = calc_size;
- u64 min_free;
- u64 avail;
- u64 max_avail = 0;
- u64 dev_offset;
- int num_stripes = 1;
- int min_stripes = 1;
- int sub_stripes = 0;
- int looped = 0;
- int ret;
- int index;
- int stripe_len = 64 * 1024;
+ if (((struct btrfs_device_info *)dev_info1)->max_avail >
+ ((struct btrfs_device_info *)dev_info2)->max_avail)
+ return -1;
+ else if (((struct btrfs_device_info *)dev_info1)->max_avail <
+ ((struct btrfs_device_info *)dev_info2)->max_avail)
+ return 1;
+ else
+ return 0;
+ }
- if ((type & BTRFS_BLOCK_GROUP_RAID1) &&
- (type & BTRFS_BLOCK_GROUP_DUP)) {
- WARN_ON(1);
- type &= ~BTRFS_BLOCK_GROUP_DUP;
- }
- if (list_empty(&fs_devices->alloc_list))
- return -ENOSPC;
+ static int __btrfs_calc_nstripes(struct btrfs_fs_devices *fs_devices, u64 type,
+ int *num_stripes, int *min_stripes,
+ int *sub_stripes)
+ {
+ *num_stripes = 1;
+ *min_stripes = 1;
+ *sub_stripes = 0;
if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
- num_stripes = fs_devices->rw_devices;
- min_stripes = 2;
+ *num_stripes = fs_devices->rw_devices;
+ *min_stripes = 2;
}
if (type & (BTRFS_BLOCK_GROUP_DUP)) {
- num_stripes = 2;
- min_stripes = 2;
+ *num_stripes = 2;
+ *min_stripes = 2;
}
if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
if (fs_devices->rw_devices < 2)
return -ENOSPC;
- num_stripes = 2;
- min_stripes = 2;
+ *num_stripes = 2;
+ *min_stripes = 2;
}
if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
- num_stripes = fs_devices->rw_devices;
- if (num_stripes < 4)
+ *num_stripes = fs_devices->rw_devices;
+ if (*num_stripes < 4)
return -ENOSPC;
- num_stripes &= ~(u32)1;
- sub_stripes = 2;
- min_stripes = 4;
+ *num_stripes &= ~(u32)1;
+ *sub_stripes = 2;
+ *min_stripes = 4;
}
+ return 0;
+ }
+
+ static u64 __btrfs_calc_stripe_size(struct btrfs_fs_devices *fs_devices,
+ u64 proposed_size, u64 type,
+ int num_stripes, int small_stripe)
+ {
+ int min_stripe_size = 1 * 1024 * 1024;
+ u64 calc_size = proposed_size;
+ u64 max_chunk_size = calc_size;
+ int ncopies = 1;
+
+ if (type & (BTRFS_BLOCK_GROUP_RAID1 |
+ BTRFS_BLOCK_GROUP_DUP |
+ BTRFS_BLOCK_GROUP_RAID10))
+ ncopies = 2;
+
if (type & BTRFS_BLOCK_GROUP_DATA) {
max_chunk_size = 10 * calc_size;
min_stripe_size = 64 * 1024 * 1024;
max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
max_chunk_size);
- again:
- max_avail = 0;
- if (!map || map->num_stripes != num_stripes) {
- kfree(map);
- map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
- if (!map)
- return -ENOMEM;
- map->num_stripes = num_stripes;
- }
-
- if (calc_size * num_stripes > max_chunk_size) {
- calc_size = max_chunk_size;
+ if (calc_size * num_stripes > max_chunk_size * ncopies) {
+ calc_size = max_chunk_size * ncopies;
do_div(calc_size, num_stripes);
- do_div(calc_size, stripe_len);
- calc_size *= stripe_len;
+ do_div(calc_size, BTRFS_STRIPE_LEN);
+ calc_size *= BTRFS_STRIPE_LEN;
}
/* we don't want tiny stripes */
- if (!looped)
+ if (!small_stripe)
calc_size = max_t(u64, min_stripe_size, calc_size);
/*
- * we're about to do_div by the stripe_len so lets make sure
+ * we're about to do_div by the BTRFS_STRIPE_LEN so lets make sure
* we end up with something bigger than a stripe
*/
- calc_size = max_t(u64, calc_size, stripe_len * 4);
+ calc_size = max_t(u64, calc_size, BTRFS_STRIPE_LEN);
+
+ do_div(calc_size, BTRFS_STRIPE_LEN);
+ calc_size *= BTRFS_STRIPE_LEN;
+
+ return calc_size;
+ }
+
+ static struct map_lookup *__shrink_map_lookup_stripes(struct map_lookup *map,
+ int num_stripes)
+ {
+ struct map_lookup *new;
+ size_t len = map_lookup_size(num_stripes);
+
+ BUG_ON(map->num_stripes < num_stripes);
+
+ if (map->num_stripes == num_stripes)
+ return map;
+
+ new = kmalloc(len, GFP_NOFS);
+ if (!new) {
+ /* just change map->num_stripes */
+ map->num_stripes = num_stripes;
+ return map;
+ }
+
+ memcpy(new, map, len);
+ new->num_stripes = num_stripes;
+ kfree(map);
+ return new;
+ }
+
+ /*
+ * helper to allocate device space from btrfs_device_info, in which we stored
+ * max free space information of every device. It is used when we can not
+ * allocate chunks by default size.
+ *
+ * By this helper, we can allocate a new chunk as larger as possible.
+ */
+ static int __btrfs_alloc_tiny_space(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_devices *fs_devices,
+ struct btrfs_device_info *devices,
+ int nr_device, u64 type,
+ struct map_lookup **map_lookup,
+ int min_stripes, u64 *stripe_size)
+ {
+ int i, index, sort_again = 0;
+ int min_devices = min_stripes;
+ u64 max_avail, min_free;
+ struct map_lookup *map = *map_lookup;
+ int ret;
+
+ if (nr_device < min_stripes)
+ return -ENOSPC;
+
+ btrfs_descending_sort_devices(devices, nr_device);
+
+ max_avail = devices[0].max_avail;
+ if (!max_avail)
+ return -ENOSPC;
+
+ for (i = 0; i < nr_device; i++) {
+ /*
+ * if dev_offset = 0, it means the free space of this device
+ * is less than what we need, and we didn't search max avail
+ * extent on this device, so do it now.
+ */
+ if (!devices[i].dev_offset) {
+ ret = find_free_dev_extent(trans, devices[i].dev,
+ max_avail,
+ &devices[i].dev_offset,
+ &devices[i].max_avail);
+ if (ret != 0 && ret != -ENOSPC)
+ return ret;
+ sort_again = 1;
+ }
+ }
+
+ /* we update the max avail free extent of each devices, sort again */
+ if (sort_again)
+ btrfs_descending_sort_devices(devices, nr_device);
+
+ if (type & BTRFS_BLOCK_GROUP_DUP)
+ min_devices = 1;
+
+ if (!devices[min_devices - 1].max_avail)
+ return -ENOSPC;
+
+ max_avail = devices[min_devices - 1].max_avail;
+ if (type & BTRFS_BLOCK_GROUP_DUP)
+ do_div(max_avail, 2);
+
+ max_avail = __btrfs_calc_stripe_size(fs_devices, max_avail, type,
+ min_stripes, 1);
+ if (type & BTRFS_BLOCK_GROUP_DUP)
+ min_free = max_avail * 2;
+ else
+ min_free = max_avail;
+
+ if (min_free > devices[min_devices - 1].max_avail)
+ return -ENOSPC;
+
+ map = __shrink_map_lookup_stripes(map, min_stripes);
+ *stripe_size = max_avail;
+
+ index = 0;
+ for (i = 0; i < min_stripes; i++) {
+ map->stripes[i].dev = devices[index].dev;
+ map->stripes[i].physical = devices[index].dev_offset;
+ if (type & BTRFS_BLOCK_GROUP_DUP) {
+ i++;
+ map->stripes[i].dev = devices[index].dev;
+ map->stripes[i].physical = devices[index].dev_offset +
+ max_avail;
+ }
+ index++;
+ }
+ *map_lookup = map;
- do_div(calc_size, stripe_len);
- calc_size *= stripe_len;
+ return 0;
+ }
+
+ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
+ struct btrfs_root *extent_root,
+ struct map_lookup **map_ret,
+ u64 *num_bytes, u64 *stripe_size,
+ u64 start, u64 type)
+ {
+ struct btrfs_fs_info *info = extent_root->fs_info;
+ struct btrfs_device *device = NULL;
+ struct btrfs_fs_devices *fs_devices = info->fs_devices;
+ struct list_head *cur;
+ struct map_lookup *map;
+ struct extent_map_tree *em_tree;
+ struct extent_map *em;
+ struct btrfs_device_info *devices_info;
+ struct list_head private_devs;
+ u64 calc_size = 1024 * 1024 * 1024;
+ u64 min_free;
+ u64 avail;
+ u64 dev_offset;
+ int num_stripes;
+ int min_stripes;
+ int sub_stripes;
+ int min_devices; /* the min number of devices we need */
+ int i;
+ int ret;
+ int index;
+
+ if ((type & BTRFS_BLOCK_GROUP_RAID1) &&
+ (type & BTRFS_BLOCK_GROUP_DUP)) {
+ WARN_ON(1);
+ type &= ~BTRFS_BLOCK_GROUP_DUP;
+ }
+ if (list_empty(&fs_devices->alloc_list))
+ return -ENOSPC;
+
+ ret = __btrfs_calc_nstripes(fs_devices, type, &num_stripes,
+ &min_stripes, &sub_stripes);
+ if (ret)
+ return ret;
+
+ devices_info = kzalloc(sizeof(*devices_info) * fs_devices->rw_devices,
+ GFP_NOFS);
+ if (!devices_info)
+ return -ENOMEM;
+
+ map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
+ if (!map) {
+ ret = -ENOMEM;
+ goto error;
+ }
+ map->num_stripes = num_stripes;
cur = fs_devices->alloc_list.next;
index = 0;
+ i = 0;
- if (type & BTRFS_BLOCK_GROUP_DUP)
+ calc_size = __btrfs_calc_stripe_size(fs_devices, calc_size, type,
+ num_stripes, 0);
+
+ if (type & BTRFS_BLOCK_GROUP_DUP) {
min_free = calc_size * 2;
- else
+ min_devices = 1;
+ } else {
min_free = calc_size;
-
- /*
- * we add 1MB because we never use the first 1MB of the device, unless
- * we've looped, then we are likely allocating the maximum amount of
- * space left already
- */
- if (!looped)
- min_free += 1024 * 1024;
+ min_devices = min_stripes;
+ }
INIT_LIST_HEAD(&private_devs);
while (index < num_stripes) {
cur = cur->next;
if (device->in_fs_metadata && avail >= min_free) {
- ret = find_free_dev_extent(trans, device,
- min_free, &dev_offset,
- &max_avail);
+ ret = find_free_dev_extent(trans, device, min_free,
+ &devices_info[i].dev_offset,
+ &devices_info[i].max_avail);
if (ret == 0) {
list_move_tail(&device->dev_alloc_list,
&private_devs);
map->stripes[index].dev = device;
- map->stripes[index].physical = dev_offset;
+ map->stripes[index].physical =
+ devices_info[i].dev_offset;
index++;
if (type & BTRFS_BLOCK_GROUP_DUP) {
map->stripes[index].dev = device;
map->stripes[index].physical =
- dev_offset + calc_size;
+ devices_info[i].dev_offset +
+ calc_size;
index++;
}
- }
- } else if (device->in_fs_metadata && avail > max_avail)
- max_avail = avail;
+ } else if (ret != -ENOSPC)
+ goto error;
+
+ devices_info[i].dev = device;
+ i++;
+ } else if (device->in_fs_metadata &&
+ avail >= BTRFS_STRIPE_LEN) {
+ devices_info[i].dev = device;
+ devices_info[i].max_avail = avail;
+ i++;
+ }
+
if (cur == &fs_devices->alloc_list)
break;
}
+
list_splice(&private_devs, &fs_devices->alloc_list);
if (index < num_stripes) {
if (index >= min_stripes) {
num_stripes /= sub_stripes;
num_stripes *= sub_stripes;
}
- looped = 1;
- goto again;
- }
- if (!looped && max_avail > 0) {
- looped = 1;
- calc_size = max_avail;
- goto again;
+
+ map = __shrink_map_lookup_stripes(map, num_stripes);
+ } else if (i >= min_devices) {
+ ret = __btrfs_alloc_tiny_space(trans, fs_devices,
+ devices_info, i, type,
+ &map, min_stripes,
+ &calc_size);
+ if (ret)
+ goto error;
+ } else {
+ ret = -ENOSPC;
+ goto error;
}
- kfree(map);
- return -ENOSPC;
}
map->sector_size = extent_root->sectorsize;
- map->stripe_len = stripe_len;
- map->io_align = stripe_len;
- map->io_width = stripe_len;
+ map->stripe_len = BTRFS_STRIPE_LEN;
+ map->io_align = BTRFS_STRIPE_LEN;
+ map->io_width = BTRFS_STRIPE_LEN;
map->type = type;
- map->num_stripes = num_stripes;
map->sub_stripes = sub_stripes;
*map_ret = map;
*stripe_size = calc_size;
*num_bytes = chunk_bytes_by_type(type, calc_size,
- num_stripes, sub_stripes);
+ map->num_stripes, sub_stripes);
em = alloc_extent_map(GFP_NOFS);
if (!em) {
- kfree(map);
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto error;
}
em->bdev = (struct block_device *)map;
em->start = start;
index++;
}
+ kfree(devices_info);
return 0;
+
+ error:
+ kfree(map);
+ kfree(devices_info);
+ return ret;
}
static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
return NULL;
list_add(&device->dev_list,
&fs_devices->devices);
- device->barriers = 1;
device->dev_root = root->fs_info->dev_root;
device->devid = devid;
device->work.func = pending_bios_fn;
#define __BTRFS_VOLUMES_
#include <linux/bio.h>
+ #include <linux/sort.h>
#include "async-thread.h"
+ #define BTRFS_STRIPE_LEN (64 * 1024)
+
struct buffer_head;
struct btrfs_pending_bios {
struct bio *head;
int running_pending;
u64 generation;
- int barriers;
int writeable;
int in_fs_metadata;
int missing;
struct block_device *bdev;
- /* the mode sent to open_bdev_exclusive */
+ /* the mode sent to blkdev_get */
fmode_t mode;
char *name;
struct btrfs_bio_stripe stripes[];
};
+ struct btrfs_device_info {
+ struct btrfs_device *dev;
+ u64 dev_offset;
+ u64 max_avail;
+ };
+
+ /* Used to sort the devices by max_avail(descending sort) */
+ int btrfs_cmp_device_free_bytes(const void *dev_info1, const void *dev_info2);
+
+ /*
+ * sort the devices by max_avail, in which max free extent size of each device
+ * is stored.(Descending Sort)
+ */
+ static inline void btrfs_descending_sort_devices(
+ struct btrfs_device_info *devices,
+ size_t nr_devices)
+ {
+ sort(devices, nr_devices, sizeof(struct btrfs_device_info),
+ btrfs_cmp_device_free_bytes, NULL);
+ }
+
+ int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
+ u64 end, u64 *length);
+
#define btrfs_multi_bio_size(n) (sizeof(struct btrfs_multi_bio) + \
(sizeof(struct btrfs_bio_stripe) * (n)))