]> git.proxmox.com Git - mirror_ubuntu-hirsute-kernel.git/blame - fs/btrfs/extent_io.c
btrfs: move ino_cache_inode dropping out of btrfs_free_fs_root
[mirror_ubuntu-hirsute-kernel.git] / fs / btrfs / extent_io.c
CommitLineData
b2441318 1// SPDX-License-Identifier: GPL-2.0
c1d7c514 2
d1310b2e
CM
3#include <linux/bitops.h>
4#include <linux/slab.h>
5#include <linux/bio.h>
6#include <linux/mm.h>
d1310b2e
CM
7#include <linux/pagemap.h>
8#include <linux/page-flags.h>
d1310b2e
CM
9#include <linux/spinlock.h>
10#include <linux/blkdev.h>
11#include <linux/swap.h>
d1310b2e
CM
12#include <linux/writeback.h>
13#include <linux/pagevec.h>
268bb0ce 14#include <linux/prefetch.h>
90a887c9 15#include <linux/cleancache.h>
d1310b2e 16#include "extent_io.h"
9c7d3a54 17#include "extent-io-tree.h"
d1310b2e 18#include "extent_map.h"
902b22f3
DW
19#include "ctree.h"
20#include "btrfs_inode.h"
4a54c8c1 21#include "volumes.h"
21adbd5c 22#include "check-integrity.h"
0b32f4bb 23#include "locking.h"
606686ee 24#include "rcu-string.h"
fe09e16c 25#include "backref.h"
6af49dbd 26#include "disk-io.h"
d1310b2e 27
d1310b2e
CM
28static struct kmem_cache *extent_state_cache;
29static struct kmem_cache *extent_buffer_cache;
8ac9f7c1 30static struct bio_set btrfs_bioset;
d1310b2e 31
27a3507d
FM
32static inline bool extent_state_in_tree(const struct extent_state *state)
33{
34 return !RB_EMPTY_NODE(&state->rb_node);
35}
36
6d49ba1b 37#ifdef CONFIG_BTRFS_DEBUG
d1310b2e 38static LIST_HEAD(states);
d397712b 39static DEFINE_SPINLOCK(leak_lock);
6d49ba1b 40
3fd63727
JB
41static inline void btrfs_leak_debug_add(spinlock_t *lock,
42 struct list_head *new,
43 struct list_head *head)
6d49ba1b
ES
44{
45 unsigned long flags;
46
3fd63727 47 spin_lock_irqsave(lock, flags);
6d49ba1b 48 list_add(new, head);
3fd63727 49 spin_unlock_irqrestore(lock, flags);
6d49ba1b
ES
50}
51
3fd63727
JB
52static inline void btrfs_leak_debug_del(spinlock_t *lock,
53 struct list_head *entry)
6d49ba1b
ES
54{
55 unsigned long flags;
56
3fd63727 57 spin_lock_irqsave(lock, flags);
6d49ba1b 58 list_del(entry);
3fd63727 59 spin_unlock_irqrestore(lock, flags);
6d49ba1b
ES
60}
61
3fd63727 62void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info)
6d49ba1b 63{
6d49ba1b 64 struct extent_buffer *eb;
3fd63727 65 unsigned long flags;
6d49ba1b 66
3fd63727
JB
67 spin_lock_irqsave(&fs_info->eb_leak_lock, flags);
68 while (!list_empty(&fs_info->allocated_ebs)) {
69 eb = list_first_entry(&fs_info->allocated_ebs,
70 struct extent_buffer, leak_list);
33ca832f
JB
71 pr_err("BTRFS: buffer leak start %llu len %lu refs %d bflags %lu\n",
72 eb->start, eb->len, atomic_read(&eb->refs), eb->bflags);
73 list_del(&eb->leak_list);
74 kmem_cache_free(extent_buffer_cache, eb);
75 }
3fd63727 76 spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags);
33ca832f
JB
77}
78
79static inline void btrfs_extent_state_leak_debug_check(void)
80{
81 struct extent_state *state;
82
6d49ba1b
ES
83 while (!list_empty(&states)) {
84 state = list_entry(states.next, struct extent_state, leak_list);
9ee49a04 85 pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n",
27a3507d
FM
86 state->start, state->end, state->state,
87 extent_state_in_tree(state),
b7ac31b7 88 refcount_read(&state->refs));
6d49ba1b
ES
89 list_del(&state->leak_list);
90 kmem_cache_free(extent_state_cache, state);
91 }
6d49ba1b 92}
8d599ae1 93
a5dee37d
JB
94#define btrfs_debug_check_extent_io_range(tree, start, end) \
95 __btrfs_debug_check_extent_io_range(__func__, (tree), (start), (end))
8d599ae1 96static inline void __btrfs_debug_check_extent_io_range(const char *caller,
a5dee37d 97 struct extent_io_tree *tree, u64 start, u64 end)
8d599ae1 98{
65a680f6
NB
99 struct inode *inode = tree->private_data;
100 u64 isize;
101
102 if (!inode || !is_data_inode(inode))
103 return;
104
105 isize = i_size_read(inode);
106 if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
107 btrfs_debug_rl(BTRFS_I(inode)->root->fs_info,
108 "%s: ino %llu isize %llu odd range [%llu,%llu]",
109 caller, btrfs_ino(BTRFS_I(inode)), isize, start, end);
110 }
8d599ae1 111}
6d49ba1b 112#else
3fd63727
JB
113#define btrfs_leak_debug_add(lock, new, head) do {} while (0)
114#define btrfs_leak_debug_del(lock, entry) do {} while (0)
33ca832f 115#define btrfs_extent_state_leak_debug_check() do {} while (0)
8d599ae1 116#define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0)
4bef0848 117#endif
d1310b2e 118
d1310b2e
CM
119struct tree_entry {
120 u64 start;
121 u64 end;
d1310b2e
CM
122 struct rb_node rb_node;
123};
124
125struct extent_page_data {
126 struct bio *bio;
771ed689
CM
127 /* tells writepage not to lock the state bits for this range
128 * it still does the unlocking
129 */
ffbd517d
CM
130 unsigned int extent_locked:1;
131
70fd7614 132 /* tells the submit_bio code to use REQ_SYNC */
ffbd517d 133 unsigned int sync_io:1;
d1310b2e
CM
134};
135
57599c7e 136static int add_extent_changeset(struct extent_state *state, unsigned bits,
d38ed27f
QW
137 struct extent_changeset *changeset,
138 int set)
139{
140 int ret;
141
142 if (!changeset)
57599c7e 143 return 0;
d38ed27f 144 if (set && (state->state & bits) == bits)
57599c7e 145 return 0;
fefdc557 146 if (!set && (state->state & bits) == 0)
57599c7e 147 return 0;
d38ed27f 148 changeset->bytes_changed += state->end - state->start + 1;
53d32359 149 ret = ulist_add(&changeset->range_changed, state->start, state->end,
d38ed27f 150 GFP_ATOMIC);
57599c7e 151 return ret;
d38ed27f
QW
152}
153
bb58eb9e
QW
154static int __must_check submit_one_bio(struct bio *bio, int mirror_num,
155 unsigned long bio_flags)
156{
157 blk_status_t ret = 0;
bb58eb9e 158 struct extent_io_tree *tree = bio->bi_private;
bb58eb9e
QW
159
160 bio->bi_private = NULL;
161
162 if (tree->ops)
163 ret = tree->ops->submit_bio_hook(tree->private_data, bio,
50489a57 164 mirror_num, bio_flags);
bb58eb9e
QW
165 else
166 btrfsic_submit_bio(bio);
167
168 return blk_status_to_errno(ret);
169}
170
3065976b
QW
171/* Cleanup unsubmitted bios */
172static void end_write_bio(struct extent_page_data *epd, int ret)
173{
174 if (epd->bio) {
175 epd->bio->bi_status = errno_to_blk_status(ret);
176 bio_endio(epd->bio);
177 epd->bio = NULL;
178 }
179}
180
f4340622
QW
181/*
182 * Submit bio from extent page data via submit_one_bio
183 *
184 * Return 0 if everything is OK.
185 * Return <0 for error.
186 */
187static int __must_check flush_write_bio(struct extent_page_data *epd)
bb58eb9e 188{
f4340622 189 int ret = 0;
bb58eb9e 190
f4340622 191 if (epd->bio) {
bb58eb9e 192 ret = submit_one_bio(epd->bio, 0, 0);
f4340622
QW
193 /*
194 * Clean up of epd->bio is handled by its endio function.
195 * And endio is either triggered by successful bio execution
196 * or the error handler of submit bio hook.
197 * So at this point, no matter what happened, we don't need
198 * to clean up epd->bio.
199 */
bb58eb9e
QW
200 epd->bio = NULL;
201 }
f4340622 202 return ret;
bb58eb9e 203}
e2932ee0 204
6f0d04f8 205int __init extent_state_cache_init(void)
d1310b2e 206{
837e1972 207 extent_state_cache = kmem_cache_create("btrfs_extent_state",
9601e3f6 208 sizeof(struct extent_state), 0,
fba4b697 209 SLAB_MEM_SPREAD, NULL);
d1310b2e
CM
210 if (!extent_state_cache)
211 return -ENOMEM;
6f0d04f8
JB
212 return 0;
213}
d1310b2e 214
6f0d04f8
JB
215int __init extent_io_init(void)
216{
837e1972 217 extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer",
9601e3f6 218 sizeof(struct extent_buffer), 0,
fba4b697 219 SLAB_MEM_SPREAD, NULL);
d1310b2e 220 if (!extent_buffer_cache)
6f0d04f8 221 return -ENOMEM;
9be3395b 222
8ac9f7c1
KO
223 if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE,
224 offsetof(struct btrfs_io_bio, bio),
225 BIOSET_NEED_BVECS))
9be3395b 226 goto free_buffer_cache;
b208c2f7 227
8ac9f7c1 228 if (bioset_integrity_create(&btrfs_bioset, BIO_POOL_SIZE))
b208c2f7
DW
229 goto free_bioset;
230
d1310b2e
CM
231 return 0;
232
b208c2f7 233free_bioset:
8ac9f7c1 234 bioset_exit(&btrfs_bioset);
b208c2f7 235
9be3395b
CM
236free_buffer_cache:
237 kmem_cache_destroy(extent_buffer_cache);
238 extent_buffer_cache = NULL;
6f0d04f8
JB
239 return -ENOMEM;
240}
9be3395b 241
6f0d04f8
JB
242void __cold extent_state_cache_exit(void)
243{
244 btrfs_extent_state_leak_debug_check();
d1310b2e 245 kmem_cache_destroy(extent_state_cache);
d1310b2e
CM
246}
247
e67c718b 248void __cold extent_io_exit(void)
d1310b2e 249{
8c0a8537
KS
250 /*
251 * Make sure all delayed rcu free are flushed before we
252 * destroy caches.
253 */
254 rcu_barrier();
5598e900 255 kmem_cache_destroy(extent_buffer_cache);
8ac9f7c1 256 bioset_exit(&btrfs_bioset);
d1310b2e
CM
257}
258
41a2ee75
JB
259/*
260 * For the file_extent_tree, we want to hold the inode lock when we lookup and
261 * update the disk_i_size, but lockdep will complain because our io_tree we hold
262 * the tree lock and get the inode lock when setting delalloc. These two things
263 * are unrelated, so make a class for the file_extent_tree so we don't get the
264 * two locking patterns mixed up.
265 */
266static struct lock_class_key file_extent_tree_class;
267
c258d6e3 268void extent_io_tree_init(struct btrfs_fs_info *fs_info,
43eb5f29
QW
269 struct extent_io_tree *tree, unsigned int owner,
270 void *private_data)
d1310b2e 271{
c258d6e3 272 tree->fs_info = fs_info;
6bef4d31 273 tree->state = RB_ROOT;
d1310b2e
CM
274 tree->ops = NULL;
275 tree->dirty_bytes = 0;
70dec807 276 spin_lock_init(&tree->lock);
c6100a4b 277 tree->private_data = private_data;
43eb5f29 278 tree->owner = owner;
41a2ee75
JB
279 if (owner == IO_TREE_INODE_FILE_EXTENT)
280 lockdep_set_class(&tree->lock, &file_extent_tree_class);
d1310b2e 281}
d1310b2e 282
41e7acd3
NB
283void extent_io_tree_release(struct extent_io_tree *tree)
284{
285 spin_lock(&tree->lock);
286 /*
287 * Do a single barrier for the waitqueue_active check here, the state
288 * of the waitqueue should not change once extent_io_tree_release is
289 * called.
290 */
291 smp_mb();
292 while (!RB_EMPTY_ROOT(&tree->state)) {
293 struct rb_node *node;
294 struct extent_state *state;
295
296 node = rb_first(&tree->state);
297 state = rb_entry(node, struct extent_state, rb_node);
298 rb_erase(&state->rb_node, &tree->state);
299 RB_CLEAR_NODE(&state->rb_node);
300 /*
301 * btree io trees aren't supposed to have tasks waiting for
302 * changes in the flags of extent states ever.
303 */
304 ASSERT(!waitqueue_active(&state->wq));
305 free_extent_state(state);
306
307 cond_resched_lock(&tree->lock);
308 }
309 spin_unlock(&tree->lock);
310}
311
b2950863 312static struct extent_state *alloc_extent_state(gfp_t mask)
d1310b2e
CM
313{
314 struct extent_state *state;
d1310b2e 315
3ba7ab22
MH
316 /*
317 * The given mask might be not appropriate for the slab allocator,
318 * drop the unsupported bits
319 */
320 mask &= ~(__GFP_DMA32|__GFP_HIGHMEM);
d1310b2e 321 state = kmem_cache_alloc(extent_state_cache, mask);
2b114d1d 322 if (!state)
d1310b2e
CM
323 return state;
324 state->state = 0;
47dc196a 325 state->failrec = NULL;
27a3507d 326 RB_CLEAR_NODE(&state->rb_node);
3fd63727 327 btrfs_leak_debug_add(&leak_lock, &state->leak_list, &states);
b7ac31b7 328 refcount_set(&state->refs, 1);
d1310b2e 329 init_waitqueue_head(&state->wq);
143bede5 330 trace_alloc_extent_state(state, mask, _RET_IP_);
d1310b2e
CM
331 return state;
332}
d1310b2e 333
4845e44f 334void free_extent_state(struct extent_state *state)
d1310b2e 335{
d1310b2e
CM
336 if (!state)
337 return;
b7ac31b7 338 if (refcount_dec_and_test(&state->refs)) {
27a3507d 339 WARN_ON(extent_state_in_tree(state));
3fd63727 340 btrfs_leak_debug_del(&leak_lock, &state->leak_list);
143bede5 341 trace_free_extent_state(state, _RET_IP_);
d1310b2e
CM
342 kmem_cache_free(extent_state_cache, state);
343 }
344}
d1310b2e 345
f2071b21
FM
346static struct rb_node *tree_insert(struct rb_root *root,
347 struct rb_node *search_start,
348 u64 offset,
12cfbad9
FDBM
349 struct rb_node *node,
350 struct rb_node ***p_in,
351 struct rb_node **parent_in)
d1310b2e 352{
f2071b21 353 struct rb_node **p;
d397712b 354 struct rb_node *parent = NULL;
d1310b2e
CM
355 struct tree_entry *entry;
356
12cfbad9
FDBM
357 if (p_in && parent_in) {
358 p = *p_in;
359 parent = *parent_in;
360 goto do_insert;
361 }
362
f2071b21 363 p = search_start ? &search_start : &root->rb_node;
d397712b 364 while (*p) {
d1310b2e
CM
365 parent = *p;
366 entry = rb_entry(parent, struct tree_entry, rb_node);
367
368 if (offset < entry->start)
369 p = &(*p)->rb_left;
370 else if (offset > entry->end)
371 p = &(*p)->rb_right;
372 else
373 return parent;
374 }
375
12cfbad9 376do_insert:
d1310b2e
CM
377 rb_link_node(node, parent, p);
378 rb_insert_color(node, root);
379 return NULL;
380}
381
8666e638
NB
382/**
383 * __etree_search - searche @tree for an entry that contains @offset. Such
384 * entry would have entry->start <= offset && entry->end >= offset.
385 *
386 * @tree - the tree to search
387 * @offset - offset that should fall within an entry in @tree
388 * @next_ret - pointer to the first entry whose range ends after @offset
389 * @prev - pointer to the first entry whose range begins before @offset
390 * @p_ret - pointer where new node should be anchored (used when inserting an
391 * entry in the tree)
392 * @parent_ret - points to entry which would have been the parent of the entry,
393 * containing @offset
394 *
395 * This function returns a pointer to the entry that contains @offset byte
396 * address. If no such entry exists, then NULL is returned and the other
397 * pointer arguments to the function are filled, otherwise the found entry is
398 * returned and other pointers are left untouched.
399 */
80ea96b1 400static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
12cfbad9 401 struct rb_node **next_ret,
352646c7 402 struct rb_node **prev_ret,
12cfbad9
FDBM
403 struct rb_node ***p_ret,
404 struct rb_node **parent_ret)
d1310b2e 405{
80ea96b1 406 struct rb_root *root = &tree->state;
12cfbad9 407 struct rb_node **n = &root->rb_node;
d1310b2e
CM
408 struct rb_node *prev = NULL;
409 struct rb_node *orig_prev = NULL;
410 struct tree_entry *entry;
411 struct tree_entry *prev_entry = NULL;
412
12cfbad9
FDBM
413 while (*n) {
414 prev = *n;
415 entry = rb_entry(prev, struct tree_entry, rb_node);
d1310b2e
CM
416 prev_entry = entry;
417
418 if (offset < entry->start)
12cfbad9 419 n = &(*n)->rb_left;
d1310b2e 420 else if (offset > entry->end)
12cfbad9 421 n = &(*n)->rb_right;
d397712b 422 else
12cfbad9 423 return *n;
d1310b2e
CM
424 }
425
12cfbad9
FDBM
426 if (p_ret)
427 *p_ret = n;
428 if (parent_ret)
429 *parent_ret = prev;
430
352646c7 431 if (next_ret) {
d1310b2e 432 orig_prev = prev;
d397712b 433 while (prev && offset > prev_entry->end) {
d1310b2e
CM
434 prev = rb_next(prev);
435 prev_entry = rb_entry(prev, struct tree_entry, rb_node);
436 }
352646c7 437 *next_ret = prev;
d1310b2e
CM
438 prev = orig_prev;
439 }
440
352646c7 441 if (prev_ret) {
d1310b2e 442 prev_entry = rb_entry(prev, struct tree_entry, rb_node);
d397712b 443 while (prev && offset < prev_entry->start) {
d1310b2e
CM
444 prev = rb_prev(prev);
445 prev_entry = rb_entry(prev, struct tree_entry, rb_node);
446 }
352646c7 447 *prev_ret = prev;
d1310b2e
CM
448 }
449 return NULL;
450}
451
12cfbad9
FDBM
452static inline struct rb_node *
453tree_search_for_insert(struct extent_io_tree *tree,
454 u64 offset,
455 struct rb_node ***p_ret,
456 struct rb_node **parent_ret)
d1310b2e 457{
352646c7 458 struct rb_node *next= NULL;
d1310b2e 459 struct rb_node *ret;
70dec807 460
352646c7 461 ret = __etree_search(tree, offset, &next, NULL, p_ret, parent_ret);
d397712b 462 if (!ret)
352646c7 463 return next;
d1310b2e
CM
464 return ret;
465}
466
12cfbad9
FDBM
467static inline struct rb_node *tree_search(struct extent_io_tree *tree,
468 u64 offset)
469{
470 return tree_search_for_insert(tree, offset, NULL, NULL);
471}
472
d1310b2e
CM
473/*
474 * utility function to look for merge candidates inside a given range.
475 * Any extents with matching state are merged together into a single
476 * extent in the tree. Extents with EXTENT_IO in their state field
477 * are not merged because the end_io handlers need to be able to do
478 * operations on them without sleeping (or doing allocations/splits).
479 *
480 * This should be called with the tree lock held.
481 */
1bf85046
JM
482static void merge_state(struct extent_io_tree *tree,
483 struct extent_state *state)
d1310b2e
CM
484{
485 struct extent_state *other;
486 struct rb_node *other_node;
487
8882679e 488 if (state->state & (EXTENT_LOCKED | EXTENT_BOUNDARY))
1bf85046 489 return;
d1310b2e
CM
490
491 other_node = rb_prev(&state->rb_node);
492 if (other_node) {
493 other = rb_entry(other_node, struct extent_state, rb_node);
494 if (other->end == state->start - 1 &&
495 other->state == state->state) {
5c848198
NB
496 if (tree->private_data &&
497 is_data_inode(tree->private_data))
498 btrfs_merge_delalloc_extent(tree->private_data,
499 state, other);
d1310b2e 500 state->start = other->start;
d1310b2e 501 rb_erase(&other->rb_node, &tree->state);
27a3507d 502 RB_CLEAR_NODE(&other->rb_node);
d1310b2e
CM
503 free_extent_state(other);
504 }
505 }
506 other_node = rb_next(&state->rb_node);
507 if (other_node) {
508 other = rb_entry(other_node, struct extent_state, rb_node);
509 if (other->start == state->end + 1 &&
510 other->state == state->state) {
5c848198
NB
511 if (tree->private_data &&
512 is_data_inode(tree->private_data))
513 btrfs_merge_delalloc_extent(tree->private_data,
514 state, other);
df98b6e2 515 state->end = other->end;
df98b6e2 516 rb_erase(&other->rb_node, &tree->state);
27a3507d 517 RB_CLEAR_NODE(&other->rb_node);
df98b6e2 518 free_extent_state(other);
d1310b2e
CM
519 }
520 }
d1310b2e
CM
521}
522
3150b699 523static void set_state_bits(struct extent_io_tree *tree,
d38ed27f
QW
524 struct extent_state *state, unsigned *bits,
525 struct extent_changeset *changeset);
3150b699 526
d1310b2e
CM
527/*
528 * insert an extent_state struct into the tree. 'bits' are set on the
529 * struct before it is inserted.
530 *
531 * This may return -EEXIST if the extent is already there, in which case the
532 * state struct is freed.
533 *
534 * The tree lock is not taken internally. This is a utility function and
535 * probably isn't what you want to call (see set/clear_extent_bit).
536 */
537static int insert_state(struct extent_io_tree *tree,
538 struct extent_state *state, u64 start, u64 end,
12cfbad9
FDBM
539 struct rb_node ***p,
540 struct rb_node **parent,
d38ed27f 541 unsigned *bits, struct extent_changeset *changeset)
d1310b2e
CM
542{
543 struct rb_node *node;
544
2792237d
DS
545 if (end < start) {
546 btrfs_err(tree->fs_info,
547 "insert state: end < start %llu %llu", end, start);
548 WARN_ON(1);
549 }
d1310b2e
CM
550 state->start = start;
551 state->end = end;
9ed74f2d 552
d38ed27f 553 set_state_bits(tree, state, bits, changeset);
3150b699 554
f2071b21 555 node = tree_insert(&tree->state, NULL, end, &state->rb_node, p, parent);
d1310b2e
CM
556 if (node) {
557 struct extent_state *found;
558 found = rb_entry(node, struct extent_state, rb_node);
2792237d
DS
559 btrfs_err(tree->fs_info,
560 "found node %llu %llu on insert of %llu %llu",
c1c9ff7c 561 found->start, found->end, start, end);
d1310b2e
CM
562 return -EEXIST;
563 }
564 merge_state(tree, state);
565 return 0;
566}
567
568/*
569 * split a given extent state struct in two, inserting the preallocated
570 * struct 'prealloc' as the newly created second half. 'split' indicates an
571 * offset inside 'orig' where it should be split.
572 *
573 * Before calling,
574 * the tree has 'orig' at [orig->start, orig->end]. After calling, there
575 * are two extent state structs in the tree:
576 * prealloc: [orig->start, split - 1]
577 * orig: [ split, orig->end ]
578 *
579 * The tree locks are not taken by this function. They need to be held
580 * by the caller.
581 */
582static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
583 struct extent_state *prealloc, u64 split)
584{
585 struct rb_node *node;
9ed74f2d 586
abbb55f4
NB
587 if (tree->private_data && is_data_inode(tree->private_data))
588 btrfs_split_delalloc_extent(tree->private_data, orig, split);
9ed74f2d 589
d1310b2e
CM
590 prealloc->start = orig->start;
591 prealloc->end = split - 1;
592 prealloc->state = orig->state;
593 orig->start = split;
594
f2071b21
FM
595 node = tree_insert(&tree->state, &orig->rb_node, prealloc->end,
596 &prealloc->rb_node, NULL, NULL);
d1310b2e 597 if (node) {
d1310b2e
CM
598 free_extent_state(prealloc);
599 return -EEXIST;
600 }
601 return 0;
602}
603
cdc6a395
LZ
604static struct extent_state *next_state(struct extent_state *state)
605{
606 struct rb_node *next = rb_next(&state->rb_node);
607 if (next)
608 return rb_entry(next, struct extent_state, rb_node);
609 else
610 return NULL;
611}
612
d1310b2e
CM
613/*
614 * utility function to clear some bits in an extent state struct.
52042d8e 615 * it will optionally wake up anyone waiting on this state (wake == 1).
d1310b2e
CM
616 *
617 * If no bits are set on the state struct after clearing things, the
618 * struct is freed and removed from the tree
619 */
cdc6a395
LZ
620static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
621 struct extent_state *state,
fefdc557
QW
622 unsigned *bits, int wake,
623 struct extent_changeset *changeset)
d1310b2e 624{
cdc6a395 625 struct extent_state *next;
9ee49a04 626 unsigned bits_to_clear = *bits & ~EXTENT_CTLBITS;
57599c7e 627 int ret;
d1310b2e 628
0ca1f7ce 629 if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
d1310b2e
CM
630 u64 range = state->end - state->start + 1;
631 WARN_ON(range > tree->dirty_bytes);
632 tree->dirty_bytes -= range;
633 }
a36bb5f9
NB
634
635 if (tree->private_data && is_data_inode(tree->private_data))
636 btrfs_clear_delalloc_extent(tree->private_data, state, bits);
637
57599c7e
DS
638 ret = add_extent_changeset(state, bits_to_clear, changeset, 0);
639 BUG_ON(ret < 0);
32c00aff 640 state->state &= ~bits_to_clear;
d1310b2e
CM
641 if (wake)
642 wake_up(&state->wq);
0ca1f7ce 643 if (state->state == 0) {
cdc6a395 644 next = next_state(state);
27a3507d 645 if (extent_state_in_tree(state)) {
d1310b2e 646 rb_erase(&state->rb_node, &tree->state);
27a3507d 647 RB_CLEAR_NODE(&state->rb_node);
d1310b2e
CM
648 free_extent_state(state);
649 } else {
650 WARN_ON(1);
651 }
652 } else {
653 merge_state(tree, state);
cdc6a395 654 next = next_state(state);
d1310b2e 655 }
cdc6a395 656 return next;
d1310b2e
CM
657}
658
8233767a
XG
659static struct extent_state *
660alloc_extent_state_atomic(struct extent_state *prealloc)
661{
662 if (!prealloc)
663 prealloc = alloc_extent_state(GFP_ATOMIC);
664
665 return prealloc;
666}
667
48a3b636 668static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
c2d904e0 669{
05912a3c
DS
670 struct inode *inode = tree->private_data;
671
672 btrfs_panic(btrfs_sb(inode->i_sb), err,
673 "locking error: extent tree was modified by another thread while locked");
c2d904e0
JM
674}
675
d1310b2e
CM
676/*
677 * clear some bits on a range in the tree. This may require splitting
678 * or inserting elements in the tree, so the gfp mask is used to
679 * indicate which allocations or sleeping are allowed.
680 *
681 * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
682 * the given range from the tree regardless of state (ie for truncate).
683 *
684 * the range [start, end] is inclusive.
685 *
6763af84 686 * This takes the tree lock, and returns 0 on success and < 0 on error.
d1310b2e 687 */
66b0c887 688int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
fefdc557
QW
689 unsigned bits, int wake, int delete,
690 struct extent_state **cached_state,
691 gfp_t mask, struct extent_changeset *changeset)
d1310b2e
CM
692{
693 struct extent_state *state;
2c64c53d 694 struct extent_state *cached;
d1310b2e
CM
695 struct extent_state *prealloc = NULL;
696 struct rb_node *node;
5c939df5 697 u64 last_end;
d1310b2e 698 int err;
2ac55d41 699 int clear = 0;
d1310b2e 700
a5dee37d 701 btrfs_debug_check_extent_io_range(tree, start, end);
a1d19847 702 trace_btrfs_clear_extent_bit(tree, start, end - start + 1, bits);
8d599ae1 703
7ee9e440
JB
704 if (bits & EXTENT_DELALLOC)
705 bits |= EXTENT_NORESERVE;
706
0ca1f7ce
YZ
707 if (delete)
708 bits |= ~EXTENT_CTLBITS;
0ca1f7ce 709
8882679e 710 if (bits & (EXTENT_LOCKED | EXTENT_BOUNDARY))
2ac55d41 711 clear = 1;
d1310b2e 712again:
d0164adc 713 if (!prealloc && gfpflags_allow_blocking(mask)) {
c7bc6319
FM
714 /*
715 * Don't care for allocation failure here because we might end
716 * up not needing the pre-allocated extent state at all, which
717 * is the case if we only have in the tree extent states that
718 * cover our input range and don't cover too any other range.
719 * If we end up needing a new extent state we allocate it later.
720 */
d1310b2e 721 prealloc = alloc_extent_state(mask);
d1310b2e
CM
722 }
723
cad321ad 724 spin_lock(&tree->lock);
2c64c53d
CM
725 if (cached_state) {
726 cached = *cached_state;
2ac55d41
JB
727
728 if (clear) {
729 *cached_state = NULL;
730 cached_state = NULL;
731 }
732
27a3507d
FM
733 if (cached && extent_state_in_tree(cached) &&
734 cached->start <= start && cached->end > start) {
2ac55d41 735 if (clear)
b7ac31b7 736 refcount_dec(&cached->refs);
2c64c53d 737 state = cached;
42daec29 738 goto hit_next;
2c64c53d 739 }
2ac55d41
JB
740 if (clear)
741 free_extent_state(cached);
2c64c53d 742 }
d1310b2e
CM
743 /*
744 * this search will find the extents that end after
745 * our range starts
746 */
80ea96b1 747 node = tree_search(tree, start);
d1310b2e
CM
748 if (!node)
749 goto out;
750 state = rb_entry(node, struct extent_state, rb_node);
2c64c53d 751hit_next:
d1310b2e
CM
752 if (state->start > end)
753 goto out;
754 WARN_ON(state->end < start);
5c939df5 755 last_end = state->end;
d1310b2e 756
0449314a 757 /* the state doesn't have the wanted bits, go ahead */
cdc6a395
LZ
758 if (!(state->state & bits)) {
759 state = next_state(state);
0449314a 760 goto next;
cdc6a395 761 }
0449314a 762
d1310b2e
CM
763 /*
764 * | ---- desired range ---- |
765 * | state | or
766 * | ------------- state -------------- |
767 *
768 * We need to split the extent we found, and may flip
769 * bits on second half.
770 *
771 * If the extent we found extends past our range, we
772 * just split and search again. It'll get split again
773 * the next time though.
774 *
775 * If the extent we found is inside our range, we clear
776 * the desired bit on it.
777 */
778
779 if (state->start < start) {
8233767a
XG
780 prealloc = alloc_extent_state_atomic(prealloc);
781 BUG_ON(!prealloc);
d1310b2e 782 err = split_state(tree, state, prealloc, start);
c2d904e0
JM
783 if (err)
784 extent_io_tree_panic(tree, err);
785
d1310b2e
CM
786 prealloc = NULL;
787 if (err)
788 goto out;
789 if (state->end <= end) {
fefdc557
QW
790 state = clear_state_bit(tree, state, &bits, wake,
791 changeset);
d1ac6e41 792 goto next;
d1310b2e
CM
793 }
794 goto search_again;
795 }
796 /*
797 * | ---- desired range ---- |
798 * | state |
799 * We need to split the extent, and clear the bit
800 * on the first half
801 */
802 if (state->start <= end && state->end > end) {
8233767a
XG
803 prealloc = alloc_extent_state_atomic(prealloc);
804 BUG_ON(!prealloc);
d1310b2e 805 err = split_state(tree, state, prealloc, end + 1);
c2d904e0
JM
806 if (err)
807 extent_io_tree_panic(tree, err);
808
d1310b2e
CM
809 if (wake)
810 wake_up(&state->wq);
42daec29 811
fefdc557 812 clear_state_bit(tree, prealloc, &bits, wake, changeset);
9ed74f2d 813
d1310b2e
CM
814 prealloc = NULL;
815 goto out;
816 }
42daec29 817
fefdc557 818 state = clear_state_bit(tree, state, &bits, wake, changeset);
0449314a 819next:
5c939df5
YZ
820 if (last_end == (u64)-1)
821 goto out;
822 start = last_end + 1;
cdc6a395 823 if (start <= end && state && !need_resched())
692e5759 824 goto hit_next;
d1310b2e
CM
825
826search_again:
827 if (start > end)
828 goto out;
cad321ad 829 spin_unlock(&tree->lock);
d0164adc 830 if (gfpflags_allow_blocking(mask))
d1310b2e
CM
831 cond_resched();
832 goto again;
7ab5cb2a
DS
833
834out:
835 spin_unlock(&tree->lock);
836 if (prealloc)
837 free_extent_state(prealloc);
838
839 return 0;
840
d1310b2e 841}
d1310b2e 842
143bede5
JM
843static void wait_on_state(struct extent_io_tree *tree,
844 struct extent_state *state)
641f5219
CH
845 __releases(tree->lock)
846 __acquires(tree->lock)
d1310b2e
CM
847{
848 DEFINE_WAIT(wait);
849 prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
cad321ad 850 spin_unlock(&tree->lock);
d1310b2e 851 schedule();
cad321ad 852 spin_lock(&tree->lock);
d1310b2e 853 finish_wait(&state->wq, &wait);
d1310b2e
CM
854}
855
856/*
857 * waits for one or more bits to clear on a range in the state tree.
858 * The range [start, end] is inclusive.
859 * The tree lock is taken by this function
860 */
41074888
DS
861static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
862 unsigned long bits)
d1310b2e
CM
863{
864 struct extent_state *state;
865 struct rb_node *node;
866
a5dee37d 867 btrfs_debug_check_extent_io_range(tree, start, end);
8d599ae1 868
cad321ad 869 spin_lock(&tree->lock);
d1310b2e
CM
870again:
871 while (1) {
872 /*
873 * this search will find all the extents that end after
874 * our range starts
875 */
80ea96b1 876 node = tree_search(tree, start);
c50d3e71 877process_node:
d1310b2e
CM
878 if (!node)
879 break;
880
881 state = rb_entry(node, struct extent_state, rb_node);
882
883 if (state->start > end)
884 goto out;
885
886 if (state->state & bits) {
887 start = state->start;
b7ac31b7 888 refcount_inc(&state->refs);
d1310b2e
CM
889 wait_on_state(tree, state);
890 free_extent_state(state);
891 goto again;
892 }
893 start = state->end + 1;
894
895 if (start > end)
896 break;
897
c50d3e71
FM
898 if (!cond_resched_lock(&tree->lock)) {
899 node = rb_next(node);
900 goto process_node;
901 }
d1310b2e
CM
902 }
903out:
cad321ad 904 spin_unlock(&tree->lock);
d1310b2e 905}
d1310b2e 906
1bf85046 907static void set_state_bits(struct extent_io_tree *tree,
d1310b2e 908 struct extent_state *state,
d38ed27f 909 unsigned *bits, struct extent_changeset *changeset)
d1310b2e 910{
9ee49a04 911 unsigned bits_to_set = *bits & ~EXTENT_CTLBITS;
57599c7e 912 int ret;
9ed74f2d 913
e06a1fc9
NB
914 if (tree->private_data && is_data_inode(tree->private_data))
915 btrfs_set_delalloc_extent(tree->private_data, state, bits);
916
0ca1f7ce 917 if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
d1310b2e
CM
918 u64 range = state->end - state->start + 1;
919 tree->dirty_bytes += range;
920 }
57599c7e
DS
921 ret = add_extent_changeset(state, bits_to_set, changeset, 1);
922 BUG_ON(ret < 0);
0ca1f7ce 923 state->state |= bits_to_set;
d1310b2e
CM
924}
925
e38e2ed7
FM
926static void cache_state_if_flags(struct extent_state *state,
927 struct extent_state **cached_ptr,
9ee49a04 928 unsigned flags)
2c64c53d
CM
929{
930 if (cached_ptr && !(*cached_ptr)) {
e38e2ed7 931 if (!flags || (state->state & flags)) {
2c64c53d 932 *cached_ptr = state;
b7ac31b7 933 refcount_inc(&state->refs);
2c64c53d
CM
934 }
935 }
936}
937
e38e2ed7
FM
938static void cache_state(struct extent_state *state,
939 struct extent_state **cached_ptr)
940{
941 return cache_state_if_flags(state, cached_ptr,
8882679e 942 EXTENT_LOCKED | EXTENT_BOUNDARY);
e38e2ed7
FM
943}
944
d1310b2e 945/*
1edbb734
CM
946 * set some bits on a range in the tree. This may require allocations or
947 * sleeping, so the gfp mask is used to indicate what is allowed.
d1310b2e 948 *
1edbb734
CM
949 * If any of the exclusive bits are set, this will fail with -EEXIST if some
950 * part of the range already has the desired bits set. The start of the
951 * existing range is returned in failed_start in this case.
d1310b2e 952 *
1edbb734 953 * [start, end] is inclusive This takes the tree lock.
d1310b2e 954 */
1edbb734 955
3fbe5c02
JM
956static int __must_check
957__set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
9ee49a04 958 unsigned bits, unsigned exclusive_bits,
41074888 959 u64 *failed_start, struct extent_state **cached_state,
d38ed27f 960 gfp_t mask, struct extent_changeset *changeset)
d1310b2e
CM
961{
962 struct extent_state *state;
963 struct extent_state *prealloc = NULL;
964 struct rb_node *node;
12cfbad9
FDBM
965 struct rb_node **p;
966 struct rb_node *parent;
d1310b2e 967 int err = 0;
d1310b2e
CM
968 u64 last_start;
969 u64 last_end;
42daec29 970
a5dee37d 971 btrfs_debug_check_extent_io_range(tree, start, end);
a1d19847 972 trace_btrfs_set_extent_bit(tree, start, end - start + 1, bits);
8d599ae1 973
d1310b2e 974again:
d0164adc 975 if (!prealloc && gfpflags_allow_blocking(mask)) {
059f791c
DS
976 /*
977 * Don't care for allocation failure here because we might end
978 * up not needing the pre-allocated extent state at all, which
979 * is the case if we only have in the tree extent states that
980 * cover our input range and don't cover too any other range.
981 * If we end up needing a new extent state we allocate it later.
982 */
d1310b2e 983 prealloc = alloc_extent_state(mask);
d1310b2e
CM
984 }
985
cad321ad 986 spin_lock(&tree->lock);
9655d298
CM
987 if (cached_state && *cached_state) {
988 state = *cached_state;
df98b6e2 989 if (state->start <= start && state->end > start &&
27a3507d 990 extent_state_in_tree(state)) {
9655d298
CM
991 node = &state->rb_node;
992 goto hit_next;
993 }
994 }
d1310b2e
CM
995 /*
996 * this search will find all the extents that end after
997 * our range starts.
998 */
12cfbad9 999 node = tree_search_for_insert(tree, start, &p, &parent);
d1310b2e 1000 if (!node) {
8233767a
XG
1001 prealloc = alloc_extent_state_atomic(prealloc);
1002 BUG_ON(!prealloc);
12cfbad9 1003 err = insert_state(tree, prealloc, start, end,
d38ed27f 1004 &p, &parent, &bits, changeset);
c2d904e0
JM
1005 if (err)
1006 extent_io_tree_panic(tree, err);
1007
c42ac0bc 1008 cache_state(prealloc, cached_state);
d1310b2e 1009 prealloc = NULL;
d1310b2e
CM
1010 goto out;
1011 }
d1310b2e 1012 state = rb_entry(node, struct extent_state, rb_node);
40431d6c 1013hit_next:
d1310b2e
CM
1014 last_start = state->start;
1015 last_end = state->end;
1016
1017 /*
1018 * | ---- desired range ---- |
1019 * | state |
1020 *
1021 * Just lock what we found and keep going
1022 */
1023 if (state->start == start && state->end <= end) {
1edbb734 1024 if (state->state & exclusive_bits) {
d1310b2e
CM
1025 *failed_start = state->start;
1026 err = -EEXIST;
1027 goto out;
1028 }
42daec29 1029
d38ed27f 1030 set_state_bits(tree, state, &bits, changeset);
2c64c53d 1031 cache_state(state, cached_state);
d1310b2e 1032 merge_state(tree, state);
5c939df5
YZ
1033 if (last_end == (u64)-1)
1034 goto out;
1035 start = last_end + 1;
d1ac6e41
LB
1036 state = next_state(state);
1037 if (start < end && state && state->start == start &&
1038 !need_resched())
1039 goto hit_next;
d1310b2e
CM
1040 goto search_again;
1041 }
1042
1043 /*
1044 * | ---- desired range ---- |
1045 * | state |
1046 * or
1047 * | ------------- state -------------- |
1048 *
1049 * We need to split the extent we found, and may flip bits on
1050 * second half.
1051 *
1052 * If the extent we found extends past our
1053 * range, we just split and search again. It'll get split
1054 * again the next time though.
1055 *
1056 * If the extent we found is inside our range, we set the
1057 * desired bit on it.
1058 */
1059 if (state->start < start) {
1edbb734 1060 if (state->state & exclusive_bits) {
d1310b2e
CM
1061 *failed_start = start;
1062 err = -EEXIST;
1063 goto out;
1064 }
8233767a 1065
55ffaabe
FM
1066 /*
1067 * If this extent already has all the bits we want set, then
1068 * skip it, not necessary to split it or do anything with it.
1069 */
1070 if ((state->state & bits) == bits) {
1071 start = state->end + 1;
1072 cache_state(state, cached_state);
1073 goto search_again;
1074 }
1075
8233767a
XG
1076 prealloc = alloc_extent_state_atomic(prealloc);
1077 BUG_ON(!prealloc);
d1310b2e 1078 err = split_state(tree, state, prealloc, start);
c2d904e0
JM
1079 if (err)
1080 extent_io_tree_panic(tree, err);
1081
d1310b2e
CM
1082 prealloc = NULL;
1083 if (err)
1084 goto out;
1085 if (state->end <= end) {
d38ed27f 1086 set_state_bits(tree, state, &bits, changeset);
2c64c53d 1087 cache_state(state, cached_state);
d1310b2e 1088 merge_state(tree, state);
5c939df5
YZ
1089 if (last_end == (u64)-1)
1090 goto out;
1091 start = last_end + 1;
d1ac6e41
LB
1092 state = next_state(state);
1093 if (start < end && state && state->start == start &&
1094 !need_resched())
1095 goto hit_next;
d1310b2e
CM
1096 }
1097 goto search_again;
1098 }
1099 /*
1100 * | ---- desired range ---- |
1101 * | state | or | state |
1102 *
1103 * There's a hole, we need to insert something in it and
1104 * ignore the extent we found.
1105 */
1106 if (state->start > start) {
1107 u64 this_end;
1108 if (end < last_start)
1109 this_end = end;
1110 else
d397712b 1111 this_end = last_start - 1;
8233767a
XG
1112
1113 prealloc = alloc_extent_state_atomic(prealloc);
1114 BUG_ON(!prealloc);
c7f895a2
XG
1115
1116 /*
1117 * Avoid to free 'prealloc' if it can be merged with
1118 * the later extent.
1119 */
d1310b2e 1120 err = insert_state(tree, prealloc, start, this_end,
d38ed27f 1121 NULL, NULL, &bits, changeset);
c2d904e0
JM
1122 if (err)
1123 extent_io_tree_panic(tree, err);
1124
9ed74f2d
JB
1125 cache_state(prealloc, cached_state);
1126 prealloc = NULL;
d1310b2e
CM
1127 start = this_end + 1;
1128 goto search_again;
1129 }
1130 /*
1131 * | ---- desired range ---- |
1132 * | state |
1133 * We need to split the extent, and set the bit
1134 * on the first half
1135 */
1136 if (state->start <= end && state->end > end) {
1edbb734 1137 if (state->state & exclusive_bits) {
d1310b2e
CM
1138 *failed_start = start;
1139 err = -EEXIST;
1140 goto out;
1141 }
8233767a
XG
1142
1143 prealloc = alloc_extent_state_atomic(prealloc);
1144 BUG_ON(!prealloc);
d1310b2e 1145 err = split_state(tree, state, prealloc, end + 1);
c2d904e0
JM
1146 if (err)
1147 extent_io_tree_panic(tree, err);
d1310b2e 1148
d38ed27f 1149 set_state_bits(tree, prealloc, &bits, changeset);
2c64c53d 1150 cache_state(prealloc, cached_state);
d1310b2e
CM
1151 merge_state(tree, prealloc);
1152 prealloc = NULL;
1153 goto out;
1154 }
1155
b5a4ba14
DS
1156search_again:
1157 if (start > end)
1158 goto out;
1159 spin_unlock(&tree->lock);
1160 if (gfpflags_allow_blocking(mask))
1161 cond_resched();
1162 goto again;
d1310b2e
CM
1163
1164out:
cad321ad 1165 spin_unlock(&tree->lock);
d1310b2e
CM
1166 if (prealloc)
1167 free_extent_state(prealloc);
1168
1169 return err;
1170
d1310b2e 1171}
d1310b2e 1172
41074888 1173int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
9ee49a04 1174 unsigned bits, u64 * failed_start,
41074888 1175 struct extent_state **cached_state, gfp_t mask)
3fbe5c02
JM
1176{
1177 return __set_extent_bit(tree, start, end, bits, 0, failed_start,
d38ed27f 1178 cached_state, mask, NULL);
3fbe5c02
JM
1179}
1180
1181
462d6fac 1182/**
10983f2e
LB
1183 * convert_extent_bit - convert all bits in a given range from one bit to
1184 * another
462d6fac
JB
1185 * @tree: the io tree to search
1186 * @start: the start offset in bytes
1187 * @end: the end offset in bytes (inclusive)
1188 * @bits: the bits to set in this range
1189 * @clear_bits: the bits to clear in this range
e6138876 1190 * @cached_state: state that we're going to cache
462d6fac
JB
1191 *
1192 * This will go through and set bits for the given range. If any states exist
1193 * already in this range they are set with the given bit and cleared of the
1194 * clear_bits. This is only meant to be used by things that are mergeable, ie
1195 * converting from say DELALLOC to DIRTY. This is not meant to be used with
1196 * boundary bits like LOCK.
210aa277
DS
1197 *
1198 * All allocations are done with GFP_NOFS.
462d6fac
JB
1199 */
1200int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
9ee49a04 1201 unsigned bits, unsigned clear_bits,
210aa277 1202 struct extent_state **cached_state)
462d6fac
JB
1203{
1204 struct extent_state *state;
1205 struct extent_state *prealloc = NULL;
1206 struct rb_node *node;
12cfbad9
FDBM
1207 struct rb_node **p;
1208 struct rb_node *parent;
462d6fac
JB
1209 int err = 0;
1210 u64 last_start;
1211 u64 last_end;
c8fd3de7 1212 bool first_iteration = true;
462d6fac 1213
a5dee37d 1214 btrfs_debug_check_extent_io_range(tree, start, end);
a1d19847
QW
1215 trace_btrfs_convert_extent_bit(tree, start, end - start + 1, bits,
1216 clear_bits);
8d599ae1 1217
462d6fac 1218again:
210aa277 1219 if (!prealloc) {
c8fd3de7
FM
1220 /*
1221 * Best effort, don't worry if extent state allocation fails
1222 * here for the first iteration. We might have a cached state
1223 * that matches exactly the target range, in which case no
1224 * extent state allocations are needed. We'll only know this
1225 * after locking the tree.
1226 */
210aa277 1227 prealloc = alloc_extent_state(GFP_NOFS);
c8fd3de7 1228 if (!prealloc && !first_iteration)
462d6fac
JB
1229 return -ENOMEM;
1230 }
1231
1232 spin_lock(&tree->lock);
e6138876
JB
1233 if (cached_state && *cached_state) {
1234 state = *cached_state;
1235 if (state->start <= start && state->end > start &&
27a3507d 1236 extent_state_in_tree(state)) {
e6138876
JB
1237 node = &state->rb_node;
1238 goto hit_next;
1239 }
1240 }
1241
462d6fac
JB
1242 /*
1243 * this search will find all the extents that end after
1244 * our range starts.
1245 */
12cfbad9 1246 node = tree_search_for_insert(tree, start, &p, &parent);
462d6fac
JB
1247 if (!node) {
1248 prealloc = alloc_extent_state_atomic(prealloc);
1cf4ffdb
LB
1249 if (!prealloc) {
1250 err = -ENOMEM;
1251 goto out;
1252 }
12cfbad9 1253 err = insert_state(tree, prealloc, start, end,
d38ed27f 1254 &p, &parent, &bits, NULL);
c2d904e0
JM
1255 if (err)
1256 extent_io_tree_panic(tree, err);
c42ac0bc
FDBM
1257 cache_state(prealloc, cached_state);
1258 prealloc = NULL;
462d6fac
JB
1259 goto out;
1260 }
1261 state = rb_entry(node, struct extent_state, rb_node);
1262hit_next:
1263 last_start = state->start;
1264 last_end = state->end;
1265
1266 /*
1267 * | ---- desired range ---- |
1268 * | state |
1269 *
1270 * Just lock what we found and keep going
1271 */
1272 if (state->start == start && state->end <= end) {
d38ed27f 1273 set_state_bits(tree, state, &bits, NULL);
e6138876 1274 cache_state(state, cached_state);
fefdc557 1275 state = clear_state_bit(tree, state, &clear_bits, 0, NULL);
462d6fac
JB
1276 if (last_end == (u64)-1)
1277 goto out;
462d6fac 1278 start = last_end + 1;
d1ac6e41
LB
1279 if (start < end && state && state->start == start &&
1280 !need_resched())
1281 goto hit_next;
462d6fac
JB
1282 goto search_again;
1283 }
1284
1285 /*
1286 * | ---- desired range ---- |
1287 * | state |
1288 * or
1289 * | ------------- state -------------- |
1290 *
1291 * We need to split the extent we found, and may flip bits on
1292 * second half.
1293 *
1294 * If the extent we found extends past our
1295 * range, we just split and search again. It'll get split
1296 * again the next time though.
1297 *
1298 * If the extent we found is inside our range, we set the
1299 * desired bit on it.
1300 */
1301 if (state->start < start) {
1302 prealloc = alloc_extent_state_atomic(prealloc);
1cf4ffdb
LB
1303 if (!prealloc) {
1304 err = -ENOMEM;
1305 goto out;
1306 }
462d6fac 1307 err = split_state(tree, state, prealloc, start);
c2d904e0
JM
1308 if (err)
1309 extent_io_tree_panic(tree, err);
462d6fac
JB
1310 prealloc = NULL;
1311 if (err)
1312 goto out;
1313 if (state->end <= end) {
d38ed27f 1314 set_state_bits(tree, state, &bits, NULL);
e6138876 1315 cache_state(state, cached_state);
fefdc557
QW
1316 state = clear_state_bit(tree, state, &clear_bits, 0,
1317 NULL);
462d6fac
JB
1318 if (last_end == (u64)-1)
1319 goto out;
1320 start = last_end + 1;
d1ac6e41
LB
1321 if (start < end && state && state->start == start &&
1322 !need_resched())
1323 goto hit_next;
462d6fac
JB
1324 }
1325 goto search_again;
1326 }
1327 /*
1328 * | ---- desired range ---- |
1329 * | state | or | state |
1330 *
1331 * There's a hole, we need to insert something in it and
1332 * ignore the extent we found.
1333 */
1334 if (state->start > start) {
1335 u64 this_end;
1336 if (end < last_start)
1337 this_end = end;
1338 else
1339 this_end = last_start - 1;
1340
1341 prealloc = alloc_extent_state_atomic(prealloc);
1cf4ffdb
LB
1342 if (!prealloc) {
1343 err = -ENOMEM;
1344 goto out;
1345 }
462d6fac
JB
1346
1347 /*
1348 * Avoid to free 'prealloc' if it can be merged with
1349 * the later extent.
1350 */
1351 err = insert_state(tree, prealloc, start, this_end,
d38ed27f 1352 NULL, NULL, &bits, NULL);
c2d904e0
JM
1353 if (err)
1354 extent_io_tree_panic(tree, err);
e6138876 1355 cache_state(prealloc, cached_state);
462d6fac
JB
1356 prealloc = NULL;
1357 start = this_end + 1;
1358 goto search_again;
1359 }
1360 /*
1361 * | ---- desired range ---- |
1362 * | state |
1363 * We need to split the extent, and set the bit
1364 * on the first half
1365 */
1366 if (state->start <= end && state->end > end) {
1367 prealloc = alloc_extent_state_atomic(prealloc);
1cf4ffdb
LB
1368 if (!prealloc) {
1369 err = -ENOMEM;
1370 goto out;
1371 }
462d6fac
JB
1372
1373 err = split_state(tree, state, prealloc, end + 1);
c2d904e0
JM
1374 if (err)
1375 extent_io_tree_panic(tree, err);
462d6fac 1376
d38ed27f 1377 set_state_bits(tree, prealloc, &bits, NULL);
e6138876 1378 cache_state(prealloc, cached_state);
fefdc557 1379 clear_state_bit(tree, prealloc, &clear_bits, 0, NULL);
462d6fac
JB
1380 prealloc = NULL;
1381 goto out;
1382 }
1383
462d6fac
JB
1384search_again:
1385 if (start > end)
1386 goto out;
1387 spin_unlock(&tree->lock);
210aa277 1388 cond_resched();
c8fd3de7 1389 first_iteration = false;
462d6fac 1390 goto again;
462d6fac
JB
1391
1392out:
1393 spin_unlock(&tree->lock);
1394 if (prealloc)
1395 free_extent_state(prealloc);
1396
1397 return err;
462d6fac
JB
1398}
1399
d1310b2e 1400/* wrappers around set/clear extent bit */
d38ed27f 1401int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
2c53b912 1402 unsigned bits, struct extent_changeset *changeset)
d38ed27f
QW
1403{
1404 /*
1405 * We don't support EXTENT_LOCKED yet, as current changeset will
1406 * record any bits changed, so for EXTENT_LOCKED case, it will
1407 * either fail with -EEXIST or changeset will record the whole
1408 * range.
1409 */
1410 BUG_ON(bits & EXTENT_LOCKED);
1411
2c53b912 1412 return __set_extent_bit(tree, start, end, bits, 0, NULL, NULL, GFP_NOFS,
d38ed27f
QW
1413 changeset);
1414}
1415
4ca73656
NB
1416int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start, u64 end,
1417 unsigned bits)
1418{
1419 return __set_extent_bit(tree, start, end, bits, 0, NULL, NULL,
1420 GFP_NOWAIT, NULL);
1421}
1422
fefdc557
QW
1423int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
1424 unsigned bits, int wake, int delete,
ae0f1625 1425 struct extent_state **cached)
fefdc557
QW
1426{
1427 return __clear_extent_bit(tree, start, end, bits, wake, delete,
ae0f1625 1428 cached, GFP_NOFS, NULL);
fefdc557
QW
1429}
1430
fefdc557 1431int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
f734c44a 1432 unsigned bits, struct extent_changeset *changeset)
fefdc557
QW
1433{
1434 /*
1435 * Don't support EXTENT_LOCKED case, same reason as
1436 * set_record_extent_bits().
1437 */
1438 BUG_ON(bits & EXTENT_LOCKED);
1439
f734c44a 1440 return __clear_extent_bit(tree, start, end, bits, 0, 0, NULL, GFP_NOFS,
fefdc557
QW
1441 changeset);
1442}
1443
d352ac68
CM
1444/*
1445 * either insert or lock state struct between start and end use mask to tell
1446 * us if waiting is desired.
1447 */
1edbb734 1448int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
ff13db41 1449 struct extent_state **cached_state)
d1310b2e
CM
1450{
1451 int err;
1452 u64 failed_start;
9ee49a04 1453
d1310b2e 1454 while (1) {
ff13db41 1455 err = __set_extent_bit(tree, start, end, EXTENT_LOCKED,
3fbe5c02 1456 EXTENT_LOCKED, &failed_start,
d38ed27f 1457 cached_state, GFP_NOFS, NULL);
d0082371 1458 if (err == -EEXIST) {
d1310b2e
CM
1459 wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
1460 start = failed_start;
d0082371 1461 } else
d1310b2e 1462 break;
d1310b2e
CM
1463 WARN_ON(start > end);
1464 }
1465 return err;
1466}
d1310b2e 1467
d0082371 1468int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
25179201
JB
1469{
1470 int err;
1471 u64 failed_start;
1472
3fbe5c02 1473 err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
d38ed27f 1474 &failed_start, NULL, GFP_NOFS, NULL);
6643558d
YZ
1475 if (err == -EEXIST) {
1476 if (failed_start > start)
1477 clear_extent_bit(tree, start, failed_start - 1,
ae0f1625 1478 EXTENT_LOCKED, 1, 0, NULL);
25179201 1479 return 0;
6643558d 1480 }
25179201
JB
1481 return 1;
1482}
25179201 1483
bd1fa4f0 1484void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end)
4adaa611 1485{
09cbfeaf
KS
1486 unsigned long index = start >> PAGE_SHIFT;
1487 unsigned long end_index = end >> PAGE_SHIFT;
4adaa611
CM
1488 struct page *page;
1489
1490 while (index <= end_index) {
1491 page = find_get_page(inode->i_mapping, index);
1492 BUG_ON(!page); /* Pages should be in the extent_io_tree */
1493 clear_page_dirty_for_io(page);
09cbfeaf 1494 put_page(page);
4adaa611
CM
1495 index++;
1496 }
4adaa611
CM
1497}
1498
f6311572 1499void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
4adaa611 1500{
09cbfeaf
KS
1501 unsigned long index = start >> PAGE_SHIFT;
1502 unsigned long end_index = end >> PAGE_SHIFT;
4adaa611
CM
1503 struct page *page;
1504
1505 while (index <= end_index) {
1506 page = find_get_page(inode->i_mapping, index);
1507 BUG_ON(!page); /* Pages should be in the extent_io_tree */
4adaa611 1508 __set_page_dirty_nobuffers(page);
8d38633c 1509 account_page_redirty(page);
09cbfeaf 1510 put_page(page);
4adaa611
CM
1511 index++;
1512 }
4adaa611
CM
1513}
1514
d352ac68
CM
1515/* find the first state struct with 'bits' set after 'start', and
1516 * return it. tree->lock must be held. NULL will returned if
1517 * nothing was found after 'start'
1518 */
48a3b636
ES
1519static struct extent_state *
1520find_first_extent_bit_state(struct extent_io_tree *tree,
9ee49a04 1521 u64 start, unsigned bits)
d7fc640e
CM
1522{
1523 struct rb_node *node;
1524 struct extent_state *state;
1525
1526 /*
1527 * this search will find all the extents that end after
1528 * our range starts.
1529 */
1530 node = tree_search(tree, start);
d397712b 1531 if (!node)
d7fc640e 1532 goto out;
d7fc640e 1533
d397712b 1534 while (1) {
d7fc640e 1535 state = rb_entry(node, struct extent_state, rb_node);
d397712b 1536 if (state->end >= start && (state->state & bits))
d7fc640e 1537 return state;
d397712b 1538
d7fc640e
CM
1539 node = rb_next(node);
1540 if (!node)
1541 break;
1542 }
1543out:
1544 return NULL;
1545}
d7fc640e 1546
69261c4b
XG
1547/*
1548 * find the first offset in the io tree with 'bits' set. zero is
1549 * returned if we find something, and *start_ret and *end_ret are
1550 * set to reflect the state struct that was found.
1551 *
477d7eaf 1552 * If nothing was found, 1 is returned. If found something, return 0.
69261c4b
XG
1553 */
1554int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
9ee49a04 1555 u64 *start_ret, u64 *end_ret, unsigned bits,
e6138876 1556 struct extent_state **cached_state)
69261c4b
XG
1557{
1558 struct extent_state *state;
1559 int ret = 1;
1560
1561 spin_lock(&tree->lock);
e6138876
JB
1562 if (cached_state && *cached_state) {
1563 state = *cached_state;
27a3507d 1564 if (state->end == start - 1 && extent_state_in_tree(state)) {
9688e9a9 1565 while ((state = next_state(state)) != NULL) {
e6138876
JB
1566 if (state->state & bits)
1567 goto got_it;
e6138876
JB
1568 }
1569 free_extent_state(*cached_state);
1570 *cached_state = NULL;
1571 goto out;
1572 }
1573 free_extent_state(*cached_state);
1574 *cached_state = NULL;
1575 }
1576
69261c4b 1577 state = find_first_extent_bit_state(tree, start, bits);
e6138876 1578got_it:
69261c4b 1579 if (state) {
e38e2ed7 1580 cache_state_if_flags(state, cached_state, 0);
69261c4b
XG
1581 *start_ret = state->start;
1582 *end_ret = state->end;
1583 ret = 0;
1584 }
e6138876 1585out:
69261c4b
XG
1586 spin_unlock(&tree->lock);
1587 return ret;
1588}
1589
41a2ee75
JB
1590/**
1591 * find_contiguous_extent_bit: find a contiguous area of bits
1592 * @tree - io tree to check
1593 * @start - offset to start the search from
1594 * @start_ret - the first offset we found with the bits set
1595 * @end_ret - the final contiguous range of the bits that were set
1596 * @bits - bits to look for
1597 *
1598 * set_extent_bit and clear_extent_bit can temporarily split contiguous ranges
1599 * to set bits appropriately, and then merge them again. During this time it
1600 * will drop the tree->lock, so use this helper if you want to find the actual
1601 * contiguous area for given bits. We will search to the first bit we find, and
1602 * then walk down the tree until we find a non-contiguous area. The area
1603 * returned will be the full contiguous area with the bits set.
1604 */
1605int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
1606 u64 *start_ret, u64 *end_ret, unsigned bits)
1607{
1608 struct extent_state *state;
1609 int ret = 1;
1610
1611 spin_lock(&tree->lock);
1612 state = find_first_extent_bit_state(tree, start, bits);
1613 if (state) {
1614 *start_ret = state->start;
1615 *end_ret = state->end;
1616 while ((state = next_state(state)) != NULL) {
1617 if (state->start > (*end_ret + 1))
1618 break;
1619 *end_ret = state->end;
1620 }
1621 ret = 0;
1622 }
1623 spin_unlock(&tree->lock);
1624 return ret;
1625}
1626
45bfcfc1 1627/**
1eaebb34
NB
1628 * find_first_clear_extent_bit - find the first range that has @bits not set.
1629 * This range could start before @start.
45bfcfc1
NB
1630 *
1631 * @tree - the tree to search
1632 * @start - the offset at/after which the found extent should start
1633 * @start_ret - records the beginning of the range
1634 * @end_ret - records the end of the range (inclusive)
1635 * @bits - the set of bits which must be unset
1636 *
1637 * Since unallocated range is also considered one which doesn't have the bits
1638 * set it's possible that @end_ret contains -1, this happens in case the range
1639 * spans (last_range_end, end of device]. In this case it's up to the caller to
1640 * trim @end_ret to the appropriate size.
1641 */
1642void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
1643 u64 *start_ret, u64 *end_ret, unsigned bits)
1644{
1645 struct extent_state *state;
1646 struct rb_node *node, *prev = NULL, *next;
1647
1648 spin_lock(&tree->lock);
1649
1650 /* Find first extent with bits cleared */
1651 while (1) {
1652 node = __etree_search(tree, start, &next, &prev, NULL, NULL);
5750c375
NB
1653 if (!node && !next && !prev) {
1654 /*
1655 * Tree is completely empty, send full range and let
1656 * caller deal with it
1657 */
1658 *start_ret = 0;
1659 *end_ret = -1;
1660 goto out;
1661 } else if (!node && !next) {
1662 /*
1663 * We are past the last allocated chunk, set start at
1664 * the end of the last extent.
1665 */
1666 state = rb_entry(prev, struct extent_state, rb_node);
1667 *start_ret = state->end + 1;
1668 *end_ret = -1;
1669 goto out;
1670 } else if (!node) {
45bfcfc1 1671 node = next;
45bfcfc1 1672 }
1eaebb34
NB
1673 /*
1674 * At this point 'node' either contains 'start' or start is
1675 * before 'node'
1676 */
45bfcfc1 1677 state = rb_entry(node, struct extent_state, rb_node);
1eaebb34
NB
1678
1679 if (in_range(start, state->start, state->end - state->start + 1)) {
1680 if (state->state & bits) {
1681 /*
1682 * |--range with bits sets--|
1683 * |
1684 * start
1685 */
1686 start = state->end + 1;
1687 } else {
1688 /*
1689 * 'start' falls within a range that doesn't
1690 * have the bits set, so take its start as
1691 * the beginning of the desired range
1692 *
1693 * |--range with bits cleared----|
1694 * |
1695 * start
1696 */
1697 *start_ret = state->start;
1698 break;
1699 }
45bfcfc1 1700 } else {
1eaebb34
NB
1701 /*
1702 * |---prev range---|---hole/unset---|---node range---|
1703 * |
1704 * start
1705 *
1706 * or
1707 *
1708 * |---hole/unset--||--first node--|
1709 * 0 |
1710 * start
1711 */
1712 if (prev) {
1713 state = rb_entry(prev, struct extent_state,
1714 rb_node);
1715 *start_ret = state->end + 1;
1716 } else {
1717 *start_ret = 0;
1718 }
45bfcfc1
NB
1719 break;
1720 }
1721 }
1722
1723 /*
1724 * Find the longest stretch from start until an entry which has the
1725 * bits set
1726 */
1727 while (1) {
1728 state = rb_entry(node, struct extent_state, rb_node);
1729 if (state->end >= start && !(state->state & bits)) {
1730 *end_ret = state->end;
1731 } else {
1732 *end_ret = state->start - 1;
1733 break;
1734 }
1735
1736 node = rb_next(node);
1737 if (!node)
1738 break;
1739 }
1740out:
1741 spin_unlock(&tree->lock);
1742}
1743
d352ac68
CM
1744/*
1745 * find a contiguous range of bytes in the file marked as delalloc, not
1746 * more than 'max_bytes'. start and end are used to return the range,
1747 *
3522e903 1748 * true is returned if we find something, false if nothing was in the tree
d352ac68 1749 */
083e75e7
JB
1750bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start,
1751 u64 *end, u64 max_bytes,
1752 struct extent_state **cached_state)
d1310b2e
CM
1753{
1754 struct rb_node *node;
1755 struct extent_state *state;
1756 u64 cur_start = *start;
3522e903 1757 bool found = false;
d1310b2e
CM
1758 u64 total_bytes = 0;
1759
cad321ad 1760 spin_lock(&tree->lock);
c8b97818 1761
d1310b2e
CM
1762 /*
1763 * this search will find all the extents that end after
1764 * our range starts.
1765 */
80ea96b1 1766 node = tree_search(tree, cur_start);
2b114d1d 1767 if (!node) {
3522e903 1768 *end = (u64)-1;
d1310b2e
CM
1769 goto out;
1770 }
1771
d397712b 1772 while (1) {
d1310b2e 1773 state = rb_entry(node, struct extent_state, rb_node);
5b21f2ed
ZY
1774 if (found && (state->start != cur_start ||
1775 (state->state & EXTENT_BOUNDARY))) {
d1310b2e
CM
1776 goto out;
1777 }
1778 if (!(state->state & EXTENT_DELALLOC)) {
1779 if (!found)
1780 *end = state->end;
1781 goto out;
1782 }
c2a128d2 1783 if (!found) {
d1310b2e 1784 *start = state->start;
c2a128d2 1785 *cached_state = state;
b7ac31b7 1786 refcount_inc(&state->refs);
c2a128d2 1787 }
3522e903 1788 found = true;
d1310b2e
CM
1789 *end = state->end;
1790 cur_start = state->end + 1;
1791 node = rb_next(node);
d1310b2e 1792 total_bytes += state->end - state->start + 1;
7bf811a5 1793 if (total_bytes >= max_bytes)
573aecaf 1794 break;
573aecaf 1795 if (!node)
d1310b2e
CM
1796 break;
1797 }
1798out:
cad321ad 1799 spin_unlock(&tree->lock);
d1310b2e
CM
1800 return found;
1801}
1802
da2c7009
LB
1803static int __process_pages_contig(struct address_space *mapping,
1804 struct page *locked_page,
1805 pgoff_t start_index, pgoff_t end_index,
1806 unsigned long page_ops, pgoff_t *index_ret);
1807
143bede5
JM
1808static noinline void __unlock_for_delalloc(struct inode *inode,
1809 struct page *locked_page,
1810 u64 start, u64 end)
c8b97818 1811{
09cbfeaf
KS
1812 unsigned long index = start >> PAGE_SHIFT;
1813 unsigned long end_index = end >> PAGE_SHIFT;
c8b97818 1814
76c0021d 1815 ASSERT(locked_page);
c8b97818 1816 if (index == locked_page->index && end_index == index)
143bede5 1817 return;
c8b97818 1818
76c0021d
LB
1819 __process_pages_contig(inode->i_mapping, locked_page, index, end_index,
1820 PAGE_UNLOCK, NULL);
c8b97818
CM
1821}
1822
1823static noinline int lock_delalloc_pages(struct inode *inode,
1824 struct page *locked_page,
1825 u64 delalloc_start,
1826 u64 delalloc_end)
1827{
09cbfeaf 1828 unsigned long index = delalloc_start >> PAGE_SHIFT;
76c0021d 1829 unsigned long index_ret = index;
09cbfeaf 1830 unsigned long end_index = delalloc_end >> PAGE_SHIFT;
c8b97818 1831 int ret;
c8b97818 1832
76c0021d 1833 ASSERT(locked_page);
c8b97818
CM
1834 if (index == locked_page->index && index == end_index)
1835 return 0;
1836
76c0021d
LB
1837 ret = __process_pages_contig(inode->i_mapping, locked_page, index,
1838 end_index, PAGE_LOCK, &index_ret);
1839 if (ret == -EAGAIN)
1840 __unlock_for_delalloc(inode, locked_page, delalloc_start,
1841 (u64)index_ret << PAGE_SHIFT);
c8b97818
CM
1842 return ret;
1843}
1844
1845/*
3522e903
LF
1846 * Find and lock a contiguous range of bytes in the file marked as delalloc, no
1847 * more than @max_bytes. @Start and @end are used to return the range,
c8b97818 1848 *
3522e903
LF
1849 * Return: true if we find something
1850 * false if nothing was in the tree
c8b97818 1851 */
ce9f967f 1852EXPORT_FOR_TESTS
3522e903 1853noinline_for_stack bool find_lock_delalloc_range(struct inode *inode,
294e30fe 1854 struct page *locked_page, u64 *start,
917aacec 1855 u64 *end)
c8b97818 1856{
9978059b 1857 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
917aacec 1858 u64 max_bytes = BTRFS_MAX_EXTENT_SIZE;
c8b97818
CM
1859 u64 delalloc_start;
1860 u64 delalloc_end;
3522e903 1861 bool found;
9655d298 1862 struct extent_state *cached_state = NULL;
c8b97818
CM
1863 int ret;
1864 int loops = 0;
1865
1866again:
1867 /* step one, find a bunch of delalloc bytes starting at start */
1868 delalloc_start = *start;
1869 delalloc_end = 0;
083e75e7
JB
1870 found = btrfs_find_delalloc_range(tree, &delalloc_start, &delalloc_end,
1871 max_bytes, &cached_state);
70b99e69 1872 if (!found || delalloc_end <= *start) {
c8b97818
CM
1873 *start = delalloc_start;
1874 *end = delalloc_end;
c2a128d2 1875 free_extent_state(cached_state);
3522e903 1876 return false;
c8b97818
CM
1877 }
1878
70b99e69
CM
1879 /*
1880 * start comes from the offset of locked_page. We have to lock
1881 * pages in order, so we can't process delalloc bytes before
1882 * locked_page
1883 */
d397712b 1884 if (delalloc_start < *start)
70b99e69 1885 delalloc_start = *start;
70b99e69 1886
c8b97818
CM
1887 /*
1888 * make sure to limit the number of pages we try to lock down
c8b97818 1889 */
7bf811a5
JB
1890 if (delalloc_end + 1 - delalloc_start > max_bytes)
1891 delalloc_end = delalloc_start + max_bytes - 1;
d397712b 1892
c8b97818
CM
1893 /* step two, lock all the pages after the page that has start */
1894 ret = lock_delalloc_pages(inode, locked_page,
1895 delalloc_start, delalloc_end);
9bfd61d9 1896 ASSERT(!ret || ret == -EAGAIN);
c8b97818
CM
1897 if (ret == -EAGAIN) {
1898 /* some of the pages are gone, lets avoid looping by
1899 * shortening the size of the delalloc range we're searching
1900 */
9655d298 1901 free_extent_state(cached_state);
7d788742 1902 cached_state = NULL;
c8b97818 1903 if (!loops) {
09cbfeaf 1904 max_bytes = PAGE_SIZE;
c8b97818
CM
1905 loops = 1;
1906 goto again;
1907 } else {
3522e903 1908 found = false;
c8b97818
CM
1909 goto out_failed;
1910 }
1911 }
c8b97818
CM
1912
1913 /* step three, lock the state bits for the whole range */
ff13db41 1914 lock_extent_bits(tree, delalloc_start, delalloc_end, &cached_state);
c8b97818
CM
1915
1916 /* then test to make sure it is all still delalloc */
1917 ret = test_range_bit(tree, delalloc_start, delalloc_end,
9655d298 1918 EXTENT_DELALLOC, 1, cached_state);
c8b97818 1919 if (!ret) {
9655d298 1920 unlock_extent_cached(tree, delalloc_start, delalloc_end,
e43bbe5e 1921 &cached_state);
c8b97818
CM
1922 __unlock_for_delalloc(inode, locked_page,
1923 delalloc_start, delalloc_end);
1924 cond_resched();
1925 goto again;
1926 }
9655d298 1927 free_extent_state(cached_state);
c8b97818
CM
1928 *start = delalloc_start;
1929 *end = delalloc_end;
1930out_failed:
1931 return found;
1932}
1933
da2c7009
LB
1934static int __process_pages_contig(struct address_space *mapping,
1935 struct page *locked_page,
1936 pgoff_t start_index, pgoff_t end_index,
1937 unsigned long page_ops, pgoff_t *index_ret)
c8b97818 1938{
873695b3 1939 unsigned long nr_pages = end_index - start_index + 1;
da2c7009 1940 unsigned long pages_locked = 0;
873695b3 1941 pgoff_t index = start_index;
c8b97818 1942 struct page *pages[16];
873695b3 1943 unsigned ret;
da2c7009 1944 int err = 0;
c8b97818 1945 int i;
771ed689 1946
da2c7009
LB
1947 if (page_ops & PAGE_LOCK) {
1948 ASSERT(page_ops == PAGE_LOCK);
1949 ASSERT(index_ret && *index_ret == start_index);
1950 }
1951
704de49d 1952 if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0)
873695b3 1953 mapping_set_error(mapping, -EIO);
704de49d 1954
d397712b 1955 while (nr_pages > 0) {
873695b3 1956 ret = find_get_pages_contig(mapping, index,
5b050f04
CM
1957 min_t(unsigned long,
1958 nr_pages, ARRAY_SIZE(pages)), pages);
da2c7009
LB
1959 if (ret == 0) {
1960 /*
1961 * Only if we're going to lock these pages,
1962 * can we find nothing at @index.
1963 */
1964 ASSERT(page_ops & PAGE_LOCK);
49d4a334
LB
1965 err = -EAGAIN;
1966 goto out;
da2c7009 1967 }
8b62b72b 1968
da2c7009 1969 for (i = 0; i < ret; i++) {
c2790a2e 1970 if (page_ops & PAGE_SET_PRIVATE2)
8b62b72b
CM
1971 SetPagePrivate2(pages[i]);
1972
1d53c9e6 1973 if (locked_page && pages[i] == locked_page) {
09cbfeaf 1974 put_page(pages[i]);
da2c7009 1975 pages_locked++;
c8b97818
CM
1976 continue;
1977 }
c2790a2e 1978 if (page_ops & PAGE_CLEAR_DIRTY)
c8b97818 1979 clear_page_dirty_for_io(pages[i]);
c2790a2e 1980 if (page_ops & PAGE_SET_WRITEBACK)
c8b97818 1981 set_page_writeback(pages[i]);
704de49d
FM
1982 if (page_ops & PAGE_SET_ERROR)
1983 SetPageError(pages[i]);
c2790a2e 1984 if (page_ops & PAGE_END_WRITEBACK)
c8b97818 1985 end_page_writeback(pages[i]);
c2790a2e 1986 if (page_ops & PAGE_UNLOCK)
771ed689 1987 unlock_page(pages[i]);
da2c7009
LB
1988 if (page_ops & PAGE_LOCK) {
1989 lock_page(pages[i]);
1990 if (!PageDirty(pages[i]) ||
1991 pages[i]->mapping != mapping) {
1992 unlock_page(pages[i]);
1993 put_page(pages[i]);
1994 err = -EAGAIN;
1995 goto out;
1996 }
1997 }
09cbfeaf 1998 put_page(pages[i]);
da2c7009 1999 pages_locked++;
c8b97818
CM
2000 }
2001 nr_pages -= ret;
2002 index += ret;
2003 cond_resched();
2004 }
da2c7009
LB
2005out:
2006 if (err && index_ret)
2007 *index_ret = start_index + pages_locked - 1;
2008 return err;
c8b97818 2009}
c8b97818 2010
873695b3 2011void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
74e9194a
NB
2012 struct page *locked_page,
2013 unsigned clear_bits,
2014 unsigned long page_ops)
873695b3
LB
2015{
2016 clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, clear_bits, 1, 0,
ae0f1625 2017 NULL);
873695b3
LB
2018
2019 __process_pages_contig(inode->i_mapping, locked_page,
2020 start >> PAGE_SHIFT, end >> PAGE_SHIFT,
da2c7009 2021 page_ops, NULL);
873695b3
LB
2022}
2023
d352ac68
CM
2024/*
2025 * count the number of bytes in the tree that have a given bit(s)
2026 * set. This can be fairly slow, except for EXTENT_DIRTY which is
2027 * cached. The total number found is returned.
2028 */
d1310b2e
CM
2029u64 count_range_bits(struct extent_io_tree *tree,
2030 u64 *start, u64 search_end, u64 max_bytes,
9ee49a04 2031 unsigned bits, int contig)
d1310b2e
CM
2032{
2033 struct rb_node *node;
2034 struct extent_state *state;
2035 u64 cur_start = *start;
2036 u64 total_bytes = 0;
ec29ed5b 2037 u64 last = 0;
d1310b2e
CM
2038 int found = 0;
2039
fae7f21c 2040 if (WARN_ON(search_end <= cur_start))
d1310b2e 2041 return 0;
d1310b2e 2042
cad321ad 2043 spin_lock(&tree->lock);
d1310b2e
CM
2044 if (cur_start == 0 && bits == EXTENT_DIRTY) {
2045 total_bytes = tree->dirty_bytes;
2046 goto out;
2047 }
2048 /*
2049 * this search will find all the extents that end after
2050 * our range starts.
2051 */
80ea96b1 2052 node = tree_search(tree, cur_start);
d397712b 2053 if (!node)
d1310b2e 2054 goto out;
d1310b2e 2055
d397712b 2056 while (1) {
d1310b2e
CM
2057 state = rb_entry(node, struct extent_state, rb_node);
2058 if (state->start > search_end)
2059 break;
ec29ed5b
CM
2060 if (contig && found && state->start > last + 1)
2061 break;
2062 if (state->end >= cur_start && (state->state & bits) == bits) {
d1310b2e
CM
2063 total_bytes += min(search_end, state->end) + 1 -
2064 max(cur_start, state->start);
2065 if (total_bytes >= max_bytes)
2066 break;
2067 if (!found) {
af60bed2 2068 *start = max(cur_start, state->start);
d1310b2e
CM
2069 found = 1;
2070 }
ec29ed5b
CM
2071 last = state->end;
2072 } else if (contig && found) {
2073 break;
d1310b2e
CM
2074 }
2075 node = rb_next(node);
2076 if (!node)
2077 break;
2078 }
2079out:
cad321ad 2080 spin_unlock(&tree->lock);
d1310b2e
CM
2081 return total_bytes;
2082}
b2950863 2083
d352ac68
CM
2084/*
2085 * set the private field for a given byte offset in the tree. If there isn't
2086 * an extent_state there already, this does nothing.
2087 */
b3f167aa
JB
2088int set_state_failrec(struct extent_io_tree *tree, u64 start,
2089 struct io_failure_record *failrec)
d1310b2e
CM
2090{
2091 struct rb_node *node;
2092 struct extent_state *state;
2093 int ret = 0;
2094
cad321ad 2095 spin_lock(&tree->lock);
d1310b2e
CM
2096 /*
2097 * this search will find all the extents that end after
2098 * our range starts.
2099 */
80ea96b1 2100 node = tree_search(tree, start);
2b114d1d 2101 if (!node) {
d1310b2e
CM
2102 ret = -ENOENT;
2103 goto out;
2104 }
2105 state = rb_entry(node, struct extent_state, rb_node);
2106 if (state->start != start) {
2107 ret = -ENOENT;
2108 goto out;
2109 }
47dc196a 2110 state->failrec = failrec;
d1310b2e 2111out:
cad321ad 2112 spin_unlock(&tree->lock);
d1310b2e
CM
2113 return ret;
2114}
2115
b3f167aa
JB
2116int get_state_failrec(struct extent_io_tree *tree, u64 start,
2117 struct io_failure_record **failrec)
d1310b2e
CM
2118{
2119 struct rb_node *node;
2120 struct extent_state *state;
2121 int ret = 0;
2122
cad321ad 2123 spin_lock(&tree->lock);
d1310b2e
CM
2124 /*
2125 * this search will find all the extents that end after
2126 * our range starts.
2127 */
80ea96b1 2128 node = tree_search(tree, start);
2b114d1d 2129 if (!node) {
d1310b2e
CM
2130 ret = -ENOENT;
2131 goto out;
2132 }
2133 state = rb_entry(node, struct extent_state, rb_node);
2134 if (state->start != start) {
2135 ret = -ENOENT;
2136 goto out;
2137 }
47dc196a 2138 *failrec = state->failrec;
d1310b2e 2139out:
cad321ad 2140 spin_unlock(&tree->lock);
d1310b2e
CM
2141 return ret;
2142}
2143
2144/*
2145 * searches a range in the state tree for a given mask.
70dec807 2146 * If 'filled' == 1, this returns 1 only if every extent in the tree
d1310b2e
CM
2147 * has the bits set. Otherwise, 1 is returned if any bit in the
2148 * range is found set.
2149 */
2150int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
9ee49a04 2151 unsigned bits, int filled, struct extent_state *cached)
d1310b2e
CM
2152{
2153 struct extent_state *state = NULL;
2154 struct rb_node *node;
2155 int bitset = 0;
d1310b2e 2156
cad321ad 2157 spin_lock(&tree->lock);
27a3507d 2158 if (cached && extent_state_in_tree(cached) && cached->start <= start &&
df98b6e2 2159 cached->end > start)
9655d298
CM
2160 node = &cached->rb_node;
2161 else
2162 node = tree_search(tree, start);
d1310b2e
CM
2163 while (node && start <= end) {
2164 state = rb_entry(node, struct extent_state, rb_node);
2165
2166 if (filled && state->start > start) {
2167 bitset = 0;
2168 break;
2169 }
2170
2171 if (state->start > end)
2172 break;
2173
2174 if (state->state & bits) {
2175 bitset = 1;
2176 if (!filled)
2177 break;
2178 } else if (filled) {
2179 bitset = 0;
2180 break;
2181 }
46562cec
CM
2182
2183 if (state->end == (u64)-1)
2184 break;
2185
d1310b2e
CM
2186 start = state->end + 1;
2187 if (start > end)
2188 break;
2189 node = rb_next(node);
2190 if (!node) {
2191 if (filled)
2192 bitset = 0;
2193 break;
2194 }
2195 }
cad321ad 2196 spin_unlock(&tree->lock);
d1310b2e
CM
2197 return bitset;
2198}
d1310b2e
CM
2199
2200/*
2201 * helper function to set a given page up to date if all the
2202 * extents in the tree for that page are up to date
2203 */
143bede5 2204static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)
d1310b2e 2205{
4eee4fa4 2206 u64 start = page_offset(page);
09cbfeaf 2207 u64 end = start + PAGE_SIZE - 1;
9655d298 2208 if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
d1310b2e 2209 SetPageUptodate(page);
d1310b2e
CM
2210}
2211
7870d082
JB
2212int free_io_failure(struct extent_io_tree *failure_tree,
2213 struct extent_io_tree *io_tree,
2214 struct io_failure_record *rec)
4a54c8c1
JS
2215{
2216 int ret;
2217 int err = 0;
4a54c8c1 2218
47dc196a 2219 set_state_failrec(failure_tree, rec->start, NULL);
4a54c8c1
JS
2220 ret = clear_extent_bits(failure_tree, rec->start,
2221 rec->start + rec->len - 1,
91166212 2222 EXTENT_LOCKED | EXTENT_DIRTY);
4a54c8c1
JS
2223 if (ret)
2224 err = ret;
2225
7870d082 2226 ret = clear_extent_bits(io_tree, rec->start,
53b381b3 2227 rec->start + rec->len - 1,
91166212 2228 EXTENT_DAMAGED);
53b381b3
DW
2229 if (ret && !err)
2230 err = ret;
4a54c8c1
JS
2231
2232 kfree(rec);
2233 return err;
2234}
2235
4a54c8c1
JS
2236/*
2237 * this bypasses the standard btrfs submit functions deliberately, as
2238 * the standard behavior is to write all copies in a raid setup. here we only
2239 * want to write the one bad copy. so we do the mapping for ourselves and issue
2240 * submit_bio directly.
3ec706c8 2241 * to avoid any synchronization issues, wait for the data after writing, which
4a54c8c1
JS
2242 * actually prevents the read that triggered the error from finishing.
2243 * currently, there can be no more than two copies of every data bit. thus,
2244 * exactly one rewrite is required.
2245 */
6ec656bc
JB
2246int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
2247 u64 length, u64 logical, struct page *page,
2248 unsigned int pg_offset, int mirror_num)
4a54c8c1
JS
2249{
2250 struct bio *bio;
2251 struct btrfs_device *dev;
4a54c8c1
JS
2252 u64 map_length = 0;
2253 u64 sector;
2254 struct btrfs_bio *bbio = NULL;
2255 int ret;
2256
1751e8a6 2257 ASSERT(!(fs_info->sb->s_flags & SB_RDONLY));
4a54c8c1
JS
2258 BUG_ON(!mirror_num);
2259
c5e4c3d7 2260 bio = btrfs_io_bio_alloc(1);
4f024f37 2261 bio->bi_iter.bi_size = 0;
4a54c8c1
JS
2262 map_length = length;
2263
b5de8d0d
FM
2264 /*
2265 * Avoid races with device replace and make sure our bbio has devices
2266 * associated to its stripes that don't go away while we are doing the
2267 * read repair operation.
2268 */
2269 btrfs_bio_counter_inc_blocked(fs_info);
e4ff5fb5 2270 if (btrfs_is_parity_mirror(fs_info, logical, length)) {
c725328c
LB
2271 /*
2272 * Note that we don't use BTRFS_MAP_WRITE because it's supposed
2273 * to update all raid stripes, but here we just want to correct
2274 * bad stripe, thus BTRFS_MAP_READ is abused to only get the bad
2275 * stripe's dev and sector.
2276 */
2277 ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical,
2278 &map_length, &bbio, 0);
2279 if (ret) {
2280 btrfs_bio_counter_dec(fs_info);
2281 bio_put(bio);
2282 return -EIO;
2283 }
2284 ASSERT(bbio->mirror_num == 1);
2285 } else {
2286 ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical,
2287 &map_length, &bbio, mirror_num);
2288 if (ret) {
2289 btrfs_bio_counter_dec(fs_info);
2290 bio_put(bio);
2291 return -EIO;
2292 }
2293 BUG_ON(mirror_num != bbio->mirror_num);
4a54c8c1 2294 }
c725328c
LB
2295
2296 sector = bbio->stripes[bbio->mirror_num - 1].physical >> 9;
4f024f37 2297 bio->bi_iter.bi_sector = sector;
c725328c 2298 dev = bbio->stripes[bbio->mirror_num - 1].dev;
6e9606d2 2299 btrfs_put_bbio(bbio);
ebbede42
AJ
2300 if (!dev || !dev->bdev ||
2301 !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
b5de8d0d 2302 btrfs_bio_counter_dec(fs_info);
4a54c8c1
JS
2303 bio_put(bio);
2304 return -EIO;
2305 }
74d46992 2306 bio_set_dev(bio, dev->bdev);
70fd7614 2307 bio->bi_opf = REQ_OP_WRITE | REQ_SYNC;
ffdd2018 2308 bio_add_page(bio, page, length, pg_offset);
4a54c8c1 2309
4e49ea4a 2310 if (btrfsic_submit_bio_wait(bio)) {
4a54c8c1 2311 /* try to remap that extent elsewhere? */
b5de8d0d 2312 btrfs_bio_counter_dec(fs_info);
4a54c8c1 2313 bio_put(bio);
442a4f63 2314 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
4a54c8c1
JS
2315 return -EIO;
2316 }
2317
b14af3b4
DS
2318 btrfs_info_rl_in_rcu(fs_info,
2319 "read error corrected: ino %llu off %llu (dev %s sector %llu)",
6ec656bc 2320 ino, start,
1203b681 2321 rcu_str_deref(dev->name), sector);
b5de8d0d 2322 btrfs_bio_counter_dec(fs_info);
4a54c8c1
JS
2323 bio_put(bio);
2324 return 0;
2325}
2326
20a1fbf9 2327int btrfs_repair_eb_io_failure(struct extent_buffer *eb, int mirror_num)
ea466794 2328{
20a1fbf9 2329 struct btrfs_fs_info *fs_info = eb->fs_info;
ea466794 2330 u64 start = eb->start;
cc5e31a4 2331 int i, num_pages = num_extent_pages(eb);
d95603b2 2332 int ret = 0;
ea466794 2333
bc98a42c 2334 if (sb_rdonly(fs_info->sb))
908960c6
ID
2335 return -EROFS;
2336
ea466794 2337 for (i = 0; i < num_pages; i++) {
fb85fc9a 2338 struct page *p = eb->pages[i];
1203b681 2339
6ec656bc 2340 ret = repair_io_failure(fs_info, 0, start, PAGE_SIZE, start, p,
1203b681 2341 start - page_offset(p), mirror_num);
ea466794
JB
2342 if (ret)
2343 break;
09cbfeaf 2344 start += PAGE_SIZE;
ea466794
JB
2345 }
2346
2347 return ret;
2348}
2349
4a54c8c1
JS
2350/*
2351 * each time an IO finishes, we do a fast check in the IO failure tree
2352 * to see if we need to process or clean up an io_failure_record
2353 */
7870d082
JB
2354int clean_io_failure(struct btrfs_fs_info *fs_info,
2355 struct extent_io_tree *failure_tree,
2356 struct extent_io_tree *io_tree, u64 start,
2357 struct page *page, u64 ino, unsigned int pg_offset)
4a54c8c1
JS
2358{
2359 u64 private;
4a54c8c1 2360 struct io_failure_record *failrec;
4a54c8c1
JS
2361 struct extent_state *state;
2362 int num_copies;
4a54c8c1 2363 int ret;
4a54c8c1
JS
2364
2365 private = 0;
7870d082
JB
2366 ret = count_range_bits(failure_tree, &private, (u64)-1, 1,
2367 EXTENT_DIRTY, 0);
4a54c8c1
JS
2368 if (!ret)
2369 return 0;
2370
7870d082 2371 ret = get_state_failrec(failure_tree, start, &failrec);
4a54c8c1
JS
2372 if (ret)
2373 return 0;
2374
4a54c8c1
JS
2375 BUG_ON(!failrec->this_mirror);
2376
2377 if (failrec->in_validation) {
2378 /* there was no real error, just free the record */
ab8d0fc4
JM
2379 btrfs_debug(fs_info,
2380 "clean_io_failure: freeing dummy error at %llu",
2381 failrec->start);
4a54c8c1
JS
2382 goto out;
2383 }
bc98a42c 2384 if (sb_rdonly(fs_info->sb))
908960c6 2385 goto out;
4a54c8c1 2386
7870d082
JB
2387 spin_lock(&io_tree->lock);
2388 state = find_first_extent_bit_state(io_tree,
4a54c8c1
JS
2389 failrec->start,
2390 EXTENT_LOCKED);
7870d082 2391 spin_unlock(&io_tree->lock);
4a54c8c1 2392
883d0de4
MX
2393 if (state && state->start <= failrec->start &&
2394 state->end >= failrec->start + failrec->len - 1) {
3ec706c8
SB
2395 num_copies = btrfs_num_copies(fs_info, failrec->logical,
2396 failrec->len);
4a54c8c1 2397 if (num_copies > 1) {
7870d082
JB
2398 repair_io_failure(fs_info, ino, start, failrec->len,
2399 failrec->logical, page, pg_offset,
2400 failrec->failed_mirror);
4a54c8c1
JS
2401 }
2402 }
2403
2404out:
7870d082 2405 free_io_failure(failure_tree, io_tree, failrec);
4a54c8c1 2406
454ff3de 2407 return 0;
4a54c8c1
JS
2408}
2409
f612496b
MX
2410/*
2411 * Can be called when
2412 * - hold extent lock
2413 * - under ordered extent
2414 * - the inode is freeing
2415 */
7ab7956e 2416void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end)
f612496b 2417{
7ab7956e 2418 struct extent_io_tree *failure_tree = &inode->io_failure_tree;
f612496b
MX
2419 struct io_failure_record *failrec;
2420 struct extent_state *state, *next;
2421
2422 if (RB_EMPTY_ROOT(&failure_tree->state))
2423 return;
2424
2425 spin_lock(&failure_tree->lock);
2426 state = find_first_extent_bit_state(failure_tree, start, EXTENT_DIRTY);
2427 while (state) {
2428 if (state->start > end)
2429 break;
2430
2431 ASSERT(state->end <= end);
2432
2433 next = next_state(state);
2434
47dc196a 2435 failrec = state->failrec;
f612496b
MX
2436 free_extent_state(state);
2437 kfree(failrec);
2438
2439 state = next;
2440 }
2441 spin_unlock(&failure_tree->lock);
2442}
2443
2fe6303e 2444int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
47dc196a 2445 struct io_failure_record **failrec_ret)
4a54c8c1 2446{
ab8d0fc4 2447 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2fe6303e 2448 struct io_failure_record *failrec;
4a54c8c1 2449 struct extent_map *em;
4a54c8c1
JS
2450 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
2451 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
2452 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
4a54c8c1 2453 int ret;
4a54c8c1
JS
2454 u64 logical;
2455
47dc196a 2456 ret = get_state_failrec(failure_tree, start, &failrec);
4a54c8c1
JS
2457 if (ret) {
2458 failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
2459 if (!failrec)
2460 return -ENOMEM;
2fe6303e 2461
4a54c8c1
JS
2462 failrec->start = start;
2463 failrec->len = end - start + 1;
2464 failrec->this_mirror = 0;
2465 failrec->bio_flags = 0;
2466 failrec->in_validation = 0;
2467
2468 read_lock(&em_tree->lock);
2469 em = lookup_extent_mapping(em_tree, start, failrec->len);
2470 if (!em) {
2471 read_unlock(&em_tree->lock);
2472 kfree(failrec);
2473 return -EIO;
2474 }
2475
68ba990f 2476 if (em->start > start || em->start + em->len <= start) {
4a54c8c1
JS
2477 free_extent_map(em);
2478 em = NULL;
2479 }
2480 read_unlock(&em_tree->lock);
7a2d6a64 2481 if (!em) {
4a54c8c1
JS
2482 kfree(failrec);
2483 return -EIO;
2484 }
2fe6303e 2485
4a54c8c1
JS
2486 logical = start - em->start;
2487 logical = em->block_start + logical;
2488 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
2489 logical = em->block_start;
2490 failrec->bio_flags = EXTENT_BIO_COMPRESSED;
2491 extent_set_compress_type(&failrec->bio_flags,
2492 em->compress_type);
2493 }
2fe6303e 2494
ab8d0fc4
JM
2495 btrfs_debug(fs_info,
2496 "Get IO Failure Record: (new) logical=%llu, start=%llu, len=%llu",
2497 logical, start, failrec->len);
2fe6303e 2498
4a54c8c1
JS
2499 failrec->logical = logical;
2500 free_extent_map(em);
2501
2502 /* set the bits in the private failure tree */
2503 ret = set_extent_bits(failure_tree, start, end,
ceeb0ae7 2504 EXTENT_LOCKED | EXTENT_DIRTY);
4a54c8c1 2505 if (ret >= 0)
47dc196a 2506 ret = set_state_failrec(failure_tree, start, failrec);
4a54c8c1
JS
2507 /* set the bits in the inode's tree */
2508 if (ret >= 0)
ceeb0ae7 2509 ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED);
4a54c8c1
JS
2510 if (ret < 0) {
2511 kfree(failrec);
2512 return ret;
2513 }
2514 } else {
ab8d0fc4
JM
2515 btrfs_debug(fs_info,
2516 "Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu, validation=%d",
2517 failrec->logical, failrec->start, failrec->len,
2518 failrec->in_validation);
4a54c8c1
JS
2519 /*
2520 * when data can be on disk more than twice, add to failrec here
2521 * (e.g. with a list for failed_mirror) to make
2522 * clean_io_failure() clean all those errors at once.
2523 */
2524 }
2fe6303e
MX
2525
2526 *failrec_ret = failrec;
2527
2528 return 0;
2529}
2530
a0b60d72 2531bool btrfs_check_repairable(struct inode *inode, unsigned failed_bio_pages,
2fe6303e
MX
2532 struct io_failure_record *failrec, int failed_mirror)
2533{
ab8d0fc4 2534 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2fe6303e
MX
2535 int num_copies;
2536
ab8d0fc4 2537 num_copies = btrfs_num_copies(fs_info, failrec->logical, failrec->len);
4a54c8c1
JS
2538 if (num_copies == 1) {
2539 /*
2540 * we only have a single copy of the data, so don't bother with
2541 * all the retry and error correction code that follows. no
2542 * matter what the error is, it is very likely to persist.
2543 */
ab8d0fc4
JM
2544 btrfs_debug(fs_info,
2545 "Check Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d",
2546 num_copies, failrec->this_mirror, failed_mirror);
c3cfb656 2547 return false;
4a54c8c1
JS
2548 }
2549
4a54c8c1
JS
2550 /*
2551 * there are two premises:
2552 * a) deliver good data to the caller
2553 * b) correct the bad sectors on disk
2554 */
a0b60d72 2555 if (failed_bio_pages > 1) {
4a54c8c1
JS
2556 /*
2557 * to fulfill b), we need to know the exact failing sectors, as
2558 * we don't want to rewrite any more than the failed ones. thus,
2559 * we need separate read requests for the failed bio
2560 *
2561 * if the following BUG_ON triggers, our validation request got
2562 * merged. we need separate requests for our algorithm to work.
2563 */
2564 BUG_ON(failrec->in_validation);
2565 failrec->in_validation = 1;
2566 failrec->this_mirror = failed_mirror;
4a54c8c1
JS
2567 } else {
2568 /*
2569 * we're ready to fulfill a) and b) alongside. get a good copy
2570 * of the failed sector and if we succeed, we have setup
2571 * everything for repair_io_failure to do the rest for us.
2572 */
2573 if (failrec->in_validation) {
2574 BUG_ON(failrec->this_mirror != failed_mirror);
2575 failrec->in_validation = 0;
2576 failrec->this_mirror = 0;
2577 }
2578 failrec->failed_mirror = failed_mirror;
2579 failrec->this_mirror++;
2580 if (failrec->this_mirror == failed_mirror)
2581 failrec->this_mirror++;
4a54c8c1
JS
2582 }
2583
facc8a22 2584 if (failrec->this_mirror > num_copies) {
ab8d0fc4
JM
2585 btrfs_debug(fs_info,
2586 "Check Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d",
2587 num_copies, failrec->this_mirror, failed_mirror);
c3cfb656 2588 return false;
4a54c8c1
JS
2589 }
2590
c3cfb656 2591 return true;
2fe6303e
MX
2592}
2593
2594
2595struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio,
2596 struct io_failure_record *failrec,
2597 struct page *page, int pg_offset, int icsum,
8b110e39 2598 bio_end_io_t *endio_func, void *data)
2fe6303e 2599{
0b246afa 2600 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2fe6303e
MX
2601 struct bio *bio;
2602 struct btrfs_io_bio *btrfs_failed_bio;
2603 struct btrfs_io_bio *btrfs_bio;
2604
c5e4c3d7 2605 bio = btrfs_io_bio_alloc(1);
2fe6303e 2606 bio->bi_end_io = endio_func;
4f024f37 2607 bio->bi_iter.bi_sector = failrec->logical >> 9;
4f024f37 2608 bio->bi_iter.bi_size = 0;
8b110e39 2609 bio->bi_private = data;
4a54c8c1 2610
facc8a22
MX
2611 btrfs_failed_bio = btrfs_io_bio(failed_bio);
2612 if (btrfs_failed_bio->csum) {
facc8a22
MX
2613 u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
2614
2615 btrfs_bio = btrfs_io_bio(bio);
2616 btrfs_bio->csum = btrfs_bio->csum_inline;
2fe6303e
MX
2617 icsum *= csum_size;
2618 memcpy(btrfs_bio->csum, btrfs_failed_bio->csum + icsum,
facc8a22
MX
2619 csum_size);
2620 }
2621
2fe6303e
MX
2622 bio_add_page(bio, page, failrec->len, pg_offset);
2623
2624 return bio;
2625}
2626
2627/*
78e62c02
NB
2628 * This is a generic handler for readpage errors. If other copies exist, read
2629 * those and write back good data to the failed position. Does not investigate
2630 * in remapping the failed extent elsewhere, hoping the device will be smart
2631 * enough to do this as needed
2fe6303e 2632 */
2fe6303e
MX
2633static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
2634 struct page *page, u64 start, u64 end,
2635 int failed_mirror)
2636{
2637 struct io_failure_record *failrec;
2638 struct inode *inode = page->mapping->host;
2639 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
7870d082 2640 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
2fe6303e 2641 struct bio *bio;
70fd7614 2642 int read_mode = 0;
4e4cbee9 2643 blk_status_t status;
2fe6303e 2644 int ret;
8a2ee44a 2645 unsigned failed_bio_pages = failed_bio->bi_iter.bi_size >> PAGE_SHIFT;
2fe6303e 2646
1f7ad75b 2647 BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
2fe6303e
MX
2648
2649 ret = btrfs_get_io_failure_record(inode, start, end, &failrec);
2650 if (ret)
2651 return ret;
2652
a0b60d72 2653 if (!btrfs_check_repairable(inode, failed_bio_pages, failrec,
c3cfb656 2654 failed_mirror)) {
7870d082 2655 free_io_failure(failure_tree, tree, failrec);
2fe6303e
MX
2656 return -EIO;
2657 }
2658
a0b60d72 2659 if (failed_bio_pages > 1)
70fd7614 2660 read_mode |= REQ_FAILFAST_DEV;
2fe6303e
MX
2661
2662 phy_offset >>= inode->i_sb->s_blocksize_bits;
2663 bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page,
2664 start - page_offset(page),
8b110e39
MX
2665 (int)phy_offset, failed_bio->bi_end_io,
2666 NULL);
ebcc3263 2667 bio->bi_opf = REQ_OP_READ | read_mode;
4a54c8c1 2668
ab8d0fc4
JM
2669 btrfs_debug(btrfs_sb(inode->i_sb),
2670 "Repair Read Error: submitting new read[%#x] to this_mirror=%d, in_validation=%d",
2671 read_mode, failrec->this_mirror, failrec->in_validation);
4a54c8c1 2672
8c27cb35 2673 status = tree->ops->submit_bio_hook(tree->private_data, bio, failrec->this_mirror,
50489a57 2674 failrec->bio_flags);
4e4cbee9 2675 if (status) {
7870d082 2676 free_io_failure(failure_tree, tree, failrec);
6c387ab2 2677 bio_put(bio);
4e4cbee9 2678 ret = blk_status_to_errno(status);
6c387ab2
MX
2679 }
2680
013bd4c3 2681 return ret;
4a54c8c1
JS
2682}
2683
d1310b2e
CM
2684/* lots and lots of room for performance fixes in the end_bio funcs */
2685
b5227c07 2686void end_extent_writepage(struct page *page, int err, u64 start, u64 end)
87826df0
JM
2687{
2688 int uptodate = (err == 0);
3e2426bd 2689 int ret = 0;
87826df0 2690
c629732d 2691 btrfs_writepage_endio_finish_ordered(page, start, end, uptodate);
87826df0 2692
87826df0 2693 if (!uptodate) {
87826df0
JM
2694 ClearPageUptodate(page);
2695 SetPageError(page);
bff5baf8 2696 ret = err < 0 ? err : -EIO;
5dca6eea 2697 mapping_set_error(page->mapping, ret);
87826df0 2698 }
87826df0
JM
2699}
2700
d1310b2e
CM
2701/*
2702 * after a writepage IO is done, we need to:
2703 * clear the uptodate bits on error
2704 * clear the writeback bits in the extent tree for this IO
2705 * end_page_writeback if the page has no more pending IO
2706 *
2707 * Scheduling is not allowed, so the extent state tree is expected
2708 * to have one and only one object corresponding to this IO.
2709 */
4246a0b6 2710static void end_bio_extent_writepage(struct bio *bio)
d1310b2e 2711{
4e4cbee9 2712 int error = blk_status_to_errno(bio->bi_status);
2c30c71b 2713 struct bio_vec *bvec;
d1310b2e
CM
2714 u64 start;
2715 u64 end;
6dc4f100 2716 struct bvec_iter_all iter_all;
d1310b2e 2717
c09abff8 2718 ASSERT(!bio_flagged(bio, BIO_CLONED));
2b070cfe 2719 bio_for_each_segment_all(bvec, bio, iter_all) {
d1310b2e 2720 struct page *page = bvec->bv_page;
0b246afa
JM
2721 struct inode *inode = page->mapping->host;
2722 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
902b22f3 2723
17a5adcc
AO
2724 /* We always issue full-page reads, but if some block
2725 * in a page fails to read, blk_update_request() will
2726 * advance bv_offset and adjust bv_len to compensate.
2727 * Print a warning for nonzero offsets, and an error
2728 * if they don't add up to a full page. */
09cbfeaf
KS
2729 if (bvec->bv_offset || bvec->bv_len != PAGE_SIZE) {
2730 if (bvec->bv_offset + bvec->bv_len != PAGE_SIZE)
0b246afa 2731 btrfs_err(fs_info,
efe120a0
FH
2732 "partial page write in btrfs with offset %u and length %u",
2733 bvec->bv_offset, bvec->bv_len);
2734 else
0b246afa 2735 btrfs_info(fs_info,
5d163e0e 2736 "incomplete page write in btrfs with offset %u and length %u",
efe120a0
FH
2737 bvec->bv_offset, bvec->bv_len);
2738 }
d1310b2e 2739
17a5adcc
AO
2740 start = page_offset(page);
2741 end = start + bvec->bv_offset + bvec->bv_len - 1;
d1310b2e 2742
4e4cbee9 2743 end_extent_writepage(page, error, start, end);
17a5adcc 2744 end_page_writeback(page);
2c30c71b 2745 }
2b1f55b0 2746
d1310b2e 2747 bio_put(bio);
d1310b2e
CM
2748}
2749
883d0de4
MX
2750static void
2751endio_readpage_release_extent(struct extent_io_tree *tree, u64 start, u64 len,
2752 int uptodate)
2753{
2754 struct extent_state *cached = NULL;
2755 u64 end = start + len - 1;
2756
2757 if (uptodate && tree->track_uptodate)
2758 set_extent_uptodate(tree, start, end, &cached, GFP_ATOMIC);
d810a4be 2759 unlock_extent_cached_atomic(tree, start, end, &cached);
883d0de4
MX
2760}
2761
d1310b2e
CM
2762/*
2763 * after a readpage IO is done, we need to:
2764 * clear the uptodate bits on error
2765 * set the uptodate bits if things worked
2766 * set the page up to date if all extents in the tree are uptodate
2767 * clear the lock bit in the extent tree
2768 * unlock the page if there are no other extents locked for it
2769 *
2770 * Scheduling is not allowed, so the extent state tree is expected
2771 * to have one and only one object corresponding to this IO.
2772 */
4246a0b6 2773static void end_bio_extent_readpage(struct bio *bio)
d1310b2e 2774{
2c30c71b 2775 struct bio_vec *bvec;
4e4cbee9 2776 int uptodate = !bio->bi_status;
facc8a22 2777 struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
7870d082 2778 struct extent_io_tree *tree, *failure_tree;
facc8a22 2779 u64 offset = 0;
d1310b2e
CM
2780 u64 start;
2781 u64 end;
facc8a22 2782 u64 len;
883d0de4
MX
2783 u64 extent_start = 0;
2784 u64 extent_len = 0;
5cf1ab56 2785 int mirror;
d1310b2e 2786 int ret;
6dc4f100 2787 struct bvec_iter_all iter_all;
d1310b2e 2788
c09abff8 2789 ASSERT(!bio_flagged(bio, BIO_CLONED));
2b070cfe 2790 bio_for_each_segment_all(bvec, bio, iter_all) {
d1310b2e 2791 struct page *page = bvec->bv_page;
a71754fc 2792 struct inode *inode = page->mapping->host;
ab8d0fc4 2793 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
78e62c02
NB
2794 bool data_inode = btrfs_ino(BTRFS_I(inode))
2795 != BTRFS_BTREE_INODE_OBJECTID;
507903b8 2796
ab8d0fc4
JM
2797 btrfs_debug(fs_info,
2798 "end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u",
4e4cbee9 2799 (u64)bio->bi_iter.bi_sector, bio->bi_status,
ab8d0fc4 2800 io_bio->mirror_num);
a71754fc 2801 tree = &BTRFS_I(inode)->io_tree;
7870d082 2802 failure_tree = &BTRFS_I(inode)->io_failure_tree;
902b22f3 2803
17a5adcc
AO
2804 /* We always issue full-page reads, but if some block
2805 * in a page fails to read, blk_update_request() will
2806 * advance bv_offset and adjust bv_len to compensate.
2807 * Print a warning for nonzero offsets, and an error
2808 * if they don't add up to a full page. */
09cbfeaf
KS
2809 if (bvec->bv_offset || bvec->bv_len != PAGE_SIZE) {
2810 if (bvec->bv_offset + bvec->bv_len != PAGE_SIZE)
ab8d0fc4
JM
2811 btrfs_err(fs_info,
2812 "partial page read in btrfs with offset %u and length %u",
efe120a0
FH
2813 bvec->bv_offset, bvec->bv_len);
2814 else
ab8d0fc4
JM
2815 btrfs_info(fs_info,
2816 "incomplete page read in btrfs with offset %u and length %u",
efe120a0
FH
2817 bvec->bv_offset, bvec->bv_len);
2818 }
d1310b2e 2819
17a5adcc
AO
2820 start = page_offset(page);
2821 end = start + bvec->bv_offset + bvec->bv_len - 1;
facc8a22 2822 len = bvec->bv_len;
d1310b2e 2823
9be3395b 2824 mirror = io_bio->mirror_num;
78e62c02 2825 if (likely(uptodate)) {
facc8a22
MX
2826 ret = tree->ops->readpage_end_io_hook(io_bio, offset,
2827 page, start, end,
2828 mirror);
5ee0844d 2829 if (ret)
d1310b2e 2830 uptodate = 0;
5ee0844d 2831 else
7870d082
JB
2832 clean_io_failure(BTRFS_I(inode)->root->fs_info,
2833 failure_tree, tree, start,
2834 page,
2835 btrfs_ino(BTRFS_I(inode)), 0);
d1310b2e 2836 }
ea466794 2837
f2a09da9
MX
2838 if (likely(uptodate))
2839 goto readpage_ok;
2840
78e62c02 2841 if (data_inode) {
9d0d1c8b 2842
f4a8e656 2843 /*
78e62c02
NB
2844 * The generic bio_readpage_error handles errors the
2845 * following way: If possible, new read requests are
2846 * created and submitted and will end up in
2847 * end_bio_extent_readpage as well (if we're lucky,
2848 * not in the !uptodate case). In that case it returns
2849 * 0 and we just go on with the next page in our bio.
2850 * If it can't handle the error it will return -EIO and
2851 * we remain responsible for that page.
f4a8e656 2852 */
78e62c02
NB
2853 ret = bio_readpage_error(bio, offset, page, start, end,
2854 mirror);
2855 if (ret == 0) {
2856 uptodate = !bio->bi_status;
2857 offset += len;
2858 continue;
2859 }
2860 } else {
2861 struct extent_buffer *eb;
2862
2863 eb = (struct extent_buffer *)page->private;
2864 set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
2865 eb->read_mirror = mirror;
2866 atomic_dec(&eb->io_pages);
2867 if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD,
2868 &eb->bflags))
2869 btree_readahead_hook(eb, -EIO);
7e38326f 2870 }
f2a09da9 2871readpage_ok:
883d0de4 2872 if (likely(uptodate)) {
a71754fc 2873 loff_t i_size = i_size_read(inode);
09cbfeaf 2874 pgoff_t end_index = i_size >> PAGE_SHIFT;
a583c026 2875 unsigned off;
a71754fc
JB
2876
2877 /* Zero out the end if this page straddles i_size */
7073017a 2878 off = offset_in_page(i_size);
a583c026 2879 if (page->index == end_index && off)
09cbfeaf 2880 zero_user_segment(page, off, PAGE_SIZE);
17a5adcc 2881 SetPageUptodate(page);
70dec807 2882 } else {
17a5adcc
AO
2883 ClearPageUptodate(page);
2884 SetPageError(page);
70dec807 2885 }
17a5adcc 2886 unlock_page(page);
facc8a22 2887 offset += len;
883d0de4
MX
2888
2889 if (unlikely(!uptodate)) {
2890 if (extent_len) {
2891 endio_readpage_release_extent(tree,
2892 extent_start,
2893 extent_len, 1);
2894 extent_start = 0;
2895 extent_len = 0;
2896 }
2897 endio_readpage_release_extent(tree, start,
2898 end - start + 1, 0);
2899 } else if (!extent_len) {
2900 extent_start = start;
2901 extent_len = end + 1 - start;
2902 } else if (extent_start + extent_len == start) {
2903 extent_len += end + 1 - start;
2904 } else {
2905 endio_readpage_release_extent(tree, extent_start,
2906 extent_len, uptodate);
2907 extent_start = start;
2908 extent_len = end + 1 - start;
2909 }
2c30c71b 2910 }
d1310b2e 2911
883d0de4
MX
2912 if (extent_len)
2913 endio_readpage_release_extent(tree, extent_start, extent_len,
2914 uptodate);
b3a0dd50 2915 btrfs_io_bio_free_csum(io_bio);
d1310b2e 2916 bio_put(bio);
d1310b2e
CM
2917}
2918
9be3395b 2919/*
184f999e
DS
2920 * Initialize the members up to but not including 'bio'. Use after allocating a
2921 * new bio by bio_alloc_bioset as it does not initialize the bytes outside of
2922 * 'bio' because use of __GFP_ZERO is not supported.
9be3395b 2923 */
184f999e 2924static inline void btrfs_io_bio_init(struct btrfs_io_bio *btrfs_bio)
d1310b2e 2925{
184f999e
DS
2926 memset(btrfs_bio, 0, offsetof(struct btrfs_io_bio, bio));
2927}
d1310b2e 2928
9be3395b 2929/*
6e707bcd
DS
2930 * The following helpers allocate a bio. As it's backed by a bioset, it'll
2931 * never fail. We're returning a bio right now but you can call btrfs_io_bio
2932 * for the appropriate container_of magic
9be3395b 2933 */
e749af44 2934struct bio *btrfs_bio_alloc(u64 first_byte)
d1310b2e
CM
2935{
2936 struct bio *bio;
d1310b2e 2937
8ac9f7c1 2938 bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, &btrfs_bioset);
c821e7f3 2939 bio->bi_iter.bi_sector = first_byte >> 9;
184f999e 2940 btrfs_io_bio_init(btrfs_io_bio(bio));
d1310b2e
CM
2941 return bio;
2942}
2943
8b6c1d56 2944struct bio *btrfs_bio_clone(struct bio *bio)
9be3395b 2945{
23ea8e5a
MX
2946 struct btrfs_io_bio *btrfs_bio;
2947 struct bio *new;
9be3395b 2948
6e707bcd 2949 /* Bio allocation backed by a bioset does not fail */
8ac9f7c1 2950 new = bio_clone_fast(bio, GFP_NOFS, &btrfs_bioset);
6e707bcd 2951 btrfs_bio = btrfs_io_bio(new);
184f999e 2952 btrfs_io_bio_init(btrfs_bio);
6e707bcd 2953 btrfs_bio->iter = bio->bi_iter;
23ea8e5a
MX
2954 return new;
2955}
9be3395b 2956
c5e4c3d7 2957struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs)
9be3395b 2958{
facc8a22
MX
2959 struct bio *bio;
2960
6e707bcd 2961 /* Bio allocation backed by a bioset does not fail */
8ac9f7c1 2962 bio = bio_alloc_bioset(GFP_NOFS, nr_iovecs, &btrfs_bioset);
184f999e 2963 btrfs_io_bio_init(btrfs_io_bio(bio));
facc8a22 2964 return bio;
9be3395b
CM
2965}
2966
e477094f 2967struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size)
2f8e9140
LB
2968{
2969 struct bio *bio;
2970 struct btrfs_io_bio *btrfs_bio;
2971
2972 /* this will never fail when it's backed by a bioset */
8ac9f7c1 2973 bio = bio_clone_fast(orig, GFP_NOFS, &btrfs_bioset);
2f8e9140
LB
2974 ASSERT(bio);
2975
2976 btrfs_bio = btrfs_io_bio(bio);
184f999e 2977 btrfs_io_bio_init(btrfs_bio);
2f8e9140
LB
2978
2979 bio_trim(bio, offset >> 9, size >> 9);
17347cec 2980 btrfs_bio->iter = bio->bi_iter;
2f8e9140
LB
2981 return bio;
2982}
9be3395b 2983
4b81ba48
DS
2984/*
2985 * @opf: bio REQ_OP_* and REQ_* flags as one value
b8b3d625
DS
2986 * @wbc: optional writeback control for io accounting
2987 * @page: page to add to the bio
2988 * @pg_offset: offset of the new bio or to check whether we are adding
2989 * a contiguous page to the previous one
2990 * @size: portion of page that we want to write
2991 * @offset: starting offset in the page
5c2b1fd7 2992 * @bio_ret: must be valid pointer, newly allocated bio will be stored there
b8b3d625
DS
2993 * @end_io_func: end_io callback for new bio
2994 * @mirror_num: desired mirror to read/write
2995 * @prev_bio_flags: flags of previous bio to see if we can merge the current one
2996 * @bio_flags: flags of the current bio to see if we can merge them
4b81ba48 2997 */
0ceb34bf 2998static int submit_extent_page(unsigned int opf,
da2f0f74 2999 struct writeback_control *wbc,
6273b7f8 3000 struct page *page, u64 offset,
6c5a4e2c 3001 size_t size, unsigned long pg_offset,
d1310b2e 3002 struct bio **bio_ret,
f188591e 3003 bio_end_io_t end_io_func,
c8b97818
CM
3004 int mirror_num,
3005 unsigned long prev_bio_flags,
005efedf
FM
3006 unsigned long bio_flags,
3007 bool force_bio_submit)
d1310b2e
CM
3008{
3009 int ret = 0;
3010 struct bio *bio;
09cbfeaf 3011 size_t page_size = min_t(size_t, size, PAGE_SIZE);
6273b7f8 3012 sector_t sector = offset >> 9;
0ceb34bf 3013 struct extent_io_tree *tree = &BTRFS_I(page->mapping->host)->io_tree;
d1310b2e 3014
5c2b1fd7
DS
3015 ASSERT(bio_ret);
3016
3017 if (*bio_ret) {
0c8508a6
DS
3018 bool contig;
3019 bool can_merge = true;
3020
d1310b2e 3021 bio = *bio_ret;
0c8508a6 3022 if (prev_bio_flags & EXTENT_BIO_COMPRESSED)
4f024f37 3023 contig = bio->bi_iter.bi_sector == sector;
c8b97818 3024 else
f73a1c7d 3025 contig = bio_end_sector(bio) == sector;
c8b97818 3026
da12fe54
NB
3027 ASSERT(tree->ops);
3028 if (btrfs_bio_fits_in_stripe(page, page_size, bio, bio_flags))
0c8508a6
DS
3029 can_merge = false;
3030
3031 if (prev_bio_flags != bio_flags || !contig || !can_merge ||
005efedf 3032 force_bio_submit ||
6c5a4e2c 3033 bio_add_page(bio, page, page_size, pg_offset) < page_size) {
1f7ad75b 3034 ret = submit_one_bio(bio, mirror_num, prev_bio_flags);
289454ad
NA
3035 if (ret < 0) {
3036 *bio_ret = NULL;
79787eaa 3037 return ret;
289454ad 3038 }
d1310b2e
CM
3039 bio = NULL;
3040 } else {
da2f0f74 3041 if (wbc)
34e51a5e 3042 wbc_account_cgroup_owner(wbc, page, page_size);
d1310b2e
CM
3043 return 0;
3044 }
3045 }
c8b97818 3046
e749af44 3047 bio = btrfs_bio_alloc(offset);
6c5a4e2c 3048 bio_add_page(bio, page, page_size, pg_offset);
d1310b2e
CM
3049 bio->bi_end_io = end_io_func;
3050 bio->bi_private = tree;
e6959b93 3051 bio->bi_write_hint = page->mapping->host->i_write_hint;
4b81ba48 3052 bio->bi_opf = opf;
da2f0f74 3053 if (wbc) {
429aebc0
DS
3054 struct block_device *bdev;
3055
3056 bdev = BTRFS_I(page->mapping->host)->root->fs_info->fs_devices->latest_bdev;
3057 bio_set_dev(bio, bdev);
da2f0f74 3058 wbc_init_bio(wbc, bio);
34e51a5e 3059 wbc_account_cgroup_owner(wbc, page, page_size);
da2f0f74 3060 }
70dec807 3061
5c2b1fd7 3062 *bio_ret = bio;
d1310b2e
CM
3063
3064 return ret;
3065}
3066
48a3b636
ES
3067static void attach_extent_buffer_page(struct extent_buffer *eb,
3068 struct page *page)
d1310b2e
CM
3069{
3070 if (!PagePrivate(page)) {
3071 SetPagePrivate(page);
09cbfeaf 3072 get_page(page);
4f2de97a
JB
3073 set_page_private(page, (unsigned long)eb);
3074 } else {
3075 WARN_ON(page->private != (unsigned long)eb);
d1310b2e
CM
3076 }
3077}
3078
4f2de97a 3079void set_page_extent_mapped(struct page *page)
d1310b2e 3080{
4f2de97a
JB
3081 if (!PagePrivate(page)) {
3082 SetPagePrivate(page);
09cbfeaf 3083 get_page(page);
4f2de97a
JB
3084 set_page_private(page, EXTENT_PAGE_PRIVATE);
3085 }
d1310b2e
CM
3086}
3087
125bac01
MX
3088static struct extent_map *
3089__get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
3090 u64 start, u64 len, get_extent_t *get_extent,
3091 struct extent_map **em_cached)
3092{
3093 struct extent_map *em;
3094
3095 if (em_cached && *em_cached) {
3096 em = *em_cached;
cbc0e928 3097 if (extent_map_in_tree(em) && start >= em->start &&
125bac01 3098 start < extent_map_end(em)) {
490b54d6 3099 refcount_inc(&em->refs);
125bac01
MX
3100 return em;
3101 }
3102
3103 free_extent_map(em);
3104 *em_cached = NULL;
3105 }
3106
39b07b5d 3107 em = get_extent(BTRFS_I(inode), page, pg_offset, start, len);
125bac01
MX
3108 if (em_cached && !IS_ERR_OR_NULL(em)) {
3109 BUG_ON(*em_cached);
490b54d6 3110 refcount_inc(&em->refs);
125bac01
MX
3111 *em_cached = em;
3112 }
3113 return em;
3114}
d1310b2e
CM
3115/*
3116 * basic readpage implementation. Locked extent state structs are inserted
3117 * into the tree that are removed when the IO is done (by the end_io
3118 * handlers)
79787eaa 3119 * XXX JDM: This needs looking at to ensure proper page locking
baf863b9 3120 * return 0 on success, otherwise return error
d1310b2e 3121 */
f657a31c 3122static int __do_readpage(struct page *page,
9974090b 3123 get_extent_t *get_extent,
125bac01 3124 struct extent_map **em_cached,
9974090b 3125 struct bio **bio, int mirror_num,
f1c77c55 3126 unsigned long *bio_flags, unsigned int read_flags,
005efedf 3127 u64 *prev_em_start)
d1310b2e
CM
3128{
3129 struct inode *inode = page->mapping->host;
4eee4fa4 3130 u64 start = page_offset(page);
8eec8296 3131 const u64 end = start + PAGE_SIZE - 1;
d1310b2e
CM
3132 u64 cur = start;
3133 u64 extent_offset;
3134 u64 last_byte = i_size_read(inode);
3135 u64 block_start;
3136 u64 cur_end;
d1310b2e 3137 struct extent_map *em;
baf863b9 3138 int ret = 0;
d1310b2e 3139 int nr = 0;
306e16ce 3140 size_t pg_offset = 0;
d1310b2e 3141 size_t iosize;
c8b97818 3142 size_t disk_io_size;
d1310b2e 3143 size_t blocksize = inode->i_sb->s_blocksize;
7f042a83 3144 unsigned long this_bio_flag = 0;
f657a31c 3145 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
ae6957eb 3146
d1310b2e
CM
3147 set_page_extent_mapped(page);
3148
90a887c9
DM
3149 if (!PageUptodate(page)) {
3150 if (cleancache_get_page(page) == 0) {
3151 BUG_ON(blocksize != PAGE_SIZE);
9974090b 3152 unlock_extent(tree, start, end);
90a887c9
DM
3153 goto out;
3154 }
3155 }
3156
09cbfeaf 3157 if (page->index == last_byte >> PAGE_SHIFT) {
c8b97818 3158 char *userpage;
7073017a 3159 size_t zero_offset = offset_in_page(last_byte);
c8b97818
CM
3160
3161 if (zero_offset) {
09cbfeaf 3162 iosize = PAGE_SIZE - zero_offset;
7ac687d9 3163 userpage = kmap_atomic(page);
c8b97818
CM
3164 memset(userpage + zero_offset, 0, iosize);
3165 flush_dcache_page(page);
7ac687d9 3166 kunmap_atomic(userpage);
c8b97818
CM
3167 }
3168 }
d1310b2e 3169 while (cur <= end) {
005efedf 3170 bool force_bio_submit = false;
6273b7f8 3171 u64 offset;
c8f2f24b 3172
d1310b2e
CM
3173 if (cur >= last_byte) {
3174 char *userpage;
507903b8
AJ
3175 struct extent_state *cached = NULL;
3176
09cbfeaf 3177 iosize = PAGE_SIZE - pg_offset;
7ac687d9 3178 userpage = kmap_atomic(page);
306e16ce 3179 memset(userpage + pg_offset, 0, iosize);
d1310b2e 3180 flush_dcache_page(page);
7ac687d9 3181 kunmap_atomic(userpage);
d1310b2e 3182 set_extent_uptodate(tree, cur, cur + iosize - 1,
507903b8 3183 &cached, GFP_NOFS);
7f042a83 3184 unlock_extent_cached(tree, cur,
e43bbe5e 3185 cur + iosize - 1, &cached);
d1310b2e
CM
3186 break;
3187 }
125bac01
MX
3188 em = __get_extent_map(inode, page, pg_offset, cur,
3189 end - cur + 1, get_extent, em_cached);
c704005d 3190 if (IS_ERR_OR_NULL(em)) {
d1310b2e 3191 SetPageError(page);
7f042a83 3192 unlock_extent(tree, cur, end);
d1310b2e
CM
3193 break;
3194 }
d1310b2e
CM
3195 extent_offset = cur - em->start;
3196 BUG_ON(extent_map_end(em) <= cur);
3197 BUG_ON(end < cur);
3198
261507a0 3199 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
4b384318 3200 this_bio_flag |= EXTENT_BIO_COMPRESSED;
261507a0
LZ
3201 extent_set_compress_type(&this_bio_flag,
3202 em->compress_type);
3203 }
c8b97818 3204
d1310b2e
CM
3205 iosize = min(extent_map_end(em) - cur, end - cur + 1);
3206 cur_end = min(extent_map_end(em) - 1, end);
fda2832f 3207 iosize = ALIGN(iosize, blocksize);
c8b97818
CM
3208 if (this_bio_flag & EXTENT_BIO_COMPRESSED) {
3209 disk_io_size = em->block_len;
6273b7f8 3210 offset = em->block_start;
c8b97818 3211 } else {
6273b7f8 3212 offset = em->block_start + extent_offset;
c8b97818
CM
3213 disk_io_size = iosize;
3214 }
d1310b2e 3215 block_start = em->block_start;
d899e052
YZ
3216 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
3217 block_start = EXTENT_MAP_HOLE;
005efedf
FM
3218
3219 /*
3220 * If we have a file range that points to a compressed extent
3221 * and it's followed by a consecutive file range that points to
3222 * to the same compressed extent (possibly with a different
3223 * offset and/or length, so it either points to the whole extent
3224 * or only part of it), we must make sure we do not submit a
3225 * single bio to populate the pages for the 2 ranges because
3226 * this makes the compressed extent read zero out the pages
3227 * belonging to the 2nd range. Imagine the following scenario:
3228 *
3229 * File layout
3230 * [0 - 8K] [8K - 24K]
3231 * | |
3232 * | |
3233 * points to extent X, points to extent X,
3234 * offset 4K, length of 8K offset 0, length 16K
3235 *
3236 * [extent X, compressed length = 4K uncompressed length = 16K]
3237 *
3238 * If the bio to read the compressed extent covers both ranges,
3239 * it will decompress extent X into the pages belonging to the
3240 * first range and then it will stop, zeroing out the remaining
3241 * pages that belong to the other range that points to extent X.
3242 * So here we make sure we submit 2 bios, one for the first
3243 * range and another one for the third range. Both will target
3244 * the same physical extent from disk, but we can't currently
3245 * make the compressed bio endio callback populate the pages
3246 * for both ranges because each compressed bio is tightly
3247 * coupled with a single extent map, and each range can have
3248 * an extent map with a different offset value relative to the
3249 * uncompressed data of our extent and different lengths. This
3250 * is a corner case so we prioritize correctness over
3251 * non-optimal behavior (submitting 2 bios for the same extent).
3252 */
3253 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) &&
3254 prev_em_start && *prev_em_start != (u64)-1 &&
8e928218 3255 *prev_em_start != em->start)
005efedf
FM
3256 force_bio_submit = true;
3257
3258 if (prev_em_start)
8e928218 3259 *prev_em_start = em->start;
005efedf 3260
d1310b2e
CM
3261 free_extent_map(em);
3262 em = NULL;
3263
3264 /* we've found a hole, just zero and go on */
3265 if (block_start == EXTENT_MAP_HOLE) {
3266 char *userpage;
507903b8
AJ
3267 struct extent_state *cached = NULL;
3268
7ac687d9 3269 userpage = kmap_atomic(page);
306e16ce 3270 memset(userpage + pg_offset, 0, iosize);
d1310b2e 3271 flush_dcache_page(page);
7ac687d9 3272 kunmap_atomic(userpage);
d1310b2e
CM
3273
3274 set_extent_uptodate(tree, cur, cur + iosize - 1,
507903b8 3275 &cached, GFP_NOFS);
7f042a83 3276 unlock_extent_cached(tree, cur,
e43bbe5e 3277 cur + iosize - 1, &cached);
d1310b2e 3278 cur = cur + iosize;
306e16ce 3279 pg_offset += iosize;
d1310b2e
CM
3280 continue;
3281 }
3282 /* the get_extent function already copied into the page */
9655d298
CM
3283 if (test_range_bit(tree, cur, cur_end,
3284 EXTENT_UPTODATE, 1, NULL)) {
a1b32a59 3285 check_page_uptodate(tree, page);
7f042a83 3286 unlock_extent(tree, cur, cur + iosize - 1);
d1310b2e 3287 cur = cur + iosize;
306e16ce 3288 pg_offset += iosize;
d1310b2e
CM
3289 continue;
3290 }
70dec807
CM
3291 /* we have an inline extent but it didn't get marked up
3292 * to date. Error out
3293 */
3294 if (block_start == EXTENT_MAP_INLINE) {
3295 SetPageError(page);
7f042a83 3296 unlock_extent(tree, cur, cur + iosize - 1);
70dec807 3297 cur = cur + iosize;
306e16ce 3298 pg_offset += iosize;
70dec807
CM
3299 continue;
3300 }
d1310b2e 3301
0ceb34bf 3302 ret = submit_extent_page(REQ_OP_READ | read_flags, NULL,
6273b7f8 3303 page, offset, disk_io_size,
fa17ed06 3304 pg_offset, bio,
c8b97818
CM
3305 end_bio_extent_readpage, mirror_num,
3306 *bio_flags,
005efedf
FM
3307 this_bio_flag,
3308 force_bio_submit);
c8f2f24b
JB
3309 if (!ret) {
3310 nr++;
3311 *bio_flags = this_bio_flag;
3312 } else {
d1310b2e 3313 SetPageError(page);
7f042a83 3314 unlock_extent(tree, cur, cur + iosize - 1);
baf863b9 3315 goto out;
edd33c99 3316 }
d1310b2e 3317 cur = cur + iosize;
306e16ce 3318 pg_offset += iosize;
d1310b2e 3319 }
90a887c9 3320out:
d1310b2e
CM
3321 if (!nr) {
3322 if (!PageError(page))
3323 SetPageUptodate(page);
3324 unlock_page(page);
3325 }
baf863b9 3326 return ret;
d1310b2e
CM
3327}
3328
b6660e80 3329static inline void contiguous_readpages(struct page *pages[], int nr_pages,
9974090b 3330 u64 start, u64 end,
125bac01 3331 struct extent_map **em_cached,
d3fac6ba 3332 struct bio **bio,
1f7ad75b 3333 unsigned long *bio_flags,
808f80b4 3334 u64 *prev_em_start)
9974090b 3335{
23d31bd4 3336 struct btrfs_inode *inode = BTRFS_I(pages[0]->mapping->host);
9974090b
MX
3337 int index;
3338
b272ae22 3339 btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
9974090b
MX
3340
3341 for (index = 0; index < nr_pages; index++) {
f657a31c 3342 __do_readpage(pages[index], btrfs_get_extent, em_cached,
5e9d3982 3343 bio, 0, bio_flags, REQ_RAHEAD, prev_em_start);
09cbfeaf 3344 put_page(pages[index]);
9974090b
MX
3345 }
3346}
3347
0d44fea7 3348static int __extent_read_full_page(struct page *page,
9974090b
MX
3349 get_extent_t *get_extent,
3350 struct bio **bio, int mirror_num,
f1c77c55
DS
3351 unsigned long *bio_flags,
3352 unsigned int read_flags)
9974090b 3353{
23d31bd4 3354 struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
9974090b 3355 u64 start = page_offset(page);
09cbfeaf 3356 u64 end = start + PAGE_SIZE - 1;
9974090b
MX
3357 int ret;
3358
b272ae22 3359 btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
9974090b 3360
f657a31c 3361 ret = __do_readpage(page, get_extent, NULL, bio, mirror_num,
1f7ad75b 3362 bio_flags, read_flags, NULL);
9974090b
MX
3363 return ret;
3364}
3365
71ad38b4
DS
3366int extent_read_full_page(struct page *page, get_extent_t *get_extent,
3367 int mirror_num)
d1310b2e
CM
3368{
3369 struct bio *bio = NULL;
c8b97818 3370 unsigned long bio_flags = 0;
d1310b2e
CM
3371 int ret;
3372
0d44fea7 3373 ret = __extent_read_full_page(page, get_extent, &bio, mirror_num,
1f7ad75b 3374 &bio_flags, 0);
d1310b2e 3375 if (bio)
1f7ad75b 3376 ret = submit_one_bio(bio, mirror_num, bio_flags);
d1310b2e
CM
3377 return ret;
3378}
d1310b2e 3379
3d4b9496 3380static void update_nr_written(struct writeback_control *wbc,
a9132667 3381 unsigned long nr_written)
11c8349b
CM
3382{
3383 wbc->nr_to_write -= nr_written;
11c8349b
CM
3384}
3385
d1310b2e 3386/*
40f76580
CM
3387 * helper for __extent_writepage, doing all of the delayed allocation setup.
3388 *
5eaad97a 3389 * This returns 1 if btrfs_run_delalloc_range function did all the work required
40f76580
CM
3390 * to write the page (copy into inline extent). In this case the IO has
3391 * been started and the page is already unlocked.
3392 *
3393 * This returns 0 if all went well (page still locked)
3394 * This returns < 0 if there were errors (page still locked)
d1310b2e 3395 */
40f76580 3396static noinline_for_stack int writepage_delalloc(struct inode *inode,
8cc0237a
NB
3397 struct page *page, struct writeback_control *wbc,
3398 u64 delalloc_start, unsigned long *nr_written)
40f76580 3399{
09cbfeaf 3400 u64 page_end = delalloc_start + PAGE_SIZE - 1;
3522e903 3401 bool found;
40f76580
CM
3402 u64 delalloc_to_write = 0;
3403 u64 delalloc_end = 0;
3404 int ret;
3405 int page_started = 0;
3406
40f76580
CM
3407
3408 while (delalloc_end < page_end) {
9978059b 3409 found = find_lock_delalloc_range(inode, page,
40f76580 3410 &delalloc_start,
917aacec 3411 &delalloc_end);
3522e903 3412 if (!found) {
40f76580
CM
3413 delalloc_start = delalloc_end + 1;
3414 continue;
3415 }
5eaad97a
NB
3416 ret = btrfs_run_delalloc_range(inode, page, delalloc_start,
3417 delalloc_end, &page_started, nr_written, wbc);
40f76580
CM
3418 if (ret) {
3419 SetPageError(page);
5eaad97a
NB
3420 /*
3421 * btrfs_run_delalloc_range should return < 0 for error
3422 * but just in case, we use > 0 here meaning the IO is
3423 * started, so we don't want to return > 0 unless
3424 * things are going well.
40f76580
CM
3425 */
3426 ret = ret < 0 ? ret : -EIO;
3427 goto done;
3428 }
3429 /*
ea1754a0
KS
3430 * delalloc_end is already one less than the total length, so
3431 * we don't subtract one from PAGE_SIZE
40f76580
CM
3432 */
3433 delalloc_to_write += (delalloc_end - delalloc_start +
ea1754a0 3434 PAGE_SIZE) >> PAGE_SHIFT;
40f76580
CM
3435 delalloc_start = delalloc_end + 1;
3436 }
3437 if (wbc->nr_to_write < delalloc_to_write) {
3438 int thresh = 8192;
3439
3440 if (delalloc_to_write < thresh * 2)
3441 thresh = delalloc_to_write;
3442 wbc->nr_to_write = min_t(u64, delalloc_to_write,
3443 thresh);
3444 }
3445
3446 /* did the fill delalloc function already unlock and start
3447 * the IO?
3448 */
3449 if (page_started) {
3450 /*
3451 * we've unlocked the page, so we can't update
3452 * the mapping's writeback index, just update
3453 * nr_to_write.
3454 */
3455 wbc->nr_to_write -= *nr_written;
3456 return 1;
3457 }
3458
3459 ret = 0;
3460
3461done:
3462 return ret;
3463}
3464
3465/*
3466 * helper for __extent_writepage. This calls the writepage start hooks,
3467 * and does the loop to map the page into extents and bios.
3468 *
3469 * We return 1 if the IO is started and the page is unlocked,
3470 * 0 if all went well (page still locked)
3471 * < 0 if there were errors (page still locked)
3472 */
3473static noinline_for_stack int __extent_writepage_io(struct inode *inode,
3474 struct page *page,
3475 struct writeback_control *wbc,
3476 struct extent_page_data *epd,
3477 loff_t i_size,
3478 unsigned long nr_written,
57e5ffeb 3479 int *nr_ret)
d1310b2e 3480{
45b08405 3481 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
4eee4fa4 3482 u64 start = page_offset(page);
09cbfeaf 3483 u64 page_end = start + PAGE_SIZE - 1;
d1310b2e
CM
3484 u64 end;
3485 u64 cur = start;
3486 u64 extent_offset;
d1310b2e
CM
3487 u64 block_start;
3488 u64 iosize;
d1310b2e 3489 struct extent_map *em;
7f3c74fb 3490 size_t pg_offset = 0;
d1310b2e 3491 size_t blocksize;
40f76580
CM
3492 int ret = 0;
3493 int nr = 0;
57e5ffeb 3494 const unsigned int write_flags = wbc_to_write_flags(wbc);
40f76580 3495 bool compressed;
c8b97818 3496
d75855b4
NB
3497 ret = btrfs_writepage_cow_fixup(page, start, page_end);
3498 if (ret) {
3499 /* Fixup worker will requeue */
5ab58055 3500 redirty_page_for_writepage(wbc, page);
d75855b4
NB
3501 update_nr_written(wbc, nr_written);
3502 unlock_page(page);
3503 return 1;
247e743c
CM
3504 }
3505
11c8349b
CM
3506 /*
3507 * we don't want to touch the inode after unlocking the page,
3508 * so we update the mapping writeback index now
3509 */
3d4b9496 3510 update_nr_written(wbc, nr_written + 1);
771ed689 3511
d1310b2e 3512 end = page_end;
d1310b2e
CM
3513 blocksize = inode->i_sb->s_blocksize;
3514
3515 while (cur <= end) {
40f76580 3516 u64 em_end;
6273b7f8 3517 u64 offset;
58409edd 3518
40f76580 3519 if (cur >= i_size) {
7087a9d8 3520 btrfs_writepage_endio_finish_ordered(page, cur,
c629732d 3521 page_end, 1);
d1310b2e
CM
3522 break;
3523 }
39b07b5d
OS
3524 em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur,
3525 end - cur + 1);
c704005d 3526 if (IS_ERR_OR_NULL(em)) {
d1310b2e 3527 SetPageError(page);
61391d56 3528 ret = PTR_ERR_OR_ZERO(em);
d1310b2e
CM
3529 break;
3530 }
3531
3532 extent_offset = cur - em->start;
40f76580
CM
3533 em_end = extent_map_end(em);
3534 BUG_ON(em_end <= cur);
d1310b2e 3535 BUG_ON(end < cur);
40f76580 3536 iosize = min(em_end - cur, end - cur + 1);
fda2832f 3537 iosize = ALIGN(iosize, blocksize);
6273b7f8 3538 offset = em->block_start + extent_offset;
d1310b2e 3539 block_start = em->block_start;
c8b97818 3540 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
d1310b2e
CM
3541 free_extent_map(em);
3542 em = NULL;
3543
c8b97818
CM
3544 /*
3545 * compressed and inline extents are written through other
3546 * paths in the FS
3547 */
3548 if (compressed || block_start == EXTENT_MAP_HOLE ||
d1310b2e 3549 block_start == EXTENT_MAP_INLINE) {
c8b04030 3550 if (compressed)
c8b97818 3551 nr++;
c8b04030
OS
3552 else
3553 btrfs_writepage_endio_finish_ordered(page, cur,
3554 cur + iosize - 1, 1);
c8b97818 3555 cur += iosize;
7f3c74fb 3556 pg_offset += iosize;
d1310b2e
CM
3557 continue;
3558 }
c8b97818 3559
5cdc84bf 3560 btrfs_set_range_writeback(tree, cur, cur + iosize - 1);
58409edd
DS
3561 if (!PageWriteback(page)) {
3562 btrfs_err(BTRFS_I(inode)->root->fs_info,
3563 "page %lu not writeback, cur %llu end %llu",
3564 page->index, cur, end);
d1310b2e 3565 }
7f3c74fb 3566
0ceb34bf 3567 ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc,
6273b7f8 3568 page, offset, iosize, pg_offset,
fa17ed06 3569 &epd->bio,
58409edd
DS
3570 end_bio_extent_writepage,
3571 0, 0, 0, false);
fe01aa65 3572 if (ret) {
58409edd 3573 SetPageError(page);
fe01aa65
TK
3574 if (PageWriteback(page))
3575 end_page_writeback(page);
3576 }
d1310b2e 3577
d1310b2e 3578 cur = cur + iosize;
7f3c74fb 3579 pg_offset += iosize;
d1310b2e
CM
3580 nr++;
3581 }
40f76580 3582 *nr_ret = nr;
40f76580
CM
3583 return ret;
3584}
3585
3586/*
3587 * the writepage semantics are similar to regular writepage. extent
3588 * records are inserted to lock ranges in the tree, and as dirty areas
3589 * are found, they are marked writeback. Then the lock bits are removed
3590 * and the end_io handler clears the writeback ranges
3065976b
QW
3591 *
3592 * Return 0 if everything goes well.
3593 * Return <0 for error.
40f76580
CM
3594 */
3595static int __extent_writepage(struct page *page, struct writeback_control *wbc,
aab6e9ed 3596 struct extent_page_data *epd)
40f76580
CM
3597{
3598 struct inode *inode = page->mapping->host;
40f76580 3599 u64 start = page_offset(page);
09cbfeaf 3600 u64 page_end = start + PAGE_SIZE - 1;
40f76580
CM
3601 int ret;
3602 int nr = 0;
eb70d222 3603 size_t pg_offset;
40f76580 3604 loff_t i_size = i_size_read(inode);
09cbfeaf 3605 unsigned long end_index = i_size >> PAGE_SHIFT;
40f76580
CM
3606 unsigned long nr_written = 0;
3607
40f76580
CM
3608 trace___extent_writepage(page, inode, wbc);
3609
3610 WARN_ON(!PageLocked(page));
3611
3612 ClearPageError(page);
3613
7073017a 3614 pg_offset = offset_in_page(i_size);
40f76580
CM
3615 if (page->index > end_index ||
3616 (page->index == end_index && !pg_offset)) {
09cbfeaf 3617 page->mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE);
40f76580
CM
3618 unlock_page(page);
3619 return 0;
3620 }
3621
3622 if (page->index == end_index) {
3623 char *userpage;
3624
3625 userpage = kmap_atomic(page);
3626 memset(userpage + pg_offset, 0,
09cbfeaf 3627 PAGE_SIZE - pg_offset);
40f76580
CM
3628 kunmap_atomic(userpage);
3629 flush_dcache_page(page);
3630 }
3631
40f76580
CM
3632 set_page_extent_mapped(page);
3633
7789a55a 3634 if (!epd->extent_locked) {
8cc0237a 3635 ret = writepage_delalloc(inode, page, wbc, start, &nr_written);
7789a55a 3636 if (ret == 1)
169d2c87 3637 return 0;
7789a55a
NB
3638 if (ret)
3639 goto done;
3640 }
40f76580
CM
3641
3642 ret = __extent_writepage_io(inode, page, wbc, epd,
57e5ffeb 3643 i_size, nr_written, &nr);
40f76580 3644 if (ret == 1)
169d2c87 3645 return 0;
40f76580 3646
d1310b2e
CM
3647done:
3648 if (nr == 0) {
3649 /* make sure the mapping tag for page dirty gets cleared */
3650 set_page_writeback(page);
3651 end_page_writeback(page);
3652 }
61391d56
FM
3653 if (PageError(page)) {
3654 ret = ret < 0 ? ret : -EIO;
3655 end_extent_writepage(page, ret, start, page_end);
3656 }
d1310b2e 3657 unlock_page(page);
3065976b 3658 ASSERT(ret <= 0);
40f76580 3659 return ret;
d1310b2e
CM
3660}
3661
fd8b2b61 3662void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
0b32f4bb 3663{
74316201
N
3664 wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_WRITEBACK,
3665 TASK_UNINTERRUPTIBLE);
0b32f4bb
JB
3666}
3667
18dfa711
FM
3668static void end_extent_buffer_writeback(struct extent_buffer *eb)
3669{
3670 clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
3671 smp_mb__after_atomic();
3672 wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
3673}
3674
2e3c2513
QW
3675/*
3676 * Lock eb pages and flush the bio if we can't the locks
3677 *
3678 * Return 0 if nothing went wrong
3679 * Return >0 is same as 0, except bio is not submitted
3680 * Return <0 if something went wrong, no page is locked
3681 */
9df76fb5 3682static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb,
0e378df1 3683 struct extent_page_data *epd)
0b32f4bb 3684{
9df76fb5 3685 struct btrfs_fs_info *fs_info = eb->fs_info;
2e3c2513 3686 int i, num_pages, failed_page_nr;
0b32f4bb
JB
3687 int flush = 0;
3688 int ret = 0;
3689
3690 if (!btrfs_try_tree_write_lock(eb)) {
f4340622 3691 ret = flush_write_bio(epd);
2e3c2513
QW
3692 if (ret < 0)
3693 return ret;
3694 flush = 1;
0b32f4bb
JB
3695 btrfs_tree_lock(eb);
3696 }
3697
3698 if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) {
3699 btrfs_tree_unlock(eb);
3700 if (!epd->sync_io)
3701 return 0;
3702 if (!flush) {
f4340622 3703 ret = flush_write_bio(epd);
2e3c2513
QW
3704 if (ret < 0)
3705 return ret;
0b32f4bb
JB
3706 flush = 1;
3707 }
a098d8e8
CM
3708 while (1) {
3709 wait_on_extent_buffer_writeback(eb);
3710 btrfs_tree_lock(eb);
3711 if (!test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags))
3712 break;
0b32f4bb 3713 btrfs_tree_unlock(eb);
0b32f4bb
JB
3714 }
3715 }
3716
51561ffe
JB
3717 /*
3718 * We need to do this to prevent races in people who check if the eb is
3719 * under IO since we can end up having no IO bits set for a short period
3720 * of time.
3721 */
3722 spin_lock(&eb->refs_lock);
0b32f4bb
JB
3723 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
3724 set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
51561ffe 3725 spin_unlock(&eb->refs_lock);
0b32f4bb 3726 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
104b4e51
NB
3727 percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
3728 -eb->len,
3729 fs_info->dirty_metadata_batch);
0b32f4bb 3730 ret = 1;
51561ffe
JB
3731 } else {
3732 spin_unlock(&eb->refs_lock);
0b32f4bb
JB
3733 }
3734
3735 btrfs_tree_unlock(eb);
3736
3737 if (!ret)
3738 return ret;
3739
65ad0104 3740 num_pages = num_extent_pages(eb);
0b32f4bb 3741 for (i = 0; i < num_pages; i++) {
fb85fc9a 3742 struct page *p = eb->pages[i];
0b32f4bb
JB
3743
3744 if (!trylock_page(p)) {
3745 if (!flush) {
18dfa711
FM
3746 int err;
3747
3748 err = flush_write_bio(epd);
3749 if (err < 0) {
3750 ret = err;
2e3c2513
QW
3751 failed_page_nr = i;
3752 goto err_unlock;
3753 }
0b32f4bb
JB
3754 flush = 1;
3755 }
3756 lock_page(p);
3757 }
3758 }
3759
3760 return ret;
2e3c2513
QW
3761err_unlock:
3762 /* Unlock already locked pages */
3763 for (i = 0; i < failed_page_nr; i++)
3764 unlock_page(eb->pages[i]);
18dfa711
FM
3765 /*
3766 * Clear EXTENT_BUFFER_WRITEBACK and wake up anyone waiting on it.
3767 * Also set back EXTENT_BUFFER_DIRTY so future attempts to this eb can
3768 * be made and undo everything done before.
3769 */
3770 btrfs_tree_lock(eb);
3771 spin_lock(&eb->refs_lock);
3772 set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
3773 end_extent_buffer_writeback(eb);
3774 spin_unlock(&eb->refs_lock);
3775 percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, eb->len,
3776 fs_info->dirty_metadata_batch);
3777 btrfs_clear_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
3778 btrfs_tree_unlock(eb);
2e3c2513 3779 return ret;
0b32f4bb
JB
3780}
3781
656f30db
FM
3782static void set_btree_ioerr(struct page *page)
3783{
3784 struct extent_buffer *eb = (struct extent_buffer *)page->private;
eb5b64f1 3785 struct btrfs_fs_info *fs_info;
656f30db
FM
3786
3787 SetPageError(page);
3788 if (test_and_set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))
3789 return;
3790
eb5b64f1
DZ
3791 /*
3792 * If we error out, we should add back the dirty_metadata_bytes
3793 * to make it consistent.
3794 */
3795 fs_info = eb->fs_info;
3796 percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
3797 eb->len, fs_info->dirty_metadata_batch);
3798
656f30db
FM
3799 /*
3800 * If writeback for a btree extent that doesn't belong to a log tree
3801 * failed, increment the counter transaction->eb_write_errors.
3802 * We do this because while the transaction is running and before it's
3803 * committing (when we call filemap_fdata[write|wait]_range against
3804 * the btree inode), we might have
3805 * btree_inode->i_mapping->a_ops->writepages() called by the VM - if it
3806 * returns an error or an error happens during writeback, when we're
3807 * committing the transaction we wouldn't know about it, since the pages
3808 * can be no longer dirty nor marked anymore for writeback (if a
3809 * subsequent modification to the extent buffer didn't happen before the
3810 * transaction commit), which makes filemap_fdata[write|wait]_range not
3811 * able to find the pages tagged with SetPageError at transaction
3812 * commit time. So if this happens we must abort the transaction,
3813 * otherwise we commit a super block with btree roots that point to
3814 * btree nodes/leafs whose content on disk is invalid - either garbage
3815 * or the content of some node/leaf from a past generation that got
3816 * cowed or deleted and is no longer valid.
3817 *
3818 * Note: setting AS_EIO/AS_ENOSPC in the btree inode's i_mapping would
3819 * not be enough - we need to distinguish between log tree extents vs
3820 * non-log tree extents, and the next filemap_fdatawait_range() call
3821 * will catch and clear such errors in the mapping - and that call might
3822 * be from a log sync and not from a transaction commit. Also, checking
3823 * for the eb flag EXTENT_BUFFER_WRITE_ERR at transaction commit time is
3824 * not done and would not be reliable - the eb might have been released
3825 * from memory and reading it back again means that flag would not be
3826 * set (since it's a runtime flag, not persisted on disk).
3827 *
3828 * Using the flags below in the btree inode also makes us achieve the
3829 * goal of AS_EIO/AS_ENOSPC when writepages() returns success, started
3830 * writeback for all dirty pages and before filemap_fdatawait_range()
3831 * is called, the writeback for all dirty pages had already finished
3832 * with errors - because we were not using AS_EIO/AS_ENOSPC,
3833 * filemap_fdatawait_range() would return success, as it could not know
3834 * that writeback errors happened (the pages were no longer tagged for
3835 * writeback).
3836 */
3837 switch (eb->log_index) {
3838 case -1:
afcdd129 3839 set_bit(BTRFS_FS_BTREE_ERR, &eb->fs_info->flags);
656f30db
FM
3840 break;
3841 case 0:
afcdd129 3842 set_bit(BTRFS_FS_LOG1_ERR, &eb->fs_info->flags);
656f30db
FM
3843 break;
3844 case 1:
afcdd129 3845 set_bit(BTRFS_FS_LOG2_ERR, &eb->fs_info->flags);
656f30db
FM
3846 break;
3847 default:
3848 BUG(); /* unexpected, logic error */
3849 }
3850}
3851
4246a0b6 3852static void end_bio_extent_buffer_writepage(struct bio *bio)
0b32f4bb 3853{
2c30c71b 3854 struct bio_vec *bvec;
0b32f4bb 3855 struct extent_buffer *eb;
2b070cfe 3856 int done;
6dc4f100 3857 struct bvec_iter_all iter_all;
0b32f4bb 3858
c09abff8 3859 ASSERT(!bio_flagged(bio, BIO_CLONED));
2b070cfe 3860 bio_for_each_segment_all(bvec, bio, iter_all) {
0b32f4bb
JB
3861 struct page *page = bvec->bv_page;
3862
0b32f4bb
JB
3863 eb = (struct extent_buffer *)page->private;
3864 BUG_ON(!eb);
3865 done = atomic_dec_and_test(&eb->io_pages);
3866
4e4cbee9 3867 if (bio->bi_status ||
4246a0b6 3868 test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) {
0b32f4bb 3869 ClearPageUptodate(page);
656f30db 3870 set_btree_ioerr(page);
0b32f4bb
JB
3871 }
3872
3873 end_page_writeback(page);
3874
3875 if (!done)
3876 continue;
3877
3878 end_extent_buffer_writeback(eb);
2c30c71b 3879 }
0b32f4bb
JB
3880
3881 bio_put(bio);
0b32f4bb
JB
3882}
3883
0e378df1 3884static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
0b32f4bb
JB
3885 struct writeback_control *wbc,
3886 struct extent_page_data *epd)
3887{
0b32f4bb 3888 u64 offset = eb->start;
851cd173 3889 u32 nritems;
cc5e31a4 3890 int i, num_pages;
851cd173 3891 unsigned long start, end;
ff40adf7 3892 unsigned int write_flags = wbc_to_write_flags(wbc) | REQ_META;
d7dbe9e7 3893 int ret = 0;
0b32f4bb 3894
656f30db 3895 clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags);
65ad0104 3896 num_pages = num_extent_pages(eb);
0b32f4bb 3897 atomic_set(&eb->io_pages, num_pages);
de0022b9 3898
851cd173
LB
3899 /* set btree blocks beyond nritems with 0 to avoid stale content. */
3900 nritems = btrfs_header_nritems(eb);
3eb548ee 3901 if (btrfs_header_level(eb) > 0) {
3eb548ee
LB
3902 end = btrfs_node_key_ptr_offset(nritems);
3903
b159fa28 3904 memzero_extent_buffer(eb, end, eb->len - end);
851cd173
LB
3905 } else {
3906 /*
3907 * leaf:
3908 * header 0 1 2 .. N ... data_N .. data_2 data_1 data_0
3909 */
3910 start = btrfs_item_nr_offset(nritems);
8f881e8c 3911 end = BTRFS_LEAF_DATA_OFFSET + leaf_data_end(eb);
b159fa28 3912 memzero_extent_buffer(eb, start, end - start);
3eb548ee
LB
3913 }
3914
0b32f4bb 3915 for (i = 0; i < num_pages; i++) {
fb85fc9a 3916 struct page *p = eb->pages[i];
0b32f4bb
JB
3917
3918 clear_page_dirty_for_io(p);
3919 set_page_writeback(p);
0ceb34bf 3920 ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc,
fa17ed06 3921 p, offset, PAGE_SIZE, 0,
c2df8bb4 3922 &epd->bio,
1f7ad75b 3923 end_bio_extent_buffer_writepage,
18fdc679 3924 0, 0, 0, false);
0b32f4bb 3925 if (ret) {
656f30db 3926 set_btree_ioerr(p);
fe01aa65
TK
3927 if (PageWriteback(p))
3928 end_page_writeback(p);
0b32f4bb
JB
3929 if (atomic_sub_and_test(num_pages - i, &eb->io_pages))
3930 end_extent_buffer_writeback(eb);
3931 ret = -EIO;
3932 break;
3933 }
09cbfeaf 3934 offset += PAGE_SIZE;
3d4b9496 3935 update_nr_written(wbc, 1);
0b32f4bb
JB
3936 unlock_page(p);
3937 }
3938
3939 if (unlikely(ret)) {
3940 for (; i < num_pages; i++) {
bbf65cf0 3941 struct page *p = eb->pages[i];
81465028 3942 clear_page_dirty_for_io(p);
0b32f4bb
JB
3943 unlock_page(p);
3944 }
3945 }
3946
3947 return ret;
3948}
3949
3950int btree_write_cache_pages(struct address_space *mapping,
3951 struct writeback_control *wbc)
3952{
0b32f4bb
JB
3953 struct extent_buffer *eb, *prev_eb = NULL;
3954 struct extent_page_data epd = {
3955 .bio = NULL,
0b32f4bb
JB
3956 .extent_locked = 0,
3957 .sync_io = wbc->sync_mode == WB_SYNC_ALL,
3958 };
b3ff8f1d 3959 struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info;
0b32f4bb
JB
3960 int ret = 0;
3961 int done = 0;
3962 int nr_to_write_done = 0;
3963 struct pagevec pvec;
3964 int nr_pages;
3965 pgoff_t index;
3966 pgoff_t end; /* Inclusive */
3967 int scanned = 0;
10bbd235 3968 xa_mark_t tag;
0b32f4bb 3969
86679820 3970 pagevec_init(&pvec);
0b32f4bb
JB
3971 if (wbc->range_cyclic) {
3972 index = mapping->writeback_index; /* Start from prev offset */
3973 end = -1;
556755a8
JB
3974 /*
3975 * Start from the beginning does not need to cycle over the
3976 * range, mark it as scanned.
3977 */
3978 scanned = (index == 0);
0b32f4bb 3979 } else {
09cbfeaf
KS
3980 index = wbc->range_start >> PAGE_SHIFT;
3981 end = wbc->range_end >> PAGE_SHIFT;
0b32f4bb
JB
3982 scanned = 1;
3983 }
3984 if (wbc->sync_mode == WB_SYNC_ALL)
3985 tag = PAGECACHE_TAG_TOWRITE;
3986 else
3987 tag = PAGECACHE_TAG_DIRTY;
3988retry:
3989 if (wbc->sync_mode == WB_SYNC_ALL)
3990 tag_pages_for_writeback(mapping, index, end);
3991 while (!done && !nr_to_write_done && (index <= end) &&
4006f437 3992 (nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end,
67fd707f 3993 tag))) {
0b32f4bb
JB
3994 unsigned i;
3995
0b32f4bb
JB
3996 for (i = 0; i < nr_pages; i++) {
3997 struct page *page = pvec.pages[i];
3998
3999 if (!PagePrivate(page))
4000 continue;
4001
b5bae261
JB
4002 spin_lock(&mapping->private_lock);
4003 if (!PagePrivate(page)) {
4004 spin_unlock(&mapping->private_lock);
4005 continue;
4006 }
4007
0b32f4bb 4008 eb = (struct extent_buffer *)page->private;
b5bae261
JB
4009
4010 /*
4011 * Shouldn't happen and normally this would be a BUG_ON
4012 * but no sense in crashing the users box for something
4013 * we can survive anyway.
4014 */
fae7f21c 4015 if (WARN_ON(!eb)) {
b5bae261 4016 spin_unlock(&mapping->private_lock);
0b32f4bb
JB
4017 continue;
4018 }
4019
b5bae261
JB
4020 if (eb == prev_eb) {
4021 spin_unlock(&mapping->private_lock);
0b32f4bb 4022 continue;
b5bae261 4023 }
0b32f4bb 4024
b5bae261
JB
4025 ret = atomic_inc_not_zero(&eb->refs);
4026 spin_unlock(&mapping->private_lock);
4027 if (!ret)
0b32f4bb 4028 continue;
0b32f4bb
JB
4029
4030 prev_eb = eb;
9df76fb5 4031 ret = lock_extent_buffer_for_io(eb, &epd);
0b32f4bb
JB
4032 if (!ret) {
4033 free_extent_buffer(eb);
4034 continue;
0607eb1d
FM
4035 } else if (ret < 0) {
4036 done = 1;
4037 free_extent_buffer(eb);
4038 break;
0b32f4bb
JB
4039 }
4040
0ab02063 4041 ret = write_one_eb(eb, wbc, &epd);
0b32f4bb
JB
4042 if (ret) {
4043 done = 1;
4044 free_extent_buffer(eb);
4045 break;
4046 }
4047 free_extent_buffer(eb);
4048
4049 /*
4050 * the filesystem may choose to bump up nr_to_write.
4051 * We have to make sure to honor the new nr_to_write
4052 * at any time
4053 */
4054 nr_to_write_done = wbc->nr_to_write <= 0;
4055 }
4056 pagevec_release(&pvec);
4057 cond_resched();
4058 }
4059 if (!scanned && !done) {
4060 /*
4061 * We hit the last page and there is more work to be done: wrap
4062 * back to the start of the file
4063 */
4064 scanned = 1;
4065 index = 0;
4066 goto retry;
4067 }
2b952eea
QW
4068 ASSERT(ret <= 0);
4069 if (ret < 0) {
4070 end_write_bio(&epd, ret);
4071 return ret;
4072 }
b3ff8f1d
QW
4073 /*
4074 * If something went wrong, don't allow any metadata write bio to be
4075 * submitted.
4076 *
4077 * This would prevent use-after-free if we had dirty pages not
4078 * cleaned up, which can still happen by fuzzed images.
4079 *
4080 * - Bad extent tree
4081 * Allowing existing tree block to be allocated for other trees.
4082 *
4083 * - Log tree operations
4084 * Exiting tree blocks get allocated to log tree, bumps its
4085 * generation, then get cleaned in tree re-balance.
4086 * Such tree block will not be written back, since it's clean,
4087 * thus no WRITTEN flag set.
4088 * And after log writes back, this tree block is not traced by
4089 * any dirty extent_io_tree.
4090 *
4091 * - Offending tree block gets re-dirtied from its original owner
4092 * Since it has bumped generation, no WRITTEN flag, it can be
4093 * reused without COWing. This tree block will not be traced
4094 * by btrfs_transaction::dirty_pages.
4095 *
4096 * Now such dirty tree block will not be cleaned by any dirty
4097 * extent io tree. Thus we don't want to submit such wild eb
4098 * if the fs already has error.
4099 */
4100 if (!test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
4101 ret = flush_write_bio(&epd);
4102 } else {
4103 ret = -EUCLEAN;
4104 end_write_bio(&epd, ret);
4105 }
0b32f4bb
JB
4106 return ret;
4107}
4108
d1310b2e 4109/**
4bef0848 4110 * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
d1310b2e
CM
4111 * @mapping: address space structure to write
4112 * @wbc: subtract the number of written pages from *@wbc->nr_to_write
935db853 4113 * @data: data passed to __extent_writepage function
d1310b2e
CM
4114 *
4115 * If a page is already under I/O, write_cache_pages() skips it, even
4116 * if it's dirty. This is desirable behaviour for memory-cleaning writeback,
4117 * but it is INCORRECT for data-integrity system calls such as fsync(). fsync()
4118 * and msync() need to guarantee that all the data which was dirty at the time
4119 * the call was made get new I/O started against them. If wbc->sync_mode is
4120 * WB_SYNC_ALL then we were called for data integrity and we must wait for
4121 * existing IO to complete.
4122 */
4242b64a 4123static int extent_write_cache_pages(struct address_space *mapping,
4bef0848 4124 struct writeback_control *wbc,
aab6e9ed 4125 struct extent_page_data *epd)
d1310b2e 4126{
7fd1a3f7 4127 struct inode *inode = mapping->host;
d1310b2e
CM
4128 int ret = 0;
4129 int done = 0;
f85d7d6c 4130 int nr_to_write_done = 0;
d1310b2e
CM
4131 struct pagevec pvec;
4132 int nr_pages;
4133 pgoff_t index;
4134 pgoff_t end; /* Inclusive */
a9132667
LB
4135 pgoff_t done_index;
4136 int range_whole = 0;
d1310b2e 4137 int scanned = 0;
10bbd235 4138 xa_mark_t tag;
d1310b2e 4139
7fd1a3f7
JB
4140 /*
4141 * We have to hold onto the inode so that ordered extents can do their
4142 * work when the IO finishes. The alternative to this is failing to add
4143 * an ordered extent if the igrab() fails there and that is a huge pain
4144 * to deal with, so instead just hold onto the inode throughout the
4145 * writepages operation. If it fails here we are freeing up the inode
4146 * anyway and we'd rather not waste our time writing out stuff that is
4147 * going to be truncated anyway.
4148 */
4149 if (!igrab(inode))
4150 return 0;
4151
86679820 4152 pagevec_init(&pvec);
d1310b2e
CM
4153 if (wbc->range_cyclic) {
4154 index = mapping->writeback_index; /* Start from prev offset */
4155 end = -1;
556755a8
JB
4156 /*
4157 * Start from the beginning does not need to cycle over the
4158 * range, mark it as scanned.
4159 */
4160 scanned = (index == 0);
d1310b2e 4161 } else {
09cbfeaf
KS
4162 index = wbc->range_start >> PAGE_SHIFT;
4163 end = wbc->range_end >> PAGE_SHIFT;
a9132667
LB
4164 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
4165 range_whole = 1;
d1310b2e
CM
4166 scanned = 1;
4167 }
3cd24c69
EL
4168
4169 /*
4170 * We do the tagged writepage as long as the snapshot flush bit is set
4171 * and we are the first one who do the filemap_flush() on this inode.
4172 *
4173 * The nr_to_write == LONG_MAX is needed to make sure other flushers do
4174 * not race in and drop the bit.
4175 */
4176 if (range_whole && wbc->nr_to_write == LONG_MAX &&
4177 test_and_clear_bit(BTRFS_INODE_SNAPSHOT_FLUSH,
4178 &BTRFS_I(inode)->runtime_flags))
4179 wbc->tagged_writepages = 1;
4180
4181 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
f7aaa06b
JB
4182 tag = PAGECACHE_TAG_TOWRITE;
4183 else
4184 tag = PAGECACHE_TAG_DIRTY;
d1310b2e 4185retry:
3cd24c69 4186 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
f7aaa06b 4187 tag_pages_for_writeback(mapping, index, end);
a9132667 4188 done_index = index;
f85d7d6c 4189 while (!done && !nr_to_write_done && (index <= end) &&
67fd707f
JK
4190 (nr_pages = pagevec_lookup_range_tag(&pvec, mapping,
4191 &index, end, tag))) {
d1310b2e
CM
4192 unsigned i;
4193
d1310b2e
CM
4194 for (i = 0; i < nr_pages; i++) {
4195 struct page *page = pvec.pages[i];
4196
f7bddf1e 4197 done_index = page->index + 1;
d1310b2e 4198 /*
b93b0163
MW
4199 * At this point we hold neither the i_pages lock nor
4200 * the page lock: the page may be truncated or
4201 * invalidated (changing page->mapping to NULL),
4202 * or even swizzled back from swapper_space to
4203 * tmpfs file mapping
d1310b2e 4204 */
c8f2f24b 4205 if (!trylock_page(page)) {
f4340622
QW
4206 ret = flush_write_bio(epd);
4207 BUG_ON(ret < 0);
c8f2f24b 4208 lock_page(page);
01d658f2 4209 }
d1310b2e
CM
4210
4211 if (unlikely(page->mapping != mapping)) {
4212 unlock_page(page);
4213 continue;
4214 }
4215
d2c3f4f6 4216 if (wbc->sync_mode != WB_SYNC_NONE) {
f4340622
QW
4217 if (PageWriteback(page)) {
4218 ret = flush_write_bio(epd);
4219 BUG_ON(ret < 0);
4220 }
d1310b2e 4221 wait_on_page_writeback(page);
d2c3f4f6 4222 }
d1310b2e
CM
4223
4224 if (PageWriteback(page) ||
4225 !clear_page_dirty_for_io(page)) {
4226 unlock_page(page);
4227 continue;
4228 }
4229
aab6e9ed 4230 ret = __extent_writepage(page, wbc, epd);
a9132667 4231 if (ret < 0) {
a9132667
LB
4232 done = 1;
4233 break;
4234 }
f85d7d6c
CM
4235
4236 /*
4237 * the filesystem may choose to bump up nr_to_write.
4238 * We have to make sure to honor the new nr_to_write
4239 * at any time
4240 */
4241 nr_to_write_done = wbc->nr_to_write <= 0;
d1310b2e
CM
4242 }
4243 pagevec_release(&pvec);
4244 cond_resched();
4245 }
894b36e3 4246 if (!scanned && !done) {
d1310b2e
CM
4247 /*
4248 * We hit the last page and there is more work to be done: wrap
4249 * back to the start of the file
4250 */
4251 scanned = 1;
4252 index = 0;
42ffb0bf
JB
4253
4254 /*
4255 * If we're looping we could run into a page that is locked by a
4256 * writer and that writer could be waiting on writeback for a
4257 * page in our current bio, and thus deadlock, so flush the
4258 * write bio here.
4259 */
4260 ret = flush_write_bio(epd);
4261 if (!ret)
4262 goto retry;
d1310b2e 4263 }
a9132667
LB
4264
4265 if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole))
4266 mapping->writeback_index = done_index;
4267
7fd1a3f7 4268 btrfs_add_delayed_iput(inode);
894b36e3 4269 return ret;
d1310b2e 4270}
d1310b2e 4271
0a9b0e53 4272int extent_write_full_page(struct page *page, struct writeback_control *wbc)
d1310b2e
CM
4273{
4274 int ret;
d1310b2e
CM
4275 struct extent_page_data epd = {
4276 .bio = NULL,
771ed689 4277 .extent_locked = 0,
ffbd517d 4278 .sync_io = wbc->sync_mode == WB_SYNC_ALL,
d1310b2e 4279 };
d1310b2e 4280
d1310b2e 4281 ret = __extent_writepage(page, wbc, &epd);
3065976b
QW
4282 ASSERT(ret <= 0);
4283 if (ret < 0) {
4284 end_write_bio(&epd, ret);
4285 return ret;
4286 }
d1310b2e 4287
3065976b
QW
4288 ret = flush_write_bio(&epd);
4289 ASSERT(ret <= 0);
d1310b2e
CM
4290 return ret;
4291}
d1310b2e 4292
5e3ee236 4293int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
771ed689
CM
4294 int mode)
4295{
4296 int ret = 0;
4297 struct address_space *mapping = inode->i_mapping;
4298 struct page *page;
09cbfeaf
KS
4299 unsigned long nr_pages = (end - start + PAGE_SIZE) >>
4300 PAGE_SHIFT;
771ed689
CM
4301
4302 struct extent_page_data epd = {
4303 .bio = NULL,
771ed689 4304 .extent_locked = 1,
ffbd517d 4305 .sync_io = mode == WB_SYNC_ALL,
771ed689
CM
4306 };
4307 struct writeback_control wbc_writepages = {
771ed689 4308 .sync_mode = mode,
771ed689
CM
4309 .nr_to_write = nr_pages * 2,
4310 .range_start = start,
4311 .range_end = end + 1,
ec39f769
CM
4312 /* We're called from an async helper function */
4313 .punt_to_cgroup = 1,
4314 .no_cgroup_owner = 1,
771ed689
CM
4315 };
4316
dbb70bec 4317 wbc_attach_fdatawrite_inode(&wbc_writepages, inode);
d397712b 4318 while (start <= end) {
09cbfeaf 4319 page = find_get_page(mapping, start >> PAGE_SHIFT);
771ed689
CM
4320 if (clear_page_dirty_for_io(page))
4321 ret = __extent_writepage(page, &wbc_writepages, &epd);
4322 else {
7087a9d8 4323 btrfs_writepage_endio_finish_ordered(page, start,
c629732d 4324 start + PAGE_SIZE - 1, 1);
771ed689
CM
4325 unlock_page(page);
4326 }
09cbfeaf
KS
4327 put_page(page);
4328 start += PAGE_SIZE;
771ed689
CM
4329 }
4330
02c6db4f 4331 ASSERT(ret <= 0);
dbb70bec
CM
4332 if (ret == 0)
4333 ret = flush_write_bio(&epd);
4334 else
02c6db4f 4335 end_write_bio(&epd, ret);
dbb70bec
CM
4336
4337 wbc_detach_inode(&wbc_writepages);
771ed689
CM
4338 return ret;
4339}
d1310b2e 4340
8ae225a8 4341int extent_writepages(struct address_space *mapping,
d1310b2e
CM
4342 struct writeback_control *wbc)
4343{
4344 int ret = 0;
4345 struct extent_page_data epd = {
4346 .bio = NULL,
771ed689 4347 .extent_locked = 0,
ffbd517d 4348 .sync_io = wbc->sync_mode == WB_SYNC_ALL,
d1310b2e
CM
4349 };
4350
935db853 4351 ret = extent_write_cache_pages(mapping, wbc, &epd);
a2a72fbd
QW
4352 ASSERT(ret <= 0);
4353 if (ret < 0) {
4354 end_write_bio(&epd, ret);
4355 return ret;
4356 }
4357 ret = flush_write_bio(&epd);
d1310b2e
CM
4358 return ret;
4359}
d1310b2e 4360
2a3ff0ad
NB
4361int extent_readpages(struct address_space *mapping, struct list_head *pages,
4362 unsigned nr_pages)
d1310b2e
CM
4363{
4364 struct bio *bio = NULL;
c8b97818 4365 unsigned long bio_flags = 0;
67c9684f 4366 struct page *pagepool[16];
125bac01 4367 struct extent_map *em_cached = NULL;
67c9684f 4368 int nr = 0;
808f80b4 4369 u64 prev_em_start = (u64)-1;
d1310b2e 4370
61ed3a14 4371 while (!list_empty(pages)) {
e65ef21e
NB
4372 u64 contig_end = 0;
4373
61ed3a14 4374 for (nr = 0; nr < ARRAY_SIZE(pagepool) && !list_empty(pages);) {
f86196ea 4375 struct page *page = lru_to_page(pages);
d1310b2e 4376
61ed3a14
NB
4377 prefetchw(&page->flags);
4378 list_del(&page->lru);
4379 if (add_to_page_cache_lru(page, mapping, page->index,
4380 readahead_gfp_mask(mapping))) {
4381 put_page(page);
e65ef21e 4382 break;
61ed3a14
NB
4383 }
4384
4385 pagepool[nr++] = page;
e65ef21e 4386 contig_end = page_offset(page) + PAGE_SIZE - 1;
d1310b2e 4387 }
67c9684f 4388
e65ef21e
NB
4389 if (nr) {
4390 u64 contig_start = page_offset(pagepool[0]);
4391
4392 ASSERT(contig_start + nr * PAGE_SIZE - 1 == contig_end);
4393
b6660e80 4394 contiguous_readpages(pagepool, nr, contig_start,
e65ef21e
NB
4395 contig_end, &em_cached, &bio, &bio_flags,
4396 &prev_em_start);
4397 }
d1310b2e 4398 }
67c9684f 4399
125bac01
MX
4400 if (em_cached)
4401 free_extent_map(em_cached);
4402
d1310b2e 4403 if (bio)
1f7ad75b 4404 return submit_one_bio(bio, 0, bio_flags);
d1310b2e
CM
4405 return 0;
4406}
d1310b2e
CM
4407
4408/*
4409 * basic invalidatepage code, this waits on any locked or writeback
4410 * ranges corresponding to the page, and then deletes any extent state
4411 * records from the tree
4412 */
4413int extent_invalidatepage(struct extent_io_tree *tree,
4414 struct page *page, unsigned long offset)
4415{
2ac55d41 4416 struct extent_state *cached_state = NULL;
4eee4fa4 4417 u64 start = page_offset(page);
09cbfeaf 4418 u64 end = start + PAGE_SIZE - 1;
d1310b2e
CM
4419 size_t blocksize = page->mapping->host->i_sb->s_blocksize;
4420
fda2832f 4421 start += ALIGN(offset, blocksize);
d1310b2e
CM
4422 if (start > end)
4423 return 0;
4424
ff13db41 4425 lock_extent_bits(tree, start, end, &cached_state);
1edbb734 4426 wait_on_page_writeback(page);
e182163d
OS
4427 clear_extent_bit(tree, start, end, EXTENT_LOCKED | EXTENT_DELALLOC |
4428 EXTENT_DO_ACCOUNTING, 1, 1, &cached_state);
d1310b2e
CM
4429 return 0;
4430}
d1310b2e 4431
7b13b7b1
CM
4432/*
4433 * a helper for releasepage, this tests for areas of the page that
4434 * are locked or under IO and drops the related state bits if it is safe
4435 * to drop the page.
4436 */
29c68b2d 4437static int try_release_extent_state(struct extent_io_tree *tree,
48a3b636 4438 struct page *page, gfp_t mask)
7b13b7b1 4439{
4eee4fa4 4440 u64 start = page_offset(page);
09cbfeaf 4441 u64 end = start + PAGE_SIZE - 1;
7b13b7b1
CM
4442 int ret = 1;
4443
8882679e 4444 if (test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) {
7b13b7b1 4445 ret = 0;
8882679e 4446 } else {
11ef160f
CM
4447 /*
4448 * at this point we can safely clear everything except the
4449 * locked bit and the nodatasum bit
4450 */
66b0c887 4451 ret = __clear_extent_bit(tree, start, end,
11ef160f 4452 ~(EXTENT_LOCKED | EXTENT_NODATASUM),
66b0c887 4453 0, 0, NULL, mask, NULL);
e3f24cc5
CM
4454
4455 /* if clear_extent_bit failed for enomem reasons,
4456 * we can't allow the release to continue.
4457 */
4458 if (ret < 0)
4459 ret = 0;
4460 else
4461 ret = 1;
7b13b7b1
CM
4462 }
4463 return ret;
4464}
7b13b7b1 4465
d1310b2e
CM
4466/*
4467 * a helper for releasepage. As long as there are no locked extents
4468 * in the range corresponding to the page, both state records and extent
4469 * map records are removed
4470 */
477a30ba 4471int try_release_extent_mapping(struct page *page, gfp_t mask)
d1310b2e
CM
4472{
4473 struct extent_map *em;
4eee4fa4 4474 u64 start = page_offset(page);
09cbfeaf 4475 u64 end = start + PAGE_SIZE - 1;
bd3599a0
FM
4476 struct btrfs_inode *btrfs_inode = BTRFS_I(page->mapping->host);
4477 struct extent_io_tree *tree = &btrfs_inode->io_tree;
4478 struct extent_map_tree *map = &btrfs_inode->extent_tree;
7b13b7b1 4479
d0164adc 4480 if (gfpflags_allow_blocking(mask) &&
ee22184b 4481 page->mapping->host->i_size > SZ_16M) {
39b5637f 4482 u64 len;
70dec807 4483 while (start <= end) {
39b5637f 4484 len = end - start + 1;
890871be 4485 write_lock(&map->lock);
39b5637f 4486 em = lookup_extent_mapping(map, start, len);
285190d9 4487 if (!em) {
890871be 4488 write_unlock(&map->lock);
70dec807
CM
4489 break;
4490 }
7f3c74fb
CM
4491 if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
4492 em->start != start) {
890871be 4493 write_unlock(&map->lock);
70dec807
CM
4494 free_extent_map(em);
4495 break;
4496 }
4497 if (!test_range_bit(tree, em->start,
4498 extent_map_end(em) - 1,
4e586ca3 4499 EXTENT_LOCKED, 0, NULL)) {
bd3599a0
FM
4500 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
4501 &btrfs_inode->runtime_flags);
70dec807
CM
4502 remove_extent_mapping(map, em);
4503 /* once for the rb tree */
4504 free_extent_map(em);
4505 }
4506 start = extent_map_end(em);
890871be 4507 write_unlock(&map->lock);
70dec807
CM
4508
4509 /* once for us */
d1310b2e
CM
4510 free_extent_map(em);
4511 }
d1310b2e 4512 }
29c68b2d 4513 return try_release_extent_state(tree, page, mask);
d1310b2e 4514}
d1310b2e 4515
ec29ed5b
CM
4516/*
4517 * helper function for fiemap, which doesn't want to see any holes.
4518 * This maps until we find something past 'last'
4519 */
4520static struct extent_map *get_extent_skip_holes(struct inode *inode,
e3350e16 4521 u64 offset, u64 last)
ec29ed5b 4522{
da17066c 4523 u64 sectorsize = btrfs_inode_sectorsize(inode);
ec29ed5b
CM
4524 struct extent_map *em;
4525 u64 len;
4526
4527 if (offset >= last)
4528 return NULL;
4529
67871254 4530 while (1) {
ec29ed5b
CM
4531 len = last - offset;
4532 if (len == 0)
4533 break;
fda2832f 4534 len = ALIGN(len, sectorsize);
4ab47a8d 4535 em = btrfs_get_extent_fiemap(BTRFS_I(inode), offset, len);
c704005d 4536 if (IS_ERR_OR_NULL(em))
ec29ed5b
CM
4537 return em;
4538
4539 /* if this isn't a hole return it */
4a2d25cd 4540 if (em->block_start != EXTENT_MAP_HOLE)
ec29ed5b 4541 return em;
ec29ed5b
CM
4542
4543 /* this is a hole, advance to the next extent */
4544 offset = extent_map_end(em);
4545 free_extent_map(em);
4546 if (offset >= last)
4547 break;
4548 }
4549 return NULL;
4550}
4551
4751832d
QW
4552/*
4553 * To cache previous fiemap extent
4554 *
4555 * Will be used for merging fiemap extent
4556 */
4557struct fiemap_cache {
4558 u64 offset;
4559 u64 phys;
4560 u64 len;
4561 u32 flags;
4562 bool cached;
4563};
4564
4565/*
4566 * Helper to submit fiemap extent.
4567 *
4568 * Will try to merge current fiemap extent specified by @offset, @phys,
4569 * @len and @flags with cached one.
4570 * And only when we fails to merge, cached one will be submitted as
4571 * fiemap extent.
4572 *
4573 * Return value is the same as fiemap_fill_next_extent().
4574 */
4575static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
4576 struct fiemap_cache *cache,
4577 u64 offset, u64 phys, u64 len, u32 flags)
4578{
4579 int ret = 0;
4580
4581 if (!cache->cached)
4582 goto assign;
4583
4584 /*
4585 * Sanity check, extent_fiemap() should have ensured that new
52042d8e 4586 * fiemap extent won't overlap with cached one.
4751832d
QW
4587 * Not recoverable.
4588 *
4589 * NOTE: Physical address can overlap, due to compression
4590 */
4591 if (cache->offset + cache->len > offset) {
4592 WARN_ON(1);
4593 return -EINVAL;
4594 }
4595
4596 /*
4597 * Only merges fiemap extents if
4598 * 1) Their logical addresses are continuous
4599 *
4600 * 2) Their physical addresses are continuous
4601 * So truly compressed (physical size smaller than logical size)
4602 * extents won't get merged with each other
4603 *
4604 * 3) Share same flags except FIEMAP_EXTENT_LAST
4605 * So regular extent won't get merged with prealloc extent
4606 */
4607 if (cache->offset + cache->len == offset &&
4608 cache->phys + cache->len == phys &&
4609 (cache->flags & ~FIEMAP_EXTENT_LAST) ==
4610 (flags & ~FIEMAP_EXTENT_LAST)) {
4611 cache->len += len;
4612 cache->flags |= flags;
4613 goto try_submit_last;
4614 }
4615
4616 /* Not mergeable, need to submit cached one */
4617 ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
4618 cache->len, cache->flags);
4619 cache->cached = false;
4620 if (ret)
4621 return ret;
4622assign:
4623 cache->cached = true;
4624 cache->offset = offset;
4625 cache->phys = phys;
4626 cache->len = len;
4627 cache->flags = flags;
4628try_submit_last:
4629 if (cache->flags & FIEMAP_EXTENT_LAST) {
4630 ret = fiemap_fill_next_extent(fieinfo, cache->offset,
4631 cache->phys, cache->len, cache->flags);
4632 cache->cached = false;
4633 }
4634 return ret;
4635}
4636
4637/*
848c23b7 4638 * Emit last fiemap cache
4751832d 4639 *
848c23b7
QW
4640 * The last fiemap cache may still be cached in the following case:
4641 * 0 4k 8k
4642 * |<- Fiemap range ->|
4643 * |<------------ First extent ----------->|
4644 *
4645 * In this case, the first extent range will be cached but not emitted.
4646 * So we must emit it before ending extent_fiemap().
4751832d 4647 */
5c5aff98 4648static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo,
848c23b7 4649 struct fiemap_cache *cache)
4751832d
QW
4650{
4651 int ret;
4652
4653 if (!cache->cached)
4654 return 0;
4655
4751832d
QW
4656 ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
4657 cache->len, cache->flags);
4658 cache->cached = false;
4659 if (ret > 0)
4660 ret = 0;
4661 return ret;
4662}
4663
1506fcc8 4664int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2135fb9b 4665 __u64 start, __u64 len)
1506fcc8 4666{
975f84fe 4667 int ret = 0;
1506fcc8
YS
4668 u64 off = start;
4669 u64 max = start + len;
4670 u32 flags = 0;
975f84fe
JB
4671 u32 found_type;
4672 u64 last;
ec29ed5b 4673 u64 last_for_get_extent = 0;
1506fcc8 4674 u64 disko = 0;
ec29ed5b 4675 u64 isize = i_size_read(inode);
975f84fe 4676 struct btrfs_key found_key;
1506fcc8 4677 struct extent_map *em = NULL;
2ac55d41 4678 struct extent_state *cached_state = NULL;
975f84fe 4679 struct btrfs_path *path;
dc046b10 4680 struct btrfs_root *root = BTRFS_I(inode)->root;
4751832d 4681 struct fiemap_cache cache = { 0 };
5911c8fe
DS
4682 struct ulist *roots;
4683 struct ulist *tmp_ulist;
1506fcc8 4684 int end = 0;
ec29ed5b
CM
4685 u64 em_start = 0;
4686 u64 em_len = 0;
4687 u64 em_end = 0;
1506fcc8
YS
4688
4689 if (len == 0)
4690 return -EINVAL;
4691
975f84fe
JB
4692 path = btrfs_alloc_path();
4693 if (!path)
4694 return -ENOMEM;
4695 path->leave_spinning = 1;
4696
5911c8fe
DS
4697 roots = ulist_alloc(GFP_KERNEL);
4698 tmp_ulist = ulist_alloc(GFP_KERNEL);
4699 if (!roots || !tmp_ulist) {
4700 ret = -ENOMEM;
4701 goto out_free_ulist;
4702 }
4703
da17066c
JM
4704 start = round_down(start, btrfs_inode_sectorsize(inode));
4705 len = round_up(max, btrfs_inode_sectorsize(inode)) - start;
4d479cf0 4706
ec29ed5b
CM
4707 /*
4708 * lookup the last file extent. We're not using i_size here
4709 * because there might be preallocation past i_size
4710 */
f85b7379
DS
4711 ret = btrfs_lookup_file_extent(NULL, root, path,
4712 btrfs_ino(BTRFS_I(inode)), -1, 0);
975f84fe 4713 if (ret < 0) {
5911c8fe 4714 goto out_free_ulist;
2d324f59
LB
4715 } else {
4716 WARN_ON(!ret);
4717 if (ret == 1)
4718 ret = 0;
975f84fe 4719 }
2d324f59 4720
975f84fe 4721 path->slots[0]--;
975f84fe 4722 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
962a298f 4723 found_type = found_key.type;
975f84fe 4724
ec29ed5b 4725 /* No extents, but there might be delalloc bits */
4a0cc7ca 4726 if (found_key.objectid != btrfs_ino(BTRFS_I(inode)) ||
975f84fe 4727 found_type != BTRFS_EXTENT_DATA_KEY) {
ec29ed5b
CM
4728 /* have to trust i_size as the end */
4729 last = (u64)-1;
4730 last_for_get_extent = isize;
4731 } else {
4732 /*
4733 * remember the start of the last extent. There are a
4734 * bunch of different factors that go into the length of the
4735 * extent, so its much less complex to remember where it started
4736 */
4737 last = found_key.offset;
4738 last_for_get_extent = last + 1;
975f84fe 4739 }
fe09e16c 4740 btrfs_release_path(path);
975f84fe 4741
ec29ed5b
CM
4742 /*
4743 * we might have some extents allocated but more delalloc past those
4744 * extents. so, we trust isize unless the start of the last extent is
4745 * beyond isize
4746 */
4747 if (last < isize) {
4748 last = (u64)-1;
4749 last_for_get_extent = isize;
4750 }
4751
ff13db41 4752 lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len - 1,
d0082371 4753 &cached_state);
ec29ed5b 4754
e3350e16 4755 em = get_extent_skip_holes(inode, start, last_for_get_extent);
1506fcc8
YS
4756 if (!em)
4757 goto out;
4758 if (IS_ERR(em)) {
4759 ret = PTR_ERR(em);
4760 goto out;
4761 }
975f84fe 4762
1506fcc8 4763 while (!end) {
b76bb701 4764 u64 offset_in_extent = 0;
ea8efc74
CM
4765
4766 /* break if the extent we found is outside the range */
4767 if (em->start >= max || extent_map_end(em) < off)
4768 break;
4769
4770 /*
4771 * get_extent may return an extent that starts before our
4772 * requested range. We have to make sure the ranges
4773 * we return to fiemap always move forward and don't
4774 * overlap, so adjust the offsets here
4775 */
4776 em_start = max(em->start, off);
1506fcc8 4777
ea8efc74
CM
4778 /*
4779 * record the offset from the start of the extent
b76bb701
JB
4780 * for adjusting the disk offset below. Only do this if the
4781 * extent isn't compressed since our in ram offset may be past
4782 * what we have actually allocated on disk.
ea8efc74 4783 */
b76bb701
JB
4784 if (!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
4785 offset_in_extent = em_start - em->start;
ec29ed5b 4786 em_end = extent_map_end(em);
ea8efc74 4787 em_len = em_end - em_start;
1506fcc8 4788 flags = 0;
f0986318
FM
4789 if (em->block_start < EXTENT_MAP_LAST_BYTE)
4790 disko = em->block_start + offset_in_extent;
4791 else
4792 disko = 0;
1506fcc8 4793
ea8efc74
CM
4794 /*
4795 * bump off for our next call to get_extent
4796 */
4797 off = extent_map_end(em);
4798 if (off >= max)
4799 end = 1;
4800
93dbfad7 4801 if (em->block_start == EXTENT_MAP_LAST_BYTE) {
1506fcc8
YS
4802 end = 1;
4803 flags |= FIEMAP_EXTENT_LAST;
93dbfad7 4804 } else if (em->block_start == EXTENT_MAP_INLINE) {
1506fcc8
YS
4805 flags |= (FIEMAP_EXTENT_DATA_INLINE |
4806 FIEMAP_EXTENT_NOT_ALIGNED);
93dbfad7 4807 } else if (em->block_start == EXTENT_MAP_DELALLOC) {
1506fcc8
YS
4808 flags |= (FIEMAP_EXTENT_DELALLOC |
4809 FIEMAP_EXTENT_UNKNOWN);
dc046b10
JB
4810 } else if (fieinfo->fi_extents_max) {
4811 u64 bytenr = em->block_start -
4812 (em->start - em->orig_start);
fe09e16c 4813
fe09e16c
LB
4814 /*
4815 * As btrfs supports shared space, this information
4816 * can be exported to userspace tools via
dc046b10
JB
4817 * flag FIEMAP_EXTENT_SHARED. If fi_extents_max == 0
4818 * then we're just getting a count and we can skip the
4819 * lookup stuff.
fe09e16c 4820 */
bb739cf0
EN
4821 ret = btrfs_check_shared(root,
4822 btrfs_ino(BTRFS_I(inode)),
5911c8fe 4823 bytenr, roots, tmp_ulist);
dc046b10 4824 if (ret < 0)
fe09e16c 4825 goto out_free;
dc046b10 4826 if (ret)
fe09e16c 4827 flags |= FIEMAP_EXTENT_SHARED;
dc046b10 4828 ret = 0;
1506fcc8
YS
4829 }
4830 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
4831 flags |= FIEMAP_EXTENT_ENCODED;
0d2b2372
JB
4832 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
4833 flags |= FIEMAP_EXTENT_UNWRITTEN;
1506fcc8 4834
1506fcc8
YS
4835 free_extent_map(em);
4836 em = NULL;
ec29ed5b
CM
4837 if ((em_start >= last) || em_len == (u64)-1 ||
4838 (last == (u64)-1 && isize <= em_end)) {
1506fcc8
YS
4839 flags |= FIEMAP_EXTENT_LAST;
4840 end = 1;
4841 }
4842
ec29ed5b 4843 /* now scan forward to see if this is really the last extent. */
e3350e16 4844 em = get_extent_skip_holes(inode, off, last_for_get_extent);
ec29ed5b
CM
4845 if (IS_ERR(em)) {
4846 ret = PTR_ERR(em);
4847 goto out;
4848 }
4849 if (!em) {
975f84fe
JB
4850 flags |= FIEMAP_EXTENT_LAST;
4851 end = 1;
4852 }
4751832d
QW
4853 ret = emit_fiemap_extent(fieinfo, &cache, em_start, disko,
4854 em_len, flags);
26e726af
CS
4855 if (ret) {
4856 if (ret == 1)
4857 ret = 0;
ec29ed5b 4858 goto out_free;
26e726af 4859 }
1506fcc8
YS
4860 }
4861out_free:
4751832d 4862 if (!ret)
5c5aff98 4863 ret = emit_last_fiemap_cache(fieinfo, &cache);
1506fcc8
YS
4864 free_extent_map(em);
4865out:
a52f4cd2 4866 unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len - 1,
e43bbe5e 4867 &cached_state);
5911c8fe
DS
4868
4869out_free_ulist:
e02d48ea 4870 btrfs_free_path(path);
5911c8fe
DS
4871 ulist_free(roots);
4872 ulist_free(tmp_ulist);
1506fcc8
YS
4873 return ret;
4874}
4875
727011e0
CM
4876static void __free_extent_buffer(struct extent_buffer *eb)
4877{
3fd63727 4878 btrfs_leak_debug_del(&eb->fs_info->eb_leak_lock, &eb->leak_list);
727011e0
CM
4879 kmem_cache_free(extent_buffer_cache, eb);
4880}
4881
a26e8c9f 4882int extent_buffer_under_io(struct extent_buffer *eb)
db7f3436
JB
4883{
4884 return (atomic_read(&eb->io_pages) ||
4885 test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) ||
4886 test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
4887}
4888
4889/*
55ac0139 4890 * Release all pages attached to the extent buffer.
db7f3436 4891 */
55ac0139 4892static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb)
db7f3436 4893{
d64766fd
NB
4894 int i;
4895 int num_pages;
b0132a3b 4896 int mapped = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
db7f3436
JB
4897
4898 BUG_ON(extent_buffer_under_io(eb));
4899
d64766fd
NB
4900 num_pages = num_extent_pages(eb);
4901 for (i = 0; i < num_pages; i++) {
4902 struct page *page = eb->pages[i];
db7f3436 4903
5d2361db
FL
4904 if (!page)
4905 continue;
4906 if (mapped)
db7f3436 4907 spin_lock(&page->mapping->private_lock);
5d2361db
FL
4908 /*
4909 * We do this since we'll remove the pages after we've
4910 * removed the eb from the radix tree, so we could race
4911 * and have this page now attached to the new eb. So
4912 * only clear page_private if it's still connected to
4913 * this eb.
4914 */
4915 if (PagePrivate(page) &&
4916 page->private == (unsigned long)eb) {
4917 BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
4918 BUG_ON(PageDirty(page));
4919 BUG_ON(PageWriteback(page));
db7f3436 4920 /*
5d2361db
FL
4921 * We need to make sure we haven't be attached
4922 * to a new eb.
db7f3436 4923 */
5d2361db
FL
4924 ClearPagePrivate(page);
4925 set_page_private(page, 0);
4926 /* One for the page private */
09cbfeaf 4927 put_page(page);
db7f3436 4928 }
5d2361db
FL
4929
4930 if (mapped)
4931 spin_unlock(&page->mapping->private_lock);
4932
01327610 4933 /* One for when we allocated the page */
09cbfeaf 4934 put_page(page);
d64766fd 4935 }
db7f3436
JB
4936}
4937
4938/*
4939 * Helper for releasing the extent buffer.
4940 */
4941static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
4942{
55ac0139 4943 btrfs_release_extent_buffer_pages(eb);
db7f3436
JB
4944 __free_extent_buffer(eb);
4945}
4946
f28491e0
JB
4947static struct extent_buffer *
4948__alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
23d79d81 4949 unsigned long len)
d1310b2e
CM
4950{
4951 struct extent_buffer *eb = NULL;
4952
d1b5c567 4953 eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS|__GFP_NOFAIL);
d1310b2e
CM
4954 eb->start = start;
4955 eb->len = len;
f28491e0 4956 eb->fs_info = fs_info;
815a51c7 4957 eb->bflags = 0;
bd681513 4958 rwlock_init(&eb->lock);
bd681513 4959 atomic_set(&eb->blocking_readers, 0);
06297d8c 4960 eb->blocking_writers = 0;
ed1b4ed7 4961 eb->lock_nested = false;
bd681513
CM
4962 init_waitqueue_head(&eb->write_lock_wq);
4963 init_waitqueue_head(&eb->read_lock_wq);
b4ce94de 4964
3fd63727
JB
4965 btrfs_leak_debug_add(&fs_info->eb_leak_lock, &eb->leak_list,
4966 &fs_info->allocated_ebs);
6d49ba1b 4967
3083ee2e 4968 spin_lock_init(&eb->refs_lock);
d1310b2e 4969 atomic_set(&eb->refs, 1);
0b32f4bb 4970 atomic_set(&eb->io_pages, 0);
727011e0 4971
b8dae313
DS
4972 /*
4973 * Sanity checks, currently the maximum is 64k covered by 16x 4k pages
4974 */
4975 BUILD_BUG_ON(BTRFS_MAX_METADATA_BLOCKSIZE
4976 > MAX_INLINE_EXTENT_BUFFER_SIZE);
4977 BUG_ON(len > MAX_INLINE_EXTENT_BUFFER_SIZE);
d1310b2e 4978
843ccf9f 4979#ifdef CONFIG_BTRFS_DEBUG
f3dc24c5 4980 eb->spinning_writers = 0;
afd495a8 4981 atomic_set(&eb->spinning_readers, 0);
5c9c799a 4982 atomic_set(&eb->read_locks, 0);
00801ae4 4983 eb->write_locks = 0;
843ccf9f
DS
4984#endif
4985
d1310b2e
CM
4986 return eb;
4987}
4988
815a51c7
JS
4989struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src)
4990{
cc5e31a4 4991 int i;
815a51c7
JS
4992 struct page *p;
4993 struct extent_buffer *new;
cc5e31a4 4994 int num_pages = num_extent_pages(src);
815a51c7 4995
3f556f78 4996 new = __alloc_extent_buffer(src->fs_info, src->start, src->len);
815a51c7
JS
4997 if (new == NULL)
4998 return NULL;
4999
5000 for (i = 0; i < num_pages; i++) {
9ec72677 5001 p = alloc_page(GFP_NOFS);
db7f3436
JB
5002 if (!p) {
5003 btrfs_release_extent_buffer(new);
5004 return NULL;
5005 }
815a51c7
JS
5006 attach_extent_buffer_page(new, p);
5007 WARN_ON(PageDirty(p));
5008 SetPageUptodate(p);
5009 new->pages[i] = p;
fba1acf9 5010 copy_page(page_address(p), page_address(src->pages[i]));
815a51c7
JS
5011 }
5012
815a51c7 5013 set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags);
b0132a3b 5014 set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags);
815a51c7
JS
5015
5016 return new;
5017}
5018
0f331229
OS
5019struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
5020 u64 start, unsigned long len)
815a51c7
JS
5021{
5022 struct extent_buffer *eb;
cc5e31a4
DS
5023 int num_pages;
5024 int i;
815a51c7 5025
3f556f78 5026 eb = __alloc_extent_buffer(fs_info, start, len);
815a51c7
JS
5027 if (!eb)
5028 return NULL;
5029
65ad0104 5030 num_pages = num_extent_pages(eb);
815a51c7 5031 for (i = 0; i < num_pages; i++) {
9ec72677 5032 eb->pages[i] = alloc_page(GFP_NOFS);
815a51c7
JS
5033 if (!eb->pages[i])
5034 goto err;
5035 }
5036 set_extent_buffer_uptodate(eb);
5037 btrfs_set_header_nritems(eb, 0);
b0132a3b 5038 set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
815a51c7
JS
5039
5040 return eb;
5041err:
84167d19
SB
5042 for (; i > 0; i--)
5043 __free_page(eb->pages[i - 1]);
815a51c7
JS
5044 __free_extent_buffer(eb);
5045 return NULL;
5046}
5047
0f331229 5048struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
da17066c 5049 u64 start)
0f331229 5050{
da17066c 5051 return __alloc_dummy_extent_buffer(fs_info, start, fs_info->nodesize);
0f331229
OS
5052}
5053
0b32f4bb
JB
5054static void check_buffer_tree_ref(struct extent_buffer *eb)
5055{
242e18c7 5056 int refs;
0b32f4bb
JB
5057 /* the ref bit is tricky. We have to make sure it is set
5058 * if we have the buffer dirty. Otherwise the
5059 * code to free a buffer can end up dropping a dirty
5060 * page
5061 *
5062 * Once the ref bit is set, it won't go away while the
5063 * buffer is dirty or in writeback, and it also won't
5064 * go away while we have the reference count on the
5065 * eb bumped.
5066 *
5067 * We can't just set the ref bit without bumping the
5068 * ref on the eb because free_extent_buffer might
5069 * see the ref bit and try to clear it. If this happens
5070 * free_extent_buffer might end up dropping our original
5071 * ref by mistake and freeing the page before we are able
5072 * to add one more ref.
5073 *
5074 * So bump the ref count first, then set the bit. If someone
5075 * beat us to it, drop the ref we added.
5076 */
242e18c7
CM
5077 refs = atomic_read(&eb->refs);
5078 if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
5079 return;
5080
594831c4
JB
5081 spin_lock(&eb->refs_lock);
5082 if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
0b32f4bb 5083 atomic_inc(&eb->refs);
594831c4 5084 spin_unlock(&eb->refs_lock);
0b32f4bb
JB
5085}
5086
2457aec6
MG
5087static void mark_extent_buffer_accessed(struct extent_buffer *eb,
5088 struct page *accessed)
5df4235e 5089{
cc5e31a4 5090 int num_pages, i;
5df4235e 5091
0b32f4bb
JB
5092 check_buffer_tree_ref(eb);
5093
65ad0104 5094 num_pages = num_extent_pages(eb);
5df4235e 5095 for (i = 0; i < num_pages; i++) {
fb85fc9a
DS
5096 struct page *p = eb->pages[i];
5097
2457aec6
MG
5098 if (p != accessed)
5099 mark_page_accessed(p);
5df4235e
JB
5100 }
5101}
5102
f28491e0
JB
5103struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
5104 u64 start)
452c75c3
CS
5105{
5106 struct extent_buffer *eb;
5107
5108 rcu_read_lock();
f28491e0 5109 eb = radix_tree_lookup(&fs_info->buffer_radix,
09cbfeaf 5110 start >> PAGE_SHIFT);
452c75c3
CS
5111 if (eb && atomic_inc_not_zero(&eb->refs)) {
5112 rcu_read_unlock();
062c19e9
FM
5113 /*
5114 * Lock our eb's refs_lock to avoid races with
5115 * free_extent_buffer. When we get our eb it might be flagged
5116 * with EXTENT_BUFFER_STALE and another task running
5117 * free_extent_buffer might have seen that flag set,
5118 * eb->refs == 2, that the buffer isn't under IO (dirty and
5119 * writeback flags not set) and it's still in the tree (flag
5120 * EXTENT_BUFFER_TREE_REF set), therefore being in the process
5121 * of decrementing the extent buffer's reference count twice.
5122 * So here we could race and increment the eb's reference count,
5123 * clear its stale flag, mark it as dirty and drop our reference
5124 * before the other task finishes executing free_extent_buffer,
5125 * which would later result in an attempt to free an extent
5126 * buffer that is dirty.
5127 */
5128 if (test_bit(EXTENT_BUFFER_STALE, &eb->bflags)) {
5129 spin_lock(&eb->refs_lock);
5130 spin_unlock(&eb->refs_lock);
5131 }
2457aec6 5132 mark_extent_buffer_accessed(eb, NULL);
452c75c3
CS
5133 return eb;
5134 }
5135 rcu_read_unlock();
5136
5137 return NULL;
5138}
5139
faa2dbf0
JB
5140#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
5141struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
da17066c 5142 u64 start)
faa2dbf0
JB
5143{
5144 struct extent_buffer *eb, *exists = NULL;
5145 int ret;
5146
5147 eb = find_extent_buffer(fs_info, start);
5148 if (eb)
5149 return eb;
da17066c 5150 eb = alloc_dummy_extent_buffer(fs_info, start);
faa2dbf0 5151 if (!eb)
b6293c82 5152 return ERR_PTR(-ENOMEM);
faa2dbf0
JB
5153 eb->fs_info = fs_info;
5154again:
e1860a77 5155 ret = radix_tree_preload(GFP_NOFS);
b6293c82
DC
5156 if (ret) {
5157 exists = ERR_PTR(ret);
faa2dbf0 5158 goto free_eb;
b6293c82 5159 }
faa2dbf0
JB
5160 spin_lock(&fs_info->buffer_lock);
5161 ret = radix_tree_insert(&fs_info->buffer_radix,
09cbfeaf 5162 start >> PAGE_SHIFT, eb);
faa2dbf0
JB
5163 spin_unlock(&fs_info->buffer_lock);
5164 radix_tree_preload_end();
5165 if (ret == -EEXIST) {
5166 exists = find_extent_buffer(fs_info, start);
5167 if (exists)
5168 goto free_eb;
5169 else
5170 goto again;
5171 }
5172 check_buffer_tree_ref(eb);
5173 set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
5174
faa2dbf0
JB
5175 return eb;
5176free_eb:
5177 btrfs_release_extent_buffer(eb);
5178 return exists;
5179}
5180#endif
5181
f28491e0 5182struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
ce3e6984 5183 u64 start)
d1310b2e 5184{
da17066c 5185 unsigned long len = fs_info->nodesize;
cc5e31a4
DS
5186 int num_pages;
5187 int i;
09cbfeaf 5188 unsigned long index = start >> PAGE_SHIFT;
d1310b2e 5189 struct extent_buffer *eb;
6af118ce 5190 struct extent_buffer *exists = NULL;
d1310b2e 5191 struct page *p;
f28491e0 5192 struct address_space *mapping = fs_info->btree_inode->i_mapping;
d1310b2e 5193 int uptodate = 1;
19fe0a8b 5194 int ret;
d1310b2e 5195
da17066c 5196 if (!IS_ALIGNED(start, fs_info->sectorsize)) {
c871b0f2
LB
5197 btrfs_err(fs_info, "bad tree block start %llu", start);
5198 return ERR_PTR(-EINVAL);
5199 }
5200
f28491e0 5201 eb = find_extent_buffer(fs_info, start);
452c75c3 5202 if (eb)
6af118ce 5203 return eb;
6af118ce 5204
23d79d81 5205 eb = __alloc_extent_buffer(fs_info, start, len);
2b114d1d 5206 if (!eb)
c871b0f2 5207 return ERR_PTR(-ENOMEM);
d1310b2e 5208
65ad0104 5209 num_pages = num_extent_pages(eb);
727011e0 5210 for (i = 0; i < num_pages; i++, index++) {
d1b5c567 5211 p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL);
c871b0f2
LB
5212 if (!p) {
5213 exists = ERR_PTR(-ENOMEM);
6af118ce 5214 goto free_eb;
c871b0f2 5215 }
4f2de97a
JB
5216
5217 spin_lock(&mapping->private_lock);
5218 if (PagePrivate(p)) {
5219 /*
5220 * We could have already allocated an eb for this page
5221 * and attached one so lets see if we can get a ref on
5222 * the existing eb, and if we can we know it's good and
5223 * we can just return that one, else we know we can just
5224 * overwrite page->private.
5225 */
5226 exists = (struct extent_buffer *)p->private;
5227 if (atomic_inc_not_zero(&exists->refs)) {
5228 spin_unlock(&mapping->private_lock);
5229 unlock_page(p);
09cbfeaf 5230 put_page(p);
2457aec6 5231 mark_extent_buffer_accessed(exists, p);
4f2de97a
JB
5232 goto free_eb;
5233 }
5ca64f45 5234 exists = NULL;
4f2de97a 5235
0b32f4bb 5236 /*
4f2de97a
JB
5237 * Do this so attach doesn't complain and we need to
5238 * drop the ref the old guy had.
5239 */
5240 ClearPagePrivate(p);
0b32f4bb 5241 WARN_ON(PageDirty(p));
09cbfeaf 5242 put_page(p);
d1310b2e 5243 }
4f2de97a
JB
5244 attach_extent_buffer_page(eb, p);
5245 spin_unlock(&mapping->private_lock);
0b32f4bb 5246 WARN_ON(PageDirty(p));
727011e0 5247 eb->pages[i] = p;
d1310b2e
CM
5248 if (!PageUptodate(p))
5249 uptodate = 0;
eb14ab8e
CM
5250
5251 /*
b16d011e
NB
5252 * We can't unlock the pages just yet since the extent buffer
5253 * hasn't been properly inserted in the radix tree, this
5254 * opens a race with btree_releasepage which can free a page
5255 * while we are still filling in all pages for the buffer and
5256 * we could crash.
eb14ab8e 5257 */
d1310b2e
CM
5258 }
5259 if (uptodate)
b4ce94de 5260 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
115391d2 5261again:
e1860a77 5262 ret = radix_tree_preload(GFP_NOFS);
c871b0f2
LB
5263 if (ret) {
5264 exists = ERR_PTR(ret);
19fe0a8b 5265 goto free_eb;
c871b0f2 5266 }
19fe0a8b 5267
f28491e0
JB
5268 spin_lock(&fs_info->buffer_lock);
5269 ret = radix_tree_insert(&fs_info->buffer_radix,
09cbfeaf 5270 start >> PAGE_SHIFT, eb);
f28491e0 5271 spin_unlock(&fs_info->buffer_lock);
452c75c3 5272 radix_tree_preload_end();
19fe0a8b 5273 if (ret == -EEXIST) {
f28491e0 5274 exists = find_extent_buffer(fs_info, start);
452c75c3
CS
5275 if (exists)
5276 goto free_eb;
5277 else
115391d2 5278 goto again;
6af118ce 5279 }
6af118ce 5280 /* add one reference for the tree */
0b32f4bb 5281 check_buffer_tree_ref(eb);
34b41ace 5282 set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
eb14ab8e
CM
5283
5284 /*
b16d011e
NB
5285 * Now it's safe to unlock the pages because any calls to
5286 * btree_releasepage will correctly detect that a page belongs to a
5287 * live buffer and won't free them prematurely.
eb14ab8e 5288 */
28187ae5
NB
5289 for (i = 0; i < num_pages; i++)
5290 unlock_page(eb->pages[i]);
d1310b2e
CM
5291 return eb;
5292
6af118ce 5293free_eb:
5ca64f45 5294 WARN_ON(!atomic_dec_and_test(&eb->refs));
727011e0
CM
5295 for (i = 0; i < num_pages; i++) {
5296 if (eb->pages[i])
5297 unlock_page(eb->pages[i]);
5298 }
eb14ab8e 5299
897ca6e9 5300 btrfs_release_extent_buffer(eb);
6af118ce 5301 return exists;
d1310b2e 5302}
d1310b2e 5303
3083ee2e
JB
5304static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
5305{
5306 struct extent_buffer *eb =
5307 container_of(head, struct extent_buffer, rcu_head);
5308
5309 __free_extent_buffer(eb);
5310}
5311
f7a52a40 5312static int release_extent_buffer(struct extent_buffer *eb)
5ce48d0f 5313 __releases(&eb->refs_lock)
3083ee2e 5314{
07e21c4d
NB
5315 lockdep_assert_held(&eb->refs_lock);
5316
3083ee2e
JB
5317 WARN_ON(atomic_read(&eb->refs) == 0);
5318 if (atomic_dec_and_test(&eb->refs)) {
34b41ace 5319 if (test_and_clear_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags)) {
f28491e0 5320 struct btrfs_fs_info *fs_info = eb->fs_info;
3083ee2e 5321
815a51c7 5322 spin_unlock(&eb->refs_lock);
3083ee2e 5323
f28491e0
JB
5324 spin_lock(&fs_info->buffer_lock);
5325 radix_tree_delete(&fs_info->buffer_radix,
09cbfeaf 5326 eb->start >> PAGE_SHIFT);
f28491e0 5327 spin_unlock(&fs_info->buffer_lock);
34b41ace
JB
5328 } else {
5329 spin_unlock(&eb->refs_lock);
815a51c7 5330 }
3083ee2e
JB
5331
5332 /* Should be safe to release our pages at this point */
55ac0139 5333 btrfs_release_extent_buffer_pages(eb);
bcb7e449 5334#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
b0132a3b 5335 if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags))) {
bcb7e449
JB
5336 __free_extent_buffer(eb);
5337 return 1;
5338 }
5339#endif
3083ee2e 5340 call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
e64860aa 5341 return 1;
3083ee2e
JB
5342 }
5343 spin_unlock(&eb->refs_lock);
e64860aa
JB
5344
5345 return 0;
3083ee2e
JB
5346}
5347
d1310b2e
CM
5348void free_extent_buffer(struct extent_buffer *eb)
5349{
242e18c7
CM
5350 int refs;
5351 int old;
d1310b2e
CM
5352 if (!eb)
5353 return;
5354
242e18c7
CM
5355 while (1) {
5356 refs = atomic_read(&eb->refs);
46cc775e
NB
5357 if ((!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) && refs <= 3)
5358 || (test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) &&
5359 refs == 1))
242e18c7
CM
5360 break;
5361 old = atomic_cmpxchg(&eb->refs, refs, refs - 1);
5362 if (old == refs)
5363 return;
5364 }
5365
3083ee2e
JB
5366 spin_lock(&eb->refs_lock);
5367 if (atomic_read(&eb->refs) == 2 &&
5368 test_bit(EXTENT_BUFFER_STALE, &eb->bflags) &&
0b32f4bb 5369 !extent_buffer_under_io(eb) &&
3083ee2e
JB
5370 test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
5371 atomic_dec(&eb->refs);
5372
5373 /*
5374 * I know this is terrible, but it's temporary until we stop tracking
5375 * the uptodate bits and such for the extent buffers.
5376 */
f7a52a40 5377 release_extent_buffer(eb);
3083ee2e
JB
5378}
5379
5380void free_extent_buffer_stale(struct extent_buffer *eb)
5381{
5382 if (!eb)
d1310b2e
CM
5383 return;
5384
3083ee2e
JB
5385 spin_lock(&eb->refs_lock);
5386 set_bit(EXTENT_BUFFER_STALE, &eb->bflags);
5387
0b32f4bb 5388 if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) &&
3083ee2e
JB
5389 test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
5390 atomic_dec(&eb->refs);
f7a52a40 5391 release_extent_buffer(eb);
d1310b2e 5392}
d1310b2e 5393
1d4284bd 5394void clear_extent_buffer_dirty(struct extent_buffer *eb)
d1310b2e 5395{
cc5e31a4
DS
5396 int i;
5397 int num_pages;
d1310b2e
CM
5398 struct page *page;
5399
65ad0104 5400 num_pages = num_extent_pages(eb);
d1310b2e
CM
5401
5402 for (i = 0; i < num_pages; i++) {
fb85fc9a 5403 page = eb->pages[i];
b9473439 5404 if (!PageDirty(page))
d2c3f4f6
CM
5405 continue;
5406
a61e6f29 5407 lock_page(page);
eb14ab8e
CM
5408 WARN_ON(!PagePrivate(page));
5409
d1310b2e 5410 clear_page_dirty_for_io(page);
b93b0163 5411 xa_lock_irq(&page->mapping->i_pages);
0a943c65
MW
5412 if (!PageDirty(page))
5413 __xa_clear_mark(&page->mapping->i_pages,
5414 page_index(page), PAGECACHE_TAG_DIRTY);
b93b0163 5415 xa_unlock_irq(&page->mapping->i_pages);
bf0da8c1 5416 ClearPageError(page);
a61e6f29 5417 unlock_page(page);
d1310b2e 5418 }
0b32f4bb 5419 WARN_ON(atomic_read(&eb->refs) == 0);
d1310b2e 5420}
d1310b2e 5421
abb57ef3 5422bool set_extent_buffer_dirty(struct extent_buffer *eb)
d1310b2e 5423{
cc5e31a4
DS
5424 int i;
5425 int num_pages;
abb57ef3 5426 bool was_dirty;
d1310b2e 5427
0b32f4bb
JB
5428 check_buffer_tree_ref(eb);
5429
b9473439 5430 was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
0b32f4bb 5431
65ad0104 5432 num_pages = num_extent_pages(eb);
3083ee2e 5433 WARN_ON(atomic_read(&eb->refs) == 0);
0b32f4bb
JB
5434 WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
5435
abb57ef3
LB
5436 if (!was_dirty)
5437 for (i = 0; i < num_pages; i++)
5438 set_page_dirty(eb->pages[i]);
51995c39
LB
5439
5440#ifdef CONFIG_BTRFS_DEBUG
5441 for (i = 0; i < num_pages; i++)
5442 ASSERT(PageDirty(eb->pages[i]));
5443#endif
5444
b9473439 5445 return was_dirty;
d1310b2e 5446}
d1310b2e 5447
69ba3927 5448void clear_extent_buffer_uptodate(struct extent_buffer *eb)
1259ab75 5449{
cc5e31a4 5450 int i;
1259ab75 5451 struct page *page;
cc5e31a4 5452 int num_pages;
1259ab75 5453
b4ce94de 5454 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
65ad0104 5455 num_pages = num_extent_pages(eb);
1259ab75 5456 for (i = 0; i < num_pages; i++) {
fb85fc9a 5457 page = eb->pages[i];
33958dc6
CM
5458 if (page)
5459 ClearPageUptodate(page);
1259ab75 5460 }
1259ab75
CM
5461}
5462
09c25a8c 5463void set_extent_buffer_uptodate(struct extent_buffer *eb)
d1310b2e 5464{
cc5e31a4 5465 int i;
d1310b2e 5466 struct page *page;
cc5e31a4 5467 int num_pages;
d1310b2e 5468
0b32f4bb 5469 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
65ad0104 5470 num_pages = num_extent_pages(eb);
d1310b2e 5471 for (i = 0; i < num_pages; i++) {
fb85fc9a 5472 page = eb->pages[i];
d1310b2e
CM
5473 SetPageUptodate(page);
5474 }
d1310b2e 5475}
d1310b2e 5476
c2ccfbc6 5477int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
d1310b2e 5478{
cc5e31a4 5479 int i;
d1310b2e
CM
5480 struct page *page;
5481 int err;
5482 int ret = 0;
ce9adaa5
CM
5483 int locked_pages = 0;
5484 int all_uptodate = 1;
cc5e31a4 5485 int num_pages;
727011e0 5486 unsigned long num_reads = 0;
a86c12c7 5487 struct bio *bio = NULL;
c8b97818 5488 unsigned long bio_flags = 0;
a86c12c7 5489
b4ce94de 5490 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
d1310b2e
CM
5491 return 0;
5492
65ad0104 5493 num_pages = num_extent_pages(eb);
8436ea91 5494 for (i = 0; i < num_pages; i++) {
fb85fc9a 5495 page = eb->pages[i];
bb82ab88 5496 if (wait == WAIT_NONE) {
2db04966 5497 if (!trylock_page(page))
ce9adaa5 5498 goto unlock_exit;
d1310b2e
CM
5499 } else {
5500 lock_page(page);
5501 }
ce9adaa5 5502 locked_pages++;
2571e739
LB
5503 }
5504 /*
5505 * We need to firstly lock all pages to make sure that
5506 * the uptodate bit of our pages won't be affected by
5507 * clear_extent_buffer_uptodate().
5508 */
8436ea91 5509 for (i = 0; i < num_pages; i++) {
2571e739 5510 page = eb->pages[i];
727011e0
CM
5511 if (!PageUptodate(page)) {
5512 num_reads++;
ce9adaa5 5513 all_uptodate = 0;
727011e0 5514 }
ce9adaa5 5515 }
2571e739 5516
ce9adaa5 5517 if (all_uptodate) {
8436ea91 5518 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
ce9adaa5
CM
5519 goto unlock_exit;
5520 }
5521
656f30db 5522 clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
5cf1ab56 5523 eb->read_mirror = 0;
0b32f4bb 5524 atomic_set(&eb->io_pages, num_reads);
8436ea91 5525 for (i = 0; i < num_pages; i++) {
fb85fc9a 5526 page = eb->pages[i];
baf863b9 5527
ce9adaa5 5528 if (!PageUptodate(page)) {
baf863b9
LB
5529 if (ret) {
5530 atomic_dec(&eb->io_pages);
5531 unlock_page(page);
5532 continue;
5533 }
5534
f188591e 5535 ClearPageError(page);
0d44fea7 5536 err = __extent_read_full_page(page,
6af49dbd 5537 btree_get_extent, &bio,
d4c7ca86 5538 mirror_num, &bio_flags,
1f7ad75b 5539 REQ_META);
baf863b9 5540 if (err) {
d1310b2e 5541 ret = err;
baf863b9
LB
5542 /*
5543 * We use &bio in above __extent_read_full_page,
5544 * so we ensure that if it returns error, the
5545 * current page fails to add itself to bio and
5546 * it's been unlocked.
5547 *
5548 * We must dec io_pages by ourselves.
5549 */
5550 atomic_dec(&eb->io_pages);
5551 }
d1310b2e
CM
5552 } else {
5553 unlock_page(page);
5554 }
5555 }
5556
355808c2 5557 if (bio) {
1f7ad75b 5558 err = submit_one_bio(bio, mirror_num, bio_flags);
79787eaa
JM
5559 if (err)
5560 return err;
355808c2 5561 }
a86c12c7 5562
bb82ab88 5563 if (ret || wait != WAIT_COMPLETE)
d1310b2e 5564 return ret;
d397712b 5565
8436ea91 5566 for (i = 0; i < num_pages; i++) {
fb85fc9a 5567 page = eb->pages[i];
d1310b2e 5568 wait_on_page_locked(page);
d397712b 5569 if (!PageUptodate(page))
d1310b2e 5570 ret = -EIO;
d1310b2e 5571 }
d397712b 5572
d1310b2e 5573 return ret;
ce9adaa5
CM
5574
5575unlock_exit:
d397712b 5576 while (locked_pages > 0) {
ce9adaa5 5577 locked_pages--;
8436ea91
JB
5578 page = eb->pages[locked_pages];
5579 unlock_page(page);
ce9adaa5
CM
5580 }
5581 return ret;
d1310b2e 5582}
d1310b2e 5583
1cbb1f45
JM
5584void read_extent_buffer(const struct extent_buffer *eb, void *dstv,
5585 unsigned long start, unsigned long len)
d1310b2e
CM
5586{
5587 size_t cur;
5588 size_t offset;
5589 struct page *page;
5590 char *kaddr;
5591 char *dst = (char *)dstv;
7073017a 5592 size_t start_offset = offset_in_page(eb->start);
09cbfeaf 5593 unsigned long i = (start_offset + start) >> PAGE_SHIFT;
d1310b2e 5594
f716abd5
LB
5595 if (start + len > eb->len) {
5596 WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, wanted %lu %lu\n",
5597 eb->start, eb->len, start, len);
5598 memset(dst, 0, len);
5599 return;
5600 }
d1310b2e 5601
7073017a 5602 offset = offset_in_page(start_offset + start);
d1310b2e 5603
d397712b 5604 while (len > 0) {
fb85fc9a 5605 page = eb->pages[i];
d1310b2e 5606
09cbfeaf 5607 cur = min(len, (PAGE_SIZE - offset));
a6591715 5608 kaddr = page_address(page);
d1310b2e 5609 memcpy(dst, kaddr + offset, cur);
d1310b2e
CM
5610
5611 dst += cur;
5612 len -= cur;
5613 offset = 0;
5614 i++;
5615 }
5616}
d1310b2e 5617
1cbb1f45
JM
5618int read_extent_buffer_to_user(const struct extent_buffer *eb,
5619 void __user *dstv,
5620 unsigned long start, unsigned long len)
550ac1d8
GH
5621{
5622 size_t cur;
5623 size_t offset;
5624 struct page *page;
5625 char *kaddr;
5626 char __user *dst = (char __user *)dstv;
7073017a 5627 size_t start_offset = offset_in_page(eb->start);
09cbfeaf 5628 unsigned long i = (start_offset + start) >> PAGE_SHIFT;
550ac1d8
GH
5629 int ret = 0;
5630
5631 WARN_ON(start > eb->len);
5632 WARN_ON(start + len > eb->start + eb->len);
5633
7073017a 5634 offset = offset_in_page(start_offset + start);
550ac1d8
GH
5635
5636 while (len > 0) {
fb85fc9a 5637 page = eb->pages[i];
550ac1d8 5638
09cbfeaf 5639 cur = min(len, (PAGE_SIZE - offset));
550ac1d8
GH
5640 kaddr = page_address(page);
5641 if (copy_to_user(dst, kaddr + offset, cur)) {
5642 ret = -EFAULT;
5643 break;
5644 }
5645
5646 dst += cur;
5647 len -= cur;
5648 offset = 0;
5649 i++;
5650 }
5651
5652 return ret;
5653}
5654
415b35a5
LB
5655/*
5656 * return 0 if the item is found within a page.
5657 * return 1 if the item spans two pages.
5658 * return -EINVAL otherwise.
5659 */
1cbb1f45
JM
5660int map_private_extent_buffer(const struct extent_buffer *eb,
5661 unsigned long start, unsigned long min_len,
5662 char **map, unsigned long *map_start,
5663 unsigned long *map_len)
d1310b2e 5664{
cc2c39d6 5665 size_t offset;
d1310b2e
CM
5666 char *kaddr;
5667 struct page *p;
7073017a 5668 size_t start_offset = offset_in_page(eb->start);
09cbfeaf 5669 unsigned long i = (start_offset + start) >> PAGE_SHIFT;
d1310b2e 5670 unsigned long end_i = (start_offset + start + min_len - 1) >>
09cbfeaf 5671 PAGE_SHIFT;
d1310b2e 5672
f716abd5
LB
5673 if (start + min_len > eb->len) {
5674 WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, wanted %lu %lu\n",
5675 eb->start, eb->len, start, min_len);
5676 return -EINVAL;
5677 }
5678
d1310b2e 5679 if (i != end_i)
415b35a5 5680 return 1;
d1310b2e
CM
5681
5682 if (i == 0) {
5683 offset = start_offset;
5684 *map_start = 0;
5685 } else {
5686 offset = 0;
09cbfeaf 5687 *map_start = ((u64)i << PAGE_SHIFT) - start_offset;
d1310b2e 5688 }
d397712b 5689
fb85fc9a 5690 p = eb->pages[i];
a6591715 5691 kaddr = page_address(p);
d1310b2e 5692 *map = kaddr + offset;
09cbfeaf 5693 *map_len = PAGE_SIZE - offset;
d1310b2e
CM
5694 return 0;
5695}
d1310b2e 5696
1cbb1f45
JM
5697int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
5698 unsigned long start, unsigned long len)
d1310b2e
CM
5699{
5700 size_t cur;
5701 size_t offset;
5702 struct page *page;
5703 char *kaddr;
5704 char *ptr = (char *)ptrv;
7073017a 5705 size_t start_offset = offset_in_page(eb->start);
09cbfeaf 5706 unsigned long i = (start_offset + start) >> PAGE_SHIFT;
d1310b2e
CM
5707 int ret = 0;
5708
5709 WARN_ON(start > eb->len);
5710 WARN_ON(start + len > eb->start + eb->len);
5711
7073017a 5712 offset = offset_in_page(start_offset + start);
d1310b2e 5713
d397712b 5714 while (len > 0) {
fb85fc9a 5715 page = eb->pages[i];
d1310b2e 5716
09cbfeaf 5717 cur = min(len, (PAGE_SIZE - offset));
d1310b2e 5718
a6591715 5719 kaddr = page_address(page);
d1310b2e 5720 ret = memcmp(ptr, kaddr + offset, cur);
d1310b2e
CM
5721 if (ret)
5722 break;
5723
5724 ptr += cur;
5725 len -= cur;
5726 offset = 0;
5727 i++;
5728 }
5729 return ret;
5730}
d1310b2e 5731
f157bf76
DS
5732void write_extent_buffer_chunk_tree_uuid(struct extent_buffer *eb,
5733 const void *srcv)
5734{
5735 char *kaddr;
5736
5737 WARN_ON(!PageUptodate(eb->pages[0]));
5738 kaddr = page_address(eb->pages[0]);
5739 memcpy(kaddr + offsetof(struct btrfs_header, chunk_tree_uuid), srcv,
5740 BTRFS_FSID_SIZE);
5741}
5742
5743void write_extent_buffer_fsid(struct extent_buffer *eb, const void *srcv)
5744{
5745 char *kaddr;
5746
5747 WARN_ON(!PageUptodate(eb->pages[0]));
5748 kaddr = page_address(eb->pages[0]);
5749 memcpy(kaddr + offsetof(struct btrfs_header, fsid), srcv,
5750 BTRFS_FSID_SIZE);
5751}
5752
d1310b2e
CM
5753void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
5754 unsigned long start, unsigned long len)
5755{
5756 size_t cur;
5757 size_t offset;
5758 struct page *page;
5759 char *kaddr;
5760 char *src = (char *)srcv;
7073017a 5761 size_t start_offset = offset_in_page(eb->start);
09cbfeaf 5762 unsigned long i = (start_offset + start) >> PAGE_SHIFT;
d1310b2e
CM
5763
5764 WARN_ON(start > eb->len);
5765 WARN_ON(start + len > eb->start + eb->len);
5766
7073017a 5767 offset = offset_in_page(start_offset + start);
d1310b2e 5768
d397712b 5769 while (len > 0) {
fb85fc9a 5770 page = eb->pages[i];
d1310b2e
CM
5771 WARN_ON(!PageUptodate(page));
5772
09cbfeaf 5773 cur = min(len, PAGE_SIZE - offset);
a6591715 5774 kaddr = page_address(page);
d1310b2e 5775 memcpy(kaddr + offset, src, cur);
d1310b2e
CM
5776
5777 src += cur;
5778 len -= cur;
5779 offset = 0;
5780 i++;
5781 }
5782}
d1310b2e 5783
b159fa28
DS
5784void memzero_extent_buffer(struct extent_buffer *eb, unsigned long start,
5785 unsigned long len)
d1310b2e
CM
5786{
5787 size_t cur;
5788 size_t offset;
5789 struct page *page;
5790 char *kaddr;
7073017a 5791 size_t start_offset = offset_in_page(eb->start);
09cbfeaf 5792 unsigned long i = (start_offset + start) >> PAGE_SHIFT;
d1310b2e
CM
5793
5794 WARN_ON(start > eb->len);
5795 WARN_ON(start + len > eb->start + eb->len);
5796
7073017a 5797 offset = offset_in_page(start_offset + start);
d1310b2e 5798
d397712b 5799 while (len > 0) {
fb85fc9a 5800 page = eb->pages[i];
d1310b2e
CM
5801 WARN_ON(!PageUptodate(page));
5802
09cbfeaf 5803 cur = min(len, PAGE_SIZE - offset);
a6591715 5804 kaddr = page_address(page);
b159fa28 5805 memset(kaddr + offset, 0, cur);
d1310b2e
CM
5806
5807 len -= cur;
5808 offset = 0;
5809 i++;
5810 }
5811}
d1310b2e 5812
58e8012c
DS
5813void copy_extent_buffer_full(struct extent_buffer *dst,
5814 struct extent_buffer *src)
5815{
5816 int i;
cc5e31a4 5817 int num_pages;
58e8012c
DS
5818
5819 ASSERT(dst->len == src->len);
5820
65ad0104 5821 num_pages = num_extent_pages(dst);
58e8012c
DS
5822 for (i = 0; i < num_pages; i++)
5823 copy_page(page_address(dst->pages[i]),
5824 page_address(src->pages[i]));
5825}
5826
d1310b2e
CM
5827void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
5828 unsigned long dst_offset, unsigned long src_offset,
5829 unsigned long len)
5830{
5831 u64 dst_len = dst->len;
5832 size_t cur;
5833 size_t offset;
5834 struct page *page;
5835 char *kaddr;
7073017a 5836 size_t start_offset = offset_in_page(dst->start);
09cbfeaf 5837 unsigned long i = (start_offset + dst_offset) >> PAGE_SHIFT;
d1310b2e
CM
5838
5839 WARN_ON(src->len != dst_len);
5840
7073017a 5841 offset = offset_in_page(start_offset + dst_offset);
d1310b2e 5842
d397712b 5843 while (len > 0) {
fb85fc9a 5844 page = dst->pages[i];
d1310b2e
CM
5845 WARN_ON(!PageUptodate(page));
5846
09cbfeaf 5847 cur = min(len, (unsigned long)(PAGE_SIZE - offset));
d1310b2e 5848
a6591715 5849 kaddr = page_address(page);
d1310b2e 5850 read_extent_buffer(src, kaddr + offset, src_offset, cur);
d1310b2e
CM
5851
5852 src_offset += cur;
5853 len -= cur;
5854 offset = 0;
5855 i++;
5856 }
5857}
d1310b2e 5858
3e1e8bb7
OS
5859/*
5860 * eb_bitmap_offset() - calculate the page and offset of the byte containing the
5861 * given bit number
5862 * @eb: the extent buffer
5863 * @start: offset of the bitmap item in the extent buffer
5864 * @nr: bit number
5865 * @page_index: return index of the page in the extent buffer that contains the
5866 * given bit number
5867 * @page_offset: return offset into the page given by page_index
5868 *
5869 * This helper hides the ugliness of finding the byte in an extent buffer which
5870 * contains a given bit.
5871 */
5872static inline void eb_bitmap_offset(struct extent_buffer *eb,
5873 unsigned long start, unsigned long nr,
5874 unsigned long *page_index,
5875 size_t *page_offset)
5876{
7073017a 5877 size_t start_offset = offset_in_page(eb->start);
3e1e8bb7
OS
5878 size_t byte_offset = BIT_BYTE(nr);
5879 size_t offset;
5880
5881 /*
5882 * The byte we want is the offset of the extent buffer + the offset of
5883 * the bitmap item in the extent buffer + the offset of the byte in the
5884 * bitmap item.
5885 */
5886 offset = start_offset + start + byte_offset;
5887
09cbfeaf 5888 *page_index = offset >> PAGE_SHIFT;
7073017a 5889 *page_offset = offset_in_page(offset);
3e1e8bb7
OS
5890}
5891
5892/**
5893 * extent_buffer_test_bit - determine whether a bit in a bitmap item is set
5894 * @eb: the extent buffer
5895 * @start: offset of the bitmap item in the extent buffer
5896 * @nr: bit number to test
5897 */
5898int extent_buffer_test_bit(struct extent_buffer *eb, unsigned long start,
5899 unsigned long nr)
5900{
2fe1d551 5901 u8 *kaddr;
3e1e8bb7
OS
5902 struct page *page;
5903 unsigned long i;
5904 size_t offset;
5905
5906 eb_bitmap_offset(eb, start, nr, &i, &offset);
5907 page = eb->pages[i];
5908 WARN_ON(!PageUptodate(page));
5909 kaddr = page_address(page);
5910 return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1)));
5911}
5912
5913/**
5914 * extent_buffer_bitmap_set - set an area of a bitmap
5915 * @eb: the extent buffer
5916 * @start: offset of the bitmap item in the extent buffer
5917 * @pos: bit number of the first bit
5918 * @len: number of bits to set
5919 */
5920void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start,
5921 unsigned long pos, unsigned long len)
5922{
2fe1d551 5923 u8 *kaddr;
3e1e8bb7
OS
5924 struct page *page;
5925 unsigned long i;
5926 size_t offset;
5927 const unsigned int size = pos + len;
5928 int bits_to_set = BITS_PER_BYTE - (pos % BITS_PER_BYTE);
2fe1d551 5929 u8 mask_to_set = BITMAP_FIRST_BYTE_MASK(pos);
3e1e8bb7
OS
5930
5931 eb_bitmap_offset(eb, start, pos, &i, &offset);
5932 page = eb->pages[i];
5933 WARN_ON(!PageUptodate(page));
5934 kaddr = page_address(page);
5935
5936 while (len >= bits_to_set) {
5937 kaddr[offset] |= mask_to_set;
5938 len -= bits_to_set;
5939 bits_to_set = BITS_PER_BYTE;
9c894696 5940 mask_to_set = ~0;
09cbfeaf 5941 if (++offset >= PAGE_SIZE && len > 0) {
3e1e8bb7
OS
5942 offset = 0;
5943 page = eb->pages[++i];
5944 WARN_ON(!PageUptodate(page));
5945 kaddr = page_address(page);
5946 }
5947 }
5948 if (len) {
5949 mask_to_set &= BITMAP_LAST_BYTE_MASK(size);
5950 kaddr[offset] |= mask_to_set;
5951 }
5952}
5953
5954
5955/**
5956 * extent_buffer_bitmap_clear - clear an area of a bitmap
5957 * @eb: the extent buffer
5958 * @start: offset of the bitmap item in the extent buffer
5959 * @pos: bit number of the first bit
5960 * @len: number of bits to clear
5961 */
5962void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start,
5963 unsigned long pos, unsigned long len)
5964{
2fe1d551 5965 u8 *kaddr;
3e1e8bb7
OS
5966 struct page *page;
5967 unsigned long i;
5968 size_t offset;
5969 const unsigned int size = pos + len;
5970 int bits_to_clear = BITS_PER_BYTE - (pos % BITS_PER_BYTE);
2fe1d551 5971 u8 mask_to_clear = BITMAP_FIRST_BYTE_MASK(pos);
3e1e8bb7
OS
5972
5973 eb_bitmap_offset(eb, start, pos, &i, &offset);
5974 page = eb->pages[i];
5975 WARN_ON(!PageUptodate(page));
5976 kaddr = page_address(page);
5977
5978 while (len >= bits_to_clear) {
5979 kaddr[offset] &= ~mask_to_clear;
5980 len -= bits_to_clear;
5981 bits_to_clear = BITS_PER_BYTE;
9c894696 5982 mask_to_clear = ~0;
09cbfeaf 5983 if (++offset >= PAGE_SIZE && len > 0) {
3e1e8bb7
OS
5984 offset = 0;
5985 page = eb->pages[++i];
5986 WARN_ON(!PageUptodate(page));
5987 kaddr = page_address(page);
5988 }
5989 }
5990 if (len) {
5991 mask_to_clear &= BITMAP_LAST_BYTE_MASK(size);
5992 kaddr[offset] &= ~mask_to_clear;
5993 }
5994}
5995
3387206f
ST
5996static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len)
5997{
5998 unsigned long distance = (src > dst) ? src - dst : dst - src;
5999 return distance < len;
6000}
6001
d1310b2e
CM
6002static void copy_pages(struct page *dst_page, struct page *src_page,
6003 unsigned long dst_off, unsigned long src_off,
6004 unsigned long len)
6005{
a6591715 6006 char *dst_kaddr = page_address(dst_page);
d1310b2e 6007 char *src_kaddr;
727011e0 6008 int must_memmove = 0;
d1310b2e 6009
3387206f 6010 if (dst_page != src_page) {
a6591715 6011 src_kaddr = page_address(src_page);
3387206f 6012 } else {
d1310b2e 6013 src_kaddr = dst_kaddr;
727011e0
CM
6014 if (areas_overlap(src_off, dst_off, len))
6015 must_memmove = 1;
3387206f 6016 }
d1310b2e 6017
727011e0
CM
6018 if (must_memmove)
6019 memmove(dst_kaddr + dst_off, src_kaddr + src_off, len);
6020 else
6021 memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
d1310b2e
CM
6022}
6023
6024void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
6025 unsigned long src_offset, unsigned long len)
6026{
0b246afa 6027 struct btrfs_fs_info *fs_info = dst->fs_info;
d1310b2e
CM
6028 size_t cur;
6029 size_t dst_off_in_page;
6030 size_t src_off_in_page;
7073017a 6031 size_t start_offset = offset_in_page(dst->start);
d1310b2e
CM
6032 unsigned long dst_i;
6033 unsigned long src_i;
6034
6035 if (src_offset + len > dst->len) {
0b246afa 6036 btrfs_err(fs_info,
5d163e0e
JM
6037 "memmove bogus src_offset %lu move len %lu dst len %lu",
6038 src_offset, len, dst->len);
290342f6 6039 BUG();
d1310b2e
CM
6040 }
6041 if (dst_offset + len > dst->len) {
0b246afa 6042 btrfs_err(fs_info,
5d163e0e
JM
6043 "memmove bogus dst_offset %lu move len %lu dst len %lu",
6044 dst_offset, len, dst->len);
290342f6 6045 BUG();
d1310b2e
CM
6046 }
6047
d397712b 6048 while (len > 0) {
7073017a
JT
6049 dst_off_in_page = offset_in_page(start_offset + dst_offset);
6050 src_off_in_page = offset_in_page(start_offset + src_offset);
d1310b2e 6051
09cbfeaf
KS
6052 dst_i = (start_offset + dst_offset) >> PAGE_SHIFT;
6053 src_i = (start_offset + src_offset) >> PAGE_SHIFT;
d1310b2e 6054
09cbfeaf 6055 cur = min(len, (unsigned long)(PAGE_SIZE -
d1310b2e
CM
6056 src_off_in_page));
6057 cur = min_t(unsigned long, cur,
09cbfeaf 6058 (unsigned long)(PAGE_SIZE - dst_off_in_page));
d1310b2e 6059
fb85fc9a 6060 copy_pages(dst->pages[dst_i], dst->pages[src_i],
d1310b2e
CM
6061 dst_off_in_page, src_off_in_page, cur);
6062
6063 src_offset += cur;
6064 dst_offset += cur;
6065 len -= cur;
6066 }
6067}
d1310b2e
CM
6068
6069void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
6070 unsigned long src_offset, unsigned long len)
6071{
0b246afa 6072 struct btrfs_fs_info *fs_info = dst->fs_info;
d1310b2e
CM
6073 size_t cur;
6074 size_t dst_off_in_page;
6075 size_t src_off_in_page;
6076 unsigned long dst_end = dst_offset + len - 1;
6077 unsigned long src_end = src_offset + len - 1;
7073017a 6078 size_t start_offset = offset_in_page(dst->start);
d1310b2e
CM
6079 unsigned long dst_i;
6080 unsigned long src_i;
6081
6082 if (src_offset + len > dst->len) {
0b246afa 6083 btrfs_err(fs_info,
5d163e0e
JM
6084 "memmove bogus src_offset %lu move len %lu len %lu",
6085 src_offset, len, dst->len);
290342f6 6086 BUG();
d1310b2e
CM
6087 }
6088 if (dst_offset + len > dst->len) {
0b246afa 6089 btrfs_err(fs_info,
5d163e0e
JM
6090 "memmove bogus dst_offset %lu move len %lu len %lu",
6091 dst_offset, len, dst->len);
290342f6 6092 BUG();
d1310b2e 6093 }
727011e0 6094 if (dst_offset < src_offset) {
d1310b2e
CM
6095 memcpy_extent_buffer(dst, dst_offset, src_offset, len);
6096 return;
6097 }
d397712b 6098 while (len > 0) {
09cbfeaf
KS
6099 dst_i = (start_offset + dst_end) >> PAGE_SHIFT;
6100 src_i = (start_offset + src_end) >> PAGE_SHIFT;
d1310b2e 6101
7073017a
JT
6102 dst_off_in_page = offset_in_page(start_offset + dst_end);
6103 src_off_in_page = offset_in_page(start_offset + src_end);
d1310b2e
CM
6104
6105 cur = min_t(unsigned long, len, src_off_in_page + 1);
6106 cur = min(cur, dst_off_in_page + 1);
fb85fc9a 6107 copy_pages(dst->pages[dst_i], dst->pages[src_i],
d1310b2e
CM
6108 dst_off_in_page - cur + 1,
6109 src_off_in_page - cur + 1, cur);
6110
6111 dst_end -= cur;
6112 src_end -= cur;
6113 len -= cur;
6114 }
6115}
6af118ce 6116
f7a52a40 6117int try_release_extent_buffer(struct page *page)
19fe0a8b 6118{
6af118ce 6119 struct extent_buffer *eb;
6af118ce 6120
3083ee2e 6121 /*
01327610 6122 * We need to make sure nobody is attaching this page to an eb right
3083ee2e
JB
6123 * now.
6124 */
6125 spin_lock(&page->mapping->private_lock);
6126 if (!PagePrivate(page)) {
6127 spin_unlock(&page->mapping->private_lock);
4f2de97a 6128 return 1;
45f49bce 6129 }
6af118ce 6130
3083ee2e
JB
6131 eb = (struct extent_buffer *)page->private;
6132 BUG_ON(!eb);
19fe0a8b
MX
6133
6134 /*
3083ee2e
JB
6135 * This is a little awful but should be ok, we need to make sure that
6136 * the eb doesn't disappear out from under us while we're looking at
6137 * this page.
19fe0a8b 6138 */
3083ee2e 6139 spin_lock(&eb->refs_lock);
0b32f4bb 6140 if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
3083ee2e
JB
6141 spin_unlock(&eb->refs_lock);
6142 spin_unlock(&page->mapping->private_lock);
6143 return 0;
b9473439 6144 }
3083ee2e 6145 spin_unlock(&page->mapping->private_lock);
897ca6e9 6146
19fe0a8b 6147 /*
3083ee2e
JB
6148 * If tree ref isn't set then we know the ref on this eb is a real ref,
6149 * so just return, this page will likely be freed soon anyway.
19fe0a8b 6150 */
3083ee2e
JB
6151 if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
6152 spin_unlock(&eb->refs_lock);
6153 return 0;
b9473439 6154 }
19fe0a8b 6155
f7a52a40 6156 return release_extent_buffer(eb);
6af118ce 6157}