]> git.proxmox.com Git - mirror_ubuntu-hirsute-kernel.git/blame - fs/btrfs/extent_io.c
btrfs: sink arugment tree to contiguous_readpages
[mirror_ubuntu-hirsute-kernel.git] / fs / btrfs / extent_io.c
CommitLineData
b2441318 1// SPDX-License-Identifier: GPL-2.0
c1d7c514 2
d1310b2e
CM
3#include <linux/bitops.h>
4#include <linux/slab.h>
5#include <linux/bio.h>
6#include <linux/mm.h>
d1310b2e
CM
7#include <linux/pagemap.h>
8#include <linux/page-flags.h>
d1310b2e
CM
9#include <linux/spinlock.h>
10#include <linux/blkdev.h>
11#include <linux/swap.h>
d1310b2e
CM
12#include <linux/writeback.h>
13#include <linux/pagevec.h>
268bb0ce 14#include <linux/prefetch.h>
90a887c9 15#include <linux/cleancache.h>
d1310b2e 16#include "extent_io.h"
9c7d3a54 17#include "extent-io-tree.h"
d1310b2e 18#include "extent_map.h"
902b22f3
DW
19#include "ctree.h"
20#include "btrfs_inode.h"
4a54c8c1 21#include "volumes.h"
21adbd5c 22#include "check-integrity.h"
0b32f4bb 23#include "locking.h"
606686ee 24#include "rcu-string.h"
fe09e16c 25#include "backref.h"
6af49dbd 26#include "disk-io.h"
d1310b2e 27
d1310b2e
CM
28static struct kmem_cache *extent_state_cache;
29static struct kmem_cache *extent_buffer_cache;
8ac9f7c1 30static struct bio_set btrfs_bioset;
d1310b2e 31
27a3507d
FM
32static inline bool extent_state_in_tree(const struct extent_state *state)
33{
34 return !RB_EMPTY_NODE(&state->rb_node);
35}
36
6d49ba1b 37#ifdef CONFIG_BTRFS_DEBUG
d1310b2e
CM
38static LIST_HEAD(buffers);
39static LIST_HEAD(states);
4bef0848 40
d397712b 41static DEFINE_SPINLOCK(leak_lock);
6d49ba1b
ES
42
43static inline
44void btrfs_leak_debug_add(struct list_head *new, struct list_head *head)
45{
46 unsigned long flags;
47
48 spin_lock_irqsave(&leak_lock, flags);
49 list_add(new, head);
50 spin_unlock_irqrestore(&leak_lock, flags);
51}
52
53static inline
54void btrfs_leak_debug_del(struct list_head *entry)
55{
56 unsigned long flags;
57
58 spin_lock_irqsave(&leak_lock, flags);
59 list_del(entry);
60 spin_unlock_irqrestore(&leak_lock, flags);
61}
62
33ca832f 63static inline void btrfs_extent_buffer_leak_debug_check(void)
6d49ba1b 64{
6d49ba1b
ES
65 struct extent_buffer *eb;
66
33ca832f
JB
67 while (!list_empty(&buffers)) {
68 eb = list_entry(buffers.next, struct extent_buffer, leak_list);
69 pr_err("BTRFS: buffer leak start %llu len %lu refs %d bflags %lu\n",
70 eb->start, eb->len, atomic_read(&eb->refs), eb->bflags);
71 list_del(&eb->leak_list);
72 kmem_cache_free(extent_buffer_cache, eb);
73 }
74}
75
76static inline void btrfs_extent_state_leak_debug_check(void)
77{
78 struct extent_state *state;
79
6d49ba1b
ES
80 while (!list_empty(&states)) {
81 state = list_entry(states.next, struct extent_state, leak_list);
9ee49a04 82 pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n",
27a3507d
FM
83 state->start, state->end, state->state,
84 extent_state_in_tree(state),
b7ac31b7 85 refcount_read(&state->refs));
6d49ba1b
ES
86 list_del(&state->leak_list);
87 kmem_cache_free(extent_state_cache, state);
88 }
6d49ba1b 89}
8d599ae1 90
a5dee37d
JB
91#define btrfs_debug_check_extent_io_range(tree, start, end) \
92 __btrfs_debug_check_extent_io_range(__func__, (tree), (start), (end))
8d599ae1 93static inline void __btrfs_debug_check_extent_io_range(const char *caller,
a5dee37d 94 struct extent_io_tree *tree, u64 start, u64 end)
8d599ae1 95{
65a680f6
NB
96 struct inode *inode = tree->private_data;
97 u64 isize;
98
99 if (!inode || !is_data_inode(inode))
100 return;
101
102 isize = i_size_read(inode);
103 if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
104 btrfs_debug_rl(BTRFS_I(inode)->root->fs_info,
105 "%s: ino %llu isize %llu odd range [%llu,%llu]",
106 caller, btrfs_ino(BTRFS_I(inode)), isize, start, end);
107 }
8d599ae1 108}
6d49ba1b
ES
109#else
110#define btrfs_leak_debug_add(new, head) do {} while (0)
111#define btrfs_leak_debug_del(entry) do {} while (0)
33ca832f
JB
112#define btrfs_extent_buffer_leak_debug_check() do {} while (0)
113#define btrfs_extent_state_leak_debug_check() do {} while (0)
8d599ae1 114#define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0)
4bef0848 115#endif
d1310b2e 116
d1310b2e
CM
117struct tree_entry {
118 u64 start;
119 u64 end;
d1310b2e
CM
120 struct rb_node rb_node;
121};
122
123struct extent_page_data {
124 struct bio *bio;
771ed689
CM
125 /* tells writepage not to lock the state bits for this range
126 * it still does the unlocking
127 */
ffbd517d
CM
128 unsigned int extent_locked:1;
129
70fd7614 130 /* tells the submit_bio code to use REQ_SYNC */
ffbd517d 131 unsigned int sync_io:1;
d1310b2e
CM
132};
133
57599c7e 134static int add_extent_changeset(struct extent_state *state, unsigned bits,
d38ed27f
QW
135 struct extent_changeset *changeset,
136 int set)
137{
138 int ret;
139
140 if (!changeset)
57599c7e 141 return 0;
d38ed27f 142 if (set && (state->state & bits) == bits)
57599c7e 143 return 0;
fefdc557 144 if (!set && (state->state & bits) == 0)
57599c7e 145 return 0;
d38ed27f 146 changeset->bytes_changed += state->end - state->start + 1;
53d32359 147 ret = ulist_add(&changeset->range_changed, state->start, state->end,
d38ed27f 148 GFP_ATOMIC);
57599c7e 149 return ret;
d38ed27f
QW
150}
151
bb58eb9e
QW
152static int __must_check submit_one_bio(struct bio *bio, int mirror_num,
153 unsigned long bio_flags)
154{
155 blk_status_t ret = 0;
bb58eb9e 156 struct extent_io_tree *tree = bio->bi_private;
bb58eb9e
QW
157
158 bio->bi_private = NULL;
159
160 if (tree->ops)
161 ret = tree->ops->submit_bio_hook(tree->private_data, bio,
50489a57 162 mirror_num, bio_flags);
bb58eb9e
QW
163 else
164 btrfsic_submit_bio(bio);
165
166 return blk_status_to_errno(ret);
167}
168
3065976b
QW
169/* Cleanup unsubmitted bios */
170static void end_write_bio(struct extent_page_data *epd, int ret)
171{
172 if (epd->bio) {
173 epd->bio->bi_status = errno_to_blk_status(ret);
174 bio_endio(epd->bio);
175 epd->bio = NULL;
176 }
177}
178
f4340622
QW
179/*
180 * Submit bio from extent page data via submit_one_bio
181 *
182 * Return 0 if everything is OK.
183 * Return <0 for error.
184 */
185static int __must_check flush_write_bio(struct extent_page_data *epd)
bb58eb9e 186{
f4340622 187 int ret = 0;
bb58eb9e 188
f4340622 189 if (epd->bio) {
bb58eb9e 190 ret = submit_one_bio(epd->bio, 0, 0);
f4340622
QW
191 /*
192 * Clean up of epd->bio is handled by its endio function.
193 * And endio is either triggered by successful bio execution
194 * or the error handler of submit bio hook.
195 * So at this point, no matter what happened, we don't need
196 * to clean up epd->bio.
197 */
bb58eb9e
QW
198 epd->bio = NULL;
199 }
f4340622 200 return ret;
bb58eb9e 201}
e2932ee0 202
6f0d04f8 203int __init extent_state_cache_init(void)
d1310b2e 204{
837e1972 205 extent_state_cache = kmem_cache_create("btrfs_extent_state",
9601e3f6 206 sizeof(struct extent_state), 0,
fba4b697 207 SLAB_MEM_SPREAD, NULL);
d1310b2e
CM
208 if (!extent_state_cache)
209 return -ENOMEM;
6f0d04f8
JB
210 return 0;
211}
d1310b2e 212
6f0d04f8
JB
213int __init extent_io_init(void)
214{
837e1972 215 extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer",
9601e3f6 216 sizeof(struct extent_buffer), 0,
fba4b697 217 SLAB_MEM_SPREAD, NULL);
d1310b2e 218 if (!extent_buffer_cache)
6f0d04f8 219 return -ENOMEM;
9be3395b 220
8ac9f7c1
KO
221 if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE,
222 offsetof(struct btrfs_io_bio, bio),
223 BIOSET_NEED_BVECS))
9be3395b 224 goto free_buffer_cache;
b208c2f7 225
8ac9f7c1 226 if (bioset_integrity_create(&btrfs_bioset, BIO_POOL_SIZE))
b208c2f7
DW
227 goto free_bioset;
228
d1310b2e
CM
229 return 0;
230
b208c2f7 231free_bioset:
8ac9f7c1 232 bioset_exit(&btrfs_bioset);
b208c2f7 233
9be3395b
CM
234free_buffer_cache:
235 kmem_cache_destroy(extent_buffer_cache);
236 extent_buffer_cache = NULL;
6f0d04f8
JB
237 return -ENOMEM;
238}
9be3395b 239
6f0d04f8
JB
240void __cold extent_state_cache_exit(void)
241{
242 btrfs_extent_state_leak_debug_check();
d1310b2e 243 kmem_cache_destroy(extent_state_cache);
d1310b2e
CM
244}
245
e67c718b 246void __cold extent_io_exit(void)
d1310b2e 247{
33ca832f 248 btrfs_extent_buffer_leak_debug_check();
8c0a8537
KS
249
250 /*
251 * Make sure all delayed rcu free are flushed before we
252 * destroy caches.
253 */
254 rcu_barrier();
5598e900 255 kmem_cache_destroy(extent_buffer_cache);
8ac9f7c1 256 bioset_exit(&btrfs_bioset);
d1310b2e
CM
257}
258
41a2ee75
JB
259/*
260 * For the file_extent_tree, we want to hold the inode lock when we lookup and
261 * update the disk_i_size, but lockdep will complain because our io_tree we hold
262 * the tree lock and get the inode lock when setting delalloc. These two things
263 * are unrelated, so make a class for the file_extent_tree so we don't get the
264 * two locking patterns mixed up.
265 */
266static struct lock_class_key file_extent_tree_class;
267
c258d6e3 268void extent_io_tree_init(struct btrfs_fs_info *fs_info,
43eb5f29
QW
269 struct extent_io_tree *tree, unsigned int owner,
270 void *private_data)
d1310b2e 271{
c258d6e3 272 tree->fs_info = fs_info;
6bef4d31 273 tree->state = RB_ROOT;
d1310b2e
CM
274 tree->ops = NULL;
275 tree->dirty_bytes = 0;
70dec807 276 spin_lock_init(&tree->lock);
c6100a4b 277 tree->private_data = private_data;
43eb5f29 278 tree->owner = owner;
41a2ee75
JB
279 if (owner == IO_TREE_INODE_FILE_EXTENT)
280 lockdep_set_class(&tree->lock, &file_extent_tree_class);
d1310b2e 281}
d1310b2e 282
41e7acd3
NB
283void extent_io_tree_release(struct extent_io_tree *tree)
284{
285 spin_lock(&tree->lock);
286 /*
287 * Do a single barrier for the waitqueue_active check here, the state
288 * of the waitqueue should not change once extent_io_tree_release is
289 * called.
290 */
291 smp_mb();
292 while (!RB_EMPTY_ROOT(&tree->state)) {
293 struct rb_node *node;
294 struct extent_state *state;
295
296 node = rb_first(&tree->state);
297 state = rb_entry(node, struct extent_state, rb_node);
298 rb_erase(&state->rb_node, &tree->state);
299 RB_CLEAR_NODE(&state->rb_node);
300 /*
301 * btree io trees aren't supposed to have tasks waiting for
302 * changes in the flags of extent states ever.
303 */
304 ASSERT(!waitqueue_active(&state->wq));
305 free_extent_state(state);
306
307 cond_resched_lock(&tree->lock);
308 }
309 spin_unlock(&tree->lock);
310}
311
b2950863 312static struct extent_state *alloc_extent_state(gfp_t mask)
d1310b2e
CM
313{
314 struct extent_state *state;
d1310b2e 315
3ba7ab22
MH
316 /*
317 * The given mask might be not appropriate for the slab allocator,
318 * drop the unsupported bits
319 */
320 mask &= ~(__GFP_DMA32|__GFP_HIGHMEM);
d1310b2e 321 state = kmem_cache_alloc(extent_state_cache, mask);
2b114d1d 322 if (!state)
d1310b2e
CM
323 return state;
324 state->state = 0;
47dc196a 325 state->failrec = NULL;
27a3507d 326 RB_CLEAR_NODE(&state->rb_node);
6d49ba1b 327 btrfs_leak_debug_add(&state->leak_list, &states);
b7ac31b7 328 refcount_set(&state->refs, 1);
d1310b2e 329 init_waitqueue_head(&state->wq);
143bede5 330 trace_alloc_extent_state(state, mask, _RET_IP_);
d1310b2e
CM
331 return state;
332}
d1310b2e 333
4845e44f 334void free_extent_state(struct extent_state *state)
d1310b2e 335{
d1310b2e
CM
336 if (!state)
337 return;
b7ac31b7 338 if (refcount_dec_and_test(&state->refs)) {
27a3507d 339 WARN_ON(extent_state_in_tree(state));
6d49ba1b 340 btrfs_leak_debug_del(&state->leak_list);
143bede5 341 trace_free_extent_state(state, _RET_IP_);
d1310b2e
CM
342 kmem_cache_free(extent_state_cache, state);
343 }
344}
d1310b2e 345
f2071b21
FM
346static struct rb_node *tree_insert(struct rb_root *root,
347 struct rb_node *search_start,
348 u64 offset,
12cfbad9
FDBM
349 struct rb_node *node,
350 struct rb_node ***p_in,
351 struct rb_node **parent_in)
d1310b2e 352{
f2071b21 353 struct rb_node **p;
d397712b 354 struct rb_node *parent = NULL;
d1310b2e
CM
355 struct tree_entry *entry;
356
12cfbad9
FDBM
357 if (p_in && parent_in) {
358 p = *p_in;
359 parent = *parent_in;
360 goto do_insert;
361 }
362
f2071b21 363 p = search_start ? &search_start : &root->rb_node;
d397712b 364 while (*p) {
d1310b2e
CM
365 parent = *p;
366 entry = rb_entry(parent, struct tree_entry, rb_node);
367
368 if (offset < entry->start)
369 p = &(*p)->rb_left;
370 else if (offset > entry->end)
371 p = &(*p)->rb_right;
372 else
373 return parent;
374 }
375
12cfbad9 376do_insert:
d1310b2e
CM
377 rb_link_node(node, parent, p);
378 rb_insert_color(node, root);
379 return NULL;
380}
381
8666e638
NB
382/**
383 * __etree_search - searche @tree for an entry that contains @offset. Such
384 * entry would have entry->start <= offset && entry->end >= offset.
385 *
386 * @tree - the tree to search
387 * @offset - offset that should fall within an entry in @tree
388 * @next_ret - pointer to the first entry whose range ends after @offset
389 * @prev - pointer to the first entry whose range begins before @offset
390 * @p_ret - pointer where new node should be anchored (used when inserting an
391 * entry in the tree)
392 * @parent_ret - points to entry which would have been the parent of the entry,
393 * containing @offset
394 *
395 * This function returns a pointer to the entry that contains @offset byte
396 * address. If no such entry exists, then NULL is returned and the other
397 * pointer arguments to the function are filled, otherwise the found entry is
398 * returned and other pointers are left untouched.
399 */
80ea96b1 400static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
12cfbad9 401 struct rb_node **next_ret,
352646c7 402 struct rb_node **prev_ret,
12cfbad9
FDBM
403 struct rb_node ***p_ret,
404 struct rb_node **parent_ret)
d1310b2e 405{
80ea96b1 406 struct rb_root *root = &tree->state;
12cfbad9 407 struct rb_node **n = &root->rb_node;
d1310b2e
CM
408 struct rb_node *prev = NULL;
409 struct rb_node *orig_prev = NULL;
410 struct tree_entry *entry;
411 struct tree_entry *prev_entry = NULL;
412
12cfbad9
FDBM
413 while (*n) {
414 prev = *n;
415 entry = rb_entry(prev, struct tree_entry, rb_node);
d1310b2e
CM
416 prev_entry = entry;
417
418 if (offset < entry->start)
12cfbad9 419 n = &(*n)->rb_left;
d1310b2e 420 else if (offset > entry->end)
12cfbad9 421 n = &(*n)->rb_right;
d397712b 422 else
12cfbad9 423 return *n;
d1310b2e
CM
424 }
425
12cfbad9
FDBM
426 if (p_ret)
427 *p_ret = n;
428 if (parent_ret)
429 *parent_ret = prev;
430
352646c7 431 if (next_ret) {
d1310b2e 432 orig_prev = prev;
d397712b 433 while (prev && offset > prev_entry->end) {
d1310b2e
CM
434 prev = rb_next(prev);
435 prev_entry = rb_entry(prev, struct tree_entry, rb_node);
436 }
352646c7 437 *next_ret = prev;
d1310b2e
CM
438 prev = orig_prev;
439 }
440
352646c7 441 if (prev_ret) {
d1310b2e 442 prev_entry = rb_entry(prev, struct tree_entry, rb_node);
d397712b 443 while (prev && offset < prev_entry->start) {
d1310b2e
CM
444 prev = rb_prev(prev);
445 prev_entry = rb_entry(prev, struct tree_entry, rb_node);
446 }
352646c7 447 *prev_ret = prev;
d1310b2e
CM
448 }
449 return NULL;
450}
451
12cfbad9
FDBM
452static inline struct rb_node *
453tree_search_for_insert(struct extent_io_tree *tree,
454 u64 offset,
455 struct rb_node ***p_ret,
456 struct rb_node **parent_ret)
d1310b2e 457{
352646c7 458 struct rb_node *next= NULL;
d1310b2e 459 struct rb_node *ret;
70dec807 460
352646c7 461 ret = __etree_search(tree, offset, &next, NULL, p_ret, parent_ret);
d397712b 462 if (!ret)
352646c7 463 return next;
d1310b2e
CM
464 return ret;
465}
466
12cfbad9
FDBM
467static inline struct rb_node *tree_search(struct extent_io_tree *tree,
468 u64 offset)
469{
470 return tree_search_for_insert(tree, offset, NULL, NULL);
471}
472
d1310b2e
CM
473/*
474 * utility function to look for merge candidates inside a given range.
475 * Any extents with matching state are merged together into a single
476 * extent in the tree. Extents with EXTENT_IO in their state field
477 * are not merged because the end_io handlers need to be able to do
478 * operations on them without sleeping (or doing allocations/splits).
479 *
480 * This should be called with the tree lock held.
481 */
1bf85046
JM
482static void merge_state(struct extent_io_tree *tree,
483 struct extent_state *state)
d1310b2e
CM
484{
485 struct extent_state *other;
486 struct rb_node *other_node;
487
8882679e 488 if (state->state & (EXTENT_LOCKED | EXTENT_BOUNDARY))
1bf85046 489 return;
d1310b2e
CM
490
491 other_node = rb_prev(&state->rb_node);
492 if (other_node) {
493 other = rb_entry(other_node, struct extent_state, rb_node);
494 if (other->end == state->start - 1 &&
495 other->state == state->state) {
5c848198
NB
496 if (tree->private_data &&
497 is_data_inode(tree->private_data))
498 btrfs_merge_delalloc_extent(tree->private_data,
499 state, other);
d1310b2e 500 state->start = other->start;
d1310b2e 501 rb_erase(&other->rb_node, &tree->state);
27a3507d 502 RB_CLEAR_NODE(&other->rb_node);
d1310b2e
CM
503 free_extent_state(other);
504 }
505 }
506 other_node = rb_next(&state->rb_node);
507 if (other_node) {
508 other = rb_entry(other_node, struct extent_state, rb_node);
509 if (other->start == state->end + 1 &&
510 other->state == state->state) {
5c848198
NB
511 if (tree->private_data &&
512 is_data_inode(tree->private_data))
513 btrfs_merge_delalloc_extent(tree->private_data,
514 state, other);
df98b6e2 515 state->end = other->end;
df98b6e2 516 rb_erase(&other->rb_node, &tree->state);
27a3507d 517 RB_CLEAR_NODE(&other->rb_node);
df98b6e2 518 free_extent_state(other);
d1310b2e
CM
519 }
520 }
d1310b2e
CM
521}
522
3150b699 523static void set_state_bits(struct extent_io_tree *tree,
d38ed27f
QW
524 struct extent_state *state, unsigned *bits,
525 struct extent_changeset *changeset);
3150b699 526
d1310b2e
CM
527/*
528 * insert an extent_state struct into the tree. 'bits' are set on the
529 * struct before it is inserted.
530 *
531 * This may return -EEXIST if the extent is already there, in which case the
532 * state struct is freed.
533 *
534 * The tree lock is not taken internally. This is a utility function and
535 * probably isn't what you want to call (see set/clear_extent_bit).
536 */
537static int insert_state(struct extent_io_tree *tree,
538 struct extent_state *state, u64 start, u64 end,
12cfbad9
FDBM
539 struct rb_node ***p,
540 struct rb_node **parent,
d38ed27f 541 unsigned *bits, struct extent_changeset *changeset)
d1310b2e
CM
542{
543 struct rb_node *node;
544
2792237d
DS
545 if (end < start) {
546 btrfs_err(tree->fs_info,
547 "insert state: end < start %llu %llu", end, start);
548 WARN_ON(1);
549 }
d1310b2e
CM
550 state->start = start;
551 state->end = end;
9ed74f2d 552
d38ed27f 553 set_state_bits(tree, state, bits, changeset);
3150b699 554
f2071b21 555 node = tree_insert(&tree->state, NULL, end, &state->rb_node, p, parent);
d1310b2e
CM
556 if (node) {
557 struct extent_state *found;
558 found = rb_entry(node, struct extent_state, rb_node);
2792237d
DS
559 btrfs_err(tree->fs_info,
560 "found node %llu %llu on insert of %llu %llu",
c1c9ff7c 561 found->start, found->end, start, end);
d1310b2e
CM
562 return -EEXIST;
563 }
564 merge_state(tree, state);
565 return 0;
566}
567
568/*
569 * split a given extent state struct in two, inserting the preallocated
570 * struct 'prealloc' as the newly created second half. 'split' indicates an
571 * offset inside 'orig' where it should be split.
572 *
573 * Before calling,
574 * the tree has 'orig' at [orig->start, orig->end]. After calling, there
575 * are two extent state structs in the tree:
576 * prealloc: [orig->start, split - 1]
577 * orig: [ split, orig->end ]
578 *
579 * The tree locks are not taken by this function. They need to be held
580 * by the caller.
581 */
582static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
583 struct extent_state *prealloc, u64 split)
584{
585 struct rb_node *node;
9ed74f2d 586
abbb55f4
NB
587 if (tree->private_data && is_data_inode(tree->private_data))
588 btrfs_split_delalloc_extent(tree->private_data, orig, split);
9ed74f2d 589
d1310b2e
CM
590 prealloc->start = orig->start;
591 prealloc->end = split - 1;
592 prealloc->state = orig->state;
593 orig->start = split;
594
f2071b21
FM
595 node = tree_insert(&tree->state, &orig->rb_node, prealloc->end,
596 &prealloc->rb_node, NULL, NULL);
d1310b2e 597 if (node) {
d1310b2e
CM
598 free_extent_state(prealloc);
599 return -EEXIST;
600 }
601 return 0;
602}
603
cdc6a395
LZ
604static struct extent_state *next_state(struct extent_state *state)
605{
606 struct rb_node *next = rb_next(&state->rb_node);
607 if (next)
608 return rb_entry(next, struct extent_state, rb_node);
609 else
610 return NULL;
611}
612
d1310b2e
CM
613/*
614 * utility function to clear some bits in an extent state struct.
52042d8e 615 * it will optionally wake up anyone waiting on this state (wake == 1).
d1310b2e
CM
616 *
617 * If no bits are set on the state struct after clearing things, the
618 * struct is freed and removed from the tree
619 */
cdc6a395
LZ
620static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
621 struct extent_state *state,
fefdc557
QW
622 unsigned *bits, int wake,
623 struct extent_changeset *changeset)
d1310b2e 624{
cdc6a395 625 struct extent_state *next;
9ee49a04 626 unsigned bits_to_clear = *bits & ~EXTENT_CTLBITS;
57599c7e 627 int ret;
d1310b2e 628
0ca1f7ce 629 if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
d1310b2e
CM
630 u64 range = state->end - state->start + 1;
631 WARN_ON(range > tree->dirty_bytes);
632 tree->dirty_bytes -= range;
633 }
a36bb5f9
NB
634
635 if (tree->private_data && is_data_inode(tree->private_data))
636 btrfs_clear_delalloc_extent(tree->private_data, state, bits);
637
57599c7e
DS
638 ret = add_extent_changeset(state, bits_to_clear, changeset, 0);
639 BUG_ON(ret < 0);
32c00aff 640 state->state &= ~bits_to_clear;
d1310b2e
CM
641 if (wake)
642 wake_up(&state->wq);
0ca1f7ce 643 if (state->state == 0) {
cdc6a395 644 next = next_state(state);
27a3507d 645 if (extent_state_in_tree(state)) {
d1310b2e 646 rb_erase(&state->rb_node, &tree->state);
27a3507d 647 RB_CLEAR_NODE(&state->rb_node);
d1310b2e
CM
648 free_extent_state(state);
649 } else {
650 WARN_ON(1);
651 }
652 } else {
653 merge_state(tree, state);
cdc6a395 654 next = next_state(state);
d1310b2e 655 }
cdc6a395 656 return next;
d1310b2e
CM
657}
658
8233767a
XG
659static struct extent_state *
660alloc_extent_state_atomic(struct extent_state *prealloc)
661{
662 if (!prealloc)
663 prealloc = alloc_extent_state(GFP_ATOMIC);
664
665 return prealloc;
666}
667
48a3b636 668static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
c2d904e0 669{
05912a3c
DS
670 struct inode *inode = tree->private_data;
671
672 btrfs_panic(btrfs_sb(inode->i_sb), err,
673 "locking error: extent tree was modified by another thread while locked");
c2d904e0
JM
674}
675
d1310b2e
CM
676/*
677 * clear some bits on a range in the tree. This may require splitting
678 * or inserting elements in the tree, so the gfp mask is used to
679 * indicate which allocations or sleeping are allowed.
680 *
681 * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
682 * the given range from the tree regardless of state (ie for truncate).
683 *
684 * the range [start, end] is inclusive.
685 *
6763af84 686 * This takes the tree lock, and returns 0 on success and < 0 on error.
d1310b2e 687 */
66b0c887 688int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
fefdc557
QW
689 unsigned bits, int wake, int delete,
690 struct extent_state **cached_state,
691 gfp_t mask, struct extent_changeset *changeset)
d1310b2e
CM
692{
693 struct extent_state *state;
2c64c53d 694 struct extent_state *cached;
d1310b2e
CM
695 struct extent_state *prealloc = NULL;
696 struct rb_node *node;
5c939df5 697 u64 last_end;
d1310b2e 698 int err;
2ac55d41 699 int clear = 0;
d1310b2e 700
a5dee37d 701 btrfs_debug_check_extent_io_range(tree, start, end);
a1d19847 702 trace_btrfs_clear_extent_bit(tree, start, end - start + 1, bits);
8d599ae1 703
7ee9e440
JB
704 if (bits & EXTENT_DELALLOC)
705 bits |= EXTENT_NORESERVE;
706
0ca1f7ce
YZ
707 if (delete)
708 bits |= ~EXTENT_CTLBITS;
0ca1f7ce 709
8882679e 710 if (bits & (EXTENT_LOCKED | EXTENT_BOUNDARY))
2ac55d41 711 clear = 1;
d1310b2e 712again:
d0164adc 713 if (!prealloc && gfpflags_allow_blocking(mask)) {
c7bc6319
FM
714 /*
715 * Don't care for allocation failure here because we might end
716 * up not needing the pre-allocated extent state at all, which
717 * is the case if we only have in the tree extent states that
718 * cover our input range and don't cover too any other range.
719 * If we end up needing a new extent state we allocate it later.
720 */
d1310b2e 721 prealloc = alloc_extent_state(mask);
d1310b2e
CM
722 }
723
cad321ad 724 spin_lock(&tree->lock);
2c64c53d
CM
725 if (cached_state) {
726 cached = *cached_state;
2ac55d41
JB
727
728 if (clear) {
729 *cached_state = NULL;
730 cached_state = NULL;
731 }
732
27a3507d
FM
733 if (cached && extent_state_in_tree(cached) &&
734 cached->start <= start && cached->end > start) {
2ac55d41 735 if (clear)
b7ac31b7 736 refcount_dec(&cached->refs);
2c64c53d 737 state = cached;
42daec29 738 goto hit_next;
2c64c53d 739 }
2ac55d41
JB
740 if (clear)
741 free_extent_state(cached);
2c64c53d 742 }
d1310b2e
CM
743 /*
744 * this search will find the extents that end after
745 * our range starts
746 */
80ea96b1 747 node = tree_search(tree, start);
d1310b2e
CM
748 if (!node)
749 goto out;
750 state = rb_entry(node, struct extent_state, rb_node);
2c64c53d 751hit_next:
d1310b2e
CM
752 if (state->start > end)
753 goto out;
754 WARN_ON(state->end < start);
5c939df5 755 last_end = state->end;
d1310b2e 756
0449314a 757 /* the state doesn't have the wanted bits, go ahead */
cdc6a395
LZ
758 if (!(state->state & bits)) {
759 state = next_state(state);
0449314a 760 goto next;
cdc6a395 761 }
0449314a 762
d1310b2e
CM
763 /*
764 * | ---- desired range ---- |
765 * | state | or
766 * | ------------- state -------------- |
767 *
768 * We need to split the extent we found, and may flip
769 * bits on second half.
770 *
771 * If the extent we found extends past our range, we
772 * just split and search again. It'll get split again
773 * the next time though.
774 *
775 * If the extent we found is inside our range, we clear
776 * the desired bit on it.
777 */
778
779 if (state->start < start) {
8233767a
XG
780 prealloc = alloc_extent_state_atomic(prealloc);
781 BUG_ON(!prealloc);
d1310b2e 782 err = split_state(tree, state, prealloc, start);
c2d904e0
JM
783 if (err)
784 extent_io_tree_panic(tree, err);
785
d1310b2e
CM
786 prealloc = NULL;
787 if (err)
788 goto out;
789 if (state->end <= end) {
fefdc557
QW
790 state = clear_state_bit(tree, state, &bits, wake,
791 changeset);
d1ac6e41 792 goto next;
d1310b2e
CM
793 }
794 goto search_again;
795 }
796 /*
797 * | ---- desired range ---- |
798 * | state |
799 * We need to split the extent, and clear the bit
800 * on the first half
801 */
802 if (state->start <= end && state->end > end) {
8233767a
XG
803 prealloc = alloc_extent_state_atomic(prealloc);
804 BUG_ON(!prealloc);
d1310b2e 805 err = split_state(tree, state, prealloc, end + 1);
c2d904e0
JM
806 if (err)
807 extent_io_tree_panic(tree, err);
808
d1310b2e
CM
809 if (wake)
810 wake_up(&state->wq);
42daec29 811
fefdc557 812 clear_state_bit(tree, prealloc, &bits, wake, changeset);
9ed74f2d 813
d1310b2e
CM
814 prealloc = NULL;
815 goto out;
816 }
42daec29 817
fefdc557 818 state = clear_state_bit(tree, state, &bits, wake, changeset);
0449314a 819next:
5c939df5
YZ
820 if (last_end == (u64)-1)
821 goto out;
822 start = last_end + 1;
cdc6a395 823 if (start <= end && state && !need_resched())
692e5759 824 goto hit_next;
d1310b2e
CM
825
826search_again:
827 if (start > end)
828 goto out;
cad321ad 829 spin_unlock(&tree->lock);
d0164adc 830 if (gfpflags_allow_blocking(mask))
d1310b2e
CM
831 cond_resched();
832 goto again;
7ab5cb2a
DS
833
834out:
835 spin_unlock(&tree->lock);
836 if (prealloc)
837 free_extent_state(prealloc);
838
839 return 0;
840
d1310b2e 841}
d1310b2e 842
143bede5
JM
843static void wait_on_state(struct extent_io_tree *tree,
844 struct extent_state *state)
641f5219
CH
845 __releases(tree->lock)
846 __acquires(tree->lock)
d1310b2e
CM
847{
848 DEFINE_WAIT(wait);
849 prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
cad321ad 850 spin_unlock(&tree->lock);
d1310b2e 851 schedule();
cad321ad 852 spin_lock(&tree->lock);
d1310b2e 853 finish_wait(&state->wq, &wait);
d1310b2e
CM
854}
855
856/*
857 * waits for one or more bits to clear on a range in the state tree.
858 * The range [start, end] is inclusive.
859 * The tree lock is taken by this function
860 */
41074888
DS
861static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
862 unsigned long bits)
d1310b2e
CM
863{
864 struct extent_state *state;
865 struct rb_node *node;
866
a5dee37d 867 btrfs_debug_check_extent_io_range(tree, start, end);
8d599ae1 868
cad321ad 869 spin_lock(&tree->lock);
d1310b2e
CM
870again:
871 while (1) {
872 /*
873 * this search will find all the extents that end after
874 * our range starts
875 */
80ea96b1 876 node = tree_search(tree, start);
c50d3e71 877process_node:
d1310b2e
CM
878 if (!node)
879 break;
880
881 state = rb_entry(node, struct extent_state, rb_node);
882
883 if (state->start > end)
884 goto out;
885
886 if (state->state & bits) {
887 start = state->start;
b7ac31b7 888 refcount_inc(&state->refs);
d1310b2e
CM
889 wait_on_state(tree, state);
890 free_extent_state(state);
891 goto again;
892 }
893 start = state->end + 1;
894
895 if (start > end)
896 break;
897
c50d3e71
FM
898 if (!cond_resched_lock(&tree->lock)) {
899 node = rb_next(node);
900 goto process_node;
901 }
d1310b2e
CM
902 }
903out:
cad321ad 904 spin_unlock(&tree->lock);
d1310b2e 905}
d1310b2e 906
1bf85046 907static void set_state_bits(struct extent_io_tree *tree,
d1310b2e 908 struct extent_state *state,
d38ed27f 909 unsigned *bits, struct extent_changeset *changeset)
d1310b2e 910{
9ee49a04 911 unsigned bits_to_set = *bits & ~EXTENT_CTLBITS;
57599c7e 912 int ret;
9ed74f2d 913
e06a1fc9
NB
914 if (tree->private_data && is_data_inode(tree->private_data))
915 btrfs_set_delalloc_extent(tree->private_data, state, bits);
916
0ca1f7ce 917 if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
d1310b2e
CM
918 u64 range = state->end - state->start + 1;
919 tree->dirty_bytes += range;
920 }
57599c7e
DS
921 ret = add_extent_changeset(state, bits_to_set, changeset, 1);
922 BUG_ON(ret < 0);
0ca1f7ce 923 state->state |= bits_to_set;
d1310b2e
CM
924}
925
e38e2ed7
FM
926static void cache_state_if_flags(struct extent_state *state,
927 struct extent_state **cached_ptr,
9ee49a04 928 unsigned flags)
2c64c53d
CM
929{
930 if (cached_ptr && !(*cached_ptr)) {
e38e2ed7 931 if (!flags || (state->state & flags)) {
2c64c53d 932 *cached_ptr = state;
b7ac31b7 933 refcount_inc(&state->refs);
2c64c53d
CM
934 }
935 }
936}
937
e38e2ed7
FM
938static void cache_state(struct extent_state *state,
939 struct extent_state **cached_ptr)
940{
941 return cache_state_if_flags(state, cached_ptr,
8882679e 942 EXTENT_LOCKED | EXTENT_BOUNDARY);
e38e2ed7
FM
943}
944
d1310b2e 945/*
1edbb734
CM
946 * set some bits on a range in the tree. This may require allocations or
947 * sleeping, so the gfp mask is used to indicate what is allowed.
d1310b2e 948 *
1edbb734
CM
949 * If any of the exclusive bits are set, this will fail with -EEXIST if some
950 * part of the range already has the desired bits set. The start of the
951 * existing range is returned in failed_start in this case.
d1310b2e 952 *
1edbb734 953 * [start, end] is inclusive This takes the tree lock.
d1310b2e 954 */
1edbb734 955
3fbe5c02
JM
956static int __must_check
957__set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
9ee49a04 958 unsigned bits, unsigned exclusive_bits,
41074888 959 u64 *failed_start, struct extent_state **cached_state,
d38ed27f 960 gfp_t mask, struct extent_changeset *changeset)
d1310b2e
CM
961{
962 struct extent_state *state;
963 struct extent_state *prealloc = NULL;
964 struct rb_node *node;
12cfbad9
FDBM
965 struct rb_node **p;
966 struct rb_node *parent;
d1310b2e 967 int err = 0;
d1310b2e
CM
968 u64 last_start;
969 u64 last_end;
42daec29 970
a5dee37d 971 btrfs_debug_check_extent_io_range(tree, start, end);
a1d19847 972 trace_btrfs_set_extent_bit(tree, start, end - start + 1, bits);
8d599ae1 973
d1310b2e 974again:
d0164adc 975 if (!prealloc && gfpflags_allow_blocking(mask)) {
059f791c
DS
976 /*
977 * Don't care for allocation failure here because we might end
978 * up not needing the pre-allocated extent state at all, which
979 * is the case if we only have in the tree extent states that
980 * cover our input range and don't cover too any other range.
981 * If we end up needing a new extent state we allocate it later.
982 */
d1310b2e 983 prealloc = alloc_extent_state(mask);
d1310b2e
CM
984 }
985
cad321ad 986 spin_lock(&tree->lock);
9655d298
CM
987 if (cached_state && *cached_state) {
988 state = *cached_state;
df98b6e2 989 if (state->start <= start && state->end > start &&
27a3507d 990 extent_state_in_tree(state)) {
9655d298
CM
991 node = &state->rb_node;
992 goto hit_next;
993 }
994 }
d1310b2e
CM
995 /*
996 * this search will find all the extents that end after
997 * our range starts.
998 */
12cfbad9 999 node = tree_search_for_insert(tree, start, &p, &parent);
d1310b2e 1000 if (!node) {
8233767a
XG
1001 prealloc = alloc_extent_state_atomic(prealloc);
1002 BUG_ON(!prealloc);
12cfbad9 1003 err = insert_state(tree, prealloc, start, end,
d38ed27f 1004 &p, &parent, &bits, changeset);
c2d904e0
JM
1005 if (err)
1006 extent_io_tree_panic(tree, err);
1007
c42ac0bc 1008 cache_state(prealloc, cached_state);
d1310b2e 1009 prealloc = NULL;
d1310b2e
CM
1010 goto out;
1011 }
d1310b2e 1012 state = rb_entry(node, struct extent_state, rb_node);
40431d6c 1013hit_next:
d1310b2e
CM
1014 last_start = state->start;
1015 last_end = state->end;
1016
1017 /*
1018 * | ---- desired range ---- |
1019 * | state |
1020 *
1021 * Just lock what we found and keep going
1022 */
1023 if (state->start == start && state->end <= end) {
1edbb734 1024 if (state->state & exclusive_bits) {
d1310b2e
CM
1025 *failed_start = state->start;
1026 err = -EEXIST;
1027 goto out;
1028 }
42daec29 1029
d38ed27f 1030 set_state_bits(tree, state, &bits, changeset);
2c64c53d 1031 cache_state(state, cached_state);
d1310b2e 1032 merge_state(tree, state);
5c939df5
YZ
1033 if (last_end == (u64)-1)
1034 goto out;
1035 start = last_end + 1;
d1ac6e41
LB
1036 state = next_state(state);
1037 if (start < end && state && state->start == start &&
1038 !need_resched())
1039 goto hit_next;
d1310b2e
CM
1040 goto search_again;
1041 }
1042
1043 /*
1044 * | ---- desired range ---- |
1045 * | state |
1046 * or
1047 * | ------------- state -------------- |
1048 *
1049 * We need to split the extent we found, and may flip bits on
1050 * second half.
1051 *
1052 * If the extent we found extends past our
1053 * range, we just split and search again. It'll get split
1054 * again the next time though.
1055 *
1056 * If the extent we found is inside our range, we set the
1057 * desired bit on it.
1058 */
1059 if (state->start < start) {
1edbb734 1060 if (state->state & exclusive_bits) {
d1310b2e
CM
1061 *failed_start = start;
1062 err = -EEXIST;
1063 goto out;
1064 }
8233767a
XG
1065
1066 prealloc = alloc_extent_state_atomic(prealloc);
1067 BUG_ON(!prealloc);
d1310b2e 1068 err = split_state(tree, state, prealloc, start);
c2d904e0
JM
1069 if (err)
1070 extent_io_tree_panic(tree, err);
1071
d1310b2e
CM
1072 prealloc = NULL;
1073 if (err)
1074 goto out;
1075 if (state->end <= end) {
d38ed27f 1076 set_state_bits(tree, state, &bits, changeset);
2c64c53d 1077 cache_state(state, cached_state);
d1310b2e 1078 merge_state(tree, state);
5c939df5
YZ
1079 if (last_end == (u64)-1)
1080 goto out;
1081 start = last_end + 1;
d1ac6e41
LB
1082 state = next_state(state);
1083 if (start < end && state && state->start == start &&
1084 !need_resched())
1085 goto hit_next;
d1310b2e
CM
1086 }
1087 goto search_again;
1088 }
1089 /*
1090 * | ---- desired range ---- |
1091 * | state | or | state |
1092 *
1093 * There's a hole, we need to insert something in it and
1094 * ignore the extent we found.
1095 */
1096 if (state->start > start) {
1097 u64 this_end;
1098 if (end < last_start)
1099 this_end = end;
1100 else
d397712b 1101 this_end = last_start - 1;
8233767a
XG
1102
1103 prealloc = alloc_extent_state_atomic(prealloc);
1104 BUG_ON(!prealloc);
c7f895a2
XG
1105
1106 /*
1107 * Avoid to free 'prealloc' if it can be merged with
1108 * the later extent.
1109 */
d1310b2e 1110 err = insert_state(tree, prealloc, start, this_end,
d38ed27f 1111 NULL, NULL, &bits, changeset);
c2d904e0
JM
1112 if (err)
1113 extent_io_tree_panic(tree, err);
1114
9ed74f2d
JB
1115 cache_state(prealloc, cached_state);
1116 prealloc = NULL;
d1310b2e
CM
1117 start = this_end + 1;
1118 goto search_again;
1119 }
1120 /*
1121 * | ---- desired range ---- |
1122 * | state |
1123 * We need to split the extent, and set the bit
1124 * on the first half
1125 */
1126 if (state->start <= end && state->end > end) {
1edbb734 1127 if (state->state & exclusive_bits) {
d1310b2e
CM
1128 *failed_start = start;
1129 err = -EEXIST;
1130 goto out;
1131 }
8233767a
XG
1132
1133 prealloc = alloc_extent_state_atomic(prealloc);
1134 BUG_ON(!prealloc);
d1310b2e 1135 err = split_state(tree, state, prealloc, end + 1);
c2d904e0
JM
1136 if (err)
1137 extent_io_tree_panic(tree, err);
d1310b2e 1138
d38ed27f 1139 set_state_bits(tree, prealloc, &bits, changeset);
2c64c53d 1140 cache_state(prealloc, cached_state);
d1310b2e
CM
1141 merge_state(tree, prealloc);
1142 prealloc = NULL;
1143 goto out;
1144 }
1145
b5a4ba14
DS
1146search_again:
1147 if (start > end)
1148 goto out;
1149 spin_unlock(&tree->lock);
1150 if (gfpflags_allow_blocking(mask))
1151 cond_resched();
1152 goto again;
d1310b2e
CM
1153
1154out:
cad321ad 1155 spin_unlock(&tree->lock);
d1310b2e
CM
1156 if (prealloc)
1157 free_extent_state(prealloc);
1158
1159 return err;
1160
d1310b2e 1161}
d1310b2e 1162
41074888 1163int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
9ee49a04 1164 unsigned bits, u64 * failed_start,
41074888 1165 struct extent_state **cached_state, gfp_t mask)
3fbe5c02
JM
1166{
1167 return __set_extent_bit(tree, start, end, bits, 0, failed_start,
d38ed27f 1168 cached_state, mask, NULL);
3fbe5c02
JM
1169}
1170
1171
462d6fac 1172/**
10983f2e
LB
1173 * convert_extent_bit - convert all bits in a given range from one bit to
1174 * another
462d6fac
JB
1175 * @tree: the io tree to search
1176 * @start: the start offset in bytes
1177 * @end: the end offset in bytes (inclusive)
1178 * @bits: the bits to set in this range
1179 * @clear_bits: the bits to clear in this range
e6138876 1180 * @cached_state: state that we're going to cache
462d6fac
JB
1181 *
1182 * This will go through and set bits for the given range. If any states exist
1183 * already in this range they are set with the given bit and cleared of the
1184 * clear_bits. This is only meant to be used by things that are mergeable, ie
1185 * converting from say DELALLOC to DIRTY. This is not meant to be used with
1186 * boundary bits like LOCK.
210aa277
DS
1187 *
1188 * All allocations are done with GFP_NOFS.
462d6fac
JB
1189 */
1190int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
9ee49a04 1191 unsigned bits, unsigned clear_bits,
210aa277 1192 struct extent_state **cached_state)
462d6fac
JB
1193{
1194 struct extent_state *state;
1195 struct extent_state *prealloc = NULL;
1196 struct rb_node *node;
12cfbad9
FDBM
1197 struct rb_node **p;
1198 struct rb_node *parent;
462d6fac
JB
1199 int err = 0;
1200 u64 last_start;
1201 u64 last_end;
c8fd3de7 1202 bool first_iteration = true;
462d6fac 1203
a5dee37d 1204 btrfs_debug_check_extent_io_range(tree, start, end);
a1d19847
QW
1205 trace_btrfs_convert_extent_bit(tree, start, end - start + 1, bits,
1206 clear_bits);
8d599ae1 1207
462d6fac 1208again:
210aa277 1209 if (!prealloc) {
c8fd3de7
FM
1210 /*
1211 * Best effort, don't worry if extent state allocation fails
1212 * here for the first iteration. We might have a cached state
1213 * that matches exactly the target range, in which case no
1214 * extent state allocations are needed. We'll only know this
1215 * after locking the tree.
1216 */
210aa277 1217 prealloc = alloc_extent_state(GFP_NOFS);
c8fd3de7 1218 if (!prealloc && !first_iteration)
462d6fac
JB
1219 return -ENOMEM;
1220 }
1221
1222 spin_lock(&tree->lock);
e6138876
JB
1223 if (cached_state && *cached_state) {
1224 state = *cached_state;
1225 if (state->start <= start && state->end > start &&
27a3507d 1226 extent_state_in_tree(state)) {
e6138876
JB
1227 node = &state->rb_node;
1228 goto hit_next;
1229 }
1230 }
1231
462d6fac
JB
1232 /*
1233 * this search will find all the extents that end after
1234 * our range starts.
1235 */
12cfbad9 1236 node = tree_search_for_insert(tree, start, &p, &parent);
462d6fac
JB
1237 if (!node) {
1238 prealloc = alloc_extent_state_atomic(prealloc);
1cf4ffdb
LB
1239 if (!prealloc) {
1240 err = -ENOMEM;
1241 goto out;
1242 }
12cfbad9 1243 err = insert_state(tree, prealloc, start, end,
d38ed27f 1244 &p, &parent, &bits, NULL);
c2d904e0
JM
1245 if (err)
1246 extent_io_tree_panic(tree, err);
c42ac0bc
FDBM
1247 cache_state(prealloc, cached_state);
1248 prealloc = NULL;
462d6fac
JB
1249 goto out;
1250 }
1251 state = rb_entry(node, struct extent_state, rb_node);
1252hit_next:
1253 last_start = state->start;
1254 last_end = state->end;
1255
1256 /*
1257 * | ---- desired range ---- |
1258 * | state |
1259 *
1260 * Just lock what we found and keep going
1261 */
1262 if (state->start == start && state->end <= end) {
d38ed27f 1263 set_state_bits(tree, state, &bits, NULL);
e6138876 1264 cache_state(state, cached_state);
fefdc557 1265 state = clear_state_bit(tree, state, &clear_bits, 0, NULL);
462d6fac
JB
1266 if (last_end == (u64)-1)
1267 goto out;
462d6fac 1268 start = last_end + 1;
d1ac6e41
LB
1269 if (start < end && state && state->start == start &&
1270 !need_resched())
1271 goto hit_next;
462d6fac
JB
1272 goto search_again;
1273 }
1274
1275 /*
1276 * | ---- desired range ---- |
1277 * | state |
1278 * or
1279 * | ------------- state -------------- |
1280 *
1281 * We need to split the extent we found, and may flip bits on
1282 * second half.
1283 *
1284 * If the extent we found extends past our
1285 * range, we just split and search again. It'll get split
1286 * again the next time though.
1287 *
1288 * If the extent we found is inside our range, we set the
1289 * desired bit on it.
1290 */
1291 if (state->start < start) {
1292 prealloc = alloc_extent_state_atomic(prealloc);
1cf4ffdb
LB
1293 if (!prealloc) {
1294 err = -ENOMEM;
1295 goto out;
1296 }
462d6fac 1297 err = split_state(tree, state, prealloc, start);
c2d904e0
JM
1298 if (err)
1299 extent_io_tree_panic(tree, err);
462d6fac
JB
1300 prealloc = NULL;
1301 if (err)
1302 goto out;
1303 if (state->end <= end) {
d38ed27f 1304 set_state_bits(tree, state, &bits, NULL);
e6138876 1305 cache_state(state, cached_state);
fefdc557
QW
1306 state = clear_state_bit(tree, state, &clear_bits, 0,
1307 NULL);
462d6fac
JB
1308 if (last_end == (u64)-1)
1309 goto out;
1310 start = last_end + 1;
d1ac6e41
LB
1311 if (start < end && state && state->start == start &&
1312 !need_resched())
1313 goto hit_next;
462d6fac
JB
1314 }
1315 goto search_again;
1316 }
1317 /*
1318 * | ---- desired range ---- |
1319 * | state | or | state |
1320 *
1321 * There's a hole, we need to insert something in it and
1322 * ignore the extent we found.
1323 */
1324 if (state->start > start) {
1325 u64 this_end;
1326 if (end < last_start)
1327 this_end = end;
1328 else
1329 this_end = last_start - 1;
1330
1331 prealloc = alloc_extent_state_atomic(prealloc);
1cf4ffdb
LB
1332 if (!prealloc) {
1333 err = -ENOMEM;
1334 goto out;
1335 }
462d6fac
JB
1336
1337 /*
1338 * Avoid to free 'prealloc' if it can be merged with
1339 * the later extent.
1340 */
1341 err = insert_state(tree, prealloc, start, this_end,
d38ed27f 1342 NULL, NULL, &bits, NULL);
c2d904e0
JM
1343 if (err)
1344 extent_io_tree_panic(tree, err);
e6138876 1345 cache_state(prealloc, cached_state);
462d6fac
JB
1346 prealloc = NULL;
1347 start = this_end + 1;
1348 goto search_again;
1349 }
1350 /*
1351 * | ---- desired range ---- |
1352 * | state |
1353 * We need to split the extent, and set the bit
1354 * on the first half
1355 */
1356 if (state->start <= end && state->end > end) {
1357 prealloc = alloc_extent_state_atomic(prealloc);
1cf4ffdb
LB
1358 if (!prealloc) {
1359 err = -ENOMEM;
1360 goto out;
1361 }
462d6fac
JB
1362
1363 err = split_state(tree, state, prealloc, end + 1);
c2d904e0
JM
1364 if (err)
1365 extent_io_tree_panic(tree, err);
462d6fac 1366
d38ed27f 1367 set_state_bits(tree, prealloc, &bits, NULL);
e6138876 1368 cache_state(prealloc, cached_state);
fefdc557 1369 clear_state_bit(tree, prealloc, &clear_bits, 0, NULL);
462d6fac
JB
1370 prealloc = NULL;
1371 goto out;
1372 }
1373
462d6fac
JB
1374search_again:
1375 if (start > end)
1376 goto out;
1377 spin_unlock(&tree->lock);
210aa277 1378 cond_resched();
c8fd3de7 1379 first_iteration = false;
462d6fac 1380 goto again;
462d6fac
JB
1381
1382out:
1383 spin_unlock(&tree->lock);
1384 if (prealloc)
1385 free_extent_state(prealloc);
1386
1387 return err;
462d6fac
JB
1388}
1389
d1310b2e 1390/* wrappers around set/clear extent bit */
d38ed27f 1391int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
2c53b912 1392 unsigned bits, struct extent_changeset *changeset)
d38ed27f
QW
1393{
1394 /*
1395 * We don't support EXTENT_LOCKED yet, as current changeset will
1396 * record any bits changed, so for EXTENT_LOCKED case, it will
1397 * either fail with -EEXIST or changeset will record the whole
1398 * range.
1399 */
1400 BUG_ON(bits & EXTENT_LOCKED);
1401
2c53b912 1402 return __set_extent_bit(tree, start, end, bits, 0, NULL, NULL, GFP_NOFS,
d38ed27f
QW
1403 changeset);
1404}
1405
4ca73656
NB
1406int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start, u64 end,
1407 unsigned bits)
1408{
1409 return __set_extent_bit(tree, start, end, bits, 0, NULL, NULL,
1410 GFP_NOWAIT, NULL);
1411}
1412
fefdc557
QW
1413int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
1414 unsigned bits, int wake, int delete,
ae0f1625 1415 struct extent_state **cached)
fefdc557
QW
1416{
1417 return __clear_extent_bit(tree, start, end, bits, wake, delete,
ae0f1625 1418 cached, GFP_NOFS, NULL);
fefdc557
QW
1419}
1420
fefdc557 1421int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
f734c44a 1422 unsigned bits, struct extent_changeset *changeset)
fefdc557
QW
1423{
1424 /*
1425 * Don't support EXTENT_LOCKED case, same reason as
1426 * set_record_extent_bits().
1427 */
1428 BUG_ON(bits & EXTENT_LOCKED);
1429
f734c44a 1430 return __clear_extent_bit(tree, start, end, bits, 0, 0, NULL, GFP_NOFS,
fefdc557
QW
1431 changeset);
1432}
1433
d352ac68
CM
1434/*
1435 * either insert or lock state struct between start and end use mask to tell
1436 * us if waiting is desired.
1437 */
1edbb734 1438int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
ff13db41 1439 struct extent_state **cached_state)
d1310b2e
CM
1440{
1441 int err;
1442 u64 failed_start;
9ee49a04 1443
d1310b2e 1444 while (1) {
ff13db41 1445 err = __set_extent_bit(tree, start, end, EXTENT_LOCKED,
3fbe5c02 1446 EXTENT_LOCKED, &failed_start,
d38ed27f 1447 cached_state, GFP_NOFS, NULL);
d0082371 1448 if (err == -EEXIST) {
d1310b2e
CM
1449 wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
1450 start = failed_start;
d0082371 1451 } else
d1310b2e 1452 break;
d1310b2e
CM
1453 WARN_ON(start > end);
1454 }
1455 return err;
1456}
d1310b2e 1457
d0082371 1458int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
25179201
JB
1459{
1460 int err;
1461 u64 failed_start;
1462
3fbe5c02 1463 err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
d38ed27f 1464 &failed_start, NULL, GFP_NOFS, NULL);
6643558d
YZ
1465 if (err == -EEXIST) {
1466 if (failed_start > start)
1467 clear_extent_bit(tree, start, failed_start - 1,
ae0f1625 1468 EXTENT_LOCKED, 1, 0, NULL);
25179201 1469 return 0;
6643558d 1470 }
25179201
JB
1471 return 1;
1472}
25179201 1473
bd1fa4f0 1474void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end)
4adaa611 1475{
09cbfeaf
KS
1476 unsigned long index = start >> PAGE_SHIFT;
1477 unsigned long end_index = end >> PAGE_SHIFT;
4adaa611
CM
1478 struct page *page;
1479
1480 while (index <= end_index) {
1481 page = find_get_page(inode->i_mapping, index);
1482 BUG_ON(!page); /* Pages should be in the extent_io_tree */
1483 clear_page_dirty_for_io(page);
09cbfeaf 1484 put_page(page);
4adaa611
CM
1485 index++;
1486 }
4adaa611
CM
1487}
1488
f6311572 1489void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
4adaa611 1490{
09cbfeaf
KS
1491 unsigned long index = start >> PAGE_SHIFT;
1492 unsigned long end_index = end >> PAGE_SHIFT;
4adaa611
CM
1493 struct page *page;
1494
1495 while (index <= end_index) {
1496 page = find_get_page(inode->i_mapping, index);
1497 BUG_ON(!page); /* Pages should be in the extent_io_tree */
4adaa611 1498 __set_page_dirty_nobuffers(page);
8d38633c 1499 account_page_redirty(page);
09cbfeaf 1500 put_page(page);
4adaa611
CM
1501 index++;
1502 }
4adaa611
CM
1503}
1504
d352ac68
CM
1505/* find the first state struct with 'bits' set after 'start', and
1506 * return it. tree->lock must be held. NULL will returned if
1507 * nothing was found after 'start'
1508 */
48a3b636
ES
1509static struct extent_state *
1510find_first_extent_bit_state(struct extent_io_tree *tree,
9ee49a04 1511 u64 start, unsigned bits)
d7fc640e
CM
1512{
1513 struct rb_node *node;
1514 struct extent_state *state;
1515
1516 /*
1517 * this search will find all the extents that end after
1518 * our range starts.
1519 */
1520 node = tree_search(tree, start);
d397712b 1521 if (!node)
d7fc640e 1522 goto out;
d7fc640e 1523
d397712b 1524 while (1) {
d7fc640e 1525 state = rb_entry(node, struct extent_state, rb_node);
d397712b 1526 if (state->end >= start && (state->state & bits))
d7fc640e 1527 return state;
d397712b 1528
d7fc640e
CM
1529 node = rb_next(node);
1530 if (!node)
1531 break;
1532 }
1533out:
1534 return NULL;
1535}
d7fc640e 1536
69261c4b
XG
1537/*
1538 * find the first offset in the io tree with 'bits' set. zero is
1539 * returned if we find something, and *start_ret and *end_ret are
1540 * set to reflect the state struct that was found.
1541 *
477d7eaf 1542 * If nothing was found, 1 is returned. If found something, return 0.
69261c4b
XG
1543 */
1544int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
9ee49a04 1545 u64 *start_ret, u64 *end_ret, unsigned bits,
e6138876 1546 struct extent_state **cached_state)
69261c4b
XG
1547{
1548 struct extent_state *state;
1549 int ret = 1;
1550
1551 spin_lock(&tree->lock);
e6138876
JB
1552 if (cached_state && *cached_state) {
1553 state = *cached_state;
27a3507d 1554 if (state->end == start - 1 && extent_state_in_tree(state)) {
9688e9a9 1555 while ((state = next_state(state)) != NULL) {
e6138876
JB
1556 if (state->state & bits)
1557 goto got_it;
e6138876
JB
1558 }
1559 free_extent_state(*cached_state);
1560 *cached_state = NULL;
1561 goto out;
1562 }
1563 free_extent_state(*cached_state);
1564 *cached_state = NULL;
1565 }
1566
69261c4b 1567 state = find_first_extent_bit_state(tree, start, bits);
e6138876 1568got_it:
69261c4b 1569 if (state) {
e38e2ed7 1570 cache_state_if_flags(state, cached_state, 0);
69261c4b
XG
1571 *start_ret = state->start;
1572 *end_ret = state->end;
1573 ret = 0;
1574 }
e6138876 1575out:
69261c4b
XG
1576 spin_unlock(&tree->lock);
1577 return ret;
1578}
1579
41a2ee75
JB
1580/**
1581 * find_contiguous_extent_bit: find a contiguous area of bits
1582 * @tree - io tree to check
1583 * @start - offset to start the search from
1584 * @start_ret - the first offset we found with the bits set
1585 * @end_ret - the final contiguous range of the bits that were set
1586 * @bits - bits to look for
1587 *
1588 * set_extent_bit and clear_extent_bit can temporarily split contiguous ranges
1589 * to set bits appropriately, and then merge them again. During this time it
1590 * will drop the tree->lock, so use this helper if you want to find the actual
1591 * contiguous area for given bits. We will search to the first bit we find, and
1592 * then walk down the tree until we find a non-contiguous area. The area
1593 * returned will be the full contiguous area with the bits set.
1594 */
1595int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
1596 u64 *start_ret, u64 *end_ret, unsigned bits)
1597{
1598 struct extent_state *state;
1599 int ret = 1;
1600
1601 spin_lock(&tree->lock);
1602 state = find_first_extent_bit_state(tree, start, bits);
1603 if (state) {
1604 *start_ret = state->start;
1605 *end_ret = state->end;
1606 while ((state = next_state(state)) != NULL) {
1607 if (state->start > (*end_ret + 1))
1608 break;
1609 *end_ret = state->end;
1610 }
1611 ret = 0;
1612 }
1613 spin_unlock(&tree->lock);
1614 return ret;
1615}
1616
45bfcfc1 1617/**
1eaebb34
NB
1618 * find_first_clear_extent_bit - find the first range that has @bits not set.
1619 * This range could start before @start.
45bfcfc1
NB
1620 *
1621 * @tree - the tree to search
1622 * @start - the offset at/after which the found extent should start
1623 * @start_ret - records the beginning of the range
1624 * @end_ret - records the end of the range (inclusive)
1625 * @bits - the set of bits which must be unset
1626 *
1627 * Since unallocated range is also considered one which doesn't have the bits
1628 * set it's possible that @end_ret contains -1, this happens in case the range
1629 * spans (last_range_end, end of device]. In this case it's up to the caller to
1630 * trim @end_ret to the appropriate size.
1631 */
1632void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
1633 u64 *start_ret, u64 *end_ret, unsigned bits)
1634{
1635 struct extent_state *state;
1636 struct rb_node *node, *prev = NULL, *next;
1637
1638 spin_lock(&tree->lock);
1639
1640 /* Find first extent with bits cleared */
1641 while (1) {
1642 node = __etree_search(tree, start, &next, &prev, NULL, NULL);
5750c375
NB
1643 if (!node && !next && !prev) {
1644 /*
1645 * Tree is completely empty, send full range and let
1646 * caller deal with it
1647 */
1648 *start_ret = 0;
1649 *end_ret = -1;
1650 goto out;
1651 } else if (!node && !next) {
1652 /*
1653 * We are past the last allocated chunk, set start at
1654 * the end of the last extent.
1655 */
1656 state = rb_entry(prev, struct extent_state, rb_node);
1657 *start_ret = state->end + 1;
1658 *end_ret = -1;
1659 goto out;
1660 } else if (!node) {
45bfcfc1 1661 node = next;
45bfcfc1 1662 }
1eaebb34
NB
1663 /*
1664 * At this point 'node' either contains 'start' or start is
1665 * before 'node'
1666 */
45bfcfc1 1667 state = rb_entry(node, struct extent_state, rb_node);
1eaebb34
NB
1668
1669 if (in_range(start, state->start, state->end - state->start + 1)) {
1670 if (state->state & bits) {
1671 /*
1672 * |--range with bits sets--|
1673 * |
1674 * start
1675 */
1676 start = state->end + 1;
1677 } else {
1678 /*
1679 * 'start' falls within a range that doesn't
1680 * have the bits set, so take its start as
1681 * the beginning of the desired range
1682 *
1683 * |--range with bits cleared----|
1684 * |
1685 * start
1686 */
1687 *start_ret = state->start;
1688 break;
1689 }
45bfcfc1 1690 } else {
1eaebb34
NB
1691 /*
1692 * |---prev range---|---hole/unset---|---node range---|
1693 * |
1694 * start
1695 *
1696 * or
1697 *
1698 * |---hole/unset--||--first node--|
1699 * 0 |
1700 * start
1701 */
1702 if (prev) {
1703 state = rb_entry(prev, struct extent_state,
1704 rb_node);
1705 *start_ret = state->end + 1;
1706 } else {
1707 *start_ret = 0;
1708 }
45bfcfc1
NB
1709 break;
1710 }
1711 }
1712
1713 /*
1714 * Find the longest stretch from start until an entry which has the
1715 * bits set
1716 */
1717 while (1) {
1718 state = rb_entry(node, struct extent_state, rb_node);
1719 if (state->end >= start && !(state->state & bits)) {
1720 *end_ret = state->end;
1721 } else {
1722 *end_ret = state->start - 1;
1723 break;
1724 }
1725
1726 node = rb_next(node);
1727 if (!node)
1728 break;
1729 }
1730out:
1731 spin_unlock(&tree->lock);
1732}
1733
d352ac68
CM
1734/*
1735 * find a contiguous range of bytes in the file marked as delalloc, not
1736 * more than 'max_bytes'. start and end are used to return the range,
1737 *
3522e903 1738 * true is returned if we find something, false if nothing was in the tree
d352ac68 1739 */
083e75e7
JB
1740bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start,
1741 u64 *end, u64 max_bytes,
1742 struct extent_state **cached_state)
d1310b2e
CM
1743{
1744 struct rb_node *node;
1745 struct extent_state *state;
1746 u64 cur_start = *start;
3522e903 1747 bool found = false;
d1310b2e
CM
1748 u64 total_bytes = 0;
1749
cad321ad 1750 spin_lock(&tree->lock);
c8b97818 1751
d1310b2e
CM
1752 /*
1753 * this search will find all the extents that end after
1754 * our range starts.
1755 */
80ea96b1 1756 node = tree_search(tree, cur_start);
2b114d1d 1757 if (!node) {
3522e903 1758 *end = (u64)-1;
d1310b2e
CM
1759 goto out;
1760 }
1761
d397712b 1762 while (1) {
d1310b2e 1763 state = rb_entry(node, struct extent_state, rb_node);
5b21f2ed
ZY
1764 if (found && (state->start != cur_start ||
1765 (state->state & EXTENT_BOUNDARY))) {
d1310b2e
CM
1766 goto out;
1767 }
1768 if (!(state->state & EXTENT_DELALLOC)) {
1769 if (!found)
1770 *end = state->end;
1771 goto out;
1772 }
c2a128d2 1773 if (!found) {
d1310b2e 1774 *start = state->start;
c2a128d2 1775 *cached_state = state;
b7ac31b7 1776 refcount_inc(&state->refs);
c2a128d2 1777 }
3522e903 1778 found = true;
d1310b2e
CM
1779 *end = state->end;
1780 cur_start = state->end + 1;
1781 node = rb_next(node);
d1310b2e 1782 total_bytes += state->end - state->start + 1;
7bf811a5 1783 if (total_bytes >= max_bytes)
573aecaf 1784 break;
573aecaf 1785 if (!node)
d1310b2e
CM
1786 break;
1787 }
1788out:
cad321ad 1789 spin_unlock(&tree->lock);
d1310b2e
CM
1790 return found;
1791}
1792
da2c7009
LB
1793static int __process_pages_contig(struct address_space *mapping,
1794 struct page *locked_page,
1795 pgoff_t start_index, pgoff_t end_index,
1796 unsigned long page_ops, pgoff_t *index_ret);
1797
143bede5
JM
1798static noinline void __unlock_for_delalloc(struct inode *inode,
1799 struct page *locked_page,
1800 u64 start, u64 end)
c8b97818 1801{
09cbfeaf
KS
1802 unsigned long index = start >> PAGE_SHIFT;
1803 unsigned long end_index = end >> PAGE_SHIFT;
c8b97818 1804
76c0021d 1805 ASSERT(locked_page);
c8b97818 1806 if (index == locked_page->index && end_index == index)
143bede5 1807 return;
c8b97818 1808
76c0021d
LB
1809 __process_pages_contig(inode->i_mapping, locked_page, index, end_index,
1810 PAGE_UNLOCK, NULL);
c8b97818
CM
1811}
1812
1813static noinline int lock_delalloc_pages(struct inode *inode,
1814 struct page *locked_page,
1815 u64 delalloc_start,
1816 u64 delalloc_end)
1817{
09cbfeaf 1818 unsigned long index = delalloc_start >> PAGE_SHIFT;
76c0021d 1819 unsigned long index_ret = index;
09cbfeaf 1820 unsigned long end_index = delalloc_end >> PAGE_SHIFT;
c8b97818 1821 int ret;
c8b97818 1822
76c0021d 1823 ASSERT(locked_page);
c8b97818
CM
1824 if (index == locked_page->index && index == end_index)
1825 return 0;
1826
76c0021d
LB
1827 ret = __process_pages_contig(inode->i_mapping, locked_page, index,
1828 end_index, PAGE_LOCK, &index_ret);
1829 if (ret == -EAGAIN)
1830 __unlock_for_delalloc(inode, locked_page, delalloc_start,
1831 (u64)index_ret << PAGE_SHIFT);
c8b97818
CM
1832 return ret;
1833}
1834
1835/*
3522e903
LF
1836 * Find and lock a contiguous range of bytes in the file marked as delalloc, no
1837 * more than @max_bytes. @Start and @end are used to return the range,
c8b97818 1838 *
3522e903
LF
1839 * Return: true if we find something
1840 * false if nothing was in the tree
c8b97818 1841 */
ce9f967f 1842EXPORT_FOR_TESTS
3522e903 1843noinline_for_stack bool find_lock_delalloc_range(struct inode *inode,
294e30fe 1844 struct page *locked_page, u64 *start,
917aacec 1845 u64 *end)
c8b97818 1846{
9978059b 1847 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
917aacec 1848 u64 max_bytes = BTRFS_MAX_EXTENT_SIZE;
c8b97818
CM
1849 u64 delalloc_start;
1850 u64 delalloc_end;
3522e903 1851 bool found;
9655d298 1852 struct extent_state *cached_state = NULL;
c8b97818
CM
1853 int ret;
1854 int loops = 0;
1855
1856again:
1857 /* step one, find a bunch of delalloc bytes starting at start */
1858 delalloc_start = *start;
1859 delalloc_end = 0;
083e75e7
JB
1860 found = btrfs_find_delalloc_range(tree, &delalloc_start, &delalloc_end,
1861 max_bytes, &cached_state);
70b99e69 1862 if (!found || delalloc_end <= *start) {
c8b97818
CM
1863 *start = delalloc_start;
1864 *end = delalloc_end;
c2a128d2 1865 free_extent_state(cached_state);
3522e903 1866 return false;
c8b97818
CM
1867 }
1868
70b99e69
CM
1869 /*
1870 * start comes from the offset of locked_page. We have to lock
1871 * pages in order, so we can't process delalloc bytes before
1872 * locked_page
1873 */
d397712b 1874 if (delalloc_start < *start)
70b99e69 1875 delalloc_start = *start;
70b99e69 1876
c8b97818
CM
1877 /*
1878 * make sure to limit the number of pages we try to lock down
c8b97818 1879 */
7bf811a5
JB
1880 if (delalloc_end + 1 - delalloc_start > max_bytes)
1881 delalloc_end = delalloc_start + max_bytes - 1;
d397712b 1882
c8b97818
CM
1883 /* step two, lock all the pages after the page that has start */
1884 ret = lock_delalloc_pages(inode, locked_page,
1885 delalloc_start, delalloc_end);
9bfd61d9 1886 ASSERT(!ret || ret == -EAGAIN);
c8b97818
CM
1887 if (ret == -EAGAIN) {
1888 /* some of the pages are gone, lets avoid looping by
1889 * shortening the size of the delalloc range we're searching
1890 */
9655d298 1891 free_extent_state(cached_state);
7d788742 1892 cached_state = NULL;
c8b97818 1893 if (!loops) {
09cbfeaf 1894 max_bytes = PAGE_SIZE;
c8b97818
CM
1895 loops = 1;
1896 goto again;
1897 } else {
3522e903 1898 found = false;
c8b97818
CM
1899 goto out_failed;
1900 }
1901 }
c8b97818
CM
1902
1903 /* step three, lock the state bits for the whole range */
ff13db41 1904 lock_extent_bits(tree, delalloc_start, delalloc_end, &cached_state);
c8b97818
CM
1905
1906 /* then test to make sure it is all still delalloc */
1907 ret = test_range_bit(tree, delalloc_start, delalloc_end,
9655d298 1908 EXTENT_DELALLOC, 1, cached_state);
c8b97818 1909 if (!ret) {
9655d298 1910 unlock_extent_cached(tree, delalloc_start, delalloc_end,
e43bbe5e 1911 &cached_state);
c8b97818
CM
1912 __unlock_for_delalloc(inode, locked_page,
1913 delalloc_start, delalloc_end);
1914 cond_resched();
1915 goto again;
1916 }
9655d298 1917 free_extent_state(cached_state);
c8b97818
CM
1918 *start = delalloc_start;
1919 *end = delalloc_end;
1920out_failed:
1921 return found;
1922}
1923
da2c7009
LB
1924static int __process_pages_contig(struct address_space *mapping,
1925 struct page *locked_page,
1926 pgoff_t start_index, pgoff_t end_index,
1927 unsigned long page_ops, pgoff_t *index_ret)
c8b97818 1928{
873695b3 1929 unsigned long nr_pages = end_index - start_index + 1;
da2c7009 1930 unsigned long pages_locked = 0;
873695b3 1931 pgoff_t index = start_index;
c8b97818 1932 struct page *pages[16];
873695b3 1933 unsigned ret;
da2c7009 1934 int err = 0;
c8b97818 1935 int i;
771ed689 1936
da2c7009
LB
1937 if (page_ops & PAGE_LOCK) {
1938 ASSERT(page_ops == PAGE_LOCK);
1939 ASSERT(index_ret && *index_ret == start_index);
1940 }
1941
704de49d 1942 if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0)
873695b3 1943 mapping_set_error(mapping, -EIO);
704de49d 1944
d397712b 1945 while (nr_pages > 0) {
873695b3 1946 ret = find_get_pages_contig(mapping, index,
5b050f04
CM
1947 min_t(unsigned long,
1948 nr_pages, ARRAY_SIZE(pages)), pages);
da2c7009
LB
1949 if (ret == 0) {
1950 /*
1951 * Only if we're going to lock these pages,
1952 * can we find nothing at @index.
1953 */
1954 ASSERT(page_ops & PAGE_LOCK);
49d4a334
LB
1955 err = -EAGAIN;
1956 goto out;
da2c7009 1957 }
8b62b72b 1958
da2c7009 1959 for (i = 0; i < ret; i++) {
c2790a2e 1960 if (page_ops & PAGE_SET_PRIVATE2)
8b62b72b
CM
1961 SetPagePrivate2(pages[i]);
1962
1d53c9e6 1963 if (locked_page && pages[i] == locked_page) {
09cbfeaf 1964 put_page(pages[i]);
da2c7009 1965 pages_locked++;
c8b97818
CM
1966 continue;
1967 }
c2790a2e 1968 if (page_ops & PAGE_CLEAR_DIRTY)
c8b97818 1969 clear_page_dirty_for_io(pages[i]);
c2790a2e 1970 if (page_ops & PAGE_SET_WRITEBACK)
c8b97818 1971 set_page_writeback(pages[i]);
704de49d
FM
1972 if (page_ops & PAGE_SET_ERROR)
1973 SetPageError(pages[i]);
c2790a2e 1974 if (page_ops & PAGE_END_WRITEBACK)
c8b97818 1975 end_page_writeback(pages[i]);
c2790a2e 1976 if (page_ops & PAGE_UNLOCK)
771ed689 1977 unlock_page(pages[i]);
da2c7009
LB
1978 if (page_ops & PAGE_LOCK) {
1979 lock_page(pages[i]);
1980 if (!PageDirty(pages[i]) ||
1981 pages[i]->mapping != mapping) {
1982 unlock_page(pages[i]);
1983 put_page(pages[i]);
1984 err = -EAGAIN;
1985 goto out;
1986 }
1987 }
09cbfeaf 1988 put_page(pages[i]);
da2c7009 1989 pages_locked++;
c8b97818
CM
1990 }
1991 nr_pages -= ret;
1992 index += ret;
1993 cond_resched();
1994 }
da2c7009
LB
1995out:
1996 if (err && index_ret)
1997 *index_ret = start_index + pages_locked - 1;
1998 return err;
c8b97818 1999}
c8b97818 2000
873695b3 2001void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
74e9194a
NB
2002 struct page *locked_page,
2003 unsigned clear_bits,
2004 unsigned long page_ops)
873695b3
LB
2005{
2006 clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, clear_bits, 1, 0,
ae0f1625 2007 NULL);
873695b3
LB
2008
2009 __process_pages_contig(inode->i_mapping, locked_page,
2010 start >> PAGE_SHIFT, end >> PAGE_SHIFT,
da2c7009 2011 page_ops, NULL);
873695b3
LB
2012}
2013
d352ac68
CM
2014/*
2015 * count the number of bytes in the tree that have a given bit(s)
2016 * set. This can be fairly slow, except for EXTENT_DIRTY which is
2017 * cached. The total number found is returned.
2018 */
d1310b2e
CM
2019u64 count_range_bits(struct extent_io_tree *tree,
2020 u64 *start, u64 search_end, u64 max_bytes,
9ee49a04 2021 unsigned bits, int contig)
d1310b2e
CM
2022{
2023 struct rb_node *node;
2024 struct extent_state *state;
2025 u64 cur_start = *start;
2026 u64 total_bytes = 0;
ec29ed5b 2027 u64 last = 0;
d1310b2e
CM
2028 int found = 0;
2029
fae7f21c 2030 if (WARN_ON(search_end <= cur_start))
d1310b2e 2031 return 0;
d1310b2e 2032
cad321ad 2033 spin_lock(&tree->lock);
d1310b2e
CM
2034 if (cur_start == 0 && bits == EXTENT_DIRTY) {
2035 total_bytes = tree->dirty_bytes;
2036 goto out;
2037 }
2038 /*
2039 * this search will find all the extents that end after
2040 * our range starts.
2041 */
80ea96b1 2042 node = tree_search(tree, cur_start);
d397712b 2043 if (!node)
d1310b2e 2044 goto out;
d1310b2e 2045
d397712b 2046 while (1) {
d1310b2e
CM
2047 state = rb_entry(node, struct extent_state, rb_node);
2048 if (state->start > search_end)
2049 break;
ec29ed5b
CM
2050 if (contig && found && state->start > last + 1)
2051 break;
2052 if (state->end >= cur_start && (state->state & bits) == bits) {
d1310b2e
CM
2053 total_bytes += min(search_end, state->end) + 1 -
2054 max(cur_start, state->start);
2055 if (total_bytes >= max_bytes)
2056 break;
2057 if (!found) {
af60bed2 2058 *start = max(cur_start, state->start);
d1310b2e
CM
2059 found = 1;
2060 }
ec29ed5b
CM
2061 last = state->end;
2062 } else if (contig && found) {
2063 break;
d1310b2e
CM
2064 }
2065 node = rb_next(node);
2066 if (!node)
2067 break;
2068 }
2069out:
cad321ad 2070 spin_unlock(&tree->lock);
d1310b2e
CM
2071 return total_bytes;
2072}
b2950863 2073
d352ac68
CM
2074/*
2075 * set the private field for a given byte offset in the tree. If there isn't
2076 * an extent_state there already, this does nothing.
2077 */
b3f167aa
JB
2078int set_state_failrec(struct extent_io_tree *tree, u64 start,
2079 struct io_failure_record *failrec)
d1310b2e
CM
2080{
2081 struct rb_node *node;
2082 struct extent_state *state;
2083 int ret = 0;
2084
cad321ad 2085 spin_lock(&tree->lock);
d1310b2e
CM
2086 /*
2087 * this search will find all the extents that end after
2088 * our range starts.
2089 */
80ea96b1 2090 node = tree_search(tree, start);
2b114d1d 2091 if (!node) {
d1310b2e
CM
2092 ret = -ENOENT;
2093 goto out;
2094 }
2095 state = rb_entry(node, struct extent_state, rb_node);
2096 if (state->start != start) {
2097 ret = -ENOENT;
2098 goto out;
2099 }
47dc196a 2100 state->failrec = failrec;
d1310b2e 2101out:
cad321ad 2102 spin_unlock(&tree->lock);
d1310b2e
CM
2103 return ret;
2104}
2105
b3f167aa
JB
2106int get_state_failrec(struct extent_io_tree *tree, u64 start,
2107 struct io_failure_record **failrec)
d1310b2e
CM
2108{
2109 struct rb_node *node;
2110 struct extent_state *state;
2111 int ret = 0;
2112
cad321ad 2113 spin_lock(&tree->lock);
d1310b2e
CM
2114 /*
2115 * this search will find all the extents that end after
2116 * our range starts.
2117 */
80ea96b1 2118 node = tree_search(tree, start);
2b114d1d 2119 if (!node) {
d1310b2e
CM
2120 ret = -ENOENT;
2121 goto out;
2122 }
2123 state = rb_entry(node, struct extent_state, rb_node);
2124 if (state->start != start) {
2125 ret = -ENOENT;
2126 goto out;
2127 }
47dc196a 2128 *failrec = state->failrec;
d1310b2e 2129out:
cad321ad 2130 spin_unlock(&tree->lock);
d1310b2e
CM
2131 return ret;
2132}
2133
2134/*
2135 * searches a range in the state tree for a given mask.
70dec807 2136 * If 'filled' == 1, this returns 1 only if every extent in the tree
d1310b2e
CM
2137 * has the bits set. Otherwise, 1 is returned if any bit in the
2138 * range is found set.
2139 */
2140int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
9ee49a04 2141 unsigned bits, int filled, struct extent_state *cached)
d1310b2e
CM
2142{
2143 struct extent_state *state = NULL;
2144 struct rb_node *node;
2145 int bitset = 0;
d1310b2e 2146
cad321ad 2147 spin_lock(&tree->lock);
27a3507d 2148 if (cached && extent_state_in_tree(cached) && cached->start <= start &&
df98b6e2 2149 cached->end > start)
9655d298
CM
2150 node = &cached->rb_node;
2151 else
2152 node = tree_search(tree, start);
d1310b2e
CM
2153 while (node && start <= end) {
2154 state = rb_entry(node, struct extent_state, rb_node);
2155
2156 if (filled && state->start > start) {
2157 bitset = 0;
2158 break;
2159 }
2160
2161 if (state->start > end)
2162 break;
2163
2164 if (state->state & bits) {
2165 bitset = 1;
2166 if (!filled)
2167 break;
2168 } else if (filled) {
2169 bitset = 0;
2170 break;
2171 }
46562cec
CM
2172
2173 if (state->end == (u64)-1)
2174 break;
2175
d1310b2e
CM
2176 start = state->end + 1;
2177 if (start > end)
2178 break;
2179 node = rb_next(node);
2180 if (!node) {
2181 if (filled)
2182 bitset = 0;
2183 break;
2184 }
2185 }
cad321ad 2186 spin_unlock(&tree->lock);
d1310b2e
CM
2187 return bitset;
2188}
d1310b2e
CM
2189
2190/*
2191 * helper function to set a given page up to date if all the
2192 * extents in the tree for that page are up to date
2193 */
143bede5 2194static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)
d1310b2e 2195{
4eee4fa4 2196 u64 start = page_offset(page);
09cbfeaf 2197 u64 end = start + PAGE_SIZE - 1;
9655d298 2198 if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
d1310b2e 2199 SetPageUptodate(page);
d1310b2e
CM
2200}
2201
7870d082
JB
2202int free_io_failure(struct extent_io_tree *failure_tree,
2203 struct extent_io_tree *io_tree,
2204 struct io_failure_record *rec)
4a54c8c1
JS
2205{
2206 int ret;
2207 int err = 0;
4a54c8c1 2208
47dc196a 2209 set_state_failrec(failure_tree, rec->start, NULL);
4a54c8c1
JS
2210 ret = clear_extent_bits(failure_tree, rec->start,
2211 rec->start + rec->len - 1,
91166212 2212 EXTENT_LOCKED | EXTENT_DIRTY);
4a54c8c1
JS
2213 if (ret)
2214 err = ret;
2215
7870d082 2216 ret = clear_extent_bits(io_tree, rec->start,
53b381b3 2217 rec->start + rec->len - 1,
91166212 2218 EXTENT_DAMAGED);
53b381b3
DW
2219 if (ret && !err)
2220 err = ret;
4a54c8c1
JS
2221
2222 kfree(rec);
2223 return err;
2224}
2225
4a54c8c1
JS
2226/*
2227 * this bypasses the standard btrfs submit functions deliberately, as
2228 * the standard behavior is to write all copies in a raid setup. here we only
2229 * want to write the one bad copy. so we do the mapping for ourselves and issue
2230 * submit_bio directly.
3ec706c8 2231 * to avoid any synchronization issues, wait for the data after writing, which
4a54c8c1
JS
2232 * actually prevents the read that triggered the error from finishing.
2233 * currently, there can be no more than two copies of every data bit. thus,
2234 * exactly one rewrite is required.
2235 */
6ec656bc
JB
2236int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
2237 u64 length, u64 logical, struct page *page,
2238 unsigned int pg_offset, int mirror_num)
4a54c8c1
JS
2239{
2240 struct bio *bio;
2241 struct btrfs_device *dev;
4a54c8c1
JS
2242 u64 map_length = 0;
2243 u64 sector;
2244 struct btrfs_bio *bbio = NULL;
2245 int ret;
2246
1751e8a6 2247 ASSERT(!(fs_info->sb->s_flags & SB_RDONLY));
4a54c8c1
JS
2248 BUG_ON(!mirror_num);
2249
c5e4c3d7 2250 bio = btrfs_io_bio_alloc(1);
4f024f37 2251 bio->bi_iter.bi_size = 0;
4a54c8c1
JS
2252 map_length = length;
2253
b5de8d0d
FM
2254 /*
2255 * Avoid races with device replace and make sure our bbio has devices
2256 * associated to its stripes that don't go away while we are doing the
2257 * read repair operation.
2258 */
2259 btrfs_bio_counter_inc_blocked(fs_info);
e4ff5fb5 2260 if (btrfs_is_parity_mirror(fs_info, logical, length)) {
c725328c
LB
2261 /*
2262 * Note that we don't use BTRFS_MAP_WRITE because it's supposed
2263 * to update all raid stripes, but here we just want to correct
2264 * bad stripe, thus BTRFS_MAP_READ is abused to only get the bad
2265 * stripe's dev and sector.
2266 */
2267 ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical,
2268 &map_length, &bbio, 0);
2269 if (ret) {
2270 btrfs_bio_counter_dec(fs_info);
2271 bio_put(bio);
2272 return -EIO;
2273 }
2274 ASSERT(bbio->mirror_num == 1);
2275 } else {
2276 ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical,
2277 &map_length, &bbio, mirror_num);
2278 if (ret) {
2279 btrfs_bio_counter_dec(fs_info);
2280 bio_put(bio);
2281 return -EIO;
2282 }
2283 BUG_ON(mirror_num != bbio->mirror_num);
4a54c8c1 2284 }
c725328c
LB
2285
2286 sector = bbio->stripes[bbio->mirror_num - 1].physical >> 9;
4f024f37 2287 bio->bi_iter.bi_sector = sector;
c725328c 2288 dev = bbio->stripes[bbio->mirror_num - 1].dev;
6e9606d2 2289 btrfs_put_bbio(bbio);
ebbede42
AJ
2290 if (!dev || !dev->bdev ||
2291 !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
b5de8d0d 2292 btrfs_bio_counter_dec(fs_info);
4a54c8c1
JS
2293 bio_put(bio);
2294 return -EIO;
2295 }
74d46992 2296 bio_set_dev(bio, dev->bdev);
70fd7614 2297 bio->bi_opf = REQ_OP_WRITE | REQ_SYNC;
ffdd2018 2298 bio_add_page(bio, page, length, pg_offset);
4a54c8c1 2299
4e49ea4a 2300 if (btrfsic_submit_bio_wait(bio)) {
4a54c8c1 2301 /* try to remap that extent elsewhere? */
b5de8d0d 2302 btrfs_bio_counter_dec(fs_info);
4a54c8c1 2303 bio_put(bio);
442a4f63 2304 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
4a54c8c1
JS
2305 return -EIO;
2306 }
2307
b14af3b4
DS
2308 btrfs_info_rl_in_rcu(fs_info,
2309 "read error corrected: ino %llu off %llu (dev %s sector %llu)",
6ec656bc 2310 ino, start,
1203b681 2311 rcu_str_deref(dev->name), sector);
b5de8d0d 2312 btrfs_bio_counter_dec(fs_info);
4a54c8c1
JS
2313 bio_put(bio);
2314 return 0;
2315}
2316
20a1fbf9 2317int btrfs_repair_eb_io_failure(struct extent_buffer *eb, int mirror_num)
ea466794 2318{
20a1fbf9 2319 struct btrfs_fs_info *fs_info = eb->fs_info;
ea466794 2320 u64 start = eb->start;
cc5e31a4 2321 int i, num_pages = num_extent_pages(eb);
d95603b2 2322 int ret = 0;
ea466794 2323
bc98a42c 2324 if (sb_rdonly(fs_info->sb))
908960c6
ID
2325 return -EROFS;
2326
ea466794 2327 for (i = 0; i < num_pages; i++) {
fb85fc9a 2328 struct page *p = eb->pages[i];
1203b681 2329
6ec656bc 2330 ret = repair_io_failure(fs_info, 0, start, PAGE_SIZE, start, p,
1203b681 2331 start - page_offset(p), mirror_num);
ea466794
JB
2332 if (ret)
2333 break;
09cbfeaf 2334 start += PAGE_SIZE;
ea466794
JB
2335 }
2336
2337 return ret;
2338}
2339
4a54c8c1
JS
2340/*
2341 * each time an IO finishes, we do a fast check in the IO failure tree
2342 * to see if we need to process or clean up an io_failure_record
2343 */
7870d082
JB
2344int clean_io_failure(struct btrfs_fs_info *fs_info,
2345 struct extent_io_tree *failure_tree,
2346 struct extent_io_tree *io_tree, u64 start,
2347 struct page *page, u64 ino, unsigned int pg_offset)
4a54c8c1
JS
2348{
2349 u64 private;
4a54c8c1 2350 struct io_failure_record *failrec;
4a54c8c1
JS
2351 struct extent_state *state;
2352 int num_copies;
4a54c8c1 2353 int ret;
4a54c8c1
JS
2354
2355 private = 0;
7870d082
JB
2356 ret = count_range_bits(failure_tree, &private, (u64)-1, 1,
2357 EXTENT_DIRTY, 0);
4a54c8c1
JS
2358 if (!ret)
2359 return 0;
2360
7870d082 2361 ret = get_state_failrec(failure_tree, start, &failrec);
4a54c8c1
JS
2362 if (ret)
2363 return 0;
2364
4a54c8c1
JS
2365 BUG_ON(!failrec->this_mirror);
2366
2367 if (failrec->in_validation) {
2368 /* there was no real error, just free the record */
ab8d0fc4
JM
2369 btrfs_debug(fs_info,
2370 "clean_io_failure: freeing dummy error at %llu",
2371 failrec->start);
4a54c8c1
JS
2372 goto out;
2373 }
bc98a42c 2374 if (sb_rdonly(fs_info->sb))
908960c6 2375 goto out;
4a54c8c1 2376
7870d082
JB
2377 spin_lock(&io_tree->lock);
2378 state = find_first_extent_bit_state(io_tree,
4a54c8c1
JS
2379 failrec->start,
2380 EXTENT_LOCKED);
7870d082 2381 spin_unlock(&io_tree->lock);
4a54c8c1 2382
883d0de4
MX
2383 if (state && state->start <= failrec->start &&
2384 state->end >= failrec->start + failrec->len - 1) {
3ec706c8
SB
2385 num_copies = btrfs_num_copies(fs_info, failrec->logical,
2386 failrec->len);
4a54c8c1 2387 if (num_copies > 1) {
7870d082
JB
2388 repair_io_failure(fs_info, ino, start, failrec->len,
2389 failrec->logical, page, pg_offset,
2390 failrec->failed_mirror);
4a54c8c1
JS
2391 }
2392 }
2393
2394out:
7870d082 2395 free_io_failure(failure_tree, io_tree, failrec);
4a54c8c1 2396
454ff3de 2397 return 0;
4a54c8c1
JS
2398}
2399
f612496b
MX
2400/*
2401 * Can be called when
2402 * - hold extent lock
2403 * - under ordered extent
2404 * - the inode is freeing
2405 */
7ab7956e 2406void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end)
f612496b 2407{
7ab7956e 2408 struct extent_io_tree *failure_tree = &inode->io_failure_tree;
f612496b
MX
2409 struct io_failure_record *failrec;
2410 struct extent_state *state, *next;
2411
2412 if (RB_EMPTY_ROOT(&failure_tree->state))
2413 return;
2414
2415 spin_lock(&failure_tree->lock);
2416 state = find_first_extent_bit_state(failure_tree, start, EXTENT_DIRTY);
2417 while (state) {
2418 if (state->start > end)
2419 break;
2420
2421 ASSERT(state->end <= end);
2422
2423 next = next_state(state);
2424
47dc196a 2425 failrec = state->failrec;
f612496b
MX
2426 free_extent_state(state);
2427 kfree(failrec);
2428
2429 state = next;
2430 }
2431 spin_unlock(&failure_tree->lock);
2432}
2433
2fe6303e 2434int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
47dc196a 2435 struct io_failure_record **failrec_ret)
4a54c8c1 2436{
ab8d0fc4 2437 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2fe6303e 2438 struct io_failure_record *failrec;
4a54c8c1 2439 struct extent_map *em;
4a54c8c1
JS
2440 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
2441 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
2442 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
4a54c8c1 2443 int ret;
4a54c8c1
JS
2444 u64 logical;
2445
47dc196a 2446 ret = get_state_failrec(failure_tree, start, &failrec);
4a54c8c1
JS
2447 if (ret) {
2448 failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
2449 if (!failrec)
2450 return -ENOMEM;
2fe6303e 2451
4a54c8c1
JS
2452 failrec->start = start;
2453 failrec->len = end - start + 1;
2454 failrec->this_mirror = 0;
2455 failrec->bio_flags = 0;
2456 failrec->in_validation = 0;
2457
2458 read_lock(&em_tree->lock);
2459 em = lookup_extent_mapping(em_tree, start, failrec->len);
2460 if (!em) {
2461 read_unlock(&em_tree->lock);
2462 kfree(failrec);
2463 return -EIO;
2464 }
2465
68ba990f 2466 if (em->start > start || em->start + em->len <= start) {
4a54c8c1
JS
2467 free_extent_map(em);
2468 em = NULL;
2469 }
2470 read_unlock(&em_tree->lock);
7a2d6a64 2471 if (!em) {
4a54c8c1
JS
2472 kfree(failrec);
2473 return -EIO;
2474 }
2fe6303e 2475
4a54c8c1
JS
2476 logical = start - em->start;
2477 logical = em->block_start + logical;
2478 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
2479 logical = em->block_start;
2480 failrec->bio_flags = EXTENT_BIO_COMPRESSED;
2481 extent_set_compress_type(&failrec->bio_flags,
2482 em->compress_type);
2483 }
2fe6303e 2484
ab8d0fc4
JM
2485 btrfs_debug(fs_info,
2486 "Get IO Failure Record: (new) logical=%llu, start=%llu, len=%llu",
2487 logical, start, failrec->len);
2fe6303e 2488
4a54c8c1
JS
2489 failrec->logical = logical;
2490 free_extent_map(em);
2491
2492 /* set the bits in the private failure tree */
2493 ret = set_extent_bits(failure_tree, start, end,
ceeb0ae7 2494 EXTENT_LOCKED | EXTENT_DIRTY);
4a54c8c1 2495 if (ret >= 0)
47dc196a 2496 ret = set_state_failrec(failure_tree, start, failrec);
4a54c8c1
JS
2497 /* set the bits in the inode's tree */
2498 if (ret >= 0)
ceeb0ae7 2499 ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED);
4a54c8c1
JS
2500 if (ret < 0) {
2501 kfree(failrec);
2502 return ret;
2503 }
2504 } else {
ab8d0fc4
JM
2505 btrfs_debug(fs_info,
2506 "Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu, validation=%d",
2507 failrec->logical, failrec->start, failrec->len,
2508 failrec->in_validation);
4a54c8c1
JS
2509 /*
2510 * when data can be on disk more than twice, add to failrec here
2511 * (e.g. with a list for failed_mirror) to make
2512 * clean_io_failure() clean all those errors at once.
2513 */
2514 }
2fe6303e
MX
2515
2516 *failrec_ret = failrec;
2517
2518 return 0;
2519}
2520
a0b60d72 2521bool btrfs_check_repairable(struct inode *inode, unsigned failed_bio_pages,
2fe6303e
MX
2522 struct io_failure_record *failrec, int failed_mirror)
2523{
ab8d0fc4 2524 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2fe6303e
MX
2525 int num_copies;
2526
ab8d0fc4 2527 num_copies = btrfs_num_copies(fs_info, failrec->logical, failrec->len);
4a54c8c1
JS
2528 if (num_copies == 1) {
2529 /*
2530 * we only have a single copy of the data, so don't bother with
2531 * all the retry and error correction code that follows. no
2532 * matter what the error is, it is very likely to persist.
2533 */
ab8d0fc4
JM
2534 btrfs_debug(fs_info,
2535 "Check Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d",
2536 num_copies, failrec->this_mirror, failed_mirror);
c3cfb656 2537 return false;
4a54c8c1
JS
2538 }
2539
4a54c8c1
JS
2540 /*
2541 * there are two premises:
2542 * a) deliver good data to the caller
2543 * b) correct the bad sectors on disk
2544 */
a0b60d72 2545 if (failed_bio_pages > 1) {
4a54c8c1
JS
2546 /*
2547 * to fulfill b), we need to know the exact failing sectors, as
2548 * we don't want to rewrite any more than the failed ones. thus,
2549 * we need separate read requests for the failed bio
2550 *
2551 * if the following BUG_ON triggers, our validation request got
2552 * merged. we need separate requests for our algorithm to work.
2553 */
2554 BUG_ON(failrec->in_validation);
2555 failrec->in_validation = 1;
2556 failrec->this_mirror = failed_mirror;
4a54c8c1
JS
2557 } else {
2558 /*
2559 * we're ready to fulfill a) and b) alongside. get a good copy
2560 * of the failed sector and if we succeed, we have setup
2561 * everything for repair_io_failure to do the rest for us.
2562 */
2563 if (failrec->in_validation) {
2564 BUG_ON(failrec->this_mirror != failed_mirror);
2565 failrec->in_validation = 0;
2566 failrec->this_mirror = 0;
2567 }
2568 failrec->failed_mirror = failed_mirror;
2569 failrec->this_mirror++;
2570 if (failrec->this_mirror == failed_mirror)
2571 failrec->this_mirror++;
4a54c8c1
JS
2572 }
2573
facc8a22 2574 if (failrec->this_mirror > num_copies) {
ab8d0fc4
JM
2575 btrfs_debug(fs_info,
2576 "Check Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d",
2577 num_copies, failrec->this_mirror, failed_mirror);
c3cfb656 2578 return false;
4a54c8c1
JS
2579 }
2580
c3cfb656 2581 return true;
2fe6303e
MX
2582}
2583
2584
2585struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio,
2586 struct io_failure_record *failrec,
2587 struct page *page, int pg_offset, int icsum,
8b110e39 2588 bio_end_io_t *endio_func, void *data)
2fe6303e 2589{
0b246afa 2590 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2fe6303e
MX
2591 struct bio *bio;
2592 struct btrfs_io_bio *btrfs_failed_bio;
2593 struct btrfs_io_bio *btrfs_bio;
2594
c5e4c3d7 2595 bio = btrfs_io_bio_alloc(1);
2fe6303e 2596 bio->bi_end_io = endio_func;
4f024f37 2597 bio->bi_iter.bi_sector = failrec->logical >> 9;
4f024f37 2598 bio->bi_iter.bi_size = 0;
8b110e39 2599 bio->bi_private = data;
4a54c8c1 2600
facc8a22
MX
2601 btrfs_failed_bio = btrfs_io_bio(failed_bio);
2602 if (btrfs_failed_bio->csum) {
facc8a22
MX
2603 u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
2604
2605 btrfs_bio = btrfs_io_bio(bio);
2606 btrfs_bio->csum = btrfs_bio->csum_inline;
2fe6303e
MX
2607 icsum *= csum_size;
2608 memcpy(btrfs_bio->csum, btrfs_failed_bio->csum + icsum,
facc8a22
MX
2609 csum_size);
2610 }
2611
2fe6303e
MX
2612 bio_add_page(bio, page, failrec->len, pg_offset);
2613
2614 return bio;
2615}
2616
2617/*
78e62c02
NB
2618 * This is a generic handler for readpage errors. If other copies exist, read
2619 * those and write back good data to the failed position. Does not investigate
2620 * in remapping the failed extent elsewhere, hoping the device will be smart
2621 * enough to do this as needed
2fe6303e 2622 */
2fe6303e
MX
2623static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
2624 struct page *page, u64 start, u64 end,
2625 int failed_mirror)
2626{
2627 struct io_failure_record *failrec;
2628 struct inode *inode = page->mapping->host;
2629 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
7870d082 2630 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
2fe6303e 2631 struct bio *bio;
70fd7614 2632 int read_mode = 0;
4e4cbee9 2633 blk_status_t status;
2fe6303e 2634 int ret;
8a2ee44a 2635 unsigned failed_bio_pages = failed_bio->bi_iter.bi_size >> PAGE_SHIFT;
2fe6303e 2636
1f7ad75b 2637 BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
2fe6303e
MX
2638
2639 ret = btrfs_get_io_failure_record(inode, start, end, &failrec);
2640 if (ret)
2641 return ret;
2642
a0b60d72 2643 if (!btrfs_check_repairable(inode, failed_bio_pages, failrec,
c3cfb656 2644 failed_mirror)) {
7870d082 2645 free_io_failure(failure_tree, tree, failrec);
2fe6303e
MX
2646 return -EIO;
2647 }
2648
a0b60d72 2649 if (failed_bio_pages > 1)
70fd7614 2650 read_mode |= REQ_FAILFAST_DEV;
2fe6303e
MX
2651
2652 phy_offset >>= inode->i_sb->s_blocksize_bits;
2653 bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page,
2654 start - page_offset(page),
8b110e39
MX
2655 (int)phy_offset, failed_bio->bi_end_io,
2656 NULL);
ebcc3263 2657 bio->bi_opf = REQ_OP_READ | read_mode;
4a54c8c1 2658
ab8d0fc4
JM
2659 btrfs_debug(btrfs_sb(inode->i_sb),
2660 "Repair Read Error: submitting new read[%#x] to this_mirror=%d, in_validation=%d",
2661 read_mode, failrec->this_mirror, failrec->in_validation);
4a54c8c1 2662
8c27cb35 2663 status = tree->ops->submit_bio_hook(tree->private_data, bio, failrec->this_mirror,
50489a57 2664 failrec->bio_flags);
4e4cbee9 2665 if (status) {
7870d082 2666 free_io_failure(failure_tree, tree, failrec);
6c387ab2 2667 bio_put(bio);
4e4cbee9 2668 ret = blk_status_to_errno(status);
6c387ab2
MX
2669 }
2670
013bd4c3 2671 return ret;
4a54c8c1
JS
2672}
2673
d1310b2e
CM
2674/* lots and lots of room for performance fixes in the end_bio funcs */
2675
b5227c07 2676void end_extent_writepage(struct page *page, int err, u64 start, u64 end)
87826df0
JM
2677{
2678 int uptodate = (err == 0);
3e2426bd 2679 int ret = 0;
87826df0 2680
c629732d 2681 btrfs_writepage_endio_finish_ordered(page, start, end, uptodate);
87826df0 2682
87826df0 2683 if (!uptodate) {
87826df0
JM
2684 ClearPageUptodate(page);
2685 SetPageError(page);
bff5baf8 2686 ret = err < 0 ? err : -EIO;
5dca6eea 2687 mapping_set_error(page->mapping, ret);
87826df0 2688 }
87826df0
JM
2689}
2690
d1310b2e
CM
2691/*
2692 * after a writepage IO is done, we need to:
2693 * clear the uptodate bits on error
2694 * clear the writeback bits in the extent tree for this IO
2695 * end_page_writeback if the page has no more pending IO
2696 *
2697 * Scheduling is not allowed, so the extent state tree is expected
2698 * to have one and only one object corresponding to this IO.
2699 */
4246a0b6 2700static void end_bio_extent_writepage(struct bio *bio)
d1310b2e 2701{
4e4cbee9 2702 int error = blk_status_to_errno(bio->bi_status);
2c30c71b 2703 struct bio_vec *bvec;
d1310b2e
CM
2704 u64 start;
2705 u64 end;
6dc4f100 2706 struct bvec_iter_all iter_all;
d1310b2e 2707
c09abff8 2708 ASSERT(!bio_flagged(bio, BIO_CLONED));
2b070cfe 2709 bio_for_each_segment_all(bvec, bio, iter_all) {
d1310b2e 2710 struct page *page = bvec->bv_page;
0b246afa
JM
2711 struct inode *inode = page->mapping->host;
2712 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
902b22f3 2713
17a5adcc
AO
2714 /* We always issue full-page reads, but if some block
2715 * in a page fails to read, blk_update_request() will
2716 * advance bv_offset and adjust bv_len to compensate.
2717 * Print a warning for nonzero offsets, and an error
2718 * if they don't add up to a full page. */
09cbfeaf
KS
2719 if (bvec->bv_offset || bvec->bv_len != PAGE_SIZE) {
2720 if (bvec->bv_offset + bvec->bv_len != PAGE_SIZE)
0b246afa 2721 btrfs_err(fs_info,
efe120a0
FH
2722 "partial page write in btrfs with offset %u and length %u",
2723 bvec->bv_offset, bvec->bv_len);
2724 else
0b246afa 2725 btrfs_info(fs_info,
5d163e0e 2726 "incomplete page write in btrfs with offset %u and length %u",
efe120a0
FH
2727 bvec->bv_offset, bvec->bv_len);
2728 }
d1310b2e 2729
17a5adcc
AO
2730 start = page_offset(page);
2731 end = start + bvec->bv_offset + bvec->bv_len - 1;
d1310b2e 2732
4e4cbee9 2733 end_extent_writepage(page, error, start, end);
17a5adcc 2734 end_page_writeback(page);
2c30c71b 2735 }
2b1f55b0 2736
d1310b2e 2737 bio_put(bio);
d1310b2e
CM
2738}
2739
883d0de4
MX
2740static void
2741endio_readpage_release_extent(struct extent_io_tree *tree, u64 start, u64 len,
2742 int uptodate)
2743{
2744 struct extent_state *cached = NULL;
2745 u64 end = start + len - 1;
2746
2747 if (uptodate && tree->track_uptodate)
2748 set_extent_uptodate(tree, start, end, &cached, GFP_ATOMIC);
d810a4be 2749 unlock_extent_cached_atomic(tree, start, end, &cached);
883d0de4
MX
2750}
2751
d1310b2e
CM
2752/*
2753 * after a readpage IO is done, we need to:
2754 * clear the uptodate bits on error
2755 * set the uptodate bits if things worked
2756 * set the page up to date if all extents in the tree are uptodate
2757 * clear the lock bit in the extent tree
2758 * unlock the page if there are no other extents locked for it
2759 *
2760 * Scheduling is not allowed, so the extent state tree is expected
2761 * to have one and only one object corresponding to this IO.
2762 */
4246a0b6 2763static void end_bio_extent_readpage(struct bio *bio)
d1310b2e 2764{
2c30c71b 2765 struct bio_vec *bvec;
4e4cbee9 2766 int uptodate = !bio->bi_status;
facc8a22 2767 struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
7870d082 2768 struct extent_io_tree *tree, *failure_tree;
facc8a22 2769 u64 offset = 0;
d1310b2e
CM
2770 u64 start;
2771 u64 end;
facc8a22 2772 u64 len;
883d0de4
MX
2773 u64 extent_start = 0;
2774 u64 extent_len = 0;
5cf1ab56 2775 int mirror;
d1310b2e 2776 int ret;
6dc4f100 2777 struct bvec_iter_all iter_all;
d1310b2e 2778
c09abff8 2779 ASSERT(!bio_flagged(bio, BIO_CLONED));
2b070cfe 2780 bio_for_each_segment_all(bvec, bio, iter_all) {
d1310b2e 2781 struct page *page = bvec->bv_page;
a71754fc 2782 struct inode *inode = page->mapping->host;
ab8d0fc4 2783 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
78e62c02
NB
2784 bool data_inode = btrfs_ino(BTRFS_I(inode))
2785 != BTRFS_BTREE_INODE_OBJECTID;
507903b8 2786
ab8d0fc4
JM
2787 btrfs_debug(fs_info,
2788 "end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u",
4e4cbee9 2789 (u64)bio->bi_iter.bi_sector, bio->bi_status,
ab8d0fc4 2790 io_bio->mirror_num);
a71754fc 2791 tree = &BTRFS_I(inode)->io_tree;
7870d082 2792 failure_tree = &BTRFS_I(inode)->io_failure_tree;
902b22f3 2793
17a5adcc
AO
2794 /* We always issue full-page reads, but if some block
2795 * in a page fails to read, blk_update_request() will
2796 * advance bv_offset and adjust bv_len to compensate.
2797 * Print a warning for nonzero offsets, and an error
2798 * if they don't add up to a full page. */
09cbfeaf
KS
2799 if (bvec->bv_offset || bvec->bv_len != PAGE_SIZE) {
2800 if (bvec->bv_offset + bvec->bv_len != PAGE_SIZE)
ab8d0fc4
JM
2801 btrfs_err(fs_info,
2802 "partial page read in btrfs with offset %u and length %u",
efe120a0
FH
2803 bvec->bv_offset, bvec->bv_len);
2804 else
ab8d0fc4
JM
2805 btrfs_info(fs_info,
2806 "incomplete page read in btrfs with offset %u and length %u",
efe120a0
FH
2807 bvec->bv_offset, bvec->bv_len);
2808 }
d1310b2e 2809
17a5adcc
AO
2810 start = page_offset(page);
2811 end = start + bvec->bv_offset + bvec->bv_len - 1;
facc8a22 2812 len = bvec->bv_len;
d1310b2e 2813
9be3395b 2814 mirror = io_bio->mirror_num;
78e62c02 2815 if (likely(uptodate)) {
facc8a22
MX
2816 ret = tree->ops->readpage_end_io_hook(io_bio, offset,
2817 page, start, end,
2818 mirror);
5ee0844d 2819 if (ret)
d1310b2e 2820 uptodate = 0;
5ee0844d 2821 else
7870d082
JB
2822 clean_io_failure(BTRFS_I(inode)->root->fs_info,
2823 failure_tree, tree, start,
2824 page,
2825 btrfs_ino(BTRFS_I(inode)), 0);
d1310b2e 2826 }
ea466794 2827
f2a09da9
MX
2828 if (likely(uptodate))
2829 goto readpage_ok;
2830
78e62c02 2831 if (data_inode) {
9d0d1c8b 2832
f4a8e656 2833 /*
78e62c02
NB
2834 * The generic bio_readpage_error handles errors the
2835 * following way: If possible, new read requests are
2836 * created and submitted and will end up in
2837 * end_bio_extent_readpage as well (if we're lucky,
2838 * not in the !uptodate case). In that case it returns
2839 * 0 and we just go on with the next page in our bio.
2840 * If it can't handle the error it will return -EIO and
2841 * we remain responsible for that page.
f4a8e656 2842 */
78e62c02
NB
2843 ret = bio_readpage_error(bio, offset, page, start, end,
2844 mirror);
2845 if (ret == 0) {
2846 uptodate = !bio->bi_status;
2847 offset += len;
2848 continue;
2849 }
2850 } else {
2851 struct extent_buffer *eb;
2852
2853 eb = (struct extent_buffer *)page->private;
2854 set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
2855 eb->read_mirror = mirror;
2856 atomic_dec(&eb->io_pages);
2857 if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD,
2858 &eb->bflags))
2859 btree_readahead_hook(eb, -EIO);
7e38326f 2860 }
f2a09da9 2861readpage_ok:
883d0de4 2862 if (likely(uptodate)) {
a71754fc 2863 loff_t i_size = i_size_read(inode);
09cbfeaf 2864 pgoff_t end_index = i_size >> PAGE_SHIFT;
a583c026 2865 unsigned off;
a71754fc
JB
2866
2867 /* Zero out the end if this page straddles i_size */
7073017a 2868 off = offset_in_page(i_size);
a583c026 2869 if (page->index == end_index && off)
09cbfeaf 2870 zero_user_segment(page, off, PAGE_SIZE);
17a5adcc 2871 SetPageUptodate(page);
70dec807 2872 } else {
17a5adcc
AO
2873 ClearPageUptodate(page);
2874 SetPageError(page);
70dec807 2875 }
17a5adcc 2876 unlock_page(page);
facc8a22 2877 offset += len;
883d0de4
MX
2878
2879 if (unlikely(!uptodate)) {
2880 if (extent_len) {
2881 endio_readpage_release_extent(tree,
2882 extent_start,
2883 extent_len, 1);
2884 extent_start = 0;
2885 extent_len = 0;
2886 }
2887 endio_readpage_release_extent(tree, start,
2888 end - start + 1, 0);
2889 } else if (!extent_len) {
2890 extent_start = start;
2891 extent_len = end + 1 - start;
2892 } else if (extent_start + extent_len == start) {
2893 extent_len += end + 1 - start;
2894 } else {
2895 endio_readpage_release_extent(tree, extent_start,
2896 extent_len, uptodate);
2897 extent_start = start;
2898 extent_len = end + 1 - start;
2899 }
2c30c71b 2900 }
d1310b2e 2901
883d0de4
MX
2902 if (extent_len)
2903 endio_readpage_release_extent(tree, extent_start, extent_len,
2904 uptodate);
b3a0dd50 2905 btrfs_io_bio_free_csum(io_bio);
d1310b2e 2906 bio_put(bio);
d1310b2e
CM
2907}
2908
9be3395b 2909/*
184f999e
DS
2910 * Initialize the members up to but not including 'bio'. Use after allocating a
2911 * new bio by bio_alloc_bioset as it does not initialize the bytes outside of
2912 * 'bio' because use of __GFP_ZERO is not supported.
9be3395b 2913 */
184f999e 2914static inline void btrfs_io_bio_init(struct btrfs_io_bio *btrfs_bio)
d1310b2e 2915{
184f999e
DS
2916 memset(btrfs_bio, 0, offsetof(struct btrfs_io_bio, bio));
2917}
d1310b2e 2918
9be3395b 2919/*
6e707bcd
DS
2920 * The following helpers allocate a bio. As it's backed by a bioset, it'll
2921 * never fail. We're returning a bio right now but you can call btrfs_io_bio
2922 * for the appropriate container_of magic
9be3395b 2923 */
e749af44 2924struct bio *btrfs_bio_alloc(u64 first_byte)
d1310b2e
CM
2925{
2926 struct bio *bio;
d1310b2e 2927
8ac9f7c1 2928 bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, &btrfs_bioset);
c821e7f3 2929 bio->bi_iter.bi_sector = first_byte >> 9;
184f999e 2930 btrfs_io_bio_init(btrfs_io_bio(bio));
d1310b2e
CM
2931 return bio;
2932}
2933
8b6c1d56 2934struct bio *btrfs_bio_clone(struct bio *bio)
9be3395b 2935{
23ea8e5a
MX
2936 struct btrfs_io_bio *btrfs_bio;
2937 struct bio *new;
9be3395b 2938
6e707bcd 2939 /* Bio allocation backed by a bioset does not fail */
8ac9f7c1 2940 new = bio_clone_fast(bio, GFP_NOFS, &btrfs_bioset);
6e707bcd 2941 btrfs_bio = btrfs_io_bio(new);
184f999e 2942 btrfs_io_bio_init(btrfs_bio);
6e707bcd 2943 btrfs_bio->iter = bio->bi_iter;
23ea8e5a
MX
2944 return new;
2945}
9be3395b 2946
c5e4c3d7 2947struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs)
9be3395b 2948{
facc8a22
MX
2949 struct bio *bio;
2950
6e707bcd 2951 /* Bio allocation backed by a bioset does not fail */
8ac9f7c1 2952 bio = bio_alloc_bioset(GFP_NOFS, nr_iovecs, &btrfs_bioset);
184f999e 2953 btrfs_io_bio_init(btrfs_io_bio(bio));
facc8a22 2954 return bio;
9be3395b
CM
2955}
2956
e477094f 2957struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size)
2f8e9140
LB
2958{
2959 struct bio *bio;
2960 struct btrfs_io_bio *btrfs_bio;
2961
2962 /* this will never fail when it's backed by a bioset */
8ac9f7c1 2963 bio = bio_clone_fast(orig, GFP_NOFS, &btrfs_bioset);
2f8e9140
LB
2964 ASSERT(bio);
2965
2966 btrfs_bio = btrfs_io_bio(bio);
184f999e 2967 btrfs_io_bio_init(btrfs_bio);
2f8e9140
LB
2968
2969 bio_trim(bio, offset >> 9, size >> 9);
17347cec 2970 btrfs_bio->iter = bio->bi_iter;
2f8e9140
LB
2971 return bio;
2972}
9be3395b 2973
4b81ba48
DS
2974/*
2975 * @opf: bio REQ_OP_* and REQ_* flags as one value
b8b3d625
DS
2976 * @wbc: optional writeback control for io accounting
2977 * @page: page to add to the bio
2978 * @pg_offset: offset of the new bio or to check whether we are adding
2979 * a contiguous page to the previous one
2980 * @size: portion of page that we want to write
2981 * @offset: starting offset in the page
5c2b1fd7 2982 * @bio_ret: must be valid pointer, newly allocated bio will be stored there
b8b3d625
DS
2983 * @end_io_func: end_io callback for new bio
2984 * @mirror_num: desired mirror to read/write
2985 * @prev_bio_flags: flags of previous bio to see if we can merge the current one
2986 * @bio_flags: flags of the current bio to see if we can merge them
4b81ba48 2987 */
0ceb34bf 2988static int submit_extent_page(unsigned int opf,
da2f0f74 2989 struct writeback_control *wbc,
6273b7f8 2990 struct page *page, u64 offset,
6c5a4e2c 2991 size_t size, unsigned long pg_offset,
d1310b2e 2992 struct bio **bio_ret,
f188591e 2993 bio_end_io_t end_io_func,
c8b97818
CM
2994 int mirror_num,
2995 unsigned long prev_bio_flags,
005efedf
FM
2996 unsigned long bio_flags,
2997 bool force_bio_submit)
d1310b2e
CM
2998{
2999 int ret = 0;
3000 struct bio *bio;
09cbfeaf 3001 size_t page_size = min_t(size_t, size, PAGE_SIZE);
6273b7f8 3002 sector_t sector = offset >> 9;
0ceb34bf 3003 struct extent_io_tree *tree = &BTRFS_I(page->mapping->host)->io_tree;
d1310b2e 3004
5c2b1fd7
DS
3005 ASSERT(bio_ret);
3006
3007 if (*bio_ret) {
0c8508a6
DS
3008 bool contig;
3009 bool can_merge = true;
3010
d1310b2e 3011 bio = *bio_ret;
0c8508a6 3012 if (prev_bio_flags & EXTENT_BIO_COMPRESSED)
4f024f37 3013 contig = bio->bi_iter.bi_sector == sector;
c8b97818 3014 else
f73a1c7d 3015 contig = bio_end_sector(bio) == sector;
c8b97818 3016
da12fe54
NB
3017 ASSERT(tree->ops);
3018 if (btrfs_bio_fits_in_stripe(page, page_size, bio, bio_flags))
0c8508a6
DS
3019 can_merge = false;
3020
3021 if (prev_bio_flags != bio_flags || !contig || !can_merge ||
005efedf 3022 force_bio_submit ||
6c5a4e2c 3023 bio_add_page(bio, page, page_size, pg_offset) < page_size) {
1f7ad75b 3024 ret = submit_one_bio(bio, mirror_num, prev_bio_flags);
289454ad
NA
3025 if (ret < 0) {
3026 *bio_ret = NULL;
79787eaa 3027 return ret;
289454ad 3028 }
d1310b2e
CM
3029 bio = NULL;
3030 } else {
da2f0f74 3031 if (wbc)
34e51a5e 3032 wbc_account_cgroup_owner(wbc, page, page_size);
d1310b2e
CM
3033 return 0;
3034 }
3035 }
c8b97818 3036
e749af44 3037 bio = btrfs_bio_alloc(offset);
6c5a4e2c 3038 bio_add_page(bio, page, page_size, pg_offset);
d1310b2e
CM
3039 bio->bi_end_io = end_io_func;
3040 bio->bi_private = tree;
e6959b93 3041 bio->bi_write_hint = page->mapping->host->i_write_hint;
4b81ba48 3042 bio->bi_opf = opf;
da2f0f74 3043 if (wbc) {
429aebc0
DS
3044 struct block_device *bdev;
3045
3046 bdev = BTRFS_I(page->mapping->host)->root->fs_info->fs_devices->latest_bdev;
3047 bio_set_dev(bio, bdev);
da2f0f74 3048 wbc_init_bio(wbc, bio);
34e51a5e 3049 wbc_account_cgroup_owner(wbc, page, page_size);
da2f0f74 3050 }
70dec807 3051
5c2b1fd7 3052 *bio_ret = bio;
d1310b2e
CM
3053
3054 return ret;
3055}
3056
48a3b636
ES
3057static void attach_extent_buffer_page(struct extent_buffer *eb,
3058 struct page *page)
d1310b2e
CM
3059{
3060 if (!PagePrivate(page)) {
3061 SetPagePrivate(page);
09cbfeaf 3062 get_page(page);
4f2de97a
JB
3063 set_page_private(page, (unsigned long)eb);
3064 } else {
3065 WARN_ON(page->private != (unsigned long)eb);
d1310b2e
CM
3066 }
3067}
3068
4f2de97a 3069void set_page_extent_mapped(struct page *page)
d1310b2e 3070{
4f2de97a
JB
3071 if (!PagePrivate(page)) {
3072 SetPagePrivate(page);
09cbfeaf 3073 get_page(page);
4f2de97a
JB
3074 set_page_private(page, EXTENT_PAGE_PRIVATE);
3075 }
d1310b2e
CM
3076}
3077
125bac01
MX
3078static struct extent_map *
3079__get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
3080 u64 start, u64 len, get_extent_t *get_extent,
3081 struct extent_map **em_cached)
3082{
3083 struct extent_map *em;
3084
3085 if (em_cached && *em_cached) {
3086 em = *em_cached;
cbc0e928 3087 if (extent_map_in_tree(em) && start >= em->start &&
125bac01 3088 start < extent_map_end(em)) {
490b54d6 3089 refcount_inc(&em->refs);
125bac01
MX
3090 return em;
3091 }
3092
3093 free_extent_map(em);
3094 *em_cached = NULL;
3095 }
3096
39b07b5d 3097 em = get_extent(BTRFS_I(inode), page, pg_offset, start, len);
125bac01
MX
3098 if (em_cached && !IS_ERR_OR_NULL(em)) {
3099 BUG_ON(*em_cached);
490b54d6 3100 refcount_inc(&em->refs);
125bac01
MX
3101 *em_cached = em;
3102 }
3103 return em;
3104}
d1310b2e
CM
3105/*
3106 * basic readpage implementation. Locked extent state structs are inserted
3107 * into the tree that are removed when the IO is done (by the end_io
3108 * handlers)
79787eaa 3109 * XXX JDM: This needs looking at to ensure proper page locking
baf863b9 3110 * return 0 on success, otherwise return error
d1310b2e 3111 */
9974090b
MX
3112static int __do_readpage(struct extent_io_tree *tree,
3113 struct page *page,
3114 get_extent_t *get_extent,
125bac01 3115 struct extent_map **em_cached,
9974090b 3116 struct bio **bio, int mirror_num,
f1c77c55 3117 unsigned long *bio_flags, unsigned int read_flags,
005efedf 3118 u64 *prev_em_start)
d1310b2e
CM
3119{
3120 struct inode *inode = page->mapping->host;
4eee4fa4 3121 u64 start = page_offset(page);
8eec8296 3122 const u64 end = start + PAGE_SIZE - 1;
d1310b2e
CM
3123 u64 cur = start;
3124 u64 extent_offset;
3125 u64 last_byte = i_size_read(inode);
3126 u64 block_start;
3127 u64 cur_end;
d1310b2e 3128 struct extent_map *em;
baf863b9 3129 int ret = 0;
d1310b2e 3130 int nr = 0;
306e16ce 3131 size_t pg_offset = 0;
d1310b2e 3132 size_t iosize;
c8b97818 3133 size_t disk_io_size;
d1310b2e 3134 size_t blocksize = inode->i_sb->s_blocksize;
7f042a83 3135 unsigned long this_bio_flag = 0;
d1310b2e 3136
ae6957eb
DS
3137 ASSERT(tree == &BTRFS_I(inode)->io_tree);
3138
d1310b2e
CM
3139 set_page_extent_mapped(page);
3140
90a887c9
DM
3141 if (!PageUptodate(page)) {
3142 if (cleancache_get_page(page) == 0) {
3143 BUG_ON(blocksize != PAGE_SIZE);
9974090b 3144 unlock_extent(tree, start, end);
90a887c9
DM
3145 goto out;
3146 }
3147 }
3148
09cbfeaf 3149 if (page->index == last_byte >> PAGE_SHIFT) {
c8b97818 3150 char *userpage;
7073017a 3151 size_t zero_offset = offset_in_page(last_byte);
c8b97818
CM
3152
3153 if (zero_offset) {
09cbfeaf 3154 iosize = PAGE_SIZE - zero_offset;
7ac687d9 3155 userpage = kmap_atomic(page);
c8b97818
CM
3156 memset(userpage + zero_offset, 0, iosize);
3157 flush_dcache_page(page);
7ac687d9 3158 kunmap_atomic(userpage);
c8b97818
CM
3159 }
3160 }
d1310b2e 3161 while (cur <= end) {
005efedf 3162 bool force_bio_submit = false;
6273b7f8 3163 u64 offset;
c8f2f24b 3164
d1310b2e
CM
3165 if (cur >= last_byte) {
3166 char *userpage;
507903b8
AJ
3167 struct extent_state *cached = NULL;
3168
09cbfeaf 3169 iosize = PAGE_SIZE - pg_offset;
7ac687d9 3170 userpage = kmap_atomic(page);
306e16ce 3171 memset(userpage + pg_offset, 0, iosize);
d1310b2e 3172 flush_dcache_page(page);
7ac687d9 3173 kunmap_atomic(userpage);
d1310b2e 3174 set_extent_uptodate(tree, cur, cur + iosize - 1,
507903b8 3175 &cached, GFP_NOFS);
7f042a83 3176 unlock_extent_cached(tree, cur,
e43bbe5e 3177 cur + iosize - 1, &cached);
d1310b2e
CM
3178 break;
3179 }
125bac01
MX
3180 em = __get_extent_map(inode, page, pg_offset, cur,
3181 end - cur + 1, get_extent, em_cached);
c704005d 3182 if (IS_ERR_OR_NULL(em)) {
d1310b2e 3183 SetPageError(page);
7f042a83 3184 unlock_extent(tree, cur, end);
d1310b2e
CM
3185 break;
3186 }
d1310b2e
CM
3187 extent_offset = cur - em->start;
3188 BUG_ON(extent_map_end(em) <= cur);
3189 BUG_ON(end < cur);
3190
261507a0 3191 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
4b384318 3192 this_bio_flag |= EXTENT_BIO_COMPRESSED;
261507a0
LZ
3193 extent_set_compress_type(&this_bio_flag,
3194 em->compress_type);
3195 }
c8b97818 3196
d1310b2e
CM
3197 iosize = min(extent_map_end(em) - cur, end - cur + 1);
3198 cur_end = min(extent_map_end(em) - 1, end);
fda2832f 3199 iosize = ALIGN(iosize, blocksize);
c8b97818
CM
3200 if (this_bio_flag & EXTENT_BIO_COMPRESSED) {
3201 disk_io_size = em->block_len;
6273b7f8 3202 offset = em->block_start;
c8b97818 3203 } else {
6273b7f8 3204 offset = em->block_start + extent_offset;
c8b97818
CM
3205 disk_io_size = iosize;
3206 }
d1310b2e 3207 block_start = em->block_start;
d899e052
YZ
3208 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
3209 block_start = EXTENT_MAP_HOLE;
005efedf
FM
3210
3211 /*
3212 * If we have a file range that points to a compressed extent
3213 * and it's followed by a consecutive file range that points to
3214 * to the same compressed extent (possibly with a different
3215 * offset and/or length, so it either points to the whole extent
3216 * or only part of it), we must make sure we do not submit a
3217 * single bio to populate the pages for the 2 ranges because
3218 * this makes the compressed extent read zero out the pages
3219 * belonging to the 2nd range. Imagine the following scenario:
3220 *
3221 * File layout
3222 * [0 - 8K] [8K - 24K]
3223 * | |
3224 * | |
3225 * points to extent X, points to extent X,
3226 * offset 4K, length of 8K offset 0, length 16K
3227 *
3228 * [extent X, compressed length = 4K uncompressed length = 16K]
3229 *
3230 * If the bio to read the compressed extent covers both ranges,
3231 * it will decompress extent X into the pages belonging to the
3232 * first range and then it will stop, zeroing out the remaining
3233 * pages that belong to the other range that points to extent X.
3234 * So here we make sure we submit 2 bios, one for the first
3235 * range and another one for the third range. Both will target
3236 * the same physical extent from disk, but we can't currently
3237 * make the compressed bio endio callback populate the pages
3238 * for both ranges because each compressed bio is tightly
3239 * coupled with a single extent map, and each range can have
3240 * an extent map with a different offset value relative to the
3241 * uncompressed data of our extent and different lengths. This
3242 * is a corner case so we prioritize correctness over
3243 * non-optimal behavior (submitting 2 bios for the same extent).
3244 */
3245 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) &&
3246 prev_em_start && *prev_em_start != (u64)-1 &&
8e928218 3247 *prev_em_start != em->start)
005efedf
FM
3248 force_bio_submit = true;
3249
3250 if (prev_em_start)
8e928218 3251 *prev_em_start = em->start;
005efedf 3252
d1310b2e
CM
3253 free_extent_map(em);
3254 em = NULL;
3255
3256 /* we've found a hole, just zero and go on */
3257 if (block_start == EXTENT_MAP_HOLE) {
3258 char *userpage;
507903b8
AJ
3259 struct extent_state *cached = NULL;
3260
7ac687d9 3261 userpage = kmap_atomic(page);
306e16ce 3262 memset(userpage + pg_offset, 0, iosize);
d1310b2e 3263 flush_dcache_page(page);
7ac687d9 3264 kunmap_atomic(userpage);
d1310b2e
CM
3265
3266 set_extent_uptodate(tree, cur, cur + iosize - 1,
507903b8 3267 &cached, GFP_NOFS);
7f042a83 3268 unlock_extent_cached(tree, cur,
e43bbe5e 3269 cur + iosize - 1, &cached);
d1310b2e 3270 cur = cur + iosize;
306e16ce 3271 pg_offset += iosize;
d1310b2e
CM
3272 continue;
3273 }
3274 /* the get_extent function already copied into the page */
9655d298
CM
3275 if (test_range_bit(tree, cur, cur_end,
3276 EXTENT_UPTODATE, 1, NULL)) {
a1b32a59 3277 check_page_uptodate(tree, page);
7f042a83 3278 unlock_extent(tree, cur, cur + iosize - 1);
d1310b2e 3279 cur = cur + iosize;
306e16ce 3280 pg_offset += iosize;
d1310b2e
CM
3281 continue;
3282 }
70dec807
CM
3283 /* we have an inline extent but it didn't get marked up
3284 * to date. Error out
3285 */
3286 if (block_start == EXTENT_MAP_INLINE) {
3287 SetPageError(page);
7f042a83 3288 unlock_extent(tree, cur, cur + iosize - 1);
70dec807 3289 cur = cur + iosize;
306e16ce 3290 pg_offset += iosize;
70dec807
CM
3291 continue;
3292 }
d1310b2e 3293
0ceb34bf 3294 ret = submit_extent_page(REQ_OP_READ | read_flags, NULL,
6273b7f8 3295 page, offset, disk_io_size,
fa17ed06 3296 pg_offset, bio,
c8b97818
CM
3297 end_bio_extent_readpage, mirror_num,
3298 *bio_flags,
005efedf
FM
3299 this_bio_flag,
3300 force_bio_submit);
c8f2f24b
JB
3301 if (!ret) {
3302 nr++;
3303 *bio_flags = this_bio_flag;
3304 } else {
d1310b2e 3305 SetPageError(page);
7f042a83 3306 unlock_extent(tree, cur, cur + iosize - 1);
baf863b9 3307 goto out;
edd33c99 3308 }
d1310b2e 3309 cur = cur + iosize;
306e16ce 3310 pg_offset += iosize;
d1310b2e 3311 }
90a887c9 3312out:
d1310b2e
CM
3313 if (!nr) {
3314 if (!PageError(page))
3315 SetPageUptodate(page);
3316 unlock_page(page);
3317 }
baf863b9 3318 return ret;
d1310b2e
CM
3319}
3320
b6660e80 3321static inline void contiguous_readpages(struct page *pages[], int nr_pages,
9974090b 3322 u64 start, u64 end,
125bac01 3323 struct extent_map **em_cached,
d3fac6ba 3324 struct bio **bio,
1f7ad75b 3325 unsigned long *bio_flags,
808f80b4 3326 u64 *prev_em_start)
9974090b 3327{
23d31bd4 3328 struct btrfs_inode *inode = BTRFS_I(pages[0]->mapping->host);
b6660e80 3329 struct extent_io_tree *tree = &inode->io_tree;
9974090b
MX
3330 int index;
3331
b272ae22 3332 btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
9974090b
MX
3333
3334 for (index = 0; index < nr_pages; index++) {
4ef77695 3335 __do_readpage(tree, pages[index], btrfs_get_extent, em_cached,
5e9d3982 3336 bio, 0, bio_flags, REQ_RAHEAD, prev_em_start);
09cbfeaf 3337 put_page(pages[index]);
9974090b
MX
3338 }
3339}
3340
0d44fea7 3341static int __extent_read_full_page(struct page *page,
9974090b
MX
3342 get_extent_t *get_extent,
3343 struct bio **bio, int mirror_num,
f1c77c55
DS
3344 unsigned long *bio_flags,
3345 unsigned int read_flags)
9974090b 3346{
23d31bd4 3347 struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
9974090b 3348 u64 start = page_offset(page);
09cbfeaf 3349 u64 end = start + PAGE_SIZE - 1;
0d44fea7 3350 struct extent_io_tree *tree = &inode->io_tree;
9974090b
MX
3351 int ret;
3352
b272ae22 3353 btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
9974090b 3354
125bac01 3355 ret = __do_readpage(tree, page, get_extent, NULL, bio, mirror_num,
1f7ad75b 3356 bio_flags, read_flags, NULL);
9974090b
MX
3357 return ret;
3358}
3359
71ad38b4
DS
3360int extent_read_full_page(struct page *page, get_extent_t *get_extent,
3361 int mirror_num)
d1310b2e
CM
3362{
3363 struct bio *bio = NULL;
c8b97818 3364 unsigned long bio_flags = 0;
d1310b2e
CM
3365 int ret;
3366
0d44fea7 3367 ret = __extent_read_full_page(page, get_extent, &bio, mirror_num,
1f7ad75b 3368 &bio_flags, 0);
d1310b2e 3369 if (bio)
1f7ad75b 3370 ret = submit_one_bio(bio, mirror_num, bio_flags);
d1310b2e
CM
3371 return ret;
3372}
d1310b2e 3373
3d4b9496 3374static void update_nr_written(struct writeback_control *wbc,
a9132667 3375 unsigned long nr_written)
11c8349b
CM
3376{
3377 wbc->nr_to_write -= nr_written;
11c8349b
CM
3378}
3379
d1310b2e 3380/*
40f76580
CM
3381 * helper for __extent_writepage, doing all of the delayed allocation setup.
3382 *
5eaad97a 3383 * This returns 1 if btrfs_run_delalloc_range function did all the work required
40f76580
CM
3384 * to write the page (copy into inline extent). In this case the IO has
3385 * been started and the page is already unlocked.
3386 *
3387 * This returns 0 if all went well (page still locked)
3388 * This returns < 0 if there were errors (page still locked)
d1310b2e 3389 */
40f76580 3390static noinline_for_stack int writepage_delalloc(struct inode *inode,
8cc0237a
NB
3391 struct page *page, struct writeback_control *wbc,
3392 u64 delalloc_start, unsigned long *nr_written)
40f76580 3393{
09cbfeaf 3394 u64 page_end = delalloc_start + PAGE_SIZE - 1;
3522e903 3395 bool found;
40f76580
CM
3396 u64 delalloc_to_write = 0;
3397 u64 delalloc_end = 0;
3398 int ret;
3399 int page_started = 0;
3400
40f76580
CM
3401
3402 while (delalloc_end < page_end) {
9978059b 3403 found = find_lock_delalloc_range(inode, page,
40f76580 3404 &delalloc_start,
917aacec 3405 &delalloc_end);
3522e903 3406 if (!found) {
40f76580
CM
3407 delalloc_start = delalloc_end + 1;
3408 continue;
3409 }
5eaad97a
NB
3410 ret = btrfs_run_delalloc_range(inode, page, delalloc_start,
3411 delalloc_end, &page_started, nr_written, wbc);
40f76580
CM
3412 if (ret) {
3413 SetPageError(page);
5eaad97a
NB
3414 /*
3415 * btrfs_run_delalloc_range should return < 0 for error
3416 * but just in case, we use > 0 here meaning the IO is
3417 * started, so we don't want to return > 0 unless
3418 * things are going well.
40f76580
CM
3419 */
3420 ret = ret < 0 ? ret : -EIO;
3421 goto done;
3422 }
3423 /*
ea1754a0
KS
3424 * delalloc_end is already one less than the total length, so
3425 * we don't subtract one from PAGE_SIZE
40f76580
CM
3426 */
3427 delalloc_to_write += (delalloc_end - delalloc_start +
ea1754a0 3428 PAGE_SIZE) >> PAGE_SHIFT;
40f76580
CM
3429 delalloc_start = delalloc_end + 1;
3430 }
3431 if (wbc->nr_to_write < delalloc_to_write) {
3432 int thresh = 8192;
3433
3434 if (delalloc_to_write < thresh * 2)
3435 thresh = delalloc_to_write;
3436 wbc->nr_to_write = min_t(u64, delalloc_to_write,
3437 thresh);
3438 }
3439
3440 /* did the fill delalloc function already unlock and start
3441 * the IO?
3442 */
3443 if (page_started) {
3444 /*
3445 * we've unlocked the page, so we can't update
3446 * the mapping's writeback index, just update
3447 * nr_to_write.
3448 */
3449 wbc->nr_to_write -= *nr_written;
3450 return 1;
3451 }
3452
3453 ret = 0;
3454
3455done:
3456 return ret;
3457}
3458
3459/*
3460 * helper for __extent_writepage. This calls the writepage start hooks,
3461 * and does the loop to map the page into extents and bios.
3462 *
3463 * We return 1 if the IO is started and the page is unlocked,
3464 * 0 if all went well (page still locked)
3465 * < 0 if there were errors (page still locked)
3466 */
3467static noinline_for_stack int __extent_writepage_io(struct inode *inode,
3468 struct page *page,
3469 struct writeback_control *wbc,
3470 struct extent_page_data *epd,
3471 loff_t i_size,
3472 unsigned long nr_written,
57e5ffeb 3473 int *nr_ret)
d1310b2e 3474{
45b08405 3475 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
4eee4fa4 3476 u64 start = page_offset(page);
09cbfeaf 3477 u64 page_end = start + PAGE_SIZE - 1;
d1310b2e
CM
3478 u64 end;
3479 u64 cur = start;
3480 u64 extent_offset;
d1310b2e
CM
3481 u64 block_start;
3482 u64 iosize;
d1310b2e 3483 struct extent_map *em;
7f3c74fb 3484 size_t pg_offset = 0;
d1310b2e 3485 size_t blocksize;
40f76580
CM
3486 int ret = 0;
3487 int nr = 0;
57e5ffeb 3488 const unsigned int write_flags = wbc_to_write_flags(wbc);
40f76580 3489 bool compressed;
c8b97818 3490
d75855b4
NB
3491 ret = btrfs_writepage_cow_fixup(page, start, page_end);
3492 if (ret) {
3493 /* Fixup worker will requeue */
5ab58055 3494 redirty_page_for_writepage(wbc, page);
d75855b4
NB
3495 update_nr_written(wbc, nr_written);
3496 unlock_page(page);
3497 return 1;
247e743c
CM
3498 }
3499
11c8349b
CM
3500 /*
3501 * we don't want to touch the inode after unlocking the page,
3502 * so we update the mapping writeback index now
3503 */
3d4b9496 3504 update_nr_written(wbc, nr_written + 1);
771ed689 3505
d1310b2e 3506 end = page_end;
d1310b2e
CM
3507 blocksize = inode->i_sb->s_blocksize;
3508
3509 while (cur <= end) {
40f76580 3510 u64 em_end;
6273b7f8 3511 u64 offset;
58409edd 3512
40f76580 3513 if (cur >= i_size) {
7087a9d8 3514 btrfs_writepage_endio_finish_ordered(page, cur,
c629732d 3515 page_end, 1);
d1310b2e
CM
3516 break;
3517 }
39b07b5d
OS
3518 em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur,
3519 end - cur + 1);
c704005d 3520 if (IS_ERR_OR_NULL(em)) {
d1310b2e 3521 SetPageError(page);
61391d56 3522 ret = PTR_ERR_OR_ZERO(em);
d1310b2e
CM
3523 break;
3524 }
3525
3526 extent_offset = cur - em->start;
40f76580
CM
3527 em_end = extent_map_end(em);
3528 BUG_ON(em_end <= cur);
d1310b2e 3529 BUG_ON(end < cur);
40f76580 3530 iosize = min(em_end - cur, end - cur + 1);
fda2832f 3531 iosize = ALIGN(iosize, blocksize);
6273b7f8 3532 offset = em->block_start + extent_offset;
d1310b2e 3533 block_start = em->block_start;
c8b97818 3534 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
d1310b2e
CM
3535 free_extent_map(em);
3536 em = NULL;
3537
c8b97818
CM
3538 /*
3539 * compressed and inline extents are written through other
3540 * paths in the FS
3541 */
3542 if (compressed || block_start == EXTENT_MAP_HOLE ||
d1310b2e 3543 block_start == EXTENT_MAP_INLINE) {
c8b04030 3544 if (compressed)
c8b97818 3545 nr++;
c8b04030
OS
3546 else
3547 btrfs_writepage_endio_finish_ordered(page, cur,
3548 cur + iosize - 1, 1);
c8b97818 3549 cur += iosize;
7f3c74fb 3550 pg_offset += iosize;
d1310b2e
CM
3551 continue;
3552 }
c8b97818 3553
5cdc84bf 3554 btrfs_set_range_writeback(tree, cur, cur + iosize - 1);
58409edd
DS
3555 if (!PageWriteback(page)) {
3556 btrfs_err(BTRFS_I(inode)->root->fs_info,
3557 "page %lu not writeback, cur %llu end %llu",
3558 page->index, cur, end);
d1310b2e 3559 }
7f3c74fb 3560
0ceb34bf 3561 ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc,
6273b7f8 3562 page, offset, iosize, pg_offset,
fa17ed06 3563 &epd->bio,
58409edd
DS
3564 end_bio_extent_writepage,
3565 0, 0, 0, false);
fe01aa65 3566 if (ret) {
58409edd 3567 SetPageError(page);
fe01aa65
TK
3568 if (PageWriteback(page))
3569 end_page_writeback(page);
3570 }
d1310b2e 3571
d1310b2e 3572 cur = cur + iosize;
7f3c74fb 3573 pg_offset += iosize;
d1310b2e
CM
3574 nr++;
3575 }
40f76580 3576 *nr_ret = nr;
40f76580
CM
3577 return ret;
3578}
3579
3580/*
3581 * the writepage semantics are similar to regular writepage. extent
3582 * records are inserted to lock ranges in the tree, and as dirty areas
3583 * are found, they are marked writeback. Then the lock bits are removed
3584 * and the end_io handler clears the writeback ranges
3065976b
QW
3585 *
3586 * Return 0 if everything goes well.
3587 * Return <0 for error.
40f76580
CM
3588 */
3589static int __extent_writepage(struct page *page, struct writeback_control *wbc,
aab6e9ed 3590 struct extent_page_data *epd)
40f76580
CM
3591{
3592 struct inode *inode = page->mapping->host;
40f76580 3593 u64 start = page_offset(page);
09cbfeaf 3594 u64 page_end = start + PAGE_SIZE - 1;
40f76580
CM
3595 int ret;
3596 int nr = 0;
eb70d222 3597 size_t pg_offset;
40f76580 3598 loff_t i_size = i_size_read(inode);
09cbfeaf 3599 unsigned long end_index = i_size >> PAGE_SHIFT;
40f76580
CM
3600 unsigned long nr_written = 0;
3601
40f76580
CM
3602 trace___extent_writepage(page, inode, wbc);
3603
3604 WARN_ON(!PageLocked(page));
3605
3606 ClearPageError(page);
3607
7073017a 3608 pg_offset = offset_in_page(i_size);
40f76580
CM
3609 if (page->index > end_index ||
3610 (page->index == end_index && !pg_offset)) {
09cbfeaf 3611 page->mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE);
40f76580
CM
3612 unlock_page(page);
3613 return 0;
3614 }
3615
3616 if (page->index == end_index) {
3617 char *userpage;
3618
3619 userpage = kmap_atomic(page);
3620 memset(userpage + pg_offset, 0,
09cbfeaf 3621 PAGE_SIZE - pg_offset);
40f76580
CM
3622 kunmap_atomic(userpage);
3623 flush_dcache_page(page);
3624 }
3625
40f76580
CM
3626 set_page_extent_mapped(page);
3627
7789a55a 3628 if (!epd->extent_locked) {
8cc0237a 3629 ret = writepage_delalloc(inode, page, wbc, start, &nr_written);
7789a55a 3630 if (ret == 1)
169d2c87 3631 return 0;
7789a55a
NB
3632 if (ret)
3633 goto done;
3634 }
40f76580
CM
3635
3636 ret = __extent_writepage_io(inode, page, wbc, epd,
57e5ffeb 3637 i_size, nr_written, &nr);
40f76580 3638 if (ret == 1)
169d2c87 3639 return 0;
40f76580 3640
d1310b2e
CM
3641done:
3642 if (nr == 0) {
3643 /* make sure the mapping tag for page dirty gets cleared */
3644 set_page_writeback(page);
3645 end_page_writeback(page);
3646 }
61391d56
FM
3647 if (PageError(page)) {
3648 ret = ret < 0 ? ret : -EIO;
3649 end_extent_writepage(page, ret, start, page_end);
3650 }
d1310b2e 3651 unlock_page(page);
3065976b 3652 ASSERT(ret <= 0);
40f76580 3653 return ret;
d1310b2e
CM
3654}
3655
fd8b2b61 3656void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
0b32f4bb 3657{
74316201
N
3658 wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_WRITEBACK,
3659 TASK_UNINTERRUPTIBLE);
0b32f4bb
JB
3660}
3661
18dfa711
FM
3662static void end_extent_buffer_writeback(struct extent_buffer *eb)
3663{
3664 clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
3665 smp_mb__after_atomic();
3666 wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
3667}
3668
2e3c2513
QW
3669/*
3670 * Lock eb pages and flush the bio if we can't the locks
3671 *
3672 * Return 0 if nothing went wrong
3673 * Return >0 is same as 0, except bio is not submitted
3674 * Return <0 if something went wrong, no page is locked
3675 */
9df76fb5 3676static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb,
0e378df1 3677 struct extent_page_data *epd)
0b32f4bb 3678{
9df76fb5 3679 struct btrfs_fs_info *fs_info = eb->fs_info;
2e3c2513 3680 int i, num_pages, failed_page_nr;
0b32f4bb
JB
3681 int flush = 0;
3682 int ret = 0;
3683
3684 if (!btrfs_try_tree_write_lock(eb)) {
f4340622 3685 ret = flush_write_bio(epd);
2e3c2513
QW
3686 if (ret < 0)
3687 return ret;
3688 flush = 1;
0b32f4bb
JB
3689 btrfs_tree_lock(eb);
3690 }
3691
3692 if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) {
3693 btrfs_tree_unlock(eb);
3694 if (!epd->sync_io)
3695 return 0;
3696 if (!flush) {
f4340622 3697 ret = flush_write_bio(epd);
2e3c2513
QW
3698 if (ret < 0)
3699 return ret;
0b32f4bb
JB
3700 flush = 1;
3701 }
a098d8e8
CM
3702 while (1) {
3703 wait_on_extent_buffer_writeback(eb);
3704 btrfs_tree_lock(eb);
3705 if (!test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags))
3706 break;
0b32f4bb 3707 btrfs_tree_unlock(eb);
0b32f4bb
JB
3708 }
3709 }
3710
51561ffe
JB
3711 /*
3712 * We need to do this to prevent races in people who check if the eb is
3713 * under IO since we can end up having no IO bits set for a short period
3714 * of time.
3715 */
3716 spin_lock(&eb->refs_lock);
0b32f4bb
JB
3717 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
3718 set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
51561ffe 3719 spin_unlock(&eb->refs_lock);
0b32f4bb 3720 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
104b4e51
NB
3721 percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
3722 -eb->len,
3723 fs_info->dirty_metadata_batch);
0b32f4bb 3724 ret = 1;
51561ffe
JB
3725 } else {
3726 spin_unlock(&eb->refs_lock);
0b32f4bb
JB
3727 }
3728
3729 btrfs_tree_unlock(eb);
3730
3731 if (!ret)
3732 return ret;
3733
65ad0104 3734 num_pages = num_extent_pages(eb);
0b32f4bb 3735 for (i = 0; i < num_pages; i++) {
fb85fc9a 3736 struct page *p = eb->pages[i];
0b32f4bb
JB
3737
3738 if (!trylock_page(p)) {
3739 if (!flush) {
18dfa711
FM
3740 int err;
3741
3742 err = flush_write_bio(epd);
3743 if (err < 0) {
3744 ret = err;
2e3c2513
QW
3745 failed_page_nr = i;
3746 goto err_unlock;
3747 }
0b32f4bb
JB
3748 flush = 1;
3749 }
3750 lock_page(p);
3751 }
3752 }
3753
3754 return ret;
2e3c2513
QW
3755err_unlock:
3756 /* Unlock already locked pages */
3757 for (i = 0; i < failed_page_nr; i++)
3758 unlock_page(eb->pages[i]);
18dfa711
FM
3759 /*
3760 * Clear EXTENT_BUFFER_WRITEBACK and wake up anyone waiting on it.
3761 * Also set back EXTENT_BUFFER_DIRTY so future attempts to this eb can
3762 * be made and undo everything done before.
3763 */
3764 btrfs_tree_lock(eb);
3765 spin_lock(&eb->refs_lock);
3766 set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
3767 end_extent_buffer_writeback(eb);
3768 spin_unlock(&eb->refs_lock);
3769 percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, eb->len,
3770 fs_info->dirty_metadata_batch);
3771 btrfs_clear_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
3772 btrfs_tree_unlock(eb);
2e3c2513 3773 return ret;
0b32f4bb
JB
3774}
3775
656f30db
FM
3776static void set_btree_ioerr(struct page *page)
3777{
3778 struct extent_buffer *eb = (struct extent_buffer *)page->private;
eb5b64f1 3779 struct btrfs_fs_info *fs_info;
656f30db
FM
3780
3781 SetPageError(page);
3782 if (test_and_set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))
3783 return;
3784
eb5b64f1
DZ
3785 /*
3786 * If we error out, we should add back the dirty_metadata_bytes
3787 * to make it consistent.
3788 */
3789 fs_info = eb->fs_info;
3790 percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
3791 eb->len, fs_info->dirty_metadata_batch);
3792
656f30db
FM
3793 /*
3794 * If writeback for a btree extent that doesn't belong to a log tree
3795 * failed, increment the counter transaction->eb_write_errors.
3796 * We do this because while the transaction is running and before it's
3797 * committing (when we call filemap_fdata[write|wait]_range against
3798 * the btree inode), we might have
3799 * btree_inode->i_mapping->a_ops->writepages() called by the VM - if it
3800 * returns an error or an error happens during writeback, when we're
3801 * committing the transaction we wouldn't know about it, since the pages
3802 * can be no longer dirty nor marked anymore for writeback (if a
3803 * subsequent modification to the extent buffer didn't happen before the
3804 * transaction commit), which makes filemap_fdata[write|wait]_range not
3805 * able to find the pages tagged with SetPageError at transaction
3806 * commit time. So if this happens we must abort the transaction,
3807 * otherwise we commit a super block with btree roots that point to
3808 * btree nodes/leafs whose content on disk is invalid - either garbage
3809 * or the content of some node/leaf from a past generation that got
3810 * cowed or deleted and is no longer valid.
3811 *
3812 * Note: setting AS_EIO/AS_ENOSPC in the btree inode's i_mapping would
3813 * not be enough - we need to distinguish between log tree extents vs
3814 * non-log tree extents, and the next filemap_fdatawait_range() call
3815 * will catch and clear such errors in the mapping - and that call might
3816 * be from a log sync and not from a transaction commit. Also, checking
3817 * for the eb flag EXTENT_BUFFER_WRITE_ERR at transaction commit time is
3818 * not done and would not be reliable - the eb might have been released
3819 * from memory and reading it back again means that flag would not be
3820 * set (since it's a runtime flag, not persisted on disk).
3821 *
3822 * Using the flags below in the btree inode also makes us achieve the
3823 * goal of AS_EIO/AS_ENOSPC when writepages() returns success, started
3824 * writeback for all dirty pages and before filemap_fdatawait_range()
3825 * is called, the writeback for all dirty pages had already finished
3826 * with errors - because we were not using AS_EIO/AS_ENOSPC,
3827 * filemap_fdatawait_range() would return success, as it could not know
3828 * that writeback errors happened (the pages were no longer tagged for
3829 * writeback).
3830 */
3831 switch (eb->log_index) {
3832 case -1:
afcdd129 3833 set_bit(BTRFS_FS_BTREE_ERR, &eb->fs_info->flags);
656f30db
FM
3834 break;
3835 case 0:
afcdd129 3836 set_bit(BTRFS_FS_LOG1_ERR, &eb->fs_info->flags);
656f30db
FM
3837 break;
3838 case 1:
afcdd129 3839 set_bit(BTRFS_FS_LOG2_ERR, &eb->fs_info->flags);
656f30db
FM
3840 break;
3841 default:
3842 BUG(); /* unexpected, logic error */
3843 }
3844}
3845
4246a0b6 3846static void end_bio_extent_buffer_writepage(struct bio *bio)
0b32f4bb 3847{
2c30c71b 3848 struct bio_vec *bvec;
0b32f4bb 3849 struct extent_buffer *eb;
2b070cfe 3850 int done;
6dc4f100 3851 struct bvec_iter_all iter_all;
0b32f4bb 3852
c09abff8 3853 ASSERT(!bio_flagged(bio, BIO_CLONED));
2b070cfe 3854 bio_for_each_segment_all(bvec, bio, iter_all) {
0b32f4bb
JB
3855 struct page *page = bvec->bv_page;
3856
0b32f4bb
JB
3857 eb = (struct extent_buffer *)page->private;
3858 BUG_ON(!eb);
3859 done = atomic_dec_and_test(&eb->io_pages);
3860
4e4cbee9 3861 if (bio->bi_status ||
4246a0b6 3862 test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) {
0b32f4bb 3863 ClearPageUptodate(page);
656f30db 3864 set_btree_ioerr(page);
0b32f4bb
JB
3865 }
3866
3867 end_page_writeback(page);
3868
3869 if (!done)
3870 continue;
3871
3872 end_extent_buffer_writeback(eb);
2c30c71b 3873 }
0b32f4bb
JB
3874
3875 bio_put(bio);
0b32f4bb
JB
3876}
3877
0e378df1 3878static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
0b32f4bb
JB
3879 struct writeback_control *wbc,
3880 struct extent_page_data *epd)
3881{
0b32f4bb 3882 u64 offset = eb->start;
851cd173 3883 u32 nritems;
cc5e31a4 3884 int i, num_pages;
851cd173 3885 unsigned long start, end;
ff40adf7 3886 unsigned int write_flags = wbc_to_write_flags(wbc) | REQ_META;
d7dbe9e7 3887 int ret = 0;
0b32f4bb 3888
656f30db 3889 clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags);
65ad0104 3890 num_pages = num_extent_pages(eb);
0b32f4bb 3891 atomic_set(&eb->io_pages, num_pages);
de0022b9 3892
851cd173
LB
3893 /* set btree blocks beyond nritems with 0 to avoid stale content. */
3894 nritems = btrfs_header_nritems(eb);
3eb548ee 3895 if (btrfs_header_level(eb) > 0) {
3eb548ee
LB
3896 end = btrfs_node_key_ptr_offset(nritems);
3897
b159fa28 3898 memzero_extent_buffer(eb, end, eb->len - end);
851cd173
LB
3899 } else {
3900 /*
3901 * leaf:
3902 * header 0 1 2 .. N ... data_N .. data_2 data_1 data_0
3903 */
3904 start = btrfs_item_nr_offset(nritems);
8f881e8c 3905 end = BTRFS_LEAF_DATA_OFFSET + leaf_data_end(eb);
b159fa28 3906 memzero_extent_buffer(eb, start, end - start);
3eb548ee
LB
3907 }
3908
0b32f4bb 3909 for (i = 0; i < num_pages; i++) {
fb85fc9a 3910 struct page *p = eb->pages[i];
0b32f4bb
JB
3911
3912 clear_page_dirty_for_io(p);
3913 set_page_writeback(p);
0ceb34bf 3914 ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc,
fa17ed06 3915 p, offset, PAGE_SIZE, 0,
c2df8bb4 3916 &epd->bio,
1f7ad75b 3917 end_bio_extent_buffer_writepage,
18fdc679 3918 0, 0, 0, false);
0b32f4bb 3919 if (ret) {
656f30db 3920 set_btree_ioerr(p);
fe01aa65
TK
3921 if (PageWriteback(p))
3922 end_page_writeback(p);
0b32f4bb
JB
3923 if (atomic_sub_and_test(num_pages - i, &eb->io_pages))
3924 end_extent_buffer_writeback(eb);
3925 ret = -EIO;
3926 break;
3927 }
09cbfeaf 3928 offset += PAGE_SIZE;
3d4b9496 3929 update_nr_written(wbc, 1);
0b32f4bb
JB
3930 unlock_page(p);
3931 }
3932
3933 if (unlikely(ret)) {
3934 for (; i < num_pages; i++) {
bbf65cf0 3935 struct page *p = eb->pages[i];
81465028 3936 clear_page_dirty_for_io(p);
0b32f4bb
JB
3937 unlock_page(p);
3938 }
3939 }
3940
3941 return ret;
3942}
3943
3944int btree_write_cache_pages(struct address_space *mapping,
3945 struct writeback_control *wbc)
3946{
0b32f4bb
JB
3947 struct extent_buffer *eb, *prev_eb = NULL;
3948 struct extent_page_data epd = {
3949 .bio = NULL,
0b32f4bb
JB
3950 .extent_locked = 0,
3951 .sync_io = wbc->sync_mode == WB_SYNC_ALL,
3952 };
3953 int ret = 0;
3954 int done = 0;
3955 int nr_to_write_done = 0;
3956 struct pagevec pvec;
3957 int nr_pages;
3958 pgoff_t index;
3959 pgoff_t end; /* Inclusive */
3960 int scanned = 0;
10bbd235 3961 xa_mark_t tag;
0b32f4bb 3962
86679820 3963 pagevec_init(&pvec);
0b32f4bb
JB
3964 if (wbc->range_cyclic) {
3965 index = mapping->writeback_index; /* Start from prev offset */
3966 end = -1;
556755a8
JB
3967 /*
3968 * Start from the beginning does not need to cycle over the
3969 * range, mark it as scanned.
3970 */
3971 scanned = (index == 0);
0b32f4bb 3972 } else {
09cbfeaf
KS
3973 index = wbc->range_start >> PAGE_SHIFT;
3974 end = wbc->range_end >> PAGE_SHIFT;
0b32f4bb
JB
3975 scanned = 1;
3976 }
3977 if (wbc->sync_mode == WB_SYNC_ALL)
3978 tag = PAGECACHE_TAG_TOWRITE;
3979 else
3980 tag = PAGECACHE_TAG_DIRTY;
3981retry:
3982 if (wbc->sync_mode == WB_SYNC_ALL)
3983 tag_pages_for_writeback(mapping, index, end);
3984 while (!done && !nr_to_write_done && (index <= end) &&
4006f437 3985 (nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end,
67fd707f 3986 tag))) {
0b32f4bb
JB
3987 unsigned i;
3988
0b32f4bb
JB
3989 for (i = 0; i < nr_pages; i++) {
3990 struct page *page = pvec.pages[i];
3991
3992 if (!PagePrivate(page))
3993 continue;
3994
b5bae261
JB
3995 spin_lock(&mapping->private_lock);
3996 if (!PagePrivate(page)) {
3997 spin_unlock(&mapping->private_lock);
3998 continue;
3999 }
4000
0b32f4bb 4001 eb = (struct extent_buffer *)page->private;
b5bae261
JB
4002
4003 /*
4004 * Shouldn't happen and normally this would be a BUG_ON
4005 * but no sense in crashing the users box for something
4006 * we can survive anyway.
4007 */
fae7f21c 4008 if (WARN_ON(!eb)) {
b5bae261 4009 spin_unlock(&mapping->private_lock);
0b32f4bb
JB
4010 continue;
4011 }
4012
b5bae261
JB
4013 if (eb == prev_eb) {
4014 spin_unlock(&mapping->private_lock);
0b32f4bb 4015 continue;
b5bae261 4016 }
0b32f4bb 4017
b5bae261
JB
4018 ret = atomic_inc_not_zero(&eb->refs);
4019 spin_unlock(&mapping->private_lock);
4020 if (!ret)
0b32f4bb 4021 continue;
0b32f4bb
JB
4022
4023 prev_eb = eb;
9df76fb5 4024 ret = lock_extent_buffer_for_io(eb, &epd);
0b32f4bb
JB
4025 if (!ret) {
4026 free_extent_buffer(eb);
4027 continue;
0607eb1d
FM
4028 } else if (ret < 0) {
4029 done = 1;
4030 free_extent_buffer(eb);
4031 break;
0b32f4bb
JB
4032 }
4033
0ab02063 4034 ret = write_one_eb(eb, wbc, &epd);
0b32f4bb
JB
4035 if (ret) {
4036 done = 1;
4037 free_extent_buffer(eb);
4038 break;
4039 }
4040 free_extent_buffer(eb);
4041
4042 /*
4043 * the filesystem may choose to bump up nr_to_write.
4044 * We have to make sure to honor the new nr_to_write
4045 * at any time
4046 */
4047 nr_to_write_done = wbc->nr_to_write <= 0;
4048 }
4049 pagevec_release(&pvec);
4050 cond_resched();
4051 }
4052 if (!scanned && !done) {
4053 /*
4054 * We hit the last page and there is more work to be done: wrap
4055 * back to the start of the file
4056 */
4057 scanned = 1;
4058 index = 0;
4059 goto retry;
4060 }
2b952eea
QW
4061 ASSERT(ret <= 0);
4062 if (ret < 0) {
4063 end_write_bio(&epd, ret);
4064 return ret;
4065 }
4066 ret = flush_write_bio(&epd);
0b32f4bb
JB
4067 return ret;
4068}
4069
d1310b2e 4070/**
4bef0848 4071 * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
d1310b2e
CM
4072 * @mapping: address space structure to write
4073 * @wbc: subtract the number of written pages from *@wbc->nr_to_write
935db853 4074 * @data: data passed to __extent_writepage function
d1310b2e
CM
4075 *
4076 * If a page is already under I/O, write_cache_pages() skips it, even
4077 * if it's dirty. This is desirable behaviour for memory-cleaning writeback,
4078 * but it is INCORRECT for data-integrity system calls such as fsync(). fsync()
4079 * and msync() need to guarantee that all the data which was dirty at the time
4080 * the call was made get new I/O started against them. If wbc->sync_mode is
4081 * WB_SYNC_ALL then we were called for data integrity and we must wait for
4082 * existing IO to complete.
4083 */
4242b64a 4084static int extent_write_cache_pages(struct address_space *mapping,
4bef0848 4085 struct writeback_control *wbc,
aab6e9ed 4086 struct extent_page_data *epd)
d1310b2e 4087{
7fd1a3f7 4088 struct inode *inode = mapping->host;
d1310b2e
CM
4089 int ret = 0;
4090 int done = 0;
f85d7d6c 4091 int nr_to_write_done = 0;
d1310b2e
CM
4092 struct pagevec pvec;
4093 int nr_pages;
4094 pgoff_t index;
4095 pgoff_t end; /* Inclusive */
a9132667
LB
4096 pgoff_t done_index;
4097 int range_whole = 0;
d1310b2e 4098 int scanned = 0;
10bbd235 4099 xa_mark_t tag;
d1310b2e 4100
7fd1a3f7
JB
4101 /*
4102 * We have to hold onto the inode so that ordered extents can do their
4103 * work when the IO finishes. The alternative to this is failing to add
4104 * an ordered extent if the igrab() fails there and that is a huge pain
4105 * to deal with, so instead just hold onto the inode throughout the
4106 * writepages operation. If it fails here we are freeing up the inode
4107 * anyway and we'd rather not waste our time writing out stuff that is
4108 * going to be truncated anyway.
4109 */
4110 if (!igrab(inode))
4111 return 0;
4112
86679820 4113 pagevec_init(&pvec);
d1310b2e
CM
4114 if (wbc->range_cyclic) {
4115 index = mapping->writeback_index; /* Start from prev offset */
4116 end = -1;
556755a8
JB
4117 /*
4118 * Start from the beginning does not need to cycle over the
4119 * range, mark it as scanned.
4120 */
4121 scanned = (index == 0);
d1310b2e 4122 } else {
09cbfeaf
KS
4123 index = wbc->range_start >> PAGE_SHIFT;
4124 end = wbc->range_end >> PAGE_SHIFT;
a9132667
LB
4125 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
4126 range_whole = 1;
d1310b2e
CM
4127 scanned = 1;
4128 }
3cd24c69
EL
4129
4130 /*
4131 * We do the tagged writepage as long as the snapshot flush bit is set
4132 * and we are the first one who do the filemap_flush() on this inode.
4133 *
4134 * The nr_to_write == LONG_MAX is needed to make sure other flushers do
4135 * not race in and drop the bit.
4136 */
4137 if (range_whole && wbc->nr_to_write == LONG_MAX &&
4138 test_and_clear_bit(BTRFS_INODE_SNAPSHOT_FLUSH,
4139 &BTRFS_I(inode)->runtime_flags))
4140 wbc->tagged_writepages = 1;
4141
4142 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
f7aaa06b
JB
4143 tag = PAGECACHE_TAG_TOWRITE;
4144 else
4145 tag = PAGECACHE_TAG_DIRTY;
d1310b2e 4146retry:
3cd24c69 4147 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
f7aaa06b 4148 tag_pages_for_writeback(mapping, index, end);
a9132667 4149 done_index = index;
f85d7d6c 4150 while (!done && !nr_to_write_done && (index <= end) &&
67fd707f
JK
4151 (nr_pages = pagevec_lookup_range_tag(&pvec, mapping,
4152 &index, end, tag))) {
d1310b2e
CM
4153 unsigned i;
4154
d1310b2e
CM
4155 for (i = 0; i < nr_pages; i++) {
4156 struct page *page = pvec.pages[i];
4157
f7bddf1e 4158 done_index = page->index + 1;
d1310b2e 4159 /*
b93b0163
MW
4160 * At this point we hold neither the i_pages lock nor
4161 * the page lock: the page may be truncated or
4162 * invalidated (changing page->mapping to NULL),
4163 * or even swizzled back from swapper_space to
4164 * tmpfs file mapping
d1310b2e 4165 */
c8f2f24b 4166 if (!trylock_page(page)) {
f4340622
QW
4167 ret = flush_write_bio(epd);
4168 BUG_ON(ret < 0);
c8f2f24b 4169 lock_page(page);
01d658f2 4170 }
d1310b2e
CM
4171
4172 if (unlikely(page->mapping != mapping)) {
4173 unlock_page(page);
4174 continue;
4175 }
4176
d2c3f4f6 4177 if (wbc->sync_mode != WB_SYNC_NONE) {
f4340622
QW
4178 if (PageWriteback(page)) {
4179 ret = flush_write_bio(epd);
4180 BUG_ON(ret < 0);
4181 }
d1310b2e 4182 wait_on_page_writeback(page);
d2c3f4f6 4183 }
d1310b2e
CM
4184
4185 if (PageWriteback(page) ||
4186 !clear_page_dirty_for_io(page)) {
4187 unlock_page(page);
4188 continue;
4189 }
4190
aab6e9ed 4191 ret = __extent_writepage(page, wbc, epd);
a9132667 4192 if (ret < 0) {
a9132667
LB
4193 done = 1;
4194 break;
4195 }
f85d7d6c
CM
4196
4197 /*
4198 * the filesystem may choose to bump up nr_to_write.
4199 * We have to make sure to honor the new nr_to_write
4200 * at any time
4201 */
4202 nr_to_write_done = wbc->nr_to_write <= 0;
d1310b2e
CM
4203 }
4204 pagevec_release(&pvec);
4205 cond_resched();
4206 }
894b36e3 4207 if (!scanned && !done) {
d1310b2e
CM
4208 /*
4209 * We hit the last page and there is more work to be done: wrap
4210 * back to the start of the file
4211 */
4212 scanned = 1;
4213 index = 0;
42ffb0bf
JB
4214
4215 /*
4216 * If we're looping we could run into a page that is locked by a
4217 * writer and that writer could be waiting on writeback for a
4218 * page in our current bio, and thus deadlock, so flush the
4219 * write bio here.
4220 */
4221 ret = flush_write_bio(epd);
4222 if (!ret)
4223 goto retry;
d1310b2e 4224 }
a9132667
LB
4225
4226 if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole))
4227 mapping->writeback_index = done_index;
4228
7fd1a3f7 4229 btrfs_add_delayed_iput(inode);
894b36e3 4230 return ret;
d1310b2e 4231}
d1310b2e 4232
0a9b0e53 4233int extent_write_full_page(struct page *page, struct writeback_control *wbc)
d1310b2e
CM
4234{
4235 int ret;
d1310b2e
CM
4236 struct extent_page_data epd = {
4237 .bio = NULL,
771ed689 4238 .extent_locked = 0,
ffbd517d 4239 .sync_io = wbc->sync_mode == WB_SYNC_ALL,
d1310b2e 4240 };
d1310b2e 4241
d1310b2e 4242 ret = __extent_writepage(page, wbc, &epd);
3065976b
QW
4243 ASSERT(ret <= 0);
4244 if (ret < 0) {
4245 end_write_bio(&epd, ret);
4246 return ret;
4247 }
d1310b2e 4248
3065976b
QW
4249 ret = flush_write_bio(&epd);
4250 ASSERT(ret <= 0);
d1310b2e
CM
4251 return ret;
4252}
d1310b2e 4253
5e3ee236 4254int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
771ed689
CM
4255 int mode)
4256{
4257 int ret = 0;
4258 struct address_space *mapping = inode->i_mapping;
4259 struct page *page;
09cbfeaf
KS
4260 unsigned long nr_pages = (end - start + PAGE_SIZE) >>
4261 PAGE_SHIFT;
771ed689
CM
4262
4263 struct extent_page_data epd = {
4264 .bio = NULL,
771ed689 4265 .extent_locked = 1,
ffbd517d 4266 .sync_io = mode == WB_SYNC_ALL,
771ed689
CM
4267 };
4268 struct writeback_control wbc_writepages = {
771ed689 4269 .sync_mode = mode,
771ed689
CM
4270 .nr_to_write = nr_pages * 2,
4271 .range_start = start,
4272 .range_end = end + 1,
ec39f769
CM
4273 /* We're called from an async helper function */
4274 .punt_to_cgroup = 1,
4275 .no_cgroup_owner = 1,
771ed689
CM
4276 };
4277
dbb70bec 4278 wbc_attach_fdatawrite_inode(&wbc_writepages, inode);
d397712b 4279 while (start <= end) {
09cbfeaf 4280 page = find_get_page(mapping, start >> PAGE_SHIFT);
771ed689
CM
4281 if (clear_page_dirty_for_io(page))
4282 ret = __extent_writepage(page, &wbc_writepages, &epd);
4283 else {
7087a9d8 4284 btrfs_writepage_endio_finish_ordered(page, start,
c629732d 4285 start + PAGE_SIZE - 1, 1);
771ed689
CM
4286 unlock_page(page);
4287 }
09cbfeaf
KS
4288 put_page(page);
4289 start += PAGE_SIZE;
771ed689
CM
4290 }
4291
02c6db4f 4292 ASSERT(ret <= 0);
dbb70bec
CM
4293 if (ret == 0)
4294 ret = flush_write_bio(&epd);
4295 else
02c6db4f 4296 end_write_bio(&epd, ret);
dbb70bec
CM
4297
4298 wbc_detach_inode(&wbc_writepages);
771ed689
CM
4299 return ret;
4300}
d1310b2e 4301
8ae225a8 4302int extent_writepages(struct address_space *mapping,
d1310b2e
CM
4303 struct writeback_control *wbc)
4304{
4305 int ret = 0;
4306 struct extent_page_data epd = {
4307 .bio = NULL,
771ed689 4308 .extent_locked = 0,
ffbd517d 4309 .sync_io = wbc->sync_mode == WB_SYNC_ALL,
d1310b2e
CM
4310 };
4311
935db853 4312 ret = extent_write_cache_pages(mapping, wbc, &epd);
a2a72fbd
QW
4313 ASSERT(ret <= 0);
4314 if (ret < 0) {
4315 end_write_bio(&epd, ret);
4316 return ret;
4317 }
4318 ret = flush_write_bio(&epd);
d1310b2e
CM
4319 return ret;
4320}
d1310b2e 4321
2a3ff0ad
NB
4322int extent_readpages(struct address_space *mapping, struct list_head *pages,
4323 unsigned nr_pages)
d1310b2e
CM
4324{
4325 struct bio *bio = NULL;
c8b97818 4326 unsigned long bio_flags = 0;
67c9684f 4327 struct page *pagepool[16];
125bac01 4328 struct extent_map *em_cached = NULL;
67c9684f 4329 int nr = 0;
808f80b4 4330 u64 prev_em_start = (u64)-1;
d1310b2e 4331
61ed3a14 4332 while (!list_empty(pages)) {
e65ef21e
NB
4333 u64 contig_end = 0;
4334
61ed3a14 4335 for (nr = 0; nr < ARRAY_SIZE(pagepool) && !list_empty(pages);) {
f86196ea 4336 struct page *page = lru_to_page(pages);
d1310b2e 4337
61ed3a14
NB
4338 prefetchw(&page->flags);
4339 list_del(&page->lru);
4340 if (add_to_page_cache_lru(page, mapping, page->index,
4341 readahead_gfp_mask(mapping))) {
4342 put_page(page);
e65ef21e 4343 break;
61ed3a14
NB
4344 }
4345
4346 pagepool[nr++] = page;
e65ef21e 4347 contig_end = page_offset(page) + PAGE_SIZE - 1;
d1310b2e 4348 }
67c9684f 4349
e65ef21e
NB
4350 if (nr) {
4351 u64 contig_start = page_offset(pagepool[0]);
4352
4353 ASSERT(contig_start + nr * PAGE_SIZE - 1 == contig_end);
4354
b6660e80 4355 contiguous_readpages(pagepool, nr, contig_start,
e65ef21e
NB
4356 contig_end, &em_cached, &bio, &bio_flags,
4357 &prev_em_start);
4358 }
d1310b2e 4359 }
67c9684f 4360
125bac01
MX
4361 if (em_cached)
4362 free_extent_map(em_cached);
4363
d1310b2e 4364 if (bio)
1f7ad75b 4365 return submit_one_bio(bio, 0, bio_flags);
d1310b2e
CM
4366 return 0;
4367}
d1310b2e
CM
4368
4369/*
4370 * basic invalidatepage code, this waits on any locked or writeback
4371 * ranges corresponding to the page, and then deletes any extent state
4372 * records from the tree
4373 */
4374int extent_invalidatepage(struct extent_io_tree *tree,
4375 struct page *page, unsigned long offset)
4376{
2ac55d41 4377 struct extent_state *cached_state = NULL;
4eee4fa4 4378 u64 start = page_offset(page);
09cbfeaf 4379 u64 end = start + PAGE_SIZE - 1;
d1310b2e
CM
4380 size_t blocksize = page->mapping->host->i_sb->s_blocksize;
4381
fda2832f 4382 start += ALIGN(offset, blocksize);
d1310b2e
CM
4383 if (start > end)
4384 return 0;
4385
ff13db41 4386 lock_extent_bits(tree, start, end, &cached_state);
1edbb734 4387 wait_on_page_writeback(page);
e182163d
OS
4388 clear_extent_bit(tree, start, end, EXTENT_LOCKED | EXTENT_DELALLOC |
4389 EXTENT_DO_ACCOUNTING, 1, 1, &cached_state);
d1310b2e
CM
4390 return 0;
4391}
d1310b2e 4392
7b13b7b1
CM
4393/*
4394 * a helper for releasepage, this tests for areas of the page that
4395 * are locked or under IO and drops the related state bits if it is safe
4396 * to drop the page.
4397 */
29c68b2d 4398static int try_release_extent_state(struct extent_io_tree *tree,
48a3b636 4399 struct page *page, gfp_t mask)
7b13b7b1 4400{
4eee4fa4 4401 u64 start = page_offset(page);
09cbfeaf 4402 u64 end = start + PAGE_SIZE - 1;
7b13b7b1
CM
4403 int ret = 1;
4404
8882679e 4405 if (test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) {
7b13b7b1 4406 ret = 0;
8882679e 4407 } else {
11ef160f
CM
4408 /*
4409 * at this point we can safely clear everything except the
4410 * locked bit and the nodatasum bit
4411 */
66b0c887 4412 ret = __clear_extent_bit(tree, start, end,
11ef160f 4413 ~(EXTENT_LOCKED | EXTENT_NODATASUM),
66b0c887 4414 0, 0, NULL, mask, NULL);
e3f24cc5
CM
4415
4416 /* if clear_extent_bit failed for enomem reasons,
4417 * we can't allow the release to continue.
4418 */
4419 if (ret < 0)
4420 ret = 0;
4421 else
4422 ret = 1;
7b13b7b1
CM
4423 }
4424 return ret;
4425}
7b13b7b1 4426
d1310b2e
CM
4427/*
4428 * a helper for releasepage. As long as there are no locked extents
4429 * in the range corresponding to the page, both state records and extent
4430 * map records are removed
4431 */
477a30ba 4432int try_release_extent_mapping(struct page *page, gfp_t mask)
d1310b2e
CM
4433{
4434 struct extent_map *em;
4eee4fa4 4435 u64 start = page_offset(page);
09cbfeaf 4436 u64 end = start + PAGE_SIZE - 1;
bd3599a0
FM
4437 struct btrfs_inode *btrfs_inode = BTRFS_I(page->mapping->host);
4438 struct extent_io_tree *tree = &btrfs_inode->io_tree;
4439 struct extent_map_tree *map = &btrfs_inode->extent_tree;
7b13b7b1 4440
d0164adc 4441 if (gfpflags_allow_blocking(mask) &&
ee22184b 4442 page->mapping->host->i_size > SZ_16M) {
39b5637f 4443 u64 len;
70dec807 4444 while (start <= end) {
39b5637f 4445 len = end - start + 1;
890871be 4446 write_lock(&map->lock);
39b5637f 4447 em = lookup_extent_mapping(map, start, len);
285190d9 4448 if (!em) {
890871be 4449 write_unlock(&map->lock);
70dec807
CM
4450 break;
4451 }
7f3c74fb
CM
4452 if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
4453 em->start != start) {
890871be 4454 write_unlock(&map->lock);
70dec807
CM
4455 free_extent_map(em);
4456 break;
4457 }
4458 if (!test_range_bit(tree, em->start,
4459 extent_map_end(em) - 1,
4e586ca3 4460 EXTENT_LOCKED, 0, NULL)) {
bd3599a0
FM
4461 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
4462 &btrfs_inode->runtime_flags);
70dec807
CM
4463 remove_extent_mapping(map, em);
4464 /* once for the rb tree */
4465 free_extent_map(em);
4466 }
4467 start = extent_map_end(em);
890871be 4468 write_unlock(&map->lock);
70dec807
CM
4469
4470 /* once for us */
d1310b2e
CM
4471 free_extent_map(em);
4472 }
d1310b2e 4473 }
29c68b2d 4474 return try_release_extent_state(tree, page, mask);
d1310b2e 4475}
d1310b2e 4476
ec29ed5b
CM
4477/*
4478 * helper function for fiemap, which doesn't want to see any holes.
4479 * This maps until we find something past 'last'
4480 */
4481static struct extent_map *get_extent_skip_holes(struct inode *inode,
e3350e16 4482 u64 offset, u64 last)
ec29ed5b 4483{
da17066c 4484 u64 sectorsize = btrfs_inode_sectorsize(inode);
ec29ed5b
CM
4485 struct extent_map *em;
4486 u64 len;
4487
4488 if (offset >= last)
4489 return NULL;
4490
67871254 4491 while (1) {
ec29ed5b
CM
4492 len = last - offset;
4493 if (len == 0)
4494 break;
fda2832f 4495 len = ALIGN(len, sectorsize);
4ab47a8d 4496 em = btrfs_get_extent_fiemap(BTRFS_I(inode), offset, len);
c704005d 4497 if (IS_ERR_OR_NULL(em))
ec29ed5b
CM
4498 return em;
4499
4500 /* if this isn't a hole return it */
4a2d25cd 4501 if (em->block_start != EXTENT_MAP_HOLE)
ec29ed5b 4502 return em;
ec29ed5b
CM
4503
4504 /* this is a hole, advance to the next extent */
4505 offset = extent_map_end(em);
4506 free_extent_map(em);
4507 if (offset >= last)
4508 break;
4509 }
4510 return NULL;
4511}
4512
4751832d
QW
4513/*
4514 * To cache previous fiemap extent
4515 *
4516 * Will be used for merging fiemap extent
4517 */
4518struct fiemap_cache {
4519 u64 offset;
4520 u64 phys;
4521 u64 len;
4522 u32 flags;
4523 bool cached;
4524};
4525
4526/*
4527 * Helper to submit fiemap extent.
4528 *
4529 * Will try to merge current fiemap extent specified by @offset, @phys,
4530 * @len and @flags with cached one.
4531 * And only when we fails to merge, cached one will be submitted as
4532 * fiemap extent.
4533 *
4534 * Return value is the same as fiemap_fill_next_extent().
4535 */
4536static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
4537 struct fiemap_cache *cache,
4538 u64 offset, u64 phys, u64 len, u32 flags)
4539{
4540 int ret = 0;
4541
4542 if (!cache->cached)
4543 goto assign;
4544
4545 /*
4546 * Sanity check, extent_fiemap() should have ensured that new
52042d8e 4547 * fiemap extent won't overlap with cached one.
4751832d
QW
4548 * Not recoverable.
4549 *
4550 * NOTE: Physical address can overlap, due to compression
4551 */
4552 if (cache->offset + cache->len > offset) {
4553 WARN_ON(1);
4554 return -EINVAL;
4555 }
4556
4557 /*
4558 * Only merges fiemap extents if
4559 * 1) Their logical addresses are continuous
4560 *
4561 * 2) Their physical addresses are continuous
4562 * So truly compressed (physical size smaller than logical size)
4563 * extents won't get merged with each other
4564 *
4565 * 3) Share same flags except FIEMAP_EXTENT_LAST
4566 * So regular extent won't get merged with prealloc extent
4567 */
4568 if (cache->offset + cache->len == offset &&
4569 cache->phys + cache->len == phys &&
4570 (cache->flags & ~FIEMAP_EXTENT_LAST) ==
4571 (flags & ~FIEMAP_EXTENT_LAST)) {
4572 cache->len += len;
4573 cache->flags |= flags;
4574 goto try_submit_last;
4575 }
4576
4577 /* Not mergeable, need to submit cached one */
4578 ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
4579 cache->len, cache->flags);
4580 cache->cached = false;
4581 if (ret)
4582 return ret;
4583assign:
4584 cache->cached = true;
4585 cache->offset = offset;
4586 cache->phys = phys;
4587 cache->len = len;
4588 cache->flags = flags;
4589try_submit_last:
4590 if (cache->flags & FIEMAP_EXTENT_LAST) {
4591 ret = fiemap_fill_next_extent(fieinfo, cache->offset,
4592 cache->phys, cache->len, cache->flags);
4593 cache->cached = false;
4594 }
4595 return ret;
4596}
4597
4598/*
848c23b7 4599 * Emit last fiemap cache
4751832d 4600 *
848c23b7
QW
4601 * The last fiemap cache may still be cached in the following case:
4602 * 0 4k 8k
4603 * |<- Fiemap range ->|
4604 * |<------------ First extent ----------->|
4605 *
4606 * In this case, the first extent range will be cached but not emitted.
4607 * So we must emit it before ending extent_fiemap().
4751832d 4608 */
5c5aff98 4609static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo,
848c23b7 4610 struct fiemap_cache *cache)
4751832d
QW
4611{
4612 int ret;
4613
4614 if (!cache->cached)
4615 return 0;
4616
4751832d
QW
4617 ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
4618 cache->len, cache->flags);
4619 cache->cached = false;
4620 if (ret > 0)
4621 ret = 0;
4622 return ret;
4623}
4624
1506fcc8 4625int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2135fb9b 4626 __u64 start, __u64 len)
1506fcc8 4627{
975f84fe 4628 int ret = 0;
1506fcc8
YS
4629 u64 off = start;
4630 u64 max = start + len;
4631 u32 flags = 0;
975f84fe
JB
4632 u32 found_type;
4633 u64 last;
ec29ed5b 4634 u64 last_for_get_extent = 0;
1506fcc8 4635 u64 disko = 0;
ec29ed5b 4636 u64 isize = i_size_read(inode);
975f84fe 4637 struct btrfs_key found_key;
1506fcc8 4638 struct extent_map *em = NULL;
2ac55d41 4639 struct extent_state *cached_state = NULL;
975f84fe 4640 struct btrfs_path *path;
dc046b10 4641 struct btrfs_root *root = BTRFS_I(inode)->root;
4751832d 4642 struct fiemap_cache cache = { 0 };
5911c8fe
DS
4643 struct ulist *roots;
4644 struct ulist *tmp_ulist;
1506fcc8 4645 int end = 0;
ec29ed5b
CM
4646 u64 em_start = 0;
4647 u64 em_len = 0;
4648 u64 em_end = 0;
1506fcc8
YS
4649
4650 if (len == 0)
4651 return -EINVAL;
4652
975f84fe
JB
4653 path = btrfs_alloc_path();
4654 if (!path)
4655 return -ENOMEM;
4656 path->leave_spinning = 1;
4657
5911c8fe
DS
4658 roots = ulist_alloc(GFP_KERNEL);
4659 tmp_ulist = ulist_alloc(GFP_KERNEL);
4660 if (!roots || !tmp_ulist) {
4661 ret = -ENOMEM;
4662 goto out_free_ulist;
4663 }
4664
da17066c
JM
4665 start = round_down(start, btrfs_inode_sectorsize(inode));
4666 len = round_up(max, btrfs_inode_sectorsize(inode)) - start;
4d479cf0 4667
ec29ed5b
CM
4668 /*
4669 * lookup the last file extent. We're not using i_size here
4670 * because there might be preallocation past i_size
4671 */
f85b7379
DS
4672 ret = btrfs_lookup_file_extent(NULL, root, path,
4673 btrfs_ino(BTRFS_I(inode)), -1, 0);
975f84fe 4674 if (ret < 0) {
5911c8fe 4675 goto out_free_ulist;
2d324f59
LB
4676 } else {
4677 WARN_ON(!ret);
4678 if (ret == 1)
4679 ret = 0;
975f84fe 4680 }
2d324f59 4681
975f84fe 4682 path->slots[0]--;
975f84fe 4683 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
962a298f 4684 found_type = found_key.type;
975f84fe 4685
ec29ed5b 4686 /* No extents, but there might be delalloc bits */
4a0cc7ca 4687 if (found_key.objectid != btrfs_ino(BTRFS_I(inode)) ||
975f84fe 4688 found_type != BTRFS_EXTENT_DATA_KEY) {
ec29ed5b
CM
4689 /* have to trust i_size as the end */
4690 last = (u64)-1;
4691 last_for_get_extent = isize;
4692 } else {
4693 /*
4694 * remember the start of the last extent. There are a
4695 * bunch of different factors that go into the length of the
4696 * extent, so its much less complex to remember where it started
4697 */
4698 last = found_key.offset;
4699 last_for_get_extent = last + 1;
975f84fe 4700 }
fe09e16c 4701 btrfs_release_path(path);
975f84fe 4702
ec29ed5b
CM
4703 /*
4704 * we might have some extents allocated but more delalloc past those
4705 * extents. so, we trust isize unless the start of the last extent is
4706 * beyond isize
4707 */
4708 if (last < isize) {
4709 last = (u64)-1;
4710 last_for_get_extent = isize;
4711 }
4712
ff13db41 4713 lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len - 1,
d0082371 4714 &cached_state);
ec29ed5b 4715
e3350e16 4716 em = get_extent_skip_holes(inode, start, last_for_get_extent);
1506fcc8
YS
4717 if (!em)
4718 goto out;
4719 if (IS_ERR(em)) {
4720 ret = PTR_ERR(em);
4721 goto out;
4722 }
975f84fe 4723
1506fcc8 4724 while (!end) {
b76bb701 4725 u64 offset_in_extent = 0;
ea8efc74
CM
4726
4727 /* break if the extent we found is outside the range */
4728 if (em->start >= max || extent_map_end(em) < off)
4729 break;
4730
4731 /*
4732 * get_extent may return an extent that starts before our
4733 * requested range. We have to make sure the ranges
4734 * we return to fiemap always move forward and don't
4735 * overlap, so adjust the offsets here
4736 */
4737 em_start = max(em->start, off);
1506fcc8 4738
ea8efc74
CM
4739 /*
4740 * record the offset from the start of the extent
b76bb701
JB
4741 * for adjusting the disk offset below. Only do this if the
4742 * extent isn't compressed since our in ram offset may be past
4743 * what we have actually allocated on disk.
ea8efc74 4744 */
b76bb701
JB
4745 if (!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
4746 offset_in_extent = em_start - em->start;
ec29ed5b 4747 em_end = extent_map_end(em);
ea8efc74 4748 em_len = em_end - em_start;
1506fcc8 4749 flags = 0;
f0986318
FM
4750 if (em->block_start < EXTENT_MAP_LAST_BYTE)
4751 disko = em->block_start + offset_in_extent;
4752 else
4753 disko = 0;
1506fcc8 4754
ea8efc74
CM
4755 /*
4756 * bump off for our next call to get_extent
4757 */
4758 off = extent_map_end(em);
4759 if (off >= max)
4760 end = 1;
4761
93dbfad7 4762 if (em->block_start == EXTENT_MAP_LAST_BYTE) {
1506fcc8
YS
4763 end = 1;
4764 flags |= FIEMAP_EXTENT_LAST;
93dbfad7 4765 } else if (em->block_start == EXTENT_MAP_INLINE) {
1506fcc8
YS
4766 flags |= (FIEMAP_EXTENT_DATA_INLINE |
4767 FIEMAP_EXTENT_NOT_ALIGNED);
93dbfad7 4768 } else if (em->block_start == EXTENT_MAP_DELALLOC) {
1506fcc8
YS
4769 flags |= (FIEMAP_EXTENT_DELALLOC |
4770 FIEMAP_EXTENT_UNKNOWN);
dc046b10
JB
4771 } else if (fieinfo->fi_extents_max) {
4772 u64 bytenr = em->block_start -
4773 (em->start - em->orig_start);
fe09e16c 4774
fe09e16c
LB
4775 /*
4776 * As btrfs supports shared space, this information
4777 * can be exported to userspace tools via
dc046b10
JB
4778 * flag FIEMAP_EXTENT_SHARED. If fi_extents_max == 0
4779 * then we're just getting a count and we can skip the
4780 * lookup stuff.
fe09e16c 4781 */
bb739cf0
EN
4782 ret = btrfs_check_shared(root,
4783 btrfs_ino(BTRFS_I(inode)),
5911c8fe 4784 bytenr, roots, tmp_ulist);
dc046b10 4785 if (ret < 0)
fe09e16c 4786 goto out_free;
dc046b10 4787 if (ret)
fe09e16c 4788 flags |= FIEMAP_EXTENT_SHARED;
dc046b10 4789 ret = 0;
1506fcc8
YS
4790 }
4791 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
4792 flags |= FIEMAP_EXTENT_ENCODED;
0d2b2372
JB
4793 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
4794 flags |= FIEMAP_EXTENT_UNWRITTEN;
1506fcc8 4795
1506fcc8
YS
4796 free_extent_map(em);
4797 em = NULL;
ec29ed5b
CM
4798 if ((em_start >= last) || em_len == (u64)-1 ||
4799 (last == (u64)-1 && isize <= em_end)) {
1506fcc8
YS
4800 flags |= FIEMAP_EXTENT_LAST;
4801 end = 1;
4802 }
4803
ec29ed5b 4804 /* now scan forward to see if this is really the last extent. */
e3350e16 4805 em = get_extent_skip_holes(inode, off, last_for_get_extent);
ec29ed5b
CM
4806 if (IS_ERR(em)) {
4807 ret = PTR_ERR(em);
4808 goto out;
4809 }
4810 if (!em) {
975f84fe
JB
4811 flags |= FIEMAP_EXTENT_LAST;
4812 end = 1;
4813 }
4751832d
QW
4814 ret = emit_fiemap_extent(fieinfo, &cache, em_start, disko,
4815 em_len, flags);
26e726af
CS
4816 if (ret) {
4817 if (ret == 1)
4818 ret = 0;
ec29ed5b 4819 goto out_free;
26e726af 4820 }
1506fcc8
YS
4821 }
4822out_free:
4751832d 4823 if (!ret)
5c5aff98 4824 ret = emit_last_fiemap_cache(fieinfo, &cache);
1506fcc8
YS
4825 free_extent_map(em);
4826out:
a52f4cd2 4827 unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len - 1,
e43bbe5e 4828 &cached_state);
5911c8fe
DS
4829
4830out_free_ulist:
e02d48ea 4831 btrfs_free_path(path);
5911c8fe
DS
4832 ulist_free(roots);
4833 ulist_free(tmp_ulist);
1506fcc8
YS
4834 return ret;
4835}
4836
727011e0
CM
4837static void __free_extent_buffer(struct extent_buffer *eb)
4838{
6d49ba1b 4839 btrfs_leak_debug_del(&eb->leak_list);
727011e0
CM
4840 kmem_cache_free(extent_buffer_cache, eb);
4841}
4842
a26e8c9f 4843int extent_buffer_under_io(struct extent_buffer *eb)
db7f3436
JB
4844{
4845 return (atomic_read(&eb->io_pages) ||
4846 test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) ||
4847 test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
4848}
4849
4850/*
55ac0139 4851 * Release all pages attached to the extent buffer.
db7f3436 4852 */
55ac0139 4853static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb)
db7f3436 4854{
d64766fd
NB
4855 int i;
4856 int num_pages;
b0132a3b 4857 int mapped = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
db7f3436
JB
4858
4859 BUG_ON(extent_buffer_under_io(eb));
4860
d64766fd
NB
4861 num_pages = num_extent_pages(eb);
4862 for (i = 0; i < num_pages; i++) {
4863 struct page *page = eb->pages[i];
db7f3436 4864
5d2361db
FL
4865 if (!page)
4866 continue;
4867 if (mapped)
db7f3436 4868 spin_lock(&page->mapping->private_lock);
5d2361db
FL
4869 /*
4870 * We do this since we'll remove the pages after we've
4871 * removed the eb from the radix tree, so we could race
4872 * and have this page now attached to the new eb. So
4873 * only clear page_private if it's still connected to
4874 * this eb.
4875 */
4876 if (PagePrivate(page) &&
4877 page->private == (unsigned long)eb) {
4878 BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
4879 BUG_ON(PageDirty(page));
4880 BUG_ON(PageWriteback(page));
db7f3436 4881 /*
5d2361db
FL
4882 * We need to make sure we haven't be attached
4883 * to a new eb.
db7f3436 4884 */
5d2361db
FL
4885 ClearPagePrivate(page);
4886 set_page_private(page, 0);
4887 /* One for the page private */
09cbfeaf 4888 put_page(page);
db7f3436 4889 }
5d2361db
FL
4890
4891 if (mapped)
4892 spin_unlock(&page->mapping->private_lock);
4893
01327610 4894 /* One for when we allocated the page */
09cbfeaf 4895 put_page(page);
d64766fd 4896 }
db7f3436
JB
4897}
4898
4899/*
4900 * Helper for releasing the extent buffer.
4901 */
4902static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
4903{
55ac0139 4904 btrfs_release_extent_buffer_pages(eb);
db7f3436
JB
4905 __free_extent_buffer(eb);
4906}
4907
f28491e0
JB
4908static struct extent_buffer *
4909__alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
23d79d81 4910 unsigned long len)
d1310b2e
CM
4911{
4912 struct extent_buffer *eb = NULL;
4913
d1b5c567 4914 eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS|__GFP_NOFAIL);
d1310b2e
CM
4915 eb->start = start;
4916 eb->len = len;
f28491e0 4917 eb->fs_info = fs_info;
815a51c7 4918 eb->bflags = 0;
bd681513 4919 rwlock_init(&eb->lock);
bd681513 4920 atomic_set(&eb->blocking_readers, 0);
06297d8c 4921 eb->blocking_writers = 0;
ed1b4ed7 4922 eb->lock_nested = false;
bd681513
CM
4923 init_waitqueue_head(&eb->write_lock_wq);
4924 init_waitqueue_head(&eb->read_lock_wq);
b4ce94de 4925
6d49ba1b
ES
4926 btrfs_leak_debug_add(&eb->leak_list, &buffers);
4927
3083ee2e 4928 spin_lock_init(&eb->refs_lock);
d1310b2e 4929 atomic_set(&eb->refs, 1);
0b32f4bb 4930 atomic_set(&eb->io_pages, 0);
727011e0 4931
b8dae313
DS
4932 /*
4933 * Sanity checks, currently the maximum is 64k covered by 16x 4k pages
4934 */
4935 BUILD_BUG_ON(BTRFS_MAX_METADATA_BLOCKSIZE
4936 > MAX_INLINE_EXTENT_BUFFER_SIZE);
4937 BUG_ON(len > MAX_INLINE_EXTENT_BUFFER_SIZE);
d1310b2e 4938
843ccf9f 4939#ifdef CONFIG_BTRFS_DEBUG
f3dc24c5 4940 eb->spinning_writers = 0;
afd495a8 4941 atomic_set(&eb->spinning_readers, 0);
5c9c799a 4942 atomic_set(&eb->read_locks, 0);
00801ae4 4943 eb->write_locks = 0;
843ccf9f
DS
4944#endif
4945
d1310b2e
CM
4946 return eb;
4947}
4948
815a51c7
JS
4949struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src)
4950{
cc5e31a4 4951 int i;
815a51c7
JS
4952 struct page *p;
4953 struct extent_buffer *new;
cc5e31a4 4954 int num_pages = num_extent_pages(src);
815a51c7 4955
3f556f78 4956 new = __alloc_extent_buffer(src->fs_info, src->start, src->len);
815a51c7
JS
4957 if (new == NULL)
4958 return NULL;
4959
4960 for (i = 0; i < num_pages; i++) {
9ec72677 4961 p = alloc_page(GFP_NOFS);
db7f3436
JB
4962 if (!p) {
4963 btrfs_release_extent_buffer(new);
4964 return NULL;
4965 }
815a51c7
JS
4966 attach_extent_buffer_page(new, p);
4967 WARN_ON(PageDirty(p));
4968 SetPageUptodate(p);
4969 new->pages[i] = p;
fba1acf9 4970 copy_page(page_address(p), page_address(src->pages[i]));
815a51c7
JS
4971 }
4972
815a51c7 4973 set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags);
b0132a3b 4974 set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags);
815a51c7
JS
4975
4976 return new;
4977}
4978
0f331229
OS
4979struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
4980 u64 start, unsigned long len)
815a51c7
JS
4981{
4982 struct extent_buffer *eb;
cc5e31a4
DS
4983 int num_pages;
4984 int i;
815a51c7 4985
3f556f78 4986 eb = __alloc_extent_buffer(fs_info, start, len);
815a51c7
JS
4987 if (!eb)
4988 return NULL;
4989
65ad0104 4990 num_pages = num_extent_pages(eb);
815a51c7 4991 for (i = 0; i < num_pages; i++) {
9ec72677 4992 eb->pages[i] = alloc_page(GFP_NOFS);
815a51c7
JS
4993 if (!eb->pages[i])
4994 goto err;
4995 }
4996 set_extent_buffer_uptodate(eb);
4997 btrfs_set_header_nritems(eb, 0);
b0132a3b 4998 set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
815a51c7
JS
4999
5000 return eb;
5001err:
84167d19
SB
5002 for (; i > 0; i--)
5003 __free_page(eb->pages[i - 1]);
815a51c7
JS
5004 __free_extent_buffer(eb);
5005 return NULL;
5006}
5007
0f331229 5008struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
da17066c 5009 u64 start)
0f331229 5010{
da17066c 5011 return __alloc_dummy_extent_buffer(fs_info, start, fs_info->nodesize);
0f331229
OS
5012}
5013
0b32f4bb
JB
5014static void check_buffer_tree_ref(struct extent_buffer *eb)
5015{
242e18c7 5016 int refs;
0b32f4bb
JB
5017 /* the ref bit is tricky. We have to make sure it is set
5018 * if we have the buffer dirty. Otherwise the
5019 * code to free a buffer can end up dropping a dirty
5020 * page
5021 *
5022 * Once the ref bit is set, it won't go away while the
5023 * buffer is dirty or in writeback, and it also won't
5024 * go away while we have the reference count on the
5025 * eb bumped.
5026 *
5027 * We can't just set the ref bit without bumping the
5028 * ref on the eb because free_extent_buffer might
5029 * see the ref bit and try to clear it. If this happens
5030 * free_extent_buffer might end up dropping our original
5031 * ref by mistake and freeing the page before we are able
5032 * to add one more ref.
5033 *
5034 * So bump the ref count first, then set the bit. If someone
5035 * beat us to it, drop the ref we added.
5036 */
242e18c7
CM
5037 refs = atomic_read(&eb->refs);
5038 if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
5039 return;
5040
594831c4
JB
5041 spin_lock(&eb->refs_lock);
5042 if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
0b32f4bb 5043 atomic_inc(&eb->refs);
594831c4 5044 spin_unlock(&eb->refs_lock);
0b32f4bb
JB
5045}
5046
2457aec6
MG
5047static void mark_extent_buffer_accessed(struct extent_buffer *eb,
5048 struct page *accessed)
5df4235e 5049{
cc5e31a4 5050 int num_pages, i;
5df4235e 5051
0b32f4bb
JB
5052 check_buffer_tree_ref(eb);
5053
65ad0104 5054 num_pages = num_extent_pages(eb);
5df4235e 5055 for (i = 0; i < num_pages; i++) {
fb85fc9a
DS
5056 struct page *p = eb->pages[i];
5057
2457aec6
MG
5058 if (p != accessed)
5059 mark_page_accessed(p);
5df4235e
JB
5060 }
5061}
5062
f28491e0
JB
5063struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
5064 u64 start)
452c75c3
CS
5065{
5066 struct extent_buffer *eb;
5067
5068 rcu_read_lock();
f28491e0 5069 eb = radix_tree_lookup(&fs_info->buffer_radix,
09cbfeaf 5070 start >> PAGE_SHIFT);
452c75c3
CS
5071 if (eb && atomic_inc_not_zero(&eb->refs)) {
5072 rcu_read_unlock();
062c19e9
FM
5073 /*
5074 * Lock our eb's refs_lock to avoid races with
5075 * free_extent_buffer. When we get our eb it might be flagged
5076 * with EXTENT_BUFFER_STALE and another task running
5077 * free_extent_buffer might have seen that flag set,
5078 * eb->refs == 2, that the buffer isn't under IO (dirty and
5079 * writeback flags not set) and it's still in the tree (flag
5080 * EXTENT_BUFFER_TREE_REF set), therefore being in the process
5081 * of decrementing the extent buffer's reference count twice.
5082 * So here we could race and increment the eb's reference count,
5083 * clear its stale flag, mark it as dirty and drop our reference
5084 * before the other task finishes executing free_extent_buffer,
5085 * which would later result in an attempt to free an extent
5086 * buffer that is dirty.
5087 */
5088 if (test_bit(EXTENT_BUFFER_STALE, &eb->bflags)) {
5089 spin_lock(&eb->refs_lock);
5090 spin_unlock(&eb->refs_lock);
5091 }
2457aec6 5092 mark_extent_buffer_accessed(eb, NULL);
452c75c3
CS
5093 return eb;
5094 }
5095 rcu_read_unlock();
5096
5097 return NULL;
5098}
5099
faa2dbf0
JB
5100#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
5101struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
da17066c 5102 u64 start)
faa2dbf0
JB
5103{
5104 struct extent_buffer *eb, *exists = NULL;
5105 int ret;
5106
5107 eb = find_extent_buffer(fs_info, start);
5108 if (eb)
5109 return eb;
da17066c 5110 eb = alloc_dummy_extent_buffer(fs_info, start);
faa2dbf0 5111 if (!eb)
b6293c82 5112 return ERR_PTR(-ENOMEM);
faa2dbf0
JB
5113 eb->fs_info = fs_info;
5114again:
e1860a77 5115 ret = radix_tree_preload(GFP_NOFS);
b6293c82
DC
5116 if (ret) {
5117 exists = ERR_PTR(ret);
faa2dbf0 5118 goto free_eb;
b6293c82 5119 }
faa2dbf0
JB
5120 spin_lock(&fs_info->buffer_lock);
5121 ret = radix_tree_insert(&fs_info->buffer_radix,
09cbfeaf 5122 start >> PAGE_SHIFT, eb);
faa2dbf0
JB
5123 spin_unlock(&fs_info->buffer_lock);
5124 radix_tree_preload_end();
5125 if (ret == -EEXIST) {
5126 exists = find_extent_buffer(fs_info, start);
5127 if (exists)
5128 goto free_eb;
5129 else
5130 goto again;
5131 }
5132 check_buffer_tree_ref(eb);
5133 set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
5134
faa2dbf0
JB
5135 return eb;
5136free_eb:
5137 btrfs_release_extent_buffer(eb);
5138 return exists;
5139}
5140#endif
5141
f28491e0 5142struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
ce3e6984 5143 u64 start)
d1310b2e 5144{
da17066c 5145 unsigned long len = fs_info->nodesize;
cc5e31a4
DS
5146 int num_pages;
5147 int i;
09cbfeaf 5148 unsigned long index = start >> PAGE_SHIFT;
d1310b2e 5149 struct extent_buffer *eb;
6af118ce 5150 struct extent_buffer *exists = NULL;
d1310b2e 5151 struct page *p;
f28491e0 5152 struct address_space *mapping = fs_info->btree_inode->i_mapping;
d1310b2e 5153 int uptodate = 1;
19fe0a8b 5154 int ret;
d1310b2e 5155
da17066c 5156 if (!IS_ALIGNED(start, fs_info->sectorsize)) {
c871b0f2
LB
5157 btrfs_err(fs_info, "bad tree block start %llu", start);
5158 return ERR_PTR(-EINVAL);
5159 }
5160
f28491e0 5161 eb = find_extent_buffer(fs_info, start);
452c75c3 5162 if (eb)
6af118ce 5163 return eb;
6af118ce 5164
23d79d81 5165 eb = __alloc_extent_buffer(fs_info, start, len);
2b114d1d 5166 if (!eb)
c871b0f2 5167 return ERR_PTR(-ENOMEM);
d1310b2e 5168
65ad0104 5169 num_pages = num_extent_pages(eb);
727011e0 5170 for (i = 0; i < num_pages; i++, index++) {
d1b5c567 5171 p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL);
c871b0f2
LB
5172 if (!p) {
5173 exists = ERR_PTR(-ENOMEM);
6af118ce 5174 goto free_eb;
c871b0f2 5175 }
4f2de97a
JB
5176
5177 spin_lock(&mapping->private_lock);
5178 if (PagePrivate(p)) {
5179 /*
5180 * We could have already allocated an eb for this page
5181 * and attached one so lets see if we can get a ref on
5182 * the existing eb, and if we can we know it's good and
5183 * we can just return that one, else we know we can just
5184 * overwrite page->private.
5185 */
5186 exists = (struct extent_buffer *)p->private;
5187 if (atomic_inc_not_zero(&exists->refs)) {
5188 spin_unlock(&mapping->private_lock);
5189 unlock_page(p);
09cbfeaf 5190 put_page(p);
2457aec6 5191 mark_extent_buffer_accessed(exists, p);
4f2de97a
JB
5192 goto free_eb;
5193 }
5ca64f45 5194 exists = NULL;
4f2de97a 5195
0b32f4bb 5196 /*
4f2de97a
JB
5197 * Do this so attach doesn't complain and we need to
5198 * drop the ref the old guy had.
5199 */
5200 ClearPagePrivate(p);
0b32f4bb 5201 WARN_ON(PageDirty(p));
09cbfeaf 5202 put_page(p);
d1310b2e 5203 }
4f2de97a
JB
5204 attach_extent_buffer_page(eb, p);
5205 spin_unlock(&mapping->private_lock);
0b32f4bb 5206 WARN_ON(PageDirty(p));
727011e0 5207 eb->pages[i] = p;
d1310b2e
CM
5208 if (!PageUptodate(p))
5209 uptodate = 0;
eb14ab8e
CM
5210
5211 /*
b16d011e
NB
5212 * We can't unlock the pages just yet since the extent buffer
5213 * hasn't been properly inserted in the radix tree, this
5214 * opens a race with btree_releasepage which can free a page
5215 * while we are still filling in all pages for the buffer and
5216 * we could crash.
eb14ab8e 5217 */
d1310b2e
CM
5218 }
5219 if (uptodate)
b4ce94de 5220 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
115391d2 5221again:
e1860a77 5222 ret = radix_tree_preload(GFP_NOFS);
c871b0f2
LB
5223 if (ret) {
5224 exists = ERR_PTR(ret);
19fe0a8b 5225 goto free_eb;
c871b0f2 5226 }
19fe0a8b 5227
f28491e0
JB
5228 spin_lock(&fs_info->buffer_lock);
5229 ret = radix_tree_insert(&fs_info->buffer_radix,
09cbfeaf 5230 start >> PAGE_SHIFT, eb);
f28491e0 5231 spin_unlock(&fs_info->buffer_lock);
452c75c3 5232 radix_tree_preload_end();
19fe0a8b 5233 if (ret == -EEXIST) {
f28491e0 5234 exists = find_extent_buffer(fs_info, start);
452c75c3
CS
5235 if (exists)
5236 goto free_eb;
5237 else
115391d2 5238 goto again;
6af118ce 5239 }
6af118ce 5240 /* add one reference for the tree */
0b32f4bb 5241 check_buffer_tree_ref(eb);
34b41ace 5242 set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
eb14ab8e
CM
5243
5244 /*
b16d011e
NB
5245 * Now it's safe to unlock the pages because any calls to
5246 * btree_releasepage will correctly detect that a page belongs to a
5247 * live buffer and won't free them prematurely.
eb14ab8e 5248 */
28187ae5
NB
5249 for (i = 0; i < num_pages; i++)
5250 unlock_page(eb->pages[i]);
d1310b2e
CM
5251 return eb;
5252
6af118ce 5253free_eb:
5ca64f45 5254 WARN_ON(!atomic_dec_and_test(&eb->refs));
727011e0
CM
5255 for (i = 0; i < num_pages; i++) {
5256 if (eb->pages[i])
5257 unlock_page(eb->pages[i]);
5258 }
eb14ab8e 5259
897ca6e9 5260 btrfs_release_extent_buffer(eb);
6af118ce 5261 return exists;
d1310b2e 5262}
d1310b2e 5263
3083ee2e
JB
5264static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
5265{
5266 struct extent_buffer *eb =
5267 container_of(head, struct extent_buffer, rcu_head);
5268
5269 __free_extent_buffer(eb);
5270}
5271
f7a52a40 5272static int release_extent_buffer(struct extent_buffer *eb)
3083ee2e 5273{
07e21c4d
NB
5274 lockdep_assert_held(&eb->refs_lock);
5275
3083ee2e
JB
5276 WARN_ON(atomic_read(&eb->refs) == 0);
5277 if (atomic_dec_and_test(&eb->refs)) {
34b41ace 5278 if (test_and_clear_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags)) {
f28491e0 5279 struct btrfs_fs_info *fs_info = eb->fs_info;
3083ee2e 5280
815a51c7 5281 spin_unlock(&eb->refs_lock);
3083ee2e 5282
f28491e0
JB
5283 spin_lock(&fs_info->buffer_lock);
5284 radix_tree_delete(&fs_info->buffer_radix,
09cbfeaf 5285 eb->start >> PAGE_SHIFT);
f28491e0 5286 spin_unlock(&fs_info->buffer_lock);
34b41ace
JB
5287 } else {
5288 spin_unlock(&eb->refs_lock);
815a51c7 5289 }
3083ee2e
JB
5290
5291 /* Should be safe to release our pages at this point */
55ac0139 5292 btrfs_release_extent_buffer_pages(eb);
bcb7e449 5293#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
b0132a3b 5294 if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags))) {
bcb7e449
JB
5295 __free_extent_buffer(eb);
5296 return 1;
5297 }
5298#endif
3083ee2e 5299 call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
e64860aa 5300 return 1;
3083ee2e
JB
5301 }
5302 spin_unlock(&eb->refs_lock);
e64860aa
JB
5303
5304 return 0;
3083ee2e
JB
5305}
5306
d1310b2e
CM
5307void free_extent_buffer(struct extent_buffer *eb)
5308{
242e18c7
CM
5309 int refs;
5310 int old;
d1310b2e
CM
5311 if (!eb)
5312 return;
5313
242e18c7
CM
5314 while (1) {
5315 refs = atomic_read(&eb->refs);
46cc775e
NB
5316 if ((!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) && refs <= 3)
5317 || (test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) &&
5318 refs == 1))
242e18c7
CM
5319 break;
5320 old = atomic_cmpxchg(&eb->refs, refs, refs - 1);
5321 if (old == refs)
5322 return;
5323 }
5324
3083ee2e
JB
5325 spin_lock(&eb->refs_lock);
5326 if (atomic_read(&eb->refs) == 2 &&
5327 test_bit(EXTENT_BUFFER_STALE, &eb->bflags) &&
0b32f4bb 5328 !extent_buffer_under_io(eb) &&
3083ee2e
JB
5329 test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
5330 atomic_dec(&eb->refs);
5331
5332 /*
5333 * I know this is terrible, but it's temporary until we stop tracking
5334 * the uptodate bits and such for the extent buffers.
5335 */
f7a52a40 5336 release_extent_buffer(eb);
3083ee2e
JB
5337}
5338
5339void free_extent_buffer_stale(struct extent_buffer *eb)
5340{
5341 if (!eb)
d1310b2e
CM
5342 return;
5343
3083ee2e
JB
5344 spin_lock(&eb->refs_lock);
5345 set_bit(EXTENT_BUFFER_STALE, &eb->bflags);
5346
0b32f4bb 5347 if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) &&
3083ee2e
JB
5348 test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
5349 atomic_dec(&eb->refs);
f7a52a40 5350 release_extent_buffer(eb);
d1310b2e 5351}
d1310b2e 5352
1d4284bd 5353void clear_extent_buffer_dirty(struct extent_buffer *eb)
d1310b2e 5354{
cc5e31a4
DS
5355 int i;
5356 int num_pages;
d1310b2e
CM
5357 struct page *page;
5358
65ad0104 5359 num_pages = num_extent_pages(eb);
d1310b2e
CM
5360
5361 for (i = 0; i < num_pages; i++) {
fb85fc9a 5362 page = eb->pages[i];
b9473439 5363 if (!PageDirty(page))
d2c3f4f6
CM
5364 continue;
5365
a61e6f29 5366 lock_page(page);
eb14ab8e
CM
5367 WARN_ON(!PagePrivate(page));
5368
d1310b2e 5369 clear_page_dirty_for_io(page);
b93b0163 5370 xa_lock_irq(&page->mapping->i_pages);
0a943c65
MW
5371 if (!PageDirty(page))
5372 __xa_clear_mark(&page->mapping->i_pages,
5373 page_index(page), PAGECACHE_TAG_DIRTY);
b93b0163 5374 xa_unlock_irq(&page->mapping->i_pages);
bf0da8c1 5375 ClearPageError(page);
a61e6f29 5376 unlock_page(page);
d1310b2e 5377 }
0b32f4bb 5378 WARN_ON(atomic_read(&eb->refs) == 0);
d1310b2e 5379}
d1310b2e 5380
abb57ef3 5381bool set_extent_buffer_dirty(struct extent_buffer *eb)
d1310b2e 5382{
cc5e31a4
DS
5383 int i;
5384 int num_pages;
abb57ef3 5385 bool was_dirty;
d1310b2e 5386
0b32f4bb
JB
5387 check_buffer_tree_ref(eb);
5388
b9473439 5389 was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
0b32f4bb 5390
65ad0104 5391 num_pages = num_extent_pages(eb);
3083ee2e 5392 WARN_ON(atomic_read(&eb->refs) == 0);
0b32f4bb
JB
5393 WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
5394
abb57ef3
LB
5395 if (!was_dirty)
5396 for (i = 0; i < num_pages; i++)
5397 set_page_dirty(eb->pages[i]);
51995c39
LB
5398
5399#ifdef CONFIG_BTRFS_DEBUG
5400 for (i = 0; i < num_pages; i++)
5401 ASSERT(PageDirty(eb->pages[i]));
5402#endif
5403
b9473439 5404 return was_dirty;
d1310b2e 5405}
d1310b2e 5406
69ba3927 5407void clear_extent_buffer_uptodate(struct extent_buffer *eb)
1259ab75 5408{
cc5e31a4 5409 int i;
1259ab75 5410 struct page *page;
cc5e31a4 5411 int num_pages;
1259ab75 5412
b4ce94de 5413 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
65ad0104 5414 num_pages = num_extent_pages(eb);
1259ab75 5415 for (i = 0; i < num_pages; i++) {
fb85fc9a 5416 page = eb->pages[i];
33958dc6
CM
5417 if (page)
5418 ClearPageUptodate(page);
1259ab75 5419 }
1259ab75
CM
5420}
5421
09c25a8c 5422void set_extent_buffer_uptodate(struct extent_buffer *eb)
d1310b2e 5423{
cc5e31a4 5424 int i;
d1310b2e 5425 struct page *page;
cc5e31a4 5426 int num_pages;
d1310b2e 5427
0b32f4bb 5428 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
65ad0104 5429 num_pages = num_extent_pages(eb);
d1310b2e 5430 for (i = 0; i < num_pages; i++) {
fb85fc9a 5431 page = eb->pages[i];
d1310b2e
CM
5432 SetPageUptodate(page);
5433 }
d1310b2e 5434}
d1310b2e 5435
c2ccfbc6 5436int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
d1310b2e 5437{
cc5e31a4 5438 int i;
d1310b2e
CM
5439 struct page *page;
5440 int err;
5441 int ret = 0;
ce9adaa5
CM
5442 int locked_pages = 0;
5443 int all_uptodate = 1;
cc5e31a4 5444 int num_pages;
727011e0 5445 unsigned long num_reads = 0;
a86c12c7 5446 struct bio *bio = NULL;
c8b97818 5447 unsigned long bio_flags = 0;
a86c12c7 5448
b4ce94de 5449 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
d1310b2e
CM
5450 return 0;
5451
65ad0104 5452 num_pages = num_extent_pages(eb);
8436ea91 5453 for (i = 0; i < num_pages; i++) {
fb85fc9a 5454 page = eb->pages[i];
bb82ab88 5455 if (wait == WAIT_NONE) {
2db04966 5456 if (!trylock_page(page))
ce9adaa5 5457 goto unlock_exit;
d1310b2e
CM
5458 } else {
5459 lock_page(page);
5460 }
ce9adaa5 5461 locked_pages++;
2571e739
LB
5462 }
5463 /*
5464 * We need to firstly lock all pages to make sure that
5465 * the uptodate bit of our pages won't be affected by
5466 * clear_extent_buffer_uptodate().
5467 */
8436ea91 5468 for (i = 0; i < num_pages; i++) {
2571e739 5469 page = eb->pages[i];
727011e0
CM
5470 if (!PageUptodate(page)) {
5471 num_reads++;
ce9adaa5 5472 all_uptodate = 0;
727011e0 5473 }
ce9adaa5 5474 }
2571e739 5475
ce9adaa5 5476 if (all_uptodate) {
8436ea91 5477 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
ce9adaa5
CM
5478 goto unlock_exit;
5479 }
5480
656f30db 5481 clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
5cf1ab56 5482 eb->read_mirror = 0;
0b32f4bb 5483 atomic_set(&eb->io_pages, num_reads);
8436ea91 5484 for (i = 0; i < num_pages; i++) {
fb85fc9a 5485 page = eb->pages[i];
baf863b9 5486
ce9adaa5 5487 if (!PageUptodate(page)) {
baf863b9
LB
5488 if (ret) {
5489 atomic_dec(&eb->io_pages);
5490 unlock_page(page);
5491 continue;
5492 }
5493
f188591e 5494 ClearPageError(page);
0d44fea7 5495 err = __extent_read_full_page(page,
6af49dbd 5496 btree_get_extent, &bio,
d4c7ca86 5497 mirror_num, &bio_flags,
1f7ad75b 5498 REQ_META);
baf863b9 5499 if (err) {
d1310b2e 5500 ret = err;
baf863b9
LB
5501 /*
5502 * We use &bio in above __extent_read_full_page,
5503 * so we ensure that if it returns error, the
5504 * current page fails to add itself to bio and
5505 * it's been unlocked.
5506 *
5507 * We must dec io_pages by ourselves.
5508 */
5509 atomic_dec(&eb->io_pages);
5510 }
d1310b2e
CM
5511 } else {
5512 unlock_page(page);
5513 }
5514 }
5515
355808c2 5516 if (bio) {
1f7ad75b 5517 err = submit_one_bio(bio, mirror_num, bio_flags);
79787eaa
JM
5518 if (err)
5519 return err;
355808c2 5520 }
a86c12c7 5521
bb82ab88 5522 if (ret || wait != WAIT_COMPLETE)
d1310b2e 5523 return ret;
d397712b 5524
8436ea91 5525 for (i = 0; i < num_pages; i++) {
fb85fc9a 5526 page = eb->pages[i];
d1310b2e 5527 wait_on_page_locked(page);
d397712b 5528 if (!PageUptodate(page))
d1310b2e 5529 ret = -EIO;
d1310b2e 5530 }
d397712b 5531
d1310b2e 5532 return ret;
ce9adaa5
CM
5533
5534unlock_exit:
d397712b 5535 while (locked_pages > 0) {
ce9adaa5 5536 locked_pages--;
8436ea91
JB
5537 page = eb->pages[locked_pages];
5538 unlock_page(page);
ce9adaa5
CM
5539 }
5540 return ret;
d1310b2e 5541}
d1310b2e 5542
1cbb1f45
JM
5543void read_extent_buffer(const struct extent_buffer *eb, void *dstv,
5544 unsigned long start, unsigned long len)
d1310b2e
CM
5545{
5546 size_t cur;
5547 size_t offset;
5548 struct page *page;
5549 char *kaddr;
5550 char *dst = (char *)dstv;
7073017a 5551 size_t start_offset = offset_in_page(eb->start);
09cbfeaf 5552 unsigned long i = (start_offset + start) >> PAGE_SHIFT;
d1310b2e 5553
f716abd5
LB
5554 if (start + len > eb->len) {
5555 WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, wanted %lu %lu\n",
5556 eb->start, eb->len, start, len);
5557 memset(dst, 0, len);
5558 return;
5559 }
d1310b2e 5560
7073017a 5561 offset = offset_in_page(start_offset + start);
d1310b2e 5562
d397712b 5563 while (len > 0) {
fb85fc9a 5564 page = eb->pages[i];
d1310b2e 5565
09cbfeaf 5566 cur = min(len, (PAGE_SIZE - offset));
a6591715 5567 kaddr = page_address(page);
d1310b2e 5568 memcpy(dst, kaddr + offset, cur);
d1310b2e
CM
5569
5570 dst += cur;
5571 len -= cur;
5572 offset = 0;
5573 i++;
5574 }
5575}
d1310b2e 5576
1cbb1f45
JM
5577int read_extent_buffer_to_user(const struct extent_buffer *eb,
5578 void __user *dstv,
5579 unsigned long start, unsigned long len)
550ac1d8
GH
5580{
5581 size_t cur;
5582 size_t offset;
5583 struct page *page;
5584 char *kaddr;
5585 char __user *dst = (char __user *)dstv;
7073017a 5586 size_t start_offset = offset_in_page(eb->start);
09cbfeaf 5587 unsigned long i = (start_offset + start) >> PAGE_SHIFT;
550ac1d8
GH
5588 int ret = 0;
5589
5590 WARN_ON(start > eb->len);
5591 WARN_ON(start + len > eb->start + eb->len);
5592
7073017a 5593 offset = offset_in_page(start_offset + start);
550ac1d8
GH
5594
5595 while (len > 0) {
fb85fc9a 5596 page = eb->pages[i];
550ac1d8 5597
09cbfeaf 5598 cur = min(len, (PAGE_SIZE - offset));
550ac1d8
GH
5599 kaddr = page_address(page);
5600 if (copy_to_user(dst, kaddr + offset, cur)) {
5601 ret = -EFAULT;
5602 break;
5603 }
5604
5605 dst += cur;
5606 len -= cur;
5607 offset = 0;
5608 i++;
5609 }
5610
5611 return ret;
5612}
5613
415b35a5
LB
5614/*
5615 * return 0 if the item is found within a page.
5616 * return 1 if the item spans two pages.
5617 * return -EINVAL otherwise.
5618 */
1cbb1f45
JM
5619int map_private_extent_buffer(const struct extent_buffer *eb,
5620 unsigned long start, unsigned long min_len,
5621 char **map, unsigned long *map_start,
5622 unsigned long *map_len)
d1310b2e 5623{
cc2c39d6 5624 size_t offset;
d1310b2e
CM
5625 char *kaddr;
5626 struct page *p;
7073017a 5627 size_t start_offset = offset_in_page(eb->start);
09cbfeaf 5628 unsigned long i = (start_offset + start) >> PAGE_SHIFT;
d1310b2e 5629 unsigned long end_i = (start_offset + start + min_len - 1) >>
09cbfeaf 5630 PAGE_SHIFT;
d1310b2e 5631
f716abd5
LB
5632 if (start + min_len > eb->len) {
5633 WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, wanted %lu %lu\n",
5634 eb->start, eb->len, start, min_len);
5635 return -EINVAL;
5636 }
5637
d1310b2e 5638 if (i != end_i)
415b35a5 5639 return 1;
d1310b2e
CM
5640
5641 if (i == 0) {
5642 offset = start_offset;
5643 *map_start = 0;
5644 } else {
5645 offset = 0;
09cbfeaf 5646 *map_start = ((u64)i << PAGE_SHIFT) - start_offset;
d1310b2e 5647 }
d397712b 5648
fb85fc9a 5649 p = eb->pages[i];
a6591715 5650 kaddr = page_address(p);
d1310b2e 5651 *map = kaddr + offset;
09cbfeaf 5652 *map_len = PAGE_SIZE - offset;
d1310b2e
CM
5653 return 0;
5654}
d1310b2e 5655
1cbb1f45
JM
5656int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
5657 unsigned long start, unsigned long len)
d1310b2e
CM
5658{
5659 size_t cur;
5660 size_t offset;
5661 struct page *page;
5662 char *kaddr;
5663 char *ptr = (char *)ptrv;
7073017a 5664 size_t start_offset = offset_in_page(eb->start);
09cbfeaf 5665 unsigned long i = (start_offset + start) >> PAGE_SHIFT;
d1310b2e
CM
5666 int ret = 0;
5667
5668 WARN_ON(start > eb->len);
5669 WARN_ON(start + len > eb->start + eb->len);
5670
7073017a 5671 offset = offset_in_page(start_offset + start);
d1310b2e 5672
d397712b 5673 while (len > 0) {
fb85fc9a 5674 page = eb->pages[i];
d1310b2e 5675
09cbfeaf 5676 cur = min(len, (PAGE_SIZE - offset));
d1310b2e 5677
a6591715 5678 kaddr = page_address(page);
d1310b2e 5679 ret = memcmp(ptr, kaddr + offset, cur);
d1310b2e
CM
5680 if (ret)
5681 break;
5682
5683 ptr += cur;
5684 len -= cur;
5685 offset = 0;
5686 i++;
5687 }
5688 return ret;
5689}
d1310b2e 5690
f157bf76
DS
5691void write_extent_buffer_chunk_tree_uuid(struct extent_buffer *eb,
5692 const void *srcv)
5693{
5694 char *kaddr;
5695
5696 WARN_ON(!PageUptodate(eb->pages[0]));
5697 kaddr = page_address(eb->pages[0]);
5698 memcpy(kaddr + offsetof(struct btrfs_header, chunk_tree_uuid), srcv,
5699 BTRFS_FSID_SIZE);
5700}
5701
5702void write_extent_buffer_fsid(struct extent_buffer *eb, const void *srcv)
5703{
5704 char *kaddr;
5705
5706 WARN_ON(!PageUptodate(eb->pages[0]));
5707 kaddr = page_address(eb->pages[0]);
5708 memcpy(kaddr + offsetof(struct btrfs_header, fsid), srcv,
5709 BTRFS_FSID_SIZE);
5710}
5711
d1310b2e
CM
5712void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
5713 unsigned long start, unsigned long len)
5714{
5715 size_t cur;
5716 size_t offset;
5717 struct page *page;
5718 char *kaddr;
5719 char *src = (char *)srcv;
7073017a 5720 size_t start_offset = offset_in_page(eb->start);
09cbfeaf 5721 unsigned long i = (start_offset + start) >> PAGE_SHIFT;
d1310b2e
CM
5722
5723 WARN_ON(start > eb->len);
5724 WARN_ON(start + len > eb->start + eb->len);
5725
7073017a 5726 offset = offset_in_page(start_offset + start);
d1310b2e 5727
d397712b 5728 while (len > 0) {
fb85fc9a 5729 page = eb->pages[i];
d1310b2e
CM
5730 WARN_ON(!PageUptodate(page));
5731
09cbfeaf 5732 cur = min(len, PAGE_SIZE - offset);
a6591715 5733 kaddr = page_address(page);
d1310b2e 5734 memcpy(kaddr + offset, src, cur);
d1310b2e
CM
5735
5736 src += cur;
5737 len -= cur;
5738 offset = 0;
5739 i++;
5740 }
5741}
d1310b2e 5742
b159fa28
DS
5743void memzero_extent_buffer(struct extent_buffer *eb, unsigned long start,
5744 unsigned long len)
d1310b2e
CM
5745{
5746 size_t cur;
5747 size_t offset;
5748 struct page *page;
5749 char *kaddr;
7073017a 5750 size_t start_offset = offset_in_page(eb->start);
09cbfeaf 5751 unsigned long i = (start_offset + start) >> PAGE_SHIFT;
d1310b2e
CM
5752
5753 WARN_ON(start > eb->len);
5754 WARN_ON(start + len > eb->start + eb->len);
5755
7073017a 5756 offset = offset_in_page(start_offset + start);
d1310b2e 5757
d397712b 5758 while (len > 0) {
fb85fc9a 5759 page = eb->pages[i];
d1310b2e
CM
5760 WARN_ON(!PageUptodate(page));
5761
09cbfeaf 5762 cur = min(len, PAGE_SIZE - offset);
a6591715 5763 kaddr = page_address(page);
b159fa28 5764 memset(kaddr + offset, 0, cur);
d1310b2e
CM
5765
5766 len -= cur;
5767 offset = 0;
5768 i++;
5769 }
5770}
d1310b2e 5771
58e8012c
DS
5772void copy_extent_buffer_full(struct extent_buffer *dst,
5773 struct extent_buffer *src)
5774{
5775 int i;
cc5e31a4 5776 int num_pages;
58e8012c
DS
5777
5778 ASSERT(dst->len == src->len);
5779
65ad0104 5780 num_pages = num_extent_pages(dst);
58e8012c
DS
5781 for (i = 0; i < num_pages; i++)
5782 copy_page(page_address(dst->pages[i]),
5783 page_address(src->pages[i]));
5784}
5785
d1310b2e
CM
5786void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
5787 unsigned long dst_offset, unsigned long src_offset,
5788 unsigned long len)
5789{
5790 u64 dst_len = dst->len;
5791 size_t cur;
5792 size_t offset;
5793 struct page *page;
5794 char *kaddr;
7073017a 5795 size_t start_offset = offset_in_page(dst->start);
09cbfeaf 5796 unsigned long i = (start_offset + dst_offset) >> PAGE_SHIFT;
d1310b2e
CM
5797
5798 WARN_ON(src->len != dst_len);
5799
7073017a 5800 offset = offset_in_page(start_offset + dst_offset);
d1310b2e 5801
d397712b 5802 while (len > 0) {
fb85fc9a 5803 page = dst->pages[i];
d1310b2e
CM
5804 WARN_ON(!PageUptodate(page));
5805
09cbfeaf 5806 cur = min(len, (unsigned long)(PAGE_SIZE - offset));
d1310b2e 5807
a6591715 5808 kaddr = page_address(page);
d1310b2e 5809 read_extent_buffer(src, kaddr + offset, src_offset, cur);
d1310b2e
CM
5810
5811 src_offset += cur;
5812 len -= cur;
5813 offset = 0;
5814 i++;
5815 }
5816}
d1310b2e 5817
3e1e8bb7
OS
5818/*
5819 * eb_bitmap_offset() - calculate the page and offset of the byte containing the
5820 * given bit number
5821 * @eb: the extent buffer
5822 * @start: offset of the bitmap item in the extent buffer
5823 * @nr: bit number
5824 * @page_index: return index of the page in the extent buffer that contains the
5825 * given bit number
5826 * @page_offset: return offset into the page given by page_index
5827 *
5828 * This helper hides the ugliness of finding the byte in an extent buffer which
5829 * contains a given bit.
5830 */
5831static inline void eb_bitmap_offset(struct extent_buffer *eb,
5832 unsigned long start, unsigned long nr,
5833 unsigned long *page_index,
5834 size_t *page_offset)
5835{
7073017a 5836 size_t start_offset = offset_in_page(eb->start);
3e1e8bb7
OS
5837 size_t byte_offset = BIT_BYTE(nr);
5838 size_t offset;
5839
5840 /*
5841 * The byte we want is the offset of the extent buffer + the offset of
5842 * the bitmap item in the extent buffer + the offset of the byte in the
5843 * bitmap item.
5844 */
5845 offset = start_offset + start + byte_offset;
5846
09cbfeaf 5847 *page_index = offset >> PAGE_SHIFT;
7073017a 5848 *page_offset = offset_in_page(offset);
3e1e8bb7
OS
5849}
5850
5851/**
5852 * extent_buffer_test_bit - determine whether a bit in a bitmap item is set
5853 * @eb: the extent buffer
5854 * @start: offset of the bitmap item in the extent buffer
5855 * @nr: bit number to test
5856 */
5857int extent_buffer_test_bit(struct extent_buffer *eb, unsigned long start,
5858 unsigned long nr)
5859{
2fe1d551 5860 u8 *kaddr;
3e1e8bb7
OS
5861 struct page *page;
5862 unsigned long i;
5863 size_t offset;
5864
5865 eb_bitmap_offset(eb, start, nr, &i, &offset);
5866 page = eb->pages[i];
5867 WARN_ON(!PageUptodate(page));
5868 kaddr = page_address(page);
5869 return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1)));
5870}
5871
5872/**
5873 * extent_buffer_bitmap_set - set an area of a bitmap
5874 * @eb: the extent buffer
5875 * @start: offset of the bitmap item in the extent buffer
5876 * @pos: bit number of the first bit
5877 * @len: number of bits to set
5878 */
5879void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start,
5880 unsigned long pos, unsigned long len)
5881{
2fe1d551 5882 u8 *kaddr;
3e1e8bb7
OS
5883 struct page *page;
5884 unsigned long i;
5885 size_t offset;
5886 const unsigned int size = pos + len;
5887 int bits_to_set = BITS_PER_BYTE - (pos % BITS_PER_BYTE);
2fe1d551 5888 u8 mask_to_set = BITMAP_FIRST_BYTE_MASK(pos);
3e1e8bb7
OS
5889
5890 eb_bitmap_offset(eb, start, pos, &i, &offset);
5891 page = eb->pages[i];
5892 WARN_ON(!PageUptodate(page));
5893 kaddr = page_address(page);
5894
5895 while (len >= bits_to_set) {
5896 kaddr[offset] |= mask_to_set;
5897 len -= bits_to_set;
5898 bits_to_set = BITS_PER_BYTE;
9c894696 5899 mask_to_set = ~0;
09cbfeaf 5900 if (++offset >= PAGE_SIZE && len > 0) {
3e1e8bb7
OS
5901 offset = 0;
5902 page = eb->pages[++i];
5903 WARN_ON(!PageUptodate(page));
5904 kaddr = page_address(page);
5905 }
5906 }
5907 if (len) {
5908 mask_to_set &= BITMAP_LAST_BYTE_MASK(size);
5909 kaddr[offset] |= mask_to_set;
5910 }
5911}
5912
5913
5914/**
5915 * extent_buffer_bitmap_clear - clear an area of a bitmap
5916 * @eb: the extent buffer
5917 * @start: offset of the bitmap item in the extent buffer
5918 * @pos: bit number of the first bit
5919 * @len: number of bits to clear
5920 */
5921void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start,
5922 unsigned long pos, unsigned long len)
5923{
2fe1d551 5924 u8 *kaddr;
3e1e8bb7
OS
5925 struct page *page;
5926 unsigned long i;
5927 size_t offset;
5928 const unsigned int size = pos + len;
5929 int bits_to_clear = BITS_PER_BYTE - (pos % BITS_PER_BYTE);
2fe1d551 5930 u8 mask_to_clear = BITMAP_FIRST_BYTE_MASK(pos);
3e1e8bb7
OS
5931
5932 eb_bitmap_offset(eb, start, pos, &i, &offset);
5933 page = eb->pages[i];
5934 WARN_ON(!PageUptodate(page));
5935 kaddr = page_address(page);
5936
5937 while (len >= bits_to_clear) {
5938 kaddr[offset] &= ~mask_to_clear;
5939 len -= bits_to_clear;
5940 bits_to_clear = BITS_PER_BYTE;
9c894696 5941 mask_to_clear = ~0;
09cbfeaf 5942 if (++offset >= PAGE_SIZE && len > 0) {
3e1e8bb7
OS
5943 offset = 0;
5944 page = eb->pages[++i];
5945 WARN_ON(!PageUptodate(page));
5946 kaddr = page_address(page);
5947 }
5948 }
5949 if (len) {
5950 mask_to_clear &= BITMAP_LAST_BYTE_MASK(size);
5951 kaddr[offset] &= ~mask_to_clear;
5952 }
5953}
5954
3387206f
ST
5955static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len)
5956{
5957 unsigned long distance = (src > dst) ? src - dst : dst - src;
5958 return distance < len;
5959}
5960
d1310b2e
CM
5961static void copy_pages(struct page *dst_page, struct page *src_page,
5962 unsigned long dst_off, unsigned long src_off,
5963 unsigned long len)
5964{
a6591715 5965 char *dst_kaddr = page_address(dst_page);
d1310b2e 5966 char *src_kaddr;
727011e0 5967 int must_memmove = 0;
d1310b2e 5968
3387206f 5969 if (dst_page != src_page) {
a6591715 5970 src_kaddr = page_address(src_page);
3387206f 5971 } else {
d1310b2e 5972 src_kaddr = dst_kaddr;
727011e0
CM
5973 if (areas_overlap(src_off, dst_off, len))
5974 must_memmove = 1;
3387206f 5975 }
d1310b2e 5976
727011e0
CM
5977 if (must_memmove)
5978 memmove(dst_kaddr + dst_off, src_kaddr + src_off, len);
5979 else
5980 memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
d1310b2e
CM
5981}
5982
5983void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
5984 unsigned long src_offset, unsigned long len)
5985{
0b246afa 5986 struct btrfs_fs_info *fs_info = dst->fs_info;
d1310b2e
CM
5987 size_t cur;
5988 size_t dst_off_in_page;
5989 size_t src_off_in_page;
7073017a 5990 size_t start_offset = offset_in_page(dst->start);
d1310b2e
CM
5991 unsigned long dst_i;
5992 unsigned long src_i;
5993
5994 if (src_offset + len > dst->len) {
0b246afa 5995 btrfs_err(fs_info,
5d163e0e
JM
5996 "memmove bogus src_offset %lu move len %lu dst len %lu",
5997 src_offset, len, dst->len);
290342f6 5998 BUG();
d1310b2e
CM
5999 }
6000 if (dst_offset + len > dst->len) {
0b246afa 6001 btrfs_err(fs_info,
5d163e0e
JM
6002 "memmove bogus dst_offset %lu move len %lu dst len %lu",
6003 dst_offset, len, dst->len);
290342f6 6004 BUG();
d1310b2e
CM
6005 }
6006
d397712b 6007 while (len > 0) {
7073017a
JT
6008 dst_off_in_page = offset_in_page(start_offset + dst_offset);
6009 src_off_in_page = offset_in_page(start_offset + src_offset);
d1310b2e 6010
09cbfeaf
KS
6011 dst_i = (start_offset + dst_offset) >> PAGE_SHIFT;
6012 src_i = (start_offset + src_offset) >> PAGE_SHIFT;
d1310b2e 6013
09cbfeaf 6014 cur = min(len, (unsigned long)(PAGE_SIZE -
d1310b2e
CM
6015 src_off_in_page));
6016 cur = min_t(unsigned long, cur,
09cbfeaf 6017 (unsigned long)(PAGE_SIZE - dst_off_in_page));
d1310b2e 6018
fb85fc9a 6019 copy_pages(dst->pages[dst_i], dst->pages[src_i],
d1310b2e
CM
6020 dst_off_in_page, src_off_in_page, cur);
6021
6022 src_offset += cur;
6023 dst_offset += cur;
6024 len -= cur;
6025 }
6026}
d1310b2e
CM
6027
6028void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
6029 unsigned long src_offset, unsigned long len)
6030{
0b246afa 6031 struct btrfs_fs_info *fs_info = dst->fs_info;
d1310b2e
CM
6032 size_t cur;
6033 size_t dst_off_in_page;
6034 size_t src_off_in_page;
6035 unsigned long dst_end = dst_offset + len - 1;
6036 unsigned long src_end = src_offset + len - 1;
7073017a 6037 size_t start_offset = offset_in_page(dst->start);
d1310b2e
CM
6038 unsigned long dst_i;
6039 unsigned long src_i;
6040
6041 if (src_offset + len > dst->len) {
0b246afa 6042 btrfs_err(fs_info,
5d163e0e
JM
6043 "memmove bogus src_offset %lu move len %lu len %lu",
6044 src_offset, len, dst->len);
290342f6 6045 BUG();
d1310b2e
CM
6046 }
6047 if (dst_offset + len > dst->len) {
0b246afa 6048 btrfs_err(fs_info,
5d163e0e
JM
6049 "memmove bogus dst_offset %lu move len %lu len %lu",
6050 dst_offset, len, dst->len);
290342f6 6051 BUG();
d1310b2e 6052 }
727011e0 6053 if (dst_offset < src_offset) {
d1310b2e
CM
6054 memcpy_extent_buffer(dst, dst_offset, src_offset, len);
6055 return;
6056 }
d397712b 6057 while (len > 0) {
09cbfeaf
KS
6058 dst_i = (start_offset + dst_end) >> PAGE_SHIFT;
6059 src_i = (start_offset + src_end) >> PAGE_SHIFT;
d1310b2e 6060
7073017a
JT
6061 dst_off_in_page = offset_in_page(start_offset + dst_end);
6062 src_off_in_page = offset_in_page(start_offset + src_end);
d1310b2e
CM
6063
6064 cur = min_t(unsigned long, len, src_off_in_page + 1);
6065 cur = min(cur, dst_off_in_page + 1);
fb85fc9a 6066 copy_pages(dst->pages[dst_i], dst->pages[src_i],
d1310b2e
CM
6067 dst_off_in_page - cur + 1,
6068 src_off_in_page - cur + 1, cur);
6069
6070 dst_end -= cur;
6071 src_end -= cur;
6072 len -= cur;
6073 }
6074}
6af118ce 6075
f7a52a40 6076int try_release_extent_buffer(struct page *page)
19fe0a8b 6077{
6af118ce 6078 struct extent_buffer *eb;
6af118ce 6079
3083ee2e 6080 /*
01327610 6081 * We need to make sure nobody is attaching this page to an eb right
3083ee2e
JB
6082 * now.
6083 */
6084 spin_lock(&page->mapping->private_lock);
6085 if (!PagePrivate(page)) {
6086 spin_unlock(&page->mapping->private_lock);
4f2de97a 6087 return 1;
45f49bce 6088 }
6af118ce 6089
3083ee2e
JB
6090 eb = (struct extent_buffer *)page->private;
6091 BUG_ON(!eb);
19fe0a8b
MX
6092
6093 /*
3083ee2e
JB
6094 * This is a little awful but should be ok, we need to make sure that
6095 * the eb doesn't disappear out from under us while we're looking at
6096 * this page.
19fe0a8b 6097 */
3083ee2e 6098 spin_lock(&eb->refs_lock);
0b32f4bb 6099 if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
3083ee2e
JB
6100 spin_unlock(&eb->refs_lock);
6101 spin_unlock(&page->mapping->private_lock);
6102 return 0;
b9473439 6103 }
3083ee2e 6104 spin_unlock(&page->mapping->private_lock);
897ca6e9 6105
19fe0a8b 6106 /*
3083ee2e
JB
6107 * If tree ref isn't set then we know the ref on this eb is a real ref,
6108 * so just return, this page will likely be freed soon anyway.
19fe0a8b 6109 */
3083ee2e
JB
6110 if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
6111 spin_unlock(&eb->refs_lock);
6112 return 0;
b9473439 6113 }
19fe0a8b 6114
f7a52a40 6115 return release_extent_buffer(eb);
6af118ce 6116}