]> git.proxmox.com Git - mirror_ubuntu-hirsute-kernel.git/blame - fs/btrfs/extent_io.c
btrfs: sink argument tree to extent_read_full_page
[mirror_ubuntu-hirsute-kernel.git] / fs / btrfs / extent_io.c
CommitLineData
b2441318 1// SPDX-License-Identifier: GPL-2.0
c1d7c514 2
d1310b2e
CM
3#include <linux/bitops.h>
4#include <linux/slab.h>
5#include <linux/bio.h>
6#include <linux/mm.h>
d1310b2e
CM
7#include <linux/pagemap.h>
8#include <linux/page-flags.h>
d1310b2e
CM
9#include <linux/spinlock.h>
10#include <linux/blkdev.h>
11#include <linux/swap.h>
d1310b2e
CM
12#include <linux/writeback.h>
13#include <linux/pagevec.h>
268bb0ce 14#include <linux/prefetch.h>
90a887c9 15#include <linux/cleancache.h>
d1310b2e 16#include "extent_io.h"
9c7d3a54 17#include "extent-io-tree.h"
d1310b2e 18#include "extent_map.h"
902b22f3
DW
19#include "ctree.h"
20#include "btrfs_inode.h"
4a54c8c1 21#include "volumes.h"
21adbd5c 22#include "check-integrity.h"
0b32f4bb 23#include "locking.h"
606686ee 24#include "rcu-string.h"
fe09e16c 25#include "backref.h"
6af49dbd 26#include "disk-io.h"
d1310b2e 27
d1310b2e
CM
28static struct kmem_cache *extent_state_cache;
29static struct kmem_cache *extent_buffer_cache;
8ac9f7c1 30static struct bio_set btrfs_bioset;
d1310b2e 31
27a3507d
FM
32static inline bool extent_state_in_tree(const struct extent_state *state)
33{
34 return !RB_EMPTY_NODE(&state->rb_node);
35}
36
6d49ba1b 37#ifdef CONFIG_BTRFS_DEBUG
d1310b2e
CM
38static LIST_HEAD(buffers);
39static LIST_HEAD(states);
4bef0848 40
d397712b 41static DEFINE_SPINLOCK(leak_lock);
6d49ba1b
ES
42
43static inline
44void btrfs_leak_debug_add(struct list_head *new, struct list_head *head)
45{
46 unsigned long flags;
47
48 spin_lock_irqsave(&leak_lock, flags);
49 list_add(new, head);
50 spin_unlock_irqrestore(&leak_lock, flags);
51}
52
53static inline
54void btrfs_leak_debug_del(struct list_head *entry)
55{
56 unsigned long flags;
57
58 spin_lock_irqsave(&leak_lock, flags);
59 list_del(entry);
60 spin_unlock_irqrestore(&leak_lock, flags);
61}
62
33ca832f 63static inline void btrfs_extent_buffer_leak_debug_check(void)
6d49ba1b 64{
6d49ba1b
ES
65 struct extent_buffer *eb;
66
33ca832f
JB
67 while (!list_empty(&buffers)) {
68 eb = list_entry(buffers.next, struct extent_buffer, leak_list);
69 pr_err("BTRFS: buffer leak start %llu len %lu refs %d bflags %lu\n",
70 eb->start, eb->len, atomic_read(&eb->refs), eb->bflags);
71 list_del(&eb->leak_list);
72 kmem_cache_free(extent_buffer_cache, eb);
73 }
74}
75
76static inline void btrfs_extent_state_leak_debug_check(void)
77{
78 struct extent_state *state;
79
6d49ba1b
ES
80 while (!list_empty(&states)) {
81 state = list_entry(states.next, struct extent_state, leak_list);
9ee49a04 82 pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n",
27a3507d
FM
83 state->start, state->end, state->state,
84 extent_state_in_tree(state),
b7ac31b7 85 refcount_read(&state->refs));
6d49ba1b
ES
86 list_del(&state->leak_list);
87 kmem_cache_free(extent_state_cache, state);
88 }
6d49ba1b 89}
8d599ae1 90
a5dee37d
JB
91#define btrfs_debug_check_extent_io_range(tree, start, end) \
92 __btrfs_debug_check_extent_io_range(__func__, (tree), (start), (end))
8d599ae1 93static inline void __btrfs_debug_check_extent_io_range(const char *caller,
a5dee37d 94 struct extent_io_tree *tree, u64 start, u64 end)
8d599ae1 95{
65a680f6
NB
96 struct inode *inode = tree->private_data;
97 u64 isize;
98
99 if (!inode || !is_data_inode(inode))
100 return;
101
102 isize = i_size_read(inode);
103 if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
104 btrfs_debug_rl(BTRFS_I(inode)->root->fs_info,
105 "%s: ino %llu isize %llu odd range [%llu,%llu]",
106 caller, btrfs_ino(BTRFS_I(inode)), isize, start, end);
107 }
8d599ae1 108}
6d49ba1b
ES
109#else
110#define btrfs_leak_debug_add(new, head) do {} while (0)
111#define btrfs_leak_debug_del(entry) do {} while (0)
33ca832f
JB
112#define btrfs_extent_buffer_leak_debug_check() do {} while (0)
113#define btrfs_extent_state_leak_debug_check() do {} while (0)
8d599ae1 114#define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0)
4bef0848 115#endif
d1310b2e 116
d1310b2e
CM
117struct tree_entry {
118 u64 start;
119 u64 end;
d1310b2e
CM
120 struct rb_node rb_node;
121};
122
123struct extent_page_data {
124 struct bio *bio;
771ed689
CM
125 /* tells writepage not to lock the state bits for this range
126 * it still does the unlocking
127 */
ffbd517d
CM
128 unsigned int extent_locked:1;
129
70fd7614 130 /* tells the submit_bio code to use REQ_SYNC */
ffbd517d 131 unsigned int sync_io:1;
d1310b2e
CM
132};
133
57599c7e 134static int add_extent_changeset(struct extent_state *state, unsigned bits,
d38ed27f
QW
135 struct extent_changeset *changeset,
136 int set)
137{
138 int ret;
139
140 if (!changeset)
57599c7e 141 return 0;
d38ed27f 142 if (set && (state->state & bits) == bits)
57599c7e 143 return 0;
fefdc557 144 if (!set && (state->state & bits) == 0)
57599c7e 145 return 0;
d38ed27f 146 changeset->bytes_changed += state->end - state->start + 1;
53d32359 147 ret = ulist_add(&changeset->range_changed, state->start, state->end,
d38ed27f 148 GFP_ATOMIC);
57599c7e 149 return ret;
d38ed27f
QW
150}
151
bb58eb9e
QW
152static int __must_check submit_one_bio(struct bio *bio, int mirror_num,
153 unsigned long bio_flags)
154{
155 blk_status_t ret = 0;
bb58eb9e 156 struct extent_io_tree *tree = bio->bi_private;
bb58eb9e
QW
157
158 bio->bi_private = NULL;
159
160 if (tree->ops)
161 ret = tree->ops->submit_bio_hook(tree->private_data, bio,
50489a57 162 mirror_num, bio_flags);
bb58eb9e
QW
163 else
164 btrfsic_submit_bio(bio);
165
166 return blk_status_to_errno(ret);
167}
168
3065976b
QW
169/* Cleanup unsubmitted bios */
170static void end_write_bio(struct extent_page_data *epd, int ret)
171{
172 if (epd->bio) {
173 epd->bio->bi_status = errno_to_blk_status(ret);
174 bio_endio(epd->bio);
175 epd->bio = NULL;
176 }
177}
178
f4340622
QW
179/*
180 * Submit bio from extent page data via submit_one_bio
181 *
182 * Return 0 if everything is OK.
183 * Return <0 for error.
184 */
185static int __must_check flush_write_bio(struct extent_page_data *epd)
bb58eb9e 186{
f4340622 187 int ret = 0;
bb58eb9e 188
f4340622 189 if (epd->bio) {
bb58eb9e 190 ret = submit_one_bio(epd->bio, 0, 0);
f4340622
QW
191 /*
192 * Clean up of epd->bio is handled by its endio function.
193 * And endio is either triggered by successful bio execution
194 * or the error handler of submit bio hook.
195 * So at this point, no matter what happened, we don't need
196 * to clean up epd->bio.
197 */
bb58eb9e
QW
198 epd->bio = NULL;
199 }
f4340622 200 return ret;
bb58eb9e 201}
e2932ee0 202
6f0d04f8 203int __init extent_state_cache_init(void)
d1310b2e 204{
837e1972 205 extent_state_cache = kmem_cache_create("btrfs_extent_state",
9601e3f6 206 sizeof(struct extent_state), 0,
fba4b697 207 SLAB_MEM_SPREAD, NULL);
d1310b2e
CM
208 if (!extent_state_cache)
209 return -ENOMEM;
6f0d04f8
JB
210 return 0;
211}
d1310b2e 212
6f0d04f8
JB
213int __init extent_io_init(void)
214{
837e1972 215 extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer",
9601e3f6 216 sizeof(struct extent_buffer), 0,
fba4b697 217 SLAB_MEM_SPREAD, NULL);
d1310b2e 218 if (!extent_buffer_cache)
6f0d04f8 219 return -ENOMEM;
9be3395b 220
8ac9f7c1
KO
221 if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE,
222 offsetof(struct btrfs_io_bio, bio),
223 BIOSET_NEED_BVECS))
9be3395b 224 goto free_buffer_cache;
b208c2f7 225
8ac9f7c1 226 if (bioset_integrity_create(&btrfs_bioset, BIO_POOL_SIZE))
b208c2f7
DW
227 goto free_bioset;
228
d1310b2e
CM
229 return 0;
230
b208c2f7 231free_bioset:
8ac9f7c1 232 bioset_exit(&btrfs_bioset);
b208c2f7 233
9be3395b
CM
234free_buffer_cache:
235 kmem_cache_destroy(extent_buffer_cache);
236 extent_buffer_cache = NULL;
6f0d04f8
JB
237 return -ENOMEM;
238}
9be3395b 239
6f0d04f8
JB
240void __cold extent_state_cache_exit(void)
241{
242 btrfs_extent_state_leak_debug_check();
d1310b2e 243 kmem_cache_destroy(extent_state_cache);
d1310b2e
CM
244}
245
e67c718b 246void __cold extent_io_exit(void)
d1310b2e 247{
33ca832f 248 btrfs_extent_buffer_leak_debug_check();
8c0a8537
KS
249
250 /*
251 * Make sure all delayed rcu free are flushed before we
252 * destroy caches.
253 */
254 rcu_barrier();
5598e900 255 kmem_cache_destroy(extent_buffer_cache);
8ac9f7c1 256 bioset_exit(&btrfs_bioset);
d1310b2e
CM
257}
258
41a2ee75
JB
259/*
260 * For the file_extent_tree, we want to hold the inode lock when we lookup and
261 * update the disk_i_size, but lockdep will complain because our io_tree we hold
262 * the tree lock and get the inode lock when setting delalloc. These two things
263 * are unrelated, so make a class for the file_extent_tree so we don't get the
264 * two locking patterns mixed up.
265 */
266static struct lock_class_key file_extent_tree_class;
267
c258d6e3 268void extent_io_tree_init(struct btrfs_fs_info *fs_info,
43eb5f29
QW
269 struct extent_io_tree *tree, unsigned int owner,
270 void *private_data)
d1310b2e 271{
c258d6e3 272 tree->fs_info = fs_info;
6bef4d31 273 tree->state = RB_ROOT;
d1310b2e
CM
274 tree->ops = NULL;
275 tree->dirty_bytes = 0;
70dec807 276 spin_lock_init(&tree->lock);
c6100a4b 277 tree->private_data = private_data;
43eb5f29 278 tree->owner = owner;
41a2ee75
JB
279 if (owner == IO_TREE_INODE_FILE_EXTENT)
280 lockdep_set_class(&tree->lock, &file_extent_tree_class);
d1310b2e 281}
d1310b2e 282
41e7acd3
NB
283void extent_io_tree_release(struct extent_io_tree *tree)
284{
285 spin_lock(&tree->lock);
286 /*
287 * Do a single barrier for the waitqueue_active check here, the state
288 * of the waitqueue should not change once extent_io_tree_release is
289 * called.
290 */
291 smp_mb();
292 while (!RB_EMPTY_ROOT(&tree->state)) {
293 struct rb_node *node;
294 struct extent_state *state;
295
296 node = rb_first(&tree->state);
297 state = rb_entry(node, struct extent_state, rb_node);
298 rb_erase(&state->rb_node, &tree->state);
299 RB_CLEAR_NODE(&state->rb_node);
300 /*
301 * btree io trees aren't supposed to have tasks waiting for
302 * changes in the flags of extent states ever.
303 */
304 ASSERT(!waitqueue_active(&state->wq));
305 free_extent_state(state);
306
307 cond_resched_lock(&tree->lock);
308 }
309 spin_unlock(&tree->lock);
310}
311
b2950863 312static struct extent_state *alloc_extent_state(gfp_t mask)
d1310b2e
CM
313{
314 struct extent_state *state;
d1310b2e 315
3ba7ab22
MH
316 /*
317 * The given mask might be not appropriate for the slab allocator,
318 * drop the unsupported bits
319 */
320 mask &= ~(__GFP_DMA32|__GFP_HIGHMEM);
d1310b2e 321 state = kmem_cache_alloc(extent_state_cache, mask);
2b114d1d 322 if (!state)
d1310b2e
CM
323 return state;
324 state->state = 0;
47dc196a 325 state->failrec = NULL;
27a3507d 326 RB_CLEAR_NODE(&state->rb_node);
6d49ba1b 327 btrfs_leak_debug_add(&state->leak_list, &states);
b7ac31b7 328 refcount_set(&state->refs, 1);
d1310b2e 329 init_waitqueue_head(&state->wq);
143bede5 330 trace_alloc_extent_state(state, mask, _RET_IP_);
d1310b2e
CM
331 return state;
332}
d1310b2e 333
4845e44f 334void free_extent_state(struct extent_state *state)
d1310b2e 335{
d1310b2e
CM
336 if (!state)
337 return;
b7ac31b7 338 if (refcount_dec_and_test(&state->refs)) {
27a3507d 339 WARN_ON(extent_state_in_tree(state));
6d49ba1b 340 btrfs_leak_debug_del(&state->leak_list);
143bede5 341 trace_free_extent_state(state, _RET_IP_);
d1310b2e
CM
342 kmem_cache_free(extent_state_cache, state);
343 }
344}
d1310b2e 345
f2071b21
FM
346static struct rb_node *tree_insert(struct rb_root *root,
347 struct rb_node *search_start,
348 u64 offset,
12cfbad9
FDBM
349 struct rb_node *node,
350 struct rb_node ***p_in,
351 struct rb_node **parent_in)
d1310b2e 352{
f2071b21 353 struct rb_node **p;
d397712b 354 struct rb_node *parent = NULL;
d1310b2e
CM
355 struct tree_entry *entry;
356
12cfbad9
FDBM
357 if (p_in && parent_in) {
358 p = *p_in;
359 parent = *parent_in;
360 goto do_insert;
361 }
362
f2071b21 363 p = search_start ? &search_start : &root->rb_node;
d397712b 364 while (*p) {
d1310b2e
CM
365 parent = *p;
366 entry = rb_entry(parent, struct tree_entry, rb_node);
367
368 if (offset < entry->start)
369 p = &(*p)->rb_left;
370 else if (offset > entry->end)
371 p = &(*p)->rb_right;
372 else
373 return parent;
374 }
375
12cfbad9 376do_insert:
d1310b2e
CM
377 rb_link_node(node, parent, p);
378 rb_insert_color(node, root);
379 return NULL;
380}
381
8666e638
NB
382/**
383 * __etree_search - searche @tree for an entry that contains @offset. Such
384 * entry would have entry->start <= offset && entry->end >= offset.
385 *
386 * @tree - the tree to search
387 * @offset - offset that should fall within an entry in @tree
388 * @next_ret - pointer to the first entry whose range ends after @offset
389 * @prev - pointer to the first entry whose range begins before @offset
390 * @p_ret - pointer where new node should be anchored (used when inserting an
391 * entry in the tree)
392 * @parent_ret - points to entry which would have been the parent of the entry,
393 * containing @offset
394 *
395 * This function returns a pointer to the entry that contains @offset byte
396 * address. If no such entry exists, then NULL is returned and the other
397 * pointer arguments to the function are filled, otherwise the found entry is
398 * returned and other pointers are left untouched.
399 */
80ea96b1 400static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
12cfbad9 401 struct rb_node **next_ret,
352646c7 402 struct rb_node **prev_ret,
12cfbad9
FDBM
403 struct rb_node ***p_ret,
404 struct rb_node **parent_ret)
d1310b2e 405{
80ea96b1 406 struct rb_root *root = &tree->state;
12cfbad9 407 struct rb_node **n = &root->rb_node;
d1310b2e
CM
408 struct rb_node *prev = NULL;
409 struct rb_node *orig_prev = NULL;
410 struct tree_entry *entry;
411 struct tree_entry *prev_entry = NULL;
412
12cfbad9
FDBM
413 while (*n) {
414 prev = *n;
415 entry = rb_entry(prev, struct tree_entry, rb_node);
d1310b2e
CM
416 prev_entry = entry;
417
418 if (offset < entry->start)
12cfbad9 419 n = &(*n)->rb_left;
d1310b2e 420 else if (offset > entry->end)
12cfbad9 421 n = &(*n)->rb_right;
d397712b 422 else
12cfbad9 423 return *n;
d1310b2e
CM
424 }
425
12cfbad9
FDBM
426 if (p_ret)
427 *p_ret = n;
428 if (parent_ret)
429 *parent_ret = prev;
430
352646c7 431 if (next_ret) {
d1310b2e 432 orig_prev = prev;
d397712b 433 while (prev && offset > prev_entry->end) {
d1310b2e
CM
434 prev = rb_next(prev);
435 prev_entry = rb_entry(prev, struct tree_entry, rb_node);
436 }
352646c7 437 *next_ret = prev;
d1310b2e
CM
438 prev = orig_prev;
439 }
440
352646c7 441 if (prev_ret) {
d1310b2e 442 prev_entry = rb_entry(prev, struct tree_entry, rb_node);
d397712b 443 while (prev && offset < prev_entry->start) {
d1310b2e
CM
444 prev = rb_prev(prev);
445 prev_entry = rb_entry(prev, struct tree_entry, rb_node);
446 }
352646c7 447 *prev_ret = prev;
d1310b2e
CM
448 }
449 return NULL;
450}
451
12cfbad9
FDBM
452static inline struct rb_node *
453tree_search_for_insert(struct extent_io_tree *tree,
454 u64 offset,
455 struct rb_node ***p_ret,
456 struct rb_node **parent_ret)
d1310b2e 457{
352646c7 458 struct rb_node *next= NULL;
d1310b2e 459 struct rb_node *ret;
70dec807 460
352646c7 461 ret = __etree_search(tree, offset, &next, NULL, p_ret, parent_ret);
d397712b 462 if (!ret)
352646c7 463 return next;
d1310b2e
CM
464 return ret;
465}
466
12cfbad9
FDBM
467static inline struct rb_node *tree_search(struct extent_io_tree *tree,
468 u64 offset)
469{
470 return tree_search_for_insert(tree, offset, NULL, NULL);
471}
472
d1310b2e
CM
473/*
474 * utility function to look for merge candidates inside a given range.
475 * Any extents with matching state are merged together into a single
476 * extent in the tree. Extents with EXTENT_IO in their state field
477 * are not merged because the end_io handlers need to be able to do
478 * operations on them without sleeping (or doing allocations/splits).
479 *
480 * This should be called with the tree lock held.
481 */
1bf85046
JM
482static void merge_state(struct extent_io_tree *tree,
483 struct extent_state *state)
d1310b2e
CM
484{
485 struct extent_state *other;
486 struct rb_node *other_node;
487
8882679e 488 if (state->state & (EXTENT_LOCKED | EXTENT_BOUNDARY))
1bf85046 489 return;
d1310b2e
CM
490
491 other_node = rb_prev(&state->rb_node);
492 if (other_node) {
493 other = rb_entry(other_node, struct extent_state, rb_node);
494 if (other->end == state->start - 1 &&
495 other->state == state->state) {
5c848198
NB
496 if (tree->private_data &&
497 is_data_inode(tree->private_data))
498 btrfs_merge_delalloc_extent(tree->private_data,
499 state, other);
d1310b2e 500 state->start = other->start;
d1310b2e 501 rb_erase(&other->rb_node, &tree->state);
27a3507d 502 RB_CLEAR_NODE(&other->rb_node);
d1310b2e
CM
503 free_extent_state(other);
504 }
505 }
506 other_node = rb_next(&state->rb_node);
507 if (other_node) {
508 other = rb_entry(other_node, struct extent_state, rb_node);
509 if (other->start == state->end + 1 &&
510 other->state == state->state) {
5c848198
NB
511 if (tree->private_data &&
512 is_data_inode(tree->private_data))
513 btrfs_merge_delalloc_extent(tree->private_data,
514 state, other);
df98b6e2 515 state->end = other->end;
df98b6e2 516 rb_erase(&other->rb_node, &tree->state);
27a3507d 517 RB_CLEAR_NODE(&other->rb_node);
df98b6e2 518 free_extent_state(other);
d1310b2e
CM
519 }
520 }
d1310b2e
CM
521}
522
3150b699 523static void set_state_bits(struct extent_io_tree *tree,
d38ed27f
QW
524 struct extent_state *state, unsigned *bits,
525 struct extent_changeset *changeset);
3150b699 526
d1310b2e
CM
527/*
528 * insert an extent_state struct into the tree. 'bits' are set on the
529 * struct before it is inserted.
530 *
531 * This may return -EEXIST if the extent is already there, in which case the
532 * state struct is freed.
533 *
534 * The tree lock is not taken internally. This is a utility function and
535 * probably isn't what you want to call (see set/clear_extent_bit).
536 */
537static int insert_state(struct extent_io_tree *tree,
538 struct extent_state *state, u64 start, u64 end,
12cfbad9
FDBM
539 struct rb_node ***p,
540 struct rb_node **parent,
d38ed27f 541 unsigned *bits, struct extent_changeset *changeset)
d1310b2e
CM
542{
543 struct rb_node *node;
544
2792237d
DS
545 if (end < start) {
546 btrfs_err(tree->fs_info,
547 "insert state: end < start %llu %llu", end, start);
548 WARN_ON(1);
549 }
d1310b2e
CM
550 state->start = start;
551 state->end = end;
9ed74f2d 552
d38ed27f 553 set_state_bits(tree, state, bits, changeset);
3150b699 554
f2071b21 555 node = tree_insert(&tree->state, NULL, end, &state->rb_node, p, parent);
d1310b2e
CM
556 if (node) {
557 struct extent_state *found;
558 found = rb_entry(node, struct extent_state, rb_node);
2792237d
DS
559 btrfs_err(tree->fs_info,
560 "found node %llu %llu on insert of %llu %llu",
c1c9ff7c 561 found->start, found->end, start, end);
d1310b2e
CM
562 return -EEXIST;
563 }
564 merge_state(tree, state);
565 return 0;
566}
567
568/*
569 * split a given extent state struct in two, inserting the preallocated
570 * struct 'prealloc' as the newly created second half. 'split' indicates an
571 * offset inside 'orig' where it should be split.
572 *
573 * Before calling,
574 * the tree has 'orig' at [orig->start, orig->end]. After calling, there
575 * are two extent state structs in the tree:
576 * prealloc: [orig->start, split - 1]
577 * orig: [ split, orig->end ]
578 *
579 * The tree locks are not taken by this function. They need to be held
580 * by the caller.
581 */
582static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
583 struct extent_state *prealloc, u64 split)
584{
585 struct rb_node *node;
9ed74f2d 586
abbb55f4
NB
587 if (tree->private_data && is_data_inode(tree->private_data))
588 btrfs_split_delalloc_extent(tree->private_data, orig, split);
9ed74f2d 589
d1310b2e
CM
590 prealloc->start = orig->start;
591 prealloc->end = split - 1;
592 prealloc->state = orig->state;
593 orig->start = split;
594
f2071b21
FM
595 node = tree_insert(&tree->state, &orig->rb_node, prealloc->end,
596 &prealloc->rb_node, NULL, NULL);
d1310b2e 597 if (node) {
d1310b2e
CM
598 free_extent_state(prealloc);
599 return -EEXIST;
600 }
601 return 0;
602}
603
cdc6a395
LZ
604static struct extent_state *next_state(struct extent_state *state)
605{
606 struct rb_node *next = rb_next(&state->rb_node);
607 if (next)
608 return rb_entry(next, struct extent_state, rb_node);
609 else
610 return NULL;
611}
612
d1310b2e
CM
613/*
614 * utility function to clear some bits in an extent state struct.
52042d8e 615 * it will optionally wake up anyone waiting on this state (wake == 1).
d1310b2e
CM
616 *
617 * If no bits are set on the state struct after clearing things, the
618 * struct is freed and removed from the tree
619 */
cdc6a395
LZ
620static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
621 struct extent_state *state,
fefdc557
QW
622 unsigned *bits, int wake,
623 struct extent_changeset *changeset)
d1310b2e 624{
cdc6a395 625 struct extent_state *next;
9ee49a04 626 unsigned bits_to_clear = *bits & ~EXTENT_CTLBITS;
57599c7e 627 int ret;
d1310b2e 628
0ca1f7ce 629 if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
d1310b2e
CM
630 u64 range = state->end - state->start + 1;
631 WARN_ON(range > tree->dirty_bytes);
632 tree->dirty_bytes -= range;
633 }
a36bb5f9
NB
634
635 if (tree->private_data && is_data_inode(tree->private_data))
636 btrfs_clear_delalloc_extent(tree->private_data, state, bits);
637
57599c7e
DS
638 ret = add_extent_changeset(state, bits_to_clear, changeset, 0);
639 BUG_ON(ret < 0);
32c00aff 640 state->state &= ~bits_to_clear;
d1310b2e
CM
641 if (wake)
642 wake_up(&state->wq);
0ca1f7ce 643 if (state->state == 0) {
cdc6a395 644 next = next_state(state);
27a3507d 645 if (extent_state_in_tree(state)) {
d1310b2e 646 rb_erase(&state->rb_node, &tree->state);
27a3507d 647 RB_CLEAR_NODE(&state->rb_node);
d1310b2e
CM
648 free_extent_state(state);
649 } else {
650 WARN_ON(1);
651 }
652 } else {
653 merge_state(tree, state);
cdc6a395 654 next = next_state(state);
d1310b2e 655 }
cdc6a395 656 return next;
d1310b2e
CM
657}
658
8233767a
XG
659static struct extent_state *
660alloc_extent_state_atomic(struct extent_state *prealloc)
661{
662 if (!prealloc)
663 prealloc = alloc_extent_state(GFP_ATOMIC);
664
665 return prealloc;
666}
667
48a3b636 668static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
c2d904e0 669{
05912a3c
DS
670 struct inode *inode = tree->private_data;
671
672 btrfs_panic(btrfs_sb(inode->i_sb), err,
673 "locking error: extent tree was modified by another thread while locked");
c2d904e0
JM
674}
675
d1310b2e
CM
676/*
677 * clear some bits on a range in the tree. This may require splitting
678 * or inserting elements in the tree, so the gfp mask is used to
679 * indicate which allocations or sleeping are allowed.
680 *
681 * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
682 * the given range from the tree regardless of state (ie for truncate).
683 *
684 * the range [start, end] is inclusive.
685 *
6763af84 686 * This takes the tree lock, and returns 0 on success and < 0 on error.
d1310b2e 687 */
66b0c887 688int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
fefdc557
QW
689 unsigned bits, int wake, int delete,
690 struct extent_state **cached_state,
691 gfp_t mask, struct extent_changeset *changeset)
d1310b2e
CM
692{
693 struct extent_state *state;
2c64c53d 694 struct extent_state *cached;
d1310b2e
CM
695 struct extent_state *prealloc = NULL;
696 struct rb_node *node;
5c939df5 697 u64 last_end;
d1310b2e 698 int err;
2ac55d41 699 int clear = 0;
d1310b2e 700
a5dee37d 701 btrfs_debug_check_extent_io_range(tree, start, end);
a1d19847 702 trace_btrfs_clear_extent_bit(tree, start, end - start + 1, bits);
8d599ae1 703
7ee9e440
JB
704 if (bits & EXTENT_DELALLOC)
705 bits |= EXTENT_NORESERVE;
706
0ca1f7ce
YZ
707 if (delete)
708 bits |= ~EXTENT_CTLBITS;
0ca1f7ce 709
8882679e 710 if (bits & (EXTENT_LOCKED | EXTENT_BOUNDARY))
2ac55d41 711 clear = 1;
d1310b2e 712again:
d0164adc 713 if (!prealloc && gfpflags_allow_blocking(mask)) {
c7bc6319
FM
714 /*
715 * Don't care for allocation failure here because we might end
716 * up not needing the pre-allocated extent state at all, which
717 * is the case if we only have in the tree extent states that
718 * cover our input range and don't cover too any other range.
719 * If we end up needing a new extent state we allocate it later.
720 */
d1310b2e 721 prealloc = alloc_extent_state(mask);
d1310b2e
CM
722 }
723
cad321ad 724 spin_lock(&tree->lock);
2c64c53d
CM
725 if (cached_state) {
726 cached = *cached_state;
2ac55d41
JB
727
728 if (clear) {
729 *cached_state = NULL;
730 cached_state = NULL;
731 }
732
27a3507d
FM
733 if (cached && extent_state_in_tree(cached) &&
734 cached->start <= start && cached->end > start) {
2ac55d41 735 if (clear)
b7ac31b7 736 refcount_dec(&cached->refs);
2c64c53d 737 state = cached;
42daec29 738 goto hit_next;
2c64c53d 739 }
2ac55d41
JB
740 if (clear)
741 free_extent_state(cached);
2c64c53d 742 }
d1310b2e
CM
743 /*
744 * this search will find the extents that end after
745 * our range starts
746 */
80ea96b1 747 node = tree_search(tree, start);
d1310b2e
CM
748 if (!node)
749 goto out;
750 state = rb_entry(node, struct extent_state, rb_node);
2c64c53d 751hit_next:
d1310b2e
CM
752 if (state->start > end)
753 goto out;
754 WARN_ON(state->end < start);
5c939df5 755 last_end = state->end;
d1310b2e 756
0449314a 757 /* the state doesn't have the wanted bits, go ahead */
cdc6a395
LZ
758 if (!(state->state & bits)) {
759 state = next_state(state);
0449314a 760 goto next;
cdc6a395 761 }
0449314a 762
d1310b2e
CM
763 /*
764 * | ---- desired range ---- |
765 * | state | or
766 * | ------------- state -------------- |
767 *
768 * We need to split the extent we found, and may flip
769 * bits on second half.
770 *
771 * If the extent we found extends past our range, we
772 * just split and search again. It'll get split again
773 * the next time though.
774 *
775 * If the extent we found is inside our range, we clear
776 * the desired bit on it.
777 */
778
779 if (state->start < start) {
8233767a
XG
780 prealloc = alloc_extent_state_atomic(prealloc);
781 BUG_ON(!prealloc);
d1310b2e 782 err = split_state(tree, state, prealloc, start);
c2d904e0
JM
783 if (err)
784 extent_io_tree_panic(tree, err);
785
d1310b2e
CM
786 prealloc = NULL;
787 if (err)
788 goto out;
789 if (state->end <= end) {
fefdc557
QW
790 state = clear_state_bit(tree, state, &bits, wake,
791 changeset);
d1ac6e41 792 goto next;
d1310b2e
CM
793 }
794 goto search_again;
795 }
796 /*
797 * | ---- desired range ---- |
798 * | state |
799 * We need to split the extent, and clear the bit
800 * on the first half
801 */
802 if (state->start <= end && state->end > end) {
8233767a
XG
803 prealloc = alloc_extent_state_atomic(prealloc);
804 BUG_ON(!prealloc);
d1310b2e 805 err = split_state(tree, state, prealloc, end + 1);
c2d904e0
JM
806 if (err)
807 extent_io_tree_panic(tree, err);
808
d1310b2e
CM
809 if (wake)
810 wake_up(&state->wq);
42daec29 811
fefdc557 812 clear_state_bit(tree, prealloc, &bits, wake, changeset);
9ed74f2d 813
d1310b2e
CM
814 prealloc = NULL;
815 goto out;
816 }
42daec29 817
fefdc557 818 state = clear_state_bit(tree, state, &bits, wake, changeset);
0449314a 819next:
5c939df5
YZ
820 if (last_end == (u64)-1)
821 goto out;
822 start = last_end + 1;
cdc6a395 823 if (start <= end && state && !need_resched())
692e5759 824 goto hit_next;
d1310b2e
CM
825
826search_again:
827 if (start > end)
828 goto out;
cad321ad 829 spin_unlock(&tree->lock);
d0164adc 830 if (gfpflags_allow_blocking(mask))
d1310b2e
CM
831 cond_resched();
832 goto again;
7ab5cb2a
DS
833
834out:
835 spin_unlock(&tree->lock);
836 if (prealloc)
837 free_extent_state(prealloc);
838
839 return 0;
840
d1310b2e 841}
d1310b2e 842
143bede5
JM
843static void wait_on_state(struct extent_io_tree *tree,
844 struct extent_state *state)
641f5219
CH
845 __releases(tree->lock)
846 __acquires(tree->lock)
d1310b2e
CM
847{
848 DEFINE_WAIT(wait);
849 prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
cad321ad 850 spin_unlock(&tree->lock);
d1310b2e 851 schedule();
cad321ad 852 spin_lock(&tree->lock);
d1310b2e 853 finish_wait(&state->wq, &wait);
d1310b2e
CM
854}
855
856/*
857 * waits for one or more bits to clear on a range in the state tree.
858 * The range [start, end] is inclusive.
859 * The tree lock is taken by this function
860 */
41074888
DS
861static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
862 unsigned long bits)
d1310b2e
CM
863{
864 struct extent_state *state;
865 struct rb_node *node;
866
a5dee37d 867 btrfs_debug_check_extent_io_range(tree, start, end);
8d599ae1 868
cad321ad 869 spin_lock(&tree->lock);
d1310b2e
CM
870again:
871 while (1) {
872 /*
873 * this search will find all the extents that end after
874 * our range starts
875 */
80ea96b1 876 node = tree_search(tree, start);
c50d3e71 877process_node:
d1310b2e
CM
878 if (!node)
879 break;
880
881 state = rb_entry(node, struct extent_state, rb_node);
882
883 if (state->start > end)
884 goto out;
885
886 if (state->state & bits) {
887 start = state->start;
b7ac31b7 888 refcount_inc(&state->refs);
d1310b2e
CM
889 wait_on_state(tree, state);
890 free_extent_state(state);
891 goto again;
892 }
893 start = state->end + 1;
894
895 if (start > end)
896 break;
897
c50d3e71
FM
898 if (!cond_resched_lock(&tree->lock)) {
899 node = rb_next(node);
900 goto process_node;
901 }
d1310b2e
CM
902 }
903out:
cad321ad 904 spin_unlock(&tree->lock);
d1310b2e 905}
d1310b2e 906
1bf85046 907static void set_state_bits(struct extent_io_tree *tree,
d1310b2e 908 struct extent_state *state,
d38ed27f 909 unsigned *bits, struct extent_changeset *changeset)
d1310b2e 910{
9ee49a04 911 unsigned bits_to_set = *bits & ~EXTENT_CTLBITS;
57599c7e 912 int ret;
9ed74f2d 913
e06a1fc9
NB
914 if (tree->private_data && is_data_inode(tree->private_data))
915 btrfs_set_delalloc_extent(tree->private_data, state, bits);
916
0ca1f7ce 917 if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
d1310b2e
CM
918 u64 range = state->end - state->start + 1;
919 tree->dirty_bytes += range;
920 }
57599c7e
DS
921 ret = add_extent_changeset(state, bits_to_set, changeset, 1);
922 BUG_ON(ret < 0);
0ca1f7ce 923 state->state |= bits_to_set;
d1310b2e
CM
924}
925
e38e2ed7
FM
926static void cache_state_if_flags(struct extent_state *state,
927 struct extent_state **cached_ptr,
9ee49a04 928 unsigned flags)
2c64c53d
CM
929{
930 if (cached_ptr && !(*cached_ptr)) {
e38e2ed7 931 if (!flags || (state->state & flags)) {
2c64c53d 932 *cached_ptr = state;
b7ac31b7 933 refcount_inc(&state->refs);
2c64c53d
CM
934 }
935 }
936}
937
e38e2ed7
FM
938static void cache_state(struct extent_state *state,
939 struct extent_state **cached_ptr)
940{
941 return cache_state_if_flags(state, cached_ptr,
8882679e 942 EXTENT_LOCKED | EXTENT_BOUNDARY);
e38e2ed7
FM
943}
944
d1310b2e 945/*
1edbb734
CM
946 * set some bits on a range in the tree. This may require allocations or
947 * sleeping, so the gfp mask is used to indicate what is allowed.
d1310b2e 948 *
1edbb734
CM
949 * If any of the exclusive bits are set, this will fail with -EEXIST if some
950 * part of the range already has the desired bits set. The start of the
951 * existing range is returned in failed_start in this case.
d1310b2e 952 *
1edbb734 953 * [start, end] is inclusive This takes the tree lock.
d1310b2e 954 */
1edbb734 955
3fbe5c02
JM
956static int __must_check
957__set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
9ee49a04 958 unsigned bits, unsigned exclusive_bits,
41074888 959 u64 *failed_start, struct extent_state **cached_state,
d38ed27f 960 gfp_t mask, struct extent_changeset *changeset)
d1310b2e
CM
961{
962 struct extent_state *state;
963 struct extent_state *prealloc = NULL;
964 struct rb_node *node;
12cfbad9
FDBM
965 struct rb_node **p;
966 struct rb_node *parent;
d1310b2e 967 int err = 0;
d1310b2e
CM
968 u64 last_start;
969 u64 last_end;
42daec29 970
a5dee37d 971 btrfs_debug_check_extent_io_range(tree, start, end);
a1d19847 972 trace_btrfs_set_extent_bit(tree, start, end - start + 1, bits);
8d599ae1 973
d1310b2e 974again:
d0164adc 975 if (!prealloc && gfpflags_allow_blocking(mask)) {
059f791c
DS
976 /*
977 * Don't care for allocation failure here because we might end
978 * up not needing the pre-allocated extent state at all, which
979 * is the case if we only have in the tree extent states that
980 * cover our input range and don't cover too any other range.
981 * If we end up needing a new extent state we allocate it later.
982 */
d1310b2e 983 prealloc = alloc_extent_state(mask);
d1310b2e
CM
984 }
985
cad321ad 986 spin_lock(&tree->lock);
9655d298
CM
987 if (cached_state && *cached_state) {
988 state = *cached_state;
df98b6e2 989 if (state->start <= start && state->end > start &&
27a3507d 990 extent_state_in_tree(state)) {
9655d298
CM
991 node = &state->rb_node;
992 goto hit_next;
993 }
994 }
d1310b2e
CM
995 /*
996 * this search will find all the extents that end after
997 * our range starts.
998 */
12cfbad9 999 node = tree_search_for_insert(tree, start, &p, &parent);
d1310b2e 1000 if (!node) {
8233767a
XG
1001 prealloc = alloc_extent_state_atomic(prealloc);
1002 BUG_ON(!prealloc);
12cfbad9 1003 err = insert_state(tree, prealloc, start, end,
d38ed27f 1004 &p, &parent, &bits, changeset);
c2d904e0
JM
1005 if (err)
1006 extent_io_tree_panic(tree, err);
1007
c42ac0bc 1008 cache_state(prealloc, cached_state);
d1310b2e 1009 prealloc = NULL;
d1310b2e
CM
1010 goto out;
1011 }
d1310b2e 1012 state = rb_entry(node, struct extent_state, rb_node);
40431d6c 1013hit_next:
d1310b2e
CM
1014 last_start = state->start;
1015 last_end = state->end;
1016
1017 /*
1018 * | ---- desired range ---- |
1019 * | state |
1020 *
1021 * Just lock what we found and keep going
1022 */
1023 if (state->start == start && state->end <= end) {
1edbb734 1024 if (state->state & exclusive_bits) {
d1310b2e
CM
1025 *failed_start = state->start;
1026 err = -EEXIST;
1027 goto out;
1028 }
42daec29 1029
d38ed27f 1030 set_state_bits(tree, state, &bits, changeset);
2c64c53d 1031 cache_state(state, cached_state);
d1310b2e 1032 merge_state(tree, state);
5c939df5
YZ
1033 if (last_end == (u64)-1)
1034 goto out;
1035 start = last_end + 1;
d1ac6e41
LB
1036 state = next_state(state);
1037 if (start < end && state && state->start == start &&
1038 !need_resched())
1039 goto hit_next;
d1310b2e
CM
1040 goto search_again;
1041 }
1042
1043 /*
1044 * | ---- desired range ---- |
1045 * | state |
1046 * or
1047 * | ------------- state -------------- |
1048 *
1049 * We need to split the extent we found, and may flip bits on
1050 * second half.
1051 *
1052 * If the extent we found extends past our
1053 * range, we just split and search again. It'll get split
1054 * again the next time though.
1055 *
1056 * If the extent we found is inside our range, we set the
1057 * desired bit on it.
1058 */
1059 if (state->start < start) {
1edbb734 1060 if (state->state & exclusive_bits) {
d1310b2e
CM
1061 *failed_start = start;
1062 err = -EEXIST;
1063 goto out;
1064 }
8233767a
XG
1065
1066 prealloc = alloc_extent_state_atomic(prealloc);
1067 BUG_ON(!prealloc);
d1310b2e 1068 err = split_state(tree, state, prealloc, start);
c2d904e0
JM
1069 if (err)
1070 extent_io_tree_panic(tree, err);
1071
d1310b2e
CM
1072 prealloc = NULL;
1073 if (err)
1074 goto out;
1075 if (state->end <= end) {
d38ed27f 1076 set_state_bits(tree, state, &bits, changeset);
2c64c53d 1077 cache_state(state, cached_state);
d1310b2e 1078 merge_state(tree, state);
5c939df5
YZ
1079 if (last_end == (u64)-1)
1080 goto out;
1081 start = last_end + 1;
d1ac6e41
LB
1082 state = next_state(state);
1083 if (start < end && state && state->start == start &&
1084 !need_resched())
1085 goto hit_next;
d1310b2e
CM
1086 }
1087 goto search_again;
1088 }
1089 /*
1090 * | ---- desired range ---- |
1091 * | state | or | state |
1092 *
1093 * There's a hole, we need to insert something in it and
1094 * ignore the extent we found.
1095 */
1096 if (state->start > start) {
1097 u64 this_end;
1098 if (end < last_start)
1099 this_end = end;
1100 else
d397712b 1101 this_end = last_start - 1;
8233767a
XG
1102
1103 prealloc = alloc_extent_state_atomic(prealloc);
1104 BUG_ON(!prealloc);
c7f895a2
XG
1105
1106 /*
1107 * Avoid to free 'prealloc' if it can be merged with
1108 * the later extent.
1109 */
d1310b2e 1110 err = insert_state(tree, prealloc, start, this_end,
d38ed27f 1111 NULL, NULL, &bits, changeset);
c2d904e0
JM
1112 if (err)
1113 extent_io_tree_panic(tree, err);
1114
9ed74f2d
JB
1115 cache_state(prealloc, cached_state);
1116 prealloc = NULL;
d1310b2e
CM
1117 start = this_end + 1;
1118 goto search_again;
1119 }
1120 /*
1121 * | ---- desired range ---- |
1122 * | state |
1123 * We need to split the extent, and set the bit
1124 * on the first half
1125 */
1126 if (state->start <= end && state->end > end) {
1edbb734 1127 if (state->state & exclusive_bits) {
d1310b2e
CM
1128 *failed_start = start;
1129 err = -EEXIST;
1130 goto out;
1131 }
8233767a
XG
1132
1133 prealloc = alloc_extent_state_atomic(prealloc);
1134 BUG_ON(!prealloc);
d1310b2e 1135 err = split_state(tree, state, prealloc, end + 1);
c2d904e0
JM
1136 if (err)
1137 extent_io_tree_panic(tree, err);
d1310b2e 1138
d38ed27f 1139 set_state_bits(tree, prealloc, &bits, changeset);
2c64c53d 1140 cache_state(prealloc, cached_state);
d1310b2e
CM
1141 merge_state(tree, prealloc);
1142 prealloc = NULL;
1143 goto out;
1144 }
1145
b5a4ba14
DS
1146search_again:
1147 if (start > end)
1148 goto out;
1149 spin_unlock(&tree->lock);
1150 if (gfpflags_allow_blocking(mask))
1151 cond_resched();
1152 goto again;
d1310b2e
CM
1153
1154out:
cad321ad 1155 spin_unlock(&tree->lock);
d1310b2e
CM
1156 if (prealloc)
1157 free_extent_state(prealloc);
1158
1159 return err;
1160
d1310b2e 1161}
d1310b2e 1162
41074888 1163int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
9ee49a04 1164 unsigned bits, u64 * failed_start,
41074888 1165 struct extent_state **cached_state, gfp_t mask)
3fbe5c02
JM
1166{
1167 return __set_extent_bit(tree, start, end, bits, 0, failed_start,
d38ed27f 1168 cached_state, mask, NULL);
3fbe5c02
JM
1169}
1170
1171
462d6fac 1172/**
10983f2e
LB
1173 * convert_extent_bit - convert all bits in a given range from one bit to
1174 * another
462d6fac
JB
1175 * @tree: the io tree to search
1176 * @start: the start offset in bytes
1177 * @end: the end offset in bytes (inclusive)
1178 * @bits: the bits to set in this range
1179 * @clear_bits: the bits to clear in this range
e6138876 1180 * @cached_state: state that we're going to cache
462d6fac
JB
1181 *
1182 * This will go through and set bits for the given range. If any states exist
1183 * already in this range they are set with the given bit and cleared of the
1184 * clear_bits. This is only meant to be used by things that are mergeable, ie
1185 * converting from say DELALLOC to DIRTY. This is not meant to be used with
1186 * boundary bits like LOCK.
210aa277
DS
1187 *
1188 * All allocations are done with GFP_NOFS.
462d6fac
JB
1189 */
1190int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
9ee49a04 1191 unsigned bits, unsigned clear_bits,
210aa277 1192 struct extent_state **cached_state)
462d6fac
JB
1193{
1194 struct extent_state *state;
1195 struct extent_state *prealloc = NULL;
1196 struct rb_node *node;
12cfbad9
FDBM
1197 struct rb_node **p;
1198 struct rb_node *parent;
462d6fac
JB
1199 int err = 0;
1200 u64 last_start;
1201 u64 last_end;
c8fd3de7 1202 bool first_iteration = true;
462d6fac 1203
a5dee37d 1204 btrfs_debug_check_extent_io_range(tree, start, end);
a1d19847
QW
1205 trace_btrfs_convert_extent_bit(tree, start, end - start + 1, bits,
1206 clear_bits);
8d599ae1 1207
462d6fac 1208again:
210aa277 1209 if (!prealloc) {
c8fd3de7
FM
1210 /*
1211 * Best effort, don't worry if extent state allocation fails
1212 * here for the first iteration. We might have a cached state
1213 * that matches exactly the target range, in which case no
1214 * extent state allocations are needed. We'll only know this
1215 * after locking the tree.
1216 */
210aa277 1217 prealloc = alloc_extent_state(GFP_NOFS);
c8fd3de7 1218 if (!prealloc && !first_iteration)
462d6fac
JB
1219 return -ENOMEM;
1220 }
1221
1222 spin_lock(&tree->lock);
e6138876
JB
1223 if (cached_state && *cached_state) {
1224 state = *cached_state;
1225 if (state->start <= start && state->end > start &&
27a3507d 1226 extent_state_in_tree(state)) {
e6138876
JB
1227 node = &state->rb_node;
1228 goto hit_next;
1229 }
1230 }
1231
462d6fac
JB
1232 /*
1233 * this search will find all the extents that end after
1234 * our range starts.
1235 */
12cfbad9 1236 node = tree_search_for_insert(tree, start, &p, &parent);
462d6fac
JB
1237 if (!node) {
1238 prealloc = alloc_extent_state_atomic(prealloc);
1cf4ffdb
LB
1239 if (!prealloc) {
1240 err = -ENOMEM;
1241 goto out;
1242 }
12cfbad9 1243 err = insert_state(tree, prealloc, start, end,
d38ed27f 1244 &p, &parent, &bits, NULL);
c2d904e0
JM
1245 if (err)
1246 extent_io_tree_panic(tree, err);
c42ac0bc
FDBM
1247 cache_state(prealloc, cached_state);
1248 prealloc = NULL;
462d6fac
JB
1249 goto out;
1250 }
1251 state = rb_entry(node, struct extent_state, rb_node);
1252hit_next:
1253 last_start = state->start;
1254 last_end = state->end;
1255
1256 /*
1257 * | ---- desired range ---- |
1258 * | state |
1259 *
1260 * Just lock what we found and keep going
1261 */
1262 if (state->start == start && state->end <= end) {
d38ed27f 1263 set_state_bits(tree, state, &bits, NULL);
e6138876 1264 cache_state(state, cached_state);
fefdc557 1265 state = clear_state_bit(tree, state, &clear_bits, 0, NULL);
462d6fac
JB
1266 if (last_end == (u64)-1)
1267 goto out;
462d6fac 1268 start = last_end + 1;
d1ac6e41
LB
1269 if (start < end && state && state->start == start &&
1270 !need_resched())
1271 goto hit_next;
462d6fac
JB
1272 goto search_again;
1273 }
1274
1275 /*
1276 * | ---- desired range ---- |
1277 * | state |
1278 * or
1279 * | ------------- state -------------- |
1280 *
1281 * We need to split the extent we found, and may flip bits on
1282 * second half.
1283 *
1284 * If the extent we found extends past our
1285 * range, we just split and search again. It'll get split
1286 * again the next time though.
1287 *
1288 * If the extent we found is inside our range, we set the
1289 * desired bit on it.
1290 */
1291 if (state->start < start) {
1292 prealloc = alloc_extent_state_atomic(prealloc);
1cf4ffdb
LB
1293 if (!prealloc) {
1294 err = -ENOMEM;
1295 goto out;
1296 }
462d6fac 1297 err = split_state(tree, state, prealloc, start);
c2d904e0
JM
1298 if (err)
1299 extent_io_tree_panic(tree, err);
462d6fac
JB
1300 prealloc = NULL;
1301 if (err)
1302 goto out;
1303 if (state->end <= end) {
d38ed27f 1304 set_state_bits(tree, state, &bits, NULL);
e6138876 1305 cache_state(state, cached_state);
fefdc557
QW
1306 state = clear_state_bit(tree, state, &clear_bits, 0,
1307 NULL);
462d6fac
JB
1308 if (last_end == (u64)-1)
1309 goto out;
1310 start = last_end + 1;
d1ac6e41
LB
1311 if (start < end && state && state->start == start &&
1312 !need_resched())
1313 goto hit_next;
462d6fac
JB
1314 }
1315 goto search_again;
1316 }
1317 /*
1318 * | ---- desired range ---- |
1319 * | state | or | state |
1320 *
1321 * There's a hole, we need to insert something in it and
1322 * ignore the extent we found.
1323 */
1324 if (state->start > start) {
1325 u64 this_end;
1326 if (end < last_start)
1327 this_end = end;
1328 else
1329 this_end = last_start - 1;
1330
1331 prealloc = alloc_extent_state_atomic(prealloc);
1cf4ffdb
LB
1332 if (!prealloc) {
1333 err = -ENOMEM;
1334 goto out;
1335 }
462d6fac
JB
1336
1337 /*
1338 * Avoid to free 'prealloc' if it can be merged with
1339 * the later extent.
1340 */
1341 err = insert_state(tree, prealloc, start, this_end,
d38ed27f 1342 NULL, NULL, &bits, NULL);
c2d904e0
JM
1343 if (err)
1344 extent_io_tree_panic(tree, err);
e6138876 1345 cache_state(prealloc, cached_state);
462d6fac
JB
1346 prealloc = NULL;
1347 start = this_end + 1;
1348 goto search_again;
1349 }
1350 /*
1351 * | ---- desired range ---- |
1352 * | state |
1353 * We need to split the extent, and set the bit
1354 * on the first half
1355 */
1356 if (state->start <= end && state->end > end) {
1357 prealloc = alloc_extent_state_atomic(prealloc);
1cf4ffdb
LB
1358 if (!prealloc) {
1359 err = -ENOMEM;
1360 goto out;
1361 }
462d6fac
JB
1362
1363 err = split_state(tree, state, prealloc, end + 1);
c2d904e0
JM
1364 if (err)
1365 extent_io_tree_panic(tree, err);
462d6fac 1366
d38ed27f 1367 set_state_bits(tree, prealloc, &bits, NULL);
e6138876 1368 cache_state(prealloc, cached_state);
fefdc557 1369 clear_state_bit(tree, prealloc, &clear_bits, 0, NULL);
462d6fac
JB
1370 prealloc = NULL;
1371 goto out;
1372 }
1373
462d6fac
JB
1374search_again:
1375 if (start > end)
1376 goto out;
1377 spin_unlock(&tree->lock);
210aa277 1378 cond_resched();
c8fd3de7 1379 first_iteration = false;
462d6fac 1380 goto again;
462d6fac
JB
1381
1382out:
1383 spin_unlock(&tree->lock);
1384 if (prealloc)
1385 free_extent_state(prealloc);
1386
1387 return err;
462d6fac
JB
1388}
1389
d1310b2e 1390/* wrappers around set/clear extent bit */
d38ed27f 1391int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
2c53b912 1392 unsigned bits, struct extent_changeset *changeset)
d38ed27f
QW
1393{
1394 /*
1395 * We don't support EXTENT_LOCKED yet, as current changeset will
1396 * record any bits changed, so for EXTENT_LOCKED case, it will
1397 * either fail with -EEXIST or changeset will record the whole
1398 * range.
1399 */
1400 BUG_ON(bits & EXTENT_LOCKED);
1401
2c53b912 1402 return __set_extent_bit(tree, start, end, bits, 0, NULL, NULL, GFP_NOFS,
d38ed27f
QW
1403 changeset);
1404}
1405
4ca73656
NB
1406int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start, u64 end,
1407 unsigned bits)
1408{
1409 return __set_extent_bit(tree, start, end, bits, 0, NULL, NULL,
1410 GFP_NOWAIT, NULL);
1411}
1412
fefdc557
QW
1413int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
1414 unsigned bits, int wake, int delete,
ae0f1625 1415 struct extent_state **cached)
fefdc557
QW
1416{
1417 return __clear_extent_bit(tree, start, end, bits, wake, delete,
ae0f1625 1418 cached, GFP_NOFS, NULL);
fefdc557
QW
1419}
1420
fefdc557 1421int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
f734c44a 1422 unsigned bits, struct extent_changeset *changeset)
fefdc557
QW
1423{
1424 /*
1425 * Don't support EXTENT_LOCKED case, same reason as
1426 * set_record_extent_bits().
1427 */
1428 BUG_ON(bits & EXTENT_LOCKED);
1429
f734c44a 1430 return __clear_extent_bit(tree, start, end, bits, 0, 0, NULL, GFP_NOFS,
fefdc557
QW
1431 changeset);
1432}
1433
d352ac68
CM
1434/*
1435 * either insert or lock state struct between start and end use mask to tell
1436 * us if waiting is desired.
1437 */
1edbb734 1438int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
ff13db41 1439 struct extent_state **cached_state)
d1310b2e
CM
1440{
1441 int err;
1442 u64 failed_start;
9ee49a04 1443
d1310b2e 1444 while (1) {
ff13db41 1445 err = __set_extent_bit(tree, start, end, EXTENT_LOCKED,
3fbe5c02 1446 EXTENT_LOCKED, &failed_start,
d38ed27f 1447 cached_state, GFP_NOFS, NULL);
d0082371 1448 if (err == -EEXIST) {
d1310b2e
CM
1449 wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
1450 start = failed_start;
d0082371 1451 } else
d1310b2e 1452 break;
d1310b2e
CM
1453 WARN_ON(start > end);
1454 }
1455 return err;
1456}
d1310b2e 1457
d0082371 1458int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
25179201
JB
1459{
1460 int err;
1461 u64 failed_start;
1462
3fbe5c02 1463 err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
d38ed27f 1464 &failed_start, NULL, GFP_NOFS, NULL);
6643558d
YZ
1465 if (err == -EEXIST) {
1466 if (failed_start > start)
1467 clear_extent_bit(tree, start, failed_start - 1,
ae0f1625 1468 EXTENT_LOCKED, 1, 0, NULL);
25179201 1469 return 0;
6643558d 1470 }
25179201
JB
1471 return 1;
1472}
25179201 1473
bd1fa4f0 1474void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end)
4adaa611 1475{
09cbfeaf
KS
1476 unsigned long index = start >> PAGE_SHIFT;
1477 unsigned long end_index = end >> PAGE_SHIFT;
4adaa611
CM
1478 struct page *page;
1479
1480 while (index <= end_index) {
1481 page = find_get_page(inode->i_mapping, index);
1482 BUG_ON(!page); /* Pages should be in the extent_io_tree */
1483 clear_page_dirty_for_io(page);
09cbfeaf 1484 put_page(page);
4adaa611
CM
1485 index++;
1486 }
4adaa611
CM
1487}
1488
f6311572 1489void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
4adaa611 1490{
09cbfeaf
KS
1491 unsigned long index = start >> PAGE_SHIFT;
1492 unsigned long end_index = end >> PAGE_SHIFT;
4adaa611
CM
1493 struct page *page;
1494
1495 while (index <= end_index) {
1496 page = find_get_page(inode->i_mapping, index);
1497 BUG_ON(!page); /* Pages should be in the extent_io_tree */
4adaa611 1498 __set_page_dirty_nobuffers(page);
8d38633c 1499 account_page_redirty(page);
09cbfeaf 1500 put_page(page);
4adaa611
CM
1501 index++;
1502 }
4adaa611
CM
1503}
1504
d352ac68
CM
1505/* find the first state struct with 'bits' set after 'start', and
1506 * return it. tree->lock must be held. NULL will returned if
1507 * nothing was found after 'start'
1508 */
48a3b636
ES
1509static struct extent_state *
1510find_first_extent_bit_state(struct extent_io_tree *tree,
9ee49a04 1511 u64 start, unsigned bits)
d7fc640e
CM
1512{
1513 struct rb_node *node;
1514 struct extent_state *state;
1515
1516 /*
1517 * this search will find all the extents that end after
1518 * our range starts.
1519 */
1520 node = tree_search(tree, start);
d397712b 1521 if (!node)
d7fc640e 1522 goto out;
d7fc640e 1523
d397712b 1524 while (1) {
d7fc640e 1525 state = rb_entry(node, struct extent_state, rb_node);
d397712b 1526 if (state->end >= start && (state->state & bits))
d7fc640e 1527 return state;
d397712b 1528
d7fc640e
CM
1529 node = rb_next(node);
1530 if (!node)
1531 break;
1532 }
1533out:
1534 return NULL;
1535}
d7fc640e 1536
69261c4b
XG
1537/*
1538 * find the first offset in the io tree with 'bits' set. zero is
1539 * returned if we find something, and *start_ret and *end_ret are
1540 * set to reflect the state struct that was found.
1541 *
477d7eaf 1542 * If nothing was found, 1 is returned. If found something, return 0.
69261c4b
XG
1543 */
1544int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
9ee49a04 1545 u64 *start_ret, u64 *end_ret, unsigned bits,
e6138876 1546 struct extent_state **cached_state)
69261c4b
XG
1547{
1548 struct extent_state *state;
1549 int ret = 1;
1550
1551 spin_lock(&tree->lock);
e6138876
JB
1552 if (cached_state && *cached_state) {
1553 state = *cached_state;
27a3507d 1554 if (state->end == start - 1 && extent_state_in_tree(state)) {
9688e9a9 1555 while ((state = next_state(state)) != NULL) {
e6138876
JB
1556 if (state->state & bits)
1557 goto got_it;
e6138876
JB
1558 }
1559 free_extent_state(*cached_state);
1560 *cached_state = NULL;
1561 goto out;
1562 }
1563 free_extent_state(*cached_state);
1564 *cached_state = NULL;
1565 }
1566
69261c4b 1567 state = find_first_extent_bit_state(tree, start, bits);
e6138876 1568got_it:
69261c4b 1569 if (state) {
e38e2ed7 1570 cache_state_if_flags(state, cached_state, 0);
69261c4b
XG
1571 *start_ret = state->start;
1572 *end_ret = state->end;
1573 ret = 0;
1574 }
e6138876 1575out:
69261c4b
XG
1576 spin_unlock(&tree->lock);
1577 return ret;
1578}
1579
41a2ee75
JB
1580/**
1581 * find_contiguous_extent_bit: find a contiguous area of bits
1582 * @tree - io tree to check
1583 * @start - offset to start the search from
1584 * @start_ret - the first offset we found with the bits set
1585 * @end_ret - the final contiguous range of the bits that were set
1586 * @bits - bits to look for
1587 *
1588 * set_extent_bit and clear_extent_bit can temporarily split contiguous ranges
1589 * to set bits appropriately, and then merge them again. During this time it
1590 * will drop the tree->lock, so use this helper if you want to find the actual
1591 * contiguous area for given bits. We will search to the first bit we find, and
1592 * then walk down the tree until we find a non-contiguous area. The area
1593 * returned will be the full contiguous area with the bits set.
1594 */
1595int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
1596 u64 *start_ret, u64 *end_ret, unsigned bits)
1597{
1598 struct extent_state *state;
1599 int ret = 1;
1600
1601 spin_lock(&tree->lock);
1602 state = find_first_extent_bit_state(tree, start, bits);
1603 if (state) {
1604 *start_ret = state->start;
1605 *end_ret = state->end;
1606 while ((state = next_state(state)) != NULL) {
1607 if (state->start > (*end_ret + 1))
1608 break;
1609 *end_ret = state->end;
1610 }
1611 ret = 0;
1612 }
1613 spin_unlock(&tree->lock);
1614 return ret;
1615}
1616
45bfcfc1 1617/**
1eaebb34
NB
1618 * find_first_clear_extent_bit - find the first range that has @bits not set.
1619 * This range could start before @start.
45bfcfc1
NB
1620 *
1621 * @tree - the tree to search
1622 * @start - the offset at/after which the found extent should start
1623 * @start_ret - records the beginning of the range
1624 * @end_ret - records the end of the range (inclusive)
1625 * @bits - the set of bits which must be unset
1626 *
1627 * Since unallocated range is also considered one which doesn't have the bits
1628 * set it's possible that @end_ret contains -1, this happens in case the range
1629 * spans (last_range_end, end of device]. In this case it's up to the caller to
1630 * trim @end_ret to the appropriate size.
1631 */
1632void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
1633 u64 *start_ret, u64 *end_ret, unsigned bits)
1634{
1635 struct extent_state *state;
1636 struct rb_node *node, *prev = NULL, *next;
1637
1638 spin_lock(&tree->lock);
1639
1640 /* Find first extent with bits cleared */
1641 while (1) {
1642 node = __etree_search(tree, start, &next, &prev, NULL, NULL);
5750c375
NB
1643 if (!node && !next && !prev) {
1644 /*
1645 * Tree is completely empty, send full range and let
1646 * caller deal with it
1647 */
1648 *start_ret = 0;
1649 *end_ret = -1;
1650 goto out;
1651 } else if (!node && !next) {
1652 /*
1653 * We are past the last allocated chunk, set start at
1654 * the end of the last extent.
1655 */
1656 state = rb_entry(prev, struct extent_state, rb_node);
1657 *start_ret = state->end + 1;
1658 *end_ret = -1;
1659 goto out;
1660 } else if (!node) {
45bfcfc1 1661 node = next;
45bfcfc1 1662 }
1eaebb34
NB
1663 /*
1664 * At this point 'node' either contains 'start' or start is
1665 * before 'node'
1666 */
45bfcfc1 1667 state = rb_entry(node, struct extent_state, rb_node);
1eaebb34
NB
1668
1669 if (in_range(start, state->start, state->end - state->start + 1)) {
1670 if (state->state & bits) {
1671 /*
1672 * |--range with bits sets--|
1673 * |
1674 * start
1675 */
1676 start = state->end + 1;
1677 } else {
1678 /*
1679 * 'start' falls within a range that doesn't
1680 * have the bits set, so take its start as
1681 * the beginning of the desired range
1682 *
1683 * |--range with bits cleared----|
1684 * |
1685 * start
1686 */
1687 *start_ret = state->start;
1688 break;
1689 }
45bfcfc1 1690 } else {
1eaebb34
NB
1691 /*
1692 * |---prev range---|---hole/unset---|---node range---|
1693 * |
1694 * start
1695 *
1696 * or
1697 *
1698 * |---hole/unset--||--first node--|
1699 * 0 |
1700 * start
1701 */
1702 if (prev) {
1703 state = rb_entry(prev, struct extent_state,
1704 rb_node);
1705 *start_ret = state->end + 1;
1706 } else {
1707 *start_ret = 0;
1708 }
45bfcfc1
NB
1709 break;
1710 }
1711 }
1712
1713 /*
1714 * Find the longest stretch from start until an entry which has the
1715 * bits set
1716 */
1717 while (1) {
1718 state = rb_entry(node, struct extent_state, rb_node);
1719 if (state->end >= start && !(state->state & bits)) {
1720 *end_ret = state->end;
1721 } else {
1722 *end_ret = state->start - 1;
1723 break;
1724 }
1725
1726 node = rb_next(node);
1727 if (!node)
1728 break;
1729 }
1730out:
1731 spin_unlock(&tree->lock);
1732}
1733
d352ac68
CM
1734/*
1735 * find a contiguous range of bytes in the file marked as delalloc, not
1736 * more than 'max_bytes'. start and end are used to return the range,
1737 *
3522e903 1738 * true is returned if we find something, false if nothing was in the tree
d352ac68 1739 */
083e75e7
JB
1740bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start,
1741 u64 *end, u64 max_bytes,
1742 struct extent_state **cached_state)
d1310b2e
CM
1743{
1744 struct rb_node *node;
1745 struct extent_state *state;
1746 u64 cur_start = *start;
3522e903 1747 bool found = false;
d1310b2e
CM
1748 u64 total_bytes = 0;
1749
cad321ad 1750 spin_lock(&tree->lock);
c8b97818 1751
d1310b2e
CM
1752 /*
1753 * this search will find all the extents that end after
1754 * our range starts.
1755 */
80ea96b1 1756 node = tree_search(tree, cur_start);
2b114d1d 1757 if (!node) {
3522e903 1758 *end = (u64)-1;
d1310b2e
CM
1759 goto out;
1760 }
1761
d397712b 1762 while (1) {
d1310b2e 1763 state = rb_entry(node, struct extent_state, rb_node);
5b21f2ed
ZY
1764 if (found && (state->start != cur_start ||
1765 (state->state & EXTENT_BOUNDARY))) {
d1310b2e
CM
1766 goto out;
1767 }
1768 if (!(state->state & EXTENT_DELALLOC)) {
1769 if (!found)
1770 *end = state->end;
1771 goto out;
1772 }
c2a128d2 1773 if (!found) {
d1310b2e 1774 *start = state->start;
c2a128d2 1775 *cached_state = state;
b7ac31b7 1776 refcount_inc(&state->refs);
c2a128d2 1777 }
3522e903 1778 found = true;
d1310b2e
CM
1779 *end = state->end;
1780 cur_start = state->end + 1;
1781 node = rb_next(node);
d1310b2e 1782 total_bytes += state->end - state->start + 1;
7bf811a5 1783 if (total_bytes >= max_bytes)
573aecaf 1784 break;
573aecaf 1785 if (!node)
d1310b2e
CM
1786 break;
1787 }
1788out:
cad321ad 1789 spin_unlock(&tree->lock);
d1310b2e
CM
1790 return found;
1791}
1792
da2c7009
LB
1793static int __process_pages_contig(struct address_space *mapping,
1794 struct page *locked_page,
1795 pgoff_t start_index, pgoff_t end_index,
1796 unsigned long page_ops, pgoff_t *index_ret);
1797
143bede5
JM
1798static noinline void __unlock_for_delalloc(struct inode *inode,
1799 struct page *locked_page,
1800 u64 start, u64 end)
c8b97818 1801{
09cbfeaf
KS
1802 unsigned long index = start >> PAGE_SHIFT;
1803 unsigned long end_index = end >> PAGE_SHIFT;
c8b97818 1804
76c0021d 1805 ASSERT(locked_page);
c8b97818 1806 if (index == locked_page->index && end_index == index)
143bede5 1807 return;
c8b97818 1808
76c0021d
LB
1809 __process_pages_contig(inode->i_mapping, locked_page, index, end_index,
1810 PAGE_UNLOCK, NULL);
c8b97818
CM
1811}
1812
1813static noinline int lock_delalloc_pages(struct inode *inode,
1814 struct page *locked_page,
1815 u64 delalloc_start,
1816 u64 delalloc_end)
1817{
09cbfeaf 1818 unsigned long index = delalloc_start >> PAGE_SHIFT;
76c0021d 1819 unsigned long index_ret = index;
09cbfeaf 1820 unsigned long end_index = delalloc_end >> PAGE_SHIFT;
c8b97818 1821 int ret;
c8b97818 1822
76c0021d 1823 ASSERT(locked_page);
c8b97818
CM
1824 if (index == locked_page->index && index == end_index)
1825 return 0;
1826
76c0021d
LB
1827 ret = __process_pages_contig(inode->i_mapping, locked_page, index,
1828 end_index, PAGE_LOCK, &index_ret);
1829 if (ret == -EAGAIN)
1830 __unlock_for_delalloc(inode, locked_page, delalloc_start,
1831 (u64)index_ret << PAGE_SHIFT);
c8b97818
CM
1832 return ret;
1833}
1834
1835/*
3522e903
LF
1836 * Find and lock a contiguous range of bytes in the file marked as delalloc, no
1837 * more than @max_bytes. @Start and @end are used to return the range,
c8b97818 1838 *
3522e903
LF
1839 * Return: true if we find something
1840 * false if nothing was in the tree
c8b97818 1841 */
ce9f967f 1842EXPORT_FOR_TESTS
3522e903 1843noinline_for_stack bool find_lock_delalloc_range(struct inode *inode,
294e30fe 1844 struct page *locked_page, u64 *start,
917aacec 1845 u64 *end)
c8b97818 1846{
9978059b 1847 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
917aacec 1848 u64 max_bytes = BTRFS_MAX_EXTENT_SIZE;
c8b97818
CM
1849 u64 delalloc_start;
1850 u64 delalloc_end;
3522e903 1851 bool found;
9655d298 1852 struct extent_state *cached_state = NULL;
c8b97818
CM
1853 int ret;
1854 int loops = 0;
1855
1856again:
1857 /* step one, find a bunch of delalloc bytes starting at start */
1858 delalloc_start = *start;
1859 delalloc_end = 0;
083e75e7
JB
1860 found = btrfs_find_delalloc_range(tree, &delalloc_start, &delalloc_end,
1861 max_bytes, &cached_state);
70b99e69 1862 if (!found || delalloc_end <= *start) {
c8b97818
CM
1863 *start = delalloc_start;
1864 *end = delalloc_end;
c2a128d2 1865 free_extent_state(cached_state);
3522e903 1866 return false;
c8b97818
CM
1867 }
1868
70b99e69
CM
1869 /*
1870 * start comes from the offset of locked_page. We have to lock
1871 * pages in order, so we can't process delalloc bytes before
1872 * locked_page
1873 */
d397712b 1874 if (delalloc_start < *start)
70b99e69 1875 delalloc_start = *start;
70b99e69 1876
c8b97818
CM
1877 /*
1878 * make sure to limit the number of pages we try to lock down
c8b97818 1879 */
7bf811a5
JB
1880 if (delalloc_end + 1 - delalloc_start > max_bytes)
1881 delalloc_end = delalloc_start + max_bytes - 1;
d397712b 1882
c8b97818
CM
1883 /* step two, lock all the pages after the page that has start */
1884 ret = lock_delalloc_pages(inode, locked_page,
1885 delalloc_start, delalloc_end);
9bfd61d9 1886 ASSERT(!ret || ret == -EAGAIN);
c8b97818
CM
1887 if (ret == -EAGAIN) {
1888 /* some of the pages are gone, lets avoid looping by
1889 * shortening the size of the delalloc range we're searching
1890 */
9655d298 1891 free_extent_state(cached_state);
7d788742 1892 cached_state = NULL;
c8b97818 1893 if (!loops) {
09cbfeaf 1894 max_bytes = PAGE_SIZE;
c8b97818
CM
1895 loops = 1;
1896 goto again;
1897 } else {
3522e903 1898 found = false;
c8b97818
CM
1899 goto out_failed;
1900 }
1901 }
c8b97818
CM
1902
1903 /* step three, lock the state bits for the whole range */
ff13db41 1904 lock_extent_bits(tree, delalloc_start, delalloc_end, &cached_state);
c8b97818
CM
1905
1906 /* then test to make sure it is all still delalloc */
1907 ret = test_range_bit(tree, delalloc_start, delalloc_end,
9655d298 1908 EXTENT_DELALLOC, 1, cached_state);
c8b97818 1909 if (!ret) {
9655d298 1910 unlock_extent_cached(tree, delalloc_start, delalloc_end,
e43bbe5e 1911 &cached_state);
c8b97818
CM
1912 __unlock_for_delalloc(inode, locked_page,
1913 delalloc_start, delalloc_end);
1914 cond_resched();
1915 goto again;
1916 }
9655d298 1917 free_extent_state(cached_state);
c8b97818
CM
1918 *start = delalloc_start;
1919 *end = delalloc_end;
1920out_failed:
1921 return found;
1922}
1923
da2c7009
LB
1924static int __process_pages_contig(struct address_space *mapping,
1925 struct page *locked_page,
1926 pgoff_t start_index, pgoff_t end_index,
1927 unsigned long page_ops, pgoff_t *index_ret)
c8b97818 1928{
873695b3 1929 unsigned long nr_pages = end_index - start_index + 1;
da2c7009 1930 unsigned long pages_locked = 0;
873695b3 1931 pgoff_t index = start_index;
c8b97818 1932 struct page *pages[16];
873695b3 1933 unsigned ret;
da2c7009 1934 int err = 0;
c8b97818 1935 int i;
771ed689 1936
da2c7009
LB
1937 if (page_ops & PAGE_LOCK) {
1938 ASSERT(page_ops == PAGE_LOCK);
1939 ASSERT(index_ret && *index_ret == start_index);
1940 }
1941
704de49d 1942 if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0)
873695b3 1943 mapping_set_error(mapping, -EIO);
704de49d 1944
d397712b 1945 while (nr_pages > 0) {
873695b3 1946 ret = find_get_pages_contig(mapping, index,
5b050f04
CM
1947 min_t(unsigned long,
1948 nr_pages, ARRAY_SIZE(pages)), pages);
da2c7009
LB
1949 if (ret == 0) {
1950 /*
1951 * Only if we're going to lock these pages,
1952 * can we find nothing at @index.
1953 */
1954 ASSERT(page_ops & PAGE_LOCK);
49d4a334
LB
1955 err = -EAGAIN;
1956 goto out;
da2c7009 1957 }
8b62b72b 1958
da2c7009 1959 for (i = 0; i < ret; i++) {
c2790a2e 1960 if (page_ops & PAGE_SET_PRIVATE2)
8b62b72b
CM
1961 SetPagePrivate2(pages[i]);
1962
1d53c9e6 1963 if (locked_page && pages[i] == locked_page) {
09cbfeaf 1964 put_page(pages[i]);
da2c7009 1965 pages_locked++;
c8b97818
CM
1966 continue;
1967 }
c2790a2e 1968 if (page_ops & PAGE_CLEAR_DIRTY)
c8b97818 1969 clear_page_dirty_for_io(pages[i]);
c2790a2e 1970 if (page_ops & PAGE_SET_WRITEBACK)
c8b97818 1971 set_page_writeback(pages[i]);
704de49d
FM
1972 if (page_ops & PAGE_SET_ERROR)
1973 SetPageError(pages[i]);
c2790a2e 1974 if (page_ops & PAGE_END_WRITEBACK)
c8b97818 1975 end_page_writeback(pages[i]);
c2790a2e 1976 if (page_ops & PAGE_UNLOCK)
771ed689 1977 unlock_page(pages[i]);
da2c7009
LB
1978 if (page_ops & PAGE_LOCK) {
1979 lock_page(pages[i]);
1980 if (!PageDirty(pages[i]) ||
1981 pages[i]->mapping != mapping) {
1982 unlock_page(pages[i]);
1983 put_page(pages[i]);
1984 err = -EAGAIN;
1985 goto out;
1986 }
1987 }
09cbfeaf 1988 put_page(pages[i]);
da2c7009 1989 pages_locked++;
c8b97818
CM
1990 }
1991 nr_pages -= ret;
1992 index += ret;
1993 cond_resched();
1994 }
da2c7009
LB
1995out:
1996 if (err && index_ret)
1997 *index_ret = start_index + pages_locked - 1;
1998 return err;
c8b97818 1999}
c8b97818 2000
873695b3 2001void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
74e9194a
NB
2002 struct page *locked_page,
2003 unsigned clear_bits,
2004 unsigned long page_ops)
873695b3
LB
2005{
2006 clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, clear_bits, 1, 0,
ae0f1625 2007 NULL);
873695b3
LB
2008
2009 __process_pages_contig(inode->i_mapping, locked_page,
2010 start >> PAGE_SHIFT, end >> PAGE_SHIFT,
da2c7009 2011 page_ops, NULL);
873695b3
LB
2012}
2013
d352ac68
CM
2014/*
2015 * count the number of bytes in the tree that have a given bit(s)
2016 * set. This can be fairly slow, except for EXTENT_DIRTY which is
2017 * cached. The total number found is returned.
2018 */
d1310b2e
CM
2019u64 count_range_bits(struct extent_io_tree *tree,
2020 u64 *start, u64 search_end, u64 max_bytes,
9ee49a04 2021 unsigned bits, int contig)
d1310b2e
CM
2022{
2023 struct rb_node *node;
2024 struct extent_state *state;
2025 u64 cur_start = *start;
2026 u64 total_bytes = 0;
ec29ed5b 2027 u64 last = 0;
d1310b2e
CM
2028 int found = 0;
2029
fae7f21c 2030 if (WARN_ON(search_end <= cur_start))
d1310b2e 2031 return 0;
d1310b2e 2032
cad321ad 2033 spin_lock(&tree->lock);
d1310b2e
CM
2034 if (cur_start == 0 && bits == EXTENT_DIRTY) {
2035 total_bytes = tree->dirty_bytes;
2036 goto out;
2037 }
2038 /*
2039 * this search will find all the extents that end after
2040 * our range starts.
2041 */
80ea96b1 2042 node = tree_search(tree, cur_start);
d397712b 2043 if (!node)
d1310b2e 2044 goto out;
d1310b2e 2045
d397712b 2046 while (1) {
d1310b2e
CM
2047 state = rb_entry(node, struct extent_state, rb_node);
2048 if (state->start > search_end)
2049 break;
ec29ed5b
CM
2050 if (contig && found && state->start > last + 1)
2051 break;
2052 if (state->end >= cur_start && (state->state & bits) == bits) {
d1310b2e
CM
2053 total_bytes += min(search_end, state->end) + 1 -
2054 max(cur_start, state->start);
2055 if (total_bytes >= max_bytes)
2056 break;
2057 if (!found) {
af60bed2 2058 *start = max(cur_start, state->start);
d1310b2e
CM
2059 found = 1;
2060 }
ec29ed5b
CM
2061 last = state->end;
2062 } else if (contig && found) {
2063 break;
d1310b2e
CM
2064 }
2065 node = rb_next(node);
2066 if (!node)
2067 break;
2068 }
2069out:
cad321ad 2070 spin_unlock(&tree->lock);
d1310b2e
CM
2071 return total_bytes;
2072}
b2950863 2073
d352ac68
CM
2074/*
2075 * set the private field for a given byte offset in the tree. If there isn't
2076 * an extent_state there already, this does nothing.
2077 */
b3f167aa
JB
2078int set_state_failrec(struct extent_io_tree *tree, u64 start,
2079 struct io_failure_record *failrec)
d1310b2e
CM
2080{
2081 struct rb_node *node;
2082 struct extent_state *state;
2083 int ret = 0;
2084
cad321ad 2085 spin_lock(&tree->lock);
d1310b2e
CM
2086 /*
2087 * this search will find all the extents that end after
2088 * our range starts.
2089 */
80ea96b1 2090 node = tree_search(tree, start);
2b114d1d 2091 if (!node) {
d1310b2e
CM
2092 ret = -ENOENT;
2093 goto out;
2094 }
2095 state = rb_entry(node, struct extent_state, rb_node);
2096 if (state->start != start) {
2097 ret = -ENOENT;
2098 goto out;
2099 }
47dc196a 2100 state->failrec = failrec;
d1310b2e 2101out:
cad321ad 2102 spin_unlock(&tree->lock);
d1310b2e
CM
2103 return ret;
2104}
2105
b3f167aa
JB
2106int get_state_failrec(struct extent_io_tree *tree, u64 start,
2107 struct io_failure_record **failrec)
d1310b2e
CM
2108{
2109 struct rb_node *node;
2110 struct extent_state *state;
2111 int ret = 0;
2112
cad321ad 2113 spin_lock(&tree->lock);
d1310b2e
CM
2114 /*
2115 * this search will find all the extents that end after
2116 * our range starts.
2117 */
80ea96b1 2118 node = tree_search(tree, start);
2b114d1d 2119 if (!node) {
d1310b2e
CM
2120 ret = -ENOENT;
2121 goto out;
2122 }
2123 state = rb_entry(node, struct extent_state, rb_node);
2124 if (state->start != start) {
2125 ret = -ENOENT;
2126 goto out;
2127 }
47dc196a 2128 *failrec = state->failrec;
d1310b2e 2129out:
cad321ad 2130 spin_unlock(&tree->lock);
d1310b2e
CM
2131 return ret;
2132}
2133
2134/*
2135 * searches a range in the state tree for a given mask.
70dec807 2136 * If 'filled' == 1, this returns 1 only if every extent in the tree
d1310b2e
CM
2137 * has the bits set. Otherwise, 1 is returned if any bit in the
2138 * range is found set.
2139 */
2140int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
9ee49a04 2141 unsigned bits, int filled, struct extent_state *cached)
d1310b2e
CM
2142{
2143 struct extent_state *state = NULL;
2144 struct rb_node *node;
2145 int bitset = 0;
d1310b2e 2146
cad321ad 2147 spin_lock(&tree->lock);
27a3507d 2148 if (cached && extent_state_in_tree(cached) && cached->start <= start &&
df98b6e2 2149 cached->end > start)
9655d298
CM
2150 node = &cached->rb_node;
2151 else
2152 node = tree_search(tree, start);
d1310b2e
CM
2153 while (node && start <= end) {
2154 state = rb_entry(node, struct extent_state, rb_node);
2155
2156 if (filled && state->start > start) {
2157 bitset = 0;
2158 break;
2159 }
2160
2161 if (state->start > end)
2162 break;
2163
2164 if (state->state & bits) {
2165 bitset = 1;
2166 if (!filled)
2167 break;
2168 } else if (filled) {
2169 bitset = 0;
2170 break;
2171 }
46562cec
CM
2172
2173 if (state->end == (u64)-1)
2174 break;
2175
d1310b2e
CM
2176 start = state->end + 1;
2177 if (start > end)
2178 break;
2179 node = rb_next(node);
2180 if (!node) {
2181 if (filled)
2182 bitset = 0;
2183 break;
2184 }
2185 }
cad321ad 2186 spin_unlock(&tree->lock);
d1310b2e
CM
2187 return bitset;
2188}
d1310b2e
CM
2189
2190/*
2191 * helper function to set a given page up to date if all the
2192 * extents in the tree for that page are up to date
2193 */
143bede5 2194static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)
d1310b2e 2195{
4eee4fa4 2196 u64 start = page_offset(page);
09cbfeaf 2197 u64 end = start + PAGE_SIZE - 1;
9655d298 2198 if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
d1310b2e 2199 SetPageUptodate(page);
d1310b2e
CM
2200}
2201
7870d082
JB
2202int free_io_failure(struct extent_io_tree *failure_tree,
2203 struct extent_io_tree *io_tree,
2204 struct io_failure_record *rec)
4a54c8c1
JS
2205{
2206 int ret;
2207 int err = 0;
4a54c8c1 2208
47dc196a 2209 set_state_failrec(failure_tree, rec->start, NULL);
4a54c8c1
JS
2210 ret = clear_extent_bits(failure_tree, rec->start,
2211 rec->start + rec->len - 1,
91166212 2212 EXTENT_LOCKED | EXTENT_DIRTY);
4a54c8c1
JS
2213 if (ret)
2214 err = ret;
2215
7870d082 2216 ret = clear_extent_bits(io_tree, rec->start,
53b381b3 2217 rec->start + rec->len - 1,
91166212 2218 EXTENT_DAMAGED);
53b381b3
DW
2219 if (ret && !err)
2220 err = ret;
4a54c8c1
JS
2221
2222 kfree(rec);
2223 return err;
2224}
2225
4a54c8c1
JS
2226/*
2227 * this bypasses the standard btrfs submit functions deliberately, as
2228 * the standard behavior is to write all copies in a raid setup. here we only
2229 * want to write the one bad copy. so we do the mapping for ourselves and issue
2230 * submit_bio directly.
3ec706c8 2231 * to avoid any synchronization issues, wait for the data after writing, which
4a54c8c1
JS
2232 * actually prevents the read that triggered the error from finishing.
2233 * currently, there can be no more than two copies of every data bit. thus,
2234 * exactly one rewrite is required.
2235 */
6ec656bc
JB
2236int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
2237 u64 length, u64 logical, struct page *page,
2238 unsigned int pg_offset, int mirror_num)
4a54c8c1
JS
2239{
2240 struct bio *bio;
2241 struct btrfs_device *dev;
4a54c8c1
JS
2242 u64 map_length = 0;
2243 u64 sector;
2244 struct btrfs_bio *bbio = NULL;
2245 int ret;
2246
1751e8a6 2247 ASSERT(!(fs_info->sb->s_flags & SB_RDONLY));
4a54c8c1
JS
2248 BUG_ON(!mirror_num);
2249
c5e4c3d7 2250 bio = btrfs_io_bio_alloc(1);
4f024f37 2251 bio->bi_iter.bi_size = 0;
4a54c8c1
JS
2252 map_length = length;
2253
b5de8d0d
FM
2254 /*
2255 * Avoid races with device replace and make sure our bbio has devices
2256 * associated to its stripes that don't go away while we are doing the
2257 * read repair operation.
2258 */
2259 btrfs_bio_counter_inc_blocked(fs_info);
e4ff5fb5 2260 if (btrfs_is_parity_mirror(fs_info, logical, length)) {
c725328c
LB
2261 /*
2262 * Note that we don't use BTRFS_MAP_WRITE because it's supposed
2263 * to update all raid stripes, but here we just want to correct
2264 * bad stripe, thus BTRFS_MAP_READ is abused to only get the bad
2265 * stripe's dev and sector.
2266 */
2267 ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical,
2268 &map_length, &bbio, 0);
2269 if (ret) {
2270 btrfs_bio_counter_dec(fs_info);
2271 bio_put(bio);
2272 return -EIO;
2273 }
2274 ASSERT(bbio->mirror_num == 1);
2275 } else {
2276 ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical,
2277 &map_length, &bbio, mirror_num);
2278 if (ret) {
2279 btrfs_bio_counter_dec(fs_info);
2280 bio_put(bio);
2281 return -EIO;
2282 }
2283 BUG_ON(mirror_num != bbio->mirror_num);
4a54c8c1 2284 }
c725328c
LB
2285
2286 sector = bbio->stripes[bbio->mirror_num - 1].physical >> 9;
4f024f37 2287 bio->bi_iter.bi_sector = sector;
c725328c 2288 dev = bbio->stripes[bbio->mirror_num - 1].dev;
6e9606d2 2289 btrfs_put_bbio(bbio);
ebbede42
AJ
2290 if (!dev || !dev->bdev ||
2291 !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
b5de8d0d 2292 btrfs_bio_counter_dec(fs_info);
4a54c8c1
JS
2293 bio_put(bio);
2294 return -EIO;
2295 }
74d46992 2296 bio_set_dev(bio, dev->bdev);
70fd7614 2297 bio->bi_opf = REQ_OP_WRITE | REQ_SYNC;
ffdd2018 2298 bio_add_page(bio, page, length, pg_offset);
4a54c8c1 2299
4e49ea4a 2300 if (btrfsic_submit_bio_wait(bio)) {
4a54c8c1 2301 /* try to remap that extent elsewhere? */
b5de8d0d 2302 btrfs_bio_counter_dec(fs_info);
4a54c8c1 2303 bio_put(bio);
442a4f63 2304 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
4a54c8c1
JS
2305 return -EIO;
2306 }
2307
b14af3b4
DS
2308 btrfs_info_rl_in_rcu(fs_info,
2309 "read error corrected: ino %llu off %llu (dev %s sector %llu)",
6ec656bc 2310 ino, start,
1203b681 2311 rcu_str_deref(dev->name), sector);
b5de8d0d 2312 btrfs_bio_counter_dec(fs_info);
4a54c8c1
JS
2313 bio_put(bio);
2314 return 0;
2315}
2316
20a1fbf9 2317int btrfs_repair_eb_io_failure(struct extent_buffer *eb, int mirror_num)
ea466794 2318{
20a1fbf9 2319 struct btrfs_fs_info *fs_info = eb->fs_info;
ea466794 2320 u64 start = eb->start;
cc5e31a4 2321 int i, num_pages = num_extent_pages(eb);
d95603b2 2322 int ret = 0;
ea466794 2323
bc98a42c 2324 if (sb_rdonly(fs_info->sb))
908960c6
ID
2325 return -EROFS;
2326
ea466794 2327 for (i = 0; i < num_pages; i++) {
fb85fc9a 2328 struct page *p = eb->pages[i];
1203b681 2329
6ec656bc 2330 ret = repair_io_failure(fs_info, 0, start, PAGE_SIZE, start, p,
1203b681 2331 start - page_offset(p), mirror_num);
ea466794
JB
2332 if (ret)
2333 break;
09cbfeaf 2334 start += PAGE_SIZE;
ea466794
JB
2335 }
2336
2337 return ret;
2338}
2339
4a54c8c1
JS
2340/*
2341 * each time an IO finishes, we do a fast check in the IO failure tree
2342 * to see if we need to process or clean up an io_failure_record
2343 */
7870d082
JB
2344int clean_io_failure(struct btrfs_fs_info *fs_info,
2345 struct extent_io_tree *failure_tree,
2346 struct extent_io_tree *io_tree, u64 start,
2347 struct page *page, u64 ino, unsigned int pg_offset)
4a54c8c1
JS
2348{
2349 u64 private;
4a54c8c1 2350 struct io_failure_record *failrec;
4a54c8c1
JS
2351 struct extent_state *state;
2352 int num_copies;
4a54c8c1 2353 int ret;
4a54c8c1
JS
2354
2355 private = 0;
7870d082
JB
2356 ret = count_range_bits(failure_tree, &private, (u64)-1, 1,
2357 EXTENT_DIRTY, 0);
4a54c8c1
JS
2358 if (!ret)
2359 return 0;
2360
7870d082 2361 ret = get_state_failrec(failure_tree, start, &failrec);
4a54c8c1
JS
2362 if (ret)
2363 return 0;
2364
4a54c8c1
JS
2365 BUG_ON(!failrec->this_mirror);
2366
2367 if (failrec->in_validation) {
2368 /* there was no real error, just free the record */
ab8d0fc4
JM
2369 btrfs_debug(fs_info,
2370 "clean_io_failure: freeing dummy error at %llu",
2371 failrec->start);
4a54c8c1
JS
2372 goto out;
2373 }
bc98a42c 2374 if (sb_rdonly(fs_info->sb))
908960c6 2375 goto out;
4a54c8c1 2376
7870d082
JB
2377 spin_lock(&io_tree->lock);
2378 state = find_first_extent_bit_state(io_tree,
4a54c8c1
JS
2379 failrec->start,
2380 EXTENT_LOCKED);
7870d082 2381 spin_unlock(&io_tree->lock);
4a54c8c1 2382
883d0de4
MX
2383 if (state && state->start <= failrec->start &&
2384 state->end >= failrec->start + failrec->len - 1) {
3ec706c8
SB
2385 num_copies = btrfs_num_copies(fs_info, failrec->logical,
2386 failrec->len);
4a54c8c1 2387 if (num_copies > 1) {
7870d082
JB
2388 repair_io_failure(fs_info, ino, start, failrec->len,
2389 failrec->logical, page, pg_offset,
2390 failrec->failed_mirror);
4a54c8c1
JS
2391 }
2392 }
2393
2394out:
7870d082 2395 free_io_failure(failure_tree, io_tree, failrec);
4a54c8c1 2396
454ff3de 2397 return 0;
4a54c8c1
JS
2398}
2399
f612496b
MX
2400/*
2401 * Can be called when
2402 * - hold extent lock
2403 * - under ordered extent
2404 * - the inode is freeing
2405 */
7ab7956e 2406void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end)
f612496b 2407{
7ab7956e 2408 struct extent_io_tree *failure_tree = &inode->io_failure_tree;
f612496b
MX
2409 struct io_failure_record *failrec;
2410 struct extent_state *state, *next;
2411
2412 if (RB_EMPTY_ROOT(&failure_tree->state))
2413 return;
2414
2415 spin_lock(&failure_tree->lock);
2416 state = find_first_extent_bit_state(failure_tree, start, EXTENT_DIRTY);
2417 while (state) {
2418 if (state->start > end)
2419 break;
2420
2421 ASSERT(state->end <= end);
2422
2423 next = next_state(state);
2424
47dc196a 2425 failrec = state->failrec;
f612496b
MX
2426 free_extent_state(state);
2427 kfree(failrec);
2428
2429 state = next;
2430 }
2431 spin_unlock(&failure_tree->lock);
2432}
2433
2fe6303e 2434int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
47dc196a 2435 struct io_failure_record **failrec_ret)
4a54c8c1 2436{
ab8d0fc4 2437 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2fe6303e 2438 struct io_failure_record *failrec;
4a54c8c1 2439 struct extent_map *em;
4a54c8c1
JS
2440 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
2441 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
2442 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
4a54c8c1 2443 int ret;
4a54c8c1
JS
2444 u64 logical;
2445
47dc196a 2446 ret = get_state_failrec(failure_tree, start, &failrec);
4a54c8c1
JS
2447 if (ret) {
2448 failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
2449 if (!failrec)
2450 return -ENOMEM;
2fe6303e 2451
4a54c8c1
JS
2452 failrec->start = start;
2453 failrec->len = end - start + 1;
2454 failrec->this_mirror = 0;
2455 failrec->bio_flags = 0;
2456 failrec->in_validation = 0;
2457
2458 read_lock(&em_tree->lock);
2459 em = lookup_extent_mapping(em_tree, start, failrec->len);
2460 if (!em) {
2461 read_unlock(&em_tree->lock);
2462 kfree(failrec);
2463 return -EIO;
2464 }
2465
68ba990f 2466 if (em->start > start || em->start + em->len <= start) {
4a54c8c1
JS
2467 free_extent_map(em);
2468 em = NULL;
2469 }
2470 read_unlock(&em_tree->lock);
7a2d6a64 2471 if (!em) {
4a54c8c1
JS
2472 kfree(failrec);
2473 return -EIO;
2474 }
2fe6303e 2475
4a54c8c1
JS
2476 logical = start - em->start;
2477 logical = em->block_start + logical;
2478 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
2479 logical = em->block_start;
2480 failrec->bio_flags = EXTENT_BIO_COMPRESSED;
2481 extent_set_compress_type(&failrec->bio_flags,
2482 em->compress_type);
2483 }
2fe6303e 2484
ab8d0fc4
JM
2485 btrfs_debug(fs_info,
2486 "Get IO Failure Record: (new) logical=%llu, start=%llu, len=%llu",
2487 logical, start, failrec->len);
2fe6303e 2488
4a54c8c1
JS
2489 failrec->logical = logical;
2490 free_extent_map(em);
2491
2492 /* set the bits in the private failure tree */
2493 ret = set_extent_bits(failure_tree, start, end,
ceeb0ae7 2494 EXTENT_LOCKED | EXTENT_DIRTY);
4a54c8c1 2495 if (ret >= 0)
47dc196a 2496 ret = set_state_failrec(failure_tree, start, failrec);
4a54c8c1
JS
2497 /* set the bits in the inode's tree */
2498 if (ret >= 0)
ceeb0ae7 2499 ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED);
4a54c8c1
JS
2500 if (ret < 0) {
2501 kfree(failrec);
2502 return ret;
2503 }
2504 } else {
ab8d0fc4
JM
2505 btrfs_debug(fs_info,
2506 "Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu, validation=%d",
2507 failrec->logical, failrec->start, failrec->len,
2508 failrec->in_validation);
4a54c8c1
JS
2509 /*
2510 * when data can be on disk more than twice, add to failrec here
2511 * (e.g. with a list for failed_mirror) to make
2512 * clean_io_failure() clean all those errors at once.
2513 */
2514 }
2fe6303e
MX
2515
2516 *failrec_ret = failrec;
2517
2518 return 0;
2519}
2520
a0b60d72 2521bool btrfs_check_repairable(struct inode *inode, unsigned failed_bio_pages,
2fe6303e
MX
2522 struct io_failure_record *failrec, int failed_mirror)
2523{
ab8d0fc4 2524 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2fe6303e
MX
2525 int num_copies;
2526
ab8d0fc4 2527 num_copies = btrfs_num_copies(fs_info, failrec->logical, failrec->len);
4a54c8c1
JS
2528 if (num_copies == 1) {
2529 /*
2530 * we only have a single copy of the data, so don't bother with
2531 * all the retry and error correction code that follows. no
2532 * matter what the error is, it is very likely to persist.
2533 */
ab8d0fc4
JM
2534 btrfs_debug(fs_info,
2535 "Check Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d",
2536 num_copies, failrec->this_mirror, failed_mirror);
c3cfb656 2537 return false;
4a54c8c1
JS
2538 }
2539
4a54c8c1
JS
2540 /*
2541 * there are two premises:
2542 * a) deliver good data to the caller
2543 * b) correct the bad sectors on disk
2544 */
a0b60d72 2545 if (failed_bio_pages > 1) {
4a54c8c1
JS
2546 /*
2547 * to fulfill b), we need to know the exact failing sectors, as
2548 * we don't want to rewrite any more than the failed ones. thus,
2549 * we need separate read requests for the failed bio
2550 *
2551 * if the following BUG_ON triggers, our validation request got
2552 * merged. we need separate requests for our algorithm to work.
2553 */
2554 BUG_ON(failrec->in_validation);
2555 failrec->in_validation = 1;
2556 failrec->this_mirror = failed_mirror;
4a54c8c1
JS
2557 } else {
2558 /*
2559 * we're ready to fulfill a) and b) alongside. get a good copy
2560 * of the failed sector and if we succeed, we have setup
2561 * everything for repair_io_failure to do the rest for us.
2562 */
2563 if (failrec->in_validation) {
2564 BUG_ON(failrec->this_mirror != failed_mirror);
2565 failrec->in_validation = 0;
2566 failrec->this_mirror = 0;
2567 }
2568 failrec->failed_mirror = failed_mirror;
2569 failrec->this_mirror++;
2570 if (failrec->this_mirror == failed_mirror)
2571 failrec->this_mirror++;
4a54c8c1
JS
2572 }
2573
facc8a22 2574 if (failrec->this_mirror > num_copies) {
ab8d0fc4
JM
2575 btrfs_debug(fs_info,
2576 "Check Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d",
2577 num_copies, failrec->this_mirror, failed_mirror);
c3cfb656 2578 return false;
4a54c8c1
JS
2579 }
2580
c3cfb656 2581 return true;
2fe6303e
MX
2582}
2583
2584
2585struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio,
2586 struct io_failure_record *failrec,
2587 struct page *page, int pg_offset, int icsum,
8b110e39 2588 bio_end_io_t *endio_func, void *data)
2fe6303e 2589{
0b246afa 2590 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2fe6303e
MX
2591 struct bio *bio;
2592 struct btrfs_io_bio *btrfs_failed_bio;
2593 struct btrfs_io_bio *btrfs_bio;
2594
c5e4c3d7 2595 bio = btrfs_io_bio_alloc(1);
2fe6303e 2596 bio->bi_end_io = endio_func;
4f024f37 2597 bio->bi_iter.bi_sector = failrec->logical >> 9;
4f024f37 2598 bio->bi_iter.bi_size = 0;
8b110e39 2599 bio->bi_private = data;
4a54c8c1 2600
facc8a22
MX
2601 btrfs_failed_bio = btrfs_io_bio(failed_bio);
2602 if (btrfs_failed_bio->csum) {
facc8a22
MX
2603 u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
2604
2605 btrfs_bio = btrfs_io_bio(bio);
2606 btrfs_bio->csum = btrfs_bio->csum_inline;
2fe6303e
MX
2607 icsum *= csum_size;
2608 memcpy(btrfs_bio->csum, btrfs_failed_bio->csum + icsum,
facc8a22
MX
2609 csum_size);
2610 }
2611
2fe6303e
MX
2612 bio_add_page(bio, page, failrec->len, pg_offset);
2613
2614 return bio;
2615}
2616
2617/*
78e62c02
NB
2618 * This is a generic handler for readpage errors. If other copies exist, read
2619 * those and write back good data to the failed position. Does not investigate
2620 * in remapping the failed extent elsewhere, hoping the device will be smart
2621 * enough to do this as needed
2fe6303e 2622 */
2fe6303e
MX
2623static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
2624 struct page *page, u64 start, u64 end,
2625 int failed_mirror)
2626{
2627 struct io_failure_record *failrec;
2628 struct inode *inode = page->mapping->host;
2629 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
7870d082 2630 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
2fe6303e 2631 struct bio *bio;
70fd7614 2632 int read_mode = 0;
4e4cbee9 2633 blk_status_t status;
2fe6303e 2634 int ret;
8a2ee44a 2635 unsigned failed_bio_pages = failed_bio->bi_iter.bi_size >> PAGE_SHIFT;
2fe6303e 2636
1f7ad75b 2637 BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
2fe6303e
MX
2638
2639 ret = btrfs_get_io_failure_record(inode, start, end, &failrec);
2640 if (ret)
2641 return ret;
2642
a0b60d72 2643 if (!btrfs_check_repairable(inode, failed_bio_pages, failrec,
c3cfb656 2644 failed_mirror)) {
7870d082 2645 free_io_failure(failure_tree, tree, failrec);
2fe6303e
MX
2646 return -EIO;
2647 }
2648
a0b60d72 2649 if (failed_bio_pages > 1)
70fd7614 2650 read_mode |= REQ_FAILFAST_DEV;
2fe6303e
MX
2651
2652 phy_offset >>= inode->i_sb->s_blocksize_bits;
2653 bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page,
2654 start - page_offset(page),
8b110e39
MX
2655 (int)phy_offset, failed_bio->bi_end_io,
2656 NULL);
ebcc3263 2657 bio->bi_opf = REQ_OP_READ | read_mode;
4a54c8c1 2658
ab8d0fc4
JM
2659 btrfs_debug(btrfs_sb(inode->i_sb),
2660 "Repair Read Error: submitting new read[%#x] to this_mirror=%d, in_validation=%d",
2661 read_mode, failrec->this_mirror, failrec->in_validation);
4a54c8c1 2662
8c27cb35 2663 status = tree->ops->submit_bio_hook(tree->private_data, bio, failrec->this_mirror,
50489a57 2664 failrec->bio_flags);
4e4cbee9 2665 if (status) {
7870d082 2666 free_io_failure(failure_tree, tree, failrec);
6c387ab2 2667 bio_put(bio);
4e4cbee9 2668 ret = blk_status_to_errno(status);
6c387ab2
MX
2669 }
2670
013bd4c3 2671 return ret;
4a54c8c1
JS
2672}
2673
d1310b2e
CM
2674/* lots and lots of room for performance fixes in the end_bio funcs */
2675
b5227c07 2676void end_extent_writepage(struct page *page, int err, u64 start, u64 end)
87826df0
JM
2677{
2678 int uptodate = (err == 0);
3e2426bd 2679 int ret = 0;
87826df0 2680
c629732d 2681 btrfs_writepage_endio_finish_ordered(page, start, end, uptodate);
87826df0 2682
87826df0 2683 if (!uptodate) {
87826df0
JM
2684 ClearPageUptodate(page);
2685 SetPageError(page);
bff5baf8 2686 ret = err < 0 ? err : -EIO;
5dca6eea 2687 mapping_set_error(page->mapping, ret);
87826df0 2688 }
87826df0
JM
2689}
2690
d1310b2e
CM
2691/*
2692 * after a writepage IO is done, we need to:
2693 * clear the uptodate bits on error
2694 * clear the writeback bits in the extent tree for this IO
2695 * end_page_writeback if the page has no more pending IO
2696 *
2697 * Scheduling is not allowed, so the extent state tree is expected
2698 * to have one and only one object corresponding to this IO.
2699 */
4246a0b6 2700static void end_bio_extent_writepage(struct bio *bio)
d1310b2e 2701{
4e4cbee9 2702 int error = blk_status_to_errno(bio->bi_status);
2c30c71b 2703 struct bio_vec *bvec;
d1310b2e
CM
2704 u64 start;
2705 u64 end;
6dc4f100 2706 struct bvec_iter_all iter_all;
d1310b2e 2707
c09abff8 2708 ASSERT(!bio_flagged(bio, BIO_CLONED));
2b070cfe 2709 bio_for_each_segment_all(bvec, bio, iter_all) {
d1310b2e 2710 struct page *page = bvec->bv_page;
0b246afa
JM
2711 struct inode *inode = page->mapping->host;
2712 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
902b22f3 2713
17a5adcc
AO
2714 /* We always issue full-page reads, but if some block
2715 * in a page fails to read, blk_update_request() will
2716 * advance bv_offset and adjust bv_len to compensate.
2717 * Print a warning for nonzero offsets, and an error
2718 * if they don't add up to a full page. */
09cbfeaf
KS
2719 if (bvec->bv_offset || bvec->bv_len != PAGE_SIZE) {
2720 if (bvec->bv_offset + bvec->bv_len != PAGE_SIZE)
0b246afa 2721 btrfs_err(fs_info,
efe120a0
FH
2722 "partial page write in btrfs with offset %u and length %u",
2723 bvec->bv_offset, bvec->bv_len);
2724 else
0b246afa 2725 btrfs_info(fs_info,
5d163e0e 2726 "incomplete page write in btrfs with offset %u and length %u",
efe120a0
FH
2727 bvec->bv_offset, bvec->bv_len);
2728 }
d1310b2e 2729
17a5adcc
AO
2730 start = page_offset(page);
2731 end = start + bvec->bv_offset + bvec->bv_len - 1;
d1310b2e 2732
4e4cbee9 2733 end_extent_writepage(page, error, start, end);
17a5adcc 2734 end_page_writeback(page);
2c30c71b 2735 }
2b1f55b0 2736
d1310b2e 2737 bio_put(bio);
d1310b2e
CM
2738}
2739
883d0de4
MX
2740static void
2741endio_readpage_release_extent(struct extent_io_tree *tree, u64 start, u64 len,
2742 int uptodate)
2743{
2744 struct extent_state *cached = NULL;
2745 u64 end = start + len - 1;
2746
2747 if (uptodate && tree->track_uptodate)
2748 set_extent_uptodate(tree, start, end, &cached, GFP_ATOMIC);
d810a4be 2749 unlock_extent_cached_atomic(tree, start, end, &cached);
883d0de4
MX
2750}
2751
d1310b2e
CM
2752/*
2753 * after a readpage IO is done, we need to:
2754 * clear the uptodate bits on error
2755 * set the uptodate bits if things worked
2756 * set the page up to date if all extents in the tree are uptodate
2757 * clear the lock bit in the extent tree
2758 * unlock the page if there are no other extents locked for it
2759 *
2760 * Scheduling is not allowed, so the extent state tree is expected
2761 * to have one and only one object corresponding to this IO.
2762 */
4246a0b6 2763static void end_bio_extent_readpage(struct bio *bio)
d1310b2e 2764{
2c30c71b 2765 struct bio_vec *bvec;
4e4cbee9 2766 int uptodate = !bio->bi_status;
facc8a22 2767 struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
7870d082 2768 struct extent_io_tree *tree, *failure_tree;
facc8a22 2769 u64 offset = 0;
d1310b2e
CM
2770 u64 start;
2771 u64 end;
facc8a22 2772 u64 len;
883d0de4
MX
2773 u64 extent_start = 0;
2774 u64 extent_len = 0;
5cf1ab56 2775 int mirror;
d1310b2e 2776 int ret;
6dc4f100 2777 struct bvec_iter_all iter_all;
d1310b2e 2778
c09abff8 2779 ASSERT(!bio_flagged(bio, BIO_CLONED));
2b070cfe 2780 bio_for_each_segment_all(bvec, bio, iter_all) {
d1310b2e 2781 struct page *page = bvec->bv_page;
a71754fc 2782 struct inode *inode = page->mapping->host;
ab8d0fc4 2783 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
78e62c02
NB
2784 bool data_inode = btrfs_ino(BTRFS_I(inode))
2785 != BTRFS_BTREE_INODE_OBJECTID;
507903b8 2786
ab8d0fc4
JM
2787 btrfs_debug(fs_info,
2788 "end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u",
4e4cbee9 2789 (u64)bio->bi_iter.bi_sector, bio->bi_status,
ab8d0fc4 2790 io_bio->mirror_num);
a71754fc 2791 tree = &BTRFS_I(inode)->io_tree;
7870d082 2792 failure_tree = &BTRFS_I(inode)->io_failure_tree;
902b22f3 2793
17a5adcc
AO
2794 /* We always issue full-page reads, but if some block
2795 * in a page fails to read, blk_update_request() will
2796 * advance bv_offset and adjust bv_len to compensate.
2797 * Print a warning for nonzero offsets, and an error
2798 * if they don't add up to a full page. */
09cbfeaf
KS
2799 if (bvec->bv_offset || bvec->bv_len != PAGE_SIZE) {
2800 if (bvec->bv_offset + bvec->bv_len != PAGE_SIZE)
ab8d0fc4
JM
2801 btrfs_err(fs_info,
2802 "partial page read in btrfs with offset %u and length %u",
efe120a0
FH
2803 bvec->bv_offset, bvec->bv_len);
2804 else
ab8d0fc4
JM
2805 btrfs_info(fs_info,
2806 "incomplete page read in btrfs with offset %u and length %u",
efe120a0
FH
2807 bvec->bv_offset, bvec->bv_len);
2808 }
d1310b2e 2809
17a5adcc
AO
2810 start = page_offset(page);
2811 end = start + bvec->bv_offset + bvec->bv_len - 1;
facc8a22 2812 len = bvec->bv_len;
d1310b2e 2813
9be3395b 2814 mirror = io_bio->mirror_num;
78e62c02 2815 if (likely(uptodate)) {
facc8a22
MX
2816 ret = tree->ops->readpage_end_io_hook(io_bio, offset,
2817 page, start, end,
2818 mirror);
5ee0844d 2819 if (ret)
d1310b2e 2820 uptodate = 0;
5ee0844d 2821 else
7870d082
JB
2822 clean_io_failure(BTRFS_I(inode)->root->fs_info,
2823 failure_tree, tree, start,
2824 page,
2825 btrfs_ino(BTRFS_I(inode)), 0);
d1310b2e 2826 }
ea466794 2827
f2a09da9
MX
2828 if (likely(uptodate))
2829 goto readpage_ok;
2830
78e62c02 2831 if (data_inode) {
9d0d1c8b 2832
f4a8e656 2833 /*
78e62c02
NB
2834 * The generic bio_readpage_error handles errors the
2835 * following way: If possible, new read requests are
2836 * created and submitted and will end up in
2837 * end_bio_extent_readpage as well (if we're lucky,
2838 * not in the !uptodate case). In that case it returns
2839 * 0 and we just go on with the next page in our bio.
2840 * If it can't handle the error it will return -EIO and
2841 * we remain responsible for that page.
f4a8e656 2842 */
78e62c02
NB
2843 ret = bio_readpage_error(bio, offset, page, start, end,
2844 mirror);
2845 if (ret == 0) {
2846 uptodate = !bio->bi_status;
2847 offset += len;
2848 continue;
2849 }
2850 } else {
2851 struct extent_buffer *eb;
2852
2853 eb = (struct extent_buffer *)page->private;
2854 set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
2855 eb->read_mirror = mirror;
2856 atomic_dec(&eb->io_pages);
2857 if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD,
2858 &eb->bflags))
2859 btree_readahead_hook(eb, -EIO);
7e38326f 2860 }
f2a09da9 2861readpage_ok:
883d0de4 2862 if (likely(uptodate)) {
a71754fc 2863 loff_t i_size = i_size_read(inode);
09cbfeaf 2864 pgoff_t end_index = i_size >> PAGE_SHIFT;
a583c026 2865 unsigned off;
a71754fc
JB
2866
2867 /* Zero out the end if this page straddles i_size */
7073017a 2868 off = offset_in_page(i_size);
a583c026 2869 if (page->index == end_index && off)
09cbfeaf 2870 zero_user_segment(page, off, PAGE_SIZE);
17a5adcc 2871 SetPageUptodate(page);
70dec807 2872 } else {
17a5adcc
AO
2873 ClearPageUptodate(page);
2874 SetPageError(page);
70dec807 2875 }
17a5adcc 2876 unlock_page(page);
facc8a22 2877 offset += len;
883d0de4
MX
2878
2879 if (unlikely(!uptodate)) {
2880 if (extent_len) {
2881 endio_readpage_release_extent(tree,
2882 extent_start,
2883 extent_len, 1);
2884 extent_start = 0;
2885 extent_len = 0;
2886 }
2887 endio_readpage_release_extent(tree, start,
2888 end - start + 1, 0);
2889 } else if (!extent_len) {
2890 extent_start = start;
2891 extent_len = end + 1 - start;
2892 } else if (extent_start + extent_len == start) {
2893 extent_len += end + 1 - start;
2894 } else {
2895 endio_readpage_release_extent(tree, extent_start,
2896 extent_len, uptodate);
2897 extent_start = start;
2898 extent_len = end + 1 - start;
2899 }
2c30c71b 2900 }
d1310b2e 2901
883d0de4
MX
2902 if (extent_len)
2903 endio_readpage_release_extent(tree, extent_start, extent_len,
2904 uptodate);
b3a0dd50 2905 btrfs_io_bio_free_csum(io_bio);
d1310b2e 2906 bio_put(bio);
d1310b2e
CM
2907}
2908
9be3395b 2909/*
184f999e
DS
2910 * Initialize the members up to but not including 'bio'. Use after allocating a
2911 * new bio by bio_alloc_bioset as it does not initialize the bytes outside of
2912 * 'bio' because use of __GFP_ZERO is not supported.
9be3395b 2913 */
184f999e 2914static inline void btrfs_io_bio_init(struct btrfs_io_bio *btrfs_bio)
d1310b2e 2915{
184f999e
DS
2916 memset(btrfs_bio, 0, offsetof(struct btrfs_io_bio, bio));
2917}
d1310b2e 2918
9be3395b 2919/*
6e707bcd
DS
2920 * The following helpers allocate a bio. As it's backed by a bioset, it'll
2921 * never fail. We're returning a bio right now but you can call btrfs_io_bio
2922 * for the appropriate container_of magic
9be3395b 2923 */
e749af44 2924struct bio *btrfs_bio_alloc(u64 first_byte)
d1310b2e
CM
2925{
2926 struct bio *bio;
d1310b2e 2927
8ac9f7c1 2928 bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, &btrfs_bioset);
c821e7f3 2929 bio->bi_iter.bi_sector = first_byte >> 9;
184f999e 2930 btrfs_io_bio_init(btrfs_io_bio(bio));
d1310b2e
CM
2931 return bio;
2932}
2933
8b6c1d56 2934struct bio *btrfs_bio_clone(struct bio *bio)
9be3395b 2935{
23ea8e5a
MX
2936 struct btrfs_io_bio *btrfs_bio;
2937 struct bio *new;
9be3395b 2938
6e707bcd 2939 /* Bio allocation backed by a bioset does not fail */
8ac9f7c1 2940 new = bio_clone_fast(bio, GFP_NOFS, &btrfs_bioset);
6e707bcd 2941 btrfs_bio = btrfs_io_bio(new);
184f999e 2942 btrfs_io_bio_init(btrfs_bio);
6e707bcd 2943 btrfs_bio->iter = bio->bi_iter;
23ea8e5a
MX
2944 return new;
2945}
9be3395b 2946
c5e4c3d7 2947struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs)
9be3395b 2948{
facc8a22
MX
2949 struct bio *bio;
2950
6e707bcd 2951 /* Bio allocation backed by a bioset does not fail */
8ac9f7c1 2952 bio = bio_alloc_bioset(GFP_NOFS, nr_iovecs, &btrfs_bioset);
184f999e 2953 btrfs_io_bio_init(btrfs_io_bio(bio));
facc8a22 2954 return bio;
9be3395b
CM
2955}
2956
e477094f 2957struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size)
2f8e9140
LB
2958{
2959 struct bio *bio;
2960 struct btrfs_io_bio *btrfs_bio;
2961
2962 /* this will never fail when it's backed by a bioset */
8ac9f7c1 2963 bio = bio_clone_fast(orig, GFP_NOFS, &btrfs_bioset);
2f8e9140
LB
2964 ASSERT(bio);
2965
2966 btrfs_bio = btrfs_io_bio(bio);
184f999e 2967 btrfs_io_bio_init(btrfs_bio);
2f8e9140
LB
2968
2969 bio_trim(bio, offset >> 9, size >> 9);
17347cec 2970 btrfs_bio->iter = bio->bi_iter;
2f8e9140
LB
2971 return bio;
2972}
9be3395b 2973
4b81ba48
DS
2974/*
2975 * @opf: bio REQ_OP_* and REQ_* flags as one value
b8b3d625
DS
2976 * @wbc: optional writeback control for io accounting
2977 * @page: page to add to the bio
2978 * @pg_offset: offset of the new bio or to check whether we are adding
2979 * a contiguous page to the previous one
2980 * @size: portion of page that we want to write
2981 * @offset: starting offset in the page
5c2b1fd7 2982 * @bio_ret: must be valid pointer, newly allocated bio will be stored there
b8b3d625
DS
2983 * @end_io_func: end_io callback for new bio
2984 * @mirror_num: desired mirror to read/write
2985 * @prev_bio_flags: flags of previous bio to see if we can merge the current one
2986 * @bio_flags: flags of the current bio to see if we can merge them
4b81ba48 2987 */
0ceb34bf 2988static int submit_extent_page(unsigned int opf,
da2f0f74 2989 struct writeback_control *wbc,
6273b7f8 2990 struct page *page, u64 offset,
6c5a4e2c 2991 size_t size, unsigned long pg_offset,
d1310b2e 2992 struct bio **bio_ret,
f188591e 2993 bio_end_io_t end_io_func,
c8b97818
CM
2994 int mirror_num,
2995 unsigned long prev_bio_flags,
005efedf
FM
2996 unsigned long bio_flags,
2997 bool force_bio_submit)
d1310b2e
CM
2998{
2999 int ret = 0;
3000 struct bio *bio;
09cbfeaf 3001 size_t page_size = min_t(size_t, size, PAGE_SIZE);
6273b7f8 3002 sector_t sector = offset >> 9;
0ceb34bf 3003 struct extent_io_tree *tree = &BTRFS_I(page->mapping->host)->io_tree;
d1310b2e 3004
5c2b1fd7
DS
3005 ASSERT(bio_ret);
3006
3007 if (*bio_ret) {
0c8508a6
DS
3008 bool contig;
3009 bool can_merge = true;
3010
d1310b2e 3011 bio = *bio_ret;
0c8508a6 3012 if (prev_bio_flags & EXTENT_BIO_COMPRESSED)
4f024f37 3013 contig = bio->bi_iter.bi_sector == sector;
c8b97818 3014 else
f73a1c7d 3015 contig = bio_end_sector(bio) == sector;
c8b97818 3016
da12fe54
NB
3017 ASSERT(tree->ops);
3018 if (btrfs_bio_fits_in_stripe(page, page_size, bio, bio_flags))
0c8508a6
DS
3019 can_merge = false;
3020
3021 if (prev_bio_flags != bio_flags || !contig || !can_merge ||
005efedf 3022 force_bio_submit ||
6c5a4e2c 3023 bio_add_page(bio, page, page_size, pg_offset) < page_size) {
1f7ad75b 3024 ret = submit_one_bio(bio, mirror_num, prev_bio_flags);
289454ad
NA
3025 if (ret < 0) {
3026 *bio_ret = NULL;
79787eaa 3027 return ret;
289454ad 3028 }
d1310b2e
CM
3029 bio = NULL;
3030 } else {
da2f0f74 3031 if (wbc)
34e51a5e 3032 wbc_account_cgroup_owner(wbc, page, page_size);
d1310b2e
CM
3033 return 0;
3034 }
3035 }
c8b97818 3036
e749af44 3037 bio = btrfs_bio_alloc(offset);
6c5a4e2c 3038 bio_add_page(bio, page, page_size, pg_offset);
d1310b2e
CM
3039 bio->bi_end_io = end_io_func;
3040 bio->bi_private = tree;
e6959b93 3041 bio->bi_write_hint = page->mapping->host->i_write_hint;
4b81ba48 3042 bio->bi_opf = opf;
da2f0f74 3043 if (wbc) {
429aebc0
DS
3044 struct block_device *bdev;
3045
3046 bdev = BTRFS_I(page->mapping->host)->root->fs_info->fs_devices->latest_bdev;
3047 bio_set_dev(bio, bdev);
da2f0f74 3048 wbc_init_bio(wbc, bio);
34e51a5e 3049 wbc_account_cgroup_owner(wbc, page, page_size);
da2f0f74 3050 }
70dec807 3051
5c2b1fd7 3052 *bio_ret = bio;
d1310b2e
CM
3053
3054 return ret;
3055}
3056
48a3b636
ES
3057static void attach_extent_buffer_page(struct extent_buffer *eb,
3058 struct page *page)
d1310b2e
CM
3059{
3060 if (!PagePrivate(page)) {
3061 SetPagePrivate(page);
09cbfeaf 3062 get_page(page);
4f2de97a
JB
3063 set_page_private(page, (unsigned long)eb);
3064 } else {
3065 WARN_ON(page->private != (unsigned long)eb);
d1310b2e
CM
3066 }
3067}
3068
4f2de97a 3069void set_page_extent_mapped(struct page *page)
d1310b2e 3070{
4f2de97a
JB
3071 if (!PagePrivate(page)) {
3072 SetPagePrivate(page);
09cbfeaf 3073 get_page(page);
4f2de97a
JB
3074 set_page_private(page, EXTENT_PAGE_PRIVATE);
3075 }
d1310b2e
CM
3076}
3077
125bac01
MX
3078static struct extent_map *
3079__get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
3080 u64 start, u64 len, get_extent_t *get_extent,
3081 struct extent_map **em_cached)
3082{
3083 struct extent_map *em;
3084
3085 if (em_cached && *em_cached) {
3086 em = *em_cached;
cbc0e928 3087 if (extent_map_in_tree(em) && start >= em->start &&
125bac01 3088 start < extent_map_end(em)) {
490b54d6 3089 refcount_inc(&em->refs);
125bac01
MX
3090 return em;
3091 }
3092
3093 free_extent_map(em);
3094 *em_cached = NULL;
3095 }
3096
39b07b5d 3097 em = get_extent(BTRFS_I(inode), page, pg_offset, start, len);
125bac01
MX
3098 if (em_cached && !IS_ERR_OR_NULL(em)) {
3099 BUG_ON(*em_cached);
490b54d6 3100 refcount_inc(&em->refs);
125bac01
MX
3101 *em_cached = em;
3102 }
3103 return em;
3104}
d1310b2e
CM
3105/*
3106 * basic readpage implementation. Locked extent state structs are inserted
3107 * into the tree that are removed when the IO is done (by the end_io
3108 * handlers)
79787eaa 3109 * XXX JDM: This needs looking at to ensure proper page locking
baf863b9 3110 * return 0 on success, otherwise return error
d1310b2e 3111 */
9974090b
MX
3112static int __do_readpage(struct extent_io_tree *tree,
3113 struct page *page,
3114 get_extent_t *get_extent,
125bac01 3115 struct extent_map **em_cached,
9974090b 3116 struct bio **bio, int mirror_num,
f1c77c55 3117 unsigned long *bio_flags, unsigned int read_flags,
005efedf 3118 u64 *prev_em_start)
d1310b2e
CM
3119{
3120 struct inode *inode = page->mapping->host;
4eee4fa4 3121 u64 start = page_offset(page);
8eec8296 3122 const u64 end = start + PAGE_SIZE - 1;
d1310b2e
CM
3123 u64 cur = start;
3124 u64 extent_offset;
3125 u64 last_byte = i_size_read(inode);
3126 u64 block_start;
3127 u64 cur_end;
d1310b2e 3128 struct extent_map *em;
baf863b9 3129 int ret = 0;
d1310b2e 3130 int nr = 0;
306e16ce 3131 size_t pg_offset = 0;
d1310b2e 3132 size_t iosize;
c8b97818 3133 size_t disk_io_size;
d1310b2e 3134 size_t blocksize = inode->i_sb->s_blocksize;
7f042a83 3135 unsigned long this_bio_flag = 0;
d1310b2e 3136
ae6957eb
DS
3137 ASSERT(tree == &BTRFS_I(inode)->io_tree);
3138
d1310b2e
CM
3139 set_page_extent_mapped(page);
3140
90a887c9
DM
3141 if (!PageUptodate(page)) {
3142 if (cleancache_get_page(page) == 0) {
3143 BUG_ON(blocksize != PAGE_SIZE);
9974090b 3144 unlock_extent(tree, start, end);
90a887c9
DM
3145 goto out;
3146 }
3147 }
3148
09cbfeaf 3149 if (page->index == last_byte >> PAGE_SHIFT) {
c8b97818 3150 char *userpage;
7073017a 3151 size_t zero_offset = offset_in_page(last_byte);
c8b97818
CM
3152
3153 if (zero_offset) {
09cbfeaf 3154 iosize = PAGE_SIZE - zero_offset;
7ac687d9 3155 userpage = kmap_atomic(page);
c8b97818
CM
3156 memset(userpage + zero_offset, 0, iosize);
3157 flush_dcache_page(page);
7ac687d9 3158 kunmap_atomic(userpage);
c8b97818
CM
3159 }
3160 }
d1310b2e 3161 while (cur <= end) {
005efedf 3162 bool force_bio_submit = false;
6273b7f8 3163 u64 offset;
c8f2f24b 3164
d1310b2e
CM
3165 if (cur >= last_byte) {
3166 char *userpage;
507903b8
AJ
3167 struct extent_state *cached = NULL;
3168
09cbfeaf 3169 iosize = PAGE_SIZE - pg_offset;
7ac687d9 3170 userpage = kmap_atomic(page);
306e16ce 3171 memset(userpage + pg_offset, 0, iosize);
d1310b2e 3172 flush_dcache_page(page);
7ac687d9 3173 kunmap_atomic(userpage);
d1310b2e 3174 set_extent_uptodate(tree, cur, cur + iosize - 1,
507903b8 3175 &cached, GFP_NOFS);
7f042a83 3176 unlock_extent_cached(tree, cur,
e43bbe5e 3177 cur + iosize - 1, &cached);
d1310b2e
CM
3178 break;
3179 }
125bac01
MX
3180 em = __get_extent_map(inode, page, pg_offset, cur,
3181 end - cur + 1, get_extent, em_cached);
c704005d 3182 if (IS_ERR_OR_NULL(em)) {
d1310b2e 3183 SetPageError(page);
7f042a83 3184 unlock_extent(tree, cur, end);
d1310b2e
CM
3185 break;
3186 }
d1310b2e
CM
3187 extent_offset = cur - em->start;
3188 BUG_ON(extent_map_end(em) <= cur);
3189 BUG_ON(end < cur);
3190
261507a0 3191 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
4b384318 3192 this_bio_flag |= EXTENT_BIO_COMPRESSED;
261507a0
LZ
3193 extent_set_compress_type(&this_bio_flag,
3194 em->compress_type);
3195 }
c8b97818 3196
d1310b2e
CM
3197 iosize = min(extent_map_end(em) - cur, end - cur + 1);
3198 cur_end = min(extent_map_end(em) - 1, end);
fda2832f 3199 iosize = ALIGN(iosize, blocksize);
c8b97818
CM
3200 if (this_bio_flag & EXTENT_BIO_COMPRESSED) {
3201 disk_io_size = em->block_len;
6273b7f8 3202 offset = em->block_start;
c8b97818 3203 } else {
6273b7f8 3204 offset = em->block_start + extent_offset;
c8b97818
CM
3205 disk_io_size = iosize;
3206 }
d1310b2e 3207 block_start = em->block_start;
d899e052
YZ
3208 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
3209 block_start = EXTENT_MAP_HOLE;
005efedf
FM
3210
3211 /*
3212 * If we have a file range that points to a compressed extent
3213 * and it's followed by a consecutive file range that points to
3214 * to the same compressed extent (possibly with a different
3215 * offset and/or length, so it either points to the whole extent
3216 * or only part of it), we must make sure we do not submit a
3217 * single bio to populate the pages for the 2 ranges because
3218 * this makes the compressed extent read zero out the pages
3219 * belonging to the 2nd range. Imagine the following scenario:
3220 *
3221 * File layout
3222 * [0 - 8K] [8K - 24K]
3223 * | |
3224 * | |
3225 * points to extent X, points to extent X,
3226 * offset 4K, length of 8K offset 0, length 16K
3227 *
3228 * [extent X, compressed length = 4K uncompressed length = 16K]
3229 *
3230 * If the bio to read the compressed extent covers both ranges,
3231 * it will decompress extent X into the pages belonging to the
3232 * first range and then it will stop, zeroing out the remaining
3233 * pages that belong to the other range that points to extent X.
3234 * So here we make sure we submit 2 bios, one for the first
3235 * range and another one for the third range. Both will target
3236 * the same physical extent from disk, but we can't currently
3237 * make the compressed bio endio callback populate the pages
3238 * for both ranges because each compressed bio is tightly
3239 * coupled with a single extent map, and each range can have
3240 * an extent map with a different offset value relative to the
3241 * uncompressed data of our extent and different lengths. This
3242 * is a corner case so we prioritize correctness over
3243 * non-optimal behavior (submitting 2 bios for the same extent).
3244 */
3245 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) &&
3246 prev_em_start && *prev_em_start != (u64)-1 &&
8e928218 3247 *prev_em_start != em->start)
005efedf
FM
3248 force_bio_submit = true;
3249
3250 if (prev_em_start)
8e928218 3251 *prev_em_start = em->start;
005efedf 3252
d1310b2e
CM
3253 free_extent_map(em);
3254 em = NULL;
3255
3256 /* we've found a hole, just zero and go on */
3257 if (block_start == EXTENT_MAP_HOLE) {
3258 char *userpage;
507903b8
AJ
3259 struct extent_state *cached = NULL;
3260
7ac687d9 3261 userpage = kmap_atomic(page);
306e16ce 3262 memset(userpage + pg_offset, 0, iosize);
d1310b2e 3263 flush_dcache_page(page);
7ac687d9 3264 kunmap_atomic(userpage);
d1310b2e
CM
3265
3266 set_extent_uptodate(tree, cur, cur + iosize - 1,
507903b8 3267 &cached, GFP_NOFS);
7f042a83 3268 unlock_extent_cached(tree, cur,
e43bbe5e 3269 cur + iosize - 1, &cached);
d1310b2e 3270 cur = cur + iosize;
306e16ce 3271 pg_offset += iosize;
d1310b2e
CM
3272 continue;
3273 }
3274 /* the get_extent function already copied into the page */
9655d298
CM
3275 if (test_range_bit(tree, cur, cur_end,
3276 EXTENT_UPTODATE, 1, NULL)) {
a1b32a59 3277 check_page_uptodate(tree, page);
7f042a83 3278 unlock_extent(tree, cur, cur + iosize - 1);
d1310b2e 3279 cur = cur + iosize;
306e16ce 3280 pg_offset += iosize;
d1310b2e
CM
3281 continue;
3282 }
70dec807
CM
3283 /* we have an inline extent but it didn't get marked up
3284 * to date. Error out
3285 */
3286 if (block_start == EXTENT_MAP_INLINE) {
3287 SetPageError(page);
7f042a83 3288 unlock_extent(tree, cur, cur + iosize - 1);
70dec807 3289 cur = cur + iosize;
306e16ce 3290 pg_offset += iosize;
70dec807
CM
3291 continue;
3292 }
d1310b2e 3293
0ceb34bf 3294 ret = submit_extent_page(REQ_OP_READ | read_flags, NULL,
6273b7f8 3295 page, offset, disk_io_size,
fa17ed06 3296 pg_offset, bio,
c8b97818
CM
3297 end_bio_extent_readpage, mirror_num,
3298 *bio_flags,
005efedf
FM
3299 this_bio_flag,
3300 force_bio_submit);
c8f2f24b
JB
3301 if (!ret) {
3302 nr++;
3303 *bio_flags = this_bio_flag;
3304 } else {
d1310b2e 3305 SetPageError(page);
7f042a83 3306 unlock_extent(tree, cur, cur + iosize - 1);
baf863b9 3307 goto out;
edd33c99 3308 }
d1310b2e 3309 cur = cur + iosize;
306e16ce 3310 pg_offset += iosize;
d1310b2e 3311 }
90a887c9 3312out:
d1310b2e
CM
3313 if (!nr) {
3314 if (!PageError(page))
3315 SetPageUptodate(page);
3316 unlock_page(page);
3317 }
baf863b9 3318 return ret;
d1310b2e
CM
3319}
3320
e65ef21e 3321static inline void contiguous_readpages(struct extent_io_tree *tree,
9974090b
MX
3322 struct page *pages[], int nr_pages,
3323 u64 start, u64 end,
125bac01 3324 struct extent_map **em_cached,
d3fac6ba 3325 struct bio **bio,
1f7ad75b 3326 unsigned long *bio_flags,
808f80b4 3327 u64 *prev_em_start)
9974090b 3328{
23d31bd4 3329 struct btrfs_inode *inode = BTRFS_I(pages[0]->mapping->host);
9974090b
MX
3330 int index;
3331
ae6957eb
DS
3332 ASSERT(tree == &inode->io_tree);
3333
b272ae22 3334 btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
9974090b
MX
3335
3336 for (index = 0; index < nr_pages; index++) {
4ef77695 3337 __do_readpage(tree, pages[index], btrfs_get_extent, em_cached,
5e9d3982 3338 bio, 0, bio_flags, REQ_RAHEAD, prev_em_start);
09cbfeaf 3339 put_page(pages[index]);
9974090b
MX
3340 }
3341}
3342
9974090b
MX
3343static int __extent_read_full_page(struct extent_io_tree *tree,
3344 struct page *page,
3345 get_extent_t *get_extent,
3346 struct bio **bio, int mirror_num,
f1c77c55
DS
3347 unsigned long *bio_flags,
3348 unsigned int read_flags)
9974090b 3349{
23d31bd4 3350 struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
9974090b 3351 u64 start = page_offset(page);
09cbfeaf 3352 u64 end = start + PAGE_SIZE - 1;
9974090b
MX
3353 int ret;
3354
ae6957eb
DS
3355 ASSERT(tree == &inode->io_tree);
3356
b272ae22 3357 btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
9974090b 3358
125bac01 3359 ret = __do_readpage(tree, page, get_extent, NULL, bio, mirror_num,
1f7ad75b 3360 bio_flags, read_flags, NULL);
9974090b
MX
3361 return ret;
3362}
3363
71ad38b4
DS
3364int extent_read_full_page(struct page *page, get_extent_t *get_extent,
3365 int mirror_num)
d1310b2e
CM
3366{
3367 struct bio *bio = NULL;
c8b97818 3368 unsigned long bio_flags = 0;
71ad38b4 3369 struct extent_io_tree *tree = &BTRFS_I(page->mapping->host)->io_tree;
d1310b2e
CM
3370 int ret;
3371
8ddc7d9c 3372 ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num,
1f7ad75b 3373 &bio_flags, 0);
d1310b2e 3374 if (bio)
1f7ad75b 3375 ret = submit_one_bio(bio, mirror_num, bio_flags);
d1310b2e
CM
3376 return ret;
3377}
d1310b2e 3378
3d4b9496 3379static void update_nr_written(struct writeback_control *wbc,
a9132667 3380 unsigned long nr_written)
11c8349b
CM
3381{
3382 wbc->nr_to_write -= nr_written;
11c8349b
CM
3383}
3384
d1310b2e 3385/*
40f76580
CM
3386 * helper for __extent_writepage, doing all of the delayed allocation setup.
3387 *
5eaad97a 3388 * This returns 1 if btrfs_run_delalloc_range function did all the work required
40f76580
CM
3389 * to write the page (copy into inline extent). In this case the IO has
3390 * been started and the page is already unlocked.
3391 *
3392 * This returns 0 if all went well (page still locked)
3393 * This returns < 0 if there were errors (page still locked)
d1310b2e 3394 */
40f76580 3395static noinline_for_stack int writepage_delalloc(struct inode *inode,
8cc0237a
NB
3396 struct page *page, struct writeback_control *wbc,
3397 u64 delalloc_start, unsigned long *nr_written)
40f76580 3398{
09cbfeaf 3399 u64 page_end = delalloc_start + PAGE_SIZE - 1;
3522e903 3400 bool found;
40f76580
CM
3401 u64 delalloc_to_write = 0;
3402 u64 delalloc_end = 0;
3403 int ret;
3404 int page_started = 0;
3405
40f76580
CM
3406
3407 while (delalloc_end < page_end) {
9978059b 3408 found = find_lock_delalloc_range(inode, page,
40f76580 3409 &delalloc_start,
917aacec 3410 &delalloc_end);
3522e903 3411 if (!found) {
40f76580
CM
3412 delalloc_start = delalloc_end + 1;
3413 continue;
3414 }
5eaad97a
NB
3415 ret = btrfs_run_delalloc_range(inode, page, delalloc_start,
3416 delalloc_end, &page_started, nr_written, wbc);
40f76580
CM
3417 if (ret) {
3418 SetPageError(page);
5eaad97a
NB
3419 /*
3420 * btrfs_run_delalloc_range should return < 0 for error
3421 * but just in case, we use > 0 here meaning the IO is
3422 * started, so we don't want to return > 0 unless
3423 * things are going well.
40f76580
CM
3424 */
3425 ret = ret < 0 ? ret : -EIO;
3426 goto done;
3427 }
3428 /*
ea1754a0
KS
3429 * delalloc_end is already one less than the total length, so
3430 * we don't subtract one from PAGE_SIZE
40f76580
CM
3431 */
3432 delalloc_to_write += (delalloc_end - delalloc_start +
ea1754a0 3433 PAGE_SIZE) >> PAGE_SHIFT;
40f76580
CM
3434 delalloc_start = delalloc_end + 1;
3435 }
3436 if (wbc->nr_to_write < delalloc_to_write) {
3437 int thresh = 8192;
3438
3439 if (delalloc_to_write < thresh * 2)
3440 thresh = delalloc_to_write;
3441 wbc->nr_to_write = min_t(u64, delalloc_to_write,
3442 thresh);
3443 }
3444
3445 /* did the fill delalloc function already unlock and start
3446 * the IO?
3447 */
3448 if (page_started) {
3449 /*
3450 * we've unlocked the page, so we can't update
3451 * the mapping's writeback index, just update
3452 * nr_to_write.
3453 */
3454 wbc->nr_to_write -= *nr_written;
3455 return 1;
3456 }
3457
3458 ret = 0;
3459
3460done:
3461 return ret;
3462}
3463
3464/*
3465 * helper for __extent_writepage. This calls the writepage start hooks,
3466 * and does the loop to map the page into extents and bios.
3467 *
3468 * We return 1 if the IO is started and the page is unlocked,
3469 * 0 if all went well (page still locked)
3470 * < 0 if there were errors (page still locked)
3471 */
3472static noinline_for_stack int __extent_writepage_io(struct inode *inode,
3473 struct page *page,
3474 struct writeback_control *wbc,
3475 struct extent_page_data *epd,
3476 loff_t i_size,
3477 unsigned long nr_written,
57e5ffeb 3478 int *nr_ret)
d1310b2e 3479{
45b08405 3480 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
4eee4fa4 3481 u64 start = page_offset(page);
09cbfeaf 3482 u64 page_end = start + PAGE_SIZE - 1;
d1310b2e
CM
3483 u64 end;
3484 u64 cur = start;
3485 u64 extent_offset;
d1310b2e
CM
3486 u64 block_start;
3487 u64 iosize;
d1310b2e 3488 struct extent_map *em;
7f3c74fb 3489 size_t pg_offset = 0;
d1310b2e 3490 size_t blocksize;
40f76580
CM
3491 int ret = 0;
3492 int nr = 0;
57e5ffeb 3493 const unsigned int write_flags = wbc_to_write_flags(wbc);
40f76580 3494 bool compressed;
c8b97818 3495
d75855b4
NB
3496 ret = btrfs_writepage_cow_fixup(page, start, page_end);
3497 if (ret) {
3498 /* Fixup worker will requeue */
5ab58055 3499 redirty_page_for_writepage(wbc, page);
d75855b4
NB
3500 update_nr_written(wbc, nr_written);
3501 unlock_page(page);
3502 return 1;
247e743c
CM
3503 }
3504
11c8349b
CM
3505 /*
3506 * we don't want to touch the inode after unlocking the page,
3507 * so we update the mapping writeback index now
3508 */
3d4b9496 3509 update_nr_written(wbc, nr_written + 1);
771ed689 3510
d1310b2e 3511 end = page_end;
d1310b2e
CM
3512 blocksize = inode->i_sb->s_blocksize;
3513
3514 while (cur <= end) {
40f76580 3515 u64 em_end;
6273b7f8 3516 u64 offset;
58409edd 3517
40f76580 3518 if (cur >= i_size) {
7087a9d8 3519 btrfs_writepage_endio_finish_ordered(page, cur,
c629732d 3520 page_end, 1);
d1310b2e
CM
3521 break;
3522 }
39b07b5d
OS
3523 em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur,
3524 end - cur + 1);
c704005d 3525 if (IS_ERR_OR_NULL(em)) {
d1310b2e 3526 SetPageError(page);
61391d56 3527 ret = PTR_ERR_OR_ZERO(em);
d1310b2e
CM
3528 break;
3529 }
3530
3531 extent_offset = cur - em->start;
40f76580
CM
3532 em_end = extent_map_end(em);
3533 BUG_ON(em_end <= cur);
d1310b2e 3534 BUG_ON(end < cur);
40f76580 3535 iosize = min(em_end - cur, end - cur + 1);
fda2832f 3536 iosize = ALIGN(iosize, blocksize);
6273b7f8 3537 offset = em->block_start + extent_offset;
d1310b2e 3538 block_start = em->block_start;
c8b97818 3539 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
d1310b2e
CM
3540 free_extent_map(em);
3541 em = NULL;
3542
c8b97818
CM
3543 /*
3544 * compressed and inline extents are written through other
3545 * paths in the FS
3546 */
3547 if (compressed || block_start == EXTENT_MAP_HOLE ||
d1310b2e 3548 block_start == EXTENT_MAP_INLINE) {
c8b04030 3549 if (compressed)
c8b97818 3550 nr++;
c8b04030
OS
3551 else
3552 btrfs_writepage_endio_finish_ordered(page, cur,
3553 cur + iosize - 1, 1);
c8b97818 3554 cur += iosize;
7f3c74fb 3555 pg_offset += iosize;
d1310b2e
CM
3556 continue;
3557 }
c8b97818 3558
5cdc84bf 3559 btrfs_set_range_writeback(tree, cur, cur + iosize - 1);
58409edd
DS
3560 if (!PageWriteback(page)) {
3561 btrfs_err(BTRFS_I(inode)->root->fs_info,
3562 "page %lu not writeback, cur %llu end %llu",
3563 page->index, cur, end);
d1310b2e 3564 }
7f3c74fb 3565
0ceb34bf 3566 ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc,
6273b7f8 3567 page, offset, iosize, pg_offset,
fa17ed06 3568 &epd->bio,
58409edd
DS
3569 end_bio_extent_writepage,
3570 0, 0, 0, false);
fe01aa65 3571 if (ret) {
58409edd 3572 SetPageError(page);
fe01aa65
TK
3573 if (PageWriteback(page))
3574 end_page_writeback(page);
3575 }
d1310b2e 3576
d1310b2e 3577 cur = cur + iosize;
7f3c74fb 3578 pg_offset += iosize;
d1310b2e
CM
3579 nr++;
3580 }
40f76580 3581 *nr_ret = nr;
40f76580
CM
3582 return ret;
3583}
3584
3585/*
3586 * the writepage semantics are similar to regular writepage. extent
3587 * records are inserted to lock ranges in the tree, and as dirty areas
3588 * are found, they are marked writeback. Then the lock bits are removed
3589 * and the end_io handler clears the writeback ranges
3065976b
QW
3590 *
3591 * Return 0 if everything goes well.
3592 * Return <0 for error.
40f76580
CM
3593 */
3594static int __extent_writepage(struct page *page, struct writeback_control *wbc,
aab6e9ed 3595 struct extent_page_data *epd)
40f76580
CM
3596{
3597 struct inode *inode = page->mapping->host;
40f76580 3598 u64 start = page_offset(page);
09cbfeaf 3599 u64 page_end = start + PAGE_SIZE - 1;
40f76580
CM
3600 int ret;
3601 int nr = 0;
eb70d222 3602 size_t pg_offset;
40f76580 3603 loff_t i_size = i_size_read(inode);
09cbfeaf 3604 unsigned long end_index = i_size >> PAGE_SHIFT;
40f76580
CM
3605 unsigned long nr_written = 0;
3606
40f76580
CM
3607 trace___extent_writepage(page, inode, wbc);
3608
3609 WARN_ON(!PageLocked(page));
3610
3611 ClearPageError(page);
3612
7073017a 3613 pg_offset = offset_in_page(i_size);
40f76580
CM
3614 if (page->index > end_index ||
3615 (page->index == end_index && !pg_offset)) {
09cbfeaf 3616 page->mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE);
40f76580
CM
3617 unlock_page(page);
3618 return 0;
3619 }
3620
3621 if (page->index == end_index) {
3622 char *userpage;
3623
3624 userpage = kmap_atomic(page);
3625 memset(userpage + pg_offset, 0,
09cbfeaf 3626 PAGE_SIZE - pg_offset);
40f76580
CM
3627 kunmap_atomic(userpage);
3628 flush_dcache_page(page);
3629 }
3630
40f76580
CM
3631 set_page_extent_mapped(page);
3632
7789a55a 3633 if (!epd->extent_locked) {
8cc0237a 3634 ret = writepage_delalloc(inode, page, wbc, start, &nr_written);
7789a55a 3635 if (ret == 1)
169d2c87 3636 return 0;
7789a55a
NB
3637 if (ret)
3638 goto done;
3639 }
40f76580
CM
3640
3641 ret = __extent_writepage_io(inode, page, wbc, epd,
57e5ffeb 3642 i_size, nr_written, &nr);
40f76580 3643 if (ret == 1)
169d2c87 3644 return 0;
40f76580 3645
d1310b2e
CM
3646done:
3647 if (nr == 0) {
3648 /* make sure the mapping tag for page dirty gets cleared */
3649 set_page_writeback(page);
3650 end_page_writeback(page);
3651 }
61391d56
FM
3652 if (PageError(page)) {
3653 ret = ret < 0 ? ret : -EIO;
3654 end_extent_writepage(page, ret, start, page_end);
3655 }
d1310b2e 3656 unlock_page(page);
3065976b 3657 ASSERT(ret <= 0);
40f76580 3658 return ret;
d1310b2e
CM
3659}
3660
fd8b2b61 3661void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
0b32f4bb 3662{
74316201
N
3663 wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_WRITEBACK,
3664 TASK_UNINTERRUPTIBLE);
0b32f4bb
JB
3665}
3666
18dfa711
FM
3667static void end_extent_buffer_writeback(struct extent_buffer *eb)
3668{
3669 clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
3670 smp_mb__after_atomic();
3671 wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
3672}
3673
2e3c2513
QW
3674/*
3675 * Lock eb pages and flush the bio if we can't the locks
3676 *
3677 * Return 0 if nothing went wrong
3678 * Return >0 is same as 0, except bio is not submitted
3679 * Return <0 if something went wrong, no page is locked
3680 */
9df76fb5 3681static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb,
0e378df1 3682 struct extent_page_data *epd)
0b32f4bb 3683{
9df76fb5 3684 struct btrfs_fs_info *fs_info = eb->fs_info;
2e3c2513 3685 int i, num_pages, failed_page_nr;
0b32f4bb
JB
3686 int flush = 0;
3687 int ret = 0;
3688
3689 if (!btrfs_try_tree_write_lock(eb)) {
f4340622 3690 ret = flush_write_bio(epd);
2e3c2513
QW
3691 if (ret < 0)
3692 return ret;
3693 flush = 1;
0b32f4bb
JB
3694 btrfs_tree_lock(eb);
3695 }
3696
3697 if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) {
3698 btrfs_tree_unlock(eb);
3699 if (!epd->sync_io)
3700 return 0;
3701 if (!flush) {
f4340622 3702 ret = flush_write_bio(epd);
2e3c2513
QW
3703 if (ret < 0)
3704 return ret;
0b32f4bb
JB
3705 flush = 1;
3706 }
a098d8e8
CM
3707 while (1) {
3708 wait_on_extent_buffer_writeback(eb);
3709 btrfs_tree_lock(eb);
3710 if (!test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags))
3711 break;
0b32f4bb 3712 btrfs_tree_unlock(eb);
0b32f4bb
JB
3713 }
3714 }
3715
51561ffe
JB
3716 /*
3717 * We need to do this to prevent races in people who check if the eb is
3718 * under IO since we can end up having no IO bits set for a short period
3719 * of time.
3720 */
3721 spin_lock(&eb->refs_lock);
0b32f4bb
JB
3722 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
3723 set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
51561ffe 3724 spin_unlock(&eb->refs_lock);
0b32f4bb 3725 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
104b4e51
NB
3726 percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
3727 -eb->len,
3728 fs_info->dirty_metadata_batch);
0b32f4bb 3729 ret = 1;
51561ffe
JB
3730 } else {
3731 spin_unlock(&eb->refs_lock);
0b32f4bb
JB
3732 }
3733
3734 btrfs_tree_unlock(eb);
3735
3736 if (!ret)
3737 return ret;
3738
65ad0104 3739 num_pages = num_extent_pages(eb);
0b32f4bb 3740 for (i = 0; i < num_pages; i++) {
fb85fc9a 3741 struct page *p = eb->pages[i];
0b32f4bb
JB
3742
3743 if (!trylock_page(p)) {
3744 if (!flush) {
18dfa711
FM
3745 int err;
3746
3747 err = flush_write_bio(epd);
3748 if (err < 0) {
3749 ret = err;
2e3c2513
QW
3750 failed_page_nr = i;
3751 goto err_unlock;
3752 }
0b32f4bb
JB
3753 flush = 1;
3754 }
3755 lock_page(p);
3756 }
3757 }
3758
3759 return ret;
2e3c2513
QW
3760err_unlock:
3761 /* Unlock already locked pages */
3762 for (i = 0; i < failed_page_nr; i++)
3763 unlock_page(eb->pages[i]);
18dfa711
FM
3764 /*
3765 * Clear EXTENT_BUFFER_WRITEBACK and wake up anyone waiting on it.
3766 * Also set back EXTENT_BUFFER_DIRTY so future attempts to this eb can
3767 * be made and undo everything done before.
3768 */
3769 btrfs_tree_lock(eb);
3770 spin_lock(&eb->refs_lock);
3771 set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
3772 end_extent_buffer_writeback(eb);
3773 spin_unlock(&eb->refs_lock);
3774 percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, eb->len,
3775 fs_info->dirty_metadata_batch);
3776 btrfs_clear_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
3777 btrfs_tree_unlock(eb);
2e3c2513 3778 return ret;
0b32f4bb
JB
3779}
3780
656f30db
FM
3781static void set_btree_ioerr(struct page *page)
3782{
3783 struct extent_buffer *eb = (struct extent_buffer *)page->private;
eb5b64f1 3784 struct btrfs_fs_info *fs_info;
656f30db
FM
3785
3786 SetPageError(page);
3787 if (test_and_set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))
3788 return;
3789
eb5b64f1
DZ
3790 /*
3791 * If we error out, we should add back the dirty_metadata_bytes
3792 * to make it consistent.
3793 */
3794 fs_info = eb->fs_info;
3795 percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
3796 eb->len, fs_info->dirty_metadata_batch);
3797
656f30db
FM
3798 /*
3799 * If writeback for a btree extent that doesn't belong to a log tree
3800 * failed, increment the counter transaction->eb_write_errors.
3801 * We do this because while the transaction is running and before it's
3802 * committing (when we call filemap_fdata[write|wait]_range against
3803 * the btree inode), we might have
3804 * btree_inode->i_mapping->a_ops->writepages() called by the VM - if it
3805 * returns an error or an error happens during writeback, when we're
3806 * committing the transaction we wouldn't know about it, since the pages
3807 * can be no longer dirty nor marked anymore for writeback (if a
3808 * subsequent modification to the extent buffer didn't happen before the
3809 * transaction commit), which makes filemap_fdata[write|wait]_range not
3810 * able to find the pages tagged with SetPageError at transaction
3811 * commit time. So if this happens we must abort the transaction,
3812 * otherwise we commit a super block with btree roots that point to
3813 * btree nodes/leafs whose content on disk is invalid - either garbage
3814 * or the content of some node/leaf from a past generation that got
3815 * cowed or deleted and is no longer valid.
3816 *
3817 * Note: setting AS_EIO/AS_ENOSPC in the btree inode's i_mapping would
3818 * not be enough - we need to distinguish between log tree extents vs
3819 * non-log tree extents, and the next filemap_fdatawait_range() call
3820 * will catch and clear such errors in the mapping - and that call might
3821 * be from a log sync and not from a transaction commit. Also, checking
3822 * for the eb flag EXTENT_BUFFER_WRITE_ERR at transaction commit time is
3823 * not done and would not be reliable - the eb might have been released
3824 * from memory and reading it back again means that flag would not be
3825 * set (since it's a runtime flag, not persisted on disk).
3826 *
3827 * Using the flags below in the btree inode also makes us achieve the
3828 * goal of AS_EIO/AS_ENOSPC when writepages() returns success, started
3829 * writeback for all dirty pages and before filemap_fdatawait_range()
3830 * is called, the writeback for all dirty pages had already finished
3831 * with errors - because we were not using AS_EIO/AS_ENOSPC,
3832 * filemap_fdatawait_range() would return success, as it could not know
3833 * that writeback errors happened (the pages were no longer tagged for
3834 * writeback).
3835 */
3836 switch (eb->log_index) {
3837 case -1:
afcdd129 3838 set_bit(BTRFS_FS_BTREE_ERR, &eb->fs_info->flags);
656f30db
FM
3839 break;
3840 case 0:
afcdd129 3841 set_bit(BTRFS_FS_LOG1_ERR, &eb->fs_info->flags);
656f30db
FM
3842 break;
3843 case 1:
afcdd129 3844 set_bit(BTRFS_FS_LOG2_ERR, &eb->fs_info->flags);
656f30db
FM
3845 break;
3846 default:
3847 BUG(); /* unexpected, logic error */
3848 }
3849}
3850
4246a0b6 3851static void end_bio_extent_buffer_writepage(struct bio *bio)
0b32f4bb 3852{
2c30c71b 3853 struct bio_vec *bvec;
0b32f4bb 3854 struct extent_buffer *eb;
2b070cfe 3855 int done;
6dc4f100 3856 struct bvec_iter_all iter_all;
0b32f4bb 3857
c09abff8 3858 ASSERT(!bio_flagged(bio, BIO_CLONED));
2b070cfe 3859 bio_for_each_segment_all(bvec, bio, iter_all) {
0b32f4bb
JB
3860 struct page *page = bvec->bv_page;
3861
0b32f4bb
JB
3862 eb = (struct extent_buffer *)page->private;
3863 BUG_ON(!eb);
3864 done = atomic_dec_and_test(&eb->io_pages);
3865
4e4cbee9 3866 if (bio->bi_status ||
4246a0b6 3867 test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) {
0b32f4bb 3868 ClearPageUptodate(page);
656f30db 3869 set_btree_ioerr(page);
0b32f4bb
JB
3870 }
3871
3872 end_page_writeback(page);
3873
3874 if (!done)
3875 continue;
3876
3877 end_extent_buffer_writeback(eb);
2c30c71b 3878 }
0b32f4bb
JB
3879
3880 bio_put(bio);
0b32f4bb
JB
3881}
3882
0e378df1 3883static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
0b32f4bb
JB
3884 struct writeback_control *wbc,
3885 struct extent_page_data *epd)
3886{
0b32f4bb 3887 u64 offset = eb->start;
851cd173 3888 u32 nritems;
cc5e31a4 3889 int i, num_pages;
851cd173 3890 unsigned long start, end;
ff40adf7 3891 unsigned int write_flags = wbc_to_write_flags(wbc) | REQ_META;
d7dbe9e7 3892 int ret = 0;
0b32f4bb 3893
656f30db 3894 clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags);
65ad0104 3895 num_pages = num_extent_pages(eb);
0b32f4bb 3896 atomic_set(&eb->io_pages, num_pages);
de0022b9 3897
851cd173
LB
3898 /* set btree blocks beyond nritems with 0 to avoid stale content. */
3899 nritems = btrfs_header_nritems(eb);
3eb548ee 3900 if (btrfs_header_level(eb) > 0) {
3eb548ee
LB
3901 end = btrfs_node_key_ptr_offset(nritems);
3902
b159fa28 3903 memzero_extent_buffer(eb, end, eb->len - end);
851cd173
LB
3904 } else {
3905 /*
3906 * leaf:
3907 * header 0 1 2 .. N ... data_N .. data_2 data_1 data_0
3908 */
3909 start = btrfs_item_nr_offset(nritems);
8f881e8c 3910 end = BTRFS_LEAF_DATA_OFFSET + leaf_data_end(eb);
b159fa28 3911 memzero_extent_buffer(eb, start, end - start);
3eb548ee
LB
3912 }
3913
0b32f4bb 3914 for (i = 0; i < num_pages; i++) {
fb85fc9a 3915 struct page *p = eb->pages[i];
0b32f4bb
JB
3916
3917 clear_page_dirty_for_io(p);
3918 set_page_writeback(p);
0ceb34bf 3919 ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc,
fa17ed06 3920 p, offset, PAGE_SIZE, 0,
c2df8bb4 3921 &epd->bio,
1f7ad75b 3922 end_bio_extent_buffer_writepage,
18fdc679 3923 0, 0, 0, false);
0b32f4bb 3924 if (ret) {
656f30db 3925 set_btree_ioerr(p);
fe01aa65
TK
3926 if (PageWriteback(p))
3927 end_page_writeback(p);
0b32f4bb
JB
3928 if (atomic_sub_and_test(num_pages - i, &eb->io_pages))
3929 end_extent_buffer_writeback(eb);
3930 ret = -EIO;
3931 break;
3932 }
09cbfeaf 3933 offset += PAGE_SIZE;
3d4b9496 3934 update_nr_written(wbc, 1);
0b32f4bb
JB
3935 unlock_page(p);
3936 }
3937
3938 if (unlikely(ret)) {
3939 for (; i < num_pages; i++) {
bbf65cf0 3940 struct page *p = eb->pages[i];
81465028 3941 clear_page_dirty_for_io(p);
0b32f4bb
JB
3942 unlock_page(p);
3943 }
3944 }
3945
3946 return ret;
3947}
3948
3949int btree_write_cache_pages(struct address_space *mapping,
3950 struct writeback_control *wbc)
3951{
0b32f4bb
JB
3952 struct extent_buffer *eb, *prev_eb = NULL;
3953 struct extent_page_data epd = {
3954 .bio = NULL,
0b32f4bb
JB
3955 .extent_locked = 0,
3956 .sync_io = wbc->sync_mode == WB_SYNC_ALL,
3957 };
3958 int ret = 0;
3959 int done = 0;
3960 int nr_to_write_done = 0;
3961 struct pagevec pvec;
3962 int nr_pages;
3963 pgoff_t index;
3964 pgoff_t end; /* Inclusive */
3965 int scanned = 0;
10bbd235 3966 xa_mark_t tag;
0b32f4bb 3967
86679820 3968 pagevec_init(&pvec);
0b32f4bb
JB
3969 if (wbc->range_cyclic) {
3970 index = mapping->writeback_index; /* Start from prev offset */
3971 end = -1;
556755a8
JB
3972 /*
3973 * Start from the beginning does not need to cycle over the
3974 * range, mark it as scanned.
3975 */
3976 scanned = (index == 0);
0b32f4bb 3977 } else {
09cbfeaf
KS
3978 index = wbc->range_start >> PAGE_SHIFT;
3979 end = wbc->range_end >> PAGE_SHIFT;
0b32f4bb
JB
3980 scanned = 1;
3981 }
3982 if (wbc->sync_mode == WB_SYNC_ALL)
3983 tag = PAGECACHE_TAG_TOWRITE;
3984 else
3985 tag = PAGECACHE_TAG_DIRTY;
3986retry:
3987 if (wbc->sync_mode == WB_SYNC_ALL)
3988 tag_pages_for_writeback(mapping, index, end);
3989 while (!done && !nr_to_write_done && (index <= end) &&
4006f437 3990 (nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end,
67fd707f 3991 tag))) {
0b32f4bb
JB
3992 unsigned i;
3993
0b32f4bb
JB
3994 for (i = 0; i < nr_pages; i++) {
3995 struct page *page = pvec.pages[i];
3996
3997 if (!PagePrivate(page))
3998 continue;
3999
b5bae261
JB
4000 spin_lock(&mapping->private_lock);
4001 if (!PagePrivate(page)) {
4002 spin_unlock(&mapping->private_lock);
4003 continue;
4004 }
4005
0b32f4bb 4006 eb = (struct extent_buffer *)page->private;
b5bae261
JB
4007
4008 /*
4009 * Shouldn't happen and normally this would be a BUG_ON
4010 * but no sense in crashing the users box for something
4011 * we can survive anyway.
4012 */
fae7f21c 4013 if (WARN_ON(!eb)) {
b5bae261 4014 spin_unlock(&mapping->private_lock);
0b32f4bb
JB
4015 continue;
4016 }
4017
b5bae261
JB
4018 if (eb == prev_eb) {
4019 spin_unlock(&mapping->private_lock);
0b32f4bb 4020 continue;
b5bae261 4021 }
0b32f4bb 4022
b5bae261
JB
4023 ret = atomic_inc_not_zero(&eb->refs);
4024 spin_unlock(&mapping->private_lock);
4025 if (!ret)
0b32f4bb 4026 continue;
0b32f4bb
JB
4027
4028 prev_eb = eb;
9df76fb5 4029 ret = lock_extent_buffer_for_io(eb, &epd);
0b32f4bb
JB
4030 if (!ret) {
4031 free_extent_buffer(eb);
4032 continue;
0607eb1d
FM
4033 } else if (ret < 0) {
4034 done = 1;
4035 free_extent_buffer(eb);
4036 break;
0b32f4bb
JB
4037 }
4038
0ab02063 4039 ret = write_one_eb(eb, wbc, &epd);
0b32f4bb
JB
4040 if (ret) {
4041 done = 1;
4042 free_extent_buffer(eb);
4043 break;
4044 }
4045 free_extent_buffer(eb);
4046
4047 /*
4048 * the filesystem may choose to bump up nr_to_write.
4049 * We have to make sure to honor the new nr_to_write
4050 * at any time
4051 */
4052 nr_to_write_done = wbc->nr_to_write <= 0;
4053 }
4054 pagevec_release(&pvec);
4055 cond_resched();
4056 }
4057 if (!scanned && !done) {
4058 /*
4059 * We hit the last page and there is more work to be done: wrap
4060 * back to the start of the file
4061 */
4062 scanned = 1;
4063 index = 0;
4064 goto retry;
4065 }
2b952eea
QW
4066 ASSERT(ret <= 0);
4067 if (ret < 0) {
4068 end_write_bio(&epd, ret);
4069 return ret;
4070 }
4071 ret = flush_write_bio(&epd);
0b32f4bb
JB
4072 return ret;
4073}
4074
d1310b2e 4075/**
4bef0848 4076 * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
d1310b2e
CM
4077 * @mapping: address space structure to write
4078 * @wbc: subtract the number of written pages from *@wbc->nr_to_write
935db853 4079 * @data: data passed to __extent_writepage function
d1310b2e
CM
4080 *
4081 * If a page is already under I/O, write_cache_pages() skips it, even
4082 * if it's dirty. This is desirable behaviour for memory-cleaning writeback,
4083 * but it is INCORRECT for data-integrity system calls such as fsync(). fsync()
4084 * and msync() need to guarantee that all the data which was dirty at the time
4085 * the call was made get new I/O started against them. If wbc->sync_mode is
4086 * WB_SYNC_ALL then we were called for data integrity and we must wait for
4087 * existing IO to complete.
4088 */
4242b64a 4089static int extent_write_cache_pages(struct address_space *mapping,
4bef0848 4090 struct writeback_control *wbc,
aab6e9ed 4091 struct extent_page_data *epd)
d1310b2e 4092{
7fd1a3f7 4093 struct inode *inode = mapping->host;
d1310b2e
CM
4094 int ret = 0;
4095 int done = 0;
f85d7d6c 4096 int nr_to_write_done = 0;
d1310b2e
CM
4097 struct pagevec pvec;
4098 int nr_pages;
4099 pgoff_t index;
4100 pgoff_t end; /* Inclusive */
a9132667
LB
4101 pgoff_t done_index;
4102 int range_whole = 0;
d1310b2e 4103 int scanned = 0;
10bbd235 4104 xa_mark_t tag;
d1310b2e 4105
7fd1a3f7
JB
4106 /*
4107 * We have to hold onto the inode so that ordered extents can do their
4108 * work when the IO finishes. The alternative to this is failing to add
4109 * an ordered extent if the igrab() fails there and that is a huge pain
4110 * to deal with, so instead just hold onto the inode throughout the
4111 * writepages operation. If it fails here we are freeing up the inode
4112 * anyway and we'd rather not waste our time writing out stuff that is
4113 * going to be truncated anyway.
4114 */
4115 if (!igrab(inode))
4116 return 0;
4117
86679820 4118 pagevec_init(&pvec);
d1310b2e
CM
4119 if (wbc->range_cyclic) {
4120 index = mapping->writeback_index; /* Start from prev offset */
4121 end = -1;
556755a8
JB
4122 /*
4123 * Start from the beginning does not need to cycle over the
4124 * range, mark it as scanned.
4125 */
4126 scanned = (index == 0);
d1310b2e 4127 } else {
09cbfeaf
KS
4128 index = wbc->range_start >> PAGE_SHIFT;
4129 end = wbc->range_end >> PAGE_SHIFT;
a9132667
LB
4130 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
4131 range_whole = 1;
d1310b2e
CM
4132 scanned = 1;
4133 }
3cd24c69
EL
4134
4135 /*
4136 * We do the tagged writepage as long as the snapshot flush bit is set
4137 * and we are the first one who do the filemap_flush() on this inode.
4138 *
4139 * The nr_to_write == LONG_MAX is needed to make sure other flushers do
4140 * not race in and drop the bit.
4141 */
4142 if (range_whole && wbc->nr_to_write == LONG_MAX &&
4143 test_and_clear_bit(BTRFS_INODE_SNAPSHOT_FLUSH,
4144 &BTRFS_I(inode)->runtime_flags))
4145 wbc->tagged_writepages = 1;
4146
4147 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
f7aaa06b
JB
4148 tag = PAGECACHE_TAG_TOWRITE;
4149 else
4150 tag = PAGECACHE_TAG_DIRTY;
d1310b2e 4151retry:
3cd24c69 4152 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
f7aaa06b 4153 tag_pages_for_writeback(mapping, index, end);
a9132667 4154 done_index = index;
f85d7d6c 4155 while (!done && !nr_to_write_done && (index <= end) &&
67fd707f
JK
4156 (nr_pages = pagevec_lookup_range_tag(&pvec, mapping,
4157 &index, end, tag))) {
d1310b2e
CM
4158 unsigned i;
4159
d1310b2e
CM
4160 for (i = 0; i < nr_pages; i++) {
4161 struct page *page = pvec.pages[i];
4162
f7bddf1e 4163 done_index = page->index + 1;
d1310b2e 4164 /*
b93b0163
MW
4165 * At this point we hold neither the i_pages lock nor
4166 * the page lock: the page may be truncated or
4167 * invalidated (changing page->mapping to NULL),
4168 * or even swizzled back from swapper_space to
4169 * tmpfs file mapping
d1310b2e 4170 */
c8f2f24b 4171 if (!trylock_page(page)) {
f4340622
QW
4172 ret = flush_write_bio(epd);
4173 BUG_ON(ret < 0);
c8f2f24b 4174 lock_page(page);
01d658f2 4175 }
d1310b2e
CM
4176
4177 if (unlikely(page->mapping != mapping)) {
4178 unlock_page(page);
4179 continue;
4180 }
4181
d2c3f4f6 4182 if (wbc->sync_mode != WB_SYNC_NONE) {
f4340622
QW
4183 if (PageWriteback(page)) {
4184 ret = flush_write_bio(epd);
4185 BUG_ON(ret < 0);
4186 }
d1310b2e 4187 wait_on_page_writeback(page);
d2c3f4f6 4188 }
d1310b2e
CM
4189
4190 if (PageWriteback(page) ||
4191 !clear_page_dirty_for_io(page)) {
4192 unlock_page(page);
4193 continue;
4194 }
4195
aab6e9ed 4196 ret = __extent_writepage(page, wbc, epd);
a9132667 4197 if (ret < 0) {
a9132667
LB
4198 done = 1;
4199 break;
4200 }
f85d7d6c
CM
4201
4202 /*
4203 * the filesystem may choose to bump up nr_to_write.
4204 * We have to make sure to honor the new nr_to_write
4205 * at any time
4206 */
4207 nr_to_write_done = wbc->nr_to_write <= 0;
d1310b2e
CM
4208 }
4209 pagevec_release(&pvec);
4210 cond_resched();
4211 }
894b36e3 4212 if (!scanned && !done) {
d1310b2e
CM
4213 /*
4214 * We hit the last page and there is more work to be done: wrap
4215 * back to the start of the file
4216 */
4217 scanned = 1;
4218 index = 0;
42ffb0bf
JB
4219
4220 /*
4221 * If we're looping we could run into a page that is locked by a
4222 * writer and that writer could be waiting on writeback for a
4223 * page in our current bio, and thus deadlock, so flush the
4224 * write bio here.
4225 */
4226 ret = flush_write_bio(epd);
4227 if (!ret)
4228 goto retry;
d1310b2e 4229 }
a9132667
LB
4230
4231 if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole))
4232 mapping->writeback_index = done_index;
4233
7fd1a3f7 4234 btrfs_add_delayed_iput(inode);
894b36e3 4235 return ret;
d1310b2e 4236}
d1310b2e 4237
0a9b0e53 4238int extent_write_full_page(struct page *page, struct writeback_control *wbc)
d1310b2e
CM
4239{
4240 int ret;
d1310b2e
CM
4241 struct extent_page_data epd = {
4242 .bio = NULL,
771ed689 4243 .extent_locked = 0,
ffbd517d 4244 .sync_io = wbc->sync_mode == WB_SYNC_ALL,
d1310b2e 4245 };
d1310b2e 4246
d1310b2e 4247 ret = __extent_writepage(page, wbc, &epd);
3065976b
QW
4248 ASSERT(ret <= 0);
4249 if (ret < 0) {
4250 end_write_bio(&epd, ret);
4251 return ret;
4252 }
d1310b2e 4253
3065976b
QW
4254 ret = flush_write_bio(&epd);
4255 ASSERT(ret <= 0);
d1310b2e
CM
4256 return ret;
4257}
d1310b2e 4258
5e3ee236 4259int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
771ed689
CM
4260 int mode)
4261{
4262 int ret = 0;
4263 struct address_space *mapping = inode->i_mapping;
4264 struct page *page;
09cbfeaf
KS
4265 unsigned long nr_pages = (end - start + PAGE_SIZE) >>
4266 PAGE_SHIFT;
771ed689
CM
4267
4268 struct extent_page_data epd = {
4269 .bio = NULL,
771ed689 4270 .extent_locked = 1,
ffbd517d 4271 .sync_io = mode == WB_SYNC_ALL,
771ed689
CM
4272 };
4273 struct writeback_control wbc_writepages = {
771ed689 4274 .sync_mode = mode,
771ed689
CM
4275 .nr_to_write = nr_pages * 2,
4276 .range_start = start,
4277 .range_end = end + 1,
ec39f769
CM
4278 /* We're called from an async helper function */
4279 .punt_to_cgroup = 1,
4280 .no_cgroup_owner = 1,
771ed689
CM
4281 };
4282
dbb70bec 4283 wbc_attach_fdatawrite_inode(&wbc_writepages, inode);
d397712b 4284 while (start <= end) {
09cbfeaf 4285 page = find_get_page(mapping, start >> PAGE_SHIFT);
771ed689
CM
4286 if (clear_page_dirty_for_io(page))
4287 ret = __extent_writepage(page, &wbc_writepages, &epd);
4288 else {
7087a9d8 4289 btrfs_writepage_endio_finish_ordered(page, start,
c629732d 4290 start + PAGE_SIZE - 1, 1);
771ed689
CM
4291 unlock_page(page);
4292 }
09cbfeaf
KS
4293 put_page(page);
4294 start += PAGE_SIZE;
771ed689
CM
4295 }
4296
02c6db4f 4297 ASSERT(ret <= 0);
dbb70bec
CM
4298 if (ret == 0)
4299 ret = flush_write_bio(&epd);
4300 else
02c6db4f 4301 end_write_bio(&epd, ret);
dbb70bec
CM
4302
4303 wbc_detach_inode(&wbc_writepages);
771ed689
CM
4304 return ret;
4305}
d1310b2e 4306
8ae225a8 4307int extent_writepages(struct address_space *mapping,
d1310b2e
CM
4308 struct writeback_control *wbc)
4309{
4310 int ret = 0;
4311 struct extent_page_data epd = {
4312 .bio = NULL,
771ed689 4313 .extent_locked = 0,
ffbd517d 4314 .sync_io = wbc->sync_mode == WB_SYNC_ALL,
d1310b2e
CM
4315 };
4316
935db853 4317 ret = extent_write_cache_pages(mapping, wbc, &epd);
a2a72fbd
QW
4318 ASSERT(ret <= 0);
4319 if (ret < 0) {
4320 end_write_bio(&epd, ret);
4321 return ret;
4322 }
4323 ret = flush_write_bio(&epd);
d1310b2e
CM
4324 return ret;
4325}
d1310b2e 4326
2a3ff0ad
NB
4327int extent_readpages(struct address_space *mapping, struct list_head *pages,
4328 unsigned nr_pages)
d1310b2e
CM
4329{
4330 struct bio *bio = NULL;
c8b97818 4331 unsigned long bio_flags = 0;
67c9684f 4332 struct page *pagepool[16];
125bac01 4333 struct extent_map *em_cached = NULL;
2a3ff0ad 4334 struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree;
67c9684f 4335 int nr = 0;
808f80b4 4336 u64 prev_em_start = (u64)-1;
d1310b2e 4337
61ed3a14 4338 while (!list_empty(pages)) {
e65ef21e
NB
4339 u64 contig_end = 0;
4340
61ed3a14 4341 for (nr = 0; nr < ARRAY_SIZE(pagepool) && !list_empty(pages);) {
f86196ea 4342 struct page *page = lru_to_page(pages);
d1310b2e 4343
61ed3a14
NB
4344 prefetchw(&page->flags);
4345 list_del(&page->lru);
4346 if (add_to_page_cache_lru(page, mapping, page->index,
4347 readahead_gfp_mask(mapping))) {
4348 put_page(page);
e65ef21e 4349 break;
61ed3a14
NB
4350 }
4351
4352 pagepool[nr++] = page;
e65ef21e 4353 contig_end = page_offset(page) + PAGE_SIZE - 1;
d1310b2e 4354 }
67c9684f 4355
e65ef21e
NB
4356 if (nr) {
4357 u64 contig_start = page_offset(pagepool[0]);
4358
4359 ASSERT(contig_start + nr * PAGE_SIZE - 1 == contig_end);
4360
4361 contiguous_readpages(tree, pagepool, nr, contig_start,
4362 contig_end, &em_cached, &bio, &bio_flags,
4363 &prev_em_start);
4364 }
d1310b2e 4365 }
67c9684f 4366
125bac01
MX
4367 if (em_cached)
4368 free_extent_map(em_cached);
4369
d1310b2e 4370 if (bio)
1f7ad75b 4371 return submit_one_bio(bio, 0, bio_flags);
d1310b2e
CM
4372 return 0;
4373}
d1310b2e
CM
4374
4375/*
4376 * basic invalidatepage code, this waits on any locked or writeback
4377 * ranges corresponding to the page, and then deletes any extent state
4378 * records from the tree
4379 */
4380int extent_invalidatepage(struct extent_io_tree *tree,
4381 struct page *page, unsigned long offset)
4382{
2ac55d41 4383 struct extent_state *cached_state = NULL;
4eee4fa4 4384 u64 start = page_offset(page);
09cbfeaf 4385 u64 end = start + PAGE_SIZE - 1;
d1310b2e
CM
4386 size_t blocksize = page->mapping->host->i_sb->s_blocksize;
4387
fda2832f 4388 start += ALIGN(offset, blocksize);
d1310b2e
CM
4389 if (start > end)
4390 return 0;
4391
ff13db41 4392 lock_extent_bits(tree, start, end, &cached_state);
1edbb734 4393 wait_on_page_writeback(page);
e182163d
OS
4394 clear_extent_bit(tree, start, end, EXTENT_LOCKED | EXTENT_DELALLOC |
4395 EXTENT_DO_ACCOUNTING, 1, 1, &cached_state);
d1310b2e
CM
4396 return 0;
4397}
d1310b2e 4398
7b13b7b1
CM
4399/*
4400 * a helper for releasepage, this tests for areas of the page that
4401 * are locked or under IO and drops the related state bits if it is safe
4402 * to drop the page.
4403 */
29c68b2d 4404static int try_release_extent_state(struct extent_io_tree *tree,
48a3b636 4405 struct page *page, gfp_t mask)
7b13b7b1 4406{
4eee4fa4 4407 u64 start = page_offset(page);
09cbfeaf 4408 u64 end = start + PAGE_SIZE - 1;
7b13b7b1
CM
4409 int ret = 1;
4410
8882679e 4411 if (test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) {
7b13b7b1 4412 ret = 0;
8882679e 4413 } else {
11ef160f
CM
4414 /*
4415 * at this point we can safely clear everything except the
4416 * locked bit and the nodatasum bit
4417 */
66b0c887 4418 ret = __clear_extent_bit(tree, start, end,
11ef160f 4419 ~(EXTENT_LOCKED | EXTENT_NODATASUM),
66b0c887 4420 0, 0, NULL, mask, NULL);
e3f24cc5
CM
4421
4422 /* if clear_extent_bit failed for enomem reasons,
4423 * we can't allow the release to continue.
4424 */
4425 if (ret < 0)
4426 ret = 0;
4427 else
4428 ret = 1;
7b13b7b1
CM
4429 }
4430 return ret;
4431}
7b13b7b1 4432
d1310b2e
CM
4433/*
4434 * a helper for releasepage. As long as there are no locked extents
4435 * in the range corresponding to the page, both state records and extent
4436 * map records are removed
4437 */
477a30ba 4438int try_release_extent_mapping(struct page *page, gfp_t mask)
d1310b2e
CM
4439{
4440 struct extent_map *em;
4eee4fa4 4441 u64 start = page_offset(page);
09cbfeaf 4442 u64 end = start + PAGE_SIZE - 1;
bd3599a0
FM
4443 struct btrfs_inode *btrfs_inode = BTRFS_I(page->mapping->host);
4444 struct extent_io_tree *tree = &btrfs_inode->io_tree;
4445 struct extent_map_tree *map = &btrfs_inode->extent_tree;
7b13b7b1 4446
d0164adc 4447 if (gfpflags_allow_blocking(mask) &&
ee22184b 4448 page->mapping->host->i_size > SZ_16M) {
39b5637f 4449 u64 len;
70dec807 4450 while (start <= end) {
39b5637f 4451 len = end - start + 1;
890871be 4452 write_lock(&map->lock);
39b5637f 4453 em = lookup_extent_mapping(map, start, len);
285190d9 4454 if (!em) {
890871be 4455 write_unlock(&map->lock);
70dec807
CM
4456 break;
4457 }
7f3c74fb
CM
4458 if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
4459 em->start != start) {
890871be 4460 write_unlock(&map->lock);
70dec807
CM
4461 free_extent_map(em);
4462 break;
4463 }
4464 if (!test_range_bit(tree, em->start,
4465 extent_map_end(em) - 1,
4e586ca3 4466 EXTENT_LOCKED, 0, NULL)) {
bd3599a0
FM
4467 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
4468 &btrfs_inode->runtime_flags);
70dec807
CM
4469 remove_extent_mapping(map, em);
4470 /* once for the rb tree */
4471 free_extent_map(em);
4472 }
4473 start = extent_map_end(em);
890871be 4474 write_unlock(&map->lock);
70dec807
CM
4475
4476 /* once for us */
d1310b2e
CM
4477 free_extent_map(em);
4478 }
d1310b2e 4479 }
29c68b2d 4480 return try_release_extent_state(tree, page, mask);
d1310b2e 4481}
d1310b2e 4482
ec29ed5b
CM
4483/*
4484 * helper function for fiemap, which doesn't want to see any holes.
4485 * This maps until we find something past 'last'
4486 */
4487static struct extent_map *get_extent_skip_holes(struct inode *inode,
e3350e16 4488 u64 offset, u64 last)
ec29ed5b 4489{
da17066c 4490 u64 sectorsize = btrfs_inode_sectorsize(inode);
ec29ed5b
CM
4491 struct extent_map *em;
4492 u64 len;
4493
4494 if (offset >= last)
4495 return NULL;
4496
67871254 4497 while (1) {
ec29ed5b
CM
4498 len = last - offset;
4499 if (len == 0)
4500 break;
fda2832f 4501 len = ALIGN(len, sectorsize);
4ab47a8d 4502 em = btrfs_get_extent_fiemap(BTRFS_I(inode), offset, len);
c704005d 4503 if (IS_ERR_OR_NULL(em))
ec29ed5b
CM
4504 return em;
4505
4506 /* if this isn't a hole return it */
4a2d25cd 4507 if (em->block_start != EXTENT_MAP_HOLE)
ec29ed5b 4508 return em;
ec29ed5b
CM
4509
4510 /* this is a hole, advance to the next extent */
4511 offset = extent_map_end(em);
4512 free_extent_map(em);
4513 if (offset >= last)
4514 break;
4515 }
4516 return NULL;
4517}
4518
4751832d
QW
4519/*
4520 * To cache previous fiemap extent
4521 *
4522 * Will be used for merging fiemap extent
4523 */
4524struct fiemap_cache {
4525 u64 offset;
4526 u64 phys;
4527 u64 len;
4528 u32 flags;
4529 bool cached;
4530};
4531
4532/*
4533 * Helper to submit fiemap extent.
4534 *
4535 * Will try to merge current fiemap extent specified by @offset, @phys,
4536 * @len and @flags with cached one.
4537 * And only when we fails to merge, cached one will be submitted as
4538 * fiemap extent.
4539 *
4540 * Return value is the same as fiemap_fill_next_extent().
4541 */
4542static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
4543 struct fiemap_cache *cache,
4544 u64 offset, u64 phys, u64 len, u32 flags)
4545{
4546 int ret = 0;
4547
4548 if (!cache->cached)
4549 goto assign;
4550
4551 /*
4552 * Sanity check, extent_fiemap() should have ensured that new
52042d8e 4553 * fiemap extent won't overlap with cached one.
4751832d
QW
4554 * Not recoverable.
4555 *
4556 * NOTE: Physical address can overlap, due to compression
4557 */
4558 if (cache->offset + cache->len > offset) {
4559 WARN_ON(1);
4560 return -EINVAL;
4561 }
4562
4563 /*
4564 * Only merges fiemap extents if
4565 * 1) Their logical addresses are continuous
4566 *
4567 * 2) Their physical addresses are continuous
4568 * So truly compressed (physical size smaller than logical size)
4569 * extents won't get merged with each other
4570 *
4571 * 3) Share same flags except FIEMAP_EXTENT_LAST
4572 * So regular extent won't get merged with prealloc extent
4573 */
4574 if (cache->offset + cache->len == offset &&
4575 cache->phys + cache->len == phys &&
4576 (cache->flags & ~FIEMAP_EXTENT_LAST) ==
4577 (flags & ~FIEMAP_EXTENT_LAST)) {
4578 cache->len += len;
4579 cache->flags |= flags;
4580 goto try_submit_last;
4581 }
4582
4583 /* Not mergeable, need to submit cached one */
4584 ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
4585 cache->len, cache->flags);
4586 cache->cached = false;
4587 if (ret)
4588 return ret;
4589assign:
4590 cache->cached = true;
4591 cache->offset = offset;
4592 cache->phys = phys;
4593 cache->len = len;
4594 cache->flags = flags;
4595try_submit_last:
4596 if (cache->flags & FIEMAP_EXTENT_LAST) {
4597 ret = fiemap_fill_next_extent(fieinfo, cache->offset,
4598 cache->phys, cache->len, cache->flags);
4599 cache->cached = false;
4600 }
4601 return ret;
4602}
4603
4604/*
848c23b7 4605 * Emit last fiemap cache
4751832d 4606 *
848c23b7
QW
4607 * The last fiemap cache may still be cached in the following case:
4608 * 0 4k 8k
4609 * |<- Fiemap range ->|
4610 * |<------------ First extent ----------->|
4611 *
4612 * In this case, the first extent range will be cached but not emitted.
4613 * So we must emit it before ending extent_fiemap().
4751832d 4614 */
5c5aff98 4615static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo,
848c23b7 4616 struct fiemap_cache *cache)
4751832d
QW
4617{
4618 int ret;
4619
4620 if (!cache->cached)
4621 return 0;
4622
4751832d
QW
4623 ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
4624 cache->len, cache->flags);
4625 cache->cached = false;
4626 if (ret > 0)
4627 ret = 0;
4628 return ret;
4629}
4630
1506fcc8 4631int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2135fb9b 4632 __u64 start, __u64 len)
1506fcc8 4633{
975f84fe 4634 int ret = 0;
1506fcc8
YS
4635 u64 off = start;
4636 u64 max = start + len;
4637 u32 flags = 0;
975f84fe
JB
4638 u32 found_type;
4639 u64 last;
ec29ed5b 4640 u64 last_for_get_extent = 0;
1506fcc8 4641 u64 disko = 0;
ec29ed5b 4642 u64 isize = i_size_read(inode);
975f84fe 4643 struct btrfs_key found_key;
1506fcc8 4644 struct extent_map *em = NULL;
2ac55d41 4645 struct extent_state *cached_state = NULL;
975f84fe 4646 struct btrfs_path *path;
dc046b10 4647 struct btrfs_root *root = BTRFS_I(inode)->root;
4751832d 4648 struct fiemap_cache cache = { 0 };
5911c8fe
DS
4649 struct ulist *roots;
4650 struct ulist *tmp_ulist;
1506fcc8 4651 int end = 0;
ec29ed5b
CM
4652 u64 em_start = 0;
4653 u64 em_len = 0;
4654 u64 em_end = 0;
1506fcc8
YS
4655
4656 if (len == 0)
4657 return -EINVAL;
4658
975f84fe
JB
4659 path = btrfs_alloc_path();
4660 if (!path)
4661 return -ENOMEM;
4662 path->leave_spinning = 1;
4663
5911c8fe
DS
4664 roots = ulist_alloc(GFP_KERNEL);
4665 tmp_ulist = ulist_alloc(GFP_KERNEL);
4666 if (!roots || !tmp_ulist) {
4667 ret = -ENOMEM;
4668 goto out_free_ulist;
4669 }
4670
da17066c
JM
4671 start = round_down(start, btrfs_inode_sectorsize(inode));
4672 len = round_up(max, btrfs_inode_sectorsize(inode)) - start;
4d479cf0 4673
ec29ed5b
CM
4674 /*
4675 * lookup the last file extent. We're not using i_size here
4676 * because there might be preallocation past i_size
4677 */
f85b7379
DS
4678 ret = btrfs_lookup_file_extent(NULL, root, path,
4679 btrfs_ino(BTRFS_I(inode)), -1, 0);
975f84fe 4680 if (ret < 0) {
5911c8fe 4681 goto out_free_ulist;
2d324f59
LB
4682 } else {
4683 WARN_ON(!ret);
4684 if (ret == 1)
4685 ret = 0;
975f84fe 4686 }
2d324f59 4687
975f84fe 4688 path->slots[0]--;
975f84fe 4689 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
962a298f 4690 found_type = found_key.type;
975f84fe 4691
ec29ed5b 4692 /* No extents, but there might be delalloc bits */
4a0cc7ca 4693 if (found_key.objectid != btrfs_ino(BTRFS_I(inode)) ||
975f84fe 4694 found_type != BTRFS_EXTENT_DATA_KEY) {
ec29ed5b
CM
4695 /* have to trust i_size as the end */
4696 last = (u64)-1;
4697 last_for_get_extent = isize;
4698 } else {
4699 /*
4700 * remember the start of the last extent. There are a
4701 * bunch of different factors that go into the length of the
4702 * extent, so its much less complex to remember where it started
4703 */
4704 last = found_key.offset;
4705 last_for_get_extent = last + 1;
975f84fe 4706 }
fe09e16c 4707 btrfs_release_path(path);
975f84fe 4708
ec29ed5b
CM
4709 /*
4710 * we might have some extents allocated but more delalloc past those
4711 * extents. so, we trust isize unless the start of the last extent is
4712 * beyond isize
4713 */
4714 if (last < isize) {
4715 last = (u64)-1;
4716 last_for_get_extent = isize;
4717 }
4718
ff13db41 4719 lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len - 1,
d0082371 4720 &cached_state);
ec29ed5b 4721
e3350e16 4722 em = get_extent_skip_holes(inode, start, last_for_get_extent);
1506fcc8
YS
4723 if (!em)
4724 goto out;
4725 if (IS_ERR(em)) {
4726 ret = PTR_ERR(em);
4727 goto out;
4728 }
975f84fe 4729
1506fcc8 4730 while (!end) {
b76bb701 4731 u64 offset_in_extent = 0;
ea8efc74
CM
4732
4733 /* break if the extent we found is outside the range */
4734 if (em->start >= max || extent_map_end(em) < off)
4735 break;
4736
4737 /*
4738 * get_extent may return an extent that starts before our
4739 * requested range. We have to make sure the ranges
4740 * we return to fiemap always move forward and don't
4741 * overlap, so adjust the offsets here
4742 */
4743 em_start = max(em->start, off);
1506fcc8 4744
ea8efc74
CM
4745 /*
4746 * record the offset from the start of the extent
b76bb701
JB
4747 * for adjusting the disk offset below. Only do this if the
4748 * extent isn't compressed since our in ram offset may be past
4749 * what we have actually allocated on disk.
ea8efc74 4750 */
b76bb701
JB
4751 if (!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
4752 offset_in_extent = em_start - em->start;
ec29ed5b 4753 em_end = extent_map_end(em);
ea8efc74 4754 em_len = em_end - em_start;
1506fcc8 4755 flags = 0;
f0986318
FM
4756 if (em->block_start < EXTENT_MAP_LAST_BYTE)
4757 disko = em->block_start + offset_in_extent;
4758 else
4759 disko = 0;
1506fcc8 4760
ea8efc74
CM
4761 /*
4762 * bump off for our next call to get_extent
4763 */
4764 off = extent_map_end(em);
4765 if (off >= max)
4766 end = 1;
4767
93dbfad7 4768 if (em->block_start == EXTENT_MAP_LAST_BYTE) {
1506fcc8
YS
4769 end = 1;
4770 flags |= FIEMAP_EXTENT_LAST;
93dbfad7 4771 } else if (em->block_start == EXTENT_MAP_INLINE) {
1506fcc8
YS
4772 flags |= (FIEMAP_EXTENT_DATA_INLINE |
4773 FIEMAP_EXTENT_NOT_ALIGNED);
93dbfad7 4774 } else if (em->block_start == EXTENT_MAP_DELALLOC) {
1506fcc8
YS
4775 flags |= (FIEMAP_EXTENT_DELALLOC |
4776 FIEMAP_EXTENT_UNKNOWN);
dc046b10
JB
4777 } else if (fieinfo->fi_extents_max) {
4778 u64 bytenr = em->block_start -
4779 (em->start - em->orig_start);
fe09e16c 4780
fe09e16c
LB
4781 /*
4782 * As btrfs supports shared space, this information
4783 * can be exported to userspace tools via
dc046b10
JB
4784 * flag FIEMAP_EXTENT_SHARED. If fi_extents_max == 0
4785 * then we're just getting a count and we can skip the
4786 * lookup stuff.
fe09e16c 4787 */
bb739cf0
EN
4788 ret = btrfs_check_shared(root,
4789 btrfs_ino(BTRFS_I(inode)),
5911c8fe 4790 bytenr, roots, tmp_ulist);
dc046b10 4791 if (ret < 0)
fe09e16c 4792 goto out_free;
dc046b10 4793 if (ret)
fe09e16c 4794 flags |= FIEMAP_EXTENT_SHARED;
dc046b10 4795 ret = 0;
1506fcc8
YS
4796 }
4797 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
4798 flags |= FIEMAP_EXTENT_ENCODED;
0d2b2372
JB
4799 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
4800 flags |= FIEMAP_EXTENT_UNWRITTEN;
1506fcc8 4801
1506fcc8
YS
4802 free_extent_map(em);
4803 em = NULL;
ec29ed5b
CM
4804 if ((em_start >= last) || em_len == (u64)-1 ||
4805 (last == (u64)-1 && isize <= em_end)) {
1506fcc8
YS
4806 flags |= FIEMAP_EXTENT_LAST;
4807 end = 1;
4808 }
4809
ec29ed5b 4810 /* now scan forward to see if this is really the last extent. */
e3350e16 4811 em = get_extent_skip_holes(inode, off, last_for_get_extent);
ec29ed5b
CM
4812 if (IS_ERR(em)) {
4813 ret = PTR_ERR(em);
4814 goto out;
4815 }
4816 if (!em) {
975f84fe
JB
4817 flags |= FIEMAP_EXTENT_LAST;
4818 end = 1;
4819 }
4751832d
QW
4820 ret = emit_fiemap_extent(fieinfo, &cache, em_start, disko,
4821 em_len, flags);
26e726af
CS
4822 if (ret) {
4823 if (ret == 1)
4824 ret = 0;
ec29ed5b 4825 goto out_free;
26e726af 4826 }
1506fcc8
YS
4827 }
4828out_free:
4751832d 4829 if (!ret)
5c5aff98 4830 ret = emit_last_fiemap_cache(fieinfo, &cache);
1506fcc8
YS
4831 free_extent_map(em);
4832out:
a52f4cd2 4833 unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len - 1,
e43bbe5e 4834 &cached_state);
5911c8fe
DS
4835
4836out_free_ulist:
e02d48ea 4837 btrfs_free_path(path);
5911c8fe
DS
4838 ulist_free(roots);
4839 ulist_free(tmp_ulist);
1506fcc8
YS
4840 return ret;
4841}
4842
727011e0
CM
4843static void __free_extent_buffer(struct extent_buffer *eb)
4844{
6d49ba1b 4845 btrfs_leak_debug_del(&eb->leak_list);
727011e0
CM
4846 kmem_cache_free(extent_buffer_cache, eb);
4847}
4848
a26e8c9f 4849int extent_buffer_under_io(struct extent_buffer *eb)
db7f3436
JB
4850{
4851 return (atomic_read(&eb->io_pages) ||
4852 test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) ||
4853 test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
4854}
4855
4856/*
55ac0139 4857 * Release all pages attached to the extent buffer.
db7f3436 4858 */
55ac0139 4859static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb)
db7f3436 4860{
d64766fd
NB
4861 int i;
4862 int num_pages;
b0132a3b 4863 int mapped = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
db7f3436
JB
4864
4865 BUG_ON(extent_buffer_under_io(eb));
4866
d64766fd
NB
4867 num_pages = num_extent_pages(eb);
4868 for (i = 0; i < num_pages; i++) {
4869 struct page *page = eb->pages[i];
db7f3436 4870
5d2361db
FL
4871 if (!page)
4872 continue;
4873 if (mapped)
db7f3436 4874 spin_lock(&page->mapping->private_lock);
5d2361db
FL
4875 /*
4876 * We do this since we'll remove the pages after we've
4877 * removed the eb from the radix tree, so we could race
4878 * and have this page now attached to the new eb. So
4879 * only clear page_private if it's still connected to
4880 * this eb.
4881 */
4882 if (PagePrivate(page) &&
4883 page->private == (unsigned long)eb) {
4884 BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
4885 BUG_ON(PageDirty(page));
4886 BUG_ON(PageWriteback(page));
db7f3436 4887 /*
5d2361db
FL
4888 * We need to make sure we haven't be attached
4889 * to a new eb.
db7f3436 4890 */
5d2361db
FL
4891 ClearPagePrivate(page);
4892 set_page_private(page, 0);
4893 /* One for the page private */
09cbfeaf 4894 put_page(page);
db7f3436 4895 }
5d2361db
FL
4896
4897 if (mapped)
4898 spin_unlock(&page->mapping->private_lock);
4899
01327610 4900 /* One for when we allocated the page */
09cbfeaf 4901 put_page(page);
d64766fd 4902 }
db7f3436
JB
4903}
4904
4905/*
4906 * Helper for releasing the extent buffer.
4907 */
4908static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
4909{
55ac0139 4910 btrfs_release_extent_buffer_pages(eb);
db7f3436
JB
4911 __free_extent_buffer(eb);
4912}
4913
f28491e0
JB
4914static struct extent_buffer *
4915__alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
23d79d81 4916 unsigned long len)
d1310b2e
CM
4917{
4918 struct extent_buffer *eb = NULL;
4919
d1b5c567 4920 eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS|__GFP_NOFAIL);
d1310b2e
CM
4921 eb->start = start;
4922 eb->len = len;
f28491e0 4923 eb->fs_info = fs_info;
815a51c7 4924 eb->bflags = 0;
bd681513 4925 rwlock_init(&eb->lock);
bd681513 4926 atomic_set(&eb->blocking_readers, 0);
06297d8c 4927 eb->blocking_writers = 0;
ed1b4ed7 4928 eb->lock_nested = false;
bd681513
CM
4929 init_waitqueue_head(&eb->write_lock_wq);
4930 init_waitqueue_head(&eb->read_lock_wq);
b4ce94de 4931
6d49ba1b
ES
4932 btrfs_leak_debug_add(&eb->leak_list, &buffers);
4933
3083ee2e 4934 spin_lock_init(&eb->refs_lock);
d1310b2e 4935 atomic_set(&eb->refs, 1);
0b32f4bb 4936 atomic_set(&eb->io_pages, 0);
727011e0 4937
b8dae313
DS
4938 /*
4939 * Sanity checks, currently the maximum is 64k covered by 16x 4k pages
4940 */
4941 BUILD_BUG_ON(BTRFS_MAX_METADATA_BLOCKSIZE
4942 > MAX_INLINE_EXTENT_BUFFER_SIZE);
4943 BUG_ON(len > MAX_INLINE_EXTENT_BUFFER_SIZE);
d1310b2e 4944
843ccf9f 4945#ifdef CONFIG_BTRFS_DEBUG
f3dc24c5 4946 eb->spinning_writers = 0;
afd495a8 4947 atomic_set(&eb->spinning_readers, 0);
5c9c799a 4948 atomic_set(&eb->read_locks, 0);
00801ae4 4949 eb->write_locks = 0;
843ccf9f
DS
4950#endif
4951
d1310b2e
CM
4952 return eb;
4953}
4954
815a51c7
JS
4955struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src)
4956{
cc5e31a4 4957 int i;
815a51c7
JS
4958 struct page *p;
4959 struct extent_buffer *new;
cc5e31a4 4960 int num_pages = num_extent_pages(src);
815a51c7 4961
3f556f78 4962 new = __alloc_extent_buffer(src->fs_info, src->start, src->len);
815a51c7
JS
4963 if (new == NULL)
4964 return NULL;
4965
4966 for (i = 0; i < num_pages; i++) {
9ec72677 4967 p = alloc_page(GFP_NOFS);
db7f3436
JB
4968 if (!p) {
4969 btrfs_release_extent_buffer(new);
4970 return NULL;
4971 }
815a51c7
JS
4972 attach_extent_buffer_page(new, p);
4973 WARN_ON(PageDirty(p));
4974 SetPageUptodate(p);
4975 new->pages[i] = p;
fba1acf9 4976 copy_page(page_address(p), page_address(src->pages[i]));
815a51c7
JS
4977 }
4978
815a51c7 4979 set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags);
b0132a3b 4980 set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags);
815a51c7
JS
4981
4982 return new;
4983}
4984
0f331229
OS
4985struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
4986 u64 start, unsigned long len)
815a51c7
JS
4987{
4988 struct extent_buffer *eb;
cc5e31a4
DS
4989 int num_pages;
4990 int i;
815a51c7 4991
3f556f78 4992 eb = __alloc_extent_buffer(fs_info, start, len);
815a51c7
JS
4993 if (!eb)
4994 return NULL;
4995
65ad0104 4996 num_pages = num_extent_pages(eb);
815a51c7 4997 for (i = 0; i < num_pages; i++) {
9ec72677 4998 eb->pages[i] = alloc_page(GFP_NOFS);
815a51c7
JS
4999 if (!eb->pages[i])
5000 goto err;
5001 }
5002 set_extent_buffer_uptodate(eb);
5003 btrfs_set_header_nritems(eb, 0);
b0132a3b 5004 set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
815a51c7
JS
5005
5006 return eb;
5007err:
84167d19
SB
5008 for (; i > 0; i--)
5009 __free_page(eb->pages[i - 1]);
815a51c7
JS
5010 __free_extent_buffer(eb);
5011 return NULL;
5012}
5013
0f331229 5014struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
da17066c 5015 u64 start)
0f331229 5016{
da17066c 5017 return __alloc_dummy_extent_buffer(fs_info, start, fs_info->nodesize);
0f331229
OS
5018}
5019
0b32f4bb
JB
5020static void check_buffer_tree_ref(struct extent_buffer *eb)
5021{
242e18c7 5022 int refs;
0b32f4bb
JB
5023 /* the ref bit is tricky. We have to make sure it is set
5024 * if we have the buffer dirty. Otherwise the
5025 * code to free a buffer can end up dropping a dirty
5026 * page
5027 *
5028 * Once the ref bit is set, it won't go away while the
5029 * buffer is dirty or in writeback, and it also won't
5030 * go away while we have the reference count on the
5031 * eb bumped.
5032 *
5033 * We can't just set the ref bit without bumping the
5034 * ref on the eb because free_extent_buffer might
5035 * see the ref bit and try to clear it. If this happens
5036 * free_extent_buffer might end up dropping our original
5037 * ref by mistake and freeing the page before we are able
5038 * to add one more ref.
5039 *
5040 * So bump the ref count first, then set the bit. If someone
5041 * beat us to it, drop the ref we added.
5042 */
242e18c7
CM
5043 refs = atomic_read(&eb->refs);
5044 if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
5045 return;
5046
594831c4
JB
5047 spin_lock(&eb->refs_lock);
5048 if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
0b32f4bb 5049 atomic_inc(&eb->refs);
594831c4 5050 spin_unlock(&eb->refs_lock);
0b32f4bb
JB
5051}
5052
2457aec6
MG
5053static void mark_extent_buffer_accessed(struct extent_buffer *eb,
5054 struct page *accessed)
5df4235e 5055{
cc5e31a4 5056 int num_pages, i;
5df4235e 5057
0b32f4bb
JB
5058 check_buffer_tree_ref(eb);
5059
65ad0104 5060 num_pages = num_extent_pages(eb);
5df4235e 5061 for (i = 0; i < num_pages; i++) {
fb85fc9a
DS
5062 struct page *p = eb->pages[i];
5063
2457aec6
MG
5064 if (p != accessed)
5065 mark_page_accessed(p);
5df4235e
JB
5066 }
5067}
5068
f28491e0
JB
5069struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
5070 u64 start)
452c75c3
CS
5071{
5072 struct extent_buffer *eb;
5073
5074 rcu_read_lock();
f28491e0 5075 eb = radix_tree_lookup(&fs_info->buffer_radix,
09cbfeaf 5076 start >> PAGE_SHIFT);
452c75c3
CS
5077 if (eb && atomic_inc_not_zero(&eb->refs)) {
5078 rcu_read_unlock();
062c19e9
FM
5079 /*
5080 * Lock our eb's refs_lock to avoid races with
5081 * free_extent_buffer. When we get our eb it might be flagged
5082 * with EXTENT_BUFFER_STALE and another task running
5083 * free_extent_buffer might have seen that flag set,
5084 * eb->refs == 2, that the buffer isn't under IO (dirty and
5085 * writeback flags not set) and it's still in the tree (flag
5086 * EXTENT_BUFFER_TREE_REF set), therefore being in the process
5087 * of decrementing the extent buffer's reference count twice.
5088 * So here we could race and increment the eb's reference count,
5089 * clear its stale flag, mark it as dirty and drop our reference
5090 * before the other task finishes executing free_extent_buffer,
5091 * which would later result in an attempt to free an extent
5092 * buffer that is dirty.
5093 */
5094 if (test_bit(EXTENT_BUFFER_STALE, &eb->bflags)) {
5095 spin_lock(&eb->refs_lock);
5096 spin_unlock(&eb->refs_lock);
5097 }
2457aec6 5098 mark_extent_buffer_accessed(eb, NULL);
452c75c3
CS
5099 return eb;
5100 }
5101 rcu_read_unlock();
5102
5103 return NULL;
5104}
5105
faa2dbf0
JB
5106#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
5107struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
da17066c 5108 u64 start)
faa2dbf0
JB
5109{
5110 struct extent_buffer *eb, *exists = NULL;
5111 int ret;
5112
5113 eb = find_extent_buffer(fs_info, start);
5114 if (eb)
5115 return eb;
da17066c 5116 eb = alloc_dummy_extent_buffer(fs_info, start);
faa2dbf0 5117 if (!eb)
b6293c82 5118 return ERR_PTR(-ENOMEM);
faa2dbf0
JB
5119 eb->fs_info = fs_info;
5120again:
e1860a77 5121 ret = radix_tree_preload(GFP_NOFS);
b6293c82
DC
5122 if (ret) {
5123 exists = ERR_PTR(ret);
faa2dbf0 5124 goto free_eb;
b6293c82 5125 }
faa2dbf0
JB
5126 spin_lock(&fs_info->buffer_lock);
5127 ret = radix_tree_insert(&fs_info->buffer_radix,
09cbfeaf 5128 start >> PAGE_SHIFT, eb);
faa2dbf0
JB
5129 spin_unlock(&fs_info->buffer_lock);
5130 radix_tree_preload_end();
5131 if (ret == -EEXIST) {
5132 exists = find_extent_buffer(fs_info, start);
5133 if (exists)
5134 goto free_eb;
5135 else
5136 goto again;
5137 }
5138 check_buffer_tree_ref(eb);
5139 set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
5140
faa2dbf0
JB
5141 return eb;
5142free_eb:
5143 btrfs_release_extent_buffer(eb);
5144 return exists;
5145}
5146#endif
5147
f28491e0 5148struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
ce3e6984 5149 u64 start)
d1310b2e 5150{
da17066c 5151 unsigned long len = fs_info->nodesize;
cc5e31a4
DS
5152 int num_pages;
5153 int i;
09cbfeaf 5154 unsigned long index = start >> PAGE_SHIFT;
d1310b2e 5155 struct extent_buffer *eb;
6af118ce 5156 struct extent_buffer *exists = NULL;
d1310b2e 5157 struct page *p;
f28491e0 5158 struct address_space *mapping = fs_info->btree_inode->i_mapping;
d1310b2e 5159 int uptodate = 1;
19fe0a8b 5160 int ret;
d1310b2e 5161
da17066c 5162 if (!IS_ALIGNED(start, fs_info->sectorsize)) {
c871b0f2
LB
5163 btrfs_err(fs_info, "bad tree block start %llu", start);
5164 return ERR_PTR(-EINVAL);
5165 }
5166
f28491e0 5167 eb = find_extent_buffer(fs_info, start);
452c75c3 5168 if (eb)
6af118ce 5169 return eb;
6af118ce 5170
23d79d81 5171 eb = __alloc_extent_buffer(fs_info, start, len);
2b114d1d 5172 if (!eb)
c871b0f2 5173 return ERR_PTR(-ENOMEM);
d1310b2e 5174
65ad0104 5175 num_pages = num_extent_pages(eb);
727011e0 5176 for (i = 0; i < num_pages; i++, index++) {
d1b5c567 5177 p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL);
c871b0f2
LB
5178 if (!p) {
5179 exists = ERR_PTR(-ENOMEM);
6af118ce 5180 goto free_eb;
c871b0f2 5181 }
4f2de97a
JB
5182
5183 spin_lock(&mapping->private_lock);
5184 if (PagePrivate(p)) {
5185 /*
5186 * We could have already allocated an eb for this page
5187 * and attached one so lets see if we can get a ref on
5188 * the existing eb, and if we can we know it's good and
5189 * we can just return that one, else we know we can just
5190 * overwrite page->private.
5191 */
5192 exists = (struct extent_buffer *)p->private;
5193 if (atomic_inc_not_zero(&exists->refs)) {
5194 spin_unlock(&mapping->private_lock);
5195 unlock_page(p);
09cbfeaf 5196 put_page(p);
2457aec6 5197 mark_extent_buffer_accessed(exists, p);
4f2de97a
JB
5198 goto free_eb;
5199 }
5ca64f45 5200 exists = NULL;
4f2de97a 5201
0b32f4bb 5202 /*
4f2de97a
JB
5203 * Do this so attach doesn't complain and we need to
5204 * drop the ref the old guy had.
5205 */
5206 ClearPagePrivate(p);
0b32f4bb 5207 WARN_ON(PageDirty(p));
09cbfeaf 5208 put_page(p);
d1310b2e 5209 }
4f2de97a
JB
5210 attach_extent_buffer_page(eb, p);
5211 spin_unlock(&mapping->private_lock);
0b32f4bb 5212 WARN_ON(PageDirty(p));
727011e0 5213 eb->pages[i] = p;
d1310b2e
CM
5214 if (!PageUptodate(p))
5215 uptodate = 0;
eb14ab8e
CM
5216
5217 /*
b16d011e
NB
5218 * We can't unlock the pages just yet since the extent buffer
5219 * hasn't been properly inserted in the radix tree, this
5220 * opens a race with btree_releasepage which can free a page
5221 * while we are still filling in all pages for the buffer and
5222 * we could crash.
eb14ab8e 5223 */
d1310b2e
CM
5224 }
5225 if (uptodate)
b4ce94de 5226 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
115391d2 5227again:
e1860a77 5228 ret = radix_tree_preload(GFP_NOFS);
c871b0f2
LB
5229 if (ret) {
5230 exists = ERR_PTR(ret);
19fe0a8b 5231 goto free_eb;
c871b0f2 5232 }
19fe0a8b 5233
f28491e0
JB
5234 spin_lock(&fs_info->buffer_lock);
5235 ret = radix_tree_insert(&fs_info->buffer_radix,
09cbfeaf 5236 start >> PAGE_SHIFT, eb);
f28491e0 5237 spin_unlock(&fs_info->buffer_lock);
452c75c3 5238 radix_tree_preload_end();
19fe0a8b 5239 if (ret == -EEXIST) {
f28491e0 5240 exists = find_extent_buffer(fs_info, start);
452c75c3
CS
5241 if (exists)
5242 goto free_eb;
5243 else
115391d2 5244 goto again;
6af118ce 5245 }
6af118ce 5246 /* add one reference for the tree */
0b32f4bb 5247 check_buffer_tree_ref(eb);
34b41ace 5248 set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
eb14ab8e
CM
5249
5250 /*
b16d011e
NB
5251 * Now it's safe to unlock the pages because any calls to
5252 * btree_releasepage will correctly detect that a page belongs to a
5253 * live buffer and won't free them prematurely.
eb14ab8e 5254 */
28187ae5
NB
5255 for (i = 0; i < num_pages; i++)
5256 unlock_page(eb->pages[i]);
d1310b2e
CM
5257 return eb;
5258
6af118ce 5259free_eb:
5ca64f45 5260 WARN_ON(!atomic_dec_and_test(&eb->refs));
727011e0
CM
5261 for (i = 0; i < num_pages; i++) {
5262 if (eb->pages[i])
5263 unlock_page(eb->pages[i]);
5264 }
eb14ab8e 5265
897ca6e9 5266 btrfs_release_extent_buffer(eb);
6af118ce 5267 return exists;
d1310b2e 5268}
d1310b2e 5269
3083ee2e
JB
5270static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
5271{
5272 struct extent_buffer *eb =
5273 container_of(head, struct extent_buffer, rcu_head);
5274
5275 __free_extent_buffer(eb);
5276}
5277
f7a52a40 5278static int release_extent_buffer(struct extent_buffer *eb)
3083ee2e 5279{
07e21c4d
NB
5280 lockdep_assert_held(&eb->refs_lock);
5281
3083ee2e
JB
5282 WARN_ON(atomic_read(&eb->refs) == 0);
5283 if (atomic_dec_and_test(&eb->refs)) {
34b41ace 5284 if (test_and_clear_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags)) {
f28491e0 5285 struct btrfs_fs_info *fs_info = eb->fs_info;
3083ee2e 5286
815a51c7 5287 spin_unlock(&eb->refs_lock);
3083ee2e 5288
f28491e0
JB
5289 spin_lock(&fs_info->buffer_lock);
5290 radix_tree_delete(&fs_info->buffer_radix,
09cbfeaf 5291 eb->start >> PAGE_SHIFT);
f28491e0 5292 spin_unlock(&fs_info->buffer_lock);
34b41ace
JB
5293 } else {
5294 spin_unlock(&eb->refs_lock);
815a51c7 5295 }
3083ee2e
JB
5296
5297 /* Should be safe to release our pages at this point */
55ac0139 5298 btrfs_release_extent_buffer_pages(eb);
bcb7e449 5299#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
b0132a3b 5300 if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags))) {
bcb7e449
JB
5301 __free_extent_buffer(eb);
5302 return 1;
5303 }
5304#endif
3083ee2e 5305 call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
e64860aa 5306 return 1;
3083ee2e
JB
5307 }
5308 spin_unlock(&eb->refs_lock);
e64860aa
JB
5309
5310 return 0;
3083ee2e
JB
5311}
5312
d1310b2e
CM
5313void free_extent_buffer(struct extent_buffer *eb)
5314{
242e18c7
CM
5315 int refs;
5316 int old;
d1310b2e
CM
5317 if (!eb)
5318 return;
5319
242e18c7
CM
5320 while (1) {
5321 refs = atomic_read(&eb->refs);
46cc775e
NB
5322 if ((!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) && refs <= 3)
5323 || (test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) &&
5324 refs == 1))
242e18c7
CM
5325 break;
5326 old = atomic_cmpxchg(&eb->refs, refs, refs - 1);
5327 if (old == refs)
5328 return;
5329 }
5330
3083ee2e
JB
5331 spin_lock(&eb->refs_lock);
5332 if (atomic_read(&eb->refs) == 2 &&
5333 test_bit(EXTENT_BUFFER_STALE, &eb->bflags) &&
0b32f4bb 5334 !extent_buffer_under_io(eb) &&
3083ee2e
JB
5335 test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
5336 atomic_dec(&eb->refs);
5337
5338 /*
5339 * I know this is terrible, but it's temporary until we stop tracking
5340 * the uptodate bits and such for the extent buffers.
5341 */
f7a52a40 5342 release_extent_buffer(eb);
3083ee2e
JB
5343}
5344
5345void free_extent_buffer_stale(struct extent_buffer *eb)
5346{
5347 if (!eb)
d1310b2e
CM
5348 return;
5349
3083ee2e
JB
5350 spin_lock(&eb->refs_lock);
5351 set_bit(EXTENT_BUFFER_STALE, &eb->bflags);
5352
0b32f4bb 5353 if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) &&
3083ee2e
JB
5354 test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
5355 atomic_dec(&eb->refs);
f7a52a40 5356 release_extent_buffer(eb);
d1310b2e 5357}
d1310b2e 5358
1d4284bd 5359void clear_extent_buffer_dirty(struct extent_buffer *eb)
d1310b2e 5360{
cc5e31a4
DS
5361 int i;
5362 int num_pages;
d1310b2e
CM
5363 struct page *page;
5364
65ad0104 5365 num_pages = num_extent_pages(eb);
d1310b2e
CM
5366
5367 for (i = 0; i < num_pages; i++) {
fb85fc9a 5368 page = eb->pages[i];
b9473439 5369 if (!PageDirty(page))
d2c3f4f6
CM
5370 continue;
5371
a61e6f29 5372 lock_page(page);
eb14ab8e
CM
5373 WARN_ON(!PagePrivate(page));
5374
d1310b2e 5375 clear_page_dirty_for_io(page);
b93b0163 5376 xa_lock_irq(&page->mapping->i_pages);
0a943c65
MW
5377 if (!PageDirty(page))
5378 __xa_clear_mark(&page->mapping->i_pages,
5379 page_index(page), PAGECACHE_TAG_DIRTY);
b93b0163 5380 xa_unlock_irq(&page->mapping->i_pages);
bf0da8c1 5381 ClearPageError(page);
a61e6f29 5382 unlock_page(page);
d1310b2e 5383 }
0b32f4bb 5384 WARN_ON(atomic_read(&eb->refs) == 0);
d1310b2e 5385}
d1310b2e 5386
abb57ef3 5387bool set_extent_buffer_dirty(struct extent_buffer *eb)
d1310b2e 5388{
cc5e31a4
DS
5389 int i;
5390 int num_pages;
abb57ef3 5391 bool was_dirty;
d1310b2e 5392
0b32f4bb
JB
5393 check_buffer_tree_ref(eb);
5394
b9473439 5395 was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
0b32f4bb 5396
65ad0104 5397 num_pages = num_extent_pages(eb);
3083ee2e 5398 WARN_ON(atomic_read(&eb->refs) == 0);
0b32f4bb
JB
5399 WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
5400
abb57ef3
LB
5401 if (!was_dirty)
5402 for (i = 0; i < num_pages; i++)
5403 set_page_dirty(eb->pages[i]);
51995c39
LB
5404
5405#ifdef CONFIG_BTRFS_DEBUG
5406 for (i = 0; i < num_pages; i++)
5407 ASSERT(PageDirty(eb->pages[i]));
5408#endif
5409
b9473439 5410 return was_dirty;
d1310b2e 5411}
d1310b2e 5412
69ba3927 5413void clear_extent_buffer_uptodate(struct extent_buffer *eb)
1259ab75 5414{
cc5e31a4 5415 int i;
1259ab75 5416 struct page *page;
cc5e31a4 5417 int num_pages;
1259ab75 5418
b4ce94de 5419 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
65ad0104 5420 num_pages = num_extent_pages(eb);
1259ab75 5421 for (i = 0; i < num_pages; i++) {
fb85fc9a 5422 page = eb->pages[i];
33958dc6
CM
5423 if (page)
5424 ClearPageUptodate(page);
1259ab75 5425 }
1259ab75
CM
5426}
5427
09c25a8c 5428void set_extent_buffer_uptodate(struct extent_buffer *eb)
d1310b2e 5429{
cc5e31a4 5430 int i;
d1310b2e 5431 struct page *page;
cc5e31a4 5432 int num_pages;
d1310b2e 5433
0b32f4bb 5434 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
65ad0104 5435 num_pages = num_extent_pages(eb);
d1310b2e 5436 for (i = 0; i < num_pages; i++) {
fb85fc9a 5437 page = eb->pages[i];
d1310b2e
CM
5438 SetPageUptodate(page);
5439 }
d1310b2e 5440}
d1310b2e 5441
c2ccfbc6 5442int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
d1310b2e 5443{
cc5e31a4 5444 int i;
d1310b2e
CM
5445 struct page *page;
5446 int err;
5447 int ret = 0;
ce9adaa5
CM
5448 int locked_pages = 0;
5449 int all_uptodate = 1;
cc5e31a4 5450 int num_pages;
727011e0 5451 unsigned long num_reads = 0;
a86c12c7 5452 struct bio *bio = NULL;
c8b97818 5453 unsigned long bio_flags = 0;
c2ccfbc6 5454 struct extent_io_tree *tree = &BTRFS_I(eb->fs_info->btree_inode)->io_tree;
a86c12c7 5455
b4ce94de 5456 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
d1310b2e
CM
5457 return 0;
5458
ae6957eb
DS
5459 ASSERT(tree == &BTRFS_I(eb->pages[0]->mapping->host)->io_tree);
5460
65ad0104 5461 num_pages = num_extent_pages(eb);
8436ea91 5462 for (i = 0; i < num_pages; i++) {
fb85fc9a 5463 page = eb->pages[i];
bb82ab88 5464 if (wait == WAIT_NONE) {
2db04966 5465 if (!trylock_page(page))
ce9adaa5 5466 goto unlock_exit;
d1310b2e
CM
5467 } else {
5468 lock_page(page);
5469 }
ce9adaa5 5470 locked_pages++;
2571e739
LB
5471 }
5472 /*
5473 * We need to firstly lock all pages to make sure that
5474 * the uptodate bit of our pages won't be affected by
5475 * clear_extent_buffer_uptodate().
5476 */
8436ea91 5477 for (i = 0; i < num_pages; i++) {
2571e739 5478 page = eb->pages[i];
727011e0
CM
5479 if (!PageUptodate(page)) {
5480 num_reads++;
ce9adaa5 5481 all_uptodate = 0;
727011e0 5482 }
ce9adaa5 5483 }
2571e739 5484
ce9adaa5 5485 if (all_uptodate) {
8436ea91 5486 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
ce9adaa5
CM
5487 goto unlock_exit;
5488 }
5489
656f30db 5490 clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
5cf1ab56 5491 eb->read_mirror = 0;
0b32f4bb 5492 atomic_set(&eb->io_pages, num_reads);
8436ea91 5493 for (i = 0; i < num_pages; i++) {
fb85fc9a 5494 page = eb->pages[i];
baf863b9 5495
ce9adaa5 5496 if (!PageUptodate(page)) {
baf863b9
LB
5497 if (ret) {
5498 atomic_dec(&eb->io_pages);
5499 unlock_page(page);
5500 continue;
5501 }
5502
f188591e 5503 ClearPageError(page);
a86c12c7 5504 err = __extent_read_full_page(tree, page,
6af49dbd 5505 btree_get_extent, &bio,
d4c7ca86 5506 mirror_num, &bio_flags,
1f7ad75b 5507 REQ_META);
baf863b9 5508 if (err) {
d1310b2e 5509 ret = err;
baf863b9
LB
5510 /*
5511 * We use &bio in above __extent_read_full_page,
5512 * so we ensure that if it returns error, the
5513 * current page fails to add itself to bio and
5514 * it's been unlocked.
5515 *
5516 * We must dec io_pages by ourselves.
5517 */
5518 atomic_dec(&eb->io_pages);
5519 }
d1310b2e
CM
5520 } else {
5521 unlock_page(page);
5522 }
5523 }
5524
355808c2 5525 if (bio) {
1f7ad75b 5526 err = submit_one_bio(bio, mirror_num, bio_flags);
79787eaa
JM
5527 if (err)
5528 return err;
355808c2 5529 }
a86c12c7 5530
bb82ab88 5531 if (ret || wait != WAIT_COMPLETE)
d1310b2e 5532 return ret;
d397712b 5533
8436ea91 5534 for (i = 0; i < num_pages; i++) {
fb85fc9a 5535 page = eb->pages[i];
d1310b2e 5536 wait_on_page_locked(page);
d397712b 5537 if (!PageUptodate(page))
d1310b2e 5538 ret = -EIO;
d1310b2e 5539 }
d397712b 5540
d1310b2e 5541 return ret;
ce9adaa5
CM
5542
5543unlock_exit:
d397712b 5544 while (locked_pages > 0) {
ce9adaa5 5545 locked_pages--;
8436ea91
JB
5546 page = eb->pages[locked_pages];
5547 unlock_page(page);
ce9adaa5
CM
5548 }
5549 return ret;
d1310b2e 5550}
d1310b2e 5551
1cbb1f45
JM
5552void read_extent_buffer(const struct extent_buffer *eb, void *dstv,
5553 unsigned long start, unsigned long len)
d1310b2e
CM
5554{
5555 size_t cur;
5556 size_t offset;
5557 struct page *page;
5558 char *kaddr;
5559 char *dst = (char *)dstv;
7073017a 5560 size_t start_offset = offset_in_page(eb->start);
09cbfeaf 5561 unsigned long i = (start_offset + start) >> PAGE_SHIFT;
d1310b2e 5562
f716abd5
LB
5563 if (start + len > eb->len) {
5564 WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, wanted %lu %lu\n",
5565 eb->start, eb->len, start, len);
5566 memset(dst, 0, len);
5567 return;
5568 }
d1310b2e 5569
7073017a 5570 offset = offset_in_page(start_offset + start);
d1310b2e 5571
d397712b 5572 while (len > 0) {
fb85fc9a 5573 page = eb->pages[i];
d1310b2e 5574
09cbfeaf 5575 cur = min(len, (PAGE_SIZE - offset));
a6591715 5576 kaddr = page_address(page);
d1310b2e 5577 memcpy(dst, kaddr + offset, cur);
d1310b2e
CM
5578
5579 dst += cur;
5580 len -= cur;
5581 offset = 0;
5582 i++;
5583 }
5584}
d1310b2e 5585
1cbb1f45
JM
5586int read_extent_buffer_to_user(const struct extent_buffer *eb,
5587 void __user *dstv,
5588 unsigned long start, unsigned long len)
550ac1d8
GH
5589{
5590 size_t cur;
5591 size_t offset;
5592 struct page *page;
5593 char *kaddr;
5594 char __user *dst = (char __user *)dstv;
7073017a 5595 size_t start_offset = offset_in_page(eb->start);
09cbfeaf 5596 unsigned long i = (start_offset + start) >> PAGE_SHIFT;
550ac1d8
GH
5597 int ret = 0;
5598
5599 WARN_ON(start > eb->len);
5600 WARN_ON(start + len > eb->start + eb->len);
5601
7073017a 5602 offset = offset_in_page(start_offset + start);
550ac1d8
GH
5603
5604 while (len > 0) {
fb85fc9a 5605 page = eb->pages[i];
550ac1d8 5606
09cbfeaf 5607 cur = min(len, (PAGE_SIZE - offset));
550ac1d8
GH
5608 kaddr = page_address(page);
5609 if (copy_to_user(dst, kaddr + offset, cur)) {
5610 ret = -EFAULT;
5611 break;
5612 }
5613
5614 dst += cur;
5615 len -= cur;
5616 offset = 0;
5617 i++;
5618 }
5619
5620 return ret;
5621}
5622
415b35a5
LB
5623/*
5624 * return 0 if the item is found within a page.
5625 * return 1 if the item spans two pages.
5626 * return -EINVAL otherwise.
5627 */
1cbb1f45
JM
5628int map_private_extent_buffer(const struct extent_buffer *eb,
5629 unsigned long start, unsigned long min_len,
5630 char **map, unsigned long *map_start,
5631 unsigned long *map_len)
d1310b2e 5632{
cc2c39d6 5633 size_t offset;
d1310b2e
CM
5634 char *kaddr;
5635 struct page *p;
7073017a 5636 size_t start_offset = offset_in_page(eb->start);
09cbfeaf 5637 unsigned long i = (start_offset + start) >> PAGE_SHIFT;
d1310b2e 5638 unsigned long end_i = (start_offset + start + min_len - 1) >>
09cbfeaf 5639 PAGE_SHIFT;
d1310b2e 5640
f716abd5
LB
5641 if (start + min_len > eb->len) {
5642 WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, wanted %lu %lu\n",
5643 eb->start, eb->len, start, min_len);
5644 return -EINVAL;
5645 }
5646
d1310b2e 5647 if (i != end_i)
415b35a5 5648 return 1;
d1310b2e
CM
5649
5650 if (i == 0) {
5651 offset = start_offset;
5652 *map_start = 0;
5653 } else {
5654 offset = 0;
09cbfeaf 5655 *map_start = ((u64)i << PAGE_SHIFT) - start_offset;
d1310b2e 5656 }
d397712b 5657
fb85fc9a 5658 p = eb->pages[i];
a6591715 5659 kaddr = page_address(p);
d1310b2e 5660 *map = kaddr + offset;
09cbfeaf 5661 *map_len = PAGE_SIZE - offset;
d1310b2e
CM
5662 return 0;
5663}
d1310b2e 5664
1cbb1f45
JM
5665int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
5666 unsigned long start, unsigned long len)
d1310b2e
CM
5667{
5668 size_t cur;
5669 size_t offset;
5670 struct page *page;
5671 char *kaddr;
5672 char *ptr = (char *)ptrv;
7073017a 5673 size_t start_offset = offset_in_page(eb->start);
09cbfeaf 5674 unsigned long i = (start_offset + start) >> PAGE_SHIFT;
d1310b2e
CM
5675 int ret = 0;
5676
5677 WARN_ON(start > eb->len);
5678 WARN_ON(start + len > eb->start + eb->len);
5679
7073017a 5680 offset = offset_in_page(start_offset + start);
d1310b2e 5681
d397712b 5682 while (len > 0) {
fb85fc9a 5683 page = eb->pages[i];
d1310b2e 5684
09cbfeaf 5685 cur = min(len, (PAGE_SIZE - offset));
d1310b2e 5686
a6591715 5687 kaddr = page_address(page);
d1310b2e 5688 ret = memcmp(ptr, kaddr + offset, cur);
d1310b2e
CM
5689 if (ret)
5690 break;
5691
5692 ptr += cur;
5693 len -= cur;
5694 offset = 0;
5695 i++;
5696 }
5697 return ret;
5698}
d1310b2e 5699
f157bf76
DS
5700void write_extent_buffer_chunk_tree_uuid(struct extent_buffer *eb,
5701 const void *srcv)
5702{
5703 char *kaddr;
5704
5705 WARN_ON(!PageUptodate(eb->pages[0]));
5706 kaddr = page_address(eb->pages[0]);
5707 memcpy(kaddr + offsetof(struct btrfs_header, chunk_tree_uuid), srcv,
5708 BTRFS_FSID_SIZE);
5709}
5710
5711void write_extent_buffer_fsid(struct extent_buffer *eb, const void *srcv)
5712{
5713 char *kaddr;
5714
5715 WARN_ON(!PageUptodate(eb->pages[0]));
5716 kaddr = page_address(eb->pages[0]);
5717 memcpy(kaddr + offsetof(struct btrfs_header, fsid), srcv,
5718 BTRFS_FSID_SIZE);
5719}
5720
d1310b2e
CM
5721void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
5722 unsigned long start, unsigned long len)
5723{
5724 size_t cur;
5725 size_t offset;
5726 struct page *page;
5727 char *kaddr;
5728 char *src = (char *)srcv;
7073017a 5729 size_t start_offset = offset_in_page(eb->start);
09cbfeaf 5730 unsigned long i = (start_offset + start) >> PAGE_SHIFT;
d1310b2e
CM
5731
5732 WARN_ON(start > eb->len);
5733 WARN_ON(start + len > eb->start + eb->len);
5734
7073017a 5735 offset = offset_in_page(start_offset + start);
d1310b2e 5736
d397712b 5737 while (len > 0) {
fb85fc9a 5738 page = eb->pages[i];
d1310b2e
CM
5739 WARN_ON(!PageUptodate(page));
5740
09cbfeaf 5741 cur = min(len, PAGE_SIZE - offset);
a6591715 5742 kaddr = page_address(page);
d1310b2e 5743 memcpy(kaddr + offset, src, cur);
d1310b2e
CM
5744
5745 src += cur;
5746 len -= cur;
5747 offset = 0;
5748 i++;
5749 }
5750}
d1310b2e 5751
b159fa28
DS
5752void memzero_extent_buffer(struct extent_buffer *eb, unsigned long start,
5753 unsigned long len)
d1310b2e
CM
5754{
5755 size_t cur;
5756 size_t offset;
5757 struct page *page;
5758 char *kaddr;
7073017a 5759 size_t start_offset = offset_in_page(eb->start);
09cbfeaf 5760 unsigned long i = (start_offset + start) >> PAGE_SHIFT;
d1310b2e
CM
5761
5762 WARN_ON(start > eb->len);
5763 WARN_ON(start + len > eb->start + eb->len);
5764
7073017a 5765 offset = offset_in_page(start_offset + start);
d1310b2e 5766
d397712b 5767 while (len > 0) {
fb85fc9a 5768 page = eb->pages[i];
d1310b2e
CM
5769 WARN_ON(!PageUptodate(page));
5770
09cbfeaf 5771 cur = min(len, PAGE_SIZE - offset);
a6591715 5772 kaddr = page_address(page);
b159fa28 5773 memset(kaddr + offset, 0, cur);
d1310b2e
CM
5774
5775 len -= cur;
5776 offset = 0;
5777 i++;
5778 }
5779}
d1310b2e 5780
58e8012c
DS
5781void copy_extent_buffer_full(struct extent_buffer *dst,
5782 struct extent_buffer *src)
5783{
5784 int i;
cc5e31a4 5785 int num_pages;
58e8012c
DS
5786
5787 ASSERT(dst->len == src->len);
5788
65ad0104 5789 num_pages = num_extent_pages(dst);
58e8012c
DS
5790 for (i = 0; i < num_pages; i++)
5791 copy_page(page_address(dst->pages[i]),
5792 page_address(src->pages[i]));
5793}
5794
d1310b2e
CM
5795void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
5796 unsigned long dst_offset, unsigned long src_offset,
5797 unsigned long len)
5798{
5799 u64 dst_len = dst->len;
5800 size_t cur;
5801 size_t offset;
5802 struct page *page;
5803 char *kaddr;
7073017a 5804 size_t start_offset = offset_in_page(dst->start);
09cbfeaf 5805 unsigned long i = (start_offset + dst_offset) >> PAGE_SHIFT;
d1310b2e
CM
5806
5807 WARN_ON(src->len != dst_len);
5808
7073017a 5809 offset = offset_in_page(start_offset + dst_offset);
d1310b2e 5810
d397712b 5811 while (len > 0) {
fb85fc9a 5812 page = dst->pages[i];
d1310b2e
CM
5813 WARN_ON(!PageUptodate(page));
5814
09cbfeaf 5815 cur = min(len, (unsigned long)(PAGE_SIZE - offset));
d1310b2e 5816
a6591715 5817 kaddr = page_address(page);
d1310b2e 5818 read_extent_buffer(src, kaddr + offset, src_offset, cur);
d1310b2e
CM
5819
5820 src_offset += cur;
5821 len -= cur;
5822 offset = 0;
5823 i++;
5824 }
5825}
d1310b2e 5826
3e1e8bb7
OS
5827/*
5828 * eb_bitmap_offset() - calculate the page and offset of the byte containing the
5829 * given bit number
5830 * @eb: the extent buffer
5831 * @start: offset of the bitmap item in the extent buffer
5832 * @nr: bit number
5833 * @page_index: return index of the page in the extent buffer that contains the
5834 * given bit number
5835 * @page_offset: return offset into the page given by page_index
5836 *
5837 * This helper hides the ugliness of finding the byte in an extent buffer which
5838 * contains a given bit.
5839 */
5840static inline void eb_bitmap_offset(struct extent_buffer *eb,
5841 unsigned long start, unsigned long nr,
5842 unsigned long *page_index,
5843 size_t *page_offset)
5844{
7073017a 5845 size_t start_offset = offset_in_page(eb->start);
3e1e8bb7
OS
5846 size_t byte_offset = BIT_BYTE(nr);
5847 size_t offset;
5848
5849 /*
5850 * The byte we want is the offset of the extent buffer + the offset of
5851 * the bitmap item in the extent buffer + the offset of the byte in the
5852 * bitmap item.
5853 */
5854 offset = start_offset + start + byte_offset;
5855
09cbfeaf 5856 *page_index = offset >> PAGE_SHIFT;
7073017a 5857 *page_offset = offset_in_page(offset);
3e1e8bb7
OS
5858}
5859
5860/**
5861 * extent_buffer_test_bit - determine whether a bit in a bitmap item is set
5862 * @eb: the extent buffer
5863 * @start: offset of the bitmap item in the extent buffer
5864 * @nr: bit number to test
5865 */
5866int extent_buffer_test_bit(struct extent_buffer *eb, unsigned long start,
5867 unsigned long nr)
5868{
2fe1d551 5869 u8 *kaddr;
3e1e8bb7
OS
5870 struct page *page;
5871 unsigned long i;
5872 size_t offset;
5873
5874 eb_bitmap_offset(eb, start, nr, &i, &offset);
5875 page = eb->pages[i];
5876 WARN_ON(!PageUptodate(page));
5877 kaddr = page_address(page);
5878 return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1)));
5879}
5880
5881/**
5882 * extent_buffer_bitmap_set - set an area of a bitmap
5883 * @eb: the extent buffer
5884 * @start: offset of the bitmap item in the extent buffer
5885 * @pos: bit number of the first bit
5886 * @len: number of bits to set
5887 */
5888void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start,
5889 unsigned long pos, unsigned long len)
5890{
2fe1d551 5891 u8 *kaddr;
3e1e8bb7
OS
5892 struct page *page;
5893 unsigned long i;
5894 size_t offset;
5895 const unsigned int size = pos + len;
5896 int bits_to_set = BITS_PER_BYTE - (pos % BITS_PER_BYTE);
2fe1d551 5897 u8 mask_to_set = BITMAP_FIRST_BYTE_MASK(pos);
3e1e8bb7
OS
5898
5899 eb_bitmap_offset(eb, start, pos, &i, &offset);
5900 page = eb->pages[i];
5901 WARN_ON(!PageUptodate(page));
5902 kaddr = page_address(page);
5903
5904 while (len >= bits_to_set) {
5905 kaddr[offset] |= mask_to_set;
5906 len -= bits_to_set;
5907 bits_to_set = BITS_PER_BYTE;
9c894696 5908 mask_to_set = ~0;
09cbfeaf 5909 if (++offset >= PAGE_SIZE && len > 0) {
3e1e8bb7
OS
5910 offset = 0;
5911 page = eb->pages[++i];
5912 WARN_ON(!PageUptodate(page));
5913 kaddr = page_address(page);
5914 }
5915 }
5916 if (len) {
5917 mask_to_set &= BITMAP_LAST_BYTE_MASK(size);
5918 kaddr[offset] |= mask_to_set;
5919 }
5920}
5921
5922
5923/**
5924 * extent_buffer_bitmap_clear - clear an area of a bitmap
5925 * @eb: the extent buffer
5926 * @start: offset of the bitmap item in the extent buffer
5927 * @pos: bit number of the first bit
5928 * @len: number of bits to clear
5929 */
5930void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start,
5931 unsigned long pos, unsigned long len)
5932{
2fe1d551 5933 u8 *kaddr;
3e1e8bb7
OS
5934 struct page *page;
5935 unsigned long i;
5936 size_t offset;
5937 const unsigned int size = pos + len;
5938 int bits_to_clear = BITS_PER_BYTE - (pos % BITS_PER_BYTE);
2fe1d551 5939 u8 mask_to_clear = BITMAP_FIRST_BYTE_MASK(pos);
3e1e8bb7
OS
5940
5941 eb_bitmap_offset(eb, start, pos, &i, &offset);
5942 page = eb->pages[i];
5943 WARN_ON(!PageUptodate(page));
5944 kaddr = page_address(page);
5945
5946 while (len >= bits_to_clear) {
5947 kaddr[offset] &= ~mask_to_clear;
5948 len -= bits_to_clear;
5949 bits_to_clear = BITS_PER_BYTE;
9c894696 5950 mask_to_clear = ~0;
09cbfeaf 5951 if (++offset >= PAGE_SIZE && len > 0) {
3e1e8bb7
OS
5952 offset = 0;
5953 page = eb->pages[++i];
5954 WARN_ON(!PageUptodate(page));
5955 kaddr = page_address(page);
5956 }
5957 }
5958 if (len) {
5959 mask_to_clear &= BITMAP_LAST_BYTE_MASK(size);
5960 kaddr[offset] &= ~mask_to_clear;
5961 }
5962}
5963
3387206f
ST
5964static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len)
5965{
5966 unsigned long distance = (src > dst) ? src - dst : dst - src;
5967 return distance < len;
5968}
5969
d1310b2e
CM
5970static void copy_pages(struct page *dst_page, struct page *src_page,
5971 unsigned long dst_off, unsigned long src_off,
5972 unsigned long len)
5973{
a6591715 5974 char *dst_kaddr = page_address(dst_page);
d1310b2e 5975 char *src_kaddr;
727011e0 5976 int must_memmove = 0;
d1310b2e 5977
3387206f 5978 if (dst_page != src_page) {
a6591715 5979 src_kaddr = page_address(src_page);
3387206f 5980 } else {
d1310b2e 5981 src_kaddr = dst_kaddr;
727011e0
CM
5982 if (areas_overlap(src_off, dst_off, len))
5983 must_memmove = 1;
3387206f 5984 }
d1310b2e 5985
727011e0
CM
5986 if (must_memmove)
5987 memmove(dst_kaddr + dst_off, src_kaddr + src_off, len);
5988 else
5989 memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
d1310b2e
CM
5990}
5991
5992void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
5993 unsigned long src_offset, unsigned long len)
5994{
0b246afa 5995 struct btrfs_fs_info *fs_info = dst->fs_info;
d1310b2e
CM
5996 size_t cur;
5997 size_t dst_off_in_page;
5998 size_t src_off_in_page;
7073017a 5999 size_t start_offset = offset_in_page(dst->start);
d1310b2e
CM
6000 unsigned long dst_i;
6001 unsigned long src_i;
6002
6003 if (src_offset + len > dst->len) {
0b246afa 6004 btrfs_err(fs_info,
5d163e0e
JM
6005 "memmove bogus src_offset %lu move len %lu dst len %lu",
6006 src_offset, len, dst->len);
290342f6 6007 BUG();
d1310b2e
CM
6008 }
6009 if (dst_offset + len > dst->len) {
0b246afa 6010 btrfs_err(fs_info,
5d163e0e
JM
6011 "memmove bogus dst_offset %lu move len %lu dst len %lu",
6012 dst_offset, len, dst->len);
290342f6 6013 BUG();
d1310b2e
CM
6014 }
6015
d397712b 6016 while (len > 0) {
7073017a
JT
6017 dst_off_in_page = offset_in_page(start_offset + dst_offset);
6018 src_off_in_page = offset_in_page(start_offset + src_offset);
d1310b2e 6019
09cbfeaf
KS
6020 dst_i = (start_offset + dst_offset) >> PAGE_SHIFT;
6021 src_i = (start_offset + src_offset) >> PAGE_SHIFT;
d1310b2e 6022
09cbfeaf 6023 cur = min(len, (unsigned long)(PAGE_SIZE -
d1310b2e
CM
6024 src_off_in_page));
6025 cur = min_t(unsigned long, cur,
09cbfeaf 6026 (unsigned long)(PAGE_SIZE - dst_off_in_page));
d1310b2e 6027
fb85fc9a 6028 copy_pages(dst->pages[dst_i], dst->pages[src_i],
d1310b2e
CM
6029 dst_off_in_page, src_off_in_page, cur);
6030
6031 src_offset += cur;
6032 dst_offset += cur;
6033 len -= cur;
6034 }
6035}
d1310b2e
CM
6036
6037void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
6038 unsigned long src_offset, unsigned long len)
6039{
0b246afa 6040 struct btrfs_fs_info *fs_info = dst->fs_info;
d1310b2e
CM
6041 size_t cur;
6042 size_t dst_off_in_page;
6043 size_t src_off_in_page;
6044 unsigned long dst_end = dst_offset + len - 1;
6045 unsigned long src_end = src_offset + len - 1;
7073017a 6046 size_t start_offset = offset_in_page(dst->start);
d1310b2e
CM
6047 unsigned long dst_i;
6048 unsigned long src_i;
6049
6050 if (src_offset + len > dst->len) {
0b246afa 6051 btrfs_err(fs_info,
5d163e0e
JM
6052 "memmove bogus src_offset %lu move len %lu len %lu",
6053 src_offset, len, dst->len);
290342f6 6054 BUG();
d1310b2e
CM
6055 }
6056 if (dst_offset + len > dst->len) {
0b246afa 6057 btrfs_err(fs_info,
5d163e0e
JM
6058 "memmove bogus dst_offset %lu move len %lu len %lu",
6059 dst_offset, len, dst->len);
290342f6 6060 BUG();
d1310b2e 6061 }
727011e0 6062 if (dst_offset < src_offset) {
d1310b2e
CM
6063 memcpy_extent_buffer(dst, dst_offset, src_offset, len);
6064 return;
6065 }
d397712b 6066 while (len > 0) {
09cbfeaf
KS
6067 dst_i = (start_offset + dst_end) >> PAGE_SHIFT;
6068 src_i = (start_offset + src_end) >> PAGE_SHIFT;
d1310b2e 6069
7073017a
JT
6070 dst_off_in_page = offset_in_page(start_offset + dst_end);
6071 src_off_in_page = offset_in_page(start_offset + src_end);
d1310b2e
CM
6072
6073 cur = min_t(unsigned long, len, src_off_in_page + 1);
6074 cur = min(cur, dst_off_in_page + 1);
fb85fc9a 6075 copy_pages(dst->pages[dst_i], dst->pages[src_i],
d1310b2e
CM
6076 dst_off_in_page - cur + 1,
6077 src_off_in_page - cur + 1, cur);
6078
6079 dst_end -= cur;
6080 src_end -= cur;
6081 len -= cur;
6082 }
6083}
6af118ce 6084
f7a52a40 6085int try_release_extent_buffer(struct page *page)
19fe0a8b 6086{
6af118ce 6087 struct extent_buffer *eb;
6af118ce 6088
3083ee2e 6089 /*
01327610 6090 * We need to make sure nobody is attaching this page to an eb right
3083ee2e
JB
6091 * now.
6092 */
6093 spin_lock(&page->mapping->private_lock);
6094 if (!PagePrivate(page)) {
6095 spin_unlock(&page->mapping->private_lock);
4f2de97a 6096 return 1;
45f49bce 6097 }
6af118ce 6098
3083ee2e
JB
6099 eb = (struct extent_buffer *)page->private;
6100 BUG_ON(!eb);
19fe0a8b
MX
6101
6102 /*
3083ee2e
JB
6103 * This is a little awful but should be ok, we need to make sure that
6104 * the eb doesn't disappear out from under us while we're looking at
6105 * this page.
19fe0a8b 6106 */
3083ee2e 6107 spin_lock(&eb->refs_lock);
0b32f4bb 6108 if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
3083ee2e
JB
6109 spin_unlock(&eb->refs_lock);
6110 spin_unlock(&page->mapping->private_lock);
6111 return 0;
b9473439 6112 }
3083ee2e 6113 spin_unlock(&page->mapping->private_lock);
897ca6e9 6114
19fe0a8b 6115 /*
3083ee2e
JB
6116 * If tree ref isn't set then we know the ref on this eb is a real ref,
6117 * so just return, this page will likely be freed soon anyway.
19fe0a8b 6118 */
3083ee2e
JB
6119 if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
6120 spin_unlock(&eb->refs_lock);
6121 return 0;
b9473439 6122 }
19fe0a8b 6123
f7a52a40 6124 return release_extent_buffer(eb);
6af118ce 6125}