]> git.proxmox.com Git - mirror_ubuntu-jammy-kernel.git/blame - fs/btrfs/extent_io.c
btrfs: save irq flags when looking up an ordered extent
[mirror_ubuntu-jammy-kernel.git] / fs / btrfs / extent_io.c
CommitLineData
b2441318 1// SPDX-License-Identifier: GPL-2.0
c1d7c514 2
d1310b2e
CM
3#include <linux/bitops.h>
4#include <linux/slab.h>
5#include <linux/bio.h>
6#include <linux/mm.h>
d1310b2e
CM
7#include <linux/pagemap.h>
8#include <linux/page-flags.h>
d1310b2e
CM
9#include <linux/spinlock.h>
10#include <linux/blkdev.h>
11#include <linux/swap.h>
d1310b2e
CM
12#include <linux/writeback.h>
13#include <linux/pagevec.h>
268bb0ce 14#include <linux/prefetch.h>
90a887c9 15#include <linux/cleancache.h>
d1310b2e 16#include "extent_io.h"
9c7d3a54 17#include "extent-io-tree.h"
d1310b2e 18#include "extent_map.h"
902b22f3
DW
19#include "ctree.h"
20#include "btrfs_inode.h"
4a54c8c1 21#include "volumes.h"
21adbd5c 22#include "check-integrity.h"
0b32f4bb 23#include "locking.h"
606686ee 24#include "rcu-string.h"
fe09e16c 25#include "backref.h"
6af49dbd 26#include "disk-io.h"
760f991f 27#include "subpage.h"
d3575156 28#include "zoned.h"
d1310b2e 29
d1310b2e
CM
30static struct kmem_cache *extent_state_cache;
31static struct kmem_cache *extent_buffer_cache;
8ac9f7c1 32static struct bio_set btrfs_bioset;
d1310b2e 33
27a3507d
FM
34static inline bool extent_state_in_tree(const struct extent_state *state)
35{
36 return !RB_EMPTY_NODE(&state->rb_node);
37}
38
6d49ba1b 39#ifdef CONFIG_BTRFS_DEBUG
d1310b2e 40static LIST_HEAD(states);
d397712b 41static DEFINE_SPINLOCK(leak_lock);
6d49ba1b 42
3fd63727
JB
43static inline void btrfs_leak_debug_add(spinlock_t *lock,
44 struct list_head *new,
45 struct list_head *head)
6d49ba1b
ES
46{
47 unsigned long flags;
48
3fd63727 49 spin_lock_irqsave(lock, flags);
6d49ba1b 50 list_add(new, head);
3fd63727 51 spin_unlock_irqrestore(lock, flags);
6d49ba1b
ES
52}
53
3fd63727
JB
54static inline void btrfs_leak_debug_del(spinlock_t *lock,
55 struct list_head *entry)
6d49ba1b
ES
56{
57 unsigned long flags;
58
3fd63727 59 spin_lock_irqsave(lock, flags);
6d49ba1b 60 list_del(entry);
3fd63727 61 spin_unlock_irqrestore(lock, flags);
6d49ba1b
ES
62}
63
3fd63727 64void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info)
6d49ba1b 65{
6d49ba1b 66 struct extent_buffer *eb;
3fd63727 67 unsigned long flags;
6d49ba1b 68
8c38938c
JB
69 /*
70 * If we didn't get into open_ctree our allocated_ebs will not be
71 * initialized, so just skip this.
72 */
73 if (!fs_info->allocated_ebs.next)
74 return;
75
3fd63727
JB
76 spin_lock_irqsave(&fs_info->eb_leak_lock, flags);
77 while (!list_empty(&fs_info->allocated_ebs)) {
78 eb = list_first_entry(&fs_info->allocated_ebs,
79 struct extent_buffer, leak_list);
8c38938c
JB
80 pr_err(
81 "BTRFS: buffer leak start %llu len %lu refs %d bflags %lu owner %llu\n",
82 eb->start, eb->len, atomic_read(&eb->refs), eb->bflags,
83 btrfs_header_owner(eb));
33ca832f
JB
84 list_del(&eb->leak_list);
85 kmem_cache_free(extent_buffer_cache, eb);
86 }
3fd63727 87 spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags);
33ca832f
JB
88}
89
90static inline void btrfs_extent_state_leak_debug_check(void)
91{
92 struct extent_state *state;
93
6d49ba1b
ES
94 while (!list_empty(&states)) {
95 state = list_entry(states.next, struct extent_state, leak_list);
9ee49a04 96 pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n",
27a3507d
FM
97 state->start, state->end, state->state,
98 extent_state_in_tree(state),
b7ac31b7 99 refcount_read(&state->refs));
6d49ba1b
ES
100 list_del(&state->leak_list);
101 kmem_cache_free(extent_state_cache, state);
102 }
6d49ba1b 103}
8d599ae1 104
a5dee37d
JB
105#define btrfs_debug_check_extent_io_range(tree, start, end) \
106 __btrfs_debug_check_extent_io_range(__func__, (tree), (start), (end))
8d599ae1 107static inline void __btrfs_debug_check_extent_io_range(const char *caller,
a5dee37d 108 struct extent_io_tree *tree, u64 start, u64 end)
8d599ae1 109{
65a680f6
NB
110 struct inode *inode = tree->private_data;
111 u64 isize;
112
113 if (!inode || !is_data_inode(inode))
114 return;
115
116 isize = i_size_read(inode);
117 if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
118 btrfs_debug_rl(BTRFS_I(inode)->root->fs_info,
119 "%s: ino %llu isize %llu odd range [%llu,%llu]",
120 caller, btrfs_ino(BTRFS_I(inode)), isize, start, end);
121 }
8d599ae1 122}
6d49ba1b 123#else
3fd63727
JB
124#define btrfs_leak_debug_add(lock, new, head) do {} while (0)
125#define btrfs_leak_debug_del(lock, entry) do {} while (0)
33ca832f 126#define btrfs_extent_state_leak_debug_check() do {} while (0)
8d599ae1 127#define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0)
4bef0848 128#endif
d1310b2e 129
d1310b2e
CM
130struct tree_entry {
131 u64 start;
132 u64 end;
d1310b2e
CM
133 struct rb_node rb_node;
134};
135
136struct extent_page_data {
137 struct bio *bio;
771ed689
CM
138 /* tells writepage not to lock the state bits for this range
139 * it still does the unlocking
140 */
ffbd517d
CM
141 unsigned int extent_locked:1;
142
70fd7614 143 /* tells the submit_bio code to use REQ_SYNC */
ffbd517d 144 unsigned int sync_io:1;
d1310b2e
CM
145};
146
f97e27e9 147static int add_extent_changeset(struct extent_state *state, u32 bits,
d38ed27f
QW
148 struct extent_changeset *changeset,
149 int set)
150{
151 int ret;
152
153 if (!changeset)
57599c7e 154 return 0;
d38ed27f 155 if (set && (state->state & bits) == bits)
57599c7e 156 return 0;
fefdc557 157 if (!set && (state->state & bits) == 0)
57599c7e 158 return 0;
d38ed27f 159 changeset->bytes_changed += state->end - state->start + 1;
53d32359 160 ret = ulist_add(&changeset->range_changed, state->start, state->end,
d38ed27f 161 GFP_ATOMIC);
57599c7e 162 return ret;
d38ed27f
QW
163}
164
c1be9c1a
NB
165int __must_check submit_one_bio(struct bio *bio, int mirror_num,
166 unsigned long bio_flags)
bb58eb9e
QW
167{
168 blk_status_t ret = 0;
bb58eb9e 169 struct extent_io_tree *tree = bio->bi_private;
bb58eb9e
QW
170
171 bio->bi_private = NULL;
172
908930f3
NB
173 if (is_data_inode(tree->private_data))
174 ret = btrfs_submit_data_bio(tree->private_data, bio, mirror_num,
175 bio_flags);
176 else
1b36294a
NB
177 ret = btrfs_submit_metadata_bio(tree->private_data, bio,
178 mirror_num, bio_flags);
bb58eb9e
QW
179
180 return blk_status_to_errno(ret);
181}
182
3065976b
QW
183/* Cleanup unsubmitted bios */
184static void end_write_bio(struct extent_page_data *epd, int ret)
185{
186 if (epd->bio) {
187 epd->bio->bi_status = errno_to_blk_status(ret);
188 bio_endio(epd->bio);
189 epd->bio = NULL;
190 }
191}
192
f4340622
QW
193/*
194 * Submit bio from extent page data via submit_one_bio
195 *
196 * Return 0 if everything is OK.
197 * Return <0 for error.
198 */
199static int __must_check flush_write_bio(struct extent_page_data *epd)
bb58eb9e 200{
f4340622 201 int ret = 0;
bb58eb9e 202
f4340622 203 if (epd->bio) {
bb58eb9e 204 ret = submit_one_bio(epd->bio, 0, 0);
f4340622
QW
205 /*
206 * Clean up of epd->bio is handled by its endio function.
207 * And endio is either triggered by successful bio execution
208 * or the error handler of submit bio hook.
209 * So at this point, no matter what happened, we don't need
210 * to clean up epd->bio.
211 */
bb58eb9e
QW
212 epd->bio = NULL;
213 }
f4340622 214 return ret;
bb58eb9e 215}
e2932ee0 216
6f0d04f8 217int __init extent_state_cache_init(void)
d1310b2e 218{
837e1972 219 extent_state_cache = kmem_cache_create("btrfs_extent_state",
9601e3f6 220 sizeof(struct extent_state), 0,
fba4b697 221 SLAB_MEM_SPREAD, NULL);
d1310b2e
CM
222 if (!extent_state_cache)
223 return -ENOMEM;
6f0d04f8
JB
224 return 0;
225}
d1310b2e 226
6f0d04f8
JB
227int __init extent_io_init(void)
228{
837e1972 229 extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer",
9601e3f6 230 sizeof(struct extent_buffer), 0,
fba4b697 231 SLAB_MEM_SPREAD, NULL);
d1310b2e 232 if (!extent_buffer_cache)
6f0d04f8 233 return -ENOMEM;
9be3395b 234
8ac9f7c1
KO
235 if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE,
236 offsetof(struct btrfs_io_bio, bio),
237 BIOSET_NEED_BVECS))
9be3395b 238 goto free_buffer_cache;
b208c2f7 239
8ac9f7c1 240 if (bioset_integrity_create(&btrfs_bioset, BIO_POOL_SIZE))
b208c2f7
DW
241 goto free_bioset;
242
d1310b2e
CM
243 return 0;
244
b208c2f7 245free_bioset:
8ac9f7c1 246 bioset_exit(&btrfs_bioset);
b208c2f7 247
9be3395b
CM
248free_buffer_cache:
249 kmem_cache_destroy(extent_buffer_cache);
250 extent_buffer_cache = NULL;
6f0d04f8
JB
251 return -ENOMEM;
252}
9be3395b 253
6f0d04f8
JB
254void __cold extent_state_cache_exit(void)
255{
256 btrfs_extent_state_leak_debug_check();
d1310b2e 257 kmem_cache_destroy(extent_state_cache);
d1310b2e
CM
258}
259
e67c718b 260void __cold extent_io_exit(void)
d1310b2e 261{
8c0a8537
KS
262 /*
263 * Make sure all delayed rcu free are flushed before we
264 * destroy caches.
265 */
266 rcu_barrier();
5598e900 267 kmem_cache_destroy(extent_buffer_cache);
8ac9f7c1 268 bioset_exit(&btrfs_bioset);
d1310b2e
CM
269}
270
41a2ee75
JB
271/*
272 * For the file_extent_tree, we want to hold the inode lock when we lookup and
273 * update the disk_i_size, but lockdep will complain because our io_tree we hold
274 * the tree lock and get the inode lock when setting delalloc. These two things
275 * are unrelated, so make a class for the file_extent_tree so we don't get the
276 * two locking patterns mixed up.
277 */
278static struct lock_class_key file_extent_tree_class;
279
c258d6e3 280void extent_io_tree_init(struct btrfs_fs_info *fs_info,
43eb5f29
QW
281 struct extent_io_tree *tree, unsigned int owner,
282 void *private_data)
d1310b2e 283{
c258d6e3 284 tree->fs_info = fs_info;
6bef4d31 285 tree->state = RB_ROOT;
d1310b2e 286 tree->dirty_bytes = 0;
70dec807 287 spin_lock_init(&tree->lock);
c6100a4b 288 tree->private_data = private_data;
43eb5f29 289 tree->owner = owner;
41a2ee75
JB
290 if (owner == IO_TREE_INODE_FILE_EXTENT)
291 lockdep_set_class(&tree->lock, &file_extent_tree_class);
d1310b2e 292}
d1310b2e 293
41e7acd3
NB
294void extent_io_tree_release(struct extent_io_tree *tree)
295{
296 spin_lock(&tree->lock);
297 /*
298 * Do a single barrier for the waitqueue_active check here, the state
299 * of the waitqueue should not change once extent_io_tree_release is
300 * called.
301 */
302 smp_mb();
303 while (!RB_EMPTY_ROOT(&tree->state)) {
304 struct rb_node *node;
305 struct extent_state *state;
306
307 node = rb_first(&tree->state);
308 state = rb_entry(node, struct extent_state, rb_node);
309 rb_erase(&state->rb_node, &tree->state);
310 RB_CLEAR_NODE(&state->rb_node);
311 /*
312 * btree io trees aren't supposed to have tasks waiting for
313 * changes in the flags of extent states ever.
314 */
315 ASSERT(!waitqueue_active(&state->wq));
316 free_extent_state(state);
317
318 cond_resched_lock(&tree->lock);
319 }
320 spin_unlock(&tree->lock);
321}
322
b2950863 323static struct extent_state *alloc_extent_state(gfp_t mask)
d1310b2e
CM
324{
325 struct extent_state *state;
d1310b2e 326
3ba7ab22
MH
327 /*
328 * The given mask might be not appropriate for the slab allocator,
329 * drop the unsupported bits
330 */
331 mask &= ~(__GFP_DMA32|__GFP_HIGHMEM);
d1310b2e 332 state = kmem_cache_alloc(extent_state_cache, mask);
2b114d1d 333 if (!state)
d1310b2e
CM
334 return state;
335 state->state = 0;
47dc196a 336 state->failrec = NULL;
27a3507d 337 RB_CLEAR_NODE(&state->rb_node);
3fd63727 338 btrfs_leak_debug_add(&leak_lock, &state->leak_list, &states);
b7ac31b7 339 refcount_set(&state->refs, 1);
d1310b2e 340 init_waitqueue_head(&state->wq);
143bede5 341 trace_alloc_extent_state(state, mask, _RET_IP_);
d1310b2e
CM
342 return state;
343}
d1310b2e 344
4845e44f 345void free_extent_state(struct extent_state *state)
d1310b2e 346{
d1310b2e
CM
347 if (!state)
348 return;
b7ac31b7 349 if (refcount_dec_and_test(&state->refs)) {
27a3507d 350 WARN_ON(extent_state_in_tree(state));
3fd63727 351 btrfs_leak_debug_del(&leak_lock, &state->leak_list);
143bede5 352 trace_free_extent_state(state, _RET_IP_);
d1310b2e
CM
353 kmem_cache_free(extent_state_cache, state);
354 }
355}
d1310b2e 356
f2071b21
FM
357static struct rb_node *tree_insert(struct rb_root *root,
358 struct rb_node *search_start,
359 u64 offset,
12cfbad9
FDBM
360 struct rb_node *node,
361 struct rb_node ***p_in,
362 struct rb_node **parent_in)
d1310b2e 363{
f2071b21 364 struct rb_node **p;
d397712b 365 struct rb_node *parent = NULL;
d1310b2e
CM
366 struct tree_entry *entry;
367
12cfbad9
FDBM
368 if (p_in && parent_in) {
369 p = *p_in;
370 parent = *parent_in;
371 goto do_insert;
372 }
373
f2071b21 374 p = search_start ? &search_start : &root->rb_node;
d397712b 375 while (*p) {
d1310b2e
CM
376 parent = *p;
377 entry = rb_entry(parent, struct tree_entry, rb_node);
378
379 if (offset < entry->start)
380 p = &(*p)->rb_left;
381 else if (offset > entry->end)
382 p = &(*p)->rb_right;
383 else
384 return parent;
385 }
386
12cfbad9 387do_insert:
d1310b2e
CM
388 rb_link_node(node, parent, p);
389 rb_insert_color(node, root);
390 return NULL;
391}
392
8666e638 393/**
3bed2da1
NB
394 * Search @tree for an entry that contains @offset. Such entry would have
395 * entry->start <= offset && entry->end >= offset.
8666e638 396 *
3bed2da1
NB
397 * @tree: the tree to search
398 * @offset: offset that should fall within an entry in @tree
399 * @next_ret: pointer to the first entry whose range ends after @offset
400 * @prev_ret: pointer to the first entry whose range begins before @offset
401 * @p_ret: pointer where new node should be anchored (used when inserting an
402 * entry in the tree)
403 * @parent_ret: points to entry which would have been the parent of the entry,
8666e638
NB
404 * containing @offset
405 *
406 * This function returns a pointer to the entry that contains @offset byte
407 * address. If no such entry exists, then NULL is returned and the other
408 * pointer arguments to the function are filled, otherwise the found entry is
409 * returned and other pointers are left untouched.
410 */
80ea96b1 411static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
12cfbad9 412 struct rb_node **next_ret,
352646c7 413 struct rb_node **prev_ret,
12cfbad9
FDBM
414 struct rb_node ***p_ret,
415 struct rb_node **parent_ret)
d1310b2e 416{
80ea96b1 417 struct rb_root *root = &tree->state;
12cfbad9 418 struct rb_node **n = &root->rb_node;
d1310b2e
CM
419 struct rb_node *prev = NULL;
420 struct rb_node *orig_prev = NULL;
421 struct tree_entry *entry;
422 struct tree_entry *prev_entry = NULL;
423
12cfbad9
FDBM
424 while (*n) {
425 prev = *n;
426 entry = rb_entry(prev, struct tree_entry, rb_node);
d1310b2e
CM
427 prev_entry = entry;
428
429 if (offset < entry->start)
12cfbad9 430 n = &(*n)->rb_left;
d1310b2e 431 else if (offset > entry->end)
12cfbad9 432 n = &(*n)->rb_right;
d397712b 433 else
12cfbad9 434 return *n;
d1310b2e
CM
435 }
436
12cfbad9
FDBM
437 if (p_ret)
438 *p_ret = n;
439 if (parent_ret)
440 *parent_ret = prev;
441
352646c7 442 if (next_ret) {
d1310b2e 443 orig_prev = prev;
d397712b 444 while (prev && offset > prev_entry->end) {
d1310b2e
CM
445 prev = rb_next(prev);
446 prev_entry = rb_entry(prev, struct tree_entry, rb_node);
447 }
352646c7 448 *next_ret = prev;
d1310b2e
CM
449 prev = orig_prev;
450 }
451
352646c7 452 if (prev_ret) {
d1310b2e 453 prev_entry = rb_entry(prev, struct tree_entry, rb_node);
d397712b 454 while (prev && offset < prev_entry->start) {
d1310b2e
CM
455 prev = rb_prev(prev);
456 prev_entry = rb_entry(prev, struct tree_entry, rb_node);
457 }
352646c7 458 *prev_ret = prev;
d1310b2e
CM
459 }
460 return NULL;
461}
462
12cfbad9
FDBM
463static inline struct rb_node *
464tree_search_for_insert(struct extent_io_tree *tree,
465 u64 offset,
466 struct rb_node ***p_ret,
467 struct rb_node **parent_ret)
d1310b2e 468{
352646c7 469 struct rb_node *next= NULL;
d1310b2e 470 struct rb_node *ret;
70dec807 471
352646c7 472 ret = __etree_search(tree, offset, &next, NULL, p_ret, parent_ret);
d397712b 473 if (!ret)
352646c7 474 return next;
d1310b2e
CM
475 return ret;
476}
477
12cfbad9
FDBM
478static inline struct rb_node *tree_search(struct extent_io_tree *tree,
479 u64 offset)
480{
481 return tree_search_for_insert(tree, offset, NULL, NULL);
482}
483
d1310b2e
CM
484/*
485 * utility function to look for merge candidates inside a given range.
486 * Any extents with matching state are merged together into a single
487 * extent in the tree. Extents with EXTENT_IO in their state field
488 * are not merged because the end_io handlers need to be able to do
489 * operations on them without sleeping (or doing allocations/splits).
490 *
491 * This should be called with the tree lock held.
492 */
1bf85046
JM
493static void merge_state(struct extent_io_tree *tree,
494 struct extent_state *state)
d1310b2e
CM
495{
496 struct extent_state *other;
497 struct rb_node *other_node;
498
8882679e 499 if (state->state & (EXTENT_LOCKED | EXTENT_BOUNDARY))
1bf85046 500 return;
d1310b2e
CM
501
502 other_node = rb_prev(&state->rb_node);
503 if (other_node) {
504 other = rb_entry(other_node, struct extent_state, rb_node);
505 if (other->end == state->start - 1 &&
506 other->state == state->state) {
5c848198
NB
507 if (tree->private_data &&
508 is_data_inode(tree->private_data))
509 btrfs_merge_delalloc_extent(tree->private_data,
510 state, other);
d1310b2e 511 state->start = other->start;
d1310b2e 512 rb_erase(&other->rb_node, &tree->state);
27a3507d 513 RB_CLEAR_NODE(&other->rb_node);
d1310b2e
CM
514 free_extent_state(other);
515 }
516 }
517 other_node = rb_next(&state->rb_node);
518 if (other_node) {
519 other = rb_entry(other_node, struct extent_state, rb_node);
520 if (other->start == state->end + 1 &&
521 other->state == state->state) {
5c848198
NB
522 if (tree->private_data &&
523 is_data_inode(tree->private_data))
524 btrfs_merge_delalloc_extent(tree->private_data,
525 state, other);
df98b6e2 526 state->end = other->end;
df98b6e2 527 rb_erase(&other->rb_node, &tree->state);
27a3507d 528 RB_CLEAR_NODE(&other->rb_node);
df98b6e2 529 free_extent_state(other);
d1310b2e
CM
530 }
531 }
d1310b2e
CM
532}
533
3150b699 534static void set_state_bits(struct extent_io_tree *tree,
f97e27e9 535 struct extent_state *state, u32 *bits,
d38ed27f 536 struct extent_changeset *changeset);
3150b699 537
d1310b2e
CM
538/*
539 * insert an extent_state struct into the tree. 'bits' are set on the
540 * struct before it is inserted.
541 *
542 * This may return -EEXIST if the extent is already there, in which case the
543 * state struct is freed.
544 *
545 * The tree lock is not taken internally. This is a utility function and
546 * probably isn't what you want to call (see set/clear_extent_bit).
547 */
548static int insert_state(struct extent_io_tree *tree,
549 struct extent_state *state, u64 start, u64 end,
12cfbad9
FDBM
550 struct rb_node ***p,
551 struct rb_node **parent,
f97e27e9 552 u32 *bits, struct extent_changeset *changeset)
d1310b2e
CM
553{
554 struct rb_node *node;
555
2792237d
DS
556 if (end < start) {
557 btrfs_err(tree->fs_info,
558 "insert state: end < start %llu %llu", end, start);
559 WARN_ON(1);
560 }
d1310b2e
CM
561 state->start = start;
562 state->end = end;
9ed74f2d 563
d38ed27f 564 set_state_bits(tree, state, bits, changeset);
3150b699 565
f2071b21 566 node = tree_insert(&tree->state, NULL, end, &state->rb_node, p, parent);
d1310b2e
CM
567 if (node) {
568 struct extent_state *found;
569 found = rb_entry(node, struct extent_state, rb_node);
2792237d
DS
570 btrfs_err(tree->fs_info,
571 "found node %llu %llu on insert of %llu %llu",
c1c9ff7c 572 found->start, found->end, start, end);
d1310b2e
CM
573 return -EEXIST;
574 }
575 merge_state(tree, state);
576 return 0;
577}
578
579/*
580 * split a given extent state struct in two, inserting the preallocated
581 * struct 'prealloc' as the newly created second half. 'split' indicates an
582 * offset inside 'orig' where it should be split.
583 *
584 * Before calling,
585 * the tree has 'orig' at [orig->start, orig->end]. After calling, there
586 * are two extent state structs in the tree:
587 * prealloc: [orig->start, split - 1]
588 * orig: [ split, orig->end ]
589 *
590 * The tree locks are not taken by this function. They need to be held
591 * by the caller.
592 */
593static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
594 struct extent_state *prealloc, u64 split)
595{
596 struct rb_node *node;
9ed74f2d 597
abbb55f4
NB
598 if (tree->private_data && is_data_inode(tree->private_data))
599 btrfs_split_delalloc_extent(tree->private_data, orig, split);
9ed74f2d 600
d1310b2e
CM
601 prealloc->start = orig->start;
602 prealloc->end = split - 1;
603 prealloc->state = orig->state;
604 orig->start = split;
605
f2071b21
FM
606 node = tree_insert(&tree->state, &orig->rb_node, prealloc->end,
607 &prealloc->rb_node, NULL, NULL);
d1310b2e 608 if (node) {
d1310b2e
CM
609 free_extent_state(prealloc);
610 return -EEXIST;
611 }
612 return 0;
613}
614
cdc6a395
LZ
615static struct extent_state *next_state(struct extent_state *state)
616{
617 struct rb_node *next = rb_next(&state->rb_node);
618 if (next)
619 return rb_entry(next, struct extent_state, rb_node);
620 else
621 return NULL;
622}
623
d1310b2e
CM
624/*
625 * utility function to clear some bits in an extent state struct.
52042d8e 626 * it will optionally wake up anyone waiting on this state (wake == 1).
d1310b2e
CM
627 *
628 * If no bits are set on the state struct after clearing things, the
629 * struct is freed and removed from the tree
630 */
cdc6a395
LZ
631static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
632 struct extent_state *state,
f97e27e9 633 u32 *bits, int wake,
fefdc557 634 struct extent_changeset *changeset)
d1310b2e 635{
cdc6a395 636 struct extent_state *next;
f97e27e9 637 u32 bits_to_clear = *bits & ~EXTENT_CTLBITS;
57599c7e 638 int ret;
d1310b2e 639
0ca1f7ce 640 if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
d1310b2e
CM
641 u64 range = state->end - state->start + 1;
642 WARN_ON(range > tree->dirty_bytes);
643 tree->dirty_bytes -= range;
644 }
a36bb5f9
NB
645
646 if (tree->private_data && is_data_inode(tree->private_data))
647 btrfs_clear_delalloc_extent(tree->private_data, state, bits);
648
57599c7e
DS
649 ret = add_extent_changeset(state, bits_to_clear, changeset, 0);
650 BUG_ON(ret < 0);
32c00aff 651 state->state &= ~bits_to_clear;
d1310b2e
CM
652 if (wake)
653 wake_up(&state->wq);
0ca1f7ce 654 if (state->state == 0) {
cdc6a395 655 next = next_state(state);
27a3507d 656 if (extent_state_in_tree(state)) {
d1310b2e 657 rb_erase(&state->rb_node, &tree->state);
27a3507d 658 RB_CLEAR_NODE(&state->rb_node);
d1310b2e
CM
659 free_extent_state(state);
660 } else {
661 WARN_ON(1);
662 }
663 } else {
664 merge_state(tree, state);
cdc6a395 665 next = next_state(state);
d1310b2e 666 }
cdc6a395 667 return next;
d1310b2e
CM
668}
669
8233767a
XG
670static struct extent_state *
671alloc_extent_state_atomic(struct extent_state *prealloc)
672{
673 if (!prealloc)
674 prealloc = alloc_extent_state(GFP_ATOMIC);
675
676 return prealloc;
677}
678
48a3b636 679static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
c2d904e0 680{
29b665cc 681 btrfs_panic(tree->fs_info, err,
05912a3c 682 "locking error: extent tree was modified by another thread while locked");
c2d904e0
JM
683}
684
d1310b2e
CM
685/*
686 * clear some bits on a range in the tree. This may require splitting
687 * or inserting elements in the tree, so the gfp mask is used to
688 * indicate which allocations or sleeping are allowed.
689 *
690 * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
691 * the given range from the tree regardless of state (ie for truncate).
692 *
693 * the range [start, end] is inclusive.
694 *
6763af84 695 * This takes the tree lock, and returns 0 on success and < 0 on error.
d1310b2e 696 */
66b0c887 697int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
f97e27e9
QW
698 u32 bits, int wake, int delete,
699 struct extent_state **cached_state,
700 gfp_t mask, struct extent_changeset *changeset)
d1310b2e
CM
701{
702 struct extent_state *state;
2c64c53d 703 struct extent_state *cached;
d1310b2e
CM
704 struct extent_state *prealloc = NULL;
705 struct rb_node *node;
5c939df5 706 u64 last_end;
d1310b2e 707 int err;
2ac55d41 708 int clear = 0;
d1310b2e 709
a5dee37d 710 btrfs_debug_check_extent_io_range(tree, start, end);
a1d19847 711 trace_btrfs_clear_extent_bit(tree, start, end - start + 1, bits);
8d599ae1 712
7ee9e440
JB
713 if (bits & EXTENT_DELALLOC)
714 bits |= EXTENT_NORESERVE;
715
0ca1f7ce
YZ
716 if (delete)
717 bits |= ~EXTENT_CTLBITS;
0ca1f7ce 718
8882679e 719 if (bits & (EXTENT_LOCKED | EXTENT_BOUNDARY))
2ac55d41 720 clear = 1;
d1310b2e 721again:
d0164adc 722 if (!prealloc && gfpflags_allow_blocking(mask)) {
c7bc6319
FM
723 /*
724 * Don't care for allocation failure here because we might end
725 * up not needing the pre-allocated extent state at all, which
726 * is the case if we only have in the tree extent states that
727 * cover our input range and don't cover too any other range.
728 * If we end up needing a new extent state we allocate it later.
729 */
d1310b2e 730 prealloc = alloc_extent_state(mask);
d1310b2e
CM
731 }
732
cad321ad 733 spin_lock(&tree->lock);
2c64c53d
CM
734 if (cached_state) {
735 cached = *cached_state;
2ac55d41
JB
736
737 if (clear) {
738 *cached_state = NULL;
739 cached_state = NULL;
740 }
741
27a3507d
FM
742 if (cached && extent_state_in_tree(cached) &&
743 cached->start <= start && cached->end > start) {
2ac55d41 744 if (clear)
b7ac31b7 745 refcount_dec(&cached->refs);
2c64c53d 746 state = cached;
42daec29 747 goto hit_next;
2c64c53d 748 }
2ac55d41
JB
749 if (clear)
750 free_extent_state(cached);
2c64c53d 751 }
d1310b2e
CM
752 /*
753 * this search will find the extents that end after
754 * our range starts
755 */
80ea96b1 756 node = tree_search(tree, start);
d1310b2e
CM
757 if (!node)
758 goto out;
759 state = rb_entry(node, struct extent_state, rb_node);
2c64c53d 760hit_next:
d1310b2e
CM
761 if (state->start > end)
762 goto out;
763 WARN_ON(state->end < start);
5c939df5 764 last_end = state->end;
d1310b2e 765
0449314a 766 /* the state doesn't have the wanted bits, go ahead */
cdc6a395
LZ
767 if (!(state->state & bits)) {
768 state = next_state(state);
0449314a 769 goto next;
cdc6a395 770 }
0449314a 771
d1310b2e
CM
772 /*
773 * | ---- desired range ---- |
774 * | state | or
775 * | ------------- state -------------- |
776 *
777 * We need to split the extent we found, and may flip
778 * bits on second half.
779 *
780 * If the extent we found extends past our range, we
781 * just split and search again. It'll get split again
782 * the next time though.
783 *
784 * If the extent we found is inside our range, we clear
785 * the desired bit on it.
786 */
787
788 if (state->start < start) {
8233767a
XG
789 prealloc = alloc_extent_state_atomic(prealloc);
790 BUG_ON(!prealloc);
d1310b2e 791 err = split_state(tree, state, prealloc, start);
c2d904e0
JM
792 if (err)
793 extent_io_tree_panic(tree, err);
794
d1310b2e
CM
795 prealloc = NULL;
796 if (err)
797 goto out;
798 if (state->end <= end) {
fefdc557
QW
799 state = clear_state_bit(tree, state, &bits, wake,
800 changeset);
d1ac6e41 801 goto next;
d1310b2e
CM
802 }
803 goto search_again;
804 }
805 /*
806 * | ---- desired range ---- |
807 * | state |
808 * We need to split the extent, and clear the bit
809 * on the first half
810 */
811 if (state->start <= end && state->end > end) {
8233767a
XG
812 prealloc = alloc_extent_state_atomic(prealloc);
813 BUG_ON(!prealloc);
d1310b2e 814 err = split_state(tree, state, prealloc, end + 1);
c2d904e0
JM
815 if (err)
816 extent_io_tree_panic(tree, err);
817
d1310b2e
CM
818 if (wake)
819 wake_up(&state->wq);
42daec29 820
fefdc557 821 clear_state_bit(tree, prealloc, &bits, wake, changeset);
9ed74f2d 822
d1310b2e
CM
823 prealloc = NULL;
824 goto out;
825 }
42daec29 826
fefdc557 827 state = clear_state_bit(tree, state, &bits, wake, changeset);
0449314a 828next:
5c939df5
YZ
829 if (last_end == (u64)-1)
830 goto out;
831 start = last_end + 1;
cdc6a395 832 if (start <= end && state && !need_resched())
692e5759 833 goto hit_next;
d1310b2e
CM
834
835search_again:
836 if (start > end)
837 goto out;
cad321ad 838 spin_unlock(&tree->lock);
d0164adc 839 if (gfpflags_allow_blocking(mask))
d1310b2e
CM
840 cond_resched();
841 goto again;
7ab5cb2a
DS
842
843out:
844 spin_unlock(&tree->lock);
845 if (prealloc)
846 free_extent_state(prealloc);
847
848 return 0;
849
d1310b2e 850}
d1310b2e 851
143bede5
JM
852static void wait_on_state(struct extent_io_tree *tree,
853 struct extent_state *state)
641f5219
CH
854 __releases(tree->lock)
855 __acquires(tree->lock)
d1310b2e
CM
856{
857 DEFINE_WAIT(wait);
858 prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
cad321ad 859 spin_unlock(&tree->lock);
d1310b2e 860 schedule();
cad321ad 861 spin_lock(&tree->lock);
d1310b2e 862 finish_wait(&state->wq, &wait);
d1310b2e
CM
863}
864
865/*
866 * waits for one or more bits to clear on a range in the state tree.
867 * The range [start, end] is inclusive.
868 * The tree lock is taken by this function
869 */
41074888 870static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
f97e27e9 871 u32 bits)
d1310b2e
CM
872{
873 struct extent_state *state;
874 struct rb_node *node;
875
a5dee37d 876 btrfs_debug_check_extent_io_range(tree, start, end);
8d599ae1 877
cad321ad 878 spin_lock(&tree->lock);
d1310b2e
CM
879again:
880 while (1) {
881 /*
882 * this search will find all the extents that end after
883 * our range starts
884 */
80ea96b1 885 node = tree_search(tree, start);
c50d3e71 886process_node:
d1310b2e
CM
887 if (!node)
888 break;
889
890 state = rb_entry(node, struct extent_state, rb_node);
891
892 if (state->start > end)
893 goto out;
894
895 if (state->state & bits) {
896 start = state->start;
b7ac31b7 897 refcount_inc(&state->refs);
d1310b2e
CM
898 wait_on_state(tree, state);
899 free_extent_state(state);
900 goto again;
901 }
902 start = state->end + 1;
903
904 if (start > end)
905 break;
906
c50d3e71
FM
907 if (!cond_resched_lock(&tree->lock)) {
908 node = rb_next(node);
909 goto process_node;
910 }
d1310b2e
CM
911 }
912out:
cad321ad 913 spin_unlock(&tree->lock);
d1310b2e 914}
d1310b2e 915
1bf85046 916static void set_state_bits(struct extent_io_tree *tree,
d1310b2e 917 struct extent_state *state,
f97e27e9 918 u32 *bits, struct extent_changeset *changeset)
d1310b2e 919{
f97e27e9 920 u32 bits_to_set = *bits & ~EXTENT_CTLBITS;
57599c7e 921 int ret;
9ed74f2d 922
e06a1fc9
NB
923 if (tree->private_data && is_data_inode(tree->private_data))
924 btrfs_set_delalloc_extent(tree->private_data, state, bits);
925
0ca1f7ce 926 if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
d1310b2e
CM
927 u64 range = state->end - state->start + 1;
928 tree->dirty_bytes += range;
929 }
57599c7e
DS
930 ret = add_extent_changeset(state, bits_to_set, changeset, 1);
931 BUG_ON(ret < 0);
0ca1f7ce 932 state->state |= bits_to_set;
d1310b2e
CM
933}
934
e38e2ed7
FM
935static void cache_state_if_flags(struct extent_state *state,
936 struct extent_state **cached_ptr,
9ee49a04 937 unsigned flags)
2c64c53d
CM
938{
939 if (cached_ptr && !(*cached_ptr)) {
e38e2ed7 940 if (!flags || (state->state & flags)) {
2c64c53d 941 *cached_ptr = state;
b7ac31b7 942 refcount_inc(&state->refs);
2c64c53d
CM
943 }
944 }
945}
946
e38e2ed7
FM
947static void cache_state(struct extent_state *state,
948 struct extent_state **cached_ptr)
949{
950 return cache_state_if_flags(state, cached_ptr,
8882679e 951 EXTENT_LOCKED | EXTENT_BOUNDARY);
e38e2ed7
FM
952}
953
d1310b2e 954/*
1edbb734
CM
955 * set some bits on a range in the tree. This may require allocations or
956 * sleeping, so the gfp mask is used to indicate what is allowed.
d1310b2e 957 *
1edbb734
CM
958 * If any of the exclusive bits are set, this will fail with -EEXIST if some
959 * part of the range already has the desired bits set. The start of the
960 * existing range is returned in failed_start in this case.
d1310b2e 961 *
1edbb734 962 * [start, end] is inclusive This takes the tree lock.
d1310b2e 963 */
f97e27e9
QW
964int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits,
965 u32 exclusive_bits, u64 *failed_start,
1cab5e72
NB
966 struct extent_state **cached_state, gfp_t mask,
967 struct extent_changeset *changeset)
d1310b2e
CM
968{
969 struct extent_state *state;
970 struct extent_state *prealloc = NULL;
971 struct rb_node *node;
12cfbad9
FDBM
972 struct rb_node **p;
973 struct rb_node *parent;
d1310b2e 974 int err = 0;
d1310b2e
CM
975 u64 last_start;
976 u64 last_end;
42daec29 977
a5dee37d 978 btrfs_debug_check_extent_io_range(tree, start, end);
a1d19847 979 trace_btrfs_set_extent_bit(tree, start, end - start + 1, bits);
8d599ae1 980
3f6bb4ae
QW
981 if (exclusive_bits)
982 ASSERT(failed_start);
983 else
984 ASSERT(failed_start == NULL);
d1310b2e 985again:
d0164adc 986 if (!prealloc && gfpflags_allow_blocking(mask)) {
059f791c
DS
987 /*
988 * Don't care for allocation failure here because we might end
989 * up not needing the pre-allocated extent state at all, which
990 * is the case if we only have in the tree extent states that
991 * cover our input range and don't cover too any other range.
992 * If we end up needing a new extent state we allocate it later.
993 */
d1310b2e 994 prealloc = alloc_extent_state(mask);
d1310b2e
CM
995 }
996
cad321ad 997 spin_lock(&tree->lock);
9655d298
CM
998 if (cached_state && *cached_state) {
999 state = *cached_state;
df98b6e2 1000 if (state->start <= start && state->end > start &&
27a3507d 1001 extent_state_in_tree(state)) {
9655d298
CM
1002 node = &state->rb_node;
1003 goto hit_next;
1004 }
1005 }
d1310b2e
CM
1006 /*
1007 * this search will find all the extents that end after
1008 * our range starts.
1009 */
12cfbad9 1010 node = tree_search_for_insert(tree, start, &p, &parent);
d1310b2e 1011 if (!node) {
8233767a
XG
1012 prealloc = alloc_extent_state_atomic(prealloc);
1013 BUG_ON(!prealloc);
12cfbad9 1014 err = insert_state(tree, prealloc, start, end,
d38ed27f 1015 &p, &parent, &bits, changeset);
c2d904e0
JM
1016 if (err)
1017 extent_io_tree_panic(tree, err);
1018
c42ac0bc 1019 cache_state(prealloc, cached_state);
d1310b2e 1020 prealloc = NULL;
d1310b2e
CM
1021 goto out;
1022 }
d1310b2e 1023 state = rb_entry(node, struct extent_state, rb_node);
40431d6c 1024hit_next:
d1310b2e
CM
1025 last_start = state->start;
1026 last_end = state->end;
1027
1028 /*
1029 * | ---- desired range ---- |
1030 * | state |
1031 *
1032 * Just lock what we found and keep going
1033 */
1034 if (state->start == start && state->end <= end) {
1edbb734 1035 if (state->state & exclusive_bits) {
d1310b2e
CM
1036 *failed_start = state->start;
1037 err = -EEXIST;
1038 goto out;
1039 }
42daec29 1040
d38ed27f 1041 set_state_bits(tree, state, &bits, changeset);
2c64c53d 1042 cache_state(state, cached_state);
d1310b2e 1043 merge_state(tree, state);
5c939df5
YZ
1044 if (last_end == (u64)-1)
1045 goto out;
1046 start = last_end + 1;
d1ac6e41
LB
1047 state = next_state(state);
1048 if (start < end && state && state->start == start &&
1049 !need_resched())
1050 goto hit_next;
d1310b2e
CM
1051 goto search_again;
1052 }
1053
1054 /*
1055 * | ---- desired range ---- |
1056 * | state |
1057 * or
1058 * | ------------- state -------------- |
1059 *
1060 * We need to split the extent we found, and may flip bits on
1061 * second half.
1062 *
1063 * If the extent we found extends past our
1064 * range, we just split and search again. It'll get split
1065 * again the next time though.
1066 *
1067 * If the extent we found is inside our range, we set the
1068 * desired bit on it.
1069 */
1070 if (state->start < start) {
1edbb734 1071 if (state->state & exclusive_bits) {
d1310b2e
CM
1072 *failed_start = start;
1073 err = -EEXIST;
1074 goto out;
1075 }
8233767a 1076
55ffaabe
FM
1077 /*
1078 * If this extent already has all the bits we want set, then
1079 * skip it, not necessary to split it or do anything with it.
1080 */
1081 if ((state->state & bits) == bits) {
1082 start = state->end + 1;
1083 cache_state(state, cached_state);
1084 goto search_again;
1085 }
1086
8233767a
XG
1087 prealloc = alloc_extent_state_atomic(prealloc);
1088 BUG_ON(!prealloc);
d1310b2e 1089 err = split_state(tree, state, prealloc, start);
c2d904e0
JM
1090 if (err)
1091 extent_io_tree_panic(tree, err);
1092
d1310b2e
CM
1093 prealloc = NULL;
1094 if (err)
1095 goto out;
1096 if (state->end <= end) {
d38ed27f 1097 set_state_bits(tree, state, &bits, changeset);
2c64c53d 1098 cache_state(state, cached_state);
d1310b2e 1099 merge_state(tree, state);
5c939df5
YZ
1100 if (last_end == (u64)-1)
1101 goto out;
1102 start = last_end + 1;
d1ac6e41
LB
1103 state = next_state(state);
1104 if (start < end && state && state->start == start &&
1105 !need_resched())
1106 goto hit_next;
d1310b2e
CM
1107 }
1108 goto search_again;
1109 }
1110 /*
1111 * | ---- desired range ---- |
1112 * | state | or | state |
1113 *
1114 * There's a hole, we need to insert something in it and
1115 * ignore the extent we found.
1116 */
1117 if (state->start > start) {
1118 u64 this_end;
1119 if (end < last_start)
1120 this_end = end;
1121 else
d397712b 1122 this_end = last_start - 1;
8233767a
XG
1123
1124 prealloc = alloc_extent_state_atomic(prealloc);
1125 BUG_ON(!prealloc);
c7f895a2
XG
1126
1127 /*
1128 * Avoid to free 'prealloc' if it can be merged with
1129 * the later extent.
1130 */
d1310b2e 1131 err = insert_state(tree, prealloc, start, this_end,
d38ed27f 1132 NULL, NULL, &bits, changeset);
c2d904e0
JM
1133 if (err)
1134 extent_io_tree_panic(tree, err);
1135
9ed74f2d
JB
1136 cache_state(prealloc, cached_state);
1137 prealloc = NULL;
d1310b2e
CM
1138 start = this_end + 1;
1139 goto search_again;
1140 }
1141 /*
1142 * | ---- desired range ---- |
1143 * | state |
1144 * We need to split the extent, and set the bit
1145 * on the first half
1146 */
1147 if (state->start <= end && state->end > end) {
1edbb734 1148 if (state->state & exclusive_bits) {
d1310b2e
CM
1149 *failed_start = start;
1150 err = -EEXIST;
1151 goto out;
1152 }
8233767a
XG
1153
1154 prealloc = alloc_extent_state_atomic(prealloc);
1155 BUG_ON(!prealloc);
d1310b2e 1156 err = split_state(tree, state, prealloc, end + 1);
c2d904e0
JM
1157 if (err)
1158 extent_io_tree_panic(tree, err);
d1310b2e 1159
d38ed27f 1160 set_state_bits(tree, prealloc, &bits, changeset);
2c64c53d 1161 cache_state(prealloc, cached_state);
d1310b2e
CM
1162 merge_state(tree, prealloc);
1163 prealloc = NULL;
1164 goto out;
1165 }
1166
b5a4ba14
DS
1167search_again:
1168 if (start > end)
1169 goto out;
1170 spin_unlock(&tree->lock);
1171 if (gfpflags_allow_blocking(mask))
1172 cond_resched();
1173 goto again;
d1310b2e
CM
1174
1175out:
cad321ad 1176 spin_unlock(&tree->lock);
d1310b2e
CM
1177 if (prealloc)
1178 free_extent_state(prealloc);
1179
1180 return err;
1181
d1310b2e 1182}
d1310b2e 1183
462d6fac 1184/**
10983f2e
LB
1185 * convert_extent_bit - convert all bits in a given range from one bit to
1186 * another
462d6fac
JB
1187 * @tree: the io tree to search
1188 * @start: the start offset in bytes
1189 * @end: the end offset in bytes (inclusive)
1190 * @bits: the bits to set in this range
1191 * @clear_bits: the bits to clear in this range
e6138876 1192 * @cached_state: state that we're going to cache
462d6fac
JB
1193 *
1194 * This will go through and set bits for the given range. If any states exist
1195 * already in this range they are set with the given bit and cleared of the
1196 * clear_bits. This is only meant to be used by things that are mergeable, ie
1197 * converting from say DELALLOC to DIRTY. This is not meant to be used with
1198 * boundary bits like LOCK.
210aa277
DS
1199 *
1200 * All allocations are done with GFP_NOFS.
462d6fac
JB
1201 */
1202int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
f97e27e9 1203 u32 bits, u32 clear_bits,
210aa277 1204 struct extent_state **cached_state)
462d6fac
JB
1205{
1206 struct extent_state *state;
1207 struct extent_state *prealloc = NULL;
1208 struct rb_node *node;
12cfbad9
FDBM
1209 struct rb_node **p;
1210 struct rb_node *parent;
462d6fac
JB
1211 int err = 0;
1212 u64 last_start;
1213 u64 last_end;
c8fd3de7 1214 bool first_iteration = true;
462d6fac 1215
a5dee37d 1216 btrfs_debug_check_extent_io_range(tree, start, end);
a1d19847
QW
1217 trace_btrfs_convert_extent_bit(tree, start, end - start + 1, bits,
1218 clear_bits);
8d599ae1 1219
462d6fac 1220again:
210aa277 1221 if (!prealloc) {
c8fd3de7
FM
1222 /*
1223 * Best effort, don't worry if extent state allocation fails
1224 * here for the first iteration. We might have a cached state
1225 * that matches exactly the target range, in which case no
1226 * extent state allocations are needed. We'll only know this
1227 * after locking the tree.
1228 */
210aa277 1229 prealloc = alloc_extent_state(GFP_NOFS);
c8fd3de7 1230 if (!prealloc && !first_iteration)
462d6fac
JB
1231 return -ENOMEM;
1232 }
1233
1234 spin_lock(&tree->lock);
e6138876
JB
1235 if (cached_state && *cached_state) {
1236 state = *cached_state;
1237 if (state->start <= start && state->end > start &&
27a3507d 1238 extent_state_in_tree(state)) {
e6138876
JB
1239 node = &state->rb_node;
1240 goto hit_next;
1241 }
1242 }
1243
462d6fac
JB
1244 /*
1245 * this search will find all the extents that end after
1246 * our range starts.
1247 */
12cfbad9 1248 node = tree_search_for_insert(tree, start, &p, &parent);
462d6fac
JB
1249 if (!node) {
1250 prealloc = alloc_extent_state_atomic(prealloc);
1cf4ffdb
LB
1251 if (!prealloc) {
1252 err = -ENOMEM;
1253 goto out;
1254 }
12cfbad9 1255 err = insert_state(tree, prealloc, start, end,
d38ed27f 1256 &p, &parent, &bits, NULL);
c2d904e0
JM
1257 if (err)
1258 extent_io_tree_panic(tree, err);
c42ac0bc
FDBM
1259 cache_state(prealloc, cached_state);
1260 prealloc = NULL;
462d6fac
JB
1261 goto out;
1262 }
1263 state = rb_entry(node, struct extent_state, rb_node);
1264hit_next:
1265 last_start = state->start;
1266 last_end = state->end;
1267
1268 /*
1269 * | ---- desired range ---- |
1270 * | state |
1271 *
1272 * Just lock what we found and keep going
1273 */
1274 if (state->start == start && state->end <= end) {
d38ed27f 1275 set_state_bits(tree, state, &bits, NULL);
e6138876 1276 cache_state(state, cached_state);
fefdc557 1277 state = clear_state_bit(tree, state, &clear_bits, 0, NULL);
462d6fac
JB
1278 if (last_end == (u64)-1)
1279 goto out;
462d6fac 1280 start = last_end + 1;
d1ac6e41
LB
1281 if (start < end && state && state->start == start &&
1282 !need_resched())
1283 goto hit_next;
462d6fac
JB
1284 goto search_again;
1285 }
1286
1287 /*
1288 * | ---- desired range ---- |
1289 * | state |
1290 * or
1291 * | ------------- state -------------- |
1292 *
1293 * We need to split the extent we found, and may flip bits on
1294 * second half.
1295 *
1296 * If the extent we found extends past our
1297 * range, we just split and search again. It'll get split
1298 * again the next time though.
1299 *
1300 * If the extent we found is inside our range, we set the
1301 * desired bit on it.
1302 */
1303 if (state->start < start) {
1304 prealloc = alloc_extent_state_atomic(prealloc);
1cf4ffdb
LB
1305 if (!prealloc) {
1306 err = -ENOMEM;
1307 goto out;
1308 }
462d6fac 1309 err = split_state(tree, state, prealloc, start);
c2d904e0
JM
1310 if (err)
1311 extent_io_tree_panic(tree, err);
462d6fac
JB
1312 prealloc = NULL;
1313 if (err)
1314 goto out;
1315 if (state->end <= end) {
d38ed27f 1316 set_state_bits(tree, state, &bits, NULL);
e6138876 1317 cache_state(state, cached_state);
fefdc557
QW
1318 state = clear_state_bit(tree, state, &clear_bits, 0,
1319 NULL);
462d6fac
JB
1320 if (last_end == (u64)-1)
1321 goto out;
1322 start = last_end + 1;
d1ac6e41
LB
1323 if (start < end && state && state->start == start &&
1324 !need_resched())
1325 goto hit_next;
462d6fac
JB
1326 }
1327 goto search_again;
1328 }
1329 /*
1330 * | ---- desired range ---- |
1331 * | state | or | state |
1332 *
1333 * There's a hole, we need to insert something in it and
1334 * ignore the extent we found.
1335 */
1336 if (state->start > start) {
1337 u64 this_end;
1338 if (end < last_start)
1339 this_end = end;
1340 else
1341 this_end = last_start - 1;
1342
1343 prealloc = alloc_extent_state_atomic(prealloc);
1cf4ffdb
LB
1344 if (!prealloc) {
1345 err = -ENOMEM;
1346 goto out;
1347 }
462d6fac
JB
1348
1349 /*
1350 * Avoid to free 'prealloc' if it can be merged with
1351 * the later extent.
1352 */
1353 err = insert_state(tree, prealloc, start, this_end,
d38ed27f 1354 NULL, NULL, &bits, NULL);
c2d904e0
JM
1355 if (err)
1356 extent_io_tree_panic(tree, err);
e6138876 1357 cache_state(prealloc, cached_state);
462d6fac
JB
1358 prealloc = NULL;
1359 start = this_end + 1;
1360 goto search_again;
1361 }
1362 /*
1363 * | ---- desired range ---- |
1364 * | state |
1365 * We need to split the extent, and set the bit
1366 * on the first half
1367 */
1368 if (state->start <= end && state->end > end) {
1369 prealloc = alloc_extent_state_atomic(prealloc);
1cf4ffdb
LB
1370 if (!prealloc) {
1371 err = -ENOMEM;
1372 goto out;
1373 }
462d6fac
JB
1374
1375 err = split_state(tree, state, prealloc, end + 1);
c2d904e0
JM
1376 if (err)
1377 extent_io_tree_panic(tree, err);
462d6fac 1378
d38ed27f 1379 set_state_bits(tree, prealloc, &bits, NULL);
e6138876 1380 cache_state(prealloc, cached_state);
fefdc557 1381 clear_state_bit(tree, prealloc, &clear_bits, 0, NULL);
462d6fac
JB
1382 prealloc = NULL;
1383 goto out;
1384 }
1385
462d6fac
JB
1386search_again:
1387 if (start > end)
1388 goto out;
1389 spin_unlock(&tree->lock);
210aa277 1390 cond_resched();
c8fd3de7 1391 first_iteration = false;
462d6fac 1392 goto again;
462d6fac
JB
1393
1394out:
1395 spin_unlock(&tree->lock);
1396 if (prealloc)
1397 free_extent_state(prealloc);
1398
1399 return err;
462d6fac
JB
1400}
1401
d1310b2e 1402/* wrappers around set/clear extent bit */
d38ed27f 1403int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
f97e27e9 1404 u32 bits, struct extent_changeset *changeset)
d38ed27f
QW
1405{
1406 /*
1407 * We don't support EXTENT_LOCKED yet, as current changeset will
1408 * record any bits changed, so for EXTENT_LOCKED case, it will
1409 * either fail with -EEXIST or changeset will record the whole
1410 * range.
1411 */
1412 BUG_ON(bits & EXTENT_LOCKED);
1413
1cab5e72
NB
1414 return set_extent_bit(tree, start, end, bits, 0, NULL, NULL, GFP_NOFS,
1415 changeset);
d38ed27f
QW
1416}
1417
4ca73656 1418int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start, u64 end,
f97e27e9 1419 u32 bits)
4ca73656 1420{
1cab5e72
NB
1421 return set_extent_bit(tree, start, end, bits, 0, NULL, NULL,
1422 GFP_NOWAIT, NULL);
4ca73656
NB
1423}
1424
fefdc557 1425int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
f97e27e9 1426 u32 bits, int wake, int delete,
ae0f1625 1427 struct extent_state **cached)
fefdc557
QW
1428{
1429 return __clear_extent_bit(tree, start, end, bits, wake, delete,
ae0f1625 1430 cached, GFP_NOFS, NULL);
fefdc557
QW
1431}
1432
fefdc557 1433int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
f97e27e9 1434 u32 bits, struct extent_changeset *changeset)
fefdc557
QW
1435{
1436 /*
1437 * Don't support EXTENT_LOCKED case, same reason as
1438 * set_record_extent_bits().
1439 */
1440 BUG_ON(bits & EXTENT_LOCKED);
1441
f734c44a 1442 return __clear_extent_bit(tree, start, end, bits, 0, 0, NULL, GFP_NOFS,
fefdc557
QW
1443 changeset);
1444}
1445
d352ac68
CM
1446/*
1447 * either insert or lock state struct between start and end use mask to tell
1448 * us if waiting is desired.
1449 */
1edbb734 1450int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
ff13db41 1451 struct extent_state **cached_state)
d1310b2e
CM
1452{
1453 int err;
1454 u64 failed_start;
9ee49a04 1455
d1310b2e 1456 while (1) {
1cab5e72
NB
1457 err = set_extent_bit(tree, start, end, EXTENT_LOCKED,
1458 EXTENT_LOCKED, &failed_start,
1459 cached_state, GFP_NOFS, NULL);
d0082371 1460 if (err == -EEXIST) {
d1310b2e
CM
1461 wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
1462 start = failed_start;
d0082371 1463 } else
d1310b2e 1464 break;
d1310b2e
CM
1465 WARN_ON(start > end);
1466 }
1467 return err;
1468}
d1310b2e 1469
d0082371 1470int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
25179201
JB
1471{
1472 int err;
1473 u64 failed_start;
1474
1cab5e72
NB
1475 err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
1476 &failed_start, NULL, GFP_NOFS, NULL);
6643558d
YZ
1477 if (err == -EEXIST) {
1478 if (failed_start > start)
1479 clear_extent_bit(tree, start, failed_start - 1,
ae0f1625 1480 EXTENT_LOCKED, 1, 0, NULL);
25179201 1481 return 0;
6643558d 1482 }
25179201
JB
1483 return 1;
1484}
25179201 1485
bd1fa4f0 1486void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end)
4adaa611 1487{
09cbfeaf
KS
1488 unsigned long index = start >> PAGE_SHIFT;
1489 unsigned long end_index = end >> PAGE_SHIFT;
4adaa611
CM
1490 struct page *page;
1491
1492 while (index <= end_index) {
1493 page = find_get_page(inode->i_mapping, index);
1494 BUG_ON(!page); /* Pages should be in the extent_io_tree */
1495 clear_page_dirty_for_io(page);
09cbfeaf 1496 put_page(page);
4adaa611
CM
1497 index++;
1498 }
4adaa611
CM
1499}
1500
f6311572 1501void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
4adaa611 1502{
09cbfeaf
KS
1503 unsigned long index = start >> PAGE_SHIFT;
1504 unsigned long end_index = end >> PAGE_SHIFT;
4adaa611
CM
1505 struct page *page;
1506
1507 while (index <= end_index) {
1508 page = find_get_page(inode->i_mapping, index);
1509 BUG_ON(!page); /* Pages should be in the extent_io_tree */
4adaa611 1510 __set_page_dirty_nobuffers(page);
8d38633c 1511 account_page_redirty(page);
09cbfeaf 1512 put_page(page);
4adaa611
CM
1513 index++;
1514 }
4adaa611
CM
1515}
1516
d352ac68
CM
1517/* find the first state struct with 'bits' set after 'start', and
1518 * return it. tree->lock must be held. NULL will returned if
1519 * nothing was found after 'start'
1520 */
48a3b636 1521static struct extent_state *
f97e27e9 1522find_first_extent_bit_state(struct extent_io_tree *tree, u64 start, u32 bits)
d7fc640e
CM
1523{
1524 struct rb_node *node;
1525 struct extent_state *state;
1526
1527 /*
1528 * this search will find all the extents that end after
1529 * our range starts.
1530 */
1531 node = tree_search(tree, start);
d397712b 1532 if (!node)
d7fc640e 1533 goto out;
d7fc640e 1534
d397712b 1535 while (1) {
d7fc640e 1536 state = rb_entry(node, struct extent_state, rb_node);
d397712b 1537 if (state->end >= start && (state->state & bits))
d7fc640e 1538 return state;
d397712b 1539
d7fc640e
CM
1540 node = rb_next(node);
1541 if (!node)
1542 break;
1543 }
1544out:
1545 return NULL;
1546}
d7fc640e 1547
69261c4b 1548/*
03509b78 1549 * Find the first offset in the io tree with one or more @bits set.
69261c4b 1550 *
03509b78
QW
1551 * Note: If there are multiple bits set in @bits, any of them will match.
1552 *
1553 * Return 0 if we find something, and update @start_ret and @end_ret.
1554 * Return 1 if we found nothing.
69261c4b
XG
1555 */
1556int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
f97e27e9 1557 u64 *start_ret, u64 *end_ret, u32 bits,
e6138876 1558 struct extent_state **cached_state)
69261c4b
XG
1559{
1560 struct extent_state *state;
1561 int ret = 1;
1562
1563 spin_lock(&tree->lock);
e6138876
JB
1564 if (cached_state && *cached_state) {
1565 state = *cached_state;
27a3507d 1566 if (state->end == start - 1 && extent_state_in_tree(state)) {
9688e9a9 1567 while ((state = next_state(state)) != NULL) {
e6138876
JB
1568 if (state->state & bits)
1569 goto got_it;
e6138876
JB
1570 }
1571 free_extent_state(*cached_state);
1572 *cached_state = NULL;
1573 goto out;
1574 }
1575 free_extent_state(*cached_state);
1576 *cached_state = NULL;
1577 }
1578
69261c4b 1579 state = find_first_extent_bit_state(tree, start, bits);
e6138876 1580got_it:
69261c4b 1581 if (state) {
e38e2ed7 1582 cache_state_if_flags(state, cached_state, 0);
69261c4b
XG
1583 *start_ret = state->start;
1584 *end_ret = state->end;
1585 ret = 0;
1586 }
e6138876 1587out:
69261c4b
XG
1588 spin_unlock(&tree->lock);
1589 return ret;
1590}
1591
41a2ee75 1592/**
3bed2da1
NB
1593 * Find a contiguous area of bits
1594 *
1595 * @tree: io tree to check
1596 * @start: offset to start the search from
1597 * @start_ret: the first offset we found with the bits set
1598 * @end_ret: the final contiguous range of the bits that were set
1599 * @bits: bits to look for
41a2ee75
JB
1600 *
1601 * set_extent_bit and clear_extent_bit can temporarily split contiguous ranges
1602 * to set bits appropriately, and then merge them again. During this time it
1603 * will drop the tree->lock, so use this helper if you want to find the actual
1604 * contiguous area for given bits. We will search to the first bit we find, and
1605 * then walk down the tree until we find a non-contiguous area. The area
1606 * returned will be the full contiguous area with the bits set.
1607 */
1608int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
f97e27e9 1609 u64 *start_ret, u64 *end_ret, u32 bits)
41a2ee75
JB
1610{
1611 struct extent_state *state;
1612 int ret = 1;
1613
1614 spin_lock(&tree->lock);
1615 state = find_first_extent_bit_state(tree, start, bits);
1616 if (state) {
1617 *start_ret = state->start;
1618 *end_ret = state->end;
1619 while ((state = next_state(state)) != NULL) {
1620 if (state->start > (*end_ret + 1))
1621 break;
1622 *end_ret = state->end;
1623 }
1624 ret = 0;
1625 }
1626 spin_unlock(&tree->lock);
1627 return ret;
1628}
1629
45bfcfc1 1630/**
3bed2da1
NB
1631 * Find the first range that has @bits not set. This range could start before
1632 * @start.
45bfcfc1 1633 *
3bed2da1
NB
1634 * @tree: the tree to search
1635 * @start: offset at/after which the found extent should start
1636 * @start_ret: records the beginning of the range
1637 * @end_ret: records the end of the range (inclusive)
1638 * @bits: the set of bits which must be unset
45bfcfc1
NB
1639 *
1640 * Since unallocated range is also considered one which doesn't have the bits
1641 * set it's possible that @end_ret contains -1, this happens in case the range
1642 * spans (last_range_end, end of device]. In this case it's up to the caller to
1643 * trim @end_ret to the appropriate size.
1644 */
1645void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
f97e27e9 1646 u64 *start_ret, u64 *end_ret, u32 bits)
45bfcfc1
NB
1647{
1648 struct extent_state *state;
1649 struct rb_node *node, *prev = NULL, *next;
1650
1651 spin_lock(&tree->lock);
1652
1653 /* Find first extent with bits cleared */
1654 while (1) {
1655 node = __etree_search(tree, start, &next, &prev, NULL, NULL);
5750c375
NB
1656 if (!node && !next && !prev) {
1657 /*
1658 * Tree is completely empty, send full range and let
1659 * caller deal with it
1660 */
1661 *start_ret = 0;
1662 *end_ret = -1;
1663 goto out;
1664 } else if (!node && !next) {
1665 /*
1666 * We are past the last allocated chunk, set start at
1667 * the end of the last extent.
1668 */
1669 state = rb_entry(prev, struct extent_state, rb_node);
1670 *start_ret = state->end + 1;
1671 *end_ret = -1;
1672 goto out;
1673 } else if (!node) {
45bfcfc1 1674 node = next;
45bfcfc1 1675 }
1eaebb34
NB
1676 /*
1677 * At this point 'node' either contains 'start' or start is
1678 * before 'node'
1679 */
45bfcfc1 1680 state = rb_entry(node, struct extent_state, rb_node);
1eaebb34
NB
1681
1682 if (in_range(start, state->start, state->end - state->start + 1)) {
1683 if (state->state & bits) {
1684 /*
1685 * |--range with bits sets--|
1686 * |
1687 * start
1688 */
1689 start = state->end + 1;
1690 } else {
1691 /*
1692 * 'start' falls within a range that doesn't
1693 * have the bits set, so take its start as
1694 * the beginning of the desired range
1695 *
1696 * |--range with bits cleared----|
1697 * |
1698 * start
1699 */
1700 *start_ret = state->start;
1701 break;
1702 }
45bfcfc1 1703 } else {
1eaebb34
NB
1704 /*
1705 * |---prev range---|---hole/unset---|---node range---|
1706 * |
1707 * start
1708 *
1709 * or
1710 *
1711 * |---hole/unset--||--first node--|
1712 * 0 |
1713 * start
1714 */
1715 if (prev) {
1716 state = rb_entry(prev, struct extent_state,
1717 rb_node);
1718 *start_ret = state->end + 1;
1719 } else {
1720 *start_ret = 0;
1721 }
45bfcfc1
NB
1722 break;
1723 }
1724 }
1725
1726 /*
1727 * Find the longest stretch from start until an entry which has the
1728 * bits set
1729 */
1730 while (1) {
1731 state = rb_entry(node, struct extent_state, rb_node);
1732 if (state->end >= start && !(state->state & bits)) {
1733 *end_ret = state->end;
1734 } else {
1735 *end_ret = state->start - 1;
1736 break;
1737 }
1738
1739 node = rb_next(node);
1740 if (!node)
1741 break;
1742 }
1743out:
1744 spin_unlock(&tree->lock);
1745}
1746
d352ac68
CM
1747/*
1748 * find a contiguous range of bytes in the file marked as delalloc, not
1749 * more than 'max_bytes'. start and end are used to return the range,
1750 *
3522e903 1751 * true is returned if we find something, false if nothing was in the tree
d352ac68 1752 */
083e75e7
JB
1753bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start,
1754 u64 *end, u64 max_bytes,
1755 struct extent_state **cached_state)
d1310b2e
CM
1756{
1757 struct rb_node *node;
1758 struct extent_state *state;
1759 u64 cur_start = *start;
3522e903 1760 bool found = false;
d1310b2e
CM
1761 u64 total_bytes = 0;
1762
cad321ad 1763 spin_lock(&tree->lock);
c8b97818 1764
d1310b2e
CM
1765 /*
1766 * this search will find all the extents that end after
1767 * our range starts.
1768 */
80ea96b1 1769 node = tree_search(tree, cur_start);
2b114d1d 1770 if (!node) {
3522e903 1771 *end = (u64)-1;
d1310b2e
CM
1772 goto out;
1773 }
1774
d397712b 1775 while (1) {
d1310b2e 1776 state = rb_entry(node, struct extent_state, rb_node);
5b21f2ed
ZY
1777 if (found && (state->start != cur_start ||
1778 (state->state & EXTENT_BOUNDARY))) {
d1310b2e
CM
1779 goto out;
1780 }
1781 if (!(state->state & EXTENT_DELALLOC)) {
1782 if (!found)
1783 *end = state->end;
1784 goto out;
1785 }
c2a128d2 1786 if (!found) {
d1310b2e 1787 *start = state->start;
c2a128d2 1788 *cached_state = state;
b7ac31b7 1789 refcount_inc(&state->refs);
c2a128d2 1790 }
3522e903 1791 found = true;
d1310b2e
CM
1792 *end = state->end;
1793 cur_start = state->end + 1;
1794 node = rb_next(node);
d1310b2e 1795 total_bytes += state->end - state->start + 1;
7bf811a5 1796 if (total_bytes >= max_bytes)
573aecaf 1797 break;
573aecaf 1798 if (!node)
d1310b2e
CM
1799 break;
1800 }
1801out:
cad321ad 1802 spin_unlock(&tree->lock);
d1310b2e
CM
1803 return found;
1804}
1805
da2c7009
LB
1806static int __process_pages_contig(struct address_space *mapping,
1807 struct page *locked_page,
1808 pgoff_t start_index, pgoff_t end_index,
1809 unsigned long page_ops, pgoff_t *index_ret);
1810
143bede5
JM
1811static noinline void __unlock_for_delalloc(struct inode *inode,
1812 struct page *locked_page,
1813 u64 start, u64 end)
c8b97818 1814{
09cbfeaf
KS
1815 unsigned long index = start >> PAGE_SHIFT;
1816 unsigned long end_index = end >> PAGE_SHIFT;
c8b97818 1817
76c0021d 1818 ASSERT(locked_page);
c8b97818 1819 if (index == locked_page->index && end_index == index)
143bede5 1820 return;
c8b97818 1821
76c0021d
LB
1822 __process_pages_contig(inode->i_mapping, locked_page, index, end_index,
1823 PAGE_UNLOCK, NULL);
c8b97818
CM
1824}
1825
1826static noinline int lock_delalloc_pages(struct inode *inode,
1827 struct page *locked_page,
1828 u64 delalloc_start,
1829 u64 delalloc_end)
1830{
09cbfeaf 1831 unsigned long index = delalloc_start >> PAGE_SHIFT;
76c0021d 1832 unsigned long index_ret = index;
09cbfeaf 1833 unsigned long end_index = delalloc_end >> PAGE_SHIFT;
c8b97818 1834 int ret;
c8b97818 1835
76c0021d 1836 ASSERT(locked_page);
c8b97818
CM
1837 if (index == locked_page->index && index == end_index)
1838 return 0;
1839
76c0021d
LB
1840 ret = __process_pages_contig(inode->i_mapping, locked_page, index,
1841 end_index, PAGE_LOCK, &index_ret);
1842 if (ret == -EAGAIN)
1843 __unlock_for_delalloc(inode, locked_page, delalloc_start,
1844 (u64)index_ret << PAGE_SHIFT);
c8b97818
CM
1845 return ret;
1846}
1847
1848/*
3522e903
LF
1849 * Find and lock a contiguous range of bytes in the file marked as delalloc, no
1850 * more than @max_bytes. @Start and @end are used to return the range,
c8b97818 1851 *
3522e903
LF
1852 * Return: true if we find something
1853 * false if nothing was in the tree
c8b97818 1854 */
ce9f967f 1855EXPORT_FOR_TESTS
3522e903 1856noinline_for_stack bool find_lock_delalloc_range(struct inode *inode,
294e30fe 1857 struct page *locked_page, u64 *start,
917aacec 1858 u64 *end)
c8b97818 1859{
9978059b 1860 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
917aacec 1861 u64 max_bytes = BTRFS_MAX_EXTENT_SIZE;
c8b97818
CM
1862 u64 delalloc_start;
1863 u64 delalloc_end;
3522e903 1864 bool found;
9655d298 1865 struct extent_state *cached_state = NULL;
c8b97818
CM
1866 int ret;
1867 int loops = 0;
1868
1869again:
1870 /* step one, find a bunch of delalloc bytes starting at start */
1871 delalloc_start = *start;
1872 delalloc_end = 0;
083e75e7
JB
1873 found = btrfs_find_delalloc_range(tree, &delalloc_start, &delalloc_end,
1874 max_bytes, &cached_state);
70b99e69 1875 if (!found || delalloc_end <= *start) {
c8b97818
CM
1876 *start = delalloc_start;
1877 *end = delalloc_end;
c2a128d2 1878 free_extent_state(cached_state);
3522e903 1879 return false;
c8b97818
CM
1880 }
1881
70b99e69
CM
1882 /*
1883 * start comes from the offset of locked_page. We have to lock
1884 * pages in order, so we can't process delalloc bytes before
1885 * locked_page
1886 */
d397712b 1887 if (delalloc_start < *start)
70b99e69 1888 delalloc_start = *start;
70b99e69 1889
c8b97818
CM
1890 /*
1891 * make sure to limit the number of pages we try to lock down
c8b97818 1892 */
7bf811a5
JB
1893 if (delalloc_end + 1 - delalloc_start > max_bytes)
1894 delalloc_end = delalloc_start + max_bytes - 1;
d397712b 1895
c8b97818
CM
1896 /* step two, lock all the pages after the page that has start */
1897 ret = lock_delalloc_pages(inode, locked_page,
1898 delalloc_start, delalloc_end);
9bfd61d9 1899 ASSERT(!ret || ret == -EAGAIN);
c8b97818
CM
1900 if (ret == -EAGAIN) {
1901 /* some of the pages are gone, lets avoid looping by
1902 * shortening the size of the delalloc range we're searching
1903 */
9655d298 1904 free_extent_state(cached_state);
7d788742 1905 cached_state = NULL;
c8b97818 1906 if (!loops) {
09cbfeaf 1907 max_bytes = PAGE_SIZE;
c8b97818
CM
1908 loops = 1;
1909 goto again;
1910 } else {
3522e903 1911 found = false;
c8b97818
CM
1912 goto out_failed;
1913 }
1914 }
c8b97818
CM
1915
1916 /* step three, lock the state bits for the whole range */
ff13db41 1917 lock_extent_bits(tree, delalloc_start, delalloc_end, &cached_state);
c8b97818
CM
1918
1919 /* then test to make sure it is all still delalloc */
1920 ret = test_range_bit(tree, delalloc_start, delalloc_end,
9655d298 1921 EXTENT_DELALLOC, 1, cached_state);
c8b97818 1922 if (!ret) {
9655d298 1923 unlock_extent_cached(tree, delalloc_start, delalloc_end,
e43bbe5e 1924 &cached_state);
c8b97818
CM
1925 __unlock_for_delalloc(inode, locked_page,
1926 delalloc_start, delalloc_end);
1927 cond_resched();
1928 goto again;
1929 }
9655d298 1930 free_extent_state(cached_state);
c8b97818
CM
1931 *start = delalloc_start;
1932 *end = delalloc_end;
1933out_failed:
1934 return found;
1935}
1936
da2c7009
LB
1937static int __process_pages_contig(struct address_space *mapping,
1938 struct page *locked_page,
1939 pgoff_t start_index, pgoff_t end_index,
1940 unsigned long page_ops, pgoff_t *index_ret)
c8b97818 1941{
873695b3 1942 unsigned long nr_pages = end_index - start_index + 1;
12e3360f 1943 unsigned long pages_processed = 0;
873695b3 1944 pgoff_t index = start_index;
c8b97818 1945 struct page *pages[16];
873695b3 1946 unsigned ret;
da2c7009 1947 int err = 0;
c8b97818 1948 int i;
771ed689 1949
da2c7009
LB
1950 if (page_ops & PAGE_LOCK) {
1951 ASSERT(page_ops == PAGE_LOCK);
1952 ASSERT(index_ret && *index_ret == start_index);
1953 }
1954
704de49d 1955 if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0)
873695b3 1956 mapping_set_error(mapping, -EIO);
704de49d 1957
d397712b 1958 while (nr_pages > 0) {
873695b3 1959 ret = find_get_pages_contig(mapping, index,
5b050f04
CM
1960 min_t(unsigned long,
1961 nr_pages, ARRAY_SIZE(pages)), pages);
da2c7009
LB
1962 if (ret == 0) {
1963 /*
1964 * Only if we're going to lock these pages,
1965 * can we find nothing at @index.
1966 */
1967 ASSERT(page_ops & PAGE_LOCK);
49d4a334
LB
1968 err = -EAGAIN;
1969 goto out;
da2c7009 1970 }
8b62b72b 1971
da2c7009 1972 for (i = 0; i < ret; i++) {
c2790a2e 1973 if (page_ops & PAGE_SET_PRIVATE2)
8b62b72b
CM
1974 SetPagePrivate2(pages[i]);
1975
1d53c9e6 1976 if (locked_page && pages[i] == locked_page) {
09cbfeaf 1977 put_page(pages[i]);
12e3360f 1978 pages_processed++;
c8b97818
CM
1979 continue;
1980 }
6869b0a8 1981 if (page_ops & PAGE_START_WRITEBACK) {
c8b97818 1982 clear_page_dirty_for_io(pages[i]);
c8b97818 1983 set_page_writeback(pages[i]);
6869b0a8 1984 }
704de49d
FM
1985 if (page_ops & PAGE_SET_ERROR)
1986 SetPageError(pages[i]);
c2790a2e 1987 if (page_ops & PAGE_END_WRITEBACK)
c8b97818 1988 end_page_writeback(pages[i]);
c2790a2e 1989 if (page_ops & PAGE_UNLOCK)
771ed689 1990 unlock_page(pages[i]);
da2c7009
LB
1991 if (page_ops & PAGE_LOCK) {
1992 lock_page(pages[i]);
1993 if (!PageDirty(pages[i]) ||
1994 pages[i]->mapping != mapping) {
1995 unlock_page(pages[i]);
5909ca11
RK
1996 for (; i < ret; i++)
1997 put_page(pages[i]);
da2c7009
LB
1998 err = -EAGAIN;
1999 goto out;
2000 }
2001 }
09cbfeaf 2002 put_page(pages[i]);
12e3360f 2003 pages_processed++;
c8b97818
CM
2004 }
2005 nr_pages -= ret;
2006 index += ret;
2007 cond_resched();
2008 }
da2c7009
LB
2009out:
2010 if (err && index_ret)
12e3360f 2011 *index_ret = start_index + pages_processed - 1;
da2c7009 2012 return err;
c8b97818 2013}
c8b97818 2014
ad7ff17b 2015void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
74e9194a 2016 struct page *locked_page,
f97e27e9 2017 u32 clear_bits, unsigned long page_ops)
873695b3 2018{
ad7ff17b 2019 clear_extent_bit(&inode->io_tree, start, end, clear_bits, 1, 0, NULL);
873695b3 2020
ad7ff17b 2021 __process_pages_contig(inode->vfs_inode.i_mapping, locked_page,
873695b3 2022 start >> PAGE_SHIFT, end >> PAGE_SHIFT,
da2c7009 2023 page_ops, NULL);
873695b3
LB
2024}
2025
d352ac68
CM
2026/*
2027 * count the number of bytes in the tree that have a given bit(s)
2028 * set. This can be fairly slow, except for EXTENT_DIRTY which is
2029 * cached. The total number found is returned.
2030 */
d1310b2e
CM
2031u64 count_range_bits(struct extent_io_tree *tree,
2032 u64 *start, u64 search_end, u64 max_bytes,
f97e27e9 2033 u32 bits, int contig)
d1310b2e
CM
2034{
2035 struct rb_node *node;
2036 struct extent_state *state;
2037 u64 cur_start = *start;
2038 u64 total_bytes = 0;
ec29ed5b 2039 u64 last = 0;
d1310b2e
CM
2040 int found = 0;
2041
fae7f21c 2042 if (WARN_ON(search_end <= cur_start))
d1310b2e 2043 return 0;
d1310b2e 2044
cad321ad 2045 spin_lock(&tree->lock);
d1310b2e
CM
2046 if (cur_start == 0 && bits == EXTENT_DIRTY) {
2047 total_bytes = tree->dirty_bytes;
2048 goto out;
2049 }
2050 /*
2051 * this search will find all the extents that end after
2052 * our range starts.
2053 */
80ea96b1 2054 node = tree_search(tree, cur_start);
d397712b 2055 if (!node)
d1310b2e 2056 goto out;
d1310b2e 2057
d397712b 2058 while (1) {
d1310b2e
CM
2059 state = rb_entry(node, struct extent_state, rb_node);
2060 if (state->start > search_end)
2061 break;
ec29ed5b
CM
2062 if (contig && found && state->start > last + 1)
2063 break;
2064 if (state->end >= cur_start && (state->state & bits) == bits) {
d1310b2e
CM
2065 total_bytes += min(search_end, state->end) + 1 -
2066 max(cur_start, state->start);
2067 if (total_bytes >= max_bytes)
2068 break;
2069 if (!found) {
af60bed2 2070 *start = max(cur_start, state->start);
d1310b2e
CM
2071 found = 1;
2072 }
ec29ed5b
CM
2073 last = state->end;
2074 } else if (contig && found) {
2075 break;
d1310b2e
CM
2076 }
2077 node = rb_next(node);
2078 if (!node)
2079 break;
2080 }
2081out:
cad321ad 2082 spin_unlock(&tree->lock);
d1310b2e
CM
2083 return total_bytes;
2084}
b2950863 2085
d352ac68
CM
2086/*
2087 * set the private field for a given byte offset in the tree. If there isn't
2088 * an extent_state there already, this does nothing.
2089 */
b3f167aa
JB
2090int set_state_failrec(struct extent_io_tree *tree, u64 start,
2091 struct io_failure_record *failrec)
d1310b2e
CM
2092{
2093 struct rb_node *node;
2094 struct extent_state *state;
2095 int ret = 0;
2096
cad321ad 2097 spin_lock(&tree->lock);
d1310b2e
CM
2098 /*
2099 * this search will find all the extents that end after
2100 * our range starts.
2101 */
80ea96b1 2102 node = tree_search(tree, start);
2b114d1d 2103 if (!node) {
d1310b2e
CM
2104 ret = -ENOENT;
2105 goto out;
2106 }
2107 state = rb_entry(node, struct extent_state, rb_node);
2108 if (state->start != start) {
2109 ret = -ENOENT;
2110 goto out;
2111 }
47dc196a 2112 state->failrec = failrec;
d1310b2e 2113out:
cad321ad 2114 spin_unlock(&tree->lock);
d1310b2e
CM
2115 return ret;
2116}
2117
2279a270 2118struct io_failure_record *get_state_failrec(struct extent_io_tree *tree, u64 start)
d1310b2e
CM
2119{
2120 struct rb_node *node;
2121 struct extent_state *state;
2279a270 2122 struct io_failure_record *failrec;
d1310b2e 2123
cad321ad 2124 spin_lock(&tree->lock);
d1310b2e
CM
2125 /*
2126 * this search will find all the extents that end after
2127 * our range starts.
2128 */
80ea96b1 2129 node = tree_search(tree, start);
2b114d1d 2130 if (!node) {
2279a270 2131 failrec = ERR_PTR(-ENOENT);
d1310b2e
CM
2132 goto out;
2133 }
2134 state = rb_entry(node, struct extent_state, rb_node);
2135 if (state->start != start) {
2279a270 2136 failrec = ERR_PTR(-ENOENT);
d1310b2e
CM
2137 goto out;
2138 }
2279a270
NB
2139
2140 failrec = state->failrec;
d1310b2e 2141out:
cad321ad 2142 spin_unlock(&tree->lock);
2279a270 2143 return failrec;
d1310b2e
CM
2144}
2145
2146/*
2147 * searches a range in the state tree for a given mask.
70dec807 2148 * If 'filled' == 1, this returns 1 only if every extent in the tree
d1310b2e
CM
2149 * has the bits set. Otherwise, 1 is returned if any bit in the
2150 * range is found set.
2151 */
2152int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
f97e27e9 2153 u32 bits, int filled, struct extent_state *cached)
d1310b2e
CM
2154{
2155 struct extent_state *state = NULL;
2156 struct rb_node *node;
2157 int bitset = 0;
d1310b2e 2158
cad321ad 2159 spin_lock(&tree->lock);
27a3507d 2160 if (cached && extent_state_in_tree(cached) && cached->start <= start &&
df98b6e2 2161 cached->end > start)
9655d298
CM
2162 node = &cached->rb_node;
2163 else
2164 node = tree_search(tree, start);
d1310b2e
CM
2165 while (node && start <= end) {
2166 state = rb_entry(node, struct extent_state, rb_node);
2167
2168 if (filled && state->start > start) {
2169 bitset = 0;
2170 break;
2171 }
2172
2173 if (state->start > end)
2174 break;
2175
2176 if (state->state & bits) {
2177 bitset = 1;
2178 if (!filled)
2179 break;
2180 } else if (filled) {
2181 bitset = 0;
2182 break;
2183 }
46562cec
CM
2184
2185 if (state->end == (u64)-1)
2186 break;
2187
d1310b2e
CM
2188 start = state->end + 1;
2189 if (start > end)
2190 break;
2191 node = rb_next(node);
2192 if (!node) {
2193 if (filled)
2194 bitset = 0;
2195 break;
2196 }
2197 }
cad321ad 2198 spin_unlock(&tree->lock);
d1310b2e
CM
2199 return bitset;
2200}
d1310b2e
CM
2201
2202/*
2203 * helper function to set a given page up to date if all the
2204 * extents in the tree for that page are up to date
2205 */
143bede5 2206static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)
d1310b2e 2207{
4eee4fa4 2208 u64 start = page_offset(page);
09cbfeaf 2209 u64 end = start + PAGE_SIZE - 1;
9655d298 2210 if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
d1310b2e 2211 SetPageUptodate(page);
d1310b2e
CM
2212}
2213
7870d082
JB
2214int free_io_failure(struct extent_io_tree *failure_tree,
2215 struct extent_io_tree *io_tree,
2216 struct io_failure_record *rec)
4a54c8c1
JS
2217{
2218 int ret;
2219 int err = 0;
4a54c8c1 2220
47dc196a 2221 set_state_failrec(failure_tree, rec->start, NULL);
4a54c8c1
JS
2222 ret = clear_extent_bits(failure_tree, rec->start,
2223 rec->start + rec->len - 1,
91166212 2224 EXTENT_LOCKED | EXTENT_DIRTY);
4a54c8c1
JS
2225 if (ret)
2226 err = ret;
2227
7870d082 2228 ret = clear_extent_bits(io_tree, rec->start,
53b381b3 2229 rec->start + rec->len - 1,
91166212 2230 EXTENT_DAMAGED);
53b381b3
DW
2231 if (ret && !err)
2232 err = ret;
4a54c8c1
JS
2233
2234 kfree(rec);
2235 return err;
2236}
2237
4a54c8c1
JS
2238/*
2239 * this bypasses the standard btrfs submit functions deliberately, as
2240 * the standard behavior is to write all copies in a raid setup. here we only
2241 * want to write the one bad copy. so we do the mapping for ourselves and issue
2242 * submit_bio directly.
3ec706c8 2243 * to avoid any synchronization issues, wait for the data after writing, which
4a54c8c1
JS
2244 * actually prevents the read that triggered the error from finishing.
2245 * currently, there can be no more than two copies of every data bit. thus,
2246 * exactly one rewrite is required.
2247 */
6ec656bc
JB
2248int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
2249 u64 length, u64 logical, struct page *page,
2250 unsigned int pg_offset, int mirror_num)
4a54c8c1
JS
2251{
2252 struct bio *bio;
2253 struct btrfs_device *dev;
4a54c8c1
JS
2254 u64 map_length = 0;
2255 u64 sector;
2256 struct btrfs_bio *bbio = NULL;
2257 int ret;
2258
1751e8a6 2259 ASSERT(!(fs_info->sb->s_flags & SB_RDONLY));
4a54c8c1
JS
2260 BUG_ON(!mirror_num);
2261
c5e4c3d7 2262 bio = btrfs_io_bio_alloc(1);
4f024f37 2263 bio->bi_iter.bi_size = 0;
4a54c8c1
JS
2264 map_length = length;
2265
b5de8d0d
FM
2266 /*
2267 * Avoid races with device replace and make sure our bbio has devices
2268 * associated to its stripes that don't go away while we are doing the
2269 * read repair operation.
2270 */
2271 btrfs_bio_counter_inc_blocked(fs_info);
e4ff5fb5 2272 if (btrfs_is_parity_mirror(fs_info, logical, length)) {
c725328c
LB
2273 /*
2274 * Note that we don't use BTRFS_MAP_WRITE because it's supposed
2275 * to update all raid stripes, but here we just want to correct
2276 * bad stripe, thus BTRFS_MAP_READ is abused to only get the bad
2277 * stripe's dev and sector.
2278 */
2279 ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical,
2280 &map_length, &bbio, 0);
2281 if (ret) {
2282 btrfs_bio_counter_dec(fs_info);
2283 bio_put(bio);
2284 return -EIO;
2285 }
2286 ASSERT(bbio->mirror_num == 1);
2287 } else {
2288 ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical,
2289 &map_length, &bbio, mirror_num);
2290 if (ret) {
2291 btrfs_bio_counter_dec(fs_info);
2292 bio_put(bio);
2293 return -EIO;
2294 }
2295 BUG_ON(mirror_num != bbio->mirror_num);
4a54c8c1 2296 }
c725328c
LB
2297
2298 sector = bbio->stripes[bbio->mirror_num - 1].physical >> 9;
4f024f37 2299 bio->bi_iter.bi_sector = sector;
c725328c 2300 dev = bbio->stripes[bbio->mirror_num - 1].dev;
6e9606d2 2301 btrfs_put_bbio(bbio);
ebbede42
AJ
2302 if (!dev || !dev->bdev ||
2303 !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
b5de8d0d 2304 btrfs_bio_counter_dec(fs_info);
4a54c8c1
JS
2305 bio_put(bio);
2306 return -EIO;
2307 }
74d46992 2308 bio_set_dev(bio, dev->bdev);
70fd7614 2309 bio->bi_opf = REQ_OP_WRITE | REQ_SYNC;
ffdd2018 2310 bio_add_page(bio, page, length, pg_offset);
4a54c8c1 2311
4e49ea4a 2312 if (btrfsic_submit_bio_wait(bio)) {
4a54c8c1 2313 /* try to remap that extent elsewhere? */
b5de8d0d 2314 btrfs_bio_counter_dec(fs_info);
4a54c8c1 2315 bio_put(bio);
442a4f63 2316 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
4a54c8c1
JS
2317 return -EIO;
2318 }
2319
b14af3b4
DS
2320 btrfs_info_rl_in_rcu(fs_info,
2321 "read error corrected: ino %llu off %llu (dev %s sector %llu)",
6ec656bc 2322 ino, start,
1203b681 2323 rcu_str_deref(dev->name), sector);
b5de8d0d 2324 btrfs_bio_counter_dec(fs_info);
4a54c8c1
JS
2325 bio_put(bio);
2326 return 0;
2327}
2328
2b48966a 2329int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num)
ea466794 2330{
20a1fbf9 2331 struct btrfs_fs_info *fs_info = eb->fs_info;
ea466794 2332 u64 start = eb->start;
cc5e31a4 2333 int i, num_pages = num_extent_pages(eb);
d95603b2 2334 int ret = 0;
ea466794 2335
bc98a42c 2336 if (sb_rdonly(fs_info->sb))
908960c6
ID
2337 return -EROFS;
2338
ea466794 2339 for (i = 0; i < num_pages; i++) {
fb85fc9a 2340 struct page *p = eb->pages[i];
1203b681 2341
6ec656bc 2342 ret = repair_io_failure(fs_info, 0, start, PAGE_SIZE, start, p,
1203b681 2343 start - page_offset(p), mirror_num);
ea466794
JB
2344 if (ret)
2345 break;
09cbfeaf 2346 start += PAGE_SIZE;
ea466794
JB
2347 }
2348
2349 return ret;
2350}
2351
4a54c8c1
JS
2352/*
2353 * each time an IO finishes, we do a fast check in the IO failure tree
2354 * to see if we need to process or clean up an io_failure_record
2355 */
7870d082
JB
2356int clean_io_failure(struct btrfs_fs_info *fs_info,
2357 struct extent_io_tree *failure_tree,
2358 struct extent_io_tree *io_tree, u64 start,
2359 struct page *page, u64 ino, unsigned int pg_offset)
4a54c8c1
JS
2360{
2361 u64 private;
4a54c8c1 2362 struct io_failure_record *failrec;
4a54c8c1
JS
2363 struct extent_state *state;
2364 int num_copies;
4a54c8c1 2365 int ret;
4a54c8c1
JS
2366
2367 private = 0;
7870d082
JB
2368 ret = count_range_bits(failure_tree, &private, (u64)-1, 1,
2369 EXTENT_DIRTY, 0);
4a54c8c1
JS
2370 if (!ret)
2371 return 0;
2372
2279a270
NB
2373 failrec = get_state_failrec(failure_tree, start);
2374 if (IS_ERR(failrec))
4a54c8c1
JS
2375 return 0;
2376
4a54c8c1
JS
2377 BUG_ON(!failrec->this_mirror);
2378
2379 if (failrec->in_validation) {
2380 /* there was no real error, just free the record */
ab8d0fc4
JM
2381 btrfs_debug(fs_info,
2382 "clean_io_failure: freeing dummy error at %llu",
2383 failrec->start);
4a54c8c1
JS
2384 goto out;
2385 }
bc98a42c 2386 if (sb_rdonly(fs_info->sb))
908960c6 2387 goto out;
4a54c8c1 2388
7870d082
JB
2389 spin_lock(&io_tree->lock);
2390 state = find_first_extent_bit_state(io_tree,
4a54c8c1
JS
2391 failrec->start,
2392 EXTENT_LOCKED);
7870d082 2393 spin_unlock(&io_tree->lock);
4a54c8c1 2394
883d0de4
MX
2395 if (state && state->start <= failrec->start &&
2396 state->end >= failrec->start + failrec->len - 1) {
3ec706c8
SB
2397 num_copies = btrfs_num_copies(fs_info, failrec->logical,
2398 failrec->len);
4a54c8c1 2399 if (num_copies > 1) {
7870d082
JB
2400 repair_io_failure(fs_info, ino, start, failrec->len,
2401 failrec->logical, page, pg_offset,
2402 failrec->failed_mirror);
4a54c8c1
JS
2403 }
2404 }
2405
2406out:
7870d082 2407 free_io_failure(failure_tree, io_tree, failrec);
4a54c8c1 2408
454ff3de 2409 return 0;
4a54c8c1
JS
2410}
2411
f612496b
MX
2412/*
2413 * Can be called when
2414 * - hold extent lock
2415 * - under ordered extent
2416 * - the inode is freeing
2417 */
7ab7956e 2418void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end)
f612496b 2419{
7ab7956e 2420 struct extent_io_tree *failure_tree = &inode->io_failure_tree;
f612496b
MX
2421 struct io_failure_record *failrec;
2422 struct extent_state *state, *next;
2423
2424 if (RB_EMPTY_ROOT(&failure_tree->state))
2425 return;
2426
2427 spin_lock(&failure_tree->lock);
2428 state = find_first_extent_bit_state(failure_tree, start, EXTENT_DIRTY);
2429 while (state) {
2430 if (state->start > end)
2431 break;
2432
2433 ASSERT(state->end <= end);
2434
2435 next = next_state(state);
2436
47dc196a 2437 failrec = state->failrec;
f612496b
MX
2438 free_extent_state(state);
2439 kfree(failrec);
2440
2441 state = next;
2442 }
2443 spin_unlock(&failure_tree->lock);
2444}
2445
3526302f
NB
2446static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode,
2447 u64 start, u64 end)
4a54c8c1 2448{
ab8d0fc4 2449 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2fe6303e 2450 struct io_failure_record *failrec;
4a54c8c1 2451 struct extent_map *em;
4a54c8c1
JS
2452 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
2453 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
2454 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
4a54c8c1 2455 int ret;
4a54c8c1
JS
2456 u64 logical;
2457
2279a270 2458 failrec = get_state_failrec(failure_tree, start);
3526302f 2459 if (!IS_ERR(failrec)) {
ab8d0fc4
JM
2460 btrfs_debug(fs_info,
2461 "Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu, validation=%d",
2462 failrec->logical, failrec->start, failrec->len,
2463 failrec->in_validation);
4a54c8c1
JS
2464 /*
2465 * when data can be on disk more than twice, add to failrec here
2466 * (e.g. with a list for failed_mirror) to make
2467 * clean_io_failure() clean all those errors at once.
2468 */
3526302f
NB
2469
2470 return failrec;
4a54c8c1 2471 }
2fe6303e 2472
3526302f
NB
2473 failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
2474 if (!failrec)
2475 return ERR_PTR(-ENOMEM);
2fe6303e 2476
3526302f
NB
2477 failrec->start = start;
2478 failrec->len = end - start + 1;
2479 failrec->this_mirror = 0;
2480 failrec->bio_flags = 0;
2481 failrec->in_validation = 0;
2482
2483 read_lock(&em_tree->lock);
2484 em = lookup_extent_mapping(em_tree, start, failrec->len);
2485 if (!em) {
2486 read_unlock(&em_tree->lock);
2487 kfree(failrec);
2488 return ERR_PTR(-EIO);
2489 }
2490
2491 if (em->start > start || em->start + em->len <= start) {
2492 free_extent_map(em);
2493 em = NULL;
2494 }
2495 read_unlock(&em_tree->lock);
2496 if (!em) {
2497 kfree(failrec);
2498 return ERR_PTR(-EIO);
2499 }
2500
2501 logical = start - em->start;
2502 logical = em->block_start + logical;
2503 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
2504 logical = em->block_start;
2505 failrec->bio_flags = EXTENT_BIO_COMPRESSED;
2506 extent_set_compress_type(&failrec->bio_flags, em->compress_type);
2507 }
2508
2509 btrfs_debug(fs_info,
2510 "Get IO Failure Record: (new) logical=%llu, start=%llu, len=%llu",
2511 logical, start, failrec->len);
2512
2513 failrec->logical = logical;
2514 free_extent_map(em);
2515
2516 /* Set the bits in the private failure tree */
2517 ret = set_extent_bits(failure_tree, start, end,
2518 EXTENT_LOCKED | EXTENT_DIRTY);
2519 if (ret >= 0) {
2520 ret = set_state_failrec(failure_tree, start, failrec);
2521 /* Set the bits in the inode's tree */
2522 ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED);
2523 } else if (ret < 0) {
2524 kfree(failrec);
2525 return ERR_PTR(ret);
2526 }
2527
2528 return failrec;
2fe6303e
MX
2529}
2530
ce06d3ec
OS
2531static bool btrfs_check_repairable(struct inode *inode, bool needs_validation,
2532 struct io_failure_record *failrec,
2533 int failed_mirror)
2fe6303e 2534{
ab8d0fc4 2535 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2fe6303e
MX
2536 int num_copies;
2537
ab8d0fc4 2538 num_copies = btrfs_num_copies(fs_info, failrec->logical, failrec->len);
4a54c8c1
JS
2539 if (num_copies == 1) {
2540 /*
2541 * we only have a single copy of the data, so don't bother with
2542 * all the retry and error correction code that follows. no
2543 * matter what the error is, it is very likely to persist.
2544 */
ab8d0fc4
JM
2545 btrfs_debug(fs_info,
2546 "Check Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d",
2547 num_copies, failrec->this_mirror, failed_mirror);
c3cfb656 2548 return false;
4a54c8c1
JS
2549 }
2550
4a54c8c1
JS
2551 /*
2552 * there are two premises:
2553 * a) deliver good data to the caller
2554 * b) correct the bad sectors on disk
2555 */
c7333972 2556 if (needs_validation) {
4a54c8c1
JS
2557 /*
2558 * to fulfill b), we need to know the exact failing sectors, as
2559 * we don't want to rewrite any more than the failed ones. thus,
2560 * we need separate read requests for the failed bio
2561 *
2562 * if the following BUG_ON triggers, our validation request got
2563 * merged. we need separate requests for our algorithm to work.
2564 */
2565 BUG_ON(failrec->in_validation);
2566 failrec->in_validation = 1;
2567 failrec->this_mirror = failed_mirror;
4a54c8c1
JS
2568 } else {
2569 /*
2570 * we're ready to fulfill a) and b) alongside. get a good copy
2571 * of the failed sector and if we succeed, we have setup
2572 * everything for repair_io_failure to do the rest for us.
2573 */
2574 if (failrec->in_validation) {
2575 BUG_ON(failrec->this_mirror != failed_mirror);
2576 failrec->in_validation = 0;
2577 failrec->this_mirror = 0;
2578 }
2579 failrec->failed_mirror = failed_mirror;
2580 failrec->this_mirror++;
2581 if (failrec->this_mirror == failed_mirror)
2582 failrec->this_mirror++;
4a54c8c1
JS
2583 }
2584
facc8a22 2585 if (failrec->this_mirror > num_copies) {
ab8d0fc4
JM
2586 btrfs_debug(fs_info,
2587 "Check Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d",
2588 num_copies, failrec->this_mirror, failed_mirror);
c3cfb656 2589 return false;
4a54c8c1
JS
2590 }
2591
c3cfb656 2592 return true;
2fe6303e
MX
2593}
2594
c7333972 2595static bool btrfs_io_needs_validation(struct inode *inode, struct bio *bio)
2fe6303e 2596{
c7333972 2597 u64 len = 0;
77d5d689 2598 const u32 blocksize = inode->i_sb->s_blocksize;
2fe6303e 2599
f337bd74
OS
2600 /*
2601 * If bi_status is BLK_STS_OK, then this was a checksum error, not an
2602 * I/O error. In this case, we already know exactly which sector was
2603 * bad, so we don't need to validate.
2604 */
2605 if (bio->bi_status == BLK_STS_OK)
2606 return false;
4a54c8c1 2607
c7333972
OS
2608 /*
2609 * We need to validate each sector individually if the failed I/O was
2610 * for multiple sectors.
77d5d689
OS
2611 *
2612 * There are a few possible bios that can end up here:
2613 * 1. A buffered read bio, which is not cloned.
2614 * 2. A direct I/O read bio, which is cloned.
2615 * 3. A (buffered or direct) repair bio, which is not cloned.
2616 *
2617 * For cloned bios (case 2), we can get the size from
2618 * btrfs_io_bio->iter; for non-cloned bios (cases 1 and 3), we can get
2619 * it from the bvecs.
c7333972 2620 */
77d5d689
OS
2621 if (bio_flagged(bio, BIO_CLONED)) {
2622 if (btrfs_io_bio(bio)->iter.bi_size > blocksize)
c7333972 2623 return true;
77d5d689
OS
2624 } else {
2625 struct bio_vec *bvec;
2626 int i;
facc8a22 2627
77d5d689
OS
2628 bio_for_each_bvec_all(bvec, bio, i) {
2629 len += bvec->bv_len;
2630 if (len > blocksize)
2631 return true;
2632 }
facc8a22 2633 }
c7333972 2634 return false;
2fe6303e
MX
2635}
2636
77d5d689 2637blk_status_t btrfs_submit_read_repair(struct inode *inode,
7ffd27e3 2638 struct bio *failed_bio, u32 bio_offset,
77d5d689
OS
2639 struct page *page, unsigned int pgoff,
2640 u64 start, u64 end, int failed_mirror,
2641 submit_bio_hook_t *submit_bio_hook)
2fe6303e
MX
2642{
2643 struct io_failure_record *failrec;
77d5d689 2644 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2fe6303e 2645 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
7870d082 2646 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
77d5d689 2647 struct btrfs_io_bio *failed_io_bio = btrfs_io_bio(failed_bio);
7ffd27e3 2648 const int icsum = bio_offset >> fs_info->sectorsize_bits;
c7333972 2649 bool need_validation;
77d5d689
OS
2650 struct bio *repair_bio;
2651 struct btrfs_io_bio *repair_io_bio;
4e4cbee9 2652 blk_status_t status;
2fe6303e 2653
77d5d689
OS
2654 btrfs_debug(fs_info,
2655 "repair read error: read error at %llu", start);
2fe6303e 2656
1f7ad75b 2657 BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
2fe6303e 2658
3526302f
NB
2659 failrec = btrfs_get_io_failure_record(inode, start, end);
2660 if (IS_ERR(failrec))
2661 return errno_to_blk_status(PTR_ERR(failrec));
2fe6303e 2662
c7333972 2663 need_validation = btrfs_io_needs_validation(inode, failed_bio);
2fe6303e 2664
c7333972 2665 if (!btrfs_check_repairable(inode, need_validation, failrec,
c3cfb656 2666 failed_mirror)) {
7870d082 2667 free_io_failure(failure_tree, tree, failrec);
77d5d689 2668 return BLK_STS_IOERR;
2fe6303e
MX
2669 }
2670
77d5d689
OS
2671 repair_bio = btrfs_io_bio_alloc(1);
2672 repair_io_bio = btrfs_io_bio(repair_bio);
2673 repair_bio->bi_opf = REQ_OP_READ;
c7333972 2674 if (need_validation)
77d5d689
OS
2675 repair_bio->bi_opf |= REQ_FAILFAST_DEV;
2676 repair_bio->bi_end_io = failed_bio->bi_end_io;
2677 repair_bio->bi_iter.bi_sector = failrec->logical >> 9;
2678 repair_bio->bi_private = failed_bio->bi_private;
2fe6303e 2679
77d5d689 2680 if (failed_io_bio->csum) {
223486c2 2681 const u32 csum_size = fs_info->csum_size;
77d5d689
OS
2682
2683 repair_io_bio->csum = repair_io_bio->csum_inline;
2684 memcpy(repair_io_bio->csum,
2685 failed_io_bio->csum + csum_size * icsum, csum_size);
2686 }
2fe6303e 2687
77d5d689
OS
2688 bio_add_page(repair_bio, page, failrec->len, pgoff);
2689 repair_io_bio->logical = failrec->start;
2690 repair_io_bio->iter = repair_bio->bi_iter;
4a54c8c1 2691
ab8d0fc4 2692 btrfs_debug(btrfs_sb(inode->i_sb),
77d5d689
OS
2693"repair read error: submitting new read to mirror %d, in_validation=%d",
2694 failrec->this_mirror, failrec->in_validation);
4a54c8c1 2695
77d5d689
OS
2696 status = submit_bio_hook(inode, repair_bio, failrec->this_mirror,
2697 failrec->bio_flags);
4e4cbee9 2698 if (status) {
7870d082 2699 free_io_failure(failure_tree, tree, failrec);
77d5d689 2700 bio_put(repair_bio);
6c387ab2 2701 }
77d5d689 2702 return status;
4a54c8c1
JS
2703}
2704
d1310b2e
CM
2705/* lots and lots of room for performance fixes in the end_bio funcs */
2706
b5227c07 2707void end_extent_writepage(struct page *page, int err, u64 start, u64 end)
87826df0
JM
2708{
2709 int uptodate = (err == 0);
3e2426bd 2710 int ret = 0;
87826df0 2711
c629732d 2712 btrfs_writepage_endio_finish_ordered(page, start, end, uptodate);
87826df0 2713
87826df0 2714 if (!uptodate) {
87826df0
JM
2715 ClearPageUptodate(page);
2716 SetPageError(page);
bff5baf8 2717 ret = err < 0 ? err : -EIO;
5dca6eea 2718 mapping_set_error(page->mapping, ret);
87826df0 2719 }
87826df0
JM
2720}
2721
d1310b2e
CM
2722/*
2723 * after a writepage IO is done, we need to:
2724 * clear the uptodate bits on error
2725 * clear the writeback bits in the extent tree for this IO
2726 * end_page_writeback if the page has no more pending IO
2727 *
2728 * Scheduling is not allowed, so the extent state tree is expected
2729 * to have one and only one object corresponding to this IO.
2730 */
4246a0b6 2731static void end_bio_extent_writepage(struct bio *bio)
d1310b2e 2732{
4e4cbee9 2733 int error = blk_status_to_errno(bio->bi_status);
2c30c71b 2734 struct bio_vec *bvec;
d1310b2e
CM
2735 u64 start;
2736 u64 end;
6dc4f100 2737 struct bvec_iter_all iter_all;
d1310b2e 2738
c09abff8 2739 ASSERT(!bio_flagged(bio, BIO_CLONED));
2b070cfe 2740 bio_for_each_segment_all(bvec, bio, iter_all) {
d1310b2e 2741 struct page *page = bvec->bv_page;
0b246afa
JM
2742 struct inode *inode = page->mapping->host;
2743 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
902b22f3 2744
17a5adcc
AO
2745 /* We always issue full-page reads, but if some block
2746 * in a page fails to read, blk_update_request() will
2747 * advance bv_offset and adjust bv_len to compensate.
2748 * Print a warning for nonzero offsets, and an error
2749 * if they don't add up to a full page. */
09cbfeaf
KS
2750 if (bvec->bv_offset || bvec->bv_len != PAGE_SIZE) {
2751 if (bvec->bv_offset + bvec->bv_len != PAGE_SIZE)
0b246afa 2752 btrfs_err(fs_info,
efe120a0
FH
2753 "partial page write in btrfs with offset %u and length %u",
2754 bvec->bv_offset, bvec->bv_len);
2755 else
0b246afa 2756 btrfs_info(fs_info,
5d163e0e 2757 "incomplete page write in btrfs with offset %u and length %u",
efe120a0
FH
2758 bvec->bv_offset, bvec->bv_len);
2759 }
d1310b2e 2760
17a5adcc
AO
2761 start = page_offset(page);
2762 end = start + bvec->bv_offset + bvec->bv_len - 1;
d1310b2e 2763
4e4cbee9 2764 end_extent_writepage(page, error, start, end);
17a5adcc 2765 end_page_writeback(page);
2c30c71b 2766 }
2b1f55b0 2767
d1310b2e 2768 bio_put(bio);
d1310b2e
CM
2769}
2770
94e8c95c
QW
2771/*
2772 * Record previously processed extent range
2773 *
2774 * For endio_readpage_release_extent() to handle a full extent range, reducing
2775 * the extent io operations.
2776 */
2777struct processed_extent {
2778 struct btrfs_inode *inode;
2779 /* Start of the range in @inode */
2780 u64 start;
2e626e56 2781 /* End of the range in @inode */
94e8c95c
QW
2782 u64 end;
2783 bool uptodate;
2784};
2785
2786/*
2787 * Try to release processed extent range
2788 *
2789 * May not release the extent range right now if the current range is
2790 * contiguous to processed extent.
2791 *
2792 * Will release processed extent when any of @inode, @uptodate, the range is
2793 * no longer contiguous to the processed range.
2794 *
2795 * Passing @inode == NULL will force processed extent to be released.
2796 */
2797static void endio_readpage_release_extent(struct processed_extent *processed,
2798 struct btrfs_inode *inode, u64 start, u64 end,
2799 bool uptodate)
883d0de4
MX
2800{
2801 struct extent_state *cached = NULL;
94e8c95c
QW
2802 struct extent_io_tree *tree;
2803
2804 /* The first extent, initialize @processed */
2805 if (!processed->inode)
2806 goto update;
883d0de4 2807
94e8c95c
QW
2808 /*
2809 * Contiguous to processed extent, just uptodate the end.
2810 *
2811 * Several things to notice:
2812 *
2813 * - bio can be merged as long as on-disk bytenr is contiguous
2814 * This means we can have page belonging to other inodes, thus need to
2815 * check if the inode still matches.
2816 * - bvec can contain range beyond current page for multi-page bvec
2817 * Thus we need to do processed->end + 1 >= start check
2818 */
2819 if (processed->inode == inode && processed->uptodate == uptodate &&
2820 processed->end + 1 >= start && end >= processed->end) {
2821 processed->end = end;
2822 return;
2823 }
2824
2825 tree = &processed->inode->io_tree;
2826 /*
2827 * Now we don't have range contiguous to the processed range, release
2828 * the processed range now.
2829 */
2830 if (processed->uptodate && tree->track_uptodate)
2831 set_extent_uptodate(tree, processed->start, processed->end,
2832 &cached, GFP_ATOMIC);
2833 unlock_extent_cached_atomic(tree, processed->start, processed->end,
2834 &cached);
2835
2836update:
2837 /* Update processed to current range */
2838 processed->inode = inode;
2839 processed->start = start;
2840 processed->end = end;
2841 processed->uptodate = uptodate;
883d0de4
MX
2842}
2843
92082d40
QW
2844static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page)
2845{
2846 ASSERT(PageLocked(page));
2847 if (fs_info->sectorsize == PAGE_SIZE)
2848 return;
2849
2850 ASSERT(PagePrivate(page));
2851 btrfs_subpage_start_reader(fs_info, page, page_offset(page), PAGE_SIZE);
2852}
2853
2854static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len)
e09caaf9 2855{
4325cb22
QW
2856 struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
2857
2858 ASSERT(page_offset(page) <= start &&
2859 start + len <= page_offset(page) + PAGE_SIZE);
2860
e09caaf9 2861 if (uptodate) {
4325cb22 2862 btrfs_page_set_uptodate(fs_info, page, start, len);
e09caaf9 2863 } else {
4325cb22
QW
2864 btrfs_page_clear_uptodate(fs_info, page, start, len);
2865 btrfs_page_set_error(fs_info, page, start, len);
e09caaf9 2866 }
4325cb22
QW
2867
2868 if (fs_info->sectorsize == PAGE_SIZE)
2869 unlock_page(page);
92082d40
QW
2870 else if (is_data_inode(page->mapping->host))
2871 /*
2872 * For subpage data, unlock the page if we're the last reader.
2873 * For subpage metadata, page lock is not utilized for read.
2874 */
2875 btrfs_subpage_end_reader(fs_info, page, start, len);
e09caaf9
QW
2876}
2877
d1310b2e
CM
2878/*
2879 * after a readpage IO is done, we need to:
2880 * clear the uptodate bits on error
2881 * set the uptodate bits if things worked
2882 * set the page up to date if all extents in the tree are uptodate
2883 * clear the lock bit in the extent tree
2884 * unlock the page if there are no other extents locked for it
2885 *
2886 * Scheduling is not allowed, so the extent state tree is expected
2887 * to have one and only one object corresponding to this IO.
2888 */
4246a0b6 2889static void end_bio_extent_readpage(struct bio *bio)
d1310b2e 2890{
2c30c71b 2891 struct bio_vec *bvec;
4e4cbee9 2892 int uptodate = !bio->bi_status;
facc8a22 2893 struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
7870d082 2894 struct extent_io_tree *tree, *failure_tree;
94e8c95c 2895 struct processed_extent processed = { 0 };
7ffd27e3
QW
2896 /*
2897 * The offset to the beginning of a bio, since one bio can never be
2898 * larger than UINT_MAX, u32 here is enough.
2899 */
2900 u32 bio_offset = 0;
5cf1ab56 2901 int mirror;
d1310b2e 2902 int ret;
6dc4f100 2903 struct bvec_iter_all iter_all;
d1310b2e 2904
c09abff8 2905 ASSERT(!bio_flagged(bio, BIO_CLONED));
2b070cfe 2906 bio_for_each_segment_all(bvec, bio, iter_all) {
d1310b2e 2907 struct page *page = bvec->bv_page;
a71754fc 2908 struct inode *inode = page->mapping->host;
ab8d0fc4 2909 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
7ffd27e3
QW
2910 const u32 sectorsize = fs_info->sectorsize;
2911 u64 start;
2912 u64 end;
2913 u32 len;
507903b8 2914
ab8d0fc4
JM
2915 btrfs_debug(fs_info,
2916 "end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u",
1201b58b 2917 bio->bi_iter.bi_sector, bio->bi_status,
ab8d0fc4 2918 io_bio->mirror_num);
a71754fc 2919 tree = &BTRFS_I(inode)->io_tree;
7870d082 2920 failure_tree = &BTRFS_I(inode)->io_failure_tree;
902b22f3 2921
8b8bbd46
QW
2922 /*
2923 * We always issue full-sector reads, but if some block in a
2924 * page fails to read, blk_update_request() will advance
2925 * bv_offset and adjust bv_len to compensate. Print a warning
2926 * for unaligned offsets, and an error if they don't add up to
2927 * a full sector.
2928 */
2929 if (!IS_ALIGNED(bvec->bv_offset, sectorsize))
2930 btrfs_err(fs_info,
2931 "partial page read in btrfs with offset %u and length %u",
2932 bvec->bv_offset, bvec->bv_len);
2933 else if (!IS_ALIGNED(bvec->bv_offset + bvec->bv_len,
2934 sectorsize))
2935 btrfs_info(fs_info,
2936 "incomplete page read with offset %u and length %u",
2937 bvec->bv_offset, bvec->bv_len);
2938
2939 start = page_offset(page) + bvec->bv_offset;
2940 end = start + bvec->bv_len - 1;
facc8a22 2941 len = bvec->bv_len;
d1310b2e 2942
9be3395b 2943 mirror = io_bio->mirror_num;
78e62c02 2944 if (likely(uptodate)) {
be17b3af 2945 if (is_data_inode(inode))
7ffd27e3
QW
2946 ret = btrfs_verify_data_csum(io_bio,
2947 bio_offset, page, start, end,
2948 mirror);
9a446d6a
NB
2949 else
2950 ret = btrfs_validate_metadata_buffer(io_bio,
8e1dc982 2951 page, start, end, mirror);
5ee0844d 2952 if (ret)
d1310b2e 2953 uptodate = 0;
5ee0844d 2954 else
7870d082
JB
2955 clean_io_failure(BTRFS_I(inode)->root->fs_info,
2956 failure_tree, tree, start,
2957 page,
2958 btrfs_ino(BTRFS_I(inode)), 0);
d1310b2e 2959 }
ea466794 2960
f2a09da9
MX
2961 if (likely(uptodate))
2962 goto readpage_ok;
2963
be17b3af 2964 if (is_data_inode(inode)) {
9d0d1c8b 2965
f4a8e656 2966 /*
78e62c02
NB
2967 * The generic bio_readpage_error handles errors the
2968 * following way: If possible, new read requests are
2969 * created and submitted and will end up in
2970 * end_bio_extent_readpage as well (if we're lucky,
2971 * not in the !uptodate case). In that case it returns
2972 * 0 and we just go on with the next page in our bio.
2973 * If it can't handle the error it will return -EIO and
2974 * we remain responsible for that page.
f4a8e656 2975 */
7ffd27e3
QW
2976 if (!btrfs_submit_read_repair(inode, bio, bio_offset,
2977 page,
77d5d689
OS
2978 start - page_offset(page),
2979 start, end, mirror,
908930f3 2980 btrfs_submit_data_bio)) {
78e62c02 2981 uptodate = !bio->bi_status;
7ffd27e3
QW
2982 ASSERT(bio_offset + len > bio_offset);
2983 bio_offset += len;
78e62c02
NB
2984 continue;
2985 }
2986 } else {
2987 struct extent_buffer *eb;
2988
2989 eb = (struct extent_buffer *)page->private;
2990 set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
2991 eb->read_mirror = mirror;
2992 atomic_dec(&eb->io_pages);
2993 if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD,
2994 &eb->bflags))
2995 btree_readahead_hook(eb, -EIO);
7e38326f 2996 }
f2a09da9 2997readpage_ok:
883d0de4 2998 if (likely(uptodate)) {
a71754fc 2999 loff_t i_size = i_size_read(inode);
09cbfeaf 3000 pgoff_t end_index = i_size >> PAGE_SHIFT;
a583c026 3001 unsigned off;
a71754fc
JB
3002
3003 /* Zero out the end if this page straddles i_size */
7073017a 3004 off = offset_in_page(i_size);
a583c026 3005 if (page->index == end_index && off)
09cbfeaf 3006 zero_user_segment(page, off, PAGE_SIZE);
70dec807 3007 }
7ffd27e3
QW
3008 ASSERT(bio_offset + len > bio_offset);
3009 bio_offset += len;
883d0de4 3010
e09caaf9 3011 /* Update page status and unlock */
92082d40 3012 end_page_read(page, uptodate, start, len);
94e8c95c
QW
3013 endio_readpage_release_extent(&processed, BTRFS_I(inode),
3014 start, end, uptodate);
2c30c71b 3015 }
94e8c95c
QW
3016 /* Release the last extent */
3017 endio_readpage_release_extent(&processed, NULL, 0, 0, false);
b3a0dd50 3018 btrfs_io_bio_free_csum(io_bio);
d1310b2e 3019 bio_put(bio);
d1310b2e
CM
3020}
3021
9be3395b 3022/*
184f999e
DS
3023 * Initialize the members up to but not including 'bio'. Use after allocating a
3024 * new bio by bio_alloc_bioset as it does not initialize the bytes outside of
3025 * 'bio' because use of __GFP_ZERO is not supported.
9be3395b 3026 */
184f999e 3027static inline void btrfs_io_bio_init(struct btrfs_io_bio *btrfs_bio)
d1310b2e 3028{
184f999e
DS
3029 memset(btrfs_bio, 0, offsetof(struct btrfs_io_bio, bio));
3030}
d1310b2e 3031
9be3395b 3032/*
6e707bcd
DS
3033 * The following helpers allocate a bio. As it's backed by a bioset, it'll
3034 * never fail. We're returning a bio right now but you can call btrfs_io_bio
3035 * for the appropriate container_of magic
9be3395b 3036 */
e749af44 3037struct bio *btrfs_bio_alloc(u64 first_byte)
d1310b2e
CM
3038{
3039 struct bio *bio;
d1310b2e 3040
8ac9f7c1 3041 bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, &btrfs_bioset);
c821e7f3 3042 bio->bi_iter.bi_sector = first_byte >> 9;
184f999e 3043 btrfs_io_bio_init(btrfs_io_bio(bio));
d1310b2e
CM
3044 return bio;
3045}
3046
8b6c1d56 3047struct bio *btrfs_bio_clone(struct bio *bio)
9be3395b 3048{
23ea8e5a
MX
3049 struct btrfs_io_bio *btrfs_bio;
3050 struct bio *new;
9be3395b 3051
6e707bcd 3052 /* Bio allocation backed by a bioset does not fail */
8ac9f7c1 3053 new = bio_clone_fast(bio, GFP_NOFS, &btrfs_bioset);
6e707bcd 3054 btrfs_bio = btrfs_io_bio(new);
184f999e 3055 btrfs_io_bio_init(btrfs_bio);
6e707bcd 3056 btrfs_bio->iter = bio->bi_iter;
23ea8e5a
MX
3057 return new;
3058}
9be3395b 3059
c5e4c3d7 3060struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs)
9be3395b 3061{
facc8a22
MX
3062 struct bio *bio;
3063
6e707bcd 3064 /* Bio allocation backed by a bioset does not fail */
8ac9f7c1 3065 bio = bio_alloc_bioset(GFP_NOFS, nr_iovecs, &btrfs_bioset);
184f999e 3066 btrfs_io_bio_init(btrfs_io_bio(bio));
facc8a22 3067 return bio;
9be3395b
CM
3068}
3069
e477094f 3070struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size)
2f8e9140
LB
3071{
3072 struct bio *bio;
3073 struct btrfs_io_bio *btrfs_bio;
3074
3075 /* this will never fail when it's backed by a bioset */
8ac9f7c1 3076 bio = bio_clone_fast(orig, GFP_NOFS, &btrfs_bioset);
2f8e9140
LB
3077 ASSERT(bio);
3078
3079 btrfs_bio = btrfs_io_bio(bio);
184f999e 3080 btrfs_io_bio_init(btrfs_bio);
2f8e9140
LB
3081
3082 bio_trim(bio, offset >> 9, size >> 9);
17347cec 3083 btrfs_bio->iter = bio->bi_iter;
2f8e9140
LB
3084 return bio;
3085}
9be3395b 3086
953651eb
NA
3087/**
3088 * Attempt to add a page to bio
3089 *
3090 * @bio: destination bio
3091 * @page: page to add to the bio
3092 * @disk_bytenr: offset of the new bio or to check whether we are adding
3093 * a contiguous page to the previous one
3094 * @pg_offset: starting offset in the page
3095 * @size: portion of page that we want to write
3096 * @prev_bio_flags: flags of previous bio to see if we can merge the current one
3097 * @bio_flags: flags of the current bio to see if we can merge them
3098 * @return: true if page was added, false otherwise
3099 *
3100 * Attempt to add a page to bio considering stripe alignment etc.
3101 *
3102 * Return true if successfully page added. Otherwise, return false.
3103 */
3104static bool btrfs_bio_add_page(struct bio *bio, struct page *page,
3105 u64 disk_bytenr, unsigned int size,
3106 unsigned int pg_offset,
3107 unsigned long prev_bio_flags,
3108 unsigned long bio_flags)
3109{
3110 const sector_t sector = disk_bytenr >> SECTOR_SHIFT;
3111 bool contig;
e1326f03 3112 int ret;
953651eb
NA
3113
3114 if (prev_bio_flags != bio_flags)
3115 return false;
3116
3117 if (prev_bio_flags & EXTENT_BIO_COMPRESSED)
3118 contig = bio->bi_iter.bi_sector == sector;
3119 else
3120 contig = bio_end_sector(bio) == sector;
3121 if (!contig)
3122 return false;
3123
3124 if (btrfs_bio_fits_in_stripe(page, size, bio, bio_flags))
3125 return false;
3126
cacb2cea
JT
3127 if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
3128 struct page *first_page = bio_first_bvec_all(bio)->bv_page;
3129
3130 if (!btrfs_bio_fits_in_ordered_extent(first_page, bio, size))
3131 return false;
e1326f03 3132 ret = bio_add_zone_append_page(bio, page, size, pg_offset);
cacb2cea 3133 } else {
e1326f03 3134 ret = bio_add_page(bio, page, size, pg_offset);
cacb2cea 3135 }
e1326f03
NA
3136
3137 return ret == size;
953651eb
NA
3138}
3139
4b81ba48
DS
3140/*
3141 * @opf: bio REQ_OP_* and REQ_* flags as one value
b8b3d625
DS
3142 * @wbc: optional writeback control for io accounting
3143 * @page: page to add to the bio
0c64c33c
QW
3144 * @disk_bytenr: logical bytenr where the write will be
3145 * @size: portion of page that we want to write to
b8b3d625
DS
3146 * @pg_offset: offset of the new bio or to check whether we are adding
3147 * a contiguous page to the previous one
5c2b1fd7 3148 * @bio_ret: must be valid pointer, newly allocated bio will be stored there
b8b3d625
DS
3149 * @end_io_func: end_io callback for new bio
3150 * @mirror_num: desired mirror to read/write
3151 * @prev_bio_flags: flags of previous bio to see if we can merge the current one
3152 * @bio_flags: flags of the current bio to see if we can merge them
4b81ba48 3153 */
0ceb34bf 3154static int submit_extent_page(unsigned int opf,
da2f0f74 3155 struct writeback_control *wbc,
0c64c33c 3156 struct page *page, u64 disk_bytenr,
6c5a4e2c 3157 size_t size, unsigned long pg_offset,
d1310b2e 3158 struct bio **bio_ret,
f188591e 3159 bio_end_io_t end_io_func,
c8b97818
CM
3160 int mirror_num,
3161 unsigned long prev_bio_flags,
005efedf
FM
3162 unsigned long bio_flags,
3163 bool force_bio_submit)
d1310b2e
CM
3164{
3165 int ret = 0;
3166 struct bio *bio;
e940e9a7 3167 size_t io_size = min_t(size_t, size, PAGE_SIZE);
e1326f03
NA
3168 struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
3169 struct extent_io_tree *tree = &inode->io_tree;
3170 struct btrfs_fs_info *fs_info = inode->root->fs_info;
d1310b2e 3171
5c2b1fd7
DS
3172 ASSERT(bio_ret);
3173
3174 if (*bio_ret) {
d1310b2e 3175 bio = *bio_ret;
953651eb
NA
3176 if (force_bio_submit ||
3177 !btrfs_bio_add_page(bio, page, disk_bytenr, io_size,
3178 pg_offset, prev_bio_flags, bio_flags)) {
1f7ad75b 3179 ret = submit_one_bio(bio, mirror_num, prev_bio_flags);
289454ad
NA
3180 if (ret < 0) {
3181 *bio_ret = NULL;
79787eaa 3182 return ret;
289454ad 3183 }
d1310b2e
CM
3184 bio = NULL;
3185 } else {
da2f0f74 3186 if (wbc)
e940e9a7 3187 wbc_account_cgroup_owner(wbc, page, io_size);
d1310b2e
CM
3188 return 0;
3189 }
3190 }
c8b97818 3191
0c64c33c 3192 bio = btrfs_bio_alloc(disk_bytenr);
e940e9a7 3193 bio_add_page(bio, page, io_size, pg_offset);
d1310b2e
CM
3194 bio->bi_end_io = end_io_func;
3195 bio->bi_private = tree;
e6959b93 3196 bio->bi_write_hint = page->mapping->host->i_write_hint;
4b81ba48 3197 bio->bi_opf = opf;
da2f0f74 3198 if (wbc) {
429aebc0
DS
3199 struct block_device *bdev;
3200
e1326f03 3201 bdev = fs_info->fs_devices->latest_bdev;
429aebc0 3202 bio_set_dev(bio, bdev);
da2f0f74 3203 wbc_init_bio(wbc, bio);
e940e9a7 3204 wbc_account_cgroup_owner(wbc, page, io_size);
da2f0f74 3205 }
e1326f03
NA
3206 if (btrfs_is_zoned(fs_info) && bio_op(bio) == REQ_OP_ZONE_APPEND) {
3207 struct extent_map *em;
3208 struct map_lookup *map;
3209
3210 em = btrfs_get_chunk_map(fs_info, disk_bytenr, io_size);
3211 if (IS_ERR(em))
3212 return PTR_ERR(em);
3213
3214 map = em->map_lookup;
3215 /* We only support single profile for now */
3216 ASSERT(map->num_stripes == 1);
3217 btrfs_io_bio(bio)->device = map->stripes[0].dev;
3218
3219 free_extent_map(em);
3220 }
70dec807 3221
5c2b1fd7 3222 *bio_ret = bio;
d1310b2e
CM
3223
3224 return ret;
3225}
3226
760f991f
QW
3227static int attach_extent_buffer_page(struct extent_buffer *eb,
3228 struct page *page,
3229 struct btrfs_subpage *prealloc)
d1310b2e 3230{
760f991f
QW
3231 struct btrfs_fs_info *fs_info = eb->fs_info;
3232 int ret = 0;
3233
0d01e247
QW
3234 /*
3235 * If the page is mapped to btree inode, we should hold the private
3236 * lock to prevent race.
3237 * For cloned or dummy extent buffers, their pages are not mapped and
3238 * will not race with any other ebs.
3239 */
3240 if (page->mapping)
3241 lockdep_assert_held(&page->mapping->private_lock);
3242
760f991f
QW
3243 if (fs_info->sectorsize == PAGE_SIZE) {
3244 if (!PagePrivate(page))
3245 attach_page_private(page, eb);
3246 else
3247 WARN_ON(page->private != (unsigned long)eb);
3248 return 0;
3249 }
3250
3251 /* Already mapped, just free prealloc */
3252 if (PagePrivate(page)) {
3253 btrfs_free_subpage(prealloc);
3254 return 0;
3255 }
3256
3257 if (prealloc)
3258 /* Has preallocated memory for subpage */
3259 attach_page_private(page, prealloc);
d1b89bc0 3260 else
760f991f
QW
3261 /* Do new allocation to attach subpage */
3262 ret = btrfs_attach_subpage(fs_info, page,
3263 BTRFS_SUBPAGE_METADATA);
3264 return ret;
d1310b2e
CM
3265}
3266
32443de3 3267int set_page_extent_mapped(struct page *page)
d1310b2e 3268{
32443de3
QW
3269 struct btrfs_fs_info *fs_info;
3270
3271 ASSERT(page->mapping);
3272
3273 if (PagePrivate(page))
3274 return 0;
3275
3276 fs_info = btrfs_sb(page->mapping->host->i_sb);
3277
3278 if (fs_info->sectorsize < PAGE_SIZE)
3279 return btrfs_attach_subpage(fs_info, page, BTRFS_SUBPAGE_DATA);
3280
3281 attach_page_private(page, (void *)EXTENT_PAGE_PRIVATE);
3282 return 0;
3283}
3284
3285void clear_page_extent_mapped(struct page *page)
3286{
3287 struct btrfs_fs_info *fs_info;
3288
3289 ASSERT(page->mapping);
3290
d1b89bc0 3291 if (!PagePrivate(page))
32443de3
QW
3292 return;
3293
3294 fs_info = btrfs_sb(page->mapping->host->i_sb);
3295 if (fs_info->sectorsize < PAGE_SIZE)
3296 return btrfs_detach_subpage(fs_info, page);
3297
3298 detach_page_private(page);
d1310b2e
CM
3299}
3300
125bac01
MX
3301static struct extent_map *
3302__get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
1a5ee1e6 3303 u64 start, u64 len, struct extent_map **em_cached)
125bac01
MX
3304{
3305 struct extent_map *em;
3306
3307 if (em_cached && *em_cached) {
3308 em = *em_cached;
cbc0e928 3309 if (extent_map_in_tree(em) && start >= em->start &&
125bac01 3310 start < extent_map_end(em)) {
490b54d6 3311 refcount_inc(&em->refs);
125bac01
MX
3312 return em;
3313 }
3314
3315 free_extent_map(em);
3316 *em_cached = NULL;
3317 }
3318
1a5ee1e6 3319 em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, start, len);
125bac01
MX
3320 if (em_cached && !IS_ERR_OR_NULL(em)) {
3321 BUG_ON(*em_cached);
490b54d6 3322 refcount_inc(&em->refs);
125bac01
MX
3323 *em_cached = em;
3324 }
3325 return em;
3326}
d1310b2e
CM
3327/*
3328 * basic readpage implementation. Locked extent state structs are inserted
3329 * into the tree that are removed when the IO is done (by the end_io
3330 * handlers)
79787eaa 3331 * XXX JDM: This needs looking at to ensure proper page locking
baf863b9 3332 * return 0 on success, otherwise return error
d1310b2e 3333 */
0f208812
NB
3334int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
3335 struct bio **bio, unsigned long *bio_flags,
3336 unsigned int read_flags, u64 *prev_em_start)
d1310b2e
CM
3337{
3338 struct inode *inode = page->mapping->host;
92082d40 3339 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4eee4fa4 3340 u64 start = page_offset(page);
8eec8296 3341 const u64 end = start + PAGE_SIZE - 1;
d1310b2e
CM
3342 u64 cur = start;
3343 u64 extent_offset;
3344 u64 last_byte = i_size_read(inode);
3345 u64 block_start;
3346 u64 cur_end;
d1310b2e 3347 struct extent_map *em;
baf863b9 3348 int ret = 0;
d1310b2e 3349 int nr = 0;
306e16ce 3350 size_t pg_offset = 0;
d1310b2e
CM
3351 size_t iosize;
3352 size_t blocksize = inode->i_sb->s_blocksize;
7f042a83 3353 unsigned long this_bio_flag = 0;
f657a31c 3354 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
ae6957eb 3355
32443de3
QW
3356 ret = set_page_extent_mapped(page);
3357 if (ret < 0) {
3358 unlock_extent(tree, start, end);
92082d40
QW
3359 btrfs_page_set_error(fs_info, page, start, PAGE_SIZE);
3360 unlock_page(page);
32443de3
QW
3361 goto out;
3362 }
d1310b2e 3363
90a887c9
DM
3364 if (!PageUptodate(page)) {
3365 if (cleancache_get_page(page) == 0) {
3366 BUG_ON(blocksize != PAGE_SIZE);
9974090b 3367 unlock_extent(tree, start, end);
92082d40 3368 unlock_page(page);
90a887c9
DM
3369 goto out;
3370 }
3371 }
3372
09cbfeaf 3373 if (page->index == last_byte >> PAGE_SHIFT) {
c8b97818 3374 char *userpage;
7073017a 3375 size_t zero_offset = offset_in_page(last_byte);
c8b97818
CM
3376
3377 if (zero_offset) {
09cbfeaf 3378 iosize = PAGE_SIZE - zero_offset;
7ac687d9 3379 userpage = kmap_atomic(page);
c8b97818
CM
3380 memset(userpage + zero_offset, 0, iosize);
3381 flush_dcache_page(page);
7ac687d9 3382 kunmap_atomic(userpage);
c8b97818
CM
3383 }
3384 }
92082d40 3385 begin_page_read(fs_info, page);
d1310b2e 3386 while (cur <= end) {
005efedf 3387 bool force_bio_submit = false;
0c64c33c 3388 u64 disk_bytenr;
c8f2f24b 3389
d1310b2e
CM
3390 if (cur >= last_byte) {
3391 char *userpage;
507903b8
AJ
3392 struct extent_state *cached = NULL;
3393
09cbfeaf 3394 iosize = PAGE_SIZE - pg_offset;
7ac687d9 3395 userpage = kmap_atomic(page);
306e16ce 3396 memset(userpage + pg_offset, 0, iosize);
d1310b2e 3397 flush_dcache_page(page);
7ac687d9 3398 kunmap_atomic(userpage);
d1310b2e 3399 set_extent_uptodate(tree, cur, cur + iosize - 1,
507903b8 3400 &cached, GFP_NOFS);
7f042a83 3401 unlock_extent_cached(tree, cur,
e43bbe5e 3402 cur + iosize - 1, &cached);
92082d40 3403 end_page_read(page, true, cur, iosize);
d1310b2e
CM
3404 break;
3405 }
125bac01 3406 em = __get_extent_map(inode, page, pg_offset, cur,
1a5ee1e6 3407 end - cur + 1, em_cached);
c704005d 3408 if (IS_ERR_OR_NULL(em)) {
7f042a83 3409 unlock_extent(tree, cur, end);
92082d40 3410 end_page_read(page, false, cur, end + 1 - cur);
d1310b2e
CM
3411 break;
3412 }
d1310b2e
CM
3413 extent_offset = cur - em->start;
3414 BUG_ON(extent_map_end(em) <= cur);
3415 BUG_ON(end < cur);
3416
261507a0 3417 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
4b384318 3418 this_bio_flag |= EXTENT_BIO_COMPRESSED;
261507a0
LZ
3419 extent_set_compress_type(&this_bio_flag,
3420 em->compress_type);
3421 }
c8b97818 3422
d1310b2e
CM
3423 iosize = min(extent_map_end(em) - cur, end - cur + 1);
3424 cur_end = min(extent_map_end(em) - 1, end);
fda2832f 3425 iosize = ALIGN(iosize, blocksize);
949b3273 3426 if (this_bio_flag & EXTENT_BIO_COMPRESSED)
0c64c33c 3427 disk_bytenr = em->block_start;
949b3273 3428 else
0c64c33c 3429 disk_bytenr = em->block_start + extent_offset;
d1310b2e 3430 block_start = em->block_start;
d899e052
YZ
3431 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
3432 block_start = EXTENT_MAP_HOLE;
005efedf
FM
3433
3434 /*
3435 * If we have a file range that points to a compressed extent
260db43c 3436 * and it's followed by a consecutive file range that points
005efedf
FM
3437 * to the same compressed extent (possibly with a different
3438 * offset and/or length, so it either points to the whole extent
3439 * or only part of it), we must make sure we do not submit a
3440 * single bio to populate the pages for the 2 ranges because
3441 * this makes the compressed extent read zero out the pages
3442 * belonging to the 2nd range. Imagine the following scenario:
3443 *
3444 * File layout
3445 * [0 - 8K] [8K - 24K]
3446 * | |
3447 * | |
3448 * points to extent X, points to extent X,
3449 * offset 4K, length of 8K offset 0, length 16K
3450 *
3451 * [extent X, compressed length = 4K uncompressed length = 16K]
3452 *
3453 * If the bio to read the compressed extent covers both ranges,
3454 * it will decompress extent X into the pages belonging to the
3455 * first range and then it will stop, zeroing out the remaining
3456 * pages that belong to the other range that points to extent X.
3457 * So here we make sure we submit 2 bios, one for the first
3458 * range and another one for the third range. Both will target
3459 * the same physical extent from disk, but we can't currently
3460 * make the compressed bio endio callback populate the pages
3461 * for both ranges because each compressed bio is tightly
3462 * coupled with a single extent map, and each range can have
3463 * an extent map with a different offset value relative to the
3464 * uncompressed data of our extent and different lengths. This
3465 * is a corner case so we prioritize correctness over
3466 * non-optimal behavior (submitting 2 bios for the same extent).
3467 */
3468 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) &&
3469 prev_em_start && *prev_em_start != (u64)-1 &&
8e928218 3470 *prev_em_start != em->start)
005efedf
FM
3471 force_bio_submit = true;
3472
3473 if (prev_em_start)
8e928218 3474 *prev_em_start = em->start;
005efedf 3475
d1310b2e
CM
3476 free_extent_map(em);
3477 em = NULL;
3478
3479 /* we've found a hole, just zero and go on */
3480 if (block_start == EXTENT_MAP_HOLE) {
3481 char *userpage;
507903b8
AJ
3482 struct extent_state *cached = NULL;
3483
7ac687d9 3484 userpage = kmap_atomic(page);
306e16ce 3485 memset(userpage + pg_offset, 0, iosize);
d1310b2e 3486 flush_dcache_page(page);
7ac687d9 3487 kunmap_atomic(userpage);
d1310b2e
CM
3488
3489 set_extent_uptodate(tree, cur, cur + iosize - 1,
507903b8 3490 &cached, GFP_NOFS);
7f042a83 3491 unlock_extent_cached(tree, cur,
e43bbe5e 3492 cur + iosize - 1, &cached);
92082d40 3493 end_page_read(page, true, cur, iosize);
d1310b2e 3494 cur = cur + iosize;
306e16ce 3495 pg_offset += iosize;
d1310b2e
CM
3496 continue;
3497 }
3498 /* the get_extent function already copied into the page */
9655d298
CM
3499 if (test_range_bit(tree, cur, cur_end,
3500 EXTENT_UPTODATE, 1, NULL)) {
a1b32a59 3501 check_page_uptodate(tree, page);
7f042a83 3502 unlock_extent(tree, cur, cur + iosize - 1);
92082d40 3503 end_page_read(page, true, cur, iosize);
d1310b2e 3504 cur = cur + iosize;
306e16ce 3505 pg_offset += iosize;
d1310b2e
CM
3506 continue;
3507 }
70dec807
CM
3508 /* we have an inline extent but it didn't get marked up
3509 * to date. Error out
3510 */
3511 if (block_start == EXTENT_MAP_INLINE) {
7f042a83 3512 unlock_extent(tree, cur, cur + iosize - 1);
92082d40 3513 end_page_read(page, false, cur, iosize);
70dec807 3514 cur = cur + iosize;
306e16ce 3515 pg_offset += iosize;
70dec807
CM
3516 continue;
3517 }
d1310b2e 3518
0ceb34bf 3519 ret = submit_extent_page(REQ_OP_READ | read_flags, NULL,
0c64c33c 3520 page, disk_bytenr, iosize,
fa17ed06 3521 pg_offset, bio,
fd513000 3522 end_bio_extent_readpage, 0,
c8b97818 3523 *bio_flags,
005efedf
FM
3524 this_bio_flag,
3525 force_bio_submit);
c8f2f24b
JB
3526 if (!ret) {
3527 nr++;
3528 *bio_flags = this_bio_flag;
3529 } else {
7f042a83 3530 unlock_extent(tree, cur, cur + iosize - 1);
92082d40 3531 end_page_read(page, false, cur, iosize);
baf863b9 3532 goto out;
edd33c99 3533 }
d1310b2e 3534 cur = cur + iosize;
306e16ce 3535 pg_offset += iosize;
d1310b2e 3536 }
90a887c9 3537out:
baf863b9 3538 return ret;
d1310b2e
CM
3539}
3540
b6660e80 3541static inline void contiguous_readpages(struct page *pages[], int nr_pages,
9974090b 3542 u64 start, u64 end,
125bac01 3543 struct extent_map **em_cached,
d3fac6ba 3544 struct bio **bio,
1f7ad75b 3545 unsigned long *bio_flags,
808f80b4 3546 u64 *prev_em_start)
9974090b 3547{
23d31bd4 3548 struct btrfs_inode *inode = BTRFS_I(pages[0]->mapping->host);
9974090b
MX
3549 int index;
3550
b272ae22 3551 btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
9974090b
MX
3552
3553 for (index = 0; index < nr_pages; index++) {
0f208812
NB
3554 btrfs_do_readpage(pages[index], em_cached, bio, bio_flags,
3555 REQ_RAHEAD, prev_em_start);
09cbfeaf 3556 put_page(pages[index]);
9974090b
MX
3557 }
3558}
3559
3d4b9496 3560static void update_nr_written(struct writeback_control *wbc,
a9132667 3561 unsigned long nr_written)
11c8349b
CM
3562{
3563 wbc->nr_to_write -= nr_written;
11c8349b
CM
3564}
3565
d1310b2e 3566/*
40f76580
CM
3567 * helper for __extent_writepage, doing all of the delayed allocation setup.
3568 *
5eaad97a 3569 * This returns 1 if btrfs_run_delalloc_range function did all the work required
40f76580
CM
3570 * to write the page (copy into inline extent). In this case the IO has
3571 * been started and the page is already unlocked.
3572 *
3573 * This returns 0 if all went well (page still locked)
3574 * This returns < 0 if there were errors (page still locked)
d1310b2e 3575 */
cd4c0bf9 3576static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
8cc0237a
NB
3577 struct page *page, struct writeback_control *wbc,
3578 u64 delalloc_start, unsigned long *nr_written)
40f76580 3579{
09cbfeaf 3580 u64 page_end = delalloc_start + PAGE_SIZE - 1;
3522e903 3581 bool found;
40f76580
CM
3582 u64 delalloc_to_write = 0;
3583 u64 delalloc_end = 0;
3584 int ret;
3585 int page_started = 0;
3586
40f76580
CM
3587
3588 while (delalloc_end < page_end) {
cd4c0bf9 3589 found = find_lock_delalloc_range(&inode->vfs_inode, page,
40f76580 3590 &delalloc_start,
917aacec 3591 &delalloc_end);
3522e903 3592 if (!found) {
40f76580
CM
3593 delalloc_start = delalloc_end + 1;
3594 continue;
3595 }
cd4c0bf9 3596 ret = btrfs_run_delalloc_range(inode, page, delalloc_start,
5eaad97a 3597 delalloc_end, &page_started, nr_written, wbc);
40f76580
CM
3598 if (ret) {
3599 SetPageError(page);
5eaad97a
NB
3600 /*
3601 * btrfs_run_delalloc_range should return < 0 for error
3602 * but just in case, we use > 0 here meaning the IO is
3603 * started, so we don't want to return > 0 unless
3604 * things are going well.
40f76580 3605 */
b69d1ee9 3606 return ret < 0 ? ret : -EIO;
40f76580
CM
3607 }
3608 /*
ea1754a0
KS
3609 * delalloc_end is already one less than the total length, so
3610 * we don't subtract one from PAGE_SIZE
40f76580
CM
3611 */
3612 delalloc_to_write += (delalloc_end - delalloc_start +
ea1754a0 3613 PAGE_SIZE) >> PAGE_SHIFT;
40f76580
CM
3614 delalloc_start = delalloc_end + 1;
3615 }
3616 if (wbc->nr_to_write < delalloc_to_write) {
3617 int thresh = 8192;
3618
3619 if (delalloc_to_write < thresh * 2)
3620 thresh = delalloc_to_write;
3621 wbc->nr_to_write = min_t(u64, delalloc_to_write,
3622 thresh);
3623 }
3624
3625 /* did the fill delalloc function already unlock and start
3626 * the IO?
3627 */
3628 if (page_started) {
3629 /*
3630 * we've unlocked the page, so we can't update
3631 * the mapping's writeback index, just update
3632 * nr_to_write.
3633 */
3634 wbc->nr_to_write -= *nr_written;
3635 return 1;
3636 }
3637
b69d1ee9 3638 return 0;
40f76580
CM
3639}
3640
3641/*
3642 * helper for __extent_writepage. This calls the writepage start hooks,
3643 * and does the loop to map the page into extents and bios.
3644 *
3645 * We return 1 if the IO is started and the page is unlocked,
3646 * 0 if all went well (page still locked)
3647 * < 0 if there were errors (page still locked)
3648 */
d4580fe2 3649static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
40f76580
CM
3650 struct page *page,
3651 struct writeback_control *wbc,
3652 struct extent_page_data *epd,
3653 loff_t i_size,
3654 unsigned long nr_written,
57e5ffeb 3655 int *nr_ret)
d1310b2e 3656{
6bc5636a 3657 struct btrfs_fs_info *fs_info = inode->root->fs_info;
d4580fe2 3658 struct extent_io_tree *tree = &inode->io_tree;
4eee4fa4 3659 u64 start = page_offset(page);
6bc5636a 3660 u64 end = start + PAGE_SIZE - 1;
d1310b2e
CM
3661 u64 cur = start;
3662 u64 extent_offset;
d1310b2e 3663 u64 block_start;
d1310b2e 3664 struct extent_map *em;
40f76580
CM
3665 int ret = 0;
3666 int nr = 0;
57e5ffeb 3667 const unsigned int write_flags = wbc_to_write_flags(wbc);
40f76580 3668 bool compressed;
c8b97818 3669
6bc5636a 3670 ret = btrfs_writepage_cow_fixup(page, start, end);
d75855b4
NB
3671 if (ret) {
3672 /* Fixup worker will requeue */
5ab58055 3673 redirty_page_for_writepage(wbc, page);
d75855b4
NB
3674 update_nr_written(wbc, nr_written);
3675 unlock_page(page);
3676 return 1;
247e743c
CM
3677 }
3678
11c8349b
CM
3679 /*
3680 * we don't want to touch the inode after unlocking the page,
3681 * so we update the mapping writeback index now
3682 */
3d4b9496 3683 update_nr_written(wbc, nr_written + 1);
771ed689 3684
d1310b2e 3685 while (cur <= end) {
0c64c33c 3686 u64 disk_bytenr;
40f76580 3687 u64 em_end;
6bc5636a 3688 u32 iosize;
58409edd 3689
40f76580 3690 if (cur >= i_size) {
6bc5636a 3691 btrfs_writepage_endio_finish_ordered(page, cur, end, 1);
d1310b2e
CM
3692 break;
3693 }
d4580fe2 3694 em = btrfs_get_extent(inode, NULL, 0, cur, end - cur + 1);
c704005d 3695 if (IS_ERR_OR_NULL(em)) {
d1310b2e 3696 SetPageError(page);
61391d56 3697 ret = PTR_ERR_OR_ZERO(em);
d1310b2e
CM
3698 break;
3699 }
3700
3701 extent_offset = cur - em->start;
40f76580 3702 em_end = extent_map_end(em);
6bc5636a
QW
3703 ASSERT(cur <= em_end);
3704 ASSERT(cur < end);
3705 ASSERT(IS_ALIGNED(em->start, fs_info->sectorsize));
3706 ASSERT(IS_ALIGNED(em->len, fs_info->sectorsize));
d1310b2e 3707 block_start = em->block_start;
c8b97818 3708 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
6bc5636a
QW
3709 disk_bytenr = em->block_start + extent_offset;
3710
3711 /* Note that em_end from extent_map_end() is exclusive */
3712 iosize = min(em_end, end + 1) - cur;
d1310b2e
CM
3713 free_extent_map(em);
3714 em = NULL;
3715
c8b97818
CM
3716 /*
3717 * compressed and inline extents are written through other
3718 * paths in the FS
3719 */
3720 if (compressed || block_start == EXTENT_MAP_HOLE ||
d1310b2e 3721 block_start == EXTENT_MAP_INLINE) {
c8b04030 3722 if (compressed)
c8b97818 3723 nr++;
c8b04030
OS
3724 else
3725 btrfs_writepage_endio_finish_ordered(page, cur,
3726 cur + iosize - 1, 1);
c8b97818 3727 cur += iosize;
d1310b2e
CM
3728 continue;
3729 }
c8b97818 3730
5cdc84bf 3731 btrfs_set_range_writeback(tree, cur, cur + iosize - 1);
58409edd 3732 if (!PageWriteback(page)) {
d4580fe2 3733 btrfs_err(inode->root->fs_info,
58409edd
DS
3734 "page %lu not writeback, cur %llu end %llu",
3735 page->index, cur, end);
d1310b2e 3736 }
7f3c74fb 3737
0ceb34bf 3738 ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc,
6bc5636a
QW
3739 page, disk_bytenr, iosize,
3740 cur - page_offset(page), &epd->bio,
58409edd
DS
3741 end_bio_extent_writepage,
3742 0, 0, 0, false);
fe01aa65 3743 if (ret) {
58409edd 3744 SetPageError(page);
fe01aa65
TK
3745 if (PageWriteback(page))
3746 end_page_writeback(page);
3747 }
d1310b2e 3748
6bc5636a 3749 cur += iosize;
d1310b2e
CM
3750 nr++;
3751 }
40f76580 3752 *nr_ret = nr;
40f76580
CM
3753 return ret;
3754}
3755
3756/*
3757 * the writepage semantics are similar to regular writepage. extent
3758 * records are inserted to lock ranges in the tree, and as dirty areas
3759 * are found, they are marked writeback. Then the lock bits are removed
3760 * and the end_io handler clears the writeback ranges
3065976b
QW
3761 *
3762 * Return 0 if everything goes well.
3763 * Return <0 for error.
40f76580
CM
3764 */
3765static int __extent_writepage(struct page *page, struct writeback_control *wbc,
aab6e9ed 3766 struct extent_page_data *epd)
40f76580
CM
3767{
3768 struct inode *inode = page->mapping->host;
40f76580 3769 u64 start = page_offset(page);
09cbfeaf 3770 u64 page_end = start + PAGE_SIZE - 1;
40f76580
CM
3771 int ret;
3772 int nr = 0;
eb70d222 3773 size_t pg_offset;
40f76580 3774 loff_t i_size = i_size_read(inode);
09cbfeaf 3775 unsigned long end_index = i_size >> PAGE_SHIFT;
40f76580
CM
3776 unsigned long nr_written = 0;
3777
40f76580
CM
3778 trace___extent_writepage(page, inode, wbc);
3779
3780 WARN_ON(!PageLocked(page));
3781
3782 ClearPageError(page);
3783
7073017a 3784 pg_offset = offset_in_page(i_size);
40f76580
CM
3785 if (page->index > end_index ||
3786 (page->index == end_index && !pg_offset)) {
09cbfeaf 3787 page->mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE);
40f76580
CM
3788 unlock_page(page);
3789 return 0;
3790 }
3791
3792 if (page->index == end_index) {
3793 char *userpage;
3794
3795 userpage = kmap_atomic(page);
3796 memset(userpage + pg_offset, 0,
09cbfeaf 3797 PAGE_SIZE - pg_offset);
40f76580
CM
3798 kunmap_atomic(userpage);
3799 flush_dcache_page(page);
3800 }
3801
32443de3
QW
3802 ret = set_page_extent_mapped(page);
3803 if (ret < 0) {
3804 SetPageError(page);
3805 goto done;
3806 }
40f76580 3807
7789a55a 3808 if (!epd->extent_locked) {
cd4c0bf9
NB
3809 ret = writepage_delalloc(BTRFS_I(inode), page, wbc, start,
3810 &nr_written);
7789a55a 3811 if (ret == 1)
169d2c87 3812 return 0;
7789a55a
NB
3813 if (ret)
3814 goto done;
3815 }
40f76580 3816
d4580fe2
NB
3817 ret = __extent_writepage_io(BTRFS_I(inode), page, wbc, epd, i_size,
3818 nr_written, &nr);
40f76580 3819 if (ret == 1)
169d2c87 3820 return 0;
40f76580 3821
d1310b2e
CM
3822done:
3823 if (nr == 0) {
3824 /* make sure the mapping tag for page dirty gets cleared */
3825 set_page_writeback(page);
3826 end_page_writeback(page);
3827 }
61391d56
FM
3828 if (PageError(page)) {
3829 ret = ret < 0 ? ret : -EIO;
3830 end_extent_writepage(page, ret, start, page_end);
3831 }
d1310b2e 3832 unlock_page(page);
3065976b 3833 ASSERT(ret <= 0);
40f76580 3834 return ret;
d1310b2e
CM
3835}
3836
fd8b2b61 3837void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
0b32f4bb 3838{
74316201
N
3839 wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_WRITEBACK,
3840 TASK_UNINTERRUPTIBLE);
0b32f4bb
JB
3841}
3842
18dfa711
FM
3843static void end_extent_buffer_writeback(struct extent_buffer *eb)
3844{
3845 clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
3846 smp_mb__after_atomic();
3847 wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
3848}
3849
2e3c2513 3850/*
a3efb2f0 3851 * Lock extent buffer status and pages for writeback.
2e3c2513 3852 *
a3efb2f0
QW
3853 * May try to flush write bio if we can't get the lock.
3854 *
3855 * Return 0 if the extent buffer doesn't need to be submitted.
3856 * (E.g. the extent buffer is not dirty)
3857 * Return >0 is the extent buffer is submitted to bio.
3858 * Return <0 if something went wrong, no page is locked.
2e3c2513 3859 */
9df76fb5 3860static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb,
0e378df1 3861 struct extent_page_data *epd)
0b32f4bb 3862{
9df76fb5 3863 struct btrfs_fs_info *fs_info = eb->fs_info;
2e3c2513 3864 int i, num_pages, failed_page_nr;
0b32f4bb
JB
3865 int flush = 0;
3866 int ret = 0;
3867
3868 if (!btrfs_try_tree_write_lock(eb)) {
f4340622 3869 ret = flush_write_bio(epd);
2e3c2513
QW
3870 if (ret < 0)
3871 return ret;
3872 flush = 1;
0b32f4bb
JB
3873 btrfs_tree_lock(eb);
3874 }
3875
3876 if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) {
3877 btrfs_tree_unlock(eb);
3878 if (!epd->sync_io)
3879 return 0;
3880 if (!flush) {
f4340622 3881 ret = flush_write_bio(epd);
2e3c2513
QW
3882 if (ret < 0)
3883 return ret;
0b32f4bb
JB
3884 flush = 1;
3885 }
a098d8e8
CM
3886 while (1) {
3887 wait_on_extent_buffer_writeback(eb);
3888 btrfs_tree_lock(eb);
3889 if (!test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags))
3890 break;
0b32f4bb 3891 btrfs_tree_unlock(eb);
0b32f4bb
JB
3892 }
3893 }
3894
51561ffe
JB
3895 /*
3896 * We need to do this to prevent races in people who check if the eb is
3897 * under IO since we can end up having no IO bits set for a short period
3898 * of time.
3899 */
3900 spin_lock(&eb->refs_lock);
0b32f4bb
JB
3901 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
3902 set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
51561ffe 3903 spin_unlock(&eb->refs_lock);
0b32f4bb 3904 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
104b4e51
NB
3905 percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
3906 -eb->len,
3907 fs_info->dirty_metadata_batch);
0b32f4bb 3908 ret = 1;
51561ffe
JB
3909 } else {
3910 spin_unlock(&eb->refs_lock);
0b32f4bb
JB
3911 }
3912
3913 btrfs_tree_unlock(eb);
3914
3915 if (!ret)
3916 return ret;
3917
65ad0104 3918 num_pages = num_extent_pages(eb);
0b32f4bb 3919 for (i = 0; i < num_pages; i++) {
fb85fc9a 3920 struct page *p = eb->pages[i];
0b32f4bb
JB
3921
3922 if (!trylock_page(p)) {
3923 if (!flush) {
18dfa711
FM
3924 int err;
3925
3926 err = flush_write_bio(epd);
3927 if (err < 0) {
3928 ret = err;
2e3c2513
QW
3929 failed_page_nr = i;
3930 goto err_unlock;
3931 }
0b32f4bb
JB
3932 flush = 1;
3933 }
3934 lock_page(p);
3935 }
3936 }
3937
3938 return ret;
2e3c2513
QW
3939err_unlock:
3940 /* Unlock already locked pages */
3941 for (i = 0; i < failed_page_nr; i++)
3942 unlock_page(eb->pages[i]);
18dfa711
FM
3943 /*
3944 * Clear EXTENT_BUFFER_WRITEBACK and wake up anyone waiting on it.
3945 * Also set back EXTENT_BUFFER_DIRTY so future attempts to this eb can
3946 * be made and undo everything done before.
3947 */
3948 btrfs_tree_lock(eb);
3949 spin_lock(&eb->refs_lock);
3950 set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
3951 end_extent_buffer_writeback(eb);
3952 spin_unlock(&eb->refs_lock);
3953 percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, eb->len,
3954 fs_info->dirty_metadata_batch);
3955 btrfs_clear_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
3956 btrfs_tree_unlock(eb);
2e3c2513 3957 return ret;
0b32f4bb
JB
3958}
3959
656f30db
FM
3960static void set_btree_ioerr(struct page *page)
3961{
3962 struct extent_buffer *eb = (struct extent_buffer *)page->private;
eb5b64f1 3963 struct btrfs_fs_info *fs_info;
656f30db
FM
3964
3965 SetPageError(page);
3966 if (test_and_set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))
3967 return;
3968
eb5b64f1
DZ
3969 /*
3970 * If we error out, we should add back the dirty_metadata_bytes
3971 * to make it consistent.
3972 */
3973 fs_info = eb->fs_info;
3974 percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
3975 eb->len, fs_info->dirty_metadata_batch);
3976
656f30db
FM
3977 /*
3978 * If writeback for a btree extent that doesn't belong to a log tree
3979 * failed, increment the counter transaction->eb_write_errors.
3980 * We do this because while the transaction is running and before it's
3981 * committing (when we call filemap_fdata[write|wait]_range against
3982 * the btree inode), we might have
3983 * btree_inode->i_mapping->a_ops->writepages() called by the VM - if it
3984 * returns an error or an error happens during writeback, when we're
3985 * committing the transaction we wouldn't know about it, since the pages
3986 * can be no longer dirty nor marked anymore for writeback (if a
3987 * subsequent modification to the extent buffer didn't happen before the
3988 * transaction commit), which makes filemap_fdata[write|wait]_range not
3989 * able to find the pages tagged with SetPageError at transaction
3990 * commit time. So if this happens we must abort the transaction,
3991 * otherwise we commit a super block with btree roots that point to
3992 * btree nodes/leafs whose content on disk is invalid - either garbage
3993 * or the content of some node/leaf from a past generation that got
3994 * cowed or deleted and is no longer valid.
3995 *
3996 * Note: setting AS_EIO/AS_ENOSPC in the btree inode's i_mapping would
3997 * not be enough - we need to distinguish between log tree extents vs
3998 * non-log tree extents, and the next filemap_fdatawait_range() call
3999 * will catch and clear such errors in the mapping - and that call might
4000 * be from a log sync and not from a transaction commit. Also, checking
4001 * for the eb flag EXTENT_BUFFER_WRITE_ERR at transaction commit time is
4002 * not done and would not be reliable - the eb might have been released
4003 * from memory and reading it back again means that flag would not be
4004 * set (since it's a runtime flag, not persisted on disk).
4005 *
4006 * Using the flags below in the btree inode also makes us achieve the
4007 * goal of AS_EIO/AS_ENOSPC when writepages() returns success, started
4008 * writeback for all dirty pages and before filemap_fdatawait_range()
4009 * is called, the writeback for all dirty pages had already finished
4010 * with errors - because we were not using AS_EIO/AS_ENOSPC,
4011 * filemap_fdatawait_range() would return success, as it could not know
4012 * that writeback errors happened (the pages were no longer tagged for
4013 * writeback).
4014 */
4015 switch (eb->log_index) {
4016 case -1:
afcdd129 4017 set_bit(BTRFS_FS_BTREE_ERR, &eb->fs_info->flags);
656f30db
FM
4018 break;
4019 case 0:
afcdd129 4020 set_bit(BTRFS_FS_LOG1_ERR, &eb->fs_info->flags);
656f30db
FM
4021 break;
4022 case 1:
afcdd129 4023 set_bit(BTRFS_FS_LOG2_ERR, &eb->fs_info->flags);
656f30db
FM
4024 break;
4025 default:
4026 BUG(); /* unexpected, logic error */
4027 }
4028}
4029
4246a0b6 4030static void end_bio_extent_buffer_writepage(struct bio *bio)
0b32f4bb 4031{
2c30c71b 4032 struct bio_vec *bvec;
0b32f4bb 4033 struct extent_buffer *eb;
2b070cfe 4034 int done;
6dc4f100 4035 struct bvec_iter_all iter_all;
0b32f4bb 4036
c09abff8 4037 ASSERT(!bio_flagged(bio, BIO_CLONED));
2b070cfe 4038 bio_for_each_segment_all(bvec, bio, iter_all) {
0b32f4bb
JB
4039 struct page *page = bvec->bv_page;
4040
0b32f4bb
JB
4041 eb = (struct extent_buffer *)page->private;
4042 BUG_ON(!eb);
4043 done = atomic_dec_and_test(&eb->io_pages);
4044
4e4cbee9 4045 if (bio->bi_status ||
4246a0b6 4046 test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) {
0b32f4bb 4047 ClearPageUptodate(page);
656f30db 4048 set_btree_ioerr(page);
0b32f4bb
JB
4049 }
4050
4051 end_page_writeback(page);
4052
4053 if (!done)
4054 continue;
4055
4056 end_extent_buffer_writeback(eb);
2c30c71b 4057 }
0b32f4bb
JB
4058
4059 bio_put(bio);
0b32f4bb
JB
4060}
4061
0e378df1 4062static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
0b32f4bb
JB
4063 struct writeback_control *wbc,
4064 struct extent_page_data *epd)
4065{
0c64c33c 4066 u64 disk_bytenr = eb->start;
851cd173 4067 u32 nritems;
cc5e31a4 4068 int i, num_pages;
851cd173 4069 unsigned long start, end;
ff40adf7 4070 unsigned int write_flags = wbc_to_write_flags(wbc) | REQ_META;
d7dbe9e7 4071 int ret = 0;
0b32f4bb 4072
656f30db 4073 clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags);
65ad0104 4074 num_pages = num_extent_pages(eb);
0b32f4bb 4075 atomic_set(&eb->io_pages, num_pages);
de0022b9 4076
851cd173
LB
4077 /* set btree blocks beyond nritems with 0 to avoid stale content. */
4078 nritems = btrfs_header_nritems(eb);
3eb548ee 4079 if (btrfs_header_level(eb) > 0) {
3eb548ee
LB
4080 end = btrfs_node_key_ptr_offset(nritems);
4081
b159fa28 4082 memzero_extent_buffer(eb, end, eb->len - end);
851cd173
LB
4083 } else {
4084 /*
4085 * leaf:
4086 * header 0 1 2 .. N ... data_N .. data_2 data_1 data_0
4087 */
4088 start = btrfs_item_nr_offset(nritems);
8f881e8c 4089 end = BTRFS_LEAF_DATA_OFFSET + leaf_data_end(eb);
b159fa28 4090 memzero_extent_buffer(eb, start, end - start);
3eb548ee
LB
4091 }
4092
0b32f4bb 4093 for (i = 0; i < num_pages; i++) {
fb85fc9a 4094 struct page *p = eb->pages[i];
0b32f4bb
JB
4095
4096 clear_page_dirty_for_io(p);
4097 set_page_writeback(p);
0ceb34bf 4098 ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc,
0c64c33c 4099 p, disk_bytenr, PAGE_SIZE, 0,
c2df8bb4 4100 &epd->bio,
1f7ad75b 4101 end_bio_extent_buffer_writepage,
18fdc679 4102 0, 0, 0, false);
0b32f4bb 4103 if (ret) {
656f30db 4104 set_btree_ioerr(p);
fe01aa65
TK
4105 if (PageWriteback(p))
4106 end_page_writeback(p);
0b32f4bb
JB
4107 if (atomic_sub_and_test(num_pages - i, &eb->io_pages))
4108 end_extent_buffer_writeback(eb);
4109 ret = -EIO;
4110 break;
4111 }
0c64c33c 4112 disk_bytenr += PAGE_SIZE;
3d4b9496 4113 update_nr_written(wbc, 1);
0b32f4bb
JB
4114 unlock_page(p);
4115 }
4116
4117 if (unlikely(ret)) {
4118 for (; i < num_pages; i++) {
bbf65cf0 4119 struct page *p = eb->pages[i];
81465028 4120 clear_page_dirty_for_io(p);
0b32f4bb
JB
4121 unlock_page(p);
4122 }
4123 }
4124
4125 return ret;
4126}
4127
f91e0d0c
QW
4128/*
4129 * Submit all page(s) of one extent buffer.
4130 *
4131 * @page: the page of one extent buffer
4132 * @eb_context: to determine if we need to submit this page, if current page
4133 * belongs to this eb, we don't need to submit
4134 *
4135 * The caller should pass each page in their bytenr order, and here we use
4136 * @eb_context to determine if we have submitted pages of one extent buffer.
4137 *
4138 * If we have, we just skip until we hit a new page that doesn't belong to
4139 * current @eb_context.
4140 *
4141 * If not, we submit all the page(s) of the extent buffer.
4142 *
4143 * Return >0 if we have submitted the extent buffer successfully.
4144 * Return 0 if we don't need to submit the page, as it's already submitted by
4145 * previous call.
4146 * Return <0 for fatal error.
4147 */
4148static int submit_eb_page(struct page *page, struct writeback_control *wbc,
4149 struct extent_page_data *epd,
4150 struct extent_buffer **eb_context)
4151{
4152 struct address_space *mapping = page->mapping;
4153 struct extent_buffer *eb;
4154 int ret;
4155
4156 if (!PagePrivate(page))
4157 return 0;
4158
4159 spin_lock(&mapping->private_lock);
4160 if (!PagePrivate(page)) {
4161 spin_unlock(&mapping->private_lock);
4162 return 0;
4163 }
4164
4165 eb = (struct extent_buffer *)page->private;
4166
4167 /*
4168 * Shouldn't happen and normally this would be a BUG_ON but no point
4169 * crashing the machine for something we can survive anyway.
4170 */
4171 if (WARN_ON(!eb)) {
4172 spin_unlock(&mapping->private_lock);
4173 return 0;
4174 }
4175
4176 if (eb == *eb_context) {
4177 spin_unlock(&mapping->private_lock);
4178 return 0;
4179 }
4180 ret = atomic_inc_not_zero(&eb->refs);
4181 spin_unlock(&mapping->private_lock);
4182 if (!ret)
4183 return 0;
4184
4185 *eb_context = eb;
4186
4187 ret = lock_extent_buffer_for_io(eb, epd);
4188 if (ret <= 0) {
4189 free_extent_buffer(eb);
4190 return ret;
4191 }
4192 ret = write_one_eb(eb, wbc, epd);
4193 free_extent_buffer(eb);
4194 if (ret < 0)
4195 return ret;
4196 return 1;
4197}
4198
0b32f4bb
JB
4199int btree_write_cache_pages(struct address_space *mapping,
4200 struct writeback_control *wbc)
4201{
f91e0d0c 4202 struct extent_buffer *eb_context = NULL;
0b32f4bb
JB
4203 struct extent_page_data epd = {
4204 .bio = NULL,
0b32f4bb
JB
4205 .extent_locked = 0,
4206 .sync_io = wbc->sync_mode == WB_SYNC_ALL,
4207 };
b3ff8f1d 4208 struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info;
0b32f4bb
JB
4209 int ret = 0;
4210 int done = 0;
4211 int nr_to_write_done = 0;
4212 struct pagevec pvec;
4213 int nr_pages;
4214 pgoff_t index;
4215 pgoff_t end; /* Inclusive */
4216 int scanned = 0;
10bbd235 4217 xa_mark_t tag;
0b32f4bb 4218
86679820 4219 pagevec_init(&pvec);
0b32f4bb
JB
4220 if (wbc->range_cyclic) {
4221 index = mapping->writeback_index; /* Start from prev offset */
4222 end = -1;
556755a8
JB
4223 /*
4224 * Start from the beginning does not need to cycle over the
4225 * range, mark it as scanned.
4226 */
4227 scanned = (index == 0);
0b32f4bb 4228 } else {
09cbfeaf
KS
4229 index = wbc->range_start >> PAGE_SHIFT;
4230 end = wbc->range_end >> PAGE_SHIFT;
0b32f4bb
JB
4231 scanned = 1;
4232 }
4233 if (wbc->sync_mode == WB_SYNC_ALL)
4234 tag = PAGECACHE_TAG_TOWRITE;
4235 else
4236 tag = PAGECACHE_TAG_DIRTY;
4237retry:
4238 if (wbc->sync_mode == WB_SYNC_ALL)
4239 tag_pages_for_writeback(mapping, index, end);
4240 while (!done && !nr_to_write_done && (index <= end) &&
4006f437 4241 (nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end,
67fd707f 4242 tag))) {
0b32f4bb
JB
4243 unsigned i;
4244
0b32f4bb
JB
4245 for (i = 0; i < nr_pages; i++) {
4246 struct page *page = pvec.pages[i];
4247
f91e0d0c
QW
4248 ret = submit_eb_page(page, wbc, &epd, &eb_context);
4249 if (ret == 0)
0b32f4bb 4250 continue;
f91e0d0c 4251 if (ret < 0) {
0b32f4bb 4252 done = 1;
0b32f4bb
JB
4253 break;
4254 }
0b32f4bb
JB
4255
4256 /*
4257 * the filesystem may choose to bump up nr_to_write.
4258 * We have to make sure to honor the new nr_to_write
4259 * at any time
4260 */
4261 nr_to_write_done = wbc->nr_to_write <= 0;
4262 }
4263 pagevec_release(&pvec);
4264 cond_resched();
4265 }
4266 if (!scanned && !done) {
4267 /*
4268 * We hit the last page and there is more work to be done: wrap
4269 * back to the start of the file
4270 */
4271 scanned = 1;
4272 index = 0;
4273 goto retry;
4274 }
2b952eea
QW
4275 if (ret < 0) {
4276 end_write_bio(&epd, ret);
4277 return ret;
4278 }
b3ff8f1d
QW
4279 /*
4280 * If something went wrong, don't allow any metadata write bio to be
4281 * submitted.
4282 *
4283 * This would prevent use-after-free if we had dirty pages not
4284 * cleaned up, which can still happen by fuzzed images.
4285 *
4286 * - Bad extent tree
4287 * Allowing existing tree block to be allocated for other trees.
4288 *
4289 * - Log tree operations
4290 * Exiting tree blocks get allocated to log tree, bumps its
4291 * generation, then get cleaned in tree re-balance.
4292 * Such tree block will not be written back, since it's clean,
4293 * thus no WRITTEN flag set.
4294 * And after log writes back, this tree block is not traced by
4295 * any dirty extent_io_tree.
4296 *
4297 * - Offending tree block gets re-dirtied from its original owner
4298 * Since it has bumped generation, no WRITTEN flag, it can be
4299 * reused without COWing. This tree block will not be traced
4300 * by btrfs_transaction::dirty_pages.
4301 *
4302 * Now such dirty tree block will not be cleaned by any dirty
4303 * extent io tree. Thus we don't want to submit such wild eb
4304 * if the fs already has error.
4305 */
4306 if (!test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
4307 ret = flush_write_bio(&epd);
4308 } else {
fbabd4a3 4309 ret = -EROFS;
b3ff8f1d
QW
4310 end_write_bio(&epd, ret);
4311 }
0b32f4bb
JB
4312 return ret;
4313}
4314
d1310b2e 4315/**
3bed2da1
NB
4316 * Walk the list of dirty pages of the given address space and write all of them.
4317 *
d1310b2e 4318 * @mapping: address space structure to write
3bed2da1
NB
4319 * @wbc: subtract the number of written pages from *@wbc->nr_to_write
4320 * @epd: holds context for the write, namely the bio
d1310b2e
CM
4321 *
4322 * If a page is already under I/O, write_cache_pages() skips it, even
4323 * if it's dirty. This is desirable behaviour for memory-cleaning writeback,
4324 * but it is INCORRECT for data-integrity system calls such as fsync(). fsync()
4325 * and msync() need to guarantee that all the data which was dirty at the time
4326 * the call was made get new I/O started against them. If wbc->sync_mode is
4327 * WB_SYNC_ALL then we were called for data integrity and we must wait for
4328 * existing IO to complete.
4329 */
4242b64a 4330static int extent_write_cache_pages(struct address_space *mapping,
4bef0848 4331 struct writeback_control *wbc,
aab6e9ed 4332 struct extent_page_data *epd)
d1310b2e 4333{
7fd1a3f7 4334 struct inode *inode = mapping->host;
d1310b2e
CM
4335 int ret = 0;
4336 int done = 0;
f85d7d6c 4337 int nr_to_write_done = 0;
d1310b2e
CM
4338 struct pagevec pvec;
4339 int nr_pages;
4340 pgoff_t index;
4341 pgoff_t end; /* Inclusive */
a9132667
LB
4342 pgoff_t done_index;
4343 int range_whole = 0;
d1310b2e 4344 int scanned = 0;
10bbd235 4345 xa_mark_t tag;
d1310b2e 4346
7fd1a3f7
JB
4347 /*
4348 * We have to hold onto the inode so that ordered extents can do their
4349 * work when the IO finishes. The alternative to this is failing to add
4350 * an ordered extent if the igrab() fails there and that is a huge pain
4351 * to deal with, so instead just hold onto the inode throughout the
4352 * writepages operation. If it fails here we are freeing up the inode
4353 * anyway and we'd rather not waste our time writing out stuff that is
4354 * going to be truncated anyway.
4355 */
4356 if (!igrab(inode))
4357 return 0;
4358
86679820 4359 pagevec_init(&pvec);
d1310b2e
CM
4360 if (wbc->range_cyclic) {
4361 index = mapping->writeback_index; /* Start from prev offset */
4362 end = -1;
556755a8
JB
4363 /*
4364 * Start from the beginning does not need to cycle over the
4365 * range, mark it as scanned.
4366 */
4367 scanned = (index == 0);
d1310b2e 4368 } else {
09cbfeaf
KS
4369 index = wbc->range_start >> PAGE_SHIFT;
4370 end = wbc->range_end >> PAGE_SHIFT;
a9132667
LB
4371 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
4372 range_whole = 1;
d1310b2e
CM
4373 scanned = 1;
4374 }
3cd24c69
EL
4375
4376 /*
4377 * We do the tagged writepage as long as the snapshot flush bit is set
4378 * and we are the first one who do the filemap_flush() on this inode.
4379 *
4380 * The nr_to_write == LONG_MAX is needed to make sure other flushers do
4381 * not race in and drop the bit.
4382 */
4383 if (range_whole && wbc->nr_to_write == LONG_MAX &&
4384 test_and_clear_bit(BTRFS_INODE_SNAPSHOT_FLUSH,
4385 &BTRFS_I(inode)->runtime_flags))
4386 wbc->tagged_writepages = 1;
4387
4388 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
f7aaa06b
JB
4389 tag = PAGECACHE_TAG_TOWRITE;
4390 else
4391 tag = PAGECACHE_TAG_DIRTY;
d1310b2e 4392retry:
3cd24c69 4393 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
f7aaa06b 4394 tag_pages_for_writeback(mapping, index, end);
a9132667 4395 done_index = index;
f85d7d6c 4396 while (!done && !nr_to_write_done && (index <= end) &&
67fd707f
JK
4397 (nr_pages = pagevec_lookup_range_tag(&pvec, mapping,
4398 &index, end, tag))) {
d1310b2e
CM
4399 unsigned i;
4400
d1310b2e
CM
4401 for (i = 0; i < nr_pages; i++) {
4402 struct page *page = pvec.pages[i];
4403
f7bddf1e 4404 done_index = page->index + 1;
d1310b2e 4405 /*
b93b0163
MW
4406 * At this point we hold neither the i_pages lock nor
4407 * the page lock: the page may be truncated or
4408 * invalidated (changing page->mapping to NULL),
4409 * or even swizzled back from swapper_space to
4410 * tmpfs file mapping
d1310b2e 4411 */
c8f2f24b 4412 if (!trylock_page(page)) {
f4340622
QW
4413 ret = flush_write_bio(epd);
4414 BUG_ON(ret < 0);
c8f2f24b 4415 lock_page(page);
01d658f2 4416 }
d1310b2e
CM
4417
4418 if (unlikely(page->mapping != mapping)) {
4419 unlock_page(page);
4420 continue;
4421 }
4422
d2c3f4f6 4423 if (wbc->sync_mode != WB_SYNC_NONE) {
f4340622
QW
4424 if (PageWriteback(page)) {
4425 ret = flush_write_bio(epd);
4426 BUG_ON(ret < 0);
4427 }
d1310b2e 4428 wait_on_page_writeback(page);
d2c3f4f6 4429 }
d1310b2e
CM
4430
4431 if (PageWriteback(page) ||
4432 !clear_page_dirty_for_io(page)) {
4433 unlock_page(page);
4434 continue;
4435 }
4436
aab6e9ed 4437 ret = __extent_writepage(page, wbc, epd);
a9132667 4438 if (ret < 0) {
a9132667
LB
4439 done = 1;
4440 break;
4441 }
f85d7d6c
CM
4442
4443 /*
4444 * the filesystem may choose to bump up nr_to_write.
4445 * We have to make sure to honor the new nr_to_write
4446 * at any time
4447 */
4448 nr_to_write_done = wbc->nr_to_write <= 0;
d1310b2e
CM
4449 }
4450 pagevec_release(&pvec);
4451 cond_resched();
4452 }
894b36e3 4453 if (!scanned && !done) {
d1310b2e
CM
4454 /*
4455 * We hit the last page and there is more work to be done: wrap
4456 * back to the start of the file
4457 */
4458 scanned = 1;
4459 index = 0;
42ffb0bf
JB
4460
4461 /*
4462 * If we're looping we could run into a page that is locked by a
4463 * writer and that writer could be waiting on writeback for a
4464 * page in our current bio, and thus deadlock, so flush the
4465 * write bio here.
4466 */
4467 ret = flush_write_bio(epd);
4468 if (!ret)
4469 goto retry;
d1310b2e 4470 }
a9132667
LB
4471
4472 if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole))
4473 mapping->writeback_index = done_index;
4474
7fd1a3f7 4475 btrfs_add_delayed_iput(inode);
894b36e3 4476 return ret;
d1310b2e 4477}
d1310b2e 4478
0a9b0e53 4479int extent_write_full_page(struct page *page, struct writeback_control *wbc)
d1310b2e
CM
4480{
4481 int ret;
d1310b2e
CM
4482 struct extent_page_data epd = {
4483 .bio = NULL,
771ed689 4484 .extent_locked = 0,
ffbd517d 4485 .sync_io = wbc->sync_mode == WB_SYNC_ALL,
d1310b2e 4486 };
d1310b2e 4487
d1310b2e 4488 ret = __extent_writepage(page, wbc, &epd);
3065976b
QW
4489 ASSERT(ret <= 0);
4490 if (ret < 0) {
4491 end_write_bio(&epd, ret);
4492 return ret;
4493 }
d1310b2e 4494
3065976b
QW
4495 ret = flush_write_bio(&epd);
4496 ASSERT(ret <= 0);
d1310b2e
CM
4497 return ret;
4498}
d1310b2e 4499
5e3ee236 4500int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
771ed689
CM
4501 int mode)
4502{
4503 int ret = 0;
4504 struct address_space *mapping = inode->i_mapping;
4505 struct page *page;
09cbfeaf
KS
4506 unsigned long nr_pages = (end - start + PAGE_SIZE) >>
4507 PAGE_SHIFT;
771ed689
CM
4508
4509 struct extent_page_data epd = {
4510 .bio = NULL,
771ed689 4511 .extent_locked = 1,
ffbd517d 4512 .sync_io = mode == WB_SYNC_ALL,
771ed689
CM
4513 };
4514 struct writeback_control wbc_writepages = {
771ed689 4515 .sync_mode = mode,
771ed689
CM
4516 .nr_to_write = nr_pages * 2,
4517 .range_start = start,
4518 .range_end = end + 1,
ec39f769
CM
4519 /* We're called from an async helper function */
4520 .punt_to_cgroup = 1,
4521 .no_cgroup_owner = 1,
771ed689
CM
4522 };
4523
dbb70bec 4524 wbc_attach_fdatawrite_inode(&wbc_writepages, inode);
d397712b 4525 while (start <= end) {
09cbfeaf 4526 page = find_get_page(mapping, start >> PAGE_SHIFT);
771ed689
CM
4527 if (clear_page_dirty_for_io(page))
4528 ret = __extent_writepage(page, &wbc_writepages, &epd);
4529 else {
7087a9d8 4530 btrfs_writepage_endio_finish_ordered(page, start,
c629732d 4531 start + PAGE_SIZE - 1, 1);
771ed689
CM
4532 unlock_page(page);
4533 }
09cbfeaf
KS
4534 put_page(page);
4535 start += PAGE_SIZE;
771ed689
CM
4536 }
4537
02c6db4f 4538 ASSERT(ret <= 0);
dbb70bec
CM
4539 if (ret == 0)
4540 ret = flush_write_bio(&epd);
4541 else
02c6db4f 4542 end_write_bio(&epd, ret);
dbb70bec
CM
4543
4544 wbc_detach_inode(&wbc_writepages);
771ed689
CM
4545 return ret;
4546}
d1310b2e 4547
8ae225a8 4548int extent_writepages(struct address_space *mapping,
d1310b2e
CM
4549 struct writeback_control *wbc)
4550{
4551 int ret = 0;
4552 struct extent_page_data epd = {
4553 .bio = NULL,
771ed689 4554 .extent_locked = 0,
ffbd517d 4555 .sync_io = wbc->sync_mode == WB_SYNC_ALL,
d1310b2e
CM
4556 };
4557
935db853 4558 ret = extent_write_cache_pages(mapping, wbc, &epd);
a2a72fbd
QW
4559 ASSERT(ret <= 0);
4560 if (ret < 0) {
4561 end_write_bio(&epd, ret);
4562 return ret;
4563 }
4564 ret = flush_write_bio(&epd);
d1310b2e
CM
4565 return ret;
4566}
d1310b2e 4567
ba206a02 4568void extent_readahead(struct readahead_control *rac)
d1310b2e
CM
4569{
4570 struct bio *bio = NULL;
c8b97818 4571 unsigned long bio_flags = 0;
67c9684f 4572 struct page *pagepool[16];
125bac01 4573 struct extent_map *em_cached = NULL;
808f80b4 4574 u64 prev_em_start = (u64)-1;
ba206a02 4575 int nr;
d1310b2e 4576
ba206a02
MWO
4577 while ((nr = readahead_page_batch(rac, pagepool))) {
4578 u64 contig_start = page_offset(pagepool[0]);
4579 u64 contig_end = page_offset(pagepool[nr - 1]) + PAGE_SIZE - 1;
e65ef21e 4580
ba206a02 4581 ASSERT(contig_start + nr * PAGE_SIZE - 1 == contig_end);
e65ef21e 4582
ba206a02
MWO
4583 contiguous_readpages(pagepool, nr, contig_start, contig_end,
4584 &em_cached, &bio, &bio_flags, &prev_em_start);
d1310b2e 4585 }
67c9684f 4586
125bac01
MX
4587 if (em_cached)
4588 free_extent_map(em_cached);
4589
ba206a02
MWO
4590 if (bio) {
4591 if (submit_one_bio(bio, 0, bio_flags))
4592 return;
4593 }
d1310b2e 4594}
d1310b2e
CM
4595
4596/*
4597 * basic invalidatepage code, this waits on any locked or writeback
4598 * ranges corresponding to the page, and then deletes any extent state
4599 * records from the tree
4600 */
4601int extent_invalidatepage(struct extent_io_tree *tree,
4602 struct page *page, unsigned long offset)
4603{
2ac55d41 4604 struct extent_state *cached_state = NULL;
4eee4fa4 4605 u64 start = page_offset(page);
09cbfeaf 4606 u64 end = start + PAGE_SIZE - 1;
d1310b2e
CM
4607 size_t blocksize = page->mapping->host->i_sb->s_blocksize;
4608
829ddec9
QW
4609 /* This function is only called for the btree inode */
4610 ASSERT(tree->owner == IO_TREE_BTREE_INODE_IO);
4611
fda2832f 4612 start += ALIGN(offset, blocksize);
d1310b2e
CM
4613 if (start > end)
4614 return 0;
4615
ff13db41 4616 lock_extent_bits(tree, start, end, &cached_state);
1edbb734 4617 wait_on_page_writeback(page);
829ddec9
QW
4618
4619 /*
4620 * Currently for btree io tree, only EXTENT_LOCKED is utilized,
4621 * so here we only need to unlock the extent range to free any
4622 * existing extent state.
4623 */
4624 unlock_extent_cached(tree, start, end, &cached_state);
d1310b2e
CM
4625 return 0;
4626}
d1310b2e 4627
7b13b7b1
CM
4628/*
4629 * a helper for releasepage, this tests for areas of the page that
4630 * are locked or under IO and drops the related state bits if it is safe
4631 * to drop the page.
4632 */
29c68b2d 4633static int try_release_extent_state(struct extent_io_tree *tree,
48a3b636 4634 struct page *page, gfp_t mask)
7b13b7b1 4635{
4eee4fa4 4636 u64 start = page_offset(page);
09cbfeaf 4637 u64 end = start + PAGE_SIZE - 1;
7b13b7b1
CM
4638 int ret = 1;
4639
8882679e 4640 if (test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) {
7b13b7b1 4641 ret = 0;
8882679e 4642 } else {
11ef160f 4643 /*
2766ff61
FM
4644 * At this point we can safely clear everything except the
4645 * locked bit, the nodatasum bit and the delalloc new bit.
4646 * The delalloc new bit will be cleared by ordered extent
4647 * completion.
11ef160f 4648 */
66b0c887 4649 ret = __clear_extent_bit(tree, start, end,
2766ff61
FM
4650 ~(EXTENT_LOCKED | EXTENT_NODATASUM | EXTENT_DELALLOC_NEW),
4651 0, 0, NULL, mask, NULL);
e3f24cc5
CM
4652
4653 /* if clear_extent_bit failed for enomem reasons,
4654 * we can't allow the release to continue.
4655 */
4656 if (ret < 0)
4657 ret = 0;
4658 else
4659 ret = 1;
7b13b7b1
CM
4660 }
4661 return ret;
4662}
7b13b7b1 4663
d1310b2e
CM
4664/*
4665 * a helper for releasepage. As long as there are no locked extents
4666 * in the range corresponding to the page, both state records and extent
4667 * map records are removed
4668 */
477a30ba 4669int try_release_extent_mapping(struct page *page, gfp_t mask)
d1310b2e
CM
4670{
4671 struct extent_map *em;
4eee4fa4 4672 u64 start = page_offset(page);
09cbfeaf 4673 u64 end = start + PAGE_SIZE - 1;
bd3599a0
FM
4674 struct btrfs_inode *btrfs_inode = BTRFS_I(page->mapping->host);
4675 struct extent_io_tree *tree = &btrfs_inode->io_tree;
4676 struct extent_map_tree *map = &btrfs_inode->extent_tree;
7b13b7b1 4677
d0164adc 4678 if (gfpflags_allow_blocking(mask) &&
ee22184b 4679 page->mapping->host->i_size > SZ_16M) {
39b5637f 4680 u64 len;
70dec807 4681 while (start <= end) {
fbc2bd7e
FM
4682 struct btrfs_fs_info *fs_info;
4683 u64 cur_gen;
4684
39b5637f 4685 len = end - start + 1;
890871be 4686 write_lock(&map->lock);
39b5637f 4687 em = lookup_extent_mapping(map, start, len);
285190d9 4688 if (!em) {
890871be 4689 write_unlock(&map->lock);
70dec807
CM
4690 break;
4691 }
7f3c74fb
CM
4692 if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
4693 em->start != start) {
890871be 4694 write_unlock(&map->lock);
70dec807
CM
4695 free_extent_map(em);
4696 break;
4697 }
3d6448e6
FM
4698 if (test_range_bit(tree, em->start,
4699 extent_map_end(em) - 1,
4700 EXTENT_LOCKED, 0, NULL))
4701 goto next;
4702 /*
4703 * If it's not in the list of modified extents, used
4704 * by a fast fsync, we can remove it. If it's being
4705 * logged we can safely remove it since fsync took an
4706 * extra reference on the em.
4707 */
4708 if (list_empty(&em->list) ||
fbc2bd7e
FM
4709 test_bit(EXTENT_FLAG_LOGGING, &em->flags))
4710 goto remove_em;
4711 /*
4712 * If it's in the list of modified extents, remove it
4713 * only if its generation is older then the current one,
4714 * in which case we don't need it for a fast fsync.
4715 * Otherwise don't remove it, we could be racing with an
4716 * ongoing fast fsync that could miss the new extent.
4717 */
4718 fs_info = btrfs_inode->root->fs_info;
4719 spin_lock(&fs_info->trans_lock);
4720 cur_gen = fs_info->generation;
4721 spin_unlock(&fs_info->trans_lock);
4722 if (em->generation >= cur_gen)
4723 goto next;
4724remove_em:
5e548b32
FM
4725 /*
4726 * We only remove extent maps that are not in the list of
4727 * modified extents or that are in the list but with a
4728 * generation lower then the current generation, so there
4729 * is no need to set the full fsync flag on the inode (it
4730 * hurts the fsync performance for workloads with a data
4731 * size that exceeds or is close to the system's memory).
4732 */
fbc2bd7e
FM
4733 remove_extent_mapping(map, em);
4734 /* once for the rb tree */
4735 free_extent_map(em);
3d6448e6 4736next:
70dec807 4737 start = extent_map_end(em);
890871be 4738 write_unlock(&map->lock);
70dec807
CM
4739
4740 /* once for us */
d1310b2e 4741 free_extent_map(em);
9f47eb54
PM
4742
4743 cond_resched(); /* Allow large-extent preemption. */
d1310b2e 4744 }
d1310b2e 4745 }
29c68b2d 4746 return try_release_extent_state(tree, page, mask);
d1310b2e 4747}
d1310b2e 4748
ec29ed5b
CM
4749/*
4750 * helper function for fiemap, which doesn't want to see any holes.
4751 * This maps until we find something past 'last'
4752 */
f1bbde8d 4753static struct extent_map *get_extent_skip_holes(struct btrfs_inode *inode,
e3350e16 4754 u64 offset, u64 last)
ec29ed5b 4755{
f1bbde8d 4756 u64 sectorsize = btrfs_inode_sectorsize(inode);
ec29ed5b
CM
4757 struct extent_map *em;
4758 u64 len;
4759
4760 if (offset >= last)
4761 return NULL;
4762
67871254 4763 while (1) {
ec29ed5b
CM
4764 len = last - offset;
4765 if (len == 0)
4766 break;
fda2832f 4767 len = ALIGN(len, sectorsize);
f1bbde8d 4768 em = btrfs_get_extent_fiemap(inode, offset, len);
c704005d 4769 if (IS_ERR_OR_NULL(em))
ec29ed5b
CM
4770 return em;
4771
4772 /* if this isn't a hole return it */
4a2d25cd 4773 if (em->block_start != EXTENT_MAP_HOLE)
ec29ed5b 4774 return em;
ec29ed5b
CM
4775
4776 /* this is a hole, advance to the next extent */
4777 offset = extent_map_end(em);
4778 free_extent_map(em);
4779 if (offset >= last)
4780 break;
4781 }
4782 return NULL;
4783}
4784
4751832d
QW
4785/*
4786 * To cache previous fiemap extent
4787 *
4788 * Will be used for merging fiemap extent
4789 */
4790struct fiemap_cache {
4791 u64 offset;
4792 u64 phys;
4793 u64 len;
4794 u32 flags;
4795 bool cached;
4796};
4797
4798/*
4799 * Helper to submit fiemap extent.
4800 *
4801 * Will try to merge current fiemap extent specified by @offset, @phys,
4802 * @len and @flags with cached one.
4803 * And only when we fails to merge, cached one will be submitted as
4804 * fiemap extent.
4805 *
4806 * Return value is the same as fiemap_fill_next_extent().
4807 */
4808static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
4809 struct fiemap_cache *cache,
4810 u64 offset, u64 phys, u64 len, u32 flags)
4811{
4812 int ret = 0;
4813
4814 if (!cache->cached)
4815 goto assign;
4816
4817 /*
4818 * Sanity check, extent_fiemap() should have ensured that new
52042d8e 4819 * fiemap extent won't overlap with cached one.
4751832d
QW
4820 * Not recoverable.
4821 *
4822 * NOTE: Physical address can overlap, due to compression
4823 */
4824 if (cache->offset + cache->len > offset) {
4825 WARN_ON(1);
4826 return -EINVAL;
4827 }
4828
4829 /*
4830 * Only merges fiemap extents if
4831 * 1) Their logical addresses are continuous
4832 *
4833 * 2) Their physical addresses are continuous
4834 * So truly compressed (physical size smaller than logical size)
4835 * extents won't get merged with each other
4836 *
4837 * 3) Share same flags except FIEMAP_EXTENT_LAST
4838 * So regular extent won't get merged with prealloc extent
4839 */
4840 if (cache->offset + cache->len == offset &&
4841 cache->phys + cache->len == phys &&
4842 (cache->flags & ~FIEMAP_EXTENT_LAST) ==
4843 (flags & ~FIEMAP_EXTENT_LAST)) {
4844 cache->len += len;
4845 cache->flags |= flags;
4846 goto try_submit_last;
4847 }
4848
4849 /* Not mergeable, need to submit cached one */
4850 ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
4851 cache->len, cache->flags);
4852 cache->cached = false;
4853 if (ret)
4854 return ret;
4855assign:
4856 cache->cached = true;
4857 cache->offset = offset;
4858 cache->phys = phys;
4859 cache->len = len;
4860 cache->flags = flags;
4861try_submit_last:
4862 if (cache->flags & FIEMAP_EXTENT_LAST) {
4863 ret = fiemap_fill_next_extent(fieinfo, cache->offset,
4864 cache->phys, cache->len, cache->flags);
4865 cache->cached = false;
4866 }
4867 return ret;
4868}
4869
4870/*
848c23b7 4871 * Emit last fiemap cache
4751832d 4872 *
848c23b7
QW
4873 * The last fiemap cache may still be cached in the following case:
4874 * 0 4k 8k
4875 * |<- Fiemap range ->|
4876 * |<------------ First extent ----------->|
4877 *
4878 * In this case, the first extent range will be cached but not emitted.
4879 * So we must emit it before ending extent_fiemap().
4751832d 4880 */
5c5aff98 4881static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo,
848c23b7 4882 struct fiemap_cache *cache)
4751832d
QW
4883{
4884 int ret;
4885
4886 if (!cache->cached)
4887 return 0;
4888
4751832d
QW
4889 ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
4890 cache->len, cache->flags);
4891 cache->cached = false;
4892 if (ret > 0)
4893 ret = 0;
4894 return ret;
4895}
4896
facee0a0 4897int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
bab16e21 4898 u64 start, u64 len)
1506fcc8 4899{
975f84fe 4900 int ret = 0;
1506fcc8
YS
4901 u64 off = start;
4902 u64 max = start + len;
4903 u32 flags = 0;
975f84fe
JB
4904 u32 found_type;
4905 u64 last;
ec29ed5b 4906 u64 last_for_get_extent = 0;
1506fcc8 4907 u64 disko = 0;
facee0a0 4908 u64 isize = i_size_read(&inode->vfs_inode);
975f84fe 4909 struct btrfs_key found_key;
1506fcc8 4910 struct extent_map *em = NULL;
2ac55d41 4911 struct extent_state *cached_state = NULL;
975f84fe 4912 struct btrfs_path *path;
facee0a0 4913 struct btrfs_root *root = inode->root;
4751832d 4914 struct fiemap_cache cache = { 0 };
5911c8fe
DS
4915 struct ulist *roots;
4916 struct ulist *tmp_ulist;
1506fcc8 4917 int end = 0;
ec29ed5b
CM
4918 u64 em_start = 0;
4919 u64 em_len = 0;
4920 u64 em_end = 0;
1506fcc8
YS
4921
4922 if (len == 0)
4923 return -EINVAL;
4924
975f84fe
JB
4925 path = btrfs_alloc_path();
4926 if (!path)
4927 return -ENOMEM;
975f84fe 4928
5911c8fe
DS
4929 roots = ulist_alloc(GFP_KERNEL);
4930 tmp_ulist = ulist_alloc(GFP_KERNEL);
4931 if (!roots || !tmp_ulist) {
4932 ret = -ENOMEM;
4933 goto out_free_ulist;
4934 }
4935
facee0a0
NB
4936 start = round_down(start, btrfs_inode_sectorsize(inode));
4937 len = round_up(max, btrfs_inode_sectorsize(inode)) - start;
4d479cf0 4938
ec29ed5b
CM
4939 /*
4940 * lookup the last file extent. We're not using i_size here
4941 * because there might be preallocation past i_size
4942 */
facee0a0
NB
4943 ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), -1,
4944 0);
975f84fe 4945 if (ret < 0) {
5911c8fe 4946 goto out_free_ulist;
2d324f59
LB
4947 } else {
4948 WARN_ON(!ret);
4949 if (ret == 1)
4950 ret = 0;
975f84fe 4951 }
2d324f59 4952
975f84fe 4953 path->slots[0]--;
975f84fe 4954 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
962a298f 4955 found_type = found_key.type;
975f84fe 4956
ec29ed5b 4957 /* No extents, but there might be delalloc bits */
facee0a0 4958 if (found_key.objectid != btrfs_ino(inode) ||
975f84fe 4959 found_type != BTRFS_EXTENT_DATA_KEY) {
ec29ed5b
CM
4960 /* have to trust i_size as the end */
4961 last = (u64)-1;
4962 last_for_get_extent = isize;
4963 } else {
4964 /*
4965 * remember the start of the last extent. There are a
4966 * bunch of different factors that go into the length of the
4967 * extent, so its much less complex to remember where it started
4968 */
4969 last = found_key.offset;
4970 last_for_get_extent = last + 1;
975f84fe 4971 }
fe09e16c 4972 btrfs_release_path(path);
975f84fe 4973
ec29ed5b
CM
4974 /*
4975 * we might have some extents allocated but more delalloc past those
4976 * extents. so, we trust isize unless the start of the last extent is
4977 * beyond isize
4978 */
4979 if (last < isize) {
4980 last = (u64)-1;
4981 last_for_get_extent = isize;
4982 }
4983
facee0a0 4984 lock_extent_bits(&inode->io_tree, start, start + len - 1,
d0082371 4985 &cached_state);
ec29ed5b 4986
facee0a0 4987 em = get_extent_skip_holes(inode, start, last_for_get_extent);
1506fcc8
YS
4988 if (!em)
4989 goto out;
4990 if (IS_ERR(em)) {
4991 ret = PTR_ERR(em);
4992 goto out;
4993 }
975f84fe 4994
1506fcc8 4995 while (!end) {
b76bb701 4996 u64 offset_in_extent = 0;
ea8efc74
CM
4997
4998 /* break if the extent we found is outside the range */
4999 if (em->start >= max || extent_map_end(em) < off)
5000 break;
5001
5002 /*
5003 * get_extent may return an extent that starts before our
5004 * requested range. We have to make sure the ranges
5005 * we return to fiemap always move forward and don't
5006 * overlap, so adjust the offsets here
5007 */
5008 em_start = max(em->start, off);
1506fcc8 5009
ea8efc74
CM
5010 /*
5011 * record the offset from the start of the extent
b76bb701
JB
5012 * for adjusting the disk offset below. Only do this if the
5013 * extent isn't compressed since our in ram offset may be past
5014 * what we have actually allocated on disk.
ea8efc74 5015 */
b76bb701
JB
5016 if (!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
5017 offset_in_extent = em_start - em->start;
ec29ed5b 5018 em_end = extent_map_end(em);
ea8efc74 5019 em_len = em_end - em_start;
1506fcc8 5020 flags = 0;
f0986318
FM
5021 if (em->block_start < EXTENT_MAP_LAST_BYTE)
5022 disko = em->block_start + offset_in_extent;
5023 else
5024 disko = 0;
1506fcc8 5025
ea8efc74
CM
5026 /*
5027 * bump off for our next call to get_extent
5028 */
5029 off = extent_map_end(em);
5030 if (off >= max)
5031 end = 1;
5032
93dbfad7 5033 if (em->block_start == EXTENT_MAP_LAST_BYTE) {
1506fcc8
YS
5034 end = 1;
5035 flags |= FIEMAP_EXTENT_LAST;
93dbfad7 5036 } else if (em->block_start == EXTENT_MAP_INLINE) {
1506fcc8
YS
5037 flags |= (FIEMAP_EXTENT_DATA_INLINE |
5038 FIEMAP_EXTENT_NOT_ALIGNED);
93dbfad7 5039 } else if (em->block_start == EXTENT_MAP_DELALLOC) {
1506fcc8
YS
5040 flags |= (FIEMAP_EXTENT_DELALLOC |
5041 FIEMAP_EXTENT_UNKNOWN);
dc046b10
JB
5042 } else if (fieinfo->fi_extents_max) {
5043 u64 bytenr = em->block_start -
5044 (em->start - em->orig_start);
fe09e16c 5045
fe09e16c
LB
5046 /*
5047 * As btrfs supports shared space, this information
5048 * can be exported to userspace tools via
dc046b10
JB
5049 * flag FIEMAP_EXTENT_SHARED. If fi_extents_max == 0
5050 * then we're just getting a count and we can skip the
5051 * lookup stuff.
fe09e16c 5052 */
facee0a0 5053 ret = btrfs_check_shared(root, btrfs_ino(inode),
5911c8fe 5054 bytenr, roots, tmp_ulist);
dc046b10 5055 if (ret < 0)
fe09e16c 5056 goto out_free;
dc046b10 5057 if (ret)
fe09e16c 5058 flags |= FIEMAP_EXTENT_SHARED;
dc046b10 5059 ret = 0;
1506fcc8
YS
5060 }
5061 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
5062 flags |= FIEMAP_EXTENT_ENCODED;
0d2b2372
JB
5063 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
5064 flags |= FIEMAP_EXTENT_UNWRITTEN;
1506fcc8 5065
1506fcc8
YS
5066 free_extent_map(em);
5067 em = NULL;
ec29ed5b
CM
5068 if ((em_start >= last) || em_len == (u64)-1 ||
5069 (last == (u64)-1 && isize <= em_end)) {
1506fcc8
YS
5070 flags |= FIEMAP_EXTENT_LAST;
5071 end = 1;
5072 }
5073
ec29ed5b 5074 /* now scan forward to see if this is really the last extent. */
facee0a0 5075 em = get_extent_skip_holes(inode, off, last_for_get_extent);
ec29ed5b
CM
5076 if (IS_ERR(em)) {
5077 ret = PTR_ERR(em);
5078 goto out;
5079 }
5080 if (!em) {
975f84fe
JB
5081 flags |= FIEMAP_EXTENT_LAST;
5082 end = 1;
5083 }
4751832d
QW
5084 ret = emit_fiemap_extent(fieinfo, &cache, em_start, disko,
5085 em_len, flags);
26e726af
CS
5086 if (ret) {
5087 if (ret == 1)
5088 ret = 0;
ec29ed5b 5089 goto out_free;
26e726af 5090 }
1506fcc8
YS
5091 }
5092out_free:
4751832d 5093 if (!ret)
5c5aff98 5094 ret = emit_last_fiemap_cache(fieinfo, &cache);
1506fcc8
YS
5095 free_extent_map(em);
5096out:
facee0a0 5097 unlock_extent_cached(&inode->io_tree, start, start + len - 1,
e43bbe5e 5098 &cached_state);
5911c8fe
DS
5099
5100out_free_ulist:
e02d48ea 5101 btrfs_free_path(path);
5911c8fe
DS
5102 ulist_free(roots);
5103 ulist_free(tmp_ulist);
1506fcc8
YS
5104 return ret;
5105}
5106
727011e0
CM
5107static void __free_extent_buffer(struct extent_buffer *eb)
5108{
727011e0
CM
5109 kmem_cache_free(extent_buffer_cache, eb);
5110}
5111
2b48966a 5112int extent_buffer_under_io(const struct extent_buffer *eb)
db7f3436
JB
5113{
5114 return (atomic_read(&eb->io_pages) ||
5115 test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) ||
5116 test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
5117}
5118
8ff8466d 5119static bool page_range_has_eb(struct btrfs_fs_info *fs_info, struct page *page)
db7f3436 5120{
8ff8466d 5121 struct btrfs_subpage *subpage;
db7f3436 5122
8ff8466d 5123 lockdep_assert_held(&page->mapping->private_lock);
db7f3436 5124
8ff8466d
QW
5125 if (PagePrivate(page)) {
5126 subpage = (struct btrfs_subpage *)page->private;
5127 if (atomic_read(&subpage->eb_refs))
5128 return true;
5129 }
5130 return false;
5131}
db7f3436 5132
8ff8466d
QW
5133static void detach_extent_buffer_page(struct extent_buffer *eb, struct page *page)
5134{
5135 struct btrfs_fs_info *fs_info = eb->fs_info;
5136 const bool mapped = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
5137
5138 /*
5139 * For mapped eb, we're going to change the page private, which should
5140 * be done under the private_lock.
5141 */
5142 if (mapped)
5143 spin_lock(&page->mapping->private_lock);
5144
5145 if (!PagePrivate(page)) {
5d2361db 5146 if (mapped)
8ff8466d
QW
5147 spin_unlock(&page->mapping->private_lock);
5148 return;
5149 }
5150
5151 if (fs_info->sectorsize == PAGE_SIZE) {
5d2361db
FL
5152 /*
5153 * We do this since we'll remove the pages after we've
5154 * removed the eb from the radix tree, so we could race
5155 * and have this page now attached to the new eb. So
5156 * only clear page_private if it's still connected to
5157 * this eb.
5158 */
5159 if (PagePrivate(page) &&
5160 page->private == (unsigned long)eb) {
5161 BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
5162 BUG_ON(PageDirty(page));
5163 BUG_ON(PageWriteback(page));
db7f3436 5164 /*
5d2361db
FL
5165 * We need to make sure we haven't be attached
5166 * to a new eb.
db7f3436 5167 */
d1b89bc0 5168 detach_page_private(page);
db7f3436 5169 }
5d2361db
FL
5170 if (mapped)
5171 spin_unlock(&page->mapping->private_lock);
8ff8466d
QW
5172 return;
5173 }
5174
5175 /*
5176 * For subpage, we can have dummy eb with page private. In this case,
5177 * we can directly detach the private as such page is only attached to
5178 * one dummy eb, no sharing.
5179 */
5180 if (!mapped) {
5181 btrfs_detach_subpage(fs_info, page);
5182 return;
5183 }
5184
5185 btrfs_page_dec_eb_refs(fs_info, page);
5186
5187 /*
5188 * We can only detach the page private if there are no other ebs in the
5189 * page range.
5190 */
5191 if (!page_range_has_eb(fs_info, page))
5192 btrfs_detach_subpage(fs_info, page);
5193
5194 spin_unlock(&page->mapping->private_lock);
5195}
5196
5197/* Release all pages attached to the extent buffer */
5198static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb)
5199{
5200 int i;
5201 int num_pages;
5202
5203 ASSERT(!extent_buffer_under_io(eb));
5204
5205 num_pages = num_extent_pages(eb);
5206 for (i = 0; i < num_pages; i++) {
5207 struct page *page = eb->pages[i];
5208
5209 if (!page)
5210 continue;
5211
5212 detach_extent_buffer_page(eb, page);
5d2361db 5213
01327610 5214 /* One for when we allocated the page */
09cbfeaf 5215 put_page(page);
d64766fd 5216 }
db7f3436
JB
5217}
5218
5219/*
5220 * Helper for releasing the extent buffer.
5221 */
5222static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
5223{
55ac0139 5224 btrfs_release_extent_buffer_pages(eb);
8c38938c 5225 btrfs_leak_debug_del(&eb->fs_info->eb_leak_lock, &eb->leak_list);
db7f3436
JB
5226 __free_extent_buffer(eb);
5227}
5228
f28491e0
JB
5229static struct extent_buffer *
5230__alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
23d79d81 5231 unsigned long len)
d1310b2e
CM
5232{
5233 struct extent_buffer *eb = NULL;
5234
d1b5c567 5235 eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS|__GFP_NOFAIL);
d1310b2e
CM
5236 eb->start = start;
5237 eb->len = len;
f28491e0 5238 eb->fs_info = fs_info;
815a51c7 5239 eb->bflags = 0;
196d59ab 5240 init_rwsem(&eb->lock);
b4ce94de 5241
3fd63727
JB
5242 btrfs_leak_debug_add(&fs_info->eb_leak_lock, &eb->leak_list,
5243 &fs_info->allocated_ebs);
d3575156 5244 INIT_LIST_HEAD(&eb->release_list);
6d49ba1b 5245
3083ee2e 5246 spin_lock_init(&eb->refs_lock);
d1310b2e 5247 atomic_set(&eb->refs, 1);
0b32f4bb 5248 atomic_set(&eb->io_pages, 0);
727011e0 5249
deb67895 5250 ASSERT(len <= BTRFS_MAX_METADATA_BLOCKSIZE);
d1310b2e
CM
5251
5252 return eb;
5253}
5254
2b48966a 5255struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src)
815a51c7 5256{
cc5e31a4 5257 int i;
815a51c7
JS
5258 struct page *p;
5259 struct extent_buffer *new;
cc5e31a4 5260 int num_pages = num_extent_pages(src);
815a51c7 5261
3f556f78 5262 new = __alloc_extent_buffer(src->fs_info, src->start, src->len);
815a51c7
JS
5263 if (new == NULL)
5264 return NULL;
5265
62c053fb
QW
5266 /*
5267 * Set UNMAPPED before calling btrfs_release_extent_buffer(), as
5268 * btrfs_release_extent_buffer() have different behavior for
5269 * UNMAPPED subpage extent buffer.
5270 */
5271 set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags);
5272
815a51c7 5273 for (i = 0; i < num_pages; i++) {
760f991f
QW
5274 int ret;
5275
9ec72677 5276 p = alloc_page(GFP_NOFS);
db7f3436
JB
5277 if (!p) {
5278 btrfs_release_extent_buffer(new);
5279 return NULL;
5280 }
760f991f
QW
5281 ret = attach_extent_buffer_page(new, p, NULL);
5282 if (ret < 0) {
5283 put_page(p);
5284 btrfs_release_extent_buffer(new);
5285 return NULL;
5286 }
815a51c7 5287 WARN_ON(PageDirty(p));
815a51c7 5288 new->pages[i] = p;
fba1acf9 5289 copy_page(page_address(p), page_address(src->pages[i]));
815a51c7 5290 }
92d83e94 5291 set_extent_buffer_uptodate(new);
815a51c7
JS
5292
5293 return new;
5294}
5295
0f331229
OS
5296struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
5297 u64 start, unsigned long len)
815a51c7
JS
5298{
5299 struct extent_buffer *eb;
cc5e31a4
DS
5300 int num_pages;
5301 int i;
815a51c7 5302
3f556f78 5303 eb = __alloc_extent_buffer(fs_info, start, len);
815a51c7
JS
5304 if (!eb)
5305 return NULL;
5306
65ad0104 5307 num_pages = num_extent_pages(eb);
815a51c7 5308 for (i = 0; i < num_pages; i++) {
09bc1f0f
QW
5309 int ret;
5310
9ec72677 5311 eb->pages[i] = alloc_page(GFP_NOFS);
815a51c7
JS
5312 if (!eb->pages[i])
5313 goto err;
09bc1f0f
QW
5314 ret = attach_extent_buffer_page(eb, eb->pages[i], NULL);
5315 if (ret < 0)
5316 goto err;
815a51c7
JS
5317 }
5318 set_extent_buffer_uptodate(eb);
5319 btrfs_set_header_nritems(eb, 0);
b0132a3b 5320 set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
815a51c7
JS
5321
5322 return eb;
5323err:
09bc1f0f
QW
5324 for (; i > 0; i--) {
5325 detach_extent_buffer_page(eb, eb->pages[i - 1]);
84167d19 5326 __free_page(eb->pages[i - 1]);
09bc1f0f 5327 }
815a51c7
JS
5328 __free_extent_buffer(eb);
5329 return NULL;
5330}
5331
0f331229 5332struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
da17066c 5333 u64 start)
0f331229 5334{
da17066c 5335 return __alloc_dummy_extent_buffer(fs_info, start, fs_info->nodesize);
0f331229
OS
5336}
5337
0b32f4bb
JB
5338static void check_buffer_tree_ref(struct extent_buffer *eb)
5339{
242e18c7 5340 int refs;
6bf9cd2e
BB
5341 /*
5342 * The TREE_REF bit is first set when the extent_buffer is added
5343 * to the radix tree. It is also reset, if unset, when a new reference
5344 * is created by find_extent_buffer.
0b32f4bb 5345 *
6bf9cd2e
BB
5346 * It is only cleared in two cases: freeing the last non-tree
5347 * reference to the extent_buffer when its STALE bit is set or
5348 * calling releasepage when the tree reference is the only reference.
0b32f4bb 5349 *
6bf9cd2e
BB
5350 * In both cases, care is taken to ensure that the extent_buffer's
5351 * pages are not under io. However, releasepage can be concurrently
5352 * called with creating new references, which is prone to race
5353 * conditions between the calls to check_buffer_tree_ref in those
5354 * codepaths and clearing TREE_REF in try_release_extent_buffer.
0b32f4bb 5355 *
6bf9cd2e
BB
5356 * The actual lifetime of the extent_buffer in the radix tree is
5357 * adequately protected by the refcount, but the TREE_REF bit and
5358 * its corresponding reference are not. To protect against this
5359 * class of races, we call check_buffer_tree_ref from the codepaths
5360 * which trigger io after they set eb->io_pages. Note that once io is
5361 * initiated, TREE_REF can no longer be cleared, so that is the
5362 * moment at which any such race is best fixed.
0b32f4bb 5363 */
242e18c7
CM
5364 refs = atomic_read(&eb->refs);
5365 if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
5366 return;
5367
594831c4
JB
5368 spin_lock(&eb->refs_lock);
5369 if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
0b32f4bb 5370 atomic_inc(&eb->refs);
594831c4 5371 spin_unlock(&eb->refs_lock);
0b32f4bb
JB
5372}
5373
2457aec6
MG
5374static void mark_extent_buffer_accessed(struct extent_buffer *eb,
5375 struct page *accessed)
5df4235e 5376{
cc5e31a4 5377 int num_pages, i;
5df4235e 5378
0b32f4bb
JB
5379 check_buffer_tree_ref(eb);
5380
65ad0104 5381 num_pages = num_extent_pages(eb);
5df4235e 5382 for (i = 0; i < num_pages; i++) {
fb85fc9a
DS
5383 struct page *p = eb->pages[i];
5384
2457aec6
MG
5385 if (p != accessed)
5386 mark_page_accessed(p);
5df4235e
JB
5387 }
5388}
5389
f28491e0
JB
5390struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
5391 u64 start)
452c75c3
CS
5392{
5393 struct extent_buffer *eb;
5394
5395 rcu_read_lock();
f28491e0 5396 eb = radix_tree_lookup(&fs_info->buffer_radix,
478ef886 5397 start >> fs_info->sectorsize_bits);
452c75c3
CS
5398 if (eb && atomic_inc_not_zero(&eb->refs)) {
5399 rcu_read_unlock();
062c19e9
FM
5400 /*
5401 * Lock our eb's refs_lock to avoid races with
5402 * free_extent_buffer. When we get our eb it might be flagged
5403 * with EXTENT_BUFFER_STALE and another task running
5404 * free_extent_buffer might have seen that flag set,
5405 * eb->refs == 2, that the buffer isn't under IO (dirty and
5406 * writeback flags not set) and it's still in the tree (flag
5407 * EXTENT_BUFFER_TREE_REF set), therefore being in the process
5408 * of decrementing the extent buffer's reference count twice.
5409 * So here we could race and increment the eb's reference count,
5410 * clear its stale flag, mark it as dirty and drop our reference
5411 * before the other task finishes executing free_extent_buffer,
5412 * which would later result in an attempt to free an extent
5413 * buffer that is dirty.
5414 */
5415 if (test_bit(EXTENT_BUFFER_STALE, &eb->bflags)) {
5416 spin_lock(&eb->refs_lock);
5417 spin_unlock(&eb->refs_lock);
5418 }
2457aec6 5419 mark_extent_buffer_accessed(eb, NULL);
452c75c3
CS
5420 return eb;
5421 }
5422 rcu_read_unlock();
5423
5424 return NULL;
5425}
5426
faa2dbf0
JB
5427#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
5428struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
da17066c 5429 u64 start)
faa2dbf0
JB
5430{
5431 struct extent_buffer *eb, *exists = NULL;
5432 int ret;
5433
5434 eb = find_extent_buffer(fs_info, start);
5435 if (eb)
5436 return eb;
da17066c 5437 eb = alloc_dummy_extent_buffer(fs_info, start);
faa2dbf0 5438 if (!eb)
b6293c82 5439 return ERR_PTR(-ENOMEM);
faa2dbf0
JB
5440 eb->fs_info = fs_info;
5441again:
e1860a77 5442 ret = radix_tree_preload(GFP_NOFS);
b6293c82
DC
5443 if (ret) {
5444 exists = ERR_PTR(ret);
faa2dbf0 5445 goto free_eb;
b6293c82 5446 }
faa2dbf0
JB
5447 spin_lock(&fs_info->buffer_lock);
5448 ret = radix_tree_insert(&fs_info->buffer_radix,
478ef886 5449 start >> fs_info->sectorsize_bits, eb);
faa2dbf0
JB
5450 spin_unlock(&fs_info->buffer_lock);
5451 radix_tree_preload_end();
5452 if (ret == -EEXIST) {
5453 exists = find_extent_buffer(fs_info, start);
5454 if (exists)
5455 goto free_eb;
5456 else
5457 goto again;
5458 }
5459 check_buffer_tree_ref(eb);
5460 set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
5461
faa2dbf0
JB
5462 return eb;
5463free_eb:
5464 btrfs_release_extent_buffer(eb);
5465 return exists;
5466}
5467#endif
5468
81982210
QW
5469static struct extent_buffer *grab_extent_buffer(
5470 struct btrfs_fs_info *fs_info, struct page *page)
c0f0a9e7
QW
5471{
5472 struct extent_buffer *exists;
5473
81982210
QW
5474 /*
5475 * For subpage case, we completely rely on radix tree to ensure we
5476 * don't try to insert two ebs for the same bytenr. So here we always
5477 * return NULL and just continue.
5478 */
5479 if (fs_info->sectorsize < PAGE_SIZE)
5480 return NULL;
5481
c0f0a9e7
QW
5482 /* Page not yet attached to an extent buffer */
5483 if (!PagePrivate(page))
5484 return NULL;
5485
5486 /*
5487 * We could have already allocated an eb for this page and attached one
5488 * so lets see if we can get a ref on the existing eb, and if we can we
5489 * know it's good and we can just return that one, else we know we can
5490 * just overwrite page->private.
5491 */
5492 exists = (struct extent_buffer *)page->private;
5493 if (atomic_inc_not_zero(&exists->refs))
5494 return exists;
5495
5496 WARN_ON(PageDirty(page));
5497 detach_page_private(page);
5498 return NULL;
5499}
5500
f28491e0 5501struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
3fbaf258 5502 u64 start, u64 owner_root, int level)
d1310b2e 5503{
da17066c 5504 unsigned long len = fs_info->nodesize;
cc5e31a4
DS
5505 int num_pages;
5506 int i;
09cbfeaf 5507 unsigned long index = start >> PAGE_SHIFT;
d1310b2e 5508 struct extent_buffer *eb;
6af118ce 5509 struct extent_buffer *exists = NULL;
d1310b2e 5510 struct page *p;
f28491e0 5511 struct address_space *mapping = fs_info->btree_inode->i_mapping;
d1310b2e 5512 int uptodate = 1;
19fe0a8b 5513 int ret;
d1310b2e 5514
da17066c 5515 if (!IS_ALIGNED(start, fs_info->sectorsize)) {
c871b0f2
LB
5516 btrfs_err(fs_info, "bad tree block start %llu", start);
5517 return ERR_PTR(-EINVAL);
5518 }
5519
1aaac38c
QW
5520 if (fs_info->sectorsize < PAGE_SIZE &&
5521 offset_in_page(start) + len > PAGE_SIZE) {
5522 btrfs_err(fs_info,
5523 "tree block crosses page boundary, start %llu nodesize %lu",
5524 start, len);
5525 return ERR_PTR(-EINVAL);
5526 }
5527
f28491e0 5528 eb = find_extent_buffer(fs_info, start);
452c75c3 5529 if (eb)
6af118ce 5530 return eb;
6af118ce 5531
23d79d81 5532 eb = __alloc_extent_buffer(fs_info, start, len);
2b114d1d 5533 if (!eb)
c871b0f2 5534 return ERR_PTR(-ENOMEM);
e114c545 5535 btrfs_set_buffer_lockdep_class(owner_root, eb, level);
d1310b2e 5536
65ad0104 5537 num_pages = num_extent_pages(eb);
727011e0 5538 for (i = 0; i < num_pages; i++, index++) {
760f991f
QW
5539 struct btrfs_subpage *prealloc = NULL;
5540
d1b5c567 5541 p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL);
c871b0f2
LB
5542 if (!p) {
5543 exists = ERR_PTR(-ENOMEM);
6af118ce 5544 goto free_eb;
c871b0f2 5545 }
4f2de97a 5546
760f991f
QW
5547 /*
5548 * Preallocate page->private for subpage case, so that we won't
5549 * allocate memory with private_lock hold. The memory will be
5550 * freed by attach_extent_buffer_page() or freed manually if
5551 * we exit earlier.
5552 *
5553 * Although we have ensured one subpage eb can only have one
5554 * page, but it may change in the future for 16K page size
5555 * support, so we still preallocate the memory in the loop.
5556 */
5557 ret = btrfs_alloc_subpage(fs_info, &prealloc,
5558 BTRFS_SUBPAGE_METADATA);
5559 if (ret < 0) {
5560 unlock_page(p);
5561 put_page(p);
5562 exists = ERR_PTR(ret);
5563 goto free_eb;
5564 }
5565
4f2de97a 5566 spin_lock(&mapping->private_lock);
81982210 5567 exists = grab_extent_buffer(fs_info, p);
c0f0a9e7
QW
5568 if (exists) {
5569 spin_unlock(&mapping->private_lock);
5570 unlock_page(p);
5571 put_page(p);
5572 mark_extent_buffer_accessed(exists, p);
760f991f 5573 btrfs_free_subpage(prealloc);
c0f0a9e7 5574 goto free_eb;
d1310b2e 5575 }
760f991f
QW
5576 /* Should not fail, as we have preallocated the memory */
5577 ret = attach_extent_buffer_page(eb, p, prealloc);
5578 ASSERT(!ret);
8ff8466d
QW
5579 /*
5580 * To inform we have extra eb under allocation, so that
5581 * detach_extent_buffer_page() won't release the page private
5582 * when the eb hasn't yet been inserted into radix tree.
5583 *
5584 * The ref will be decreased when the eb released the page, in
5585 * detach_extent_buffer_page().
5586 * Thus needs no special handling in error path.
5587 */
5588 btrfs_page_inc_eb_refs(fs_info, p);
4f2de97a 5589 spin_unlock(&mapping->private_lock);
760f991f 5590
0b32f4bb 5591 WARN_ON(PageDirty(p));
727011e0 5592 eb->pages[i] = p;
d1310b2e
CM
5593 if (!PageUptodate(p))
5594 uptodate = 0;
eb14ab8e
CM
5595
5596 /*
b16d011e
NB
5597 * We can't unlock the pages just yet since the extent buffer
5598 * hasn't been properly inserted in the radix tree, this
5599 * opens a race with btree_releasepage which can free a page
5600 * while we are still filling in all pages for the buffer and
5601 * we could crash.
eb14ab8e 5602 */
d1310b2e
CM
5603 }
5604 if (uptodate)
b4ce94de 5605 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
115391d2 5606again:
e1860a77 5607 ret = radix_tree_preload(GFP_NOFS);
c871b0f2
LB
5608 if (ret) {
5609 exists = ERR_PTR(ret);
19fe0a8b 5610 goto free_eb;
c871b0f2 5611 }
19fe0a8b 5612
f28491e0
JB
5613 spin_lock(&fs_info->buffer_lock);
5614 ret = radix_tree_insert(&fs_info->buffer_radix,
478ef886 5615 start >> fs_info->sectorsize_bits, eb);
f28491e0 5616 spin_unlock(&fs_info->buffer_lock);
452c75c3 5617 radix_tree_preload_end();
19fe0a8b 5618 if (ret == -EEXIST) {
f28491e0 5619 exists = find_extent_buffer(fs_info, start);
452c75c3
CS
5620 if (exists)
5621 goto free_eb;
5622 else
115391d2 5623 goto again;
6af118ce 5624 }
6af118ce 5625 /* add one reference for the tree */
0b32f4bb 5626 check_buffer_tree_ref(eb);
34b41ace 5627 set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
eb14ab8e
CM
5628
5629 /*
b16d011e
NB
5630 * Now it's safe to unlock the pages because any calls to
5631 * btree_releasepage will correctly detect that a page belongs to a
5632 * live buffer and won't free them prematurely.
eb14ab8e 5633 */
28187ae5
NB
5634 for (i = 0; i < num_pages; i++)
5635 unlock_page(eb->pages[i]);
d1310b2e
CM
5636 return eb;
5637
6af118ce 5638free_eb:
5ca64f45 5639 WARN_ON(!atomic_dec_and_test(&eb->refs));
727011e0
CM
5640 for (i = 0; i < num_pages; i++) {
5641 if (eb->pages[i])
5642 unlock_page(eb->pages[i]);
5643 }
eb14ab8e 5644
897ca6e9 5645 btrfs_release_extent_buffer(eb);
6af118ce 5646 return exists;
d1310b2e 5647}
d1310b2e 5648
3083ee2e
JB
5649static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
5650{
5651 struct extent_buffer *eb =
5652 container_of(head, struct extent_buffer, rcu_head);
5653
5654 __free_extent_buffer(eb);
5655}
5656
f7a52a40 5657static int release_extent_buffer(struct extent_buffer *eb)
5ce48d0f 5658 __releases(&eb->refs_lock)
3083ee2e 5659{
07e21c4d
NB
5660 lockdep_assert_held(&eb->refs_lock);
5661
3083ee2e
JB
5662 WARN_ON(atomic_read(&eb->refs) == 0);
5663 if (atomic_dec_and_test(&eb->refs)) {
34b41ace 5664 if (test_and_clear_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags)) {
f28491e0 5665 struct btrfs_fs_info *fs_info = eb->fs_info;
3083ee2e 5666
815a51c7 5667 spin_unlock(&eb->refs_lock);
3083ee2e 5668
f28491e0
JB
5669 spin_lock(&fs_info->buffer_lock);
5670 radix_tree_delete(&fs_info->buffer_radix,
478ef886 5671 eb->start >> fs_info->sectorsize_bits);
f28491e0 5672 spin_unlock(&fs_info->buffer_lock);
34b41ace
JB
5673 } else {
5674 spin_unlock(&eb->refs_lock);
815a51c7 5675 }
3083ee2e 5676
8c38938c 5677 btrfs_leak_debug_del(&eb->fs_info->eb_leak_lock, &eb->leak_list);
3083ee2e 5678 /* Should be safe to release our pages at this point */
55ac0139 5679 btrfs_release_extent_buffer_pages(eb);
bcb7e449 5680#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
b0132a3b 5681 if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags))) {
bcb7e449
JB
5682 __free_extent_buffer(eb);
5683 return 1;
5684 }
5685#endif
3083ee2e 5686 call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
e64860aa 5687 return 1;
3083ee2e
JB
5688 }
5689 spin_unlock(&eb->refs_lock);
e64860aa
JB
5690
5691 return 0;
3083ee2e
JB
5692}
5693
d1310b2e
CM
5694void free_extent_buffer(struct extent_buffer *eb)
5695{
242e18c7
CM
5696 int refs;
5697 int old;
d1310b2e
CM
5698 if (!eb)
5699 return;
5700
242e18c7
CM
5701 while (1) {
5702 refs = atomic_read(&eb->refs);
46cc775e
NB
5703 if ((!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) && refs <= 3)
5704 || (test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) &&
5705 refs == 1))
242e18c7
CM
5706 break;
5707 old = atomic_cmpxchg(&eb->refs, refs, refs - 1);
5708 if (old == refs)
5709 return;
5710 }
5711
3083ee2e
JB
5712 spin_lock(&eb->refs_lock);
5713 if (atomic_read(&eb->refs) == 2 &&
5714 test_bit(EXTENT_BUFFER_STALE, &eb->bflags) &&
0b32f4bb 5715 !extent_buffer_under_io(eb) &&
3083ee2e
JB
5716 test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
5717 atomic_dec(&eb->refs);
5718
5719 /*
5720 * I know this is terrible, but it's temporary until we stop tracking
5721 * the uptodate bits and such for the extent buffers.
5722 */
f7a52a40 5723 release_extent_buffer(eb);
3083ee2e
JB
5724}
5725
5726void free_extent_buffer_stale(struct extent_buffer *eb)
5727{
5728 if (!eb)
d1310b2e
CM
5729 return;
5730
3083ee2e
JB
5731 spin_lock(&eb->refs_lock);
5732 set_bit(EXTENT_BUFFER_STALE, &eb->bflags);
5733
0b32f4bb 5734 if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) &&
3083ee2e
JB
5735 test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
5736 atomic_dec(&eb->refs);
f7a52a40 5737 release_extent_buffer(eb);
d1310b2e 5738}
d1310b2e 5739
2b48966a 5740void clear_extent_buffer_dirty(const struct extent_buffer *eb)
d1310b2e 5741{
cc5e31a4
DS
5742 int i;
5743 int num_pages;
d1310b2e
CM
5744 struct page *page;
5745
65ad0104 5746 num_pages = num_extent_pages(eb);
d1310b2e
CM
5747
5748 for (i = 0; i < num_pages; i++) {
fb85fc9a 5749 page = eb->pages[i];
b9473439 5750 if (!PageDirty(page))
d2c3f4f6
CM
5751 continue;
5752
a61e6f29 5753 lock_page(page);
eb14ab8e
CM
5754 WARN_ON(!PagePrivate(page));
5755
d1310b2e 5756 clear_page_dirty_for_io(page);
b93b0163 5757 xa_lock_irq(&page->mapping->i_pages);
0a943c65
MW
5758 if (!PageDirty(page))
5759 __xa_clear_mark(&page->mapping->i_pages,
5760 page_index(page), PAGECACHE_TAG_DIRTY);
b93b0163 5761 xa_unlock_irq(&page->mapping->i_pages);
bf0da8c1 5762 ClearPageError(page);
a61e6f29 5763 unlock_page(page);
d1310b2e 5764 }
0b32f4bb 5765 WARN_ON(atomic_read(&eb->refs) == 0);
d1310b2e 5766}
d1310b2e 5767
abb57ef3 5768bool set_extent_buffer_dirty(struct extent_buffer *eb)
d1310b2e 5769{
cc5e31a4
DS
5770 int i;
5771 int num_pages;
abb57ef3 5772 bool was_dirty;
d1310b2e 5773
0b32f4bb
JB
5774 check_buffer_tree_ref(eb);
5775
b9473439 5776 was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
0b32f4bb 5777
65ad0104 5778 num_pages = num_extent_pages(eb);
3083ee2e 5779 WARN_ON(atomic_read(&eb->refs) == 0);
0b32f4bb
JB
5780 WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
5781
abb57ef3
LB
5782 if (!was_dirty)
5783 for (i = 0; i < num_pages; i++)
5784 set_page_dirty(eb->pages[i]);
51995c39
LB
5785
5786#ifdef CONFIG_BTRFS_DEBUG
5787 for (i = 0; i < num_pages; i++)
5788 ASSERT(PageDirty(eb->pages[i]));
5789#endif
5790
b9473439 5791 return was_dirty;
d1310b2e 5792}
d1310b2e 5793
69ba3927 5794void clear_extent_buffer_uptodate(struct extent_buffer *eb)
1259ab75 5795{
251f2acc 5796 struct btrfs_fs_info *fs_info = eb->fs_info;
1259ab75 5797 struct page *page;
cc5e31a4 5798 int num_pages;
251f2acc 5799 int i;
1259ab75 5800
b4ce94de 5801 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
65ad0104 5802 num_pages = num_extent_pages(eb);
1259ab75 5803 for (i = 0; i < num_pages; i++) {
fb85fc9a 5804 page = eb->pages[i];
33958dc6 5805 if (page)
251f2acc
QW
5806 btrfs_page_clear_uptodate(fs_info, page,
5807 eb->start, eb->len);
1259ab75 5808 }
1259ab75
CM
5809}
5810
09c25a8c 5811void set_extent_buffer_uptodate(struct extent_buffer *eb)
d1310b2e 5812{
251f2acc 5813 struct btrfs_fs_info *fs_info = eb->fs_info;
d1310b2e 5814 struct page *page;
cc5e31a4 5815 int num_pages;
251f2acc 5816 int i;
d1310b2e 5817
0b32f4bb 5818 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
65ad0104 5819 num_pages = num_extent_pages(eb);
d1310b2e 5820 for (i = 0; i < num_pages; i++) {
fb85fc9a 5821 page = eb->pages[i];
251f2acc 5822 btrfs_page_set_uptodate(fs_info, page, eb->start, eb->len);
d1310b2e 5823 }
d1310b2e 5824}
d1310b2e 5825
4012daf7
QW
5826static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
5827 int mirror_num)
5828{
5829 struct btrfs_fs_info *fs_info = eb->fs_info;
5830 struct extent_io_tree *io_tree;
5831 struct page *page = eb->pages[0];
5832 struct bio *bio = NULL;
5833 int ret = 0;
5834
5835 ASSERT(!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags));
5836 ASSERT(PagePrivate(page));
5837 io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
5838
5839 if (wait == WAIT_NONE) {
5840 ret = try_lock_extent(io_tree, eb->start,
5841 eb->start + eb->len - 1);
5842 if (ret <= 0)
5843 return ret;
5844 } else {
5845 ret = lock_extent(io_tree, eb->start, eb->start + eb->len - 1);
5846 if (ret < 0)
5847 return ret;
5848 }
5849
5850 ret = 0;
5851 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags) ||
5852 PageUptodate(page) ||
5853 btrfs_subpage_test_uptodate(fs_info, page, eb->start, eb->len)) {
5854 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
5855 unlock_extent(io_tree, eb->start, eb->start + eb->len - 1);
5856 return ret;
5857 }
5858
5859 clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
5860 eb->read_mirror = 0;
5861 atomic_set(&eb->io_pages, 1);
5862 check_buffer_tree_ref(eb);
5863 btrfs_subpage_clear_error(fs_info, page, eb->start, eb->len);
5864
5865 ret = submit_extent_page(REQ_OP_READ | REQ_META, NULL, page, eb->start,
5866 eb->len, eb->start - page_offset(page), &bio,
5867 end_bio_extent_readpage, mirror_num, 0, 0,
5868 true);
5869 if (ret) {
5870 /*
5871 * In the endio function, if we hit something wrong we will
5872 * increase the io_pages, so here we need to decrease it for
5873 * error path.
5874 */
5875 atomic_dec(&eb->io_pages);
5876 }
5877 if (bio) {
5878 int tmp;
5879
5880 tmp = submit_one_bio(bio, mirror_num, 0);
5881 if (tmp < 0)
5882 return tmp;
5883 }
5884 if (ret || wait != WAIT_COMPLETE)
5885 return ret;
5886
5887 wait_extent_bit(io_tree, eb->start, eb->start + eb->len - 1, EXTENT_LOCKED);
5888 if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
5889 ret = -EIO;
5890 return ret;
5891}
5892
c2ccfbc6 5893int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
d1310b2e 5894{
cc5e31a4 5895 int i;
d1310b2e
CM
5896 struct page *page;
5897 int err;
5898 int ret = 0;
ce9adaa5
CM
5899 int locked_pages = 0;
5900 int all_uptodate = 1;
cc5e31a4 5901 int num_pages;
727011e0 5902 unsigned long num_reads = 0;
a86c12c7 5903 struct bio *bio = NULL;
c8b97818 5904 unsigned long bio_flags = 0;
a86c12c7 5905
b4ce94de 5906 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
d1310b2e
CM
5907 return 0;
5908
4012daf7
QW
5909 if (eb->fs_info->sectorsize < PAGE_SIZE)
5910 return read_extent_buffer_subpage(eb, wait, mirror_num);
5911
65ad0104 5912 num_pages = num_extent_pages(eb);
8436ea91 5913 for (i = 0; i < num_pages; i++) {
fb85fc9a 5914 page = eb->pages[i];
bb82ab88 5915 if (wait == WAIT_NONE) {
2c4d8cb7
QW
5916 /*
5917 * WAIT_NONE is only utilized by readahead. If we can't
5918 * acquire the lock atomically it means either the eb
5919 * is being read out or under modification.
5920 * Either way the eb will be or has been cached,
5921 * readahead can exit safely.
5922 */
2db04966 5923 if (!trylock_page(page))
ce9adaa5 5924 goto unlock_exit;
d1310b2e
CM
5925 } else {
5926 lock_page(page);
5927 }
ce9adaa5 5928 locked_pages++;
2571e739
LB
5929 }
5930 /*
5931 * We need to firstly lock all pages to make sure that
5932 * the uptodate bit of our pages won't be affected by
5933 * clear_extent_buffer_uptodate().
5934 */
8436ea91 5935 for (i = 0; i < num_pages; i++) {
2571e739 5936 page = eb->pages[i];
727011e0
CM
5937 if (!PageUptodate(page)) {
5938 num_reads++;
ce9adaa5 5939 all_uptodate = 0;
727011e0 5940 }
ce9adaa5 5941 }
2571e739 5942
ce9adaa5 5943 if (all_uptodate) {
8436ea91 5944 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
ce9adaa5
CM
5945 goto unlock_exit;
5946 }
5947
656f30db 5948 clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
5cf1ab56 5949 eb->read_mirror = 0;
0b32f4bb 5950 atomic_set(&eb->io_pages, num_reads);
6bf9cd2e
BB
5951 /*
5952 * It is possible for releasepage to clear the TREE_REF bit before we
5953 * set io_pages. See check_buffer_tree_ref for a more detailed comment.
5954 */
5955 check_buffer_tree_ref(eb);
8436ea91 5956 for (i = 0; i < num_pages; i++) {
fb85fc9a 5957 page = eb->pages[i];
baf863b9 5958
ce9adaa5 5959 if (!PageUptodate(page)) {
baf863b9
LB
5960 if (ret) {
5961 atomic_dec(&eb->io_pages);
5962 unlock_page(page);
5963 continue;
5964 }
5965
f188591e 5966 ClearPageError(page);
0420177c
NB
5967 err = submit_extent_page(REQ_OP_READ | REQ_META, NULL,
5968 page, page_offset(page), PAGE_SIZE, 0,
5969 &bio, end_bio_extent_readpage,
5970 mirror_num, 0, 0, false);
baf863b9 5971 if (err) {
baf863b9 5972 /*
0420177c
NB
5973 * We failed to submit the bio so it's the
5974 * caller's responsibility to perform cleanup
5975 * i.e unlock page/set error bit.
baf863b9 5976 */
0420177c
NB
5977 ret = err;
5978 SetPageError(page);
5979 unlock_page(page);
baf863b9
LB
5980 atomic_dec(&eb->io_pages);
5981 }
d1310b2e
CM
5982 } else {
5983 unlock_page(page);
5984 }
5985 }
5986
355808c2 5987 if (bio) {
1f7ad75b 5988 err = submit_one_bio(bio, mirror_num, bio_flags);
79787eaa
JM
5989 if (err)
5990 return err;
355808c2 5991 }
a86c12c7 5992
bb82ab88 5993 if (ret || wait != WAIT_COMPLETE)
d1310b2e 5994 return ret;
d397712b 5995
8436ea91 5996 for (i = 0; i < num_pages; i++) {
fb85fc9a 5997 page = eb->pages[i];
d1310b2e 5998 wait_on_page_locked(page);
d397712b 5999 if (!PageUptodate(page))
d1310b2e 6000 ret = -EIO;
d1310b2e 6001 }
d397712b 6002
d1310b2e 6003 return ret;
ce9adaa5
CM
6004
6005unlock_exit:
d397712b 6006 while (locked_pages > 0) {
ce9adaa5 6007 locked_pages--;
8436ea91
JB
6008 page = eb->pages[locked_pages];
6009 unlock_page(page);
ce9adaa5
CM
6010 }
6011 return ret;
d1310b2e 6012}
d1310b2e 6013
f98b6215
QW
6014static bool report_eb_range(const struct extent_buffer *eb, unsigned long start,
6015 unsigned long len)
6016{
6017 btrfs_warn(eb->fs_info,
6018 "access to eb bytenr %llu len %lu out of range start %lu len %lu",
6019 eb->start, eb->len, start, len);
6020 WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
6021
6022 return true;
6023}
6024
6025/*
6026 * Check if the [start, start + len) range is valid before reading/writing
6027 * the eb.
6028 * NOTE: @start and @len are offset inside the eb, not logical address.
6029 *
6030 * Caller should not touch the dst/src memory if this function returns error.
6031 */
6032static inline int check_eb_range(const struct extent_buffer *eb,
6033 unsigned long start, unsigned long len)
6034{
6035 unsigned long offset;
6036
6037 /* start, start + len should not go beyond eb->len nor overflow */
6038 if (unlikely(check_add_overflow(start, len, &offset) || offset > eb->len))
6039 return report_eb_range(eb, start, len);
6040
6041 return false;
6042}
6043
1cbb1f45
JM
6044void read_extent_buffer(const struct extent_buffer *eb, void *dstv,
6045 unsigned long start, unsigned long len)
d1310b2e
CM
6046{
6047 size_t cur;
6048 size_t offset;
6049 struct page *page;
6050 char *kaddr;
6051 char *dst = (char *)dstv;
884b07d0 6052 unsigned long i = get_eb_page_index(start);
d1310b2e 6053
f98b6215 6054 if (check_eb_range(eb, start, len))
f716abd5 6055 return;
d1310b2e 6056
884b07d0 6057 offset = get_eb_offset_in_page(eb, start);
d1310b2e 6058
d397712b 6059 while (len > 0) {
fb85fc9a 6060 page = eb->pages[i];
d1310b2e 6061
09cbfeaf 6062 cur = min(len, (PAGE_SIZE - offset));
a6591715 6063 kaddr = page_address(page);
d1310b2e 6064 memcpy(dst, kaddr + offset, cur);
d1310b2e
CM
6065
6066 dst += cur;
6067 len -= cur;
6068 offset = 0;
6069 i++;
6070 }
6071}
d1310b2e 6072
a48b73ec
JB
6073int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb,
6074 void __user *dstv,
6075 unsigned long start, unsigned long len)
550ac1d8
GH
6076{
6077 size_t cur;
6078 size_t offset;
6079 struct page *page;
6080 char *kaddr;
6081 char __user *dst = (char __user *)dstv;
884b07d0 6082 unsigned long i = get_eb_page_index(start);
550ac1d8
GH
6083 int ret = 0;
6084
6085 WARN_ON(start > eb->len);
6086 WARN_ON(start + len > eb->start + eb->len);
6087
884b07d0 6088 offset = get_eb_offset_in_page(eb, start);
550ac1d8
GH
6089
6090 while (len > 0) {
fb85fc9a 6091 page = eb->pages[i];
550ac1d8 6092
09cbfeaf 6093 cur = min(len, (PAGE_SIZE - offset));
550ac1d8 6094 kaddr = page_address(page);
a48b73ec 6095 if (copy_to_user_nofault(dst, kaddr + offset, cur)) {
550ac1d8
GH
6096 ret = -EFAULT;
6097 break;
6098 }
6099
6100 dst += cur;
6101 len -= cur;
6102 offset = 0;
6103 i++;
6104 }
6105
6106 return ret;
6107}
6108
1cbb1f45
JM
6109int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
6110 unsigned long start, unsigned long len)
d1310b2e
CM
6111{
6112 size_t cur;
6113 size_t offset;
6114 struct page *page;
6115 char *kaddr;
6116 char *ptr = (char *)ptrv;
884b07d0 6117 unsigned long i = get_eb_page_index(start);
d1310b2e
CM
6118 int ret = 0;
6119
f98b6215
QW
6120 if (check_eb_range(eb, start, len))
6121 return -EINVAL;
d1310b2e 6122
884b07d0 6123 offset = get_eb_offset_in_page(eb, start);
d1310b2e 6124
d397712b 6125 while (len > 0) {
fb85fc9a 6126 page = eb->pages[i];
d1310b2e 6127
09cbfeaf 6128 cur = min(len, (PAGE_SIZE - offset));
d1310b2e 6129
a6591715 6130 kaddr = page_address(page);
d1310b2e 6131 ret = memcmp(ptr, kaddr + offset, cur);
d1310b2e
CM
6132 if (ret)
6133 break;
6134
6135 ptr += cur;
6136 len -= cur;
6137 offset = 0;
6138 i++;
6139 }
6140 return ret;
6141}
d1310b2e 6142
2b48966a 6143void write_extent_buffer_chunk_tree_uuid(const struct extent_buffer *eb,
f157bf76
DS
6144 const void *srcv)
6145{
6146 char *kaddr;
6147
6148 WARN_ON(!PageUptodate(eb->pages[0]));
884b07d0 6149 kaddr = page_address(eb->pages[0]) + get_eb_offset_in_page(eb, 0);
f157bf76
DS
6150 memcpy(kaddr + offsetof(struct btrfs_header, chunk_tree_uuid), srcv,
6151 BTRFS_FSID_SIZE);
6152}
6153
2b48966a 6154void write_extent_buffer_fsid(const struct extent_buffer *eb, const void *srcv)
f157bf76
DS
6155{
6156 char *kaddr;
6157
6158 WARN_ON(!PageUptodate(eb->pages[0]));
884b07d0 6159 kaddr = page_address(eb->pages[0]) + get_eb_offset_in_page(eb, 0);
f157bf76
DS
6160 memcpy(kaddr + offsetof(struct btrfs_header, fsid), srcv,
6161 BTRFS_FSID_SIZE);
6162}
6163
2b48966a 6164void write_extent_buffer(const struct extent_buffer *eb, const void *srcv,
d1310b2e
CM
6165 unsigned long start, unsigned long len)
6166{
6167 size_t cur;
6168 size_t offset;
6169 struct page *page;
6170 char *kaddr;
6171 char *src = (char *)srcv;
884b07d0 6172 unsigned long i = get_eb_page_index(start);
d1310b2e 6173
d3575156
NA
6174 WARN_ON(test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags));
6175
f98b6215
QW
6176 if (check_eb_range(eb, start, len))
6177 return;
d1310b2e 6178
884b07d0 6179 offset = get_eb_offset_in_page(eb, start);
d1310b2e 6180
d397712b 6181 while (len > 0) {
fb85fc9a 6182 page = eb->pages[i];
d1310b2e
CM
6183 WARN_ON(!PageUptodate(page));
6184
09cbfeaf 6185 cur = min(len, PAGE_SIZE - offset);
a6591715 6186 kaddr = page_address(page);
d1310b2e 6187 memcpy(kaddr + offset, src, cur);
d1310b2e
CM
6188
6189 src += cur;
6190 len -= cur;
6191 offset = 0;
6192 i++;
6193 }
6194}
d1310b2e 6195
2b48966a 6196void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start,
b159fa28 6197 unsigned long len)
d1310b2e
CM
6198{
6199 size_t cur;
6200 size_t offset;
6201 struct page *page;
6202 char *kaddr;
884b07d0 6203 unsigned long i = get_eb_page_index(start);
d1310b2e 6204
f98b6215
QW
6205 if (check_eb_range(eb, start, len))
6206 return;
d1310b2e 6207
884b07d0 6208 offset = get_eb_offset_in_page(eb, start);
d1310b2e 6209
d397712b 6210 while (len > 0) {
fb85fc9a 6211 page = eb->pages[i];
d1310b2e
CM
6212 WARN_ON(!PageUptodate(page));
6213
09cbfeaf 6214 cur = min(len, PAGE_SIZE - offset);
a6591715 6215 kaddr = page_address(page);
b159fa28 6216 memset(kaddr + offset, 0, cur);
d1310b2e
CM
6217
6218 len -= cur;
6219 offset = 0;
6220 i++;
6221 }
6222}
d1310b2e 6223
2b48966a
DS
6224void copy_extent_buffer_full(const struct extent_buffer *dst,
6225 const struct extent_buffer *src)
58e8012c
DS
6226{
6227 int i;
cc5e31a4 6228 int num_pages;
58e8012c
DS
6229
6230 ASSERT(dst->len == src->len);
6231
884b07d0
QW
6232 if (dst->fs_info->sectorsize == PAGE_SIZE) {
6233 num_pages = num_extent_pages(dst);
6234 for (i = 0; i < num_pages; i++)
6235 copy_page(page_address(dst->pages[i]),
6236 page_address(src->pages[i]));
6237 } else {
6238 size_t src_offset = get_eb_offset_in_page(src, 0);
6239 size_t dst_offset = get_eb_offset_in_page(dst, 0);
6240
6241 ASSERT(src->fs_info->sectorsize < PAGE_SIZE);
6242 memcpy(page_address(dst->pages[0]) + dst_offset,
6243 page_address(src->pages[0]) + src_offset,
6244 src->len);
6245 }
58e8012c
DS
6246}
6247
2b48966a
DS
6248void copy_extent_buffer(const struct extent_buffer *dst,
6249 const struct extent_buffer *src,
d1310b2e
CM
6250 unsigned long dst_offset, unsigned long src_offset,
6251 unsigned long len)
6252{
6253 u64 dst_len = dst->len;
6254 size_t cur;
6255 size_t offset;
6256 struct page *page;
6257 char *kaddr;
884b07d0 6258 unsigned long i = get_eb_page_index(dst_offset);
d1310b2e 6259
f98b6215
QW
6260 if (check_eb_range(dst, dst_offset, len) ||
6261 check_eb_range(src, src_offset, len))
6262 return;
6263
d1310b2e
CM
6264 WARN_ON(src->len != dst_len);
6265
884b07d0 6266 offset = get_eb_offset_in_page(dst, dst_offset);
d1310b2e 6267
d397712b 6268 while (len > 0) {
fb85fc9a 6269 page = dst->pages[i];
d1310b2e
CM
6270 WARN_ON(!PageUptodate(page));
6271
09cbfeaf 6272 cur = min(len, (unsigned long)(PAGE_SIZE - offset));
d1310b2e 6273
a6591715 6274 kaddr = page_address(page);
d1310b2e 6275 read_extent_buffer(src, kaddr + offset, src_offset, cur);
d1310b2e
CM
6276
6277 src_offset += cur;
6278 len -= cur;
6279 offset = 0;
6280 i++;
6281 }
6282}
d1310b2e 6283
3e1e8bb7
OS
6284/*
6285 * eb_bitmap_offset() - calculate the page and offset of the byte containing the
6286 * given bit number
6287 * @eb: the extent buffer
6288 * @start: offset of the bitmap item in the extent buffer
6289 * @nr: bit number
6290 * @page_index: return index of the page in the extent buffer that contains the
6291 * given bit number
6292 * @page_offset: return offset into the page given by page_index
6293 *
6294 * This helper hides the ugliness of finding the byte in an extent buffer which
6295 * contains a given bit.
6296 */
2b48966a 6297static inline void eb_bitmap_offset(const struct extent_buffer *eb,
3e1e8bb7
OS
6298 unsigned long start, unsigned long nr,
6299 unsigned long *page_index,
6300 size_t *page_offset)
6301{
3e1e8bb7
OS
6302 size_t byte_offset = BIT_BYTE(nr);
6303 size_t offset;
6304
6305 /*
6306 * The byte we want is the offset of the extent buffer + the offset of
6307 * the bitmap item in the extent buffer + the offset of the byte in the
6308 * bitmap item.
6309 */
884b07d0 6310 offset = start + offset_in_page(eb->start) + byte_offset;
3e1e8bb7 6311
09cbfeaf 6312 *page_index = offset >> PAGE_SHIFT;
7073017a 6313 *page_offset = offset_in_page(offset);
3e1e8bb7
OS
6314}
6315
6316/**
6317 * extent_buffer_test_bit - determine whether a bit in a bitmap item is set
6318 * @eb: the extent buffer
6319 * @start: offset of the bitmap item in the extent buffer
6320 * @nr: bit number to test
6321 */
2b48966a 6322int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start,
3e1e8bb7
OS
6323 unsigned long nr)
6324{
2fe1d551 6325 u8 *kaddr;
3e1e8bb7
OS
6326 struct page *page;
6327 unsigned long i;
6328 size_t offset;
6329
6330 eb_bitmap_offset(eb, start, nr, &i, &offset);
6331 page = eb->pages[i];
6332 WARN_ON(!PageUptodate(page));
6333 kaddr = page_address(page);
6334 return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1)));
6335}
6336
6337/**
6338 * extent_buffer_bitmap_set - set an area of a bitmap
6339 * @eb: the extent buffer
6340 * @start: offset of the bitmap item in the extent buffer
6341 * @pos: bit number of the first bit
6342 * @len: number of bits to set
6343 */
2b48966a 6344void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long start,
3e1e8bb7
OS
6345 unsigned long pos, unsigned long len)
6346{
2fe1d551 6347 u8 *kaddr;
3e1e8bb7
OS
6348 struct page *page;
6349 unsigned long i;
6350 size_t offset;
6351 const unsigned int size = pos + len;
6352 int bits_to_set = BITS_PER_BYTE - (pos % BITS_PER_BYTE);
2fe1d551 6353 u8 mask_to_set = BITMAP_FIRST_BYTE_MASK(pos);
3e1e8bb7
OS
6354
6355 eb_bitmap_offset(eb, start, pos, &i, &offset);
6356 page = eb->pages[i];
6357 WARN_ON(!PageUptodate(page));
6358 kaddr = page_address(page);
6359
6360 while (len >= bits_to_set) {
6361 kaddr[offset] |= mask_to_set;
6362 len -= bits_to_set;
6363 bits_to_set = BITS_PER_BYTE;
9c894696 6364 mask_to_set = ~0;
09cbfeaf 6365 if (++offset >= PAGE_SIZE && len > 0) {
3e1e8bb7
OS
6366 offset = 0;
6367 page = eb->pages[++i];
6368 WARN_ON(!PageUptodate(page));
6369 kaddr = page_address(page);
6370 }
6371 }
6372 if (len) {
6373 mask_to_set &= BITMAP_LAST_BYTE_MASK(size);
6374 kaddr[offset] |= mask_to_set;
6375 }
6376}
6377
6378
6379/**
6380 * extent_buffer_bitmap_clear - clear an area of a bitmap
6381 * @eb: the extent buffer
6382 * @start: offset of the bitmap item in the extent buffer
6383 * @pos: bit number of the first bit
6384 * @len: number of bits to clear
6385 */
2b48966a
DS
6386void extent_buffer_bitmap_clear(const struct extent_buffer *eb,
6387 unsigned long start, unsigned long pos,
6388 unsigned long len)
3e1e8bb7 6389{
2fe1d551 6390 u8 *kaddr;
3e1e8bb7
OS
6391 struct page *page;
6392 unsigned long i;
6393 size_t offset;
6394 const unsigned int size = pos + len;
6395 int bits_to_clear = BITS_PER_BYTE - (pos % BITS_PER_BYTE);
2fe1d551 6396 u8 mask_to_clear = BITMAP_FIRST_BYTE_MASK(pos);
3e1e8bb7
OS
6397
6398 eb_bitmap_offset(eb, start, pos, &i, &offset);
6399 page = eb->pages[i];
6400 WARN_ON(!PageUptodate(page));
6401 kaddr = page_address(page);
6402
6403 while (len >= bits_to_clear) {
6404 kaddr[offset] &= ~mask_to_clear;
6405 len -= bits_to_clear;
6406 bits_to_clear = BITS_PER_BYTE;
9c894696 6407 mask_to_clear = ~0;
09cbfeaf 6408 if (++offset >= PAGE_SIZE && len > 0) {
3e1e8bb7
OS
6409 offset = 0;
6410 page = eb->pages[++i];
6411 WARN_ON(!PageUptodate(page));
6412 kaddr = page_address(page);
6413 }
6414 }
6415 if (len) {
6416 mask_to_clear &= BITMAP_LAST_BYTE_MASK(size);
6417 kaddr[offset] &= ~mask_to_clear;
6418 }
6419}
6420
3387206f
ST
6421static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len)
6422{
6423 unsigned long distance = (src > dst) ? src - dst : dst - src;
6424 return distance < len;
6425}
6426
d1310b2e
CM
6427static void copy_pages(struct page *dst_page, struct page *src_page,
6428 unsigned long dst_off, unsigned long src_off,
6429 unsigned long len)
6430{
a6591715 6431 char *dst_kaddr = page_address(dst_page);
d1310b2e 6432 char *src_kaddr;
727011e0 6433 int must_memmove = 0;
d1310b2e 6434
3387206f 6435 if (dst_page != src_page) {
a6591715 6436 src_kaddr = page_address(src_page);
3387206f 6437 } else {
d1310b2e 6438 src_kaddr = dst_kaddr;
727011e0
CM
6439 if (areas_overlap(src_off, dst_off, len))
6440 must_memmove = 1;
3387206f 6441 }
d1310b2e 6442
727011e0
CM
6443 if (must_memmove)
6444 memmove(dst_kaddr + dst_off, src_kaddr + src_off, len);
6445 else
6446 memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
d1310b2e
CM
6447}
6448
2b48966a
DS
6449void memcpy_extent_buffer(const struct extent_buffer *dst,
6450 unsigned long dst_offset, unsigned long src_offset,
6451 unsigned long len)
d1310b2e
CM
6452{
6453 size_t cur;
6454 size_t dst_off_in_page;
6455 size_t src_off_in_page;
d1310b2e
CM
6456 unsigned long dst_i;
6457 unsigned long src_i;
6458
f98b6215
QW
6459 if (check_eb_range(dst, dst_offset, len) ||
6460 check_eb_range(dst, src_offset, len))
6461 return;
d1310b2e 6462
d397712b 6463 while (len > 0) {
884b07d0
QW
6464 dst_off_in_page = get_eb_offset_in_page(dst, dst_offset);
6465 src_off_in_page = get_eb_offset_in_page(dst, src_offset);
d1310b2e 6466
884b07d0
QW
6467 dst_i = get_eb_page_index(dst_offset);
6468 src_i = get_eb_page_index(src_offset);
d1310b2e 6469
09cbfeaf 6470 cur = min(len, (unsigned long)(PAGE_SIZE -
d1310b2e
CM
6471 src_off_in_page));
6472 cur = min_t(unsigned long, cur,
09cbfeaf 6473 (unsigned long)(PAGE_SIZE - dst_off_in_page));
d1310b2e 6474
fb85fc9a 6475 copy_pages(dst->pages[dst_i], dst->pages[src_i],
d1310b2e
CM
6476 dst_off_in_page, src_off_in_page, cur);
6477
6478 src_offset += cur;
6479 dst_offset += cur;
6480 len -= cur;
6481 }
6482}
d1310b2e 6483
2b48966a
DS
6484void memmove_extent_buffer(const struct extent_buffer *dst,
6485 unsigned long dst_offset, unsigned long src_offset,
6486 unsigned long len)
d1310b2e
CM
6487{
6488 size_t cur;
6489 size_t dst_off_in_page;
6490 size_t src_off_in_page;
6491 unsigned long dst_end = dst_offset + len - 1;
6492 unsigned long src_end = src_offset + len - 1;
d1310b2e
CM
6493 unsigned long dst_i;
6494 unsigned long src_i;
6495
f98b6215
QW
6496 if (check_eb_range(dst, dst_offset, len) ||
6497 check_eb_range(dst, src_offset, len))
6498 return;
727011e0 6499 if (dst_offset < src_offset) {
d1310b2e
CM
6500 memcpy_extent_buffer(dst, dst_offset, src_offset, len);
6501 return;
6502 }
d397712b 6503 while (len > 0) {
884b07d0
QW
6504 dst_i = get_eb_page_index(dst_end);
6505 src_i = get_eb_page_index(src_end);
d1310b2e 6506
884b07d0
QW
6507 dst_off_in_page = get_eb_offset_in_page(dst, dst_end);
6508 src_off_in_page = get_eb_offset_in_page(dst, src_end);
d1310b2e
CM
6509
6510 cur = min_t(unsigned long, len, src_off_in_page + 1);
6511 cur = min(cur, dst_off_in_page + 1);
fb85fc9a 6512 copy_pages(dst->pages[dst_i], dst->pages[src_i],
d1310b2e
CM
6513 dst_off_in_page - cur + 1,
6514 src_off_in_page - cur + 1, cur);
6515
6516 dst_end -= cur;
6517 src_end -= cur;
6518 len -= cur;
6519 }
6520}
6af118ce 6521
d1e86e3f
QW
6522static struct extent_buffer *get_next_extent_buffer(
6523 struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr)
6524{
6525 struct extent_buffer *gang[BTRFS_SUBPAGE_BITMAP_SIZE];
6526 struct extent_buffer *found = NULL;
6527 u64 page_start = page_offset(page);
6528 int ret;
6529 int i;
6530
6531 ASSERT(in_range(bytenr, page_start, PAGE_SIZE));
6532 ASSERT(PAGE_SIZE / fs_info->nodesize <= BTRFS_SUBPAGE_BITMAP_SIZE);
6533 lockdep_assert_held(&fs_info->buffer_lock);
6534
6535 ret = radix_tree_gang_lookup(&fs_info->buffer_radix, (void **)gang,
6536 bytenr >> fs_info->sectorsize_bits,
6537 PAGE_SIZE / fs_info->nodesize);
6538 for (i = 0; i < ret; i++) {
6539 /* Already beyond page end */
6540 if (gang[i]->start >= page_start + PAGE_SIZE)
6541 break;
6542 /* Found one */
6543 if (gang[i]->start >= bytenr) {
6544 found = gang[i];
6545 break;
6546 }
6547 }
6548 return found;
6549}
6550
6551static int try_release_subpage_extent_buffer(struct page *page)
6552{
6553 struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
6554 u64 cur = page_offset(page);
6555 const u64 end = page_offset(page) + PAGE_SIZE;
6556 int ret;
6557
6558 while (cur < end) {
6559 struct extent_buffer *eb = NULL;
6560
6561 /*
6562 * Unlike try_release_extent_buffer() which uses page->private
6563 * to grab buffer, for subpage case we rely on radix tree, thus
6564 * we need to ensure radix tree consistency.
6565 *
6566 * We also want an atomic snapshot of the radix tree, thus go
6567 * with spinlock rather than RCU.
6568 */
6569 spin_lock(&fs_info->buffer_lock);
6570 eb = get_next_extent_buffer(fs_info, page, cur);
6571 if (!eb) {
6572 /* No more eb in the page range after or at cur */
6573 spin_unlock(&fs_info->buffer_lock);
6574 break;
6575 }
6576 cur = eb->start + eb->len;
6577
6578 /*
6579 * The same as try_release_extent_buffer(), to ensure the eb
6580 * won't disappear out from under us.
6581 */
6582 spin_lock(&eb->refs_lock);
6583 if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
6584 spin_unlock(&eb->refs_lock);
6585 spin_unlock(&fs_info->buffer_lock);
6586 break;
6587 }
6588 spin_unlock(&fs_info->buffer_lock);
6589
6590 /*
6591 * If tree ref isn't set then we know the ref on this eb is a
6592 * real ref, so just return, this eb will likely be freed soon
6593 * anyway.
6594 */
6595 if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
6596 spin_unlock(&eb->refs_lock);
6597 break;
6598 }
6599
6600 /*
6601 * Here we don't care about the return value, we will always
6602 * check the page private at the end. And
6603 * release_extent_buffer() will release the refs_lock.
6604 */
6605 release_extent_buffer(eb);
6606 }
6607 /*
6608 * Finally to check if we have cleared page private, as if we have
6609 * released all ebs in the page, the page private should be cleared now.
6610 */
6611 spin_lock(&page->mapping->private_lock);
6612 if (!PagePrivate(page))
6613 ret = 1;
6614 else
6615 ret = 0;
6616 spin_unlock(&page->mapping->private_lock);
6617 return ret;
6618
6619}
6620
f7a52a40 6621int try_release_extent_buffer(struct page *page)
19fe0a8b 6622{
6af118ce 6623 struct extent_buffer *eb;
6af118ce 6624
d1e86e3f
QW
6625 if (btrfs_sb(page->mapping->host->i_sb)->sectorsize < PAGE_SIZE)
6626 return try_release_subpage_extent_buffer(page);
6627
3083ee2e 6628 /*
d1e86e3f
QW
6629 * We need to make sure nobody is changing page->private, as we rely on
6630 * page->private as the pointer to extent buffer.
3083ee2e
JB
6631 */
6632 spin_lock(&page->mapping->private_lock);
6633 if (!PagePrivate(page)) {
6634 spin_unlock(&page->mapping->private_lock);
4f2de97a 6635 return 1;
45f49bce 6636 }
6af118ce 6637
3083ee2e
JB
6638 eb = (struct extent_buffer *)page->private;
6639 BUG_ON(!eb);
19fe0a8b
MX
6640
6641 /*
3083ee2e
JB
6642 * This is a little awful but should be ok, we need to make sure that
6643 * the eb doesn't disappear out from under us while we're looking at
6644 * this page.
19fe0a8b 6645 */
3083ee2e 6646 spin_lock(&eb->refs_lock);
0b32f4bb 6647 if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
3083ee2e
JB
6648 spin_unlock(&eb->refs_lock);
6649 spin_unlock(&page->mapping->private_lock);
6650 return 0;
b9473439 6651 }
3083ee2e 6652 spin_unlock(&page->mapping->private_lock);
897ca6e9 6653
19fe0a8b 6654 /*
3083ee2e
JB
6655 * If tree ref isn't set then we know the ref on this eb is a real ref,
6656 * so just return, this page will likely be freed soon anyway.
19fe0a8b 6657 */
3083ee2e
JB
6658 if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
6659 spin_unlock(&eb->refs_lock);
6660 return 0;
b9473439 6661 }
19fe0a8b 6662
f7a52a40 6663 return release_extent_buffer(eb);
6af118ce 6664}
bfb484d9
JB
6665
6666/*
6667 * btrfs_readahead_tree_block - attempt to readahead a child block
6668 * @fs_info: the fs_info
6669 * @bytenr: bytenr to read
3fbaf258 6670 * @owner_root: objectid of the root that owns this eb
bfb484d9 6671 * @gen: generation for the uptodate check, can be 0
3fbaf258 6672 * @level: level for the eb
bfb484d9
JB
6673 *
6674 * Attempt to readahead a tree block at @bytenr. If @gen is 0 then we do a
6675 * normal uptodate check of the eb, without checking the generation. If we have
6676 * to read the block we will not block on anything.
6677 */
6678void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info,
3fbaf258 6679 u64 bytenr, u64 owner_root, u64 gen, int level)
bfb484d9
JB
6680{
6681 struct extent_buffer *eb;
6682 int ret;
6683
3fbaf258 6684 eb = btrfs_find_create_tree_block(fs_info, bytenr, owner_root, level);
bfb484d9
JB
6685 if (IS_ERR(eb))
6686 return;
6687
6688 if (btrfs_buffer_uptodate(eb, gen, 1)) {
6689 free_extent_buffer(eb);
6690 return;
6691 }
6692
6693 ret = read_extent_buffer_pages(eb, WAIT_NONE, 0);
6694 if (ret < 0)
6695 free_extent_buffer_stale(eb);
6696 else
6697 free_extent_buffer(eb);
6698}
6699
6700/*
6701 * btrfs_readahead_node_child - readahead a node's child block
6702 * @node: parent node we're reading from
6703 * @slot: slot in the parent node for the child we want to read
6704 *
6705 * A helper for btrfs_readahead_tree_block, we simply read the bytenr pointed at
6706 * the slot in the node provided.
6707 */
6708void btrfs_readahead_node_child(struct extent_buffer *node, int slot)
6709{
6710 btrfs_readahead_tree_block(node->fs_info,
6711 btrfs_node_blockptr(node, slot),
3fbaf258
JB
6712 btrfs_header_owner(node),
6713 btrfs_node_ptr_generation(node, slot),
6714 btrfs_header_level(node) - 1);
bfb484d9 6715}