]> git.proxmox.com Git - mirror_ubuntu-jammy-kernel.git/blame - fs/btrfs/space-info.c
btrfs: fix force usage in inc_block_group_ro
[mirror_ubuntu-jammy-kernel.git] / fs / btrfs / space-info.c
CommitLineData
280c2908
JB
1// SPDX-License-Identifier: GPL-2.0
2
784352fe 3#include "misc.h"
280c2908
JB
4#include "ctree.h"
5#include "space-info.h"
6#include "sysfs.h"
7#include "volumes.h"
5da6afeb 8#include "free-space-cache.h"
0d9764f6
JB
9#include "ordered-data.h"
10#include "transaction.h"
aac0023c 11#include "block-group.h"
280c2908 12
e1f60a65 13u64 __pure btrfs_space_info_used(struct btrfs_space_info *s_info,
280c2908
JB
14 bool may_use_included)
15{
16 ASSERT(s_info);
17 return s_info->bytes_used + s_info->bytes_reserved +
18 s_info->bytes_pinned + s_info->bytes_readonly +
19 (may_use_included ? s_info->bytes_may_use : 0);
20}
21
22/*
23 * after adding space to the filesystem, we need to clear the full flags
24 * on all the space infos.
25 */
26void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
27{
28 struct list_head *head = &info->space_info;
29 struct btrfs_space_info *found;
30
31 rcu_read_lock();
32 list_for_each_entry_rcu(found, head, list)
33 found->full = 0;
34 rcu_read_unlock();
35}
36
280c2908
JB
37static int create_space_info(struct btrfs_fs_info *info, u64 flags)
38{
39
40 struct btrfs_space_info *space_info;
41 int i;
42 int ret;
43
44 space_info = kzalloc(sizeof(*space_info), GFP_NOFS);
45 if (!space_info)
46 return -ENOMEM;
47
48 ret = percpu_counter_init(&space_info->total_bytes_pinned, 0,
49 GFP_KERNEL);
50 if (ret) {
51 kfree(space_info);
52 return ret;
53 }
54
55 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
56 INIT_LIST_HEAD(&space_info->block_groups[i]);
57 init_rwsem(&space_info->groups_sem);
58 spin_lock_init(&space_info->lock);
59 space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
60 space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
280c2908
JB
61 INIT_LIST_HEAD(&space_info->ro_bgs);
62 INIT_LIST_HEAD(&space_info->tickets);
63 INIT_LIST_HEAD(&space_info->priority_tickets);
64
b882327a
DS
65 ret = btrfs_sysfs_add_space_info_type(info, space_info);
66 if (ret)
280c2908 67 return ret;
280c2908
JB
68
69 list_add_rcu(&space_info->list, &info->space_info);
70 if (flags & BTRFS_BLOCK_GROUP_DATA)
71 info->data_sinfo = space_info;
72
73 return ret;
74}
75
76int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
77{
78 struct btrfs_super_block *disk_super;
79 u64 features;
80 u64 flags;
81 int mixed = 0;
82 int ret;
83
84 disk_super = fs_info->super_copy;
85 if (!btrfs_super_root(disk_super))
86 return -EINVAL;
87
88 features = btrfs_super_incompat_flags(disk_super);
89 if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
90 mixed = 1;
91
92 flags = BTRFS_BLOCK_GROUP_SYSTEM;
93 ret = create_space_info(fs_info, flags);
94 if (ret)
95 goto out;
96
97 if (mixed) {
98 flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
99 ret = create_space_info(fs_info, flags);
100 } else {
101 flags = BTRFS_BLOCK_GROUP_METADATA;
102 ret = create_space_info(fs_info, flags);
103 if (ret)
104 goto out;
105
106 flags = BTRFS_BLOCK_GROUP_DATA;
107 ret = create_space_info(fs_info, flags);
108 }
109out:
110 return ret;
111}
112
113void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags,
114 u64 total_bytes, u64 bytes_used,
115 u64 bytes_readonly,
116 struct btrfs_space_info **space_info)
117{
118 struct btrfs_space_info *found;
119 int factor;
120
121 factor = btrfs_bg_type_to_factor(flags);
122
123 found = btrfs_find_space_info(info, flags);
124 ASSERT(found);
125 spin_lock(&found->lock);
126 found->total_bytes += total_bytes;
127 found->disk_total += total_bytes * factor;
128 found->bytes_used += bytes_used;
129 found->disk_used += bytes_used * factor;
130 found->bytes_readonly += bytes_readonly;
131 if (total_bytes > 0)
132 found->full = 0;
18fa2284 133 btrfs_try_granting_tickets(info, found);
280c2908
JB
134 spin_unlock(&found->lock);
135 *space_info = found;
136}
137
138struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info,
139 u64 flags)
140{
141 struct list_head *head = &info->space_info;
142 struct btrfs_space_info *found;
143
144 flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
145
146 rcu_read_lock();
147 list_for_each_entry_rcu(found, head, list) {
148 if (found->flags & flags) {
149 rcu_read_unlock();
150 return found;
151 }
152 }
153 rcu_read_unlock();
154 return NULL;
155}
41783ef2
JB
156
157static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
158{
159 return (global->size << 1);
160}
161
83d731a5
JB
162static int can_overcommit(struct btrfs_fs_info *fs_info,
163 struct btrfs_space_info *space_info, u64 bytes,
9f246926 164 enum btrfs_reserve_flush_enum flush)
41783ef2 165{
41783ef2 166 u64 profile;
41783ef2
JB
167 u64 avail;
168 u64 used;
169 int factor;
170
171 /* Don't overcommit when in mixed mode. */
172 if (space_info->flags & BTRFS_BLOCK_GROUP_DATA)
173 return 0;
174
9f246926 175 if (space_info->flags & BTRFS_BLOCK_GROUP_SYSTEM)
41783ef2
JB
176 profile = btrfs_system_alloc_profile(fs_info);
177 else
178 profile = btrfs_metadata_alloc_profile(fs_info);
179
0096420a 180 used = btrfs_space_info_used(space_info, true);
41783ef2
JB
181 avail = atomic64_read(&fs_info->free_chunk_space);
182
183 /*
184 * If we have dup, raid1 or raid10 then only half of the free
185 * space is actually usable. For raid56, the space info used
186 * doesn't include the parity drive, so we don't have to
187 * change the math
188 */
189 factor = btrfs_bg_type_to_factor(profile);
190 avail = div_u64(avail, factor);
191
192 /*
193 * If we aren't flushing all things, let us overcommit up to
194 * 1/2th of the space. If we can flush, don't let us overcommit
195 * too much, let it overcommit up to 1/8 of the space.
196 */
197 if (flush == BTRFS_RESERVE_FLUSH_ALL)
198 avail >>= 3;
199 else
200 avail >>= 1;
201
202 if (used + bytes < space_info->total_bytes + avail)
203 return 1;
204 return 0;
205}
b338b013
JB
206
207/*
208 * This is for space we already have accounted in space_info->bytes_may_use, so
209 * basically when we're returning space from block_rsv's.
210 */
18fa2284
JB
211void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info,
212 struct btrfs_space_info *space_info)
b338b013 213{
b338b013 214 struct list_head *head;
b338b013 215 enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH;
b338b013 216
18fa2284 217 lockdep_assert_held(&space_info->lock);
b338b013 218
18fa2284 219 head = &space_info->priority_tickets;
b338b013 220again:
91182645
JB
221 while (!list_empty(head)) {
222 struct reserve_ticket *ticket;
223 u64 used = btrfs_space_info_used(space_info, true);
224
225 ticket = list_first_entry(head, struct reserve_ticket, list);
226
227 /* Check and see if our ticket can be satisified now. */
228 if ((used + ticket->bytes <= space_info->total_bytes) ||
9f246926 229 can_overcommit(fs_info, space_info, ticket->bytes, flush)) {
91182645
JB
230 btrfs_space_info_update_bytes_may_use(fs_info,
231 space_info,
232 ticket->bytes);
b338b013 233 list_del_init(&ticket->list);
b338b013
JB
234 ticket->bytes = 0;
235 space_info->tickets_id++;
236 wake_up(&ticket->wait);
237 } else {
91182645 238 break;
b338b013
JB
239 }
240 }
241
91182645 242 if (head == &space_info->priority_tickets) {
b338b013
JB
243 head = &space_info->tickets;
244 flush = BTRFS_RESERVE_FLUSH_ALL;
245 goto again;
246 }
b338b013 247}
5da6afeb
JB
248
249#define DUMP_BLOCK_RSV(fs_info, rsv_name) \
250do { \
251 struct btrfs_block_rsv *__rsv = &(fs_info)->rsv_name; \
252 spin_lock(&__rsv->lock); \
253 btrfs_info(fs_info, #rsv_name ": size %llu reserved %llu", \
254 __rsv->size, __rsv->reserved); \
255 spin_unlock(&__rsv->lock); \
256} while (0)
257
84fe47a4
JB
258static void __btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
259 struct btrfs_space_info *info)
5da6afeb 260{
84fe47a4 261 lockdep_assert_held(&info->lock);
5da6afeb 262
5da6afeb
JB
263 btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull",
264 info->flags,
265 info->total_bytes - btrfs_space_info_used(info, true),
266 info->full ? "" : "not ");
267 btrfs_info(fs_info,
268 "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu",
269 info->total_bytes, info->bytes_used, info->bytes_pinned,
270 info->bytes_reserved, info->bytes_may_use,
271 info->bytes_readonly);
5da6afeb
JB
272
273 DUMP_BLOCK_RSV(fs_info, global_block_rsv);
274 DUMP_BLOCK_RSV(fs_info, trans_block_rsv);
275 DUMP_BLOCK_RSV(fs_info, chunk_block_rsv);
276 DUMP_BLOCK_RSV(fs_info, delayed_block_rsv);
277 DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv);
278
84fe47a4
JB
279}
280
281void btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
282 struct btrfs_space_info *info, u64 bytes,
283 int dump_block_groups)
284{
32da5386 285 struct btrfs_block_group *cache;
84fe47a4
JB
286 int index = 0;
287
288 spin_lock(&info->lock);
289 __btrfs_dump_space_info(fs_info, info);
290 spin_unlock(&info->lock);
291
5da6afeb
JB
292 if (!dump_block_groups)
293 return;
294
295 down_read(&info->groups_sem);
296again:
297 list_for_each_entry(cache, &info->block_groups[index], list) {
298 spin_lock(&cache->lock);
299 btrfs_info(fs_info,
300 "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s",
b3470b5d 301 cache->start, cache->length, cache->used, cache->pinned,
5da6afeb
JB
302 cache->reserved, cache->ro ? "[readonly]" : "");
303 btrfs_dump_free_space(cache, bytes);
304 spin_unlock(&cache->lock);
305 }
306 if (++index < BTRFS_NR_RAID_TYPES)
307 goto again;
308 up_read(&info->groups_sem);
309}
0d9764f6
JB
310
311static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info,
312 unsigned long nr_pages, int nr_items)
313{
314 struct super_block *sb = fs_info->sb;
315
316 if (down_read_trylock(&sb->s_umount)) {
317 writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE);
318 up_read(&sb->s_umount);
319 } else {
320 /*
321 * We needn't worry the filesystem going from r/w to r/o though
322 * we don't acquire ->s_umount mutex, because the filesystem
323 * should guarantee the delalloc inodes list be empty after
324 * the filesystem is readonly(all dirty pages are written to
325 * the disk).
326 */
327 btrfs_start_delalloc_roots(fs_info, nr_items);
328 if (!current->journal_info)
329 btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1);
330 }
331}
332
333static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
334 u64 to_reclaim)
335{
336 u64 bytes;
337 u64 nr;
338
2bd36e7b 339 bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
0d9764f6
JB
340 nr = div64_u64(to_reclaim, bytes);
341 if (!nr)
342 nr = 1;
343 return nr;
344}
345
346#define EXTENT_SIZE_PER_ITEM SZ_256K
347
348/*
349 * shrink metadata reservation for delalloc
350 */
351static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
352 u64 orig, bool wait_ordered)
353{
354 struct btrfs_space_info *space_info;
355 struct btrfs_trans_handle *trans;
356 u64 delalloc_bytes;
357 u64 dio_bytes;
358 u64 async_pages;
359 u64 items;
360 long time_left;
361 unsigned long nr_pages;
362 int loops;
363
364 /* Calc the number of the pages we need flush for space reservation */
365 items = calc_reclaim_items_nr(fs_info, to_reclaim);
366 to_reclaim = items * EXTENT_SIZE_PER_ITEM;
367
368 trans = (struct btrfs_trans_handle *)current->journal_info;
369 space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
370
371 delalloc_bytes = percpu_counter_sum_positive(
372 &fs_info->delalloc_bytes);
373 dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes);
374 if (delalloc_bytes == 0 && dio_bytes == 0) {
375 if (trans)
376 return;
377 if (wait_ordered)
378 btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
379 return;
380 }
381
382 /*
383 * If we are doing more ordered than delalloc we need to just wait on
384 * ordered extents, otherwise we'll waste time trying to flush delalloc
385 * that likely won't give us the space back we need.
386 */
387 if (dio_bytes > delalloc_bytes)
388 wait_ordered = true;
389
390 loops = 0;
391 while ((delalloc_bytes || dio_bytes) && loops < 3) {
392 nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT;
393
394 /*
395 * Triggers inode writeback for up to nr_pages. This will invoke
396 * ->writepages callback and trigger delalloc filling
397 * (btrfs_run_delalloc_range()).
398 */
399 btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items);
400
401 /*
402 * We need to wait for the compressed pages to start before
403 * we continue.
404 */
405 async_pages = atomic_read(&fs_info->async_delalloc_pages);
406 if (!async_pages)
407 goto skip_async;
408
409 /*
410 * Calculate how many compressed pages we want to be written
411 * before we continue. I.e if there are more async pages than we
412 * require wait_event will wait until nr_pages are written.
413 */
414 if (async_pages <= nr_pages)
415 async_pages = 0;
416 else
417 async_pages -= nr_pages;
418
419 wait_event(fs_info->async_submit_wait,
420 atomic_read(&fs_info->async_delalloc_pages) <=
421 (int)async_pages);
422skip_async:
423 spin_lock(&space_info->lock);
424 if (list_empty(&space_info->tickets) &&
425 list_empty(&space_info->priority_tickets)) {
426 spin_unlock(&space_info->lock);
427 break;
428 }
429 spin_unlock(&space_info->lock);
430
431 loops++;
432 if (wait_ordered && !trans) {
433 btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
434 } else {
435 time_left = schedule_timeout_killable(1);
436 if (time_left)
437 break;
438 }
439 delalloc_bytes = percpu_counter_sum_positive(
440 &fs_info->delalloc_bytes);
441 dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes);
442 }
443}
444
445/**
446 * maybe_commit_transaction - possibly commit the transaction if its ok to
447 * @root - the root we're allocating for
448 * @bytes - the number of bytes we want to reserve
449 * @force - force the commit
450 *
451 * This will check to make sure that committing the transaction will actually
452 * get us somewhere and then commit the transaction if it does. Otherwise it
453 * will return -ENOSPC.
454 */
455static int may_commit_transaction(struct btrfs_fs_info *fs_info,
456 struct btrfs_space_info *space_info)
457{
458 struct reserve_ticket *ticket = NULL;
459 struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv;
460 struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
461 struct btrfs_trans_handle *trans;
462 u64 bytes_needed;
463 u64 reclaim_bytes = 0;
00c0135e 464 u64 cur_free_bytes = 0;
0d9764f6
JB
465
466 trans = (struct btrfs_trans_handle *)current->journal_info;
467 if (trans)
468 return -EAGAIN;
469
470 spin_lock(&space_info->lock);
00c0135e
JB
471 cur_free_bytes = btrfs_space_info_used(space_info, true);
472 if (cur_free_bytes < space_info->total_bytes)
473 cur_free_bytes = space_info->total_bytes - cur_free_bytes;
474 else
475 cur_free_bytes = 0;
476
0d9764f6
JB
477 if (!list_empty(&space_info->priority_tickets))
478 ticket = list_first_entry(&space_info->priority_tickets,
479 struct reserve_ticket, list);
480 else if (!list_empty(&space_info->tickets))
481 ticket = list_first_entry(&space_info->tickets,
482 struct reserve_ticket, list);
483 bytes_needed = (ticket) ? ticket->bytes : 0;
00c0135e
JB
484
485 if (bytes_needed > cur_free_bytes)
486 bytes_needed -= cur_free_bytes;
487 else
488 bytes_needed = 0;
0d9764f6
JB
489 spin_unlock(&space_info->lock);
490
491 if (!bytes_needed)
492 return 0;
493
494 trans = btrfs_join_transaction(fs_info->extent_root);
495 if (IS_ERR(trans))
496 return PTR_ERR(trans);
497
498 /*
499 * See if there is enough pinned space to make this reservation, or if
500 * we have block groups that are going to be freed, allowing us to
501 * possibly do a chunk allocation the next loop through.
502 */
503 if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) ||
504 __percpu_counter_compare(&space_info->total_bytes_pinned,
505 bytes_needed,
506 BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0)
507 goto commit;
508
509 /*
510 * See if there is some space in the delayed insertion reservation for
511 * this reservation.
512 */
513 if (space_info != delayed_rsv->space_info)
514 goto enospc;
515
516 spin_lock(&delayed_rsv->lock);
517 reclaim_bytes += delayed_rsv->reserved;
518 spin_unlock(&delayed_rsv->lock);
519
520 spin_lock(&delayed_refs_rsv->lock);
521 reclaim_bytes += delayed_refs_rsv->reserved;
522 spin_unlock(&delayed_refs_rsv->lock);
523 if (reclaim_bytes >= bytes_needed)
524 goto commit;
525 bytes_needed -= reclaim_bytes;
526
527 if (__percpu_counter_compare(&space_info->total_bytes_pinned,
528 bytes_needed,
529 BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0)
530 goto enospc;
531
532commit:
533 return btrfs_commit_transaction(trans);
534enospc:
535 btrfs_end_transaction(trans);
536 return -ENOSPC;
537}
538
539/*
540 * Try to flush some data based on policy set by @state. This is only advisory
541 * and may fail for various reasons. The caller is supposed to examine the
542 * state of @space_info to detect the outcome.
543 */
544static void flush_space(struct btrfs_fs_info *fs_info,
545 struct btrfs_space_info *space_info, u64 num_bytes,
546 int state)
547{
548 struct btrfs_root *root = fs_info->extent_root;
549 struct btrfs_trans_handle *trans;
550 int nr;
551 int ret = 0;
552
553 switch (state) {
554 case FLUSH_DELAYED_ITEMS_NR:
555 case FLUSH_DELAYED_ITEMS:
556 if (state == FLUSH_DELAYED_ITEMS_NR)
557 nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2;
558 else
559 nr = -1;
560
561 trans = btrfs_join_transaction(root);
562 if (IS_ERR(trans)) {
563 ret = PTR_ERR(trans);
564 break;
565 }
566 ret = btrfs_run_delayed_items_nr(trans, nr);
567 btrfs_end_transaction(trans);
568 break;
569 case FLUSH_DELALLOC:
570 case FLUSH_DELALLOC_WAIT:
571 shrink_delalloc(fs_info, num_bytes * 2, num_bytes,
572 state == FLUSH_DELALLOC_WAIT);
573 break;
574 case FLUSH_DELAYED_REFS_NR:
575 case FLUSH_DELAYED_REFS:
576 trans = btrfs_join_transaction(root);
577 if (IS_ERR(trans)) {
578 ret = PTR_ERR(trans);
579 break;
580 }
581 if (state == FLUSH_DELAYED_REFS_NR)
582 nr = calc_reclaim_items_nr(fs_info, num_bytes);
583 else
584 nr = 0;
585 btrfs_run_delayed_refs(trans, nr);
586 btrfs_end_transaction(trans);
587 break;
588 case ALLOC_CHUNK:
589 case ALLOC_CHUNK_FORCE:
590 trans = btrfs_join_transaction(root);
591 if (IS_ERR(trans)) {
592 ret = PTR_ERR(trans);
593 break;
594 }
595 ret = btrfs_chunk_alloc(trans,
596 btrfs_metadata_alloc_profile(fs_info),
597 (state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE :
598 CHUNK_ALLOC_FORCE);
599 btrfs_end_transaction(trans);
600 if (ret > 0 || ret == -ENOSPC)
601 ret = 0;
602 break;
844245b4 603 case RUN_DELAYED_IPUTS:
0d9764f6
JB
604 /*
605 * If we have pending delayed iputs then we could free up a
606 * bunch of pinned space, so make sure we run the iputs before
607 * we do our pinned bytes check below.
608 */
609 btrfs_run_delayed_iputs(fs_info);
610 btrfs_wait_on_delayed_iputs(fs_info);
844245b4
JB
611 break;
612 case COMMIT_TRANS:
0d9764f6
JB
613 ret = may_commit_transaction(fs_info, space_info);
614 break;
615 default:
616 ret = -ENOSPC;
617 break;
618 }
619
620 trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state,
621 ret);
622 return;
623}
624
625static inline u64
626btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
9f246926 627 struct btrfs_space_info *space_info)
0d9764f6
JB
628{
629 struct reserve_ticket *ticket;
630 u64 used;
631 u64 expected;
632 u64 to_reclaim = 0;
633
634 list_for_each_entry(ticket, &space_info->tickets, list)
635 to_reclaim += ticket->bytes;
636 list_for_each_entry(ticket, &space_info->priority_tickets, list)
637 to_reclaim += ticket->bytes;
638 if (to_reclaim)
639 return to_reclaim;
640
641 to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
83d731a5 642 if (can_overcommit(fs_info, space_info, to_reclaim,
9f246926 643 BTRFS_RESERVE_FLUSH_ALL))
0d9764f6
JB
644 return 0;
645
646 used = btrfs_space_info_used(space_info, true);
647
9f246926 648 if (can_overcommit(fs_info, space_info, SZ_1M, BTRFS_RESERVE_FLUSH_ALL))
0d9764f6
JB
649 expected = div_factor_fine(space_info->total_bytes, 95);
650 else
651 expected = div_factor_fine(space_info->total_bytes, 90);
652
653 if (used > expected)
654 to_reclaim = used - expected;
655 else
656 to_reclaim = 0;
657 to_reclaim = min(to_reclaim, space_info->bytes_may_use +
658 space_info->bytes_reserved);
659 return to_reclaim;
660}
661
662static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info,
663 struct btrfs_space_info *space_info,
9f246926 664 u64 used)
0d9764f6
JB
665{
666 u64 thresh = div_factor_fine(space_info->total_bytes, 98);
667
668 /* If we're just plain full then async reclaim just slows us down. */
669 if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh)
670 return 0;
671
9f246926 672 if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info))
0d9764f6
JB
673 return 0;
674
675 return (used >= thresh && !btrfs_fs_closing(fs_info) &&
676 !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
677}
678
2341ccd1
JB
679/*
680 * maybe_fail_all_tickets - we've exhausted our flushing, start failing tickets
681 * @fs_info - fs_info for this fs
682 * @space_info - the space info we were flushing
683 *
684 * We call this when we've exhausted our flushing ability and haven't made
685 * progress in satisfying tickets. The reservation code handles tickets in
686 * order, so if there is a large ticket first and then smaller ones we could
687 * very well satisfy the smaller tickets. This will attempt to wake up any
688 * tickets in the list to catch this case.
689 *
690 * This function returns true if it was able to make progress by clearing out
691 * other tickets, or if it stumbles across a ticket that was smaller than the
692 * first ticket.
693 */
694static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info,
695 struct btrfs_space_info *space_info)
0d9764f6
JB
696{
697 struct reserve_ticket *ticket;
2341ccd1
JB
698 u64 tickets_id = space_info->tickets_id;
699 u64 first_ticket_bytes = 0;
700
84fe47a4
JB
701 if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
702 btrfs_info(fs_info, "cannot satisfy tickets, dumping space info");
703 __btrfs_dump_space_info(fs_info, space_info);
704 }
705
2341ccd1
JB
706 while (!list_empty(&space_info->tickets) &&
707 tickets_id == space_info->tickets_id) {
708 ticket = list_first_entry(&space_info->tickets,
709 struct reserve_ticket, list);
710
711 /*
712 * may_commit_transaction will avoid committing the transaction
713 * if it doesn't feel like the space reclaimed by the commit
714 * would result in the ticket succeeding. However if we have a
715 * smaller ticket in the queue it may be small enough to be
716 * satisified by committing the transaction, so if any
717 * subsequent ticket is smaller than the first ticket go ahead
718 * and send us back for another loop through the enospc flushing
719 * code.
720 */
721 if (first_ticket_bytes == 0)
722 first_ticket_bytes = ticket->bytes;
723 else if (first_ticket_bytes > ticket->bytes)
724 return true;
0d9764f6 725
84fe47a4
JB
726 if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
727 btrfs_info(fs_info, "failing ticket with %llu bytes",
728 ticket->bytes);
729
0d9764f6
JB
730 list_del_init(&ticket->list);
731 ticket->error = -ENOSPC;
732 wake_up(&ticket->wait);
2341ccd1
JB
733
734 /*
735 * We're just throwing tickets away, so more flushing may not
736 * trip over btrfs_try_granting_tickets, so we need to call it
737 * here to see if we can make progress with the next ticket in
738 * the list.
739 */
740 btrfs_try_granting_tickets(fs_info, space_info);
0d9764f6 741 }
2341ccd1 742 return (tickets_id != space_info->tickets_id);
0d9764f6
JB
743}
744
745/*
746 * This is for normal flushers, we can wait all goddamned day if we want to. We
747 * will loop and continuously try to flush as long as we are making progress.
748 * We count progress as clearing off tickets each time we have to loop.
749 */
750static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
751{
752 struct btrfs_fs_info *fs_info;
753 struct btrfs_space_info *space_info;
754 u64 to_reclaim;
755 int flush_state;
756 int commit_cycles = 0;
757 u64 last_tickets_id;
758
759 fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
760 space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
761
762 spin_lock(&space_info->lock);
9f246926 763 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info);
0d9764f6
JB
764 if (!to_reclaim) {
765 space_info->flush = 0;
766 spin_unlock(&space_info->lock);
767 return;
768 }
769 last_tickets_id = space_info->tickets_id;
770 spin_unlock(&space_info->lock);
771
772 flush_state = FLUSH_DELAYED_ITEMS_NR;
773 do {
774 flush_space(fs_info, space_info, to_reclaim, flush_state);
775 spin_lock(&space_info->lock);
776 if (list_empty(&space_info->tickets)) {
777 space_info->flush = 0;
778 spin_unlock(&space_info->lock);
779 return;
780 }
781 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info,
9f246926 782 space_info);
0d9764f6
JB
783 if (last_tickets_id == space_info->tickets_id) {
784 flush_state++;
785 } else {
786 last_tickets_id = space_info->tickets_id;
787 flush_state = FLUSH_DELAYED_ITEMS_NR;
788 if (commit_cycles)
789 commit_cycles--;
790 }
791
792 /*
793 * We don't want to force a chunk allocation until we've tried
794 * pretty hard to reclaim space. Think of the case where we
795 * freed up a bunch of space and so have a lot of pinned space
796 * to reclaim. We would rather use that than possibly create a
797 * underutilized metadata chunk. So if this is our first run
798 * through the flushing state machine skip ALLOC_CHUNK_FORCE and
799 * commit the transaction. If nothing has changed the next go
800 * around then we can force a chunk allocation.
801 */
802 if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles)
803 flush_state++;
804
805 if (flush_state > COMMIT_TRANS) {
806 commit_cycles++;
807 if (commit_cycles > 2) {
2341ccd1 808 if (maybe_fail_all_tickets(fs_info, space_info)) {
0d9764f6
JB
809 flush_state = FLUSH_DELAYED_ITEMS_NR;
810 commit_cycles--;
811 } else {
812 space_info->flush = 0;
813 }
814 } else {
815 flush_state = FLUSH_DELAYED_ITEMS_NR;
816 }
817 }
818 spin_unlock(&space_info->lock);
819 } while (flush_state <= COMMIT_TRANS);
820}
821
822void btrfs_init_async_reclaim_work(struct work_struct *work)
823{
824 INIT_WORK(work, btrfs_async_reclaim_metadata_space);
825}
826
827static const enum btrfs_flush_state priority_flush_states[] = {
828 FLUSH_DELAYED_ITEMS_NR,
829 FLUSH_DELAYED_ITEMS,
830 ALLOC_CHUNK,
831};
832
d3984c90
JB
833static const enum btrfs_flush_state evict_flush_states[] = {
834 FLUSH_DELAYED_ITEMS_NR,
835 FLUSH_DELAYED_ITEMS,
836 FLUSH_DELAYED_REFS_NR,
837 FLUSH_DELAYED_REFS,
838 FLUSH_DELALLOC,
839 FLUSH_DELALLOC_WAIT,
840 ALLOC_CHUNK,
841 COMMIT_TRANS,
842};
843
0d9764f6 844static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
9ce2f423
JB
845 struct btrfs_space_info *space_info,
846 struct reserve_ticket *ticket,
847 const enum btrfs_flush_state *states,
848 int states_nr)
0d9764f6
JB
849{
850 u64 to_reclaim;
851 int flush_state;
852
853 spin_lock(&space_info->lock);
9f246926 854 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info);
0d9764f6
JB
855 if (!to_reclaim) {
856 spin_unlock(&space_info->lock);
857 return;
858 }
859 spin_unlock(&space_info->lock);
860
861 flush_state = 0;
862 do {
9ce2f423 863 flush_space(fs_info, space_info, to_reclaim, states[flush_state]);
0d9764f6
JB
864 flush_state++;
865 spin_lock(&space_info->lock);
866 if (ticket->bytes == 0) {
867 spin_unlock(&space_info->lock);
868 return;
869 }
870 spin_unlock(&space_info->lock);
9ce2f423 871 } while (flush_state < states_nr);
0d9764f6
JB
872}
873
374bf9c5
JB
874static void wait_reserve_ticket(struct btrfs_fs_info *fs_info,
875 struct btrfs_space_info *space_info,
876 struct reserve_ticket *ticket)
0d9764f6
JB
877
878{
879 DEFINE_WAIT(wait);
0d9764f6
JB
880 int ret = 0;
881
882 spin_lock(&space_info->lock);
883 while (ticket->bytes > 0 && ticket->error == 0) {
884 ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE);
885 if (ret) {
0cab7acc
FM
886 /*
887 * Delete us from the list. After we unlock the space
888 * info, we don't want the async reclaim job to reserve
889 * space for this ticket. If that would happen, then the
890 * ticket's task would not known that space was reserved
891 * despite getting an error, resulting in a space leak
892 * (bytes_may_use counter of our space_info).
893 */
894 list_del_init(&ticket->list);
374bf9c5 895 ticket->error = -EINTR;
0d9764f6
JB
896 break;
897 }
898 spin_unlock(&space_info->lock);
899
900 schedule();
901
902 finish_wait(&ticket->wait, &wait);
903 spin_lock(&space_info->lock);
904 }
0d9764f6 905 spin_unlock(&space_info->lock);
0d9764f6
JB
906}
907
03235279
JB
908/**
909 * handle_reserve_ticket - do the appropriate flushing and waiting for a ticket
910 * @fs_info - the fs
911 * @space_info - the space_info for the reservation
912 * @ticket - the ticket for the reservation
913 * @flush - how much we can flush
914 *
915 * This does the work of figuring out how to flush for the ticket, waiting for
916 * the reservation, and returning the appropriate error if there is one.
917 */
918static int handle_reserve_ticket(struct btrfs_fs_info *fs_info,
919 struct btrfs_space_info *space_info,
920 struct reserve_ticket *ticket,
921 enum btrfs_reserve_flush_enum flush)
922{
03235279
JB
923 int ret;
924
d3984c90
JB
925 switch (flush) {
926 case BTRFS_RESERVE_FLUSH_ALL:
03235279 927 wait_reserve_ticket(fs_info, space_info, ticket);
d3984c90
JB
928 break;
929 case BTRFS_RESERVE_FLUSH_LIMIT:
9ce2f423
JB
930 priority_reclaim_metadata_space(fs_info, space_info, ticket,
931 priority_flush_states,
932 ARRAY_SIZE(priority_flush_states));
d3984c90
JB
933 break;
934 case BTRFS_RESERVE_FLUSH_EVICT:
935 priority_reclaim_metadata_space(fs_info, space_info, ticket,
936 evict_flush_states,
937 ARRAY_SIZE(evict_flush_states));
938 break;
939 default:
940 ASSERT(0);
941 break;
942 }
03235279
JB
943
944 spin_lock(&space_info->lock);
945 ret = ticket->error;
946 if (ticket->bytes || ticket->error) {
0cab7acc
FM
947 /*
948 * Need to delete here for priority tickets. For regular tickets
949 * either the async reclaim job deletes the ticket from the list
950 * or we delete it ourselves at wait_reserve_ticket().
951 */
03235279
JB
952 list_del_init(&ticket->list);
953 if (!ret)
954 ret = -ENOSPC;
955 }
956 spin_unlock(&space_info->lock);
03235279 957 ASSERT(list_empty(&ticket->list));
0cab7acc
FM
958 /*
959 * Check that we can't have an error set if the reservation succeeded,
960 * as that would confuse tasks and lead them to error out without
961 * releasing reserved space (if an error happens the expectation is that
962 * space wasn't reserved at all).
963 */
964 ASSERT(!(ticket->bytes == 0 && ticket->error));
03235279
JB
965 return ret;
966}
967
0d9764f6
JB
968/**
969 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
970 * @root - the root we're allocating for
971 * @space_info - the space info we want to allocate from
972 * @orig_bytes - the number of bytes we want
973 * @flush - whether or not we can flush to make our reservation
974 *
975 * This will reserve orig_bytes number of bytes from the space info associated
976 * with the block_rsv. If there is not enough space it will make an attempt to
977 * flush out space to make room. It will do this by flushing delalloc if
978 * possible or committing the transaction. If flush is 0 then no attempts to
979 * regain reservations will be made and this will fail if there is not enough
980 * space already.
981 */
982static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
983 struct btrfs_space_info *space_info,
984 u64 orig_bytes,
9f246926 985 enum btrfs_reserve_flush_enum flush)
0d9764f6
JB
986{
987 struct reserve_ticket ticket;
988 u64 used;
0d9764f6 989 int ret = 0;
ef1317a1 990 bool pending_tickets;
0d9764f6
JB
991
992 ASSERT(orig_bytes);
993 ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL);
994
995 spin_lock(&space_info->lock);
996 ret = -ENOSPC;
997 used = btrfs_space_info_used(space_info, true);
ef1317a1
JB
998 pending_tickets = !list_empty(&space_info->tickets) ||
999 !list_empty(&space_info->priority_tickets);
0d9764f6
JB
1000
1001 /*
9b4851bc
GR
1002 * Carry on if we have enough space (short-circuit) OR call
1003 * can_overcommit() to ensure we can overcommit to continue.
0d9764f6 1004 */
ef1317a1
JB
1005 if (!pending_tickets &&
1006 ((used + orig_bytes <= space_info->total_bytes) ||
9f246926 1007 can_overcommit(fs_info, space_info, orig_bytes, flush))) {
0d9764f6
JB
1008 btrfs_space_info_update_bytes_may_use(fs_info, space_info,
1009 orig_bytes);
0d9764f6
JB
1010 ret = 0;
1011 }
1012
1013 /*
1014 * If we couldn't make a reservation then setup our reservation ticket
1015 * and kick the async worker if it's not already running.
1016 *
1017 * If we are a priority flusher then we just need to add our ticket to
1018 * the list and we will do our own flushing further down.
1019 */
1020 if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
0d9764f6
JB
1021 ticket.bytes = orig_bytes;
1022 ticket.error = 0;
1023 init_waitqueue_head(&ticket.wait);
1024 if (flush == BTRFS_RESERVE_FLUSH_ALL) {
1025 list_add_tail(&ticket.list, &space_info->tickets);
1026 if (!space_info->flush) {
1027 space_info->flush = 1;
1028 trace_btrfs_trigger_flush(fs_info,
1029 space_info->flags,
1030 orig_bytes, flush,
1031 "enospc");
1032 queue_work(system_unbound_wq,
1033 &fs_info->async_reclaim_work);
1034 }
1035 } else {
1036 list_add_tail(&ticket.list,
1037 &space_info->priority_tickets);
1038 }
1039 } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
1040 used += orig_bytes;
1041 /*
1042 * We will do the space reservation dance during log replay,
1043 * which means we won't have fs_info->fs_root set, so don't do
1044 * the async reclaim as we will panic.
1045 */
1046 if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) &&
9f246926 1047 need_do_async_reclaim(fs_info, space_info, used) &&
0d9764f6
JB
1048 !work_busy(&fs_info->async_reclaim_work)) {
1049 trace_btrfs_trigger_flush(fs_info, space_info->flags,
1050 orig_bytes, flush, "preempt");
1051 queue_work(system_unbound_wq,
1052 &fs_info->async_reclaim_work);
1053 }
1054 }
1055 spin_unlock(&space_info->lock);
1056 if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
1057 return ret;
1058
03235279 1059 return handle_reserve_ticket(fs_info, space_info, &ticket, flush);
0d9764f6
JB
1060}
1061
1062/**
1063 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
1064 * @root - the root we're allocating for
1065 * @block_rsv - the block_rsv we're allocating for
1066 * @orig_bytes - the number of bytes we want
1067 * @flush - whether or not we can flush to make our reservation
1068 *
1069 * This will reserve orig_bytes number of bytes from the space info associated
1070 * with the block_rsv. If there is not enough space it will make an attempt to
1071 * flush out space to make room. It will do this by flushing delalloc if
1072 * possible or committing the transaction. If flush is 0 then no attempts to
1073 * regain reservations will be made and this will fail if there is not enough
1074 * space already.
1075 */
1076int btrfs_reserve_metadata_bytes(struct btrfs_root *root,
1077 struct btrfs_block_rsv *block_rsv,
1078 u64 orig_bytes,
1079 enum btrfs_reserve_flush_enum flush)
1080{
1081 struct btrfs_fs_info *fs_info = root->fs_info;
1082 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
1083 int ret;
0d9764f6
JB
1084
1085 ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info,
9f246926 1086 orig_bytes, flush);
0d9764f6
JB
1087 if (ret == -ENOSPC &&
1088 unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
1089 if (block_rsv != global_rsv &&
1090 !btrfs_block_rsv_use_bytes(global_rsv, orig_bytes))
1091 ret = 0;
1092 }
1093 if (ret == -ENOSPC) {
1094 trace_btrfs_space_reservation(fs_info, "space_info:enospc",
1095 block_rsv->space_info->flags,
1096 orig_bytes, 1);
1097
1098 if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
1099 btrfs_dump_space_info(fs_info, block_rsv->space_info,
1100 orig_bytes, 0);
1101 }
1102 return ret;
1103}