]> git.proxmox.com Git - mirror_ubuntu-jammy-kernel.git/blame - fs/btrfs/space-info.c
Merge branches 'pm-sleep' and 'pm-devfreq'
[mirror_ubuntu-jammy-kernel.git] / fs / btrfs / space-info.c
CommitLineData
280c2908
JB
1// SPDX-License-Identifier: GPL-2.0
2
784352fe 3#include "misc.h"
280c2908
JB
4#include "ctree.h"
5#include "space-info.h"
6#include "sysfs.h"
7#include "volumes.h"
5da6afeb 8#include "free-space-cache.h"
0d9764f6
JB
9#include "ordered-data.h"
10#include "transaction.h"
aac0023c 11#include "block-group.h"
280c2908 12
e1f60a65 13u64 __pure btrfs_space_info_used(struct btrfs_space_info *s_info,
280c2908
JB
14 bool may_use_included)
15{
16 ASSERT(s_info);
17 return s_info->bytes_used + s_info->bytes_reserved +
18 s_info->bytes_pinned + s_info->bytes_readonly +
19 (may_use_included ? s_info->bytes_may_use : 0);
20}
21
22/*
23 * after adding space to the filesystem, we need to clear the full flags
24 * on all the space infos.
25 */
26void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
27{
28 struct list_head *head = &info->space_info;
29 struct btrfs_space_info *found;
30
31 rcu_read_lock();
32 list_for_each_entry_rcu(found, head, list)
33 found->full = 0;
34 rcu_read_unlock();
35}
36
280c2908
JB
37static int create_space_info(struct btrfs_fs_info *info, u64 flags)
38{
39
40 struct btrfs_space_info *space_info;
41 int i;
42 int ret;
43
44 space_info = kzalloc(sizeof(*space_info), GFP_NOFS);
45 if (!space_info)
46 return -ENOMEM;
47
48 ret = percpu_counter_init(&space_info->total_bytes_pinned, 0,
49 GFP_KERNEL);
50 if (ret) {
51 kfree(space_info);
52 return ret;
53 }
54
55 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
56 INIT_LIST_HEAD(&space_info->block_groups[i]);
57 init_rwsem(&space_info->groups_sem);
58 spin_lock_init(&space_info->lock);
59 space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
60 space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
280c2908
JB
61 INIT_LIST_HEAD(&space_info->ro_bgs);
62 INIT_LIST_HEAD(&space_info->tickets);
63 INIT_LIST_HEAD(&space_info->priority_tickets);
64
b882327a
DS
65 ret = btrfs_sysfs_add_space_info_type(info, space_info);
66 if (ret)
280c2908 67 return ret;
280c2908
JB
68
69 list_add_rcu(&space_info->list, &info->space_info);
70 if (flags & BTRFS_BLOCK_GROUP_DATA)
71 info->data_sinfo = space_info;
72
73 return ret;
74}
75
76int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
77{
78 struct btrfs_super_block *disk_super;
79 u64 features;
80 u64 flags;
81 int mixed = 0;
82 int ret;
83
84 disk_super = fs_info->super_copy;
85 if (!btrfs_super_root(disk_super))
86 return -EINVAL;
87
88 features = btrfs_super_incompat_flags(disk_super);
89 if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
90 mixed = 1;
91
92 flags = BTRFS_BLOCK_GROUP_SYSTEM;
93 ret = create_space_info(fs_info, flags);
94 if (ret)
95 goto out;
96
97 if (mixed) {
98 flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
99 ret = create_space_info(fs_info, flags);
100 } else {
101 flags = BTRFS_BLOCK_GROUP_METADATA;
102 ret = create_space_info(fs_info, flags);
103 if (ret)
104 goto out;
105
106 flags = BTRFS_BLOCK_GROUP_DATA;
107 ret = create_space_info(fs_info, flags);
108 }
109out:
110 return ret;
111}
112
113void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags,
114 u64 total_bytes, u64 bytes_used,
115 u64 bytes_readonly,
116 struct btrfs_space_info **space_info)
117{
118 struct btrfs_space_info *found;
119 int factor;
120
121 factor = btrfs_bg_type_to_factor(flags);
122
123 found = btrfs_find_space_info(info, flags);
124 ASSERT(found);
125 spin_lock(&found->lock);
126 found->total_bytes += total_bytes;
127 found->disk_total += total_bytes * factor;
128 found->bytes_used += bytes_used;
129 found->disk_used += bytes_used * factor;
130 found->bytes_readonly += bytes_readonly;
131 if (total_bytes > 0)
132 found->full = 0;
18fa2284 133 btrfs_try_granting_tickets(info, found);
280c2908
JB
134 spin_unlock(&found->lock);
135 *space_info = found;
136}
137
138struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info,
139 u64 flags)
140{
141 struct list_head *head = &info->space_info;
142 struct btrfs_space_info *found;
143
144 flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
145
146 rcu_read_lock();
147 list_for_each_entry_rcu(found, head, list) {
148 if (found->flags & flags) {
149 rcu_read_unlock();
150 return found;
151 }
152 }
153 rcu_read_unlock();
154 return NULL;
155}
41783ef2
JB
156
157static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
158{
159 return (global->size << 1);
160}
161
a30a3d20
JB
162int btrfs_can_overcommit(struct btrfs_fs_info *fs_info,
163 struct btrfs_space_info *space_info, u64 bytes,
164 enum btrfs_reserve_flush_enum flush)
41783ef2 165{
41783ef2 166 u64 profile;
41783ef2
JB
167 u64 avail;
168 u64 used;
169 int factor;
170
171 /* Don't overcommit when in mixed mode. */
172 if (space_info->flags & BTRFS_BLOCK_GROUP_DATA)
173 return 0;
174
9f246926 175 if (space_info->flags & BTRFS_BLOCK_GROUP_SYSTEM)
41783ef2
JB
176 profile = btrfs_system_alloc_profile(fs_info);
177 else
178 profile = btrfs_metadata_alloc_profile(fs_info);
179
0096420a 180 used = btrfs_space_info_used(space_info, true);
41783ef2
JB
181 avail = atomic64_read(&fs_info->free_chunk_space);
182
183 /*
184 * If we have dup, raid1 or raid10 then only half of the free
185 * space is actually usable. For raid56, the space info used
186 * doesn't include the parity drive, so we don't have to
187 * change the math
188 */
189 factor = btrfs_bg_type_to_factor(profile);
190 avail = div_u64(avail, factor);
191
192 /*
193 * If we aren't flushing all things, let us overcommit up to
194 * 1/2th of the space. If we can flush, don't let us overcommit
195 * too much, let it overcommit up to 1/8 of the space.
196 */
197 if (flush == BTRFS_RESERVE_FLUSH_ALL)
198 avail >>= 3;
199 else
200 avail >>= 1;
201
202 if (used + bytes < space_info->total_bytes + avail)
203 return 1;
204 return 0;
205}
b338b013
JB
206
207/*
208 * This is for space we already have accounted in space_info->bytes_may_use, so
209 * basically when we're returning space from block_rsv's.
210 */
18fa2284
JB
211void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info,
212 struct btrfs_space_info *space_info)
b338b013 213{
b338b013 214 struct list_head *head;
b338b013 215 enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH;
b338b013 216
18fa2284 217 lockdep_assert_held(&space_info->lock);
b338b013 218
18fa2284 219 head = &space_info->priority_tickets;
b338b013 220again:
91182645
JB
221 while (!list_empty(head)) {
222 struct reserve_ticket *ticket;
223 u64 used = btrfs_space_info_used(space_info, true);
224
225 ticket = list_first_entry(head, struct reserve_ticket, list);
226
227 /* Check and see if our ticket can be satisified now. */
228 if ((used + ticket->bytes <= space_info->total_bytes) ||
a30a3d20
JB
229 btrfs_can_overcommit(fs_info, space_info, ticket->bytes,
230 flush)) {
91182645
JB
231 btrfs_space_info_update_bytes_may_use(fs_info,
232 space_info,
233 ticket->bytes);
b338b013 234 list_del_init(&ticket->list);
b338b013
JB
235 ticket->bytes = 0;
236 space_info->tickets_id++;
237 wake_up(&ticket->wait);
238 } else {
91182645 239 break;
b338b013
JB
240 }
241 }
242
91182645 243 if (head == &space_info->priority_tickets) {
b338b013
JB
244 head = &space_info->tickets;
245 flush = BTRFS_RESERVE_FLUSH_ALL;
246 goto again;
247 }
b338b013 248}
5da6afeb
JB
249
250#define DUMP_BLOCK_RSV(fs_info, rsv_name) \
251do { \
252 struct btrfs_block_rsv *__rsv = &(fs_info)->rsv_name; \
253 spin_lock(&__rsv->lock); \
254 btrfs_info(fs_info, #rsv_name ": size %llu reserved %llu", \
255 __rsv->size, __rsv->reserved); \
256 spin_unlock(&__rsv->lock); \
257} while (0)
258
84fe47a4
JB
259static void __btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
260 struct btrfs_space_info *info)
5da6afeb 261{
84fe47a4 262 lockdep_assert_held(&info->lock);
5da6afeb 263
5da6afeb
JB
264 btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull",
265 info->flags,
266 info->total_bytes - btrfs_space_info_used(info, true),
267 info->full ? "" : "not ");
268 btrfs_info(fs_info,
269 "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu",
270 info->total_bytes, info->bytes_used, info->bytes_pinned,
271 info->bytes_reserved, info->bytes_may_use,
272 info->bytes_readonly);
5da6afeb
JB
273
274 DUMP_BLOCK_RSV(fs_info, global_block_rsv);
275 DUMP_BLOCK_RSV(fs_info, trans_block_rsv);
276 DUMP_BLOCK_RSV(fs_info, chunk_block_rsv);
277 DUMP_BLOCK_RSV(fs_info, delayed_block_rsv);
278 DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv);
279
84fe47a4
JB
280}
281
282void btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
283 struct btrfs_space_info *info, u64 bytes,
284 int dump_block_groups)
285{
32da5386 286 struct btrfs_block_group *cache;
84fe47a4
JB
287 int index = 0;
288
289 spin_lock(&info->lock);
290 __btrfs_dump_space_info(fs_info, info);
291 spin_unlock(&info->lock);
292
5da6afeb
JB
293 if (!dump_block_groups)
294 return;
295
296 down_read(&info->groups_sem);
297again:
298 list_for_each_entry(cache, &info->block_groups[index], list) {
299 spin_lock(&cache->lock);
300 btrfs_info(fs_info,
301 "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s",
b3470b5d 302 cache->start, cache->length, cache->used, cache->pinned,
5da6afeb
JB
303 cache->reserved, cache->ro ? "[readonly]" : "");
304 btrfs_dump_free_space(cache, bytes);
305 spin_unlock(&cache->lock);
306 }
307 if (++index < BTRFS_NR_RAID_TYPES)
308 goto again;
309 up_read(&info->groups_sem);
310}
0d9764f6
JB
311
312static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info,
313 unsigned long nr_pages, int nr_items)
314{
315 struct super_block *sb = fs_info->sb;
316
317 if (down_read_trylock(&sb->s_umount)) {
318 writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE);
319 up_read(&sb->s_umount);
320 } else {
321 /*
322 * We needn't worry the filesystem going from r/w to r/o though
323 * we don't acquire ->s_umount mutex, because the filesystem
324 * should guarantee the delalloc inodes list be empty after
325 * the filesystem is readonly(all dirty pages are written to
326 * the disk).
327 */
328 btrfs_start_delalloc_roots(fs_info, nr_items);
329 if (!current->journal_info)
330 btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1);
331 }
332}
333
334static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
335 u64 to_reclaim)
336{
337 u64 bytes;
338 u64 nr;
339
2bd36e7b 340 bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
0d9764f6
JB
341 nr = div64_u64(to_reclaim, bytes);
342 if (!nr)
343 nr = 1;
344 return nr;
345}
346
347#define EXTENT_SIZE_PER_ITEM SZ_256K
348
349/*
350 * shrink metadata reservation for delalloc
351 */
352static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
353 u64 orig, bool wait_ordered)
354{
355 struct btrfs_space_info *space_info;
356 struct btrfs_trans_handle *trans;
357 u64 delalloc_bytes;
358 u64 dio_bytes;
359 u64 async_pages;
360 u64 items;
361 long time_left;
362 unsigned long nr_pages;
363 int loops;
364
365 /* Calc the number of the pages we need flush for space reservation */
366 items = calc_reclaim_items_nr(fs_info, to_reclaim);
367 to_reclaim = items * EXTENT_SIZE_PER_ITEM;
368
369 trans = (struct btrfs_trans_handle *)current->journal_info;
370 space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
371
372 delalloc_bytes = percpu_counter_sum_positive(
373 &fs_info->delalloc_bytes);
374 dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes);
375 if (delalloc_bytes == 0 && dio_bytes == 0) {
376 if (trans)
377 return;
378 if (wait_ordered)
379 btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
380 return;
381 }
382
383 /*
384 * If we are doing more ordered than delalloc we need to just wait on
385 * ordered extents, otherwise we'll waste time trying to flush delalloc
386 * that likely won't give us the space back we need.
387 */
388 if (dio_bytes > delalloc_bytes)
389 wait_ordered = true;
390
391 loops = 0;
392 while ((delalloc_bytes || dio_bytes) && loops < 3) {
393 nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT;
394
395 /*
396 * Triggers inode writeback for up to nr_pages. This will invoke
397 * ->writepages callback and trigger delalloc filling
398 * (btrfs_run_delalloc_range()).
399 */
400 btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items);
401
402 /*
403 * We need to wait for the compressed pages to start before
404 * we continue.
405 */
406 async_pages = atomic_read(&fs_info->async_delalloc_pages);
407 if (!async_pages)
408 goto skip_async;
409
410 /*
411 * Calculate how many compressed pages we want to be written
412 * before we continue. I.e if there are more async pages than we
413 * require wait_event will wait until nr_pages are written.
414 */
415 if (async_pages <= nr_pages)
416 async_pages = 0;
417 else
418 async_pages -= nr_pages;
419
420 wait_event(fs_info->async_submit_wait,
421 atomic_read(&fs_info->async_delalloc_pages) <=
422 (int)async_pages);
423skip_async:
424 spin_lock(&space_info->lock);
425 if (list_empty(&space_info->tickets) &&
426 list_empty(&space_info->priority_tickets)) {
427 spin_unlock(&space_info->lock);
428 break;
429 }
430 spin_unlock(&space_info->lock);
431
432 loops++;
433 if (wait_ordered && !trans) {
434 btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
435 } else {
436 time_left = schedule_timeout_killable(1);
437 if (time_left)
438 break;
439 }
440 delalloc_bytes = percpu_counter_sum_positive(
441 &fs_info->delalloc_bytes);
442 dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes);
443 }
444}
445
446/**
447 * maybe_commit_transaction - possibly commit the transaction if its ok to
448 * @root - the root we're allocating for
449 * @bytes - the number of bytes we want to reserve
450 * @force - force the commit
451 *
452 * This will check to make sure that committing the transaction will actually
453 * get us somewhere and then commit the transaction if it does. Otherwise it
454 * will return -ENOSPC.
455 */
456static int may_commit_transaction(struct btrfs_fs_info *fs_info,
457 struct btrfs_space_info *space_info)
458{
459 struct reserve_ticket *ticket = NULL;
460 struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv;
461 struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
462 struct btrfs_trans_handle *trans;
463 u64 bytes_needed;
464 u64 reclaim_bytes = 0;
00c0135e 465 u64 cur_free_bytes = 0;
0d9764f6
JB
466
467 trans = (struct btrfs_trans_handle *)current->journal_info;
468 if (trans)
469 return -EAGAIN;
470
471 spin_lock(&space_info->lock);
00c0135e
JB
472 cur_free_bytes = btrfs_space_info_used(space_info, true);
473 if (cur_free_bytes < space_info->total_bytes)
474 cur_free_bytes = space_info->total_bytes - cur_free_bytes;
475 else
476 cur_free_bytes = 0;
477
0d9764f6
JB
478 if (!list_empty(&space_info->priority_tickets))
479 ticket = list_first_entry(&space_info->priority_tickets,
480 struct reserve_ticket, list);
481 else if (!list_empty(&space_info->tickets))
482 ticket = list_first_entry(&space_info->tickets,
483 struct reserve_ticket, list);
484 bytes_needed = (ticket) ? ticket->bytes : 0;
00c0135e
JB
485
486 if (bytes_needed > cur_free_bytes)
487 bytes_needed -= cur_free_bytes;
488 else
489 bytes_needed = 0;
0d9764f6
JB
490 spin_unlock(&space_info->lock);
491
492 if (!bytes_needed)
493 return 0;
494
495 trans = btrfs_join_transaction(fs_info->extent_root);
496 if (IS_ERR(trans))
497 return PTR_ERR(trans);
498
499 /*
500 * See if there is enough pinned space to make this reservation, or if
501 * we have block groups that are going to be freed, allowing us to
502 * possibly do a chunk allocation the next loop through.
503 */
504 if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) ||
505 __percpu_counter_compare(&space_info->total_bytes_pinned,
506 bytes_needed,
507 BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0)
508 goto commit;
509
510 /*
511 * See if there is some space in the delayed insertion reservation for
512 * this reservation.
513 */
514 if (space_info != delayed_rsv->space_info)
515 goto enospc;
516
517 spin_lock(&delayed_rsv->lock);
518 reclaim_bytes += delayed_rsv->reserved;
519 spin_unlock(&delayed_rsv->lock);
520
521 spin_lock(&delayed_refs_rsv->lock);
522 reclaim_bytes += delayed_refs_rsv->reserved;
523 spin_unlock(&delayed_refs_rsv->lock);
524 if (reclaim_bytes >= bytes_needed)
525 goto commit;
526 bytes_needed -= reclaim_bytes;
527
528 if (__percpu_counter_compare(&space_info->total_bytes_pinned,
529 bytes_needed,
530 BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0)
531 goto enospc;
532
533commit:
534 return btrfs_commit_transaction(trans);
535enospc:
536 btrfs_end_transaction(trans);
537 return -ENOSPC;
538}
539
540/*
541 * Try to flush some data based on policy set by @state. This is only advisory
542 * and may fail for various reasons. The caller is supposed to examine the
543 * state of @space_info to detect the outcome.
544 */
545static void flush_space(struct btrfs_fs_info *fs_info,
546 struct btrfs_space_info *space_info, u64 num_bytes,
547 int state)
548{
549 struct btrfs_root *root = fs_info->extent_root;
550 struct btrfs_trans_handle *trans;
551 int nr;
552 int ret = 0;
553
554 switch (state) {
555 case FLUSH_DELAYED_ITEMS_NR:
556 case FLUSH_DELAYED_ITEMS:
557 if (state == FLUSH_DELAYED_ITEMS_NR)
558 nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2;
559 else
560 nr = -1;
561
562 trans = btrfs_join_transaction(root);
563 if (IS_ERR(trans)) {
564 ret = PTR_ERR(trans);
565 break;
566 }
567 ret = btrfs_run_delayed_items_nr(trans, nr);
568 btrfs_end_transaction(trans);
569 break;
570 case FLUSH_DELALLOC:
571 case FLUSH_DELALLOC_WAIT:
572 shrink_delalloc(fs_info, num_bytes * 2, num_bytes,
573 state == FLUSH_DELALLOC_WAIT);
574 break;
575 case FLUSH_DELAYED_REFS_NR:
576 case FLUSH_DELAYED_REFS:
577 trans = btrfs_join_transaction(root);
578 if (IS_ERR(trans)) {
579 ret = PTR_ERR(trans);
580 break;
581 }
582 if (state == FLUSH_DELAYED_REFS_NR)
583 nr = calc_reclaim_items_nr(fs_info, num_bytes);
584 else
585 nr = 0;
586 btrfs_run_delayed_refs(trans, nr);
587 btrfs_end_transaction(trans);
588 break;
589 case ALLOC_CHUNK:
590 case ALLOC_CHUNK_FORCE:
591 trans = btrfs_join_transaction(root);
592 if (IS_ERR(trans)) {
593 ret = PTR_ERR(trans);
594 break;
595 }
596 ret = btrfs_chunk_alloc(trans,
597 btrfs_metadata_alloc_profile(fs_info),
598 (state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE :
599 CHUNK_ALLOC_FORCE);
600 btrfs_end_transaction(trans);
601 if (ret > 0 || ret == -ENOSPC)
602 ret = 0;
603 break;
844245b4 604 case RUN_DELAYED_IPUTS:
0d9764f6
JB
605 /*
606 * If we have pending delayed iputs then we could free up a
607 * bunch of pinned space, so make sure we run the iputs before
608 * we do our pinned bytes check below.
609 */
610 btrfs_run_delayed_iputs(fs_info);
611 btrfs_wait_on_delayed_iputs(fs_info);
844245b4
JB
612 break;
613 case COMMIT_TRANS:
0d9764f6
JB
614 ret = may_commit_transaction(fs_info, space_info);
615 break;
616 default:
617 ret = -ENOSPC;
618 break;
619 }
620
621 trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state,
622 ret);
623 return;
624}
625
626static inline u64
627btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
9f246926 628 struct btrfs_space_info *space_info)
0d9764f6
JB
629{
630 struct reserve_ticket *ticket;
631 u64 used;
632 u64 expected;
633 u64 to_reclaim = 0;
634
635 list_for_each_entry(ticket, &space_info->tickets, list)
636 to_reclaim += ticket->bytes;
637 list_for_each_entry(ticket, &space_info->priority_tickets, list)
638 to_reclaim += ticket->bytes;
639 if (to_reclaim)
640 return to_reclaim;
641
642 to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
a30a3d20
JB
643 if (btrfs_can_overcommit(fs_info, space_info, to_reclaim,
644 BTRFS_RESERVE_FLUSH_ALL))
0d9764f6
JB
645 return 0;
646
647 used = btrfs_space_info_used(space_info, true);
648
a30a3d20
JB
649 if (btrfs_can_overcommit(fs_info, space_info, SZ_1M,
650 BTRFS_RESERVE_FLUSH_ALL))
0d9764f6
JB
651 expected = div_factor_fine(space_info->total_bytes, 95);
652 else
653 expected = div_factor_fine(space_info->total_bytes, 90);
654
655 if (used > expected)
656 to_reclaim = used - expected;
657 else
658 to_reclaim = 0;
659 to_reclaim = min(to_reclaim, space_info->bytes_may_use +
660 space_info->bytes_reserved);
661 return to_reclaim;
662}
663
664static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info,
665 struct btrfs_space_info *space_info,
9f246926 666 u64 used)
0d9764f6
JB
667{
668 u64 thresh = div_factor_fine(space_info->total_bytes, 98);
669
670 /* If we're just plain full then async reclaim just slows us down. */
671 if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh)
672 return 0;
673
9f246926 674 if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info))
0d9764f6
JB
675 return 0;
676
677 return (used >= thresh && !btrfs_fs_closing(fs_info) &&
678 !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
679}
680
2341ccd1
JB
681/*
682 * maybe_fail_all_tickets - we've exhausted our flushing, start failing tickets
683 * @fs_info - fs_info for this fs
684 * @space_info - the space info we were flushing
685 *
686 * We call this when we've exhausted our flushing ability and haven't made
687 * progress in satisfying tickets. The reservation code handles tickets in
688 * order, so if there is a large ticket first and then smaller ones we could
689 * very well satisfy the smaller tickets. This will attempt to wake up any
690 * tickets in the list to catch this case.
691 *
692 * This function returns true if it was able to make progress by clearing out
693 * other tickets, or if it stumbles across a ticket that was smaller than the
694 * first ticket.
695 */
696static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info,
697 struct btrfs_space_info *space_info)
0d9764f6
JB
698{
699 struct reserve_ticket *ticket;
2341ccd1
JB
700 u64 tickets_id = space_info->tickets_id;
701 u64 first_ticket_bytes = 0;
702
84fe47a4
JB
703 if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
704 btrfs_info(fs_info, "cannot satisfy tickets, dumping space info");
705 __btrfs_dump_space_info(fs_info, space_info);
706 }
707
2341ccd1
JB
708 while (!list_empty(&space_info->tickets) &&
709 tickets_id == space_info->tickets_id) {
710 ticket = list_first_entry(&space_info->tickets,
711 struct reserve_ticket, list);
712
713 /*
714 * may_commit_transaction will avoid committing the transaction
715 * if it doesn't feel like the space reclaimed by the commit
716 * would result in the ticket succeeding. However if we have a
717 * smaller ticket in the queue it may be small enough to be
718 * satisified by committing the transaction, so if any
719 * subsequent ticket is smaller than the first ticket go ahead
720 * and send us back for another loop through the enospc flushing
721 * code.
722 */
723 if (first_ticket_bytes == 0)
724 first_ticket_bytes = ticket->bytes;
725 else if (first_ticket_bytes > ticket->bytes)
726 return true;
0d9764f6 727
84fe47a4
JB
728 if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
729 btrfs_info(fs_info, "failing ticket with %llu bytes",
730 ticket->bytes);
731
0d9764f6
JB
732 list_del_init(&ticket->list);
733 ticket->error = -ENOSPC;
734 wake_up(&ticket->wait);
2341ccd1
JB
735
736 /*
737 * We're just throwing tickets away, so more flushing may not
738 * trip over btrfs_try_granting_tickets, so we need to call it
739 * here to see if we can make progress with the next ticket in
740 * the list.
741 */
742 btrfs_try_granting_tickets(fs_info, space_info);
0d9764f6 743 }
2341ccd1 744 return (tickets_id != space_info->tickets_id);
0d9764f6
JB
745}
746
747/*
748 * This is for normal flushers, we can wait all goddamned day if we want to. We
749 * will loop and continuously try to flush as long as we are making progress.
750 * We count progress as clearing off tickets each time we have to loop.
751 */
752static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
753{
754 struct btrfs_fs_info *fs_info;
755 struct btrfs_space_info *space_info;
756 u64 to_reclaim;
757 int flush_state;
758 int commit_cycles = 0;
759 u64 last_tickets_id;
760
761 fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
762 space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
763
764 spin_lock(&space_info->lock);
9f246926 765 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info);
0d9764f6
JB
766 if (!to_reclaim) {
767 space_info->flush = 0;
768 spin_unlock(&space_info->lock);
769 return;
770 }
771 last_tickets_id = space_info->tickets_id;
772 spin_unlock(&space_info->lock);
773
774 flush_state = FLUSH_DELAYED_ITEMS_NR;
775 do {
776 flush_space(fs_info, space_info, to_reclaim, flush_state);
777 spin_lock(&space_info->lock);
778 if (list_empty(&space_info->tickets)) {
779 space_info->flush = 0;
780 spin_unlock(&space_info->lock);
781 return;
782 }
783 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info,
9f246926 784 space_info);
0d9764f6
JB
785 if (last_tickets_id == space_info->tickets_id) {
786 flush_state++;
787 } else {
788 last_tickets_id = space_info->tickets_id;
789 flush_state = FLUSH_DELAYED_ITEMS_NR;
790 if (commit_cycles)
791 commit_cycles--;
792 }
793
794 /*
795 * We don't want to force a chunk allocation until we've tried
796 * pretty hard to reclaim space. Think of the case where we
797 * freed up a bunch of space and so have a lot of pinned space
798 * to reclaim. We would rather use that than possibly create a
799 * underutilized metadata chunk. So if this is our first run
800 * through the flushing state machine skip ALLOC_CHUNK_FORCE and
801 * commit the transaction. If nothing has changed the next go
802 * around then we can force a chunk allocation.
803 */
804 if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles)
805 flush_state++;
806
807 if (flush_state > COMMIT_TRANS) {
808 commit_cycles++;
809 if (commit_cycles > 2) {
2341ccd1 810 if (maybe_fail_all_tickets(fs_info, space_info)) {
0d9764f6
JB
811 flush_state = FLUSH_DELAYED_ITEMS_NR;
812 commit_cycles--;
813 } else {
814 space_info->flush = 0;
815 }
816 } else {
817 flush_state = FLUSH_DELAYED_ITEMS_NR;
818 }
819 }
820 spin_unlock(&space_info->lock);
821 } while (flush_state <= COMMIT_TRANS);
822}
823
824void btrfs_init_async_reclaim_work(struct work_struct *work)
825{
826 INIT_WORK(work, btrfs_async_reclaim_metadata_space);
827}
828
829static const enum btrfs_flush_state priority_flush_states[] = {
830 FLUSH_DELAYED_ITEMS_NR,
831 FLUSH_DELAYED_ITEMS,
832 ALLOC_CHUNK,
833};
834
d3984c90
JB
835static const enum btrfs_flush_state evict_flush_states[] = {
836 FLUSH_DELAYED_ITEMS_NR,
837 FLUSH_DELAYED_ITEMS,
838 FLUSH_DELAYED_REFS_NR,
839 FLUSH_DELAYED_REFS,
840 FLUSH_DELALLOC,
841 FLUSH_DELALLOC_WAIT,
842 ALLOC_CHUNK,
843 COMMIT_TRANS,
844};
845
0d9764f6 846static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
9ce2f423
JB
847 struct btrfs_space_info *space_info,
848 struct reserve_ticket *ticket,
849 const enum btrfs_flush_state *states,
850 int states_nr)
0d9764f6
JB
851{
852 u64 to_reclaim;
853 int flush_state;
854
855 spin_lock(&space_info->lock);
9f246926 856 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info);
0d9764f6
JB
857 if (!to_reclaim) {
858 spin_unlock(&space_info->lock);
859 return;
860 }
861 spin_unlock(&space_info->lock);
862
863 flush_state = 0;
864 do {
9ce2f423 865 flush_space(fs_info, space_info, to_reclaim, states[flush_state]);
0d9764f6
JB
866 flush_state++;
867 spin_lock(&space_info->lock);
868 if (ticket->bytes == 0) {
869 spin_unlock(&space_info->lock);
870 return;
871 }
872 spin_unlock(&space_info->lock);
9ce2f423 873 } while (flush_state < states_nr);
0d9764f6
JB
874}
875
374bf9c5
JB
876static void wait_reserve_ticket(struct btrfs_fs_info *fs_info,
877 struct btrfs_space_info *space_info,
878 struct reserve_ticket *ticket)
0d9764f6
JB
879
880{
881 DEFINE_WAIT(wait);
0d9764f6
JB
882 int ret = 0;
883
884 spin_lock(&space_info->lock);
885 while (ticket->bytes > 0 && ticket->error == 0) {
886 ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE);
887 if (ret) {
0cab7acc
FM
888 /*
889 * Delete us from the list. After we unlock the space
890 * info, we don't want the async reclaim job to reserve
891 * space for this ticket. If that would happen, then the
892 * ticket's task would not known that space was reserved
893 * despite getting an error, resulting in a space leak
894 * (bytes_may_use counter of our space_info).
895 */
896 list_del_init(&ticket->list);
374bf9c5 897 ticket->error = -EINTR;
0d9764f6
JB
898 break;
899 }
900 spin_unlock(&space_info->lock);
901
902 schedule();
903
904 finish_wait(&ticket->wait, &wait);
905 spin_lock(&space_info->lock);
906 }
0d9764f6 907 spin_unlock(&space_info->lock);
0d9764f6
JB
908}
909
03235279
JB
910/**
911 * handle_reserve_ticket - do the appropriate flushing and waiting for a ticket
912 * @fs_info - the fs
913 * @space_info - the space_info for the reservation
914 * @ticket - the ticket for the reservation
915 * @flush - how much we can flush
916 *
917 * This does the work of figuring out how to flush for the ticket, waiting for
918 * the reservation, and returning the appropriate error if there is one.
919 */
920static int handle_reserve_ticket(struct btrfs_fs_info *fs_info,
921 struct btrfs_space_info *space_info,
922 struct reserve_ticket *ticket,
923 enum btrfs_reserve_flush_enum flush)
924{
03235279
JB
925 int ret;
926
d3984c90
JB
927 switch (flush) {
928 case BTRFS_RESERVE_FLUSH_ALL:
03235279 929 wait_reserve_ticket(fs_info, space_info, ticket);
d3984c90
JB
930 break;
931 case BTRFS_RESERVE_FLUSH_LIMIT:
9ce2f423
JB
932 priority_reclaim_metadata_space(fs_info, space_info, ticket,
933 priority_flush_states,
934 ARRAY_SIZE(priority_flush_states));
d3984c90
JB
935 break;
936 case BTRFS_RESERVE_FLUSH_EVICT:
937 priority_reclaim_metadata_space(fs_info, space_info, ticket,
938 evict_flush_states,
939 ARRAY_SIZE(evict_flush_states));
940 break;
941 default:
942 ASSERT(0);
943 break;
944 }
03235279
JB
945
946 spin_lock(&space_info->lock);
947 ret = ticket->error;
948 if (ticket->bytes || ticket->error) {
0cab7acc
FM
949 /*
950 * Need to delete here for priority tickets. For regular tickets
951 * either the async reclaim job deletes the ticket from the list
952 * or we delete it ourselves at wait_reserve_ticket().
953 */
03235279
JB
954 list_del_init(&ticket->list);
955 if (!ret)
956 ret = -ENOSPC;
957 }
958 spin_unlock(&space_info->lock);
03235279 959 ASSERT(list_empty(&ticket->list));
0cab7acc
FM
960 /*
961 * Check that we can't have an error set if the reservation succeeded,
962 * as that would confuse tasks and lead them to error out without
963 * releasing reserved space (if an error happens the expectation is that
964 * space wasn't reserved at all).
965 */
966 ASSERT(!(ticket->bytes == 0 && ticket->error));
03235279
JB
967 return ret;
968}
969
0d9764f6
JB
970/**
971 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
972 * @root - the root we're allocating for
973 * @space_info - the space info we want to allocate from
974 * @orig_bytes - the number of bytes we want
975 * @flush - whether or not we can flush to make our reservation
976 *
977 * This will reserve orig_bytes number of bytes from the space info associated
978 * with the block_rsv. If there is not enough space it will make an attempt to
979 * flush out space to make room. It will do this by flushing delalloc if
980 * possible or committing the transaction. If flush is 0 then no attempts to
981 * regain reservations will be made and this will fail if there is not enough
982 * space already.
983 */
984static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
985 struct btrfs_space_info *space_info,
986 u64 orig_bytes,
9f246926 987 enum btrfs_reserve_flush_enum flush)
0d9764f6
JB
988{
989 struct reserve_ticket ticket;
990 u64 used;
0d9764f6 991 int ret = 0;
ef1317a1 992 bool pending_tickets;
0d9764f6
JB
993
994 ASSERT(orig_bytes);
995 ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL);
996
997 spin_lock(&space_info->lock);
998 ret = -ENOSPC;
999 used = btrfs_space_info_used(space_info, true);
ef1317a1
JB
1000 pending_tickets = !list_empty(&space_info->tickets) ||
1001 !list_empty(&space_info->priority_tickets);
0d9764f6
JB
1002
1003 /*
9b4851bc
GR
1004 * Carry on if we have enough space (short-circuit) OR call
1005 * can_overcommit() to ensure we can overcommit to continue.
0d9764f6 1006 */
ef1317a1
JB
1007 if (!pending_tickets &&
1008 ((used + orig_bytes <= space_info->total_bytes) ||
a30a3d20 1009 btrfs_can_overcommit(fs_info, space_info, orig_bytes, flush))) {
0d9764f6
JB
1010 btrfs_space_info_update_bytes_may_use(fs_info, space_info,
1011 orig_bytes);
0d9764f6
JB
1012 ret = 0;
1013 }
1014
1015 /*
1016 * If we couldn't make a reservation then setup our reservation ticket
1017 * and kick the async worker if it's not already running.
1018 *
1019 * If we are a priority flusher then we just need to add our ticket to
1020 * the list and we will do our own flushing further down.
1021 */
1022 if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
0d9764f6
JB
1023 ticket.bytes = orig_bytes;
1024 ticket.error = 0;
1025 init_waitqueue_head(&ticket.wait);
1026 if (flush == BTRFS_RESERVE_FLUSH_ALL) {
1027 list_add_tail(&ticket.list, &space_info->tickets);
1028 if (!space_info->flush) {
1029 space_info->flush = 1;
1030 trace_btrfs_trigger_flush(fs_info,
1031 space_info->flags,
1032 orig_bytes, flush,
1033 "enospc");
1034 queue_work(system_unbound_wq,
1035 &fs_info->async_reclaim_work);
1036 }
1037 } else {
1038 list_add_tail(&ticket.list,
1039 &space_info->priority_tickets);
1040 }
1041 } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
1042 used += orig_bytes;
1043 /*
1044 * We will do the space reservation dance during log replay,
1045 * which means we won't have fs_info->fs_root set, so don't do
1046 * the async reclaim as we will panic.
1047 */
1048 if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) &&
9f246926 1049 need_do_async_reclaim(fs_info, space_info, used) &&
0d9764f6
JB
1050 !work_busy(&fs_info->async_reclaim_work)) {
1051 trace_btrfs_trigger_flush(fs_info, space_info->flags,
1052 orig_bytes, flush, "preempt");
1053 queue_work(system_unbound_wq,
1054 &fs_info->async_reclaim_work);
1055 }
1056 }
1057 spin_unlock(&space_info->lock);
1058 if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
1059 return ret;
1060
03235279 1061 return handle_reserve_ticket(fs_info, space_info, &ticket, flush);
0d9764f6
JB
1062}
1063
1064/**
1065 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
1066 * @root - the root we're allocating for
1067 * @block_rsv - the block_rsv we're allocating for
1068 * @orig_bytes - the number of bytes we want
1069 * @flush - whether or not we can flush to make our reservation
1070 *
1071 * This will reserve orig_bytes number of bytes from the space info associated
1072 * with the block_rsv. If there is not enough space it will make an attempt to
1073 * flush out space to make room. It will do this by flushing delalloc if
1074 * possible or committing the transaction. If flush is 0 then no attempts to
1075 * regain reservations will be made and this will fail if there is not enough
1076 * space already.
1077 */
1078int btrfs_reserve_metadata_bytes(struct btrfs_root *root,
1079 struct btrfs_block_rsv *block_rsv,
1080 u64 orig_bytes,
1081 enum btrfs_reserve_flush_enum flush)
1082{
1083 struct btrfs_fs_info *fs_info = root->fs_info;
1084 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
1085 int ret;
0d9764f6
JB
1086
1087 ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info,
9f246926 1088 orig_bytes, flush);
0d9764f6
JB
1089 if (ret == -ENOSPC &&
1090 unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
1091 if (block_rsv != global_rsv &&
1092 !btrfs_block_rsv_use_bytes(global_rsv, orig_bytes))
1093 ret = 0;
1094 }
1095 if (ret == -ENOSPC) {
1096 trace_btrfs_space_reservation(fs_info, "space_info:enospc",
1097 block_rsv->space_info->flags,
1098 orig_bytes, 1);
1099
1100 if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
1101 btrfs_dump_space_info(fs_info, block_rsv->space_info,
1102 orig_bytes, 0);
1103 }
1104 return ret;
1105}