1 // SPDX-License-Identifier: GPL-2.0
5 * Copyright (c) 2012 Samsung Electronics Co., Ltd.
6 * http://www.samsung.com/
9 #include <linux/module.h>
10 #include <linux/backing-dev.h>
11 #include <linux/init.h>
12 #include <linux/f2fs_fs.h>
13 #include <linux/kthread.h>
14 #include <linux/delay.h>
15 #include <linux/freezer.h>
16 #include <linux/sched/signal.h>
22 #include <trace/events/f2fs.h>
24 static struct kmem_cache
*victim_entry_slab
;
26 static unsigned int count_bits(const unsigned long *addr
,
27 unsigned int offset
, unsigned int len
);
29 static int gc_thread_func(void *data
)
31 struct f2fs_sb_info
*sbi
= data
;
32 struct f2fs_gc_kthread
*gc_th
= sbi
->gc_thread
;
33 wait_queue_head_t
*wq
= &sbi
->gc_thread
->gc_wait_queue_head
;
36 wait_ms
= gc_th
->min_sleep_time
;
42 wait_event_interruptible_timeout(*wq
,
43 kthread_should_stop() || freezing(current
) ||
45 msecs_to_jiffies(wait_ms
));
47 /* give it a try one time */
51 if (try_to_freeze()) {
52 stat_other_skip_bggc_count(sbi
);
55 if (kthread_should_stop())
58 if (sbi
->sb
->s_writers
.frozen
>= SB_FREEZE_WRITE
) {
59 increase_sleep_time(gc_th
, &wait_ms
);
60 stat_other_skip_bggc_count(sbi
);
64 if (time_to_inject(sbi
, FAULT_CHECKPOINT
)) {
65 f2fs_show_injection_info(sbi
, FAULT_CHECKPOINT
);
66 f2fs_stop_checkpoint(sbi
, false);
69 if (!sb_start_write_trylock(sbi
->sb
)) {
70 stat_other_skip_bggc_count(sbi
);
75 * [GC triggering condition]
76 * 0. GC is not conducted currently.
77 * 1. There are enough dirty segments.
78 * 2. IO subsystem is idle by checking the # of writeback pages.
79 * 3. IO subsystem is idle by checking the # of requests in
80 * bdev's request list.
82 * Note) We have to avoid triggering GCs frequently.
83 * Because it is possible that some segments can be
84 * invalidated soon after by user update or deletion.
85 * So, I'd like to wait some time to collect dirty segments.
87 if (sbi
->gc_mode
== GC_URGENT_HIGH
) {
88 wait_ms
= gc_th
->urgent_sleep_time
;
89 down_write(&sbi
->gc_lock
);
93 if (!down_write_trylock(&sbi
->gc_lock
)) {
94 stat_other_skip_bggc_count(sbi
);
98 if (!is_idle(sbi
, GC_TIME
)) {
99 increase_sleep_time(gc_th
, &wait_ms
);
100 up_write(&sbi
->gc_lock
);
101 stat_io_skip_bggc_count(sbi
);
105 if (has_enough_invalid_blocks(sbi
))
106 decrease_sleep_time(gc_th
, &wait_ms
);
108 increase_sleep_time(gc_th
, &wait_ms
);
110 stat_inc_bggc_count(sbi
->stat_info
);
112 sync_mode
= F2FS_OPTION(sbi
).bggc_mode
== BGGC_MODE_SYNC
;
114 /* if return value is not zero, no victim was selected */
115 if (f2fs_gc(sbi
, sync_mode
, true, NULL_SEGNO
))
116 wait_ms
= gc_th
->no_gc_sleep_time
;
118 trace_f2fs_background_gc(sbi
->sb
, wait_ms
,
119 prefree_segments(sbi
), free_segments(sbi
));
121 /* balancing f2fs's metadata periodically */
122 f2fs_balance_fs_bg(sbi
, true);
124 sb_end_write(sbi
->sb
);
126 } while (!kthread_should_stop());
130 int f2fs_start_gc_thread(struct f2fs_sb_info
*sbi
)
132 struct f2fs_gc_kthread
*gc_th
;
133 dev_t dev
= sbi
->sb
->s_bdev
->bd_dev
;
136 gc_th
= f2fs_kmalloc(sbi
, sizeof(struct f2fs_gc_kthread
), GFP_KERNEL
);
142 gc_th
->urgent_sleep_time
= DEF_GC_THREAD_URGENT_SLEEP_TIME
;
143 gc_th
->min_sleep_time
= DEF_GC_THREAD_MIN_SLEEP_TIME
;
144 gc_th
->max_sleep_time
= DEF_GC_THREAD_MAX_SLEEP_TIME
;
145 gc_th
->no_gc_sleep_time
= DEF_GC_THREAD_NOGC_SLEEP_TIME
;
149 sbi
->gc_thread
= gc_th
;
150 init_waitqueue_head(&sbi
->gc_thread
->gc_wait_queue_head
);
151 sbi
->gc_thread
->f2fs_gc_task
= kthread_run(gc_thread_func
, sbi
,
152 "f2fs_gc-%u:%u", MAJOR(dev
), MINOR(dev
));
153 if (IS_ERR(gc_th
->f2fs_gc_task
)) {
154 err
= PTR_ERR(gc_th
->f2fs_gc_task
);
156 sbi
->gc_thread
= NULL
;
162 void f2fs_stop_gc_thread(struct f2fs_sb_info
*sbi
)
164 struct f2fs_gc_kthread
*gc_th
= sbi
->gc_thread
;
167 kthread_stop(gc_th
->f2fs_gc_task
);
169 sbi
->gc_thread
= NULL
;
172 static int select_gc_type(struct f2fs_sb_info
*sbi
, int gc_type
)
176 if (gc_type
== BG_GC
) {
177 if (sbi
->am
.atgc_enabled
)
185 switch (sbi
->gc_mode
) {
201 static void select_policy(struct f2fs_sb_info
*sbi
, int gc_type
,
202 int type
, struct victim_sel_policy
*p
)
204 struct dirty_seglist_info
*dirty_i
= DIRTY_I(sbi
);
206 if (p
->alloc_mode
== SSR
) {
207 p
->gc_mode
= GC_GREEDY
;
208 p
->dirty_bitmap
= dirty_i
->dirty_segmap
[type
];
209 p
->max_search
= dirty_i
->nr_dirty
[type
];
211 } else if (p
->alloc_mode
== AT_SSR
) {
212 p
->gc_mode
= GC_GREEDY
;
213 p
->dirty_bitmap
= dirty_i
->dirty_segmap
[type
];
214 p
->max_search
= dirty_i
->nr_dirty
[type
];
217 p
->gc_mode
= select_gc_type(sbi
, gc_type
);
218 p
->ofs_unit
= sbi
->segs_per_sec
;
219 if (__is_large_section(sbi
)) {
220 p
->dirty_bitmap
= dirty_i
->dirty_secmap
;
221 p
->max_search
= count_bits(p
->dirty_bitmap
,
224 p
->dirty_bitmap
= dirty_i
->dirty_segmap
[DIRTY
];
225 p
->max_search
= dirty_i
->nr_dirty
[DIRTY
];
230 * adjust candidates range, should select all dirty segments for
231 * foreground GC and urgent GC cases.
233 if (gc_type
!= FG_GC
&&
234 (sbi
->gc_mode
!= GC_URGENT_HIGH
) &&
235 (p
->gc_mode
!= GC_AT
&& p
->alloc_mode
!= AT_SSR
) &&
236 p
->max_search
> sbi
->max_victim_search
)
237 p
->max_search
= sbi
->max_victim_search
;
239 /* let's select beginning hot/small space first in no_heap mode*/
240 if (test_opt(sbi
, NOHEAP
) &&
241 (type
== CURSEG_HOT_DATA
|| IS_NODESEG(type
)))
244 p
->offset
= SIT_I(sbi
)->last_victim
[p
->gc_mode
];
247 static unsigned int get_max_cost(struct f2fs_sb_info
*sbi
,
248 struct victim_sel_policy
*p
)
250 /* SSR allocates in a segment unit */
251 if (p
->alloc_mode
== SSR
)
252 return sbi
->blocks_per_seg
;
253 else if (p
->alloc_mode
== AT_SSR
)
257 if (p
->gc_mode
== GC_GREEDY
)
258 return 2 * sbi
->blocks_per_seg
* p
->ofs_unit
;
259 else if (p
->gc_mode
== GC_CB
)
261 else if (p
->gc_mode
== GC_AT
)
263 else /* No other gc_mode */
267 static unsigned int check_bg_victims(struct f2fs_sb_info
*sbi
)
269 struct dirty_seglist_info
*dirty_i
= DIRTY_I(sbi
);
273 * If the gc_type is FG_GC, we can select victim segments
274 * selected by background GC before.
275 * Those segments guarantee they have small valid blocks.
277 for_each_set_bit(secno
, dirty_i
->victim_secmap
, MAIN_SECS(sbi
)) {
278 if (sec_usage_check(sbi
, secno
))
280 clear_bit(secno
, dirty_i
->victim_secmap
);
281 return GET_SEG_FROM_SEC(sbi
, secno
);
286 static unsigned int get_cb_cost(struct f2fs_sb_info
*sbi
, unsigned int segno
)
288 struct sit_info
*sit_i
= SIT_I(sbi
);
289 unsigned int secno
= GET_SEC_FROM_SEG(sbi
, segno
);
290 unsigned int start
= GET_SEG_FROM_SEC(sbi
, secno
);
291 unsigned long long mtime
= 0;
292 unsigned int vblocks
;
293 unsigned char age
= 0;
296 unsigned int usable_segs_per_sec
= f2fs_usable_segs_in_sec(sbi
, segno
);
298 for (i
= 0; i
< usable_segs_per_sec
; i
++)
299 mtime
+= get_seg_entry(sbi
, start
+ i
)->mtime
;
300 vblocks
= get_valid_blocks(sbi
, segno
, true);
302 mtime
= div_u64(mtime
, usable_segs_per_sec
);
303 vblocks
= div_u64(vblocks
, usable_segs_per_sec
);
305 u
= (vblocks
* 100) >> sbi
->log_blocks_per_seg
;
307 /* Handle if the system time has changed by the user */
308 if (mtime
< sit_i
->min_mtime
)
309 sit_i
->min_mtime
= mtime
;
310 if (mtime
> sit_i
->max_mtime
)
311 sit_i
->max_mtime
= mtime
;
312 if (sit_i
->max_mtime
!= sit_i
->min_mtime
)
313 age
= 100 - div64_u64(100 * (mtime
- sit_i
->min_mtime
),
314 sit_i
->max_mtime
- sit_i
->min_mtime
);
316 return UINT_MAX
- ((100 * (100 - u
) * age
) / (100 + u
));
319 static inline unsigned int get_gc_cost(struct f2fs_sb_info
*sbi
,
320 unsigned int segno
, struct victim_sel_policy
*p
)
322 if (p
->alloc_mode
== SSR
)
323 return get_seg_entry(sbi
, segno
)->ckpt_valid_blocks
;
325 /* alloc_mode == LFS */
326 if (p
->gc_mode
== GC_GREEDY
)
327 return get_valid_blocks(sbi
, segno
, true);
328 else if (p
->gc_mode
== GC_CB
)
329 return get_cb_cost(sbi
, segno
);
335 static unsigned int count_bits(const unsigned long *addr
,
336 unsigned int offset
, unsigned int len
)
338 unsigned int end
= offset
+ len
, sum
= 0;
340 while (offset
< end
) {
341 if (test_bit(offset
++, addr
))
347 static struct victim_entry
*attach_victim_entry(struct f2fs_sb_info
*sbi
,
348 unsigned long long mtime
, unsigned int segno
,
349 struct rb_node
*parent
, struct rb_node
**p
,
352 struct atgc_management
*am
= &sbi
->am
;
353 struct victim_entry
*ve
;
355 ve
= f2fs_kmem_cache_alloc(victim_entry_slab
, GFP_NOFS
);
360 rb_link_node(&ve
->rb_node
, parent
, p
);
361 rb_insert_color_cached(&ve
->rb_node
, &am
->root
, left_most
);
363 list_add_tail(&ve
->list
, &am
->victim_list
);
370 static void insert_victim_entry(struct f2fs_sb_info
*sbi
,
371 unsigned long long mtime
, unsigned int segno
)
373 struct atgc_management
*am
= &sbi
->am
;
375 struct rb_node
*parent
= NULL
;
376 bool left_most
= true;
378 p
= f2fs_lookup_rb_tree_ext(sbi
, &am
->root
, &parent
, mtime
, &left_most
);
379 attach_victim_entry(sbi
, mtime
, segno
, parent
, p
, left_most
);
382 static void add_victim_entry(struct f2fs_sb_info
*sbi
,
383 struct victim_sel_policy
*p
, unsigned int segno
)
385 struct sit_info
*sit_i
= SIT_I(sbi
);
386 unsigned int secno
= GET_SEC_FROM_SEG(sbi
, segno
);
387 unsigned int start
= GET_SEG_FROM_SEC(sbi
, secno
);
388 unsigned long long mtime
= 0;
391 if (unlikely(is_sbi_flag_set(sbi
, SBI_CP_DISABLED
))) {
392 if (p
->gc_mode
== GC_AT
&&
393 get_valid_blocks(sbi
, segno
, true) == 0)
396 if (p
->alloc_mode
== AT_SSR
&&
397 get_seg_entry(sbi
, segno
)->ckpt_valid_blocks
== 0)
401 for (i
= 0; i
< sbi
->segs_per_sec
; i
++)
402 mtime
+= get_seg_entry(sbi
, start
+ i
)->mtime
;
403 mtime
= div_u64(mtime
, sbi
->segs_per_sec
);
405 /* Handle if the system time has changed by the user */
406 if (mtime
< sit_i
->min_mtime
)
407 sit_i
->min_mtime
= mtime
;
408 if (mtime
> sit_i
->max_mtime
)
409 sit_i
->max_mtime
= mtime
;
410 if (mtime
< sit_i
->dirty_min_mtime
)
411 sit_i
->dirty_min_mtime
= mtime
;
412 if (mtime
> sit_i
->dirty_max_mtime
)
413 sit_i
->dirty_max_mtime
= mtime
;
415 /* don't choose young section as candidate */
416 if (sit_i
->dirty_max_mtime
- mtime
< p
->age_threshold
)
419 insert_victim_entry(sbi
, mtime
, segno
);
422 static struct rb_node
*lookup_central_victim(struct f2fs_sb_info
*sbi
,
423 struct victim_sel_policy
*p
)
425 struct atgc_management
*am
= &sbi
->am
;
426 struct rb_node
*parent
= NULL
;
429 f2fs_lookup_rb_tree_ext(sbi
, &am
->root
, &parent
, p
->age
, &left_most
);
434 static void atgc_lookup_victim(struct f2fs_sb_info
*sbi
,
435 struct victim_sel_policy
*p
)
437 struct sit_info
*sit_i
= SIT_I(sbi
);
438 struct atgc_management
*am
= &sbi
->am
;
439 struct rb_root_cached
*root
= &am
->root
;
440 struct rb_node
*node
;
442 struct victim_entry
*ve
;
443 unsigned long long total_time
;
444 unsigned long long age
, u
, accu
;
445 unsigned long long max_mtime
= sit_i
->dirty_max_mtime
;
446 unsigned long long min_mtime
= sit_i
->dirty_min_mtime
;
447 unsigned int sec_blocks
= BLKS_PER_SEC(sbi
);
448 unsigned int vblocks
;
449 unsigned int dirty_threshold
= max(am
->max_candidate_count
,
450 am
->candidate_ratio
*
451 am
->victim_count
/ 100);
452 unsigned int age_weight
= am
->age_weight
;
454 unsigned int iter
= 0;
456 if (max_mtime
< min_mtime
)
460 total_time
= max_mtime
- min_mtime
;
462 accu
= div64_u64(ULLONG_MAX
, total_time
);
463 accu
= min_t(unsigned long long, div_u64(accu
, 100),
464 DEFAULT_ACCURACY_CLASS
);
466 node
= rb_first_cached(root
);
468 re
= rb_entry_safe(node
, struct rb_entry
, rb_node
);
472 ve
= (struct victim_entry
*)re
;
474 if (ve
->mtime
>= max_mtime
|| ve
->mtime
< min_mtime
)
477 /* age = 10000 * x% * 60 */
478 age
= div64_u64(accu
* (max_mtime
- ve
->mtime
), total_time
) *
481 vblocks
= get_valid_blocks(sbi
, ve
->segno
, true);
482 f2fs_bug_on(sbi
, !vblocks
|| vblocks
== sec_blocks
);
484 /* u = 10000 * x% * 40 */
485 u
= div64_u64(accu
* (sec_blocks
- vblocks
), sec_blocks
) *
488 f2fs_bug_on(sbi
, age
+ u
>= UINT_MAX
);
490 cost
= UINT_MAX
- (age
+ u
);
493 if (cost
< p
->min_cost
||
494 (cost
== p
->min_cost
&& age
> p
->oldest_age
)) {
497 p
->min_segno
= ve
->segno
;
500 if (iter
< dirty_threshold
) {
501 node
= rb_next(node
);
507 * select candidates around source section in range of
508 * [target - dirty_threshold, target + dirty_threshold]
510 static void atssr_lookup_victim(struct f2fs_sb_info
*sbi
,
511 struct victim_sel_policy
*p
)
513 struct sit_info
*sit_i
= SIT_I(sbi
);
514 struct atgc_management
*am
= &sbi
->am
;
515 struct rb_node
*node
;
517 struct victim_entry
*ve
;
518 unsigned long long age
;
519 unsigned long long max_mtime
= sit_i
->dirty_max_mtime
;
520 unsigned long long min_mtime
= sit_i
->dirty_min_mtime
;
521 unsigned int seg_blocks
= sbi
->blocks_per_seg
;
522 unsigned int vblocks
;
523 unsigned int dirty_threshold
= max(am
->max_candidate_count
,
524 am
->candidate_ratio
*
525 am
->victim_count
/ 100);
527 unsigned int iter
= 0;
530 if (max_mtime
< min_mtime
)
534 node
= lookup_central_victim(sbi
, p
);
536 re
= rb_entry_safe(node
, struct rb_entry
, rb_node
);
543 ve
= (struct victim_entry
*)re
;
545 if (ve
->mtime
>= max_mtime
|| ve
->mtime
< min_mtime
)
548 age
= max_mtime
- ve
->mtime
;
550 vblocks
= get_seg_entry(sbi
, ve
->segno
)->ckpt_valid_blocks
;
551 f2fs_bug_on(sbi
, !vblocks
);
554 if (vblocks
== seg_blocks
)
559 age
= max_mtime
- abs(p
->age
- age
);
560 cost
= UINT_MAX
- vblocks
;
562 if (cost
< p
->min_cost
||
563 (cost
== p
->min_cost
&& age
> p
->oldest_age
)) {
566 p
->min_segno
= ve
->segno
;
569 if (iter
< dirty_threshold
) {
571 node
= rb_prev(node
);
573 node
= rb_next(node
);
583 static void lookup_victim_by_age(struct f2fs_sb_info
*sbi
,
584 struct victim_sel_policy
*p
)
586 f2fs_bug_on(sbi
, !f2fs_check_rb_tree_consistence(sbi
,
587 &sbi
->am
.root
, true));
589 if (p
->gc_mode
== GC_AT
)
590 atgc_lookup_victim(sbi
, p
);
591 else if (p
->alloc_mode
== AT_SSR
)
592 atssr_lookup_victim(sbi
, p
);
597 static void release_victim_entry(struct f2fs_sb_info
*sbi
)
599 struct atgc_management
*am
= &sbi
->am
;
600 struct victim_entry
*ve
, *tmp
;
602 list_for_each_entry_safe(ve
, tmp
, &am
->victim_list
, list
) {
604 kmem_cache_free(victim_entry_slab
, ve
);
608 am
->root
= RB_ROOT_CACHED
;
610 f2fs_bug_on(sbi
, am
->victim_count
);
611 f2fs_bug_on(sbi
, !list_empty(&am
->victim_list
));
615 * This function is called from two paths.
616 * One is garbage collection and the other is SSR segment selection.
617 * When it is called during GC, it just gets a victim segment
618 * and it does not remove it from dirty seglist.
619 * When it is called from SSR segment selection, it finds a segment
620 * which has minimum valid blocks and removes it from dirty seglist.
622 static int get_victim_by_default(struct f2fs_sb_info
*sbi
,
623 unsigned int *result
, int gc_type
, int type
,
624 char alloc_mode
, unsigned long long age
)
626 struct dirty_seglist_info
*dirty_i
= DIRTY_I(sbi
);
627 struct sit_info
*sm
= SIT_I(sbi
);
628 struct victim_sel_policy p
;
629 unsigned int secno
, last_victim
;
630 unsigned int last_segment
;
631 unsigned int nsearched
;
635 mutex_lock(&dirty_i
->seglist_lock
);
636 last_segment
= MAIN_SECS(sbi
) * sbi
->segs_per_sec
;
638 p
.alloc_mode
= alloc_mode
;
640 p
.age_threshold
= sbi
->am
.age_threshold
;
643 select_policy(sbi
, gc_type
, type
, &p
);
644 p
.min_segno
= NULL_SEGNO
;
646 p
.min_cost
= get_max_cost(sbi
, &p
);
648 is_atgc
= (p
.gc_mode
== GC_AT
|| p
.alloc_mode
== AT_SSR
);
652 SIT_I(sbi
)->dirty_min_mtime
= ULLONG_MAX
;
654 if (*result
!= NULL_SEGNO
) {
655 if (!get_valid_blocks(sbi
, *result
, false)) {
660 if (sec_usage_check(sbi
, GET_SEC_FROM_SEG(sbi
, *result
)))
663 p
.min_segno
= *result
;
668 if (p
.max_search
== 0)
671 if (__is_large_section(sbi
) && p
.alloc_mode
== LFS
) {
672 if (sbi
->next_victim_seg
[BG_GC
] != NULL_SEGNO
) {
673 p
.min_segno
= sbi
->next_victim_seg
[BG_GC
];
674 *result
= p
.min_segno
;
675 sbi
->next_victim_seg
[BG_GC
] = NULL_SEGNO
;
678 if (gc_type
== FG_GC
&&
679 sbi
->next_victim_seg
[FG_GC
] != NULL_SEGNO
) {
680 p
.min_segno
= sbi
->next_victim_seg
[FG_GC
];
681 *result
= p
.min_segno
;
682 sbi
->next_victim_seg
[FG_GC
] = NULL_SEGNO
;
687 last_victim
= sm
->last_victim
[p
.gc_mode
];
688 if (p
.alloc_mode
== LFS
&& gc_type
== FG_GC
) {
689 p
.min_segno
= check_bg_victims(sbi
);
690 if (p
.min_segno
!= NULL_SEGNO
)
695 unsigned long cost
, *dirty_bitmap
;
696 unsigned int unit_no
, segno
;
698 dirty_bitmap
= p
.dirty_bitmap
;
699 unit_no
= find_next_bit(dirty_bitmap
,
700 last_segment
/ p
.ofs_unit
,
701 p
.offset
/ p
.ofs_unit
);
702 segno
= unit_no
* p
.ofs_unit
;
703 if (segno
>= last_segment
) {
704 if (sm
->last_victim
[p
.gc_mode
]) {
706 sm
->last_victim
[p
.gc_mode
];
707 sm
->last_victim
[p
.gc_mode
] = 0;
714 p
.offset
= segno
+ p
.ofs_unit
;
717 #ifdef CONFIG_F2FS_CHECK_FS
719 * skip selecting the invalid segno (that is failed due to block
720 * validity check failure during GC) to avoid endless GC loop in
723 if (test_bit(segno
, sm
->invalid_segmap
))
727 secno
= GET_SEC_FROM_SEG(sbi
, segno
);
729 if (sec_usage_check(sbi
, secno
))
731 /* Don't touch checkpointed data */
732 if (unlikely(is_sbi_flag_set(sbi
, SBI_CP_DISABLED
) &&
733 get_ckpt_valid_blocks(sbi
, segno
) &&
734 p
.alloc_mode
== LFS
))
736 if (gc_type
== BG_GC
&& test_bit(secno
, dirty_i
->victim_secmap
))
740 add_victim_entry(sbi
, &p
, segno
);
744 cost
= get_gc_cost(sbi
, segno
, &p
);
746 if (p
.min_cost
> cost
) {
751 if (nsearched
>= p
.max_search
) {
752 if (!sm
->last_victim
[p
.gc_mode
] && segno
<= last_victim
)
753 sm
->last_victim
[p
.gc_mode
] =
754 last_victim
+ p
.ofs_unit
;
756 sm
->last_victim
[p
.gc_mode
] = segno
+ p
.ofs_unit
;
757 sm
->last_victim
[p
.gc_mode
] %=
758 (MAIN_SECS(sbi
) * sbi
->segs_per_sec
);
763 /* get victim for GC_AT/AT_SSR */
765 lookup_victim_by_age(sbi
, &p
);
766 release_victim_entry(sbi
);
769 if (is_atgc
&& p
.min_segno
== NULL_SEGNO
&&
770 sm
->elapsed_time
< p
.age_threshold
) {
775 if (p
.min_segno
!= NULL_SEGNO
) {
777 *result
= (p
.min_segno
/ p
.ofs_unit
) * p
.ofs_unit
;
779 if (p
.alloc_mode
== LFS
) {
780 secno
= GET_SEC_FROM_SEG(sbi
, p
.min_segno
);
781 if (gc_type
== FG_GC
)
782 sbi
->cur_victim_sec
= secno
;
784 set_bit(secno
, dirty_i
->victim_secmap
);
790 if (p
.min_segno
!= NULL_SEGNO
)
791 trace_f2fs_get_victim(sbi
->sb
, type
, gc_type
, &p
,
793 prefree_segments(sbi
), free_segments(sbi
));
794 mutex_unlock(&dirty_i
->seglist_lock
);
799 static const struct victim_selection default_v_ops
= {
800 .get_victim
= get_victim_by_default
,
803 static struct inode
*find_gc_inode(struct gc_inode_list
*gc_list
, nid_t ino
)
805 struct inode_entry
*ie
;
807 ie
= radix_tree_lookup(&gc_list
->iroot
, ino
);
813 static void add_gc_inode(struct gc_inode_list
*gc_list
, struct inode
*inode
)
815 struct inode_entry
*new_ie
;
817 if (inode
== find_gc_inode(gc_list
, inode
->i_ino
)) {
821 new_ie
= f2fs_kmem_cache_alloc(f2fs_inode_entry_slab
, GFP_NOFS
);
822 new_ie
->inode
= inode
;
824 f2fs_radix_tree_insert(&gc_list
->iroot
, inode
->i_ino
, new_ie
);
825 list_add_tail(&new_ie
->list
, &gc_list
->ilist
);
828 static void put_gc_inode(struct gc_inode_list
*gc_list
)
830 struct inode_entry
*ie
, *next_ie
;
831 list_for_each_entry_safe(ie
, next_ie
, &gc_list
->ilist
, list
) {
832 radix_tree_delete(&gc_list
->iroot
, ie
->inode
->i_ino
);
835 kmem_cache_free(f2fs_inode_entry_slab
, ie
);
839 static int check_valid_map(struct f2fs_sb_info
*sbi
,
840 unsigned int segno
, int offset
)
842 struct sit_info
*sit_i
= SIT_I(sbi
);
843 struct seg_entry
*sentry
;
846 down_read(&sit_i
->sentry_lock
);
847 sentry
= get_seg_entry(sbi
, segno
);
848 ret
= f2fs_test_bit(offset
, sentry
->cur_valid_map
);
849 up_read(&sit_i
->sentry_lock
);
854 * This function compares node address got in summary with that in NAT.
855 * On validity, copy that node with cold status, otherwise (invalid node)
858 static int gc_node_segment(struct f2fs_sb_info
*sbi
,
859 struct f2fs_summary
*sum
, unsigned int segno
, int gc_type
)
861 struct f2fs_summary
*entry
;
865 bool fggc
= (gc_type
== FG_GC
);
867 unsigned int usable_blks_in_seg
= f2fs_usable_blks_in_seg(sbi
, segno
);
869 start_addr
= START_BLOCK(sbi
, segno
);
874 if (fggc
&& phase
== 2)
875 atomic_inc(&sbi
->wb_sync_req
[NODE
]);
877 for (off
= 0; off
< usable_blks_in_seg
; off
++, entry
++) {
878 nid_t nid
= le32_to_cpu(entry
->nid
);
879 struct page
*node_page
;
883 /* stop BG_GC if there is not enough free sections. */
884 if (gc_type
== BG_GC
&& has_not_enough_free_secs(sbi
, 0, 0))
887 if (check_valid_map(sbi
, segno
, off
) == 0)
891 f2fs_ra_meta_pages(sbi
, NAT_BLOCK_OFFSET(nid
), 1,
897 f2fs_ra_node_page(sbi
, nid
);
902 node_page
= f2fs_get_node_page(sbi
, nid
);
903 if (IS_ERR(node_page
))
906 /* block may become invalid during f2fs_get_node_page */
907 if (check_valid_map(sbi
, segno
, off
) == 0) {
908 f2fs_put_page(node_page
, 1);
912 if (f2fs_get_node_info(sbi
, nid
, &ni
)) {
913 f2fs_put_page(node_page
, 1);
917 if (ni
.blk_addr
!= start_addr
+ off
) {
918 f2fs_put_page(node_page
, 1);
922 err
= f2fs_move_node_page(node_page
, gc_type
);
923 if (!err
&& gc_type
== FG_GC
)
925 stat_inc_node_blk_count(sbi
, 1, gc_type
);
932 atomic_dec(&sbi
->wb_sync_req
[NODE
]);
937 * Calculate start block index indicating the given node offset.
938 * Be careful, caller should give this node offset only indicating direct node
939 * blocks. If any node offsets, which point the other types of node blocks such
940 * as indirect or double indirect node blocks, are given, it must be a caller's
943 block_t
f2fs_start_bidx_of_node(unsigned int node_ofs
, struct inode
*inode
)
945 unsigned int indirect_blks
= 2 * NIDS_PER_BLOCK
+ 4;
953 } else if (node_ofs
<= indirect_blks
) {
954 int dec
= (node_ofs
- 4) / (NIDS_PER_BLOCK
+ 1);
955 bidx
= node_ofs
- 2 - dec
;
957 int dec
= (node_ofs
- indirect_blks
- 3) / (NIDS_PER_BLOCK
+ 1);
958 bidx
= node_ofs
- 5 - dec
;
960 return bidx
* ADDRS_PER_BLOCK(inode
) + ADDRS_PER_INODE(inode
);
963 static bool is_alive(struct f2fs_sb_info
*sbi
, struct f2fs_summary
*sum
,
964 struct node_info
*dni
, block_t blkaddr
, unsigned int *nofs
)
966 struct page
*node_page
;
968 unsigned int ofs_in_node
;
969 block_t source_blkaddr
;
971 nid
= le32_to_cpu(sum
->nid
);
972 ofs_in_node
= le16_to_cpu(sum
->ofs_in_node
);
974 node_page
= f2fs_get_node_page(sbi
, nid
);
975 if (IS_ERR(node_page
))
978 if (f2fs_get_node_info(sbi
, nid
, dni
)) {
979 f2fs_put_page(node_page
, 1);
983 if (sum
->version
!= dni
->version
) {
984 f2fs_warn(sbi
, "%s: valid data with mismatched node version.",
986 set_sbi_flag(sbi
, SBI_NEED_FSCK
);
989 *nofs
= ofs_of_node(node_page
);
990 source_blkaddr
= data_blkaddr(NULL
, node_page
, ofs_in_node
);
991 f2fs_put_page(node_page
, 1);
993 if (source_blkaddr
!= blkaddr
) {
994 #ifdef CONFIG_F2FS_CHECK_FS
995 unsigned int segno
= GET_SEGNO(sbi
, blkaddr
);
996 unsigned long offset
= GET_BLKOFF_FROM_SEG0(sbi
, blkaddr
);
998 if (unlikely(check_valid_map(sbi
, segno
, offset
))) {
999 if (!test_and_set_bit(segno
, SIT_I(sbi
)->invalid_segmap
)) {
1000 f2fs_err(sbi
, "mismatched blkaddr %u (source_blkaddr %u) in seg %u\n",
1001 blkaddr
, source_blkaddr
, segno
);
1002 f2fs_bug_on(sbi
, 1);
1011 static int ra_data_block(struct inode
*inode
, pgoff_t index
)
1013 struct f2fs_sb_info
*sbi
= F2FS_I_SB(inode
);
1014 struct address_space
*mapping
= inode
->i_mapping
;
1015 struct dnode_of_data dn
;
1017 struct extent_info ei
= {0, 0, 0};
1018 struct f2fs_io_info fio
= {
1020 .ino
= inode
->i_ino
,
1025 .encrypted_page
= NULL
,
1031 page
= f2fs_grab_cache_page(mapping
, index
, true);
1035 if (f2fs_lookup_extent_cache(inode
, index
, &ei
)) {
1036 dn
.data_blkaddr
= ei
.blk
+ index
- ei
.fofs
;
1037 if (unlikely(!f2fs_is_valid_blkaddr(sbi
, dn
.data_blkaddr
,
1038 DATA_GENERIC_ENHANCE_READ
))) {
1039 err
= -EFSCORRUPTED
;
1045 set_new_dnode(&dn
, inode
, NULL
, NULL
, 0);
1046 err
= f2fs_get_dnode_of_data(&dn
, index
, LOOKUP_NODE
);
1049 f2fs_put_dnode(&dn
);
1051 if (!__is_valid_data_blkaddr(dn
.data_blkaddr
)) {
1055 if (unlikely(!f2fs_is_valid_blkaddr(sbi
, dn
.data_blkaddr
,
1056 DATA_GENERIC_ENHANCE
))) {
1057 err
= -EFSCORRUPTED
;
1063 fio
.new_blkaddr
= fio
.old_blkaddr
= dn
.data_blkaddr
;
1066 * don't cache encrypted data into meta inode until previous dirty
1067 * data were writebacked to avoid racing between GC and flush.
1069 f2fs_wait_on_page_writeback(page
, DATA
, true, true);
1071 f2fs_wait_on_block_writeback(inode
, dn
.data_blkaddr
);
1073 fio
.encrypted_page
= f2fs_pagecache_get_page(META_MAPPING(sbi
),
1075 FGP_LOCK
| FGP_CREAT
, GFP_NOFS
);
1076 if (!fio
.encrypted_page
) {
1081 err
= f2fs_submit_page_bio(&fio
);
1083 goto put_encrypted_page
;
1084 f2fs_put_page(fio
.encrypted_page
, 0);
1085 f2fs_put_page(page
, 1);
1087 f2fs_update_iostat(sbi
, FS_DATA_READ_IO
, F2FS_BLKSIZE
);
1088 f2fs_update_iostat(sbi
, FS_GDATA_READ_IO
, F2FS_BLKSIZE
);
1092 f2fs_put_page(fio
.encrypted_page
, 1);
1094 f2fs_put_page(page
, 1);
1099 * Move data block via META_MAPPING while keeping locked data page.
1100 * This can be used to move blocks, aka LBAs, directly on disk.
1102 static int move_data_block(struct inode
*inode
, block_t bidx
,
1103 int gc_type
, unsigned int segno
, int off
)
1105 struct f2fs_io_info fio
= {
1106 .sbi
= F2FS_I_SB(inode
),
1107 .ino
= inode
->i_ino
,
1112 .encrypted_page
= NULL
,
1116 struct dnode_of_data dn
;
1117 struct f2fs_summary sum
;
1118 struct node_info ni
;
1119 struct page
*page
, *mpage
;
1122 bool lfs_mode
= f2fs_lfs_mode(fio
.sbi
);
1123 int type
= fio
.sbi
->am
.atgc_enabled
?
1124 CURSEG_ALL_DATA_ATGC
: CURSEG_COLD_DATA
;
1126 /* do not read out */
1127 page
= f2fs_grab_cache_page(inode
->i_mapping
, bidx
, false);
1131 if (!check_valid_map(F2FS_I_SB(inode
), segno
, off
)) {
1136 if (f2fs_is_atomic_file(inode
)) {
1137 F2FS_I(inode
)->i_gc_failures
[GC_FAILURE_ATOMIC
]++;
1138 F2FS_I_SB(inode
)->skipped_atomic_files
[gc_type
]++;
1143 if (f2fs_is_pinned_file(inode
)) {
1144 f2fs_pin_file_control(inode
, true);
1149 set_new_dnode(&dn
, inode
, NULL
, NULL
, 0);
1150 err
= f2fs_get_dnode_of_data(&dn
, bidx
, LOOKUP_NODE
);
1154 if (unlikely(dn
.data_blkaddr
== NULL_ADDR
)) {
1155 ClearPageUptodate(page
);
1161 * don't cache encrypted data into meta inode until previous dirty
1162 * data were writebacked to avoid racing between GC and flush.
1164 f2fs_wait_on_page_writeback(page
, DATA
, true, true);
1166 f2fs_wait_on_block_writeback(inode
, dn
.data_blkaddr
);
1168 err
= f2fs_get_node_info(fio
.sbi
, dn
.nid
, &ni
);
1174 fio
.new_blkaddr
= fio
.old_blkaddr
= dn
.data_blkaddr
;
1177 down_write(&fio
.sbi
->io_order_lock
);
1179 mpage
= f2fs_grab_cache_page(META_MAPPING(fio
.sbi
),
1180 fio
.old_blkaddr
, false);
1186 fio
.encrypted_page
= mpage
;
1188 /* read source block in mpage */
1189 if (!PageUptodate(mpage
)) {
1190 err
= f2fs_submit_page_bio(&fio
);
1192 f2fs_put_page(mpage
, 1);
1196 f2fs_update_iostat(fio
.sbi
, FS_DATA_READ_IO
, F2FS_BLKSIZE
);
1197 f2fs_update_iostat(fio
.sbi
, FS_GDATA_READ_IO
, F2FS_BLKSIZE
);
1200 if (unlikely(mpage
->mapping
!= META_MAPPING(fio
.sbi
) ||
1201 !PageUptodate(mpage
))) {
1203 f2fs_put_page(mpage
, 1);
1208 set_summary(&sum
, dn
.nid
, dn
.ofs_in_node
, ni
.version
);
1210 /* allocate block address */
1211 f2fs_allocate_data_block(fio
.sbi
, NULL
, fio
.old_blkaddr
, &newaddr
,
1214 fio
.encrypted_page
= f2fs_pagecache_get_page(META_MAPPING(fio
.sbi
),
1215 newaddr
, FGP_LOCK
| FGP_CREAT
, GFP_NOFS
);
1216 if (!fio
.encrypted_page
) {
1218 f2fs_put_page(mpage
, 1);
1222 /* write target block */
1223 f2fs_wait_on_page_writeback(fio
.encrypted_page
, DATA
, true, true);
1224 memcpy(page_address(fio
.encrypted_page
),
1225 page_address(mpage
), PAGE_SIZE
);
1226 f2fs_put_page(mpage
, 1);
1227 invalidate_mapping_pages(META_MAPPING(fio
.sbi
),
1228 fio
.old_blkaddr
, fio
.old_blkaddr
);
1230 set_page_dirty(fio
.encrypted_page
);
1231 if (clear_page_dirty_for_io(fio
.encrypted_page
))
1232 dec_page_count(fio
.sbi
, F2FS_DIRTY_META
);
1234 set_page_writeback(fio
.encrypted_page
);
1235 ClearPageError(page
);
1237 fio
.op
= REQ_OP_WRITE
;
1238 fio
.op_flags
= REQ_SYNC
;
1239 fio
.new_blkaddr
= newaddr
;
1240 f2fs_submit_page_write(&fio
);
1243 if (PageWriteback(fio
.encrypted_page
))
1244 end_page_writeback(fio
.encrypted_page
);
1248 f2fs_update_iostat(fio
.sbi
, FS_GC_DATA_IO
, F2FS_BLKSIZE
);
1250 f2fs_update_data_blkaddr(&dn
, newaddr
);
1251 set_inode_flag(inode
, FI_APPEND_WRITE
);
1252 if (page
->index
== 0)
1253 set_inode_flag(inode
, FI_FIRST_BLOCK_WRITTEN
);
1255 f2fs_put_page(fio
.encrypted_page
, 1);
1258 f2fs_do_replace_block(fio
.sbi
, &sum
, newaddr
, fio
.old_blkaddr
,
1262 up_write(&fio
.sbi
->io_order_lock
);
1264 f2fs_put_dnode(&dn
);
1266 f2fs_put_page(page
, 1);
1270 static int move_data_page(struct inode
*inode
, block_t bidx
, int gc_type
,
1271 unsigned int segno
, int off
)
1276 page
= f2fs_get_lock_data_page(inode
, bidx
, true);
1278 return PTR_ERR(page
);
1280 if (!check_valid_map(F2FS_I_SB(inode
), segno
, off
)) {
1285 if (f2fs_is_atomic_file(inode
)) {
1286 F2FS_I(inode
)->i_gc_failures
[GC_FAILURE_ATOMIC
]++;
1287 F2FS_I_SB(inode
)->skipped_atomic_files
[gc_type
]++;
1291 if (f2fs_is_pinned_file(inode
)) {
1292 if (gc_type
== FG_GC
)
1293 f2fs_pin_file_control(inode
, true);
1298 if (gc_type
== BG_GC
) {
1299 if (PageWriteback(page
)) {
1303 set_page_dirty(page
);
1304 set_cold_data(page
);
1306 struct f2fs_io_info fio
= {
1307 .sbi
= F2FS_I_SB(inode
),
1308 .ino
= inode
->i_ino
,
1312 .op_flags
= REQ_SYNC
,
1313 .old_blkaddr
= NULL_ADDR
,
1315 .encrypted_page
= NULL
,
1316 .need_lock
= LOCK_REQ
,
1317 .io_type
= FS_GC_DATA_IO
,
1319 bool is_dirty
= PageDirty(page
);
1322 f2fs_wait_on_page_writeback(page
, DATA
, true, true);
1324 set_page_dirty(page
);
1325 if (clear_page_dirty_for_io(page
)) {
1326 inode_dec_dirty_pages(inode
);
1327 f2fs_remove_dirty_inode(inode
);
1330 set_cold_data(page
);
1332 err
= f2fs_do_write_data_page(&fio
);
1334 clear_cold_data(page
);
1335 if (err
== -ENOMEM
) {
1336 congestion_wait(BLK_RW_ASYNC
,
1337 DEFAULT_IO_TIMEOUT
);
1341 set_page_dirty(page
);
1345 f2fs_put_page(page
, 1);
1350 * This function tries to get parent node of victim data block, and identifies
1351 * data block validity. If the block is valid, copy that with cold status and
1352 * modify parent node.
1353 * If the parent node is not valid or the data block address is different,
1354 * the victim data block is ignored.
1356 static int gc_data_segment(struct f2fs_sb_info
*sbi
, struct f2fs_summary
*sum
,
1357 struct gc_inode_list
*gc_list
, unsigned int segno
, int gc_type
)
1359 struct super_block
*sb
= sbi
->sb
;
1360 struct f2fs_summary
*entry
;
1365 unsigned int usable_blks_in_seg
= f2fs_usable_blks_in_seg(sbi
, segno
);
1367 start_addr
= START_BLOCK(sbi
, segno
);
1372 for (off
= 0; off
< usable_blks_in_seg
; off
++, entry
++) {
1373 struct page
*data_page
;
1374 struct inode
*inode
;
1375 struct node_info dni
; /* dnode info for the data */
1376 unsigned int ofs_in_node
, nofs
;
1378 nid_t nid
= le32_to_cpu(entry
->nid
);
1381 * stop BG_GC if there is not enough free sections.
1382 * Or, stop GC if the segment becomes fully valid caused by
1383 * race condition along with SSR block allocation.
1385 if ((gc_type
== BG_GC
&& has_not_enough_free_secs(sbi
, 0, 0)) ||
1386 get_valid_blocks(sbi
, segno
, true) ==
1390 if (check_valid_map(sbi
, segno
, off
) == 0)
1394 f2fs_ra_meta_pages(sbi
, NAT_BLOCK_OFFSET(nid
), 1,
1400 f2fs_ra_node_page(sbi
, nid
);
1404 /* Get an inode by ino with checking validity */
1405 if (!is_alive(sbi
, entry
, &dni
, start_addr
+ off
, &nofs
))
1409 f2fs_ra_node_page(sbi
, dni
.ino
);
1413 ofs_in_node
= le16_to_cpu(entry
->ofs_in_node
);
1416 inode
= f2fs_iget(sb
, dni
.ino
);
1417 if (IS_ERR(inode
) || is_bad_inode(inode
)) {
1418 set_sbi_flag(sbi
, SBI_NEED_FSCK
);
1422 if (!down_write_trylock(
1423 &F2FS_I(inode
)->i_gc_rwsem
[WRITE
])) {
1425 sbi
->skipped_gc_rwsem
++;
1429 start_bidx
= f2fs_start_bidx_of_node(nofs
, inode
) +
1432 if (f2fs_post_read_required(inode
)) {
1433 int err
= ra_data_block(inode
, start_bidx
);
1435 up_write(&F2FS_I(inode
)->i_gc_rwsem
[WRITE
]);
1440 add_gc_inode(gc_list
, inode
);
1444 data_page
= f2fs_get_read_data_page(inode
,
1445 start_bidx
, REQ_RAHEAD
, true);
1446 up_write(&F2FS_I(inode
)->i_gc_rwsem
[WRITE
]);
1447 if (IS_ERR(data_page
)) {
1452 f2fs_put_page(data_page
, 0);
1453 add_gc_inode(gc_list
, inode
);
1458 inode
= find_gc_inode(gc_list
, dni
.ino
);
1460 struct f2fs_inode_info
*fi
= F2FS_I(inode
);
1461 bool locked
= false;
1464 if (S_ISREG(inode
->i_mode
)) {
1465 if (!down_write_trylock(&fi
->i_gc_rwsem
[READ
]))
1467 if (!down_write_trylock(
1468 &fi
->i_gc_rwsem
[WRITE
])) {
1469 sbi
->skipped_gc_rwsem
++;
1470 up_write(&fi
->i_gc_rwsem
[READ
]);
1475 /* wait for all inflight aio data */
1476 inode_dio_wait(inode
);
1479 start_bidx
= f2fs_start_bidx_of_node(nofs
, inode
)
1481 if (f2fs_post_read_required(inode
))
1482 err
= move_data_block(inode
, start_bidx
,
1483 gc_type
, segno
, off
);
1485 err
= move_data_page(inode
, start_bidx
, gc_type
,
1488 if (!err
&& (gc_type
== FG_GC
||
1489 f2fs_post_read_required(inode
)))
1493 up_write(&fi
->i_gc_rwsem
[WRITE
]);
1494 up_write(&fi
->i_gc_rwsem
[READ
]);
1497 stat_inc_data_blk_count(sbi
, 1, gc_type
);
1507 static int __get_victim(struct f2fs_sb_info
*sbi
, unsigned int *victim
,
1510 struct sit_info
*sit_i
= SIT_I(sbi
);
1513 down_write(&sit_i
->sentry_lock
);
1514 ret
= DIRTY_I(sbi
)->v_ops
->get_victim(sbi
, victim
, gc_type
,
1515 NO_CHECK_TYPE
, LFS
, 0);
1516 up_write(&sit_i
->sentry_lock
);
1520 static int do_garbage_collect(struct f2fs_sb_info
*sbi
,
1521 unsigned int start_segno
,
1522 struct gc_inode_list
*gc_list
, int gc_type
)
1524 struct page
*sum_page
;
1525 struct f2fs_summary_block
*sum
;
1526 struct blk_plug plug
;
1527 unsigned int segno
= start_segno
;
1528 unsigned int end_segno
= start_segno
+ sbi
->segs_per_sec
;
1529 int seg_freed
= 0, migrated
= 0;
1530 unsigned char type
= IS_DATASEG(get_seg_entry(sbi
, segno
)->type
) ?
1531 SUM_TYPE_DATA
: SUM_TYPE_NODE
;
1534 if (__is_large_section(sbi
))
1535 end_segno
= rounddown(end_segno
, sbi
->segs_per_sec
);
1538 * zone-capacity can be less than zone-size in zoned devices,
1539 * resulting in less than expected usable segments in the zone,
1540 * calculate the end segno in the zone which can be garbage collected
1542 if (f2fs_sb_has_blkzoned(sbi
))
1543 end_segno
-= sbi
->segs_per_sec
-
1544 f2fs_usable_segs_in_sec(sbi
, segno
);
1546 sanity_check_seg_type(sbi
, get_seg_entry(sbi
, segno
)->type
);
1548 /* readahead multi ssa blocks those have contiguous address */
1549 if (__is_large_section(sbi
))
1550 f2fs_ra_meta_pages(sbi
, GET_SUM_BLOCK(sbi
, segno
),
1551 end_segno
- segno
, META_SSA
, true);
1553 /* reference all summary page */
1554 while (segno
< end_segno
) {
1555 sum_page
= f2fs_get_sum_page(sbi
, segno
++);
1556 if (IS_ERR(sum_page
)) {
1557 int err
= PTR_ERR(sum_page
);
1559 end_segno
= segno
- 1;
1560 for (segno
= start_segno
; segno
< end_segno
; segno
++) {
1561 sum_page
= find_get_page(META_MAPPING(sbi
),
1562 GET_SUM_BLOCK(sbi
, segno
));
1563 f2fs_put_page(sum_page
, 0);
1564 f2fs_put_page(sum_page
, 0);
1568 unlock_page(sum_page
);
1571 blk_start_plug(&plug
);
1573 for (segno
= start_segno
; segno
< end_segno
; segno
++) {
1575 /* find segment summary of victim */
1576 sum_page
= find_get_page(META_MAPPING(sbi
),
1577 GET_SUM_BLOCK(sbi
, segno
));
1578 f2fs_put_page(sum_page
, 0);
1580 if (get_valid_blocks(sbi
, segno
, false) == 0)
1582 if (gc_type
== BG_GC
&& __is_large_section(sbi
) &&
1583 migrated
>= sbi
->migration_granularity
)
1585 if (!PageUptodate(sum_page
) || unlikely(f2fs_cp_error(sbi
)))
1588 sum
= page_address(sum_page
);
1589 if (type
!= GET_SUM_TYPE((&sum
->footer
))) {
1590 f2fs_err(sbi
, "Inconsistent segment (%u) type [%d, %d] in SSA and SIT",
1591 segno
, type
, GET_SUM_TYPE((&sum
->footer
)));
1592 set_sbi_flag(sbi
, SBI_NEED_FSCK
);
1593 f2fs_stop_checkpoint(sbi
, false);
1598 * this is to avoid deadlock:
1599 * - lock_page(sum_page) - f2fs_replace_block
1600 * - check_valid_map() - down_write(sentry_lock)
1601 * - down_read(sentry_lock) - change_curseg()
1602 * - lock_page(sum_page)
1604 if (type
== SUM_TYPE_NODE
)
1605 submitted
+= gc_node_segment(sbi
, sum
->entries
, segno
,
1608 submitted
+= gc_data_segment(sbi
, sum
->entries
, gc_list
,
1611 stat_inc_seg_count(sbi
, type
, gc_type
);
1615 if (gc_type
== FG_GC
&&
1616 get_valid_blocks(sbi
, segno
, false) == 0)
1619 if (__is_large_section(sbi
) && segno
+ 1 < end_segno
)
1620 sbi
->next_victim_seg
[gc_type
] = segno
+ 1;
1622 f2fs_put_page(sum_page
, 0);
1626 f2fs_submit_merged_write(sbi
,
1627 (type
== SUM_TYPE_NODE
) ? NODE
: DATA
);
1629 blk_finish_plug(&plug
);
1631 stat_inc_call_count(sbi
->stat_info
);
1636 int f2fs_gc(struct f2fs_sb_info
*sbi
, bool sync
,
1637 bool background
, unsigned int segno
)
1639 int gc_type
= sync
? FG_GC
: BG_GC
;
1640 int sec_freed
= 0, seg_freed
= 0, total_freed
= 0;
1642 struct cp_control cpc
;
1643 unsigned int init_segno
= segno
;
1644 struct gc_inode_list gc_list
= {
1645 .ilist
= LIST_HEAD_INIT(gc_list
.ilist
),
1646 .iroot
= RADIX_TREE_INIT(gc_list
.iroot
, GFP_NOFS
),
1648 unsigned long long last_skipped
= sbi
->skipped_atomic_files
[FG_GC
];
1649 unsigned long long first_skipped
;
1650 unsigned int skipped_round
= 0, round
= 0;
1652 trace_f2fs_gc_begin(sbi
->sb
, sync
, background
,
1653 get_pages(sbi
, F2FS_DIRTY_NODES
),
1654 get_pages(sbi
, F2FS_DIRTY_DENTS
),
1655 get_pages(sbi
, F2FS_DIRTY_IMETA
),
1658 reserved_segments(sbi
),
1659 prefree_segments(sbi
));
1661 cpc
.reason
= __get_cp_reason(sbi
);
1662 sbi
->skipped_gc_rwsem
= 0;
1663 first_skipped
= last_skipped
;
1665 if (unlikely(!(sbi
->sb
->s_flags
& SB_ACTIVE
))) {
1669 if (unlikely(f2fs_cp_error(sbi
))) {
1674 if (gc_type
== BG_GC
&& has_not_enough_free_secs(sbi
, 0, 0)) {
1676 * For example, if there are many prefree_segments below given
1677 * threshold, we can make them free by checkpoint. Then, we
1678 * secure free segments which doesn't need fggc any more.
1680 if (prefree_segments(sbi
) &&
1681 !is_sbi_flag_set(sbi
, SBI_CP_DISABLED
)) {
1682 ret
= f2fs_write_checkpoint(sbi
, &cpc
);
1686 if (has_not_enough_free_secs(sbi
, 0, 0))
1690 /* f2fs_balance_fs doesn't need to do BG_GC in critical path. */
1691 if (gc_type
== BG_GC
&& !background
) {
1695 ret
= __get_victim(sbi
, &segno
, gc_type
);
1699 seg_freed
= do_garbage_collect(sbi
, segno
, &gc_list
, gc_type
);
1700 if (gc_type
== FG_GC
&&
1701 seg_freed
== f2fs_usable_segs_in_sec(sbi
, segno
))
1703 total_freed
+= seg_freed
;
1705 if (gc_type
== FG_GC
) {
1706 if (sbi
->skipped_atomic_files
[FG_GC
] > last_skipped
||
1707 sbi
->skipped_gc_rwsem
)
1709 last_skipped
= sbi
->skipped_atomic_files
[FG_GC
];
1713 if (gc_type
== FG_GC
&& seg_freed
)
1714 sbi
->cur_victim_sec
= NULL_SEGNO
;
1719 if (has_not_enough_free_secs(sbi
, sec_freed
, 0)) {
1720 if (skipped_round
<= MAX_SKIP_GC_COUNT
||
1721 skipped_round
* 2 < round
) {
1726 if (first_skipped
< last_skipped
&&
1727 (last_skipped
- first_skipped
) >
1728 sbi
->skipped_gc_rwsem
) {
1729 f2fs_drop_inmem_pages_all(sbi
, true);
1733 if (gc_type
== FG_GC
&& !is_sbi_flag_set(sbi
, SBI_CP_DISABLED
))
1734 ret
= f2fs_write_checkpoint(sbi
, &cpc
);
1737 SIT_I(sbi
)->last_victim
[ALLOC_NEXT
] = 0;
1738 SIT_I(sbi
)->last_victim
[FLUSH_DEVICE
] = init_segno
;
1740 trace_f2fs_gc_end(sbi
->sb
, ret
, total_freed
, sec_freed
,
1741 get_pages(sbi
, F2FS_DIRTY_NODES
),
1742 get_pages(sbi
, F2FS_DIRTY_DENTS
),
1743 get_pages(sbi
, F2FS_DIRTY_IMETA
),
1746 reserved_segments(sbi
),
1747 prefree_segments(sbi
));
1749 up_write(&sbi
->gc_lock
);
1751 put_gc_inode(&gc_list
);
1754 ret
= sec_freed
? 0 : -EAGAIN
;
1758 int __init
f2fs_create_garbage_collection_cache(void)
1760 victim_entry_slab
= f2fs_kmem_cache_create("f2fs_victim_entry",
1761 sizeof(struct victim_entry
));
1762 if (!victim_entry_slab
)
1767 void f2fs_destroy_garbage_collection_cache(void)
1769 kmem_cache_destroy(victim_entry_slab
);
1772 static void init_atgc_management(struct f2fs_sb_info
*sbi
)
1774 struct atgc_management
*am
= &sbi
->am
;
1776 if (test_opt(sbi
, ATGC
) &&
1777 SIT_I(sbi
)->elapsed_time
>= DEF_GC_THREAD_AGE_THRESHOLD
)
1778 am
->atgc_enabled
= true;
1780 am
->root
= RB_ROOT_CACHED
;
1781 INIT_LIST_HEAD(&am
->victim_list
);
1782 am
->victim_count
= 0;
1784 am
->candidate_ratio
= DEF_GC_THREAD_CANDIDATE_RATIO
;
1785 am
->max_candidate_count
= DEF_GC_THREAD_MAX_CANDIDATE_COUNT
;
1786 am
->age_weight
= DEF_GC_THREAD_AGE_WEIGHT
;
1789 void f2fs_build_gc_manager(struct f2fs_sb_info
*sbi
)
1791 DIRTY_I(sbi
)->v_ops
= &default_v_ops
;
1793 sbi
->gc_pin_file_threshold
= DEF_GC_FAILED_PINNED_FILES
;
1795 /* give warm/cold data area from slower device */
1796 if (f2fs_is_multi_device(sbi
) && !__is_large_section(sbi
))
1797 SIT_I(sbi
)->last_victim
[ALLOC_NEXT
] =
1798 GET_SEGNO(sbi
, FDEV(0).end_blk
) + 1;
1800 init_atgc_management(sbi
);
1803 static int free_segment_range(struct f2fs_sb_info
*sbi
,
1804 unsigned int secs
, bool gc_only
)
1806 unsigned int segno
, next_inuse
, start
, end
;
1807 struct cp_control cpc
= { CP_RESIZE
, 0, 0, 0 };
1808 int gc_mode
, gc_type
;
1812 /* Force block allocation for GC */
1813 MAIN_SECS(sbi
) -= secs
;
1814 start
= MAIN_SECS(sbi
) * sbi
->segs_per_sec
;
1815 end
= MAIN_SEGS(sbi
) - 1;
1817 mutex_lock(&DIRTY_I(sbi
)->seglist_lock
);
1818 for (gc_mode
= 0; gc_mode
< MAX_GC_POLICY
; gc_mode
++)
1819 if (SIT_I(sbi
)->last_victim
[gc_mode
] >= start
)
1820 SIT_I(sbi
)->last_victim
[gc_mode
] = 0;
1822 for (gc_type
= BG_GC
; gc_type
<= FG_GC
; gc_type
++)
1823 if (sbi
->next_victim_seg
[gc_type
] >= start
)
1824 sbi
->next_victim_seg
[gc_type
] = NULL_SEGNO
;
1825 mutex_unlock(&DIRTY_I(sbi
)->seglist_lock
);
1827 /* Move out cursegs from the target range */
1828 for (type
= CURSEG_HOT_DATA
; type
< NR_CURSEG_PERSIST_TYPE
; type
++)
1829 f2fs_allocate_segment_for_resize(sbi
, type
, start
, end
);
1831 /* do GC to move out valid blocks in the range */
1832 for (segno
= start
; segno
<= end
; segno
+= sbi
->segs_per_sec
) {
1833 struct gc_inode_list gc_list
= {
1834 .ilist
= LIST_HEAD_INIT(gc_list
.ilist
),
1835 .iroot
= RADIX_TREE_INIT(gc_list
.iroot
, GFP_NOFS
),
1838 do_garbage_collect(sbi
, segno
, &gc_list
, FG_GC
);
1839 put_gc_inode(&gc_list
);
1841 if (!gc_only
&& get_valid_blocks(sbi
, segno
, true)) {
1845 if (fatal_signal_pending(current
)) {
1853 err
= f2fs_write_checkpoint(sbi
, &cpc
);
1857 next_inuse
= find_next_inuse(FREE_I(sbi
), end
+ 1, start
);
1858 if (next_inuse
<= end
) {
1859 f2fs_err(sbi
, "segno %u should be free but still inuse!",
1861 f2fs_bug_on(sbi
, 1);
1864 MAIN_SECS(sbi
) += secs
;
1868 static void update_sb_metadata(struct f2fs_sb_info
*sbi
, int secs
)
1870 struct f2fs_super_block
*raw_sb
= F2FS_RAW_SUPER(sbi
);
1873 int segment_count_main
;
1874 long long block_count
;
1875 int segs
= secs
* sbi
->segs_per_sec
;
1877 down_write(&sbi
->sb_lock
);
1879 section_count
= le32_to_cpu(raw_sb
->section_count
);
1880 segment_count
= le32_to_cpu(raw_sb
->segment_count
);
1881 segment_count_main
= le32_to_cpu(raw_sb
->segment_count_main
);
1882 block_count
= le64_to_cpu(raw_sb
->block_count
);
1884 raw_sb
->section_count
= cpu_to_le32(section_count
+ secs
);
1885 raw_sb
->segment_count
= cpu_to_le32(segment_count
+ segs
);
1886 raw_sb
->segment_count_main
= cpu_to_le32(segment_count_main
+ segs
);
1887 raw_sb
->block_count
= cpu_to_le64(block_count
+
1888 (long long)segs
* sbi
->blocks_per_seg
);
1889 if (f2fs_is_multi_device(sbi
)) {
1890 int last_dev
= sbi
->s_ndevs
- 1;
1892 le32_to_cpu(raw_sb
->devs
[last_dev
].total_segments
);
1894 raw_sb
->devs
[last_dev
].total_segments
=
1895 cpu_to_le32(dev_segs
+ segs
);
1898 up_write(&sbi
->sb_lock
);
1901 static void update_fs_metadata(struct f2fs_sb_info
*sbi
, int secs
)
1903 int segs
= secs
* sbi
->segs_per_sec
;
1904 long long blks
= (long long)segs
* sbi
->blocks_per_seg
;
1905 long long user_block_count
=
1906 le64_to_cpu(F2FS_CKPT(sbi
)->user_block_count
);
1908 SM_I(sbi
)->segment_count
= (int)SM_I(sbi
)->segment_count
+ segs
;
1909 MAIN_SEGS(sbi
) = (int)MAIN_SEGS(sbi
) + segs
;
1910 MAIN_SECS(sbi
) += secs
;
1911 FREE_I(sbi
)->free_sections
= (int)FREE_I(sbi
)->free_sections
+ secs
;
1912 FREE_I(sbi
)->free_segments
= (int)FREE_I(sbi
)->free_segments
+ segs
;
1913 F2FS_CKPT(sbi
)->user_block_count
= cpu_to_le64(user_block_count
+ blks
);
1915 if (f2fs_is_multi_device(sbi
)) {
1916 int last_dev
= sbi
->s_ndevs
- 1;
1918 FDEV(last_dev
).total_segments
=
1919 (int)FDEV(last_dev
).total_segments
+ segs
;
1920 FDEV(last_dev
).end_blk
=
1921 (long long)FDEV(last_dev
).end_blk
+ blks
;
1922 #ifdef CONFIG_BLK_DEV_ZONED
1923 FDEV(last_dev
).nr_blkz
= (int)FDEV(last_dev
).nr_blkz
+
1924 (int)(blks
>> sbi
->log_blocks_per_blkz
);
1929 int f2fs_resize_fs(struct f2fs_sb_info
*sbi
, __u64 block_count
)
1931 __u64 old_block_count
, shrunk_blocks
;
1932 struct cp_control cpc
= { CP_RESIZE
, 0, 0, 0 };
1937 old_block_count
= le64_to_cpu(F2FS_RAW_SUPER(sbi
)->block_count
);
1938 if (block_count
> old_block_count
)
1941 if (f2fs_is_multi_device(sbi
)) {
1942 int last_dev
= sbi
->s_ndevs
- 1;
1943 __u64 last_segs
= FDEV(last_dev
).total_segments
;
1945 if (block_count
+ last_segs
* sbi
->blocks_per_seg
<=
1950 /* new fs size should align to section size */
1951 div_u64_rem(block_count
, BLKS_PER_SEC(sbi
), &rem
);
1955 if (block_count
== old_block_count
)
1958 if (is_sbi_flag_set(sbi
, SBI_NEED_FSCK
)) {
1959 f2fs_err(sbi
, "Should run fsck to repair first.");
1960 return -EFSCORRUPTED
;
1963 if (test_opt(sbi
, DISABLE_CHECKPOINT
)) {
1964 f2fs_err(sbi
, "Checkpoint should be enabled.");
1968 shrunk_blocks
= old_block_count
- block_count
;
1969 secs
= div_u64(shrunk_blocks
, BLKS_PER_SEC(sbi
));
1972 if (!down_write_trylock(&sbi
->gc_lock
))
1975 /* stop CP to protect MAIN_SEC in free_segment_range */
1977 err
= free_segment_range(sbi
, secs
, true);
1978 f2fs_unlock_op(sbi
);
1979 up_write(&sbi
->gc_lock
);
1983 set_sbi_flag(sbi
, SBI_IS_RESIZEFS
);
1985 freeze_super(sbi
->sb
);
1986 down_write(&sbi
->gc_lock
);
1987 down_write(&sbi
->cp_global_sem
);
1989 spin_lock(&sbi
->stat_lock
);
1990 if (shrunk_blocks
+ valid_user_blocks(sbi
) +
1991 sbi
->current_reserved_blocks
+ sbi
->unusable_block_count
+
1992 F2FS_OPTION(sbi
).root_reserved_blocks
> sbi
->user_block_count
)
1995 sbi
->user_block_count
-= shrunk_blocks
;
1996 spin_unlock(&sbi
->stat_lock
);
2000 err
= free_segment_range(sbi
, secs
, false);
2004 update_sb_metadata(sbi
, -secs
);
2006 err
= f2fs_commit_super(sbi
, false);
2008 update_sb_metadata(sbi
, secs
);
2012 update_fs_metadata(sbi
, -secs
);
2013 clear_sbi_flag(sbi
, SBI_IS_RESIZEFS
);
2014 set_sbi_flag(sbi
, SBI_IS_DIRTY
);
2016 err
= f2fs_write_checkpoint(sbi
, &cpc
);
2018 update_fs_metadata(sbi
, secs
);
2019 update_sb_metadata(sbi
, secs
);
2020 f2fs_commit_super(sbi
, false);
2024 set_sbi_flag(sbi
, SBI_NEED_FSCK
);
2025 f2fs_err(sbi
, "resize_fs failed, should run fsck to repair!");
2027 spin_lock(&sbi
->stat_lock
);
2028 sbi
->user_block_count
+= shrunk_blocks
;
2029 spin_unlock(&sbi
->stat_lock
);
2032 up_write(&sbi
->cp_global_sem
);
2033 up_write(&sbi
->gc_lock
);
2034 thaw_super(sbi
->sb
);
2035 clear_sbi_flag(sbi
, SBI_IS_RESIZEFS
);