]>
Commit | Line | Data |
---|---|---|
b0643e59 DZ |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | ||
3 | #include <linux/jiffies.h> | |
4 | #include <linux/kernel.h> | |
5 | #include <linux/ktime.h> | |
6 | #include <linux/list.h> | |
e93591bb | 7 | #include <linux/math64.h> |
b0643e59 DZ |
8 | #include <linux/sizes.h> |
9 | #include <linux/workqueue.h> | |
10 | #include "ctree.h" | |
11 | #include "block-group.h" | |
12 | #include "discard.h" | |
13 | #include "free-space-cache.h" | |
14 | ||
dbc2a8c9 DZ |
15 | /* |
16 | * This contains the logic to handle async discard. | |
17 | * | |
18 | * Async discard manages trimming of free space outside of transaction commit. | |
19 | * Discarding is done by managing the block_groups on a LRU list based on free | |
20 | * space recency. Two passes are used to first prioritize discarding extents | |
21 | * and then allow for trimming in the bitmap the best opportunity to coalesce. | |
22 | * The block_groups are maintained on multiple lists to allow for multiple | |
23 | * passes with different discard filter requirements. A delayed work item is | |
24 | * used to manage discarding with timeout determined by a max of the delay | |
25 | * incurred by the iops rate limit, the byte rate limit, and the max delay of | |
26 | * BTRFS_DISCARD_MAX_DELAY. | |
27 | * | |
28 | * Note, this only keeps track of block_groups that are explicitly for data. | |
29 | * Mixed block_groups are not supported. | |
30 | * | |
31 | * The first list is special to manage discarding of fully free block groups. | |
32 | * This is necessary because we issue a final trim for a full free block group | |
33 | * after forgetting it. When a block group becomes unused, instead of directly | |
34 | * being added to the unused_bgs list, we add it to this first list. Then | |
35 | * from there, if it becomes fully discarded, we place it onto the unused_bgs | |
36 | * list. | |
37 | * | |
38 | * The in-memory free space cache serves as the backing state for discard. | |
39 | * Consequently this means there is no persistence. We opt to load all the | |
40 | * block groups in as not discarded, so the mount case degenerates to the | |
41 | * crashing case. | |
42 | * | |
43 | * As the free space cache uses bitmaps, there exists a tradeoff between | |
44 | * ease/efficiency for find_free_extent() and the accuracy of discard state. | |
45 | * Here we opt to let untrimmed regions merge with everything while only letting | |
46 | * trimmed regions merge with other trimmed regions. This can cause | |
47 | * overtrimming, but the coalescing benefit seems to be worth it. Additionally, | |
48 | * bitmap state is tracked as a whole. If we're able to fully trim a bitmap, | |
49 | * the trimmed flag is set on the bitmap. Otherwise, if an allocation comes in, | |
50 | * this resets the state and we will retry trimming the whole bitmap. This is a | |
51 | * tradeoff between discard state accuracy and the cost of accounting. | |
52 | */ | |
53 | ||
b0643e59 DZ |
54 | /* This is an initial delay to give some chance for block reuse */ |
55 | #define BTRFS_DISCARD_DELAY (120ULL * NSEC_PER_SEC) | |
6e80d4f8 | 56 | #define BTRFS_DISCARD_UNUSED_DELAY (10ULL * NSEC_PER_SEC) |
b0643e59 | 57 | |
a2309300 DZ |
58 | /* Target completion latency of discarding all discardable extents */ |
59 | #define BTRFS_DISCARD_TARGET_MSEC (6 * 60 * 60UL * MSEC_PER_SEC) | |
60 | #define BTRFS_DISCARD_MIN_DELAY_MSEC (1UL) | |
61 | #define BTRFS_DISCARD_MAX_DELAY_MSEC (1000UL) | |
62 | #define BTRFS_DISCARD_MAX_IOPS (10U) | |
63 | ||
7fe6d45e DZ |
64 | /* Montonically decreasing minimum length filters after index 0 */ |
65 | static int discard_minlen[BTRFS_NR_DISCARD_LISTS] = { | |
66 | 0, | |
67 | BTRFS_ASYNC_DISCARD_MAX_FILTER, | |
68 | BTRFS_ASYNC_DISCARD_MIN_FILTER | |
69 | }; | |
70 | ||
b0643e59 DZ |
71 | static struct list_head *get_discard_list(struct btrfs_discard_ctl *discard_ctl, |
72 | struct btrfs_block_group *block_group) | |
73 | { | |
74 | return &discard_ctl->discard_list[block_group->discard_index]; | |
75 | } | |
76 | ||
2bee7eb8 DZ |
77 | static void __add_to_discard_list(struct btrfs_discard_ctl *discard_ctl, |
78 | struct btrfs_block_group *block_group) | |
b0643e59 | 79 | { |
2bee7eb8 | 80 | if (!btrfs_run_discard_work(discard_ctl)) |
b0643e59 | 81 | return; |
b0643e59 | 82 | |
6e80d4f8 DZ |
83 | if (list_empty(&block_group->discard_list) || |
84 | block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED) { | |
85 | if (block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED) | |
86 | block_group->discard_index = BTRFS_DISCARD_INDEX_START; | |
b0643e59 DZ |
87 | block_group->discard_eligible_time = (ktime_get_ns() + |
88 | BTRFS_DISCARD_DELAY); | |
2bee7eb8 | 89 | block_group->discard_state = BTRFS_DISCARD_RESET_CURSOR; |
6e80d4f8 | 90 | } |
b0643e59 DZ |
91 | |
92 | list_move_tail(&block_group->discard_list, | |
93 | get_discard_list(discard_ctl, block_group)); | |
2bee7eb8 | 94 | } |
b0643e59 | 95 | |
2bee7eb8 DZ |
96 | static void add_to_discard_list(struct btrfs_discard_ctl *discard_ctl, |
97 | struct btrfs_block_group *block_group) | |
98 | { | |
5cb0724e DZ |
99 | if (!btrfs_is_block_group_data_only(block_group)) |
100 | return; | |
101 | ||
2bee7eb8 DZ |
102 | spin_lock(&discard_ctl->lock); |
103 | __add_to_discard_list(discard_ctl, block_group); | |
b0643e59 DZ |
104 | spin_unlock(&discard_ctl->lock); |
105 | } | |
106 | ||
6e80d4f8 DZ |
107 | static void add_to_discard_unused_list(struct btrfs_discard_ctl *discard_ctl, |
108 | struct btrfs_block_group *block_group) | |
109 | { | |
110 | spin_lock(&discard_ctl->lock); | |
111 | ||
112 | if (!btrfs_run_discard_work(discard_ctl)) { | |
113 | spin_unlock(&discard_ctl->lock); | |
114 | return; | |
115 | } | |
116 | ||
117 | list_del_init(&block_group->discard_list); | |
118 | ||
119 | block_group->discard_index = BTRFS_DISCARD_INDEX_UNUSED; | |
120 | block_group->discard_eligible_time = (ktime_get_ns() + | |
121 | BTRFS_DISCARD_UNUSED_DELAY); | |
2bee7eb8 | 122 | block_group->discard_state = BTRFS_DISCARD_RESET_CURSOR; |
6e80d4f8 DZ |
123 | list_add_tail(&block_group->discard_list, |
124 | &discard_ctl->discard_list[BTRFS_DISCARD_INDEX_UNUSED]); | |
125 | ||
126 | spin_unlock(&discard_ctl->lock); | |
127 | } | |
128 | ||
b0643e59 DZ |
129 | static bool remove_from_discard_list(struct btrfs_discard_ctl *discard_ctl, |
130 | struct btrfs_block_group *block_group) | |
131 | { | |
132 | bool running = false; | |
133 | ||
134 | spin_lock(&discard_ctl->lock); | |
135 | ||
136 | if (block_group == discard_ctl->block_group) { | |
137 | running = true; | |
138 | discard_ctl->block_group = NULL; | |
139 | } | |
140 | ||
141 | block_group->discard_eligible_time = 0; | |
142 | list_del_init(&block_group->discard_list); | |
143 | ||
144 | spin_unlock(&discard_ctl->lock); | |
145 | ||
146 | return running; | |
147 | } | |
148 | ||
149 | /** | |
150 | * find_next_block_group - find block_group that's up next for discarding | |
151 | * @discard_ctl: discard control | |
152 | * @now: current time | |
153 | * | |
154 | * Iterate over the discard lists to find the next block_group up for | |
155 | * discarding checking the discard_eligible_time of block_group. | |
156 | */ | |
157 | static struct btrfs_block_group *find_next_block_group( | |
158 | struct btrfs_discard_ctl *discard_ctl, | |
159 | u64 now) | |
160 | { | |
161 | struct btrfs_block_group *ret_block_group = NULL, *block_group; | |
162 | int i; | |
163 | ||
164 | for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++) { | |
165 | struct list_head *discard_list = &discard_ctl->discard_list[i]; | |
166 | ||
167 | if (!list_empty(discard_list)) { | |
168 | block_group = list_first_entry(discard_list, | |
169 | struct btrfs_block_group, | |
170 | discard_list); | |
171 | ||
172 | if (!ret_block_group) | |
173 | ret_block_group = block_group; | |
174 | ||
175 | if (ret_block_group->discard_eligible_time < now) | |
176 | break; | |
177 | ||
178 | if (ret_block_group->discard_eligible_time > | |
179 | block_group->discard_eligible_time) | |
180 | ret_block_group = block_group; | |
181 | } | |
182 | } | |
183 | ||
184 | return ret_block_group; | |
185 | } | |
186 | ||
187 | /** | |
188 | * peek_discard_list - wrap find_next_block_group() | |
189 | * @discard_ctl: discard control | |
2bee7eb8 | 190 | * @discard_state: the discard_state of the block_group after state management |
7fe6d45e | 191 | * @discard_index: the discard_index of the block_group after state management |
b0643e59 DZ |
192 | * |
193 | * This wraps find_next_block_group() and sets the block_group to be in use. | |
2bee7eb8 | 194 | * discard_state's control flow is managed here. Variables related to |
7fe6d45e DZ |
195 | * discard_state are reset here as needed (eg discard_cursor). @discard_state |
196 | * and @discard_index are remembered as it may change while we're discarding, | |
197 | * but we want the discard to execute in the context determined here. | |
b0643e59 DZ |
198 | */ |
199 | static struct btrfs_block_group *peek_discard_list( | |
2bee7eb8 | 200 | struct btrfs_discard_ctl *discard_ctl, |
7fe6d45e DZ |
201 | enum btrfs_discard_state *discard_state, |
202 | int *discard_index) | |
b0643e59 DZ |
203 | { |
204 | struct btrfs_block_group *block_group; | |
205 | const u64 now = ktime_get_ns(); | |
206 | ||
207 | spin_lock(&discard_ctl->lock); | |
2bee7eb8 | 208 | again: |
b0643e59 DZ |
209 | block_group = find_next_block_group(discard_ctl, now); |
210 | ||
2bee7eb8 DZ |
211 | if (block_group && now > block_group->discard_eligible_time) { |
212 | if (block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED && | |
213 | block_group->used != 0) { | |
5cb0724e DZ |
214 | if (btrfs_is_block_group_data_only(block_group)) |
215 | __add_to_discard_list(discard_ctl, block_group); | |
216 | else | |
217 | list_del_init(&block_group->discard_list); | |
2bee7eb8 DZ |
218 | goto again; |
219 | } | |
220 | if (block_group->discard_state == BTRFS_DISCARD_RESET_CURSOR) { | |
221 | block_group->discard_cursor = block_group->start; | |
222 | block_group->discard_state = BTRFS_DISCARD_EXTENTS; | |
223 | } | |
224 | discard_ctl->block_group = block_group; | |
225 | *discard_state = block_group->discard_state; | |
7fe6d45e | 226 | *discard_index = block_group->discard_index; |
2bee7eb8 | 227 | } else { |
b0643e59 | 228 | block_group = NULL; |
2bee7eb8 | 229 | } |
b0643e59 DZ |
230 | |
231 | spin_unlock(&discard_ctl->lock); | |
232 | ||
233 | return block_group; | |
234 | } | |
235 | ||
7fe6d45e DZ |
236 | /** |
237 | * btrfs_discard_check_filter - updates a block groups filters | |
238 | * @block_group: block group of interest | |
239 | * @bytes: recently freed region size after coalescing | |
240 | * | |
241 | * Async discard maintains multiple lists with progressively smaller filters | |
242 | * to prioritize discarding based on size. Should a free space that matches | |
243 | * a larger filter be returned to the free_space_cache, prioritize that discard | |
244 | * by moving @block_group to the proper filter. | |
245 | */ | |
246 | void btrfs_discard_check_filter(struct btrfs_block_group *block_group, | |
247 | u64 bytes) | |
248 | { | |
249 | struct btrfs_discard_ctl *discard_ctl; | |
250 | ||
251 | if (!block_group || | |
252 | !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC)) | |
253 | return; | |
254 | ||
255 | discard_ctl = &block_group->fs_info->discard_ctl; | |
256 | ||
257 | if (block_group->discard_index > BTRFS_DISCARD_INDEX_START && | |
258 | bytes >= discard_minlen[block_group->discard_index - 1]) { | |
259 | int i; | |
260 | ||
261 | remove_from_discard_list(discard_ctl, block_group); | |
262 | ||
263 | for (i = BTRFS_DISCARD_INDEX_START; i < BTRFS_NR_DISCARD_LISTS; | |
264 | i++) { | |
265 | if (bytes >= discard_minlen[i]) { | |
266 | block_group->discard_index = i; | |
267 | add_to_discard_list(discard_ctl, block_group); | |
268 | break; | |
269 | } | |
270 | } | |
271 | } | |
272 | } | |
273 | ||
274 | /** | |
275 | * btrfs_update_discard_index - moves a block group along the discard lists | |
276 | * @discard_ctl: discard control | |
277 | * @block_group: block_group of interest | |
278 | * | |
279 | * Increment @block_group's discard_index. If it falls of the list, let it be. | |
280 | * Otherwise add it back to the appropriate list. | |
281 | */ | |
282 | static void btrfs_update_discard_index(struct btrfs_discard_ctl *discard_ctl, | |
283 | struct btrfs_block_group *block_group) | |
284 | { | |
285 | block_group->discard_index++; | |
286 | if (block_group->discard_index == BTRFS_NR_DISCARD_LISTS) { | |
287 | block_group->discard_index = 1; | |
288 | return; | |
289 | } | |
290 | ||
291 | add_to_discard_list(discard_ctl, block_group); | |
292 | } | |
293 | ||
b0643e59 DZ |
294 | /** |
295 | * btrfs_discard_cancel_work - remove a block_group from the discard lists | |
296 | * @discard_ctl: discard control | |
297 | * @block_group: block_group of interest | |
298 | * | |
299 | * This removes @block_group from the discard lists. If necessary, it waits on | |
300 | * the current work and then reschedules the delayed work. | |
301 | */ | |
302 | void btrfs_discard_cancel_work(struct btrfs_discard_ctl *discard_ctl, | |
303 | struct btrfs_block_group *block_group) | |
304 | { | |
305 | if (remove_from_discard_list(discard_ctl, block_group)) { | |
306 | cancel_delayed_work_sync(&discard_ctl->work); | |
307 | btrfs_discard_schedule_work(discard_ctl, true); | |
308 | } | |
309 | } | |
310 | ||
311 | /** | |
312 | * btrfs_discard_queue_work - handles queuing the block_groups | |
313 | * @discard_ctl: discard control | |
314 | * @block_group: block_group of interest | |
315 | * | |
316 | * This maintains the LRU order of the discard lists. | |
317 | */ | |
318 | void btrfs_discard_queue_work(struct btrfs_discard_ctl *discard_ctl, | |
319 | struct btrfs_block_group *block_group) | |
320 | { | |
321 | if (!block_group || !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC)) | |
322 | return; | |
323 | ||
6e80d4f8 DZ |
324 | if (block_group->used == 0) |
325 | add_to_discard_unused_list(discard_ctl, block_group); | |
326 | else | |
327 | add_to_discard_list(discard_ctl, block_group); | |
b0643e59 DZ |
328 | |
329 | if (!delayed_work_pending(&discard_ctl->work)) | |
330 | btrfs_discard_schedule_work(discard_ctl, false); | |
331 | } | |
332 | ||
333 | /** | |
334 | * btrfs_discard_schedule_work - responsible for scheduling the discard work | |
335 | * @discard_ctl: discard control | |
336 | * @override: override the current timer | |
337 | * | |
338 | * Discards are issued by a delayed workqueue item. @override is used to | |
e93591bb DZ |
339 | * update the current delay as the baseline delay interval is reevaluated on |
340 | * transaction commit. This is also maxed with any other rate limit. | |
b0643e59 DZ |
341 | */ |
342 | void btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl, | |
343 | bool override) | |
344 | { | |
345 | struct btrfs_block_group *block_group; | |
346 | const u64 now = ktime_get_ns(); | |
347 | ||
348 | spin_lock(&discard_ctl->lock); | |
349 | ||
350 | if (!btrfs_run_discard_work(discard_ctl)) | |
351 | goto out; | |
352 | ||
353 | if (!override && delayed_work_pending(&discard_ctl->work)) | |
354 | goto out; | |
355 | ||
356 | block_group = find_next_block_group(discard_ctl, now); | |
357 | if (block_group) { | |
a2309300 | 358 | unsigned long delay = discard_ctl->delay; |
e93591bb DZ |
359 | u32 kbps_limit = READ_ONCE(discard_ctl->kbps_limit); |
360 | ||
361 | /* | |
362 | * A single delayed workqueue item is responsible for | |
363 | * discarding, so we can manage the bytes rate limit by keeping | |
364 | * track of the previous discard. | |
365 | */ | |
366 | if (kbps_limit && discard_ctl->prev_discard) { | |
367 | u64 bps_limit = ((u64)kbps_limit) * SZ_1K; | |
368 | u64 bps_delay = div64_u64(discard_ctl->prev_discard * | |
369 | MSEC_PER_SEC, bps_limit); | |
370 | ||
371 | delay = max(delay, msecs_to_jiffies(bps_delay)); | |
372 | } | |
a2309300 DZ |
373 | |
374 | /* | |
375 | * This timeout is to hopefully prevent immediate discarding | |
376 | * in a recently allocated block group. | |
377 | */ | |
378 | if (now < block_group->discard_eligible_time) { | |
379 | u64 bg_timeout = block_group->discard_eligible_time - now; | |
b0643e59 | 380 | |
a2309300 DZ |
381 | delay = max(delay, nsecs_to_jiffies(bg_timeout)); |
382 | } | |
b0643e59 DZ |
383 | |
384 | mod_delayed_work(discard_ctl->discard_workers, | |
385 | &discard_ctl->work, delay); | |
386 | } | |
387 | out: | |
388 | spin_unlock(&discard_ctl->lock); | |
389 | } | |
390 | ||
6e80d4f8 DZ |
391 | /** |
392 | * btrfs_finish_discard_pass - determine next step of a block_group | |
393 | * @discard_ctl: discard control | |
394 | * @block_group: block_group of interest | |
395 | * | |
396 | * This determines the next step for a block group after it's finished going | |
397 | * through a pass on a discard list. If it is unused and fully trimmed, we can | |
398 | * mark it unused and send it to the unused_bgs path. Otherwise, pass it onto | |
399 | * the appropriate filter list or let it fall off. | |
400 | */ | |
401 | static void btrfs_finish_discard_pass(struct btrfs_discard_ctl *discard_ctl, | |
402 | struct btrfs_block_group *block_group) | |
403 | { | |
404 | remove_from_discard_list(discard_ctl, block_group); | |
405 | ||
406 | if (block_group->used == 0) { | |
407 | if (btrfs_is_free_space_trimmed(block_group)) | |
408 | btrfs_mark_bg_unused(block_group); | |
409 | else | |
410 | add_to_discard_unused_list(discard_ctl, block_group); | |
7fe6d45e DZ |
411 | } else { |
412 | btrfs_update_discard_index(discard_ctl, block_group); | |
6e80d4f8 DZ |
413 | } |
414 | } | |
415 | ||
b0643e59 DZ |
416 | /** |
417 | * btrfs_discard_workfn - discard work function | |
418 | * @work: work | |
419 | * | |
2bee7eb8 DZ |
420 | * This finds the next block_group to start discarding and then discards a |
421 | * single region. It does this in a two-pass fashion: first extents and second | |
422 | * bitmaps. Completely discarded block groups are sent to the unused_bgs path. | |
b0643e59 DZ |
423 | */ |
424 | static void btrfs_discard_workfn(struct work_struct *work) | |
425 | { | |
426 | struct btrfs_discard_ctl *discard_ctl; | |
427 | struct btrfs_block_group *block_group; | |
2bee7eb8 | 428 | enum btrfs_discard_state discard_state; |
7fe6d45e | 429 | int discard_index = 0; |
b0643e59 | 430 | u64 trimmed = 0; |
7fe6d45e | 431 | u64 minlen = 0; |
b0643e59 DZ |
432 | |
433 | discard_ctl = container_of(work, struct btrfs_discard_ctl, work.work); | |
434 | ||
7fe6d45e DZ |
435 | block_group = peek_discard_list(discard_ctl, &discard_state, |
436 | &discard_index); | |
b0643e59 DZ |
437 | if (!block_group || !btrfs_run_discard_work(discard_ctl)) |
438 | return; | |
439 | ||
2bee7eb8 | 440 | /* Perform discarding */ |
7fe6d45e DZ |
441 | minlen = discard_minlen[discard_index]; |
442 | ||
443 | if (discard_state == BTRFS_DISCARD_BITMAPS) { | |
444 | u64 maxlen = 0; | |
445 | ||
446 | /* | |
447 | * Use the previous levels minimum discard length as the max | |
448 | * length filter. In the case something is added to make a | |
449 | * region go beyond the max filter, the entire bitmap is set | |
450 | * back to BTRFS_TRIM_STATE_UNTRIMMED. | |
451 | */ | |
452 | if (discard_index != BTRFS_DISCARD_INDEX_UNUSED) | |
453 | maxlen = discard_minlen[discard_index - 1]; | |
454 | ||
2bee7eb8 DZ |
455 | btrfs_trim_block_group_bitmaps(block_group, &trimmed, |
456 | block_group->discard_cursor, | |
457 | btrfs_block_group_end(block_group), | |
7fe6d45e | 458 | minlen, maxlen, true); |
9ddf648f | 459 | discard_ctl->discard_bitmap_bytes += trimmed; |
7fe6d45e | 460 | } else { |
2bee7eb8 DZ |
461 | btrfs_trim_block_group_extents(block_group, &trimmed, |
462 | block_group->discard_cursor, | |
463 | btrfs_block_group_end(block_group), | |
7fe6d45e | 464 | minlen, true); |
9ddf648f | 465 | discard_ctl->discard_extent_bytes += trimmed; |
7fe6d45e | 466 | } |
2bee7eb8 | 467 | |
e93591bb DZ |
468 | discard_ctl->prev_discard = trimmed; |
469 | ||
2bee7eb8 DZ |
470 | /* Determine next steps for a block_group */ |
471 | if (block_group->discard_cursor >= btrfs_block_group_end(block_group)) { | |
472 | if (discard_state == BTRFS_DISCARD_BITMAPS) { | |
473 | btrfs_finish_discard_pass(discard_ctl, block_group); | |
474 | } else { | |
475 | block_group->discard_cursor = block_group->start; | |
476 | spin_lock(&discard_ctl->lock); | |
477 | if (block_group->discard_state != | |
478 | BTRFS_DISCARD_RESET_CURSOR) | |
479 | block_group->discard_state = | |
480 | BTRFS_DISCARD_BITMAPS; | |
481 | spin_unlock(&discard_ctl->lock); | |
482 | } | |
483 | } | |
484 | ||
485 | spin_lock(&discard_ctl->lock); | |
486 | discard_ctl->block_group = NULL; | |
487 | spin_unlock(&discard_ctl->lock); | |
b0643e59 | 488 | |
b0643e59 DZ |
489 | btrfs_discard_schedule_work(discard_ctl, false); |
490 | } | |
491 | ||
492 | /** | |
493 | * btrfs_run_discard_work - determines if async discard should be running | |
494 | * @discard_ctl: discard control | |
495 | * | |
496 | * Checks if the file system is writeable and BTRFS_FS_DISCARD_RUNNING is set. | |
497 | */ | |
498 | bool btrfs_run_discard_work(struct btrfs_discard_ctl *discard_ctl) | |
499 | { | |
500 | struct btrfs_fs_info *fs_info = container_of(discard_ctl, | |
501 | struct btrfs_fs_info, | |
502 | discard_ctl); | |
503 | ||
504 | return (!(fs_info->sb->s_flags & SB_RDONLY) && | |
505 | test_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags)); | |
506 | } | |
507 | ||
a2309300 DZ |
508 | /** |
509 | * btrfs_discard_calc_delay - recalculate the base delay | |
510 | * @discard_ctl: discard control | |
511 | * | |
512 | * Recalculate the base delay which is based off the total number of | |
513 | * discardable_extents. Clamp this between the lower_limit (iops_limit or 1ms) | |
514 | * and the upper_limit (BTRFS_DISCARD_MAX_DELAY_MSEC). | |
515 | */ | |
516 | void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl) | |
517 | { | |
518 | s32 discardable_extents; | |
81b29a3b | 519 | s64 discardable_bytes; |
a2309300 DZ |
520 | u32 iops_limit; |
521 | unsigned long delay; | |
522 | unsigned long lower_limit = BTRFS_DISCARD_MIN_DELAY_MSEC; | |
523 | ||
524 | discardable_extents = atomic_read(&discard_ctl->discardable_extents); | |
525 | if (!discardable_extents) | |
526 | return; | |
527 | ||
528 | spin_lock(&discard_ctl->lock); | |
529 | ||
81b29a3b DZ |
530 | /* |
531 | * The following is to fix a potential -1 discrepenancy that we're not | |
532 | * sure how to reproduce. But given that this is the only place that | |
533 | * utilizes these numbers and this is only called by from | |
534 | * btrfs_finish_extent_commit() which is synchronized, we can correct | |
535 | * here. | |
536 | */ | |
537 | if (discardable_extents < 0) | |
538 | atomic_add(-discardable_extents, | |
539 | &discard_ctl->discardable_extents); | |
540 | ||
541 | discardable_bytes = atomic64_read(&discard_ctl->discardable_bytes); | |
542 | if (discardable_bytes < 0) | |
543 | atomic64_add(-discardable_bytes, | |
544 | &discard_ctl->discardable_bytes); | |
545 | ||
546 | if (discardable_extents <= 0) { | |
547 | spin_unlock(&discard_ctl->lock); | |
548 | return; | |
549 | } | |
550 | ||
a2309300 DZ |
551 | iops_limit = READ_ONCE(discard_ctl->iops_limit); |
552 | if (iops_limit) | |
553 | lower_limit = max_t(unsigned long, lower_limit, | |
554 | MSEC_PER_SEC / iops_limit); | |
555 | ||
556 | delay = BTRFS_DISCARD_TARGET_MSEC / discardable_extents; | |
557 | delay = clamp(delay, lower_limit, BTRFS_DISCARD_MAX_DELAY_MSEC); | |
558 | discard_ctl->delay = msecs_to_jiffies(delay); | |
559 | ||
560 | spin_unlock(&discard_ctl->lock); | |
561 | } | |
562 | ||
dfb79ddb DZ |
563 | /** |
564 | * btrfs_discard_update_discardable - propagate discard counters | |
565 | * @block_group: block_group of interest | |
566 | * @ctl: free_space_ctl of @block_group | |
567 | * | |
568 | * This propagates deltas of counters up to the discard_ctl. It maintains a | |
569 | * current counter and a previous counter passing the delta up to the global | |
570 | * stat. Then the current counter value becomes the previous counter value. | |
571 | */ | |
572 | void btrfs_discard_update_discardable(struct btrfs_block_group *block_group, | |
573 | struct btrfs_free_space_ctl *ctl) | |
574 | { | |
575 | struct btrfs_discard_ctl *discard_ctl; | |
576 | s32 extents_delta; | |
5dc7c10b | 577 | s64 bytes_delta; |
dfb79ddb | 578 | |
5cb0724e DZ |
579 | if (!block_group || |
580 | !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC) || | |
581 | !btrfs_is_block_group_data_only(block_group)) | |
dfb79ddb DZ |
582 | return; |
583 | ||
584 | discard_ctl = &block_group->fs_info->discard_ctl; | |
585 | ||
586 | extents_delta = ctl->discardable_extents[BTRFS_STAT_CURR] - | |
587 | ctl->discardable_extents[BTRFS_STAT_PREV]; | |
588 | if (extents_delta) { | |
589 | atomic_add(extents_delta, &discard_ctl->discardable_extents); | |
590 | ctl->discardable_extents[BTRFS_STAT_PREV] = | |
591 | ctl->discardable_extents[BTRFS_STAT_CURR]; | |
592 | } | |
5dc7c10b DZ |
593 | |
594 | bytes_delta = ctl->discardable_bytes[BTRFS_STAT_CURR] - | |
595 | ctl->discardable_bytes[BTRFS_STAT_PREV]; | |
596 | if (bytes_delta) { | |
597 | atomic64_add(bytes_delta, &discard_ctl->discardable_bytes); | |
598 | ctl->discardable_bytes[BTRFS_STAT_PREV] = | |
599 | ctl->discardable_bytes[BTRFS_STAT_CURR]; | |
600 | } | |
dfb79ddb DZ |
601 | } |
602 | ||
6e80d4f8 DZ |
603 | /** |
604 | * btrfs_discard_punt_unused_bgs_list - punt unused_bgs list to discard lists | |
605 | * @fs_info: fs_info of interest | |
606 | * | |
607 | * The unused_bgs list needs to be punted to the discard lists because the | |
608 | * order of operations is changed. In the normal sychronous discard path, the | |
609 | * block groups are trimmed via a single large trim in transaction commit. This | |
610 | * is ultimately what we are trying to avoid with asynchronous discard. Thus, | |
611 | * it must be done before going down the unused_bgs path. | |
612 | */ | |
613 | void btrfs_discard_punt_unused_bgs_list(struct btrfs_fs_info *fs_info) | |
614 | { | |
615 | struct btrfs_block_group *block_group, *next; | |
616 | ||
617 | spin_lock(&fs_info->unused_bgs_lock); | |
618 | /* We enabled async discard, so punt all to the queue */ | |
619 | list_for_each_entry_safe(block_group, next, &fs_info->unused_bgs, | |
620 | bg_list) { | |
621 | list_del_init(&block_group->bg_list); | |
04e484c5 | 622 | btrfs_put_block_group(block_group); |
6e80d4f8 DZ |
623 | btrfs_discard_queue_work(&fs_info->discard_ctl, block_group); |
624 | } | |
625 | spin_unlock(&fs_info->unused_bgs_lock); | |
626 | } | |
627 | ||
628 | /** | |
629 | * btrfs_discard_purge_list - purge discard lists | |
630 | * @discard_ctl: discard control | |
631 | * | |
632 | * If we are disabling async discard, we may have intercepted block groups that | |
633 | * are completely free and ready for the unused_bgs path. As discarding will | |
634 | * now happen in transaction commit or not at all, we can safely mark the | |
635 | * corresponding block groups as unused and they will be sent on their merry | |
636 | * way to the unused_bgs list. | |
637 | */ | |
638 | static void btrfs_discard_purge_list(struct btrfs_discard_ctl *discard_ctl) | |
639 | { | |
640 | struct btrfs_block_group *block_group, *next; | |
641 | int i; | |
642 | ||
643 | spin_lock(&discard_ctl->lock); | |
644 | for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++) { | |
645 | list_for_each_entry_safe(block_group, next, | |
646 | &discard_ctl->discard_list[i], | |
647 | discard_list) { | |
648 | list_del_init(&block_group->discard_list); | |
649 | spin_unlock(&discard_ctl->lock); | |
650 | if (block_group->used == 0) | |
651 | btrfs_mark_bg_unused(block_group); | |
652 | spin_lock(&discard_ctl->lock); | |
653 | } | |
654 | } | |
655 | spin_unlock(&discard_ctl->lock); | |
656 | } | |
657 | ||
b0643e59 DZ |
658 | void btrfs_discard_resume(struct btrfs_fs_info *fs_info) |
659 | { | |
660 | if (!btrfs_test_opt(fs_info, DISCARD_ASYNC)) { | |
661 | btrfs_discard_cleanup(fs_info); | |
662 | return; | |
663 | } | |
664 | ||
6e80d4f8 DZ |
665 | btrfs_discard_punt_unused_bgs_list(fs_info); |
666 | ||
b0643e59 DZ |
667 | set_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags); |
668 | } | |
669 | ||
670 | void btrfs_discard_stop(struct btrfs_fs_info *fs_info) | |
671 | { | |
672 | clear_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags); | |
673 | } | |
674 | ||
675 | void btrfs_discard_init(struct btrfs_fs_info *fs_info) | |
676 | { | |
677 | struct btrfs_discard_ctl *discard_ctl = &fs_info->discard_ctl; | |
678 | int i; | |
679 | ||
680 | spin_lock_init(&discard_ctl->lock); | |
681 | INIT_DELAYED_WORK(&discard_ctl->work, btrfs_discard_workfn); | |
682 | ||
683 | for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++) | |
684 | INIT_LIST_HEAD(&discard_ctl->discard_list[i]); | |
dfb79ddb | 685 | |
e93591bb | 686 | discard_ctl->prev_discard = 0; |
dfb79ddb | 687 | atomic_set(&discard_ctl->discardable_extents, 0); |
5dc7c10b | 688 | atomic64_set(&discard_ctl->discardable_bytes, 0); |
19b2a2c7 | 689 | discard_ctl->max_discard_size = BTRFS_ASYNC_DISCARD_DEFAULT_MAX_SIZE; |
a2309300 DZ |
690 | discard_ctl->delay = BTRFS_DISCARD_MAX_DELAY_MSEC; |
691 | discard_ctl->iops_limit = BTRFS_DISCARD_MAX_IOPS; | |
e93591bb | 692 | discard_ctl->kbps_limit = 0; |
9ddf648f DZ |
693 | discard_ctl->discard_extent_bytes = 0; |
694 | discard_ctl->discard_bitmap_bytes = 0; | |
695 | atomic64_set(&discard_ctl->discard_bytes_saved, 0); | |
b0643e59 DZ |
696 | } |
697 | ||
698 | void btrfs_discard_cleanup(struct btrfs_fs_info *fs_info) | |
699 | { | |
700 | btrfs_discard_stop(fs_info); | |
701 | cancel_delayed_work_sync(&fs_info->discard_ctl.work); | |
6e80d4f8 | 702 | btrfs_discard_purge_list(&fs_info->discard_ctl); |
b0643e59 | 703 | } |