]>
Commit | Line | Data |
---|---|---|
1b939560 BB |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
1d3ba0bf | 9 | * or https://opensource.org/licenses/CDDL-1.0. |
1b939560 BB |
10 | * See the License for the specific language governing permissions |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | ||
22 | /* | |
23 | * Copyright (c) 2016 by Delphix. All rights reserved. | |
24 | * Copyright (c) 2019 by Lawrence Livermore National Security, LLC. | |
bedbc13d | 25 | * Copyright (c) 2021 Hewlett Packard Enterprise Development LP |
8a740701 | 26 | * Copyright 2023 RackTop Systems, Inc. |
1b939560 BB |
27 | */ |
28 | ||
29 | #include <sys/spa.h> | |
30 | #include <sys/spa_impl.h> | |
31 | #include <sys/txg.h> | |
32 | #include <sys/vdev_impl.h> | |
33 | #include <sys/vdev_trim.h> | |
1b939560 BB |
34 | #include <sys/metaslab_impl.h> |
35 | #include <sys/dsl_synctask.h> | |
36 | #include <sys/zap.h> | |
37 | #include <sys/dmu_tx.h> | |
b7654bd7 | 38 | #include <sys/arc_impl.h> |
1b939560 BB |
39 | |
40 | /* | |
41 | * TRIM is a feature which is used to notify a SSD that some previously | |
42 | * written space is no longer allocated by the pool. This is useful because | |
43 | * writes to a SSD must be performed to blocks which have first been erased. | |
44 | * Ensuring the SSD always has a supply of erased blocks for new writes | |
45 | * helps prevent the performance from deteriorating. | |
46 | * | |
47 | * There are two supported TRIM methods; manual and automatic. | |
48 | * | |
49 | * Manual TRIM: | |
50 | * | |
51 | * A manual TRIM is initiated by running the 'zpool trim' command. A single | |
52 | * 'vdev_trim' thread is created for each leaf vdev, and it is responsible for | |
53 | * managing that vdev TRIM process. This involves iterating over all the | |
54 | * metaslabs, calculating the unallocated space ranges, and then issuing the | |
55 | * required TRIM I/Os. | |
56 | * | |
57 | * While a metaslab is being actively trimmed it is not eligible to perform | |
58 | * new allocations. After traversing all of the metaslabs the thread is | |
59 | * terminated. Finally, both the requested options and current progress of | |
60 | * the TRIM are regularly written to the pool. This allows the TRIM to be | |
61 | * suspended and resumed as needed. | |
62 | * | |
63 | * Automatic TRIM: | |
64 | * | |
65 | * An automatic TRIM is enabled by setting the 'autotrim' pool property | |
66 | * to 'on'. When enabled, a `vdev_autotrim' thread is created for each | |
67 | * top-level (not leaf) vdev in the pool. These threads perform the same | |
68 | * core TRIM process as a manual TRIM, but with a few key differences. | |
69 | * | |
70 | * 1) Automatic TRIM happens continuously in the background and operates | |
71 | * solely on recently freed blocks (ms_trim not ms_allocatable). | |
72 | * | |
73 | * 2) Each thread is associated with a top-level (not leaf) vdev. This has | |
74 | * the benefit of simplifying the threading model, it makes it easier | |
75 | * to coordinate administrative commands, and it ensures only a single | |
76 | * metaslab is disabled at a time. Unlike manual TRIM, this means each | |
77 | * 'vdev_autotrim' thread is responsible for issuing TRIM I/Os for its | |
78 | * children. | |
79 | * | |
80 | * 3) There is no automatic TRIM progress information stored on disk, nor | |
81 | * is it reported by 'zpool status'. | |
82 | * | |
83 | * While the automatic TRIM process is highly effective it is more likely | |
84 | * than a manual TRIM to encounter tiny ranges. Ranges less than or equal to | |
85 | * 'zfs_trim_extent_bytes_min' (32k) are considered too small to efficiently | |
86 | * TRIM and are skipped. This means small amounts of freed space may not | |
87 | * be automatically trimmed. | |
88 | * | |
89 | * Furthermore, devices with attached hot spares and devices being actively | |
90 | * replaced are skipped. This is done to avoid adding additional stress to | |
91 | * a potentially unhealthy device and to minimize the required rebuild time. | |
92 | * | |
93 | * For this reason it may be beneficial to occasionally manually TRIM a pool | |
94 | * even when automatic TRIM is enabled. | |
95 | */ | |
96 | ||
97 | /* | |
98 | * Maximum size of TRIM I/O, ranges will be chunked in to 128MiB lengths. | |
99 | */ | |
18168da7 | 100 | static unsigned int zfs_trim_extent_bytes_max = 128 * 1024 * 1024; |
1b939560 BB |
101 | |
102 | /* | |
103 | * Minimum size of TRIM I/O, extents smaller than 32Kib will be skipped. | |
104 | */ | |
18168da7 | 105 | static unsigned int zfs_trim_extent_bytes_min = 32 * 1024; |
1b939560 BB |
106 | |
107 | /* | |
108 | * Skip uninitialized metaslabs during the TRIM process. This option is | |
109 | * useful for pools constructed from large thinly-provisioned devices where | |
110 | * TRIM operations are slow. As a pool ages an increasing fraction of | |
111 | * the pools metaslabs will be initialized progressively degrading the | |
112 | * usefulness of this option. This setting is stored when starting a | |
113 | * manual TRIM and will persist for the duration of the requested TRIM. | |
114 | */ | |
115 | unsigned int zfs_trim_metaslab_skip = 0; | |
116 | ||
117 | /* | |
118 | * Maximum number of queued TRIM I/Os per leaf vdev. The number of | |
119 | * concurrent TRIM I/Os issued to the device is controlled by the | |
120 | * zfs_vdev_trim_min_active and zfs_vdev_trim_max_active module options. | |
121 | */ | |
18168da7 | 122 | static unsigned int zfs_trim_queue_limit = 10; |
1b939560 BB |
123 | |
124 | /* | |
125 | * The minimum number of transaction groups between automatic trims of a | |
126 | * metaslab. This setting represents a trade-off between issuing more | |
127 | * efficient TRIM operations, by allowing them to be aggregated longer, | |
128 | * and issuing them promptly so the trimmed space is available. Note | |
129 | * that this value is a minimum; metaslabs can be trimmed less frequently | |
130 | * when there are a large number of ranges which need to be trimmed. | |
131 | * | |
132 | * Increasing this value will allow frees to be aggregated for a longer | |
133 | * time. This can result is larger TRIM operations, and increased memory | |
134 | * usage in order to track the ranges to be trimmed. Decreasing this value | |
135 | * has the opposite effect. The default value of 32 was determined though | |
136 | * testing to be a reasonable compromise. | |
137 | */ | |
18168da7 | 138 | static unsigned int zfs_trim_txg_batch = 32; |
1b939560 BB |
139 | |
140 | /* | |
141 | * The trim_args are a control structure which describe how a leaf vdev | |
142 | * should be trimmed. The core elements are the vdev, the metaslab being | |
143 | * trimmed and a range tree containing the extents to TRIM. All provided | |
144 | * ranges must be within the metaslab. | |
145 | */ | |
146 | typedef struct trim_args { | |
147 | /* | |
148 | * These fields are set by the caller of vdev_trim_ranges(). | |
149 | */ | |
150 | vdev_t *trim_vdev; /* Leaf vdev to TRIM */ | |
151 | metaslab_t *trim_msp; /* Disabled metaslab */ | |
152 | range_tree_t *trim_tree; /* TRIM ranges (in metaslab) */ | |
153 | trim_type_t trim_type; /* Manual or auto TRIM */ | |
154 | uint64_t trim_extent_bytes_max; /* Maximum TRIM I/O size */ | |
155 | uint64_t trim_extent_bytes_min; /* Minimum TRIM I/O size */ | |
156 | enum trim_flag trim_flags; /* TRIM flags (secure) */ | |
157 | ||
158 | /* | |
159 | * These fields are updated by vdev_trim_ranges(). | |
160 | */ | |
161 | hrtime_t trim_start_time; /* Start time */ | |
162 | uint64_t trim_bytes_done; /* Bytes trimmed */ | |
163 | } trim_args_t; | |
164 | ||
165 | /* | |
166 | * Determines whether a vdev_trim_thread() should be stopped. | |
167 | */ | |
168 | static boolean_t | |
169 | vdev_trim_should_stop(vdev_t *vd) | |
170 | { | |
171 | return (vd->vdev_trim_exit_wanted || !vdev_writeable(vd) || | |
5caeef02 DB |
172 | vd->vdev_detached || vd->vdev_top->vdev_removing || |
173 | vd->vdev_top->vdev_rz_expanding); | |
1b939560 BB |
174 | } |
175 | ||
176 | /* | |
177 | * Determines whether a vdev_autotrim_thread() should be stopped. | |
178 | */ | |
179 | static boolean_t | |
180 | vdev_autotrim_should_stop(vdev_t *tvd) | |
181 | { | |
182 | return (tvd->vdev_autotrim_exit_wanted || | |
183 | !vdev_writeable(tvd) || tvd->vdev_removing || | |
5caeef02 | 184 | tvd->vdev_rz_expanding || |
1b939560 BB |
185 | spa_get_autotrim(tvd->vdev_spa) == SPA_AUTOTRIM_OFF); |
186 | } | |
187 | ||
65d10bd8 KJ |
188 | /* |
189 | * Wait for given number of kicks, return true if the wait is aborted due to | |
190 | * vdev_autotrim_exit_wanted. | |
191 | */ | |
192 | static boolean_t | |
193 | vdev_autotrim_wait_kick(vdev_t *vd, int num_of_kick) | |
194 | { | |
195 | mutex_enter(&vd->vdev_autotrim_lock); | |
196 | for (int i = 0; i < num_of_kick; i++) { | |
197 | if (vd->vdev_autotrim_exit_wanted) | |
198 | break; | |
1494e8fb KJ |
199 | cv_wait_idle(&vd->vdev_autotrim_kick_cv, |
200 | &vd->vdev_autotrim_lock); | |
65d10bd8 KJ |
201 | } |
202 | boolean_t exit_wanted = vd->vdev_autotrim_exit_wanted; | |
203 | mutex_exit(&vd->vdev_autotrim_lock); | |
204 | ||
205 | return (exit_wanted); | |
206 | } | |
207 | ||
1b939560 BB |
208 | /* |
209 | * The sync task for updating the on-disk state of a manual TRIM. This | |
210 | * is scheduled by vdev_trim_change_state(). | |
211 | */ | |
212 | static void | |
213 | vdev_trim_zap_update_sync(void *arg, dmu_tx_t *tx) | |
214 | { | |
215 | /* | |
216 | * We pass in the guid instead of the vdev_t since the vdev may | |
217 | * have been freed prior to the sync task being processed. This | |
218 | * happens when a vdev is detached as we call spa_config_vdev_exit(), | |
219 | * stop the trimming thread, schedule the sync task, and free | |
220 | * the vdev. Later when the scheduled sync task is invoked, it would | |
221 | * find that the vdev has been freed. | |
222 | */ | |
223 | uint64_t guid = *(uint64_t *)arg; | |
224 | uint64_t txg = dmu_tx_get_txg(tx); | |
225 | kmem_free(arg, sizeof (uint64_t)); | |
226 | ||
227 | vdev_t *vd = spa_lookup_by_guid(tx->tx_pool->dp_spa, guid, B_FALSE); | |
5caeef02 DB |
228 | if (vd == NULL || vd->vdev_top->vdev_removing || |
229 | !vdev_is_concrete(vd) || vd->vdev_top->vdev_rz_expanding) | |
1b939560 BB |
230 | return; |
231 | ||
232 | uint64_t last_offset = vd->vdev_trim_offset[txg & TXG_MASK]; | |
233 | vd->vdev_trim_offset[txg & TXG_MASK] = 0; | |
234 | ||
235 | VERIFY3U(vd->vdev_leaf_zap, !=, 0); | |
236 | ||
237 | objset_t *mos = vd->vdev_spa->spa_meta_objset; | |
238 | ||
239 | if (last_offset > 0 || vd->vdev_trim_last_offset == UINT64_MAX) { | |
240 | ||
241 | if (vd->vdev_trim_last_offset == UINT64_MAX) | |
242 | last_offset = 0; | |
243 | ||
244 | vd->vdev_trim_last_offset = last_offset; | |
245 | VERIFY0(zap_update(mos, vd->vdev_leaf_zap, | |
246 | VDEV_LEAF_ZAP_TRIM_LAST_OFFSET, | |
247 | sizeof (last_offset), 1, &last_offset, tx)); | |
248 | } | |
249 | ||
250 | if (vd->vdev_trim_action_time > 0) { | |
251 | uint64_t val = (uint64_t)vd->vdev_trim_action_time; | |
252 | VERIFY0(zap_update(mos, vd->vdev_leaf_zap, | |
253 | VDEV_LEAF_ZAP_TRIM_ACTION_TIME, sizeof (val), | |
254 | 1, &val, tx)); | |
255 | } | |
256 | ||
257 | if (vd->vdev_trim_rate > 0) { | |
258 | uint64_t rate = (uint64_t)vd->vdev_trim_rate; | |
259 | ||
260 | if (rate == UINT64_MAX) | |
261 | rate = 0; | |
262 | ||
263 | VERIFY0(zap_update(mos, vd->vdev_leaf_zap, | |
264 | VDEV_LEAF_ZAP_TRIM_RATE, sizeof (rate), 1, &rate, tx)); | |
265 | } | |
266 | ||
267 | uint64_t partial = vd->vdev_trim_partial; | |
268 | if (partial == UINT64_MAX) | |
269 | partial = 0; | |
270 | ||
271 | VERIFY0(zap_update(mos, vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_PARTIAL, | |
272 | sizeof (partial), 1, &partial, tx)); | |
273 | ||
274 | uint64_t secure = vd->vdev_trim_secure; | |
275 | if (secure == UINT64_MAX) | |
276 | secure = 0; | |
277 | ||
278 | VERIFY0(zap_update(mos, vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_SECURE, | |
279 | sizeof (secure), 1, &secure, tx)); | |
280 | ||
281 | ||
282 | uint64_t trim_state = vd->vdev_trim_state; | |
283 | VERIFY0(zap_update(mos, vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_STATE, | |
284 | sizeof (trim_state), 1, &trim_state, tx)); | |
285 | } | |
286 | ||
287 | /* | |
288 | * Update the on-disk state of a manual TRIM. This is called to request | |
289 | * that a TRIM be started/suspended/canceled, or to change one of the | |
290 | * TRIM options (partial, secure, rate). | |
291 | */ | |
292 | static void | |
293 | vdev_trim_change_state(vdev_t *vd, vdev_trim_state_t new_state, | |
294 | uint64_t rate, boolean_t partial, boolean_t secure) | |
295 | { | |
296 | ASSERT(MUTEX_HELD(&vd->vdev_trim_lock)); | |
297 | spa_t *spa = vd->vdev_spa; | |
298 | ||
299 | if (new_state == vd->vdev_trim_state) | |
300 | return; | |
301 | ||
302 | /* | |
303 | * Copy the vd's guid, this will be freed by the sync task. | |
304 | */ | |
305 | uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP); | |
306 | *guid = vd->vdev_guid; | |
307 | ||
308 | /* | |
309 | * If we're suspending, then preserve the original start time. | |
310 | */ | |
311 | if (vd->vdev_trim_state != VDEV_TRIM_SUSPENDED) { | |
312 | vd->vdev_trim_action_time = gethrestime_sec(); | |
313 | } | |
314 | ||
315 | /* | |
316 | * If we're activating, then preserve the requested rate and trim | |
317 | * method. Setting the last offset and rate to UINT64_MAX is used | |
318 | * as a sentinel to indicate they should be reset to default values. | |
319 | */ | |
320 | if (new_state == VDEV_TRIM_ACTIVE) { | |
321 | if (vd->vdev_trim_state == VDEV_TRIM_COMPLETE || | |
322 | vd->vdev_trim_state == VDEV_TRIM_CANCELED) { | |
323 | vd->vdev_trim_last_offset = UINT64_MAX; | |
324 | vd->vdev_trim_rate = UINT64_MAX; | |
325 | vd->vdev_trim_partial = UINT64_MAX; | |
326 | vd->vdev_trim_secure = UINT64_MAX; | |
327 | } | |
328 | ||
329 | if (rate != 0) | |
330 | vd->vdev_trim_rate = rate; | |
331 | ||
332 | if (partial != 0) | |
333 | vd->vdev_trim_partial = partial; | |
334 | ||
335 | if (secure != 0) | |
336 | vd->vdev_trim_secure = secure; | |
337 | } | |
338 | ||
b2255edc BB |
339 | vdev_trim_state_t old_state = vd->vdev_trim_state; |
340 | boolean_t resumed = (old_state == VDEV_TRIM_SUSPENDED); | |
1b939560 BB |
341 | vd->vdev_trim_state = new_state; |
342 | ||
343 | dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); | |
344 | VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); | |
345 | dsl_sync_task_nowait(spa_get_dsl(spa), vdev_trim_zap_update_sync, | |
38080324 | 346 | guid, tx); |
1b939560 BB |
347 | |
348 | switch (new_state) { | |
349 | case VDEV_TRIM_ACTIVE: | |
350 | spa_event_notify(spa, vd, NULL, | |
351 | resumed ? ESC_ZFS_TRIM_RESUME : ESC_ZFS_TRIM_START); | |
352 | spa_history_log_internal(spa, "trim", tx, | |
353 | "vdev=%s activated", vd->vdev_path); | |
354 | break; | |
355 | case VDEV_TRIM_SUSPENDED: | |
356 | spa_event_notify(spa, vd, NULL, ESC_ZFS_TRIM_SUSPEND); | |
357 | spa_history_log_internal(spa, "trim", tx, | |
358 | "vdev=%s suspended", vd->vdev_path); | |
359 | break; | |
360 | case VDEV_TRIM_CANCELED: | |
b2255edc BB |
361 | if (old_state == VDEV_TRIM_ACTIVE || |
362 | old_state == VDEV_TRIM_SUSPENDED) { | |
363 | spa_event_notify(spa, vd, NULL, ESC_ZFS_TRIM_CANCEL); | |
364 | spa_history_log_internal(spa, "trim", tx, | |
365 | "vdev=%s canceled", vd->vdev_path); | |
366 | } | |
1b939560 BB |
367 | break; |
368 | case VDEV_TRIM_COMPLETE: | |
369 | spa_event_notify(spa, vd, NULL, ESC_ZFS_TRIM_FINISH); | |
370 | spa_history_log_internal(spa, "trim", tx, | |
371 | "vdev=%s complete", vd->vdev_path); | |
372 | break; | |
373 | default: | |
374 | panic("invalid state %llu", (unsigned long long)new_state); | |
375 | } | |
376 | ||
377 | dmu_tx_commit(tx); | |
2288d419 BB |
378 | |
379 | if (new_state != VDEV_TRIM_ACTIVE) | |
380 | spa_notify_waiters(spa); | |
1b939560 BB |
381 | } |
382 | ||
383 | /* | |
384 | * The zio_done_func_t done callback for each manual TRIM issued. It is | |
385 | * responsible for updating the TRIM stats, reissuing failed TRIM I/Os, | |
386 | * and limiting the number of in flight TRIM I/Os. | |
387 | */ | |
388 | static void | |
389 | vdev_trim_cb(zio_t *zio) | |
390 | { | |
391 | vdev_t *vd = zio->io_vd; | |
392 | ||
393 | mutex_enter(&vd->vdev_trim_io_lock); | |
394 | if (zio->io_error == ENXIO && !vdev_writeable(vd)) { | |
395 | /* | |
396 | * The I/O failed because the vdev was unavailable; roll the | |
397 | * last offset back. (This works because spa_sync waits on | |
398 | * spa_txg_zio before it runs sync tasks.) | |
399 | */ | |
400 | uint64_t *offset = | |
401 | &vd->vdev_trim_offset[zio->io_txg & TXG_MASK]; | |
402 | *offset = MIN(*offset, zio->io_offset); | |
403 | } else { | |
404 | if (zio->io_error != 0) { | |
405 | vd->vdev_stat.vs_trim_errors++; | |
406 | spa_iostats_trim_add(vd->vdev_spa, TRIM_TYPE_MANUAL, | |
407 | 0, 0, 0, 0, 1, zio->io_orig_size); | |
408 | } else { | |
409 | spa_iostats_trim_add(vd->vdev_spa, TRIM_TYPE_MANUAL, | |
410 | 1, zio->io_orig_size, 0, 0, 0, 0); | |
411 | } | |
412 | ||
413 | vd->vdev_trim_bytes_done += zio->io_orig_size; | |
414 | } | |
415 | ||
416 | ASSERT3U(vd->vdev_trim_inflight[TRIM_TYPE_MANUAL], >, 0); | |
417 | vd->vdev_trim_inflight[TRIM_TYPE_MANUAL]--; | |
418 | cv_broadcast(&vd->vdev_trim_io_cv); | |
419 | mutex_exit(&vd->vdev_trim_io_lock); | |
420 | ||
421 | spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); | |
422 | } | |
423 | ||
424 | /* | |
425 | * The zio_done_func_t done callback for each automatic TRIM issued. It | |
426 | * is responsible for updating the TRIM stats and limiting the number of | |
427 | * in flight TRIM I/Os. Automatic TRIM I/Os are best effort and are | |
428 | * never reissued on failure. | |
429 | */ | |
430 | static void | |
431 | vdev_autotrim_cb(zio_t *zio) | |
432 | { | |
433 | vdev_t *vd = zio->io_vd; | |
434 | ||
435 | mutex_enter(&vd->vdev_trim_io_lock); | |
436 | ||
437 | if (zio->io_error != 0) { | |
438 | vd->vdev_stat.vs_trim_errors++; | |
439 | spa_iostats_trim_add(vd->vdev_spa, TRIM_TYPE_AUTO, | |
440 | 0, 0, 0, 0, 1, zio->io_orig_size); | |
441 | } else { | |
442 | spa_iostats_trim_add(vd->vdev_spa, TRIM_TYPE_AUTO, | |
443 | 1, zio->io_orig_size, 0, 0, 0, 0); | |
444 | } | |
445 | ||
446 | ASSERT3U(vd->vdev_trim_inflight[TRIM_TYPE_AUTO], >, 0); | |
447 | vd->vdev_trim_inflight[TRIM_TYPE_AUTO]--; | |
448 | cv_broadcast(&vd->vdev_trim_io_cv); | |
449 | mutex_exit(&vd->vdev_trim_io_lock); | |
450 | ||
451 | spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); | |
452 | } | |
453 | ||
b7654bd7 GA |
454 | /* |
455 | * The zio_done_func_t done callback for each TRIM issued via | |
456 | * vdev_trim_simple(). It is responsible for updating the TRIM stats and | |
457 | * limiting the number of in flight TRIM I/Os. Simple TRIM I/Os are best | |
458 | * effort and are never reissued on failure. | |
459 | */ | |
460 | static void | |
461 | vdev_trim_simple_cb(zio_t *zio) | |
462 | { | |
463 | vdev_t *vd = zio->io_vd; | |
464 | ||
465 | mutex_enter(&vd->vdev_trim_io_lock); | |
466 | ||
467 | if (zio->io_error != 0) { | |
468 | vd->vdev_stat.vs_trim_errors++; | |
469 | spa_iostats_trim_add(vd->vdev_spa, TRIM_TYPE_SIMPLE, | |
470 | 0, 0, 0, 0, 1, zio->io_orig_size); | |
471 | } else { | |
472 | spa_iostats_trim_add(vd->vdev_spa, TRIM_TYPE_SIMPLE, | |
473 | 1, zio->io_orig_size, 0, 0, 0, 0); | |
474 | } | |
475 | ||
476 | ASSERT3U(vd->vdev_trim_inflight[TRIM_TYPE_SIMPLE], >, 0); | |
477 | vd->vdev_trim_inflight[TRIM_TYPE_SIMPLE]--; | |
478 | cv_broadcast(&vd->vdev_trim_io_cv); | |
479 | mutex_exit(&vd->vdev_trim_io_lock); | |
480 | ||
481 | spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); | |
482 | } | |
1b939560 BB |
483 | /* |
484 | * Returns the average trim rate in bytes/sec for the ta->trim_vdev. | |
485 | */ | |
486 | static uint64_t | |
487 | vdev_trim_calculate_rate(trim_args_t *ta) | |
488 | { | |
489 | return (ta->trim_bytes_done * 1000 / | |
490 | (NSEC2MSEC(gethrtime() - ta->trim_start_time) + 1)); | |
491 | } | |
492 | ||
493 | /* | |
494 | * Issues a physical TRIM and takes care of rate limiting (bytes/sec) | |
495 | * and number of concurrent TRIM I/Os. | |
496 | */ | |
497 | static int | |
498 | vdev_trim_range(trim_args_t *ta, uint64_t start, uint64_t size) | |
499 | { | |
500 | vdev_t *vd = ta->trim_vdev; | |
501 | spa_t *spa = vd->vdev_spa; | |
b7654bd7 | 502 | void *cb; |
1b939560 BB |
503 | |
504 | mutex_enter(&vd->vdev_trim_io_lock); | |
505 | ||
506 | /* | |
507 | * Limit manual TRIM I/Os to the requested rate. This does not | |
508 | * apply to automatic TRIM since no per vdev rate can be specified. | |
509 | */ | |
510 | if (ta->trim_type == TRIM_TYPE_MANUAL) { | |
511 | while (vd->vdev_trim_rate != 0 && !vdev_trim_should_stop(vd) && | |
512 | vdev_trim_calculate_rate(ta) > vd->vdev_trim_rate) { | |
ac6e5fb2 | 513 | cv_timedwait_idle(&vd->vdev_trim_io_cv, |
1b939560 BB |
514 | &vd->vdev_trim_io_lock, ddi_get_lbolt() + |
515 | MSEC_TO_TICK(10)); | |
516 | } | |
517 | } | |
518 | ta->trim_bytes_done += size; | |
519 | ||
520 | /* Limit in flight trimming I/Os */ | |
b7654bd7 GA |
521 | while (vd->vdev_trim_inflight[0] + vd->vdev_trim_inflight[1] + |
522 | vd->vdev_trim_inflight[2] >= zfs_trim_queue_limit) { | |
1b939560 BB |
523 | cv_wait(&vd->vdev_trim_io_cv, &vd->vdev_trim_io_lock); |
524 | } | |
525 | vd->vdev_trim_inflight[ta->trim_type]++; | |
526 | mutex_exit(&vd->vdev_trim_io_lock); | |
527 | ||
528 | dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); | |
529 | VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); | |
530 | uint64_t txg = dmu_tx_get_txg(tx); | |
531 | ||
532 | spa_config_enter(spa, SCL_STATE_ALL, vd, RW_READER); | |
533 | mutex_enter(&vd->vdev_trim_lock); | |
534 | ||
535 | if (ta->trim_type == TRIM_TYPE_MANUAL && | |
536 | vd->vdev_trim_offset[txg & TXG_MASK] == 0) { | |
537 | uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP); | |
538 | *guid = vd->vdev_guid; | |
539 | ||
540 | /* This is the first write of this txg. */ | |
541 | dsl_sync_task_nowait(spa_get_dsl(spa), | |
38080324 | 542 | vdev_trim_zap_update_sync, guid, tx); |
1b939560 BB |
543 | } |
544 | ||
545 | /* | |
546 | * We know the vdev_t will still be around since all consumers of | |
547 | * vdev_free must stop the trimming first. | |
548 | */ | |
549 | if ((ta->trim_type == TRIM_TYPE_MANUAL && | |
550 | vdev_trim_should_stop(vd)) || | |
551 | (ta->trim_type == TRIM_TYPE_AUTO && | |
552 | vdev_autotrim_should_stop(vd->vdev_top))) { | |
553 | mutex_enter(&vd->vdev_trim_io_lock); | |
554 | vd->vdev_trim_inflight[ta->trim_type]--; | |
555 | mutex_exit(&vd->vdev_trim_io_lock); | |
556 | spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); | |
557 | mutex_exit(&vd->vdev_trim_lock); | |
558 | dmu_tx_commit(tx); | |
559 | return (SET_ERROR(EINTR)); | |
560 | } | |
561 | mutex_exit(&vd->vdev_trim_lock); | |
562 | ||
563 | if (ta->trim_type == TRIM_TYPE_MANUAL) | |
564 | vd->vdev_trim_offset[txg & TXG_MASK] = start + size; | |
565 | ||
b7654bd7 GA |
566 | if (ta->trim_type == TRIM_TYPE_MANUAL) { |
567 | cb = vdev_trim_cb; | |
568 | } else if (ta->trim_type == TRIM_TYPE_AUTO) { | |
569 | cb = vdev_autotrim_cb; | |
570 | } else { | |
571 | cb = vdev_trim_simple_cb; | |
572 | } | |
573 | ||
1b939560 | 574 | zio_nowait(zio_trim(spa->spa_txg_zio[txg & TXG_MASK], vd, |
b7654bd7 GA |
575 | start, size, cb, NULL, ZIO_PRIORITY_TRIM, ZIO_FLAG_CANFAIL, |
576 | ta->trim_flags)); | |
1b939560 BB |
577 | /* vdev_trim_cb and vdev_autotrim_cb release SCL_STATE_ALL */ |
578 | ||
579 | dmu_tx_commit(tx); | |
580 | ||
581 | return (0); | |
582 | } | |
583 | ||
584 | /* | |
585 | * Issues TRIM I/Os for all ranges in the provided ta->trim_tree range tree. | |
586 | * Additional parameters describing how the TRIM should be performed must | |
587 | * be set in the trim_args structure. See the trim_args definition for | |
588 | * additional information. | |
589 | */ | |
590 | static int | |
591 | vdev_trim_ranges(trim_args_t *ta) | |
592 | { | |
593 | vdev_t *vd = ta->trim_vdev; | |
ca577779 PD |
594 | zfs_btree_t *t = &ta->trim_tree->rt_root; |
595 | zfs_btree_index_t idx; | |
1b939560 BB |
596 | uint64_t extent_bytes_max = ta->trim_extent_bytes_max; |
597 | uint64_t extent_bytes_min = ta->trim_extent_bytes_min; | |
598 | spa_t *spa = vd->vdev_spa; | |
8a740701 | 599 | int error = 0; |
1b939560 BB |
600 | |
601 | ta->trim_start_time = gethrtime(); | |
602 | ta->trim_bytes_done = 0; | |
603 | ||
ca577779 PD |
604 | for (range_seg_t *rs = zfs_btree_first(t, &idx); rs != NULL; |
605 | rs = zfs_btree_next(t, &idx, &idx)) { | |
606 | uint64_t size = rs_get_end(rs, ta->trim_tree) - rs_get_start(rs, | |
607 | ta->trim_tree); | |
1b939560 BB |
608 | |
609 | if (extent_bytes_min && size < extent_bytes_min) { | |
610 | spa_iostats_trim_add(spa, ta->trim_type, | |
611 | 0, 0, 1, size, 0, 0); | |
612 | continue; | |
613 | } | |
614 | ||
615 | /* Split range into legally-sized physical chunks */ | |
616 | uint64_t writes_required = ((size - 1) / extent_bytes_max) + 1; | |
617 | ||
618 | for (uint64_t w = 0; w < writes_required; w++) { | |
1b939560 | 619 | error = vdev_trim_range(ta, VDEV_LABEL_START_SIZE + |
ca577779 PD |
620 | rs_get_start(rs, ta->trim_tree) + |
621 | (w *extent_bytes_max), MIN(size - | |
622 | (w * extent_bytes_max), extent_bytes_max)); | |
1b939560 | 623 | if (error != 0) { |
8a740701 | 624 | goto done; |
1b939560 BB |
625 | } |
626 | } | |
627 | } | |
628 | ||
8a740701 JK |
629 | done: |
630 | /* | |
631 | * Make sure all TRIMs for this metaslab have completed before | |
632 | * returning. TRIM zios have lower priority over regular or syncing | |
633 | * zios, so all TRIM zios for this metaslab must complete before the | |
634 | * metaslab is re-enabled. Otherwise it's possible write zios to | |
635 | * this metaslab could cut ahead of still queued TRIM zios for this | |
636 | * metaslab causing corruption if the ranges overlap. | |
637 | */ | |
638 | mutex_enter(&vd->vdev_trim_io_lock); | |
639 | while (vd->vdev_trim_inflight[0] > 0) { | |
640 | cv_wait(&vd->vdev_trim_io_cv, &vd->vdev_trim_io_lock); | |
641 | } | |
642 | mutex_exit(&vd->vdev_trim_io_lock); | |
643 | ||
644 | return (error); | |
1b939560 BB |
645 | } |
646 | ||
b2255edc BB |
647 | static void |
648 | vdev_trim_xlate_last_rs_end(void *arg, range_seg64_t *physical_rs) | |
649 | { | |
650 | uint64_t *last_rs_end = (uint64_t *)arg; | |
651 | ||
652 | if (physical_rs->rs_end > *last_rs_end) | |
653 | *last_rs_end = physical_rs->rs_end; | |
654 | } | |
655 | ||
656 | static void | |
657 | vdev_trim_xlate_progress(void *arg, range_seg64_t *physical_rs) | |
658 | { | |
659 | vdev_t *vd = (vdev_t *)arg; | |
660 | ||
661 | uint64_t size = physical_rs->rs_end - physical_rs->rs_start; | |
662 | vd->vdev_trim_bytes_est += size; | |
663 | ||
664 | if (vd->vdev_trim_last_offset >= physical_rs->rs_end) { | |
665 | vd->vdev_trim_bytes_done += size; | |
666 | } else if (vd->vdev_trim_last_offset > physical_rs->rs_start && | |
667 | vd->vdev_trim_last_offset <= physical_rs->rs_end) { | |
668 | vd->vdev_trim_bytes_done += | |
669 | vd->vdev_trim_last_offset - physical_rs->rs_start; | |
670 | } | |
671 | } | |
672 | ||
1b939560 BB |
673 | /* |
674 | * Calculates the completion percentage of a manual TRIM. | |
675 | */ | |
676 | static void | |
677 | vdev_trim_calculate_progress(vdev_t *vd) | |
678 | { | |
679 | ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) || | |
680 | spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER)); | |
681 | ASSERT(vd->vdev_leaf_zap != 0); | |
682 | ||
683 | vd->vdev_trim_bytes_est = 0; | |
684 | vd->vdev_trim_bytes_done = 0; | |
685 | ||
686 | for (uint64_t i = 0; i < vd->vdev_top->vdev_ms_count; i++) { | |
687 | metaslab_t *msp = vd->vdev_top->vdev_ms[i]; | |
688 | mutex_enter(&msp->ms_lock); | |
689 | ||
b2255edc BB |
690 | uint64_t ms_free = (msp->ms_size - |
691 | metaslab_allocated_space(msp)) / | |
692 | vdev_get_ndisks(vd->vdev_top); | |
1b939560 BB |
693 | |
694 | /* | |
695 | * Convert the metaslab range to a physical range | |
696 | * on our vdev. We use this to determine if we are | |
697 | * in the middle of this metaslab range. | |
698 | */ | |
b2255edc | 699 | range_seg64_t logical_rs, physical_rs, remain_rs; |
1b939560 BB |
700 | logical_rs.rs_start = msp->ms_start; |
701 | logical_rs.rs_end = msp->ms_start + msp->ms_size; | |
1b939560 | 702 | |
b2255edc BB |
703 | /* Metaslab space after this offset has not been trimmed. */ |
704 | vdev_xlate(vd, &logical_rs, &physical_rs, &remain_rs); | |
1b939560 BB |
705 | if (vd->vdev_trim_last_offset <= physical_rs.rs_start) { |
706 | vd->vdev_trim_bytes_est += ms_free; | |
707 | mutex_exit(&msp->ms_lock); | |
708 | continue; | |
b2255edc BB |
709 | } |
710 | ||
711 | /* Metaslab space before this offset has been trimmed */ | |
712 | uint64_t last_rs_end = physical_rs.rs_end; | |
713 | if (!vdev_xlate_is_empty(&remain_rs)) { | |
714 | vdev_xlate_walk(vd, &remain_rs, | |
715 | vdev_trim_xlate_last_rs_end, &last_rs_end); | |
716 | } | |
717 | ||
718 | if (vd->vdev_trim_last_offset > last_rs_end) { | |
1b939560 BB |
719 | vd->vdev_trim_bytes_done += ms_free; |
720 | vd->vdev_trim_bytes_est += ms_free; | |
721 | mutex_exit(&msp->ms_lock); | |
722 | continue; | |
723 | } | |
724 | ||
725 | /* | |
726 | * If we get here, we're in the middle of trimming this | |
727 | * metaslab. Load it and walk the free tree for more | |
728 | * accurate progress estimation. | |
729 | */ | |
730 | VERIFY0(metaslab_load(msp)); | |
731 | ||
ca577779 PD |
732 | range_tree_t *rt = msp->ms_allocatable; |
733 | zfs_btree_t *bt = &rt->rt_root; | |
734 | zfs_btree_index_t idx; | |
735 | for (range_seg_t *rs = zfs_btree_first(bt, &idx); | |
736 | rs != NULL; rs = zfs_btree_next(bt, &idx, &idx)) { | |
737 | logical_rs.rs_start = rs_get_start(rs, rt); | |
738 | logical_rs.rs_end = rs_get_end(rs, rt); | |
b2255edc BB |
739 | |
740 | vdev_xlate_walk(vd, &logical_rs, | |
741 | vdev_trim_xlate_progress, vd); | |
1b939560 BB |
742 | } |
743 | mutex_exit(&msp->ms_lock); | |
744 | } | |
745 | } | |
746 | ||
747 | /* | |
748 | * Load from disk the vdev's manual TRIM information. This includes the | |
749 | * state, progress, and options provided when initiating the manual TRIM. | |
750 | */ | |
751 | static int | |
752 | vdev_trim_load(vdev_t *vd) | |
753 | { | |
754 | int err = 0; | |
755 | ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) || | |
756 | spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER)); | |
757 | ASSERT(vd->vdev_leaf_zap != 0); | |
758 | ||
759 | if (vd->vdev_trim_state == VDEV_TRIM_ACTIVE || | |
760 | vd->vdev_trim_state == VDEV_TRIM_SUSPENDED) { | |
761 | err = zap_lookup(vd->vdev_spa->spa_meta_objset, | |
762 | vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_LAST_OFFSET, | |
763 | sizeof (vd->vdev_trim_last_offset), 1, | |
764 | &vd->vdev_trim_last_offset); | |
765 | if (err == ENOENT) { | |
766 | vd->vdev_trim_last_offset = 0; | |
767 | err = 0; | |
768 | } | |
769 | ||
770 | if (err == 0) { | |
771 | err = zap_lookup(vd->vdev_spa->spa_meta_objset, | |
772 | vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_RATE, | |
773 | sizeof (vd->vdev_trim_rate), 1, | |
774 | &vd->vdev_trim_rate); | |
775 | if (err == ENOENT) { | |
776 | vd->vdev_trim_rate = 0; | |
777 | err = 0; | |
778 | } | |
779 | } | |
780 | ||
781 | if (err == 0) { | |
782 | err = zap_lookup(vd->vdev_spa->spa_meta_objset, | |
783 | vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_PARTIAL, | |
784 | sizeof (vd->vdev_trim_partial), 1, | |
785 | &vd->vdev_trim_partial); | |
786 | if (err == ENOENT) { | |
787 | vd->vdev_trim_partial = 0; | |
788 | err = 0; | |
789 | } | |
790 | } | |
791 | ||
792 | if (err == 0) { | |
793 | err = zap_lookup(vd->vdev_spa->spa_meta_objset, | |
794 | vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_SECURE, | |
795 | sizeof (vd->vdev_trim_secure), 1, | |
796 | &vd->vdev_trim_secure); | |
797 | if (err == ENOENT) { | |
798 | vd->vdev_trim_secure = 0; | |
799 | err = 0; | |
800 | } | |
801 | } | |
802 | } | |
803 | ||
804 | vdev_trim_calculate_progress(vd); | |
805 | ||
806 | return (err); | |
807 | } | |
808 | ||
b2255edc BB |
809 | static void |
810 | vdev_trim_xlate_range_add(void *arg, range_seg64_t *physical_rs) | |
811 | { | |
812 | trim_args_t *ta = arg; | |
813 | vdev_t *vd = ta->trim_vdev; | |
814 | ||
815 | /* | |
816 | * Only a manual trim will be traversing the vdev sequentially. | |
817 | * For an auto trim all valid ranges should be added. | |
818 | */ | |
819 | if (ta->trim_type == TRIM_TYPE_MANUAL) { | |
820 | ||
821 | /* Only add segments that we have not visited yet */ | |
822 | if (physical_rs->rs_end <= vd->vdev_trim_last_offset) | |
823 | return; | |
824 | ||
825 | /* Pick up where we left off mid-range. */ | |
826 | if (vd->vdev_trim_last_offset > physical_rs->rs_start) { | |
827 | ASSERT3U(physical_rs->rs_end, >, | |
828 | vd->vdev_trim_last_offset); | |
829 | physical_rs->rs_start = vd->vdev_trim_last_offset; | |
830 | } | |
831 | } | |
832 | ||
833 | ASSERT3U(physical_rs->rs_end, >, physical_rs->rs_start); | |
834 | ||
835 | range_tree_add(ta->trim_tree, physical_rs->rs_start, | |
836 | physical_rs->rs_end - physical_rs->rs_start); | |
837 | } | |
838 | ||
1b939560 | 839 | /* |
b2255edc | 840 | * Convert the logical range into physical ranges and add them to the |
1b939560 BB |
841 | * range tree passed in the trim_args_t. |
842 | */ | |
843 | static void | |
844 | vdev_trim_range_add(void *arg, uint64_t start, uint64_t size) | |
845 | { | |
846 | trim_args_t *ta = arg; | |
847 | vdev_t *vd = ta->trim_vdev; | |
b2255edc | 848 | range_seg64_t logical_rs; |
1b939560 BB |
849 | logical_rs.rs_start = start; |
850 | logical_rs.rs_end = start + size; | |
851 | ||
852 | /* | |
853 | * Every range to be trimmed must be part of ms_allocatable. | |
854 | * When ZFS_DEBUG_TRIM is set load the metaslab to verify this | |
855 | * is always the case. | |
856 | */ | |
857 | if (zfs_flags & ZFS_DEBUG_TRIM) { | |
858 | metaslab_t *msp = ta->trim_msp; | |
859 | VERIFY0(metaslab_load(msp)); | |
860 | VERIFY3B(msp->ms_loaded, ==, B_TRUE); | |
ca577779 | 861 | VERIFY(range_tree_contains(msp->ms_allocatable, start, size)); |
1b939560 BB |
862 | } |
863 | ||
864 | ASSERT(vd->vdev_ops->vdev_op_leaf); | |
b2255edc | 865 | vdev_xlate_walk(vd, &logical_rs, vdev_trim_xlate_range_add, arg); |
1b939560 BB |
866 | } |
867 | ||
868 | /* | |
869 | * Each manual TRIM thread is responsible for trimming the unallocated | |
870 | * space for each leaf vdev. This is accomplished by sequentially iterating | |
871 | * over its top-level metaslabs and issuing TRIM I/O for the space described | |
872 | * by its ms_allocatable. While a metaslab is undergoing trimming it is | |
873 | * not eligible for new allocations. | |
874 | */ | |
460748d4 | 875 | static __attribute__((noreturn)) void |
1b939560 BB |
876 | vdev_trim_thread(void *arg) |
877 | { | |
878 | vdev_t *vd = arg; | |
879 | spa_t *spa = vd->vdev_spa; | |
880 | trim_args_t ta; | |
881 | int error = 0; | |
882 | ||
883 | /* | |
884 | * The VDEV_LEAF_ZAP_TRIM_* entries may have been updated by | |
885 | * vdev_trim(). Wait for the updated values to be reflected | |
886 | * in the zap in order to start with the requested settings. | |
887 | */ | |
888 | txg_wait_synced(spa_get_dsl(vd->vdev_spa), 0); | |
889 | ||
890 | ASSERT(vdev_is_concrete(vd)); | |
891 | spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); | |
892 | ||
893 | vd->vdev_trim_last_offset = 0; | |
894 | vd->vdev_trim_rate = 0; | |
895 | vd->vdev_trim_partial = 0; | |
896 | vd->vdev_trim_secure = 0; | |
897 | ||
898 | VERIFY0(vdev_trim_load(vd)); | |
899 | ||
900 | ta.trim_vdev = vd; | |
901 | ta.trim_extent_bytes_max = zfs_trim_extent_bytes_max; | |
902 | ta.trim_extent_bytes_min = zfs_trim_extent_bytes_min; | |
ca577779 | 903 | ta.trim_tree = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); |
1b939560 BB |
904 | ta.trim_type = TRIM_TYPE_MANUAL; |
905 | ta.trim_flags = 0; | |
906 | ||
907 | /* | |
908 | * When a secure TRIM has been requested infer that the intent | |
909 | * is that everything must be trimmed. Override the default | |
910 | * minimum TRIM size to prevent ranges from being skipped. | |
911 | */ | |
912 | if (vd->vdev_trim_secure) { | |
913 | ta.trim_flags |= ZIO_TRIM_SECURE; | |
914 | ta.trim_extent_bytes_min = SPA_MINBLOCKSIZE; | |
915 | } | |
916 | ||
917 | uint64_t ms_count = 0; | |
918 | for (uint64_t i = 0; !vd->vdev_detached && | |
919 | i < vd->vdev_top->vdev_ms_count; i++) { | |
920 | metaslab_t *msp = vd->vdev_top->vdev_ms[i]; | |
921 | ||
922 | /* | |
923 | * If we've expanded the top-level vdev or it's our | |
924 | * first pass, calculate our progress. | |
925 | */ | |
926 | if (vd->vdev_top->vdev_ms_count != ms_count) { | |
927 | vdev_trim_calculate_progress(vd); | |
928 | ms_count = vd->vdev_top->vdev_ms_count; | |
929 | } | |
930 | ||
931 | spa_config_exit(spa, SCL_CONFIG, FTAG); | |
932 | metaslab_disable(msp); | |
933 | mutex_enter(&msp->ms_lock); | |
934 | VERIFY0(metaslab_load(msp)); | |
935 | ||
936 | /* | |
937 | * If a partial TRIM was requested skip metaslabs which have | |
938 | * never been initialized and thus have never been written. | |
939 | */ | |
940 | if (msp->ms_sm == NULL && vd->vdev_trim_partial) { | |
941 | mutex_exit(&msp->ms_lock); | |
f09fda50 | 942 | metaslab_enable(msp, B_FALSE, B_FALSE); |
1b939560 BB |
943 | spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); |
944 | vdev_trim_calculate_progress(vd); | |
945 | continue; | |
946 | } | |
947 | ||
948 | ta.trim_msp = msp; | |
949 | range_tree_walk(msp->ms_allocatable, vdev_trim_range_add, &ta); | |
950 | range_tree_vacate(msp->ms_trim, NULL, NULL); | |
951 | mutex_exit(&msp->ms_lock); | |
952 | ||
953 | error = vdev_trim_ranges(&ta); | |
f09fda50 | 954 | metaslab_enable(msp, B_TRUE, B_FALSE); |
1b939560 BB |
955 | spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); |
956 | ||
957 | range_tree_vacate(ta.trim_tree, NULL, NULL); | |
958 | if (error != 0) | |
959 | break; | |
960 | } | |
961 | ||
962 | spa_config_exit(spa, SCL_CONFIG, FTAG); | |
1b939560 BB |
963 | |
964 | range_tree_destroy(ta.trim_tree); | |
965 | ||
966 | mutex_enter(&vd->vdev_trim_lock); | |
bedbc13d S |
967 | if (!vd->vdev_trim_exit_wanted) { |
968 | if (vdev_writeable(vd)) { | |
969 | vdev_trim_change_state(vd, VDEV_TRIM_COMPLETE, | |
970 | vd->vdev_trim_rate, vd->vdev_trim_partial, | |
971 | vd->vdev_trim_secure); | |
972 | } else if (vd->vdev_faulted) { | |
973 | vdev_trim_change_state(vd, VDEV_TRIM_CANCELED, | |
974 | vd->vdev_trim_rate, vd->vdev_trim_partial, | |
975 | vd->vdev_trim_secure); | |
976 | } | |
1b939560 BB |
977 | } |
978 | ASSERT(vd->vdev_trim_thread != NULL || vd->vdev_trim_inflight[0] == 0); | |
979 | ||
980 | /* | |
981 | * Drop the vdev_trim_lock while we sync out the txg since it's | |
982 | * possible that a device might be trying to come online and must | |
983 | * check to see if it needs to restart a trim. That thread will be | |
984 | * holding the spa_config_lock which would prevent the txg_wait_synced | |
985 | * from completing. | |
986 | */ | |
987 | mutex_exit(&vd->vdev_trim_lock); | |
988 | txg_wait_synced(spa_get_dsl(spa), 0); | |
989 | mutex_enter(&vd->vdev_trim_lock); | |
990 | ||
991 | vd->vdev_trim_thread = NULL; | |
992 | cv_broadcast(&vd->vdev_trim_cv); | |
993 | mutex_exit(&vd->vdev_trim_lock); | |
eeb8fae9 JL |
994 | |
995 | thread_exit(); | |
1b939560 BB |
996 | } |
997 | ||
998 | /* | |
999 | * Initiates a manual TRIM for the vdev_t. Callers must hold vdev_trim_lock, | |
1000 | * the vdev_t must be a leaf and cannot already be manually trimming. | |
1001 | */ | |
1002 | void | |
1003 | vdev_trim(vdev_t *vd, uint64_t rate, boolean_t partial, boolean_t secure) | |
1004 | { | |
1005 | ASSERT(MUTEX_HELD(&vd->vdev_trim_lock)); | |
1006 | ASSERT(vd->vdev_ops->vdev_op_leaf); | |
1007 | ASSERT(vdev_is_concrete(vd)); | |
1008 | ASSERT3P(vd->vdev_trim_thread, ==, NULL); | |
1009 | ASSERT(!vd->vdev_detached); | |
1010 | ASSERT(!vd->vdev_trim_exit_wanted); | |
1011 | ASSERT(!vd->vdev_top->vdev_removing); | |
5caeef02 | 1012 | ASSERT(!vd->vdev_rz_expanding); |
1b939560 BB |
1013 | |
1014 | vdev_trim_change_state(vd, VDEV_TRIM_ACTIVE, rate, partial, secure); | |
1015 | vd->vdev_trim_thread = thread_create(NULL, 0, | |
1016 | vdev_trim_thread, vd, 0, &p0, TS_RUN, maxclsyspri); | |
1017 | } | |
1018 | ||
1019 | /* | |
1020 | * Wait for the trimming thread to be terminated (canceled or stopped). | |
1021 | */ | |
1022 | static void | |
1023 | vdev_trim_stop_wait_impl(vdev_t *vd) | |
1024 | { | |
1025 | ASSERT(MUTEX_HELD(&vd->vdev_trim_lock)); | |
1026 | ||
1027 | while (vd->vdev_trim_thread != NULL) | |
1028 | cv_wait(&vd->vdev_trim_cv, &vd->vdev_trim_lock); | |
1029 | ||
1030 | ASSERT3P(vd->vdev_trim_thread, ==, NULL); | |
1031 | vd->vdev_trim_exit_wanted = B_FALSE; | |
1032 | } | |
1033 | ||
1034 | /* | |
1035 | * Wait for vdev trim threads which were listed to cleanly exit. | |
1036 | */ | |
1037 | void | |
1038 | vdev_trim_stop_wait(spa_t *spa, list_t *vd_list) | |
1039 | { | |
14e4e3cb | 1040 | (void) spa; |
1b939560 BB |
1041 | vdev_t *vd; |
1042 | ||
1043 | ASSERT(MUTEX_HELD(&spa_namespace_lock)); | |
1044 | ||
1045 | while ((vd = list_remove_head(vd_list)) != NULL) { | |
1046 | mutex_enter(&vd->vdev_trim_lock); | |
1047 | vdev_trim_stop_wait_impl(vd); | |
1048 | mutex_exit(&vd->vdev_trim_lock); | |
1049 | } | |
1050 | } | |
1051 | ||
1052 | /* | |
1053 | * Stop trimming a device, with the resultant trimming state being tgt_state. | |
1054 | * For blocking behavior pass NULL for vd_list. Otherwise, when a list_t is | |
1055 | * provided the stopping vdev is inserted in to the list. Callers are then | |
1056 | * required to call vdev_trim_stop_wait() to block for all the trim threads | |
1057 | * to exit. The caller must hold vdev_trim_lock and must not be writing to | |
1058 | * the spa config, as the trimming thread may try to enter the config as a | |
1059 | * reader before exiting. | |
1060 | */ | |
1061 | void | |
1062 | vdev_trim_stop(vdev_t *vd, vdev_trim_state_t tgt_state, list_t *vd_list) | |
1063 | { | |
1064 | ASSERT(!spa_config_held(vd->vdev_spa, SCL_CONFIG|SCL_STATE, RW_WRITER)); | |
1065 | ASSERT(MUTEX_HELD(&vd->vdev_trim_lock)); | |
1066 | ASSERT(vd->vdev_ops->vdev_op_leaf); | |
1067 | ASSERT(vdev_is_concrete(vd)); | |
1068 | ||
1069 | /* | |
1070 | * Allow cancel requests to proceed even if the trim thread has | |
1071 | * stopped. | |
1072 | */ | |
1073 | if (vd->vdev_trim_thread == NULL && tgt_state != VDEV_TRIM_CANCELED) | |
1074 | return; | |
1075 | ||
1076 | vdev_trim_change_state(vd, tgt_state, 0, 0, 0); | |
1077 | vd->vdev_trim_exit_wanted = B_TRUE; | |
1078 | ||
1079 | if (vd_list == NULL) { | |
1080 | vdev_trim_stop_wait_impl(vd); | |
1081 | } else { | |
1082 | ASSERT(MUTEX_HELD(&spa_namespace_lock)); | |
1083 | list_insert_tail(vd_list, vd); | |
1084 | } | |
1085 | } | |
1086 | ||
1087 | /* | |
1088 | * Requests that all listed vdevs stop trimming. | |
1089 | */ | |
1090 | static void | |
1091 | vdev_trim_stop_all_impl(vdev_t *vd, vdev_trim_state_t tgt_state, | |
1092 | list_t *vd_list) | |
1093 | { | |
1094 | if (vd->vdev_ops->vdev_op_leaf && vdev_is_concrete(vd)) { | |
1095 | mutex_enter(&vd->vdev_trim_lock); | |
1096 | vdev_trim_stop(vd, tgt_state, vd_list); | |
1097 | mutex_exit(&vd->vdev_trim_lock); | |
1098 | return; | |
1099 | } | |
1100 | ||
1101 | for (uint64_t i = 0; i < vd->vdev_children; i++) { | |
1102 | vdev_trim_stop_all_impl(vd->vdev_child[i], tgt_state, | |
1103 | vd_list); | |
1104 | } | |
1105 | } | |
1106 | ||
1107 | /* | |
1108 | * Convenience function to stop trimming of a vdev tree and set all trim | |
1109 | * thread pointers to NULL. | |
1110 | */ | |
1111 | void | |
1112 | vdev_trim_stop_all(vdev_t *vd, vdev_trim_state_t tgt_state) | |
1113 | { | |
1114 | spa_t *spa = vd->vdev_spa; | |
1115 | list_t vd_list; | |
b7654bd7 | 1116 | vdev_t *vd_l2cache; |
1b939560 BB |
1117 | |
1118 | ASSERT(MUTEX_HELD(&spa_namespace_lock)); | |
1119 | ||
1120 | list_create(&vd_list, sizeof (vdev_t), | |
1121 | offsetof(vdev_t, vdev_trim_node)); | |
1122 | ||
1123 | vdev_trim_stop_all_impl(vd, tgt_state, &vd_list); | |
b7654bd7 GA |
1124 | |
1125 | /* | |
1126 | * Iterate over cache devices and request stop trimming the | |
1127 | * whole device in case we export the pool or remove the cache | |
1128 | * device prematurely. | |
1129 | */ | |
1130 | for (int i = 0; i < spa->spa_l2cache.sav_count; i++) { | |
1131 | vd_l2cache = spa->spa_l2cache.sav_vdevs[i]; | |
1132 | vdev_trim_stop_all_impl(vd_l2cache, tgt_state, &vd_list); | |
1133 | } | |
1134 | ||
1b939560 BB |
1135 | vdev_trim_stop_wait(spa, &vd_list); |
1136 | ||
1137 | if (vd->vdev_spa->spa_sync_on) { | |
1138 | /* Make sure that our state has been synced to disk */ | |
1139 | txg_wait_synced(spa_get_dsl(vd->vdev_spa), 0); | |
1140 | } | |
1141 | ||
1142 | list_destroy(&vd_list); | |
1143 | } | |
1144 | ||
1145 | /* | |
1146 | * Conditionally restarts a manual TRIM given its on-disk state. | |
1147 | */ | |
1148 | void | |
1149 | vdev_trim_restart(vdev_t *vd) | |
1150 | { | |
1151 | ASSERT(MUTEX_HELD(&spa_namespace_lock)); | |
1152 | ASSERT(!spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); | |
1153 | ||
1154 | if (vd->vdev_leaf_zap != 0) { | |
1155 | mutex_enter(&vd->vdev_trim_lock); | |
1156 | uint64_t trim_state = VDEV_TRIM_NONE; | |
1157 | int err = zap_lookup(vd->vdev_spa->spa_meta_objset, | |
1158 | vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_STATE, | |
1159 | sizeof (trim_state), 1, &trim_state); | |
1160 | ASSERT(err == 0 || err == ENOENT); | |
1161 | vd->vdev_trim_state = trim_state; | |
1162 | ||
1163 | uint64_t timestamp = 0; | |
1164 | err = zap_lookup(vd->vdev_spa->spa_meta_objset, | |
1165 | vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_ACTION_TIME, | |
1166 | sizeof (timestamp), 1, ×tamp); | |
1167 | ASSERT(err == 0 || err == ENOENT); | |
2c3a8370 | 1168 | vd->vdev_trim_action_time = timestamp; |
1b939560 | 1169 | |
5caeef02 DB |
1170 | if ((vd->vdev_trim_state == VDEV_TRIM_SUSPENDED || |
1171 | vd->vdev_offline) && !vd->vdev_top->vdev_rz_expanding) { | |
1b939560 BB |
1172 | /* load progress for reporting, but don't resume */ |
1173 | VERIFY0(vdev_trim_load(vd)); | |
1174 | } else if (vd->vdev_trim_state == VDEV_TRIM_ACTIVE && | |
1175 | vdev_writeable(vd) && !vd->vdev_top->vdev_removing && | |
5caeef02 | 1176 | !vd->vdev_top->vdev_rz_expanding && |
1b939560 BB |
1177 | vd->vdev_trim_thread == NULL) { |
1178 | VERIFY0(vdev_trim_load(vd)); | |
1179 | vdev_trim(vd, vd->vdev_trim_rate, | |
1180 | vd->vdev_trim_partial, vd->vdev_trim_secure); | |
1181 | } | |
1182 | ||
1183 | mutex_exit(&vd->vdev_trim_lock); | |
1184 | } | |
1185 | ||
1186 | for (uint64_t i = 0; i < vd->vdev_children; i++) { | |
1187 | vdev_trim_restart(vd->vdev_child[i]); | |
1188 | } | |
1189 | } | |
1190 | ||
1191 | /* | |
1192 | * Used by the automatic TRIM when ZFS_DEBUG_TRIM is set to verify that | |
1193 | * every TRIM range is contained within ms_allocatable. | |
1194 | */ | |
1195 | static void | |
1196 | vdev_trim_range_verify(void *arg, uint64_t start, uint64_t size) | |
1197 | { | |
1198 | trim_args_t *ta = arg; | |
1199 | metaslab_t *msp = ta->trim_msp; | |
1200 | ||
1201 | VERIFY3B(msp->ms_loaded, ==, B_TRUE); | |
1202 | VERIFY3U(msp->ms_disabled, >, 0); | |
ca577779 | 1203 | VERIFY(range_tree_contains(msp->ms_allocatable, start, size)); |
1b939560 BB |
1204 | } |
1205 | ||
1206 | /* | |
1207 | * Each automatic TRIM thread is responsible for managing the trimming of a | |
1208 | * top-level vdev in the pool. No automatic TRIM state is maintained on-disk. | |
1209 | * | |
1210 | * N.B. This behavior is different from a manual TRIM where a thread | |
1211 | * is created for each leaf vdev, instead of each top-level vdev. | |
1212 | */ | |
460748d4 | 1213 | static __attribute__((noreturn)) void |
1b939560 BB |
1214 | vdev_autotrim_thread(void *arg) |
1215 | { | |
1216 | vdev_t *vd = arg; | |
1217 | spa_t *spa = vd->vdev_spa; | |
1218 | int shift = 0; | |
1219 | ||
1220 | mutex_enter(&vd->vdev_autotrim_lock); | |
1221 | ASSERT3P(vd->vdev_top, ==, vd); | |
1222 | ASSERT3P(vd->vdev_autotrim_thread, !=, NULL); | |
1223 | mutex_exit(&vd->vdev_autotrim_lock); | |
1224 | spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); | |
1225 | ||
1b939560 BB |
1226 | while (!vdev_autotrim_should_stop(vd)) { |
1227 | int txgs_per_trim = MAX(zfs_trim_txg_batch, 1); | |
7822b50f VS |
1228 | uint64_t extent_bytes_max = zfs_trim_extent_bytes_max; |
1229 | uint64_t extent_bytes_min = zfs_trim_extent_bytes_min; | |
1b939560 BB |
1230 | |
1231 | /* | |
1232 | * All of the metaslabs are divided in to groups of size | |
1233 | * num_metaslabs / zfs_trim_txg_batch. Each of these groups | |
1234 | * is composed of metaslabs which are spread evenly over the | |
1235 | * device. | |
1236 | * | |
1237 | * For example, when zfs_trim_txg_batch = 32 (default) then | |
1238 | * group 0 will contain metaslabs 0, 32, 64, ...; | |
1239 | * group 1 will contain metaslabs 1, 33, 65, ...; | |
1240 | * group 2 will contain metaslabs 2, 34, 66, ...; and so on. | |
1241 | * | |
1242 | * On each pass through the while() loop one of these groups | |
1243 | * is selected. This is accomplished by using a shift value | |
1244 | * to select the starting metaslab, then striding over the | |
1245 | * metaslabs using the zfs_trim_txg_batch size. This is | |
1246 | * done to accomplish two things. | |
1247 | * | |
1248 | * 1) By dividing the metaslabs in to groups, and making sure | |
1249 | * that each group takes a minimum of one txg to process. | |
1250 | * Then zfs_trim_txg_batch controls the minimum number of | |
1251 | * txgs which must occur before a metaslab is revisited. | |
1252 | * | |
1253 | * 2) Selecting non-consecutive metaslabs distributes the | |
1254 | * TRIM commands for a group evenly over the entire device. | |
1255 | * This can be advantageous for certain types of devices. | |
1256 | */ | |
1257 | for (uint64_t i = shift % txgs_per_trim; i < vd->vdev_ms_count; | |
1258 | i += txgs_per_trim) { | |
1259 | metaslab_t *msp = vd->vdev_ms[i]; | |
1260 | range_tree_t *trim_tree; | |
65d10bd8 KJ |
1261 | boolean_t issued_trim = B_FALSE; |
1262 | boolean_t wait_aborted = B_FALSE; | |
1b939560 BB |
1263 | |
1264 | spa_config_exit(spa, SCL_CONFIG, FTAG); | |
1265 | metaslab_disable(msp); | |
1266 | spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); | |
1267 | ||
1268 | mutex_enter(&msp->ms_lock); | |
1269 | ||
1270 | /* | |
1271 | * Skip the metaslab when it has never been allocated | |
1272 | * or when there are no recent frees to trim. | |
1273 | */ | |
1274 | if (msp->ms_sm == NULL || | |
1275 | range_tree_is_empty(msp->ms_trim)) { | |
1276 | mutex_exit(&msp->ms_lock); | |
f09fda50 | 1277 | metaslab_enable(msp, B_FALSE, B_FALSE); |
1b939560 BB |
1278 | continue; |
1279 | } | |
1280 | ||
1281 | /* | |
1282 | * Skip the metaslab when it has already been disabled. | |
1283 | * This may happen when a manual TRIM or initialize | |
1284 | * operation is running concurrently. In the case | |
1285 | * of a manual TRIM, the ms_trim tree will have been | |
1286 | * vacated. Only ranges added after the manual TRIM | |
1287 | * disabled the metaslab will be included in the tree. | |
1288 | * These will be processed when the automatic TRIM | |
1289 | * next revisits this metaslab. | |
1290 | */ | |
1291 | if (msp->ms_disabled > 1) { | |
1292 | mutex_exit(&msp->ms_lock); | |
f09fda50 | 1293 | metaslab_enable(msp, B_FALSE, B_FALSE); |
1b939560 BB |
1294 | continue; |
1295 | } | |
1296 | ||
1297 | /* | |
1298 | * Allocate an empty range tree which is swapped in | |
1299 | * for the existing ms_trim tree while it is processed. | |
1300 | */ | |
ca577779 PD |
1301 | trim_tree = range_tree_create(NULL, RANGE_SEG64, NULL, |
1302 | 0, 0); | |
1b939560 BB |
1303 | range_tree_swap(&msp->ms_trim, &trim_tree); |
1304 | ASSERT(range_tree_is_empty(msp->ms_trim)); | |
1305 | ||
1306 | /* | |
1307 | * There are two cases when constructing the per-vdev | |
1308 | * trim trees for a metaslab. If the top-level vdev | |
1309 | * has no children then it is also a leaf and should | |
1310 | * be trimmed. Otherwise our children are the leaves | |
1311 | * and a trim tree should be constructed for each. | |
1312 | */ | |
1313 | trim_args_t *tap; | |
1314 | uint64_t children = vd->vdev_children; | |
1315 | if (children == 0) { | |
1316 | children = 1; | |
1317 | tap = kmem_zalloc(sizeof (trim_args_t) * | |
1318 | children, KM_SLEEP); | |
1319 | tap[0].trim_vdev = vd; | |
1320 | } else { | |
1321 | tap = kmem_zalloc(sizeof (trim_args_t) * | |
1322 | children, KM_SLEEP); | |
1323 | ||
1324 | for (uint64_t c = 0; c < children; c++) { | |
1325 | tap[c].trim_vdev = vd->vdev_child[c]; | |
1326 | } | |
1327 | } | |
1328 | ||
1329 | for (uint64_t c = 0; c < children; c++) { | |
1330 | trim_args_t *ta = &tap[c]; | |
1331 | vdev_t *cvd = ta->trim_vdev; | |
1332 | ||
1333 | ta->trim_msp = msp; | |
1334 | ta->trim_extent_bytes_max = extent_bytes_max; | |
1335 | ta->trim_extent_bytes_min = extent_bytes_min; | |
1336 | ta->trim_type = TRIM_TYPE_AUTO; | |
1337 | ta->trim_flags = 0; | |
1338 | ||
1339 | if (cvd->vdev_detached || | |
1340 | !vdev_writeable(cvd) || | |
1341 | !cvd->vdev_has_trim || | |
1342 | cvd->vdev_trim_thread != NULL) { | |
1343 | continue; | |
1344 | } | |
1345 | ||
1346 | /* | |
1347 | * When a device has an attached hot spare, or | |
1348 | * is being replaced it will not be trimmed. | |
1349 | * This is done to avoid adding additional | |
1350 | * stress to a potentially unhealthy device, | |
1351 | * and to minimize the required rebuild time. | |
1352 | */ | |
1353 | if (!cvd->vdev_ops->vdev_op_leaf) | |
1354 | continue; | |
1355 | ||
ca577779 PD |
1356 | ta->trim_tree = range_tree_create(NULL, |
1357 | RANGE_SEG64, NULL, 0, 0); | |
1b939560 BB |
1358 | range_tree_walk(trim_tree, |
1359 | vdev_trim_range_add, ta); | |
1360 | } | |
1361 | ||
1362 | mutex_exit(&msp->ms_lock); | |
1363 | spa_config_exit(spa, SCL_CONFIG, FTAG); | |
1364 | ||
1365 | /* | |
1366 | * Issue the TRIM I/Os for all ranges covered by the | |
1367 | * TRIM trees. These ranges are safe to TRIM because | |
1368 | * no new allocations will be performed until the call | |
1369 | * to metaslab_enabled() below. | |
1370 | */ | |
1371 | for (uint64_t c = 0; c < children; c++) { | |
1372 | trim_args_t *ta = &tap[c]; | |
1373 | ||
1374 | /* | |
1375 | * Always yield to a manual TRIM if one has | |
1376 | * been started for the child vdev. | |
1377 | */ | |
1378 | if (ta->trim_tree == NULL || | |
1379 | ta->trim_vdev->vdev_trim_thread != NULL) { | |
1380 | continue; | |
1381 | } | |
1382 | ||
1383 | /* | |
1384 | * After this point metaslab_enable() must be | |
1385 | * called with the sync flag set. This is done | |
1386 | * here because vdev_trim_ranges() is allowed | |
1387 | * to be interrupted (EINTR) before issuing all | |
1388 | * of the required TRIM I/Os. | |
1389 | */ | |
1390 | issued_trim = B_TRUE; | |
1391 | ||
1392 | int error = vdev_trim_ranges(ta); | |
1393 | if (error) | |
1394 | break; | |
1395 | } | |
1396 | ||
1397 | /* | |
1398 | * Verify every range which was trimmed is still | |
1399 | * contained within the ms_allocatable tree. | |
1400 | */ | |
1401 | if (zfs_flags & ZFS_DEBUG_TRIM) { | |
1402 | mutex_enter(&msp->ms_lock); | |
1403 | VERIFY0(metaslab_load(msp)); | |
1404 | VERIFY3P(tap[0].trim_msp, ==, msp); | |
1405 | range_tree_walk(trim_tree, | |
1406 | vdev_trim_range_verify, &tap[0]); | |
1407 | mutex_exit(&msp->ms_lock); | |
1408 | } | |
1409 | ||
1410 | range_tree_vacate(trim_tree, NULL, NULL); | |
1411 | range_tree_destroy(trim_tree); | |
1412 | ||
65d10bd8 KJ |
1413 | /* |
1414 | * Wait for couples of kicks, to ensure the trim io is | |
1415 | * synced. If the wait is aborted due to | |
1416 | * vdev_autotrim_exit_wanted, we need to signal | |
1417 | * metaslab_enable() to wait for sync. | |
1418 | */ | |
1419 | if (issued_trim) { | |
1420 | wait_aborted = vdev_autotrim_wait_kick(vd, | |
1421 | TXG_CONCURRENT_STATES + TXG_DEFER_SIZE); | |
1422 | } | |
1423 | ||
1424 | metaslab_enable(msp, wait_aborted, B_FALSE); | |
1b939560 BB |
1425 | spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); |
1426 | ||
1427 | for (uint64_t c = 0; c < children; c++) { | |
1428 | trim_args_t *ta = &tap[c]; | |
1429 | ||
1430 | if (ta->trim_tree == NULL) | |
1431 | continue; | |
1432 | ||
1433 | range_tree_vacate(ta->trim_tree, NULL, NULL); | |
1434 | range_tree_destroy(ta->trim_tree); | |
1435 | } | |
1436 | ||
1437 | kmem_free(tap, sizeof (trim_args_t) * children); | |
65d10bd8 KJ |
1438 | |
1439 | if (vdev_autotrim_should_stop(vd)) | |
1440 | break; | |
1b939560 BB |
1441 | } |
1442 | ||
1443 | spa_config_exit(spa, SCL_CONFIG, FTAG); | |
1444 | ||
65d10bd8 | 1445 | vdev_autotrim_wait_kick(vd, 1); |
1b939560 BB |
1446 | |
1447 | shift++; | |
1448 | spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); | |
1449 | } | |
1450 | ||
1451 | for (uint64_t c = 0; c < vd->vdev_children; c++) { | |
1452 | vdev_t *cvd = vd->vdev_child[c]; | |
1453 | mutex_enter(&cvd->vdev_trim_io_lock); | |
1454 | ||
1455 | while (cvd->vdev_trim_inflight[1] > 0) { | |
1456 | cv_wait(&cvd->vdev_trim_io_cv, | |
1457 | &cvd->vdev_trim_io_lock); | |
1458 | } | |
1459 | mutex_exit(&cvd->vdev_trim_io_lock); | |
1460 | } | |
1461 | ||
1462 | spa_config_exit(spa, SCL_CONFIG, FTAG); | |
1463 | ||
1464 | /* | |
1465 | * When exiting because the autotrim property was set to off, then | |
1466 | * abandon any unprocessed ms_trim ranges to reclaim the memory. | |
1467 | */ | |
1468 | if (spa_get_autotrim(spa) == SPA_AUTOTRIM_OFF) { | |
1469 | for (uint64_t i = 0; i < vd->vdev_ms_count; i++) { | |
1470 | metaslab_t *msp = vd->vdev_ms[i]; | |
1471 | ||
1472 | mutex_enter(&msp->ms_lock); | |
1473 | range_tree_vacate(msp->ms_trim, NULL, NULL); | |
1474 | mutex_exit(&msp->ms_lock); | |
1475 | } | |
1476 | } | |
1477 | ||
1478 | mutex_enter(&vd->vdev_autotrim_lock); | |
1479 | ASSERT(vd->vdev_autotrim_thread != NULL); | |
1480 | vd->vdev_autotrim_thread = NULL; | |
1481 | cv_broadcast(&vd->vdev_autotrim_cv); | |
1482 | mutex_exit(&vd->vdev_autotrim_lock); | |
22dcf891 MM |
1483 | |
1484 | thread_exit(); | |
1b939560 BB |
1485 | } |
1486 | ||
1487 | /* | |
1488 | * Starts an autotrim thread, if needed, for each top-level vdev which can be | |
1489 | * trimmed. A top-level vdev which has been evacuated will never be trimmed. | |
1490 | */ | |
1491 | void | |
1492 | vdev_autotrim(spa_t *spa) | |
1493 | { | |
1494 | vdev_t *root_vd = spa->spa_root_vdev; | |
1495 | ||
1496 | for (uint64_t i = 0; i < root_vd->vdev_children; i++) { | |
1497 | vdev_t *tvd = root_vd->vdev_child[i]; | |
1498 | ||
1499 | mutex_enter(&tvd->vdev_autotrim_lock); | |
1500 | if (vdev_writeable(tvd) && !tvd->vdev_removing && | |
5caeef02 DB |
1501 | tvd->vdev_autotrim_thread == NULL && |
1502 | !tvd->vdev_rz_expanding) { | |
1b939560 BB |
1503 | ASSERT3P(tvd->vdev_top, ==, tvd); |
1504 | ||
1505 | tvd->vdev_autotrim_thread = thread_create(NULL, 0, | |
1506 | vdev_autotrim_thread, tvd, 0, &p0, TS_RUN, | |
1507 | maxclsyspri); | |
1508 | ASSERT(tvd->vdev_autotrim_thread != NULL); | |
1509 | } | |
1510 | mutex_exit(&tvd->vdev_autotrim_lock); | |
1511 | } | |
1512 | } | |
1513 | ||
1514 | /* | |
1515 | * Wait for the vdev_autotrim_thread associated with the passed top-level | |
1516 | * vdev to be terminated (canceled or stopped). | |
1517 | */ | |
1518 | void | |
1519 | vdev_autotrim_stop_wait(vdev_t *tvd) | |
1520 | { | |
1521 | mutex_enter(&tvd->vdev_autotrim_lock); | |
1522 | if (tvd->vdev_autotrim_thread != NULL) { | |
1523 | tvd->vdev_autotrim_exit_wanted = B_TRUE; | |
65d10bd8 KJ |
1524 | cv_broadcast(&tvd->vdev_autotrim_kick_cv); |
1525 | cv_wait(&tvd->vdev_autotrim_cv, | |
1526 | &tvd->vdev_autotrim_lock); | |
1b939560 BB |
1527 | |
1528 | ASSERT3P(tvd->vdev_autotrim_thread, ==, NULL); | |
1529 | tvd->vdev_autotrim_exit_wanted = B_FALSE; | |
1530 | } | |
1531 | mutex_exit(&tvd->vdev_autotrim_lock); | |
1532 | } | |
1533 | ||
65d10bd8 KJ |
1534 | void |
1535 | vdev_autotrim_kick(spa_t *spa) | |
1536 | { | |
1537 | ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); | |
1538 | ||
1539 | vdev_t *root_vd = spa->spa_root_vdev; | |
1540 | vdev_t *tvd; | |
1541 | ||
1542 | for (uint64_t i = 0; i < root_vd->vdev_children; i++) { | |
1543 | tvd = root_vd->vdev_child[i]; | |
1544 | ||
1545 | mutex_enter(&tvd->vdev_autotrim_lock); | |
1546 | if (tvd->vdev_autotrim_thread != NULL) | |
1547 | cv_broadcast(&tvd->vdev_autotrim_kick_cv); | |
1548 | mutex_exit(&tvd->vdev_autotrim_lock); | |
1549 | } | |
1550 | } | |
1551 | ||
1b939560 BB |
1552 | /* |
1553 | * Wait for all of the vdev_autotrim_thread associated with the pool to | |
1554 | * be terminated (canceled or stopped). | |
1555 | */ | |
1556 | void | |
1557 | vdev_autotrim_stop_all(spa_t *spa) | |
1558 | { | |
1559 | vdev_t *root_vd = spa->spa_root_vdev; | |
1560 | ||
1561 | for (uint64_t i = 0; i < root_vd->vdev_children; i++) | |
1562 | vdev_autotrim_stop_wait(root_vd->vdev_child[i]); | |
1563 | } | |
1564 | ||
1565 | /* | |
1566 | * Conditionally restart all of the vdev_autotrim_thread's for the pool. | |
1567 | */ | |
1568 | void | |
1569 | vdev_autotrim_restart(spa_t *spa) | |
1570 | { | |
1571 | ASSERT(MUTEX_HELD(&spa_namespace_lock)); | |
1572 | ||
1573 | if (spa->spa_autotrim) | |
1574 | vdev_autotrim(spa); | |
1575 | } | |
1576 | ||
460748d4 | 1577 | static __attribute__((noreturn)) void |
b7654bd7 GA |
1578 | vdev_trim_l2arc_thread(void *arg) |
1579 | { | |
1580 | vdev_t *vd = arg; | |
1581 | spa_t *spa = vd->vdev_spa; | |
1582 | l2arc_dev_t *dev = l2arc_vdev_get(vd); | |
861166b0 | 1583 | trim_args_t ta = {0}; |
b7654bd7 GA |
1584 | range_seg64_t physical_rs; |
1585 | ||
1586 | ASSERT(vdev_is_concrete(vd)); | |
1587 | spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); | |
1588 | ||
1589 | vd->vdev_trim_last_offset = 0; | |
1590 | vd->vdev_trim_rate = 0; | |
1591 | vd->vdev_trim_partial = 0; | |
1592 | vd->vdev_trim_secure = 0; | |
1593 | ||
b7654bd7 GA |
1594 | ta.trim_vdev = vd; |
1595 | ta.trim_tree = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); | |
1596 | ta.trim_type = TRIM_TYPE_MANUAL; | |
1597 | ta.trim_extent_bytes_max = zfs_trim_extent_bytes_max; | |
1598 | ta.trim_extent_bytes_min = SPA_MINBLOCKSIZE; | |
1599 | ta.trim_flags = 0; | |
1600 | ||
1601 | physical_rs.rs_start = vd->vdev_trim_bytes_done = 0; | |
1602 | physical_rs.rs_end = vd->vdev_trim_bytes_est = | |
1603 | vdev_get_min_asize(vd); | |
1604 | ||
1605 | range_tree_add(ta.trim_tree, physical_rs.rs_start, | |
1606 | physical_rs.rs_end - physical_rs.rs_start); | |
1607 | ||
1608 | mutex_enter(&vd->vdev_trim_lock); | |
1609 | vdev_trim_change_state(vd, VDEV_TRIM_ACTIVE, 0, 0, 0); | |
1610 | mutex_exit(&vd->vdev_trim_lock); | |
1611 | ||
1612 | (void) vdev_trim_ranges(&ta); | |
1613 | ||
1614 | spa_config_exit(spa, SCL_CONFIG, FTAG); | |
1615 | mutex_enter(&vd->vdev_trim_io_lock); | |
1616 | while (vd->vdev_trim_inflight[TRIM_TYPE_MANUAL] > 0) { | |
1617 | cv_wait(&vd->vdev_trim_io_cv, &vd->vdev_trim_io_lock); | |
1618 | } | |
1619 | mutex_exit(&vd->vdev_trim_io_lock); | |
1620 | ||
1621 | range_tree_vacate(ta.trim_tree, NULL, NULL); | |
1622 | range_tree_destroy(ta.trim_tree); | |
1623 | ||
1624 | mutex_enter(&vd->vdev_trim_lock); | |
1625 | if (!vd->vdev_trim_exit_wanted && vdev_writeable(vd)) { | |
1626 | vdev_trim_change_state(vd, VDEV_TRIM_COMPLETE, | |
1627 | vd->vdev_trim_rate, vd->vdev_trim_partial, | |
1628 | vd->vdev_trim_secure); | |
1629 | } | |
1630 | ASSERT(vd->vdev_trim_thread != NULL || | |
1631 | vd->vdev_trim_inflight[TRIM_TYPE_MANUAL] == 0); | |
1632 | ||
1633 | /* | |
1634 | * Drop the vdev_trim_lock while we sync out the txg since it's | |
1635 | * possible that a device might be trying to come online and | |
1636 | * must check to see if it needs to restart a trim. That thread | |
1637 | * will be holding the spa_config_lock which would prevent the | |
1638 | * txg_wait_synced from completing. Same strategy as in | |
1639 | * vdev_trim_thread(). | |
1640 | */ | |
1641 | mutex_exit(&vd->vdev_trim_lock); | |
1642 | txg_wait_synced(spa_get_dsl(vd->vdev_spa), 0); | |
1643 | mutex_enter(&vd->vdev_trim_lock); | |
1644 | ||
1645 | /* | |
1646 | * Update the header of the cache device here, before | |
1647 | * broadcasting vdev_trim_cv which may lead to the removal | |
1648 | * of the device. The same applies for setting l2ad_trim_all to | |
1649 | * false. | |
1650 | */ | |
1651 | spa_config_enter(vd->vdev_spa, SCL_L2ARC, vd, | |
1652 | RW_READER); | |
861166b0 | 1653 | memset(dev->l2ad_dev_hdr, 0, dev->l2ad_dev_hdr_asize); |
b7654bd7 GA |
1654 | l2arc_dev_hdr_update(dev); |
1655 | spa_config_exit(vd->vdev_spa, SCL_L2ARC, vd); | |
1656 | ||
1657 | vd->vdev_trim_thread = NULL; | |
1658 | if (vd->vdev_trim_state == VDEV_TRIM_COMPLETE) | |
1659 | dev->l2ad_trim_all = B_FALSE; | |
1660 | ||
1661 | cv_broadcast(&vd->vdev_trim_cv); | |
1662 | mutex_exit(&vd->vdev_trim_lock); | |
1663 | ||
1664 | thread_exit(); | |
1665 | } | |
1666 | ||
1667 | /* | |
1668 | * Punches out TRIM threads for the L2ARC devices in a spa and assigns them | |
1669 | * to vd->vdev_trim_thread variable. This facilitates the management of | |
1670 | * trimming the whole cache device using TRIM_TYPE_MANUAL upon addition | |
1671 | * to a pool or pool creation or when the header of the device is invalid. | |
1672 | */ | |
1673 | void | |
1674 | vdev_trim_l2arc(spa_t *spa) | |
1675 | { | |
1676 | ASSERT(MUTEX_HELD(&spa_namespace_lock)); | |
1677 | ||
1678 | /* | |
1679 | * Locate the spa's l2arc devices and kick off TRIM threads. | |
1680 | */ | |
1681 | for (int i = 0; i < spa->spa_l2cache.sav_count; i++) { | |
1682 | vdev_t *vd = spa->spa_l2cache.sav_vdevs[i]; | |
1683 | l2arc_dev_t *dev = l2arc_vdev_get(vd); | |
1684 | ||
1685 | if (dev == NULL || !dev->l2ad_trim_all) { | |
1686 | /* | |
1687 | * Don't attempt TRIM if the vdev is UNAVAIL or if the | |
1688 | * cache device was not marked for whole device TRIM | |
1689 | * (ie l2arc_trim_ahead = 0, or the L2ARC device header | |
1690 | * is valid with trim_state = VDEV_TRIM_COMPLETE and | |
1691 | * l2ad_log_entries > 0). | |
1692 | */ | |
1693 | continue; | |
1694 | } | |
1695 | ||
1696 | mutex_enter(&vd->vdev_trim_lock); | |
1697 | ASSERT(vd->vdev_ops->vdev_op_leaf); | |
1698 | ASSERT(vdev_is_concrete(vd)); | |
1699 | ASSERT3P(vd->vdev_trim_thread, ==, NULL); | |
1700 | ASSERT(!vd->vdev_detached); | |
1701 | ASSERT(!vd->vdev_trim_exit_wanted); | |
1702 | ASSERT(!vd->vdev_top->vdev_removing); | |
1703 | vdev_trim_change_state(vd, VDEV_TRIM_ACTIVE, 0, 0, 0); | |
1704 | vd->vdev_trim_thread = thread_create(NULL, 0, | |
1705 | vdev_trim_l2arc_thread, vd, 0, &p0, TS_RUN, maxclsyspri); | |
1706 | mutex_exit(&vd->vdev_trim_lock); | |
1707 | } | |
1708 | } | |
1709 | ||
1710 | /* | |
1711 | * A wrapper which calls vdev_trim_ranges(). It is intended to be called | |
1712 | * on leaf vdevs. | |
1713 | */ | |
1714 | int | |
1715 | vdev_trim_simple(vdev_t *vd, uint64_t start, uint64_t size) | |
1716 | { | |
861166b0 AZ |
1717 | trim_args_t ta = {0}; |
1718 | range_seg64_t physical_rs; | |
1719 | int error; | |
b7654bd7 GA |
1720 | physical_rs.rs_start = start; |
1721 | physical_rs.rs_end = start + size; | |
1722 | ||
1723 | ASSERT(vdev_is_concrete(vd)); | |
1724 | ASSERT(vd->vdev_ops->vdev_op_leaf); | |
1725 | ASSERT(!vd->vdev_detached); | |
1726 | ASSERT(!vd->vdev_top->vdev_removing); | |
5caeef02 | 1727 | ASSERT(!vd->vdev_top->vdev_rz_expanding); |
b7654bd7 | 1728 | |
b7654bd7 GA |
1729 | ta.trim_vdev = vd; |
1730 | ta.trim_tree = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); | |
1731 | ta.trim_type = TRIM_TYPE_SIMPLE; | |
1732 | ta.trim_extent_bytes_max = zfs_trim_extent_bytes_max; | |
1733 | ta.trim_extent_bytes_min = SPA_MINBLOCKSIZE; | |
1734 | ta.trim_flags = 0; | |
1735 | ||
1736 | ASSERT3U(physical_rs.rs_end, >=, physical_rs.rs_start); | |
1737 | ||
1738 | if (physical_rs.rs_end > physical_rs.rs_start) { | |
1739 | range_tree_add(ta.trim_tree, physical_rs.rs_start, | |
1740 | physical_rs.rs_end - physical_rs.rs_start); | |
1741 | } else { | |
1742 | ASSERT3U(physical_rs.rs_end, ==, physical_rs.rs_start); | |
1743 | } | |
1744 | ||
1745 | error = vdev_trim_ranges(&ta); | |
1746 | ||
1747 | mutex_enter(&vd->vdev_trim_io_lock); | |
1748 | while (vd->vdev_trim_inflight[TRIM_TYPE_SIMPLE] > 0) { | |
1749 | cv_wait(&vd->vdev_trim_io_cv, &vd->vdev_trim_io_lock); | |
1750 | } | |
1751 | mutex_exit(&vd->vdev_trim_io_lock); | |
1752 | ||
1753 | range_tree_vacate(ta.trim_tree, NULL, NULL); | |
1754 | range_tree_destroy(ta.trim_tree); | |
1755 | ||
1756 | return (error); | |
1757 | } | |
1758 | ||
1b939560 BB |
1759 | EXPORT_SYMBOL(vdev_trim); |
1760 | EXPORT_SYMBOL(vdev_trim_stop); | |
1761 | EXPORT_SYMBOL(vdev_trim_stop_all); | |
1762 | EXPORT_SYMBOL(vdev_trim_stop_wait); | |
1763 | EXPORT_SYMBOL(vdev_trim_restart); | |
1764 | EXPORT_SYMBOL(vdev_autotrim); | |
1765 | EXPORT_SYMBOL(vdev_autotrim_stop_all); | |
1766 | EXPORT_SYMBOL(vdev_autotrim_stop_wait); | |
1767 | EXPORT_SYMBOL(vdev_autotrim_restart); | |
b7654bd7 GA |
1768 | EXPORT_SYMBOL(vdev_trim_l2arc); |
1769 | EXPORT_SYMBOL(vdev_trim_simple); | |
1b939560 | 1770 | |
03fdcb9a | 1771 | ZFS_MODULE_PARAM(zfs_trim, zfs_trim_, extent_bytes_max, UINT, ZMOD_RW, |
7ada752a | 1772 | "Max size of TRIM commands, larger will be split"); |
1b939560 | 1773 | |
03fdcb9a | 1774 | ZFS_MODULE_PARAM(zfs_trim, zfs_trim_, extent_bytes_min, UINT, ZMOD_RW, |
7ada752a | 1775 | "Min size of TRIM commands, smaller will be skipped"); |
1b939560 | 1776 | |
03fdcb9a | 1777 | ZFS_MODULE_PARAM(zfs_trim, zfs_trim_, metaslab_skip, UINT, ZMOD_RW, |
7ada752a | 1778 | "Skip metaslabs which have never been initialized"); |
1b939560 | 1779 | |
03fdcb9a | 1780 | ZFS_MODULE_PARAM(zfs_trim, zfs_trim_, txg_batch, UINT, ZMOD_RW, |
7ada752a | 1781 | "Min number of txgs to aggregate frees before issuing TRIM"); |
1b939560 | 1782 | |
03fdcb9a | 1783 | ZFS_MODULE_PARAM(zfs_trim, zfs_trim_, queue_limit, UINT, ZMOD_RW, |
7ada752a | 1784 | "Max queued TRIMs outstanding per leaf vdev"); |