]> git.proxmox.com Git - mirror_zfs.git/blame - module/zfs/vdev_trim.c
Reword comment in lz4_compress_zfs
[mirror_zfs.git] / module / zfs / vdev_trim.c
CommitLineData
1b939560
BB
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2016 by Delphix. All rights reserved.
24 * Copyright (c) 2019 by Lawrence Livermore National Security, LLC.
25 */
26
27#include <sys/spa.h>
28#include <sys/spa_impl.h>
29#include <sys/txg.h>
30#include <sys/vdev_impl.h>
31#include <sys/vdev_trim.h>
32#include <sys/refcount.h>
33#include <sys/metaslab_impl.h>
34#include <sys/dsl_synctask.h>
35#include <sys/zap.h>
36#include <sys/dmu_tx.h>
37
38/*
39 * TRIM is a feature which is used to notify a SSD that some previously
40 * written space is no longer allocated by the pool. This is useful because
41 * writes to a SSD must be performed to blocks which have first been erased.
42 * Ensuring the SSD always has a supply of erased blocks for new writes
43 * helps prevent the performance from deteriorating.
44 *
45 * There are two supported TRIM methods; manual and automatic.
46 *
47 * Manual TRIM:
48 *
49 * A manual TRIM is initiated by running the 'zpool trim' command. A single
50 * 'vdev_trim' thread is created for each leaf vdev, and it is responsible for
51 * managing that vdev TRIM process. This involves iterating over all the
52 * metaslabs, calculating the unallocated space ranges, and then issuing the
53 * required TRIM I/Os.
54 *
55 * While a metaslab is being actively trimmed it is not eligible to perform
56 * new allocations. After traversing all of the metaslabs the thread is
57 * terminated. Finally, both the requested options and current progress of
58 * the TRIM are regularly written to the pool. This allows the TRIM to be
59 * suspended and resumed as needed.
60 *
61 * Automatic TRIM:
62 *
63 * An automatic TRIM is enabled by setting the 'autotrim' pool property
64 * to 'on'. When enabled, a `vdev_autotrim' thread is created for each
65 * top-level (not leaf) vdev in the pool. These threads perform the same
66 * core TRIM process as a manual TRIM, but with a few key differences.
67 *
68 * 1) Automatic TRIM happens continuously in the background and operates
69 * solely on recently freed blocks (ms_trim not ms_allocatable).
70 *
71 * 2) Each thread is associated with a top-level (not leaf) vdev. This has
72 * the benefit of simplifying the threading model, it makes it easier
73 * to coordinate administrative commands, and it ensures only a single
74 * metaslab is disabled at a time. Unlike manual TRIM, this means each
75 * 'vdev_autotrim' thread is responsible for issuing TRIM I/Os for its
76 * children.
77 *
78 * 3) There is no automatic TRIM progress information stored on disk, nor
79 * is it reported by 'zpool status'.
80 *
81 * While the automatic TRIM process is highly effective it is more likely
82 * than a manual TRIM to encounter tiny ranges. Ranges less than or equal to
83 * 'zfs_trim_extent_bytes_min' (32k) are considered too small to efficiently
84 * TRIM and are skipped. This means small amounts of freed space may not
85 * be automatically trimmed.
86 *
87 * Furthermore, devices with attached hot spares and devices being actively
88 * replaced are skipped. This is done to avoid adding additional stress to
89 * a potentially unhealthy device and to minimize the required rebuild time.
90 *
91 * For this reason it may be beneficial to occasionally manually TRIM a pool
92 * even when automatic TRIM is enabled.
93 */
94
95/*
96 * Maximum size of TRIM I/O, ranges will be chunked in to 128MiB lengths.
97 */
98unsigned int zfs_trim_extent_bytes_max = 128 * 1024 * 1024;
99
100/*
101 * Minimum size of TRIM I/O, extents smaller than 32Kib will be skipped.
102 */
103unsigned int zfs_trim_extent_bytes_min = 32 * 1024;
104
105/*
106 * Skip uninitialized metaslabs during the TRIM process. This option is
107 * useful for pools constructed from large thinly-provisioned devices where
108 * TRIM operations are slow. As a pool ages an increasing fraction of
109 * the pools metaslabs will be initialized progressively degrading the
110 * usefulness of this option. This setting is stored when starting a
111 * manual TRIM and will persist for the duration of the requested TRIM.
112 */
113unsigned int zfs_trim_metaslab_skip = 0;
114
115/*
116 * Maximum number of queued TRIM I/Os per leaf vdev. The number of
117 * concurrent TRIM I/Os issued to the device is controlled by the
118 * zfs_vdev_trim_min_active and zfs_vdev_trim_max_active module options.
119 */
120unsigned int zfs_trim_queue_limit = 10;
121
122/*
123 * The minimum number of transaction groups between automatic trims of a
124 * metaslab. This setting represents a trade-off between issuing more
125 * efficient TRIM operations, by allowing them to be aggregated longer,
126 * and issuing them promptly so the trimmed space is available. Note
127 * that this value is a minimum; metaslabs can be trimmed less frequently
128 * when there are a large number of ranges which need to be trimmed.
129 *
130 * Increasing this value will allow frees to be aggregated for a longer
131 * time. This can result is larger TRIM operations, and increased memory
132 * usage in order to track the ranges to be trimmed. Decreasing this value
133 * has the opposite effect. The default value of 32 was determined though
134 * testing to be a reasonable compromise.
135 */
136unsigned int zfs_trim_txg_batch = 32;
137
138/*
139 * The trim_args are a control structure which describe how a leaf vdev
140 * should be trimmed. The core elements are the vdev, the metaslab being
141 * trimmed and a range tree containing the extents to TRIM. All provided
142 * ranges must be within the metaslab.
143 */
144typedef struct trim_args {
145 /*
146 * These fields are set by the caller of vdev_trim_ranges().
147 */
148 vdev_t *trim_vdev; /* Leaf vdev to TRIM */
149 metaslab_t *trim_msp; /* Disabled metaslab */
150 range_tree_t *trim_tree; /* TRIM ranges (in metaslab) */
151 trim_type_t trim_type; /* Manual or auto TRIM */
152 uint64_t trim_extent_bytes_max; /* Maximum TRIM I/O size */
153 uint64_t trim_extent_bytes_min; /* Minimum TRIM I/O size */
154 enum trim_flag trim_flags; /* TRIM flags (secure) */
155
156 /*
157 * These fields are updated by vdev_trim_ranges().
158 */
159 hrtime_t trim_start_time; /* Start time */
160 uint64_t trim_bytes_done; /* Bytes trimmed */
161} trim_args_t;
162
163/*
164 * Determines whether a vdev_trim_thread() should be stopped.
165 */
166static boolean_t
167vdev_trim_should_stop(vdev_t *vd)
168{
169 return (vd->vdev_trim_exit_wanted || !vdev_writeable(vd) ||
170 vd->vdev_detached || vd->vdev_top->vdev_removing);
171}
172
173/*
174 * Determines whether a vdev_autotrim_thread() should be stopped.
175 */
176static boolean_t
177vdev_autotrim_should_stop(vdev_t *tvd)
178{
179 return (tvd->vdev_autotrim_exit_wanted ||
180 !vdev_writeable(tvd) || tvd->vdev_removing ||
181 spa_get_autotrim(tvd->vdev_spa) == SPA_AUTOTRIM_OFF);
182}
183
184/*
185 * The sync task for updating the on-disk state of a manual TRIM. This
186 * is scheduled by vdev_trim_change_state().
187 */
188static void
189vdev_trim_zap_update_sync(void *arg, dmu_tx_t *tx)
190{
191 /*
192 * We pass in the guid instead of the vdev_t since the vdev may
193 * have been freed prior to the sync task being processed. This
194 * happens when a vdev is detached as we call spa_config_vdev_exit(),
195 * stop the trimming thread, schedule the sync task, and free
196 * the vdev. Later when the scheduled sync task is invoked, it would
197 * find that the vdev has been freed.
198 */
199 uint64_t guid = *(uint64_t *)arg;
200 uint64_t txg = dmu_tx_get_txg(tx);
201 kmem_free(arg, sizeof (uint64_t));
202
203 vdev_t *vd = spa_lookup_by_guid(tx->tx_pool->dp_spa, guid, B_FALSE);
204 if (vd == NULL || vd->vdev_top->vdev_removing || !vdev_is_concrete(vd))
205 return;
206
207 uint64_t last_offset = vd->vdev_trim_offset[txg & TXG_MASK];
208 vd->vdev_trim_offset[txg & TXG_MASK] = 0;
209
210 VERIFY3U(vd->vdev_leaf_zap, !=, 0);
211
212 objset_t *mos = vd->vdev_spa->spa_meta_objset;
213
214 if (last_offset > 0 || vd->vdev_trim_last_offset == UINT64_MAX) {
215
216 if (vd->vdev_trim_last_offset == UINT64_MAX)
217 last_offset = 0;
218
219 vd->vdev_trim_last_offset = last_offset;
220 VERIFY0(zap_update(mos, vd->vdev_leaf_zap,
221 VDEV_LEAF_ZAP_TRIM_LAST_OFFSET,
222 sizeof (last_offset), 1, &last_offset, tx));
223 }
224
225 if (vd->vdev_trim_action_time > 0) {
226 uint64_t val = (uint64_t)vd->vdev_trim_action_time;
227 VERIFY0(zap_update(mos, vd->vdev_leaf_zap,
228 VDEV_LEAF_ZAP_TRIM_ACTION_TIME, sizeof (val),
229 1, &val, tx));
230 }
231
232 if (vd->vdev_trim_rate > 0) {
233 uint64_t rate = (uint64_t)vd->vdev_trim_rate;
234
235 if (rate == UINT64_MAX)
236 rate = 0;
237
238 VERIFY0(zap_update(mos, vd->vdev_leaf_zap,
239 VDEV_LEAF_ZAP_TRIM_RATE, sizeof (rate), 1, &rate, tx));
240 }
241
242 uint64_t partial = vd->vdev_trim_partial;
243 if (partial == UINT64_MAX)
244 partial = 0;
245
246 VERIFY0(zap_update(mos, vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_PARTIAL,
247 sizeof (partial), 1, &partial, tx));
248
249 uint64_t secure = vd->vdev_trim_secure;
250 if (secure == UINT64_MAX)
251 secure = 0;
252
253 VERIFY0(zap_update(mos, vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_SECURE,
254 sizeof (secure), 1, &secure, tx));
255
256
257 uint64_t trim_state = vd->vdev_trim_state;
258 VERIFY0(zap_update(mos, vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_STATE,
259 sizeof (trim_state), 1, &trim_state, tx));
260}
261
262/*
263 * Update the on-disk state of a manual TRIM. This is called to request
264 * that a TRIM be started/suspended/canceled, or to change one of the
265 * TRIM options (partial, secure, rate).
266 */
267static void
268vdev_trim_change_state(vdev_t *vd, vdev_trim_state_t new_state,
269 uint64_t rate, boolean_t partial, boolean_t secure)
270{
271 ASSERT(MUTEX_HELD(&vd->vdev_trim_lock));
272 spa_t *spa = vd->vdev_spa;
273
274 if (new_state == vd->vdev_trim_state)
275 return;
276
277 /*
278 * Copy the vd's guid, this will be freed by the sync task.
279 */
280 uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP);
281 *guid = vd->vdev_guid;
282
283 /*
284 * If we're suspending, then preserve the original start time.
285 */
286 if (vd->vdev_trim_state != VDEV_TRIM_SUSPENDED) {
287 vd->vdev_trim_action_time = gethrestime_sec();
288 }
289
290 /*
291 * If we're activating, then preserve the requested rate and trim
292 * method. Setting the last offset and rate to UINT64_MAX is used
293 * as a sentinel to indicate they should be reset to default values.
294 */
295 if (new_state == VDEV_TRIM_ACTIVE) {
296 if (vd->vdev_trim_state == VDEV_TRIM_COMPLETE ||
297 vd->vdev_trim_state == VDEV_TRIM_CANCELED) {
298 vd->vdev_trim_last_offset = UINT64_MAX;
299 vd->vdev_trim_rate = UINT64_MAX;
300 vd->vdev_trim_partial = UINT64_MAX;
301 vd->vdev_trim_secure = UINT64_MAX;
302 }
303
304 if (rate != 0)
305 vd->vdev_trim_rate = rate;
306
307 if (partial != 0)
308 vd->vdev_trim_partial = partial;
309
310 if (secure != 0)
311 vd->vdev_trim_secure = secure;
312 }
313
314 boolean_t resumed = !!(vd->vdev_trim_state == VDEV_TRIM_SUSPENDED);
315 vd->vdev_trim_state = new_state;
316
317 dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
318 VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
319 dsl_sync_task_nowait(spa_get_dsl(spa), vdev_trim_zap_update_sync,
320 guid, 2, ZFS_SPACE_CHECK_NONE, tx);
321
322 switch (new_state) {
323 case VDEV_TRIM_ACTIVE:
324 spa_event_notify(spa, vd, NULL,
325 resumed ? ESC_ZFS_TRIM_RESUME : ESC_ZFS_TRIM_START);
326 spa_history_log_internal(spa, "trim", tx,
327 "vdev=%s activated", vd->vdev_path);
328 break;
329 case VDEV_TRIM_SUSPENDED:
330 spa_event_notify(spa, vd, NULL, ESC_ZFS_TRIM_SUSPEND);
331 spa_history_log_internal(spa, "trim", tx,
332 "vdev=%s suspended", vd->vdev_path);
333 break;
334 case VDEV_TRIM_CANCELED:
335 spa_event_notify(spa, vd, NULL, ESC_ZFS_TRIM_CANCEL);
336 spa_history_log_internal(spa, "trim", tx,
337 "vdev=%s canceled", vd->vdev_path);
338 break;
339 case VDEV_TRIM_COMPLETE:
340 spa_event_notify(spa, vd, NULL, ESC_ZFS_TRIM_FINISH);
341 spa_history_log_internal(spa, "trim", tx,
342 "vdev=%s complete", vd->vdev_path);
343 break;
344 default:
345 panic("invalid state %llu", (unsigned long long)new_state);
346 }
347
348 dmu_tx_commit(tx);
349}
350
351/*
352 * The zio_done_func_t done callback for each manual TRIM issued. It is
353 * responsible for updating the TRIM stats, reissuing failed TRIM I/Os,
354 * and limiting the number of in flight TRIM I/Os.
355 */
356static void
357vdev_trim_cb(zio_t *zio)
358{
359 vdev_t *vd = zio->io_vd;
360
361 mutex_enter(&vd->vdev_trim_io_lock);
362 if (zio->io_error == ENXIO && !vdev_writeable(vd)) {
363 /*
364 * The I/O failed because the vdev was unavailable; roll the
365 * last offset back. (This works because spa_sync waits on
366 * spa_txg_zio before it runs sync tasks.)
367 */
368 uint64_t *offset =
369 &vd->vdev_trim_offset[zio->io_txg & TXG_MASK];
370 *offset = MIN(*offset, zio->io_offset);
371 } else {
372 if (zio->io_error != 0) {
373 vd->vdev_stat.vs_trim_errors++;
374 spa_iostats_trim_add(vd->vdev_spa, TRIM_TYPE_MANUAL,
375 0, 0, 0, 0, 1, zio->io_orig_size);
376 } else {
377 spa_iostats_trim_add(vd->vdev_spa, TRIM_TYPE_MANUAL,
378 1, zio->io_orig_size, 0, 0, 0, 0);
379 }
380
381 vd->vdev_trim_bytes_done += zio->io_orig_size;
382 }
383
384 ASSERT3U(vd->vdev_trim_inflight[TRIM_TYPE_MANUAL], >, 0);
385 vd->vdev_trim_inflight[TRIM_TYPE_MANUAL]--;
386 cv_broadcast(&vd->vdev_trim_io_cv);
387 mutex_exit(&vd->vdev_trim_io_lock);
388
389 spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd);
390}
391
392/*
393 * The zio_done_func_t done callback for each automatic TRIM issued. It
394 * is responsible for updating the TRIM stats and limiting the number of
395 * in flight TRIM I/Os. Automatic TRIM I/Os are best effort and are
396 * never reissued on failure.
397 */
398static void
399vdev_autotrim_cb(zio_t *zio)
400{
401 vdev_t *vd = zio->io_vd;
402
403 mutex_enter(&vd->vdev_trim_io_lock);
404
405 if (zio->io_error != 0) {
406 vd->vdev_stat.vs_trim_errors++;
407 spa_iostats_trim_add(vd->vdev_spa, TRIM_TYPE_AUTO,
408 0, 0, 0, 0, 1, zio->io_orig_size);
409 } else {
410 spa_iostats_trim_add(vd->vdev_spa, TRIM_TYPE_AUTO,
411 1, zio->io_orig_size, 0, 0, 0, 0);
412 }
413
414 ASSERT3U(vd->vdev_trim_inflight[TRIM_TYPE_AUTO], >, 0);
415 vd->vdev_trim_inflight[TRIM_TYPE_AUTO]--;
416 cv_broadcast(&vd->vdev_trim_io_cv);
417 mutex_exit(&vd->vdev_trim_io_lock);
418
419 spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd);
420}
421
422/*
423 * Returns the average trim rate in bytes/sec for the ta->trim_vdev.
424 */
425static uint64_t
426vdev_trim_calculate_rate(trim_args_t *ta)
427{
428 return (ta->trim_bytes_done * 1000 /
429 (NSEC2MSEC(gethrtime() - ta->trim_start_time) + 1));
430}
431
432/*
433 * Issues a physical TRIM and takes care of rate limiting (bytes/sec)
434 * and number of concurrent TRIM I/Os.
435 */
436static int
437vdev_trim_range(trim_args_t *ta, uint64_t start, uint64_t size)
438{
439 vdev_t *vd = ta->trim_vdev;
440 spa_t *spa = vd->vdev_spa;
441
442 mutex_enter(&vd->vdev_trim_io_lock);
443
444 /*
445 * Limit manual TRIM I/Os to the requested rate. This does not
446 * apply to automatic TRIM since no per vdev rate can be specified.
447 */
448 if (ta->trim_type == TRIM_TYPE_MANUAL) {
449 while (vd->vdev_trim_rate != 0 && !vdev_trim_should_stop(vd) &&
450 vdev_trim_calculate_rate(ta) > vd->vdev_trim_rate) {
451 cv_timedwait_sig(&vd->vdev_trim_io_cv,
452 &vd->vdev_trim_io_lock, ddi_get_lbolt() +
453 MSEC_TO_TICK(10));
454 }
455 }
456 ta->trim_bytes_done += size;
457
458 /* Limit in flight trimming I/Os */
459 while (vd->vdev_trim_inflight[0] + vd->vdev_trim_inflight[1] >=
460 zfs_trim_queue_limit) {
461 cv_wait(&vd->vdev_trim_io_cv, &vd->vdev_trim_io_lock);
462 }
463 vd->vdev_trim_inflight[ta->trim_type]++;
464 mutex_exit(&vd->vdev_trim_io_lock);
465
466 dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
467 VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
468 uint64_t txg = dmu_tx_get_txg(tx);
469
470 spa_config_enter(spa, SCL_STATE_ALL, vd, RW_READER);
471 mutex_enter(&vd->vdev_trim_lock);
472
473 if (ta->trim_type == TRIM_TYPE_MANUAL &&
474 vd->vdev_trim_offset[txg & TXG_MASK] == 0) {
475 uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP);
476 *guid = vd->vdev_guid;
477
478 /* This is the first write of this txg. */
479 dsl_sync_task_nowait(spa_get_dsl(spa),
480 vdev_trim_zap_update_sync, guid, 2,
481 ZFS_SPACE_CHECK_RESERVED, tx);
482 }
483
484 /*
485 * We know the vdev_t will still be around since all consumers of
486 * vdev_free must stop the trimming first.
487 */
488 if ((ta->trim_type == TRIM_TYPE_MANUAL &&
489 vdev_trim_should_stop(vd)) ||
490 (ta->trim_type == TRIM_TYPE_AUTO &&
491 vdev_autotrim_should_stop(vd->vdev_top))) {
492 mutex_enter(&vd->vdev_trim_io_lock);
493 vd->vdev_trim_inflight[ta->trim_type]--;
494 mutex_exit(&vd->vdev_trim_io_lock);
495 spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd);
496 mutex_exit(&vd->vdev_trim_lock);
497 dmu_tx_commit(tx);
498 return (SET_ERROR(EINTR));
499 }
500 mutex_exit(&vd->vdev_trim_lock);
501
502 if (ta->trim_type == TRIM_TYPE_MANUAL)
503 vd->vdev_trim_offset[txg & TXG_MASK] = start + size;
504
505 zio_nowait(zio_trim(spa->spa_txg_zio[txg & TXG_MASK], vd,
506 start, size, ta->trim_type == TRIM_TYPE_MANUAL ?
507 vdev_trim_cb : vdev_autotrim_cb, NULL,
508 ZIO_PRIORITY_TRIM, ZIO_FLAG_CANFAIL, ta->trim_flags));
509 /* vdev_trim_cb and vdev_autotrim_cb release SCL_STATE_ALL */
510
511 dmu_tx_commit(tx);
512
513 return (0);
514}
515
516/*
517 * Issues TRIM I/Os for all ranges in the provided ta->trim_tree range tree.
518 * Additional parameters describing how the TRIM should be performed must
519 * be set in the trim_args structure. See the trim_args definition for
520 * additional information.
521 */
522static int
523vdev_trim_ranges(trim_args_t *ta)
524{
525 vdev_t *vd = ta->trim_vdev;
526 avl_tree_t *rt = &ta->trim_tree->rt_root;
527 uint64_t extent_bytes_max = ta->trim_extent_bytes_max;
528 uint64_t extent_bytes_min = ta->trim_extent_bytes_min;
529 spa_t *spa = vd->vdev_spa;
530
531 ta->trim_start_time = gethrtime();
532 ta->trim_bytes_done = 0;
533
534 for (range_seg_t *rs = avl_first(rt); rs != NULL;
535 rs = AVL_NEXT(rt, rs)) {
536 uint64_t size = rs->rs_end - rs->rs_start;
537
538 if (extent_bytes_min && size < extent_bytes_min) {
539 spa_iostats_trim_add(spa, ta->trim_type,
540 0, 0, 1, size, 0, 0);
541 continue;
542 }
543
544 /* Split range into legally-sized physical chunks */
545 uint64_t writes_required = ((size - 1) / extent_bytes_max) + 1;
546
547 for (uint64_t w = 0; w < writes_required; w++) {
548 int error;
549
550 error = vdev_trim_range(ta, VDEV_LABEL_START_SIZE +
551 rs->rs_start + (w * extent_bytes_max),
552 MIN(size - (w * extent_bytes_max),
553 extent_bytes_max));
554 if (error != 0) {
555 return (error);
556 }
557 }
558 }
559
560 return (0);
561}
562
563/*
564 * Calculates the completion percentage of a manual TRIM.
565 */
566static void
567vdev_trim_calculate_progress(vdev_t *vd)
568{
569 ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) ||
570 spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER));
571 ASSERT(vd->vdev_leaf_zap != 0);
572
573 vd->vdev_trim_bytes_est = 0;
574 vd->vdev_trim_bytes_done = 0;
575
576 for (uint64_t i = 0; i < vd->vdev_top->vdev_ms_count; i++) {
577 metaslab_t *msp = vd->vdev_top->vdev_ms[i];
578 mutex_enter(&msp->ms_lock);
579
580 uint64_t ms_free = msp->ms_size -
581 metaslab_allocated_space(msp);
582
583 if (vd->vdev_top->vdev_ops == &vdev_raidz_ops)
584 ms_free /= vd->vdev_top->vdev_children;
585
586 /*
587 * Convert the metaslab range to a physical range
588 * on our vdev. We use this to determine if we are
589 * in the middle of this metaslab range.
590 */
591 range_seg_t logical_rs, physical_rs;
592 logical_rs.rs_start = msp->ms_start;
593 logical_rs.rs_end = msp->ms_start + msp->ms_size;
594 vdev_xlate(vd, &logical_rs, &physical_rs);
595
596 if (vd->vdev_trim_last_offset <= physical_rs.rs_start) {
597 vd->vdev_trim_bytes_est += ms_free;
598 mutex_exit(&msp->ms_lock);
599 continue;
600 } else if (vd->vdev_trim_last_offset > physical_rs.rs_end) {
601 vd->vdev_trim_bytes_done += ms_free;
602 vd->vdev_trim_bytes_est += ms_free;
603 mutex_exit(&msp->ms_lock);
604 continue;
605 }
606
607 /*
608 * If we get here, we're in the middle of trimming this
609 * metaslab. Load it and walk the free tree for more
610 * accurate progress estimation.
611 */
612 VERIFY0(metaslab_load(msp));
613
614 for (range_seg_t *rs = avl_first(&msp->ms_allocatable->rt_root);
615 rs; rs = AVL_NEXT(&msp->ms_allocatable->rt_root, rs)) {
616 logical_rs.rs_start = rs->rs_start;
617 logical_rs.rs_end = rs->rs_end;
618 vdev_xlate(vd, &logical_rs, &physical_rs);
619
620 uint64_t size = physical_rs.rs_end -
621 physical_rs.rs_start;
622 vd->vdev_trim_bytes_est += size;
623 if (vd->vdev_trim_last_offset >= physical_rs.rs_end) {
624 vd->vdev_trim_bytes_done += size;
625 } else if (vd->vdev_trim_last_offset >
626 physical_rs.rs_start &&
627 vd->vdev_trim_last_offset <=
628 physical_rs.rs_end) {
629 vd->vdev_trim_bytes_done +=
630 vd->vdev_trim_last_offset -
631 physical_rs.rs_start;
632 }
633 }
634 mutex_exit(&msp->ms_lock);
635 }
636}
637
638/*
639 * Load from disk the vdev's manual TRIM information. This includes the
640 * state, progress, and options provided when initiating the manual TRIM.
641 */
642static int
643vdev_trim_load(vdev_t *vd)
644{
645 int err = 0;
646 ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) ||
647 spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER));
648 ASSERT(vd->vdev_leaf_zap != 0);
649
650 if (vd->vdev_trim_state == VDEV_TRIM_ACTIVE ||
651 vd->vdev_trim_state == VDEV_TRIM_SUSPENDED) {
652 err = zap_lookup(vd->vdev_spa->spa_meta_objset,
653 vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_LAST_OFFSET,
654 sizeof (vd->vdev_trim_last_offset), 1,
655 &vd->vdev_trim_last_offset);
656 if (err == ENOENT) {
657 vd->vdev_trim_last_offset = 0;
658 err = 0;
659 }
660
661 if (err == 0) {
662 err = zap_lookup(vd->vdev_spa->spa_meta_objset,
663 vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_RATE,
664 sizeof (vd->vdev_trim_rate), 1,
665 &vd->vdev_trim_rate);
666 if (err == ENOENT) {
667 vd->vdev_trim_rate = 0;
668 err = 0;
669 }
670 }
671
672 if (err == 0) {
673 err = zap_lookup(vd->vdev_spa->spa_meta_objset,
674 vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_PARTIAL,
675 sizeof (vd->vdev_trim_partial), 1,
676 &vd->vdev_trim_partial);
677 if (err == ENOENT) {
678 vd->vdev_trim_partial = 0;
679 err = 0;
680 }
681 }
682
683 if (err == 0) {
684 err = zap_lookup(vd->vdev_spa->spa_meta_objset,
685 vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_SECURE,
686 sizeof (vd->vdev_trim_secure), 1,
687 &vd->vdev_trim_secure);
688 if (err == ENOENT) {
689 vd->vdev_trim_secure = 0;
690 err = 0;
691 }
692 }
693 }
694
695 vdev_trim_calculate_progress(vd);
696
697 return (err);
698}
699
700/*
701 * Convert the logical range into a physical range and add it to the
702 * range tree passed in the trim_args_t.
703 */
704static void
705vdev_trim_range_add(void *arg, uint64_t start, uint64_t size)
706{
707 trim_args_t *ta = arg;
708 vdev_t *vd = ta->trim_vdev;
709 range_seg_t logical_rs, physical_rs;
710 logical_rs.rs_start = start;
711 logical_rs.rs_end = start + size;
712
713 /*
714 * Every range to be trimmed must be part of ms_allocatable.
715 * When ZFS_DEBUG_TRIM is set load the metaslab to verify this
716 * is always the case.
717 */
718 if (zfs_flags & ZFS_DEBUG_TRIM) {
719 metaslab_t *msp = ta->trim_msp;
720 VERIFY0(metaslab_load(msp));
721 VERIFY3B(msp->ms_loaded, ==, B_TRUE);
722 VERIFY(range_tree_find(msp->ms_allocatable, start, size));
723 }
724
725 ASSERT(vd->vdev_ops->vdev_op_leaf);
726 vdev_xlate(vd, &logical_rs, &physical_rs);
727
728 IMPLY(vd->vdev_top == vd,
729 logical_rs.rs_start == physical_rs.rs_start);
730 IMPLY(vd->vdev_top == vd,
731 logical_rs.rs_end == physical_rs.rs_end);
732
733 /*
734 * Only a manual trim will be traversing the vdev sequentially.
735 * For an auto trim all valid ranges should be added.
736 */
737 if (ta->trim_type == TRIM_TYPE_MANUAL) {
738
739 /* Only add segments that we have not visited yet */
740 if (physical_rs.rs_end <= vd->vdev_trim_last_offset)
741 return;
742
743 /* Pick up where we left off mid-range. */
744 if (vd->vdev_trim_last_offset > physical_rs.rs_start) {
745 ASSERT3U(physical_rs.rs_end, >,
746 vd->vdev_trim_last_offset);
747 physical_rs.rs_start = vd->vdev_trim_last_offset;
748 }
749 }
750
751 ASSERT3U(physical_rs.rs_end, >=, physical_rs.rs_start);
752
753 /*
754 * With raidz, it's possible that the logical range does not live on
755 * this leaf vdev. We only add the physical range to this vdev's if it
756 * has a length greater than 0.
757 */
758 if (physical_rs.rs_end > physical_rs.rs_start) {
759 range_tree_add(ta->trim_tree, physical_rs.rs_start,
760 physical_rs.rs_end - physical_rs.rs_start);
761 } else {
762 ASSERT3U(physical_rs.rs_end, ==, physical_rs.rs_start);
763 }
764}
765
766/*
767 * Each manual TRIM thread is responsible for trimming the unallocated
768 * space for each leaf vdev. This is accomplished by sequentially iterating
769 * over its top-level metaslabs and issuing TRIM I/O for the space described
770 * by its ms_allocatable. While a metaslab is undergoing trimming it is
771 * not eligible for new allocations.
772 */
773static void
774vdev_trim_thread(void *arg)
775{
776 vdev_t *vd = arg;
777 spa_t *spa = vd->vdev_spa;
778 trim_args_t ta;
779 int error = 0;
780
781 /*
782 * The VDEV_LEAF_ZAP_TRIM_* entries may have been updated by
783 * vdev_trim(). Wait for the updated values to be reflected
784 * in the zap in order to start with the requested settings.
785 */
786 txg_wait_synced(spa_get_dsl(vd->vdev_spa), 0);
787
788 ASSERT(vdev_is_concrete(vd));
789 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
790
791 vd->vdev_trim_last_offset = 0;
792 vd->vdev_trim_rate = 0;
793 vd->vdev_trim_partial = 0;
794 vd->vdev_trim_secure = 0;
795
796 VERIFY0(vdev_trim_load(vd));
797
798 ta.trim_vdev = vd;
799 ta.trim_extent_bytes_max = zfs_trim_extent_bytes_max;
800 ta.trim_extent_bytes_min = zfs_trim_extent_bytes_min;
801 ta.trim_tree = range_tree_create(NULL, NULL);
802 ta.trim_type = TRIM_TYPE_MANUAL;
803 ta.trim_flags = 0;
804
805 /*
806 * When a secure TRIM has been requested infer that the intent
807 * is that everything must be trimmed. Override the default
808 * minimum TRIM size to prevent ranges from being skipped.
809 */
810 if (vd->vdev_trim_secure) {
811 ta.trim_flags |= ZIO_TRIM_SECURE;
812 ta.trim_extent_bytes_min = SPA_MINBLOCKSIZE;
813 }
814
815 uint64_t ms_count = 0;
816 for (uint64_t i = 0; !vd->vdev_detached &&
817 i < vd->vdev_top->vdev_ms_count; i++) {
818 metaslab_t *msp = vd->vdev_top->vdev_ms[i];
819
820 /*
821 * If we've expanded the top-level vdev or it's our
822 * first pass, calculate our progress.
823 */
824 if (vd->vdev_top->vdev_ms_count != ms_count) {
825 vdev_trim_calculate_progress(vd);
826 ms_count = vd->vdev_top->vdev_ms_count;
827 }
828
829 spa_config_exit(spa, SCL_CONFIG, FTAG);
830 metaslab_disable(msp);
831 mutex_enter(&msp->ms_lock);
832 VERIFY0(metaslab_load(msp));
833
834 /*
835 * If a partial TRIM was requested skip metaslabs which have
836 * never been initialized and thus have never been written.
837 */
838 if (msp->ms_sm == NULL && vd->vdev_trim_partial) {
839 mutex_exit(&msp->ms_lock);
840 metaslab_enable(msp, B_FALSE);
841 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
842 vdev_trim_calculate_progress(vd);
843 continue;
844 }
845
846 ta.trim_msp = msp;
847 range_tree_walk(msp->ms_allocatable, vdev_trim_range_add, &ta);
848 range_tree_vacate(msp->ms_trim, NULL, NULL);
849 mutex_exit(&msp->ms_lock);
850
851 error = vdev_trim_ranges(&ta);
852 metaslab_enable(msp, B_TRUE);
853 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
854
855 range_tree_vacate(ta.trim_tree, NULL, NULL);
856 if (error != 0)
857 break;
858 }
859
860 spa_config_exit(spa, SCL_CONFIG, FTAG);
861 mutex_enter(&vd->vdev_trim_io_lock);
862 while (vd->vdev_trim_inflight[0] > 0) {
863 cv_wait(&vd->vdev_trim_io_cv, &vd->vdev_trim_io_lock);
864 }
865 mutex_exit(&vd->vdev_trim_io_lock);
866
867 range_tree_destroy(ta.trim_tree);
868
869 mutex_enter(&vd->vdev_trim_lock);
870 if (!vd->vdev_trim_exit_wanted && vdev_writeable(vd)) {
871 vdev_trim_change_state(vd, VDEV_TRIM_COMPLETE,
872 vd->vdev_trim_rate, vd->vdev_trim_partial,
873 vd->vdev_trim_secure);
874 }
875 ASSERT(vd->vdev_trim_thread != NULL || vd->vdev_trim_inflight[0] == 0);
876
877 /*
878 * Drop the vdev_trim_lock while we sync out the txg since it's
879 * possible that a device might be trying to come online and must
880 * check to see if it needs to restart a trim. That thread will be
881 * holding the spa_config_lock which would prevent the txg_wait_synced
882 * from completing.
883 */
884 mutex_exit(&vd->vdev_trim_lock);
885 txg_wait_synced(spa_get_dsl(spa), 0);
886 mutex_enter(&vd->vdev_trim_lock);
887
888 vd->vdev_trim_thread = NULL;
889 cv_broadcast(&vd->vdev_trim_cv);
890 mutex_exit(&vd->vdev_trim_lock);
891}
892
893/*
894 * Initiates a manual TRIM for the vdev_t. Callers must hold vdev_trim_lock,
895 * the vdev_t must be a leaf and cannot already be manually trimming.
896 */
897void
898vdev_trim(vdev_t *vd, uint64_t rate, boolean_t partial, boolean_t secure)
899{
900 ASSERT(MUTEX_HELD(&vd->vdev_trim_lock));
901 ASSERT(vd->vdev_ops->vdev_op_leaf);
902 ASSERT(vdev_is_concrete(vd));
903 ASSERT3P(vd->vdev_trim_thread, ==, NULL);
904 ASSERT(!vd->vdev_detached);
905 ASSERT(!vd->vdev_trim_exit_wanted);
906 ASSERT(!vd->vdev_top->vdev_removing);
907
908 vdev_trim_change_state(vd, VDEV_TRIM_ACTIVE, rate, partial, secure);
909 vd->vdev_trim_thread = thread_create(NULL, 0,
910 vdev_trim_thread, vd, 0, &p0, TS_RUN, maxclsyspri);
911}
912
913/*
914 * Wait for the trimming thread to be terminated (canceled or stopped).
915 */
916static void
917vdev_trim_stop_wait_impl(vdev_t *vd)
918{
919 ASSERT(MUTEX_HELD(&vd->vdev_trim_lock));
920
921 while (vd->vdev_trim_thread != NULL)
922 cv_wait(&vd->vdev_trim_cv, &vd->vdev_trim_lock);
923
924 ASSERT3P(vd->vdev_trim_thread, ==, NULL);
925 vd->vdev_trim_exit_wanted = B_FALSE;
926}
927
928/*
929 * Wait for vdev trim threads which were listed to cleanly exit.
930 */
931void
932vdev_trim_stop_wait(spa_t *spa, list_t *vd_list)
933{
934 vdev_t *vd;
935
936 ASSERT(MUTEX_HELD(&spa_namespace_lock));
937
938 while ((vd = list_remove_head(vd_list)) != NULL) {
939 mutex_enter(&vd->vdev_trim_lock);
940 vdev_trim_stop_wait_impl(vd);
941 mutex_exit(&vd->vdev_trim_lock);
942 }
943}
944
945/*
946 * Stop trimming a device, with the resultant trimming state being tgt_state.
947 * For blocking behavior pass NULL for vd_list. Otherwise, when a list_t is
948 * provided the stopping vdev is inserted in to the list. Callers are then
949 * required to call vdev_trim_stop_wait() to block for all the trim threads
950 * to exit. The caller must hold vdev_trim_lock and must not be writing to
951 * the spa config, as the trimming thread may try to enter the config as a
952 * reader before exiting.
953 */
954void
955vdev_trim_stop(vdev_t *vd, vdev_trim_state_t tgt_state, list_t *vd_list)
956{
957 ASSERT(!spa_config_held(vd->vdev_spa, SCL_CONFIG|SCL_STATE, RW_WRITER));
958 ASSERT(MUTEX_HELD(&vd->vdev_trim_lock));
959 ASSERT(vd->vdev_ops->vdev_op_leaf);
960 ASSERT(vdev_is_concrete(vd));
961
962 /*
963 * Allow cancel requests to proceed even if the trim thread has
964 * stopped.
965 */
966 if (vd->vdev_trim_thread == NULL && tgt_state != VDEV_TRIM_CANCELED)
967 return;
968
969 vdev_trim_change_state(vd, tgt_state, 0, 0, 0);
970 vd->vdev_trim_exit_wanted = B_TRUE;
971
972 if (vd_list == NULL) {
973 vdev_trim_stop_wait_impl(vd);
974 } else {
975 ASSERT(MUTEX_HELD(&spa_namespace_lock));
976 list_insert_tail(vd_list, vd);
977 }
978}
979
980/*
981 * Requests that all listed vdevs stop trimming.
982 */
983static void
984vdev_trim_stop_all_impl(vdev_t *vd, vdev_trim_state_t tgt_state,
985 list_t *vd_list)
986{
987 if (vd->vdev_ops->vdev_op_leaf && vdev_is_concrete(vd)) {
988 mutex_enter(&vd->vdev_trim_lock);
989 vdev_trim_stop(vd, tgt_state, vd_list);
990 mutex_exit(&vd->vdev_trim_lock);
991 return;
992 }
993
994 for (uint64_t i = 0; i < vd->vdev_children; i++) {
995 vdev_trim_stop_all_impl(vd->vdev_child[i], tgt_state,
996 vd_list);
997 }
998}
999
1000/*
1001 * Convenience function to stop trimming of a vdev tree and set all trim
1002 * thread pointers to NULL.
1003 */
1004void
1005vdev_trim_stop_all(vdev_t *vd, vdev_trim_state_t tgt_state)
1006{
1007 spa_t *spa = vd->vdev_spa;
1008 list_t vd_list;
1009
1010 ASSERT(MUTEX_HELD(&spa_namespace_lock));
1011
1012 list_create(&vd_list, sizeof (vdev_t),
1013 offsetof(vdev_t, vdev_trim_node));
1014
1015 vdev_trim_stop_all_impl(vd, tgt_state, &vd_list);
1016 vdev_trim_stop_wait(spa, &vd_list);
1017
1018 if (vd->vdev_spa->spa_sync_on) {
1019 /* Make sure that our state has been synced to disk */
1020 txg_wait_synced(spa_get_dsl(vd->vdev_spa), 0);
1021 }
1022
1023 list_destroy(&vd_list);
1024}
1025
1026/*
1027 * Conditionally restarts a manual TRIM given its on-disk state.
1028 */
1029void
1030vdev_trim_restart(vdev_t *vd)
1031{
1032 ASSERT(MUTEX_HELD(&spa_namespace_lock));
1033 ASSERT(!spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
1034
1035 if (vd->vdev_leaf_zap != 0) {
1036 mutex_enter(&vd->vdev_trim_lock);
1037 uint64_t trim_state = VDEV_TRIM_NONE;
1038 int err = zap_lookup(vd->vdev_spa->spa_meta_objset,
1039 vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_STATE,
1040 sizeof (trim_state), 1, &trim_state);
1041 ASSERT(err == 0 || err == ENOENT);
1042 vd->vdev_trim_state = trim_state;
1043
1044 uint64_t timestamp = 0;
1045 err = zap_lookup(vd->vdev_spa->spa_meta_objset,
1046 vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_ACTION_TIME,
1047 sizeof (timestamp), 1, &timestamp);
1048 ASSERT(err == 0 || err == ENOENT);
1049 vd->vdev_trim_action_time = (time_t)timestamp;
1050
1051 if (vd->vdev_trim_state == VDEV_TRIM_SUSPENDED ||
1052 vd->vdev_offline) {
1053 /* load progress for reporting, but don't resume */
1054 VERIFY0(vdev_trim_load(vd));
1055 } else if (vd->vdev_trim_state == VDEV_TRIM_ACTIVE &&
1056 vdev_writeable(vd) && !vd->vdev_top->vdev_removing &&
1057 vd->vdev_trim_thread == NULL) {
1058 VERIFY0(vdev_trim_load(vd));
1059 vdev_trim(vd, vd->vdev_trim_rate,
1060 vd->vdev_trim_partial, vd->vdev_trim_secure);
1061 }
1062
1063 mutex_exit(&vd->vdev_trim_lock);
1064 }
1065
1066 for (uint64_t i = 0; i < vd->vdev_children; i++) {
1067 vdev_trim_restart(vd->vdev_child[i]);
1068 }
1069}
1070
1071/*
1072 * Used by the automatic TRIM when ZFS_DEBUG_TRIM is set to verify that
1073 * every TRIM range is contained within ms_allocatable.
1074 */
1075static void
1076vdev_trim_range_verify(void *arg, uint64_t start, uint64_t size)
1077{
1078 trim_args_t *ta = arg;
1079 metaslab_t *msp = ta->trim_msp;
1080
1081 VERIFY3B(msp->ms_loaded, ==, B_TRUE);
1082 VERIFY3U(msp->ms_disabled, >, 0);
1083 VERIFY(range_tree_find(msp->ms_allocatable, start, size) != NULL);
1084}
1085
1086/*
1087 * Each automatic TRIM thread is responsible for managing the trimming of a
1088 * top-level vdev in the pool. No automatic TRIM state is maintained on-disk.
1089 *
1090 * N.B. This behavior is different from a manual TRIM where a thread
1091 * is created for each leaf vdev, instead of each top-level vdev.
1092 */
1093static void
1094vdev_autotrim_thread(void *arg)
1095{
1096 vdev_t *vd = arg;
1097 spa_t *spa = vd->vdev_spa;
1098 int shift = 0;
1099
1100 mutex_enter(&vd->vdev_autotrim_lock);
1101 ASSERT3P(vd->vdev_top, ==, vd);
1102 ASSERT3P(vd->vdev_autotrim_thread, !=, NULL);
1103 mutex_exit(&vd->vdev_autotrim_lock);
1104 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
1105
1106 uint64_t extent_bytes_max = zfs_trim_extent_bytes_max;
1107 uint64_t extent_bytes_min = zfs_trim_extent_bytes_min;
1108
1109 while (!vdev_autotrim_should_stop(vd)) {
1110 int txgs_per_trim = MAX(zfs_trim_txg_batch, 1);
1111 boolean_t issued_trim = B_FALSE;
1112
1113 /*
1114 * All of the metaslabs are divided in to groups of size
1115 * num_metaslabs / zfs_trim_txg_batch. Each of these groups
1116 * is composed of metaslabs which are spread evenly over the
1117 * device.
1118 *
1119 * For example, when zfs_trim_txg_batch = 32 (default) then
1120 * group 0 will contain metaslabs 0, 32, 64, ...;
1121 * group 1 will contain metaslabs 1, 33, 65, ...;
1122 * group 2 will contain metaslabs 2, 34, 66, ...; and so on.
1123 *
1124 * On each pass through the while() loop one of these groups
1125 * is selected. This is accomplished by using a shift value
1126 * to select the starting metaslab, then striding over the
1127 * metaslabs using the zfs_trim_txg_batch size. This is
1128 * done to accomplish two things.
1129 *
1130 * 1) By dividing the metaslabs in to groups, and making sure
1131 * that each group takes a minimum of one txg to process.
1132 * Then zfs_trim_txg_batch controls the minimum number of
1133 * txgs which must occur before a metaslab is revisited.
1134 *
1135 * 2) Selecting non-consecutive metaslabs distributes the
1136 * TRIM commands for a group evenly over the entire device.
1137 * This can be advantageous for certain types of devices.
1138 */
1139 for (uint64_t i = shift % txgs_per_trim; i < vd->vdev_ms_count;
1140 i += txgs_per_trim) {
1141 metaslab_t *msp = vd->vdev_ms[i];
1142 range_tree_t *trim_tree;
1143
1144 spa_config_exit(spa, SCL_CONFIG, FTAG);
1145 metaslab_disable(msp);
1146 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
1147
1148 mutex_enter(&msp->ms_lock);
1149
1150 /*
1151 * Skip the metaslab when it has never been allocated
1152 * or when there are no recent frees to trim.
1153 */
1154 if (msp->ms_sm == NULL ||
1155 range_tree_is_empty(msp->ms_trim)) {
1156 mutex_exit(&msp->ms_lock);
1157 metaslab_enable(msp, B_FALSE);
1158 continue;
1159 }
1160
1161 /*
1162 * Skip the metaslab when it has already been disabled.
1163 * This may happen when a manual TRIM or initialize
1164 * operation is running concurrently. In the case
1165 * of a manual TRIM, the ms_trim tree will have been
1166 * vacated. Only ranges added after the manual TRIM
1167 * disabled the metaslab will be included in the tree.
1168 * These will be processed when the automatic TRIM
1169 * next revisits this metaslab.
1170 */
1171 if (msp->ms_disabled > 1) {
1172 mutex_exit(&msp->ms_lock);
1173 metaslab_enable(msp, B_FALSE);
1174 continue;
1175 }
1176
1177 /*
1178 * Allocate an empty range tree which is swapped in
1179 * for the existing ms_trim tree while it is processed.
1180 */
1181 trim_tree = range_tree_create(NULL, NULL);
1182 range_tree_swap(&msp->ms_trim, &trim_tree);
1183 ASSERT(range_tree_is_empty(msp->ms_trim));
1184
1185 /*
1186 * There are two cases when constructing the per-vdev
1187 * trim trees for a metaslab. If the top-level vdev
1188 * has no children then it is also a leaf and should
1189 * be trimmed. Otherwise our children are the leaves
1190 * and a trim tree should be constructed for each.
1191 */
1192 trim_args_t *tap;
1193 uint64_t children = vd->vdev_children;
1194 if (children == 0) {
1195 children = 1;
1196 tap = kmem_zalloc(sizeof (trim_args_t) *
1197 children, KM_SLEEP);
1198 tap[0].trim_vdev = vd;
1199 } else {
1200 tap = kmem_zalloc(sizeof (trim_args_t) *
1201 children, KM_SLEEP);
1202
1203 for (uint64_t c = 0; c < children; c++) {
1204 tap[c].trim_vdev = vd->vdev_child[c];
1205 }
1206 }
1207
1208 for (uint64_t c = 0; c < children; c++) {
1209 trim_args_t *ta = &tap[c];
1210 vdev_t *cvd = ta->trim_vdev;
1211
1212 ta->trim_msp = msp;
1213 ta->trim_extent_bytes_max = extent_bytes_max;
1214 ta->trim_extent_bytes_min = extent_bytes_min;
1215 ta->trim_type = TRIM_TYPE_AUTO;
1216 ta->trim_flags = 0;
1217
1218 if (cvd->vdev_detached ||
1219 !vdev_writeable(cvd) ||
1220 !cvd->vdev_has_trim ||
1221 cvd->vdev_trim_thread != NULL) {
1222 continue;
1223 }
1224
1225 /*
1226 * When a device has an attached hot spare, or
1227 * is being replaced it will not be trimmed.
1228 * This is done to avoid adding additional
1229 * stress to a potentially unhealthy device,
1230 * and to minimize the required rebuild time.
1231 */
1232 if (!cvd->vdev_ops->vdev_op_leaf)
1233 continue;
1234
1235 ta->trim_tree = range_tree_create(NULL, NULL);
1236 range_tree_walk(trim_tree,
1237 vdev_trim_range_add, ta);
1238 }
1239
1240 mutex_exit(&msp->ms_lock);
1241 spa_config_exit(spa, SCL_CONFIG, FTAG);
1242
1243 /*
1244 * Issue the TRIM I/Os for all ranges covered by the
1245 * TRIM trees. These ranges are safe to TRIM because
1246 * no new allocations will be performed until the call
1247 * to metaslab_enabled() below.
1248 */
1249 for (uint64_t c = 0; c < children; c++) {
1250 trim_args_t *ta = &tap[c];
1251
1252 /*
1253 * Always yield to a manual TRIM if one has
1254 * been started for the child vdev.
1255 */
1256 if (ta->trim_tree == NULL ||
1257 ta->trim_vdev->vdev_trim_thread != NULL) {
1258 continue;
1259 }
1260
1261 /*
1262 * After this point metaslab_enable() must be
1263 * called with the sync flag set. This is done
1264 * here because vdev_trim_ranges() is allowed
1265 * to be interrupted (EINTR) before issuing all
1266 * of the required TRIM I/Os.
1267 */
1268 issued_trim = B_TRUE;
1269
1270 int error = vdev_trim_ranges(ta);
1271 if (error)
1272 break;
1273 }
1274
1275 /*
1276 * Verify every range which was trimmed is still
1277 * contained within the ms_allocatable tree.
1278 */
1279 if (zfs_flags & ZFS_DEBUG_TRIM) {
1280 mutex_enter(&msp->ms_lock);
1281 VERIFY0(metaslab_load(msp));
1282 VERIFY3P(tap[0].trim_msp, ==, msp);
1283 range_tree_walk(trim_tree,
1284 vdev_trim_range_verify, &tap[0]);
1285 mutex_exit(&msp->ms_lock);
1286 }
1287
1288 range_tree_vacate(trim_tree, NULL, NULL);
1289 range_tree_destroy(trim_tree);
1290
1291 metaslab_enable(msp, issued_trim);
1292 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
1293
1294 for (uint64_t c = 0; c < children; c++) {
1295 trim_args_t *ta = &tap[c];
1296
1297 if (ta->trim_tree == NULL)
1298 continue;
1299
1300 range_tree_vacate(ta->trim_tree, NULL, NULL);
1301 range_tree_destroy(ta->trim_tree);
1302 }
1303
1304 kmem_free(tap, sizeof (trim_args_t) * children);
1305 }
1306
1307 spa_config_exit(spa, SCL_CONFIG, FTAG);
1308
1309 /*
1310 * After completing the group of metaslabs wait for the next
1311 * open txg. This is done to make sure that a minimum of
1312 * zfs_trim_txg_batch txgs will occur before these metaslabs
1313 * are trimmed again.
1314 */
1315 txg_wait_open(spa_get_dsl(spa), 0, issued_trim);
1316
1317 shift++;
1318 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
1319 }
1320
1321 for (uint64_t c = 0; c < vd->vdev_children; c++) {
1322 vdev_t *cvd = vd->vdev_child[c];
1323 mutex_enter(&cvd->vdev_trim_io_lock);
1324
1325 while (cvd->vdev_trim_inflight[1] > 0) {
1326 cv_wait(&cvd->vdev_trim_io_cv,
1327 &cvd->vdev_trim_io_lock);
1328 }
1329 mutex_exit(&cvd->vdev_trim_io_lock);
1330 }
1331
1332 spa_config_exit(spa, SCL_CONFIG, FTAG);
1333
1334 /*
1335 * When exiting because the autotrim property was set to off, then
1336 * abandon any unprocessed ms_trim ranges to reclaim the memory.
1337 */
1338 if (spa_get_autotrim(spa) == SPA_AUTOTRIM_OFF) {
1339 for (uint64_t i = 0; i < vd->vdev_ms_count; i++) {
1340 metaslab_t *msp = vd->vdev_ms[i];
1341
1342 mutex_enter(&msp->ms_lock);
1343 range_tree_vacate(msp->ms_trim, NULL, NULL);
1344 mutex_exit(&msp->ms_lock);
1345 }
1346 }
1347
1348 mutex_enter(&vd->vdev_autotrim_lock);
1349 ASSERT(vd->vdev_autotrim_thread != NULL);
1350 vd->vdev_autotrim_thread = NULL;
1351 cv_broadcast(&vd->vdev_autotrim_cv);
1352 mutex_exit(&vd->vdev_autotrim_lock);
1353}
1354
1355/*
1356 * Starts an autotrim thread, if needed, for each top-level vdev which can be
1357 * trimmed. A top-level vdev which has been evacuated will never be trimmed.
1358 */
1359void
1360vdev_autotrim(spa_t *spa)
1361{
1362 vdev_t *root_vd = spa->spa_root_vdev;
1363
1364 for (uint64_t i = 0; i < root_vd->vdev_children; i++) {
1365 vdev_t *tvd = root_vd->vdev_child[i];
1366
1367 mutex_enter(&tvd->vdev_autotrim_lock);
1368 if (vdev_writeable(tvd) && !tvd->vdev_removing &&
1369 tvd->vdev_autotrim_thread == NULL) {
1370 ASSERT3P(tvd->vdev_top, ==, tvd);
1371
1372 tvd->vdev_autotrim_thread = thread_create(NULL, 0,
1373 vdev_autotrim_thread, tvd, 0, &p0, TS_RUN,
1374 maxclsyspri);
1375 ASSERT(tvd->vdev_autotrim_thread != NULL);
1376 }
1377 mutex_exit(&tvd->vdev_autotrim_lock);
1378 }
1379}
1380
1381/*
1382 * Wait for the vdev_autotrim_thread associated with the passed top-level
1383 * vdev to be terminated (canceled or stopped).
1384 */
1385void
1386vdev_autotrim_stop_wait(vdev_t *tvd)
1387{
1388 mutex_enter(&tvd->vdev_autotrim_lock);
1389 if (tvd->vdev_autotrim_thread != NULL) {
1390 tvd->vdev_autotrim_exit_wanted = B_TRUE;
1391
1392 while (tvd->vdev_autotrim_thread != NULL) {
1393 cv_wait(&tvd->vdev_autotrim_cv,
1394 &tvd->vdev_autotrim_lock);
1395 }
1396
1397 ASSERT3P(tvd->vdev_autotrim_thread, ==, NULL);
1398 tvd->vdev_autotrim_exit_wanted = B_FALSE;
1399 }
1400 mutex_exit(&tvd->vdev_autotrim_lock);
1401}
1402
1403/*
1404 * Wait for all of the vdev_autotrim_thread associated with the pool to
1405 * be terminated (canceled or stopped).
1406 */
1407void
1408vdev_autotrim_stop_all(spa_t *spa)
1409{
1410 vdev_t *root_vd = spa->spa_root_vdev;
1411
1412 for (uint64_t i = 0; i < root_vd->vdev_children; i++)
1413 vdev_autotrim_stop_wait(root_vd->vdev_child[i]);
1414}
1415
1416/*
1417 * Conditionally restart all of the vdev_autotrim_thread's for the pool.
1418 */
1419void
1420vdev_autotrim_restart(spa_t *spa)
1421{
1422 ASSERT(MUTEX_HELD(&spa_namespace_lock));
1423
1424 if (spa->spa_autotrim)
1425 vdev_autotrim(spa);
1426}
1427
1428#if defined(_KERNEL)
1429EXPORT_SYMBOL(vdev_trim);
1430EXPORT_SYMBOL(vdev_trim_stop);
1431EXPORT_SYMBOL(vdev_trim_stop_all);
1432EXPORT_SYMBOL(vdev_trim_stop_wait);
1433EXPORT_SYMBOL(vdev_trim_restart);
1434EXPORT_SYMBOL(vdev_autotrim);
1435EXPORT_SYMBOL(vdev_autotrim_stop_all);
1436EXPORT_SYMBOL(vdev_autotrim_stop_wait);
1437EXPORT_SYMBOL(vdev_autotrim_restart);
1438
1439/* BEGIN CSTYLED */
1440module_param(zfs_trim_extent_bytes_max, uint, 0644);
1441MODULE_PARM_DESC(zfs_trim_extent_bytes_max,
1442 "Max size of TRIM commands, larger will be split");
1443
1444module_param(zfs_trim_extent_bytes_min, uint, 0644);
1445MODULE_PARM_DESC(zfs_trim_extent_bytes_min,
1446 "Min size of TRIM commands, smaller will be skipped");
1447
1448module_param(zfs_trim_metaslab_skip, uint, 0644);
1449MODULE_PARM_DESC(zfs_trim_metaslab_skip,
1450 "Skip metaslabs which have never been initialized");
1451
1452module_param(zfs_trim_txg_batch, uint, 0644);
1453MODULE_PARM_DESC(zfs_trim_txg_batch,
1454 "Min number of txgs to aggregate frees before issuing TRIM");
1455
1456module_param(zfs_trim_queue_limit, uint, 0644);
1457MODULE_PARM_DESC(zfs_trim_queue_limit,
1458 "Max queued TRIMs outstanding per leaf vdev");
1459/* END CSTYLED */
1460#endif