]>
Commit | Line | Data |
---|---|---|
1b939560 BB |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
9 | * or http://www.opensolaris.org/os/licensing. | |
10 | * See the License for the specific language governing permissions | |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | ||
22 | /* | |
23 | * Copyright (c) 2016 by Delphix. All rights reserved. | |
24 | * Copyright (c) 2019 by Lawrence Livermore National Security, LLC. | |
25 | */ | |
26 | ||
27 | #include <sys/spa.h> | |
28 | #include <sys/spa_impl.h> | |
29 | #include <sys/txg.h> | |
30 | #include <sys/vdev_impl.h> | |
31 | #include <sys/vdev_trim.h> | |
32 | #include <sys/refcount.h> | |
33 | #include <sys/metaslab_impl.h> | |
34 | #include <sys/dsl_synctask.h> | |
35 | #include <sys/zap.h> | |
36 | #include <sys/dmu_tx.h> | |
37 | ||
38 | /* | |
39 | * TRIM is a feature which is used to notify a SSD that some previously | |
40 | * written space is no longer allocated by the pool. This is useful because | |
41 | * writes to a SSD must be performed to blocks which have first been erased. | |
42 | * Ensuring the SSD always has a supply of erased blocks for new writes | |
43 | * helps prevent the performance from deteriorating. | |
44 | * | |
45 | * There are two supported TRIM methods; manual and automatic. | |
46 | * | |
47 | * Manual TRIM: | |
48 | * | |
49 | * A manual TRIM is initiated by running the 'zpool trim' command. A single | |
50 | * 'vdev_trim' thread is created for each leaf vdev, and it is responsible for | |
51 | * managing that vdev TRIM process. This involves iterating over all the | |
52 | * metaslabs, calculating the unallocated space ranges, and then issuing the | |
53 | * required TRIM I/Os. | |
54 | * | |
55 | * While a metaslab is being actively trimmed it is not eligible to perform | |
56 | * new allocations. After traversing all of the metaslabs the thread is | |
57 | * terminated. Finally, both the requested options and current progress of | |
58 | * the TRIM are regularly written to the pool. This allows the TRIM to be | |
59 | * suspended and resumed as needed. | |
60 | * | |
61 | * Automatic TRIM: | |
62 | * | |
63 | * An automatic TRIM is enabled by setting the 'autotrim' pool property | |
64 | * to 'on'. When enabled, a `vdev_autotrim' thread is created for each | |
65 | * top-level (not leaf) vdev in the pool. These threads perform the same | |
66 | * core TRIM process as a manual TRIM, but with a few key differences. | |
67 | * | |
68 | * 1) Automatic TRIM happens continuously in the background and operates | |
69 | * solely on recently freed blocks (ms_trim not ms_allocatable). | |
70 | * | |
71 | * 2) Each thread is associated with a top-level (not leaf) vdev. This has | |
72 | * the benefit of simplifying the threading model, it makes it easier | |
73 | * to coordinate administrative commands, and it ensures only a single | |
74 | * metaslab is disabled at a time. Unlike manual TRIM, this means each | |
75 | * 'vdev_autotrim' thread is responsible for issuing TRIM I/Os for its | |
76 | * children. | |
77 | * | |
78 | * 3) There is no automatic TRIM progress information stored on disk, nor | |
79 | * is it reported by 'zpool status'. | |
80 | * | |
81 | * While the automatic TRIM process is highly effective it is more likely | |
82 | * than a manual TRIM to encounter tiny ranges. Ranges less than or equal to | |
83 | * 'zfs_trim_extent_bytes_min' (32k) are considered too small to efficiently | |
84 | * TRIM and are skipped. This means small amounts of freed space may not | |
85 | * be automatically trimmed. | |
86 | * | |
87 | * Furthermore, devices with attached hot spares and devices being actively | |
88 | * replaced are skipped. This is done to avoid adding additional stress to | |
89 | * a potentially unhealthy device and to minimize the required rebuild time. | |
90 | * | |
91 | * For this reason it may be beneficial to occasionally manually TRIM a pool | |
92 | * even when automatic TRIM is enabled. | |
93 | */ | |
94 | ||
95 | /* | |
96 | * Maximum size of TRIM I/O, ranges will be chunked in to 128MiB lengths. | |
97 | */ | |
98 | unsigned int zfs_trim_extent_bytes_max = 128 * 1024 * 1024; | |
99 | ||
100 | /* | |
101 | * Minimum size of TRIM I/O, extents smaller than 32Kib will be skipped. | |
102 | */ | |
103 | unsigned int zfs_trim_extent_bytes_min = 32 * 1024; | |
104 | ||
105 | /* | |
106 | * Skip uninitialized metaslabs during the TRIM process. This option is | |
107 | * useful for pools constructed from large thinly-provisioned devices where | |
108 | * TRIM operations are slow. As a pool ages an increasing fraction of | |
109 | * the pools metaslabs will be initialized progressively degrading the | |
110 | * usefulness of this option. This setting is stored when starting a | |
111 | * manual TRIM and will persist for the duration of the requested TRIM. | |
112 | */ | |
113 | unsigned int zfs_trim_metaslab_skip = 0; | |
114 | ||
115 | /* | |
116 | * Maximum number of queued TRIM I/Os per leaf vdev. The number of | |
117 | * concurrent TRIM I/Os issued to the device is controlled by the | |
118 | * zfs_vdev_trim_min_active and zfs_vdev_trim_max_active module options. | |
119 | */ | |
120 | unsigned int zfs_trim_queue_limit = 10; | |
121 | ||
122 | /* | |
123 | * The minimum number of transaction groups between automatic trims of a | |
124 | * metaslab. This setting represents a trade-off between issuing more | |
125 | * efficient TRIM operations, by allowing them to be aggregated longer, | |
126 | * and issuing them promptly so the trimmed space is available. Note | |
127 | * that this value is a minimum; metaslabs can be trimmed less frequently | |
128 | * when there are a large number of ranges which need to be trimmed. | |
129 | * | |
130 | * Increasing this value will allow frees to be aggregated for a longer | |
131 | * time. This can result is larger TRIM operations, and increased memory | |
132 | * usage in order to track the ranges to be trimmed. Decreasing this value | |
133 | * has the opposite effect. The default value of 32 was determined though | |
134 | * testing to be a reasonable compromise. | |
135 | */ | |
136 | unsigned int zfs_trim_txg_batch = 32; | |
137 | ||
138 | /* | |
139 | * The trim_args are a control structure which describe how a leaf vdev | |
140 | * should be trimmed. The core elements are the vdev, the metaslab being | |
141 | * trimmed and a range tree containing the extents to TRIM. All provided | |
142 | * ranges must be within the metaslab. | |
143 | */ | |
144 | typedef struct trim_args { | |
145 | /* | |
146 | * These fields are set by the caller of vdev_trim_ranges(). | |
147 | */ | |
148 | vdev_t *trim_vdev; /* Leaf vdev to TRIM */ | |
149 | metaslab_t *trim_msp; /* Disabled metaslab */ | |
150 | range_tree_t *trim_tree; /* TRIM ranges (in metaslab) */ | |
151 | trim_type_t trim_type; /* Manual or auto TRIM */ | |
152 | uint64_t trim_extent_bytes_max; /* Maximum TRIM I/O size */ | |
153 | uint64_t trim_extent_bytes_min; /* Minimum TRIM I/O size */ | |
154 | enum trim_flag trim_flags; /* TRIM flags (secure) */ | |
155 | ||
156 | /* | |
157 | * These fields are updated by vdev_trim_ranges(). | |
158 | */ | |
159 | hrtime_t trim_start_time; /* Start time */ | |
160 | uint64_t trim_bytes_done; /* Bytes trimmed */ | |
161 | } trim_args_t; | |
162 | ||
163 | /* | |
164 | * Determines whether a vdev_trim_thread() should be stopped. | |
165 | */ | |
166 | static boolean_t | |
167 | vdev_trim_should_stop(vdev_t *vd) | |
168 | { | |
169 | return (vd->vdev_trim_exit_wanted || !vdev_writeable(vd) || | |
170 | vd->vdev_detached || vd->vdev_top->vdev_removing); | |
171 | } | |
172 | ||
173 | /* | |
174 | * Determines whether a vdev_autotrim_thread() should be stopped. | |
175 | */ | |
176 | static boolean_t | |
177 | vdev_autotrim_should_stop(vdev_t *tvd) | |
178 | { | |
179 | return (tvd->vdev_autotrim_exit_wanted || | |
180 | !vdev_writeable(tvd) || tvd->vdev_removing || | |
181 | spa_get_autotrim(tvd->vdev_spa) == SPA_AUTOTRIM_OFF); | |
182 | } | |
183 | ||
184 | /* | |
185 | * The sync task for updating the on-disk state of a manual TRIM. This | |
186 | * is scheduled by vdev_trim_change_state(). | |
187 | */ | |
188 | static void | |
189 | vdev_trim_zap_update_sync(void *arg, dmu_tx_t *tx) | |
190 | { | |
191 | /* | |
192 | * We pass in the guid instead of the vdev_t since the vdev may | |
193 | * have been freed prior to the sync task being processed. This | |
194 | * happens when a vdev is detached as we call spa_config_vdev_exit(), | |
195 | * stop the trimming thread, schedule the sync task, and free | |
196 | * the vdev. Later when the scheduled sync task is invoked, it would | |
197 | * find that the vdev has been freed. | |
198 | */ | |
199 | uint64_t guid = *(uint64_t *)arg; | |
200 | uint64_t txg = dmu_tx_get_txg(tx); | |
201 | kmem_free(arg, sizeof (uint64_t)); | |
202 | ||
203 | vdev_t *vd = spa_lookup_by_guid(tx->tx_pool->dp_spa, guid, B_FALSE); | |
204 | if (vd == NULL || vd->vdev_top->vdev_removing || !vdev_is_concrete(vd)) | |
205 | return; | |
206 | ||
207 | uint64_t last_offset = vd->vdev_trim_offset[txg & TXG_MASK]; | |
208 | vd->vdev_trim_offset[txg & TXG_MASK] = 0; | |
209 | ||
210 | VERIFY3U(vd->vdev_leaf_zap, !=, 0); | |
211 | ||
212 | objset_t *mos = vd->vdev_spa->spa_meta_objset; | |
213 | ||
214 | if (last_offset > 0 || vd->vdev_trim_last_offset == UINT64_MAX) { | |
215 | ||
216 | if (vd->vdev_trim_last_offset == UINT64_MAX) | |
217 | last_offset = 0; | |
218 | ||
219 | vd->vdev_trim_last_offset = last_offset; | |
220 | VERIFY0(zap_update(mos, vd->vdev_leaf_zap, | |
221 | VDEV_LEAF_ZAP_TRIM_LAST_OFFSET, | |
222 | sizeof (last_offset), 1, &last_offset, tx)); | |
223 | } | |
224 | ||
225 | if (vd->vdev_trim_action_time > 0) { | |
226 | uint64_t val = (uint64_t)vd->vdev_trim_action_time; | |
227 | VERIFY0(zap_update(mos, vd->vdev_leaf_zap, | |
228 | VDEV_LEAF_ZAP_TRIM_ACTION_TIME, sizeof (val), | |
229 | 1, &val, tx)); | |
230 | } | |
231 | ||
232 | if (vd->vdev_trim_rate > 0) { | |
233 | uint64_t rate = (uint64_t)vd->vdev_trim_rate; | |
234 | ||
235 | if (rate == UINT64_MAX) | |
236 | rate = 0; | |
237 | ||
238 | VERIFY0(zap_update(mos, vd->vdev_leaf_zap, | |
239 | VDEV_LEAF_ZAP_TRIM_RATE, sizeof (rate), 1, &rate, tx)); | |
240 | } | |
241 | ||
242 | uint64_t partial = vd->vdev_trim_partial; | |
243 | if (partial == UINT64_MAX) | |
244 | partial = 0; | |
245 | ||
246 | VERIFY0(zap_update(mos, vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_PARTIAL, | |
247 | sizeof (partial), 1, &partial, tx)); | |
248 | ||
249 | uint64_t secure = vd->vdev_trim_secure; | |
250 | if (secure == UINT64_MAX) | |
251 | secure = 0; | |
252 | ||
253 | VERIFY0(zap_update(mos, vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_SECURE, | |
254 | sizeof (secure), 1, &secure, tx)); | |
255 | ||
256 | ||
257 | uint64_t trim_state = vd->vdev_trim_state; | |
258 | VERIFY0(zap_update(mos, vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_STATE, | |
259 | sizeof (trim_state), 1, &trim_state, tx)); | |
260 | } | |
261 | ||
262 | /* | |
263 | * Update the on-disk state of a manual TRIM. This is called to request | |
264 | * that a TRIM be started/suspended/canceled, or to change one of the | |
265 | * TRIM options (partial, secure, rate). | |
266 | */ | |
267 | static void | |
268 | vdev_trim_change_state(vdev_t *vd, vdev_trim_state_t new_state, | |
269 | uint64_t rate, boolean_t partial, boolean_t secure) | |
270 | { | |
271 | ASSERT(MUTEX_HELD(&vd->vdev_trim_lock)); | |
272 | spa_t *spa = vd->vdev_spa; | |
273 | ||
274 | if (new_state == vd->vdev_trim_state) | |
275 | return; | |
276 | ||
277 | /* | |
278 | * Copy the vd's guid, this will be freed by the sync task. | |
279 | */ | |
280 | uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP); | |
281 | *guid = vd->vdev_guid; | |
282 | ||
283 | /* | |
284 | * If we're suspending, then preserve the original start time. | |
285 | */ | |
286 | if (vd->vdev_trim_state != VDEV_TRIM_SUSPENDED) { | |
287 | vd->vdev_trim_action_time = gethrestime_sec(); | |
288 | } | |
289 | ||
290 | /* | |
291 | * If we're activating, then preserve the requested rate and trim | |
292 | * method. Setting the last offset and rate to UINT64_MAX is used | |
293 | * as a sentinel to indicate they should be reset to default values. | |
294 | */ | |
295 | if (new_state == VDEV_TRIM_ACTIVE) { | |
296 | if (vd->vdev_trim_state == VDEV_TRIM_COMPLETE || | |
297 | vd->vdev_trim_state == VDEV_TRIM_CANCELED) { | |
298 | vd->vdev_trim_last_offset = UINT64_MAX; | |
299 | vd->vdev_trim_rate = UINT64_MAX; | |
300 | vd->vdev_trim_partial = UINT64_MAX; | |
301 | vd->vdev_trim_secure = UINT64_MAX; | |
302 | } | |
303 | ||
304 | if (rate != 0) | |
305 | vd->vdev_trim_rate = rate; | |
306 | ||
307 | if (partial != 0) | |
308 | vd->vdev_trim_partial = partial; | |
309 | ||
310 | if (secure != 0) | |
311 | vd->vdev_trim_secure = secure; | |
312 | } | |
313 | ||
314 | boolean_t resumed = !!(vd->vdev_trim_state == VDEV_TRIM_SUSPENDED); | |
315 | vd->vdev_trim_state = new_state; | |
316 | ||
317 | dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); | |
318 | VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); | |
319 | dsl_sync_task_nowait(spa_get_dsl(spa), vdev_trim_zap_update_sync, | |
320 | guid, 2, ZFS_SPACE_CHECK_NONE, tx); | |
321 | ||
322 | switch (new_state) { | |
323 | case VDEV_TRIM_ACTIVE: | |
324 | spa_event_notify(spa, vd, NULL, | |
325 | resumed ? ESC_ZFS_TRIM_RESUME : ESC_ZFS_TRIM_START); | |
326 | spa_history_log_internal(spa, "trim", tx, | |
327 | "vdev=%s activated", vd->vdev_path); | |
328 | break; | |
329 | case VDEV_TRIM_SUSPENDED: | |
330 | spa_event_notify(spa, vd, NULL, ESC_ZFS_TRIM_SUSPEND); | |
331 | spa_history_log_internal(spa, "trim", tx, | |
332 | "vdev=%s suspended", vd->vdev_path); | |
333 | break; | |
334 | case VDEV_TRIM_CANCELED: | |
335 | spa_event_notify(spa, vd, NULL, ESC_ZFS_TRIM_CANCEL); | |
336 | spa_history_log_internal(spa, "trim", tx, | |
337 | "vdev=%s canceled", vd->vdev_path); | |
338 | break; | |
339 | case VDEV_TRIM_COMPLETE: | |
340 | spa_event_notify(spa, vd, NULL, ESC_ZFS_TRIM_FINISH); | |
341 | spa_history_log_internal(spa, "trim", tx, | |
342 | "vdev=%s complete", vd->vdev_path); | |
343 | break; | |
344 | default: | |
345 | panic("invalid state %llu", (unsigned long long)new_state); | |
346 | } | |
347 | ||
348 | dmu_tx_commit(tx); | |
349 | } | |
350 | ||
351 | /* | |
352 | * The zio_done_func_t done callback for each manual TRIM issued. It is | |
353 | * responsible for updating the TRIM stats, reissuing failed TRIM I/Os, | |
354 | * and limiting the number of in flight TRIM I/Os. | |
355 | */ | |
356 | static void | |
357 | vdev_trim_cb(zio_t *zio) | |
358 | { | |
359 | vdev_t *vd = zio->io_vd; | |
360 | ||
361 | mutex_enter(&vd->vdev_trim_io_lock); | |
362 | if (zio->io_error == ENXIO && !vdev_writeable(vd)) { | |
363 | /* | |
364 | * The I/O failed because the vdev was unavailable; roll the | |
365 | * last offset back. (This works because spa_sync waits on | |
366 | * spa_txg_zio before it runs sync tasks.) | |
367 | */ | |
368 | uint64_t *offset = | |
369 | &vd->vdev_trim_offset[zio->io_txg & TXG_MASK]; | |
370 | *offset = MIN(*offset, zio->io_offset); | |
371 | } else { | |
372 | if (zio->io_error != 0) { | |
373 | vd->vdev_stat.vs_trim_errors++; | |
374 | spa_iostats_trim_add(vd->vdev_spa, TRIM_TYPE_MANUAL, | |
375 | 0, 0, 0, 0, 1, zio->io_orig_size); | |
376 | } else { | |
377 | spa_iostats_trim_add(vd->vdev_spa, TRIM_TYPE_MANUAL, | |
378 | 1, zio->io_orig_size, 0, 0, 0, 0); | |
379 | } | |
380 | ||
381 | vd->vdev_trim_bytes_done += zio->io_orig_size; | |
382 | } | |
383 | ||
384 | ASSERT3U(vd->vdev_trim_inflight[TRIM_TYPE_MANUAL], >, 0); | |
385 | vd->vdev_trim_inflight[TRIM_TYPE_MANUAL]--; | |
386 | cv_broadcast(&vd->vdev_trim_io_cv); | |
387 | mutex_exit(&vd->vdev_trim_io_lock); | |
388 | ||
389 | spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); | |
390 | } | |
391 | ||
392 | /* | |
393 | * The zio_done_func_t done callback for each automatic TRIM issued. It | |
394 | * is responsible for updating the TRIM stats and limiting the number of | |
395 | * in flight TRIM I/Os. Automatic TRIM I/Os are best effort and are | |
396 | * never reissued on failure. | |
397 | */ | |
398 | static void | |
399 | vdev_autotrim_cb(zio_t *zio) | |
400 | { | |
401 | vdev_t *vd = zio->io_vd; | |
402 | ||
403 | mutex_enter(&vd->vdev_trim_io_lock); | |
404 | ||
405 | if (zio->io_error != 0) { | |
406 | vd->vdev_stat.vs_trim_errors++; | |
407 | spa_iostats_trim_add(vd->vdev_spa, TRIM_TYPE_AUTO, | |
408 | 0, 0, 0, 0, 1, zio->io_orig_size); | |
409 | } else { | |
410 | spa_iostats_trim_add(vd->vdev_spa, TRIM_TYPE_AUTO, | |
411 | 1, zio->io_orig_size, 0, 0, 0, 0); | |
412 | } | |
413 | ||
414 | ASSERT3U(vd->vdev_trim_inflight[TRIM_TYPE_AUTO], >, 0); | |
415 | vd->vdev_trim_inflight[TRIM_TYPE_AUTO]--; | |
416 | cv_broadcast(&vd->vdev_trim_io_cv); | |
417 | mutex_exit(&vd->vdev_trim_io_lock); | |
418 | ||
419 | spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); | |
420 | } | |
421 | ||
422 | /* | |
423 | * Returns the average trim rate in bytes/sec for the ta->trim_vdev. | |
424 | */ | |
425 | static uint64_t | |
426 | vdev_trim_calculate_rate(trim_args_t *ta) | |
427 | { | |
428 | return (ta->trim_bytes_done * 1000 / | |
429 | (NSEC2MSEC(gethrtime() - ta->trim_start_time) + 1)); | |
430 | } | |
431 | ||
432 | /* | |
433 | * Issues a physical TRIM and takes care of rate limiting (bytes/sec) | |
434 | * and number of concurrent TRIM I/Os. | |
435 | */ | |
436 | static int | |
437 | vdev_trim_range(trim_args_t *ta, uint64_t start, uint64_t size) | |
438 | { | |
439 | vdev_t *vd = ta->trim_vdev; | |
440 | spa_t *spa = vd->vdev_spa; | |
441 | ||
442 | mutex_enter(&vd->vdev_trim_io_lock); | |
443 | ||
444 | /* | |
445 | * Limit manual TRIM I/Os to the requested rate. This does not | |
446 | * apply to automatic TRIM since no per vdev rate can be specified. | |
447 | */ | |
448 | if (ta->trim_type == TRIM_TYPE_MANUAL) { | |
449 | while (vd->vdev_trim_rate != 0 && !vdev_trim_should_stop(vd) && | |
450 | vdev_trim_calculate_rate(ta) > vd->vdev_trim_rate) { | |
451 | cv_timedwait_sig(&vd->vdev_trim_io_cv, | |
452 | &vd->vdev_trim_io_lock, ddi_get_lbolt() + | |
453 | MSEC_TO_TICK(10)); | |
454 | } | |
455 | } | |
456 | ta->trim_bytes_done += size; | |
457 | ||
458 | /* Limit in flight trimming I/Os */ | |
459 | while (vd->vdev_trim_inflight[0] + vd->vdev_trim_inflight[1] >= | |
460 | zfs_trim_queue_limit) { | |
461 | cv_wait(&vd->vdev_trim_io_cv, &vd->vdev_trim_io_lock); | |
462 | } | |
463 | vd->vdev_trim_inflight[ta->trim_type]++; | |
464 | mutex_exit(&vd->vdev_trim_io_lock); | |
465 | ||
466 | dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); | |
467 | VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); | |
468 | uint64_t txg = dmu_tx_get_txg(tx); | |
469 | ||
470 | spa_config_enter(spa, SCL_STATE_ALL, vd, RW_READER); | |
471 | mutex_enter(&vd->vdev_trim_lock); | |
472 | ||
473 | if (ta->trim_type == TRIM_TYPE_MANUAL && | |
474 | vd->vdev_trim_offset[txg & TXG_MASK] == 0) { | |
475 | uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP); | |
476 | *guid = vd->vdev_guid; | |
477 | ||
478 | /* This is the first write of this txg. */ | |
479 | dsl_sync_task_nowait(spa_get_dsl(spa), | |
480 | vdev_trim_zap_update_sync, guid, 2, | |
481 | ZFS_SPACE_CHECK_RESERVED, tx); | |
482 | } | |
483 | ||
484 | /* | |
485 | * We know the vdev_t will still be around since all consumers of | |
486 | * vdev_free must stop the trimming first. | |
487 | */ | |
488 | if ((ta->trim_type == TRIM_TYPE_MANUAL && | |
489 | vdev_trim_should_stop(vd)) || | |
490 | (ta->trim_type == TRIM_TYPE_AUTO && | |
491 | vdev_autotrim_should_stop(vd->vdev_top))) { | |
492 | mutex_enter(&vd->vdev_trim_io_lock); | |
493 | vd->vdev_trim_inflight[ta->trim_type]--; | |
494 | mutex_exit(&vd->vdev_trim_io_lock); | |
495 | spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); | |
496 | mutex_exit(&vd->vdev_trim_lock); | |
497 | dmu_tx_commit(tx); | |
498 | return (SET_ERROR(EINTR)); | |
499 | } | |
500 | mutex_exit(&vd->vdev_trim_lock); | |
501 | ||
502 | if (ta->trim_type == TRIM_TYPE_MANUAL) | |
503 | vd->vdev_trim_offset[txg & TXG_MASK] = start + size; | |
504 | ||
505 | zio_nowait(zio_trim(spa->spa_txg_zio[txg & TXG_MASK], vd, | |
506 | start, size, ta->trim_type == TRIM_TYPE_MANUAL ? | |
507 | vdev_trim_cb : vdev_autotrim_cb, NULL, | |
508 | ZIO_PRIORITY_TRIM, ZIO_FLAG_CANFAIL, ta->trim_flags)); | |
509 | /* vdev_trim_cb and vdev_autotrim_cb release SCL_STATE_ALL */ | |
510 | ||
511 | dmu_tx_commit(tx); | |
512 | ||
513 | return (0); | |
514 | } | |
515 | ||
516 | /* | |
517 | * Issues TRIM I/Os for all ranges in the provided ta->trim_tree range tree. | |
518 | * Additional parameters describing how the TRIM should be performed must | |
519 | * be set in the trim_args structure. See the trim_args definition for | |
520 | * additional information. | |
521 | */ | |
522 | static int | |
523 | vdev_trim_ranges(trim_args_t *ta) | |
524 | { | |
525 | vdev_t *vd = ta->trim_vdev; | |
526 | avl_tree_t *rt = &ta->trim_tree->rt_root; | |
527 | uint64_t extent_bytes_max = ta->trim_extent_bytes_max; | |
528 | uint64_t extent_bytes_min = ta->trim_extent_bytes_min; | |
529 | spa_t *spa = vd->vdev_spa; | |
530 | ||
531 | ta->trim_start_time = gethrtime(); | |
532 | ta->trim_bytes_done = 0; | |
533 | ||
534 | for (range_seg_t *rs = avl_first(rt); rs != NULL; | |
535 | rs = AVL_NEXT(rt, rs)) { | |
536 | uint64_t size = rs->rs_end - rs->rs_start; | |
537 | ||
538 | if (extent_bytes_min && size < extent_bytes_min) { | |
539 | spa_iostats_trim_add(spa, ta->trim_type, | |
540 | 0, 0, 1, size, 0, 0); | |
541 | continue; | |
542 | } | |
543 | ||
544 | /* Split range into legally-sized physical chunks */ | |
545 | uint64_t writes_required = ((size - 1) / extent_bytes_max) + 1; | |
546 | ||
547 | for (uint64_t w = 0; w < writes_required; w++) { | |
548 | int error; | |
549 | ||
550 | error = vdev_trim_range(ta, VDEV_LABEL_START_SIZE + | |
551 | rs->rs_start + (w * extent_bytes_max), | |
552 | MIN(size - (w * extent_bytes_max), | |
553 | extent_bytes_max)); | |
554 | if (error != 0) { | |
555 | return (error); | |
556 | } | |
557 | } | |
558 | } | |
559 | ||
560 | return (0); | |
561 | } | |
562 | ||
563 | /* | |
564 | * Calculates the completion percentage of a manual TRIM. | |
565 | */ | |
566 | static void | |
567 | vdev_trim_calculate_progress(vdev_t *vd) | |
568 | { | |
569 | ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) || | |
570 | spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER)); | |
571 | ASSERT(vd->vdev_leaf_zap != 0); | |
572 | ||
573 | vd->vdev_trim_bytes_est = 0; | |
574 | vd->vdev_trim_bytes_done = 0; | |
575 | ||
576 | for (uint64_t i = 0; i < vd->vdev_top->vdev_ms_count; i++) { | |
577 | metaslab_t *msp = vd->vdev_top->vdev_ms[i]; | |
578 | mutex_enter(&msp->ms_lock); | |
579 | ||
580 | uint64_t ms_free = msp->ms_size - | |
581 | metaslab_allocated_space(msp); | |
582 | ||
583 | if (vd->vdev_top->vdev_ops == &vdev_raidz_ops) | |
584 | ms_free /= vd->vdev_top->vdev_children; | |
585 | ||
586 | /* | |
587 | * Convert the metaslab range to a physical range | |
588 | * on our vdev. We use this to determine if we are | |
589 | * in the middle of this metaslab range. | |
590 | */ | |
591 | range_seg_t logical_rs, physical_rs; | |
592 | logical_rs.rs_start = msp->ms_start; | |
593 | logical_rs.rs_end = msp->ms_start + msp->ms_size; | |
594 | vdev_xlate(vd, &logical_rs, &physical_rs); | |
595 | ||
596 | if (vd->vdev_trim_last_offset <= physical_rs.rs_start) { | |
597 | vd->vdev_trim_bytes_est += ms_free; | |
598 | mutex_exit(&msp->ms_lock); | |
599 | continue; | |
600 | } else if (vd->vdev_trim_last_offset > physical_rs.rs_end) { | |
601 | vd->vdev_trim_bytes_done += ms_free; | |
602 | vd->vdev_trim_bytes_est += ms_free; | |
603 | mutex_exit(&msp->ms_lock); | |
604 | continue; | |
605 | } | |
606 | ||
607 | /* | |
608 | * If we get here, we're in the middle of trimming this | |
609 | * metaslab. Load it and walk the free tree for more | |
610 | * accurate progress estimation. | |
611 | */ | |
612 | VERIFY0(metaslab_load(msp)); | |
613 | ||
614 | for (range_seg_t *rs = avl_first(&msp->ms_allocatable->rt_root); | |
615 | rs; rs = AVL_NEXT(&msp->ms_allocatable->rt_root, rs)) { | |
616 | logical_rs.rs_start = rs->rs_start; | |
617 | logical_rs.rs_end = rs->rs_end; | |
618 | vdev_xlate(vd, &logical_rs, &physical_rs); | |
619 | ||
620 | uint64_t size = physical_rs.rs_end - | |
621 | physical_rs.rs_start; | |
622 | vd->vdev_trim_bytes_est += size; | |
623 | if (vd->vdev_trim_last_offset >= physical_rs.rs_end) { | |
624 | vd->vdev_trim_bytes_done += size; | |
625 | } else if (vd->vdev_trim_last_offset > | |
626 | physical_rs.rs_start && | |
627 | vd->vdev_trim_last_offset <= | |
628 | physical_rs.rs_end) { | |
629 | vd->vdev_trim_bytes_done += | |
630 | vd->vdev_trim_last_offset - | |
631 | physical_rs.rs_start; | |
632 | } | |
633 | } | |
634 | mutex_exit(&msp->ms_lock); | |
635 | } | |
636 | } | |
637 | ||
638 | /* | |
639 | * Load from disk the vdev's manual TRIM information. This includes the | |
640 | * state, progress, and options provided when initiating the manual TRIM. | |
641 | */ | |
642 | static int | |
643 | vdev_trim_load(vdev_t *vd) | |
644 | { | |
645 | int err = 0; | |
646 | ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) || | |
647 | spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER)); | |
648 | ASSERT(vd->vdev_leaf_zap != 0); | |
649 | ||
650 | if (vd->vdev_trim_state == VDEV_TRIM_ACTIVE || | |
651 | vd->vdev_trim_state == VDEV_TRIM_SUSPENDED) { | |
652 | err = zap_lookup(vd->vdev_spa->spa_meta_objset, | |
653 | vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_LAST_OFFSET, | |
654 | sizeof (vd->vdev_trim_last_offset), 1, | |
655 | &vd->vdev_trim_last_offset); | |
656 | if (err == ENOENT) { | |
657 | vd->vdev_trim_last_offset = 0; | |
658 | err = 0; | |
659 | } | |
660 | ||
661 | if (err == 0) { | |
662 | err = zap_lookup(vd->vdev_spa->spa_meta_objset, | |
663 | vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_RATE, | |
664 | sizeof (vd->vdev_trim_rate), 1, | |
665 | &vd->vdev_trim_rate); | |
666 | if (err == ENOENT) { | |
667 | vd->vdev_trim_rate = 0; | |
668 | err = 0; | |
669 | } | |
670 | } | |
671 | ||
672 | if (err == 0) { | |
673 | err = zap_lookup(vd->vdev_spa->spa_meta_objset, | |
674 | vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_PARTIAL, | |
675 | sizeof (vd->vdev_trim_partial), 1, | |
676 | &vd->vdev_trim_partial); | |
677 | if (err == ENOENT) { | |
678 | vd->vdev_trim_partial = 0; | |
679 | err = 0; | |
680 | } | |
681 | } | |
682 | ||
683 | if (err == 0) { | |
684 | err = zap_lookup(vd->vdev_spa->spa_meta_objset, | |
685 | vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_SECURE, | |
686 | sizeof (vd->vdev_trim_secure), 1, | |
687 | &vd->vdev_trim_secure); | |
688 | if (err == ENOENT) { | |
689 | vd->vdev_trim_secure = 0; | |
690 | err = 0; | |
691 | } | |
692 | } | |
693 | } | |
694 | ||
695 | vdev_trim_calculate_progress(vd); | |
696 | ||
697 | return (err); | |
698 | } | |
699 | ||
700 | /* | |
701 | * Convert the logical range into a physical range and add it to the | |
702 | * range tree passed in the trim_args_t. | |
703 | */ | |
704 | static void | |
705 | vdev_trim_range_add(void *arg, uint64_t start, uint64_t size) | |
706 | { | |
707 | trim_args_t *ta = arg; | |
708 | vdev_t *vd = ta->trim_vdev; | |
709 | range_seg_t logical_rs, physical_rs; | |
710 | logical_rs.rs_start = start; | |
711 | logical_rs.rs_end = start + size; | |
712 | ||
713 | /* | |
714 | * Every range to be trimmed must be part of ms_allocatable. | |
715 | * When ZFS_DEBUG_TRIM is set load the metaslab to verify this | |
716 | * is always the case. | |
717 | */ | |
718 | if (zfs_flags & ZFS_DEBUG_TRIM) { | |
719 | metaslab_t *msp = ta->trim_msp; | |
720 | VERIFY0(metaslab_load(msp)); | |
721 | VERIFY3B(msp->ms_loaded, ==, B_TRUE); | |
722 | VERIFY(range_tree_find(msp->ms_allocatable, start, size)); | |
723 | } | |
724 | ||
725 | ASSERT(vd->vdev_ops->vdev_op_leaf); | |
726 | vdev_xlate(vd, &logical_rs, &physical_rs); | |
727 | ||
728 | IMPLY(vd->vdev_top == vd, | |
729 | logical_rs.rs_start == physical_rs.rs_start); | |
730 | IMPLY(vd->vdev_top == vd, | |
731 | logical_rs.rs_end == physical_rs.rs_end); | |
732 | ||
733 | /* | |
734 | * Only a manual trim will be traversing the vdev sequentially. | |
735 | * For an auto trim all valid ranges should be added. | |
736 | */ | |
737 | if (ta->trim_type == TRIM_TYPE_MANUAL) { | |
738 | ||
739 | /* Only add segments that we have not visited yet */ | |
740 | if (physical_rs.rs_end <= vd->vdev_trim_last_offset) | |
741 | return; | |
742 | ||
743 | /* Pick up where we left off mid-range. */ | |
744 | if (vd->vdev_trim_last_offset > physical_rs.rs_start) { | |
745 | ASSERT3U(physical_rs.rs_end, >, | |
746 | vd->vdev_trim_last_offset); | |
747 | physical_rs.rs_start = vd->vdev_trim_last_offset; | |
748 | } | |
749 | } | |
750 | ||
751 | ASSERT3U(physical_rs.rs_end, >=, physical_rs.rs_start); | |
752 | ||
753 | /* | |
754 | * With raidz, it's possible that the logical range does not live on | |
755 | * this leaf vdev. We only add the physical range to this vdev's if it | |
756 | * has a length greater than 0. | |
757 | */ | |
758 | if (physical_rs.rs_end > physical_rs.rs_start) { | |
759 | range_tree_add(ta->trim_tree, physical_rs.rs_start, | |
760 | physical_rs.rs_end - physical_rs.rs_start); | |
761 | } else { | |
762 | ASSERT3U(physical_rs.rs_end, ==, physical_rs.rs_start); | |
763 | } | |
764 | } | |
765 | ||
766 | /* | |
767 | * Each manual TRIM thread is responsible for trimming the unallocated | |
768 | * space for each leaf vdev. This is accomplished by sequentially iterating | |
769 | * over its top-level metaslabs and issuing TRIM I/O for the space described | |
770 | * by its ms_allocatable. While a metaslab is undergoing trimming it is | |
771 | * not eligible for new allocations. | |
772 | */ | |
773 | static void | |
774 | vdev_trim_thread(void *arg) | |
775 | { | |
776 | vdev_t *vd = arg; | |
777 | spa_t *spa = vd->vdev_spa; | |
778 | trim_args_t ta; | |
779 | int error = 0; | |
780 | ||
781 | /* | |
782 | * The VDEV_LEAF_ZAP_TRIM_* entries may have been updated by | |
783 | * vdev_trim(). Wait for the updated values to be reflected | |
784 | * in the zap in order to start with the requested settings. | |
785 | */ | |
786 | txg_wait_synced(spa_get_dsl(vd->vdev_spa), 0); | |
787 | ||
788 | ASSERT(vdev_is_concrete(vd)); | |
789 | spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); | |
790 | ||
791 | vd->vdev_trim_last_offset = 0; | |
792 | vd->vdev_trim_rate = 0; | |
793 | vd->vdev_trim_partial = 0; | |
794 | vd->vdev_trim_secure = 0; | |
795 | ||
796 | VERIFY0(vdev_trim_load(vd)); | |
797 | ||
798 | ta.trim_vdev = vd; | |
799 | ta.trim_extent_bytes_max = zfs_trim_extent_bytes_max; | |
800 | ta.trim_extent_bytes_min = zfs_trim_extent_bytes_min; | |
801 | ta.trim_tree = range_tree_create(NULL, NULL); | |
802 | ta.trim_type = TRIM_TYPE_MANUAL; | |
803 | ta.trim_flags = 0; | |
804 | ||
805 | /* | |
806 | * When a secure TRIM has been requested infer that the intent | |
807 | * is that everything must be trimmed. Override the default | |
808 | * minimum TRIM size to prevent ranges from being skipped. | |
809 | */ | |
810 | if (vd->vdev_trim_secure) { | |
811 | ta.trim_flags |= ZIO_TRIM_SECURE; | |
812 | ta.trim_extent_bytes_min = SPA_MINBLOCKSIZE; | |
813 | } | |
814 | ||
815 | uint64_t ms_count = 0; | |
816 | for (uint64_t i = 0; !vd->vdev_detached && | |
817 | i < vd->vdev_top->vdev_ms_count; i++) { | |
818 | metaslab_t *msp = vd->vdev_top->vdev_ms[i]; | |
819 | ||
820 | /* | |
821 | * If we've expanded the top-level vdev or it's our | |
822 | * first pass, calculate our progress. | |
823 | */ | |
824 | if (vd->vdev_top->vdev_ms_count != ms_count) { | |
825 | vdev_trim_calculate_progress(vd); | |
826 | ms_count = vd->vdev_top->vdev_ms_count; | |
827 | } | |
828 | ||
829 | spa_config_exit(spa, SCL_CONFIG, FTAG); | |
830 | metaslab_disable(msp); | |
831 | mutex_enter(&msp->ms_lock); | |
832 | VERIFY0(metaslab_load(msp)); | |
833 | ||
834 | /* | |
835 | * If a partial TRIM was requested skip metaslabs which have | |
836 | * never been initialized and thus have never been written. | |
837 | */ | |
838 | if (msp->ms_sm == NULL && vd->vdev_trim_partial) { | |
839 | mutex_exit(&msp->ms_lock); | |
f09fda50 | 840 | metaslab_enable(msp, B_FALSE, B_FALSE); |
1b939560 BB |
841 | spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); |
842 | vdev_trim_calculate_progress(vd); | |
843 | continue; | |
844 | } | |
845 | ||
846 | ta.trim_msp = msp; | |
847 | range_tree_walk(msp->ms_allocatable, vdev_trim_range_add, &ta); | |
848 | range_tree_vacate(msp->ms_trim, NULL, NULL); | |
849 | mutex_exit(&msp->ms_lock); | |
850 | ||
851 | error = vdev_trim_ranges(&ta); | |
f09fda50 | 852 | metaslab_enable(msp, B_TRUE, B_FALSE); |
1b939560 BB |
853 | spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); |
854 | ||
855 | range_tree_vacate(ta.trim_tree, NULL, NULL); | |
856 | if (error != 0) | |
857 | break; | |
858 | } | |
859 | ||
860 | spa_config_exit(spa, SCL_CONFIG, FTAG); | |
861 | mutex_enter(&vd->vdev_trim_io_lock); | |
862 | while (vd->vdev_trim_inflight[0] > 0) { | |
863 | cv_wait(&vd->vdev_trim_io_cv, &vd->vdev_trim_io_lock); | |
864 | } | |
865 | mutex_exit(&vd->vdev_trim_io_lock); | |
866 | ||
867 | range_tree_destroy(ta.trim_tree); | |
868 | ||
869 | mutex_enter(&vd->vdev_trim_lock); | |
870 | if (!vd->vdev_trim_exit_wanted && vdev_writeable(vd)) { | |
871 | vdev_trim_change_state(vd, VDEV_TRIM_COMPLETE, | |
872 | vd->vdev_trim_rate, vd->vdev_trim_partial, | |
873 | vd->vdev_trim_secure); | |
874 | } | |
875 | ASSERT(vd->vdev_trim_thread != NULL || vd->vdev_trim_inflight[0] == 0); | |
876 | ||
877 | /* | |
878 | * Drop the vdev_trim_lock while we sync out the txg since it's | |
879 | * possible that a device might be trying to come online and must | |
880 | * check to see if it needs to restart a trim. That thread will be | |
881 | * holding the spa_config_lock which would prevent the txg_wait_synced | |
882 | * from completing. | |
883 | */ | |
884 | mutex_exit(&vd->vdev_trim_lock); | |
885 | txg_wait_synced(spa_get_dsl(spa), 0); | |
886 | mutex_enter(&vd->vdev_trim_lock); | |
887 | ||
888 | vd->vdev_trim_thread = NULL; | |
889 | cv_broadcast(&vd->vdev_trim_cv); | |
890 | mutex_exit(&vd->vdev_trim_lock); | |
891 | } | |
892 | ||
893 | /* | |
894 | * Initiates a manual TRIM for the vdev_t. Callers must hold vdev_trim_lock, | |
895 | * the vdev_t must be a leaf and cannot already be manually trimming. | |
896 | */ | |
897 | void | |
898 | vdev_trim(vdev_t *vd, uint64_t rate, boolean_t partial, boolean_t secure) | |
899 | { | |
900 | ASSERT(MUTEX_HELD(&vd->vdev_trim_lock)); | |
901 | ASSERT(vd->vdev_ops->vdev_op_leaf); | |
902 | ASSERT(vdev_is_concrete(vd)); | |
903 | ASSERT3P(vd->vdev_trim_thread, ==, NULL); | |
904 | ASSERT(!vd->vdev_detached); | |
905 | ASSERT(!vd->vdev_trim_exit_wanted); | |
906 | ASSERT(!vd->vdev_top->vdev_removing); | |
907 | ||
908 | vdev_trim_change_state(vd, VDEV_TRIM_ACTIVE, rate, partial, secure); | |
909 | vd->vdev_trim_thread = thread_create(NULL, 0, | |
910 | vdev_trim_thread, vd, 0, &p0, TS_RUN, maxclsyspri); | |
911 | } | |
912 | ||
913 | /* | |
914 | * Wait for the trimming thread to be terminated (canceled or stopped). | |
915 | */ | |
916 | static void | |
917 | vdev_trim_stop_wait_impl(vdev_t *vd) | |
918 | { | |
919 | ASSERT(MUTEX_HELD(&vd->vdev_trim_lock)); | |
920 | ||
921 | while (vd->vdev_trim_thread != NULL) | |
922 | cv_wait(&vd->vdev_trim_cv, &vd->vdev_trim_lock); | |
923 | ||
924 | ASSERT3P(vd->vdev_trim_thread, ==, NULL); | |
925 | vd->vdev_trim_exit_wanted = B_FALSE; | |
926 | } | |
927 | ||
928 | /* | |
929 | * Wait for vdev trim threads which were listed to cleanly exit. | |
930 | */ | |
931 | void | |
932 | vdev_trim_stop_wait(spa_t *spa, list_t *vd_list) | |
933 | { | |
934 | vdev_t *vd; | |
935 | ||
936 | ASSERT(MUTEX_HELD(&spa_namespace_lock)); | |
937 | ||
938 | while ((vd = list_remove_head(vd_list)) != NULL) { | |
939 | mutex_enter(&vd->vdev_trim_lock); | |
940 | vdev_trim_stop_wait_impl(vd); | |
941 | mutex_exit(&vd->vdev_trim_lock); | |
942 | } | |
943 | } | |
944 | ||
945 | /* | |
946 | * Stop trimming a device, with the resultant trimming state being tgt_state. | |
947 | * For blocking behavior pass NULL for vd_list. Otherwise, when a list_t is | |
948 | * provided the stopping vdev is inserted in to the list. Callers are then | |
949 | * required to call vdev_trim_stop_wait() to block for all the trim threads | |
950 | * to exit. The caller must hold vdev_trim_lock and must not be writing to | |
951 | * the spa config, as the trimming thread may try to enter the config as a | |
952 | * reader before exiting. | |
953 | */ | |
954 | void | |
955 | vdev_trim_stop(vdev_t *vd, vdev_trim_state_t tgt_state, list_t *vd_list) | |
956 | { | |
957 | ASSERT(!spa_config_held(vd->vdev_spa, SCL_CONFIG|SCL_STATE, RW_WRITER)); | |
958 | ASSERT(MUTEX_HELD(&vd->vdev_trim_lock)); | |
959 | ASSERT(vd->vdev_ops->vdev_op_leaf); | |
960 | ASSERT(vdev_is_concrete(vd)); | |
961 | ||
962 | /* | |
963 | * Allow cancel requests to proceed even if the trim thread has | |
964 | * stopped. | |
965 | */ | |
966 | if (vd->vdev_trim_thread == NULL && tgt_state != VDEV_TRIM_CANCELED) | |
967 | return; | |
968 | ||
969 | vdev_trim_change_state(vd, tgt_state, 0, 0, 0); | |
970 | vd->vdev_trim_exit_wanted = B_TRUE; | |
971 | ||
972 | if (vd_list == NULL) { | |
973 | vdev_trim_stop_wait_impl(vd); | |
974 | } else { | |
975 | ASSERT(MUTEX_HELD(&spa_namespace_lock)); | |
976 | list_insert_tail(vd_list, vd); | |
977 | } | |
978 | } | |
979 | ||
980 | /* | |
981 | * Requests that all listed vdevs stop trimming. | |
982 | */ | |
983 | static void | |
984 | vdev_trim_stop_all_impl(vdev_t *vd, vdev_trim_state_t tgt_state, | |
985 | list_t *vd_list) | |
986 | { | |
987 | if (vd->vdev_ops->vdev_op_leaf && vdev_is_concrete(vd)) { | |
988 | mutex_enter(&vd->vdev_trim_lock); | |
989 | vdev_trim_stop(vd, tgt_state, vd_list); | |
990 | mutex_exit(&vd->vdev_trim_lock); | |
991 | return; | |
992 | } | |
993 | ||
994 | for (uint64_t i = 0; i < vd->vdev_children; i++) { | |
995 | vdev_trim_stop_all_impl(vd->vdev_child[i], tgt_state, | |
996 | vd_list); | |
997 | } | |
998 | } | |
999 | ||
1000 | /* | |
1001 | * Convenience function to stop trimming of a vdev tree and set all trim | |
1002 | * thread pointers to NULL. | |
1003 | */ | |
1004 | void | |
1005 | vdev_trim_stop_all(vdev_t *vd, vdev_trim_state_t tgt_state) | |
1006 | { | |
1007 | spa_t *spa = vd->vdev_spa; | |
1008 | list_t vd_list; | |
1009 | ||
1010 | ASSERT(MUTEX_HELD(&spa_namespace_lock)); | |
1011 | ||
1012 | list_create(&vd_list, sizeof (vdev_t), | |
1013 | offsetof(vdev_t, vdev_trim_node)); | |
1014 | ||
1015 | vdev_trim_stop_all_impl(vd, tgt_state, &vd_list); | |
1016 | vdev_trim_stop_wait(spa, &vd_list); | |
1017 | ||
1018 | if (vd->vdev_spa->spa_sync_on) { | |
1019 | /* Make sure that our state has been synced to disk */ | |
1020 | txg_wait_synced(spa_get_dsl(vd->vdev_spa), 0); | |
1021 | } | |
1022 | ||
1023 | list_destroy(&vd_list); | |
1024 | } | |
1025 | ||
1026 | /* | |
1027 | * Conditionally restarts a manual TRIM given its on-disk state. | |
1028 | */ | |
1029 | void | |
1030 | vdev_trim_restart(vdev_t *vd) | |
1031 | { | |
1032 | ASSERT(MUTEX_HELD(&spa_namespace_lock)); | |
1033 | ASSERT(!spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); | |
1034 | ||
1035 | if (vd->vdev_leaf_zap != 0) { | |
1036 | mutex_enter(&vd->vdev_trim_lock); | |
1037 | uint64_t trim_state = VDEV_TRIM_NONE; | |
1038 | int err = zap_lookup(vd->vdev_spa->spa_meta_objset, | |
1039 | vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_STATE, | |
1040 | sizeof (trim_state), 1, &trim_state); | |
1041 | ASSERT(err == 0 || err == ENOENT); | |
1042 | vd->vdev_trim_state = trim_state; | |
1043 | ||
1044 | uint64_t timestamp = 0; | |
1045 | err = zap_lookup(vd->vdev_spa->spa_meta_objset, | |
1046 | vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_ACTION_TIME, | |
1047 | sizeof (timestamp), 1, ×tamp); | |
1048 | ASSERT(err == 0 || err == ENOENT); | |
1049 | vd->vdev_trim_action_time = (time_t)timestamp; | |
1050 | ||
1051 | if (vd->vdev_trim_state == VDEV_TRIM_SUSPENDED || | |
1052 | vd->vdev_offline) { | |
1053 | /* load progress for reporting, but don't resume */ | |
1054 | VERIFY0(vdev_trim_load(vd)); | |
1055 | } else if (vd->vdev_trim_state == VDEV_TRIM_ACTIVE && | |
1056 | vdev_writeable(vd) && !vd->vdev_top->vdev_removing && | |
1057 | vd->vdev_trim_thread == NULL) { | |
1058 | VERIFY0(vdev_trim_load(vd)); | |
1059 | vdev_trim(vd, vd->vdev_trim_rate, | |
1060 | vd->vdev_trim_partial, vd->vdev_trim_secure); | |
1061 | } | |
1062 | ||
1063 | mutex_exit(&vd->vdev_trim_lock); | |
1064 | } | |
1065 | ||
1066 | for (uint64_t i = 0; i < vd->vdev_children; i++) { | |
1067 | vdev_trim_restart(vd->vdev_child[i]); | |
1068 | } | |
1069 | } | |
1070 | ||
1071 | /* | |
1072 | * Used by the automatic TRIM when ZFS_DEBUG_TRIM is set to verify that | |
1073 | * every TRIM range is contained within ms_allocatable. | |
1074 | */ | |
1075 | static void | |
1076 | vdev_trim_range_verify(void *arg, uint64_t start, uint64_t size) | |
1077 | { | |
1078 | trim_args_t *ta = arg; | |
1079 | metaslab_t *msp = ta->trim_msp; | |
1080 | ||
1081 | VERIFY3B(msp->ms_loaded, ==, B_TRUE); | |
1082 | VERIFY3U(msp->ms_disabled, >, 0); | |
1083 | VERIFY(range_tree_find(msp->ms_allocatable, start, size) != NULL); | |
1084 | } | |
1085 | ||
1086 | /* | |
1087 | * Each automatic TRIM thread is responsible for managing the trimming of a | |
1088 | * top-level vdev in the pool. No automatic TRIM state is maintained on-disk. | |
1089 | * | |
1090 | * N.B. This behavior is different from a manual TRIM where a thread | |
1091 | * is created for each leaf vdev, instead of each top-level vdev. | |
1092 | */ | |
1093 | static void | |
1094 | vdev_autotrim_thread(void *arg) | |
1095 | { | |
1096 | vdev_t *vd = arg; | |
1097 | spa_t *spa = vd->vdev_spa; | |
1098 | int shift = 0; | |
1099 | ||
1100 | mutex_enter(&vd->vdev_autotrim_lock); | |
1101 | ASSERT3P(vd->vdev_top, ==, vd); | |
1102 | ASSERT3P(vd->vdev_autotrim_thread, !=, NULL); | |
1103 | mutex_exit(&vd->vdev_autotrim_lock); | |
1104 | spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); | |
1105 | ||
1106 | uint64_t extent_bytes_max = zfs_trim_extent_bytes_max; | |
1107 | uint64_t extent_bytes_min = zfs_trim_extent_bytes_min; | |
1108 | ||
1109 | while (!vdev_autotrim_should_stop(vd)) { | |
1110 | int txgs_per_trim = MAX(zfs_trim_txg_batch, 1); | |
1111 | boolean_t issued_trim = B_FALSE; | |
1112 | ||
1113 | /* | |
1114 | * All of the metaslabs are divided in to groups of size | |
1115 | * num_metaslabs / zfs_trim_txg_batch. Each of these groups | |
1116 | * is composed of metaslabs which are spread evenly over the | |
1117 | * device. | |
1118 | * | |
1119 | * For example, when zfs_trim_txg_batch = 32 (default) then | |
1120 | * group 0 will contain metaslabs 0, 32, 64, ...; | |
1121 | * group 1 will contain metaslabs 1, 33, 65, ...; | |
1122 | * group 2 will contain metaslabs 2, 34, 66, ...; and so on. | |
1123 | * | |
1124 | * On each pass through the while() loop one of these groups | |
1125 | * is selected. This is accomplished by using a shift value | |
1126 | * to select the starting metaslab, then striding over the | |
1127 | * metaslabs using the zfs_trim_txg_batch size. This is | |
1128 | * done to accomplish two things. | |
1129 | * | |
1130 | * 1) By dividing the metaslabs in to groups, and making sure | |
1131 | * that each group takes a minimum of one txg to process. | |
1132 | * Then zfs_trim_txg_batch controls the minimum number of | |
1133 | * txgs which must occur before a metaslab is revisited. | |
1134 | * | |
1135 | * 2) Selecting non-consecutive metaslabs distributes the | |
1136 | * TRIM commands for a group evenly over the entire device. | |
1137 | * This can be advantageous for certain types of devices. | |
1138 | */ | |
1139 | for (uint64_t i = shift % txgs_per_trim; i < vd->vdev_ms_count; | |
1140 | i += txgs_per_trim) { | |
1141 | metaslab_t *msp = vd->vdev_ms[i]; | |
1142 | range_tree_t *trim_tree; | |
1143 | ||
1144 | spa_config_exit(spa, SCL_CONFIG, FTAG); | |
1145 | metaslab_disable(msp); | |
1146 | spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); | |
1147 | ||
1148 | mutex_enter(&msp->ms_lock); | |
1149 | ||
1150 | /* | |
1151 | * Skip the metaslab when it has never been allocated | |
1152 | * or when there are no recent frees to trim. | |
1153 | */ | |
1154 | if (msp->ms_sm == NULL || | |
1155 | range_tree_is_empty(msp->ms_trim)) { | |
1156 | mutex_exit(&msp->ms_lock); | |
f09fda50 | 1157 | metaslab_enable(msp, B_FALSE, B_FALSE); |
1b939560 BB |
1158 | continue; |
1159 | } | |
1160 | ||
1161 | /* | |
1162 | * Skip the metaslab when it has already been disabled. | |
1163 | * This may happen when a manual TRIM or initialize | |
1164 | * operation is running concurrently. In the case | |
1165 | * of a manual TRIM, the ms_trim tree will have been | |
1166 | * vacated. Only ranges added after the manual TRIM | |
1167 | * disabled the metaslab will be included in the tree. | |
1168 | * These will be processed when the automatic TRIM | |
1169 | * next revisits this metaslab. | |
1170 | */ | |
1171 | if (msp->ms_disabled > 1) { | |
1172 | mutex_exit(&msp->ms_lock); | |
f09fda50 | 1173 | metaslab_enable(msp, B_FALSE, B_FALSE); |
1b939560 BB |
1174 | continue; |
1175 | } | |
1176 | ||
1177 | /* | |
1178 | * Allocate an empty range tree which is swapped in | |
1179 | * for the existing ms_trim tree while it is processed. | |
1180 | */ | |
1181 | trim_tree = range_tree_create(NULL, NULL); | |
1182 | range_tree_swap(&msp->ms_trim, &trim_tree); | |
1183 | ASSERT(range_tree_is_empty(msp->ms_trim)); | |
1184 | ||
1185 | /* | |
1186 | * There are two cases when constructing the per-vdev | |
1187 | * trim trees for a metaslab. If the top-level vdev | |
1188 | * has no children then it is also a leaf and should | |
1189 | * be trimmed. Otherwise our children are the leaves | |
1190 | * and a trim tree should be constructed for each. | |
1191 | */ | |
1192 | trim_args_t *tap; | |
1193 | uint64_t children = vd->vdev_children; | |
1194 | if (children == 0) { | |
1195 | children = 1; | |
1196 | tap = kmem_zalloc(sizeof (trim_args_t) * | |
1197 | children, KM_SLEEP); | |
1198 | tap[0].trim_vdev = vd; | |
1199 | } else { | |
1200 | tap = kmem_zalloc(sizeof (trim_args_t) * | |
1201 | children, KM_SLEEP); | |
1202 | ||
1203 | for (uint64_t c = 0; c < children; c++) { | |
1204 | tap[c].trim_vdev = vd->vdev_child[c]; | |
1205 | } | |
1206 | } | |
1207 | ||
1208 | for (uint64_t c = 0; c < children; c++) { | |
1209 | trim_args_t *ta = &tap[c]; | |
1210 | vdev_t *cvd = ta->trim_vdev; | |
1211 | ||
1212 | ta->trim_msp = msp; | |
1213 | ta->trim_extent_bytes_max = extent_bytes_max; | |
1214 | ta->trim_extent_bytes_min = extent_bytes_min; | |
1215 | ta->trim_type = TRIM_TYPE_AUTO; | |
1216 | ta->trim_flags = 0; | |
1217 | ||
1218 | if (cvd->vdev_detached || | |
1219 | !vdev_writeable(cvd) || | |
1220 | !cvd->vdev_has_trim || | |
1221 | cvd->vdev_trim_thread != NULL) { | |
1222 | continue; | |
1223 | } | |
1224 | ||
1225 | /* | |
1226 | * When a device has an attached hot spare, or | |
1227 | * is being replaced it will not be trimmed. | |
1228 | * This is done to avoid adding additional | |
1229 | * stress to a potentially unhealthy device, | |
1230 | * and to minimize the required rebuild time. | |
1231 | */ | |
1232 | if (!cvd->vdev_ops->vdev_op_leaf) | |
1233 | continue; | |
1234 | ||
1235 | ta->trim_tree = range_tree_create(NULL, NULL); | |
1236 | range_tree_walk(trim_tree, | |
1237 | vdev_trim_range_add, ta); | |
1238 | } | |
1239 | ||
1240 | mutex_exit(&msp->ms_lock); | |
1241 | spa_config_exit(spa, SCL_CONFIG, FTAG); | |
1242 | ||
1243 | /* | |
1244 | * Issue the TRIM I/Os for all ranges covered by the | |
1245 | * TRIM trees. These ranges are safe to TRIM because | |
1246 | * no new allocations will be performed until the call | |
1247 | * to metaslab_enabled() below. | |
1248 | */ | |
1249 | for (uint64_t c = 0; c < children; c++) { | |
1250 | trim_args_t *ta = &tap[c]; | |
1251 | ||
1252 | /* | |
1253 | * Always yield to a manual TRIM if one has | |
1254 | * been started for the child vdev. | |
1255 | */ | |
1256 | if (ta->trim_tree == NULL || | |
1257 | ta->trim_vdev->vdev_trim_thread != NULL) { | |
1258 | continue; | |
1259 | } | |
1260 | ||
1261 | /* | |
1262 | * After this point metaslab_enable() must be | |
1263 | * called with the sync flag set. This is done | |
1264 | * here because vdev_trim_ranges() is allowed | |
1265 | * to be interrupted (EINTR) before issuing all | |
1266 | * of the required TRIM I/Os. | |
1267 | */ | |
1268 | issued_trim = B_TRUE; | |
1269 | ||
1270 | int error = vdev_trim_ranges(ta); | |
1271 | if (error) | |
1272 | break; | |
1273 | } | |
1274 | ||
1275 | /* | |
1276 | * Verify every range which was trimmed is still | |
1277 | * contained within the ms_allocatable tree. | |
1278 | */ | |
1279 | if (zfs_flags & ZFS_DEBUG_TRIM) { | |
1280 | mutex_enter(&msp->ms_lock); | |
1281 | VERIFY0(metaslab_load(msp)); | |
1282 | VERIFY3P(tap[0].trim_msp, ==, msp); | |
1283 | range_tree_walk(trim_tree, | |
1284 | vdev_trim_range_verify, &tap[0]); | |
1285 | mutex_exit(&msp->ms_lock); | |
1286 | } | |
1287 | ||
1288 | range_tree_vacate(trim_tree, NULL, NULL); | |
1289 | range_tree_destroy(trim_tree); | |
1290 | ||
f09fda50 | 1291 | metaslab_enable(msp, issued_trim, B_FALSE); |
1b939560 BB |
1292 | spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); |
1293 | ||
1294 | for (uint64_t c = 0; c < children; c++) { | |
1295 | trim_args_t *ta = &tap[c]; | |
1296 | ||
1297 | if (ta->trim_tree == NULL) | |
1298 | continue; | |
1299 | ||
1300 | range_tree_vacate(ta->trim_tree, NULL, NULL); | |
1301 | range_tree_destroy(ta->trim_tree); | |
1302 | } | |
1303 | ||
1304 | kmem_free(tap, sizeof (trim_args_t) * children); | |
1305 | } | |
1306 | ||
1307 | spa_config_exit(spa, SCL_CONFIG, FTAG); | |
1308 | ||
1309 | /* | |
1310 | * After completing the group of metaslabs wait for the next | |
1311 | * open txg. This is done to make sure that a minimum of | |
1312 | * zfs_trim_txg_batch txgs will occur before these metaslabs | |
1313 | * are trimmed again. | |
1314 | */ | |
1315 | txg_wait_open(spa_get_dsl(spa), 0, issued_trim); | |
1316 | ||
1317 | shift++; | |
1318 | spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); | |
1319 | } | |
1320 | ||
1321 | for (uint64_t c = 0; c < vd->vdev_children; c++) { | |
1322 | vdev_t *cvd = vd->vdev_child[c]; | |
1323 | mutex_enter(&cvd->vdev_trim_io_lock); | |
1324 | ||
1325 | while (cvd->vdev_trim_inflight[1] > 0) { | |
1326 | cv_wait(&cvd->vdev_trim_io_cv, | |
1327 | &cvd->vdev_trim_io_lock); | |
1328 | } | |
1329 | mutex_exit(&cvd->vdev_trim_io_lock); | |
1330 | } | |
1331 | ||
1332 | spa_config_exit(spa, SCL_CONFIG, FTAG); | |
1333 | ||
1334 | /* | |
1335 | * When exiting because the autotrim property was set to off, then | |
1336 | * abandon any unprocessed ms_trim ranges to reclaim the memory. | |
1337 | */ | |
1338 | if (spa_get_autotrim(spa) == SPA_AUTOTRIM_OFF) { | |
1339 | for (uint64_t i = 0; i < vd->vdev_ms_count; i++) { | |
1340 | metaslab_t *msp = vd->vdev_ms[i]; | |
1341 | ||
1342 | mutex_enter(&msp->ms_lock); | |
1343 | range_tree_vacate(msp->ms_trim, NULL, NULL); | |
1344 | mutex_exit(&msp->ms_lock); | |
1345 | } | |
1346 | } | |
1347 | ||
1348 | mutex_enter(&vd->vdev_autotrim_lock); | |
1349 | ASSERT(vd->vdev_autotrim_thread != NULL); | |
1350 | vd->vdev_autotrim_thread = NULL; | |
1351 | cv_broadcast(&vd->vdev_autotrim_cv); | |
1352 | mutex_exit(&vd->vdev_autotrim_lock); | |
1353 | } | |
1354 | ||
1355 | /* | |
1356 | * Starts an autotrim thread, if needed, for each top-level vdev which can be | |
1357 | * trimmed. A top-level vdev which has been evacuated will never be trimmed. | |
1358 | */ | |
1359 | void | |
1360 | vdev_autotrim(spa_t *spa) | |
1361 | { | |
1362 | vdev_t *root_vd = spa->spa_root_vdev; | |
1363 | ||
1364 | for (uint64_t i = 0; i < root_vd->vdev_children; i++) { | |
1365 | vdev_t *tvd = root_vd->vdev_child[i]; | |
1366 | ||
1367 | mutex_enter(&tvd->vdev_autotrim_lock); | |
1368 | if (vdev_writeable(tvd) && !tvd->vdev_removing && | |
1369 | tvd->vdev_autotrim_thread == NULL) { | |
1370 | ASSERT3P(tvd->vdev_top, ==, tvd); | |
1371 | ||
1372 | tvd->vdev_autotrim_thread = thread_create(NULL, 0, | |
1373 | vdev_autotrim_thread, tvd, 0, &p0, TS_RUN, | |
1374 | maxclsyspri); | |
1375 | ASSERT(tvd->vdev_autotrim_thread != NULL); | |
1376 | } | |
1377 | mutex_exit(&tvd->vdev_autotrim_lock); | |
1378 | } | |
1379 | } | |
1380 | ||
1381 | /* | |
1382 | * Wait for the vdev_autotrim_thread associated with the passed top-level | |
1383 | * vdev to be terminated (canceled or stopped). | |
1384 | */ | |
1385 | void | |
1386 | vdev_autotrim_stop_wait(vdev_t *tvd) | |
1387 | { | |
1388 | mutex_enter(&tvd->vdev_autotrim_lock); | |
1389 | if (tvd->vdev_autotrim_thread != NULL) { | |
1390 | tvd->vdev_autotrim_exit_wanted = B_TRUE; | |
1391 | ||
1392 | while (tvd->vdev_autotrim_thread != NULL) { | |
1393 | cv_wait(&tvd->vdev_autotrim_cv, | |
1394 | &tvd->vdev_autotrim_lock); | |
1395 | } | |
1396 | ||
1397 | ASSERT3P(tvd->vdev_autotrim_thread, ==, NULL); | |
1398 | tvd->vdev_autotrim_exit_wanted = B_FALSE; | |
1399 | } | |
1400 | mutex_exit(&tvd->vdev_autotrim_lock); | |
1401 | } | |
1402 | ||
1403 | /* | |
1404 | * Wait for all of the vdev_autotrim_thread associated with the pool to | |
1405 | * be terminated (canceled or stopped). | |
1406 | */ | |
1407 | void | |
1408 | vdev_autotrim_stop_all(spa_t *spa) | |
1409 | { | |
1410 | vdev_t *root_vd = spa->spa_root_vdev; | |
1411 | ||
1412 | for (uint64_t i = 0; i < root_vd->vdev_children; i++) | |
1413 | vdev_autotrim_stop_wait(root_vd->vdev_child[i]); | |
1414 | } | |
1415 | ||
1416 | /* | |
1417 | * Conditionally restart all of the vdev_autotrim_thread's for the pool. | |
1418 | */ | |
1419 | void | |
1420 | vdev_autotrim_restart(spa_t *spa) | |
1421 | { | |
1422 | ASSERT(MUTEX_HELD(&spa_namespace_lock)); | |
1423 | ||
1424 | if (spa->spa_autotrim) | |
1425 | vdev_autotrim(spa); | |
1426 | } | |
1427 | ||
1b939560 BB |
1428 | EXPORT_SYMBOL(vdev_trim); |
1429 | EXPORT_SYMBOL(vdev_trim_stop); | |
1430 | EXPORT_SYMBOL(vdev_trim_stop_all); | |
1431 | EXPORT_SYMBOL(vdev_trim_stop_wait); | |
1432 | EXPORT_SYMBOL(vdev_trim_restart); | |
1433 | EXPORT_SYMBOL(vdev_autotrim); | |
1434 | EXPORT_SYMBOL(vdev_autotrim_stop_all); | |
1435 | EXPORT_SYMBOL(vdev_autotrim_stop_wait); | |
1436 | EXPORT_SYMBOL(vdev_autotrim_restart); | |
1437 | ||
1438 | /* BEGIN CSTYLED */ | |
03fdcb9a | 1439 | ZFS_MODULE_PARAM(zfs_trim, zfs_trim_, extent_bytes_max, UINT, ZMOD_RW, |
1b939560 BB |
1440 | "Max size of TRIM commands, larger will be split"); |
1441 | ||
03fdcb9a | 1442 | ZFS_MODULE_PARAM(zfs_trim, zfs_trim_, extent_bytes_min, UINT, ZMOD_RW, |
1b939560 BB |
1443 | "Min size of TRIM commands, smaller will be skipped"); |
1444 | ||
03fdcb9a | 1445 | ZFS_MODULE_PARAM(zfs_trim, zfs_trim_, metaslab_skip, UINT, ZMOD_RW, |
1b939560 BB |
1446 | "Skip metaslabs which have never been initialized"); |
1447 | ||
03fdcb9a | 1448 | ZFS_MODULE_PARAM(zfs_trim, zfs_trim_, txg_batch, UINT, ZMOD_RW, |
1b939560 BB |
1449 | "Min number of txgs to aggregate frees before issuing TRIM"); |
1450 | ||
03fdcb9a | 1451 | ZFS_MODULE_PARAM(zfs_trim, zfs_trim_, queue_limit, UINT, ZMOD_RW, |
1b939560 BB |
1452 | "Max queued TRIMs outstanding per leaf vdev"); |
1453 | /* END CSTYLED */ |