]> git.proxmox.com Git - mirror_zfs.git/blame - module/zfs/metaslab.c
Add/generalize abstractions in arc_summary3
[mirror_zfs.git] / module / zfs / metaslab.c
CommitLineData
34dc7c2f
BB
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
428870ff 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
893a6d62 23 * Copyright (c) 2011, 2019 by Delphix. All rights reserved.
2e528b49 24 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
cc99f275 25 * Copyright (c) 2017, Intel Corporation.
34dc7c2f
BB
26 */
27
34dc7c2f 28#include <sys/zfs_context.h>
34dc7c2f
BB
29#include <sys/dmu.h>
30#include <sys/dmu_tx.h>
31#include <sys/space_map.h>
32#include <sys/metaslab_impl.h>
33#include <sys/vdev_impl.h>
34#include <sys/zio.h>
93cf2076 35#include <sys/spa_impl.h>
f3a7f661 36#include <sys/zfeature.h>
a1d477c2 37#include <sys/vdev_indirect_mapping.h>
d2734cce 38#include <sys/zap.h>
34dc7c2f 39
d1d7e268 40#define WITH_DF_BLOCK_ALLOCATOR
6d974228 41
3dfb57a3
DB
42#define GANG_ALLOCATION(flags) \
43 ((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER))
22c81dd8 44
e8fe6684
ED
45/*
46 * Metaslab granularity, in bytes. This is roughly similar to what would be
47 * referred to as the "stripe size" in traditional RAID arrays. In normal
48 * operation, we will try to write this amount of data to a top-level vdev
49 * before moving on to the next one.
50 */
99b14de4 51unsigned long metaslab_aliquot = 512 << 10;
e8fe6684 52
d830d479
MA
53/*
54 * For testing, make some blocks above a certain size be gang blocks.
55 */
56unsigned long metaslab_force_ganging = SPA_MAXBLOCKSIZE + 1;
34dc7c2f 57
d2734cce 58/*
93e28d66
SD
59 * In pools where the log space map feature is not enabled we touch
60 * multiple metaslabs (and their respective space maps) with each
61 * transaction group. Thus, we benefit from having a small space map
d2734cce 62 * block size since it allows us to issue more I/O operations scattered
93e28d66
SD
63 * around the disk. So a sane default for the space map block size
64 * is 8~16K.
d2734cce 65 */
93e28d66
SD
66int zfs_metaslab_sm_blksz_no_log = (1 << 14);
67
68/*
69 * When the log space map feature is enabled, we accumulate a lot of
70 * changes per metaslab that are flushed once in a while so we benefit
71 * from a bigger block size like 128K for the metaslab space maps.
72 */
73int zfs_metaslab_sm_blksz_with_log = (1 << 17);
d2734cce 74
e51be066
GW
75/*
76 * The in-core space map representation is more compact than its on-disk form.
77 * The zfs_condense_pct determines how much more compact the in-core
4e21fd06 78 * space map representation must be before we compact it on-disk.
e51be066
GW
79 * Values should be greater than or equal to 100.
80 */
81int zfs_condense_pct = 200;
82
b02fe35d
AR
83/*
84 * Condensing a metaslab is not guaranteed to actually reduce the amount of
85 * space used on disk. In particular, a space map uses data in increments of
96358617 86 * MAX(1 << ashift, space_map_blksz), so a metaslab might use the
b02fe35d
AR
87 * same number of blocks after condensing. Since the goal of condensing is to
88 * reduce the number of IOPs required to read the space map, we only want to
89 * condense when we can be sure we will reduce the number of blocks used by the
90 * space map. Unfortunately, we cannot precisely compute whether or not this is
91 * the case in metaslab_should_condense since we are holding ms_lock. Instead,
92 * we apply the following heuristic: do not condense a spacemap unless the
93 * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold
94 * blocks.
95 */
96int zfs_metaslab_condense_block_threshold = 4;
97
ac72fac3
GW
98/*
99 * The zfs_mg_noalloc_threshold defines which metaslab groups should
100 * be eligible for allocation. The value is defined as a percentage of
f3a7f661 101 * free space. Metaslab groups that have more free space than
ac72fac3
GW
102 * zfs_mg_noalloc_threshold are always eligible for allocations. Once
103 * a metaslab group's free space is less than or equal to the
104 * zfs_mg_noalloc_threshold the allocator will avoid allocating to that
105 * group unless all groups in the pool have reached zfs_mg_noalloc_threshold.
106 * Once all groups in the pool reach zfs_mg_noalloc_threshold then all
107 * groups are allowed to accept allocations. Gang blocks are always
108 * eligible to allocate on any metaslab group. The default value of 0 means
109 * no metaslab group will be excluded based on this criterion.
110 */
111int zfs_mg_noalloc_threshold = 0;
6d974228 112
f3a7f661
GW
113/*
114 * Metaslab groups are considered eligible for allocations if their
e1cfd73f 115 * fragmentation metric (measured as a percentage) is less than or
cb020f0d
SD
116 * equal to zfs_mg_fragmentation_threshold. If a metaslab group
117 * exceeds this threshold then it will be skipped unless all metaslab
118 * groups within the metaslab class have also crossed this threshold.
119 *
120 * This tunable was introduced to avoid edge cases where we continue
121 * allocating from very fragmented disks in our pool while other, less
122 * fragmented disks, exists. On the other hand, if all disks in the
123 * pool are uniformly approaching the threshold, the threshold can
124 * be a speed bump in performance, where we keep switching the disks
125 * that we allocate from (e.g. we allocate some segments from disk A
126 * making it bypassing the threshold while freeing segments from disk
127 * B getting its fragmentation below the threshold).
128 *
129 * Empirically, we've seen that our vdev selection for allocations is
130 * good enough that fragmentation increases uniformly across all vdevs
131 * the majority of the time. Thus we set the threshold percentage high
132 * enough to avoid hitting the speed bump on pools that are being pushed
133 * to the edge.
f3a7f661 134 */
cb020f0d 135int zfs_mg_fragmentation_threshold = 95;
f3a7f661
GW
136
137/*
138 * Allow metaslabs to keep their active state as long as their fragmentation
139 * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An
140 * active metaslab that exceeds this threshold will no longer keep its active
141 * status allowing better metaslabs to be selected.
142 */
143int zfs_metaslab_fragmentation_threshold = 70;
144
428870ff 145/*
aa7d06a9 146 * When set will load all metaslabs when pool is first opened.
428870ff 147 */
aa7d06a9
GW
148int metaslab_debug_load = 0;
149
150/*
151 * When set will prevent metaslabs from being unloaded.
152 */
153int metaslab_debug_unload = 0;
428870ff 154
9babb374
BB
155/*
156 * Minimum size which forces the dynamic allocator to change
428870ff 157 * it's allocation strategy. Once the space map cannot satisfy
9babb374
BB
158 * an allocation of this size then it switches to using more
159 * aggressive strategy (i.e search by size rather than offset).
160 */
4e21fd06 161uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE;
9babb374
BB
162
163/*
164 * The minimum free space, in percent, which must be available
165 * in a space map to continue allocations in a first-fit fashion.
4e21fd06 166 * Once the space map's free space drops below this level we dynamically
9babb374
BB
167 * switch to using best-fit allocations.
168 */
428870ff
BB
169int metaslab_df_free_pct = 4;
170
d3230d76
MA
171/*
172 * Maximum distance to search forward from the last offset. Without this
173 * limit, fragmented pools can see >100,000 iterations and
174 * metaslab_block_picker() becomes the performance limiting factor on
175 * high-performance storage.
176 *
177 * With the default setting of 16MB, we typically see less than 500
178 * iterations, even with very fragmented, ashift=9 pools. The maximum number
179 * of iterations possible is:
180 * metaslab_df_max_search / (2 * (1<<ashift))
181 * With the default setting of 16MB this is 16*1024 (with ashift=9) or
182 * 2048 (with ashift=12).
183 */
184int metaslab_df_max_search = 16 * 1024 * 1024;
185
186/*
187 * If we are not searching forward (due to metaslab_df_max_search,
188 * metaslab_df_free_pct, or metaslab_df_alloc_threshold), this tunable
189 * controls what segment is used. If it is set, we will use the largest free
190 * segment. If it is not set, we will use a segment of exactly the requested
191 * size (or larger).
192 */
193int metaslab_df_use_largest_segment = B_FALSE;
194
428870ff 195/*
93cf2076 196 * Percentage of all cpus that can be used by the metaslab taskq.
428870ff 197 */
93cf2076 198int metaslab_load_pct = 50;
428870ff
BB
199
200/*
eef0f4d8
PD
201 * These tunables control how long a metaslab will remain loaded after the
202 * last allocation from it. A metaslab can't be unloaded until at least
203 * metaslab_unload_delay TXG's and metaslab_unload_delay_ms milliseconds
204 * have elapsed. However, zfs_metaslab_mem_limit may cause it to be
205 * unloaded sooner. These settings are intended to be generous -- to keep
206 * metaslabs loaded for a long time, reducing the rate of metaslab loading.
428870ff 207 */
eef0f4d8
PD
208int metaslab_unload_delay = 32;
209int metaslab_unload_delay_ms = 10 * 60 * 1000; /* ten minutes */
9babb374 210
93cf2076
GW
211/*
212 * Max number of metaslabs per group to preload.
213 */
eef0f4d8 214int metaslab_preload_limit = 10;
93cf2076
GW
215
216/*
217 * Enable/disable preloading of metaslab.
218 */
f3a7f661 219int metaslab_preload_enabled = B_TRUE;
93cf2076
GW
220
221/*
f3a7f661 222 * Enable/disable fragmentation weighting on metaslabs.
93cf2076 223 */
f3a7f661 224int metaslab_fragmentation_factor_enabled = B_TRUE;
93cf2076 225
f3a7f661
GW
226/*
227 * Enable/disable lba weighting (i.e. outer tracks are given preference).
228 */
229int metaslab_lba_weighting_enabled = B_TRUE;
230
231/*
232 * Enable/disable metaslab group biasing.
233 */
234int metaslab_bias_enabled = B_TRUE;
235
a1d477c2
MA
236/*
237 * Enable/disable remapping of indirect DVAs to their concrete vdevs.
238 */
239boolean_t zfs_remap_blkptr_enable = B_TRUE;
240
4e21fd06
DB
241/*
242 * Enable/disable segment-based metaslab selection.
243 */
244int zfs_metaslab_segment_weight_enabled = B_TRUE;
245
246/*
247 * When using segment-based metaslab selection, we will continue
248 * allocating from the active metaslab until we have exhausted
249 * zfs_metaslab_switch_threshold of its buckets.
250 */
251int zfs_metaslab_switch_threshold = 2;
252
253/*
254 * Internal switch to enable/disable the metaslab allocation tracing
255 * facility.
256 */
257#ifdef _METASLAB_TRACING
258boolean_t metaslab_trace_enabled = B_TRUE;
259#endif
260
261/*
262 * Maximum entries that the metaslab allocation tracing facility will keep
263 * in a given list when running in non-debug mode. We limit the number
264 * of entries in non-debug mode to prevent us from using up too much memory.
265 * The limit should be sufficiently large that we don't expect any allocation
266 * to every exceed this value. In debug mode, the system will panic if this
267 * limit is ever reached allowing for further investigation.
268 */
269#ifdef _METASLAB_TRACING
270uint64_t metaslab_trace_max_entries = 5000;
271#endif
272
1b939560
BB
273/*
274 * Maximum number of metaslabs per group that can be disabled
275 * simultaneously.
276 */
277int max_disabled_ms = 3;
278
f09fda50
PD
279/*
280 * Maximum percentage of memory to use on storing loaded metaslabs. If loading
281 * a metaslab would take it over this percentage, the oldest selected metaslab
282 * is automatically unloaded.
283 */
eef0f4d8
PD
284int zfs_metaslab_mem_limit = 25;
285
286/*
287 * Time (in seconds) to respect ms_max_size when the metaslab is not loaded.
288 * To avoid 64-bit overflow, don't set above UINT32_MAX.
289 */
290unsigned long zfs_metaslab_max_size_cache_sec = 3600; /* 1 hour */
f09fda50 291
65a91b16
SD
292static uint64_t metaslab_weight(metaslab_t *, boolean_t);
293static void metaslab_set_fragmentation(metaslab_t *, boolean_t);
d2734cce 294static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t);
a1d477c2 295static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t);
4e21fd06 296
492f64e9
PD
297static void metaslab_passivate(metaslab_t *msp, uint64_t weight);
298static uint64_t metaslab_weight_from_range_tree(metaslab_t *msp);
93e28d66 299static void metaslab_flush_update(metaslab_t *, dmu_tx_t *);
f09fda50
PD
300static unsigned int metaslab_idx_func(multilist_t *, void *);
301static void metaslab_evict(metaslab_t *, uint64_t);
4e21fd06
DB
302#ifdef _METASLAB_TRACING
303kmem_cache_t *metaslab_alloc_trace_cache;
304#endif
93cf2076 305
34dc7c2f
BB
306/*
307 * ==========================================================================
308 * Metaslab classes
309 * ==========================================================================
310 */
311metaslab_class_t *
93cf2076 312metaslab_class_create(spa_t *spa, metaslab_ops_t *ops)
34dc7c2f
BB
313{
314 metaslab_class_t *mc;
315
79c76d5b 316 mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);
34dc7c2f 317
428870ff 318 mc->mc_spa = spa;
34dc7c2f 319 mc->mc_rotor = NULL;
9babb374 320 mc->mc_ops = ops;
3dfb57a3 321 mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL);
f09fda50
PD
322 mc->mc_metaslab_txg_list = multilist_create(sizeof (metaslab_t),
323 offsetof(metaslab_t, ms_class_txg_node), metaslab_idx_func);
492f64e9 324 mc->mc_alloc_slots = kmem_zalloc(spa->spa_alloc_count *
c13060e4 325 sizeof (zfs_refcount_t), KM_SLEEP);
492f64e9
PD
326 mc->mc_alloc_max_slots = kmem_zalloc(spa->spa_alloc_count *
327 sizeof (uint64_t), KM_SLEEP);
328 for (int i = 0; i < spa->spa_alloc_count; i++)
424fd7c3 329 zfs_refcount_create_tracked(&mc->mc_alloc_slots[i]);
34dc7c2f
BB
330
331 return (mc);
332}
333
334void
335metaslab_class_destroy(metaslab_class_t *mc)
336{
428870ff
BB
337 ASSERT(mc->mc_rotor == NULL);
338 ASSERT(mc->mc_alloc == 0);
339 ASSERT(mc->mc_deferred == 0);
340 ASSERT(mc->mc_space == 0);
341 ASSERT(mc->mc_dspace == 0);
34dc7c2f 342
492f64e9 343 for (int i = 0; i < mc->mc_spa->spa_alloc_count; i++)
424fd7c3 344 zfs_refcount_destroy(&mc->mc_alloc_slots[i]);
492f64e9 345 kmem_free(mc->mc_alloc_slots, mc->mc_spa->spa_alloc_count *
c13060e4 346 sizeof (zfs_refcount_t));
492f64e9
PD
347 kmem_free(mc->mc_alloc_max_slots, mc->mc_spa->spa_alloc_count *
348 sizeof (uint64_t));
3dfb57a3 349 mutex_destroy(&mc->mc_lock);
f09fda50 350 multilist_destroy(mc->mc_metaslab_txg_list);
34dc7c2f
BB
351 kmem_free(mc, sizeof (metaslab_class_t));
352}
353
428870ff
BB
354int
355metaslab_class_validate(metaslab_class_t *mc)
34dc7c2f 356{
428870ff
BB
357 metaslab_group_t *mg;
358 vdev_t *vd;
34dc7c2f 359
428870ff
BB
360 /*
361 * Must hold one of the spa_config locks.
362 */
363 ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) ||
364 spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER));
34dc7c2f 365
428870ff
BB
366 if ((mg = mc->mc_rotor) == NULL)
367 return (0);
368
369 do {
370 vd = mg->mg_vd;
371 ASSERT(vd->vdev_mg != NULL);
372 ASSERT3P(vd->vdev_top, ==, vd);
373 ASSERT3P(mg->mg_class, ==, mc);
374 ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops);
375 } while ((mg = mg->mg_next) != mc->mc_rotor);
376
377 return (0);
34dc7c2f
BB
378}
379
cc99f275 380static void
428870ff
BB
381metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta,
382 int64_t defer_delta, int64_t space_delta, int64_t dspace_delta)
34dc7c2f 383{
428870ff
BB
384 atomic_add_64(&mc->mc_alloc, alloc_delta);
385 atomic_add_64(&mc->mc_deferred, defer_delta);
386 atomic_add_64(&mc->mc_space, space_delta);
387 atomic_add_64(&mc->mc_dspace, dspace_delta);
388}
34dc7c2f 389
428870ff
BB
390uint64_t
391metaslab_class_get_alloc(metaslab_class_t *mc)
392{
393 return (mc->mc_alloc);
394}
34dc7c2f 395
428870ff
BB
396uint64_t
397metaslab_class_get_deferred(metaslab_class_t *mc)
398{
399 return (mc->mc_deferred);
400}
34dc7c2f 401
428870ff
BB
402uint64_t
403metaslab_class_get_space(metaslab_class_t *mc)
404{
405 return (mc->mc_space);
406}
34dc7c2f 407
428870ff
BB
408uint64_t
409metaslab_class_get_dspace(metaslab_class_t *mc)
410{
411 return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space);
34dc7c2f
BB
412}
413
f3a7f661
GW
414void
415metaslab_class_histogram_verify(metaslab_class_t *mc)
416{
cc99f275
DB
417 spa_t *spa = mc->mc_spa;
418 vdev_t *rvd = spa->spa_root_vdev;
f3a7f661 419 uint64_t *mc_hist;
1c27024e 420 int i;
f3a7f661
GW
421
422 if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
423 return;
424
425 mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
79c76d5b 426 KM_SLEEP);
f3a7f661 427
1c27024e 428 for (int c = 0; c < rvd->vdev_children; c++) {
f3a7f661
GW
429 vdev_t *tvd = rvd->vdev_child[c];
430 metaslab_group_t *mg = tvd->vdev_mg;
431
432 /*
433 * Skip any holes, uninitialized top-levels, or
434 * vdevs that are not in this metalab class.
435 */
a1d477c2 436 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 ||
f3a7f661
GW
437 mg->mg_class != mc) {
438 continue;
439 }
440
441 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
442 mc_hist[i] += mg->mg_histogram[i];
443 }
444
445 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
446 VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]);
447
448 kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
449}
450
451/*
452 * Calculate the metaslab class's fragmentation metric. The metric
453 * is weighted based on the space contribution of each metaslab group.
454 * The return value will be a number between 0 and 100 (inclusive), or
455 * ZFS_FRAG_INVALID if the metric has not been set. See comment above the
456 * zfs_frag_table for more information about the metric.
457 */
458uint64_t
459metaslab_class_fragmentation(metaslab_class_t *mc)
460{
461 vdev_t *rvd = mc->mc_spa->spa_root_vdev;
462 uint64_t fragmentation = 0;
f3a7f661
GW
463
464 spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
465
1c27024e 466 for (int c = 0; c < rvd->vdev_children; c++) {
f3a7f661
GW
467 vdev_t *tvd = rvd->vdev_child[c];
468 metaslab_group_t *mg = tvd->vdev_mg;
469
470 /*
a1d477c2
MA
471 * Skip any holes, uninitialized top-levels,
472 * or vdevs that are not in this metalab class.
f3a7f661 473 */
a1d477c2 474 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 ||
f3a7f661
GW
475 mg->mg_class != mc) {
476 continue;
477 }
478
479 /*
480 * If a metaslab group does not contain a fragmentation
481 * metric then just bail out.
482 */
483 if (mg->mg_fragmentation == ZFS_FRAG_INVALID) {
484 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
485 return (ZFS_FRAG_INVALID);
486 }
487
488 /*
489 * Determine how much this metaslab_group is contributing
490 * to the overall pool fragmentation metric.
491 */
492 fragmentation += mg->mg_fragmentation *
493 metaslab_group_get_space(mg);
494 }
495 fragmentation /= metaslab_class_get_space(mc);
496
497 ASSERT3U(fragmentation, <=, 100);
498 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
499 return (fragmentation);
500}
501
502/*
503 * Calculate the amount of expandable space that is available in
504 * this metaslab class. If a device is expanded then its expandable
505 * space will be the amount of allocatable space that is currently not
506 * part of this metaslab class.
507 */
508uint64_t
509metaslab_class_expandable_space(metaslab_class_t *mc)
510{
511 vdev_t *rvd = mc->mc_spa->spa_root_vdev;
512 uint64_t space = 0;
f3a7f661
GW
513
514 spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
1c27024e 515 for (int c = 0; c < rvd->vdev_children; c++) {
f3a7f661
GW
516 vdev_t *tvd = rvd->vdev_child[c];
517 metaslab_group_t *mg = tvd->vdev_mg;
518
a1d477c2 519 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 ||
f3a7f661
GW
520 mg->mg_class != mc) {
521 continue;
522 }
523
0f676dc2
GM
524 /*
525 * Calculate if we have enough space to add additional
526 * metaslabs. We report the expandable space in terms
527 * of the metaslab size since that's the unit of expansion.
528 */
529 space += P2ALIGN(tvd->vdev_max_asize - tvd->vdev_asize,
530 1ULL << tvd->vdev_ms_shift);
f3a7f661
GW
531 }
532 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
533 return (space);
534}
535
f09fda50
PD
536void
537metaslab_class_evict_old(metaslab_class_t *mc, uint64_t txg)
538{
539 multilist_t *ml = mc->mc_metaslab_txg_list;
540 for (int i = 0; i < multilist_get_num_sublists(ml); i++) {
541 multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
542 metaslab_t *msp = multilist_sublist_head(mls);
543 multilist_sublist_unlock(mls);
544 while (msp != NULL) {
545 mutex_enter(&msp->ms_lock);
f09fda50
PD
546
547 /*
548 * If the metaslab has been removed from the list
549 * (which could happen if we were at the memory limit
550 * and it was evicted during this loop), then we can't
551 * proceed and we should restart the sublist.
552 */
553 if (!multilist_link_active(&msp->ms_class_txg_node)) {
554 mutex_exit(&msp->ms_lock);
555 i--;
556 break;
557 }
558 mls = multilist_sublist_lock(ml, i);
559 metaslab_t *next_msp = multilist_sublist_next(mls, msp);
560 multilist_sublist_unlock(mls);
eef0f4d8
PD
561 if (txg >
562 msp->ms_selected_txg + metaslab_unload_delay &&
563 gethrtime() > msp->ms_selected_time +
564 (uint64_t)MSEC2NSEC(metaslab_unload_delay_ms)) {
565 metaslab_evict(msp, txg);
566 } else {
567 /*
568 * Once we've hit a metaslab selected too
569 * recently to evict, we're done evicting for
570 * now.
571 */
572 mutex_exit(&msp->ms_lock);
573 break;
574 }
f09fda50
PD
575 mutex_exit(&msp->ms_lock);
576 msp = next_msp;
577 }
578 }
579}
580
34dc7c2f
BB
581static int
582metaslab_compare(const void *x1, const void *x2)
583{
ee36c709
GN
584 const metaslab_t *m1 = (const metaslab_t *)x1;
585 const metaslab_t *m2 = (const metaslab_t *)x2;
34dc7c2f 586
492f64e9
PD
587 int sort1 = 0;
588 int sort2 = 0;
589 if (m1->ms_allocator != -1 && m1->ms_primary)
590 sort1 = 1;
591 else if (m1->ms_allocator != -1 && !m1->ms_primary)
592 sort1 = 2;
593 if (m2->ms_allocator != -1 && m2->ms_primary)
594 sort2 = 1;
595 else if (m2->ms_allocator != -1 && !m2->ms_primary)
596 sort2 = 2;
597
598 /*
599 * Sort inactive metaslabs first, then primaries, then secondaries. When
600 * selecting a metaslab to allocate from, an allocator first tries its
601 * primary, then secondary active metaslab. If it doesn't have active
602 * metaslabs, or can't allocate from them, it searches for an inactive
603 * metaslab to activate. If it can't find a suitable one, it will steal
604 * a primary or secondary metaslab from another allocator.
605 */
606 if (sort1 < sort2)
607 return (-1);
608 if (sort1 > sort2)
609 return (1);
610
ee36c709
GN
611 int cmp = AVL_CMP(m2->ms_weight, m1->ms_weight);
612 if (likely(cmp))
613 return (cmp);
34dc7c2f 614
ee36c709 615 IMPLY(AVL_CMP(m1->ms_start, m2->ms_start) == 0, m1 == m2);
34dc7c2f 616
ee36c709 617 return (AVL_CMP(m1->ms_start, m2->ms_start));
34dc7c2f
BB
618}
619
4e21fd06
DB
620/*
621 * ==========================================================================
622 * Metaslab groups
623 * ==========================================================================
624 */
ac72fac3
GW
625/*
626 * Update the allocatable flag and the metaslab group's capacity.
627 * The allocatable flag is set to true if the capacity is below
3dfb57a3
DB
628 * the zfs_mg_noalloc_threshold or has a fragmentation value that is
629 * greater than zfs_mg_fragmentation_threshold. If a metaslab group
630 * transitions from allocatable to non-allocatable or vice versa then the
631 * metaslab group's class is updated to reflect the transition.
ac72fac3
GW
632 */
633static void
634metaslab_group_alloc_update(metaslab_group_t *mg)
635{
636 vdev_t *vd = mg->mg_vd;
637 metaslab_class_t *mc = mg->mg_class;
638 vdev_stat_t *vs = &vd->vdev_stat;
639 boolean_t was_allocatable;
3dfb57a3 640 boolean_t was_initialized;
ac72fac3
GW
641
642 ASSERT(vd == vd->vdev_top);
a1d477c2
MA
643 ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_READER), ==,
644 SCL_ALLOC);
ac72fac3
GW
645
646 mutex_enter(&mg->mg_lock);
647 was_allocatable = mg->mg_allocatable;
3dfb57a3 648 was_initialized = mg->mg_initialized;
ac72fac3
GW
649
650 mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) /
651 (vs->vs_space + 1);
652
3dfb57a3
DB
653 mutex_enter(&mc->mc_lock);
654
655 /*
656 * If the metaslab group was just added then it won't
657 * have any space until we finish syncing out this txg.
658 * At that point we will consider it initialized and available
659 * for allocations. We also don't consider non-activated
660 * metaslab groups (e.g. vdevs that are in the middle of being removed)
661 * to be initialized, because they can't be used for allocation.
662 */
663 mg->mg_initialized = metaslab_group_initialized(mg);
664 if (!was_initialized && mg->mg_initialized) {
665 mc->mc_groups++;
666 } else if (was_initialized && !mg->mg_initialized) {
667 ASSERT3U(mc->mc_groups, >, 0);
668 mc->mc_groups--;
669 }
670 if (mg->mg_initialized)
671 mg->mg_no_free_space = B_FALSE;
672
f3a7f661
GW
673 /*
674 * A metaslab group is considered allocatable if it has plenty
675 * of free space or is not heavily fragmented. We only take
676 * fragmentation into account if the metaslab group has a valid
677 * fragmentation metric (i.e. a value between 0 and 100).
678 */
3dfb57a3
DB
679 mg->mg_allocatable = (mg->mg_activation_count > 0 &&
680 mg->mg_free_capacity > zfs_mg_noalloc_threshold &&
f3a7f661
GW
681 (mg->mg_fragmentation == ZFS_FRAG_INVALID ||
682 mg->mg_fragmentation <= zfs_mg_fragmentation_threshold));
ac72fac3
GW
683
684 /*
685 * The mc_alloc_groups maintains a count of the number of
686 * groups in this metaslab class that are still above the
687 * zfs_mg_noalloc_threshold. This is used by the allocating
688 * threads to determine if they should avoid allocations to
689 * a given group. The allocator will avoid allocations to a group
690 * if that group has reached or is below the zfs_mg_noalloc_threshold
691 * and there are still other groups that are above the threshold.
692 * When a group transitions from allocatable to non-allocatable or
693 * vice versa we update the metaslab class to reflect that change.
694 * When the mc_alloc_groups value drops to 0 that means that all
695 * groups have reached the zfs_mg_noalloc_threshold making all groups
696 * eligible for allocations. This effectively means that all devices
697 * are balanced again.
698 */
699 if (was_allocatable && !mg->mg_allocatable)
700 mc->mc_alloc_groups--;
701 else if (!was_allocatable && mg->mg_allocatable)
702 mc->mc_alloc_groups++;
3dfb57a3 703 mutex_exit(&mc->mc_lock);
f3a7f661 704
ac72fac3
GW
705 mutex_exit(&mg->mg_lock);
706}
707
93e28d66
SD
708int
709metaslab_sort_by_flushed(const void *va, const void *vb)
710{
711 const metaslab_t *a = va;
712 const metaslab_t *b = vb;
713
714 int cmp = AVL_CMP(a->ms_unflushed_txg, b->ms_unflushed_txg);
715 if (likely(cmp))
716 return (cmp);
717
718 uint64_t a_vdev_id = a->ms_group->mg_vd->vdev_id;
719 uint64_t b_vdev_id = b->ms_group->mg_vd->vdev_id;
720 cmp = AVL_CMP(a_vdev_id, b_vdev_id);
721 if (cmp)
722 return (cmp);
723
724 return (AVL_CMP(a->ms_id, b->ms_id));
725}
726
34dc7c2f 727metaslab_group_t *
492f64e9 728metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators)
34dc7c2f
BB
729{
730 metaslab_group_t *mg;
731
79c76d5b 732 mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
34dc7c2f 733 mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
1b939560
BB
734 mutex_init(&mg->mg_ms_disabled_lock, NULL, MUTEX_DEFAULT, NULL);
735 cv_init(&mg->mg_ms_disabled_cv, NULL, CV_DEFAULT, NULL);
492f64e9
PD
736 mg->mg_primaries = kmem_zalloc(allocators * sizeof (metaslab_t *),
737 KM_SLEEP);
738 mg->mg_secondaries = kmem_zalloc(allocators * sizeof (metaslab_t *),
739 KM_SLEEP);
34dc7c2f 740 avl_create(&mg->mg_metaslab_tree, metaslab_compare,
93e28d66 741 sizeof (metaslab_t), offsetof(metaslab_t, ms_group_node));
34dc7c2f 742 mg->mg_vd = vd;
428870ff
BB
743 mg->mg_class = mc;
744 mg->mg_activation_count = 0;
3dfb57a3
DB
745 mg->mg_initialized = B_FALSE;
746 mg->mg_no_free_space = B_TRUE;
492f64e9
PD
747 mg->mg_allocators = allocators;
748
c13060e4
TS
749 mg->mg_alloc_queue_depth = kmem_zalloc(allocators *
750 sizeof (zfs_refcount_t), KM_SLEEP);
492f64e9
PD
751 mg->mg_cur_max_alloc_queue_depth = kmem_zalloc(allocators *
752 sizeof (uint64_t), KM_SLEEP);
753 for (int i = 0; i < allocators; i++) {
424fd7c3 754 zfs_refcount_create_tracked(&mg->mg_alloc_queue_depth[i]);
492f64e9
PD
755 mg->mg_cur_max_alloc_queue_depth[i] = 0;
756 }
34dc7c2f 757
3c51c5cb 758 mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct,
1229323d 759 maxclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT | TASKQ_DYNAMIC);
93cf2076 760
34dc7c2f
BB
761 return (mg);
762}
763
764void
765metaslab_group_destroy(metaslab_group_t *mg)
766{
428870ff
BB
767 ASSERT(mg->mg_prev == NULL);
768 ASSERT(mg->mg_next == NULL);
769 /*
770 * We may have gone below zero with the activation count
771 * either because we never activated in the first place or
772 * because we're done, and possibly removing the vdev.
773 */
774 ASSERT(mg->mg_activation_count <= 0);
775
3c51c5cb 776 taskq_destroy(mg->mg_taskq);
34dc7c2f 777 avl_destroy(&mg->mg_metaslab_tree);
492f64e9
PD
778 kmem_free(mg->mg_primaries, mg->mg_allocators * sizeof (metaslab_t *));
779 kmem_free(mg->mg_secondaries, mg->mg_allocators *
780 sizeof (metaslab_t *));
34dc7c2f 781 mutex_destroy(&mg->mg_lock);
1b939560
BB
782 mutex_destroy(&mg->mg_ms_disabled_lock);
783 cv_destroy(&mg->mg_ms_disabled_cv);
492f64e9
PD
784
785 for (int i = 0; i < mg->mg_allocators; i++) {
424fd7c3 786 zfs_refcount_destroy(&mg->mg_alloc_queue_depth[i]);
492f64e9
PD
787 mg->mg_cur_max_alloc_queue_depth[i] = 0;
788 }
789 kmem_free(mg->mg_alloc_queue_depth, mg->mg_allocators *
c13060e4 790 sizeof (zfs_refcount_t));
492f64e9
PD
791 kmem_free(mg->mg_cur_max_alloc_queue_depth, mg->mg_allocators *
792 sizeof (uint64_t));
793
34dc7c2f
BB
794 kmem_free(mg, sizeof (metaslab_group_t));
795}
796
428870ff
BB
797void
798metaslab_group_activate(metaslab_group_t *mg)
799{
800 metaslab_class_t *mc = mg->mg_class;
801 metaslab_group_t *mgprev, *mgnext;
802
a1d477c2 803 ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER), !=, 0);
428870ff
BB
804
805 ASSERT(mc->mc_rotor != mg);
806 ASSERT(mg->mg_prev == NULL);
807 ASSERT(mg->mg_next == NULL);
808 ASSERT(mg->mg_activation_count <= 0);
809
810 if (++mg->mg_activation_count <= 0)
811 return;
812
813 mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children);
ac72fac3 814 metaslab_group_alloc_update(mg);
428870ff
BB
815
816 if ((mgprev = mc->mc_rotor) == NULL) {
817 mg->mg_prev = mg;
818 mg->mg_next = mg;
819 } else {
820 mgnext = mgprev->mg_next;
821 mg->mg_prev = mgprev;
822 mg->mg_next = mgnext;
823 mgprev->mg_next = mg;
824 mgnext->mg_prev = mg;
825 }
826 mc->mc_rotor = mg;
827}
828
a1d477c2
MA
829/*
830 * Passivate a metaslab group and remove it from the allocation rotor.
831 * Callers must hold both the SCL_ALLOC and SCL_ZIO lock prior to passivating
832 * a metaslab group. This function will momentarily drop spa_config_locks
833 * that are lower than the SCL_ALLOC lock (see comment below).
834 */
428870ff
BB
835void
836metaslab_group_passivate(metaslab_group_t *mg)
837{
838 metaslab_class_t *mc = mg->mg_class;
a1d477c2 839 spa_t *spa = mc->mc_spa;
428870ff 840 metaslab_group_t *mgprev, *mgnext;
a1d477c2 841 int locks = spa_config_held(spa, SCL_ALL, RW_WRITER);
428870ff 842
a1d477c2
MA
843 ASSERT3U(spa_config_held(spa, SCL_ALLOC | SCL_ZIO, RW_WRITER), ==,
844 (SCL_ALLOC | SCL_ZIO));
428870ff
BB
845
846 if (--mg->mg_activation_count != 0) {
847 ASSERT(mc->mc_rotor != mg);
848 ASSERT(mg->mg_prev == NULL);
849 ASSERT(mg->mg_next == NULL);
850 ASSERT(mg->mg_activation_count < 0);
851 return;
852 }
853
a1d477c2
MA
854 /*
855 * The spa_config_lock is an array of rwlocks, ordered as
856 * follows (from highest to lowest):
857 * SCL_CONFIG > SCL_STATE > SCL_L2ARC > SCL_ALLOC >
858 * SCL_ZIO > SCL_FREE > SCL_VDEV
859 * (For more information about the spa_config_lock see spa_misc.c)
860 * The higher the lock, the broader its coverage. When we passivate
861 * a metaslab group, we must hold both the SCL_ALLOC and the SCL_ZIO
862 * config locks. However, the metaslab group's taskq might be trying
863 * to preload metaslabs so we must drop the SCL_ZIO lock and any
864 * lower locks to allow the I/O to complete. At a minimum,
865 * we continue to hold the SCL_ALLOC lock, which prevents any future
866 * allocations from taking place and any changes to the vdev tree.
867 */
868 spa_config_exit(spa, locks & ~(SCL_ZIO - 1), spa);
c5528b9b 869 taskq_wait_outstanding(mg->mg_taskq, 0);
a1d477c2 870 spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER);
f3a7f661 871 metaslab_group_alloc_update(mg);
492f64e9
PD
872 for (int i = 0; i < mg->mg_allocators; i++) {
873 metaslab_t *msp = mg->mg_primaries[i];
874 if (msp != NULL) {
875 mutex_enter(&msp->ms_lock);
876 metaslab_passivate(msp,
877 metaslab_weight_from_range_tree(msp));
878 mutex_exit(&msp->ms_lock);
879 }
880 msp = mg->mg_secondaries[i];
881 if (msp != NULL) {
882 mutex_enter(&msp->ms_lock);
883 metaslab_passivate(msp,
884 metaslab_weight_from_range_tree(msp));
885 mutex_exit(&msp->ms_lock);
886 }
887 }
93cf2076 888
428870ff
BB
889 mgprev = mg->mg_prev;
890 mgnext = mg->mg_next;
891
892 if (mg == mgnext) {
893 mc->mc_rotor = NULL;
894 } else {
895 mc->mc_rotor = mgnext;
896 mgprev->mg_next = mgnext;
897 mgnext->mg_prev = mgprev;
898 }
899
900 mg->mg_prev = NULL;
901 mg->mg_next = NULL;
902}
903
3dfb57a3
DB
904boolean_t
905metaslab_group_initialized(metaslab_group_t *mg)
906{
907 vdev_t *vd = mg->mg_vd;
908 vdev_stat_t *vs = &vd->vdev_stat;
909
910 return (vs->vs_space != 0 && mg->mg_activation_count > 0);
911}
912
f3a7f661
GW
913uint64_t
914metaslab_group_get_space(metaslab_group_t *mg)
915{
916 return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count);
917}
918
919void
920metaslab_group_histogram_verify(metaslab_group_t *mg)
921{
922 uint64_t *mg_hist;
923 vdev_t *vd = mg->mg_vd;
924 uint64_t ashift = vd->vdev_ashift;
1c27024e 925 int i;
f3a7f661
GW
926
927 if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
928 return;
929
930 mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
79c76d5b 931 KM_SLEEP);
f3a7f661
GW
932
933 ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=,
934 SPACE_MAP_HISTOGRAM_SIZE + ashift);
935
1c27024e 936 for (int m = 0; m < vd->vdev_ms_count; m++) {
f3a7f661
GW
937 metaslab_t *msp = vd->vdev_ms[m];
938
cc99f275
DB
939 /* skip if not active or not a member */
940 if (msp->ms_sm == NULL || msp->ms_group != mg)
f3a7f661
GW
941 continue;
942
943 for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++)
944 mg_hist[i + ashift] +=
945 msp->ms_sm->sm_phys->smp_histogram[i];
946 }
947
948 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++)
949 VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]);
950
951 kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
952}
953
34dc7c2f 954static void
f3a7f661 955metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp)
34dc7c2f 956{
f3a7f661
GW
957 metaslab_class_t *mc = mg->mg_class;
958 uint64_t ashift = mg->mg_vd->vdev_ashift;
f3a7f661
GW
959
960 ASSERT(MUTEX_HELD(&msp->ms_lock));
961 if (msp->ms_sm == NULL)
962 return;
963
34dc7c2f 964 mutex_enter(&mg->mg_lock);
1c27024e 965 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
f3a7f661
GW
966 mg->mg_histogram[i + ashift] +=
967 msp->ms_sm->sm_phys->smp_histogram[i];
968 mc->mc_histogram[i + ashift] +=
969 msp->ms_sm->sm_phys->smp_histogram[i];
970 }
971 mutex_exit(&mg->mg_lock);
972}
973
974void
975metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp)
976{
977 metaslab_class_t *mc = mg->mg_class;
978 uint64_t ashift = mg->mg_vd->vdev_ashift;
f3a7f661
GW
979
980 ASSERT(MUTEX_HELD(&msp->ms_lock));
981 if (msp->ms_sm == NULL)
982 return;
983
984 mutex_enter(&mg->mg_lock);
1c27024e 985 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
f3a7f661
GW
986 ASSERT3U(mg->mg_histogram[i + ashift], >=,
987 msp->ms_sm->sm_phys->smp_histogram[i]);
988 ASSERT3U(mc->mc_histogram[i + ashift], >=,
989 msp->ms_sm->sm_phys->smp_histogram[i]);
990
991 mg->mg_histogram[i + ashift] -=
992 msp->ms_sm->sm_phys->smp_histogram[i];
993 mc->mc_histogram[i + ashift] -=
994 msp->ms_sm->sm_phys->smp_histogram[i];
995 }
996 mutex_exit(&mg->mg_lock);
997}
998
999static void
1000metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp)
1001{
34dc7c2f 1002 ASSERT(msp->ms_group == NULL);
f3a7f661 1003 mutex_enter(&mg->mg_lock);
34dc7c2f
BB
1004 msp->ms_group = mg;
1005 msp->ms_weight = 0;
1006 avl_add(&mg->mg_metaslab_tree, msp);
1007 mutex_exit(&mg->mg_lock);
f3a7f661
GW
1008
1009 mutex_enter(&msp->ms_lock);
1010 metaslab_group_histogram_add(mg, msp);
1011 mutex_exit(&msp->ms_lock);
34dc7c2f
BB
1012}
1013
1014static void
1015metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
1016{
f3a7f661
GW
1017 mutex_enter(&msp->ms_lock);
1018 metaslab_group_histogram_remove(mg, msp);
1019 mutex_exit(&msp->ms_lock);
1020
34dc7c2f
BB
1021 mutex_enter(&mg->mg_lock);
1022 ASSERT(msp->ms_group == mg);
1023 avl_remove(&mg->mg_metaslab_tree, msp);
f09fda50
PD
1024
1025 metaslab_class_t *mc = msp->ms_group->mg_class;
1026 multilist_sublist_t *mls =
1027 multilist_sublist_lock_obj(mc->mc_metaslab_txg_list, msp);
1028 if (multilist_link_active(&msp->ms_class_txg_node))
1029 multilist_sublist_remove(mls, msp);
1030 multilist_sublist_unlock(mls);
1031
34dc7c2f
BB
1032 msp->ms_group = NULL;
1033 mutex_exit(&mg->mg_lock);
1034}
1035
492f64e9
PD
1036static void
1037metaslab_group_sort_impl(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
1038{
679b0f2a 1039 ASSERT(MUTEX_HELD(&msp->ms_lock));
492f64e9
PD
1040 ASSERT(MUTEX_HELD(&mg->mg_lock));
1041 ASSERT(msp->ms_group == mg);
679b0f2a 1042
492f64e9
PD
1043 avl_remove(&mg->mg_metaslab_tree, msp);
1044 msp->ms_weight = weight;
1045 avl_add(&mg->mg_metaslab_tree, msp);
1046
1047}
1048
34dc7c2f
BB
1049static void
1050metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
1051{
1052 /*
1053 * Although in principle the weight can be any value, in
f3a7f661 1054 * practice we do not use values in the range [1, 511].
34dc7c2f 1055 */
f3a7f661 1056 ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0);
34dc7c2f
BB
1057 ASSERT(MUTEX_HELD(&msp->ms_lock));
1058
1059 mutex_enter(&mg->mg_lock);
492f64e9 1060 metaslab_group_sort_impl(mg, msp, weight);
34dc7c2f
BB
1061 mutex_exit(&mg->mg_lock);
1062}
1063
f3a7f661
GW
1064/*
1065 * Calculate the fragmentation for a given metaslab group. We can use
1066 * a simple average here since all metaslabs within the group must have
1067 * the same size. The return value will be a value between 0 and 100
1068 * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this
1069 * group have a fragmentation metric.
1070 */
1071uint64_t
1072metaslab_group_fragmentation(metaslab_group_t *mg)
1073{
1074 vdev_t *vd = mg->mg_vd;
1075 uint64_t fragmentation = 0;
1076 uint64_t valid_ms = 0;
f3a7f661 1077
1c27024e 1078 for (int m = 0; m < vd->vdev_ms_count; m++) {
f3a7f661
GW
1079 metaslab_t *msp = vd->vdev_ms[m];
1080
1081 if (msp->ms_fragmentation == ZFS_FRAG_INVALID)
1082 continue;
cc99f275
DB
1083 if (msp->ms_group != mg)
1084 continue;
f3a7f661
GW
1085
1086 valid_ms++;
1087 fragmentation += msp->ms_fragmentation;
1088 }
1089
cc99f275 1090 if (valid_ms <= mg->mg_vd->vdev_ms_count / 2)
f3a7f661
GW
1091 return (ZFS_FRAG_INVALID);
1092
1093 fragmentation /= valid_ms;
1094 ASSERT3U(fragmentation, <=, 100);
1095 return (fragmentation);
1096}
1097
ac72fac3
GW
1098/*
1099 * Determine if a given metaslab group should skip allocations. A metaslab
f3a7f661
GW
1100 * group should avoid allocations if its free capacity is less than the
1101 * zfs_mg_noalloc_threshold or its fragmentation metric is greater than
1102 * zfs_mg_fragmentation_threshold and there is at least one metaslab group
3dfb57a3
DB
1103 * that can still handle allocations. If the allocation throttle is enabled
1104 * then we skip allocations to devices that have reached their maximum
1105 * allocation queue depth unless the selected metaslab group is the only
1106 * eligible group remaining.
ac72fac3
GW
1107 */
1108static boolean_t
3dfb57a3 1109metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
c197a77c 1110 uint64_t psize, int allocator, int d)
ac72fac3 1111{
3dfb57a3 1112 spa_t *spa = mg->mg_vd->vdev_spa;
ac72fac3
GW
1113 metaslab_class_t *mc = mg->mg_class;
1114
1115 /*
3dfb57a3
DB
1116 * We can only consider skipping this metaslab group if it's
1117 * in the normal metaslab class and there are other metaslab
1118 * groups to select from. Otherwise, we always consider it eligible
f3a7f661 1119 * for allocations.
ac72fac3 1120 */
cc99f275
DB
1121 if ((mc != spa_normal_class(spa) &&
1122 mc != spa_special_class(spa) &&
1123 mc != spa_dedup_class(spa)) ||
1124 mc->mc_groups <= 1)
3dfb57a3
DB
1125 return (B_TRUE);
1126
1127 /*
1128 * If the metaslab group's mg_allocatable flag is set (see comments
1129 * in metaslab_group_alloc_update() for more information) and
1130 * the allocation throttle is disabled then allow allocations to this
1131 * device. However, if the allocation throttle is enabled then
1132 * check if we have reached our allocation limit (mg_alloc_queue_depth)
1133 * to determine if we should allow allocations to this metaslab group.
1134 * If all metaslab groups are no longer considered allocatable
1135 * (mc_alloc_groups == 0) or we're trying to allocate the smallest
1136 * gang block size then we allow allocations on this metaslab group
1137 * regardless of the mg_allocatable or throttle settings.
1138 */
1139 if (mg->mg_allocatable) {
1140 metaslab_group_t *mgp;
1141 int64_t qdepth;
492f64e9 1142 uint64_t qmax = mg->mg_cur_max_alloc_queue_depth[allocator];
3dfb57a3
DB
1143
1144 if (!mc->mc_alloc_throttle_enabled)
1145 return (B_TRUE);
1146
1147 /*
1148 * If this metaslab group does not have any free space, then
1149 * there is no point in looking further.
1150 */
1151 if (mg->mg_no_free_space)
1152 return (B_FALSE);
1153
c197a77c 1154 /*
1155 * Relax allocation throttling for ditto blocks. Due to
1156 * random imbalances in allocation it tends to push copies
1157 * to one vdev, that looks a bit better at the moment.
1158 */
1159 qmax = qmax * (4 + d) / 4;
1160
424fd7c3
TS
1161 qdepth = zfs_refcount_count(
1162 &mg->mg_alloc_queue_depth[allocator]);
3dfb57a3
DB
1163
1164 /*
1165 * If this metaslab group is below its qmax or it's
1166 * the only allocatable metasable group, then attempt
1167 * to allocate from it.
1168 */
1169 if (qdepth < qmax || mc->mc_alloc_groups == 1)
1170 return (B_TRUE);
1171 ASSERT3U(mc->mc_alloc_groups, >, 1);
1172
1173 /*
1174 * Since this metaslab group is at or over its qmax, we
1175 * need to determine if there are metaslab groups after this
1176 * one that might be able to handle this allocation. This is
1177 * racy since we can't hold the locks for all metaslab
1178 * groups at the same time when we make this check.
1179 */
1180 for (mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) {
492f64e9 1181 qmax = mgp->mg_cur_max_alloc_queue_depth[allocator];
c197a77c 1182 qmax = qmax * (4 + d) / 4;
424fd7c3 1183 qdepth = zfs_refcount_count(
492f64e9 1184 &mgp->mg_alloc_queue_depth[allocator]);
3dfb57a3
DB
1185
1186 /*
1187 * If there is another metaslab group that
1188 * might be able to handle the allocation, then
1189 * we return false so that we skip this group.
1190 */
1191 if (qdepth < qmax && !mgp->mg_no_free_space)
1192 return (B_FALSE);
1193 }
1194
1195 /*
1196 * We didn't find another group to handle the allocation
1197 * so we can't skip this metaslab group even though
1198 * we are at or over our qmax.
1199 */
1200 return (B_TRUE);
1201
1202 } else if (mc->mc_alloc_groups == 0 || psize == SPA_MINBLOCKSIZE) {
1203 return (B_TRUE);
1204 }
1205 return (B_FALSE);
ac72fac3
GW
1206}
1207
428870ff
BB
1208/*
1209 * ==========================================================================
93cf2076 1210 * Range tree callbacks
428870ff
BB
1211 * ==========================================================================
1212 */
93cf2076
GW
1213
1214/*
1215 * Comparison function for the private size-ordered tree. Tree is sorted
1216 * by size, larger sizes at the end of the tree.
1217 */
428870ff 1218static int
93cf2076 1219metaslab_rangesize_compare(const void *x1, const void *x2)
428870ff 1220{
93cf2076
GW
1221 const range_seg_t *r1 = x1;
1222 const range_seg_t *r2 = x2;
1223 uint64_t rs_size1 = r1->rs_end - r1->rs_start;
1224 uint64_t rs_size2 = r2->rs_end - r2->rs_start;
428870ff 1225
ee36c709
GN
1226 int cmp = AVL_CMP(rs_size1, rs_size2);
1227 if (likely(cmp))
1228 return (cmp);
428870ff 1229
ee36c709 1230 return (AVL_CMP(r1->rs_start, r2->rs_start));
428870ff
BB
1231}
1232
93cf2076
GW
1233/*
1234 * ==========================================================================
4e21fd06 1235 * Common allocator routines
93cf2076
GW
1236 * ==========================================================================
1237 */
1238
9babb374 1239/*
428870ff 1240 * Return the maximum contiguous segment within the metaslab.
9babb374 1241 */
9babb374 1242uint64_t
c81f1790 1243metaslab_largest_allocatable(metaslab_t *msp)
9babb374 1244{
d2734cce 1245 avl_tree_t *t = &msp->ms_allocatable_by_size;
93cf2076 1246 range_seg_t *rs;
9babb374 1247
c81f1790
PD
1248 if (t == NULL)
1249 return (0);
1250 rs = avl_last(t);
1251 if (rs == NULL)
1252 return (0);
9babb374 1253
93cf2076
GW
1254 return (rs->rs_end - rs->rs_start);
1255}
1256
c81f1790
PD
1257/*
1258 * Return the maximum contiguous segment within the unflushed frees of this
1259 * metaslab.
1260 */
1261uint64_t
1262metaslab_largest_unflushed_free(metaslab_t *msp)
1263{
1264 ASSERT(MUTEX_HELD(&msp->ms_lock));
1265
1266 if (msp->ms_unflushed_frees == NULL)
1267 return (0);
1268
1269 range_seg_t *rs = avl_last(&msp->ms_unflushed_frees_by_size);
1270 if (rs == NULL)
1271 return (0);
1272
1273 /*
1274 * When a range is freed from the metaslab, that range is added to
1275 * both the unflushed frees and the deferred frees. While the block
1276 * will eventually be usable, if the metaslab were loaded the range
1277 * would not be added to the ms_allocatable tree until TXG_DEFER_SIZE
1278 * txgs had passed. As a result, when attempting to estimate an upper
1279 * bound for the largest currently-usable free segment in the
1280 * metaslab, we need to not consider any ranges currently in the defer
1281 * trees. This algorithm approximates the largest available chunk in
1282 * the largest range in the unflushed_frees tree by taking the first
1283 * chunk. While this may be a poor estimate, it should only remain so
1284 * briefly and should eventually self-correct as frees are no longer
1285 * deferred. Similar logic applies to the ms_freed tree. See
1286 * metaslab_load() for more details.
1287 *
e1cfd73f 1288 * There are two primary sources of inaccuracy in this estimate. Both
c81f1790
PD
1289 * are tolerated for performance reasons. The first source is that we
1290 * only check the largest segment for overlaps. Smaller segments may
1291 * have more favorable overlaps with the other trees, resulting in
1292 * larger usable chunks. Second, we only look at the first chunk in
1293 * the largest segment; there may be other usable chunks in the
1294 * largest segment, but we ignore them.
1295 */
1296 uint64_t rstart = rs->rs_start;
1297 uint64_t rsize = rs->rs_end - rstart;
1298 for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1299 uint64_t start = 0;
1300 uint64_t size = 0;
1301 boolean_t found = range_tree_find_in(msp->ms_defer[t], rstart,
1302 rsize, &start, &size);
1303 if (found) {
1304 if (rstart == start)
1305 return (0);
1306 rsize = start - rstart;
1307 }
1308 }
1309
1310 uint64_t start = 0;
1311 uint64_t size = 0;
1312 boolean_t found = range_tree_find_in(msp->ms_freed, rstart,
1313 rsize, &start, &size);
1314 if (found)
1315 rsize = start - rstart;
1316
1317 return (rsize);
1318}
1319
4e21fd06
DB
1320static range_seg_t *
1321metaslab_block_find(avl_tree_t *t, uint64_t start, uint64_t size)
93cf2076 1322{
4e21fd06
DB
1323 range_seg_t *rs, rsearch;
1324 avl_index_t where;
93cf2076 1325
4e21fd06
DB
1326 rsearch.rs_start = start;
1327 rsearch.rs_end = start + size;
93cf2076 1328
4e21fd06
DB
1329 rs = avl_find(t, &rsearch, &where);
1330 if (rs == NULL) {
1331 rs = avl_nearest(t, where, AVL_AFTER);
93cf2076 1332 }
93cf2076 1333
4e21fd06
DB
1334 return (rs);
1335}
93cf2076 1336
d3230d76 1337#if defined(WITH_DF_BLOCK_ALLOCATOR) || \
93cf2076
GW
1338 defined(WITH_CF_BLOCK_ALLOCATOR)
1339/*
1340 * This is a helper function that can be used by the allocator to find
1341 * a suitable block to allocate. This will search the specified AVL
1342 * tree looking for a block that matches the specified criteria.
1343 */
1344static uint64_t
1345metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
d3230d76 1346 uint64_t max_search)
93cf2076 1347{
4e21fd06 1348 range_seg_t *rs = metaslab_block_find(t, *cursor, size);
d3230d76 1349 uint64_t first_found;
93cf2076 1350
d3230d76
MA
1351 if (rs != NULL)
1352 first_found = rs->rs_start;
93cf2076 1353
d3230d76
MA
1354 while (rs != NULL && rs->rs_start - first_found <= max_search) {
1355 uint64_t offset = rs->rs_start;
93cf2076
GW
1356 if (offset + size <= rs->rs_end) {
1357 *cursor = offset + size;
1358 return (offset);
1359 }
1360 rs = AVL_NEXT(t, rs);
1361 }
1362
93cf2076 1363 *cursor = 0;
d3230d76 1364 return (-1ULL);
9babb374 1365}
d3230d76 1366#endif /* WITH_DF/CF_BLOCK_ALLOCATOR */
22c81dd8
BB
1367
1368#if defined(WITH_DF_BLOCK_ALLOCATOR)
428870ff
BB
1369/*
1370 * ==========================================================================
d3230d76
MA
1371 * Dynamic Fit (df) block allocator
1372 *
1373 * Search for a free chunk of at least this size, starting from the last
1374 * offset (for this alignment of block) looking for up to
1375 * metaslab_df_max_search bytes (16MB). If a large enough free chunk is not
1376 * found within 16MB, then return a free chunk of exactly the requested size (or
1377 * larger).
1378 *
1379 * If it seems like searching from the last offset will be unproductive, skip
1380 * that and just return a free chunk of exactly the requested size (or larger).
1381 * This is based on metaslab_df_alloc_threshold and metaslab_df_free_pct. This
1382 * mechanism is probably not very useful and may be removed in the future.
1383 *
1384 * The behavior when not searching can be changed to return the largest free
1385 * chunk, instead of a free chunk of exactly the requested size, by setting
1386 * metaslab_df_use_largest_segment.
428870ff
BB
1387 * ==========================================================================
1388 */
9babb374 1389static uint64_t
93cf2076 1390metaslab_df_alloc(metaslab_t *msp, uint64_t size)
9babb374 1391{
93cf2076
GW
1392 /*
1393 * Find the largest power of 2 block size that evenly divides the
1394 * requested size. This is used to try to allocate blocks with similar
1395 * alignment from the same area of the metaslab (i.e. same cursor
1396 * bucket) but it does not guarantee that other allocations sizes
1397 * may exist in the same region.
1398 */
9babb374 1399 uint64_t align = size & -size;
9bd274dd 1400 uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
d2734cce 1401 range_tree_t *rt = msp->ms_allocatable;
93cf2076 1402 int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
d3230d76 1403 uint64_t offset;
9babb374 1404
93cf2076 1405 ASSERT(MUTEX_HELD(&msp->ms_lock));
d3230d76 1406 ASSERT3U(avl_numnodes(&rt->rt_root), ==,
d2734cce 1407 avl_numnodes(&msp->ms_allocatable_by_size));
9babb374 1408
9babb374 1409 /*
d3230d76
MA
1410 * If we're running low on space, find a segment based on size,
1411 * rather than iterating based on offset.
9babb374 1412 */
c81f1790 1413 if (metaslab_largest_allocatable(msp) < metaslab_df_alloc_threshold ||
9babb374 1414 free_pct < metaslab_df_free_pct) {
d3230d76
MA
1415 offset = -1;
1416 } else {
1417 offset = metaslab_block_picker(&rt->rt_root,
1418 cursor, size, metaslab_df_max_search);
9babb374
BB
1419 }
1420
d3230d76
MA
1421 if (offset == -1) {
1422 range_seg_t *rs;
1423 if (metaslab_df_use_largest_segment) {
1424 /* use largest free segment */
1425 rs = avl_last(&msp->ms_allocatable_by_size);
1426 } else {
1427 /* use segment of this size, or next largest */
1428 rs = metaslab_block_find(&msp->ms_allocatable_by_size,
1429 0, size);
1430 }
1431 if (rs != NULL && rs->rs_start + size <= rs->rs_end) {
1432 offset = rs->rs_start;
1433 *cursor = offset + size;
1434 }
1435 }
1436
1437 return (offset);
9babb374
BB
1438}
1439
93cf2076 1440static metaslab_ops_t metaslab_df_ops = {
f3a7f661 1441 metaslab_df_alloc
34dc7c2f
BB
1442};
1443
93cf2076 1444metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
22c81dd8
BB
1445#endif /* WITH_DF_BLOCK_ALLOCATOR */
1446
93cf2076 1447#if defined(WITH_CF_BLOCK_ALLOCATOR)
428870ff
BB
1448/*
1449 * ==========================================================================
93cf2076
GW
1450 * Cursor fit block allocator -
1451 * Select the largest region in the metaslab, set the cursor to the beginning
1452 * of the range and the cursor_end to the end of the range. As allocations
1453 * are made advance the cursor. Continue allocating from the cursor until
1454 * the range is exhausted and then find a new range.
428870ff
BB
1455 * ==========================================================================
1456 */
1457static uint64_t
93cf2076 1458metaslab_cf_alloc(metaslab_t *msp, uint64_t size)
428870ff 1459{
d2734cce
SD
1460 range_tree_t *rt = msp->ms_allocatable;
1461 avl_tree_t *t = &msp->ms_allocatable_by_size;
93cf2076
GW
1462 uint64_t *cursor = &msp->ms_lbas[0];
1463 uint64_t *cursor_end = &msp->ms_lbas[1];
428870ff
BB
1464 uint64_t offset = 0;
1465
93cf2076
GW
1466 ASSERT(MUTEX_HELD(&msp->ms_lock));
1467 ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root));
428870ff 1468
93cf2076 1469 ASSERT3U(*cursor_end, >=, *cursor);
428870ff 1470
93cf2076
GW
1471 if ((*cursor + size) > *cursor_end) {
1472 range_seg_t *rs;
428870ff 1473
d2734cce 1474 rs = avl_last(&msp->ms_allocatable_by_size);
93cf2076
GW
1475 if (rs == NULL || (rs->rs_end - rs->rs_start) < size)
1476 return (-1ULL);
428870ff 1477
93cf2076
GW
1478 *cursor = rs->rs_start;
1479 *cursor_end = rs->rs_end;
428870ff 1480 }
93cf2076
GW
1481
1482 offset = *cursor;
1483 *cursor += size;
1484
428870ff
BB
1485 return (offset);
1486}
1487
93cf2076 1488static metaslab_ops_t metaslab_cf_ops = {
f3a7f661 1489 metaslab_cf_alloc
428870ff
BB
1490};
1491
93cf2076
GW
1492metaslab_ops_t *zfs_metaslab_ops = &metaslab_cf_ops;
1493#endif /* WITH_CF_BLOCK_ALLOCATOR */
22c81dd8
BB
1494
1495#if defined(WITH_NDF_BLOCK_ALLOCATOR)
93cf2076
GW
1496/*
1497 * ==========================================================================
1498 * New dynamic fit allocator -
1499 * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift
1500 * contiguous blocks. If no region is found then just use the largest segment
1501 * that remains.
1502 * ==========================================================================
1503 */
1504
1505/*
1506 * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift)
1507 * to request from the allocator.
1508 */
428870ff
BB
1509uint64_t metaslab_ndf_clump_shift = 4;
1510
1511static uint64_t
93cf2076 1512metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
428870ff 1513{
d2734cce 1514 avl_tree_t *t = &msp->ms_allocatable->rt_root;
428870ff 1515 avl_index_t where;
93cf2076 1516 range_seg_t *rs, rsearch;
9bd274dd 1517 uint64_t hbit = highbit64(size);
93cf2076 1518 uint64_t *cursor = &msp->ms_lbas[hbit - 1];
c81f1790 1519 uint64_t max_size = metaslab_largest_allocatable(msp);
428870ff 1520
93cf2076 1521 ASSERT(MUTEX_HELD(&msp->ms_lock));
d2734cce
SD
1522 ASSERT3U(avl_numnodes(t), ==,
1523 avl_numnodes(&msp->ms_allocatable_by_size));
428870ff
BB
1524
1525 if (max_size < size)
1526 return (-1ULL);
1527
93cf2076
GW
1528 rsearch.rs_start = *cursor;
1529 rsearch.rs_end = *cursor + size;
428870ff 1530
93cf2076
GW
1531 rs = avl_find(t, &rsearch, &where);
1532 if (rs == NULL || (rs->rs_end - rs->rs_start) < size) {
d2734cce 1533 t = &msp->ms_allocatable_by_size;
428870ff 1534
93cf2076
GW
1535 rsearch.rs_start = 0;
1536 rsearch.rs_end = MIN(max_size,
428870ff 1537 1ULL << (hbit + metaslab_ndf_clump_shift));
93cf2076
GW
1538 rs = avl_find(t, &rsearch, &where);
1539 if (rs == NULL)
1540 rs = avl_nearest(t, where, AVL_AFTER);
1541 ASSERT(rs != NULL);
428870ff
BB
1542 }
1543
93cf2076
GW
1544 if ((rs->rs_end - rs->rs_start) >= size) {
1545 *cursor = rs->rs_start + size;
1546 return (rs->rs_start);
428870ff
BB
1547 }
1548 return (-1ULL);
1549}
1550
93cf2076 1551static metaslab_ops_t metaslab_ndf_ops = {
f3a7f661 1552 metaslab_ndf_alloc
428870ff
BB
1553};
1554
93cf2076 1555metaslab_ops_t *zfs_metaslab_ops = &metaslab_ndf_ops;
22c81dd8 1556#endif /* WITH_NDF_BLOCK_ALLOCATOR */
9babb374 1557
93cf2076 1558
34dc7c2f
BB
1559/*
1560 * ==========================================================================
1561 * Metaslabs
1562 * ==========================================================================
1563 */
93cf2076 1564
93e28d66
SD
1565/*
1566 * Wait for any in-progress metaslab loads to complete.
1567 */
1568void
1569metaslab_load_wait(metaslab_t *msp)
1570{
1571 ASSERT(MUTEX_HELD(&msp->ms_lock));
1572
1573 while (msp->ms_loading) {
1574 ASSERT(!msp->ms_loaded);
1575 cv_wait(&msp->ms_load_cv, &msp->ms_lock);
1576 }
1577}
1578
1579/*
1580 * Wait for any in-progress flushing to complete.
1581 */
1582void
1583metaslab_flush_wait(metaslab_t *msp)
1584{
1585 ASSERT(MUTEX_HELD(&msp->ms_lock));
1586
1587 while (msp->ms_flushing)
1588 cv_wait(&msp->ms_flush_cv, &msp->ms_lock);
1589}
1590
f09fda50
PD
1591static unsigned int
1592metaslab_idx_func(multilist_t *ml, void *arg)
1593{
1594 metaslab_t *msp = arg;
1595 return (msp->ms_id % multilist_get_num_sublists(ml));
1596}
1597
93e28d66
SD
1598uint64_t
1599metaslab_allocated_space(metaslab_t *msp)
1600{
1601 return (msp->ms_allocated_space);
1602}
1603
1604/*
1605 * Verify that the space accounting on disk matches the in-core range_trees.
1606 */
1607static void
1608metaslab_verify_space(metaslab_t *msp, uint64_t txg)
1609{
1610 spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1611 uint64_t allocating = 0;
1612 uint64_t sm_free_space, msp_free_space;
1613
1614 ASSERT(MUTEX_HELD(&msp->ms_lock));
1615 ASSERT(!msp->ms_condensing);
1616
1617 if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
1618 return;
1619
1620 /*
1621 * We can only verify the metaslab space when we're called
1622 * from syncing context with a loaded metaslab that has an
1623 * allocated space map. Calling this in non-syncing context
1624 * does not provide a consistent view of the metaslab since
1625 * we're performing allocations in the future.
1626 */
1627 if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL ||
1628 !msp->ms_loaded)
1629 return;
1630
1631 /*
1632 * Even though the smp_alloc field can get negative,
1633 * when it comes to a metaslab's space map, that should
1634 * never be the case.
1635 */
1636 ASSERT3S(space_map_allocated(msp->ms_sm), >=, 0);
1637
1638 ASSERT3U(space_map_allocated(msp->ms_sm), >=,
1639 range_tree_space(msp->ms_unflushed_frees));
1640
1641 ASSERT3U(metaslab_allocated_space(msp), ==,
1642 space_map_allocated(msp->ms_sm) +
1643 range_tree_space(msp->ms_unflushed_allocs) -
1644 range_tree_space(msp->ms_unflushed_frees));
1645
1646 sm_free_space = msp->ms_size - metaslab_allocated_space(msp);
1647
1648 /*
1649 * Account for future allocations since we would have
1650 * already deducted that space from the ms_allocatable.
1651 */
1652 for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
1653 allocating +=
1654 range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]);
1655 }
f09fda50
PD
1656 ASSERT3U(allocating + msp->ms_allocated_this_txg, ==,
1657 msp->ms_allocating_total);
93e28d66
SD
1658
1659 ASSERT3U(msp->ms_deferspace, ==,
1660 range_tree_space(msp->ms_defer[0]) +
1661 range_tree_space(msp->ms_defer[1]));
1662
1663 msp_free_space = range_tree_space(msp->ms_allocatable) + allocating +
1664 msp->ms_deferspace + range_tree_space(msp->ms_freed);
1665
1666 VERIFY3U(sm_free_space, ==, msp_free_space);
1667}
1668
928e8ad4
SD
1669static void
1670metaslab_aux_histograms_clear(metaslab_t *msp)
1671{
1672 /*
1673 * Auxiliary histograms are only cleared when resetting them,
1674 * which can only happen while the metaslab is loaded.
1675 */
1676 ASSERT(msp->ms_loaded);
1677
1678 bzero(msp->ms_synchist, sizeof (msp->ms_synchist));
1679 for (int t = 0; t < TXG_DEFER_SIZE; t++)
1680 bzero(msp->ms_deferhist[t], sizeof (msp->ms_deferhist[t]));
1681}
1682
1683static void
1684metaslab_aux_histogram_add(uint64_t *histogram, uint64_t shift,
1685 range_tree_t *rt)
1686{
1687 /*
1688 * This is modeled after space_map_histogram_add(), so refer to that
1689 * function for implementation details. We want this to work like
1690 * the space map histogram, and not the range tree histogram, as we
1691 * are essentially constructing a delta that will be later subtracted
1692 * from the space map histogram.
1693 */
1694 int idx = 0;
1695 for (int i = shift; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
1696 ASSERT3U(i, >=, idx + shift);
1697 histogram[idx] += rt->rt_histogram[i] << (i - idx - shift);
1698
1699 if (idx < SPACE_MAP_HISTOGRAM_SIZE - 1) {
1700 ASSERT3U(idx + shift, ==, i);
1701 idx++;
1702 ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE);
1703 }
1704 }
1705}
1706
1707/*
1708 * Called at every sync pass that the metaslab gets synced.
1709 *
1710 * The reason is that we want our auxiliary histograms to be updated
1711 * wherever the metaslab's space map histogram is updated. This way
1712 * we stay consistent on which parts of the metaslab space map's
1713 * histogram are currently not available for allocations (e.g because
1714 * they are in the defer, freed, and freeing trees).
1715 */
1716static void
1717metaslab_aux_histograms_update(metaslab_t *msp)
1718{
1719 space_map_t *sm = msp->ms_sm;
1720 ASSERT(sm != NULL);
1721
1722 /*
1723 * This is similar to the metaslab's space map histogram updates
1724 * that take place in metaslab_sync(). The only difference is that
1725 * we only care about segments that haven't made it into the
1726 * ms_allocatable tree yet.
1727 */
1728 if (msp->ms_loaded) {
1729 metaslab_aux_histograms_clear(msp);
1730
1731 metaslab_aux_histogram_add(msp->ms_synchist,
1732 sm->sm_shift, msp->ms_freed);
1733
1734 for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1735 metaslab_aux_histogram_add(msp->ms_deferhist[t],
1736 sm->sm_shift, msp->ms_defer[t]);
1737 }
1738 }
1739
1740 metaslab_aux_histogram_add(msp->ms_synchist,
1741 sm->sm_shift, msp->ms_freeing);
1742}
1743
1744/*
1745 * Called every time we are done syncing (writing to) the metaslab,
1746 * i.e. at the end of each sync pass.
1747 * [see the comment in metaslab_impl.h for ms_synchist, ms_deferhist]
1748 */
1749static void
1750metaslab_aux_histograms_update_done(metaslab_t *msp, boolean_t defer_allowed)
1751{
1752 spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1753 space_map_t *sm = msp->ms_sm;
1754
1755 if (sm == NULL) {
1756 /*
1757 * We came here from metaslab_init() when creating/opening a
1758 * pool, looking at a metaslab that hasn't had any allocations
1759 * yet.
1760 */
1761 return;
1762 }
1763
1764 /*
1765 * This is similar to the actions that we take for the ms_freed
1766 * and ms_defer trees in metaslab_sync_done().
1767 */
1768 uint64_t hist_index = spa_syncing_txg(spa) % TXG_DEFER_SIZE;
1769 if (defer_allowed) {
1770 bcopy(msp->ms_synchist, msp->ms_deferhist[hist_index],
1771 sizeof (msp->ms_synchist));
1772 } else {
1773 bzero(msp->ms_deferhist[hist_index],
1774 sizeof (msp->ms_deferhist[hist_index]));
1775 }
1776 bzero(msp->ms_synchist, sizeof (msp->ms_synchist));
1777}
1778
1779/*
1780 * Ensure that the metaslab's weight and fragmentation are consistent
1781 * with the contents of the histogram (either the range tree's histogram
1782 * or the space map's depending whether the metaslab is loaded).
1783 */
1784static void
1785metaslab_verify_weight_and_frag(metaslab_t *msp)
1786{
1787 ASSERT(MUTEX_HELD(&msp->ms_lock));
1788
1789 if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
1790 return;
1791
2fcf4481
SD
1792 /*
1793 * We can end up here from vdev_remove_complete(), in which case we
1794 * cannot do these assertions because we hold spa config locks and
1795 * thus we are not allowed to read from the DMU.
1796 *
1797 * We check if the metaslab group has been removed and if that's
1798 * the case we return immediately as that would mean that we are
1799 * here from the aforementioned code path.
1800 */
928e8ad4
SD
1801 if (msp->ms_group == NULL)
1802 return;
1803
1804 /*
1805 * Devices being removed always return a weight of 0 and leave
1806 * fragmentation and ms_max_size as is - there is nothing for
1807 * us to verify here.
1808 */
1809 vdev_t *vd = msp->ms_group->mg_vd;
1810 if (vd->vdev_removing)
1811 return;
1812
1813 /*
1814 * If the metaslab is dirty it probably means that we've done
1815 * some allocations or frees that have changed our histograms
1816 * and thus the weight.
1817 */
1818 for (int t = 0; t < TXG_SIZE; t++) {
1819 if (txg_list_member(&vd->vdev_ms_list, msp, t))
1820 return;
1821 }
1822
1823 /*
1824 * This verification checks that our in-memory state is consistent
1825 * with what's on disk. If the pool is read-only then there aren't
1826 * any changes and we just have the initially-loaded state.
1827 */
1828 if (!spa_writeable(msp->ms_group->mg_vd->vdev_spa))
1829 return;
1830
1831 /* some extra verification for in-core tree if you can */
1832 if (msp->ms_loaded) {
1833 range_tree_stat_verify(msp->ms_allocatable);
1834 VERIFY(space_map_histogram_verify(msp->ms_sm,
1835 msp->ms_allocatable));
1836 }
1837
1838 uint64_t weight = msp->ms_weight;
1839 uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
1840 boolean_t space_based = WEIGHT_IS_SPACEBASED(msp->ms_weight);
1841 uint64_t frag = msp->ms_fragmentation;
1842 uint64_t max_segsize = msp->ms_max_size;
1843
1844 msp->ms_weight = 0;
1845 msp->ms_fragmentation = 0;
928e8ad4
SD
1846
1847 /*
65a91b16
SD
1848 * This function is used for verification purposes and thus should
1849 * not introduce any side-effects/mutations on the system's state.
1850 *
1851 * Regardless of whether metaslab_weight() thinks this metaslab
1852 * should be active or not, we want to ensure that the actual weight
1853 * (and therefore the value of ms_weight) would be the same if it
1854 * was to be recalculated at this point.
1855 *
1856 * In addition we set the nodirty flag so metaslab_weight() does
1857 * not dirty the metaslab for future TXGs (e.g. when trying to
1858 * force condensing to upgrade the metaslab spacemaps).
928e8ad4 1859 */
65a91b16 1860 msp->ms_weight = metaslab_weight(msp, B_TRUE) | was_active;
928e8ad4
SD
1861
1862 VERIFY3U(max_segsize, ==, msp->ms_max_size);
1863
1864 /*
1865 * If the weight type changed then there is no point in doing
1866 * verification. Revert fields to their original values.
1867 */
1868 if ((space_based && !WEIGHT_IS_SPACEBASED(msp->ms_weight)) ||
1869 (!space_based && WEIGHT_IS_SPACEBASED(msp->ms_weight))) {
1870 msp->ms_fragmentation = frag;
1871 msp->ms_weight = weight;
1872 return;
1873 }
1874
1875 VERIFY3U(msp->ms_fragmentation, ==, frag);
1876 VERIFY3U(msp->ms_weight, ==, weight);
1877}
1878
f09fda50
PD
1879/*
1880 * If we're over the zfs_metaslab_mem_limit, select the loaded metaslab from
1881 * this class that was used longest ago, and attempt to unload it. We don't
1882 * want to spend too much time in this loop to prevent performance
e1cfd73f 1883 * degradation, and we expect that most of the time this operation will
f09fda50
PD
1884 * succeed. Between that and the normal unloading processing during txg sync,
1885 * we expect this to keep the metaslab memory usage under control.
1886 */
1887static void
1888metaslab_potentially_evict(metaslab_class_t *mc)
1889{
1890#ifdef _KERNEL
1891 uint64_t allmem = arc_all_memory();
1892 extern kmem_cache_t *range_seg_cache;
1893 uint64_t inuse = range_seg_cache->skc_obj_total;
1894 uint64_t size = range_seg_cache->skc_obj_size;
1895 int tries = 0;
1896 for (; allmem * zfs_metaslab_mem_limit / 100 < inuse * size &&
1897 tries < multilist_get_num_sublists(mc->mc_metaslab_txg_list) * 2;
1898 tries++) {
1899 unsigned int idx = multilist_get_random_index(
1900 mc->mc_metaslab_txg_list);
1901 multilist_sublist_t *mls =
1902 multilist_sublist_lock(mc->mc_metaslab_txg_list, idx);
1903 metaslab_t *msp = multilist_sublist_head(mls);
1904 multilist_sublist_unlock(mls);
1905 while (msp != NULL && allmem * zfs_metaslab_mem_limit / 100 <
1906 inuse * size) {
1907 VERIFY3P(mls, ==, multilist_sublist_lock(
1908 mc->mc_metaslab_txg_list, idx));
1909 ASSERT3U(idx, ==,
1910 metaslab_idx_func(mc->mc_metaslab_txg_list, msp));
1911
1912 if (!multilist_link_active(&msp->ms_class_txg_node)) {
1913 multilist_sublist_unlock(mls);
1914 break;
1915 }
1916 metaslab_t *next_msp = multilist_sublist_next(mls, msp);
1917 multilist_sublist_unlock(mls);
1918 /*
1919 * If the metaslab is currently loading there are two
1920 * cases. If it's the metaslab we're evicting, we
1921 * can't continue on or we'll panic when we attempt to
1922 * recursively lock the mutex. If it's another
1923 * metaslab that's loading, it can be safely skipped,
1924 * since we know it's very new and therefore not a
1925 * good eviction candidate. We check later once the
1926 * lock is held that the metaslab is fully loaded
1927 * before actually unloading it.
1928 */
1929 if (msp->ms_loading) {
1930 msp = next_msp;
1931 inuse = range_seg_cache->skc_obj_total;
1932 continue;
1933 }
1934 /*
1935 * We can't unload metaslabs with no spacemap because
1936 * they're not ready to be unloaded yet. We can't
1937 * unload metaslabs with outstanding allocations
1938 * because doing so could cause the metaslab's weight
1939 * to decrease while it's unloaded, which violates an
1940 * invariant that we use to prevent unnecessary
1941 * loading. We also don't unload metaslabs that are
1942 * currently active because they are high-weight
1943 * metaslabs that are likely to be used in the near
1944 * future.
1945 */
1946 mutex_enter(&msp->ms_lock);
1947 if (msp->ms_allocator == -1 && msp->ms_sm != NULL &&
1948 msp->ms_allocating_total == 0) {
1949 metaslab_unload(msp);
1950 }
1951 mutex_exit(&msp->ms_lock);
1952 msp = next_msp;
1953 inuse = range_seg_cache->skc_obj_total;
1954 }
1955 }
1956#endif
1957}
1958
b194fab0
SD
1959static int
1960metaslab_load_impl(metaslab_t *msp)
93cf2076
GW
1961{
1962 int error = 0;
93cf2076
GW
1963
1964 ASSERT(MUTEX_HELD(&msp->ms_lock));
b194fab0 1965 ASSERT(msp->ms_loading);
425d3237 1966 ASSERT(!msp->ms_condensing);
93cf2076 1967
a1d477c2 1968 /*
425d3237
SD
1969 * We temporarily drop the lock to unblock other operations while we
1970 * are reading the space map. Therefore, metaslab_sync() and
1971 * metaslab_sync_done() can run at the same time as we do.
1972 *
93e28d66
SD
1973 * If we are using the log space maps, metaslab_sync() can't write to
1974 * the metaslab's space map while we are loading as we only write to
1975 * it when we are flushing the metaslab, and that can't happen while
1976 * we are loading it.
1977 *
1978 * If we are not using log space maps though, metaslab_sync() can
1979 * append to the space map while we are loading. Therefore we load
1980 * only entries that existed when we started the load. Additionally,
1981 * metaslab_sync_done() has to wait for the load to complete because
1982 * there are potential races like metaslab_load() loading parts of the
1983 * space map that are currently being appended by metaslab_sync(). If
1984 * we didn't, the ms_allocatable would have entries that
1985 * metaslab_sync_done() would try to re-add later.
425d3237
SD
1986 *
1987 * That's why before dropping the lock we remember the synced length
1988 * of the metaslab and read up to that point of the space map,
1989 * ignoring entries appended by metaslab_sync() that happen after we
1990 * drop the lock.
a1d477c2 1991 */
425d3237 1992 uint64_t length = msp->ms_synced_length;
a1d477c2 1993 mutex_exit(&msp->ms_lock);
93cf2076 1994
93e28d66 1995 hrtime_t load_start = gethrtime();
d2734cce 1996 if (msp->ms_sm != NULL) {
425d3237
SD
1997 error = space_map_load_length(msp->ms_sm, msp->ms_allocatable,
1998 SM_FREE, length);
d2734cce 1999 } else {
425d3237
SD
2000 /*
2001 * The space map has not been allocated yet, so treat
2002 * all the space in the metaslab as free and add it to the
2003 * ms_allocatable tree.
2004 */
d2734cce
SD
2005 range_tree_add(msp->ms_allocatable,
2006 msp->ms_start, msp->ms_size);
93e28d66
SD
2007
2008 if (msp->ms_freed != NULL) {
2009 /*
2010 * If the ms_sm doesn't exist, this means that this
2011 * metaslab hasn't gone through metaslab_sync() and
2012 * thus has never been dirtied. So we shouldn't
2013 * expect any unflushed allocs or frees from previous
2014 * TXGs.
2015 *
2016 * Note: ms_freed and all the other trees except for
2017 * the ms_allocatable, can be NULL at this point only
2018 * if this is a new metaslab of a vdev that just got
2019 * expanded.
2020 */
2021 ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
2022 ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
2023 }
d2734cce 2024 }
93cf2076 2025
425d3237
SD
2026 /*
2027 * We need to grab the ms_sync_lock to prevent metaslab_sync() from
93e28d66
SD
2028 * changing the ms_sm (or log_sm) and the metaslab's range trees
2029 * while we are about to use them and populate the ms_allocatable.
2030 * The ms_lock is insufficient for this because metaslab_sync() doesn't
2031 * hold the ms_lock while writing the ms_checkpointing tree to disk.
425d3237
SD
2032 */
2033 mutex_enter(&msp->ms_sync_lock);
a1d477c2 2034 mutex_enter(&msp->ms_lock);
93e28d66 2035
425d3237 2036 ASSERT(!msp->ms_condensing);
93e28d66 2037 ASSERT(!msp->ms_flushing);
93cf2076 2038
8eef9976
SD
2039 if (error != 0) {
2040 mutex_exit(&msp->ms_sync_lock);
b194fab0 2041 return (error);
8eef9976 2042 }
4e21fd06 2043
b194fab0
SD
2044 ASSERT3P(msp->ms_group, !=, NULL);
2045 msp->ms_loaded = B_TRUE;
2046
2047 /*
93e28d66
SD
2048 * Apply all the unflushed changes to ms_allocatable right
2049 * away so any manipulations we do below have a clear view
2050 * of what is allocated and what is free.
2051 */
2052 range_tree_walk(msp->ms_unflushed_allocs,
2053 range_tree_remove, msp->ms_allocatable);
2054 range_tree_walk(msp->ms_unflushed_frees,
2055 range_tree_add, msp->ms_allocatable);
2056
2057 msp->ms_loaded = B_TRUE;
2058
2059 ASSERT3P(msp->ms_group, !=, NULL);
2060 spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
2061 if (spa_syncing_log_sm(spa) != NULL) {
2062 ASSERT(spa_feature_is_enabled(spa,
2063 SPA_FEATURE_LOG_SPACEMAP));
2064
2065 /*
2066 * If we use a log space map we add all the segments
2067 * that are in ms_unflushed_frees so they are available
2068 * for allocation.
2069 *
2070 * ms_allocatable needs to contain all free segments
2071 * that are ready for allocations (thus not segments
2072 * from ms_freeing, ms_freed, and the ms_defer trees).
2073 * But if we grab the lock in this code path at a sync
2074 * pass later that 1, then it also contains the
2075 * segments of ms_freed (they were added to it earlier
2076 * in this path through ms_unflushed_frees). So we
2077 * need to remove all the segments that exist in
2078 * ms_freed from ms_allocatable as they will be added
2079 * later in metaslab_sync_done().
2080 *
2081 * When there's no log space map, the ms_allocatable
2082 * correctly doesn't contain any segments that exist
2083 * in ms_freed [see ms_synced_length].
2084 */
2085 range_tree_walk(msp->ms_freed,
2086 range_tree_remove, msp->ms_allocatable);
2087 }
2088
2089 /*
2090 * If we are not using the log space map, ms_allocatable
2091 * contains the segments that exist in the ms_defer trees
2092 * [see ms_synced_length]. Thus we need to remove them
2093 * from ms_allocatable as they will be added again in
425d3237 2094 * metaslab_sync_done().
93e28d66
SD
2095 *
2096 * If we are using the log space map, ms_allocatable still
2097 * contains the segments that exist in the ms_defer trees.
2098 * Not because it read them through the ms_sm though. But
2099 * because these segments are part of ms_unflushed_frees
2100 * whose segments we add to ms_allocatable earlier in this
2101 * code path.
b194fab0 2102 */
425d3237
SD
2103 for (int t = 0; t < TXG_DEFER_SIZE; t++) {
2104 range_tree_walk(msp->ms_defer[t],
2105 range_tree_remove, msp->ms_allocatable);
93cf2076 2106 }
425d3237 2107
928e8ad4
SD
2108 /*
2109 * Call metaslab_recalculate_weight_and_sort() now that the
2110 * metaslab is loaded so we get the metaslab's real weight.
2111 *
2112 * Unless this metaslab was created with older software and
2113 * has not yet been converted to use segment-based weight, we
2114 * expect the new weight to be better or equal to the weight
2115 * that the metaslab had while it was not loaded. This is
2116 * because the old weight does not take into account the
2117 * consolidation of adjacent segments between TXGs. [see
2118 * comment for ms_synchist and ms_deferhist[] for more info]
2119 */
2120 uint64_t weight = msp->ms_weight;
c81f1790 2121 uint64_t max_size = msp->ms_max_size;
928e8ad4
SD
2122 metaslab_recalculate_weight_and_sort(msp);
2123 if (!WEIGHT_IS_SPACEBASED(weight))
2124 ASSERT3U(weight, <=, msp->ms_weight);
c81f1790
PD
2125 msp->ms_max_size = metaslab_largest_allocatable(msp);
2126 ASSERT3U(max_size, <=, msp->ms_max_size);
93e28d66 2127 hrtime_t load_end = gethrtime();
c81f1790 2128 msp->ms_load_time = load_end;
93e28d66
SD
2129 if (zfs_flags & ZFS_DEBUG_LOG_SPACEMAP) {
2130 zfs_dbgmsg("loading: txg %llu, spa %s, vdev_id %llu, "
2131 "ms_id %llu, smp_length %llu, "
2132 "unflushed_allocs %llu, unflushed_frees %llu, "
2133 "freed %llu, defer %llu + %llu, "
c81f1790
PD
2134 "loading_time %lld ms, ms_max_size %llu, "
2135 "max size error %llu",
93e28d66
SD
2136 spa_syncing_txg(spa), spa_name(spa),
2137 msp->ms_group->mg_vd->vdev_id, msp->ms_id,
2138 space_map_length(msp->ms_sm),
2139 range_tree_space(msp->ms_unflushed_allocs),
2140 range_tree_space(msp->ms_unflushed_frees),
2141 range_tree_space(msp->ms_freed),
2142 range_tree_space(msp->ms_defer[0]),
2143 range_tree_space(msp->ms_defer[1]),
c81f1790
PD
2144 (longlong_t)((load_end - load_start) / 1000000),
2145 msp->ms_max_size, msp->ms_max_size - max_size);
93e28d66
SD
2146 }
2147
425d3237
SD
2148 metaslab_verify_space(msp, spa_syncing_txg(spa));
2149 mutex_exit(&msp->ms_sync_lock);
b194fab0
SD
2150 return (0);
2151}
2152
2153int
2154metaslab_load(metaslab_t *msp)
2155{
2156 ASSERT(MUTEX_HELD(&msp->ms_lock));
2157
2158 /*
2159 * There may be another thread loading the same metaslab, if that's
2160 * the case just wait until the other thread is done and return.
2161 */
2162 metaslab_load_wait(msp);
2163 if (msp->ms_loaded)
2164 return (0);
2165 VERIFY(!msp->ms_loading);
425d3237 2166 ASSERT(!msp->ms_condensing);
b194fab0 2167
93e28d66
SD
2168 /*
2169 * We set the loading flag BEFORE potentially dropping the lock to
2170 * wait for an ongoing flush (see ms_flushing below). This way other
2171 * threads know that there is already a thread that is loading this
2172 * metaslab.
2173 */
b194fab0 2174 msp->ms_loading = B_TRUE;
93e28d66
SD
2175
2176 /*
2177 * Wait for any in-progress flushing to finish as we drop the ms_lock
2178 * both here (during space_map_load()) and in metaslab_flush() (when
2179 * we flush our changes to the ms_sm).
2180 */
2181 if (msp->ms_flushing)
2182 metaslab_flush_wait(msp);
2183
2184 /*
2185 * In the possibility that we were waiting for the metaslab to be
2186 * flushed (where we temporarily dropped the ms_lock), ensure that
2187 * no one else loaded the metaslab somehow.
2188 */
2189 ASSERT(!msp->ms_loaded);
2190
f09fda50
PD
2191 /*
2192 * If we're loading a metaslab in the normal class, consider evicting
2193 * another one to keep our memory usage under the limit defined by the
2194 * zfs_metaslab_mem_limit tunable.
2195 */
2196 if (spa_normal_class(msp->ms_group->mg_class->mc_spa) ==
2197 msp->ms_group->mg_class) {
2198 metaslab_potentially_evict(msp->ms_group->mg_class);
2199 }
2200
b194fab0 2201 int error = metaslab_load_impl(msp);
93e28d66
SD
2202
2203 ASSERT(MUTEX_HELD(&msp->ms_lock));
b194fab0 2204 msp->ms_loading = B_FALSE;
93cf2076 2205 cv_broadcast(&msp->ms_load_cv);
b194fab0 2206
93cf2076
GW
2207 return (error);
2208}
2209
2210void
2211metaslab_unload(metaslab_t *msp)
2212{
2213 ASSERT(MUTEX_HELD(&msp->ms_lock));
928e8ad4 2214
f09fda50
PD
2215 /*
2216 * This can happen if a metaslab is selected for eviction (in
2217 * metaslab_potentially_evict) and then unloaded during spa_sync (via
2218 * metaslab_class_evict_old).
2219 */
2220 if (!msp->ms_loaded)
2221 return;
928e8ad4 2222
d2734cce 2223 range_tree_vacate(msp->ms_allocatable, NULL, NULL);
93cf2076 2224 msp->ms_loaded = B_FALSE;
c81f1790 2225 msp->ms_unload_time = gethrtime();
928e8ad4 2226
679b0f2a 2227 msp->ms_activation_weight = 0;
93cf2076 2228 msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
928e8ad4 2229
f09fda50
PD
2230 if (msp->ms_group != NULL) {
2231 metaslab_class_t *mc = msp->ms_group->mg_class;
2232 multilist_sublist_t *mls =
2233 multilist_sublist_lock_obj(mc->mc_metaslab_txg_list, msp);
2234 if (multilist_link_active(&msp->ms_class_txg_node))
2235 multilist_sublist_remove(mls, msp);
2236 multilist_sublist_unlock(mls);
2237 }
2238
928e8ad4
SD
2239 /*
2240 * We explicitly recalculate the metaslab's weight based on its space
2241 * map (as it is now not loaded). We want unload metaslabs to always
2242 * have their weights calculated from the space map histograms, while
2243 * loaded ones have it calculated from their in-core range tree
2244 * [see metaslab_load()]. This way, the weight reflects the information
93e28d66 2245 * available in-core, whether it is loaded or not.
928e8ad4
SD
2246 *
2247 * If ms_group == NULL means that we came here from metaslab_fini(),
2248 * at which point it doesn't make sense for us to do the recalculation
2249 * and the sorting.
2250 */
2251 if (msp->ms_group != NULL)
2252 metaslab_recalculate_weight_and_sort(msp);
93cf2076
GW
2253}
2254
f09fda50
PD
2255void
2256metaslab_set_selected_txg(metaslab_t *msp, uint64_t txg)
2257{
2258 ASSERT(MUTEX_HELD(&msp->ms_lock));
2259 metaslab_class_t *mc = msp->ms_group->mg_class;
2260 multilist_sublist_t *mls =
2261 multilist_sublist_lock_obj(mc->mc_metaslab_txg_list, msp);
2262 if (multilist_link_active(&msp->ms_class_txg_node))
2263 multilist_sublist_remove(mls, msp);
2264 msp->ms_selected_txg = txg;
eef0f4d8 2265 msp->ms_selected_time = gethrtime();
f09fda50
PD
2266 multilist_sublist_insert_tail(mls, msp);
2267 multilist_sublist_unlock(mls);
2268}
2269
93e28d66 2270void
cc99f275
DB
2271metaslab_space_update(vdev_t *vd, metaslab_class_t *mc, int64_t alloc_delta,
2272 int64_t defer_delta, int64_t space_delta)
2273{
2274 vdev_space_update(vd, alloc_delta, defer_delta, space_delta);
2275
2276 ASSERT3P(vd->vdev_spa->spa_root_vdev, ==, vd->vdev_parent);
2277 ASSERT(vd->vdev_ms_count != 0);
2278
2279 metaslab_class_space_update(mc, alloc_delta, defer_delta, space_delta,
2280 vdev_deflated_space(vd, space_delta));
2281}
2282
fb42a493 2283int
93e28d66
SD
2284metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object,
2285 uint64_t txg, metaslab_t **msp)
34dc7c2f
BB
2286{
2287 vdev_t *vd = mg->mg_vd;
cc99f275
DB
2288 spa_t *spa = vd->vdev_spa;
2289 objset_t *mos = spa->spa_meta_objset;
fb42a493
PS
2290 metaslab_t *ms;
2291 int error;
34dc7c2f 2292
79c76d5b 2293 ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
fb42a493 2294 mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL);
a1d477c2 2295 mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL);
fb42a493 2296 cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL);
93e28d66 2297 cv_init(&ms->ms_flush_cv, NULL, CV_DEFAULT, NULL);
f09fda50 2298 multilist_link_init(&ms->ms_class_txg_node);
619f0976 2299
fb42a493
PS
2300 ms->ms_id = id;
2301 ms->ms_start = id << vd->vdev_ms_shift;
2302 ms->ms_size = 1ULL << vd->vdev_ms_shift;
492f64e9
PD
2303 ms->ms_allocator = -1;
2304 ms->ms_new = B_TRUE;
34dc7c2f 2305
93cf2076
GW
2306 /*
2307 * We only open space map objects that already exist. All others
afe37326 2308 * will be opened when we finally allocate an object for it.
425d3237
SD
2309 *
2310 * Note:
2311 * When called from vdev_expand(), we can't call into the DMU as
2312 * we are holding the spa_config_lock as a writer and we would
2313 * deadlock [see relevant comment in vdev_metaslab_init()]. in
2314 * that case, the object parameter is zero though, so we won't
2315 * call into the DMU.
93cf2076 2316 */
afe37326 2317 if (object != 0) {
fb42a493 2318 error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start,
a1d477c2 2319 ms->ms_size, vd->vdev_ashift);
fb42a493
PS
2320
2321 if (error != 0) {
2322 kmem_free(ms, sizeof (metaslab_t));
2323 return (error);
2324 }
2325
2326 ASSERT(ms->ms_sm != NULL);
425d3237 2327 ms->ms_allocated_space = space_map_allocated(ms->ms_sm);
93cf2076 2328 }
34dc7c2f
BB
2329
2330 /*
425d3237 2331 * We create the ms_allocatable here, but we don't create the
258553d3 2332 * other range trees until metaslab_sync_done(). This serves
34dc7c2f 2333 * two purposes: it allows metaslab_sync_done() to detect the
425d3237
SD
2334 * addition of new space; and for debugging, it ensures that
2335 * we'd data fault on any attempt to use this metaslab before
2336 * it's ready.
34dc7c2f 2337 */
d2734cce
SD
2338 ms->ms_allocatable = range_tree_create_impl(&rt_avl_ops,
2339 &ms->ms_allocatable_by_size, metaslab_rangesize_compare, 0);
34dc7c2f 2340
1b939560
BB
2341 ms->ms_trim = range_tree_create(NULL, NULL);
2342
2343 metaslab_group_add(mg, ms);
65a91b16 2344 metaslab_set_fragmentation(ms, B_FALSE);
428870ff 2345
34dc7c2f
BB
2346 /*
2347 * If we're opening an existing pool (txg == 0) or creating
2348 * a new one (txg == TXG_INITIAL), all space is available now.
2349 * If we're adding space to an existing pool, the new space
2350 * does not become available until after this txg has synced.
4e21fd06
DB
2351 * The metaslab's weight will also be initialized when we sync
2352 * out this txg. This ensures that we don't attempt to allocate
2353 * from it before we have initialized it completely.
34dc7c2f 2354 */
425d3237 2355 if (txg <= TXG_INITIAL) {
fb42a493 2356 metaslab_sync_done(ms, 0);
425d3237
SD
2357 metaslab_space_update(vd, mg->mg_class,
2358 metaslab_allocated_space(ms), 0, 0);
2359 }
34dc7c2f
BB
2360
2361 if (txg != 0) {
34dc7c2f 2362 vdev_dirty(vd, 0, NULL, txg);
fb42a493 2363 vdev_dirty(vd, VDD_METASLAB, ms, txg);
34dc7c2f
BB
2364 }
2365
fb42a493
PS
2366 *msp = ms;
2367
2368 return (0);
34dc7c2f
BB
2369}
2370
93e28d66
SD
2371static void
2372metaslab_fini_flush_data(metaslab_t *msp)
2373{
2374 spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
2375
2376 if (metaslab_unflushed_txg(msp) == 0) {
2377 ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL),
2378 ==, NULL);
2379 return;
2380 }
2381 ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
2382
2383 mutex_enter(&spa->spa_flushed_ms_lock);
2384 avl_remove(&spa->spa_metaslabs_by_flushed, msp);
2385 mutex_exit(&spa->spa_flushed_ms_lock);
2386
2387 spa_log_sm_decrement_mscount(spa, metaslab_unflushed_txg(msp));
2388 spa_log_summary_decrement_mscount(spa, metaslab_unflushed_txg(msp));
2389}
2390
2391uint64_t
2392metaslab_unflushed_changes_memused(metaslab_t *ms)
2393{
2394 return ((range_tree_numsegs(ms->ms_unflushed_allocs) +
2395 range_tree_numsegs(ms->ms_unflushed_frees)) *
2396 sizeof (range_seg_t));
2397}
2398
34dc7c2f
BB
2399void
2400metaslab_fini(metaslab_t *msp)
2401{
93cf2076 2402 metaslab_group_t *mg = msp->ms_group;
cc99f275 2403 vdev_t *vd = mg->mg_vd;
93e28d66
SD
2404 spa_t *spa = vd->vdev_spa;
2405
2406 metaslab_fini_flush_data(msp);
34dc7c2f
BB
2407
2408 metaslab_group_remove(mg, msp);
2409
2410 mutex_enter(&msp->ms_lock);
93cf2076 2411 VERIFY(msp->ms_group == NULL);
cc99f275 2412 metaslab_space_update(vd, mg->mg_class,
425d3237 2413 -metaslab_allocated_space(msp), 0, -msp->ms_size);
cc99f275 2414
93cf2076 2415 space_map_close(msp->ms_sm);
93e28d66 2416 msp->ms_sm = NULL;
93cf2076
GW
2417
2418 metaslab_unload(msp);
d2734cce
SD
2419 range_tree_destroy(msp->ms_allocatable);
2420 range_tree_destroy(msp->ms_freeing);
2421 range_tree_destroy(msp->ms_freed);
34dc7c2f 2422
93e28d66
SD
2423 ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
2424 metaslab_unflushed_changes_memused(msp));
2425 spa->spa_unflushed_stats.sus_memused -=
2426 metaslab_unflushed_changes_memused(msp);
2427 range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL);
2428 range_tree_destroy(msp->ms_unflushed_allocs);
2429 range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL);
2430 range_tree_destroy(msp->ms_unflushed_frees);
2431
1c27024e 2432 for (int t = 0; t < TXG_SIZE; t++) {
d2734cce 2433 range_tree_destroy(msp->ms_allocating[t]);
34dc7c2f
BB
2434 }
2435
1c27024e 2436 for (int t = 0; t < TXG_DEFER_SIZE; t++) {
d2734cce 2437 range_tree_destroy(msp->ms_defer[t]);
e51be066 2438 }
c99c9001 2439 ASSERT0(msp->ms_deferspace);
428870ff 2440
d2734cce
SD
2441 range_tree_destroy(msp->ms_checkpointing);
2442
928e8ad4
SD
2443 for (int t = 0; t < TXG_SIZE; t++)
2444 ASSERT(!txg_list_member(&vd->vdev_ms_list, msp, t));
2445
1b939560
BB
2446 range_tree_vacate(msp->ms_trim, NULL, NULL);
2447 range_tree_destroy(msp->ms_trim);
2448
34dc7c2f 2449 mutex_exit(&msp->ms_lock);
93cf2076 2450 cv_destroy(&msp->ms_load_cv);
93e28d66 2451 cv_destroy(&msp->ms_flush_cv);
34dc7c2f 2452 mutex_destroy(&msp->ms_lock);
a1d477c2 2453 mutex_destroy(&msp->ms_sync_lock);
492f64e9 2454 ASSERT3U(msp->ms_allocator, ==, -1);
34dc7c2f
BB
2455
2456 kmem_free(msp, sizeof (metaslab_t));
2457}
2458
f3a7f661
GW
2459#define FRAGMENTATION_TABLE_SIZE 17
2460
93cf2076 2461/*
f3a7f661
GW
2462 * This table defines a segment size based fragmentation metric that will
2463 * allow each metaslab to derive its own fragmentation value. This is done
2464 * by calculating the space in each bucket of the spacemap histogram and
928e8ad4 2465 * multiplying that by the fragmentation metric in this table. Doing
f3a7f661
GW
2466 * this for all buckets and dividing it by the total amount of free
2467 * space in this metaslab (i.e. the total free space in all buckets) gives
2468 * us the fragmentation metric. This means that a high fragmentation metric
2469 * equates to most of the free space being comprised of small segments.
2470 * Conversely, if the metric is low, then most of the free space is in
2471 * large segments. A 10% change in fragmentation equates to approximately
2472 * double the number of segments.
93cf2076 2473 *
f3a7f661
GW
2474 * This table defines 0% fragmented space using 16MB segments. Testing has
2475 * shown that segments that are greater than or equal to 16MB do not suffer
2476 * from drastic performance problems. Using this value, we derive the rest
2477 * of the table. Since the fragmentation value is never stored on disk, it
2478 * is possible to change these calculations in the future.
2479 */
2480int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = {
2481 100, /* 512B */
2482 100, /* 1K */
2483 98, /* 2K */
2484 95, /* 4K */
2485 90, /* 8K */
2486 80, /* 16K */
2487 70, /* 32K */
2488 60, /* 64K */
2489 50, /* 128K */
2490 40, /* 256K */
2491 30, /* 512K */
2492 20, /* 1M */
2493 15, /* 2M */
2494 10, /* 4M */
2495 5, /* 8M */
2496 0 /* 16M */
2497};
2498
2499/*
425d3237
SD
2500 * Calculate the metaslab's fragmentation metric and set ms_fragmentation.
2501 * Setting this value to ZFS_FRAG_INVALID means that the metaslab has not
2502 * been upgraded and does not support this metric. Otherwise, the return
2503 * value should be in the range [0, 100].
93cf2076 2504 */
4e21fd06 2505static void
65a91b16 2506metaslab_set_fragmentation(metaslab_t *msp, boolean_t nodirty)
93cf2076 2507{
f3a7f661
GW
2508 spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
2509 uint64_t fragmentation = 0;
2510 uint64_t total = 0;
2511 boolean_t feature_enabled = spa_feature_is_enabled(spa,
2512 SPA_FEATURE_SPACEMAP_HISTOGRAM);
93cf2076 2513
4e21fd06
DB
2514 if (!feature_enabled) {
2515 msp->ms_fragmentation = ZFS_FRAG_INVALID;
2516 return;
2517 }
f3a7f661 2518
93cf2076 2519 /*
f3a7f661
GW
2520 * A null space map means that the entire metaslab is free
2521 * and thus is not fragmented.
93cf2076 2522 */
4e21fd06
DB
2523 if (msp->ms_sm == NULL) {
2524 msp->ms_fragmentation = 0;
2525 return;
2526 }
f3a7f661
GW
2527
2528 /*
4e21fd06 2529 * If this metaslab's space map has not been upgraded, flag it
f3a7f661
GW
2530 * so that we upgrade next time we encounter it.
2531 */
2532 if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) {
3b7f360c 2533 uint64_t txg = spa_syncing_txg(spa);
93cf2076
GW
2534 vdev_t *vd = msp->ms_group->mg_vd;
2535
3b7f360c
GW
2536 /*
2537 * If we've reached the final dirty txg, then we must
2538 * be shutting down the pool. We don't want to dirty
2539 * any data past this point so skip setting the condense
2540 * flag. We can retry this action the next time the pool
65a91b16
SD
2541 * is imported. We also skip marking this metaslab for
2542 * condensing if the caller has explicitly set nodirty.
3b7f360c 2543 */
65a91b16
SD
2544 if (!nodirty &&
2545 spa_writeable(spa) && txg < spa_final_dirty_txg(spa)) {
8b0a0840
TC
2546 msp->ms_condense_wanted = B_TRUE;
2547 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
964c2d69 2548 zfs_dbgmsg("txg %llu, requesting force condense: "
3b7f360c
GW
2549 "ms_id %llu, vdev_id %llu", txg, msp->ms_id,
2550 vd->vdev_id);
8b0a0840 2551 }
4e21fd06
DB
2552 msp->ms_fragmentation = ZFS_FRAG_INVALID;
2553 return;
93cf2076
GW
2554 }
2555
1c27024e 2556 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
f3a7f661
GW
2557 uint64_t space = 0;
2558 uint8_t shift = msp->ms_sm->sm_shift;
4e21fd06 2559
f3a7f661
GW
2560 int idx = MIN(shift - SPA_MINBLOCKSHIFT + i,
2561 FRAGMENTATION_TABLE_SIZE - 1);
93cf2076 2562
93cf2076
GW
2563 if (msp->ms_sm->sm_phys->smp_histogram[i] == 0)
2564 continue;
2565
f3a7f661
GW
2566 space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift);
2567 total += space;
2568
2569 ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE);
2570 fragmentation += space * zfs_frag_table[idx];
93cf2076 2571 }
f3a7f661
GW
2572
2573 if (total > 0)
2574 fragmentation /= total;
2575 ASSERT3U(fragmentation, <=, 100);
4e21fd06
DB
2576
2577 msp->ms_fragmentation = fragmentation;
93cf2076 2578}
34dc7c2f 2579
f3a7f661
GW
2580/*
2581 * Compute a weight -- a selection preference value -- for the given metaslab.
2582 * This is based on the amount of free space, the level of fragmentation,
2583 * the LBA range, and whether the metaslab is loaded.
2584 */
34dc7c2f 2585static uint64_t
4e21fd06 2586metaslab_space_weight(metaslab_t *msp)
34dc7c2f
BB
2587{
2588 metaslab_group_t *mg = msp->ms_group;
34dc7c2f
BB
2589 vdev_t *vd = mg->mg_vd;
2590 uint64_t weight, space;
2591
2592 ASSERT(MUTEX_HELD(&msp->ms_lock));
c2e42f9d 2593
34dc7c2f
BB
2594 /*
2595 * The baseline weight is the metaslab's free space.
2596 */
425d3237 2597 space = msp->ms_size - metaslab_allocated_space(msp);
f3a7f661 2598
f3a7f661
GW
2599 if (metaslab_fragmentation_factor_enabled &&
2600 msp->ms_fragmentation != ZFS_FRAG_INVALID) {
2601 /*
2602 * Use the fragmentation information to inversely scale
2603 * down the baseline weight. We need to ensure that we
2604 * don't exclude this metaslab completely when it's 100%
2605 * fragmented. To avoid this we reduce the fragmented value
2606 * by 1.
2607 */
2608 space = (space * (100 - (msp->ms_fragmentation - 1))) / 100;
2609
2610 /*
2611 * If space < SPA_MINBLOCKSIZE, then we will not allocate from
2612 * this metaslab again. The fragmentation metric may have
2613 * decreased the space to something smaller than
2614 * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE
2615 * so that we can consume any remaining space.
2616 */
2617 if (space > 0 && space < SPA_MINBLOCKSIZE)
2618 space = SPA_MINBLOCKSIZE;
2619 }
34dc7c2f
BB
2620 weight = space;
2621
2622 /*
2623 * Modern disks have uniform bit density and constant angular velocity.
2624 * Therefore, the outer recording zones are faster (higher bandwidth)
2625 * than the inner zones by the ratio of outer to inner track diameter,
2626 * which is typically around 2:1. We account for this by assigning
2627 * higher weight to lower metaslabs (multiplier ranging from 2x to 1x).
2628 * In effect, this means that we'll select the metaslab with the most
2629 * free bandwidth rather than simply the one with the most free space.
2630 */
fb40095f 2631 if (!vd->vdev_nonrot && metaslab_lba_weighting_enabled) {
f3a7f661
GW
2632 weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count;
2633 ASSERT(weight >= space && weight <= 2 * space);
2634 }
428870ff 2635
f3a7f661
GW
2636 /*
2637 * If this metaslab is one we're actively using, adjust its
2638 * weight to make it preferable to any inactive metaslab so
2639 * we'll polish it off. If the fragmentation on this metaslab
2640 * has exceed our threshold, then don't mark it active.
2641 */
2642 if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID &&
2643 msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) {
428870ff
BB
2644 weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
2645 }
34dc7c2f 2646
4e21fd06
DB
2647 WEIGHT_SET_SPACEBASED(weight);
2648 return (weight);
2649}
2650
2651/*
2652 * Return the weight of the specified metaslab, according to the segment-based
2653 * weighting algorithm. The metaslab must be loaded. This function can
2654 * be called within a sync pass since it relies only on the metaslab's
2655 * range tree which is always accurate when the metaslab is loaded.
2656 */
2657static uint64_t
2658metaslab_weight_from_range_tree(metaslab_t *msp)
2659{
2660 uint64_t weight = 0;
2661 uint32_t segments = 0;
4e21fd06
DB
2662
2663 ASSERT(msp->ms_loaded);
2664
1c27024e
DB
2665 for (int i = RANGE_TREE_HISTOGRAM_SIZE - 1; i >= SPA_MINBLOCKSHIFT;
2666 i--) {
4e21fd06
DB
2667 uint8_t shift = msp->ms_group->mg_vd->vdev_ashift;
2668 int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
2669
2670 segments <<= 1;
d2734cce 2671 segments += msp->ms_allocatable->rt_histogram[i];
4e21fd06
DB
2672
2673 /*
2674 * The range tree provides more precision than the space map
2675 * and must be downgraded so that all values fit within the
2676 * space map's histogram. This allows us to compare loaded
2677 * vs. unloaded metaslabs to determine which metaslab is
2678 * considered "best".
2679 */
2680 if (i > max_idx)
2681 continue;
2682
2683 if (segments != 0) {
2684 WEIGHT_SET_COUNT(weight, segments);
2685 WEIGHT_SET_INDEX(weight, i);
2686 WEIGHT_SET_ACTIVE(weight, 0);
2687 break;
2688 }
2689 }
2690 return (weight);
2691}
2692
2693/*
93e28d66
SD
2694 * Calculate the weight based on the on-disk histogram. Should be applied
2695 * only to unloaded metaslabs (i.e no incoming allocations) in-order to
2696 * give results consistent with the on-disk state
4e21fd06
DB
2697 */
2698static uint64_t
2699metaslab_weight_from_spacemap(metaslab_t *msp)
2700{
928e8ad4
SD
2701 space_map_t *sm = msp->ms_sm;
2702 ASSERT(!msp->ms_loaded);
2703 ASSERT(sm != NULL);
2704 ASSERT3U(space_map_object(sm), !=, 0);
2705 ASSERT3U(sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t));
4e21fd06 2706
928e8ad4
SD
2707 /*
2708 * Create a joint histogram from all the segments that have made
2709 * it to the metaslab's space map histogram, that are not yet
2710 * available for allocation because they are still in the freeing
2711 * pipeline (e.g. freeing, freed, and defer trees). Then subtract
2712 * these segments from the space map's histogram to get a more
2713 * accurate weight.
2714 */
2715 uint64_t deferspace_histogram[SPACE_MAP_HISTOGRAM_SIZE] = {0};
2716 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++)
2717 deferspace_histogram[i] += msp->ms_synchist[i];
2718 for (int t = 0; t < TXG_DEFER_SIZE; t++) {
2719 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
2720 deferspace_histogram[i] += msp->ms_deferhist[t][i];
2721 }
2722 }
2723
2724 uint64_t weight = 0;
1c27024e 2725 for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) {
928e8ad4
SD
2726 ASSERT3U(sm->sm_phys->smp_histogram[i], >=,
2727 deferspace_histogram[i]);
2728 uint64_t count =
2729 sm->sm_phys->smp_histogram[i] - deferspace_histogram[i];
2730 if (count != 0) {
2731 WEIGHT_SET_COUNT(weight, count);
2732 WEIGHT_SET_INDEX(weight, i + sm->sm_shift);
4e21fd06
DB
2733 WEIGHT_SET_ACTIVE(weight, 0);
2734 break;
2735 }
2736 }
2737 return (weight);
2738}
2739
2740/*
2741 * Compute a segment-based weight for the specified metaslab. The weight
2742 * is determined by highest bucket in the histogram. The information
2743 * for the highest bucket is encoded into the weight value.
2744 */
2745static uint64_t
2746metaslab_segment_weight(metaslab_t *msp)
2747{
2748 metaslab_group_t *mg = msp->ms_group;
2749 uint64_t weight = 0;
2750 uint8_t shift = mg->mg_vd->vdev_ashift;
2751
2752 ASSERT(MUTEX_HELD(&msp->ms_lock));
2753
2754 /*
2755 * The metaslab is completely free.
2756 */
425d3237 2757 if (metaslab_allocated_space(msp) == 0) {
4e21fd06
DB
2758 int idx = highbit64(msp->ms_size) - 1;
2759 int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
2760
2761 if (idx < max_idx) {
2762 WEIGHT_SET_COUNT(weight, 1ULL);
2763 WEIGHT_SET_INDEX(weight, idx);
2764 } else {
2765 WEIGHT_SET_COUNT(weight, 1ULL << (idx - max_idx));
2766 WEIGHT_SET_INDEX(weight, max_idx);
2767 }
2768 WEIGHT_SET_ACTIVE(weight, 0);
2769 ASSERT(!WEIGHT_IS_SPACEBASED(weight));
4e21fd06
DB
2770 return (weight);
2771 }
2772
2773 ASSERT3U(msp->ms_sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t));
2774
2775 /*
2776 * If the metaslab is fully allocated then just make the weight 0.
2777 */
425d3237 2778 if (metaslab_allocated_space(msp) == msp->ms_size)
4e21fd06
DB
2779 return (0);
2780 /*
2781 * If the metaslab is already loaded, then use the range tree to
2782 * determine the weight. Otherwise, we rely on the space map information
2783 * to generate the weight.
2784 */
2785 if (msp->ms_loaded) {
2786 weight = metaslab_weight_from_range_tree(msp);
2787 } else {
2788 weight = metaslab_weight_from_spacemap(msp);
2789 }
2790
2791 /*
2792 * If the metaslab was active the last time we calculated its weight
2793 * then keep it active. We want to consume the entire region that
2794 * is associated with this weight.
2795 */
2796 if (msp->ms_activation_weight != 0 && weight != 0)
2797 WEIGHT_SET_ACTIVE(weight, WEIGHT_GET_ACTIVE(msp->ms_weight));
2798 return (weight);
2799}
2800
2801/*
2802 * Determine if we should attempt to allocate from this metaslab. If the
7f319089
SD
2803 * metaslab is loaded, then we can determine if the desired allocation
2804 * can be satisfied by looking at the size of the maximum free segment
2805 * on that metaslab. Otherwise, we make our decision based on the metaslab's
2806 * weight. For segment-based weighting we can determine the maximum
2807 * allocation based on the index encoded in its value. For space-based
2808 * weights we rely on the entire weight (excluding the weight-type bit).
4e21fd06
DB
2809 */
2810boolean_t
c81f1790 2811metaslab_should_allocate(metaslab_t *msp, uint64_t asize, boolean_t try_hard)
4e21fd06 2812{
c81f1790
PD
2813 /*
2814 * If the metaslab is loaded, ms_max_size is definitive and we can use
2815 * the fast check. If it's not, the ms_max_size is a lower bound (once
2816 * set), and we should use the fast check as long as we're not in
2817 * try_hard and it's been less than zfs_metaslab_max_size_cache_sec
2818 * seconds since the metaslab was unloaded.
2819 */
2820 if (msp->ms_loaded ||
2821 (msp->ms_max_size != 0 && !try_hard && gethrtime() <
2822 msp->ms_unload_time + SEC2NSEC(zfs_metaslab_max_size_cache_sec)))
4e21fd06
DB
2823 return (msp->ms_max_size >= asize);
2824
679b0f2a 2825 boolean_t should_allocate;
4e21fd06
DB
2826 if (!WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
2827 /*
2828 * The metaslab segment weight indicates segments in the
2829 * range [2^i, 2^(i+1)), where i is the index in the weight.
2830 * Since the asize might be in the middle of the range, we
2831 * should attempt the allocation if asize < 2^(i+1).
2832 */
2833 should_allocate = (asize <
2834 1ULL << (WEIGHT_GET_INDEX(msp->ms_weight) + 1));
2835 } else {
2836 should_allocate = (asize <=
2837 (msp->ms_weight & ~METASLAB_WEIGHT_TYPE));
2838 }
679b0f2a 2839
4e21fd06
DB
2840 return (should_allocate);
2841}
65a91b16 2842
4e21fd06 2843static uint64_t
65a91b16 2844metaslab_weight(metaslab_t *msp, boolean_t nodirty)
4e21fd06
DB
2845{
2846 vdev_t *vd = msp->ms_group->mg_vd;
2847 spa_t *spa = vd->vdev_spa;
2848 uint64_t weight;
2849
2850 ASSERT(MUTEX_HELD(&msp->ms_lock));
2851
65a91b16 2852 metaslab_set_fragmentation(msp, nodirty);
4e21fd06
DB
2853
2854 /*
c81f1790 2855 * Update the maximum size. If the metaslab is loaded, this will
4e21fd06 2856 * ensure that we get an accurate maximum size if newly freed space
c81f1790
PD
2857 * has been added back into the free tree. If the metaslab is
2858 * unloaded, we check if there's a larger free segment in the
2859 * unflushed frees. This is a lower bound on the largest allocatable
2860 * segment size. Coalescing of adjacent entries may reveal larger
2861 * allocatable segments, but we aren't aware of those until loading
2862 * the space map into a range tree.
4e21fd06 2863 */
c81f1790
PD
2864 if (msp->ms_loaded) {
2865 msp->ms_max_size = metaslab_largest_allocatable(msp);
2866 } else {
2867 msp->ms_max_size = MAX(msp->ms_max_size,
2868 metaslab_largest_unflushed_free(msp));
2869 }
4e21fd06
DB
2870
2871 /*
2872 * Segment-based weighting requires space map histogram support.
2873 */
2874 if (zfs_metaslab_segment_weight_enabled &&
2875 spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) &&
2876 (msp->ms_sm == NULL || msp->ms_sm->sm_dbuf->db_size ==
2877 sizeof (space_map_phys_t))) {
2878 weight = metaslab_segment_weight(msp);
2879 } else {
2880 weight = metaslab_space_weight(msp);
2881 }
93cf2076 2882 return (weight);
34dc7c2f
BB
2883}
2884
928e8ad4
SD
2885void
2886metaslab_recalculate_weight_and_sort(metaslab_t *msp)
2887{
679b0f2a
PD
2888 ASSERT(MUTEX_HELD(&msp->ms_lock));
2889
928e8ad4
SD
2890 /* note: we preserve the mask (e.g. indication of primary, etc..) */
2891 uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
2892 metaslab_group_sort(msp->ms_group, msp,
65a91b16 2893 metaslab_weight(msp, B_FALSE) | was_active);
928e8ad4
SD
2894}
2895
34dc7c2f 2896static int
492f64e9
PD
2897metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp,
2898 int allocator, uint64_t activation_weight)
2899{
679b0f2a
PD
2900 ASSERT(MUTEX_HELD(&msp->ms_lock));
2901
492f64e9
PD
2902 /*
2903 * If we're activating for the claim code, we don't want to actually
2904 * set the metaslab up for a specific allocator.
2905 */
f09fda50
PD
2906 if (activation_weight == METASLAB_WEIGHT_CLAIM) {
2907 ASSERT0(msp->ms_activation_weight);
2908 msp->ms_activation_weight = msp->ms_weight;
2909 metaslab_group_sort(mg, msp, msp->ms_weight |
2910 activation_weight);
492f64e9 2911 return (0);
f09fda50 2912 }
679b0f2a 2913
492f64e9
PD
2914 metaslab_t **arr = (activation_weight == METASLAB_WEIGHT_PRIMARY ?
2915 mg->mg_primaries : mg->mg_secondaries);
2916
492f64e9
PD
2917 mutex_enter(&mg->mg_lock);
2918 if (arr[allocator] != NULL) {
2919 mutex_exit(&mg->mg_lock);
2920 return (EEXIST);
2921 }
2922
2923 arr[allocator] = msp;
2924 ASSERT3S(msp->ms_allocator, ==, -1);
2925 msp->ms_allocator = allocator;
2926 msp->ms_primary = (activation_weight == METASLAB_WEIGHT_PRIMARY);
f09fda50
PD
2927
2928 ASSERT0(msp->ms_activation_weight);
2929 msp->ms_activation_weight = msp->ms_weight;
2930 metaslab_group_sort_impl(mg, msp,
2931 msp->ms_weight | activation_weight);
2932
492f64e9
PD
2933 mutex_exit(&mg->mg_lock);
2934
2935 return (0);
2936}
2937
2938static int
2939metaslab_activate(metaslab_t *msp, int allocator, uint64_t activation_weight)
34dc7c2f 2940{
34dc7c2f
BB
2941 ASSERT(MUTEX_HELD(&msp->ms_lock));
2942
679b0f2a
PD
2943 /*
2944 * The current metaslab is already activated for us so there
2945 * is nothing to do. Already activated though, doesn't mean
2946 * that this metaslab is activated for our allocator nor our
2947 * requested activation weight. The metaslab could have started
2948 * as an active one for our allocator but changed allocators
2949 * while we were waiting to grab its ms_lock or we stole it
2950 * [see find_valid_metaslab()]. This means that there is a
2951 * possibility of passivating a metaslab of another allocator
2952 * or from a different activation mask, from this thread.
2953 */
2954 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) {
2955 ASSERT(msp->ms_loaded);
2956 return (0);
2957 }
2958
2959 int error = metaslab_load(msp);
2960 if (error != 0) {
2961 metaslab_group_sort(msp->ms_group, msp, 0);
2962 return (error);
2963 }
2964
2965 /*
2966 * When entering metaslab_load() we may have dropped the
2967 * ms_lock because we were loading this metaslab, or we
2968 * were waiting for another thread to load it for us. In
2969 * that scenario, we recheck the weight of the metaslab
2970 * to see if it was activated by another thread.
2971 *
2972 * If the metaslab was activated for another allocator or
2973 * it was activated with a different activation weight (e.g.
2974 * we wanted to make it a primary but it was activated as
2975 * secondary) we return error (EBUSY).
2976 *
2977 * If the metaslab was activated for the same allocator
2978 * and requested activation mask, skip activating it.
2979 */
2980 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) {
2981 if (msp->ms_allocator != allocator)
2982 return (EBUSY);
2983
2984 if ((msp->ms_weight & activation_weight) == 0)
7ab96299 2985 return (SET_ERROR(EBUSY));
9babb374 2986
679b0f2a
PD
2987 EQUIV((activation_weight == METASLAB_WEIGHT_PRIMARY),
2988 msp->ms_primary);
2989 return (0);
34dc7c2f 2990 }
679b0f2a 2991
fe0ea848
PD
2992 /*
2993 * If the metaslab has literally 0 space, it will have weight 0. In
2994 * that case, don't bother activating it. This can happen if the
2995 * metaslab had space during find_valid_metaslab, but another thread
2996 * loaded it and used all that space while we were waiting to grab the
2997 * lock.
2998 */
2999 if (msp->ms_weight == 0) {
3000 ASSERT0(range_tree_space(msp->ms_allocatable));
3001 return (SET_ERROR(ENOSPC));
3002 }
3003
679b0f2a
PD
3004 if ((error = metaslab_activate_allocator(msp->ms_group, msp,
3005 allocator, activation_weight)) != 0) {
3006 return (error);
3007 }
3008
93cf2076 3009 ASSERT(msp->ms_loaded);
34dc7c2f
BB
3010 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
3011
3012 return (0);
3013}
3014
492f64e9
PD
3015static void
3016metaslab_passivate_allocator(metaslab_group_t *mg, metaslab_t *msp,
3017 uint64_t weight)
3018{
3019 ASSERT(MUTEX_HELD(&msp->ms_lock));
679b0f2a
PD
3020 ASSERT(msp->ms_loaded);
3021
492f64e9
PD
3022 if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) {
3023 metaslab_group_sort(mg, msp, weight);
3024 return;
3025 }
3026
3027 mutex_enter(&mg->mg_lock);
3028 ASSERT3P(msp->ms_group, ==, mg);
679b0f2a
PD
3029 ASSERT3S(0, <=, msp->ms_allocator);
3030 ASSERT3U(msp->ms_allocator, <, mg->mg_allocators);
3031
492f64e9 3032 if (msp->ms_primary) {
492f64e9
PD
3033 ASSERT3P(mg->mg_primaries[msp->ms_allocator], ==, msp);
3034 ASSERT(msp->ms_weight & METASLAB_WEIGHT_PRIMARY);
3035 mg->mg_primaries[msp->ms_allocator] = NULL;
3036 } else {
492f64e9 3037 ASSERT3P(mg->mg_secondaries[msp->ms_allocator], ==, msp);
679b0f2a 3038 ASSERT(msp->ms_weight & METASLAB_WEIGHT_SECONDARY);
492f64e9
PD
3039 mg->mg_secondaries[msp->ms_allocator] = NULL;
3040 }
3041 msp->ms_allocator = -1;
3042 metaslab_group_sort_impl(mg, msp, weight);
3043 mutex_exit(&mg->mg_lock);
3044}
3045
34dc7c2f 3046static void
4e21fd06 3047metaslab_passivate(metaslab_t *msp, uint64_t weight)
34dc7c2f 3048{
4e21fd06
DB
3049 ASSERTV(uint64_t size = weight & ~METASLAB_WEIGHT_TYPE);
3050
34dc7c2f
BB
3051 /*
3052 * If size < SPA_MINBLOCKSIZE, then we will not allocate from
3053 * this metaslab again. In that case, it had better be empty,
3054 * or we would be leaving space on the table.
3055 */
94d49e8f
TC
3056 ASSERT(!WEIGHT_IS_SPACEBASED(msp->ms_weight) ||
3057 size >= SPA_MINBLOCKSIZE ||
d2734cce 3058 range_tree_space(msp->ms_allocatable) == 0);
4e21fd06
DB
3059 ASSERT0(weight & METASLAB_ACTIVE_MASK);
3060
679b0f2a 3061 ASSERT(msp->ms_activation_weight != 0);
4e21fd06 3062 msp->ms_activation_weight = 0;
492f64e9 3063 metaslab_passivate_allocator(msp->ms_group, msp, weight);
679b0f2a 3064 ASSERT0(msp->ms_weight & METASLAB_ACTIVE_MASK);
34dc7c2f
BB
3065}
3066
4e21fd06
DB
3067/*
3068 * Segment-based metaslabs are activated once and remain active until
3069 * we either fail an allocation attempt (similar to space-based metaslabs)
3070 * or have exhausted the free space in zfs_metaslab_switch_threshold
3071 * buckets since the metaslab was activated. This function checks to see
e1cfd73f 3072 * if we've exhausted the zfs_metaslab_switch_threshold buckets in the
4e21fd06
DB
3073 * metaslab and passivates it proactively. This will allow us to select a
3074 * metaslab with a larger contiguous region, if any, remaining within this
3075 * metaslab group. If we're in sync pass > 1, then we continue using this
3076 * metaslab so that we don't dirty more block and cause more sync passes.
3077 */
3078void
3079metaslab_segment_may_passivate(metaslab_t *msp)
3080{
3081 spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
4e21fd06
DB
3082
3083 if (WEIGHT_IS_SPACEBASED(msp->ms_weight) || spa_sync_pass(spa) > 1)
3084 return;
3085
3086 /*
3087 * Since we are in the middle of a sync pass, the most accurate
3088 * information that is accessible to us is the in-core range tree
3089 * histogram; calculate the new weight based on that information.
3090 */
1c27024e
DB
3091 uint64_t weight = metaslab_weight_from_range_tree(msp);
3092 int activation_idx = WEIGHT_GET_INDEX(msp->ms_activation_weight);
3093 int current_idx = WEIGHT_GET_INDEX(weight);
4e21fd06
DB
3094
3095 if (current_idx <= activation_idx - zfs_metaslab_switch_threshold)
3096 metaslab_passivate(msp, weight);
3097}
3098
93cf2076
GW
3099static void
3100metaslab_preload(void *arg)
3101{
3102 metaslab_t *msp = arg;
f09fda50
PD
3103 metaslab_class_t *mc = msp->ms_group->mg_class;
3104 spa_t *spa = mc->mc_spa;
1cd77734 3105 fstrans_cookie_t cookie = spl_fstrans_mark();
93cf2076 3106
080b3100
GW
3107 ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock));
3108
93cf2076 3109 mutex_enter(&msp->ms_lock);
b194fab0 3110 (void) metaslab_load(msp);
f09fda50 3111 metaslab_set_selected_txg(msp, spa_syncing_txg(spa));
93cf2076 3112 mutex_exit(&msp->ms_lock);
1cd77734 3113 spl_fstrans_unmark(cookie);
93cf2076
GW
3114}
3115
3116static void
3117metaslab_group_preload(metaslab_group_t *mg)
3118{
3119 spa_t *spa = mg->mg_vd->vdev_spa;
3120 metaslab_t *msp;
3121 avl_tree_t *t = &mg->mg_metaslab_tree;
3122 int m = 0;
3123
3124 if (spa_shutting_down(spa) || !metaslab_preload_enabled) {
c5528b9b 3125 taskq_wait_outstanding(mg->mg_taskq, 0);
93cf2076
GW
3126 return;
3127 }
93cf2076 3128
080b3100 3129 mutex_enter(&mg->mg_lock);
a1d477c2 3130
93cf2076 3131 /*
080b3100 3132 * Load the next potential metaslabs
93cf2076 3133 */
4e21fd06 3134 for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) {
a1d477c2
MA
3135 ASSERT3P(msp->ms_group, ==, mg);
3136
f3a7f661
GW
3137 /*
3138 * We preload only the maximum number of metaslabs specified
3139 * by metaslab_preload_limit. If a metaslab is being forced
3140 * to condense then we preload it too. This will ensure
3141 * that force condensing happens in the next txg.
3142 */
3143 if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) {
f3a7f661
GW
3144 continue;
3145 }
93cf2076
GW
3146
3147 VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload,
48d3eb40 3148 msp, TQ_SLEEP) != TASKQID_INVALID);
93cf2076
GW
3149 }
3150 mutex_exit(&mg->mg_lock);
3151}
3152
e51be066 3153/*
93e28d66
SD
3154 * Determine if the space map's on-disk footprint is past our tolerance for
3155 * inefficiency. We would like to use the following criteria to make our
3156 * decision:
e51be066 3157 *
93e28d66
SD
3158 * 1. Do not condense if the size of the space map object would dramatically
3159 * increase as a result of writing out the free space range tree.
e51be066 3160 *
93e28d66
SD
3161 * 2. Condense if the on on-disk space map representation is at least
3162 * zfs_condense_pct/100 times the size of the optimal representation
3163 * (i.e. zfs_condense_pct = 110 and in-core = 1MB, optimal = 1.1MB).
e51be066 3164 *
93e28d66
SD
3165 * 3. Do not condense if the on-disk size of the space map does not actually
3166 * decrease.
b02fe35d 3167 *
b02fe35d
AR
3168 * Unfortunately, we cannot compute the on-disk size of the space map in this
3169 * context because we cannot accurately compute the effects of compression, etc.
3170 * Instead, we apply the heuristic described in the block comment for
3171 * zfs_metaslab_condense_block_threshold - we only condense if the space used
3172 * is greater than a threshold number of blocks.
e51be066
GW
3173 */
3174static boolean_t
3175metaslab_should_condense(metaslab_t *msp)
3176{
93cf2076 3177 space_map_t *sm = msp->ms_sm;
d2734cce
SD
3178 vdev_t *vd = msp->ms_group->mg_vd;
3179 uint64_t vdev_blocksize = 1 << vd->vdev_ashift;
e51be066
GW
3180
3181 ASSERT(MUTEX_HELD(&msp->ms_lock));
93cf2076 3182 ASSERT(msp->ms_loaded);
93e28d66
SD
3183 ASSERT(sm != NULL);
3184 ASSERT3U(spa_sync_pass(vd->vdev_spa), ==, 1);
d2734cce
SD
3185
3186 /*
4d044c4c
SD
3187 * We always condense metaslabs that are empty and metaslabs for
3188 * which a condense request has been made.
e51be066 3189 */
4d044c4c
SD
3190 if (avl_is_empty(&msp->ms_allocatable_by_size) ||
3191 msp->ms_condense_wanted)
e51be066
GW
3192 return (B_TRUE);
3193
93e28d66
SD
3194 uint64_t record_size = MAX(sm->sm_blksz, vdev_blocksize);
3195 uint64_t object_size = space_map_length(sm);
4d044c4c
SD
3196 uint64_t optimal_size = space_map_estimate_optimal_size(sm,
3197 msp->ms_allocatable, SM_NO_VDEVID);
b02fe35d 3198
4d044c4c 3199 return (object_size >= (optimal_size * zfs_condense_pct / 100) &&
b02fe35d 3200 object_size > zfs_metaslab_condense_block_threshold * record_size);
e51be066
GW
3201}
3202
3203/*
3204 * Condense the on-disk space map representation to its minimized form.
93e28d66
SD
3205 * The minimized form consists of a small number of allocations followed
3206 * by the entries of the free range tree (ms_allocatable). The condensed
3207 * spacemap contains all the entries of previous TXGs (including those in
3208 * the pool-wide log spacemaps; thus this is effectively a superset of
3209 * metaslab_flush()), but this TXG's entries still need to be written.
e51be066
GW
3210 */
3211static void
93e28d66 3212metaslab_condense(metaslab_t *msp, dmu_tx_t *tx)
e51be066 3213{
93cf2076
GW
3214 range_tree_t *condense_tree;
3215 space_map_t *sm = msp->ms_sm;
93e28d66
SD
3216 uint64_t txg = dmu_tx_get_txg(tx);
3217 spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
e51be066
GW
3218
3219 ASSERT(MUTEX_HELD(&msp->ms_lock));
93cf2076 3220 ASSERT(msp->ms_loaded);
93e28d66 3221 ASSERT(msp->ms_sm != NULL);
e51be066 3222
93e28d66
SD
3223 /*
3224 * In order to condense the space map, we need to change it so it
3225 * only describes which segments are currently allocated and free.
3226 *
3227 * All the current free space resides in the ms_allocatable, all
3228 * the ms_defer trees, and all the ms_allocating trees. We ignore
3229 * ms_freed because it is empty because we're in sync pass 1. We
3230 * ignore ms_freeing because these changes are not yet reflected
3231 * in the spacemap (they will be written later this txg).
3232 *
3233 * So to truncate the space map to represent all the entries of
3234 * previous TXGs we do the following:
3235 *
3236 * 1] We create a range tree (condense tree) that is 100% allocated.
3237 * 2] We remove from it all segments found in the ms_defer trees
3238 * as those segments are marked as free in the original space
3239 * map. We do the same with the ms_allocating trees for the same
3240 * reason. Removing these segments should be a relatively
3241 * inexpensive operation since we expect these trees to have a
3242 * small number of nodes.
3243 * 3] We vacate any unflushed allocs as they should already exist
3244 * in the condense tree. Then we vacate any unflushed frees as
3245 * they should already be part of ms_allocatable.
3246 * 4] At this point, we would ideally like to remove all segments
3247 * in the ms_allocatable tree from the condense tree. This way
3248 * we would write all the entries of the condense tree as the
3249 * condensed space map, which would only contain allocated
3250 * segments with everything else assumed to be freed.
3251 *
3252 * Doing so can be prohibitively expensive as ms_allocatable can
3253 * be large, and therefore computationally expensive to subtract
3254 * from the condense_tree. Instead we first sync out the
3255 * condense_tree and then the ms_allocatable, in the condensed
3256 * space map. While this is not optimal, it is typically close to
3257 * optimal and more importantly much cheaper to compute.
3258 *
3259 * 5] Finally, as both of the unflushed trees were written to our
3260 * new and condensed metaslab space map, we basically flushed
3261 * all the unflushed changes to disk, thus we call
3262 * metaslab_flush_update().
3263 */
3264 ASSERT3U(spa_sync_pass(spa), ==, 1);
3265 ASSERT(range_tree_is_empty(msp->ms_freed)); /* since it is pass 1 */
f3a7f661 3266
a887d653 3267 zfs_dbgmsg("condensing: txg %llu, msp[%llu] %px, vdev id %llu, "
5f3d9c69
JS
3268 "spa %s, smp size %llu, segments %lu, forcing condense=%s", txg,
3269 msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id,
93e28d66 3270 spa->spa_name, space_map_length(msp->ms_sm),
d2734cce 3271 avl_numnodes(&msp->ms_allocatable->rt_root),
f3a7f661
GW
3272 msp->ms_condense_wanted ? "TRUE" : "FALSE");
3273
3274 msp->ms_condense_wanted = B_FALSE;
e51be066 3275
a1d477c2 3276 condense_tree = range_tree_create(NULL, NULL);
93cf2076 3277 range_tree_add(condense_tree, msp->ms_start, msp->ms_size);
e51be066 3278
1c27024e 3279 for (int t = 0; t < TXG_DEFER_SIZE; t++) {
d2734cce 3280 range_tree_walk(msp->ms_defer[t],
93cf2076
GW
3281 range_tree_remove, condense_tree);
3282 }
e51be066 3283
93e28d66 3284 for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
d2734cce 3285 range_tree_walk(msp->ms_allocating[(txg + t) & TXG_MASK],
93cf2076
GW
3286 range_tree_remove, condense_tree);
3287 }
e51be066 3288
93e28d66
SD
3289 ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
3290 metaslab_unflushed_changes_memused(msp));
3291 spa->spa_unflushed_stats.sus_memused -=
3292 metaslab_unflushed_changes_memused(msp);
3293 range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL);
3294 range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL);
3295
e51be066 3296 /*
93e28d66
SD
3297 * We're about to drop the metaslab's lock thus allowing other
3298 * consumers to change it's content. Set the metaslab's ms_condensing
3299 * flag to ensure that allocations on this metaslab do not occur
3300 * while we're in the middle of committing it to disk. This is only
3301 * critical for ms_allocatable as all other range trees use per TXG
e51be066
GW
3302 * views of their content.
3303 */
93cf2076 3304 msp->ms_condensing = B_TRUE;
e51be066
GW
3305
3306 mutex_exit(&msp->ms_lock);
93e28d66
SD
3307 uint64_t object = space_map_object(msp->ms_sm);
3308 space_map_truncate(sm,
3309 spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP) ?
3310 zfs_metaslab_sm_blksz_with_log : zfs_metaslab_sm_blksz_no_log, tx);
3311
3312 /*
3313 * space_map_truncate() may have reallocated the spacemap object.
3314 * If so, update the vdev_ms_array.
3315 */
3316 if (space_map_object(msp->ms_sm) != object) {
3317 object = space_map_object(msp->ms_sm);
3318 dmu_write(spa->spa_meta_objset,
3319 msp->ms_group->mg_vd->vdev_ms_array, sizeof (uint64_t) *
3320 msp->ms_id, sizeof (uint64_t), &object, tx);
3321 }
e51be066
GW
3322
3323 /*
93e28d66
SD
3324 * Note:
3325 * When the log space map feature is enabled, each space map will
3326 * always have ALLOCS followed by FREES for each sync pass. This is
3327 * typically true even when the log space map feature is disabled,
3328 * except from the case where a metaslab goes through metaslab_sync()
3329 * and gets condensed. In that case the metaslab's space map will have
3330 * ALLOCS followed by FREES (due to condensing) followed by ALLOCS
3331 * followed by FREES (due to space_map_write() in metaslab_sync()) for
3332 * sync pass 1.
e51be066 3333 */
4d044c4c 3334 space_map_write(sm, condense_tree, SM_ALLOC, SM_NO_VDEVID, tx);
93e28d66
SD
3335 space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx);
3336
93cf2076
GW
3337 range_tree_vacate(condense_tree, NULL, NULL);
3338 range_tree_destroy(condense_tree);
a1d477c2 3339 mutex_enter(&msp->ms_lock);
93e28d66 3340
93cf2076 3341 msp->ms_condensing = B_FALSE;
93e28d66
SD
3342 metaslab_flush_update(msp, tx);
3343}
3344
3345/*
3346 * Called when the metaslab has been flushed (its own spacemap now reflects
3347 * all the contents of the pool-wide spacemap log). Updates the metaslab's
3348 * metadata and any pool-wide related log space map data (e.g. summary,
3349 * obsolete logs, etc..) to reflect that.
3350 */
3351static void
3352metaslab_flush_update(metaslab_t *msp, dmu_tx_t *tx)
3353{
3354 metaslab_group_t *mg = msp->ms_group;
3355 spa_t *spa = mg->mg_vd->vdev_spa;
3356
3357 ASSERT(MUTEX_HELD(&msp->ms_lock));
3358
3359 ASSERT3U(spa_sync_pass(spa), ==, 1);
3360 ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
3361 ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
3362
3363 /*
3364 * Just because a metaslab got flushed, that doesn't mean that
3365 * it will pass through metaslab_sync_done(). Thus, make sure to
3366 * update ms_synced_length here in case it doesn't.
3367 */
3368 msp->ms_synced_length = space_map_length(msp->ms_sm);
3369
3370 /*
3371 * We may end up here from metaslab_condense() without the
3372 * feature being active. In that case this is a no-op.
3373 */
3374 if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
3375 return;
3376
3377 ASSERT(spa_syncing_log_sm(spa) != NULL);
3378 ASSERT(msp->ms_sm != NULL);
3379 ASSERT(metaslab_unflushed_txg(msp) != 0);
3380 ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL), ==, msp);
3381
3382 VERIFY3U(tx->tx_txg, <=, spa_final_dirty_txg(spa));
3383
3384 /* update metaslab's position in our flushing tree */
3385 uint64_t ms_prev_flushed_txg = metaslab_unflushed_txg(msp);
3386 mutex_enter(&spa->spa_flushed_ms_lock);
3387 avl_remove(&spa->spa_metaslabs_by_flushed, msp);
3388 metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx);
3389 avl_add(&spa->spa_metaslabs_by_flushed, msp);
3390 mutex_exit(&spa->spa_flushed_ms_lock);
3391
3392 /* update metaslab counts of spa_log_sm_t nodes */
3393 spa_log_sm_decrement_mscount(spa, ms_prev_flushed_txg);
3394 spa_log_sm_increment_current_mscount(spa);
3395
3396 /* cleanup obsolete logs if any */
3397 uint64_t log_blocks_before = spa_log_sm_nblocks(spa);
3398 spa_cleanup_old_sm_logs(spa, tx);
3399 uint64_t log_blocks_after = spa_log_sm_nblocks(spa);
3400 VERIFY3U(log_blocks_after, <=, log_blocks_before);
3401
3402 /* update log space map summary */
3403 uint64_t blocks_gone = log_blocks_before - log_blocks_after;
3404 spa_log_summary_add_flushed_metaslab(spa);
3405 spa_log_summary_decrement_mscount(spa, ms_prev_flushed_txg);
3406 spa_log_summary_decrement_blkcount(spa, blocks_gone);
3407}
3408
3409boolean_t
3410metaslab_flush(metaslab_t *msp, dmu_tx_t *tx)
3411{
3412 spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
3413
3414 ASSERT(MUTEX_HELD(&msp->ms_lock));
3415 ASSERT3U(spa_sync_pass(spa), ==, 1);
3416 ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
3417
3418 ASSERT(msp->ms_sm != NULL);
3419 ASSERT(metaslab_unflushed_txg(msp) != 0);
3420 ASSERT(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL) != NULL);
3421
3422 /*
3423 * There is nothing wrong with flushing the same metaslab twice, as
3424 * this codepath should work on that case. However, the current
3425 * flushing scheme makes sure to avoid this situation as we would be
3426 * making all these calls without having anything meaningful to write
3427 * to disk. We assert this behavior here.
3428 */
3429 ASSERT3U(metaslab_unflushed_txg(msp), <, dmu_tx_get_txg(tx));
3430
3431 /*
3432 * We can not flush while loading, because then we would
3433 * not load the ms_unflushed_{allocs,frees}.
3434 */
3435 if (msp->ms_loading)
3436 return (B_FALSE);
3437
3438 metaslab_verify_space(msp, dmu_tx_get_txg(tx));
3439 metaslab_verify_weight_and_frag(msp);
3440
3441 /*
3442 * Metaslab condensing is effectively flushing. Therefore if the
3443 * metaslab can be condensed we can just condense it instead of
3444 * flushing it.
3445 *
3446 * Note that metaslab_condense() does call metaslab_flush_update()
3447 * so we can just return immediately after condensing. We also
3448 * don't need to care about setting ms_flushing or broadcasting
3449 * ms_flush_cv, even if we temporarily drop the ms_lock in
3450 * metaslab_condense(), as the metaslab is already loaded.
3451 */
3452 if (msp->ms_loaded && metaslab_should_condense(msp)) {
3453 metaslab_group_t *mg = msp->ms_group;
3454
3455 /*
3456 * For all histogram operations below refer to the
3457 * comments of metaslab_sync() where we follow a
3458 * similar procedure.
3459 */
3460 metaslab_group_histogram_verify(mg);
3461 metaslab_class_histogram_verify(mg->mg_class);
3462 metaslab_group_histogram_remove(mg, msp);
3463
3464 metaslab_condense(msp, tx);
3465
3466 space_map_histogram_clear(msp->ms_sm);
3467 space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx);
3468 ASSERT(range_tree_is_empty(msp->ms_freed));
3469 for (int t = 0; t < TXG_DEFER_SIZE; t++) {
3470 space_map_histogram_add(msp->ms_sm,
3471 msp->ms_defer[t], tx);
3472 }
3473 metaslab_aux_histograms_update(msp);
3474
3475 metaslab_group_histogram_add(mg, msp);
3476 metaslab_group_histogram_verify(mg);
3477 metaslab_class_histogram_verify(mg->mg_class);
3478
3479 metaslab_verify_space(msp, dmu_tx_get_txg(tx));
3480
3481 /*
3482 * Since we recreated the histogram (and potentially
3483 * the ms_sm too while condensing) ensure that the
3484 * weight is updated too because we are not guaranteed
3485 * that this metaslab is dirty and will go through
3486 * metaslab_sync_done().
3487 */
3488 metaslab_recalculate_weight_and_sort(msp);
3489 return (B_TRUE);
3490 }
3491
3492 msp->ms_flushing = B_TRUE;
3493 uint64_t sm_len_before = space_map_length(msp->ms_sm);
3494
3495 mutex_exit(&msp->ms_lock);
3496 space_map_write(msp->ms_sm, msp->ms_unflushed_allocs, SM_ALLOC,
3497 SM_NO_VDEVID, tx);
3498 space_map_write(msp->ms_sm, msp->ms_unflushed_frees, SM_FREE,
3499 SM_NO_VDEVID, tx);
3500 mutex_enter(&msp->ms_lock);
3501
3502 uint64_t sm_len_after = space_map_length(msp->ms_sm);
3503 if (zfs_flags & ZFS_DEBUG_LOG_SPACEMAP) {
3504 zfs_dbgmsg("flushing: txg %llu, spa %s, vdev_id %llu, "
3505 "ms_id %llu, unflushed_allocs %llu, unflushed_frees %llu, "
3506 "appended %llu bytes", dmu_tx_get_txg(tx), spa_name(spa),
3507 msp->ms_group->mg_vd->vdev_id, msp->ms_id,
3508 range_tree_space(msp->ms_unflushed_allocs),
3509 range_tree_space(msp->ms_unflushed_frees),
3510 (sm_len_after - sm_len_before));
3511 }
3512
3513 ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
3514 metaslab_unflushed_changes_memused(msp));
3515 spa->spa_unflushed_stats.sus_memused -=
3516 metaslab_unflushed_changes_memused(msp);
3517 range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL);
3518 range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL);
3519
3520 metaslab_verify_space(msp, dmu_tx_get_txg(tx));
3521 metaslab_verify_weight_and_frag(msp);
3522
3523 metaslab_flush_update(msp, tx);
3524
3525 metaslab_verify_space(msp, dmu_tx_get_txg(tx));
3526 metaslab_verify_weight_and_frag(msp);
3527
3528 msp->ms_flushing = B_FALSE;
3529 cv_broadcast(&msp->ms_flush_cv);
3530 return (B_TRUE);
e51be066
GW
3531}
3532
34dc7c2f
BB
3533/*
3534 * Write a metaslab to disk in the context of the specified transaction group.
3535 */
3536void
3537metaslab_sync(metaslab_t *msp, uint64_t txg)
3538{
93cf2076
GW
3539 metaslab_group_t *mg = msp->ms_group;
3540 vdev_t *vd = mg->mg_vd;
34dc7c2f 3541 spa_t *spa = vd->vdev_spa;
428870ff 3542 objset_t *mos = spa_meta_objset(spa);
d2734cce 3543 range_tree_t *alloctree = msp->ms_allocating[txg & TXG_MASK];
34dc7c2f 3544 dmu_tx_t *tx;
34dc7c2f 3545
428870ff
BB
3546 ASSERT(!vd->vdev_ishole);
3547
e51be066
GW
3548 /*
3549 * This metaslab has just been added so there's no work to do now.
3550 */
d2734cce 3551 if (msp->ms_freeing == NULL) {
93cf2076 3552 ASSERT3P(alloctree, ==, NULL);
e51be066
GW
3553 return;
3554 }
3555
93cf2076 3556 ASSERT3P(alloctree, !=, NULL);
d2734cce
SD
3557 ASSERT3P(msp->ms_freeing, !=, NULL);
3558 ASSERT3P(msp->ms_freed, !=, NULL);
3559 ASSERT3P(msp->ms_checkpointing, !=, NULL);
1b939560 3560 ASSERT3P(msp->ms_trim, !=, NULL);
e51be066 3561
f3a7f661 3562 /*
d2734cce
SD
3563 * Normally, we don't want to process a metaslab if there are no
3564 * allocations or frees to perform. However, if the metaslab is being
475aa97c
PD
3565 * forced to condense, it's loaded and we're not beyond the final
3566 * dirty txg, we need to let it through. Not condensing beyond the
3567 * final dirty txg prevents an issue where metaslabs that need to be
3568 * condensed but were loaded for other reasons could cause a panic
3569 * here. By only checking the txg in that branch of the conditional,
3570 * we preserve the utility of the VERIFY statements in all other
3571 * cases.
f3a7f661 3572 */
d2734cce
SD
3573 if (range_tree_is_empty(alloctree) &&
3574 range_tree_is_empty(msp->ms_freeing) &&
3575 range_tree_is_empty(msp->ms_checkpointing) &&
475aa97c
PD
3576 !(msp->ms_loaded && msp->ms_condense_wanted &&
3577 txg <= spa_final_dirty_txg(spa)))
428870ff 3578 return;
34dc7c2f 3579
3b7f360c
GW
3580
3581 VERIFY(txg <= spa_final_dirty_txg(spa));
3582
34dc7c2f 3583 /*
425d3237
SD
3584 * The only state that can actually be changing concurrently
3585 * with metaslab_sync() is the metaslab's ms_allocatable. No
3586 * other thread can be modifying this txg's alloc, freeing,
d2734cce 3587 * freed, or space_map_phys_t. We drop ms_lock whenever we
425d3237
SD
3588 * could call into the DMU, because the DMU can call down to
3589 * us (e.g. via zio_free()) at any time.
a1d477c2
MA
3590 *
3591 * The spa_vdev_remove_thread() can be reading metaslab state
425d3237
SD
3592 * concurrently, and it is locked out by the ms_sync_lock.
3593 * Note that the ms_lock is insufficient for this, because it
3594 * is dropped by space_map_write().
34dc7c2f 3595 */
428870ff 3596 tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
34dc7c2f 3597
93e28d66
SD
3598 /*
3599 * Generate a log space map if one doesn't exist already.
3600 */
3601 spa_generate_syncing_log_sm(spa, tx);
93cf2076 3602
93e28d66
SD
3603 if (msp->ms_sm == NULL) {
3604 uint64_t new_object = space_map_alloc(mos,
3605 spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP) ?
3606 zfs_metaslab_sm_blksz_with_log :
3607 zfs_metaslab_sm_blksz_no_log, tx);
93cf2076
GW
3608 VERIFY3U(new_object, !=, 0);
3609
93e28d66
SD
3610 dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
3611 msp->ms_id, sizeof (uint64_t), &new_object, tx);
3612
93cf2076 3613 VERIFY0(space_map_open(&msp->ms_sm, mos, new_object,
a1d477c2 3614 msp->ms_start, msp->ms_size, vd->vdev_ashift));
93cf2076 3615 ASSERT(msp->ms_sm != NULL);
93e28d66
SD
3616
3617 ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
3618 ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
425d3237 3619 ASSERT0(metaslab_allocated_space(msp));
34dc7c2f
BB
3620 }
3621
93e28d66
SD
3622 if (metaslab_unflushed_txg(msp) == 0 &&
3623 spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) {
3624 ASSERT(spa_syncing_log_sm(spa) != NULL);
3625
3626 metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx);
3627 spa_log_sm_increment_current_mscount(spa);
3628 spa_log_summary_add_flushed_metaslab(spa);
3629
3630 ASSERT(msp->ms_sm != NULL);
3631 mutex_enter(&spa->spa_flushed_ms_lock);
3632 avl_add(&spa->spa_metaslabs_by_flushed, msp);
3633 mutex_exit(&spa->spa_flushed_ms_lock);
3634
3635 ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
3636 ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
3637 }
3638
d2734cce
SD
3639 if (!range_tree_is_empty(msp->ms_checkpointing) &&
3640 vd->vdev_checkpoint_sm == NULL) {
3641 ASSERT(spa_has_checkpoint(spa));
3642
3643 uint64_t new_object = space_map_alloc(mos,
93e28d66 3644 zfs_vdev_standard_sm_blksz, tx);
d2734cce
SD
3645 VERIFY3U(new_object, !=, 0);
3646
3647 VERIFY0(space_map_open(&vd->vdev_checkpoint_sm,
3648 mos, new_object, 0, vd->vdev_asize, vd->vdev_ashift));
3649 ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
3650
3651 /*
3652 * We save the space map object as an entry in vdev_top_zap
3653 * so it can be retrieved when the pool is reopened after an
3654 * export or through zdb.
3655 */
3656 VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset,
3657 vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM,
3658 sizeof (new_object), 1, &new_object, tx));
3659 }
3660
a1d477c2 3661 mutex_enter(&msp->ms_sync_lock);
428870ff
BB
3662 mutex_enter(&msp->ms_lock);
3663
96358617 3664 /*
4e21fd06
DB
3665 * Note: metaslab_condense() clears the space map's histogram.
3666 * Therefore we must verify and remove this histogram before
96358617
MA
3667 * condensing.
3668 */
3669 metaslab_group_histogram_verify(mg);
3670 metaslab_class_histogram_verify(mg->mg_class);
3671 metaslab_group_histogram_remove(mg, msp);
3672
93e28d66
SD
3673 if (spa->spa_sync_pass == 1 && msp->ms_loaded &&
3674 metaslab_should_condense(msp))
3675 metaslab_condense(msp, tx);
3676
3677 /*
3678 * We'll be going to disk to sync our space accounting, thus we
3679 * drop the ms_lock during that time so allocations coming from
3680 * open-context (ZIL) for future TXGs do not block.
3681 */
3682 mutex_exit(&msp->ms_lock);
3683 space_map_t *log_sm = spa_syncing_log_sm(spa);
3684 if (log_sm != NULL) {
3685 ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP));
3686
3687 space_map_write(log_sm, alloctree, SM_ALLOC,
3688 vd->vdev_id, tx);
3689 space_map_write(log_sm, msp->ms_freeing, SM_FREE,
3690 vd->vdev_id, tx);
3691 mutex_enter(&msp->ms_lock);
3692
3693 ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
3694 metaslab_unflushed_changes_memused(msp));
3695 spa->spa_unflushed_stats.sus_memused -=
3696 metaslab_unflushed_changes_memused(msp);
3697 range_tree_remove_xor_add(alloctree,
3698 msp->ms_unflushed_frees, msp->ms_unflushed_allocs);
3699 range_tree_remove_xor_add(msp->ms_freeing,
3700 msp->ms_unflushed_allocs, msp->ms_unflushed_frees);
3701 spa->spa_unflushed_stats.sus_memused +=
3702 metaslab_unflushed_changes_memused(msp);
e51be066 3703 } else {
93e28d66
SD
3704 ASSERT(!spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP));
3705
4d044c4c
SD
3706 space_map_write(msp->ms_sm, alloctree, SM_ALLOC,
3707 SM_NO_VDEVID, tx);
3708 space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE,
3709 SM_NO_VDEVID, tx);
a1d477c2 3710 mutex_enter(&msp->ms_lock);
e51be066 3711 }
428870ff 3712
425d3237
SD
3713 msp->ms_allocated_space += range_tree_space(alloctree);
3714 ASSERT3U(msp->ms_allocated_space, >=,
3715 range_tree_space(msp->ms_freeing));
3716 msp->ms_allocated_space -= range_tree_space(msp->ms_freeing);
3717
d2734cce
SD
3718 if (!range_tree_is_empty(msp->ms_checkpointing)) {
3719 ASSERT(spa_has_checkpoint(spa));
3720 ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
3721
3722 /*
3723 * Since we are doing writes to disk and the ms_checkpointing
3724 * tree won't be changing during that time, we drop the
93e28d66
SD
3725 * ms_lock while writing to the checkpoint space map, for the
3726 * same reason mentioned above.
d2734cce
SD
3727 */
3728 mutex_exit(&msp->ms_lock);
3729 space_map_write(vd->vdev_checkpoint_sm,
4d044c4c 3730 msp->ms_checkpointing, SM_FREE, SM_NO_VDEVID, tx);
d2734cce 3731 mutex_enter(&msp->ms_lock);
d2734cce
SD
3732
3733 spa->spa_checkpoint_info.sci_dspace +=
3734 range_tree_space(msp->ms_checkpointing);
3735 vd->vdev_stat.vs_checkpoint_space +=
3736 range_tree_space(msp->ms_checkpointing);
3737 ASSERT3U(vd->vdev_stat.vs_checkpoint_space, ==,
425d3237 3738 -space_map_allocated(vd->vdev_checkpoint_sm));
d2734cce
SD
3739
3740 range_tree_vacate(msp->ms_checkpointing, NULL, NULL);
3741 }
3742
93cf2076
GW
3743 if (msp->ms_loaded) {
3744 /*
a1d477c2 3745 * When the space map is loaded, we have an accurate
93cf2076
GW
3746 * histogram in the range tree. This gives us an opportunity
3747 * to bring the space map's histogram up-to-date so we clear
3748 * it first before updating it.
3749 */
3750 space_map_histogram_clear(msp->ms_sm);
d2734cce 3751 space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx);
4e21fd06
DB
3752
3753 /*
3754 * Since we've cleared the histogram we need to add back
3755 * any free space that has already been processed, plus
3756 * any deferred space. This allows the on-disk histogram
3757 * to accurately reflect all free space even if some space
3758 * is not yet available for allocation (i.e. deferred).
3759 */
d2734cce 3760 space_map_histogram_add(msp->ms_sm, msp->ms_freed, tx);
4e21fd06 3761
93cf2076 3762 /*
4e21fd06
DB
3763 * Add back any deferred free space that has not been
3764 * added back into the in-core free tree yet. This will
3765 * ensure that we don't end up with a space map histogram
3766 * that is completely empty unless the metaslab is fully
3767 * allocated.
93cf2076 3768 */
1c27024e 3769 for (int t = 0; t < TXG_DEFER_SIZE; t++) {
4e21fd06 3770 space_map_histogram_add(msp->ms_sm,
d2734cce 3771 msp->ms_defer[t], tx);
4e21fd06 3772 }
93cf2076 3773 }
4e21fd06
DB
3774
3775 /*
3776 * Always add the free space from this sync pass to the space
3777 * map histogram. We want to make sure that the on-disk histogram
3778 * accounts for all free space. If the space map is not loaded,
3779 * then we will lose some accuracy but will correct it the next
3780 * time we load the space map.
3781 */
d2734cce 3782 space_map_histogram_add(msp->ms_sm, msp->ms_freeing, tx);
928e8ad4 3783 metaslab_aux_histograms_update(msp);
4e21fd06 3784
f3a7f661
GW
3785 metaslab_group_histogram_add(mg, msp);
3786 metaslab_group_histogram_verify(mg);
3787 metaslab_class_histogram_verify(mg->mg_class);
34dc7c2f 3788
e51be066 3789 /*
93cf2076 3790 * For sync pass 1, we avoid traversing this txg's free range tree
425d3237
SD
3791 * and instead will just swap the pointers for freeing and freed.
3792 * We can safely do this since the freed_tree is guaranteed to be
3793 * empty on the initial pass.
93e28d66
SD
3794 *
3795 * Keep in mind that even if we are currently using a log spacemap
3796 * we want current frees to end up in the ms_allocatable (but not
3797 * get appended to the ms_sm) so their ranges can be reused as usual.
e51be066
GW
3798 */
3799 if (spa_sync_pass(spa) == 1) {
d2734cce 3800 range_tree_swap(&msp->ms_freeing, &msp->ms_freed);
425d3237 3801 ASSERT0(msp->ms_allocated_this_txg);
e51be066 3802 } else {
d2734cce
SD
3803 range_tree_vacate(msp->ms_freeing,
3804 range_tree_add, msp->ms_freed);
34dc7c2f 3805 }
425d3237 3806 msp->ms_allocated_this_txg += range_tree_space(alloctree);
f3a7f661 3807 range_tree_vacate(alloctree, NULL, NULL);
34dc7c2f 3808
d2734cce
SD
3809 ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK]));
3810 ASSERT0(range_tree_space(msp->ms_allocating[TXG_CLEAN(txg)
3811 & TXG_MASK]));
3812 ASSERT0(range_tree_space(msp->ms_freeing));
3813 ASSERT0(range_tree_space(msp->ms_checkpointing));
34dc7c2f
BB
3814
3815 mutex_exit(&msp->ms_lock);
3816
93e28d66
SD
3817 /*
3818 * Verify that the space map object ID has been recorded in the
3819 * vdev_ms_array.
3820 */
3821 uint64_t object;
3822 VERIFY0(dmu_read(mos, vd->vdev_ms_array,
3823 msp->ms_id * sizeof (uint64_t), sizeof (uint64_t), &object, 0));
3824 VERIFY3U(object, ==, space_map_object(msp->ms_sm));
3825
a1d477c2 3826 mutex_exit(&msp->ms_sync_lock);
34dc7c2f
BB
3827 dmu_tx_commit(tx);
3828}
3829
f09fda50
PD
3830static void
3831metaslab_evict(metaslab_t *msp, uint64_t txg)
893a6d62 3832{
f09fda50
PD
3833 if (!msp->ms_loaded || msp->ms_disabled != 0)
3834 return;
893a6d62 3835
f09fda50
PD
3836 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
3837 VERIFY0(range_tree_space(
3838 msp->ms_allocating[(txg + t) & TXG_MASK]));
893a6d62 3839 }
f09fda50
PD
3840 if (msp->ms_allocator != -1)
3841 metaslab_passivate(msp, msp->ms_weight & ~METASLAB_ACTIVE_MASK);
3842
3843 if (!metaslab_debug_unload)
3844 metaslab_unload(msp);
893a6d62
PD
3845}
3846
34dc7c2f
BB
3847/*
3848 * Called after a transaction group has completely synced to mark
3849 * all of the metaslab's free space as usable.
3850 */
3851void
3852metaslab_sync_done(metaslab_t *msp, uint64_t txg)
3853{
34dc7c2f
BB
3854 metaslab_group_t *mg = msp->ms_group;
3855 vdev_t *vd = mg->mg_vd;
4e21fd06 3856 spa_t *spa = vd->vdev_spa;
93cf2076 3857 range_tree_t **defer_tree;
428870ff 3858 int64_t alloc_delta, defer_delta;
4e21fd06 3859 boolean_t defer_allowed = B_TRUE;
428870ff
BB
3860
3861 ASSERT(!vd->vdev_ishole);
34dc7c2f
BB
3862
3863 mutex_enter(&msp->ms_lock);
3864
3865 /*
3866 * If this metaslab is just becoming available, initialize its
258553d3 3867 * range trees and add its capacity to the vdev.
34dc7c2f 3868 */
d2734cce 3869 if (msp->ms_freed == NULL) {
1c27024e 3870 for (int t = 0; t < TXG_SIZE; t++) {
d2734cce 3871 ASSERT(msp->ms_allocating[t] == NULL);
93cf2076 3872
d2734cce 3873 msp->ms_allocating[t] = range_tree_create(NULL, NULL);
34dc7c2f 3874 }
428870ff 3875
d2734cce
SD
3876 ASSERT3P(msp->ms_freeing, ==, NULL);
3877 msp->ms_freeing = range_tree_create(NULL, NULL);
258553d3 3878
d2734cce
SD
3879 ASSERT3P(msp->ms_freed, ==, NULL);
3880 msp->ms_freed = range_tree_create(NULL, NULL);
258553d3 3881
1c27024e 3882 for (int t = 0; t < TXG_DEFER_SIZE; t++) {
93e28d66 3883 ASSERT3P(msp->ms_defer[t], ==, NULL);
d2734cce 3884 msp->ms_defer[t] = range_tree_create(NULL, NULL);
93cf2076 3885 }
428870ff 3886
d2734cce
SD
3887 ASSERT3P(msp->ms_checkpointing, ==, NULL);
3888 msp->ms_checkpointing = range_tree_create(NULL, NULL);
3889
93e28d66
SD
3890 ASSERT3P(msp->ms_unflushed_allocs, ==, NULL);
3891 msp->ms_unflushed_allocs = range_tree_create(NULL, NULL);
3892 ASSERT3P(msp->ms_unflushed_frees, ==, NULL);
c81f1790
PD
3893 msp->ms_unflushed_frees = range_tree_create_impl(&rt_avl_ops,
3894 &msp->ms_unflushed_frees_by_size,
3895 metaslab_rangesize_compare, 0);
93e28d66 3896
cc99f275 3897 metaslab_space_update(vd, mg->mg_class, 0, 0, msp->ms_size);
34dc7c2f 3898 }
d2734cce
SD
3899 ASSERT0(range_tree_space(msp->ms_freeing));
3900 ASSERT0(range_tree_space(msp->ms_checkpointing));
34dc7c2f 3901
d2734cce 3902 defer_tree = &msp->ms_defer[txg % TXG_DEFER_SIZE];
93cf2076 3903
1c27024e 3904 uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) -
4e21fd06 3905 metaslab_class_get_alloc(spa_normal_class(spa));
a1d477c2 3906 if (free_space <= spa_get_slop_space(spa) || vd->vdev_removing) {
4e21fd06
DB
3907 defer_allowed = B_FALSE;
3908 }
3909
3910 defer_delta = 0;
425d3237
SD
3911 alloc_delta = msp->ms_allocated_this_txg -
3912 range_tree_space(msp->ms_freed);
93e28d66 3913
4e21fd06 3914 if (defer_allowed) {
d2734cce 3915 defer_delta = range_tree_space(msp->ms_freed) -
4e21fd06
DB
3916 range_tree_space(*defer_tree);
3917 } else {
3918 defer_delta -= range_tree_space(*defer_tree);
3919 }
cc99f275
DB
3920 metaslab_space_update(vd, mg->mg_class, alloc_delta + defer_delta,
3921 defer_delta, 0);
34dc7c2f 3922
93e28d66
SD
3923 if (spa_syncing_log_sm(spa) == NULL) {
3924 /*
3925 * If there's a metaslab_load() in progress and we don't have
3926 * a log space map, it means that we probably wrote to the
3927 * metaslab's space map. If this is the case, we need to
3928 * make sure that we wait for the load to complete so that we
3929 * have a consistent view at the in-core side of the metaslab.
3930 */
3931 metaslab_load_wait(msp);
3932 } else {
3933 ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
3934 }
c2e42f9d 3935
1b939560
BB
3936 /*
3937 * When auto-trimming is enabled, free ranges which are added to
3938 * ms_allocatable are also be added to ms_trim. The ms_trim tree is
3939 * periodically consumed by the vdev_autotrim_thread() which issues
3940 * trims for all ranges and then vacates the tree. The ms_trim tree
3941 * can be discarded at any time with the sole consequence of recent
3942 * frees not being trimmed.
3943 */
3944 if (spa_get_autotrim(spa) == SPA_AUTOTRIM_ON) {
3945 range_tree_walk(*defer_tree, range_tree_add, msp->ms_trim);
3946 if (!defer_allowed) {
3947 range_tree_walk(msp->ms_freed, range_tree_add,
3948 msp->ms_trim);
3949 }
3950 } else {
3951 range_tree_vacate(msp->ms_trim, NULL, NULL);
3952 }
3953
c2e42f9d 3954 /*
93cf2076 3955 * Move the frees from the defer_tree back to the free
d2734cce
SD
3956 * range tree (if it's loaded). Swap the freed_tree and
3957 * the defer_tree -- this is safe to do because we've
3958 * just emptied out the defer_tree.
c2e42f9d 3959 */
93cf2076 3960 range_tree_vacate(*defer_tree,
d2734cce 3961 msp->ms_loaded ? range_tree_add : NULL, msp->ms_allocatable);
4e21fd06 3962 if (defer_allowed) {
d2734cce 3963 range_tree_swap(&msp->ms_freed, defer_tree);
4e21fd06 3964 } else {
d2734cce
SD
3965 range_tree_vacate(msp->ms_freed,
3966 msp->ms_loaded ? range_tree_add : NULL,
3967 msp->ms_allocatable);
4e21fd06 3968 }
425d3237
SD
3969
3970 msp->ms_synced_length = space_map_length(msp->ms_sm);
34dc7c2f 3971
428870ff
BB
3972 msp->ms_deferspace += defer_delta;
3973 ASSERT3S(msp->ms_deferspace, >=, 0);
93cf2076 3974 ASSERT3S(msp->ms_deferspace, <=, msp->ms_size);
428870ff
BB
3975 if (msp->ms_deferspace != 0) {
3976 /*
3977 * Keep syncing this metaslab until all deferred frees
3978 * are back in circulation.
3979 */
3980 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
3981 }
928e8ad4 3982 metaslab_aux_histograms_update_done(msp, defer_allowed);
428870ff 3983
492f64e9
PD
3984 if (msp->ms_new) {
3985 msp->ms_new = B_FALSE;
3986 mutex_enter(&mg->mg_lock);
3987 mg->mg_ms_ready++;
3988 mutex_exit(&mg->mg_lock);
3989 }
928e8ad4 3990
4e21fd06 3991 /*
928e8ad4
SD
3992 * Re-sort metaslab within its group now that we've adjusted
3993 * its allocatable space.
4e21fd06 3994 */
928e8ad4 3995 metaslab_recalculate_weight_and_sort(msp);
4e21fd06 3996
d2734cce
SD
3997 ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK]));
3998 ASSERT0(range_tree_space(msp->ms_freeing));
3999 ASSERT0(range_tree_space(msp->ms_freed));
4000 ASSERT0(range_tree_space(msp->ms_checkpointing));
f09fda50 4001 msp->ms_allocating_total -= msp->ms_allocated_this_txg;
425d3237 4002 msp->ms_allocated_this_txg = 0;
34dc7c2f
BB
4003 mutex_exit(&msp->ms_lock);
4004}
4005
428870ff
BB
4006void
4007metaslab_sync_reassess(metaslab_group_t *mg)
4008{
a1d477c2
MA
4009 spa_t *spa = mg->mg_class->mc_spa;
4010
4011 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
1be627f5 4012 metaslab_group_alloc_update(mg);
f3a7f661 4013 mg->mg_fragmentation = metaslab_group_fragmentation(mg);
6d974228 4014
428870ff 4015 /*
a1d477c2
MA
4016 * Preload the next potential metaslabs but only on active
4017 * metaslab groups. We can get into a state where the metaslab
4018 * is no longer active since we dirty metaslabs as we remove a
4019 * a device, thus potentially making the metaslab group eligible
4020 * for preloading.
428870ff 4021 */
a1d477c2
MA
4022 if (mg->mg_activation_count > 0) {
4023 metaslab_group_preload(mg);
4024 }
4025 spa_config_exit(spa, SCL_ALLOC, FTAG);
428870ff
BB
4026}
4027
cc99f275
DB
4028/*
4029 * When writing a ditto block (i.e. more than one DVA for a given BP) on
4030 * the same vdev as an existing DVA of this BP, then try to allocate it
4031 * on a different metaslab than existing DVAs (i.e. a unique metaslab).
4032 */
4033static boolean_t
4034metaslab_is_unique(metaslab_t *msp, dva_t *dva)
34dc7c2f 4035{
cc99f275
DB
4036 uint64_t dva_ms_id;
4037
4038 if (DVA_GET_ASIZE(dva) == 0)
4039 return (B_TRUE);
34dc7c2f
BB
4040
4041 if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva))
cc99f275 4042 return (B_TRUE);
34dc7c2f 4043
cc99f275
DB
4044 dva_ms_id = DVA_GET_OFFSET(dva) >> msp->ms_group->mg_vd->vdev_ms_shift;
4045
4046 return (msp->ms_id != dva_ms_id);
34dc7c2f
BB
4047}
4048
4e21fd06
DB
4049/*
4050 * ==========================================================================
4051 * Metaslab allocation tracing facility
4052 * ==========================================================================
4053 */
4054#ifdef _METASLAB_TRACING
4055kstat_t *metaslab_trace_ksp;
4056kstat_named_t metaslab_trace_over_limit;
4057
4058void
4059metaslab_alloc_trace_init(void)
4060{
4061 ASSERT(metaslab_alloc_trace_cache == NULL);
4062 metaslab_alloc_trace_cache = kmem_cache_create(
4063 "metaslab_alloc_trace_cache", sizeof (metaslab_alloc_trace_t),
4064 0, NULL, NULL, NULL, NULL, NULL, 0);
4065 metaslab_trace_ksp = kstat_create("zfs", 0, "metaslab_trace_stats",
4066 "misc", KSTAT_TYPE_NAMED, 1, KSTAT_FLAG_VIRTUAL);
4067 if (metaslab_trace_ksp != NULL) {
4068 metaslab_trace_ksp->ks_data = &metaslab_trace_over_limit;
4069 kstat_named_init(&metaslab_trace_over_limit,
4070 "metaslab_trace_over_limit", KSTAT_DATA_UINT64);
4071 kstat_install(metaslab_trace_ksp);
4072 }
4073}
4074
4075void
4076metaslab_alloc_trace_fini(void)
4077{
4078 if (metaslab_trace_ksp != NULL) {
4079 kstat_delete(metaslab_trace_ksp);
4080 metaslab_trace_ksp = NULL;
4081 }
4082 kmem_cache_destroy(metaslab_alloc_trace_cache);
4083 metaslab_alloc_trace_cache = NULL;
4084}
4085
4086/*
4087 * Add an allocation trace element to the allocation tracing list.
4088 */
4089static void
4090metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg,
492f64e9
PD
4091 metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset,
4092 int allocator)
4e21fd06
DB
4093{
4094 metaslab_alloc_trace_t *mat;
4095
4096 if (!metaslab_trace_enabled)
4097 return;
4098
4099 /*
4100 * When the tracing list reaches its maximum we remove
4101 * the second element in the list before adding a new one.
4102 * By removing the second element we preserve the original
4103 * entry as a clue to what allocations steps have already been
4104 * performed.
4105 */
4106 if (zal->zal_size == metaslab_trace_max_entries) {
4107 metaslab_alloc_trace_t *mat_next;
4108#ifdef DEBUG
4109 panic("too many entries in allocation list");
4110#endif
4111 atomic_inc_64(&metaslab_trace_over_limit.value.ui64);
4112 zal->zal_size--;
4113 mat_next = list_next(&zal->zal_list, list_head(&zal->zal_list));
4114 list_remove(&zal->zal_list, mat_next);
4115 kmem_cache_free(metaslab_alloc_trace_cache, mat_next);
4116 }
4117
4118 mat = kmem_cache_alloc(metaslab_alloc_trace_cache, KM_SLEEP);
4119 list_link_init(&mat->mat_list_node);
4120 mat->mat_mg = mg;
4121 mat->mat_msp = msp;
4122 mat->mat_size = psize;
4123 mat->mat_dva_id = dva_id;
4124 mat->mat_offset = offset;
4125 mat->mat_weight = 0;
492f64e9 4126 mat->mat_allocator = allocator;
4e21fd06
DB
4127
4128 if (msp != NULL)
4129 mat->mat_weight = msp->ms_weight;
4130
4131 /*
4132 * The list is part of the zio so locking is not required. Only
4133 * a single thread will perform allocations for a given zio.
4134 */
4135 list_insert_tail(&zal->zal_list, mat);
4136 zal->zal_size++;
4137
4138 ASSERT3U(zal->zal_size, <=, metaslab_trace_max_entries);
4139}
4140
4141void
4142metaslab_trace_init(zio_alloc_list_t *zal)
4143{
4144 list_create(&zal->zal_list, sizeof (metaslab_alloc_trace_t),
4145 offsetof(metaslab_alloc_trace_t, mat_list_node));
4146 zal->zal_size = 0;
4147}
4148
4149void
4150metaslab_trace_fini(zio_alloc_list_t *zal)
4151{
4152 metaslab_alloc_trace_t *mat;
4153
4154 while ((mat = list_remove_head(&zal->zal_list)) != NULL)
4155 kmem_cache_free(metaslab_alloc_trace_cache, mat);
4156 list_destroy(&zal->zal_list);
4157 zal->zal_size = 0;
4158}
4159#else
4160
492f64e9 4161#define metaslab_trace_add(zal, mg, msp, psize, id, off, alloc)
4e21fd06
DB
4162
4163void
4164metaslab_alloc_trace_init(void)
4165{
4166}
4167
4168void
4169metaslab_alloc_trace_fini(void)
4170{
4171}
4172
4173void
4174metaslab_trace_init(zio_alloc_list_t *zal)
4175{
4176}
4177
4178void
4179metaslab_trace_fini(zio_alloc_list_t *zal)
4180{
4181}
4182
4183#endif /* _METASLAB_TRACING */
4184
3dfb57a3
DB
4185/*
4186 * ==========================================================================
4187 * Metaslab block operations
4188 * ==========================================================================
4189 */
4190
4191static void
492f64e9
PD
4192metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags,
4193 int allocator)
3dfb57a3 4194{
3dfb57a3 4195 if (!(flags & METASLAB_ASYNC_ALLOC) ||
492f64e9 4196 (flags & METASLAB_DONT_THROTTLE))
3dfb57a3
DB
4197 return;
4198
1c27024e 4199 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
3dfb57a3
DB
4200 if (!mg->mg_class->mc_alloc_throttle_enabled)
4201 return;
4202
c13060e4 4203 (void) zfs_refcount_add(&mg->mg_alloc_queue_depth[allocator], tag);
492f64e9
PD
4204}
4205
4206static void
4207metaslab_group_increment_qdepth(metaslab_group_t *mg, int allocator)
4208{
4209 uint64_t max = mg->mg_max_alloc_queue_depth;
4210 uint64_t cur = mg->mg_cur_max_alloc_queue_depth[allocator];
4211 while (cur < max) {
4212 if (atomic_cas_64(&mg->mg_cur_max_alloc_queue_depth[allocator],
4213 cur, cur + 1) == cur) {
4214 atomic_inc_64(
4215 &mg->mg_class->mc_alloc_max_slots[allocator]);
4216 return;
4217 }
4218 cur = mg->mg_cur_max_alloc_queue_depth[allocator];
4219 }
3dfb57a3
DB
4220}
4221
4222void
492f64e9
PD
4223metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags,
4224 int allocator, boolean_t io_complete)
3dfb57a3 4225{
3dfb57a3 4226 if (!(flags & METASLAB_ASYNC_ALLOC) ||
492f64e9 4227 (flags & METASLAB_DONT_THROTTLE))
3dfb57a3
DB
4228 return;
4229
1c27024e 4230 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
3dfb57a3
DB
4231 if (!mg->mg_class->mc_alloc_throttle_enabled)
4232 return;
4233
424fd7c3 4234 (void) zfs_refcount_remove(&mg->mg_alloc_queue_depth[allocator], tag);
492f64e9
PD
4235 if (io_complete)
4236 metaslab_group_increment_qdepth(mg, allocator);
3dfb57a3
DB
4237}
4238
4239void
492f64e9
PD
4240metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag,
4241 int allocator)
3dfb57a3
DB
4242{
4243#ifdef ZFS_DEBUG
4244 const dva_t *dva = bp->blk_dva;
4245 int ndvas = BP_GET_NDVAS(bp);
3dfb57a3 4246
1c27024e 4247 for (int d = 0; d < ndvas; d++) {
3dfb57a3
DB
4248 uint64_t vdev = DVA_GET_VDEV(&dva[d]);
4249 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
424fd7c3
TS
4250 VERIFY(zfs_refcount_not_held(
4251 &mg->mg_alloc_queue_depth[allocator], tag));
3dfb57a3
DB
4252 }
4253#endif
4254}
4255
34dc7c2f 4256static uint64_t
4e21fd06
DB
4257metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
4258{
4259 uint64_t start;
d2734cce 4260 range_tree_t *rt = msp->ms_allocatable;
4e21fd06
DB
4261 metaslab_class_t *mc = msp->ms_group->mg_class;
4262
93e28d66 4263 ASSERT(MUTEX_HELD(&msp->ms_lock));
4e21fd06 4264 VERIFY(!msp->ms_condensing);
1b939560 4265 VERIFY0(msp->ms_disabled);
4e21fd06
DB
4266
4267 start = mc->mc_ops->msop_alloc(msp, size);
4268 if (start != -1ULL) {
4269 metaslab_group_t *mg = msp->ms_group;
4270 vdev_t *vd = mg->mg_vd;
4271
4272 VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift));
4273 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
4274 VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size);
4275 range_tree_remove(rt, start, size);
1b939560 4276 range_tree_clear(msp->ms_trim, start, size);
4e21fd06 4277
d2734cce 4278 if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK]))
4e21fd06
DB
4279 vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
4280
d2734cce 4281 range_tree_add(msp->ms_allocating[txg & TXG_MASK], start, size);
f09fda50 4282 msp->ms_allocating_total += size;
4e21fd06
DB
4283
4284 /* Track the last successful allocation */
4285 msp->ms_alloc_txg = txg;
4286 metaslab_verify_space(msp, txg);
4287 }
4288
4289 /*
4290 * Now that we've attempted the allocation we need to update the
4291 * metaslab's maximum block size since it may have changed.
4292 */
c81f1790 4293 msp->ms_max_size = metaslab_largest_allocatable(msp);
4e21fd06
DB
4294 return (start);
4295}
4296
492f64e9
PD
4297/*
4298 * Find the metaslab with the highest weight that is less than what we've
4299 * already tried. In the common case, this means that we will examine each
4300 * metaslab at most once. Note that concurrent callers could reorder metaslabs
4301 * by activation/passivation once we have dropped the mg_lock. If a metaslab is
4302 * activated by another thread, and we fail to allocate from the metaslab we
4303 * have selected, we may not try the newly-activated metaslab, and instead
4304 * activate another metaslab. This is not optimal, but generally does not cause
4305 * any problems (a possible exception being if every metaslab is completely full
e1cfd73f 4306 * except for the newly-activated metaslab which we fail to examine).
492f64e9
PD
4307 */
4308static metaslab_t *
4309find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight,
cc99f275 4310 dva_t *dva, int d, boolean_t want_unique, uint64_t asize, int allocator,
c81f1790
PD
4311 boolean_t try_hard, zio_alloc_list_t *zal, metaslab_t *search,
4312 boolean_t *was_active)
492f64e9
PD
4313{
4314 avl_index_t idx;
4315 avl_tree_t *t = &mg->mg_metaslab_tree;
4316 metaslab_t *msp = avl_find(t, search, &idx);
4317 if (msp == NULL)
4318 msp = avl_nearest(t, idx, AVL_AFTER);
4319
4320 for (; msp != NULL; msp = AVL_NEXT(t, msp)) {
4321 int i;
c81f1790 4322 if (!metaslab_should_allocate(msp, asize, try_hard)) {
492f64e9
PD
4323 metaslab_trace_add(zal, mg, msp, asize, d,
4324 TRACE_TOO_SMALL, allocator);
4325 continue;
4326 }
4327
4328 /*
1b939560
BB
4329 * If the selected metaslab is condensing or disabled,
4330 * skip it.
492f64e9 4331 */
1b939560 4332 if (msp->ms_condensing || msp->ms_disabled > 0)
492f64e9
PD
4333 continue;
4334
4335 *was_active = msp->ms_allocator != -1;
4336 /*
4337 * If we're activating as primary, this is our first allocation
4338 * from this disk, so we don't need to check how close we are.
4339 * If the metaslab under consideration was already active,
4340 * we're getting desperate enough to steal another allocator's
4341 * metaslab, so we still don't care about distances.
4342 */
4343 if (activation_weight == METASLAB_WEIGHT_PRIMARY || *was_active)
4344 break;
4345
492f64e9 4346 for (i = 0; i < d; i++) {
cc99f275
DB
4347 if (want_unique &&
4348 !metaslab_is_unique(msp, &dva[i]))
4349 break; /* try another metaslab */
492f64e9
PD
4350 }
4351 if (i == d)
4352 break;
4353 }
4354
4355 if (msp != NULL) {
4356 search->ms_weight = msp->ms_weight;
4357 search->ms_start = msp->ms_start + 1;
4358 search->ms_allocator = msp->ms_allocator;
4359 search->ms_primary = msp->ms_primary;
4360 }
4361 return (msp);
4362}
4363
679b0f2a
PD
4364void
4365metaslab_active_mask_verify(metaslab_t *msp)
4366{
4367 ASSERT(MUTEX_HELD(&msp->ms_lock));
4368
4369 if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
4370 return;
4371
4372 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0)
4373 return;
4374
4375 if (msp->ms_weight & METASLAB_WEIGHT_PRIMARY) {
4376 VERIFY0(msp->ms_weight & METASLAB_WEIGHT_SECONDARY);
4377 VERIFY0(msp->ms_weight & METASLAB_WEIGHT_CLAIM);
4378 VERIFY3S(msp->ms_allocator, !=, -1);
4379 VERIFY(msp->ms_primary);
4380 return;
4381 }
4382
4383 if (msp->ms_weight & METASLAB_WEIGHT_SECONDARY) {
4384 VERIFY0(msp->ms_weight & METASLAB_WEIGHT_PRIMARY);
4385 VERIFY0(msp->ms_weight & METASLAB_WEIGHT_CLAIM);
4386 VERIFY3S(msp->ms_allocator, !=, -1);
4387 VERIFY(!msp->ms_primary);
4388 return;
4389 }
4390
4391 if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) {
4392 VERIFY0(msp->ms_weight & METASLAB_WEIGHT_PRIMARY);
4393 VERIFY0(msp->ms_weight & METASLAB_WEIGHT_SECONDARY);
4394 VERIFY3S(msp->ms_allocator, ==, -1);
4395 return;
4396 }
4397}
4398
492f64e9 4399/* ARGSUSED */
4e21fd06
DB
4400static uint64_t
4401metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
c81f1790
PD
4402 uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, int d,
4403 int allocator, boolean_t try_hard)
34dc7c2f
BB
4404{
4405 metaslab_t *msp = NULL;
4406 uint64_t offset = -1ULL;
34dc7c2f 4407
679b0f2a 4408 uint64_t activation_weight = METASLAB_WEIGHT_PRIMARY;
492f64e9
PD
4409 for (int i = 0; i < d; i++) {
4410 if (activation_weight == METASLAB_WEIGHT_PRIMARY &&
4411 DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
34dc7c2f 4412 activation_weight = METASLAB_WEIGHT_SECONDARY;
492f64e9
PD
4413 } else if (activation_weight == METASLAB_WEIGHT_SECONDARY &&
4414 DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
e38afd34 4415 activation_weight = METASLAB_WEIGHT_CLAIM;
9babb374
BB
4416 break;
4417 }
4418 }
34dc7c2f 4419
492f64e9
PD
4420 /*
4421 * If we don't have enough metaslabs active to fill the entire array, we
4422 * just use the 0th slot.
4423 */
e38afd34 4424 if (mg->mg_ms_ready < mg->mg_allocators * 3)
492f64e9 4425 allocator = 0;
492f64e9
PD
4426
4427 ASSERT3U(mg->mg_vd->vdev_ms_count, >=, 2);
4428
1c27024e 4429 metaslab_t *search = kmem_alloc(sizeof (*search), KM_SLEEP);
4e21fd06
DB
4430 search->ms_weight = UINT64_MAX;
4431 search->ms_start = 0;
492f64e9
PD
4432 /*
4433 * At the end of the metaslab tree are the already-active metaslabs,
4434 * first the primaries, then the secondaries. When we resume searching
4435 * through the tree, we need to consider ms_allocator and ms_primary so
4436 * we start in the location right after where we left off, and don't
4437 * accidentally loop forever considering the same metaslabs.
4438 */
4439 search->ms_allocator = -1;
4440 search->ms_primary = B_TRUE;
34dc7c2f 4441 for (;;) {
492f64e9 4442 boolean_t was_active = B_FALSE;
9babb374 4443
34dc7c2f 4444 mutex_enter(&mg->mg_lock);
4e21fd06 4445
492f64e9
PD
4446 if (activation_weight == METASLAB_WEIGHT_PRIMARY &&
4447 mg->mg_primaries[allocator] != NULL) {
4448 msp = mg->mg_primaries[allocator];
679b0f2a
PD
4449
4450 /*
4451 * Even though we don't hold the ms_lock for the
4452 * primary metaslab, those fields should not
e1cfd73f 4453 * change while we hold the mg_lock. Thus it is
679b0f2a
PD
4454 * safe to make assertions on them.
4455 */
4456 ASSERT(msp->ms_primary);
4457 ASSERT3S(msp->ms_allocator, ==, allocator);
4458 ASSERT(msp->ms_loaded);
4459
492f64e9 4460 was_active = B_TRUE;
f09fda50 4461 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
492f64e9 4462 } else if (activation_weight == METASLAB_WEIGHT_SECONDARY &&
e38afd34 4463 mg->mg_secondaries[allocator] != NULL) {
492f64e9 4464 msp = mg->mg_secondaries[allocator];
679b0f2a
PD
4465
4466 /*
4467 * See comment above about the similar assertions
4468 * for the primary metaslab.
4469 */
4470 ASSERT(!msp->ms_primary);
4471 ASSERT3S(msp->ms_allocator, ==, allocator);
4472 ASSERT(msp->ms_loaded);
4473
492f64e9 4474 was_active = B_TRUE;
f09fda50 4475 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
492f64e9
PD
4476 } else {
4477 msp = find_valid_metaslab(mg, activation_weight, dva, d,
c81f1790
PD
4478 want_unique, asize, allocator, try_hard, zal,
4479 search, &was_active);
34dc7c2f 4480 }
492f64e9 4481
34dc7c2f 4482 mutex_exit(&mg->mg_lock);
4e21fd06
DB
4483 if (msp == NULL) {
4484 kmem_free(search, sizeof (*search));
34dc7c2f 4485 return (-1ULL);
4e21fd06 4486 }
ac72fac3 4487 mutex_enter(&msp->ms_lock);
679b0f2a
PD
4488
4489 metaslab_active_mask_verify(msp);
4490
4491 /*
4492 * This code is disabled out because of issues with
4493 * tracepoints in non-gpl kernel modules.
4494 */
4495#if 0
4496 DTRACE_PROBE3(ms__activation__attempt,
4497 metaslab_t *, msp, uint64_t, activation_weight,
4498 boolean_t, was_active);
4499#endif
4500
34dc7c2f
BB
4501 /*
4502 * Ensure that the metaslab we have selected is still
4503 * capable of handling our request. It's possible that
4504 * another thread may have changed the weight while we
4e21fd06 4505 * were blocked on the metaslab lock. We check the
f09fda50 4506 * active status first to see if we need to set_selected_txg
4e21fd06 4507 * a new metaslab.
34dc7c2f 4508 */
4e21fd06 4509 if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) {
679b0f2a 4510 ASSERT3S(msp->ms_allocator, ==, -1);
34dc7c2f
BB
4511 mutex_exit(&msp->ms_lock);
4512 continue;
4513 }
4514
492f64e9 4515 /*
679b0f2a
PD
4516 * If the metaslab was activated for another allocator
4517 * while we were waiting in the ms_lock above, or it's
4518 * a primary and we're seeking a secondary (or vice versa),
4519 * we go back and select a new metaslab.
492f64e9
PD
4520 */
4521 if (!was_active && (msp->ms_weight & METASLAB_ACTIVE_MASK) &&
4522 (msp->ms_allocator != -1) &&
4523 (msp->ms_allocator != allocator || ((activation_weight ==
4524 METASLAB_WEIGHT_PRIMARY) != msp->ms_primary))) {
679b0f2a
PD
4525 ASSERT(msp->ms_loaded);
4526 ASSERT((msp->ms_weight & METASLAB_WEIGHT_CLAIM) ||
4527 msp->ms_allocator != -1);
492f64e9
PD
4528 mutex_exit(&msp->ms_lock);
4529 continue;
4530 }
4531
679b0f2a
PD
4532 /*
4533 * This metaslab was used for claiming regions allocated
4534 * by the ZIL during pool import. Once these regions are
4535 * claimed we don't need to keep the CLAIM bit set
4536 * anymore. Passivate this metaslab to zero its activation
4537 * mask.
4538 */
e38afd34 4539 if (msp->ms_weight & METASLAB_WEIGHT_CLAIM &&
4540 activation_weight != METASLAB_WEIGHT_CLAIM) {
679b0f2a
PD
4541 ASSERT(msp->ms_loaded);
4542 ASSERT3S(msp->ms_allocator, ==, -1);
492f64e9
PD
4543 metaslab_passivate(msp, msp->ms_weight &
4544 ~METASLAB_WEIGHT_CLAIM);
34dc7c2f
BB
4545 mutex_exit(&msp->ms_lock);
4546 continue;
4547 }
4548
f09fda50 4549 metaslab_set_selected_txg(msp, txg);
679b0f2a
PD
4550
4551 int activation_error =
4552 metaslab_activate(msp, allocator, activation_weight);
4553 metaslab_active_mask_verify(msp);
4554
4555 /*
4556 * If the metaslab was activated by another thread for
4557 * another allocator or activation_weight (EBUSY), or it
4558 * failed because another metaslab was assigned as primary
4559 * for this allocator (EEXIST) we continue using this
4560 * metaslab for our allocation, rather than going on to a
4561 * worse metaslab (we waited for that metaslab to be loaded
4562 * after all).
4563 *
fe0ea848
PD
4564 * If the activation failed due to an I/O error or ENOSPC we
4565 * skip to the next metaslab.
679b0f2a
PD
4566 */
4567 boolean_t activated;
4568 if (activation_error == 0) {
4569 activated = B_TRUE;
4570 } else if (activation_error == EBUSY ||
4571 activation_error == EEXIST) {
4572 activated = B_FALSE;
4573 } else {
34dc7c2f
BB
4574 mutex_exit(&msp->ms_lock);
4575 continue;
4576 }
679b0f2a 4577 ASSERT(msp->ms_loaded);
4e21fd06
DB
4578
4579 /*
4580 * Now that we have the lock, recheck to see if we should
4581 * continue to use this metaslab for this allocation. The
679b0f2a
PD
4582 * the metaslab is now loaded so metaslab_should_allocate()
4583 * can accurately determine if the allocation attempt should
4e21fd06
DB
4584 * proceed.
4585 */
c81f1790 4586 if (!metaslab_should_allocate(msp, asize, try_hard)) {
4e21fd06
DB
4587 /* Passivate this metaslab and select a new one. */
4588 metaslab_trace_add(zal, mg, msp, asize, d,
492f64e9 4589 TRACE_TOO_SMALL, allocator);
4e21fd06
DB
4590 goto next;
4591 }
4592
7a614407 4593 /*
679b0f2a
PD
4594 * If this metaslab is currently condensing then pick again
4595 * as we can't manipulate this metaslab until it's committed
619f0976
GW
4596 * to disk. If this metaslab is being initialized, we shouldn't
4597 * allocate from it since the allocated region might be
4598 * overwritten after allocation.
7a614407 4599 */
93cf2076 4600 if (msp->ms_condensing) {
4e21fd06 4601 metaslab_trace_add(zal, mg, msp, asize, d,
492f64e9 4602 TRACE_CONDENSING, allocator);
679b0f2a
PD
4603 if (activated) {
4604 metaslab_passivate(msp, msp->ms_weight &
4605 ~METASLAB_ACTIVE_MASK);
4606 }
7a614407
GW
4607 mutex_exit(&msp->ms_lock);
4608 continue;
1b939560 4609 } else if (msp->ms_disabled > 0) {
619f0976 4610 metaslab_trace_add(zal, mg, msp, asize, d,
1b939560 4611 TRACE_DISABLED, allocator);
679b0f2a
PD
4612 if (activated) {
4613 metaslab_passivate(msp, msp->ms_weight &
4614 ~METASLAB_ACTIVE_MASK);
4615 }
619f0976
GW
4616 mutex_exit(&msp->ms_lock);
4617 continue;
7a614407
GW
4618 }
4619
4e21fd06 4620 offset = metaslab_block_alloc(msp, asize, txg);
492f64e9 4621 metaslab_trace_add(zal, mg, msp, asize, d, offset, allocator);
4e21fd06
DB
4622
4623 if (offset != -1ULL) {
4624 /* Proactively passivate the metaslab, if needed */
679b0f2a
PD
4625 if (activated)
4626 metaslab_segment_may_passivate(msp);
34dc7c2f 4627 break;
4e21fd06
DB
4628 }
4629next:
4630 ASSERT(msp->ms_loaded);
4631
679b0f2a
PD
4632 /*
4633 * This code is disabled out because of issues with
4634 * tracepoints in non-gpl kernel modules.
4635 */
4636#if 0
4637 DTRACE_PROBE2(ms__alloc__failure, metaslab_t *, msp,
4638 uint64_t, asize);
4639#endif
4640
4e21fd06
DB
4641 /*
4642 * We were unable to allocate from this metaslab so determine
4643 * a new weight for this metaslab. Now that we have loaded
4644 * the metaslab we can provide a better hint to the metaslab
4645 * selector.
4646 *
4647 * For space-based metaslabs, we use the maximum block size.
4648 * This information is only available when the metaslab
4649 * is loaded and is more accurate than the generic free
4650 * space weight that was calculated by metaslab_weight().
4651 * This information allows us to quickly compare the maximum
4652 * available allocation in the metaslab to the allocation
4653 * size being requested.
4654 *
4655 * For segment-based metaslabs, determine the new weight
4656 * based on the highest bucket in the range tree. We
4657 * explicitly use the loaded segment weight (i.e. the range
4658 * tree histogram) since it contains the space that is
4659 * currently available for allocation and is accurate
4660 * even within a sync pass.
4661 */
679b0f2a 4662 uint64_t weight;
4e21fd06 4663 if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
c81f1790 4664 weight = metaslab_largest_allocatable(msp);
4e21fd06 4665 WEIGHT_SET_SPACEBASED(weight);
679b0f2a
PD
4666 } else {
4667 weight = metaslab_weight_from_range_tree(msp);
4668 }
4669
4670 if (activated) {
4e21fd06
DB
4671 metaslab_passivate(msp, weight);
4672 } else {
679b0f2a
PD
4673 /*
4674 * For the case where we use the metaslab that is
4675 * active for another allocator we want to make
4676 * sure that we retain the activation mask.
4677 *
4678 * Note that we could attempt to use something like
4679 * metaslab_recalculate_weight_and_sort() that
4680 * retains the activation mask here. That function
4681 * uses metaslab_weight() to set the weight though
4682 * which is not as accurate as the calculations
4683 * above.
4684 */
4685 weight |= msp->ms_weight & METASLAB_ACTIVE_MASK;
4686 metaslab_group_sort(mg, msp, weight);
4e21fd06 4687 }
679b0f2a 4688 metaslab_active_mask_verify(msp);
34dc7c2f 4689
4e21fd06
DB
4690 /*
4691 * We have just failed an allocation attempt, check
4692 * that metaslab_should_allocate() agrees. Otherwise,
4693 * we may end up in an infinite loop retrying the same
4694 * metaslab.
4695 */
c81f1790 4696 ASSERT(!metaslab_should_allocate(msp, asize, try_hard));
cc99f275 4697
34dc7c2f
BB
4698 mutex_exit(&msp->ms_lock);
4699 }
4e21fd06
DB
4700 mutex_exit(&msp->ms_lock);
4701 kmem_free(search, sizeof (*search));
4702 return (offset);
4703}
34dc7c2f 4704
4e21fd06
DB
4705static uint64_t
4706metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal,
c81f1790
PD
4707 uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, int d,
4708 int allocator, boolean_t try_hard)
4e21fd06
DB
4709{
4710 uint64_t offset;
4711 ASSERT(mg->mg_initialized);
34dc7c2f 4712
cc99f275 4713 offset = metaslab_group_alloc_normal(mg, zal, asize, txg, want_unique,
c81f1790 4714 dva, d, allocator, try_hard);
34dc7c2f 4715
4e21fd06
DB
4716 mutex_enter(&mg->mg_lock);
4717 if (offset == -1ULL) {
4718 mg->mg_failed_allocations++;
4719 metaslab_trace_add(zal, mg, NULL, asize, d,
492f64e9 4720 TRACE_GROUP_FAILURE, allocator);
4e21fd06
DB
4721 if (asize == SPA_GANGBLOCKSIZE) {
4722 /*
4723 * This metaslab group was unable to allocate
4724 * the minimum gang block size so it must be out of
4725 * space. We must notify the allocation throttle
4726 * to start skipping allocation attempts to this
4727 * metaslab group until more space becomes available.
4728 * Note: this failure cannot be caused by the
4729 * allocation throttle since the allocation throttle
4730 * is only responsible for skipping devices and
4731 * not failing block allocations.
4732 */
4733 mg->mg_no_free_space = B_TRUE;
4734 }
4735 }
4736 mg->mg_allocations++;
4737 mutex_exit(&mg->mg_lock);
34dc7c2f
BB
4738 return (offset);
4739}
4740
4741/*
4742 * Allocate a block for the specified i/o.
4743 */
a1d477c2 4744int
34dc7c2f 4745metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
4e21fd06 4746 dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags,
492f64e9 4747 zio_alloc_list_t *zal, int allocator)
34dc7c2f 4748{
920dd524 4749 metaslab_group_t *mg, *fast_mg, *rotor;
34dc7c2f 4750 vdev_t *vd;
4e21fd06 4751 boolean_t try_hard = B_FALSE;
34dc7c2f
BB
4752
4753 ASSERT(!DVA_IS_VALID(&dva[d]));
4754
4755 /*
4756 * For testing, make some blocks above a certain size be gang blocks.
09b85f2d
BB
4757 * This will result in more split blocks when using device removal,
4758 * and a large number of split blocks coupled with ztest-induced
4759 * damage can result in extremely long reconstruction times. This
4760 * will also test spilling from special to normal.
34dc7c2f 4761 */
09b85f2d 4762 if (psize >= metaslab_force_ganging && (spa_get_random(100) < 3)) {
492f64e9
PD
4763 metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG,
4764 allocator);
2e528b49 4765 return (SET_ERROR(ENOSPC));
4e21fd06 4766 }
34dc7c2f
BB
4767
4768 /*
4769 * Start at the rotor and loop through all mgs until we find something.
428870ff 4770 * Note that there's no locking on mc_rotor or mc_aliquot because
34dc7c2f
BB
4771 * nothing actually breaks if we miss a few updates -- we just won't
4772 * allocate quite as evenly. It all balances out over time.
4773 *
4774 * If we are doing ditto or log blocks, try to spread them across
4775 * consecutive vdevs. If we're forced to reuse a vdev before we've
4776 * allocated all of our ditto blocks, then try and spread them out on
4777 * that vdev as much as possible. If it turns out to not be possible,
4778 * gradually lower our standards until anything becomes acceptable.
4779 * Also, allocating on consecutive vdevs (as opposed to random vdevs)
4780 * gives us hope of containing our fault domains to something we're
4781 * able to reason about. Otherwise, any two top-level vdev failures
4782 * will guarantee the loss of data. With consecutive allocation,
4783 * only two adjacent top-level vdev failures will result in data loss.
4784 *
4785 * If we are doing gang blocks (hintdva is non-NULL), try to keep
4786 * ourselves on the same vdev as our gang block header. That
4787 * way, we can hope for locality in vdev_cache, plus it makes our
4788 * fault domains something tractable.
4789 */
4790 if (hintdva) {
4791 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));
428870ff
BB
4792
4793 /*
4794 * It's possible the vdev we're using as the hint no
a1d477c2
MA
4795 * longer exists or its mg has been closed (e.g. by
4796 * device removal). Consult the rotor when
428870ff
BB
4797 * all else fails.
4798 */
a1d477c2 4799 if (vd != NULL && vd->vdev_mg != NULL) {
34dc7c2f 4800 mg = vd->vdev_mg;
428870ff
BB
4801
4802 if (flags & METASLAB_HINTBP_AVOID &&
4803 mg->mg_next != NULL)
4804 mg = mg->mg_next;
4805 } else {
4806 mg = mc->mc_rotor;
4807 }
34dc7c2f
BB
4808 } else if (d != 0) {
4809 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
4810 mg = vd->vdev_mg->mg_next;
920dd524
ED
4811 } else if (flags & METASLAB_FASTWRITE) {
4812 mg = fast_mg = mc->mc_rotor;
4813
4814 do {
4815 if (fast_mg->mg_vd->vdev_pending_fastwrite <
4816 mg->mg_vd->vdev_pending_fastwrite)
4817 mg = fast_mg;
4818 } while ((fast_mg = fast_mg->mg_next) != mc->mc_rotor);
4819
34dc7c2f 4820 } else {
cc99f275 4821 ASSERT(mc->mc_rotor != NULL);
34dc7c2f
BB
4822 mg = mc->mc_rotor;
4823 }
4824
4825 /*
428870ff
BB
4826 * If the hint put us into the wrong metaslab class, or into a
4827 * metaslab group that has been passivated, just follow the rotor.
34dc7c2f 4828 */
428870ff 4829 if (mg->mg_class != mc || mg->mg_activation_count <= 0)
34dc7c2f
BB
4830 mg = mc->mc_rotor;
4831
4832 rotor = mg;
4833top:
34dc7c2f 4834 do {
4e21fd06 4835 boolean_t allocatable;
428870ff 4836
3dfb57a3 4837 ASSERT(mg->mg_activation_count == 1);
34dc7c2f 4838 vd = mg->mg_vd;
fb5f0bc8 4839
34dc7c2f 4840 /*
b128c09f 4841 * Don't allocate from faulted devices.
34dc7c2f 4842 */
4e21fd06 4843 if (try_hard) {
fb5f0bc8
BB
4844 spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
4845 allocatable = vdev_allocatable(vd);
4846 spa_config_exit(spa, SCL_ZIO, FTAG);
4847 } else {
4848 allocatable = vdev_allocatable(vd);
4849 }
ac72fac3
GW
4850
4851 /*
4852 * Determine if the selected metaslab group is eligible
3dfb57a3
DB
4853 * for allocations. If we're ganging then don't allow
4854 * this metaslab group to skip allocations since that would
4855 * inadvertently return ENOSPC and suspend the pool
ac72fac3
GW
4856 * even though space is still available.
4857 */
4e21fd06 4858 if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) {
3dfb57a3 4859 allocatable = metaslab_group_allocatable(mg, rotor,
c197a77c 4860 psize, allocator, d);
3dfb57a3 4861 }
ac72fac3 4862
4e21fd06
DB
4863 if (!allocatable) {
4864 metaslab_trace_add(zal, mg, NULL, psize, d,
492f64e9 4865 TRACE_NOT_ALLOCATABLE, allocator);
34dc7c2f 4866 goto next;
4e21fd06 4867 }
fb5f0bc8 4868
3dfb57a3
DB
4869 ASSERT(mg->mg_initialized);
4870
34dc7c2f 4871 /*
4e21fd06
DB
4872 * Avoid writing single-copy data to a failing,
4873 * non-redundant vdev, unless we've already tried all
4874 * other vdevs.
34dc7c2f
BB
4875 */
4876 if ((vd->vdev_stat.vs_write_errors > 0 ||
4877 vd->vdev_state < VDEV_STATE_HEALTHY) &&
4e21fd06
DB
4878 d == 0 && !try_hard && vd->vdev_children == 0) {
4879 metaslab_trace_add(zal, mg, NULL, psize, d,
492f64e9 4880 TRACE_VDEV_ERROR, allocator);
34dc7c2f
BB
4881 goto next;
4882 }
4883
4884 ASSERT(mg->mg_class == mc);
4885
1c27024e 4886 uint64_t asize = vdev_psize_to_asize(vd, psize);
34dc7c2f
BB
4887 ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
4888
cc99f275
DB
4889 /*
4890 * If we don't need to try hard, then require that the
e1cfd73f 4891 * block be on a different metaslab from any other DVAs
cc99f275
DB
4892 * in this BP (unique=true). If we are trying hard, then
4893 * allow any metaslab to be used (unique=false).
4894 */
1c27024e 4895 uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg,
c81f1790 4896 !try_hard, dva, d, allocator, try_hard);
3dfb57a3 4897
34dc7c2f
BB
4898 if (offset != -1ULL) {
4899 /*
4900 * If we've just selected this metaslab group,
4901 * figure out whether the corresponding vdev is
4902 * over- or under-used relative to the pool,
4903 * and set an allocation bias to even it out.
bb3250d0
ED
4904 *
4905 * Bias is also used to compensate for unequally
4906 * sized vdevs so that space is allocated fairly.
34dc7c2f 4907 */
f3a7f661 4908 if (mc->mc_aliquot == 0 && metaslab_bias_enabled) {
34dc7c2f 4909 vdev_stat_t *vs = &vd->vdev_stat;
bb3250d0
ED
4910 int64_t vs_free = vs->vs_space - vs->vs_alloc;
4911 int64_t mc_free = mc->mc_space - mc->mc_alloc;
4912 int64_t ratio;
34dc7c2f
BB
4913
4914 /*
6d974228
GW
4915 * Calculate how much more or less we should
4916 * try to allocate from this device during
4917 * this iteration around the rotor.
6d974228 4918 *
bb3250d0
ED
4919 * This basically introduces a zero-centered
4920 * bias towards the devices with the most
4921 * free space, while compensating for vdev
4922 * size differences.
4923 *
4924 * Examples:
4925 * vdev V1 = 16M/128M
4926 * vdev V2 = 16M/128M
4927 * ratio(V1) = 100% ratio(V2) = 100%
4928 *
4929 * vdev V1 = 16M/128M
4930 * vdev V2 = 64M/128M
4931 * ratio(V1) = 127% ratio(V2) = 72%
6d974228 4932 *
bb3250d0
ED
4933 * vdev V1 = 16M/128M
4934 * vdev V2 = 64M/512M
4935 * ratio(V1) = 40% ratio(V2) = 160%
34dc7c2f 4936 */
bb3250d0
ED
4937 ratio = (vs_free * mc->mc_alloc_groups * 100) /
4938 (mc_free + 1);
4939 mg->mg_bias = ((ratio - 100) *
6d974228 4940 (int64_t)mg->mg_aliquot) / 100;
f3a7f661
GW
4941 } else if (!metaslab_bias_enabled) {
4942 mg->mg_bias = 0;
34dc7c2f
BB
4943 }
4944
920dd524
ED
4945 if ((flags & METASLAB_FASTWRITE) ||
4946 atomic_add_64_nv(&mc->mc_aliquot, asize) >=
34dc7c2f
BB
4947 mg->mg_aliquot + mg->mg_bias) {
4948 mc->mc_rotor = mg->mg_next;
428870ff 4949 mc->mc_aliquot = 0;
34dc7c2f
BB
4950 }
4951
4952 DVA_SET_VDEV(&dva[d], vd->vdev_id);
4953 DVA_SET_OFFSET(&dva[d], offset);
e3e7cf60
D
4954 DVA_SET_GANG(&dva[d],
4955 ((flags & METASLAB_GANG_HEADER) ? 1 : 0));
34dc7c2f
BB
4956 DVA_SET_ASIZE(&dva[d], asize);
4957
920dd524
ED
4958 if (flags & METASLAB_FASTWRITE) {
4959 atomic_add_64(&vd->vdev_pending_fastwrite,
4960 psize);
920dd524
ED
4961 }
4962
34dc7c2f
BB
4963 return (0);
4964 }
4965next:
4966 mc->mc_rotor = mg->mg_next;
428870ff 4967 mc->mc_aliquot = 0;
34dc7c2f
BB
4968 } while ((mg = mg->mg_next) != rotor);
4969
4e21fd06
DB
4970 /*
4971 * If we haven't tried hard, do so now.
4972 */
4973 if (!try_hard) {
4974 try_hard = B_TRUE;
fb5f0bc8
BB
4975 goto top;
4976 }
4977
34dc7c2f
BB
4978 bzero(&dva[d], sizeof (dva_t));
4979
492f64e9 4980 metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC, allocator);
2e528b49 4981 return (SET_ERROR(ENOSPC));
34dc7c2f
BB
4982}
4983
a1d477c2
MA
4984void
4985metaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize,
d2734cce 4986 boolean_t checkpoint)
a1d477c2
MA
4987{
4988 metaslab_t *msp;
d2734cce 4989 spa_t *spa = vd->vdev_spa;
a1d477c2 4990
a1d477c2
MA
4991 ASSERT(vdev_is_concrete(vd));
4992 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
4993 ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count);
4994
4995 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
4996
4997 VERIFY(!msp->ms_condensing);
4998 VERIFY3U(offset, >=, msp->ms_start);
4999 VERIFY3U(offset + asize, <=, msp->ms_start + msp->ms_size);
5000 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
5001 VERIFY0(P2PHASE(asize, 1ULL << vd->vdev_ashift));
5002
5003 metaslab_check_free_impl(vd, offset, asize);
d2734cce 5004
a1d477c2 5005 mutex_enter(&msp->ms_lock);
d2734cce
SD
5006 if (range_tree_is_empty(msp->ms_freeing) &&
5007 range_tree_is_empty(msp->ms_checkpointing)) {
5008 vdev_dirty(vd, VDD_METASLAB, msp, spa_syncing_txg(spa));
5009 }
5010
5011 if (checkpoint) {
5012 ASSERT(spa_has_checkpoint(spa));
5013 range_tree_add(msp->ms_checkpointing, offset, asize);
5014 } else {
5015 range_tree_add(msp->ms_freeing, offset, asize);
a1d477c2 5016 }
a1d477c2
MA
5017 mutex_exit(&msp->ms_lock);
5018}
5019
5020/* ARGSUSED */
5021void
5022metaslab_free_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
5023 uint64_t size, void *arg)
5024{
d2734cce
SD
5025 boolean_t *checkpoint = arg;
5026
5027 ASSERT3P(checkpoint, !=, NULL);
a1d477c2
MA
5028
5029 if (vd->vdev_ops->vdev_op_remap != NULL)
d2734cce 5030 vdev_indirect_mark_obsolete(vd, offset, size);
a1d477c2 5031 else
d2734cce 5032 metaslab_free_impl(vd, offset, size, *checkpoint);
a1d477c2
MA
5033}
5034
5035static void
5036metaslab_free_impl(vdev_t *vd, uint64_t offset, uint64_t size,
d2734cce 5037 boolean_t checkpoint)
a1d477c2
MA
5038{
5039 spa_t *spa = vd->vdev_spa;
5040
5041 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
5042
d2734cce 5043 if (spa_syncing_txg(spa) > spa_freeze_txg(spa))
a1d477c2
MA
5044 return;
5045
5046 if (spa->spa_vdev_removal != NULL &&
9e052db4 5047 spa->spa_vdev_removal->svr_vdev_id == vd->vdev_id &&
a1d477c2
MA
5048 vdev_is_concrete(vd)) {
5049 /*
5050 * Note: we check if the vdev is concrete because when
5051 * we complete the removal, we first change the vdev to be
5052 * an indirect vdev (in open context), and then (in syncing
5053 * context) clear spa_vdev_removal.
5054 */
d2734cce 5055 free_from_removing_vdev(vd, offset, size);
a1d477c2 5056 } else if (vd->vdev_ops->vdev_op_remap != NULL) {
d2734cce 5057 vdev_indirect_mark_obsolete(vd, offset, size);
a1d477c2 5058 vd->vdev_ops->vdev_op_remap(vd, offset, size,
d2734cce 5059 metaslab_free_impl_cb, &checkpoint);
a1d477c2 5060 } else {
d2734cce 5061 metaslab_free_concrete(vd, offset, size, checkpoint);
a1d477c2
MA
5062 }
5063}
5064
5065typedef struct remap_blkptr_cb_arg {
5066 blkptr_t *rbca_bp;
5067 spa_remap_cb_t rbca_cb;
5068 vdev_t *rbca_remap_vd;
5069 uint64_t rbca_remap_offset;
5070 void *rbca_cb_arg;
5071} remap_blkptr_cb_arg_t;
5072
5073void
5074remap_blkptr_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
5075 uint64_t size, void *arg)
5076{
5077 remap_blkptr_cb_arg_t *rbca = arg;
5078 blkptr_t *bp = rbca->rbca_bp;
5079
5080 /* We can not remap split blocks. */
5081 if (size != DVA_GET_ASIZE(&bp->blk_dva[0]))
5082 return;
5083 ASSERT0(inner_offset);
5084
5085 if (rbca->rbca_cb != NULL) {
5086 /*
5087 * At this point we know that we are not handling split
5088 * blocks and we invoke the callback on the previous
5089 * vdev which must be indirect.
5090 */
5091 ASSERT3P(rbca->rbca_remap_vd->vdev_ops, ==, &vdev_indirect_ops);
5092
5093 rbca->rbca_cb(rbca->rbca_remap_vd->vdev_id,
5094 rbca->rbca_remap_offset, size, rbca->rbca_cb_arg);
5095
5096 /* set up remap_blkptr_cb_arg for the next call */
5097 rbca->rbca_remap_vd = vd;
5098 rbca->rbca_remap_offset = offset;
5099 }
5100
5101 /*
5102 * The phys birth time is that of dva[0]. This ensures that we know
5103 * when each dva was written, so that resilver can determine which
5104 * blocks need to be scrubbed (i.e. those written during the time
5105 * the vdev was offline). It also ensures that the key used in
5106 * the ARC hash table is unique (i.e. dva[0] + phys_birth). If
5107 * we didn't change the phys_birth, a lookup in the ARC for a
5108 * remapped BP could find the data that was previously stored at
5109 * this vdev + offset.
5110 */
5111 vdev_t *oldvd = vdev_lookup_top(vd->vdev_spa,
5112 DVA_GET_VDEV(&bp->blk_dva[0]));
5113 vdev_indirect_births_t *vib = oldvd->vdev_indirect_births;
5114 bp->blk_phys_birth = vdev_indirect_births_physbirth(vib,
5115 DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_ASIZE(&bp->blk_dva[0]));
5116
5117 DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id);
5118 DVA_SET_OFFSET(&bp->blk_dva[0], offset);
5119}
5120
34dc7c2f 5121/*
a1d477c2
MA
5122 * If the block pointer contains any indirect DVAs, modify them to refer to
5123 * concrete DVAs. Note that this will sometimes not be possible, leaving
5124 * the indirect DVA in place. This happens if the indirect DVA spans multiple
5125 * segments in the mapping (i.e. it is a "split block").
5126 *
5127 * If the BP was remapped, calls the callback on the original dva (note the
5128 * callback can be called multiple times if the original indirect DVA refers
5129 * to another indirect DVA, etc).
5130 *
5131 * Returns TRUE if the BP was remapped.
34dc7c2f 5132 */
a1d477c2
MA
5133boolean_t
5134spa_remap_blkptr(spa_t *spa, blkptr_t *bp, spa_remap_cb_t callback, void *arg)
34dc7c2f 5135{
a1d477c2
MA
5136 remap_blkptr_cb_arg_t rbca;
5137
5138 if (!zfs_remap_blkptr_enable)
5139 return (B_FALSE);
5140
5141 if (!spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS))
5142 return (B_FALSE);
5143
5144 /*
5145 * Dedup BP's can not be remapped, because ddt_phys_select() depends
5146 * on DVA[0] being the same in the BP as in the DDT (dedup table).
5147 */
5148 if (BP_GET_DEDUP(bp))
5149 return (B_FALSE);
5150
5151 /*
5152 * Gang blocks can not be remapped, because
5153 * zio_checksum_gang_verifier() depends on the DVA[0] that's in
5154 * the BP used to read the gang block header (GBH) being the same
5155 * as the DVA[0] that we allocated for the GBH.
5156 */
5157 if (BP_IS_GANG(bp))
5158 return (B_FALSE);
5159
5160 /*
5161 * Embedded BP's have no DVA to remap.
5162 */
5163 if (BP_GET_NDVAS(bp) < 1)
5164 return (B_FALSE);
5165
5166 /*
5167 * Note: we only remap dva[0]. If we remapped other dvas, we
5168 * would no longer know what their phys birth txg is.
5169 */
5170 dva_t *dva = &bp->blk_dva[0];
5171
34dc7c2f
BB
5172 uint64_t offset = DVA_GET_OFFSET(dva);
5173 uint64_t size = DVA_GET_ASIZE(dva);
a1d477c2
MA
5174 vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva));
5175
5176 if (vd->vdev_ops->vdev_op_remap == NULL)
5177 return (B_FALSE);
5178
5179 rbca.rbca_bp = bp;
5180 rbca.rbca_cb = callback;
5181 rbca.rbca_remap_vd = vd;
5182 rbca.rbca_remap_offset = offset;
5183 rbca.rbca_cb_arg = arg;
5184
5185 /*
5186 * remap_blkptr_cb() will be called in order for each level of
5187 * indirection, until a concrete vdev is reached or a split block is
5188 * encountered. old_vd and old_offset are updated within the callback
5189 * as we go from the one indirect vdev to the next one (either concrete
5190 * or indirect again) in that order.
5191 */
5192 vd->vdev_ops->vdev_op_remap(vd, offset, size, remap_blkptr_cb, &rbca);
5193
5194 /* Check if the DVA wasn't remapped because it is a split block */
5195 if (DVA_GET_VDEV(&rbca.rbca_bp->blk_dva[0]) == vd->vdev_id)
5196 return (B_FALSE);
5197
5198 return (B_TRUE);
5199}
5200
5201/*
5202 * Undo the allocation of a DVA which happened in the given transaction group.
5203 */
5204void
5205metaslab_unalloc_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
5206{
34dc7c2f 5207 metaslab_t *msp;
a1d477c2
MA
5208 vdev_t *vd;
5209 uint64_t vdev = DVA_GET_VDEV(dva);
5210 uint64_t offset = DVA_GET_OFFSET(dva);
5211 uint64_t size = DVA_GET_ASIZE(dva);
5212
5213 ASSERT(DVA_IS_VALID(dva));
5214 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
34dc7c2f 5215
34dc7c2f
BB
5216 if (txg > spa_freeze_txg(spa))
5217 return;
5218
7d2868d5 5219 if ((vd = vdev_lookup_top(spa, vdev)) == NULL || !DVA_IS_VALID(dva) ||
34dc7c2f 5220 (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) {
7d2868d5
BB
5221 zfs_panic_recover("metaslab_free_dva(): bad DVA %llu:%llu:%llu",
5222 (u_longlong_t)vdev, (u_longlong_t)offset,
5223 (u_longlong_t)size);
34dc7c2f
BB
5224 return;
5225 }
5226
a1d477c2
MA
5227 ASSERT(!vd->vdev_removing);
5228 ASSERT(vdev_is_concrete(vd));
5229 ASSERT0(vd->vdev_indirect_config.vic_mapping_object);
5230 ASSERT3P(vd->vdev_indirect_mapping, ==, NULL);
34dc7c2f
BB
5231
5232 if (DVA_GET_GANG(dva))
5233 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
5234
a1d477c2 5235 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
93cf2076 5236
a1d477c2 5237 mutex_enter(&msp->ms_lock);
d2734cce 5238 range_tree_remove(msp->ms_allocating[txg & TXG_MASK],
a1d477c2 5239 offset, size);
f09fda50 5240 msp->ms_allocating_total -= size;
34dc7c2f 5241
a1d477c2
MA
5242 VERIFY(!msp->ms_condensing);
5243 VERIFY3U(offset, >=, msp->ms_start);
5244 VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size);
d2734cce 5245 VERIFY3U(range_tree_space(msp->ms_allocatable) + size, <=,
a1d477c2
MA
5246 msp->ms_size);
5247 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
5248 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
d2734cce 5249 range_tree_add(msp->ms_allocatable, offset, size);
34dc7c2f
BB
5250 mutex_exit(&msp->ms_lock);
5251}
5252
5253/*
d2734cce 5254 * Free the block represented by the given DVA.
34dc7c2f 5255 */
a1d477c2 5256void
d2734cce 5257metaslab_free_dva(spa_t *spa, const dva_t *dva, boolean_t checkpoint)
34dc7c2f
BB
5258{
5259 uint64_t vdev = DVA_GET_VDEV(dva);
5260 uint64_t offset = DVA_GET_OFFSET(dva);
5261 uint64_t size = DVA_GET_ASIZE(dva);
a1d477c2 5262 vdev_t *vd = vdev_lookup_top(spa, vdev);
34dc7c2f
BB
5263
5264 ASSERT(DVA_IS_VALID(dva));
a1d477c2 5265 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
34dc7c2f 5266
a1d477c2 5267 if (DVA_GET_GANG(dva)) {
34dc7c2f 5268 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
34dc7c2f
BB
5269 }
5270
d2734cce 5271 metaslab_free_impl(vd, offset, size, checkpoint);
34dc7c2f
BB
5272}
5273
3dfb57a3
DB
5274/*
5275 * Reserve some allocation slots. The reservation system must be called
5276 * before we call into the allocator. If there aren't any available slots
5277 * then the I/O will be throttled until an I/O completes and its slots are
5278 * freed up. The function returns true if it was successful in placing
5279 * the reservation.
5280 */
5281boolean_t
492f64e9
PD
5282metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator,
5283 zio_t *zio, int flags)
3dfb57a3
DB
5284{
5285 uint64_t available_slots = 0;
3dfb57a3 5286 boolean_t slot_reserved = B_FALSE;
492f64e9 5287 uint64_t max = mc->mc_alloc_max_slots[allocator];
3dfb57a3
DB
5288
5289 ASSERT(mc->mc_alloc_throttle_enabled);
5290 mutex_enter(&mc->mc_lock);
5291
492f64e9 5292 uint64_t reserved_slots =
424fd7c3 5293 zfs_refcount_count(&mc->mc_alloc_slots[allocator]);
492f64e9
PD
5294 if (reserved_slots < max)
5295 available_slots = max - reserved_slots;
3dfb57a3 5296
cc99f275
DB
5297 if (slots <= available_slots || GANG_ALLOCATION(flags) ||
5298 flags & METASLAB_MUST_RESERVE) {
3dfb57a3
DB
5299 /*
5300 * We reserve the slots individually so that we can unreserve
5301 * them individually when an I/O completes.
5302 */
1c27024e 5303 for (int d = 0; d < slots; d++) {
492f64e9 5304 reserved_slots =
c13060e4 5305 zfs_refcount_add(&mc->mc_alloc_slots[allocator],
492f64e9 5306 zio);
3dfb57a3
DB
5307 }
5308 zio->io_flags |= ZIO_FLAG_IO_ALLOCATING;
5309 slot_reserved = B_TRUE;
5310 }
5311
5312 mutex_exit(&mc->mc_lock);
5313 return (slot_reserved);
5314}
5315
5316void
492f64e9
PD
5317metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots,
5318 int allocator, zio_t *zio)
3dfb57a3 5319{
3dfb57a3
DB
5320 ASSERT(mc->mc_alloc_throttle_enabled);
5321 mutex_enter(&mc->mc_lock);
1c27024e 5322 for (int d = 0; d < slots; d++) {
424fd7c3 5323 (void) zfs_refcount_remove(&mc->mc_alloc_slots[allocator],
492f64e9 5324 zio);
3dfb57a3
DB
5325 }
5326 mutex_exit(&mc->mc_lock);
5327}
5328
a1d477c2
MA
5329static int
5330metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size,
5331 uint64_t txg)
5332{
5333 metaslab_t *msp;
5334 spa_t *spa = vd->vdev_spa;
5335 int error = 0;
5336
5337 if (offset >> vd->vdev_ms_shift >= vd->vdev_ms_count)
7ab96299 5338 return (SET_ERROR(ENXIO));
a1d477c2
MA
5339
5340 ASSERT3P(vd->vdev_ms, !=, NULL);
5341 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
5342
5343 mutex_enter(&msp->ms_lock);
5344
7ab96299 5345 if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded) {
492f64e9 5346 error = metaslab_activate(msp, 0, METASLAB_WEIGHT_CLAIM);
7ab96299
TC
5347 if (error == EBUSY) {
5348 ASSERT(msp->ms_loaded);
5349 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
5350 error = 0;
5351 }
5352 }
a1d477c2 5353
d2734cce
SD
5354 if (error == 0 &&
5355 !range_tree_contains(msp->ms_allocatable, offset, size))
a1d477c2
MA
5356 error = SET_ERROR(ENOENT);
5357
5358 if (error || txg == 0) { /* txg == 0 indicates dry run */
5359 mutex_exit(&msp->ms_lock);
5360 return (error);
5361 }
5362
5363 VERIFY(!msp->ms_condensing);
5364 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
5365 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
d2734cce
SD
5366 VERIFY3U(range_tree_space(msp->ms_allocatable) - size, <=,
5367 msp->ms_size);
5368 range_tree_remove(msp->ms_allocatable, offset, size);
1b939560 5369 range_tree_clear(msp->ms_trim, offset, size);
a1d477c2
MA
5370
5371 if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */
f09fda50
PD
5372 metaslab_class_t *mc = msp->ms_group->mg_class;
5373 multilist_sublist_t *mls =
5374 multilist_sublist_lock_obj(mc->mc_metaslab_txg_list, msp);
5375 if (!multilist_link_active(&msp->ms_class_txg_node)) {
5376 msp->ms_selected_txg = txg;
5377 multilist_sublist_insert_head(mls, msp);
5378 }
5379 multilist_sublist_unlock(mls);
5380
d2734cce 5381 if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK]))
a1d477c2 5382 vdev_dirty(vd, VDD_METASLAB, msp, txg);
d2734cce
SD
5383 range_tree_add(msp->ms_allocating[txg & TXG_MASK],
5384 offset, size);
f09fda50 5385 msp->ms_allocating_total += size;
a1d477c2
MA
5386 }
5387
5388 mutex_exit(&msp->ms_lock);
5389
5390 return (0);
5391}
5392
5393typedef struct metaslab_claim_cb_arg_t {
5394 uint64_t mcca_txg;
5395 int mcca_error;
5396} metaslab_claim_cb_arg_t;
5397
5398/* ARGSUSED */
5399static void
5400metaslab_claim_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
5401 uint64_t size, void *arg)
5402{
5403 metaslab_claim_cb_arg_t *mcca_arg = arg;
5404
5405 if (mcca_arg->mcca_error == 0) {
5406 mcca_arg->mcca_error = metaslab_claim_concrete(vd, offset,
5407 size, mcca_arg->mcca_txg);
5408 }
5409}
5410
5411int
5412metaslab_claim_impl(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg)
5413{
5414 if (vd->vdev_ops->vdev_op_remap != NULL) {
5415 metaslab_claim_cb_arg_t arg;
5416
5417 /*
5418 * Only zdb(1M) can claim on indirect vdevs. This is used
5419 * to detect leaks of mapped space (that are not accounted
5420 * for in the obsolete counts, spacemap, or bpobj).
5421 */
5422 ASSERT(!spa_writeable(vd->vdev_spa));
5423 arg.mcca_error = 0;
5424 arg.mcca_txg = txg;
5425
5426 vd->vdev_ops->vdev_op_remap(vd, offset, size,
5427 metaslab_claim_impl_cb, &arg);
5428
5429 if (arg.mcca_error == 0) {
5430 arg.mcca_error = metaslab_claim_concrete(vd,
5431 offset, size, txg);
5432 }
5433 return (arg.mcca_error);
5434 } else {
5435 return (metaslab_claim_concrete(vd, offset, size, txg));
5436 }
5437}
5438
5439/*
5440 * Intent log support: upon opening the pool after a crash, notify the SPA
5441 * of blocks that the intent log has allocated for immediate write, but
5442 * which are still considered free by the SPA because the last transaction
5443 * group didn't commit yet.
5444 */
5445static int
5446metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
5447{
5448 uint64_t vdev = DVA_GET_VDEV(dva);
5449 uint64_t offset = DVA_GET_OFFSET(dva);
5450 uint64_t size = DVA_GET_ASIZE(dva);
5451 vdev_t *vd;
5452
5453 if ((vd = vdev_lookup_top(spa, vdev)) == NULL) {
5454 return (SET_ERROR(ENXIO));
5455 }
5456
5457 ASSERT(DVA_IS_VALID(dva));
5458
5459 if (DVA_GET_GANG(dva))
5460 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
5461
5462 return (metaslab_claim_impl(vd, offset, size, txg));
5463}
5464
34dc7c2f
BB
5465int
5466metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
4e21fd06 5467 int ndvas, uint64_t txg, blkptr_t *hintbp, int flags,
492f64e9 5468 zio_alloc_list_t *zal, zio_t *zio, int allocator)
34dc7c2f
BB
5469{
5470 dva_t *dva = bp->blk_dva;
928e8ad4 5471 dva_t *hintdva = (hintbp != NULL) ? hintbp->blk_dva : NULL;
1c27024e 5472 int error = 0;
34dc7c2f 5473
b128c09f 5474 ASSERT(bp->blk_birth == 0);
428870ff 5475 ASSERT(BP_PHYSICAL_BIRTH(bp) == 0);
b128c09f
BB
5476
5477 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
5478
5479 if (mc->mc_rotor == NULL) { /* no vdevs in this class */
5480 spa_config_exit(spa, SCL_ALLOC, FTAG);
2e528b49 5481 return (SET_ERROR(ENOSPC));
b128c09f 5482 }
34dc7c2f
BB
5483
5484 ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa));
5485 ASSERT(BP_GET_NDVAS(bp) == 0);
5486 ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
4e21fd06 5487 ASSERT3P(zal, !=, NULL);
34dc7c2f 5488
1c27024e 5489 for (int d = 0; d < ndvas; d++) {
34dc7c2f 5490 error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
492f64e9 5491 txg, flags, zal, allocator);
93cf2076 5492 if (error != 0) {
34dc7c2f 5493 for (d--; d >= 0; d--) {
a1d477c2 5494 metaslab_unalloc_dva(spa, &dva[d], txg);
3dfb57a3 5495 metaslab_group_alloc_decrement(spa,
492f64e9
PD
5496 DVA_GET_VDEV(&dva[d]), zio, flags,
5497 allocator, B_FALSE);
34dc7c2f
BB
5498 bzero(&dva[d], sizeof (dva_t));
5499 }
b128c09f 5500 spa_config_exit(spa, SCL_ALLOC, FTAG);
34dc7c2f 5501 return (error);
3dfb57a3
DB
5502 } else {
5503 /*
5504 * Update the metaslab group's queue depth
5505 * based on the newly allocated dva.
5506 */
5507 metaslab_group_alloc_increment(spa,
492f64e9 5508 DVA_GET_VDEV(&dva[d]), zio, flags, allocator);
34dc7c2f 5509 }
3dfb57a3 5510
34dc7c2f
BB
5511 }
5512 ASSERT(error == 0);
5513 ASSERT(BP_GET_NDVAS(bp) == ndvas);
5514
b128c09f
BB
5515 spa_config_exit(spa, SCL_ALLOC, FTAG);
5516
efe7978d 5517 BP_SET_BIRTH(bp, txg, 0);
b128c09f 5518
34dc7c2f
BB
5519 return (0);
5520}
5521
5522void
5523metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
5524{
5525 const dva_t *dva = bp->blk_dva;
1c27024e 5526 int ndvas = BP_GET_NDVAS(bp);
34dc7c2f
BB
5527
5528 ASSERT(!BP_IS_HOLE(bp));
428870ff 5529 ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa));
b128c09f 5530
d2734cce
SD
5531 /*
5532 * If we have a checkpoint for the pool we need to make sure that
5533 * the blocks that we free that are part of the checkpoint won't be
5534 * reused until the checkpoint is discarded or we revert to it.
5535 *
5536 * The checkpoint flag is passed down the metaslab_free code path
5537 * and is set whenever we want to add a block to the checkpoint's
5538 * accounting. That is, we "checkpoint" blocks that existed at the
5539 * time the checkpoint was created and are therefore referenced by
5540 * the checkpointed uberblock.
5541 *
5542 * Note that, we don't checkpoint any blocks if the current
5543 * syncing txg <= spa_checkpoint_txg. We want these frees to sync
5544 * normally as they will be referenced by the checkpointed uberblock.
5545 */
5546 boolean_t checkpoint = B_FALSE;
5547 if (bp->blk_birth <= spa->spa_checkpoint_txg &&
5548 spa_syncing_txg(spa) > spa->spa_checkpoint_txg) {
5549 /*
5550 * At this point, if the block is part of the checkpoint
5551 * there is no way it was created in the current txg.
5552 */
5553 ASSERT(!now);
5554 ASSERT3U(spa_syncing_txg(spa), ==, txg);
5555 checkpoint = B_TRUE;
5556 }
5557
b128c09f 5558 spa_config_enter(spa, SCL_FREE, FTAG, RW_READER);
34dc7c2f 5559
a1d477c2
MA
5560 for (int d = 0; d < ndvas; d++) {
5561 if (now) {
5562 metaslab_unalloc_dva(spa, &dva[d], txg);
5563 } else {
d2734cce
SD
5564 ASSERT3U(txg, ==, spa_syncing_txg(spa));
5565 metaslab_free_dva(spa, &dva[d], checkpoint);
a1d477c2
MA
5566 }
5567 }
b128c09f
BB
5568
5569 spa_config_exit(spa, SCL_FREE, FTAG);
34dc7c2f
BB
5570}
5571
5572int
5573metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
5574{
5575 const dva_t *dva = bp->blk_dva;
5576 int ndvas = BP_GET_NDVAS(bp);
1c27024e 5577 int error = 0;
34dc7c2f
BB
5578
5579 ASSERT(!BP_IS_HOLE(bp));
5580
b128c09f
BB
5581 if (txg != 0) {
5582 /*
5583 * First do a dry run to make sure all DVAs are claimable,
5584 * so we don't have to unwind from partial failures below.
5585 */
5586 if ((error = metaslab_claim(spa, bp, 0)) != 0)
5587 return (error);
5588 }
5589
5590 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
5591
cc99f275
DB
5592 for (int d = 0; d < ndvas; d++) {
5593 error = metaslab_claim_dva(spa, &dva[d], txg);
5594 if (error != 0)
b128c09f 5595 break;
cc99f275 5596 }
b128c09f
BB
5597
5598 spa_config_exit(spa, SCL_ALLOC, FTAG);
5599
5600 ASSERT(error == 0 || txg == 0);
34dc7c2f 5601
b128c09f 5602 return (error);
34dc7c2f 5603}
920dd524 5604
d1d7e268
MK
5605void
5606metaslab_fastwrite_mark(spa_t *spa, const blkptr_t *bp)
920dd524
ED
5607{
5608 const dva_t *dva = bp->blk_dva;
5609 int ndvas = BP_GET_NDVAS(bp);
5610 uint64_t psize = BP_GET_PSIZE(bp);
5611 int d;
5612 vdev_t *vd;
5613
5614 ASSERT(!BP_IS_HOLE(bp));
9b67f605 5615 ASSERT(!BP_IS_EMBEDDED(bp));
920dd524
ED
5616 ASSERT(psize > 0);
5617
5618 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
5619
5620 for (d = 0; d < ndvas; d++) {
5621 if ((vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]))) == NULL)
5622 continue;
5623 atomic_add_64(&vd->vdev_pending_fastwrite, psize);
5624 }
5625
5626 spa_config_exit(spa, SCL_VDEV, FTAG);
5627}
5628
d1d7e268
MK
5629void
5630metaslab_fastwrite_unmark(spa_t *spa, const blkptr_t *bp)
920dd524
ED
5631{
5632 const dva_t *dva = bp->blk_dva;
5633 int ndvas = BP_GET_NDVAS(bp);
5634 uint64_t psize = BP_GET_PSIZE(bp);
5635 int d;
5636 vdev_t *vd;
5637
5638 ASSERT(!BP_IS_HOLE(bp));
9b67f605 5639 ASSERT(!BP_IS_EMBEDDED(bp));
920dd524
ED
5640 ASSERT(psize > 0);
5641
5642 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
5643
5644 for (d = 0; d < ndvas; d++) {
5645 if ((vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]))) == NULL)
5646 continue;
5647 ASSERT3U(vd->vdev_pending_fastwrite, >=, psize);
5648 atomic_sub_64(&vd->vdev_pending_fastwrite, psize);
5649 }
5650
5651 spa_config_exit(spa, SCL_VDEV, FTAG);
5652}
30b92c1d 5653
a1d477c2
MA
5654/* ARGSUSED */
5655static void
5656metaslab_check_free_impl_cb(uint64_t inner, vdev_t *vd, uint64_t offset,
5657 uint64_t size, void *arg)
5658{
5659 if (vd->vdev_ops == &vdev_indirect_ops)
5660 return;
5661
5662 metaslab_check_free_impl(vd, offset, size);
5663}
5664
5665static void
5666metaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size)
5667{
5668 metaslab_t *msp;
5669 ASSERTV(spa_t *spa = vd->vdev_spa);
5670
5671 if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
5672 return;
5673
5674 if (vd->vdev_ops->vdev_op_remap != NULL) {
5675 vd->vdev_ops->vdev_op_remap(vd, offset, size,
5676 metaslab_check_free_impl_cb, NULL);
5677 return;
5678 }
5679
5680 ASSERT(vdev_is_concrete(vd));
5681 ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count);
5682 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
5683
5684 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
5685
5686 mutex_enter(&msp->ms_lock);
df72b8be
SD
5687 if (msp->ms_loaded) {
5688 range_tree_verify_not_present(msp->ms_allocatable,
5689 offset, size);
5690 }
a1d477c2 5691
93e28d66
SD
5692 /*
5693 * Check all segments that currently exist in the freeing pipeline.
5694 *
5695 * It would intuitively make sense to also check the current allocating
5696 * tree since metaslab_unalloc_dva() exists for extents that are
e1cfd73f 5697 * allocated and freed in the same sync pass within the same txg.
93e28d66
SD
5698 * Unfortunately there are places (e.g. the ZIL) where we allocate a
5699 * segment but then we free part of it within the same txg
5700 * [see zil_sync()]. Thus, we don't call range_tree_verify() in the
5701 * current allocating tree.
5702 */
df72b8be
SD
5703 range_tree_verify_not_present(msp->ms_freeing, offset, size);
5704 range_tree_verify_not_present(msp->ms_checkpointing, offset, size);
5705 range_tree_verify_not_present(msp->ms_freed, offset, size);
a1d477c2 5706 for (int j = 0; j < TXG_DEFER_SIZE; j++)
df72b8be 5707 range_tree_verify_not_present(msp->ms_defer[j], offset, size);
93e28d66 5708 range_tree_verify_not_present(msp->ms_trim, offset, size);
a1d477c2
MA
5709 mutex_exit(&msp->ms_lock);
5710}
5711
13fe0198
MA
5712void
5713metaslab_check_free(spa_t *spa, const blkptr_t *bp)
5714{
13fe0198
MA
5715 if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
5716 return;
5717
5718 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
1c27024e 5719 for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
93cf2076
GW
5720 uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
5721 vdev_t *vd = vdev_lookup_top(spa, vdev);
5722 uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
13fe0198 5723 uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]);
13fe0198 5724
a1d477c2
MA
5725 if (DVA_GET_GANG(&bp->blk_dva[i]))
5726 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
5727
5728 ASSERT3P(vd, !=, NULL);
13fe0198 5729
a1d477c2 5730 metaslab_check_free_impl(vd, offset, size);
13fe0198
MA
5731 }
5732 spa_config_exit(spa, SCL_VDEV, FTAG);
5733}
5734
1b939560
BB
5735static void
5736metaslab_group_disable_wait(metaslab_group_t *mg)
5737{
5738 ASSERT(MUTEX_HELD(&mg->mg_ms_disabled_lock));
5739 while (mg->mg_disabled_updating) {
5740 cv_wait(&mg->mg_ms_disabled_cv, &mg->mg_ms_disabled_lock);
5741 }
5742}
5743
5744static void
5745metaslab_group_disabled_increment(metaslab_group_t *mg)
5746{
5747 ASSERT(MUTEX_HELD(&mg->mg_ms_disabled_lock));
5748 ASSERT(mg->mg_disabled_updating);
5749
5750 while (mg->mg_ms_disabled >= max_disabled_ms) {
5751 cv_wait(&mg->mg_ms_disabled_cv, &mg->mg_ms_disabled_lock);
5752 }
5753 mg->mg_ms_disabled++;
5754 ASSERT3U(mg->mg_ms_disabled, <=, max_disabled_ms);
5755}
5756
5757/*
5758 * Mark the metaslab as disabled to prevent any allocations on this metaslab.
5759 * We must also track how many metaslabs are currently disabled within a
5760 * metaslab group and limit them to prevent allocation failures from
5761 * occurring because all metaslabs are disabled.
5762 */
5763void
5764metaslab_disable(metaslab_t *msp)
5765{
5766 ASSERT(!MUTEX_HELD(&msp->ms_lock));
5767 metaslab_group_t *mg = msp->ms_group;
5768
5769 mutex_enter(&mg->mg_ms_disabled_lock);
5770
5771 /*
5772 * To keep an accurate count of how many threads have disabled
5773 * a specific metaslab group, we only allow one thread to mark
5774 * the metaslab group at a time. This ensures that the value of
5775 * ms_disabled will be accurate when we decide to mark a metaslab
5776 * group as disabled. To do this we force all other threads
5777 * to wait till the metaslab's mg_disabled_updating flag is no
5778 * longer set.
5779 */
5780 metaslab_group_disable_wait(mg);
5781 mg->mg_disabled_updating = B_TRUE;
5782 if (msp->ms_disabled == 0) {
5783 metaslab_group_disabled_increment(mg);
5784 }
5785 mutex_enter(&msp->ms_lock);
5786 msp->ms_disabled++;
5787 mutex_exit(&msp->ms_lock);
5788
5789 mg->mg_disabled_updating = B_FALSE;
5790 cv_broadcast(&mg->mg_ms_disabled_cv);
5791 mutex_exit(&mg->mg_ms_disabled_lock);
5792}
5793
5794void
f09fda50 5795metaslab_enable(metaslab_t *msp, boolean_t sync, boolean_t unload)
1b939560
BB
5796{
5797 metaslab_group_t *mg = msp->ms_group;
5798 spa_t *spa = mg->mg_vd->vdev_spa;
5799
5800 /*
5801 * Wait for the outstanding IO to be synced to prevent newly
5802 * allocated blocks from being overwritten. This used by
5803 * initialize and TRIM which are modifying unallocated space.
5804 */
5805 if (sync)
5806 txg_wait_synced(spa_get_dsl(spa), 0);
5807
5808 mutex_enter(&mg->mg_ms_disabled_lock);
5809 mutex_enter(&msp->ms_lock);
5810 if (--msp->ms_disabled == 0) {
5811 mg->mg_ms_disabled--;
5812 cv_broadcast(&mg->mg_ms_disabled_cv);
f09fda50
PD
5813 if (unload)
5814 metaslab_unload(msp);
1b939560
BB
5815 }
5816 mutex_exit(&msp->ms_lock);
5817 mutex_exit(&mg->mg_ms_disabled_lock);
5818}
5819
93e28d66
SD
5820static void
5821metaslab_update_ondisk_flush_data(metaslab_t *ms, dmu_tx_t *tx)
5822{
5823 vdev_t *vd = ms->ms_group->mg_vd;
5824 spa_t *spa = vd->vdev_spa;
5825 objset_t *mos = spa_meta_objset(spa);
5826
5827 ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
5828
5829 metaslab_unflushed_phys_t entry = {
5830 .msp_unflushed_txg = metaslab_unflushed_txg(ms),
5831 };
5832 uint64_t entry_size = sizeof (entry);
5833 uint64_t entry_offset = ms->ms_id * entry_size;
5834
5835 uint64_t object = 0;
5836 int err = zap_lookup(mos, vd->vdev_top_zap,
5837 VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1,
5838 &object);
5839 if (err == ENOENT) {
5840 object = dmu_object_alloc(mos, DMU_OTN_UINT64_METADATA,
5841 SPA_OLD_MAXBLOCKSIZE, DMU_OT_NONE, 0, tx);
5842 VERIFY0(zap_add(mos, vd->vdev_top_zap,
5843 VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1,
5844 &object, tx));
5845 } else {
5846 VERIFY0(err);
5847 }
5848
5849 dmu_write(spa_meta_objset(spa), object, entry_offset, entry_size,
5850 &entry, tx);
5851}
5852
5853void
5854metaslab_set_unflushed_txg(metaslab_t *ms, uint64_t txg, dmu_tx_t *tx)
5855{
5856 spa_t *spa = ms->ms_group->mg_vd->vdev_spa;
5857
5858 if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
5859 return;
5860
5861 ms->ms_unflushed_txg = txg;
5862 metaslab_update_ondisk_flush_data(ms, tx);
5863}
5864
5865uint64_t
5866metaslab_unflushed_txg(metaslab_t *ms)
5867{
5868 return (ms->ms_unflushed_txg);
5869}
5870
03fdcb9a
MM
5871ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, aliquot, ULONG, ZMOD_RW,
5872 "Allocation granularity (a.k.a. stripe size)");
02730c33 5873
03fdcb9a
MM
5874ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, debug_load, INT, ZMOD_RW,
5875 "Load all metaslabs when pool is first opened");
02730c33 5876
03fdcb9a
MM
5877ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, debug_unload, INT, ZMOD_RW,
5878 "Prevent metaslabs from being unloaded");
f4a4046b 5879
03fdcb9a
MM
5880ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, preload_enabled, INT, ZMOD_RW,
5881 "Preload potential metaslabs during reassessment");
eef0f4d8 5882
03fdcb9a
MM
5883ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, unload_delay, INT, ZMOD_RW,
5884 "Delay in txgs after metaslab was last used before unloading");
eef0f4d8 5885
03fdcb9a
MM
5886ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, unload_delay_ms, INT, ZMOD_RW,
5887 "Delay in milliseconds after metaslab was last used before unloading");
02730c33 5888
03fdcb9a
MM
5889/* BEGIN CSTYLED */
5890ZFS_MODULE_PARAM(zfs_mg, zfs_mg_, noalloc_threshold, INT, ZMOD_RW,
5891 "Percentage of metaslab group size that should be free to make it "
5892 "eligible for allocation");
f3a7f661 5893
03fdcb9a
MM
5894ZFS_MODULE_PARAM(zfs_mg, zfs_mg_, fragmentation_threshold, INT, ZMOD_RW,
5895 "Percentage of metaslab group size that should be considered eligible "
5896 "for allocations unless all metaslab groups within the metaslab class "
5897 "have also crossed this threshold");
02730c33 5898
03fdcb9a
MM
5899ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, fragmentation_threshold, INT,
5900 ZMOD_RW, "Fragmentation for metaslab to allow allocation");
02730c33 5901
03fdcb9a
MM
5902ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, fragmentation_factor_enabled, INT, ZMOD_RW,
5903 "Use the fragmentation metric to prefer less fragmented metaslabs");
5904/* END CSTYLED */
02730c33 5905
03fdcb9a
MM
5906ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, lba_weighting_enabled, INT, ZMOD_RW,
5907 "Prefer metaslabs with lower LBAs");
4e21fd06 5908
03fdcb9a
MM
5909ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, bias_enabled, INT, ZMOD_RW,
5910 "Enable metaslab group biasing");
4e21fd06 5911
03fdcb9a
MM
5912ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, segment_weight_enabled, INT,
5913 ZMOD_RW, "Enable segment-based metaslab selection");
a1d477c2 5914
03fdcb9a
MM
5915ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, switch_threshold, INT, ZMOD_RW,
5916 "Segment-based metaslab selection maximum buckets before switching");
d3230d76 5917
03fdcb9a
MM
5918ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, force_ganging, ULONG, ZMOD_RW,
5919 "Blocks larger than this size are forced to be gang blocks");
d3230d76 5920
03fdcb9a
MM
5921ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, df_max_search, INT, ZMOD_RW,
5922 "Max distance (bytes) to search forward before using size tree");
c81f1790 5923
03fdcb9a
MM
5924ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, df_use_largest_segment, INT, ZMOD_RW,
5925 "When looking in size tree, use largest segment instead of exact fit");
f09fda50 5926
03fdcb9a
MM
5927ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, max_size_cache_sec, ULONG,
5928 ZMOD_RW, "How long to trust the cached max chunk size of a metaslab");
cc99f275 5929
03fdcb9a
MM
5930ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, mem_limit, INT, ZMOD_RW,
5931 "Percentage of memory that can be used to store metaslab range trees");