]> git.proxmox.com Git - mirror_zfs.git/blob - module/zfs/metaslab.c
OpenZFS 7614, 9064 - zfs device evacuation/removal
[mirror_zfs.git] / module / zfs / metaslab.c
1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
24 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
25 */
26
27 #include <sys/zfs_context.h>
28 #include <sys/dmu.h>
29 #include <sys/dmu_tx.h>
30 #include <sys/space_map.h>
31 #include <sys/metaslab_impl.h>
32 #include <sys/vdev_impl.h>
33 #include <sys/zio.h>
34 #include <sys/spa_impl.h>
35 #include <sys/zfeature.h>
36 #include <sys/vdev_indirect_mapping.h>
37
38 #define WITH_DF_BLOCK_ALLOCATOR
39
40 #define GANG_ALLOCATION(flags) \
41 ((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER))
42
43 /*
44 * Metaslab granularity, in bytes. This is roughly similar to what would be
45 * referred to as the "stripe size" in traditional RAID arrays. In normal
46 * operation, we will try to write this amount of data to a top-level vdev
47 * before moving on to the next one.
48 */
49 unsigned long metaslab_aliquot = 512 << 10;
50
51 /* force gang blocks */
52 unsigned long metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1;
53
54 /*
55 * The in-core space map representation is more compact than its on-disk form.
56 * The zfs_condense_pct determines how much more compact the in-core
57 * space map representation must be before we compact it on-disk.
58 * Values should be greater than or equal to 100.
59 */
60 int zfs_condense_pct = 200;
61
62 /*
63 * Condensing a metaslab is not guaranteed to actually reduce the amount of
64 * space used on disk. In particular, a space map uses data in increments of
65 * MAX(1 << ashift, space_map_blksz), so a metaslab might use the
66 * same number of blocks after condensing. Since the goal of condensing is to
67 * reduce the number of IOPs required to read the space map, we only want to
68 * condense when we can be sure we will reduce the number of blocks used by the
69 * space map. Unfortunately, we cannot precisely compute whether or not this is
70 * the case in metaslab_should_condense since we are holding ms_lock. Instead,
71 * we apply the following heuristic: do not condense a spacemap unless the
72 * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold
73 * blocks.
74 */
75 int zfs_metaslab_condense_block_threshold = 4;
76
77 /*
78 * The zfs_mg_noalloc_threshold defines which metaslab groups should
79 * be eligible for allocation. The value is defined as a percentage of
80 * free space. Metaslab groups that have more free space than
81 * zfs_mg_noalloc_threshold are always eligible for allocations. Once
82 * a metaslab group's free space is less than or equal to the
83 * zfs_mg_noalloc_threshold the allocator will avoid allocating to that
84 * group unless all groups in the pool have reached zfs_mg_noalloc_threshold.
85 * Once all groups in the pool reach zfs_mg_noalloc_threshold then all
86 * groups are allowed to accept allocations. Gang blocks are always
87 * eligible to allocate on any metaslab group. The default value of 0 means
88 * no metaslab group will be excluded based on this criterion.
89 */
90 int zfs_mg_noalloc_threshold = 0;
91
92 /*
93 * Metaslab groups are considered eligible for allocations if their
94 * fragmenation metric (measured as a percentage) is less than or equal to
95 * zfs_mg_fragmentation_threshold. If a metaslab group exceeds this threshold
96 * then it will be skipped unless all metaslab groups within the metaslab
97 * class have also crossed this threshold.
98 */
99 int zfs_mg_fragmentation_threshold = 85;
100
101 /*
102 * Allow metaslabs to keep their active state as long as their fragmentation
103 * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An
104 * active metaslab that exceeds this threshold will no longer keep its active
105 * status allowing better metaslabs to be selected.
106 */
107 int zfs_metaslab_fragmentation_threshold = 70;
108
109 /*
110 * When set will load all metaslabs when pool is first opened.
111 */
112 int metaslab_debug_load = 0;
113
114 /*
115 * When set will prevent metaslabs from being unloaded.
116 */
117 int metaslab_debug_unload = 0;
118
119 /*
120 * Minimum size which forces the dynamic allocator to change
121 * it's allocation strategy. Once the space map cannot satisfy
122 * an allocation of this size then it switches to using more
123 * aggressive strategy (i.e search by size rather than offset).
124 */
125 uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE;
126
127 /*
128 * The minimum free space, in percent, which must be available
129 * in a space map to continue allocations in a first-fit fashion.
130 * Once the space map's free space drops below this level we dynamically
131 * switch to using best-fit allocations.
132 */
133 int metaslab_df_free_pct = 4;
134
135 /*
136 * Percentage of all cpus that can be used by the metaslab taskq.
137 */
138 int metaslab_load_pct = 50;
139
140 /*
141 * Determines how many txgs a metaslab may remain loaded without having any
142 * allocations from it. As long as a metaslab continues to be used we will
143 * keep it loaded.
144 */
145 int metaslab_unload_delay = TXG_SIZE * 2;
146
147 /*
148 * Max number of metaslabs per group to preload.
149 */
150 int metaslab_preload_limit = SPA_DVAS_PER_BP;
151
152 /*
153 * Enable/disable preloading of metaslab.
154 */
155 int metaslab_preload_enabled = B_TRUE;
156
157 /*
158 * Enable/disable fragmentation weighting on metaslabs.
159 */
160 int metaslab_fragmentation_factor_enabled = B_TRUE;
161
162 /*
163 * Enable/disable lba weighting (i.e. outer tracks are given preference).
164 */
165 int metaslab_lba_weighting_enabled = B_TRUE;
166
167 /*
168 * Enable/disable metaslab group biasing.
169 */
170 int metaslab_bias_enabled = B_TRUE;
171
172
173 /*
174 * Enable/disable remapping of indirect DVAs to their concrete vdevs.
175 */
176 boolean_t zfs_remap_blkptr_enable = B_TRUE;
177
178 /*
179 * Enable/disable segment-based metaslab selection.
180 */
181 int zfs_metaslab_segment_weight_enabled = B_TRUE;
182
183 /*
184 * When using segment-based metaslab selection, we will continue
185 * allocating from the active metaslab until we have exhausted
186 * zfs_metaslab_switch_threshold of its buckets.
187 */
188 int zfs_metaslab_switch_threshold = 2;
189
190 /*
191 * Internal switch to enable/disable the metaslab allocation tracing
192 * facility.
193 */
194 #ifdef _METASLAB_TRACING
195 boolean_t metaslab_trace_enabled = B_TRUE;
196 #endif
197
198 /*
199 * Maximum entries that the metaslab allocation tracing facility will keep
200 * in a given list when running in non-debug mode. We limit the number
201 * of entries in non-debug mode to prevent us from using up too much memory.
202 * The limit should be sufficiently large that we don't expect any allocation
203 * to every exceed this value. In debug mode, the system will panic if this
204 * limit is ever reached allowing for further investigation.
205 */
206 #ifdef _METASLAB_TRACING
207 uint64_t metaslab_trace_max_entries = 5000;
208 #endif
209
210 static uint64_t metaslab_weight(metaslab_t *);
211 static void metaslab_set_fragmentation(metaslab_t *);
212 static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, uint64_t);
213 static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t);
214
215 #ifdef _METASLAB_TRACING
216 kmem_cache_t *metaslab_alloc_trace_cache;
217 #endif
218
219 /*
220 * ==========================================================================
221 * Metaslab classes
222 * ==========================================================================
223 */
224 metaslab_class_t *
225 metaslab_class_create(spa_t *spa, metaslab_ops_t *ops)
226 {
227 metaslab_class_t *mc;
228
229 mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);
230
231 mc->mc_spa = spa;
232 mc->mc_rotor = NULL;
233 mc->mc_ops = ops;
234 mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL);
235 refcount_create_tracked(&mc->mc_alloc_slots);
236
237 return (mc);
238 }
239
240 void
241 metaslab_class_destroy(metaslab_class_t *mc)
242 {
243 ASSERT(mc->mc_rotor == NULL);
244 ASSERT(mc->mc_alloc == 0);
245 ASSERT(mc->mc_deferred == 0);
246 ASSERT(mc->mc_space == 0);
247 ASSERT(mc->mc_dspace == 0);
248
249 refcount_destroy(&mc->mc_alloc_slots);
250 mutex_destroy(&mc->mc_lock);
251 kmem_free(mc, sizeof (metaslab_class_t));
252 }
253
254 int
255 metaslab_class_validate(metaslab_class_t *mc)
256 {
257 metaslab_group_t *mg;
258 vdev_t *vd;
259
260 /*
261 * Must hold one of the spa_config locks.
262 */
263 ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) ||
264 spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER));
265
266 if ((mg = mc->mc_rotor) == NULL)
267 return (0);
268
269 do {
270 vd = mg->mg_vd;
271 ASSERT(vd->vdev_mg != NULL);
272 ASSERT3P(vd->vdev_top, ==, vd);
273 ASSERT3P(mg->mg_class, ==, mc);
274 ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops);
275 } while ((mg = mg->mg_next) != mc->mc_rotor);
276
277 return (0);
278 }
279
280 void
281 metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta,
282 int64_t defer_delta, int64_t space_delta, int64_t dspace_delta)
283 {
284 atomic_add_64(&mc->mc_alloc, alloc_delta);
285 atomic_add_64(&mc->mc_deferred, defer_delta);
286 atomic_add_64(&mc->mc_space, space_delta);
287 atomic_add_64(&mc->mc_dspace, dspace_delta);
288 }
289
290 uint64_t
291 metaslab_class_get_alloc(metaslab_class_t *mc)
292 {
293 return (mc->mc_alloc);
294 }
295
296 uint64_t
297 metaslab_class_get_deferred(metaslab_class_t *mc)
298 {
299 return (mc->mc_deferred);
300 }
301
302 uint64_t
303 metaslab_class_get_space(metaslab_class_t *mc)
304 {
305 return (mc->mc_space);
306 }
307
308 uint64_t
309 metaslab_class_get_dspace(metaslab_class_t *mc)
310 {
311 return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space);
312 }
313
314 void
315 metaslab_class_histogram_verify(metaslab_class_t *mc)
316 {
317 vdev_t *rvd = mc->mc_spa->spa_root_vdev;
318 uint64_t *mc_hist;
319 int i;
320
321 if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
322 return;
323
324 mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
325 KM_SLEEP);
326
327 for (int c = 0; c < rvd->vdev_children; c++) {
328 vdev_t *tvd = rvd->vdev_child[c];
329 metaslab_group_t *mg = tvd->vdev_mg;
330
331 /*
332 * Skip any holes, uninitialized top-levels, or
333 * vdevs that are not in this metalab class.
334 */
335 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 ||
336 mg->mg_class != mc) {
337 continue;
338 }
339
340 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
341 mc_hist[i] += mg->mg_histogram[i];
342 }
343
344 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
345 VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]);
346
347 kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
348 }
349
350 /*
351 * Calculate the metaslab class's fragmentation metric. The metric
352 * is weighted based on the space contribution of each metaslab group.
353 * The return value will be a number between 0 and 100 (inclusive), or
354 * ZFS_FRAG_INVALID if the metric has not been set. See comment above the
355 * zfs_frag_table for more information about the metric.
356 */
357 uint64_t
358 metaslab_class_fragmentation(metaslab_class_t *mc)
359 {
360 vdev_t *rvd = mc->mc_spa->spa_root_vdev;
361 uint64_t fragmentation = 0;
362
363 spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
364
365 for (int c = 0; c < rvd->vdev_children; c++) {
366 vdev_t *tvd = rvd->vdev_child[c];
367 metaslab_group_t *mg = tvd->vdev_mg;
368
369 /*
370 * Skip any holes, uninitialized top-levels,
371 * or vdevs that are not in this metalab class.
372 */
373 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 ||
374 mg->mg_class != mc) {
375 continue;
376 }
377
378 /*
379 * If a metaslab group does not contain a fragmentation
380 * metric then just bail out.
381 */
382 if (mg->mg_fragmentation == ZFS_FRAG_INVALID) {
383 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
384 return (ZFS_FRAG_INVALID);
385 }
386
387 /*
388 * Determine how much this metaslab_group is contributing
389 * to the overall pool fragmentation metric.
390 */
391 fragmentation += mg->mg_fragmentation *
392 metaslab_group_get_space(mg);
393 }
394 fragmentation /= metaslab_class_get_space(mc);
395
396 ASSERT3U(fragmentation, <=, 100);
397 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
398 return (fragmentation);
399 }
400
401 /*
402 * Calculate the amount of expandable space that is available in
403 * this metaslab class. If a device is expanded then its expandable
404 * space will be the amount of allocatable space that is currently not
405 * part of this metaslab class.
406 */
407 uint64_t
408 metaslab_class_expandable_space(metaslab_class_t *mc)
409 {
410 vdev_t *rvd = mc->mc_spa->spa_root_vdev;
411 uint64_t space = 0;
412
413 spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
414 for (int c = 0; c < rvd->vdev_children; c++) {
415 vdev_t *tvd = rvd->vdev_child[c];
416 metaslab_group_t *mg = tvd->vdev_mg;
417
418 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 ||
419 mg->mg_class != mc) {
420 continue;
421 }
422
423 /*
424 * Calculate if we have enough space to add additional
425 * metaslabs. We report the expandable space in terms
426 * of the metaslab size since that's the unit of expansion.
427 */
428 space += P2ALIGN(tvd->vdev_max_asize - tvd->vdev_asize,
429 1ULL << tvd->vdev_ms_shift);
430 }
431 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
432 return (space);
433 }
434
435 static int
436 metaslab_compare(const void *x1, const void *x2)
437 {
438 const metaslab_t *m1 = (const metaslab_t *)x1;
439 const metaslab_t *m2 = (const metaslab_t *)x2;
440
441 int cmp = AVL_CMP(m2->ms_weight, m1->ms_weight);
442 if (likely(cmp))
443 return (cmp);
444
445 IMPLY(AVL_CMP(m1->ms_start, m2->ms_start) == 0, m1 == m2);
446
447 return (AVL_CMP(m1->ms_start, m2->ms_start));
448 }
449
450 /*
451 * Verify that the space accounting on disk matches the in-core range_trees.
452 */
453 void
454 metaslab_verify_space(metaslab_t *msp, uint64_t txg)
455 {
456 spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
457 uint64_t allocated = 0;
458 uint64_t sm_free_space, msp_free_space;
459
460 ASSERT(MUTEX_HELD(&msp->ms_lock));
461
462 if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
463 return;
464
465 /*
466 * We can only verify the metaslab space when we're called
467 * from syncing context with a loaded metaslab that has an allocated
468 * space map. Calling this in non-syncing context does not
469 * provide a consistent view of the metaslab since we're performing
470 * allocations in the future.
471 */
472 if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL ||
473 !msp->ms_loaded)
474 return;
475
476 sm_free_space = msp->ms_size - space_map_allocated(msp->ms_sm) -
477 space_map_alloc_delta(msp->ms_sm);
478
479 /*
480 * Account for future allocations since we would have already
481 * deducted that space from the ms_freetree.
482 */
483 for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
484 allocated +=
485 range_tree_space(msp->ms_alloctree[(txg + t) & TXG_MASK]);
486 }
487
488 msp_free_space = range_tree_space(msp->ms_tree) + allocated +
489 msp->ms_deferspace + range_tree_space(msp->ms_freedtree);
490
491 VERIFY3U(sm_free_space, ==, msp_free_space);
492 }
493
494 /*
495 * ==========================================================================
496 * Metaslab groups
497 * ==========================================================================
498 */
499 /*
500 * Update the allocatable flag and the metaslab group's capacity.
501 * The allocatable flag is set to true if the capacity is below
502 * the zfs_mg_noalloc_threshold or has a fragmentation value that is
503 * greater than zfs_mg_fragmentation_threshold. If a metaslab group
504 * transitions from allocatable to non-allocatable or vice versa then the
505 * metaslab group's class is updated to reflect the transition.
506 */
507 static void
508 metaslab_group_alloc_update(metaslab_group_t *mg)
509 {
510 vdev_t *vd = mg->mg_vd;
511 metaslab_class_t *mc = mg->mg_class;
512 vdev_stat_t *vs = &vd->vdev_stat;
513 boolean_t was_allocatable;
514 boolean_t was_initialized;
515
516 ASSERT(vd == vd->vdev_top);
517 ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_READER), ==,
518 SCL_ALLOC);
519
520 mutex_enter(&mg->mg_lock);
521 was_allocatable = mg->mg_allocatable;
522 was_initialized = mg->mg_initialized;
523
524 mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) /
525 (vs->vs_space + 1);
526
527 mutex_enter(&mc->mc_lock);
528
529 /*
530 * If the metaslab group was just added then it won't
531 * have any space until we finish syncing out this txg.
532 * At that point we will consider it initialized and available
533 * for allocations. We also don't consider non-activated
534 * metaslab groups (e.g. vdevs that are in the middle of being removed)
535 * to be initialized, because they can't be used for allocation.
536 */
537 mg->mg_initialized = metaslab_group_initialized(mg);
538 if (!was_initialized && mg->mg_initialized) {
539 mc->mc_groups++;
540 } else if (was_initialized && !mg->mg_initialized) {
541 ASSERT3U(mc->mc_groups, >, 0);
542 mc->mc_groups--;
543 }
544 if (mg->mg_initialized)
545 mg->mg_no_free_space = B_FALSE;
546
547 /*
548 * A metaslab group is considered allocatable if it has plenty
549 * of free space or is not heavily fragmented. We only take
550 * fragmentation into account if the metaslab group has a valid
551 * fragmentation metric (i.e. a value between 0 and 100).
552 */
553 mg->mg_allocatable = (mg->mg_activation_count > 0 &&
554 mg->mg_free_capacity > zfs_mg_noalloc_threshold &&
555 (mg->mg_fragmentation == ZFS_FRAG_INVALID ||
556 mg->mg_fragmentation <= zfs_mg_fragmentation_threshold));
557
558 /*
559 * The mc_alloc_groups maintains a count of the number of
560 * groups in this metaslab class that are still above the
561 * zfs_mg_noalloc_threshold. This is used by the allocating
562 * threads to determine if they should avoid allocations to
563 * a given group. The allocator will avoid allocations to a group
564 * if that group has reached or is below the zfs_mg_noalloc_threshold
565 * and there are still other groups that are above the threshold.
566 * When a group transitions from allocatable to non-allocatable or
567 * vice versa we update the metaslab class to reflect that change.
568 * When the mc_alloc_groups value drops to 0 that means that all
569 * groups have reached the zfs_mg_noalloc_threshold making all groups
570 * eligible for allocations. This effectively means that all devices
571 * are balanced again.
572 */
573 if (was_allocatable && !mg->mg_allocatable)
574 mc->mc_alloc_groups--;
575 else if (!was_allocatable && mg->mg_allocatable)
576 mc->mc_alloc_groups++;
577 mutex_exit(&mc->mc_lock);
578
579 mutex_exit(&mg->mg_lock);
580 }
581
582 metaslab_group_t *
583 metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
584 {
585 metaslab_group_t *mg;
586
587 mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
588 mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
589 avl_create(&mg->mg_metaslab_tree, metaslab_compare,
590 sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
591 mg->mg_vd = vd;
592 mg->mg_class = mc;
593 mg->mg_activation_count = 0;
594 mg->mg_initialized = B_FALSE;
595 mg->mg_no_free_space = B_TRUE;
596 refcount_create_tracked(&mg->mg_alloc_queue_depth);
597
598 mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct,
599 maxclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT | TASKQ_DYNAMIC);
600
601 return (mg);
602 }
603
604 void
605 metaslab_group_destroy(metaslab_group_t *mg)
606 {
607 ASSERT(mg->mg_prev == NULL);
608 ASSERT(mg->mg_next == NULL);
609 /*
610 * We may have gone below zero with the activation count
611 * either because we never activated in the first place or
612 * because we're done, and possibly removing the vdev.
613 */
614 ASSERT(mg->mg_activation_count <= 0);
615
616 taskq_destroy(mg->mg_taskq);
617 avl_destroy(&mg->mg_metaslab_tree);
618 mutex_destroy(&mg->mg_lock);
619 refcount_destroy(&mg->mg_alloc_queue_depth);
620 kmem_free(mg, sizeof (metaslab_group_t));
621 }
622
623 void
624 metaslab_group_activate(metaslab_group_t *mg)
625 {
626 metaslab_class_t *mc = mg->mg_class;
627 metaslab_group_t *mgprev, *mgnext;
628
629 ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER), !=, 0);
630
631 ASSERT(mc->mc_rotor != mg);
632 ASSERT(mg->mg_prev == NULL);
633 ASSERT(mg->mg_next == NULL);
634 ASSERT(mg->mg_activation_count <= 0);
635
636 if (++mg->mg_activation_count <= 0)
637 return;
638
639 mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children);
640 metaslab_group_alloc_update(mg);
641
642 if ((mgprev = mc->mc_rotor) == NULL) {
643 mg->mg_prev = mg;
644 mg->mg_next = mg;
645 } else {
646 mgnext = mgprev->mg_next;
647 mg->mg_prev = mgprev;
648 mg->mg_next = mgnext;
649 mgprev->mg_next = mg;
650 mgnext->mg_prev = mg;
651 }
652 mc->mc_rotor = mg;
653 }
654
655 /*
656 * Passivate a metaslab group and remove it from the allocation rotor.
657 * Callers must hold both the SCL_ALLOC and SCL_ZIO lock prior to passivating
658 * a metaslab group. This function will momentarily drop spa_config_locks
659 * that are lower than the SCL_ALLOC lock (see comment below).
660 */
661 void
662 metaslab_group_passivate(metaslab_group_t *mg)
663 {
664 metaslab_class_t *mc = mg->mg_class;
665 spa_t *spa = mc->mc_spa;
666 metaslab_group_t *mgprev, *mgnext;
667 int locks = spa_config_held(spa, SCL_ALL, RW_WRITER);
668
669 ASSERT3U(spa_config_held(spa, SCL_ALLOC | SCL_ZIO, RW_WRITER), ==,
670 (SCL_ALLOC | SCL_ZIO));
671
672 if (--mg->mg_activation_count != 0) {
673 ASSERT(mc->mc_rotor != mg);
674 ASSERT(mg->mg_prev == NULL);
675 ASSERT(mg->mg_next == NULL);
676 ASSERT(mg->mg_activation_count < 0);
677 return;
678 }
679
680 /*
681 * The spa_config_lock is an array of rwlocks, ordered as
682 * follows (from highest to lowest):
683 * SCL_CONFIG > SCL_STATE > SCL_L2ARC > SCL_ALLOC >
684 * SCL_ZIO > SCL_FREE > SCL_VDEV
685 * (For more information about the spa_config_lock see spa_misc.c)
686 * The higher the lock, the broader its coverage. When we passivate
687 * a metaslab group, we must hold both the SCL_ALLOC and the SCL_ZIO
688 * config locks. However, the metaslab group's taskq might be trying
689 * to preload metaslabs so we must drop the SCL_ZIO lock and any
690 * lower locks to allow the I/O to complete. At a minimum,
691 * we continue to hold the SCL_ALLOC lock, which prevents any future
692 * allocations from taking place and any changes to the vdev tree.
693 */
694 spa_config_exit(spa, locks & ~(SCL_ZIO - 1), spa);
695 taskq_wait_outstanding(mg->mg_taskq, 0);
696 spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER);
697 metaslab_group_alloc_update(mg);
698
699 mgprev = mg->mg_prev;
700 mgnext = mg->mg_next;
701
702 if (mg == mgnext) {
703 mc->mc_rotor = NULL;
704 } else {
705 mc->mc_rotor = mgnext;
706 mgprev->mg_next = mgnext;
707 mgnext->mg_prev = mgprev;
708 }
709
710 mg->mg_prev = NULL;
711 mg->mg_next = NULL;
712 }
713
714 boolean_t
715 metaslab_group_initialized(metaslab_group_t *mg)
716 {
717 vdev_t *vd = mg->mg_vd;
718 vdev_stat_t *vs = &vd->vdev_stat;
719
720 return (vs->vs_space != 0 && mg->mg_activation_count > 0);
721 }
722
723 uint64_t
724 metaslab_group_get_space(metaslab_group_t *mg)
725 {
726 return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count);
727 }
728
729 void
730 metaslab_group_histogram_verify(metaslab_group_t *mg)
731 {
732 uint64_t *mg_hist;
733 vdev_t *vd = mg->mg_vd;
734 uint64_t ashift = vd->vdev_ashift;
735 int i;
736
737 if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
738 return;
739
740 mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
741 KM_SLEEP);
742
743 ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=,
744 SPACE_MAP_HISTOGRAM_SIZE + ashift);
745
746 for (int m = 0; m < vd->vdev_ms_count; m++) {
747 metaslab_t *msp = vd->vdev_ms[m];
748
749 if (msp->ms_sm == NULL)
750 continue;
751
752 for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++)
753 mg_hist[i + ashift] +=
754 msp->ms_sm->sm_phys->smp_histogram[i];
755 }
756
757 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++)
758 VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]);
759
760 kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
761 }
762
763 static void
764 metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp)
765 {
766 metaslab_class_t *mc = mg->mg_class;
767 uint64_t ashift = mg->mg_vd->vdev_ashift;
768
769 ASSERT(MUTEX_HELD(&msp->ms_lock));
770 if (msp->ms_sm == NULL)
771 return;
772
773 mutex_enter(&mg->mg_lock);
774 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
775 mg->mg_histogram[i + ashift] +=
776 msp->ms_sm->sm_phys->smp_histogram[i];
777 mc->mc_histogram[i + ashift] +=
778 msp->ms_sm->sm_phys->smp_histogram[i];
779 }
780 mutex_exit(&mg->mg_lock);
781 }
782
783 void
784 metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp)
785 {
786 metaslab_class_t *mc = mg->mg_class;
787 uint64_t ashift = mg->mg_vd->vdev_ashift;
788
789 ASSERT(MUTEX_HELD(&msp->ms_lock));
790 if (msp->ms_sm == NULL)
791 return;
792
793 mutex_enter(&mg->mg_lock);
794 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
795 ASSERT3U(mg->mg_histogram[i + ashift], >=,
796 msp->ms_sm->sm_phys->smp_histogram[i]);
797 ASSERT3U(mc->mc_histogram[i + ashift], >=,
798 msp->ms_sm->sm_phys->smp_histogram[i]);
799
800 mg->mg_histogram[i + ashift] -=
801 msp->ms_sm->sm_phys->smp_histogram[i];
802 mc->mc_histogram[i + ashift] -=
803 msp->ms_sm->sm_phys->smp_histogram[i];
804 }
805 mutex_exit(&mg->mg_lock);
806 }
807
808 static void
809 metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp)
810 {
811 ASSERT(msp->ms_group == NULL);
812 mutex_enter(&mg->mg_lock);
813 msp->ms_group = mg;
814 msp->ms_weight = 0;
815 avl_add(&mg->mg_metaslab_tree, msp);
816 mutex_exit(&mg->mg_lock);
817
818 mutex_enter(&msp->ms_lock);
819 metaslab_group_histogram_add(mg, msp);
820 mutex_exit(&msp->ms_lock);
821 }
822
823 static void
824 metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
825 {
826 mutex_enter(&msp->ms_lock);
827 metaslab_group_histogram_remove(mg, msp);
828 mutex_exit(&msp->ms_lock);
829
830 mutex_enter(&mg->mg_lock);
831 ASSERT(msp->ms_group == mg);
832 avl_remove(&mg->mg_metaslab_tree, msp);
833 msp->ms_group = NULL;
834 mutex_exit(&mg->mg_lock);
835 }
836
837 static void
838 metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
839 {
840 /*
841 * Although in principle the weight can be any value, in
842 * practice we do not use values in the range [1, 511].
843 */
844 ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0);
845 ASSERT(MUTEX_HELD(&msp->ms_lock));
846
847 mutex_enter(&mg->mg_lock);
848 ASSERT(msp->ms_group == mg);
849 avl_remove(&mg->mg_metaslab_tree, msp);
850 msp->ms_weight = weight;
851 avl_add(&mg->mg_metaslab_tree, msp);
852 mutex_exit(&mg->mg_lock);
853 }
854
855 /*
856 * Calculate the fragmentation for a given metaslab group. We can use
857 * a simple average here since all metaslabs within the group must have
858 * the same size. The return value will be a value between 0 and 100
859 * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this
860 * group have a fragmentation metric.
861 */
862 uint64_t
863 metaslab_group_fragmentation(metaslab_group_t *mg)
864 {
865 vdev_t *vd = mg->mg_vd;
866 uint64_t fragmentation = 0;
867 uint64_t valid_ms = 0;
868
869 for (int m = 0; m < vd->vdev_ms_count; m++) {
870 metaslab_t *msp = vd->vdev_ms[m];
871
872 if (msp->ms_fragmentation == ZFS_FRAG_INVALID)
873 continue;
874
875 valid_ms++;
876 fragmentation += msp->ms_fragmentation;
877 }
878
879 if (valid_ms <= vd->vdev_ms_count / 2)
880 return (ZFS_FRAG_INVALID);
881
882 fragmentation /= valid_ms;
883 ASSERT3U(fragmentation, <=, 100);
884 return (fragmentation);
885 }
886
887 /*
888 * Determine if a given metaslab group should skip allocations. A metaslab
889 * group should avoid allocations if its free capacity is less than the
890 * zfs_mg_noalloc_threshold or its fragmentation metric is greater than
891 * zfs_mg_fragmentation_threshold and there is at least one metaslab group
892 * that can still handle allocations. If the allocation throttle is enabled
893 * then we skip allocations to devices that have reached their maximum
894 * allocation queue depth unless the selected metaslab group is the only
895 * eligible group remaining.
896 */
897 static boolean_t
898 metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
899 uint64_t psize)
900 {
901 spa_t *spa = mg->mg_vd->vdev_spa;
902 metaslab_class_t *mc = mg->mg_class;
903
904 /*
905 * We can only consider skipping this metaslab group if it's
906 * in the normal metaslab class and there are other metaslab
907 * groups to select from. Otherwise, we always consider it eligible
908 * for allocations.
909 */
910 if (mc != spa_normal_class(spa) || mc->mc_groups <= 1)
911 return (B_TRUE);
912
913 /*
914 * If the metaslab group's mg_allocatable flag is set (see comments
915 * in metaslab_group_alloc_update() for more information) and
916 * the allocation throttle is disabled then allow allocations to this
917 * device. However, if the allocation throttle is enabled then
918 * check if we have reached our allocation limit (mg_alloc_queue_depth)
919 * to determine if we should allow allocations to this metaslab group.
920 * If all metaslab groups are no longer considered allocatable
921 * (mc_alloc_groups == 0) or we're trying to allocate the smallest
922 * gang block size then we allow allocations on this metaslab group
923 * regardless of the mg_allocatable or throttle settings.
924 */
925 if (mg->mg_allocatable) {
926 metaslab_group_t *mgp;
927 int64_t qdepth;
928 uint64_t qmax = mg->mg_max_alloc_queue_depth;
929
930 if (!mc->mc_alloc_throttle_enabled)
931 return (B_TRUE);
932
933 /*
934 * If this metaslab group does not have any free space, then
935 * there is no point in looking further.
936 */
937 if (mg->mg_no_free_space)
938 return (B_FALSE);
939
940 qdepth = refcount_count(&mg->mg_alloc_queue_depth);
941
942 /*
943 * If this metaslab group is below its qmax or it's
944 * the only allocatable metasable group, then attempt
945 * to allocate from it.
946 */
947 if (qdepth < qmax || mc->mc_alloc_groups == 1)
948 return (B_TRUE);
949 ASSERT3U(mc->mc_alloc_groups, >, 1);
950
951 /*
952 * Since this metaslab group is at or over its qmax, we
953 * need to determine if there are metaslab groups after this
954 * one that might be able to handle this allocation. This is
955 * racy since we can't hold the locks for all metaslab
956 * groups at the same time when we make this check.
957 */
958 for (mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) {
959 qmax = mgp->mg_max_alloc_queue_depth;
960
961 qdepth = refcount_count(&mgp->mg_alloc_queue_depth);
962
963 /*
964 * If there is another metaslab group that
965 * might be able to handle the allocation, then
966 * we return false so that we skip this group.
967 */
968 if (qdepth < qmax && !mgp->mg_no_free_space)
969 return (B_FALSE);
970 }
971
972 /*
973 * We didn't find another group to handle the allocation
974 * so we can't skip this metaslab group even though
975 * we are at or over our qmax.
976 */
977 return (B_TRUE);
978
979 } else if (mc->mc_alloc_groups == 0 || psize == SPA_MINBLOCKSIZE) {
980 return (B_TRUE);
981 }
982 return (B_FALSE);
983 }
984
985 /*
986 * ==========================================================================
987 * Range tree callbacks
988 * ==========================================================================
989 */
990
991 /*
992 * Comparison function for the private size-ordered tree. Tree is sorted
993 * by size, larger sizes at the end of the tree.
994 */
995 static int
996 metaslab_rangesize_compare(const void *x1, const void *x2)
997 {
998 const range_seg_t *r1 = x1;
999 const range_seg_t *r2 = x2;
1000 uint64_t rs_size1 = r1->rs_end - r1->rs_start;
1001 uint64_t rs_size2 = r2->rs_end - r2->rs_start;
1002
1003 int cmp = AVL_CMP(rs_size1, rs_size2);
1004 if (likely(cmp))
1005 return (cmp);
1006
1007 return (AVL_CMP(r1->rs_start, r2->rs_start));
1008 }
1009
1010 /*
1011 * ==========================================================================
1012 * Common allocator routines
1013 * ==========================================================================
1014 */
1015
1016 /*
1017 * Return the maximum contiguous segment within the metaslab.
1018 */
1019 uint64_t
1020 metaslab_block_maxsize(metaslab_t *msp)
1021 {
1022 avl_tree_t *t = &msp->ms_size_tree;
1023 range_seg_t *rs;
1024
1025 if (t == NULL || (rs = avl_last(t)) == NULL)
1026 return (0ULL);
1027
1028 return (rs->rs_end - rs->rs_start);
1029 }
1030
1031 static range_seg_t *
1032 metaslab_block_find(avl_tree_t *t, uint64_t start, uint64_t size)
1033 {
1034 range_seg_t *rs, rsearch;
1035 avl_index_t where;
1036
1037 rsearch.rs_start = start;
1038 rsearch.rs_end = start + size;
1039
1040 rs = avl_find(t, &rsearch, &where);
1041 if (rs == NULL) {
1042 rs = avl_nearest(t, where, AVL_AFTER);
1043 }
1044
1045 return (rs);
1046 }
1047
1048 #if defined(WITH_FF_BLOCK_ALLOCATOR) || \
1049 defined(WITH_DF_BLOCK_ALLOCATOR) || \
1050 defined(WITH_CF_BLOCK_ALLOCATOR)
1051 /*
1052 * This is a helper function that can be used by the allocator to find
1053 * a suitable block to allocate. This will search the specified AVL
1054 * tree looking for a block that matches the specified criteria.
1055 */
1056 static uint64_t
1057 metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
1058 uint64_t align)
1059 {
1060 range_seg_t *rs = metaslab_block_find(t, *cursor, size);
1061
1062 while (rs != NULL) {
1063 uint64_t offset = P2ROUNDUP(rs->rs_start, align);
1064
1065 if (offset + size <= rs->rs_end) {
1066 *cursor = offset + size;
1067 return (offset);
1068 }
1069 rs = AVL_NEXT(t, rs);
1070 }
1071
1072 /*
1073 * If we know we've searched the whole map (*cursor == 0), give up.
1074 * Otherwise, reset the cursor to the beginning and try again.
1075 */
1076 if (*cursor == 0)
1077 return (-1ULL);
1078
1079 *cursor = 0;
1080 return (metaslab_block_picker(t, cursor, size, align));
1081 }
1082 #endif /* WITH_FF/DF/CF_BLOCK_ALLOCATOR */
1083
1084 #if defined(WITH_FF_BLOCK_ALLOCATOR)
1085 /*
1086 * ==========================================================================
1087 * The first-fit block allocator
1088 * ==========================================================================
1089 */
1090 static uint64_t
1091 metaslab_ff_alloc(metaslab_t *msp, uint64_t size)
1092 {
1093 /*
1094 * Find the largest power of 2 block size that evenly divides the
1095 * requested size. This is used to try to allocate blocks with similar
1096 * alignment from the same area of the metaslab (i.e. same cursor
1097 * bucket) but it does not guarantee that other allocations sizes
1098 * may exist in the same region.
1099 */
1100 uint64_t align = size & -size;
1101 uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
1102 avl_tree_t *t = &msp->ms_tree->rt_root;
1103
1104 return (metaslab_block_picker(t, cursor, size, align));
1105 }
1106
1107 static metaslab_ops_t metaslab_ff_ops = {
1108 metaslab_ff_alloc
1109 };
1110
1111 metaslab_ops_t *zfs_metaslab_ops = &metaslab_ff_ops;
1112 #endif /* WITH_FF_BLOCK_ALLOCATOR */
1113
1114 #if defined(WITH_DF_BLOCK_ALLOCATOR)
1115 /*
1116 * ==========================================================================
1117 * Dynamic block allocator -
1118 * Uses the first fit allocation scheme until space get low and then
1119 * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold
1120 * and metaslab_df_free_pct to determine when to switch the allocation scheme.
1121 * ==========================================================================
1122 */
1123 static uint64_t
1124 metaslab_df_alloc(metaslab_t *msp, uint64_t size)
1125 {
1126 /*
1127 * Find the largest power of 2 block size that evenly divides the
1128 * requested size. This is used to try to allocate blocks with similar
1129 * alignment from the same area of the metaslab (i.e. same cursor
1130 * bucket) but it does not guarantee that other allocations sizes
1131 * may exist in the same region.
1132 */
1133 uint64_t align = size & -size;
1134 uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
1135 range_tree_t *rt = msp->ms_tree;
1136 avl_tree_t *t = &rt->rt_root;
1137 uint64_t max_size = metaslab_block_maxsize(msp);
1138 int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
1139
1140 ASSERT(MUTEX_HELD(&msp->ms_lock));
1141 ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
1142
1143 if (max_size < size)
1144 return (-1ULL);
1145
1146 /*
1147 * If we're running low on space switch to using the size
1148 * sorted AVL tree (best-fit).
1149 */
1150 if (max_size < metaslab_df_alloc_threshold ||
1151 free_pct < metaslab_df_free_pct) {
1152 t = &msp->ms_size_tree;
1153 *cursor = 0;
1154 }
1155
1156 return (metaslab_block_picker(t, cursor, size, 1ULL));
1157 }
1158
1159 static metaslab_ops_t metaslab_df_ops = {
1160 metaslab_df_alloc
1161 };
1162
1163 metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
1164 #endif /* WITH_DF_BLOCK_ALLOCATOR */
1165
1166 #if defined(WITH_CF_BLOCK_ALLOCATOR)
1167 /*
1168 * ==========================================================================
1169 * Cursor fit block allocator -
1170 * Select the largest region in the metaslab, set the cursor to the beginning
1171 * of the range and the cursor_end to the end of the range. As allocations
1172 * are made advance the cursor. Continue allocating from the cursor until
1173 * the range is exhausted and then find a new range.
1174 * ==========================================================================
1175 */
1176 static uint64_t
1177 metaslab_cf_alloc(metaslab_t *msp, uint64_t size)
1178 {
1179 range_tree_t *rt = msp->ms_tree;
1180 avl_tree_t *t = &msp->ms_size_tree;
1181 uint64_t *cursor = &msp->ms_lbas[0];
1182 uint64_t *cursor_end = &msp->ms_lbas[1];
1183 uint64_t offset = 0;
1184
1185 ASSERT(MUTEX_HELD(&msp->ms_lock));
1186 ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root));
1187
1188 ASSERT3U(*cursor_end, >=, *cursor);
1189
1190 if ((*cursor + size) > *cursor_end) {
1191 range_seg_t *rs;
1192
1193 rs = avl_last(&msp->ms_size_tree);
1194 if (rs == NULL || (rs->rs_end - rs->rs_start) < size)
1195 return (-1ULL);
1196
1197 *cursor = rs->rs_start;
1198 *cursor_end = rs->rs_end;
1199 }
1200
1201 offset = *cursor;
1202 *cursor += size;
1203
1204 return (offset);
1205 }
1206
1207 static metaslab_ops_t metaslab_cf_ops = {
1208 metaslab_cf_alloc
1209 };
1210
1211 metaslab_ops_t *zfs_metaslab_ops = &metaslab_cf_ops;
1212 #endif /* WITH_CF_BLOCK_ALLOCATOR */
1213
1214 #if defined(WITH_NDF_BLOCK_ALLOCATOR)
1215 /*
1216 * ==========================================================================
1217 * New dynamic fit allocator -
1218 * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift
1219 * contiguous blocks. If no region is found then just use the largest segment
1220 * that remains.
1221 * ==========================================================================
1222 */
1223
1224 /*
1225 * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift)
1226 * to request from the allocator.
1227 */
1228 uint64_t metaslab_ndf_clump_shift = 4;
1229
1230 static uint64_t
1231 metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
1232 {
1233 avl_tree_t *t = &msp->ms_tree->rt_root;
1234 avl_index_t where;
1235 range_seg_t *rs, rsearch;
1236 uint64_t hbit = highbit64(size);
1237 uint64_t *cursor = &msp->ms_lbas[hbit - 1];
1238 uint64_t max_size = metaslab_block_maxsize(msp);
1239
1240 ASSERT(MUTEX_HELD(&msp->ms_lock));
1241 ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
1242
1243 if (max_size < size)
1244 return (-1ULL);
1245
1246 rsearch.rs_start = *cursor;
1247 rsearch.rs_end = *cursor + size;
1248
1249 rs = avl_find(t, &rsearch, &where);
1250 if (rs == NULL || (rs->rs_end - rs->rs_start) < size) {
1251 t = &msp->ms_size_tree;
1252
1253 rsearch.rs_start = 0;
1254 rsearch.rs_end = MIN(max_size,
1255 1ULL << (hbit + metaslab_ndf_clump_shift));
1256 rs = avl_find(t, &rsearch, &where);
1257 if (rs == NULL)
1258 rs = avl_nearest(t, where, AVL_AFTER);
1259 ASSERT(rs != NULL);
1260 }
1261
1262 if ((rs->rs_end - rs->rs_start) >= size) {
1263 *cursor = rs->rs_start + size;
1264 return (rs->rs_start);
1265 }
1266 return (-1ULL);
1267 }
1268
1269 static metaslab_ops_t metaslab_ndf_ops = {
1270 metaslab_ndf_alloc
1271 };
1272
1273 metaslab_ops_t *zfs_metaslab_ops = &metaslab_ndf_ops;
1274 #endif /* WITH_NDF_BLOCK_ALLOCATOR */
1275
1276
1277 /*
1278 * ==========================================================================
1279 * Metaslabs
1280 * ==========================================================================
1281 */
1282
1283 /*
1284 * Wait for any in-progress metaslab loads to complete.
1285 */
1286 void
1287 metaslab_load_wait(metaslab_t *msp)
1288 {
1289 ASSERT(MUTEX_HELD(&msp->ms_lock));
1290
1291 while (msp->ms_loading) {
1292 ASSERT(!msp->ms_loaded);
1293 cv_wait(&msp->ms_load_cv, &msp->ms_lock);
1294 }
1295 }
1296
1297 int
1298 metaslab_load(metaslab_t *msp)
1299 {
1300 int error = 0;
1301 boolean_t success = B_FALSE;
1302
1303 ASSERT(MUTEX_HELD(&msp->ms_lock));
1304 ASSERT(!msp->ms_loaded);
1305 ASSERT(!msp->ms_loading);
1306
1307 msp->ms_loading = B_TRUE;
1308 /*
1309 * Nobody else can manipulate a loading metaslab, so it's now safe
1310 * to drop the lock. This way we don't have to hold the lock while
1311 * reading the spacemap from disk.
1312 */
1313 mutex_exit(&msp->ms_lock);
1314
1315 /*
1316 * If the space map has not been allocated yet, then treat
1317 * all the space in the metaslab as free and add it to the
1318 * ms_tree.
1319 */
1320 if (msp->ms_sm != NULL)
1321 error = space_map_load(msp->ms_sm, msp->ms_tree, SM_FREE);
1322 else
1323 range_tree_add(msp->ms_tree, msp->ms_start, msp->ms_size);
1324
1325 success = (error == 0);
1326
1327 mutex_enter(&msp->ms_lock);
1328 msp->ms_loading = B_FALSE;
1329
1330 if (success) {
1331 ASSERT3P(msp->ms_group, !=, NULL);
1332 msp->ms_loaded = B_TRUE;
1333
1334 for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1335 range_tree_walk(msp->ms_defertree[t],
1336 range_tree_remove, msp->ms_tree);
1337 }
1338 msp->ms_max_size = metaslab_block_maxsize(msp);
1339 }
1340 cv_broadcast(&msp->ms_load_cv);
1341 return (error);
1342 }
1343
1344 void
1345 metaslab_unload(metaslab_t *msp)
1346 {
1347 ASSERT(MUTEX_HELD(&msp->ms_lock));
1348 range_tree_vacate(msp->ms_tree, NULL, NULL);
1349 msp->ms_loaded = B_FALSE;
1350 msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
1351 msp->ms_max_size = 0;
1352 }
1353
1354 int
1355 metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
1356 metaslab_t **msp)
1357 {
1358 vdev_t *vd = mg->mg_vd;
1359 objset_t *mos = vd->vdev_spa->spa_meta_objset;
1360 metaslab_t *ms;
1361 int error;
1362
1363 ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
1364 mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL);
1365 mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL);
1366 cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL);
1367 ms->ms_id = id;
1368 ms->ms_start = id << vd->vdev_ms_shift;
1369 ms->ms_size = 1ULL << vd->vdev_ms_shift;
1370
1371 /*
1372 * We only open space map objects that already exist. All others
1373 * will be opened when we finally allocate an object for it.
1374 */
1375 if (object != 0) {
1376 error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start,
1377 ms->ms_size, vd->vdev_ashift);
1378
1379 if (error != 0) {
1380 kmem_free(ms, sizeof (metaslab_t));
1381 return (error);
1382 }
1383
1384 ASSERT(ms->ms_sm != NULL);
1385 }
1386
1387 /*
1388 * We create the main range tree here, but we don't create the
1389 * other range trees until metaslab_sync_done(). This serves
1390 * two purposes: it allows metaslab_sync_done() to detect the
1391 * addition of new space; and for debugging, it ensures that we'd
1392 * data fault on any attempt to use this metaslab before it's ready.
1393 */
1394 ms->ms_tree = range_tree_create_impl(&rt_avl_ops, &ms->ms_size_tree,
1395 metaslab_rangesize_compare, 0);
1396 metaslab_group_add(mg, ms);
1397
1398 metaslab_set_fragmentation(ms);
1399
1400 /*
1401 * If we're opening an existing pool (txg == 0) or creating
1402 * a new one (txg == TXG_INITIAL), all space is available now.
1403 * If we're adding space to an existing pool, the new space
1404 * does not become available until after this txg has synced.
1405 * The metaslab's weight will also be initialized when we sync
1406 * out this txg. This ensures that we don't attempt to allocate
1407 * from it before we have initialized it completely.
1408 */
1409 if (txg <= TXG_INITIAL)
1410 metaslab_sync_done(ms, 0);
1411
1412 /*
1413 * If metaslab_debug_load is set and we're initializing a metaslab
1414 * that has an allocated space map object then load the its space
1415 * map so that can verify frees.
1416 */
1417 if (metaslab_debug_load && ms->ms_sm != NULL) {
1418 mutex_enter(&ms->ms_lock);
1419 VERIFY0(metaslab_load(ms));
1420 mutex_exit(&ms->ms_lock);
1421 }
1422
1423 if (txg != 0) {
1424 vdev_dirty(vd, 0, NULL, txg);
1425 vdev_dirty(vd, VDD_METASLAB, ms, txg);
1426 }
1427
1428 *msp = ms;
1429
1430 return (0);
1431 }
1432
1433 void
1434 metaslab_fini(metaslab_t *msp)
1435 {
1436 metaslab_group_t *mg = msp->ms_group;
1437
1438 metaslab_group_remove(mg, msp);
1439
1440 mutex_enter(&msp->ms_lock);
1441 VERIFY(msp->ms_group == NULL);
1442 vdev_space_update(mg->mg_vd, -space_map_allocated(msp->ms_sm),
1443 0, -msp->ms_size);
1444 space_map_close(msp->ms_sm);
1445
1446 metaslab_unload(msp);
1447 range_tree_destroy(msp->ms_tree);
1448 range_tree_destroy(msp->ms_freeingtree);
1449 range_tree_destroy(msp->ms_freedtree);
1450
1451 for (int t = 0; t < TXG_SIZE; t++) {
1452 range_tree_destroy(msp->ms_alloctree[t]);
1453 }
1454
1455 for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1456 range_tree_destroy(msp->ms_defertree[t]);
1457 }
1458
1459 ASSERT0(msp->ms_deferspace);
1460
1461 mutex_exit(&msp->ms_lock);
1462 cv_destroy(&msp->ms_load_cv);
1463 mutex_destroy(&msp->ms_lock);
1464 mutex_destroy(&msp->ms_sync_lock);
1465
1466 kmem_free(msp, sizeof (metaslab_t));
1467 }
1468
1469 #define FRAGMENTATION_TABLE_SIZE 17
1470
1471 /*
1472 * This table defines a segment size based fragmentation metric that will
1473 * allow each metaslab to derive its own fragmentation value. This is done
1474 * by calculating the space in each bucket of the spacemap histogram and
1475 * multiplying that by the fragmetation metric in this table. Doing
1476 * this for all buckets and dividing it by the total amount of free
1477 * space in this metaslab (i.e. the total free space in all buckets) gives
1478 * us the fragmentation metric. This means that a high fragmentation metric
1479 * equates to most of the free space being comprised of small segments.
1480 * Conversely, if the metric is low, then most of the free space is in
1481 * large segments. A 10% change in fragmentation equates to approximately
1482 * double the number of segments.
1483 *
1484 * This table defines 0% fragmented space using 16MB segments. Testing has
1485 * shown that segments that are greater than or equal to 16MB do not suffer
1486 * from drastic performance problems. Using this value, we derive the rest
1487 * of the table. Since the fragmentation value is never stored on disk, it
1488 * is possible to change these calculations in the future.
1489 */
1490 int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = {
1491 100, /* 512B */
1492 100, /* 1K */
1493 98, /* 2K */
1494 95, /* 4K */
1495 90, /* 8K */
1496 80, /* 16K */
1497 70, /* 32K */
1498 60, /* 64K */
1499 50, /* 128K */
1500 40, /* 256K */
1501 30, /* 512K */
1502 20, /* 1M */
1503 15, /* 2M */
1504 10, /* 4M */
1505 5, /* 8M */
1506 0 /* 16M */
1507 };
1508
1509 /*
1510 * Calclate the metaslab's fragmentation metric. A return value
1511 * of ZFS_FRAG_INVALID means that the metaslab has not been upgraded and does
1512 * not support this metric. Otherwise, the return value should be in the
1513 * range [0, 100].
1514 */
1515 static void
1516 metaslab_set_fragmentation(metaslab_t *msp)
1517 {
1518 spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1519 uint64_t fragmentation = 0;
1520 uint64_t total = 0;
1521 boolean_t feature_enabled = spa_feature_is_enabled(spa,
1522 SPA_FEATURE_SPACEMAP_HISTOGRAM);
1523
1524 if (!feature_enabled) {
1525 msp->ms_fragmentation = ZFS_FRAG_INVALID;
1526 return;
1527 }
1528
1529 /*
1530 * A null space map means that the entire metaslab is free
1531 * and thus is not fragmented.
1532 */
1533 if (msp->ms_sm == NULL) {
1534 msp->ms_fragmentation = 0;
1535 return;
1536 }
1537
1538 /*
1539 * If this metaslab's space map has not been upgraded, flag it
1540 * so that we upgrade next time we encounter it.
1541 */
1542 if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) {
1543 uint64_t txg = spa_syncing_txg(spa);
1544 vdev_t *vd = msp->ms_group->mg_vd;
1545
1546 /*
1547 * If we've reached the final dirty txg, then we must
1548 * be shutting down the pool. We don't want to dirty
1549 * any data past this point so skip setting the condense
1550 * flag. We can retry this action the next time the pool
1551 * is imported.
1552 */
1553 if (spa_writeable(spa) && txg < spa_final_dirty_txg(spa)) {
1554 msp->ms_condense_wanted = B_TRUE;
1555 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
1556 spa_dbgmsg(spa, "txg %llu, requesting force condense: "
1557 "ms_id %llu, vdev_id %llu", txg, msp->ms_id,
1558 vd->vdev_id);
1559 }
1560 msp->ms_fragmentation = ZFS_FRAG_INVALID;
1561 return;
1562 }
1563
1564 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
1565 uint64_t space = 0;
1566 uint8_t shift = msp->ms_sm->sm_shift;
1567
1568 int idx = MIN(shift - SPA_MINBLOCKSHIFT + i,
1569 FRAGMENTATION_TABLE_SIZE - 1);
1570
1571 if (msp->ms_sm->sm_phys->smp_histogram[i] == 0)
1572 continue;
1573
1574 space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift);
1575 total += space;
1576
1577 ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE);
1578 fragmentation += space * zfs_frag_table[idx];
1579 }
1580
1581 if (total > 0)
1582 fragmentation /= total;
1583 ASSERT3U(fragmentation, <=, 100);
1584
1585 msp->ms_fragmentation = fragmentation;
1586 }
1587
1588 /*
1589 * Compute a weight -- a selection preference value -- for the given metaslab.
1590 * This is based on the amount of free space, the level of fragmentation,
1591 * the LBA range, and whether the metaslab is loaded.
1592 */
1593 static uint64_t
1594 metaslab_space_weight(metaslab_t *msp)
1595 {
1596 metaslab_group_t *mg = msp->ms_group;
1597 vdev_t *vd = mg->mg_vd;
1598 uint64_t weight, space;
1599
1600 ASSERT(MUTEX_HELD(&msp->ms_lock));
1601 ASSERT(!vd->vdev_removing);
1602
1603 /*
1604 * The baseline weight is the metaslab's free space.
1605 */
1606 space = msp->ms_size - space_map_allocated(msp->ms_sm);
1607
1608 if (metaslab_fragmentation_factor_enabled &&
1609 msp->ms_fragmentation != ZFS_FRAG_INVALID) {
1610 /*
1611 * Use the fragmentation information to inversely scale
1612 * down the baseline weight. We need to ensure that we
1613 * don't exclude this metaslab completely when it's 100%
1614 * fragmented. To avoid this we reduce the fragmented value
1615 * by 1.
1616 */
1617 space = (space * (100 - (msp->ms_fragmentation - 1))) / 100;
1618
1619 /*
1620 * If space < SPA_MINBLOCKSIZE, then we will not allocate from
1621 * this metaslab again. The fragmentation metric may have
1622 * decreased the space to something smaller than
1623 * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE
1624 * so that we can consume any remaining space.
1625 */
1626 if (space > 0 && space < SPA_MINBLOCKSIZE)
1627 space = SPA_MINBLOCKSIZE;
1628 }
1629 weight = space;
1630
1631 /*
1632 * Modern disks have uniform bit density and constant angular velocity.
1633 * Therefore, the outer recording zones are faster (higher bandwidth)
1634 * than the inner zones by the ratio of outer to inner track diameter,
1635 * which is typically around 2:1. We account for this by assigning
1636 * higher weight to lower metaslabs (multiplier ranging from 2x to 1x).
1637 * In effect, this means that we'll select the metaslab with the most
1638 * free bandwidth rather than simply the one with the most free space.
1639 */
1640 if (!vd->vdev_nonrot && metaslab_lba_weighting_enabled) {
1641 weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count;
1642 ASSERT(weight >= space && weight <= 2 * space);
1643 }
1644
1645 /*
1646 * If this metaslab is one we're actively using, adjust its
1647 * weight to make it preferable to any inactive metaslab so
1648 * we'll polish it off. If the fragmentation on this metaslab
1649 * has exceed our threshold, then don't mark it active.
1650 */
1651 if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID &&
1652 msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) {
1653 weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
1654 }
1655
1656 WEIGHT_SET_SPACEBASED(weight);
1657 return (weight);
1658 }
1659
1660 /*
1661 * Return the weight of the specified metaslab, according to the segment-based
1662 * weighting algorithm. The metaslab must be loaded. This function can
1663 * be called within a sync pass since it relies only on the metaslab's
1664 * range tree which is always accurate when the metaslab is loaded.
1665 */
1666 static uint64_t
1667 metaslab_weight_from_range_tree(metaslab_t *msp)
1668 {
1669 uint64_t weight = 0;
1670 uint32_t segments = 0;
1671
1672 ASSERT(msp->ms_loaded);
1673
1674 for (int i = RANGE_TREE_HISTOGRAM_SIZE - 1; i >= SPA_MINBLOCKSHIFT;
1675 i--) {
1676 uint8_t shift = msp->ms_group->mg_vd->vdev_ashift;
1677 int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
1678
1679 segments <<= 1;
1680 segments += msp->ms_tree->rt_histogram[i];
1681
1682 /*
1683 * The range tree provides more precision than the space map
1684 * and must be downgraded so that all values fit within the
1685 * space map's histogram. This allows us to compare loaded
1686 * vs. unloaded metaslabs to determine which metaslab is
1687 * considered "best".
1688 */
1689 if (i > max_idx)
1690 continue;
1691
1692 if (segments != 0) {
1693 WEIGHT_SET_COUNT(weight, segments);
1694 WEIGHT_SET_INDEX(weight, i);
1695 WEIGHT_SET_ACTIVE(weight, 0);
1696 break;
1697 }
1698 }
1699 return (weight);
1700 }
1701
1702 /*
1703 * Calculate the weight based on the on-disk histogram. This should only
1704 * be called after a sync pass has completely finished since the on-disk
1705 * information is updated in metaslab_sync().
1706 */
1707 static uint64_t
1708 metaslab_weight_from_spacemap(metaslab_t *msp)
1709 {
1710 uint64_t weight = 0;
1711
1712 for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) {
1713 if (msp->ms_sm->sm_phys->smp_histogram[i] != 0) {
1714 WEIGHT_SET_COUNT(weight,
1715 msp->ms_sm->sm_phys->smp_histogram[i]);
1716 WEIGHT_SET_INDEX(weight, i +
1717 msp->ms_sm->sm_shift);
1718 WEIGHT_SET_ACTIVE(weight, 0);
1719 break;
1720 }
1721 }
1722 return (weight);
1723 }
1724
1725 /*
1726 * Compute a segment-based weight for the specified metaslab. The weight
1727 * is determined by highest bucket in the histogram. The information
1728 * for the highest bucket is encoded into the weight value.
1729 */
1730 static uint64_t
1731 metaslab_segment_weight(metaslab_t *msp)
1732 {
1733 metaslab_group_t *mg = msp->ms_group;
1734 uint64_t weight = 0;
1735 uint8_t shift = mg->mg_vd->vdev_ashift;
1736
1737 ASSERT(MUTEX_HELD(&msp->ms_lock));
1738
1739 /*
1740 * The metaslab is completely free.
1741 */
1742 if (space_map_allocated(msp->ms_sm) == 0) {
1743 int idx = highbit64(msp->ms_size) - 1;
1744 int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
1745
1746 if (idx < max_idx) {
1747 WEIGHT_SET_COUNT(weight, 1ULL);
1748 WEIGHT_SET_INDEX(weight, idx);
1749 } else {
1750 WEIGHT_SET_COUNT(weight, 1ULL << (idx - max_idx));
1751 WEIGHT_SET_INDEX(weight, max_idx);
1752 }
1753 WEIGHT_SET_ACTIVE(weight, 0);
1754 ASSERT(!WEIGHT_IS_SPACEBASED(weight));
1755
1756 return (weight);
1757 }
1758
1759 ASSERT3U(msp->ms_sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t));
1760
1761 /*
1762 * If the metaslab is fully allocated then just make the weight 0.
1763 */
1764 if (space_map_allocated(msp->ms_sm) == msp->ms_size)
1765 return (0);
1766 /*
1767 * If the metaslab is already loaded, then use the range tree to
1768 * determine the weight. Otherwise, we rely on the space map information
1769 * to generate the weight.
1770 */
1771 if (msp->ms_loaded) {
1772 weight = metaslab_weight_from_range_tree(msp);
1773 } else {
1774 weight = metaslab_weight_from_spacemap(msp);
1775 }
1776
1777 /*
1778 * If the metaslab was active the last time we calculated its weight
1779 * then keep it active. We want to consume the entire region that
1780 * is associated with this weight.
1781 */
1782 if (msp->ms_activation_weight != 0 && weight != 0)
1783 WEIGHT_SET_ACTIVE(weight, WEIGHT_GET_ACTIVE(msp->ms_weight));
1784 return (weight);
1785 }
1786
1787 /*
1788 * Determine if we should attempt to allocate from this metaslab. If the
1789 * metaslab has a maximum size then we can quickly determine if the desired
1790 * allocation size can be satisfied. Otherwise, if we're using segment-based
1791 * weighting then we can determine the maximum allocation that this metaslab
1792 * can accommodate based on the index encoded in the weight. If we're using
1793 * space-based weights then rely on the entire weight (excluding the weight
1794 * type bit).
1795 */
1796 boolean_t
1797 metaslab_should_allocate(metaslab_t *msp, uint64_t asize)
1798 {
1799 boolean_t should_allocate;
1800
1801 if (msp->ms_max_size != 0)
1802 return (msp->ms_max_size >= asize);
1803
1804 if (!WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
1805 /*
1806 * The metaslab segment weight indicates segments in the
1807 * range [2^i, 2^(i+1)), where i is the index in the weight.
1808 * Since the asize might be in the middle of the range, we
1809 * should attempt the allocation if asize < 2^(i+1).
1810 */
1811 should_allocate = (asize <
1812 1ULL << (WEIGHT_GET_INDEX(msp->ms_weight) + 1));
1813 } else {
1814 should_allocate = (asize <=
1815 (msp->ms_weight & ~METASLAB_WEIGHT_TYPE));
1816 }
1817 return (should_allocate);
1818 }
1819 static uint64_t
1820 metaslab_weight(metaslab_t *msp)
1821 {
1822 vdev_t *vd = msp->ms_group->mg_vd;
1823 spa_t *spa = vd->vdev_spa;
1824 uint64_t weight;
1825
1826 ASSERT(MUTEX_HELD(&msp->ms_lock));
1827
1828 /*
1829 * If this vdev is in the process of being removed, there is nothing
1830 * for us to do here.
1831 */
1832 if (vd->vdev_removing)
1833 return (0);
1834
1835 metaslab_set_fragmentation(msp);
1836
1837 /*
1838 * Update the maximum size if the metaslab is loaded. This will
1839 * ensure that we get an accurate maximum size if newly freed space
1840 * has been added back into the free tree.
1841 */
1842 if (msp->ms_loaded)
1843 msp->ms_max_size = metaslab_block_maxsize(msp);
1844
1845 /*
1846 * Segment-based weighting requires space map histogram support.
1847 */
1848 if (zfs_metaslab_segment_weight_enabled &&
1849 spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) &&
1850 (msp->ms_sm == NULL || msp->ms_sm->sm_dbuf->db_size ==
1851 sizeof (space_map_phys_t))) {
1852 weight = metaslab_segment_weight(msp);
1853 } else {
1854 weight = metaslab_space_weight(msp);
1855 }
1856 return (weight);
1857 }
1858
1859 static int
1860 metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
1861 {
1862 ASSERT(MUTEX_HELD(&msp->ms_lock));
1863
1864 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
1865 metaslab_load_wait(msp);
1866 if (!msp->ms_loaded) {
1867 int error = metaslab_load(msp);
1868 if (error) {
1869 metaslab_group_sort(msp->ms_group, msp, 0);
1870 return (error);
1871 }
1872 }
1873
1874 msp->ms_activation_weight = msp->ms_weight;
1875 metaslab_group_sort(msp->ms_group, msp,
1876 msp->ms_weight | activation_weight);
1877 }
1878 ASSERT(msp->ms_loaded);
1879 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
1880
1881 return (0);
1882 }
1883
1884 static void
1885 metaslab_passivate(metaslab_t *msp, uint64_t weight)
1886 {
1887 ASSERTV(uint64_t size = weight & ~METASLAB_WEIGHT_TYPE);
1888
1889 /*
1890 * If size < SPA_MINBLOCKSIZE, then we will not allocate from
1891 * this metaslab again. In that case, it had better be empty,
1892 * or we would be leaving space on the table.
1893 */
1894 ASSERT(!WEIGHT_IS_SPACEBASED(msp->ms_weight) ||
1895 size >= SPA_MINBLOCKSIZE ||
1896 range_tree_space(msp->ms_tree) == 0);
1897 ASSERT0(weight & METASLAB_ACTIVE_MASK);
1898
1899 msp->ms_activation_weight = 0;
1900 metaslab_group_sort(msp->ms_group, msp, weight);
1901 ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0);
1902 }
1903
1904 /*
1905 * Segment-based metaslabs are activated once and remain active until
1906 * we either fail an allocation attempt (similar to space-based metaslabs)
1907 * or have exhausted the free space in zfs_metaslab_switch_threshold
1908 * buckets since the metaslab was activated. This function checks to see
1909 * if we've exhaused the zfs_metaslab_switch_threshold buckets in the
1910 * metaslab and passivates it proactively. This will allow us to select a
1911 * metaslab with a larger contiguous region, if any, remaining within this
1912 * metaslab group. If we're in sync pass > 1, then we continue using this
1913 * metaslab so that we don't dirty more block and cause more sync passes.
1914 */
1915 void
1916 metaslab_segment_may_passivate(metaslab_t *msp)
1917 {
1918 spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1919
1920 if (WEIGHT_IS_SPACEBASED(msp->ms_weight) || spa_sync_pass(spa) > 1)
1921 return;
1922
1923 /*
1924 * Since we are in the middle of a sync pass, the most accurate
1925 * information that is accessible to us is the in-core range tree
1926 * histogram; calculate the new weight based on that information.
1927 */
1928 uint64_t weight = metaslab_weight_from_range_tree(msp);
1929 int activation_idx = WEIGHT_GET_INDEX(msp->ms_activation_weight);
1930 int current_idx = WEIGHT_GET_INDEX(weight);
1931
1932 if (current_idx <= activation_idx - zfs_metaslab_switch_threshold)
1933 metaslab_passivate(msp, weight);
1934 }
1935
1936 static void
1937 metaslab_preload(void *arg)
1938 {
1939 metaslab_t *msp = arg;
1940 spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1941 fstrans_cookie_t cookie = spl_fstrans_mark();
1942
1943 ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock));
1944
1945 mutex_enter(&msp->ms_lock);
1946 metaslab_load_wait(msp);
1947 if (!msp->ms_loaded)
1948 (void) metaslab_load(msp);
1949 msp->ms_selected_txg = spa_syncing_txg(spa);
1950 mutex_exit(&msp->ms_lock);
1951 spl_fstrans_unmark(cookie);
1952 }
1953
1954 static void
1955 metaslab_group_preload(metaslab_group_t *mg)
1956 {
1957 spa_t *spa = mg->mg_vd->vdev_spa;
1958 metaslab_t *msp;
1959 avl_tree_t *t = &mg->mg_metaslab_tree;
1960 int m = 0;
1961
1962 if (spa_shutting_down(spa) || !metaslab_preload_enabled) {
1963 taskq_wait_outstanding(mg->mg_taskq, 0);
1964 return;
1965 }
1966
1967 mutex_enter(&mg->mg_lock);
1968
1969 /*
1970 * Load the next potential metaslabs
1971 */
1972 for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) {
1973 ASSERT3P(msp->ms_group, ==, mg);
1974
1975 /*
1976 * We preload only the maximum number of metaslabs specified
1977 * by metaslab_preload_limit. If a metaslab is being forced
1978 * to condense then we preload it too. This will ensure
1979 * that force condensing happens in the next txg.
1980 */
1981 if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) {
1982 continue;
1983 }
1984
1985 VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload,
1986 msp, TQ_SLEEP) != TASKQID_INVALID);
1987 }
1988 mutex_exit(&mg->mg_lock);
1989 }
1990
1991 /*
1992 * Determine if the space map's on-disk footprint is past our tolerance
1993 * for inefficiency. We would like to use the following criteria to make
1994 * our decision:
1995 *
1996 * 1. The size of the space map object should not dramatically increase as a
1997 * result of writing out the free space range tree.
1998 *
1999 * 2. The minimal on-disk space map representation is zfs_condense_pct/100
2000 * times the size than the free space range tree representation
2001 * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1MB).
2002 *
2003 * 3. The on-disk size of the space map should actually decrease.
2004 *
2005 * Checking the first condition is tricky since we don't want to walk
2006 * the entire AVL tree calculating the estimated on-disk size. Instead we
2007 * use the size-ordered range tree in the metaslab and calculate the
2008 * size required to write out the largest segment in our free tree. If the
2009 * size required to represent that segment on disk is larger than the space
2010 * map object then we avoid condensing this map.
2011 *
2012 * To determine the second criterion we use a best-case estimate and assume
2013 * each segment can be represented on-disk as a single 64-bit entry. We refer
2014 * to this best-case estimate as the space map's minimal form.
2015 *
2016 * Unfortunately, we cannot compute the on-disk size of the space map in this
2017 * context because we cannot accurately compute the effects of compression, etc.
2018 * Instead, we apply the heuristic described in the block comment for
2019 * zfs_metaslab_condense_block_threshold - we only condense if the space used
2020 * is greater than a threshold number of blocks.
2021 */
2022 static boolean_t
2023 metaslab_should_condense(metaslab_t *msp)
2024 {
2025 space_map_t *sm = msp->ms_sm;
2026 range_seg_t *rs;
2027 uint64_t size, entries, segsz, object_size, optimal_size, record_size;
2028 dmu_object_info_t doi;
2029 uint64_t vdev_blocksize = 1ULL << msp->ms_group->mg_vd->vdev_ashift;
2030
2031 ASSERT(MUTEX_HELD(&msp->ms_lock));
2032 ASSERT(msp->ms_loaded);
2033
2034 /*
2035 * Use the ms_size_tree range tree, which is ordered by size, to
2036 * obtain the largest segment in the free tree. We always condense
2037 * metaslabs that are empty and metaslabs for which a condense
2038 * request has been made.
2039 */
2040 rs = avl_last(&msp->ms_size_tree);
2041 if (rs == NULL || msp->ms_condense_wanted)
2042 return (B_TRUE);
2043
2044 /*
2045 * Calculate the number of 64-bit entries this segment would
2046 * require when written to disk. If this single segment would be
2047 * larger on-disk than the entire current on-disk structure, then
2048 * clearly condensing will increase the on-disk structure size.
2049 */
2050 size = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
2051 entries = size / (MIN(size, SM_RUN_MAX));
2052 segsz = entries * sizeof (uint64_t);
2053
2054 optimal_size = sizeof (uint64_t) * avl_numnodes(&msp->ms_tree->rt_root);
2055 object_size = space_map_length(msp->ms_sm);
2056
2057 dmu_object_info_from_db(sm->sm_dbuf, &doi);
2058 record_size = MAX(doi.doi_data_block_size, vdev_blocksize);
2059
2060 return (segsz <= object_size &&
2061 object_size >= (optimal_size * zfs_condense_pct / 100) &&
2062 object_size > zfs_metaslab_condense_block_threshold * record_size);
2063 }
2064
2065 /*
2066 * Condense the on-disk space map representation to its minimized form.
2067 * The minimized form consists of a small number of allocations followed by
2068 * the entries of the free range tree.
2069 */
2070 static void
2071 metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx)
2072 {
2073 spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
2074 range_tree_t *condense_tree;
2075 space_map_t *sm = msp->ms_sm;
2076
2077 ASSERT(MUTEX_HELD(&msp->ms_lock));
2078 ASSERT3U(spa_sync_pass(spa), ==, 1);
2079 ASSERT(msp->ms_loaded);
2080
2081
2082 spa_dbgmsg(spa, "condensing: txg %llu, msp[%llu] %p, vdev id %llu, "
2083 "spa %s, smp size %llu, segments %lu, forcing condense=%s", txg,
2084 msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id,
2085 msp->ms_group->mg_vd->vdev_spa->spa_name,
2086 space_map_length(msp->ms_sm), avl_numnodes(&msp->ms_tree->rt_root),
2087 msp->ms_condense_wanted ? "TRUE" : "FALSE");
2088
2089 msp->ms_condense_wanted = B_FALSE;
2090
2091 /*
2092 * Create an range tree that is 100% allocated. We remove segments
2093 * that have been freed in this txg, any deferred frees that exist,
2094 * and any allocation in the future. Removing segments should be
2095 * a relatively inexpensive operation since we expect these trees to
2096 * have a small number of nodes.
2097 */
2098 condense_tree = range_tree_create(NULL, NULL);
2099 range_tree_add(condense_tree, msp->ms_start, msp->ms_size);
2100
2101 /*
2102 * Remove what's been freed in this txg from the condense_tree.
2103 * Since we're in sync_pass 1, we know that all the frees from
2104 * this txg are in the freeingtree.
2105 */
2106 range_tree_walk(msp->ms_freeingtree, range_tree_remove, condense_tree);
2107
2108 for (int t = 0; t < TXG_DEFER_SIZE; t++) {
2109 range_tree_walk(msp->ms_defertree[t],
2110 range_tree_remove, condense_tree);
2111 }
2112
2113 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
2114 range_tree_walk(msp->ms_alloctree[(txg + t) & TXG_MASK],
2115 range_tree_remove, condense_tree);
2116 }
2117
2118 /*
2119 * We're about to drop the metaslab's lock thus allowing
2120 * other consumers to change it's content. Set the
2121 * metaslab's ms_condensing flag to ensure that
2122 * allocations on this metaslab do not occur while we're
2123 * in the middle of committing it to disk. This is only critical
2124 * for the ms_tree as all other range trees use per txg
2125 * views of their content.
2126 */
2127 msp->ms_condensing = B_TRUE;
2128
2129 mutex_exit(&msp->ms_lock);
2130 space_map_truncate(sm, tx);
2131
2132 /*
2133 * While we would ideally like to create a space map representation
2134 * that consists only of allocation records, doing so can be
2135 * prohibitively expensive because the in-core free tree can be
2136 * large, and therefore computationally expensive to subtract
2137 * from the condense_tree. Instead we sync out two trees, a cheap
2138 * allocation only tree followed by the in-core free tree. While not
2139 * optimal, this is typically close to optimal, and much cheaper to
2140 * compute.
2141 */
2142 space_map_write(sm, condense_tree, SM_ALLOC, tx);
2143 range_tree_vacate(condense_tree, NULL, NULL);
2144 range_tree_destroy(condense_tree);
2145
2146 space_map_write(sm, msp->ms_tree, SM_FREE, tx);
2147 mutex_enter(&msp->ms_lock);
2148 msp->ms_condensing = B_FALSE;
2149 }
2150
2151 /*
2152 * Write a metaslab to disk in the context of the specified transaction group.
2153 */
2154 void
2155 metaslab_sync(metaslab_t *msp, uint64_t txg)
2156 {
2157 metaslab_group_t *mg = msp->ms_group;
2158 vdev_t *vd = mg->mg_vd;
2159 spa_t *spa = vd->vdev_spa;
2160 objset_t *mos = spa_meta_objset(spa);
2161 range_tree_t *alloctree = msp->ms_alloctree[txg & TXG_MASK];
2162 dmu_tx_t *tx;
2163 uint64_t object = space_map_object(msp->ms_sm);
2164
2165 ASSERT(!vd->vdev_ishole);
2166
2167 /*
2168 * This metaslab has just been added so there's no work to do now.
2169 */
2170 if (msp->ms_freeingtree == NULL) {
2171 ASSERT3P(alloctree, ==, NULL);
2172 return;
2173 }
2174
2175 ASSERT3P(alloctree, !=, NULL);
2176 ASSERT3P(msp->ms_freeingtree, !=, NULL);
2177 ASSERT3P(msp->ms_freedtree, !=, NULL);
2178
2179 /*
2180 * Normally, we don't want to process a metaslab if there
2181 * are no allocations or frees to perform. However, if the metaslab
2182 * is being forced to condense and it's loaded, we need to let it
2183 * through.
2184 */
2185 if (range_tree_space(alloctree) == 0 &&
2186 range_tree_space(msp->ms_freeingtree) == 0 &&
2187 !(msp->ms_loaded && msp->ms_condense_wanted))
2188 return;
2189
2190
2191 VERIFY(txg <= spa_final_dirty_txg(spa));
2192
2193 /*
2194 * The only state that can actually be changing concurrently with
2195 * metaslab_sync() is the metaslab's ms_tree. No other thread can
2196 * be modifying this txg's alloctree, freeingtree, freedtree, or
2197 * space_map_phys_t. We drop ms_lock whenever we could call
2198 * into the DMU, because the DMU can call down to us
2199 * (e.g. via zio_free()) at any time.
2200 *
2201 * The spa_vdev_remove_thread() can be reading metaslab state
2202 * concurrently, and it is locked out by the ms_sync_lock. Note
2203 * that the ms_lock is insufficient for this, because it is dropped
2204 * by space_map_write().
2205 */
2206
2207 tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
2208
2209 if (msp->ms_sm == NULL) {
2210 uint64_t new_object;
2211
2212 new_object = space_map_alloc(mos, tx);
2213 VERIFY3U(new_object, !=, 0);
2214
2215 VERIFY0(space_map_open(&msp->ms_sm, mos, new_object,
2216 msp->ms_start, msp->ms_size, vd->vdev_ashift));
2217 ASSERT(msp->ms_sm != NULL);
2218 }
2219
2220 mutex_enter(&msp->ms_sync_lock);
2221 mutex_enter(&msp->ms_lock);
2222
2223 /*
2224 * Note: metaslab_condense() clears the space map's histogram.
2225 * Therefore we must verify and remove this histogram before
2226 * condensing.
2227 */
2228 metaslab_group_histogram_verify(mg);
2229 metaslab_class_histogram_verify(mg->mg_class);
2230 metaslab_group_histogram_remove(mg, msp);
2231
2232 if (msp->ms_loaded && spa_sync_pass(spa) == 1 &&
2233 metaslab_should_condense(msp)) {
2234 metaslab_condense(msp, txg, tx);
2235 } else {
2236 mutex_exit(&msp->ms_lock);
2237 space_map_write(msp->ms_sm, alloctree, SM_ALLOC, tx);
2238 space_map_write(msp->ms_sm, msp->ms_freeingtree, SM_FREE, tx);
2239 mutex_enter(&msp->ms_lock);
2240 }
2241
2242 if (msp->ms_loaded) {
2243 /*
2244 * When the space map is loaded, we have an accurate
2245 * histogram in the range tree. This gives us an opportunity
2246 * to bring the space map's histogram up-to-date so we clear
2247 * it first before updating it.
2248 */
2249 space_map_histogram_clear(msp->ms_sm);
2250 space_map_histogram_add(msp->ms_sm, msp->ms_tree, tx);
2251
2252 /*
2253 * Since we've cleared the histogram we need to add back
2254 * any free space that has already been processed, plus
2255 * any deferred space. This allows the on-disk histogram
2256 * to accurately reflect all free space even if some space
2257 * is not yet available for allocation (i.e. deferred).
2258 */
2259 space_map_histogram_add(msp->ms_sm, msp->ms_freedtree, tx);
2260
2261 /*
2262 * Add back any deferred free space that has not been
2263 * added back into the in-core free tree yet. This will
2264 * ensure that we don't end up with a space map histogram
2265 * that is completely empty unless the metaslab is fully
2266 * allocated.
2267 */
2268 for (int t = 0; t < TXG_DEFER_SIZE; t++) {
2269 space_map_histogram_add(msp->ms_sm,
2270 msp->ms_defertree[t], tx);
2271 }
2272 }
2273
2274 /*
2275 * Always add the free space from this sync pass to the space
2276 * map histogram. We want to make sure that the on-disk histogram
2277 * accounts for all free space. If the space map is not loaded,
2278 * then we will lose some accuracy but will correct it the next
2279 * time we load the space map.
2280 */
2281 space_map_histogram_add(msp->ms_sm, msp->ms_freeingtree, tx);
2282
2283 metaslab_group_histogram_add(mg, msp);
2284 metaslab_group_histogram_verify(mg);
2285 metaslab_class_histogram_verify(mg->mg_class);
2286
2287 /*
2288 * For sync pass 1, we avoid traversing this txg's free range tree
2289 * and instead will just swap the pointers for freeingtree and
2290 * freedtree. We can safely do this since the freed_tree is
2291 * guaranteed to be empty on the initial pass.
2292 */
2293 if (spa_sync_pass(spa) == 1) {
2294 range_tree_swap(&msp->ms_freeingtree, &msp->ms_freedtree);
2295 } else {
2296 range_tree_vacate(msp->ms_freeingtree,
2297 range_tree_add, msp->ms_freedtree);
2298 }
2299 range_tree_vacate(alloctree, NULL, NULL);
2300
2301 ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK]));
2302 ASSERT0(range_tree_space(msp->ms_alloctree[TXG_CLEAN(txg) & TXG_MASK]));
2303 ASSERT0(range_tree_space(msp->ms_freeingtree));
2304
2305 mutex_exit(&msp->ms_lock);
2306
2307 if (object != space_map_object(msp->ms_sm)) {
2308 object = space_map_object(msp->ms_sm);
2309 dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
2310 msp->ms_id, sizeof (uint64_t), &object, tx);
2311 }
2312 mutex_exit(&msp->ms_sync_lock);
2313 dmu_tx_commit(tx);
2314 }
2315
2316 /*
2317 * Called after a transaction group has completely synced to mark
2318 * all of the metaslab's free space as usable.
2319 */
2320 void
2321 metaslab_sync_done(metaslab_t *msp, uint64_t txg)
2322 {
2323 metaslab_group_t *mg = msp->ms_group;
2324 vdev_t *vd = mg->mg_vd;
2325 spa_t *spa = vd->vdev_spa;
2326 range_tree_t **defer_tree;
2327 int64_t alloc_delta, defer_delta;
2328 boolean_t defer_allowed = B_TRUE;
2329
2330 ASSERT(!vd->vdev_ishole);
2331
2332 mutex_enter(&msp->ms_lock);
2333
2334 /*
2335 * If this metaslab is just becoming available, initialize its
2336 * range trees and add its capacity to the vdev.
2337 */
2338 if (msp->ms_freedtree == NULL) {
2339 for (int t = 0; t < TXG_SIZE; t++) {
2340 ASSERT(msp->ms_alloctree[t] == NULL);
2341
2342 msp->ms_alloctree[t] = range_tree_create(NULL, NULL);
2343 }
2344
2345 ASSERT3P(msp->ms_freeingtree, ==, NULL);
2346 msp->ms_freeingtree = range_tree_create(NULL, NULL);
2347
2348 ASSERT3P(msp->ms_freedtree, ==, NULL);
2349 msp->ms_freedtree = range_tree_create(NULL, NULL);
2350
2351 for (int t = 0; t < TXG_DEFER_SIZE; t++) {
2352 ASSERT(msp->ms_defertree[t] == NULL);
2353
2354 msp->ms_defertree[t] = range_tree_create(NULL, NULL);
2355 }
2356
2357 vdev_space_update(vd, 0, 0, msp->ms_size);
2358 }
2359
2360 defer_tree = &msp->ms_defertree[txg % TXG_DEFER_SIZE];
2361
2362 uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) -
2363 metaslab_class_get_alloc(spa_normal_class(spa));
2364 if (free_space <= spa_get_slop_space(spa) || vd->vdev_removing) {
2365 defer_allowed = B_FALSE;
2366 }
2367
2368 defer_delta = 0;
2369 alloc_delta = space_map_alloc_delta(msp->ms_sm);
2370 if (defer_allowed) {
2371 defer_delta = range_tree_space(msp->ms_freedtree) -
2372 range_tree_space(*defer_tree);
2373 } else {
2374 defer_delta -= range_tree_space(*defer_tree);
2375 }
2376
2377 vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0);
2378
2379 /*
2380 * If there's a metaslab_load() in progress, wait for it to complete
2381 * so that we have a consistent view of the in-core space map.
2382 */
2383 metaslab_load_wait(msp);
2384
2385 /*
2386 * Move the frees from the defer_tree back to the free
2387 * range tree (if it's loaded). Swap the freed_tree and the
2388 * defer_tree -- this is safe to do because we've just emptied out
2389 * the defer_tree.
2390 */
2391 range_tree_vacate(*defer_tree,
2392 msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree);
2393 if (defer_allowed) {
2394 range_tree_swap(&msp->ms_freedtree, defer_tree);
2395 } else {
2396 range_tree_vacate(msp->ms_freedtree,
2397 msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree);
2398 }
2399
2400 space_map_update(msp->ms_sm);
2401
2402 msp->ms_deferspace += defer_delta;
2403 ASSERT3S(msp->ms_deferspace, >=, 0);
2404 ASSERT3S(msp->ms_deferspace, <=, msp->ms_size);
2405 if (msp->ms_deferspace != 0) {
2406 /*
2407 * Keep syncing this metaslab until all deferred frees
2408 * are back in circulation.
2409 */
2410 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
2411 }
2412
2413 /*
2414 * Calculate the new weights before unloading any metaslabs.
2415 * This will give us the most accurate weighting.
2416 */
2417 metaslab_group_sort(mg, msp, metaslab_weight(msp));
2418
2419 /*
2420 * If the metaslab is loaded and we've not tried to load or allocate
2421 * from it in 'metaslab_unload_delay' txgs, then unload it.
2422 */
2423 if (msp->ms_loaded &&
2424 msp->ms_selected_txg + metaslab_unload_delay < txg) {
2425
2426 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
2427 VERIFY0(range_tree_space(
2428 msp->ms_alloctree[(txg + t) & TXG_MASK]));
2429 }
2430
2431 if (!metaslab_debug_unload)
2432 metaslab_unload(msp);
2433 }
2434
2435 ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK]));
2436 ASSERT0(range_tree_space(msp->ms_freeingtree));
2437 ASSERT0(range_tree_space(msp->ms_freedtree));
2438
2439 mutex_exit(&msp->ms_lock);
2440 }
2441
2442 void
2443 metaslab_sync_reassess(metaslab_group_t *mg)
2444 {
2445 spa_t *spa = mg->mg_class->mc_spa;
2446
2447 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
2448 metaslab_group_alloc_update(mg);
2449 mg->mg_fragmentation = metaslab_group_fragmentation(mg);
2450
2451 /*
2452 * Preload the next potential metaslabs but only on active
2453 * metaslab groups. We can get into a state where the metaslab
2454 * is no longer active since we dirty metaslabs as we remove a
2455 * a device, thus potentially making the metaslab group eligible
2456 * for preloading.
2457 */
2458 if (mg->mg_activation_count > 0) {
2459 metaslab_group_preload(mg);
2460 }
2461 spa_config_exit(spa, SCL_ALLOC, FTAG);
2462 }
2463
2464 static uint64_t
2465 metaslab_distance(metaslab_t *msp, dva_t *dva)
2466 {
2467 uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift;
2468 uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift;
2469 uint64_t start = msp->ms_id;
2470
2471 if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva))
2472 return (1ULL << 63);
2473
2474 if (offset < start)
2475 return ((start - offset) << ms_shift);
2476 if (offset > start)
2477 return ((offset - start) << ms_shift);
2478 return (0);
2479 }
2480
2481 /*
2482 * ==========================================================================
2483 * Metaslab allocation tracing facility
2484 * ==========================================================================
2485 */
2486 #ifdef _METASLAB_TRACING
2487 kstat_t *metaslab_trace_ksp;
2488 kstat_named_t metaslab_trace_over_limit;
2489
2490 void
2491 metaslab_alloc_trace_init(void)
2492 {
2493 ASSERT(metaslab_alloc_trace_cache == NULL);
2494 metaslab_alloc_trace_cache = kmem_cache_create(
2495 "metaslab_alloc_trace_cache", sizeof (metaslab_alloc_trace_t),
2496 0, NULL, NULL, NULL, NULL, NULL, 0);
2497 metaslab_trace_ksp = kstat_create("zfs", 0, "metaslab_trace_stats",
2498 "misc", KSTAT_TYPE_NAMED, 1, KSTAT_FLAG_VIRTUAL);
2499 if (metaslab_trace_ksp != NULL) {
2500 metaslab_trace_ksp->ks_data = &metaslab_trace_over_limit;
2501 kstat_named_init(&metaslab_trace_over_limit,
2502 "metaslab_trace_over_limit", KSTAT_DATA_UINT64);
2503 kstat_install(metaslab_trace_ksp);
2504 }
2505 }
2506
2507 void
2508 metaslab_alloc_trace_fini(void)
2509 {
2510 if (metaslab_trace_ksp != NULL) {
2511 kstat_delete(metaslab_trace_ksp);
2512 metaslab_trace_ksp = NULL;
2513 }
2514 kmem_cache_destroy(metaslab_alloc_trace_cache);
2515 metaslab_alloc_trace_cache = NULL;
2516 }
2517
2518 /*
2519 * Add an allocation trace element to the allocation tracing list.
2520 */
2521 static void
2522 metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg,
2523 metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset)
2524 {
2525 metaslab_alloc_trace_t *mat;
2526
2527 if (!metaslab_trace_enabled)
2528 return;
2529
2530 /*
2531 * When the tracing list reaches its maximum we remove
2532 * the second element in the list before adding a new one.
2533 * By removing the second element we preserve the original
2534 * entry as a clue to what allocations steps have already been
2535 * performed.
2536 */
2537 if (zal->zal_size == metaslab_trace_max_entries) {
2538 metaslab_alloc_trace_t *mat_next;
2539 #ifdef DEBUG
2540 panic("too many entries in allocation list");
2541 #endif
2542 atomic_inc_64(&metaslab_trace_over_limit.value.ui64);
2543 zal->zal_size--;
2544 mat_next = list_next(&zal->zal_list, list_head(&zal->zal_list));
2545 list_remove(&zal->zal_list, mat_next);
2546 kmem_cache_free(metaslab_alloc_trace_cache, mat_next);
2547 }
2548
2549 mat = kmem_cache_alloc(metaslab_alloc_trace_cache, KM_SLEEP);
2550 list_link_init(&mat->mat_list_node);
2551 mat->mat_mg = mg;
2552 mat->mat_msp = msp;
2553 mat->mat_size = psize;
2554 mat->mat_dva_id = dva_id;
2555 mat->mat_offset = offset;
2556 mat->mat_weight = 0;
2557
2558 if (msp != NULL)
2559 mat->mat_weight = msp->ms_weight;
2560
2561 /*
2562 * The list is part of the zio so locking is not required. Only
2563 * a single thread will perform allocations for a given zio.
2564 */
2565 list_insert_tail(&zal->zal_list, mat);
2566 zal->zal_size++;
2567
2568 ASSERT3U(zal->zal_size, <=, metaslab_trace_max_entries);
2569 }
2570
2571 void
2572 metaslab_trace_init(zio_alloc_list_t *zal)
2573 {
2574 list_create(&zal->zal_list, sizeof (metaslab_alloc_trace_t),
2575 offsetof(metaslab_alloc_trace_t, mat_list_node));
2576 zal->zal_size = 0;
2577 }
2578
2579 void
2580 metaslab_trace_fini(zio_alloc_list_t *zal)
2581 {
2582 metaslab_alloc_trace_t *mat;
2583
2584 while ((mat = list_remove_head(&zal->zal_list)) != NULL)
2585 kmem_cache_free(metaslab_alloc_trace_cache, mat);
2586 list_destroy(&zal->zal_list);
2587 zal->zal_size = 0;
2588 }
2589 #else
2590
2591 #define metaslab_trace_add(zal, mg, msp, psize, id, off)
2592
2593 void
2594 metaslab_alloc_trace_init(void)
2595 {
2596 }
2597
2598 void
2599 metaslab_alloc_trace_fini(void)
2600 {
2601 }
2602
2603 void
2604 metaslab_trace_init(zio_alloc_list_t *zal)
2605 {
2606 }
2607
2608 void
2609 metaslab_trace_fini(zio_alloc_list_t *zal)
2610 {
2611 }
2612
2613 #endif /* _METASLAB_TRACING */
2614
2615 /*
2616 * ==========================================================================
2617 * Metaslab block operations
2618 * ==========================================================================
2619 */
2620
2621 static void
2622 metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags)
2623 {
2624 if (!(flags & METASLAB_ASYNC_ALLOC) ||
2625 flags & METASLAB_DONT_THROTTLE)
2626 return;
2627
2628 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
2629 if (!mg->mg_class->mc_alloc_throttle_enabled)
2630 return;
2631
2632 (void) refcount_add(&mg->mg_alloc_queue_depth, tag);
2633 }
2634
2635 void
2636 metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags)
2637 {
2638 if (!(flags & METASLAB_ASYNC_ALLOC) ||
2639 flags & METASLAB_DONT_THROTTLE)
2640 return;
2641
2642 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
2643 if (!mg->mg_class->mc_alloc_throttle_enabled)
2644 return;
2645
2646 (void) refcount_remove(&mg->mg_alloc_queue_depth, tag);
2647 }
2648
2649 void
2650 metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag)
2651 {
2652 #ifdef ZFS_DEBUG
2653 const dva_t *dva = bp->blk_dva;
2654 int ndvas = BP_GET_NDVAS(bp);
2655
2656 for (int d = 0; d < ndvas; d++) {
2657 uint64_t vdev = DVA_GET_VDEV(&dva[d]);
2658 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
2659 VERIFY(refcount_not_held(&mg->mg_alloc_queue_depth, tag));
2660 }
2661 #endif
2662 }
2663
2664 static uint64_t
2665 metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
2666 {
2667 uint64_t start;
2668 range_tree_t *rt = msp->ms_tree;
2669 metaslab_class_t *mc = msp->ms_group->mg_class;
2670
2671 VERIFY(!msp->ms_condensing);
2672
2673 start = mc->mc_ops->msop_alloc(msp, size);
2674 if (start != -1ULL) {
2675 metaslab_group_t *mg = msp->ms_group;
2676 vdev_t *vd = mg->mg_vd;
2677
2678 VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift));
2679 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
2680 VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size);
2681 range_tree_remove(rt, start, size);
2682
2683 if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
2684 vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
2685
2686 range_tree_add(msp->ms_alloctree[txg & TXG_MASK], start, size);
2687
2688 /* Track the last successful allocation */
2689 msp->ms_alloc_txg = txg;
2690 metaslab_verify_space(msp, txg);
2691 }
2692
2693 /*
2694 * Now that we've attempted the allocation we need to update the
2695 * metaslab's maximum block size since it may have changed.
2696 */
2697 msp->ms_max_size = metaslab_block_maxsize(msp);
2698 return (start);
2699 }
2700
2701 static uint64_t
2702 metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
2703 uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d)
2704 {
2705 metaslab_t *msp = NULL;
2706 uint64_t offset = -1ULL;
2707 uint64_t activation_weight;
2708 uint64_t target_distance;
2709 int i;
2710
2711 activation_weight = METASLAB_WEIGHT_PRIMARY;
2712 for (i = 0; i < d; i++) {
2713 if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
2714 activation_weight = METASLAB_WEIGHT_SECONDARY;
2715 break;
2716 }
2717 }
2718
2719 metaslab_t *search = kmem_alloc(sizeof (*search), KM_SLEEP);
2720 search->ms_weight = UINT64_MAX;
2721 search->ms_start = 0;
2722 for (;;) {
2723 boolean_t was_active;
2724 avl_tree_t *t = &mg->mg_metaslab_tree;
2725 avl_index_t idx;
2726
2727 mutex_enter(&mg->mg_lock);
2728
2729 /*
2730 * Find the metaslab with the highest weight that is less
2731 * than what we've already tried. In the common case, this
2732 * means that we will examine each metaslab at most once.
2733 * Note that concurrent callers could reorder metaslabs
2734 * by activation/passivation once we have dropped the mg_lock.
2735 * If a metaslab is activated by another thread, and we fail
2736 * to allocate from the metaslab we have selected, we may
2737 * not try the newly-activated metaslab, and instead activate
2738 * another metaslab. This is not optimal, but generally
2739 * does not cause any problems (a possible exception being
2740 * if every metaslab is completely full except for the
2741 * the newly-activated metaslab which we fail to examine).
2742 */
2743 msp = avl_find(t, search, &idx);
2744 if (msp == NULL)
2745 msp = avl_nearest(t, idx, AVL_AFTER);
2746 for (; msp != NULL; msp = AVL_NEXT(t, msp)) {
2747
2748 if (!metaslab_should_allocate(msp, asize)) {
2749 metaslab_trace_add(zal, mg, msp, asize, d,
2750 TRACE_TOO_SMALL);
2751 continue;
2752 }
2753
2754 /*
2755 * If the selected metaslab is condensing, skip it.
2756 */
2757 if (msp->ms_condensing)
2758 continue;
2759
2760 was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
2761 if (activation_weight == METASLAB_WEIGHT_PRIMARY)
2762 break;
2763
2764 target_distance = min_distance +
2765 (space_map_allocated(msp->ms_sm) != 0 ? 0 :
2766 min_distance >> 1);
2767
2768 for (i = 0; i < d; i++) {
2769 if (metaslab_distance(msp, &dva[i]) <
2770 target_distance)
2771 break;
2772 }
2773 if (i == d)
2774 break;
2775 }
2776 mutex_exit(&mg->mg_lock);
2777 if (msp == NULL) {
2778 kmem_free(search, sizeof (*search));
2779 return (-1ULL);
2780 }
2781 search->ms_weight = msp->ms_weight;
2782 search->ms_start = msp->ms_start + 1;
2783
2784 mutex_enter(&msp->ms_lock);
2785
2786 /*
2787 * Ensure that the metaslab we have selected is still
2788 * capable of handling our request. It's possible that
2789 * another thread may have changed the weight while we
2790 * were blocked on the metaslab lock. We check the
2791 * active status first to see if we need to reselect
2792 * a new metaslab.
2793 */
2794 if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) {
2795 mutex_exit(&msp->ms_lock);
2796 continue;
2797 }
2798
2799 if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) &&
2800 activation_weight == METASLAB_WEIGHT_PRIMARY) {
2801 metaslab_passivate(msp,
2802 msp->ms_weight & ~METASLAB_ACTIVE_MASK);
2803 mutex_exit(&msp->ms_lock);
2804 continue;
2805 }
2806
2807 if (metaslab_activate(msp, activation_weight) != 0) {
2808 mutex_exit(&msp->ms_lock);
2809 continue;
2810 }
2811 msp->ms_selected_txg = txg;
2812
2813 /*
2814 * Now that we have the lock, recheck to see if we should
2815 * continue to use this metaslab for this allocation. The
2816 * the metaslab is now loaded so metaslab_should_allocate() can
2817 * accurately determine if the allocation attempt should
2818 * proceed.
2819 */
2820 if (!metaslab_should_allocate(msp, asize)) {
2821 /* Passivate this metaslab and select a new one. */
2822 metaslab_trace_add(zal, mg, msp, asize, d,
2823 TRACE_TOO_SMALL);
2824 goto next;
2825 }
2826
2827
2828 /*
2829 * If this metaslab is currently condensing then pick again as
2830 * we can't manipulate this metaslab until it's committed
2831 * to disk.
2832 */
2833 if (msp->ms_condensing) {
2834 metaslab_trace_add(zal, mg, msp, asize, d,
2835 TRACE_CONDENSING);
2836 mutex_exit(&msp->ms_lock);
2837 continue;
2838 }
2839
2840 offset = metaslab_block_alloc(msp, asize, txg);
2841 metaslab_trace_add(zal, mg, msp, asize, d, offset);
2842
2843 if (offset != -1ULL) {
2844 /* Proactively passivate the metaslab, if needed */
2845 metaslab_segment_may_passivate(msp);
2846 break;
2847 }
2848 next:
2849 ASSERT(msp->ms_loaded);
2850
2851 /*
2852 * We were unable to allocate from this metaslab so determine
2853 * a new weight for this metaslab. Now that we have loaded
2854 * the metaslab we can provide a better hint to the metaslab
2855 * selector.
2856 *
2857 * For space-based metaslabs, we use the maximum block size.
2858 * This information is only available when the metaslab
2859 * is loaded and is more accurate than the generic free
2860 * space weight that was calculated by metaslab_weight().
2861 * This information allows us to quickly compare the maximum
2862 * available allocation in the metaslab to the allocation
2863 * size being requested.
2864 *
2865 * For segment-based metaslabs, determine the new weight
2866 * based on the highest bucket in the range tree. We
2867 * explicitly use the loaded segment weight (i.e. the range
2868 * tree histogram) since it contains the space that is
2869 * currently available for allocation and is accurate
2870 * even within a sync pass.
2871 */
2872 if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
2873 uint64_t weight = metaslab_block_maxsize(msp);
2874 WEIGHT_SET_SPACEBASED(weight);
2875 metaslab_passivate(msp, weight);
2876 } else {
2877 metaslab_passivate(msp,
2878 metaslab_weight_from_range_tree(msp));
2879 }
2880
2881 /*
2882 * We have just failed an allocation attempt, check
2883 * that metaslab_should_allocate() agrees. Otherwise,
2884 * we may end up in an infinite loop retrying the same
2885 * metaslab.
2886 */
2887 ASSERT(!metaslab_should_allocate(msp, asize));
2888 mutex_exit(&msp->ms_lock);
2889 }
2890 mutex_exit(&msp->ms_lock);
2891 kmem_free(search, sizeof (*search));
2892 return (offset);
2893 }
2894
2895 static uint64_t
2896 metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal,
2897 uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d)
2898 {
2899 uint64_t offset;
2900 ASSERT(mg->mg_initialized);
2901
2902 offset = metaslab_group_alloc_normal(mg, zal, asize, txg,
2903 min_distance, dva, d);
2904
2905 mutex_enter(&mg->mg_lock);
2906 if (offset == -1ULL) {
2907 mg->mg_failed_allocations++;
2908 metaslab_trace_add(zal, mg, NULL, asize, d,
2909 TRACE_GROUP_FAILURE);
2910 if (asize == SPA_GANGBLOCKSIZE) {
2911 /*
2912 * This metaslab group was unable to allocate
2913 * the minimum gang block size so it must be out of
2914 * space. We must notify the allocation throttle
2915 * to start skipping allocation attempts to this
2916 * metaslab group until more space becomes available.
2917 * Note: this failure cannot be caused by the
2918 * allocation throttle since the allocation throttle
2919 * is only responsible for skipping devices and
2920 * not failing block allocations.
2921 */
2922 mg->mg_no_free_space = B_TRUE;
2923 }
2924 }
2925 mg->mg_allocations++;
2926 mutex_exit(&mg->mg_lock);
2927 return (offset);
2928 }
2929
2930 /*
2931 * If we have to write a ditto block (i.e. more than one DVA for a given BP)
2932 * on the same vdev as an existing DVA of this BP, then try to allocate it
2933 * at least (vdev_asize / (2 ^ ditto_same_vdev_distance_shift)) away from the
2934 * existing DVAs.
2935 */
2936 int ditto_same_vdev_distance_shift = 3;
2937
2938 /*
2939 * Allocate a block for the specified i/o.
2940 */
2941 int
2942 metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
2943 dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags,
2944 zio_alloc_list_t *zal)
2945 {
2946 metaslab_group_t *mg, *fast_mg, *rotor;
2947 vdev_t *vd;
2948 boolean_t try_hard = B_FALSE;
2949
2950 ASSERT(!DVA_IS_VALID(&dva[d]));
2951
2952 /*
2953 * For testing, make some blocks above a certain size be gang blocks.
2954 */
2955 if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0) {
2956 metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG);
2957 return (SET_ERROR(ENOSPC));
2958 }
2959
2960 /*
2961 * Start at the rotor and loop through all mgs until we find something.
2962 * Note that there's no locking on mc_rotor or mc_aliquot because
2963 * nothing actually breaks if we miss a few updates -- we just won't
2964 * allocate quite as evenly. It all balances out over time.
2965 *
2966 * If we are doing ditto or log blocks, try to spread them across
2967 * consecutive vdevs. If we're forced to reuse a vdev before we've
2968 * allocated all of our ditto blocks, then try and spread them out on
2969 * that vdev as much as possible. If it turns out to not be possible,
2970 * gradually lower our standards until anything becomes acceptable.
2971 * Also, allocating on consecutive vdevs (as opposed to random vdevs)
2972 * gives us hope of containing our fault domains to something we're
2973 * able to reason about. Otherwise, any two top-level vdev failures
2974 * will guarantee the loss of data. With consecutive allocation,
2975 * only two adjacent top-level vdev failures will result in data loss.
2976 *
2977 * If we are doing gang blocks (hintdva is non-NULL), try to keep
2978 * ourselves on the same vdev as our gang block header. That
2979 * way, we can hope for locality in vdev_cache, plus it makes our
2980 * fault domains something tractable.
2981 */
2982 if (hintdva) {
2983 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));
2984
2985 /*
2986 * It's possible the vdev we're using as the hint no
2987 * longer exists or its mg has been closed (e.g. by
2988 * device removal). Consult the rotor when
2989 * all else fails.
2990 */
2991 if (vd != NULL && vd->vdev_mg != NULL) {
2992 mg = vd->vdev_mg;
2993
2994 if (flags & METASLAB_HINTBP_AVOID &&
2995 mg->mg_next != NULL)
2996 mg = mg->mg_next;
2997 } else {
2998 mg = mc->mc_rotor;
2999 }
3000 } else if (d != 0) {
3001 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
3002 mg = vd->vdev_mg->mg_next;
3003 } else if (flags & METASLAB_FASTWRITE) {
3004 mg = fast_mg = mc->mc_rotor;
3005
3006 do {
3007 if (fast_mg->mg_vd->vdev_pending_fastwrite <
3008 mg->mg_vd->vdev_pending_fastwrite)
3009 mg = fast_mg;
3010 } while ((fast_mg = fast_mg->mg_next) != mc->mc_rotor);
3011
3012 } else {
3013 mg = mc->mc_rotor;
3014 }
3015
3016 /*
3017 * If the hint put us into the wrong metaslab class, or into a
3018 * metaslab group that has been passivated, just follow the rotor.
3019 */
3020 if (mg->mg_class != mc || mg->mg_activation_count <= 0)
3021 mg = mc->mc_rotor;
3022
3023 rotor = mg;
3024 top:
3025 do {
3026 boolean_t allocatable;
3027
3028 ASSERT(mg->mg_activation_count == 1);
3029 vd = mg->mg_vd;
3030
3031 /*
3032 * Don't allocate from faulted devices.
3033 */
3034 if (try_hard) {
3035 spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
3036 allocatable = vdev_allocatable(vd);
3037 spa_config_exit(spa, SCL_ZIO, FTAG);
3038 } else {
3039 allocatable = vdev_allocatable(vd);
3040 }
3041
3042 /*
3043 * Determine if the selected metaslab group is eligible
3044 * for allocations. If we're ganging then don't allow
3045 * this metaslab group to skip allocations since that would
3046 * inadvertently return ENOSPC and suspend the pool
3047 * even though space is still available.
3048 */
3049 if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) {
3050 allocatable = metaslab_group_allocatable(mg, rotor,
3051 psize);
3052 }
3053
3054 if (!allocatable) {
3055 metaslab_trace_add(zal, mg, NULL, psize, d,
3056 TRACE_NOT_ALLOCATABLE);
3057 goto next;
3058 }
3059
3060 ASSERT(mg->mg_initialized);
3061
3062 /*
3063 * Avoid writing single-copy data to a failing,
3064 * non-redundant vdev, unless we've already tried all
3065 * other vdevs.
3066 */
3067 if ((vd->vdev_stat.vs_write_errors > 0 ||
3068 vd->vdev_state < VDEV_STATE_HEALTHY) &&
3069 d == 0 && !try_hard && vd->vdev_children == 0) {
3070 metaslab_trace_add(zal, mg, NULL, psize, d,
3071 TRACE_VDEV_ERROR);
3072 goto next;
3073 }
3074
3075 ASSERT(mg->mg_class == mc);
3076
3077 /*
3078 * If we don't need to try hard, then require that the
3079 * block be 1/8th of the device away from any other DVAs
3080 * in this BP. If we are trying hard, allow any offset
3081 * to be used (distance=0).
3082 */
3083 uint64_t distance = 0;
3084 if (!try_hard) {
3085 distance = vd->vdev_asize >>
3086 ditto_same_vdev_distance_shift;
3087 if (distance <= (1ULL << vd->vdev_ms_shift))
3088 distance = 0;
3089 }
3090
3091 uint64_t asize = vdev_psize_to_asize(vd, psize);
3092 ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
3093
3094 uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg,
3095 distance, dva, d);
3096
3097 if (offset != -1ULL) {
3098 /*
3099 * If we've just selected this metaslab group,
3100 * figure out whether the corresponding vdev is
3101 * over- or under-used relative to the pool,
3102 * and set an allocation bias to even it out.
3103 *
3104 * Bias is also used to compensate for unequally
3105 * sized vdevs so that space is allocated fairly.
3106 */
3107 if (mc->mc_aliquot == 0 && metaslab_bias_enabled) {
3108 vdev_stat_t *vs = &vd->vdev_stat;
3109 int64_t vs_free = vs->vs_space - vs->vs_alloc;
3110 int64_t mc_free = mc->mc_space - mc->mc_alloc;
3111 int64_t ratio;
3112
3113 /*
3114 * Calculate how much more or less we should
3115 * try to allocate from this device during
3116 * this iteration around the rotor.
3117 *
3118 * This basically introduces a zero-centered
3119 * bias towards the devices with the most
3120 * free space, while compensating for vdev
3121 * size differences.
3122 *
3123 * Examples:
3124 * vdev V1 = 16M/128M
3125 * vdev V2 = 16M/128M
3126 * ratio(V1) = 100% ratio(V2) = 100%
3127 *
3128 * vdev V1 = 16M/128M
3129 * vdev V2 = 64M/128M
3130 * ratio(V1) = 127% ratio(V2) = 72%
3131 *
3132 * vdev V1 = 16M/128M
3133 * vdev V2 = 64M/512M
3134 * ratio(V1) = 40% ratio(V2) = 160%
3135 */
3136 ratio = (vs_free * mc->mc_alloc_groups * 100) /
3137 (mc_free + 1);
3138 mg->mg_bias = ((ratio - 100) *
3139 (int64_t)mg->mg_aliquot) / 100;
3140 } else if (!metaslab_bias_enabled) {
3141 mg->mg_bias = 0;
3142 }
3143
3144 if ((flags & METASLAB_FASTWRITE) ||
3145 atomic_add_64_nv(&mc->mc_aliquot, asize) >=
3146 mg->mg_aliquot + mg->mg_bias) {
3147 mc->mc_rotor = mg->mg_next;
3148 mc->mc_aliquot = 0;
3149 }
3150
3151 DVA_SET_VDEV(&dva[d], vd->vdev_id);
3152 DVA_SET_OFFSET(&dva[d], offset);
3153 DVA_SET_GANG(&dva[d],
3154 ((flags & METASLAB_GANG_HEADER) ? 1 : 0));
3155 DVA_SET_ASIZE(&dva[d], asize);
3156
3157 if (flags & METASLAB_FASTWRITE) {
3158 atomic_add_64(&vd->vdev_pending_fastwrite,
3159 psize);
3160 }
3161
3162 return (0);
3163 }
3164 next:
3165 mc->mc_rotor = mg->mg_next;
3166 mc->mc_aliquot = 0;
3167 } while ((mg = mg->mg_next) != rotor);
3168
3169 /*
3170 * If we haven't tried hard, do so now.
3171 */
3172 if (!try_hard) {
3173 try_hard = B_TRUE;
3174 goto top;
3175 }
3176
3177 bzero(&dva[d], sizeof (dva_t));
3178
3179 metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC);
3180 return (SET_ERROR(ENOSPC));
3181 }
3182
3183 void
3184 metaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize,
3185 uint64_t txg)
3186 {
3187 metaslab_t *msp;
3188 ASSERTV(spa_t *spa = vd->vdev_spa);
3189
3190 ASSERT3U(txg, ==, spa->spa_syncing_txg);
3191 ASSERT(vdev_is_concrete(vd));
3192 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
3193 ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count);
3194
3195 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
3196
3197 VERIFY(!msp->ms_condensing);
3198 VERIFY3U(offset, >=, msp->ms_start);
3199 VERIFY3U(offset + asize, <=, msp->ms_start + msp->ms_size);
3200 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
3201 VERIFY0(P2PHASE(asize, 1ULL << vd->vdev_ashift));
3202
3203 metaslab_check_free_impl(vd, offset, asize);
3204 mutex_enter(&msp->ms_lock);
3205 if (range_tree_space(msp->ms_freeingtree) == 0) {
3206 vdev_dirty(vd, VDD_METASLAB, msp, txg);
3207 }
3208 range_tree_add(msp->ms_freeingtree, offset, asize);
3209 mutex_exit(&msp->ms_lock);
3210 }
3211
3212 /* ARGSUSED */
3213 void
3214 metaslab_free_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
3215 uint64_t size, void *arg)
3216 {
3217 uint64_t *txgp = arg;
3218
3219 if (vd->vdev_ops->vdev_op_remap != NULL)
3220 vdev_indirect_mark_obsolete(vd, offset, size, *txgp);
3221 else
3222 metaslab_free_impl(vd, offset, size, *txgp);
3223 }
3224
3225 static void
3226 metaslab_free_impl(vdev_t *vd, uint64_t offset, uint64_t size,
3227 uint64_t txg)
3228 {
3229 spa_t *spa = vd->vdev_spa;
3230
3231 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
3232
3233 if (txg > spa_freeze_txg(spa))
3234 return;
3235
3236 if (spa->spa_vdev_removal != NULL &&
3237 spa->spa_vdev_removal->svr_vdev == vd &&
3238 vdev_is_concrete(vd)) {
3239 /*
3240 * Note: we check if the vdev is concrete because when
3241 * we complete the removal, we first change the vdev to be
3242 * an indirect vdev (in open context), and then (in syncing
3243 * context) clear spa_vdev_removal.
3244 */
3245 free_from_removing_vdev(vd, offset, size, txg);
3246 } else if (vd->vdev_ops->vdev_op_remap != NULL) {
3247 vdev_indirect_mark_obsolete(vd, offset, size, txg);
3248 vd->vdev_ops->vdev_op_remap(vd, offset, size,
3249 metaslab_free_impl_cb, &txg);
3250 } else {
3251 metaslab_free_concrete(vd, offset, size, txg);
3252 }
3253 }
3254
3255 typedef struct remap_blkptr_cb_arg {
3256 blkptr_t *rbca_bp;
3257 spa_remap_cb_t rbca_cb;
3258 vdev_t *rbca_remap_vd;
3259 uint64_t rbca_remap_offset;
3260 void *rbca_cb_arg;
3261 } remap_blkptr_cb_arg_t;
3262
3263 void
3264 remap_blkptr_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
3265 uint64_t size, void *arg)
3266 {
3267 remap_blkptr_cb_arg_t *rbca = arg;
3268 blkptr_t *bp = rbca->rbca_bp;
3269
3270 /* We can not remap split blocks. */
3271 if (size != DVA_GET_ASIZE(&bp->blk_dva[0]))
3272 return;
3273 ASSERT0(inner_offset);
3274
3275 if (rbca->rbca_cb != NULL) {
3276 /*
3277 * At this point we know that we are not handling split
3278 * blocks and we invoke the callback on the previous
3279 * vdev which must be indirect.
3280 */
3281 ASSERT3P(rbca->rbca_remap_vd->vdev_ops, ==, &vdev_indirect_ops);
3282
3283 rbca->rbca_cb(rbca->rbca_remap_vd->vdev_id,
3284 rbca->rbca_remap_offset, size, rbca->rbca_cb_arg);
3285
3286 /* set up remap_blkptr_cb_arg for the next call */
3287 rbca->rbca_remap_vd = vd;
3288 rbca->rbca_remap_offset = offset;
3289 }
3290
3291 /*
3292 * The phys birth time is that of dva[0]. This ensures that we know
3293 * when each dva was written, so that resilver can determine which
3294 * blocks need to be scrubbed (i.e. those written during the time
3295 * the vdev was offline). It also ensures that the key used in
3296 * the ARC hash table is unique (i.e. dva[0] + phys_birth). If
3297 * we didn't change the phys_birth, a lookup in the ARC for a
3298 * remapped BP could find the data that was previously stored at
3299 * this vdev + offset.
3300 */
3301 vdev_t *oldvd = vdev_lookup_top(vd->vdev_spa,
3302 DVA_GET_VDEV(&bp->blk_dva[0]));
3303 vdev_indirect_births_t *vib = oldvd->vdev_indirect_births;
3304 bp->blk_phys_birth = vdev_indirect_births_physbirth(vib,
3305 DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_ASIZE(&bp->blk_dva[0]));
3306
3307 DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id);
3308 DVA_SET_OFFSET(&bp->blk_dva[0], offset);
3309 }
3310
3311 /*
3312 * If the block pointer contains any indirect DVAs, modify them to refer to
3313 * concrete DVAs. Note that this will sometimes not be possible, leaving
3314 * the indirect DVA in place. This happens if the indirect DVA spans multiple
3315 * segments in the mapping (i.e. it is a "split block").
3316 *
3317 * If the BP was remapped, calls the callback on the original dva (note the
3318 * callback can be called multiple times if the original indirect DVA refers
3319 * to another indirect DVA, etc).
3320 *
3321 * Returns TRUE if the BP was remapped.
3322 */
3323 boolean_t
3324 spa_remap_blkptr(spa_t *spa, blkptr_t *bp, spa_remap_cb_t callback, void *arg)
3325 {
3326 remap_blkptr_cb_arg_t rbca;
3327
3328 if (!zfs_remap_blkptr_enable)
3329 return (B_FALSE);
3330
3331 if (!spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS))
3332 return (B_FALSE);
3333
3334 /*
3335 * Dedup BP's can not be remapped, because ddt_phys_select() depends
3336 * on DVA[0] being the same in the BP as in the DDT (dedup table).
3337 */
3338 if (BP_GET_DEDUP(bp))
3339 return (B_FALSE);
3340
3341 /*
3342 * Gang blocks can not be remapped, because
3343 * zio_checksum_gang_verifier() depends on the DVA[0] that's in
3344 * the BP used to read the gang block header (GBH) being the same
3345 * as the DVA[0] that we allocated for the GBH.
3346 */
3347 if (BP_IS_GANG(bp))
3348 return (B_FALSE);
3349
3350 /*
3351 * Embedded BP's have no DVA to remap.
3352 */
3353 if (BP_GET_NDVAS(bp) < 1)
3354 return (B_FALSE);
3355
3356 /*
3357 * Note: we only remap dva[0]. If we remapped other dvas, we
3358 * would no longer know what their phys birth txg is.
3359 */
3360 dva_t *dva = &bp->blk_dva[0];
3361
3362 uint64_t offset = DVA_GET_OFFSET(dva);
3363 uint64_t size = DVA_GET_ASIZE(dva);
3364 vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva));
3365
3366 if (vd->vdev_ops->vdev_op_remap == NULL)
3367 return (B_FALSE);
3368
3369 rbca.rbca_bp = bp;
3370 rbca.rbca_cb = callback;
3371 rbca.rbca_remap_vd = vd;
3372 rbca.rbca_remap_offset = offset;
3373 rbca.rbca_cb_arg = arg;
3374
3375 /*
3376 * remap_blkptr_cb() will be called in order for each level of
3377 * indirection, until a concrete vdev is reached or a split block is
3378 * encountered. old_vd and old_offset are updated within the callback
3379 * as we go from the one indirect vdev to the next one (either concrete
3380 * or indirect again) in that order.
3381 */
3382 vd->vdev_ops->vdev_op_remap(vd, offset, size, remap_blkptr_cb, &rbca);
3383
3384 /* Check if the DVA wasn't remapped because it is a split block */
3385 if (DVA_GET_VDEV(&rbca.rbca_bp->blk_dva[0]) == vd->vdev_id)
3386 return (B_FALSE);
3387
3388 return (B_TRUE);
3389 }
3390
3391 /*
3392 * Undo the allocation of a DVA which happened in the given transaction group.
3393 */
3394 void
3395 metaslab_unalloc_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
3396 {
3397 metaslab_t *msp;
3398 vdev_t *vd;
3399 uint64_t vdev = DVA_GET_VDEV(dva);
3400 uint64_t offset = DVA_GET_OFFSET(dva);
3401 uint64_t size = DVA_GET_ASIZE(dva);
3402
3403 ASSERT(DVA_IS_VALID(dva));
3404 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
3405
3406 if (txg > spa_freeze_txg(spa))
3407 return;
3408
3409 if ((vd = vdev_lookup_top(spa, vdev)) == NULL || !DVA_IS_VALID(dva) ||
3410 (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) {
3411 zfs_panic_recover("metaslab_free_dva(): bad DVA %llu:%llu:%llu",
3412 (u_longlong_t)vdev, (u_longlong_t)offset,
3413 (u_longlong_t)size);
3414 return;
3415 }
3416
3417 ASSERT(!vd->vdev_removing);
3418 ASSERT(vdev_is_concrete(vd));
3419 ASSERT0(vd->vdev_indirect_config.vic_mapping_object);
3420 ASSERT3P(vd->vdev_indirect_mapping, ==, NULL);
3421
3422 if (DVA_GET_GANG(dva))
3423 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
3424
3425 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
3426
3427 mutex_enter(&msp->ms_lock);
3428 range_tree_remove(msp->ms_alloctree[txg & TXG_MASK],
3429 offset, size);
3430
3431 VERIFY(!msp->ms_condensing);
3432 VERIFY3U(offset, >=, msp->ms_start);
3433 VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size);
3434 VERIFY3U(range_tree_space(msp->ms_tree) + size, <=,
3435 msp->ms_size);
3436 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
3437 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
3438 range_tree_add(msp->ms_tree, offset, size);
3439 mutex_exit(&msp->ms_lock);
3440 }
3441
3442 /*
3443 * Free the block represented by DVA in the context of the specified
3444 * transaction group.
3445 */
3446 void
3447 metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
3448 {
3449 uint64_t vdev = DVA_GET_VDEV(dva);
3450 uint64_t offset = DVA_GET_OFFSET(dva);
3451 uint64_t size = DVA_GET_ASIZE(dva);
3452 vdev_t *vd = vdev_lookup_top(spa, vdev);
3453
3454 ASSERT(DVA_IS_VALID(dva));
3455 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
3456
3457 if (DVA_GET_GANG(dva)) {
3458 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
3459 }
3460
3461 metaslab_free_impl(vd, offset, size, txg);
3462 }
3463
3464 /*
3465 * Reserve some allocation slots. The reservation system must be called
3466 * before we call into the allocator. If there aren't any available slots
3467 * then the I/O will be throttled until an I/O completes and its slots are
3468 * freed up. The function returns true if it was successful in placing
3469 * the reservation.
3470 */
3471 boolean_t
3472 metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, zio_t *zio,
3473 int flags)
3474 {
3475 uint64_t available_slots = 0;
3476 boolean_t slot_reserved = B_FALSE;
3477
3478 ASSERT(mc->mc_alloc_throttle_enabled);
3479 mutex_enter(&mc->mc_lock);
3480
3481 uint64_t reserved_slots = refcount_count(&mc->mc_alloc_slots);
3482 if (reserved_slots < mc->mc_alloc_max_slots)
3483 available_slots = mc->mc_alloc_max_slots - reserved_slots;
3484
3485 if (slots <= available_slots || GANG_ALLOCATION(flags)) {
3486 /*
3487 * We reserve the slots individually so that we can unreserve
3488 * them individually when an I/O completes.
3489 */
3490 for (int d = 0; d < slots; d++) {
3491 reserved_slots = refcount_add(&mc->mc_alloc_slots, zio);
3492 }
3493 zio->io_flags |= ZIO_FLAG_IO_ALLOCATING;
3494 slot_reserved = B_TRUE;
3495 }
3496
3497 mutex_exit(&mc->mc_lock);
3498 return (slot_reserved);
3499 }
3500
3501 void
3502 metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, zio_t *zio)
3503 {
3504 ASSERT(mc->mc_alloc_throttle_enabled);
3505 mutex_enter(&mc->mc_lock);
3506 for (int d = 0; d < slots; d++) {
3507 (void) refcount_remove(&mc->mc_alloc_slots, zio);
3508 }
3509 mutex_exit(&mc->mc_lock);
3510 }
3511
3512 static int
3513 metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size,
3514 uint64_t txg)
3515 {
3516 metaslab_t *msp;
3517 spa_t *spa = vd->vdev_spa;
3518 int error = 0;
3519
3520 if (offset >> vd->vdev_ms_shift >= vd->vdev_ms_count)
3521 return (ENXIO);
3522
3523 ASSERT3P(vd->vdev_ms, !=, NULL);
3524 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
3525
3526 mutex_enter(&msp->ms_lock);
3527
3528 if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded)
3529 error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
3530
3531 if (error == 0 && !range_tree_contains(msp->ms_tree, offset, size))
3532 error = SET_ERROR(ENOENT);
3533
3534 if (error || txg == 0) { /* txg == 0 indicates dry run */
3535 mutex_exit(&msp->ms_lock);
3536 return (error);
3537 }
3538
3539 VERIFY(!msp->ms_condensing);
3540 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
3541 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
3542 VERIFY3U(range_tree_space(msp->ms_tree) - size, <=, msp->ms_size);
3543 range_tree_remove(msp->ms_tree, offset, size);
3544
3545 if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */
3546 if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
3547 vdev_dirty(vd, VDD_METASLAB, msp, txg);
3548 range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, size);
3549 }
3550
3551 mutex_exit(&msp->ms_lock);
3552
3553 return (0);
3554 }
3555
3556 typedef struct metaslab_claim_cb_arg_t {
3557 uint64_t mcca_txg;
3558 int mcca_error;
3559 } metaslab_claim_cb_arg_t;
3560
3561 /* ARGSUSED */
3562 static void
3563 metaslab_claim_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
3564 uint64_t size, void *arg)
3565 {
3566 metaslab_claim_cb_arg_t *mcca_arg = arg;
3567
3568 if (mcca_arg->mcca_error == 0) {
3569 mcca_arg->mcca_error = metaslab_claim_concrete(vd, offset,
3570 size, mcca_arg->mcca_txg);
3571 }
3572 }
3573
3574 int
3575 metaslab_claim_impl(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg)
3576 {
3577 if (vd->vdev_ops->vdev_op_remap != NULL) {
3578 metaslab_claim_cb_arg_t arg;
3579
3580 /*
3581 * Only zdb(1M) can claim on indirect vdevs. This is used
3582 * to detect leaks of mapped space (that are not accounted
3583 * for in the obsolete counts, spacemap, or bpobj).
3584 */
3585 ASSERT(!spa_writeable(vd->vdev_spa));
3586 arg.mcca_error = 0;
3587 arg.mcca_txg = txg;
3588
3589 vd->vdev_ops->vdev_op_remap(vd, offset, size,
3590 metaslab_claim_impl_cb, &arg);
3591
3592 if (arg.mcca_error == 0) {
3593 arg.mcca_error = metaslab_claim_concrete(vd,
3594 offset, size, txg);
3595 }
3596 return (arg.mcca_error);
3597 } else {
3598 return (metaslab_claim_concrete(vd, offset, size, txg));
3599 }
3600 }
3601
3602 /*
3603 * Intent log support: upon opening the pool after a crash, notify the SPA
3604 * of blocks that the intent log has allocated for immediate write, but
3605 * which are still considered free by the SPA because the last transaction
3606 * group didn't commit yet.
3607 */
3608 static int
3609 metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
3610 {
3611 uint64_t vdev = DVA_GET_VDEV(dva);
3612 uint64_t offset = DVA_GET_OFFSET(dva);
3613 uint64_t size = DVA_GET_ASIZE(dva);
3614 vdev_t *vd;
3615
3616 if ((vd = vdev_lookup_top(spa, vdev)) == NULL) {
3617 return (SET_ERROR(ENXIO));
3618 }
3619
3620 ASSERT(DVA_IS_VALID(dva));
3621
3622 if (DVA_GET_GANG(dva))
3623 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
3624
3625 return (metaslab_claim_impl(vd, offset, size, txg));
3626 }
3627
3628 int
3629 metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
3630 int ndvas, uint64_t txg, blkptr_t *hintbp, int flags,
3631 zio_alloc_list_t *zal, zio_t *zio)
3632 {
3633 dva_t *dva = bp->blk_dva;
3634 dva_t *hintdva = hintbp->blk_dva;
3635 int error = 0;
3636
3637 ASSERT(bp->blk_birth == 0);
3638 ASSERT(BP_PHYSICAL_BIRTH(bp) == 0);
3639
3640 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
3641
3642 if (mc->mc_rotor == NULL) { /* no vdevs in this class */
3643 spa_config_exit(spa, SCL_ALLOC, FTAG);
3644 return (SET_ERROR(ENOSPC));
3645 }
3646
3647 ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa));
3648 ASSERT(BP_GET_NDVAS(bp) == 0);
3649 ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
3650 ASSERT3P(zal, !=, NULL);
3651
3652 for (int d = 0; d < ndvas; d++) {
3653 error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
3654 txg, flags, zal);
3655 if (error != 0) {
3656 for (d--; d >= 0; d--) {
3657 metaslab_unalloc_dva(spa, &dva[d], txg);
3658 metaslab_group_alloc_decrement(spa,
3659 DVA_GET_VDEV(&dva[d]), zio, flags);
3660 bzero(&dva[d], sizeof (dva_t));
3661 }
3662 spa_config_exit(spa, SCL_ALLOC, FTAG);
3663 return (error);
3664 } else {
3665 /*
3666 * Update the metaslab group's queue depth
3667 * based on the newly allocated dva.
3668 */
3669 metaslab_group_alloc_increment(spa,
3670 DVA_GET_VDEV(&dva[d]), zio, flags);
3671 }
3672
3673 }
3674 ASSERT(error == 0);
3675 ASSERT(BP_GET_NDVAS(bp) == ndvas);
3676
3677 spa_config_exit(spa, SCL_ALLOC, FTAG);
3678
3679 BP_SET_BIRTH(bp, txg, 0);
3680
3681 return (0);
3682 }
3683
3684 void
3685 metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
3686 {
3687 const dva_t *dva = bp->blk_dva;
3688 int ndvas = BP_GET_NDVAS(bp);
3689
3690 ASSERT(!BP_IS_HOLE(bp));
3691 ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa));
3692
3693 spa_config_enter(spa, SCL_FREE, FTAG, RW_READER);
3694
3695 for (int d = 0; d < ndvas; d++) {
3696 if (now) {
3697 metaslab_unalloc_dva(spa, &dva[d], txg);
3698 } else {
3699 metaslab_free_dva(spa, &dva[d], txg);
3700 }
3701 }
3702
3703 spa_config_exit(spa, SCL_FREE, FTAG);
3704 }
3705
3706 int
3707 metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
3708 {
3709 const dva_t *dva = bp->blk_dva;
3710 int ndvas = BP_GET_NDVAS(bp);
3711 int error = 0;
3712
3713 ASSERT(!BP_IS_HOLE(bp));
3714
3715 if (txg != 0) {
3716 /*
3717 * First do a dry run to make sure all DVAs are claimable,
3718 * so we don't have to unwind from partial failures below.
3719 */
3720 if ((error = metaslab_claim(spa, bp, 0)) != 0)
3721 return (error);
3722 }
3723
3724 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
3725
3726 for (int d = 0; d < ndvas; d++)
3727 if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0)
3728 break;
3729
3730 spa_config_exit(spa, SCL_ALLOC, FTAG);
3731
3732 ASSERT(error == 0 || txg == 0);
3733
3734 return (error);
3735 }
3736
3737 void
3738 metaslab_fastwrite_mark(spa_t *spa, const blkptr_t *bp)
3739 {
3740 const dva_t *dva = bp->blk_dva;
3741 int ndvas = BP_GET_NDVAS(bp);
3742 uint64_t psize = BP_GET_PSIZE(bp);
3743 int d;
3744 vdev_t *vd;
3745
3746 ASSERT(!BP_IS_HOLE(bp));
3747 ASSERT(!BP_IS_EMBEDDED(bp));
3748 ASSERT(psize > 0);
3749
3750 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
3751
3752 for (d = 0; d < ndvas; d++) {
3753 if ((vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]))) == NULL)
3754 continue;
3755 atomic_add_64(&vd->vdev_pending_fastwrite, psize);
3756 }
3757
3758 spa_config_exit(spa, SCL_VDEV, FTAG);
3759 }
3760
3761 void
3762 metaslab_fastwrite_unmark(spa_t *spa, const blkptr_t *bp)
3763 {
3764 const dva_t *dva = bp->blk_dva;
3765 int ndvas = BP_GET_NDVAS(bp);
3766 uint64_t psize = BP_GET_PSIZE(bp);
3767 int d;
3768 vdev_t *vd;
3769
3770 ASSERT(!BP_IS_HOLE(bp));
3771 ASSERT(!BP_IS_EMBEDDED(bp));
3772 ASSERT(psize > 0);
3773
3774 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
3775
3776 for (d = 0; d < ndvas; d++) {
3777 if ((vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]))) == NULL)
3778 continue;
3779 ASSERT3U(vd->vdev_pending_fastwrite, >=, psize);
3780 atomic_sub_64(&vd->vdev_pending_fastwrite, psize);
3781 }
3782
3783 spa_config_exit(spa, SCL_VDEV, FTAG);
3784 }
3785
3786 /* ARGSUSED */
3787 static void
3788 metaslab_check_free_impl_cb(uint64_t inner, vdev_t *vd, uint64_t offset,
3789 uint64_t size, void *arg)
3790 {
3791 if (vd->vdev_ops == &vdev_indirect_ops)
3792 return;
3793
3794 metaslab_check_free_impl(vd, offset, size);
3795 }
3796
3797 static void
3798 metaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size)
3799 {
3800 metaslab_t *msp;
3801 ASSERTV(spa_t *spa = vd->vdev_spa);
3802
3803 if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
3804 return;
3805
3806 if (vd->vdev_ops->vdev_op_remap != NULL) {
3807 vd->vdev_ops->vdev_op_remap(vd, offset, size,
3808 metaslab_check_free_impl_cb, NULL);
3809 return;
3810 }
3811
3812 ASSERT(vdev_is_concrete(vd));
3813 ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count);
3814 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
3815
3816 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
3817
3818 mutex_enter(&msp->ms_lock);
3819 if (msp->ms_loaded)
3820 range_tree_verify(msp->ms_tree, offset, size);
3821
3822 range_tree_verify(msp->ms_freeingtree, offset, size);
3823 range_tree_verify(msp->ms_freedtree, offset, size);
3824 for (int j = 0; j < TXG_DEFER_SIZE; j++)
3825 range_tree_verify(msp->ms_defertree[j], offset, size);
3826 mutex_exit(&msp->ms_lock);
3827 }
3828
3829 void
3830 metaslab_check_free(spa_t *spa, const blkptr_t *bp)
3831 {
3832 if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
3833 return;
3834
3835 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
3836 for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
3837 uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
3838 vdev_t *vd = vdev_lookup_top(spa, vdev);
3839 uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
3840 uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]);
3841
3842 if (DVA_GET_GANG(&bp->blk_dva[i]))
3843 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
3844
3845 ASSERT3P(vd, !=, NULL);
3846
3847 metaslab_check_free_impl(vd, offset, size);
3848 }
3849 spa_config_exit(spa, SCL_VDEV, FTAG);
3850 }
3851
3852 #if defined(_KERNEL) && defined(HAVE_SPL)
3853 /* CSTYLED */
3854 module_param(metaslab_aliquot, ulong, 0644);
3855 MODULE_PARM_DESC(metaslab_aliquot,
3856 "allocation granularity (a.k.a. stripe size)");
3857
3858 module_param(metaslab_debug_load, int, 0644);
3859 MODULE_PARM_DESC(metaslab_debug_load,
3860 "load all metaslabs when pool is first opened");
3861
3862 module_param(metaslab_debug_unload, int, 0644);
3863 MODULE_PARM_DESC(metaslab_debug_unload,
3864 "prevent metaslabs from being unloaded");
3865
3866 module_param(metaslab_preload_enabled, int, 0644);
3867 MODULE_PARM_DESC(metaslab_preload_enabled,
3868 "preload potential metaslabs during reassessment");
3869
3870 module_param(zfs_mg_noalloc_threshold, int, 0644);
3871 MODULE_PARM_DESC(zfs_mg_noalloc_threshold,
3872 "percentage of free space for metaslab group to allow allocation");
3873
3874 module_param(zfs_mg_fragmentation_threshold, int, 0644);
3875 MODULE_PARM_DESC(zfs_mg_fragmentation_threshold,
3876 "fragmentation for metaslab group to allow allocation");
3877
3878 module_param(zfs_metaslab_fragmentation_threshold, int, 0644);
3879 MODULE_PARM_DESC(zfs_metaslab_fragmentation_threshold,
3880 "fragmentation for metaslab to allow allocation");
3881
3882 module_param(metaslab_fragmentation_factor_enabled, int, 0644);
3883 MODULE_PARM_DESC(metaslab_fragmentation_factor_enabled,
3884 "use the fragmentation metric to prefer less fragmented metaslabs");
3885
3886 module_param(metaslab_lba_weighting_enabled, int, 0644);
3887 MODULE_PARM_DESC(metaslab_lba_weighting_enabled,
3888 "prefer metaslabs with lower LBAs");
3889
3890 module_param(metaslab_bias_enabled, int, 0644);
3891 MODULE_PARM_DESC(metaslab_bias_enabled,
3892 "enable metaslab group biasing");
3893
3894 module_param(zfs_metaslab_segment_weight_enabled, int, 0644);
3895 MODULE_PARM_DESC(zfs_metaslab_segment_weight_enabled,
3896 "enable segment-based metaslab selection");
3897
3898 module_param(zfs_metaslab_switch_threshold, int, 0644);
3899 MODULE_PARM_DESC(zfs_metaslab_switch_threshold,
3900 "segment-based metaslab selection maximum buckets before switching");
3901
3902 /* CSTYLED */
3903 module_param(metaslab_gang_bang, ulong, 0644);
3904 MODULE_PARM_DESC(metaslab_gang_bang,
3905 "blocks larger than this size are forced to be gang blocks");
3906 #endif /* _KERNEL && HAVE_SPL */