]> git.proxmox.com Git - mirror_zfs.git/blob - module/zfs/metaslab.c
cstyle: Resolve C style issues
[mirror_zfs.git] / module / zfs / metaslab.c
1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2013 by Delphix. All rights reserved.
24 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
25 */
26
27 #include <sys/zfs_context.h>
28 #include <sys/dmu.h>
29 #include <sys/dmu_tx.h>
30 #include <sys/space_map.h>
31 #include <sys/metaslab_impl.h>
32 #include <sys/vdev_impl.h>
33 #include <sys/zio.h>
34
35 #define WITH_DF_BLOCK_ALLOCATOR
36
37 /*
38 * Allow allocations to switch to gang blocks quickly. We do this to
39 * avoid having to load lots of space_maps in a given txg. There are,
40 * however, some cases where we want to avoid "fast" ganging and instead
41 * we want to do an exhaustive search of all metaslabs on this device.
42 * Currently we don't allow any gang, zil, or dump device related allocations
43 * to "fast" gang.
44 */
45 #define CAN_FASTGANG(flags) \
46 (!((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER | \
47 METASLAB_GANG_AVOID)))
48
49 uint64_t metaslab_aliquot = 512ULL << 10;
50 uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */
51
52 /*
53 * The in-core space map representation is more compact than its on-disk form.
54 * The zfs_condense_pct determines how much more compact the in-core
55 * space_map representation must be before we compact it on-disk.
56 * Values should be greater than or equal to 100.
57 */
58 int zfs_condense_pct = 200;
59
60 /*
61 * This value defines the number of allowed allocation failures per vdev.
62 * If a device reaches this threshold in a given txg then we consider skipping
63 * allocations on that device. The value of zfs_mg_alloc_failures is computed
64 * in zio_init() unless it has been overridden in /etc/system.
65 */
66 int zfs_mg_alloc_failures = 0;
67
68 /*
69 * The zfs_mg_noalloc_threshold defines which metaslab groups should
70 * be eligible for allocation. The value is defined as a percentage of
71 * a free space. Metaslab groups that have more free space than
72 * zfs_mg_noalloc_threshold are always eligible for allocations. Once
73 * a metaslab group's free space is less than or equal to the
74 * zfs_mg_noalloc_threshold the allocator will avoid allocating to that
75 * group unless all groups in the pool have reached zfs_mg_noalloc_threshold.
76 * Once all groups in the pool reach zfs_mg_noalloc_threshold then all
77 * groups are allowed to accept allocations. Gang blocks are always
78 * eligible to allocate on any metaslab group. The default value of 0 means
79 * no metaslab group will be excluded based on this criterion.
80 */
81 int zfs_mg_noalloc_threshold = 0;
82
83 /*
84 * Metaslab debugging: when set, keeps all space maps in core to verify frees.
85 */
86 int metaslab_debug = 0;
87
88 /*
89 * Minimum size which forces the dynamic allocator to change
90 * it's allocation strategy. Once the space map cannot satisfy
91 * an allocation of this size then it switches to using more
92 * aggressive strategy (i.e search by size rather than offset).
93 */
94 uint64_t metaslab_df_alloc_threshold = SPA_MAXBLOCKSIZE;
95
96 /*
97 * The minimum free space, in percent, which must be available
98 * in a space map to continue allocations in a first-fit fashion.
99 * Once the space_map's free space drops below this level we dynamically
100 * switch to using best-fit allocations.
101 */
102 int metaslab_df_free_pct = 4;
103
104 /*
105 * A metaslab is considered "free" if it contains a contiguous
106 * segment which is greater than metaslab_min_alloc_size.
107 */
108 uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS;
109
110 /*
111 * Max number of space_maps to prefetch.
112 */
113 int metaslab_prefetch_limit = SPA_DVAS_PER_BP;
114
115 /*
116 * Percentage bonus multiplier for metaslabs that are in the bonus area.
117 */
118 int metaslab_smo_bonus_pct = 150;
119
120 /*
121 * Should we be willing to write data to degraded vdevs?
122 */
123 boolean_t zfs_write_to_degraded = B_FALSE;
124
125 /*
126 * ==========================================================================
127 * Metaslab classes
128 * ==========================================================================
129 */
130 metaslab_class_t *
131 metaslab_class_create(spa_t *spa, space_map_ops_t *ops)
132 {
133 metaslab_class_t *mc;
134
135 mc = kmem_zalloc(sizeof (metaslab_class_t), KM_PUSHPAGE);
136
137 mc->mc_spa = spa;
138 mc->mc_rotor = NULL;
139 mc->mc_ops = ops;
140 mutex_init(&mc->mc_fastwrite_lock, NULL, MUTEX_DEFAULT, NULL);
141
142 return (mc);
143 }
144
145 void
146 metaslab_class_destroy(metaslab_class_t *mc)
147 {
148 ASSERT(mc->mc_rotor == NULL);
149 ASSERT(mc->mc_alloc == 0);
150 ASSERT(mc->mc_deferred == 0);
151 ASSERT(mc->mc_space == 0);
152 ASSERT(mc->mc_dspace == 0);
153
154 mutex_destroy(&mc->mc_fastwrite_lock);
155 kmem_free(mc, sizeof (metaslab_class_t));
156 }
157
158 int
159 metaslab_class_validate(metaslab_class_t *mc)
160 {
161 metaslab_group_t *mg;
162 vdev_t *vd;
163
164 /*
165 * Must hold one of the spa_config locks.
166 */
167 ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) ||
168 spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER));
169
170 if ((mg = mc->mc_rotor) == NULL)
171 return (0);
172
173 do {
174 vd = mg->mg_vd;
175 ASSERT(vd->vdev_mg != NULL);
176 ASSERT3P(vd->vdev_top, ==, vd);
177 ASSERT3P(mg->mg_class, ==, mc);
178 ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops);
179 } while ((mg = mg->mg_next) != mc->mc_rotor);
180
181 return (0);
182 }
183
184 void
185 metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta,
186 int64_t defer_delta, int64_t space_delta, int64_t dspace_delta)
187 {
188 atomic_add_64(&mc->mc_alloc, alloc_delta);
189 atomic_add_64(&mc->mc_deferred, defer_delta);
190 atomic_add_64(&mc->mc_space, space_delta);
191 atomic_add_64(&mc->mc_dspace, dspace_delta);
192 }
193
194 uint64_t
195 metaslab_class_get_alloc(metaslab_class_t *mc)
196 {
197 return (mc->mc_alloc);
198 }
199
200 uint64_t
201 metaslab_class_get_deferred(metaslab_class_t *mc)
202 {
203 return (mc->mc_deferred);
204 }
205
206 uint64_t
207 metaslab_class_get_space(metaslab_class_t *mc)
208 {
209 return (mc->mc_space);
210 }
211
212 uint64_t
213 metaslab_class_get_dspace(metaslab_class_t *mc)
214 {
215 return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space);
216 }
217
218 /*
219 * ==========================================================================
220 * Metaslab groups
221 * ==========================================================================
222 */
223 static int
224 metaslab_compare(const void *x1, const void *x2)
225 {
226 const metaslab_t *m1 = x1;
227 const metaslab_t *m2 = x2;
228
229 if (m1->ms_weight < m2->ms_weight)
230 return (1);
231 if (m1->ms_weight > m2->ms_weight)
232 return (-1);
233
234 /*
235 * If the weights are identical, use the offset to force uniqueness.
236 */
237 if (m1->ms_map->sm_start < m2->ms_map->sm_start)
238 return (-1);
239 if (m1->ms_map->sm_start > m2->ms_map->sm_start)
240 return (1);
241
242 ASSERT3P(m1, ==, m2);
243
244 return (0);
245 }
246
247 /*
248 * Update the allocatable flag and the metaslab group's capacity.
249 * The allocatable flag is set to true if the capacity is below
250 * the zfs_mg_noalloc_threshold. If a metaslab group transitions
251 * from allocatable to non-allocatable or vice versa then the metaslab
252 * group's class is updated to reflect the transition.
253 */
254 static void
255 metaslab_group_alloc_update(metaslab_group_t *mg)
256 {
257 vdev_t *vd = mg->mg_vd;
258 metaslab_class_t *mc = mg->mg_class;
259 vdev_stat_t *vs = &vd->vdev_stat;
260 boolean_t was_allocatable;
261
262 ASSERT(vd == vd->vdev_top);
263
264 mutex_enter(&mg->mg_lock);
265 was_allocatable = mg->mg_allocatable;
266
267 mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) /
268 (vs->vs_space + 1);
269
270 mg->mg_allocatable = (mg->mg_free_capacity > zfs_mg_noalloc_threshold);
271
272 /*
273 * The mc_alloc_groups maintains a count of the number of
274 * groups in this metaslab class that are still above the
275 * zfs_mg_noalloc_threshold. This is used by the allocating
276 * threads to determine if they should avoid allocations to
277 * a given group. The allocator will avoid allocations to a group
278 * if that group has reached or is below the zfs_mg_noalloc_threshold
279 * and there are still other groups that are above the threshold.
280 * When a group transitions from allocatable to non-allocatable or
281 * vice versa we update the metaslab class to reflect that change.
282 * When the mc_alloc_groups value drops to 0 that means that all
283 * groups have reached the zfs_mg_noalloc_threshold making all groups
284 * eligible for allocations. This effectively means that all devices
285 * are balanced again.
286 */
287 if (was_allocatable && !mg->mg_allocatable)
288 mc->mc_alloc_groups--;
289 else if (!was_allocatable && mg->mg_allocatable)
290 mc->mc_alloc_groups++;
291 mutex_exit(&mg->mg_lock);
292 }
293
294 metaslab_group_t *
295 metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
296 {
297 metaslab_group_t *mg;
298
299 mg = kmem_zalloc(sizeof (metaslab_group_t), KM_PUSHPAGE);
300 mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
301 avl_create(&mg->mg_metaslab_tree, metaslab_compare,
302 sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
303 mg->mg_vd = vd;
304 mg->mg_class = mc;
305 mg->mg_activation_count = 0;
306
307 return (mg);
308 }
309
310 void
311 metaslab_group_destroy(metaslab_group_t *mg)
312 {
313 ASSERT(mg->mg_prev == NULL);
314 ASSERT(mg->mg_next == NULL);
315 /*
316 * We may have gone below zero with the activation count
317 * either because we never activated in the first place or
318 * because we're done, and possibly removing the vdev.
319 */
320 ASSERT(mg->mg_activation_count <= 0);
321
322 avl_destroy(&mg->mg_metaslab_tree);
323 mutex_destroy(&mg->mg_lock);
324 kmem_free(mg, sizeof (metaslab_group_t));
325 }
326
327 void
328 metaslab_group_activate(metaslab_group_t *mg)
329 {
330 metaslab_class_t *mc = mg->mg_class;
331 metaslab_group_t *mgprev, *mgnext;
332
333 ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
334
335 ASSERT(mc->mc_rotor != mg);
336 ASSERT(mg->mg_prev == NULL);
337 ASSERT(mg->mg_next == NULL);
338 ASSERT(mg->mg_activation_count <= 0);
339
340 if (++mg->mg_activation_count <= 0)
341 return;
342
343 mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children);
344 metaslab_group_alloc_update(mg);
345
346 if ((mgprev = mc->mc_rotor) == NULL) {
347 mg->mg_prev = mg;
348 mg->mg_next = mg;
349 } else {
350 mgnext = mgprev->mg_next;
351 mg->mg_prev = mgprev;
352 mg->mg_next = mgnext;
353 mgprev->mg_next = mg;
354 mgnext->mg_prev = mg;
355 }
356 mc->mc_rotor = mg;
357 }
358
359 void
360 metaslab_group_passivate(metaslab_group_t *mg)
361 {
362 metaslab_class_t *mc = mg->mg_class;
363 metaslab_group_t *mgprev, *mgnext;
364
365 ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
366
367 if (--mg->mg_activation_count != 0) {
368 ASSERT(mc->mc_rotor != mg);
369 ASSERT(mg->mg_prev == NULL);
370 ASSERT(mg->mg_next == NULL);
371 ASSERT(mg->mg_activation_count < 0);
372 return;
373 }
374
375 mgprev = mg->mg_prev;
376 mgnext = mg->mg_next;
377
378 if (mg == mgnext) {
379 mc->mc_rotor = NULL;
380 } else {
381 mc->mc_rotor = mgnext;
382 mgprev->mg_next = mgnext;
383 mgnext->mg_prev = mgprev;
384 }
385
386 mg->mg_prev = NULL;
387 mg->mg_next = NULL;
388 }
389
390 static void
391 metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp)
392 {
393 mutex_enter(&mg->mg_lock);
394 ASSERT(msp->ms_group == NULL);
395 msp->ms_group = mg;
396 msp->ms_weight = 0;
397 avl_add(&mg->mg_metaslab_tree, msp);
398 mutex_exit(&mg->mg_lock);
399 }
400
401 static void
402 metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
403 {
404 mutex_enter(&mg->mg_lock);
405 ASSERT(msp->ms_group == mg);
406 avl_remove(&mg->mg_metaslab_tree, msp);
407 msp->ms_group = NULL;
408 mutex_exit(&mg->mg_lock);
409 }
410
411 static void
412 metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
413 {
414 /*
415 * Although in principle the weight can be any value, in
416 * practice we do not use values in the range [1, 510].
417 */
418 ASSERT(weight >= SPA_MINBLOCKSIZE-1 || weight == 0);
419 ASSERT(MUTEX_HELD(&msp->ms_lock));
420
421 mutex_enter(&mg->mg_lock);
422 ASSERT(msp->ms_group == mg);
423 avl_remove(&mg->mg_metaslab_tree, msp);
424 msp->ms_weight = weight;
425 avl_add(&mg->mg_metaslab_tree, msp);
426 mutex_exit(&mg->mg_lock);
427 }
428
429 /*
430 * Determine if a given metaslab group should skip allocations. A metaslab
431 * group should avoid allocations if its used capacity has crossed the
432 * zfs_mg_noalloc_threshold and there is at least one metaslab group
433 * that can still handle allocations.
434 */
435 static boolean_t
436 metaslab_group_allocatable(metaslab_group_t *mg)
437 {
438 vdev_t *vd = mg->mg_vd;
439 spa_t *spa = vd->vdev_spa;
440 metaslab_class_t *mc = mg->mg_class;
441
442 /*
443 * A metaslab group is considered allocatable if its free capacity
444 * is greater than the set value of zfs_mg_noalloc_threshold, it's
445 * associated with a slog, or there are no other metaslab groups
446 * with free capacity greater than zfs_mg_noalloc_threshold.
447 */
448 return (mg->mg_free_capacity > zfs_mg_noalloc_threshold ||
449 mc != spa_normal_class(spa) || mc->mc_alloc_groups == 0);
450 }
451
452 /*
453 * ==========================================================================
454 * Common allocator routines
455 * ==========================================================================
456 */
457 static int
458 metaslab_segsize_compare(const void *x1, const void *x2)
459 {
460 const space_seg_t *s1 = x1;
461 const space_seg_t *s2 = x2;
462 uint64_t ss_size1 = s1->ss_end - s1->ss_start;
463 uint64_t ss_size2 = s2->ss_end - s2->ss_start;
464
465 if (ss_size1 < ss_size2)
466 return (-1);
467 if (ss_size1 > ss_size2)
468 return (1);
469
470 if (s1->ss_start < s2->ss_start)
471 return (-1);
472 if (s1->ss_start > s2->ss_start)
473 return (1);
474
475 return (0);
476 }
477
478 #if defined(WITH_FF_BLOCK_ALLOCATOR) || \
479 defined(WITH_DF_BLOCK_ALLOCATOR) || \
480 defined(WITH_CDF_BLOCK_ALLOCATOR)
481 /*
482 * This is a helper function that can be used by the allocator to find
483 * a suitable block to allocate. This will search the specified AVL
484 * tree looking for a block that matches the specified criteria.
485 */
486 static uint64_t
487 metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
488 uint64_t align)
489 {
490 space_seg_t *ss, ssearch;
491 avl_index_t where;
492
493 ssearch.ss_start = *cursor;
494 ssearch.ss_end = *cursor + size;
495
496 ss = avl_find(t, &ssearch, &where);
497 if (ss == NULL)
498 ss = avl_nearest(t, where, AVL_AFTER);
499
500 while (ss != NULL) {
501 uint64_t offset = P2ROUNDUP(ss->ss_start, align);
502
503 if (offset + size <= ss->ss_end) {
504 *cursor = offset + size;
505 return (offset);
506 }
507 ss = AVL_NEXT(t, ss);
508 }
509
510 /*
511 * If we know we've searched the whole map (*cursor == 0), give up.
512 * Otherwise, reset the cursor to the beginning and try again.
513 */
514 if (*cursor == 0)
515 return (-1ULL);
516
517 *cursor = 0;
518 return (metaslab_block_picker(t, cursor, size, align));
519 }
520 #endif /* WITH_FF/DF/CDF_BLOCK_ALLOCATOR */
521
522 static void
523 metaslab_pp_load(space_map_t *sm)
524 {
525 space_seg_t *ss;
526
527 ASSERT(sm->sm_ppd == NULL);
528 sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_PUSHPAGE);
529
530 sm->sm_pp_root = kmem_alloc(sizeof (avl_tree_t), KM_PUSHPAGE);
531 avl_create(sm->sm_pp_root, metaslab_segsize_compare,
532 sizeof (space_seg_t), offsetof(struct space_seg, ss_pp_node));
533
534 for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss))
535 avl_add(sm->sm_pp_root, ss);
536 }
537
538 static void
539 metaslab_pp_unload(space_map_t *sm)
540 {
541 void *cookie = NULL;
542
543 kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t));
544 sm->sm_ppd = NULL;
545
546 while (avl_destroy_nodes(sm->sm_pp_root, &cookie) != NULL) {
547 /* tear down the tree */
548 }
549
550 avl_destroy(sm->sm_pp_root);
551 kmem_free(sm->sm_pp_root, sizeof (avl_tree_t));
552 sm->sm_pp_root = NULL;
553 }
554
555 /* ARGSUSED */
556 static void
557 metaslab_pp_claim(space_map_t *sm, uint64_t start, uint64_t size)
558 {
559 /* No need to update cursor */
560 }
561
562 /* ARGSUSED */
563 static void
564 metaslab_pp_free(space_map_t *sm, uint64_t start, uint64_t size)
565 {
566 /* No need to update cursor */
567 }
568
569 /*
570 * Return the maximum contiguous segment within the metaslab.
571 */
572 uint64_t
573 metaslab_pp_maxsize(space_map_t *sm)
574 {
575 avl_tree_t *t = sm->sm_pp_root;
576 space_seg_t *ss;
577
578 if (t == NULL || (ss = avl_last(t)) == NULL)
579 return (0ULL);
580
581 return (ss->ss_end - ss->ss_start);
582 }
583
584 #if defined(WITH_FF_BLOCK_ALLOCATOR)
585 /*
586 * ==========================================================================
587 * The first-fit block allocator
588 * ==========================================================================
589 */
590 static uint64_t
591 metaslab_ff_alloc(space_map_t *sm, uint64_t size)
592 {
593 avl_tree_t *t = &sm->sm_root;
594 uint64_t align = size & -size;
595 uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1;
596
597 return (metaslab_block_picker(t, cursor, size, align));
598 }
599
600 /* ARGSUSED */
601 boolean_t
602 metaslab_ff_fragmented(space_map_t *sm)
603 {
604 return (B_TRUE);
605 }
606
607 static space_map_ops_t metaslab_ff_ops = {
608 metaslab_pp_load,
609 metaslab_pp_unload,
610 metaslab_ff_alloc,
611 metaslab_pp_claim,
612 metaslab_pp_free,
613 metaslab_pp_maxsize,
614 metaslab_ff_fragmented
615 };
616
617 space_map_ops_t *zfs_metaslab_ops = &metaslab_ff_ops;
618 #endif /* WITH_FF_BLOCK_ALLOCATOR */
619
620 #if defined(WITH_DF_BLOCK_ALLOCATOR)
621 /*
622 * ==========================================================================
623 * Dynamic block allocator -
624 * Uses the first fit allocation scheme until space get low and then
625 * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold
626 * and metaslab_df_free_pct to determine when to switch the allocation scheme.
627 * ==========================================================================
628 */
629 static uint64_t
630 metaslab_df_alloc(space_map_t *sm, uint64_t size)
631 {
632 avl_tree_t *t = &sm->sm_root;
633 uint64_t align = size & -size;
634 uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1;
635 uint64_t max_size = metaslab_pp_maxsize(sm);
636 int free_pct = sm->sm_space * 100 / sm->sm_size;
637
638 ASSERT(MUTEX_HELD(sm->sm_lock));
639 ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root));
640
641 if (max_size < size)
642 return (-1ULL);
643
644 /*
645 * If we're running low on space switch to using the size
646 * sorted AVL tree (best-fit).
647 */
648 if (max_size < metaslab_df_alloc_threshold ||
649 free_pct < metaslab_df_free_pct) {
650 t = sm->sm_pp_root;
651 *cursor = 0;
652 }
653
654 return (metaslab_block_picker(t, cursor, size, 1ULL));
655 }
656
657 static boolean_t
658 metaslab_df_fragmented(space_map_t *sm)
659 {
660 uint64_t max_size = metaslab_pp_maxsize(sm);
661 int free_pct = sm->sm_space * 100 / sm->sm_size;
662
663 if (max_size >= metaslab_df_alloc_threshold &&
664 free_pct >= metaslab_df_free_pct)
665 return (B_FALSE);
666
667 return (B_TRUE);
668 }
669
670 static space_map_ops_t metaslab_df_ops = {
671 metaslab_pp_load,
672 metaslab_pp_unload,
673 metaslab_df_alloc,
674 metaslab_pp_claim,
675 metaslab_pp_free,
676 metaslab_pp_maxsize,
677 metaslab_df_fragmented
678 };
679
680 space_map_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
681 #endif /* WITH_DF_BLOCK_ALLOCATOR */
682
683 /*
684 * ==========================================================================
685 * Other experimental allocators
686 * ==========================================================================
687 */
688 #if defined(WITH_CDF_BLOCK_ALLOCATOR)
689 static uint64_t
690 metaslab_cdf_alloc(space_map_t *sm, uint64_t size)
691 {
692 avl_tree_t *t = &sm->sm_root;
693 uint64_t *cursor = (uint64_t *)sm->sm_ppd;
694 uint64_t *extent_end = (uint64_t *)sm->sm_ppd + 1;
695 uint64_t max_size = metaslab_pp_maxsize(sm);
696 uint64_t rsize = size;
697 uint64_t offset = 0;
698
699 ASSERT(MUTEX_HELD(sm->sm_lock));
700 ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root));
701
702 if (max_size < size)
703 return (-1ULL);
704
705 ASSERT3U(*extent_end, >=, *cursor);
706
707 /*
708 * If we're running low on space switch to using the size
709 * sorted AVL tree (best-fit).
710 */
711 if ((*cursor + size) > *extent_end) {
712
713 t = sm->sm_pp_root;
714 *cursor = *extent_end = 0;
715
716 if (max_size > 2 * SPA_MAXBLOCKSIZE)
717 rsize = MIN(metaslab_min_alloc_size, max_size);
718 offset = metaslab_block_picker(t, extent_end, rsize, 1ULL);
719 if (offset != -1)
720 *cursor = offset + size;
721 } else {
722 offset = metaslab_block_picker(t, cursor, rsize, 1ULL);
723 }
724 ASSERT3U(*cursor, <=, *extent_end);
725 return (offset);
726 }
727
728 static boolean_t
729 metaslab_cdf_fragmented(space_map_t *sm)
730 {
731 uint64_t max_size = metaslab_pp_maxsize(sm);
732
733 if (max_size > (metaslab_min_alloc_size * 10))
734 return (B_FALSE);
735 return (B_TRUE);
736 }
737
738 static space_map_ops_t metaslab_cdf_ops = {
739 metaslab_pp_load,
740 metaslab_pp_unload,
741 metaslab_cdf_alloc,
742 metaslab_pp_claim,
743 metaslab_pp_free,
744 metaslab_pp_maxsize,
745 metaslab_cdf_fragmented
746 };
747
748 space_map_ops_t *zfs_metaslab_ops = &metaslab_cdf_ops;
749 #endif /* WITH_CDF_BLOCK_ALLOCATOR */
750
751 #if defined(WITH_NDF_BLOCK_ALLOCATOR)
752 uint64_t metaslab_ndf_clump_shift = 4;
753
754 static uint64_t
755 metaslab_ndf_alloc(space_map_t *sm, uint64_t size)
756 {
757 avl_tree_t *t = &sm->sm_root;
758 avl_index_t where;
759 space_seg_t *ss, ssearch;
760 uint64_t hbit = highbit(size);
761 uint64_t *cursor = (uint64_t *)sm->sm_ppd + hbit - 1;
762 uint64_t max_size = metaslab_pp_maxsize(sm);
763
764 ASSERT(MUTEX_HELD(sm->sm_lock));
765 ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root));
766
767 if (max_size < size)
768 return (-1ULL);
769
770 ssearch.ss_start = *cursor;
771 ssearch.ss_end = *cursor + size;
772
773 ss = avl_find(t, &ssearch, &where);
774 if (ss == NULL || (ss->ss_start + size > ss->ss_end)) {
775 t = sm->sm_pp_root;
776
777 ssearch.ss_start = 0;
778 ssearch.ss_end = MIN(max_size,
779 1ULL << (hbit + metaslab_ndf_clump_shift));
780 ss = avl_find(t, &ssearch, &where);
781 if (ss == NULL)
782 ss = avl_nearest(t, where, AVL_AFTER);
783 ASSERT(ss != NULL);
784 }
785
786 if (ss != NULL) {
787 if (ss->ss_start + size <= ss->ss_end) {
788 *cursor = ss->ss_start + size;
789 return (ss->ss_start);
790 }
791 }
792 return (-1ULL);
793 }
794
795 static boolean_t
796 metaslab_ndf_fragmented(space_map_t *sm)
797 {
798 uint64_t max_size = metaslab_pp_maxsize(sm);
799
800 if (max_size > (metaslab_min_alloc_size << metaslab_ndf_clump_shift))
801 return (B_FALSE);
802 return (B_TRUE);
803 }
804
805
806 static space_map_ops_t metaslab_ndf_ops = {
807 metaslab_pp_load,
808 metaslab_pp_unload,
809 metaslab_ndf_alloc,
810 metaslab_pp_claim,
811 metaslab_pp_free,
812 metaslab_pp_maxsize,
813 metaslab_ndf_fragmented
814 };
815
816 space_map_ops_t *zfs_metaslab_ops = &metaslab_ndf_ops;
817 #endif /* WITH_NDF_BLOCK_ALLOCATOR */
818
819 /*
820 * ==========================================================================
821 * Metaslabs
822 * ==========================================================================
823 */
824 metaslab_t *
825 metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo,
826 uint64_t start, uint64_t size, uint64_t txg)
827 {
828 vdev_t *vd = mg->mg_vd;
829 metaslab_t *msp;
830
831 msp = kmem_zalloc(sizeof (metaslab_t), KM_PUSHPAGE);
832 mutex_init(&msp->ms_lock, NULL, MUTEX_DEFAULT, NULL);
833
834 msp->ms_smo_syncing = *smo;
835
836 /*
837 * We create the main space map here, but we don't create the
838 * allocmaps and freemaps until metaslab_sync_done(). This serves
839 * two purposes: it allows metaslab_sync_done() to detect the
840 * addition of new space; and for debugging, it ensures that we'd
841 * data fault on any attempt to use this metaslab before it's ready.
842 */
843 msp->ms_map = kmem_zalloc(sizeof (space_map_t), KM_PUSHPAGE);
844 space_map_create(msp->ms_map, start, size,
845 vd->vdev_ashift, &msp->ms_lock);
846
847 metaslab_group_add(mg, msp);
848
849 if (metaslab_debug && smo->smo_object != 0) {
850 mutex_enter(&msp->ms_lock);
851 VERIFY(space_map_load(msp->ms_map, mg->mg_class->mc_ops,
852 SM_FREE, smo, spa_meta_objset(vd->vdev_spa)) == 0);
853 mutex_exit(&msp->ms_lock);
854 }
855
856 /*
857 * If we're opening an existing pool (txg == 0) or creating
858 * a new one (txg == TXG_INITIAL), all space is available now.
859 * If we're adding space to an existing pool, the new space
860 * does not become available until after this txg has synced.
861 */
862 if (txg <= TXG_INITIAL)
863 metaslab_sync_done(msp, 0);
864
865 if (txg != 0) {
866 vdev_dirty(vd, 0, NULL, txg);
867 vdev_dirty(vd, VDD_METASLAB, msp, txg);
868 }
869
870 return (msp);
871 }
872
873 void
874 metaslab_fini(metaslab_t *msp)
875 {
876 metaslab_group_t *mg = msp->ms_group;
877 int t;
878
879 vdev_space_update(mg->mg_vd,
880 -msp->ms_smo.smo_alloc, 0, -msp->ms_map->sm_size);
881
882 metaslab_group_remove(mg, msp);
883
884 mutex_enter(&msp->ms_lock);
885
886 space_map_unload(msp->ms_map);
887 space_map_destroy(msp->ms_map);
888 kmem_free(msp->ms_map, sizeof (*msp->ms_map));
889
890 for (t = 0; t < TXG_SIZE; t++) {
891 space_map_destroy(msp->ms_allocmap[t]);
892 space_map_destroy(msp->ms_freemap[t]);
893 kmem_free(msp->ms_allocmap[t], sizeof (*msp->ms_allocmap[t]));
894 kmem_free(msp->ms_freemap[t], sizeof (*msp->ms_freemap[t]));
895 }
896
897 for (t = 0; t < TXG_DEFER_SIZE; t++) {
898 space_map_destroy(msp->ms_defermap[t]);
899 kmem_free(msp->ms_defermap[t], sizeof (*msp->ms_defermap[t]));
900 }
901
902 ASSERT0(msp->ms_deferspace);
903
904 mutex_exit(&msp->ms_lock);
905 mutex_destroy(&msp->ms_lock);
906
907 kmem_free(msp, sizeof (metaslab_t));
908 }
909
910 #define METASLAB_WEIGHT_PRIMARY (1ULL << 63)
911 #define METASLAB_WEIGHT_SECONDARY (1ULL << 62)
912 #define METASLAB_ACTIVE_MASK \
913 (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY)
914
915 static uint64_t
916 metaslab_weight(metaslab_t *msp)
917 {
918 metaslab_group_t *mg = msp->ms_group;
919 space_map_t *sm = msp->ms_map;
920 space_map_obj_t *smo = &msp->ms_smo;
921 vdev_t *vd = mg->mg_vd;
922 uint64_t weight, space;
923
924 ASSERT(MUTEX_HELD(&msp->ms_lock));
925
926 /*
927 * This vdev is in the process of being removed so there is nothing
928 * for us to do here.
929 */
930 if (vd->vdev_removing) {
931 ASSERT0(smo->smo_alloc);
932 ASSERT0(vd->vdev_ms_shift);
933 return (0);
934 }
935
936 /*
937 * The baseline weight is the metaslab's free space.
938 */
939 space = sm->sm_size - smo->smo_alloc;
940 weight = space;
941
942 /*
943 * Modern disks have uniform bit density and constant angular velocity.
944 * Therefore, the outer recording zones are faster (higher bandwidth)
945 * than the inner zones by the ratio of outer to inner track diameter,
946 * which is typically around 2:1. We account for this by assigning
947 * higher weight to lower metaslabs (multiplier ranging from 2x to 1x).
948 * In effect, this means that we'll select the metaslab with the most
949 * free bandwidth rather than simply the one with the most free space.
950 */
951 weight = 2 * weight -
952 ((sm->sm_start >> vd->vdev_ms_shift) * weight) / vd->vdev_ms_count;
953 ASSERT(weight >= space && weight <= 2 * space);
954
955 /*
956 * For locality, assign higher weight to metaslabs which have
957 * a lower offset than what we've already activated.
958 */
959 if (sm->sm_start <= mg->mg_bonus_area)
960 weight *= (metaslab_smo_bonus_pct / 100);
961 ASSERT(weight >= space &&
962 weight <= 2 * (metaslab_smo_bonus_pct / 100) * space);
963
964 if (sm->sm_loaded && !sm->sm_ops->smop_fragmented(sm)) {
965 /*
966 * If this metaslab is one we're actively using, adjust its
967 * weight to make it preferable to any inactive metaslab so
968 * we'll polish it off.
969 */
970 weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
971 }
972 return (weight);
973 }
974
975 static void
976 metaslab_prefetch(metaslab_group_t *mg)
977 {
978 spa_t *spa = mg->mg_vd->vdev_spa;
979 metaslab_t *msp;
980 avl_tree_t *t = &mg->mg_metaslab_tree;
981 int m;
982
983 mutex_enter(&mg->mg_lock);
984
985 /*
986 * Prefetch the next potential metaslabs
987 */
988 for (msp = avl_first(t), m = 0; msp; msp = AVL_NEXT(t, msp), m++) {
989 space_map_t *sm = msp->ms_map;
990 space_map_obj_t *smo = &msp->ms_smo;
991
992 /* If we have reached our prefetch limit then we're done */
993 if (m >= metaslab_prefetch_limit)
994 break;
995
996 if (!sm->sm_loaded && smo->smo_object != 0) {
997 mutex_exit(&mg->mg_lock);
998 dmu_prefetch(spa_meta_objset(spa), smo->smo_object,
999 0ULL, smo->smo_objsize);
1000 mutex_enter(&mg->mg_lock);
1001 }
1002 }
1003 mutex_exit(&mg->mg_lock);
1004 }
1005
1006 static int
1007 metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
1008 {
1009 metaslab_group_t *mg = msp->ms_group;
1010 space_map_t *sm = msp->ms_map;
1011 space_map_ops_t *sm_ops = msp->ms_group->mg_class->mc_ops;
1012 int t;
1013
1014 ASSERT(MUTEX_HELD(&msp->ms_lock));
1015
1016 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
1017 space_map_load_wait(sm);
1018 if (!sm->sm_loaded) {
1019 space_map_obj_t *smo = &msp->ms_smo;
1020
1021 int error = space_map_load(sm, sm_ops, SM_FREE, smo,
1022 spa_meta_objset(msp->ms_group->mg_vd->vdev_spa));
1023 if (error) {
1024 metaslab_group_sort(msp->ms_group, msp, 0);
1025 return (error);
1026 }
1027 for (t = 0; t < TXG_DEFER_SIZE; t++)
1028 space_map_walk(msp->ms_defermap[t],
1029 space_map_claim, sm);
1030
1031 }
1032
1033 /*
1034 * Track the bonus area as we activate new metaslabs.
1035 */
1036 if (sm->sm_start > mg->mg_bonus_area) {
1037 mutex_enter(&mg->mg_lock);
1038 mg->mg_bonus_area = sm->sm_start;
1039 mutex_exit(&mg->mg_lock);
1040 }
1041
1042 metaslab_group_sort(msp->ms_group, msp,
1043 msp->ms_weight | activation_weight);
1044 }
1045 ASSERT(sm->sm_loaded);
1046 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
1047
1048 return (0);
1049 }
1050
1051 static void
1052 metaslab_passivate(metaslab_t *msp, uint64_t size)
1053 {
1054 /*
1055 * If size < SPA_MINBLOCKSIZE, then we will not allocate from
1056 * this metaslab again. In that case, it had better be empty,
1057 * or we would be leaving space on the table.
1058 */
1059 ASSERT(size >= SPA_MINBLOCKSIZE || msp->ms_map->sm_space == 0);
1060 metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size));
1061 ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0);
1062 }
1063
1064 /*
1065 * Determine if the in-core space map representation can be condensed on-disk.
1066 * We would like to use the following criteria to make our decision:
1067 *
1068 * 1. The size of the space map object should not dramatically increase as a
1069 * result of writing out our in-core free map.
1070 *
1071 * 2. The minimal on-disk space map representation is zfs_condense_pct/100
1072 * times the size than the in-core representation (i.e. zfs_condense_pct = 110
1073 * and in-core = 1MB, minimal = 1.1.MB).
1074 *
1075 * Checking the first condition is tricky since we don't want to walk
1076 * the entire AVL tree calculating the estimated on-disk size. Instead we
1077 * use the size-ordered AVL tree in the space map and calculate the
1078 * size required for the largest segment in our in-core free map. If the
1079 * size required to represent that segment on disk is larger than the space
1080 * map object then we avoid condensing this map.
1081 *
1082 * To determine the second criterion we use a best-case estimate and assume
1083 * each segment can be represented on-disk as a single 64-bit entry. We refer
1084 * to this best-case estimate as the space map's minimal form.
1085 */
1086 static boolean_t
1087 metaslab_should_condense(metaslab_t *msp)
1088 {
1089 space_map_t *sm = msp->ms_map;
1090 space_map_obj_t *smo = &msp->ms_smo_syncing;
1091 space_seg_t *ss;
1092 uint64_t size, entries, segsz;
1093
1094 ASSERT(MUTEX_HELD(&msp->ms_lock));
1095 ASSERT(sm->sm_loaded);
1096
1097 /*
1098 * Use the sm_pp_root AVL tree, which is ordered by size, to obtain
1099 * the largest segment in the in-core free map. If the tree is
1100 * empty then we should condense the map.
1101 */
1102 ss = avl_last(sm->sm_pp_root);
1103 if (ss == NULL)
1104 return (B_TRUE);
1105
1106 /*
1107 * Calculate the number of 64-bit entries this segment would
1108 * require when written to disk. If this single segment would be
1109 * larger on-disk than the entire current on-disk structure, then
1110 * clearly condensing will increase the on-disk structure size.
1111 */
1112 size = (ss->ss_end - ss->ss_start) >> sm->sm_shift;
1113 entries = size / (MIN(size, SM_RUN_MAX));
1114 segsz = entries * sizeof (uint64_t);
1115
1116 return (segsz <= smo->smo_objsize &&
1117 smo->smo_objsize >= (zfs_condense_pct *
1118 sizeof (uint64_t) * avl_numnodes(&sm->sm_root)) / 100);
1119 }
1120
1121 /*
1122 * Condense the on-disk space map representation to its minimized form.
1123 * The minimized form consists of a small number of allocations followed by
1124 * the in-core free map.
1125 */
1126 static void
1127 metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx)
1128 {
1129 spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1130 space_map_t *freemap = msp->ms_freemap[txg & TXG_MASK];
1131 space_map_t condense_map;
1132 space_map_t *sm = msp->ms_map;
1133 objset_t *mos = spa_meta_objset(spa);
1134 space_map_obj_t *smo = &msp->ms_smo_syncing;
1135 int t;
1136
1137 ASSERT(MUTEX_HELD(&msp->ms_lock));
1138 ASSERT3U(spa_sync_pass(spa), ==, 1);
1139 ASSERT(sm->sm_loaded);
1140
1141 spa_dbgmsg(spa, "condensing: txg %llu, msp[%llu] %p, "
1142 "smo size %llu, segments %lu", txg,
1143 (msp->ms_map->sm_start / msp->ms_map->sm_size), msp,
1144 smo->smo_objsize, avl_numnodes(&sm->sm_root));
1145
1146 /*
1147 * Create an map that is a 100% allocated map. We remove segments
1148 * that have been freed in this txg, any deferred frees that exist,
1149 * and any allocation in the future. Removing segments should be
1150 * a relatively inexpensive operation since we expect these maps to
1151 * a small number of nodes.
1152 */
1153 space_map_create(&condense_map, sm->sm_start, sm->sm_size,
1154 sm->sm_shift, sm->sm_lock);
1155 space_map_add(&condense_map, condense_map.sm_start,
1156 condense_map.sm_size);
1157
1158 /*
1159 * Remove what's been freed in this txg from the condense_map.
1160 * Since we're in sync_pass 1, we know that all the frees from
1161 * this txg are in the freemap.
1162 */
1163 space_map_walk(freemap, space_map_remove, &condense_map);
1164
1165 for (t = 0; t < TXG_DEFER_SIZE; t++)
1166 space_map_walk(msp->ms_defermap[t],
1167 space_map_remove, &condense_map);
1168
1169 for (t = 1; t < TXG_CONCURRENT_STATES; t++)
1170 space_map_walk(msp->ms_allocmap[(txg + t) & TXG_MASK],
1171 space_map_remove, &condense_map);
1172
1173 /*
1174 * We're about to drop the metaslab's lock thus allowing
1175 * other consumers to change it's content. Set the
1176 * space_map's sm_condensing flag to ensure that
1177 * allocations on this metaslab do not occur while we're
1178 * in the middle of committing it to disk. This is only critical
1179 * for the ms_map as all other space_maps use per txg
1180 * views of their content.
1181 */
1182 sm->sm_condensing = B_TRUE;
1183
1184 mutex_exit(&msp->ms_lock);
1185 space_map_truncate(smo, mos, tx);
1186 mutex_enter(&msp->ms_lock);
1187
1188 /*
1189 * While we would ideally like to create a space_map representation
1190 * that consists only of allocation records, doing so can be
1191 * prohibitively expensive because the in-core free map can be
1192 * large, and therefore computationally expensive to subtract
1193 * from the condense_map. Instead we sync out two maps, a cheap
1194 * allocation only map followed by the in-core free map. While not
1195 * optimal, this is typically close to optimal, and much cheaper to
1196 * compute.
1197 */
1198 space_map_sync(&condense_map, SM_ALLOC, smo, mos, tx);
1199 space_map_vacate(&condense_map, NULL, NULL);
1200 space_map_destroy(&condense_map);
1201
1202 space_map_sync(sm, SM_FREE, smo, mos, tx);
1203 sm->sm_condensing = B_FALSE;
1204
1205 spa_dbgmsg(spa, "condensed: txg %llu, msp[%llu] %p, "
1206 "smo size %llu", txg,
1207 (msp->ms_map->sm_start / msp->ms_map->sm_size), msp,
1208 smo->smo_objsize);
1209 }
1210
1211 /*
1212 * Write a metaslab to disk in the context of the specified transaction group.
1213 */
1214 void
1215 metaslab_sync(metaslab_t *msp, uint64_t txg)
1216 {
1217 vdev_t *vd = msp->ms_group->mg_vd;
1218 spa_t *spa = vd->vdev_spa;
1219 objset_t *mos = spa_meta_objset(spa);
1220 space_map_t *allocmap = msp->ms_allocmap[txg & TXG_MASK];
1221 space_map_t **freemap = &msp->ms_freemap[txg & TXG_MASK];
1222 space_map_t **freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
1223 space_map_t *sm = msp->ms_map;
1224 space_map_obj_t *smo = &msp->ms_smo_syncing;
1225 dmu_buf_t *db;
1226 dmu_tx_t *tx;
1227
1228 ASSERT(!vd->vdev_ishole);
1229
1230 /*
1231 * This metaslab has just been added so there's no work to do now.
1232 */
1233 if (*freemap == NULL) {
1234 ASSERT3P(allocmap, ==, NULL);
1235 return;
1236 }
1237
1238 ASSERT3P(allocmap, !=, NULL);
1239 ASSERT3P(*freemap, !=, NULL);
1240 ASSERT3P(*freed_map, !=, NULL);
1241
1242 if (allocmap->sm_space == 0 && (*freemap)->sm_space == 0)
1243 return;
1244
1245 /*
1246 * The only state that can actually be changing concurrently with
1247 * metaslab_sync() is the metaslab's ms_map. No other thread can
1248 * be modifying this txg's allocmap, freemap, freed_map, or smo.
1249 * Therefore, we only hold ms_lock to satify space_map ASSERTs.
1250 * We drop it whenever we call into the DMU, because the DMU
1251 * can call down to us (e.g. via zio_free()) at any time.
1252 */
1253
1254 tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
1255
1256 if (smo->smo_object == 0) {
1257 ASSERT(smo->smo_objsize == 0);
1258 ASSERT(smo->smo_alloc == 0);
1259 smo->smo_object = dmu_object_alloc(mos,
1260 DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT,
1261 DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx);
1262 ASSERT(smo->smo_object != 0);
1263 dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
1264 (sm->sm_start >> vd->vdev_ms_shift),
1265 sizeof (uint64_t), &smo->smo_object, tx);
1266 }
1267
1268 mutex_enter(&msp->ms_lock);
1269
1270 if (sm->sm_loaded && spa_sync_pass(spa) == 1 &&
1271 metaslab_should_condense(msp)) {
1272 metaslab_condense(msp, txg, tx);
1273 } else {
1274 space_map_sync(allocmap, SM_ALLOC, smo, mos, tx);
1275 space_map_sync(*freemap, SM_FREE, smo, mos, tx);
1276 }
1277
1278 space_map_vacate(allocmap, NULL, NULL);
1279
1280 /*
1281 * For sync pass 1, we avoid walking the entire space map and
1282 * instead will just swap the pointers for freemap and
1283 * freed_map. We can safely do this since the freed_map is
1284 * guaranteed to be empty on the initial pass.
1285 */
1286 if (spa_sync_pass(spa) == 1) {
1287 ASSERT0((*freed_map)->sm_space);
1288 ASSERT0(avl_numnodes(&(*freed_map)->sm_root));
1289 space_map_swap(freemap, freed_map);
1290 } else {
1291 space_map_vacate(*freemap, space_map_add, *freed_map);
1292 }
1293
1294 ASSERT0(msp->ms_allocmap[txg & TXG_MASK]->sm_space);
1295 ASSERT0(msp->ms_freemap[txg & TXG_MASK]->sm_space);
1296
1297 mutex_exit(&msp->ms_lock);
1298
1299 VERIFY0(dmu_bonus_hold(mos, smo->smo_object, FTAG, &db));
1300 dmu_buf_will_dirty(db, tx);
1301 ASSERT3U(db->db_size, >=, sizeof (*smo));
1302 bcopy(smo, db->db_data, sizeof (*smo));
1303 dmu_buf_rele(db, FTAG);
1304
1305 dmu_tx_commit(tx);
1306 }
1307
1308 /*
1309 * Called after a transaction group has completely synced to mark
1310 * all of the metaslab's free space as usable.
1311 */
1312 void
1313 metaslab_sync_done(metaslab_t *msp, uint64_t txg)
1314 {
1315 space_map_obj_t *smo = &msp->ms_smo;
1316 space_map_obj_t *smosync = &msp->ms_smo_syncing;
1317 space_map_t *sm = msp->ms_map;
1318 space_map_t **freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
1319 space_map_t **defer_map = &msp->ms_defermap[txg % TXG_DEFER_SIZE];
1320 metaslab_group_t *mg = msp->ms_group;
1321 vdev_t *vd = mg->mg_vd;
1322 int64_t alloc_delta, defer_delta;
1323 int t;
1324
1325 ASSERT(!vd->vdev_ishole);
1326
1327 mutex_enter(&msp->ms_lock);
1328
1329 /*
1330 * If this metaslab is just becoming available, initialize its
1331 * allocmaps, freemaps, and defermap and add its capacity to the vdev.
1332 */
1333 if (*freed_map == NULL) {
1334 ASSERT(*defer_map == NULL);
1335 for (t = 0; t < TXG_SIZE; t++) {
1336 msp->ms_allocmap[t] = kmem_zalloc(sizeof (space_map_t),
1337 KM_PUSHPAGE);
1338 space_map_create(msp->ms_allocmap[t], sm->sm_start,
1339 sm->sm_size, sm->sm_shift, sm->sm_lock);
1340 msp->ms_freemap[t] = kmem_zalloc(sizeof (space_map_t),
1341 KM_PUSHPAGE);
1342 space_map_create(msp->ms_freemap[t], sm->sm_start,
1343 sm->sm_size, sm->sm_shift, sm->sm_lock);
1344 }
1345
1346 for (t = 0; t < TXG_DEFER_SIZE; t++) {
1347 msp->ms_defermap[t] = kmem_zalloc(sizeof (space_map_t),
1348 KM_PUSHPAGE);
1349 space_map_create(msp->ms_defermap[t], sm->sm_start,
1350 sm->sm_size, sm->sm_shift, sm->sm_lock);
1351 }
1352
1353 freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
1354 defer_map = &msp->ms_defermap[txg % TXG_DEFER_SIZE];
1355
1356 vdev_space_update(vd, 0, 0, sm->sm_size);
1357 }
1358
1359 alloc_delta = smosync->smo_alloc - smo->smo_alloc;
1360 defer_delta = (*freed_map)->sm_space - (*defer_map)->sm_space;
1361
1362 vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0);
1363
1364 ASSERT(msp->ms_allocmap[txg & TXG_MASK]->sm_space == 0);
1365 ASSERT(msp->ms_freemap[txg & TXG_MASK]->sm_space == 0);
1366
1367 /*
1368 * If there's a space_map_load() in progress, wait for it to complete
1369 * so that we have a consistent view of the in-core space map.
1370 */
1371 space_map_load_wait(sm);
1372
1373 /*
1374 * Move the frees from the defer_map to this map (if it's loaded).
1375 * Swap the freed_map and the defer_map -- this is safe to do
1376 * because we've just emptied out the defer_map.
1377 */
1378 space_map_vacate(*defer_map, sm->sm_loaded ? space_map_free : NULL, sm);
1379 ASSERT0((*defer_map)->sm_space);
1380 ASSERT0(avl_numnodes(&(*defer_map)->sm_root));
1381 space_map_swap(freed_map, defer_map);
1382
1383 *smo = *smosync;
1384
1385 msp->ms_deferspace += defer_delta;
1386 ASSERT3S(msp->ms_deferspace, >=, 0);
1387 ASSERT3S(msp->ms_deferspace, <=, sm->sm_size);
1388 if (msp->ms_deferspace != 0) {
1389 /*
1390 * Keep syncing this metaslab until all deferred frees
1391 * are back in circulation.
1392 */
1393 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
1394 }
1395
1396 metaslab_group_alloc_update(mg);
1397
1398 /*
1399 * If the map is loaded but no longer active, evict it as soon as all
1400 * future allocations have synced. (If we unloaded it now and then
1401 * loaded a moment later, the map wouldn't reflect those allocations.)
1402 */
1403 if (sm->sm_loaded && (msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
1404 int evictable = 1;
1405
1406 for (t = 1; t < TXG_CONCURRENT_STATES; t++)
1407 if (msp->ms_allocmap[(txg + t) & TXG_MASK]->sm_space)
1408 evictable = 0;
1409
1410 if (evictable && !metaslab_debug)
1411 space_map_unload(sm);
1412 }
1413
1414 metaslab_group_sort(mg, msp, metaslab_weight(msp));
1415
1416 mutex_exit(&msp->ms_lock);
1417 }
1418
1419 void
1420 metaslab_sync_reassess(metaslab_group_t *mg)
1421 {
1422 vdev_t *vd = mg->mg_vd;
1423 int64_t failures = mg->mg_alloc_failures;
1424 int m;
1425
1426 /*
1427 * Re-evaluate all metaslabs which have lower offsets than the
1428 * bonus area.
1429 */
1430 for (m = 0; m < vd->vdev_ms_count; m++) {
1431 metaslab_t *msp = vd->vdev_ms[m];
1432
1433 if (msp->ms_map->sm_start > mg->mg_bonus_area)
1434 break;
1435
1436 mutex_enter(&msp->ms_lock);
1437 metaslab_group_sort(mg, msp, metaslab_weight(msp));
1438 mutex_exit(&msp->ms_lock);
1439 }
1440
1441 atomic_add_64(&mg->mg_alloc_failures, -failures);
1442
1443 /*
1444 * Prefetch the next potential metaslabs
1445 */
1446 metaslab_prefetch(mg);
1447 }
1448
1449 static uint64_t
1450 metaslab_distance(metaslab_t *msp, dva_t *dva)
1451 {
1452 uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift;
1453 uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift;
1454 uint64_t start = msp->ms_map->sm_start >> ms_shift;
1455
1456 if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva))
1457 return (1ULL << 63);
1458
1459 if (offset < start)
1460 return ((start - offset) << ms_shift);
1461 if (offset > start)
1462 return ((offset - start) << ms_shift);
1463 return (0);
1464 }
1465
1466 static uint64_t
1467 metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize,
1468 uint64_t txg, uint64_t min_distance, dva_t *dva, int d, int flags)
1469 {
1470 spa_t *spa = mg->mg_vd->vdev_spa;
1471 metaslab_t *msp = NULL;
1472 uint64_t offset = -1ULL;
1473 avl_tree_t *t = &mg->mg_metaslab_tree;
1474 uint64_t activation_weight;
1475 uint64_t target_distance;
1476 int i;
1477
1478 activation_weight = METASLAB_WEIGHT_PRIMARY;
1479 for (i = 0; i < d; i++) {
1480 if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
1481 activation_weight = METASLAB_WEIGHT_SECONDARY;
1482 break;
1483 }
1484 }
1485
1486 for (;;) {
1487 boolean_t was_active;
1488
1489 mutex_enter(&mg->mg_lock);
1490 for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) {
1491 if (msp->ms_weight < asize) {
1492 spa_dbgmsg(spa, "%s: failed to meet weight "
1493 "requirement: vdev %llu, txg %llu, mg %p, "
1494 "msp %p, psize %llu, asize %llu, "
1495 "failures %llu, weight %llu",
1496 spa_name(spa), mg->mg_vd->vdev_id, txg,
1497 mg, msp, psize, asize,
1498 mg->mg_alloc_failures, msp->ms_weight);
1499 mutex_exit(&mg->mg_lock);
1500 return (-1ULL);
1501 }
1502
1503 /*
1504 * If the selected metaslab is condensing, skip it.
1505 */
1506 if (msp->ms_map->sm_condensing)
1507 continue;
1508
1509 was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
1510 if (activation_weight == METASLAB_WEIGHT_PRIMARY)
1511 break;
1512
1513 target_distance = min_distance +
1514 (msp->ms_smo.smo_alloc ? 0 : min_distance >> 1);
1515
1516 for (i = 0; i < d; i++)
1517 if (metaslab_distance(msp, &dva[i]) <
1518 target_distance)
1519 break;
1520 if (i == d)
1521 break;
1522 }
1523 mutex_exit(&mg->mg_lock);
1524 if (msp == NULL)
1525 return (-1ULL);
1526
1527 mutex_enter(&msp->ms_lock);
1528
1529 /*
1530 * If we've already reached the allowable number of failed
1531 * allocation attempts on this metaslab group then we
1532 * consider skipping it. We skip it only if we're allowed
1533 * to "fast" gang, the physical size is larger than
1534 * a gang block, and we're attempting to allocate from
1535 * the primary metaslab.
1536 */
1537 if (mg->mg_alloc_failures > zfs_mg_alloc_failures &&
1538 CAN_FASTGANG(flags) && psize > SPA_GANGBLOCKSIZE &&
1539 activation_weight == METASLAB_WEIGHT_PRIMARY) {
1540 spa_dbgmsg(spa, "%s: skipping metaslab group: "
1541 "vdev %llu, txg %llu, mg %p, psize %llu, "
1542 "asize %llu, failures %llu", spa_name(spa),
1543 mg->mg_vd->vdev_id, txg, mg, psize, asize,
1544 mg->mg_alloc_failures);
1545 mutex_exit(&msp->ms_lock);
1546 return (-1ULL);
1547 }
1548
1549 /*
1550 * Ensure that the metaslab we have selected is still
1551 * capable of handling our request. It's possible that
1552 * another thread may have changed the weight while we
1553 * were blocked on the metaslab lock.
1554 */
1555 if (msp->ms_weight < asize || (was_active &&
1556 !(msp->ms_weight & METASLAB_ACTIVE_MASK) &&
1557 activation_weight == METASLAB_WEIGHT_PRIMARY)) {
1558 mutex_exit(&msp->ms_lock);
1559 continue;
1560 }
1561
1562 if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) &&
1563 activation_weight == METASLAB_WEIGHT_PRIMARY) {
1564 metaslab_passivate(msp,
1565 msp->ms_weight & ~METASLAB_ACTIVE_MASK);
1566 mutex_exit(&msp->ms_lock);
1567 continue;
1568 }
1569
1570 if (metaslab_activate(msp, activation_weight) != 0) {
1571 mutex_exit(&msp->ms_lock);
1572 continue;
1573 }
1574
1575 /*
1576 * If this metaslab is currently condensing then pick again as
1577 * we can't manipulate this metaslab until it's committed
1578 * to disk.
1579 */
1580 if (msp->ms_map->sm_condensing) {
1581 mutex_exit(&msp->ms_lock);
1582 continue;
1583 }
1584
1585 if ((offset = space_map_alloc(msp->ms_map, asize)) != -1ULL)
1586 break;
1587
1588 atomic_inc_64(&mg->mg_alloc_failures);
1589
1590 metaslab_passivate(msp, space_map_maxsize(msp->ms_map));
1591
1592 mutex_exit(&msp->ms_lock);
1593 }
1594
1595 if (msp->ms_allocmap[txg & TXG_MASK]->sm_space == 0)
1596 vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
1597
1598 space_map_add(msp->ms_allocmap[txg & TXG_MASK], offset, asize);
1599
1600 mutex_exit(&msp->ms_lock);
1601
1602 return (offset);
1603 }
1604
1605 /*
1606 * Allocate a block for the specified i/o.
1607 */
1608 static int
1609 metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
1610 dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags)
1611 {
1612 metaslab_group_t *mg, *fast_mg, *rotor;
1613 vdev_t *vd;
1614 int dshift = 3;
1615 int all_zero;
1616 int zio_lock = B_FALSE;
1617 boolean_t allocatable;
1618 uint64_t offset = -1ULL;
1619 uint64_t asize;
1620 uint64_t distance;
1621
1622 ASSERT(!DVA_IS_VALID(&dva[d]));
1623
1624 /*
1625 * For testing, make some blocks above a certain size be gang blocks.
1626 */
1627 if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0)
1628 return (SET_ERROR(ENOSPC));
1629
1630 if (flags & METASLAB_FASTWRITE)
1631 mutex_enter(&mc->mc_fastwrite_lock);
1632
1633 /*
1634 * Start at the rotor and loop through all mgs until we find something.
1635 * Note that there's no locking on mc_rotor or mc_aliquot because
1636 * nothing actually breaks if we miss a few updates -- we just won't
1637 * allocate quite as evenly. It all balances out over time.
1638 *
1639 * If we are doing ditto or log blocks, try to spread them across
1640 * consecutive vdevs. If we're forced to reuse a vdev before we've
1641 * allocated all of our ditto blocks, then try and spread them out on
1642 * that vdev as much as possible. If it turns out to not be possible,
1643 * gradually lower our standards until anything becomes acceptable.
1644 * Also, allocating on consecutive vdevs (as opposed to random vdevs)
1645 * gives us hope of containing our fault domains to something we're
1646 * able to reason about. Otherwise, any two top-level vdev failures
1647 * will guarantee the loss of data. With consecutive allocation,
1648 * only two adjacent top-level vdev failures will result in data loss.
1649 *
1650 * If we are doing gang blocks (hintdva is non-NULL), try to keep
1651 * ourselves on the same vdev as our gang block header. That
1652 * way, we can hope for locality in vdev_cache, plus it makes our
1653 * fault domains something tractable.
1654 */
1655 if (hintdva) {
1656 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));
1657
1658 /*
1659 * It's possible the vdev we're using as the hint no
1660 * longer exists (i.e. removed). Consult the rotor when
1661 * all else fails.
1662 */
1663 if (vd != NULL) {
1664 mg = vd->vdev_mg;
1665
1666 if (flags & METASLAB_HINTBP_AVOID &&
1667 mg->mg_next != NULL)
1668 mg = mg->mg_next;
1669 } else {
1670 mg = mc->mc_rotor;
1671 }
1672 } else if (d != 0) {
1673 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
1674 mg = vd->vdev_mg->mg_next;
1675 } else if (flags & METASLAB_FASTWRITE) {
1676 mg = fast_mg = mc->mc_rotor;
1677
1678 do {
1679 if (fast_mg->mg_vd->vdev_pending_fastwrite <
1680 mg->mg_vd->vdev_pending_fastwrite)
1681 mg = fast_mg;
1682 } while ((fast_mg = fast_mg->mg_next) != mc->mc_rotor);
1683
1684 } else {
1685 mg = mc->mc_rotor;
1686 }
1687
1688 /*
1689 * If the hint put us into the wrong metaslab class, or into a
1690 * metaslab group that has been passivated, just follow the rotor.
1691 */
1692 if (mg->mg_class != mc || mg->mg_activation_count <= 0)
1693 mg = mc->mc_rotor;
1694
1695 rotor = mg;
1696 top:
1697 all_zero = B_TRUE;
1698 do {
1699 ASSERT(mg->mg_activation_count == 1);
1700
1701 vd = mg->mg_vd;
1702
1703 /*
1704 * Don't allocate from faulted devices.
1705 */
1706 if (zio_lock) {
1707 spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
1708 allocatable = vdev_allocatable(vd);
1709 spa_config_exit(spa, SCL_ZIO, FTAG);
1710 } else {
1711 allocatable = vdev_allocatable(vd);
1712 }
1713
1714 /*
1715 * Determine if the selected metaslab group is eligible
1716 * for allocations. If we're ganging or have requested
1717 * an allocation for the smallest gang block size
1718 * then we don't want to avoid allocating to the this
1719 * metaslab group. If we're in this condition we should
1720 * try to allocate from any device possible so that we
1721 * don't inadvertently return ENOSPC and suspend the pool
1722 * even though space is still available.
1723 */
1724 if (allocatable && CAN_FASTGANG(flags) &&
1725 psize > SPA_GANGBLOCKSIZE)
1726 allocatable = metaslab_group_allocatable(mg);
1727
1728 if (!allocatable)
1729 goto next;
1730
1731 /*
1732 * Avoid writing single-copy data to a failing vdev
1733 * unless the user instructs us that it is okay.
1734 */
1735 if ((vd->vdev_stat.vs_write_errors > 0 ||
1736 vd->vdev_state < VDEV_STATE_HEALTHY) &&
1737 d == 0 && dshift == 3 &&
1738 !(zfs_write_to_degraded && vd->vdev_state ==
1739 VDEV_STATE_DEGRADED)) {
1740 all_zero = B_FALSE;
1741 goto next;
1742 }
1743
1744 ASSERT(mg->mg_class == mc);
1745
1746 distance = vd->vdev_asize >> dshift;
1747 if (distance <= (1ULL << vd->vdev_ms_shift))
1748 distance = 0;
1749 else
1750 all_zero = B_FALSE;
1751
1752 asize = vdev_psize_to_asize(vd, psize);
1753 ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
1754
1755 offset = metaslab_group_alloc(mg, psize, asize, txg, distance,
1756 dva, d, flags);
1757 if (offset != -1ULL) {
1758 /*
1759 * If we've just selected this metaslab group,
1760 * figure out whether the corresponding vdev is
1761 * over- or under-used relative to the pool,
1762 * and set an allocation bias to even it out.
1763 */
1764 if (mc->mc_aliquot == 0) {
1765 vdev_stat_t *vs = &vd->vdev_stat;
1766 int64_t vu, cu;
1767
1768 vu = (vs->vs_alloc * 100) / (vs->vs_space + 1);
1769 cu = (mc->mc_alloc * 100) / (mc->mc_space + 1);
1770
1771 /*
1772 * Calculate how much more or less we should
1773 * try to allocate from this device during
1774 * this iteration around the rotor.
1775 * For example, if a device is 80% full
1776 * and the pool is 20% full then we should
1777 * reduce allocations by 60% on this device.
1778 *
1779 * mg_bias = (20 - 80) * 512K / 100 = -307K
1780 *
1781 * This reduces allocations by 307K for this
1782 * iteration.
1783 */
1784 mg->mg_bias = ((cu - vu) *
1785 (int64_t)mg->mg_aliquot) / 100;
1786 }
1787
1788 if ((flags & METASLAB_FASTWRITE) ||
1789 atomic_add_64_nv(&mc->mc_aliquot, asize) >=
1790 mg->mg_aliquot + mg->mg_bias) {
1791 mc->mc_rotor = mg->mg_next;
1792 mc->mc_aliquot = 0;
1793 }
1794
1795 DVA_SET_VDEV(&dva[d], vd->vdev_id);
1796 DVA_SET_OFFSET(&dva[d], offset);
1797 DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER));
1798 DVA_SET_ASIZE(&dva[d], asize);
1799
1800 if (flags & METASLAB_FASTWRITE) {
1801 atomic_add_64(&vd->vdev_pending_fastwrite,
1802 psize);
1803 mutex_exit(&mc->mc_fastwrite_lock);
1804 }
1805
1806 return (0);
1807 }
1808 next:
1809 mc->mc_rotor = mg->mg_next;
1810 mc->mc_aliquot = 0;
1811 } while ((mg = mg->mg_next) != rotor);
1812
1813 if (!all_zero) {
1814 dshift++;
1815 ASSERT(dshift < 64);
1816 goto top;
1817 }
1818
1819 if (!allocatable && !zio_lock) {
1820 dshift = 3;
1821 zio_lock = B_TRUE;
1822 goto top;
1823 }
1824
1825 bzero(&dva[d], sizeof (dva_t));
1826
1827 if (flags & METASLAB_FASTWRITE)
1828 mutex_exit(&mc->mc_fastwrite_lock);
1829
1830 return (SET_ERROR(ENOSPC));
1831 }
1832
1833 /*
1834 * Free the block represented by DVA in the context of the specified
1835 * transaction group.
1836 */
1837 static void
1838 metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now)
1839 {
1840 uint64_t vdev = DVA_GET_VDEV(dva);
1841 uint64_t offset = DVA_GET_OFFSET(dva);
1842 uint64_t size = DVA_GET_ASIZE(dva);
1843 vdev_t *vd;
1844 metaslab_t *msp;
1845
1846 ASSERT(DVA_IS_VALID(dva));
1847
1848 if (txg > spa_freeze_txg(spa))
1849 return;
1850
1851 if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
1852 (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) {
1853 cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu",
1854 (u_longlong_t)vdev, (u_longlong_t)offset);
1855 ASSERT(0);
1856 return;
1857 }
1858
1859 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
1860
1861 if (DVA_GET_GANG(dva))
1862 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
1863
1864 mutex_enter(&msp->ms_lock);
1865
1866 if (now) {
1867 space_map_remove(msp->ms_allocmap[txg & TXG_MASK],
1868 offset, size);
1869 space_map_free(msp->ms_map, offset, size);
1870 } else {
1871 if (msp->ms_freemap[txg & TXG_MASK]->sm_space == 0)
1872 vdev_dirty(vd, VDD_METASLAB, msp, txg);
1873 space_map_add(msp->ms_freemap[txg & TXG_MASK], offset, size);
1874 }
1875
1876 mutex_exit(&msp->ms_lock);
1877 }
1878
1879 /*
1880 * Intent log support: upon opening the pool after a crash, notify the SPA
1881 * of blocks that the intent log has allocated for immediate write, but
1882 * which are still considered free by the SPA because the last transaction
1883 * group didn't commit yet.
1884 */
1885 static int
1886 metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
1887 {
1888 uint64_t vdev = DVA_GET_VDEV(dva);
1889 uint64_t offset = DVA_GET_OFFSET(dva);
1890 uint64_t size = DVA_GET_ASIZE(dva);
1891 vdev_t *vd;
1892 metaslab_t *msp;
1893 int error = 0;
1894
1895 ASSERT(DVA_IS_VALID(dva));
1896
1897 if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
1898 (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count)
1899 return (SET_ERROR(ENXIO));
1900
1901 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
1902
1903 if (DVA_GET_GANG(dva))
1904 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
1905
1906 mutex_enter(&msp->ms_lock);
1907
1908 if ((txg != 0 && spa_writeable(spa)) || !msp->ms_map->sm_loaded)
1909 error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
1910
1911 if (error == 0 && !space_map_contains(msp->ms_map, offset, size))
1912 error = SET_ERROR(ENOENT);
1913
1914 if (error || txg == 0) { /* txg == 0 indicates dry run */
1915 mutex_exit(&msp->ms_lock);
1916 return (error);
1917 }
1918
1919 space_map_claim(msp->ms_map, offset, size);
1920
1921 if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */
1922 if (msp->ms_allocmap[txg & TXG_MASK]->sm_space == 0)
1923 vdev_dirty(vd, VDD_METASLAB, msp, txg);
1924 space_map_add(msp->ms_allocmap[txg & TXG_MASK], offset, size);
1925 }
1926
1927 mutex_exit(&msp->ms_lock);
1928
1929 return (0);
1930 }
1931
1932 int
1933 metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
1934 int ndvas, uint64_t txg, blkptr_t *hintbp, int flags)
1935 {
1936 dva_t *dva = bp->blk_dva;
1937 dva_t *hintdva = hintbp->blk_dva;
1938 int d, error = 0;
1939
1940 ASSERT(bp->blk_birth == 0);
1941 ASSERT(BP_PHYSICAL_BIRTH(bp) == 0);
1942
1943 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
1944
1945 if (mc->mc_rotor == NULL) { /* no vdevs in this class */
1946 spa_config_exit(spa, SCL_ALLOC, FTAG);
1947 return (SET_ERROR(ENOSPC));
1948 }
1949
1950 ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa));
1951 ASSERT(BP_GET_NDVAS(bp) == 0);
1952 ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
1953
1954 for (d = 0; d < ndvas; d++) {
1955 error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
1956 txg, flags);
1957 if (error) {
1958 for (d--; d >= 0; d--) {
1959 metaslab_free_dva(spa, &dva[d], txg, B_TRUE);
1960 bzero(&dva[d], sizeof (dva_t));
1961 }
1962 spa_config_exit(spa, SCL_ALLOC, FTAG);
1963 return (error);
1964 }
1965 }
1966 ASSERT(error == 0);
1967 ASSERT(BP_GET_NDVAS(bp) == ndvas);
1968
1969 spa_config_exit(spa, SCL_ALLOC, FTAG);
1970
1971 BP_SET_BIRTH(bp, txg, txg);
1972
1973 return (0);
1974 }
1975
1976 void
1977 metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
1978 {
1979 const dva_t *dva = bp->blk_dva;
1980 int d, ndvas = BP_GET_NDVAS(bp);
1981
1982 ASSERT(!BP_IS_HOLE(bp));
1983 ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa));
1984
1985 spa_config_enter(spa, SCL_FREE, FTAG, RW_READER);
1986
1987 for (d = 0; d < ndvas; d++)
1988 metaslab_free_dva(spa, &dva[d], txg, now);
1989
1990 spa_config_exit(spa, SCL_FREE, FTAG);
1991 }
1992
1993 int
1994 metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
1995 {
1996 const dva_t *dva = bp->blk_dva;
1997 int ndvas = BP_GET_NDVAS(bp);
1998 int d, error = 0;
1999
2000 ASSERT(!BP_IS_HOLE(bp));
2001
2002 if (txg != 0) {
2003 /*
2004 * First do a dry run to make sure all DVAs are claimable,
2005 * so we don't have to unwind from partial failures below.
2006 */
2007 if ((error = metaslab_claim(spa, bp, 0)) != 0)
2008 return (error);
2009 }
2010
2011 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
2012
2013 for (d = 0; d < ndvas; d++)
2014 if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0)
2015 break;
2016
2017 spa_config_exit(spa, SCL_ALLOC, FTAG);
2018
2019 ASSERT(error == 0 || txg == 0);
2020
2021 return (error);
2022 }
2023
2024 void
2025 metaslab_fastwrite_mark(spa_t *spa, const blkptr_t *bp)
2026 {
2027 const dva_t *dva = bp->blk_dva;
2028 int ndvas = BP_GET_NDVAS(bp);
2029 uint64_t psize = BP_GET_PSIZE(bp);
2030 int d;
2031 vdev_t *vd;
2032
2033 ASSERT(!BP_IS_HOLE(bp));
2034 ASSERT(psize > 0);
2035
2036 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
2037
2038 for (d = 0; d < ndvas; d++) {
2039 if ((vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]))) == NULL)
2040 continue;
2041 atomic_add_64(&vd->vdev_pending_fastwrite, psize);
2042 }
2043
2044 spa_config_exit(spa, SCL_VDEV, FTAG);
2045 }
2046
2047 void
2048 metaslab_fastwrite_unmark(spa_t *spa, const blkptr_t *bp)
2049 {
2050 const dva_t *dva = bp->blk_dva;
2051 int ndvas = BP_GET_NDVAS(bp);
2052 uint64_t psize = BP_GET_PSIZE(bp);
2053 int d;
2054 vdev_t *vd;
2055
2056 ASSERT(!BP_IS_HOLE(bp));
2057 ASSERT(psize > 0);
2058
2059 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
2060
2061 for (d = 0; d < ndvas; d++) {
2062 if ((vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]))) == NULL)
2063 continue;
2064 ASSERT3U(vd->vdev_pending_fastwrite, >=, psize);
2065 atomic_sub_64(&vd->vdev_pending_fastwrite, psize);
2066 }
2067
2068 spa_config_exit(spa, SCL_VDEV, FTAG);
2069 }
2070
2071 static void
2072 checkmap(space_map_t *sm, uint64_t off, uint64_t size)
2073 {
2074 space_seg_t *ss;
2075 avl_index_t where;
2076
2077 mutex_enter(sm->sm_lock);
2078 ss = space_map_find(sm, off, size, &where);
2079 if (ss != NULL)
2080 panic("freeing free block; ss=%p", (void *)ss);
2081 mutex_exit(sm->sm_lock);
2082 }
2083
2084 void
2085 metaslab_check_free(spa_t *spa, const blkptr_t *bp)
2086 {
2087 int i, j;
2088
2089 if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
2090 return;
2091
2092 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
2093 for (i = 0; i < BP_GET_NDVAS(bp); i++) {
2094 uint64_t vdid = DVA_GET_VDEV(&bp->blk_dva[i]);
2095 vdev_t *vd = vdev_lookup_top(spa, vdid);
2096 uint64_t off = DVA_GET_OFFSET(&bp->blk_dva[i]);
2097 uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]);
2098 metaslab_t *ms = vd->vdev_ms[off >> vd->vdev_ms_shift];
2099
2100 if (ms->ms_map->sm_loaded)
2101 checkmap(ms->ms_map, off, size);
2102
2103 for (j = 0; j < TXG_SIZE; j++)
2104 checkmap(ms->ms_freemap[j], off, size);
2105 for (j = 0; j < TXG_DEFER_SIZE; j++)
2106 checkmap(ms->ms_defermap[j], off, size);
2107 }
2108 spa_config_exit(spa, SCL_VDEV, FTAG);
2109 }
2110
2111 #if defined(_KERNEL) && defined(HAVE_SPL)
2112 module_param(metaslab_debug, int, 0644);
2113 MODULE_PARM_DESC(metaslab_debug, "keep space maps in core to verify frees");
2114 #endif /* _KERNEL && HAVE_SPL */