]>
Commit | Line | Data |
---|---|---|
34dc7c2f BB |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
9 | * or http://www.opensolaris.org/os/licensing. | |
10 | * See the License for the specific language governing permissions | |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | /* | |
9babb374 | 22 | * Copyright 2009 Sun Microsystems, Inc. All rights reserved. |
34dc7c2f BB |
23 | * Use is subject to license terms. |
24 | */ | |
25 | ||
34dc7c2f BB |
26 | #include <sys/zfs_context.h> |
27 | #include <sys/spa_impl.h> | |
28 | #include <sys/dmu.h> | |
29 | #include <sys/dmu_tx.h> | |
30 | #include <sys/space_map.h> | |
31 | #include <sys/metaslab_impl.h> | |
32 | #include <sys/vdev_impl.h> | |
33 | #include <sys/zio.h> | |
34 | ||
35 | uint64_t metaslab_aliquot = 512ULL << 10; | |
36 | uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */ | |
37 | ||
9babb374 BB |
38 | /* |
39 | * Minimum size which forces the dynamic allocator to change | |
40 | * it's allocation strategy. Once the space map cannot satisfy | |
41 | * an allocation of this size then it switches to using more | |
42 | * aggressive strategy (i.e search by size rather than offset). | |
43 | */ | |
44 | uint64_t metaslab_df_alloc_threshold = SPA_MAXBLOCKSIZE; | |
45 | ||
46 | /* | |
47 | * The minimum free space, in percent, which must be available | |
48 | * in a space map to continue allocations in a first-fit fashion. | |
49 | * Once the space_map's free space drops below this level we dynamically | |
50 | * switch to using best-fit allocations. | |
51 | */ | |
52 | int metaslab_df_free_pct = 30; | |
53 | ||
34dc7c2f BB |
54 | /* |
55 | * ========================================================================== | |
56 | * Metaslab classes | |
57 | * ========================================================================== | |
58 | */ | |
59 | metaslab_class_t * | |
9babb374 | 60 | metaslab_class_create(space_map_ops_t *ops) |
34dc7c2f BB |
61 | { |
62 | metaslab_class_t *mc; | |
63 | ||
64 | mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP); | |
65 | ||
66 | mc->mc_rotor = NULL; | |
9babb374 | 67 | mc->mc_ops = ops; |
34dc7c2f BB |
68 | |
69 | return (mc); | |
70 | } | |
71 | ||
72 | void | |
73 | metaslab_class_destroy(metaslab_class_t *mc) | |
74 | { | |
75 | metaslab_group_t *mg; | |
76 | ||
77 | while ((mg = mc->mc_rotor) != NULL) { | |
78 | metaslab_class_remove(mc, mg); | |
79 | metaslab_group_destroy(mg); | |
80 | } | |
81 | ||
82 | kmem_free(mc, sizeof (metaslab_class_t)); | |
83 | } | |
84 | ||
85 | void | |
86 | metaslab_class_add(metaslab_class_t *mc, metaslab_group_t *mg) | |
87 | { | |
88 | metaslab_group_t *mgprev, *mgnext; | |
89 | ||
90 | ASSERT(mg->mg_class == NULL); | |
91 | ||
92 | if ((mgprev = mc->mc_rotor) == NULL) { | |
93 | mg->mg_prev = mg; | |
94 | mg->mg_next = mg; | |
95 | } else { | |
96 | mgnext = mgprev->mg_next; | |
97 | mg->mg_prev = mgprev; | |
98 | mg->mg_next = mgnext; | |
99 | mgprev->mg_next = mg; | |
100 | mgnext->mg_prev = mg; | |
101 | } | |
102 | mc->mc_rotor = mg; | |
103 | mg->mg_class = mc; | |
104 | } | |
105 | ||
106 | void | |
107 | metaslab_class_remove(metaslab_class_t *mc, metaslab_group_t *mg) | |
108 | { | |
109 | metaslab_group_t *mgprev, *mgnext; | |
110 | ||
111 | ASSERT(mg->mg_class == mc); | |
112 | ||
113 | mgprev = mg->mg_prev; | |
114 | mgnext = mg->mg_next; | |
115 | ||
116 | if (mg == mgnext) { | |
117 | mc->mc_rotor = NULL; | |
118 | } else { | |
119 | mc->mc_rotor = mgnext; | |
120 | mgprev->mg_next = mgnext; | |
121 | mgnext->mg_prev = mgprev; | |
122 | } | |
123 | ||
124 | mg->mg_prev = NULL; | |
125 | mg->mg_next = NULL; | |
126 | mg->mg_class = NULL; | |
127 | } | |
128 | ||
129 | /* | |
130 | * ========================================================================== | |
131 | * Metaslab groups | |
132 | * ========================================================================== | |
133 | */ | |
134 | static int | |
135 | metaslab_compare(const void *x1, const void *x2) | |
136 | { | |
137 | const metaslab_t *m1 = x1; | |
138 | const metaslab_t *m2 = x2; | |
139 | ||
140 | if (m1->ms_weight < m2->ms_weight) | |
141 | return (1); | |
142 | if (m1->ms_weight > m2->ms_weight) | |
143 | return (-1); | |
144 | ||
145 | /* | |
146 | * If the weights are identical, use the offset to force uniqueness. | |
147 | */ | |
148 | if (m1->ms_map.sm_start < m2->ms_map.sm_start) | |
149 | return (-1); | |
150 | if (m1->ms_map.sm_start > m2->ms_map.sm_start) | |
151 | return (1); | |
152 | ||
153 | ASSERT3P(m1, ==, m2); | |
154 | ||
155 | return (0); | |
156 | } | |
157 | ||
158 | metaslab_group_t * | |
159 | metaslab_group_create(metaslab_class_t *mc, vdev_t *vd) | |
160 | { | |
161 | metaslab_group_t *mg; | |
162 | ||
163 | mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP); | |
164 | mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); | |
165 | avl_create(&mg->mg_metaslab_tree, metaslab_compare, | |
166 | sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node)); | |
167 | mg->mg_aliquot = metaslab_aliquot * MAX(1, vd->vdev_children); | |
168 | mg->mg_vd = vd; | |
169 | metaslab_class_add(mc, mg); | |
170 | ||
171 | return (mg); | |
172 | } | |
173 | ||
174 | void | |
175 | metaslab_group_destroy(metaslab_group_t *mg) | |
176 | { | |
177 | avl_destroy(&mg->mg_metaslab_tree); | |
178 | mutex_destroy(&mg->mg_lock); | |
179 | kmem_free(mg, sizeof (metaslab_group_t)); | |
180 | } | |
181 | ||
182 | static void | |
183 | metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp) | |
184 | { | |
185 | mutex_enter(&mg->mg_lock); | |
186 | ASSERT(msp->ms_group == NULL); | |
187 | msp->ms_group = mg; | |
188 | msp->ms_weight = 0; | |
189 | avl_add(&mg->mg_metaslab_tree, msp); | |
190 | mutex_exit(&mg->mg_lock); | |
191 | } | |
192 | ||
193 | static void | |
194 | metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp) | |
195 | { | |
196 | mutex_enter(&mg->mg_lock); | |
197 | ASSERT(msp->ms_group == mg); | |
198 | avl_remove(&mg->mg_metaslab_tree, msp); | |
199 | msp->ms_group = NULL; | |
200 | mutex_exit(&mg->mg_lock); | |
201 | } | |
202 | ||
203 | static void | |
204 | metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) | |
205 | { | |
206 | /* | |
207 | * Although in principle the weight can be any value, in | |
208 | * practice we do not use values in the range [1, 510]. | |
209 | */ | |
210 | ASSERT(weight >= SPA_MINBLOCKSIZE-1 || weight == 0); | |
211 | ASSERT(MUTEX_HELD(&msp->ms_lock)); | |
212 | ||
213 | mutex_enter(&mg->mg_lock); | |
214 | ASSERT(msp->ms_group == mg); | |
215 | avl_remove(&mg->mg_metaslab_tree, msp); | |
216 | msp->ms_weight = weight; | |
217 | avl_add(&mg->mg_metaslab_tree, msp); | |
218 | mutex_exit(&mg->mg_lock); | |
219 | } | |
220 | ||
221 | /* | |
9babb374 BB |
222 | * This is a helper function that can be used by the allocator to find |
223 | * a suitable block to allocate. This will search the specified AVL | |
224 | * tree looking for a block that matches the specified criteria. | |
34dc7c2f | 225 | */ |
34dc7c2f | 226 | static uint64_t |
9babb374 BB |
227 | metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size, |
228 | uint64_t align) | |
34dc7c2f | 229 | { |
34dc7c2f BB |
230 | space_seg_t *ss, ssearch; |
231 | avl_index_t where; | |
232 | ||
233 | ssearch.ss_start = *cursor; | |
234 | ssearch.ss_end = *cursor + size; | |
235 | ||
236 | ss = avl_find(t, &ssearch, &where); | |
237 | if (ss == NULL) | |
238 | ss = avl_nearest(t, where, AVL_AFTER); | |
239 | ||
240 | while (ss != NULL) { | |
241 | uint64_t offset = P2ROUNDUP(ss->ss_start, align); | |
242 | ||
243 | if (offset + size <= ss->ss_end) { | |
244 | *cursor = offset + size; | |
245 | return (offset); | |
246 | } | |
247 | ss = AVL_NEXT(t, ss); | |
248 | } | |
249 | ||
250 | /* | |
251 | * If we know we've searched the whole map (*cursor == 0), give up. | |
252 | * Otherwise, reset the cursor to the beginning and try again. | |
253 | */ | |
254 | if (*cursor == 0) | |
255 | return (-1ULL); | |
256 | ||
257 | *cursor = 0; | |
9babb374 BB |
258 | return (metaslab_block_picker(t, cursor, size, align)); |
259 | } | |
260 | ||
261 | /* | |
262 | * ========================================================================== | |
263 | * The first-fit block allocator | |
264 | * ========================================================================== | |
265 | */ | |
266 | static void | |
267 | metaslab_ff_load(space_map_t *sm) | |
268 | { | |
269 | ASSERT(sm->sm_ppd == NULL); | |
270 | sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP); | |
271 | sm->sm_pp_root = NULL; | |
272 | } | |
273 | ||
274 | static void | |
275 | metaslab_ff_unload(space_map_t *sm) | |
276 | { | |
277 | kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t)); | |
278 | sm->sm_ppd = NULL; | |
279 | } | |
280 | ||
281 | static uint64_t | |
282 | metaslab_ff_alloc(space_map_t *sm, uint64_t size) | |
283 | { | |
284 | avl_tree_t *t = &sm->sm_root; | |
285 | uint64_t align = size & -size; | |
286 | uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1; | |
287 | ||
288 | return (metaslab_block_picker(t, cursor, size, align)); | |
34dc7c2f BB |
289 | } |
290 | ||
291 | /* ARGSUSED */ | |
292 | static void | |
293 | metaslab_ff_claim(space_map_t *sm, uint64_t start, uint64_t size) | |
294 | { | |
295 | /* No need to update cursor */ | |
296 | } | |
297 | ||
298 | /* ARGSUSED */ | |
299 | static void | |
300 | metaslab_ff_free(space_map_t *sm, uint64_t start, uint64_t size) | |
301 | { | |
302 | /* No need to update cursor */ | |
303 | } | |
304 | ||
305 | static space_map_ops_t metaslab_ff_ops = { | |
306 | metaslab_ff_load, | |
307 | metaslab_ff_unload, | |
308 | metaslab_ff_alloc, | |
309 | metaslab_ff_claim, | |
9babb374 BB |
310 | metaslab_ff_free, |
311 | NULL /* maxsize */ | |
312 | }; | |
313 | ||
314 | /* | |
315 | * Dynamic block allocator - | |
316 | * Uses the first fit allocation scheme until space get low and then | |
317 | * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold | |
318 | * and metaslab_df_free_pct to determine when to switch the allocation scheme. | |
319 | */ | |
320 | ||
321 | uint64_t | |
322 | metaslab_df_maxsize(space_map_t *sm) | |
323 | { | |
324 | avl_tree_t *t = sm->sm_pp_root; | |
325 | space_seg_t *ss; | |
326 | ||
327 | if (t == NULL || (ss = avl_last(t)) == NULL) | |
328 | return (0ULL); | |
329 | ||
330 | return (ss->ss_end - ss->ss_start); | |
331 | } | |
332 | ||
333 | static int | |
334 | metaslab_df_seg_compare(const void *x1, const void *x2) | |
335 | { | |
336 | const space_seg_t *s1 = x1; | |
337 | const space_seg_t *s2 = x2; | |
338 | uint64_t ss_size1 = s1->ss_end - s1->ss_start; | |
339 | uint64_t ss_size2 = s2->ss_end - s2->ss_start; | |
340 | ||
341 | if (ss_size1 < ss_size2) | |
342 | return (-1); | |
343 | if (ss_size1 > ss_size2) | |
344 | return (1); | |
345 | ||
346 | if (s1->ss_start < s2->ss_start) | |
347 | return (-1); | |
348 | if (s1->ss_start > s2->ss_start) | |
349 | return (1); | |
350 | ||
351 | return (0); | |
352 | } | |
353 | ||
354 | static void | |
355 | metaslab_df_load(space_map_t *sm) | |
356 | { | |
357 | space_seg_t *ss; | |
358 | ||
359 | ASSERT(sm->sm_ppd == NULL); | |
360 | sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP); | |
361 | ||
362 | sm->sm_pp_root = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP); | |
363 | avl_create(sm->sm_pp_root, metaslab_df_seg_compare, | |
364 | sizeof (space_seg_t), offsetof(struct space_seg, ss_pp_node)); | |
365 | ||
366 | for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss)) | |
367 | avl_add(sm->sm_pp_root, ss); | |
368 | } | |
369 | ||
370 | static void | |
371 | metaslab_df_unload(space_map_t *sm) | |
372 | { | |
373 | void *cookie = NULL; | |
374 | ||
375 | kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t)); | |
376 | sm->sm_ppd = NULL; | |
377 | ||
378 | while (avl_destroy_nodes(sm->sm_pp_root, &cookie) != NULL) { | |
379 | /* tear down the tree */ | |
380 | } | |
381 | ||
382 | avl_destroy(sm->sm_pp_root); | |
383 | kmem_free(sm->sm_pp_root, sizeof (avl_tree_t)); | |
384 | sm->sm_pp_root = NULL; | |
385 | } | |
386 | ||
387 | static uint64_t | |
388 | metaslab_df_alloc(space_map_t *sm, uint64_t size) | |
389 | { | |
390 | avl_tree_t *t = &sm->sm_root; | |
391 | uint64_t align = size & -size; | |
392 | uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1; | |
393 | uint64_t max_size = metaslab_df_maxsize(sm); | |
394 | int free_pct = sm->sm_space * 100 / sm->sm_size; | |
395 | ||
396 | ASSERT(MUTEX_HELD(sm->sm_lock)); | |
397 | ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root)); | |
398 | ||
399 | if (max_size < size) | |
400 | return (-1ULL); | |
401 | ||
402 | /* | |
403 | * If we're running low on space switch to using the size | |
404 | * sorted AVL tree (best-fit). | |
405 | */ | |
406 | if (max_size < metaslab_df_alloc_threshold || | |
407 | free_pct < metaslab_df_free_pct) { | |
408 | t = sm->sm_pp_root; | |
409 | *cursor = 0; | |
410 | } | |
411 | ||
412 | return (metaslab_block_picker(t, cursor, size, 1ULL)); | |
413 | } | |
414 | ||
415 | /* ARGSUSED */ | |
416 | static void | |
417 | metaslab_df_claim(space_map_t *sm, uint64_t start, uint64_t size) | |
418 | { | |
419 | /* No need to update cursor */ | |
420 | } | |
421 | ||
422 | /* ARGSUSED */ | |
423 | static void | |
424 | metaslab_df_free(space_map_t *sm, uint64_t start, uint64_t size) | |
425 | { | |
426 | /* No need to update cursor */ | |
427 | } | |
428 | ||
429 | static space_map_ops_t metaslab_df_ops = { | |
430 | metaslab_df_load, | |
431 | metaslab_df_unload, | |
432 | metaslab_df_alloc, | |
433 | metaslab_df_claim, | |
434 | metaslab_df_free, | |
435 | metaslab_df_maxsize | |
34dc7c2f BB |
436 | }; |
437 | ||
9babb374 BB |
438 | space_map_ops_t *zfs_metaslab_ops = &metaslab_df_ops; |
439 | ||
34dc7c2f BB |
440 | /* |
441 | * ========================================================================== | |
442 | * Metaslabs | |
443 | * ========================================================================== | |
444 | */ | |
445 | metaslab_t * | |
446 | metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo, | |
447 | uint64_t start, uint64_t size, uint64_t txg) | |
448 | { | |
449 | vdev_t *vd = mg->mg_vd; | |
450 | metaslab_t *msp; | |
451 | ||
452 | msp = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP); | |
453 | mutex_init(&msp->ms_lock, NULL, MUTEX_DEFAULT, NULL); | |
454 | ||
455 | msp->ms_smo_syncing = *smo; | |
456 | ||
457 | /* | |
458 | * We create the main space map here, but we don't create the | |
459 | * allocmaps and freemaps until metaslab_sync_done(). This serves | |
460 | * two purposes: it allows metaslab_sync_done() to detect the | |
461 | * addition of new space; and for debugging, it ensures that we'd | |
462 | * data fault on any attempt to use this metaslab before it's ready. | |
463 | */ | |
464 | space_map_create(&msp->ms_map, start, size, | |
465 | vd->vdev_ashift, &msp->ms_lock); | |
466 | ||
467 | metaslab_group_add(mg, msp); | |
468 | ||
469 | /* | |
470 | * If we're opening an existing pool (txg == 0) or creating | |
471 | * a new one (txg == TXG_INITIAL), all space is available now. | |
472 | * If we're adding space to an existing pool, the new space | |
473 | * does not become available until after this txg has synced. | |
474 | */ | |
475 | if (txg <= TXG_INITIAL) | |
476 | metaslab_sync_done(msp, 0); | |
477 | ||
478 | if (txg != 0) { | |
479 | /* | |
480 | * The vdev is dirty, but the metaslab isn't -- it just needs | |
481 | * to have metaslab_sync_done() invoked from vdev_sync_done(). | |
482 | * [We could just dirty the metaslab, but that would cause us | |
483 | * to allocate a space map object for it, which is wasteful | |
484 | * and would mess up the locality logic in metaslab_weight().] | |
485 | */ | |
486 | ASSERT(TXG_CLEAN(txg) == spa_last_synced_txg(vd->vdev_spa)); | |
487 | vdev_dirty(vd, 0, NULL, txg); | |
488 | vdev_dirty(vd, VDD_METASLAB, msp, TXG_CLEAN(txg)); | |
489 | } | |
490 | ||
491 | return (msp); | |
492 | } | |
493 | ||
494 | void | |
495 | metaslab_fini(metaslab_t *msp) | |
496 | { | |
497 | metaslab_group_t *mg = msp->ms_group; | |
498 | int t; | |
499 | ||
500 | vdev_space_update(mg->mg_vd, -msp->ms_map.sm_size, | |
501 | -msp->ms_smo.smo_alloc, B_TRUE); | |
502 | ||
503 | metaslab_group_remove(mg, msp); | |
504 | ||
505 | mutex_enter(&msp->ms_lock); | |
506 | ||
507 | space_map_unload(&msp->ms_map); | |
508 | space_map_destroy(&msp->ms_map); | |
509 | ||
510 | for (t = 0; t < TXG_SIZE; t++) { | |
511 | space_map_destroy(&msp->ms_allocmap[t]); | |
512 | space_map_destroy(&msp->ms_freemap[t]); | |
513 | } | |
514 | ||
515 | mutex_exit(&msp->ms_lock); | |
516 | mutex_destroy(&msp->ms_lock); | |
517 | ||
518 | kmem_free(msp, sizeof (metaslab_t)); | |
519 | } | |
520 | ||
521 | #define METASLAB_WEIGHT_PRIMARY (1ULL << 63) | |
522 | #define METASLAB_WEIGHT_SECONDARY (1ULL << 62) | |
523 | #define METASLAB_ACTIVE_MASK \ | |
524 | (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY) | |
525 | #define METASLAB_SMO_BONUS_MULTIPLIER 2 | |
526 | ||
527 | static uint64_t | |
528 | metaslab_weight(metaslab_t *msp) | |
529 | { | |
530 | metaslab_group_t *mg = msp->ms_group; | |
531 | space_map_t *sm = &msp->ms_map; | |
532 | space_map_obj_t *smo = &msp->ms_smo; | |
533 | vdev_t *vd = mg->mg_vd; | |
534 | uint64_t weight, space; | |
535 | ||
536 | ASSERT(MUTEX_HELD(&msp->ms_lock)); | |
537 | ||
538 | /* | |
539 | * The baseline weight is the metaslab's free space. | |
540 | */ | |
541 | space = sm->sm_size - smo->smo_alloc; | |
542 | weight = space; | |
543 | ||
544 | /* | |
545 | * Modern disks have uniform bit density and constant angular velocity. | |
546 | * Therefore, the outer recording zones are faster (higher bandwidth) | |
547 | * than the inner zones by the ratio of outer to inner track diameter, | |
548 | * which is typically around 2:1. We account for this by assigning | |
549 | * higher weight to lower metaslabs (multiplier ranging from 2x to 1x). | |
550 | * In effect, this means that we'll select the metaslab with the most | |
551 | * free bandwidth rather than simply the one with the most free space. | |
552 | */ | |
553 | weight = 2 * weight - | |
554 | ((sm->sm_start >> vd->vdev_ms_shift) * weight) / vd->vdev_ms_count; | |
555 | ASSERT(weight >= space && weight <= 2 * space); | |
556 | ||
557 | /* | |
558 | * For locality, assign higher weight to metaslabs we've used before. | |
559 | */ | |
560 | if (smo->smo_object != 0) | |
561 | weight *= METASLAB_SMO_BONUS_MULTIPLIER; | |
562 | ASSERT(weight >= space && | |
563 | weight <= 2 * METASLAB_SMO_BONUS_MULTIPLIER * space); | |
564 | ||
565 | /* | |
566 | * If this metaslab is one we're actively using, adjust its weight to | |
567 | * make it preferable to any inactive metaslab so we'll polish it off. | |
568 | */ | |
569 | weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK); | |
570 | ||
571 | return (weight); | |
572 | } | |
573 | ||
574 | static int | |
9babb374 | 575 | metaslab_activate(metaslab_t *msp, uint64_t activation_weight, uint64_t size) |
34dc7c2f BB |
576 | { |
577 | space_map_t *sm = &msp->ms_map; | |
9babb374 | 578 | space_map_ops_t *sm_ops = msp->ms_group->mg_class->mc_ops; |
34dc7c2f BB |
579 | |
580 | ASSERT(MUTEX_HELD(&msp->ms_lock)); | |
581 | ||
582 | if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { | |
9babb374 | 583 | int error = space_map_load(sm, sm_ops, SM_FREE, &msp->ms_smo, |
34dc7c2f BB |
584 | msp->ms_group->mg_vd->vdev_spa->spa_meta_objset); |
585 | if (error) { | |
586 | metaslab_group_sort(msp->ms_group, msp, 0); | |
587 | return (error); | |
588 | } | |
9babb374 BB |
589 | |
590 | /* | |
591 | * If we were able to load the map then make sure | |
592 | * that this map is still able to satisfy our request. | |
593 | */ | |
594 | if (msp->ms_weight < size) | |
595 | return (ENOSPC); | |
596 | ||
34dc7c2f BB |
597 | metaslab_group_sort(msp->ms_group, msp, |
598 | msp->ms_weight | activation_weight); | |
599 | } | |
600 | ASSERT(sm->sm_loaded); | |
601 | ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); | |
602 | ||
603 | return (0); | |
604 | } | |
605 | ||
606 | static void | |
607 | metaslab_passivate(metaslab_t *msp, uint64_t size) | |
608 | { | |
609 | /* | |
610 | * If size < SPA_MINBLOCKSIZE, then we will not allocate from | |
611 | * this metaslab again. In that case, it had better be empty, | |
612 | * or we would be leaving space on the table. | |
613 | */ | |
614 | ASSERT(size >= SPA_MINBLOCKSIZE || msp->ms_map.sm_space == 0); | |
615 | metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size)); | |
616 | ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0); | |
617 | } | |
618 | ||
619 | /* | |
620 | * Write a metaslab to disk in the context of the specified transaction group. | |
621 | */ | |
622 | void | |
623 | metaslab_sync(metaslab_t *msp, uint64_t txg) | |
624 | { | |
625 | vdev_t *vd = msp->ms_group->mg_vd; | |
626 | spa_t *spa = vd->vdev_spa; | |
627 | objset_t *mos = spa->spa_meta_objset; | |
628 | space_map_t *allocmap = &msp->ms_allocmap[txg & TXG_MASK]; | |
629 | space_map_t *freemap = &msp->ms_freemap[txg & TXG_MASK]; | |
630 | space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; | |
631 | space_map_t *sm = &msp->ms_map; | |
632 | space_map_obj_t *smo = &msp->ms_smo_syncing; | |
633 | dmu_buf_t *db; | |
634 | dmu_tx_t *tx; | |
635 | int t; | |
636 | ||
637 | tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); | |
638 | ||
639 | /* | |
640 | * The only state that can actually be changing concurrently with | |
641 | * metaslab_sync() is the metaslab's ms_map. No other thread can | |
642 | * be modifying this txg's allocmap, freemap, freed_map, or smo. | |
643 | * Therefore, we only hold ms_lock to satify space_map ASSERTs. | |
644 | * We drop it whenever we call into the DMU, because the DMU | |
645 | * can call down to us (e.g. via zio_free()) at any time. | |
646 | */ | |
647 | mutex_enter(&msp->ms_lock); | |
648 | ||
649 | if (smo->smo_object == 0) { | |
650 | ASSERT(smo->smo_objsize == 0); | |
651 | ASSERT(smo->smo_alloc == 0); | |
652 | mutex_exit(&msp->ms_lock); | |
653 | smo->smo_object = dmu_object_alloc(mos, | |
654 | DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT, | |
655 | DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx); | |
656 | ASSERT(smo->smo_object != 0); | |
657 | dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) * | |
658 | (sm->sm_start >> vd->vdev_ms_shift), | |
659 | sizeof (uint64_t), &smo->smo_object, tx); | |
660 | mutex_enter(&msp->ms_lock); | |
661 | } | |
662 | ||
663 | space_map_walk(freemap, space_map_add, freed_map); | |
664 | ||
665 | if (sm->sm_loaded && spa_sync_pass(spa) == 1 && smo->smo_objsize >= | |
666 | 2 * sizeof (uint64_t) * avl_numnodes(&sm->sm_root)) { | |
667 | /* | |
668 | * The in-core space map representation is twice as compact | |
669 | * as the on-disk one, so it's time to condense the latter | |
670 | * by generating a pure allocmap from first principles. | |
671 | * | |
672 | * This metaslab is 100% allocated, | |
673 | * minus the content of the in-core map (sm), | |
674 | * minus what's been freed this txg (freed_map), | |
675 | * minus allocations from txgs in the future | |
676 | * (because they haven't been committed yet). | |
677 | */ | |
678 | space_map_vacate(allocmap, NULL, NULL); | |
679 | space_map_vacate(freemap, NULL, NULL); | |
680 | ||
681 | space_map_add(allocmap, allocmap->sm_start, allocmap->sm_size); | |
682 | ||
683 | space_map_walk(sm, space_map_remove, allocmap); | |
684 | space_map_walk(freed_map, space_map_remove, allocmap); | |
685 | ||
686 | for (t = 1; t < TXG_CONCURRENT_STATES; t++) | |
687 | space_map_walk(&msp->ms_allocmap[(txg + t) & TXG_MASK], | |
688 | space_map_remove, allocmap); | |
689 | ||
690 | mutex_exit(&msp->ms_lock); | |
691 | space_map_truncate(smo, mos, tx); | |
692 | mutex_enter(&msp->ms_lock); | |
693 | } | |
694 | ||
695 | space_map_sync(allocmap, SM_ALLOC, smo, mos, tx); | |
696 | space_map_sync(freemap, SM_FREE, smo, mos, tx); | |
697 | ||
698 | mutex_exit(&msp->ms_lock); | |
699 | ||
700 | VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)); | |
701 | dmu_buf_will_dirty(db, tx); | |
702 | ASSERT3U(db->db_size, >=, sizeof (*smo)); | |
703 | bcopy(smo, db->db_data, sizeof (*smo)); | |
704 | dmu_buf_rele(db, FTAG); | |
705 | ||
706 | dmu_tx_commit(tx); | |
707 | } | |
708 | ||
709 | /* | |
710 | * Called after a transaction group has completely synced to mark | |
711 | * all of the metaslab's free space as usable. | |
712 | */ | |
713 | void | |
714 | metaslab_sync_done(metaslab_t *msp, uint64_t txg) | |
715 | { | |
716 | space_map_obj_t *smo = &msp->ms_smo; | |
717 | space_map_obj_t *smosync = &msp->ms_smo_syncing; | |
718 | space_map_t *sm = &msp->ms_map; | |
719 | space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; | |
720 | metaslab_group_t *mg = msp->ms_group; | |
721 | vdev_t *vd = mg->mg_vd; | |
722 | int t; | |
723 | ||
724 | mutex_enter(&msp->ms_lock); | |
725 | ||
726 | /* | |
727 | * If this metaslab is just becoming available, initialize its | |
728 | * allocmaps and freemaps and add its capacity to the vdev. | |
729 | */ | |
730 | if (freed_map->sm_size == 0) { | |
731 | for (t = 0; t < TXG_SIZE; t++) { | |
732 | space_map_create(&msp->ms_allocmap[t], sm->sm_start, | |
733 | sm->sm_size, sm->sm_shift, sm->sm_lock); | |
734 | space_map_create(&msp->ms_freemap[t], sm->sm_start, | |
735 | sm->sm_size, sm->sm_shift, sm->sm_lock); | |
736 | } | |
737 | vdev_space_update(vd, sm->sm_size, 0, B_TRUE); | |
738 | } | |
739 | ||
740 | vdev_space_update(vd, 0, smosync->smo_alloc - smo->smo_alloc, B_TRUE); | |
741 | ||
742 | ASSERT(msp->ms_allocmap[txg & TXG_MASK].sm_space == 0); | |
743 | ASSERT(msp->ms_freemap[txg & TXG_MASK].sm_space == 0); | |
744 | ||
745 | /* | |
746 | * If there's a space_map_load() in progress, wait for it to complete | |
747 | * so that we have a consistent view of the in-core space map. | |
748 | * Then, add everything we freed in this txg to the map. | |
749 | */ | |
750 | space_map_load_wait(sm); | |
751 | space_map_vacate(freed_map, sm->sm_loaded ? space_map_free : NULL, sm); | |
752 | ||
753 | *smo = *smosync; | |
754 | ||
755 | /* | |
756 | * If the map is loaded but no longer active, evict it as soon as all | |
757 | * future allocations have synced. (If we unloaded it now and then | |
758 | * loaded a moment later, the map wouldn't reflect those allocations.) | |
759 | */ | |
760 | if (sm->sm_loaded && (msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { | |
761 | int evictable = 1; | |
762 | ||
763 | for (t = 1; t < TXG_CONCURRENT_STATES; t++) | |
764 | if (msp->ms_allocmap[(txg + t) & TXG_MASK].sm_space) | |
765 | evictable = 0; | |
766 | ||
767 | if (evictable) | |
768 | space_map_unload(sm); | |
769 | } | |
770 | ||
771 | metaslab_group_sort(mg, msp, metaslab_weight(msp)); | |
772 | ||
773 | mutex_exit(&msp->ms_lock); | |
774 | } | |
775 | ||
776 | static uint64_t | |
777 | metaslab_distance(metaslab_t *msp, dva_t *dva) | |
778 | { | |
779 | uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift; | |
780 | uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift; | |
781 | uint64_t start = msp->ms_map.sm_start >> ms_shift; | |
782 | ||
783 | if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva)) | |
784 | return (1ULL << 63); | |
785 | ||
786 | if (offset < start) | |
787 | return ((start - offset) << ms_shift); | |
788 | if (offset > start) | |
789 | return ((offset - start) << ms_shift); | |
790 | return (0); | |
791 | } | |
792 | ||
793 | static uint64_t | |
794 | metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg, | |
795 | uint64_t min_distance, dva_t *dva, int d) | |
796 | { | |
797 | metaslab_t *msp = NULL; | |
798 | uint64_t offset = -1ULL; | |
799 | avl_tree_t *t = &mg->mg_metaslab_tree; | |
800 | uint64_t activation_weight; | |
801 | uint64_t target_distance; | |
802 | int i; | |
803 | ||
804 | activation_weight = METASLAB_WEIGHT_PRIMARY; | |
9babb374 BB |
805 | for (i = 0; i < d; i++) { |
806 | if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { | |
34dc7c2f | 807 | activation_weight = METASLAB_WEIGHT_SECONDARY; |
9babb374 BB |
808 | break; |
809 | } | |
810 | } | |
34dc7c2f BB |
811 | |
812 | for (;;) { | |
9babb374 BB |
813 | boolean_t was_active; |
814 | ||
34dc7c2f BB |
815 | mutex_enter(&mg->mg_lock); |
816 | for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) { | |
817 | if (msp->ms_weight < size) { | |
818 | mutex_exit(&mg->mg_lock); | |
819 | return (-1ULL); | |
820 | } | |
821 | ||
9babb374 | 822 | was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; |
34dc7c2f BB |
823 | if (activation_weight == METASLAB_WEIGHT_PRIMARY) |
824 | break; | |
825 | ||
826 | target_distance = min_distance + | |
827 | (msp->ms_smo.smo_alloc ? 0 : min_distance >> 1); | |
828 | ||
829 | for (i = 0; i < d; i++) | |
830 | if (metaslab_distance(msp, &dva[i]) < | |
831 | target_distance) | |
832 | break; | |
833 | if (i == d) | |
834 | break; | |
835 | } | |
836 | mutex_exit(&mg->mg_lock); | |
837 | if (msp == NULL) | |
838 | return (-1ULL); | |
839 | ||
840 | mutex_enter(&msp->ms_lock); | |
841 | ||
842 | /* | |
843 | * Ensure that the metaslab we have selected is still | |
844 | * capable of handling our request. It's possible that | |
845 | * another thread may have changed the weight while we | |
846 | * were blocked on the metaslab lock. | |
847 | */ | |
9babb374 BB |
848 | if (msp->ms_weight < size || (was_active && |
849 | !(msp->ms_weight & METASLAB_ACTIVE_MASK) && | |
850 | activation_weight == METASLAB_WEIGHT_PRIMARY)) { | |
34dc7c2f BB |
851 | mutex_exit(&msp->ms_lock); |
852 | continue; | |
853 | } | |
854 | ||
855 | if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) && | |
856 | activation_weight == METASLAB_WEIGHT_PRIMARY) { | |
857 | metaslab_passivate(msp, | |
858 | msp->ms_weight & ~METASLAB_ACTIVE_MASK); | |
859 | mutex_exit(&msp->ms_lock); | |
860 | continue; | |
861 | } | |
862 | ||
9babb374 | 863 | if (metaslab_activate(msp, activation_weight, size) != 0) { |
34dc7c2f BB |
864 | mutex_exit(&msp->ms_lock); |
865 | continue; | |
866 | } | |
867 | ||
868 | if ((offset = space_map_alloc(&msp->ms_map, size)) != -1ULL) | |
869 | break; | |
870 | ||
871 | metaslab_passivate(msp, size - 1); | |
872 | ||
873 | mutex_exit(&msp->ms_lock); | |
874 | } | |
875 | ||
876 | if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0) | |
877 | vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); | |
878 | ||
879 | space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size); | |
880 | ||
881 | mutex_exit(&msp->ms_lock); | |
882 | ||
883 | return (offset); | |
884 | } | |
885 | ||
886 | /* | |
887 | * Allocate a block for the specified i/o. | |
888 | */ | |
889 | static int | |
890 | metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, | |
b128c09f | 891 | dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags) |
34dc7c2f BB |
892 | { |
893 | metaslab_group_t *mg, *rotor; | |
894 | vdev_t *vd; | |
895 | int dshift = 3; | |
896 | int all_zero; | |
fb5f0bc8 BB |
897 | int zio_lock = B_FALSE; |
898 | boolean_t allocatable; | |
34dc7c2f BB |
899 | uint64_t offset = -1ULL; |
900 | uint64_t asize; | |
901 | uint64_t distance; | |
902 | ||
903 | ASSERT(!DVA_IS_VALID(&dva[d])); | |
904 | ||
905 | /* | |
906 | * For testing, make some blocks above a certain size be gang blocks. | |
907 | */ | |
908 | if (psize >= metaslab_gang_bang && (lbolt & 3) == 0) | |
909 | return (ENOSPC); | |
910 | ||
911 | /* | |
912 | * Start at the rotor and loop through all mgs until we find something. | |
913 | * Note that there's no locking on mc_rotor or mc_allocated because | |
914 | * nothing actually breaks if we miss a few updates -- we just won't | |
915 | * allocate quite as evenly. It all balances out over time. | |
916 | * | |
917 | * If we are doing ditto or log blocks, try to spread them across | |
918 | * consecutive vdevs. If we're forced to reuse a vdev before we've | |
919 | * allocated all of our ditto blocks, then try and spread them out on | |
920 | * that vdev as much as possible. If it turns out to not be possible, | |
921 | * gradually lower our standards until anything becomes acceptable. | |
922 | * Also, allocating on consecutive vdevs (as opposed to random vdevs) | |
923 | * gives us hope of containing our fault domains to something we're | |
924 | * able to reason about. Otherwise, any two top-level vdev failures | |
925 | * will guarantee the loss of data. With consecutive allocation, | |
926 | * only two adjacent top-level vdev failures will result in data loss. | |
927 | * | |
928 | * If we are doing gang blocks (hintdva is non-NULL), try to keep | |
929 | * ourselves on the same vdev as our gang block header. That | |
930 | * way, we can hope for locality in vdev_cache, plus it makes our | |
931 | * fault domains something tractable. | |
932 | */ | |
933 | if (hintdva) { | |
934 | vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d])); | |
b128c09f | 935 | if (flags & METASLAB_HINTBP_AVOID) |
34dc7c2f BB |
936 | mg = vd->vdev_mg->mg_next; |
937 | else | |
938 | mg = vd->vdev_mg; | |
939 | } else if (d != 0) { | |
940 | vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1])); | |
941 | mg = vd->vdev_mg->mg_next; | |
942 | } else { | |
943 | mg = mc->mc_rotor; | |
944 | } | |
945 | ||
946 | /* | |
947 | * If the hint put us into the wrong class, just follow the rotor. | |
948 | */ | |
949 | if (mg->mg_class != mc) | |
950 | mg = mc->mc_rotor; | |
951 | ||
952 | rotor = mg; | |
953 | top: | |
954 | all_zero = B_TRUE; | |
955 | do { | |
956 | vd = mg->mg_vd; | |
fb5f0bc8 | 957 | |
34dc7c2f | 958 | /* |
b128c09f | 959 | * Don't allocate from faulted devices. |
34dc7c2f | 960 | */ |
fb5f0bc8 BB |
961 | if (zio_lock) { |
962 | spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER); | |
963 | allocatable = vdev_allocatable(vd); | |
964 | spa_config_exit(spa, SCL_ZIO, FTAG); | |
965 | } else { | |
966 | allocatable = vdev_allocatable(vd); | |
967 | } | |
968 | if (!allocatable) | |
34dc7c2f | 969 | goto next; |
fb5f0bc8 | 970 | |
34dc7c2f BB |
971 | /* |
972 | * Avoid writing single-copy data to a failing vdev | |
973 | */ | |
974 | if ((vd->vdev_stat.vs_write_errors > 0 || | |
975 | vd->vdev_state < VDEV_STATE_HEALTHY) && | |
976 | d == 0 && dshift == 3) { | |
977 | all_zero = B_FALSE; | |
978 | goto next; | |
979 | } | |
980 | ||
981 | ASSERT(mg->mg_class == mc); | |
982 | ||
983 | distance = vd->vdev_asize >> dshift; | |
984 | if (distance <= (1ULL << vd->vdev_ms_shift)) | |
985 | distance = 0; | |
986 | else | |
987 | all_zero = B_FALSE; | |
988 | ||
989 | asize = vdev_psize_to_asize(vd, psize); | |
990 | ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); | |
991 | ||
992 | offset = metaslab_group_alloc(mg, asize, txg, distance, dva, d); | |
993 | if (offset != -1ULL) { | |
994 | /* | |
995 | * If we've just selected this metaslab group, | |
996 | * figure out whether the corresponding vdev is | |
997 | * over- or under-used relative to the pool, | |
998 | * and set an allocation bias to even it out. | |
999 | */ | |
1000 | if (mc->mc_allocated == 0) { | |
1001 | vdev_stat_t *vs = &vd->vdev_stat; | |
1002 | uint64_t alloc, space; | |
1003 | int64_t vu, su; | |
1004 | ||
1005 | alloc = spa_get_alloc(spa); | |
1006 | space = spa_get_space(spa); | |
1007 | ||
1008 | /* | |
1009 | * Determine percent used in units of 0..1024. | |
1010 | * (This is just to avoid floating point.) | |
1011 | */ | |
1012 | vu = (vs->vs_alloc << 10) / (vs->vs_space + 1); | |
1013 | su = (alloc << 10) / (space + 1); | |
1014 | ||
1015 | /* | |
1016 | * Bias by at most +/- 25% of the aliquot. | |
1017 | */ | |
1018 | mg->mg_bias = ((su - vu) * | |
1019 | (int64_t)mg->mg_aliquot) / (1024 * 4); | |
1020 | } | |
1021 | ||
1022 | if (atomic_add_64_nv(&mc->mc_allocated, asize) >= | |
1023 | mg->mg_aliquot + mg->mg_bias) { | |
1024 | mc->mc_rotor = mg->mg_next; | |
1025 | mc->mc_allocated = 0; | |
1026 | } | |
1027 | ||
1028 | DVA_SET_VDEV(&dva[d], vd->vdev_id); | |
1029 | DVA_SET_OFFSET(&dva[d], offset); | |
b128c09f | 1030 | DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER)); |
34dc7c2f BB |
1031 | DVA_SET_ASIZE(&dva[d], asize); |
1032 | ||
1033 | return (0); | |
1034 | } | |
1035 | next: | |
1036 | mc->mc_rotor = mg->mg_next; | |
1037 | mc->mc_allocated = 0; | |
1038 | } while ((mg = mg->mg_next) != rotor); | |
1039 | ||
1040 | if (!all_zero) { | |
1041 | dshift++; | |
1042 | ASSERT(dshift < 64); | |
1043 | goto top; | |
1044 | } | |
1045 | ||
9babb374 | 1046 | if (!allocatable && !zio_lock) { |
fb5f0bc8 BB |
1047 | dshift = 3; |
1048 | zio_lock = B_TRUE; | |
1049 | goto top; | |
1050 | } | |
1051 | ||
34dc7c2f BB |
1052 | bzero(&dva[d], sizeof (dva_t)); |
1053 | ||
1054 | return (ENOSPC); | |
1055 | } | |
1056 | ||
1057 | /* | |
1058 | * Free the block represented by DVA in the context of the specified | |
1059 | * transaction group. | |
1060 | */ | |
1061 | static void | |
1062 | metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now) | |
1063 | { | |
1064 | uint64_t vdev = DVA_GET_VDEV(dva); | |
1065 | uint64_t offset = DVA_GET_OFFSET(dva); | |
1066 | uint64_t size = DVA_GET_ASIZE(dva); | |
1067 | vdev_t *vd; | |
1068 | metaslab_t *msp; | |
1069 | ||
1070 | ASSERT(DVA_IS_VALID(dva)); | |
1071 | ||
1072 | if (txg > spa_freeze_txg(spa)) | |
1073 | return; | |
1074 | ||
1075 | if ((vd = vdev_lookup_top(spa, vdev)) == NULL || | |
1076 | (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) { | |
1077 | cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu", | |
1078 | (u_longlong_t)vdev, (u_longlong_t)offset); | |
1079 | ASSERT(0); | |
1080 | return; | |
1081 | } | |
1082 | ||
1083 | msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; | |
1084 | ||
1085 | if (DVA_GET_GANG(dva)) | |
1086 | size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); | |
1087 | ||
1088 | mutex_enter(&msp->ms_lock); | |
1089 | ||
1090 | if (now) { | |
1091 | space_map_remove(&msp->ms_allocmap[txg & TXG_MASK], | |
1092 | offset, size); | |
1093 | space_map_free(&msp->ms_map, offset, size); | |
1094 | } else { | |
1095 | if (msp->ms_freemap[txg & TXG_MASK].sm_space == 0) | |
1096 | vdev_dirty(vd, VDD_METASLAB, msp, txg); | |
1097 | space_map_add(&msp->ms_freemap[txg & TXG_MASK], offset, size); | |
34dc7c2f BB |
1098 | } |
1099 | ||
1100 | mutex_exit(&msp->ms_lock); | |
1101 | } | |
1102 | ||
1103 | /* | |
1104 | * Intent log support: upon opening the pool after a crash, notify the SPA | |
1105 | * of blocks that the intent log has allocated for immediate write, but | |
1106 | * which are still considered free by the SPA because the last transaction | |
1107 | * group didn't commit yet. | |
1108 | */ | |
1109 | static int | |
1110 | metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) | |
1111 | { | |
1112 | uint64_t vdev = DVA_GET_VDEV(dva); | |
1113 | uint64_t offset = DVA_GET_OFFSET(dva); | |
1114 | uint64_t size = DVA_GET_ASIZE(dva); | |
1115 | vdev_t *vd; | |
1116 | metaslab_t *msp; | |
1117 | int error; | |
1118 | ||
1119 | ASSERT(DVA_IS_VALID(dva)); | |
1120 | ||
1121 | if ((vd = vdev_lookup_top(spa, vdev)) == NULL || | |
1122 | (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) | |
1123 | return (ENXIO); | |
1124 | ||
1125 | msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; | |
1126 | ||
1127 | if (DVA_GET_GANG(dva)) | |
1128 | size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); | |
1129 | ||
1130 | mutex_enter(&msp->ms_lock); | |
1131 | ||
9babb374 | 1132 | error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY, 0); |
b128c09f | 1133 | if (error || txg == 0) { /* txg == 0 indicates dry run */ |
34dc7c2f BB |
1134 | mutex_exit(&msp->ms_lock); |
1135 | return (error); | |
1136 | } | |
1137 | ||
34dc7c2f | 1138 | space_map_claim(&msp->ms_map, offset, size); |
b128c09f | 1139 | |
fb5f0bc8 | 1140 | if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */ |
b128c09f BB |
1141 | if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0) |
1142 | vdev_dirty(vd, VDD_METASLAB, msp, txg); | |
1143 | space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size); | |
1144 | } | |
34dc7c2f BB |
1145 | |
1146 | mutex_exit(&msp->ms_lock); | |
1147 | ||
1148 | return (0); | |
1149 | } | |
1150 | ||
1151 | int | |
1152 | metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, | |
b128c09f | 1153 | int ndvas, uint64_t txg, blkptr_t *hintbp, int flags) |
34dc7c2f BB |
1154 | { |
1155 | dva_t *dva = bp->blk_dva; | |
1156 | dva_t *hintdva = hintbp->blk_dva; | |
34dc7c2f BB |
1157 | int error = 0; |
1158 | ||
b128c09f BB |
1159 | ASSERT(bp->blk_birth == 0); |
1160 | ||
1161 | spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); | |
1162 | ||
1163 | if (mc->mc_rotor == NULL) { /* no vdevs in this class */ | |
1164 | spa_config_exit(spa, SCL_ALLOC, FTAG); | |
34dc7c2f | 1165 | return (ENOSPC); |
b128c09f | 1166 | } |
34dc7c2f BB |
1167 | |
1168 | ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa)); | |
1169 | ASSERT(BP_GET_NDVAS(bp) == 0); | |
1170 | ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp)); | |
1171 | ||
b128c09f | 1172 | for (int d = 0; d < ndvas; d++) { |
34dc7c2f | 1173 | error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva, |
b128c09f | 1174 | txg, flags); |
34dc7c2f BB |
1175 | if (error) { |
1176 | for (d--; d >= 0; d--) { | |
1177 | metaslab_free_dva(spa, &dva[d], txg, B_TRUE); | |
1178 | bzero(&dva[d], sizeof (dva_t)); | |
1179 | } | |
b128c09f | 1180 | spa_config_exit(spa, SCL_ALLOC, FTAG); |
34dc7c2f BB |
1181 | return (error); |
1182 | } | |
1183 | } | |
1184 | ASSERT(error == 0); | |
1185 | ASSERT(BP_GET_NDVAS(bp) == ndvas); | |
1186 | ||
b128c09f BB |
1187 | spa_config_exit(spa, SCL_ALLOC, FTAG); |
1188 | ||
1189 | bp->blk_birth = txg; | |
1190 | ||
34dc7c2f BB |
1191 | return (0); |
1192 | } | |
1193 | ||
1194 | void | |
1195 | metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now) | |
1196 | { | |
1197 | const dva_t *dva = bp->blk_dva; | |
1198 | int ndvas = BP_GET_NDVAS(bp); | |
34dc7c2f BB |
1199 | |
1200 | ASSERT(!BP_IS_HOLE(bp)); | |
b128c09f BB |
1201 | ASSERT(!now || bp->blk_birth >= spa->spa_syncing_txg); |
1202 | ||
1203 | spa_config_enter(spa, SCL_FREE, FTAG, RW_READER); | |
34dc7c2f | 1204 | |
b128c09f | 1205 | for (int d = 0; d < ndvas; d++) |
34dc7c2f | 1206 | metaslab_free_dva(spa, &dva[d], txg, now); |
b128c09f BB |
1207 | |
1208 | spa_config_exit(spa, SCL_FREE, FTAG); | |
34dc7c2f BB |
1209 | } |
1210 | ||
1211 | int | |
1212 | metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) | |
1213 | { | |
1214 | const dva_t *dva = bp->blk_dva; | |
1215 | int ndvas = BP_GET_NDVAS(bp); | |
b128c09f | 1216 | int error = 0; |
34dc7c2f BB |
1217 | |
1218 | ASSERT(!BP_IS_HOLE(bp)); | |
1219 | ||
b128c09f BB |
1220 | if (txg != 0) { |
1221 | /* | |
1222 | * First do a dry run to make sure all DVAs are claimable, | |
1223 | * so we don't have to unwind from partial failures below. | |
1224 | */ | |
1225 | if ((error = metaslab_claim(spa, bp, 0)) != 0) | |
1226 | return (error); | |
1227 | } | |
1228 | ||
1229 | spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); | |
1230 | ||
1231 | for (int d = 0; d < ndvas; d++) | |
34dc7c2f | 1232 | if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0) |
b128c09f BB |
1233 | break; |
1234 | ||
1235 | spa_config_exit(spa, SCL_ALLOC, FTAG); | |
1236 | ||
1237 | ASSERT(error == 0 || txg == 0); | |
34dc7c2f | 1238 | |
b128c09f | 1239 | return (error); |
34dc7c2f | 1240 | } |