]>
Commit | Line | Data |
---|---|---|
34dc7c2f BB |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
9 | * or http://www.opensolaris.org/os/licensing. | |
10 | * See the License for the specific language governing permissions | |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | /* | |
428870ff | 22 | * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. |
34dc7c2f BB |
23 | */ |
24 | ||
34dc7c2f | 25 | #include <sys/zfs_context.h> |
34dc7c2f BB |
26 | #include <sys/dmu.h> |
27 | #include <sys/dmu_tx.h> | |
28 | #include <sys/space_map.h> | |
29 | #include <sys/metaslab_impl.h> | |
30 | #include <sys/vdev_impl.h> | |
31 | #include <sys/zio.h> | |
32 | ||
33 | uint64_t metaslab_aliquot = 512ULL << 10; | |
34 | uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */ | |
35 | ||
428870ff BB |
36 | /* |
37 | * Metaslab debugging: when set, keeps all space maps in core to verify frees. | |
38 | */ | |
39 | static int metaslab_debug = 0; | |
40 | ||
9babb374 BB |
41 | /* |
42 | * Minimum size which forces the dynamic allocator to change | |
428870ff | 43 | * it's allocation strategy. Once the space map cannot satisfy |
9babb374 BB |
44 | * an allocation of this size then it switches to using more |
45 | * aggressive strategy (i.e search by size rather than offset). | |
46 | */ | |
47 | uint64_t metaslab_df_alloc_threshold = SPA_MAXBLOCKSIZE; | |
48 | ||
49 | /* | |
50 | * The minimum free space, in percent, which must be available | |
51 | * in a space map to continue allocations in a first-fit fashion. | |
52 | * Once the space_map's free space drops below this level we dynamically | |
53 | * switch to using best-fit allocations. | |
54 | */ | |
428870ff BB |
55 | int metaslab_df_free_pct = 4; |
56 | ||
57 | /* | |
58 | * A metaslab is considered "free" if it contains a contiguous | |
59 | * segment which is greater than metaslab_min_alloc_size. | |
60 | */ | |
61 | uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS; | |
62 | ||
63 | /* | |
64 | * Max number of space_maps to prefetch. | |
65 | */ | |
66 | int metaslab_prefetch_limit = SPA_DVAS_PER_BP; | |
67 | ||
68 | /* | |
69 | * Percentage bonus multiplier for metaslabs that are in the bonus area. | |
70 | */ | |
71 | int metaslab_smo_bonus_pct = 150; | |
9babb374 | 72 | |
34dc7c2f BB |
73 | /* |
74 | * ========================================================================== | |
75 | * Metaslab classes | |
76 | * ========================================================================== | |
77 | */ | |
78 | metaslab_class_t * | |
428870ff | 79 | metaslab_class_create(spa_t *spa, space_map_ops_t *ops) |
34dc7c2f BB |
80 | { |
81 | metaslab_class_t *mc; | |
82 | ||
83 | mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP); | |
84 | ||
428870ff | 85 | mc->mc_spa = spa; |
34dc7c2f | 86 | mc->mc_rotor = NULL; |
9babb374 | 87 | mc->mc_ops = ops; |
34dc7c2f BB |
88 | |
89 | return (mc); | |
90 | } | |
91 | ||
92 | void | |
93 | metaslab_class_destroy(metaslab_class_t *mc) | |
94 | { | |
428870ff BB |
95 | ASSERT(mc->mc_rotor == NULL); |
96 | ASSERT(mc->mc_alloc == 0); | |
97 | ASSERT(mc->mc_deferred == 0); | |
98 | ASSERT(mc->mc_space == 0); | |
99 | ASSERT(mc->mc_dspace == 0); | |
34dc7c2f BB |
100 | |
101 | kmem_free(mc, sizeof (metaslab_class_t)); | |
102 | } | |
103 | ||
428870ff BB |
104 | int |
105 | metaslab_class_validate(metaslab_class_t *mc) | |
34dc7c2f | 106 | { |
428870ff BB |
107 | metaslab_group_t *mg; |
108 | vdev_t *vd; | |
34dc7c2f | 109 | |
428870ff BB |
110 | /* |
111 | * Must hold one of the spa_config locks. | |
112 | */ | |
113 | ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) || | |
114 | spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER)); | |
34dc7c2f | 115 | |
428870ff BB |
116 | if ((mg = mc->mc_rotor) == NULL) |
117 | return (0); | |
118 | ||
119 | do { | |
120 | vd = mg->mg_vd; | |
121 | ASSERT(vd->vdev_mg != NULL); | |
122 | ASSERT3P(vd->vdev_top, ==, vd); | |
123 | ASSERT3P(mg->mg_class, ==, mc); | |
124 | ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops); | |
125 | } while ((mg = mg->mg_next) != mc->mc_rotor); | |
126 | ||
127 | return (0); | |
34dc7c2f BB |
128 | } |
129 | ||
130 | void | |
428870ff BB |
131 | metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta, |
132 | int64_t defer_delta, int64_t space_delta, int64_t dspace_delta) | |
34dc7c2f | 133 | { |
428870ff BB |
134 | atomic_add_64(&mc->mc_alloc, alloc_delta); |
135 | atomic_add_64(&mc->mc_deferred, defer_delta); | |
136 | atomic_add_64(&mc->mc_space, space_delta); | |
137 | atomic_add_64(&mc->mc_dspace, dspace_delta); | |
138 | } | |
34dc7c2f | 139 | |
428870ff BB |
140 | uint64_t |
141 | metaslab_class_get_alloc(metaslab_class_t *mc) | |
142 | { | |
143 | return (mc->mc_alloc); | |
144 | } | |
34dc7c2f | 145 | |
428870ff BB |
146 | uint64_t |
147 | metaslab_class_get_deferred(metaslab_class_t *mc) | |
148 | { | |
149 | return (mc->mc_deferred); | |
150 | } | |
34dc7c2f | 151 | |
428870ff BB |
152 | uint64_t |
153 | metaslab_class_get_space(metaslab_class_t *mc) | |
154 | { | |
155 | return (mc->mc_space); | |
156 | } | |
34dc7c2f | 157 | |
428870ff BB |
158 | uint64_t |
159 | metaslab_class_get_dspace(metaslab_class_t *mc) | |
160 | { | |
161 | return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space); | |
34dc7c2f BB |
162 | } |
163 | ||
164 | /* | |
165 | * ========================================================================== | |
166 | * Metaslab groups | |
167 | * ========================================================================== | |
168 | */ | |
169 | static int | |
170 | metaslab_compare(const void *x1, const void *x2) | |
171 | { | |
172 | const metaslab_t *m1 = x1; | |
173 | const metaslab_t *m2 = x2; | |
174 | ||
175 | if (m1->ms_weight < m2->ms_weight) | |
176 | return (1); | |
177 | if (m1->ms_weight > m2->ms_weight) | |
178 | return (-1); | |
179 | ||
180 | /* | |
181 | * If the weights are identical, use the offset to force uniqueness. | |
182 | */ | |
183 | if (m1->ms_map.sm_start < m2->ms_map.sm_start) | |
184 | return (-1); | |
185 | if (m1->ms_map.sm_start > m2->ms_map.sm_start) | |
186 | return (1); | |
187 | ||
188 | ASSERT3P(m1, ==, m2); | |
189 | ||
190 | return (0); | |
191 | } | |
192 | ||
193 | metaslab_group_t * | |
194 | metaslab_group_create(metaslab_class_t *mc, vdev_t *vd) | |
195 | { | |
196 | metaslab_group_t *mg; | |
197 | ||
198 | mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP); | |
199 | mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); | |
200 | avl_create(&mg->mg_metaslab_tree, metaslab_compare, | |
201 | sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node)); | |
34dc7c2f | 202 | mg->mg_vd = vd; |
428870ff BB |
203 | mg->mg_class = mc; |
204 | mg->mg_activation_count = 0; | |
34dc7c2f BB |
205 | |
206 | return (mg); | |
207 | } | |
208 | ||
209 | void | |
210 | metaslab_group_destroy(metaslab_group_t *mg) | |
211 | { | |
428870ff BB |
212 | ASSERT(mg->mg_prev == NULL); |
213 | ASSERT(mg->mg_next == NULL); | |
214 | /* | |
215 | * We may have gone below zero with the activation count | |
216 | * either because we never activated in the first place or | |
217 | * because we're done, and possibly removing the vdev. | |
218 | */ | |
219 | ASSERT(mg->mg_activation_count <= 0); | |
220 | ||
34dc7c2f BB |
221 | avl_destroy(&mg->mg_metaslab_tree); |
222 | mutex_destroy(&mg->mg_lock); | |
223 | kmem_free(mg, sizeof (metaslab_group_t)); | |
224 | } | |
225 | ||
428870ff BB |
226 | void |
227 | metaslab_group_activate(metaslab_group_t *mg) | |
228 | { | |
229 | metaslab_class_t *mc = mg->mg_class; | |
230 | metaslab_group_t *mgprev, *mgnext; | |
231 | ||
232 | ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER)); | |
233 | ||
234 | ASSERT(mc->mc_rotor != mg); | |
235 | ASSERT(mg->mg_prev == NULL); | |
236 | ASSERT(mg->mg_next == NULL); | |
237 | ASSERT(mg->mg_activation_count <= 0); | |
238 | ||
239 | if (++mg->mg_activation_count <= 0) | |
240 | return; | |
241 | ||
242 | mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children); | |
243 | ||
244 | if ((mgprev = mc->mc_rotor) == NULL) { | |
245 | mg->mg_prev = mg; | |
246 | mg->mg_next = mg; | |
247 | } else { | |
248 | mgnext = mgprev->mg_next; | |
249 | mg->mg_prev = mgprev; | |
250 | mg->mg_next = mgnext; | |
251 | mgprev->mg_next = mg; | |
252 | mgnext->mg_prev = mg; | |
253 | } | |
254 | mc->mc_rotor = mg; | |
255 | } | |
256 | ||
257 | void | |
258 | metaslab_group_passivate(metaslab_group_t *mg) | |
259 | { | |
260 | metaslab_class_t *mc = mg->mg_class; | |
261 | metaslab_group_t *mgprev, *mgnext; | |
262 | ||
263 | ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER)); | |
264 | ||
265 | if (--mg->mg_activation_count != 0) { | |
266 | ASSERT(mc->mc_rotor != mg); | |
267 | ASSERT(mg->mg_prev == NULL); | |
268 | ASSERT(mg->mg_next == NULL); | |
269 | ASSERT(mg->mg_activation_count < 0); | |
270 | return; | |
271 | } | |
272 | ||
273 | mgprev = mg->mg_prev; | |
274 | mgnext = mg->mg_next; | |
275 | ||
276 | if (mg == mgnext) { | |
277 | mc->mc_rotor = NULL; | |
278 | } else { | |
279 | mc->mc_rotor = mgnext; | |
280 | mgprev->mg_next = mgnext; | |
281 | mgnext->mg_prev = mgprev; | |
282 | } | |
283 | ||
284 | mg->mg_prev = NULL; | |
285 | mg->mg_next = NULL; | |
286 | } | |
287 | ||
34dc7c2f BB |
288 | static void |
289 | metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp) | |
290 | { | |
291 | mutex_enter(&mg->mg_lock); | |
292 | ASSERT(msp->ms_group == NULL); | |
293 | msp->ms_group = mg; | |
294 | msp->ms_weight = 0; | |
295 | avl_add(&mg->mg_metaslab_tree, msp); | |
296 | mutex_exit(&mg->mg_lock); | |
297 | } | |
298 | ||
299 | static void | |
300 | metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp) | |
301 | { | |
302 | mutex_enter(&mg->mg_lock); | |
303 | ASSERT(msp->ms_group == mg); | |
304 | avl_remove(&mg->mg_metaslab_tree, msp); | |
305 | msp->ms_group = NULL; | |
306 | mutex_exit(&mg->mg_lock); | |
307 | } | |
308 | ||
309 | static void | |
310 | metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) | |
311 | { | |
312 | /* | |
313 | * Although in principle the weight can be any value, in | |
314 | * practice we do not use values in the range [1, 510]. | |
315 | */ | |
316 | ASSERT(weight >= SPA_MINBLOCKSIZE-1 || weight == 0); | |
317 | ASSERT(MUTEX_HELD(&msp->ms_lock)); | |
318 | ||
319 | mutex_enter(&mg->mg_lock); | |
320 | ASSERT(msp->ms_group == mg); | |
321 | avl_remove(&mg->mg_metaslab_tree, msp); | |
322 | msp->ms_weight = weight; | |
323 | avl_add(&mg->mg_metaslab_tree, msp); | |
324 | mutex_exit(&mg->mg_lock); | |
325 | } | |
326 | ||
428870ff BB |
327 | /* |
328 | * ========================================================================== | |
329 | * Common allocator routines | |
330 | * ========================================================================== | |
331 | */ | |
332 | static int | |
333 | metaslab_segsize_compare(const void *x1, const void *x2) | |
334 | { | |
335 | const space_seg_t *s1 = x1; | |
336 | const space_seg_t *s2 = x2; | |
337 | uint64_t ss_size1 = s1->ss_end - s1->ss_start; | |
338 | uint64_t ss_size2 = s2->ss_end - s2->ss_start; | |
339 | ||
340 | if (ss_size1 < ss_size2) | |
341 | return (-1); | |
342 | if (ss_size1 > ss_size2) | |
343 | return (1); | |
344 | ||
345 | if (s1->ss_start < s2->ss_start) | |
346 | return (-1); | |
347 | if (s1->ss_start > s2->ss_start) | |
348 | return (1); | |
349 | ||
350 | return (0); | |
351 | } | |
352 | ||
34dc7c2f | 353 | /* |
9babb374 BB |
354 | * This is a helper function that can be used by the allocator to find |
355 | * a suitable block to allocate. This will search the specified AVL | |
356 | * tree looking for a block that matches the specified criteria. | |
34dc7c2f | 357 | */ |
34dc7c2f | 358 | static uint64_t |
9babb374 BB |
359 | metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size, |
360 | uint64_t align) | |
34dc7c2f | 361 | { |
34dc7c2f BB |
362 | space_seg_t *ss, ssearch; |
363 | avl_index_t where; | |
364 | ||
365 | ssearch.ss_start = *cursor; | |
366 | ssearch.ss_end = *cursor + size; | |
367 | ||
368 | ss = avl_find(t, &ssearch, &where); | |
369 | if (ss == NULL) | |
370 | ss = avl_nearest(t, where, AVL_AFTER); | |
371 | ||
372 | while (ss != NULL) { | |
373 | uint64_t offset = P2ROUNDUP(ss->ss_start, align); | |
374 | ||
375 | if (offset + size <= ss->ss_end) { | |
376 | *cursor = offset + size; | |
377 | return (offset); | |
378 | } | |
379 | ss = AVL_NEXT(t, ss); | |
380 | } | |
381 | ||
382 | /* | |
383 | * If we know we've searched the whole map (*cursor == 0), give up. | |
384 | * Otherwise, reset the cursor to the beginning and try again. | |
385 | */ | |
386 | if (*cursor == 0) | |
387 | return (-1ULL); | |
388 | ||
389 | *cursor = 0; | |
9babb374 BB |
390 | return (metaslab_block_picker(t, cursor, size, align)); |
391 | } | |
392 | ||
9babb374 | 393 | static void |
428870ff | 394 | metaslab_pp_load(space_map_t *sm) |
9babb374 | 395 | { |
428870ff BB |
396 | space_seg_t *ss; |
397 | ||
9babb374 BB |
398 | ASSERT(sm->sm_ppd == NULL); |
399 | sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP); | |
428870ff BB |
400 | |
401 | sm->sm_pp_root = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP); | |
402 | avl_create(sm->sm_pp_root, metaslab_segsize_compare, | |
403 | sizeof (space_seg_t), offsetof(struct space_seg, ss_pp_node)); | |
404 | ||
405 | for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss)) | |
406 | avl_add(sm->sm_pp_root, ss); | |
9babb374 BB |
407 | } |
408 | ||
409 | static void | |
428870ff | 410 | metaslab_pp_unload(space_map_t *sm) |
9babb374 | 411 | { |
428870ff BB |
412 | void *cookie = NULL; |
413 | ||
9babb374 BB |
414 | kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t)); |
415 | sm->sm_ppd = NULL; | |
9babb374 | 416 | |
428870ff BB |
417 | while (avl_destroy_nodes(sm->sm_pp_root, &cookie) != NULL) { |
418 | /* tear down the tree */ | |
419 | } | |
9babb374 | 420 | |
428870ff BB |
421 | avl_destroy(sm->sm_pp_root); |
422 | kmem_free(sm->sm_pp_root, sizeof (avl_tree_t)); | |
423 | sm->sm_pp_root = NULL; | |
34dc7c2f BB |
424 | } |
425 | ||
426 | /* ARGSUSED */ | |
427 | static void | |
428870ff | 428 | metaslab_pp_claim(space_map_t *sm, uint64_t start, uint64_t size) |
34dc7c2f BB |
429 | { |
430 | /* No need to update cursor */ | |
431 | } | |
432 | ||
433 | /* ARGSUSED */ | |
434 | static void | |
428870ff | 435 | metaslab_pp_free(space_map_t *sm, uint64_t start, uint64_t size) |
34dc7c2f BB |
436 | { |
437 | /* No need to update cursor */ | |
438 | } | |
439 | ||
9babb374 | 440 | /* |
428870ff | 441 | * Return the maximum contiguous segment within the metaslab. |
9babb374 | 442 | */ |
9babb374 | 443 | uint64_t |
428870ff | 444 | metaslab_pp_maxsize(space_map_t *sm) |
9babb374 BB |
445 | { |
446 | avl_tree_t *t = sm->sm_pp_root; | |
447 | space_seg_t *ss; | |
448 | ||
449 | if (t == NULL || (ss = avl_last(t)) == NULL) | |
450 | return (0ULL); | |
451 | ||
452 | return (ss->ss_end - ss->ss_start); | |
453 | } | |
454 | ||
428870ff BB |
455 | /* |
456 | * ========================================================================== | |
457 | * The first-fit block allocator | |
458 | * ========================================================================== | |
459 | */ | |
460 | static uint64_t | |
461 | metaslab_ff_alloc(space_map_t *sm, uint64_t size) | |
9babb374 | 462 | { |
428870ff BB |
463 | avl_tree_t *t = &sm->sm_root; |
464 | uint64_t align = size & -size; | |
465 | uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1; | |
9babb374 | 466 | |
428870ff | 467 | return (metaslab_block_picker(t, cursor, size, align)); |
9babb374 BB |
468 | } |
469 | ||
428870ff BB |
470 | /* ARGSUSED */ |
471 | boolean_t | |
472 | metaslab_ff_fragmented(space_map_t *sm) | |
9babb374 | 473 | { |
428870ff | 474 | return (B_TRUE); |
9babb374 BB |
475 | } |
476 | ||
428870ff BB |
477 | static space_map_ops_t metaslab_ff_ops = { |
478 | metaslab_pp_load, | |
479 | metaslab_pp_unload, | |
480 | metaslab_ff_alloc, | |
481 | metaslab_pp_claim, | |
482 | metaslab_pp_free, | |
483 | metaslab_pp_maxsize, | |
484 | metaslab_ff_fragmented | |
485 | }; | |
9babb374 | 486 | |
428870ff BB |
487 | /* |
488 | * ========================================================================== | |
489 | * Dynamic block allocator - | |
490 | * Uses the first fit allocation scheme until space get low and then | |
491 | * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold | |
492 | * and metaslab_df_free_pct to determine when to switch the allocation scheme. | |
493 | * ========================================================================== | |
494 | */ | |
9babb374 BB |
495 | static uint64_t |
496 | metaslab_df_alloc(space_map_t *sm, uint64_t size) | |
497 | { | |
498 | avl_tree_t *t = &sm->sm_root; | |
499 | uint64_t align = size & -size; | |
500 | uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1; | |
428870ff | 501 | uint64_t max_size = metaslab_pp_maxsize(sm); |
9babb374 BB |
502 | int free_pct = sm->sm_space * 100 / sm->sm_size; |
503 | ||
504 | ASSERT(MUTEX_HELD(sm->sm_lock)); | |
505 | ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root)); | |
506 | ||
507 | if (max_size < size) | |
508 | return (-1ULL); | |
509 | ||
510 | /* | |
511 | * If we're running low on space switch to using the size | |
512 | * sorted AVL tree (best-fit). | |
513 | */ | |
514 | if (max_size < metaslab_df_alloc_threshold || | |
515 | free_pct < metaslab_df_free_pct) { | |
516 | t = sm->sm_pp_root; | |
517 | *cursor = 0; | |
518 | } | |
519 | ||
520 | return (metaslab_block_picker(t, cursor, size, 1ULL)); | |
521 | } | |
522 | ||
428870ff BB |
523 | static boolean_t |
524 | metaslab_df_fragmented(space_map_t *sm) | |
9babb374 | 525 | { |
428870ff BB |
526 | uint64_t max_size = metaslab_pp_maxsize(sm); |
527 | int free_pct = sm->sm_space * 100 / sm->sm_size; | |
9babb374 | 528 | |
428870ff BB |
529 | if (max_size >= metaslab_df_alloc_threshold && |
530 | free_pct >= metaslab_df_free_pct) | |
531 | return (B_FALSE); | |
532 | ||
533 | return (B_TRUE); | |
9babb374 BB |
534 | } |
535 | ||
536 | static space_map_ops_t metaslab_df_ops = { | |
428870ff BB |
537 | metaslab_pp_load, |
538 | metaslab_pp_unload, | |
9babb374 | 539 | metaslab_df_alloc, |
428870ff BB |
540 | metaslab_pp_claim, |
541 | metaslab_pp_free, | |
542 | metaslab_pp_maxsize, | |
543 | metaslab_df_fragmented | |
34dc7c2f BB |
544 | }; |
545 | ||
428870ff BB |
546 | /* |
547 | * ========================================================================== | |
548 | * Other experimental allocators | |
549 | * ========================================================================== | |
550 | */ | |
551 | static uint64_t | |
552 | metaslab_cdf_alloc(space_map_t *sm, uint64_t size) | |
553 | { | |
554 | avl_tree_t *t = &sm->sm_root; | |
555 | uint64_t *cursor = (uint64_t *)sm->sm_ppd; | |
556 | uint64_t *extent_end = (uint64_t *)sm->sm_ppd + 1; | |
557 | uint64_t max_size = metaslab_pp_maxsize(sm); | |
558 | uint64_t rsize = size; | |
559 | uint64_t offset = 0; | |
560 | ||
561 | ASSERT(MUTEX_HELD(sm->sm_lock)); | |
562 | ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root)); | |
563 | ||
564 | if (max_size < size) | |
565 | return (-1ULL); | |
566 | ||
567 | ASSERT3U(*extent_end, >=, *cursor); | |
568 | ||
569 | /* | |
570 | * If we're running low on space switch to using the size | |
571 | * sorted AVL tree (best-fit). | |
572 | */ | |
573 | if ((*cursor + size) > *extent_end) { | |
574 | ||
575 | t = sm->sm_pp_root; | |
576 | *cursor = *extent_end = 0; | |
577 | ||
578 | if (max_size > 2 * SPA_MAXBLOCKSIZE) | |
579 | rsize = MIN(metaslab_min_alloc_size, max_size); | |
580 | offset = metaslab_block_picker(t, extent_end, rsize, 1ULL); | |
581 | if (offset != -1) | |
582 | *cursor = offset + size; | |
583 | } else { | |
584 | offset = metaslab_block_picker(t, cursor, rsize, 1ULL); | |
585 | } | |
586 | ASSERT3U(*cursor, <=, *extent_end); | |
587 | return (offset); | |
588 | } | |
589 | ||
590 | static boolean_t | |
591 | metaslab_cdf_fragmented(space_map_t *sm) | |
592 | { | |
593 | uint64_t max_size = metaslab_pp_maxsize(sm); | |
594 | ||
595 | if (max_size > (metaslab_min_alloc_size * 10)) | |
596 | return (B_FALSE); | |
597 | return (B_TRUE); | |
598 | } | |
599 | ||
600 | static space_map_ops_t metaslab_cdf_ops = { | |
601 | metaslab_pp_load, | |
602 | metaslab_pp_unload, | |
603 | metaslab_cdf_alloc, | |
604 | metaslab_pp_claim, | |
605 | metaslab_pp_free, | |
606 | metaslab_pp_maxsize, | |
607 | metaslab_cdf_fragmented | |
608 | }; | |
609 | ||
610 | uint64_t metaslab_ndf_clump_shift = 4; | |
611 | ||
612 | static uint64_t | |
613 | metaslab_ndf_alloc(space_map_t *sm, uint64_t size) | |
614 | { | |
615 | avl_tree_t *t = &sm->sm_root; | |
616 | avl_index_t where; | |
617 | space_seg_t *ss, ssearch; | |
618 | uint64_t hbit = highbit(size); | |
619 | uint64_t *cursor = (uint64_t *)sm->sm_ppd + hbit - 1; | |
620 | uint64_t max_size = metaslab_pp_maxsize(sm); | |
621 | ||
622 | ASSERT(MUTEX_HELD(sm->sm_lock)); | |
623 | ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root)); | |
624 | ||
625 | if (max_size < size) | |
626 | return (-1ULL); | |
627 | ||
628 | ssearch.ss_start = *cursor; | |
629 | ssearch.ss_end = *cursor + size; | |
630 | ||
631 | ss = avl_find(t, &ssearch, &where); | |
632 | if (ss == NULL || (ss->ss_start + size > ss->ss_end)) { | |
633 | t = sm->sm_pp_root; | |
634 | ||
635 | ssearch.ss_start = 0; | |
636 | ssearch.ss_end = MIN(max_size, | |
637 | 1ULL << (hbit + metaslab_ndf_clump_shift)); | |
638 | ss = avl_find(t, &ssearch, &where); | |
639 | if (ss == NULL) | |
640 | ss = avl_nearest(t, where, AVL_AFTER); | |
641 | ASSERT(ss != NULL); | |
642 | } | |
643 | ||
644 | if (ss != NULL) { | |
645 | if (ss->ss_start + size <= ss->ss_end) { | |
646 | *cursor = ss->ss_start + size; | |
647 | return (ss->ss_start); | |
648 | } | |
649 | } | |
650 | return (-1ULL); | |
651 | } | |
652 | ||
653 | static boolean_t | |
654 | metaslab_ndf_fragmented(space_map_t *sm) | |
655 | { | |
656 | uint64_t max_size = metaslab_pp_maxsize(sm); | |
657 | ||
658 | if (max_size > (metaslab_min_alloc_size << metaslab_ndf_clump_shift)) | |
659 | return (B_FALSE); | |
660 | return (B_TRUE); | |
661 | } | |
662 | ||
663 | ||
664 | static space_map_ops_t metaslab_ndf_ops = { | |
665 | metaslab_pp_load, | |
666 | metaslab_pp_unload, | |
667 | metaslab_ndf_alloc, | |
668 | metaslab_pp_claim, | |
669 | metaslab_pp_free, | |
670 | metaslab_pp_maxsize, | |
671 | metaslab_ndf_fragmented | |
672 | }; | |
673 | ||
674 | space_map_ops_t *zfs_metaslab_ops = &metaslab_ndf_ops; | |
9babb374 | 675 | |
34dc7c2f BB |
676 | /* |
677 | * ========================================================================== | |
678 | * Metaslabs | |
679 | * ========================================================================== | |
680 | */ | |
681 | metaslab_t * | |
682 | metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo, | |
683 | uint64_t start, uint64_t size, uint64_t txg) | |
684 | { | |
685 | vdev_t *vd = mg->mg_vd; | |
686 | metaslab_t *msp; | |
687 | ||
688 | msp = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP); | |
689 | mutex_init(&msp->ms_lock, NULL, MUTEX_DEFAULT, NULL); | |
690 | ||
691 | msp->ms_smo_syncing = *smo; | |
692 | ||
693 | /* | |
694 | * We create the main space map here, but we don't create the | |
695 | * allocmaps and freemaps until metaslab_sync_done(). This serves | |
696 | * two purposes: it allows metaslab_sync_done() to detect the | |
697 | * addition of new space; and for debugging, it ensures that we'd | |
698 | * data fault on any attempt to use this metaslab before it's ready. | |
699 | */ | |
700 | space_map_create(&msp->ms_map, start, size, | |
701 | vd->vdev_ashift, &msp->ms_lock); | |
702 | ||
703 | metaslab_group_add(mg, msp); | |
704 | ||
428870ff BB |
705 | if (metaslab_debug && smo->smo_object != 0) { |
706 | mutex_enter(&msp->ms_lock); | |
707 | VERIFY(space_map_load(&msp->ms_map, mg->mg_class->mc_ops, | |
708 | SM_FREE, smo, spa_meta_objset(vd->vdev_spa)) == 0); | |
709 | mutex_exit(&msp->ms_lock); | |
710 | } | |
711 | ||
34dc7c2f BB |
712 | /* |
713 | * If we're opening an existing pool (txg == 0) or creating | |
714 | * a new one (txg == TXG_INITIAL), all space is available now. | |
715 | * If we're adding space to an existing pool, the new space | |
716 | * does not become available until after this txg has synced. | |
717 | */ | |
718 | if (txg <= TXG_INITIAL) | |
719 | metaslab_sync_done(msp, 0); | |
720 | ||
721 | if (txg != 0) { | |
34dc7c2f | 722 | vdev_dirty(vd, 0, NULL, txg); |
428870ff | 723 | vdev_dirty(vd, VDD_METASLAB, msp, txg); |
34dc7c2f BB |
724 | } |
725 | ||
726 | return (msp); | |
727 | } | |
728 | ||
729 | void | |
730 | metaslab_fini(metaslab_t *msp) | |
731 | { | |
732 | metaslab_group_t *mg = msp->ms_group; | |
d6320ddb | 733 | int t; |
34dc7c2f | 734 | |
428870ff BB |
735 | vdev_space_update(mg->mg_vd, |
736 | -msp->ms_smo.smo_alloc, 0, -msp->ms_map.sm_size); | |
34dc7c2f BB |
737 | |
738 | metaslab_group_remove(mg, msp); | |
739 | ||
740 | mutex_enter(&msp->ms_lock); | |
741 | ||
742 | space_map_unload(&msp->ms_map); | |
743 | space_map_destroy(&msp->ms_map); | |
744 | ||
d6320ddb | 745 | for (t = 0; t < TXG_SIZE; t++) { |
34dc7c2f BB |
746 | space_map_destroy(&msp->ms_allocmap[t]); |
747 | space_map_destroy(&msp->ms_freemap[t]); | |
748 | } | |
749 | ||
d6320ddb | 750 | for (t = 0; t < TXG_DEFER_SIZE; t++) |
428870ff BB |
751 | space_map_destroy(&msp->ms_defermap[t]); |
752 | ||
753 | ASSERT3S(msp->ms_deferspace, ==, 0); | |
754 | ||
34dc7c2f BB |
755 | mutex_exit(&msp->ms_lock); |
756 | mutex_destroy(&msp->ms_lock); | |
757 | ||
758 | kmem_free(msp, sizeof (metaslab_t)); | |
759 | } | |
760 | ||
761 | #define METASLAB_WEIGHT_PRIMARY (1ULL << 63) | |
762 | #define METASLAB_WEIGHT_SECONDARY (1ULL << 62) | |
763 | #define METASLAB_ACTIVE_MASK \ | |
764 | (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY) | |
34dc7c2f BB |
765 | |
766 | static uint64_t | |
767 | metaslab_weight(metaslab_t *msp) | |
768 | { | |
769 | metaslab_group_t *mg = msp->ms_group; | |
770 | space_map_t *sm = &msp->ms_map; | |
771 | space_map_obj_t *smo = &msp->ms_smo; | |
772 | vdev_t *vd = mg->mg_vd; | |
773 | uint64_t weight, space; | |
774 | ||
775 | ASSERT(MUTEX_HELD(&msp->ms_lock)); | |
776 | ||
777 | /* | |
778 | * The baseline weight is the metaslab's free space. | |
779 | */ | |
780 | space = sm->sm_size - smo->smo_alloc; | |
781 | weight = space; | |
782 | ||
783 | /* | |
784 | * Modern disks have uniform bit density and constant angular velocity. | |
785 | * Therefore, the outer recording zones are faster (higher bandwidth) | |
786 | * than the inner zones by the ratio of outer to inner track diameter, | |
787 | * which is typically around 2:1. We account for this by assigning | |
788 | * higher weight to lower metaslabs (multiplier ranging from 2x to 1x). | |
789 | * In effect, this means that we'll select the metaslab with the most | |
790 | * free bandwidth rather than simply the one with the most free space. | |
791 | */ | |
792 | weight = 2 * weight - | |
793 | ((sm->sm_start >> vd->vdev_ms_shift) * weight) / vd->vdev_ms_count; | |
794 | ASSERT(weight >= space && weight <= 2 * space); | |
795 | ||
796 | /* | |
428870ff BB |
797 | * For locality, assign higher weight to metaslabs which have |
798 | * a lower offset than what we've already activated. | |
34dc7c2f | 799 | */ |
428870ff BB |
800 | if (sm->sm_start <= mg->mg_bonus_area) |
801 | weight *= (metaslab_smo_bonus_pct / 100); | |
34dc7c2f | 802 | ASSERT(weight >= space && |
428870ff BB |
803 | weight <= 2 * (metaslab_smo_bonus_pct / 100) * space); |
804 | ||
805 | if (sm->sm_loaded && !sm->sm_ops->smop_fragmented(sm)) { | |
806 | /* | |
807 | * If this metaslab is one we're actively using, adjust its | |
808 | * weight to make it preferable to any inactive metaslab so | |
809 | * we'll polish it off. | |
810 | */ | |
811 | weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK); | |
812 | } | |
813 | return (weight); | |
814 | } | |
815 | ||
816 | static void | |
817 | metaslab_prefetch(metaslab_group_t *mg) | |
818 | { | |
819 | spa_t *spa = mg->mg_vd->vdev_spa; | |
820 | metaslab_t *msp; | |
821 | avl_tree_t *t = &mg->mg_metaslab_tree; | |
822 | int m; | |
823 | ||
824 | mutex_enter(&mg->mg_lock); | |
34dc7c2f BB |
825 | |
826 | /* | |
428870ff | 827 | * Prefetch the next potential metaslabs |
34dc7c2f | 828 | */ |
428870ff BB |
829 | for (msp = avl_first(t), m = 0; msp; msp = AVL_NEXT(t, msp), m++) { |
830 | space_map_t *sm = &msp->ms_map; | |
831 | space_map_obj_t *smo = &msp->ms_smo; | |
34dc7c2f | 832 | |
428870ff BB |
833 | /* If we have reached our prefetch limit then we're done */ |
834 | if (m >= metaslab_prefetch_limit) | |
835 | break; | |
836 | ||
837 | if (!sm->sm_loaded && smo->smo_object != 0) { | |
838 | mutex_exit(&mg->mg_lock); | |
839 | dmu_prefetch(spa_meta_objset(spa), smo->smo_object, | |
840 | 0ULL, smo->smo_objsize); | |
841 | mutex_enter(&mg->mg_lock); | |
842 | } | |
843 | } | |
844 | mutex_exit(&mg->mg_lock); | |
34dc7c2f BB |
845 | } |
846 | ||
847 | static int | |
9babb374 | 848 | metaslab_activate(metaslab_t *msp, uint64_t activation_weight, uint64_t size) |
34dc7c2f | 849 | { |
428870ff | 850 | metaslab_group_t *mg = msp->ms_group; |
34dc7c2f | 851 | space_map_t *sm = &msp->ms_map; |
9babb374 | 852 | space_map_ops_t *sm_ops = msp->ms_group->mg_class->mc_ops; |
d6320ddb | 853 | int t; |
34dc7c2f BB |
854 | |
855 | ASSERT(MUTEX_HELD(&msp->ms_lock)); | |
856 | ||
857 | if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { | |
428870ff BB |
858 | space_map_load_wait(sm); |
859 | if (!sm->sm_loaded) { | |
860 | int error = space_map_load(sm, sm_ops, SM_FREE, | |
861 | &msp->ms_smo, | |
862 | spa_meta_objset(msp->ms_group->mg_vd->vdev_spa)); | |
863 | if (error) { | |
864 | metaslab_group_sort(msp->ms_group, msp, 0); | |
865 | return (error); | |
866 | } | |
d6320ddb | 867 | for (t = 0; t < TXG_DEFER_SIZE; t++) |
428870ff BB |
868 | space_map_walk(&msp->ms_defermap[t], |
869 | space_map_claim, sm); | |
870 | ||
871 | } | |
872 | ||
873 | /* | |
874 | * Track the bonus area as we activate new metaslabs. | |
875 | */ | |
876 | if (sm->sm_start > mg->mg_bonus_area) { | |
877 | mutex_enter(&mg->mg_lock); | |
878 | mg->mg_bonus_area = sm->sm_start; | |
879 | mutex_exit(&mg->mg_lock); | |
34dc7c2f | 880 | } |
9babb374 BB |
881 | |
882 | /* | |
883 | * If we were able to load the map then make sure | |
884 | * that this map is still able to satisfy our request. | |
885 | */ | |
886 | if (msp->ms_weight < size) | |
887 | return (ENOSPC); | |
888 | ||
34dc7c2f BB |
889 | metaslab_group_sort(msp->ms_group, msp, |
890 | msp->ms_weight | activation_weight); | |
891 | } | |
892 | ASSERT(sm->sm_loaded); | |
893 | ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); | |
894 | ||
895 | return (0); | |
896 | } | |
897 | ||
898 | static void | |
899 | metaslab_passivate(metaslab_t *msp, uint64_t size) | |
900 | { | |
901 | /* | |
902 | * If size < SPA_MINBLOCKSIZE, then we will not allocate from | |
903 | * this metaslab again. In that case, it had better be empty, | |
904 | * or we would be leaving space on the table. | |
905 | */ | |
906 | ASSERT(size >= SPA_MINBLOCKSIZE || msp->ms_map.sm_space == 0); | |
907 | metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size)); | |
908 | ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0); | |
909 | } | |
910 | ||
911 | /* | |
912 | * Write a metaslab to disk in the context of the specified transaction group. | |
913 | */ | |
914 | void | |
915 | metaslab_sync(metaslab_t *msp, uint64_t txg) | |
916 | { | |
917 | vdev_t *vd = msp->ms_group->mg_vd; | |
918 | spa_t *spa = vd->vdev_spa; | |
428870ff | 919 | objset_t *mos = spa_meta_objset(spa); |
34dc7c2f BB |
920 | space_map_t *allocmap = &msp->ms_allocmap[txg & TXG_MASK]; |
921 | space_map_t *freemap = &msp->ms_freemap[txg & TXG_MASK]; | |
922 | space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; | |
923 | space_map_t *sm = &msp->ms_map; | |
924 | space_map_obj_t *smo = &msp->ms_smo_syncing; | |
925 | dmu_buf_t *db; | |
926 | dmu_tx_t *tx; | |
d6320ddb | 927 | int t; |
34dc7c2f | 928 | |
428870ff BB |
929 | ASSERT(!vd->vdev_ishole); |
930 | ||
931 | if (allocmap->sm_space == 0 && freemap->sm_space == 0) | |
932 | return; | |
34dc7c2f BB |
933 | |
934 | /* | |
935 | * The only state that can actually be changing concurrently with | |
936 | * metaslab_sync() is the metaslab's ms_map. No other thread can | |
937 | * be modifying this txg's allocmap, freemap, freed_map, or smo. | |
938 | * Therefore, we only hold ms_lock to satify space_map ASSERTs. | |
939 | * We drop it whenever we call into the DMU, because the DMU | |
940 | * can call down to us (e.g. via zio_free()) at any time. | |
941 | */ | |
428870ff BB |
942 | |
943 | tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); | |
34dc7c2f BB |
944 | |
945 | if (smo->smo_object == 0) { | |
946 | ASSERT(smo->smo_objsize == 0); | |
947 | ASSERT(smo->smo_alloc == 0); | |
34dc7c2f BB |
948 | smo->smo_object = dmu_object_alloc(mos, |
949 | DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT, | |
950 | DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx); | |
951 | ASSERT(smo->smo_object != 0); | |
952 | dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) * | |
953 | (sm->sm_start >> vd->vdev_ms_shift), | |
954 | sizeof (uint64_t), &smo->smo_object, tx); | |
34dc7c2f BB |
955 | } |
956 | ||
428870ff BB |
957 | mutex_enter(&msp->ms_lock); |
958 | ||
34dc7c2f BB |
959 | space_map_walk(freemap, space_map_add, freed_map); |
960 | ||
961 | if (sm->sm_loaded && spa_sync_pass(spa) == 1 && smo->smo_objsize >= | |
962 | 2 * sizeof (uint64_t) * avl_numnodes(&sm->sm_root)) { | |
963 | /* | |
964 | * The in-core space map representation is twice as compact | |
965 | * as the on-disk one, so it's time to condense the latter | |
966 | * by generating a pure allocmap from first principles. | |
967 | * | |
968 | * This metaslab is 100% allocated, | |
969 | * minus the content of the in-core map (sm), | |
970 | * minus what's been freed this txg (freed_map), | |
428870ff | 971 | * minus deferred frees (ms_defermap[]), |
34dc7c2f BB |
972 | * minus allocations from txgs in the future |
973 | * (because they haven't been committed yet). | |
974 | */ | |
975 | space_map_vacate(allocmap, NULL, NULL); | |
976 | space_map_vacate(freemap, NULL, NULL); | |
977 | ||
978 | space_map_add(allocmap, allocmap->sm_start, allocmap->sm_size); | |
979 | ||
980 | space_map_walk(sm, space_map_remove, allocmap); | |
981 | space_map_walk(freed_map, space_map_remove, allocmap); | |
982 | ||
d6320ddb | 983 | for (t = 0; t < TXG_DEFER_SIZE; t++) |
428870ff BB |
984 | space_map_walk(&msp->ms_defermap[t], |
985 | space_map_remove, allocmap); | |
986 | ||
d6320ddb | 987 | for (t = 1; t < TXG_CONCURRENT_STATES; t++) |
34dc7c2f BB |
988 | space_map_walk(&msp->ms_allocmap[(txg + t) & TXG_MASK], |
989 | space_map_remove, allocmap); | |
990 | ||
991 | mutex_exit(&msp->ms_lock); | |
992 | space_map_truncate(smo, mos, tx); | |
993 | mutex_enter(&msp->ms_lock); | |
994 | } | |
995 | ||
996 | space_map_sync(allocmap, SM_ALLOC, smo, mos, tx); | |
997 | space_map_sync(freemap, SM_FREE, smo, mos, tx); | |
998 | ||
999 | mutex_exit(&msp->ms_lock); | |
1000 | ||
1001 | VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)); | |
1002 | dmu_buf_will_dirty(db, tx); | |
1003 | ASSERT3U(db->db_size, >=, sizeof (*smo)); | |
1004 | bcopy(smo, db->db_data, sizeof (*smo)); | |
1005 | dmu_buf_rele(db, FTAG); | |
1006 | ||
1007 | dmu_tx_commit(tx); | |
1008 | } | |
1009 | ||
1010 | /* | |
1011 | * Called after a transaction group has completely synced to mark | |
1012 | * all of the metaslab's free space as usable. | |
1013 | */ | |
1014 | void | |
1015 | metaslab_sync_done(metaslab_t *msp, uint64_t txg) | |
1016 | { | |
1017 | space_map_obj_t *smo = &msp->ms_smo; | |
1018 | space_map_obj_t *smosync = &msp->ms_smo_syncing; | |
1019 | space_map_t *sm = &msp->ms_map; | |
1020 | space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; | |
428870ff | 1021 | space_map_t *defer_map = &msp->ms_defermap[txg % TXG_DEFER_SIZE]; |
34dc7c2f BB |
1022 | metaslab_group_t *mg = msp->ms_group; |
1023 | vdev_t *vd = mg->mg_vd; | |
428870ff | 1024 | int64_t alloc_delta, defer_delta; |
d6320ddb | 1025 | int t; |
428870ff BB |
1026 | |
1027 | ASSERT(!vd->vdev_ishole); | |
34dc7c2f BB |
1028 | |
1029 | mutex_enter(&msp->ms_lock); | |
1030 | ||
1031 | /* | |
1032 | * If this metaslab is just becoming available, initialize its | |
1033 | * allocmaps and freemaps and add its capacity to the vdev. | |
1034 | */ | |
1035 | if (freed_map->sm_size == 0) { | |
d6320ddb | 1036 | for (t = 0; t < TXG_SIZE; t++) { |
34dc7c2f BB |
1037 | space_map_create(&msp->ms_allocmap[t], sm->sm_start, |
1038 | sm->sm_size, sm->sm_shift, sm->sm_lock); | |
1039 | space_map_create(&msp->ms_freemap[t], sm->sm_start, | |
1040 | sm->sm_size, sm->sm_shift, sm->sm_lock); | |
1041 | } | |
428870ff | 1042 | |
d6320ddb | 1043 | for (t = 0; t < TXG_DEFER_SIZE; t++) |
428870ff BB |
1044 | space_map_create(&msp->ms_defermap[t], sm->sm_start, |
1045 | sm->sm_size, sm->sm_shift, sm->sm_lock); | |
1046 | ||
1047 | vdev_space_update(vd, 0, 0, sm->sm_size); | |
34dc7c2f BB |
1048 | } |
1049 | ||
428870ff BB |
1050 | alloc_delta = smosync->smo_alloc - smo->smo_alloc; |
1051 | defer_delta = freed_map->sm_space - defer_map->sm_space; | |
1052 | ||
1053 | vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0); | |
34dc7c2f BB |
1054 | |
1055 | ASSERT(msp->ms_allocmap[txg & TXG_MASK].sm_space == 0); | |
1056 | ASSERT(msp->ms_freemap[txg & TXG_MASK].sm_space == 0); | |
1057 | ||
1058 | /* | |
1059 | * If there's a space_map_load() in progress, wait for it to complete | |
1060 | * so that we have a consistent view of the in-core space map. | |
428870ff BB |
1061 | * Then, add defer_map (oldest deferred frees) to this map and |
1062 | * transfer freed_map (this txg's frees) to defer_map. | |
34dc7c2f BB |
1063 | */ |
1064 | space_map_load_wait(sm); | |
428870ff BB |
1065 | space_map_vacate(defer_map, sm->sm_loaded ? space_map_free : NULL, sm); |
1066 | space_map_vacate(freed_map, space_map_add, defer_map); | |
34dc7c2f BB |
1067 | |
1068 | *smo = *smosync; | |
1069 | ||
428870ff BB |
1070 | msp->ms_deferspace += defer_delta; |
1071 | ASSERT3S(msp->ms_deferspace, >=, 0); | |
1072 | ASSERT3S(msp->ms_deferspace, <=, sm->sm_size); | |
1073 | if (msp->ms_deferspace != 0) { | |
1074 | /* | |
1075 | * Keep syncing this metaslab until all deferred frees | |
1076 | * are back in circulation. | |
1077 | */ | |
1078 | vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); | |
1079 | } | |
1080 | ||
34dc7c2f BB |
1081 | /* |
1082 | * If the map is loaded but no longer active, evict it as soon as all | |
1083 | * future allocations have synced. (If we unloaded it now and then | |
1084 | * loaded a moment later, the map wouldn't reflect those allocations.) | |
1085 | */ | |
1086 | if (sm->sm_loaded && (msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { | |
1087 | int evictable = 1; | |
1088 | ||
d6320ddb | 1089 | for (t = 1; t < TXG_CONCURRENT_STATES; t++) |
34dc7c2f BB |
1090 | if (msp->ms_allocmap[(txg + t) & TXG_MASK].sm_space) |
1091 | evictable = 0; | |
1092 | ||
428870ff | 1093 | if (evictable && !metaslab_debug) |
34dc7c2f BB |
1094 | space_map_unload(sm); |
1095 | } | |
1096 | ||
1097 | metaslab_group_sort(mg, msp, metaslab_weight(msp)); | |
1098 | ||
1099 | mutex_exit(&msp->ms_lock); | |
1100 | } | |
1101 | ||
428870ff BB |
1102 | void |
1103 | metaslab_sync_reassess(metaslab_group_t *mg) | |
1104 | { | |
1105 | vdev_t *vd = mg->mg_vd; | |
d6320ddb | 1106 | int m; |
428870ff BB |
1107 | |
1108 | /* | |
1109 | * Re-evaluate all metaslabs which have lower offsets than the | |
1110 | * bonus area. | |
1111 | */ | |
d6320ddb | 1112 | for (m = 0; m < vd->vdev_ms_count; m++) { |
428870ff BB |
1113 | metaslab_t *msp = vd->vdev_ms[m]; |
1114 | ||
1115 | if (msp->ms_map.sm_start > mg->mg_bonus_area) | |
1116 | break; | |
1117 | ||
1118 | mutex_enter(&msp->ms_lock); | |
1119 | metaslab_group_sort(mg, msp, metaslab_weight(msp)); | |
1120 | mutex_exit(&msp->ms_lock); | |
1121 | } | |
1122 | ||
1123 | /* | |
1124 | * Prefetch the next potential metaslabs | |
1125 | */ | |
1126 | metaslab_prefetch(mg); | |
1127 | } | |
1128 | ||
34dc7c2f BB |
1129 | static uint64_t |
1130 | metaslab_distance(metaslab_t *msp, dva_t *dva) | |
1131 | { | |
1132 | uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift; | |
1133 | uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift; | |
1134 | uint64_t start = msp->ms_map.sm_start >> ms_shift; | |
1135 | ||
1136 | if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva)) | |
1137 | return (1ULL << 63); | |
1138 | ||
1139 | if (offset < start) | |
1140 | return ((start - offset) << ms_shift); | |
1141 | if (offset > start) | |
1142 | return ((offset - start) << ms_shift); | |
1143 | return (0); | |
1144 | } | |
1145 | ||
1146 | static uint64_t | |
1147 | metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg, | |
1148 | uint64_t min_distance, dva_t *dva, int d) | |
1149 | { | |
1150 | metaslab_t *msp = NULL; | |
1151 | uint64_t offset = -1ULL; | |
1152 | avl_tree_t *t = &mg->mg_metaslab_tree; | |
1153 | uint64_t activation_weight; | |
1154 | uint64_t target_distance; | |
1155 | int i; | |
1156 | ||
1157 | activation_weight = METASLAB_WEIGHT_PRIMARY; | |
9babb374 BB |
1158 | for (i = 0; i < d; i++) { |
1159 | if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { | |
34dc7c2f | 1160 | activation_weight = METASLAB_WEIGHT_SECONDARY; |
9babb374 BB |
1161 | break; |
1162 | } | |
1163 | } | |
34dc7c2f BB |
1164 | |
1165 | for (;;) { | |
9babb374 BB |
1166 | boolean_t was_active; |
1167 | ||
34dc7c2f BB |
1168 | mutex_enter(&mg->mg_lock); |
1169 | for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) { | |
1170 | if (msp->ms_weight < size) { | |
1171 | mutex_exit(&mg->mg_lock); | |
1172 | return (-1ULL); | |
1173 | } | |
1174 | ||
9babb374 | 1175 | was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; |
34dc7c2f BB |
1176 | if (activation_weight == METASLAB_WEIGHT_PRIMARY) |
1177 | break; | |
1178 | ||
1179 | target_distance = min_distance + | |
1180 | (msp->ms_smo.smo_alloc ? 0 : min_distance >> 1); | |
1181 | ||
1182 | for (i = 0; i < d; i++) | |
1183 | if (metaslab_distance(msp, &dva[i]) < | |
1184 | target_distance) | |
1185 | break; | |
1186 | if (i == d) | |
1187 | break; | |
1188 | } | |
1189 | mutex_exit(&mg->mg_lock); | |
1190 | if (msp == NULL) | |
1191 | return (-1ULL); | |
1192 | ||
1193 | mutex_enter(&msp->ms_lock); | |
1194 | ||
1195 | /* | |
1196 | * Ensure that the metaslab we have selected is still | |
1197 | * capable of handling our request. It's possible that | |
1198 | * another thread may have changed the weight while we | |
1199 | * were blocked on the metaslab lock. | |
1200 | */ | |
9babb374 BB |
1201 | if (msp->ms_weight < size || (was_active && |
1202 | !(msp->ms_weight & METASLAB_ACTIVE_MASK) && | |
1203 | activation_weight == METASLAB_WEIGHT_PRIMARY)) { | |
34dc7c2f BB |
1204 | mutex_exit(&msp->ms_lock); |
1205 | continue; | |
1206 | } | |
1207 | ||
1208 | if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) && | |
1209 | activation_weight == METASLAB_WEIGHT_PRIMARY) { | |
1210 | metaslab_passivate(msp, | |
1211 | msp->ms_weight & ~METASLAB_ACTIVE_MASK); | |
1212 | mutex_exit(&msp->ms_lock); | |
1213 | continue; | |
1214 | } | |
1215 | ||
9babb374 | 1216 | if (metaslab_activate(msp, activation_weight, size) != 0) { |
34dc7c2f BB |
1217 | mutex_exit(&msp->ms_lock); |
1218 | continue; | |
1219 | } | |
1220 | ||
1221 | if ((offset = space_map_alloc(&msp->ms_map, size)) != -1ULL) | |
1222 | break; | |
1223 | ||
428870ff | 1224 | metaslab_passivate(msp, space_map_maxsize(&msp->ms_map)); |
34dc7c2f BB |
1225 | |
1226 | mutex_exit(&msp->ms_lock); | |
1227 | } | |
1228 | ||
1229 | if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0) | |
1230 | vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); | |
1231 | ||
1232 | space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size); | |
1233 | ||
1234 | mutex_exit(&msp->ms_lock); | |
1235 | ||
1236 | return (offset); | |
1237 | } | |
1238 | ||
1239 | /* | |
1240 | * Allocate a block for the specified i/o. | |
1241 | */ | |
1242 | static int | |
1243 | metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, | |
b128c09f | 1244 | dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags) |
34dc7c2f BB |
1245 | { |
1246 | metaslab_group_t *mg, *rotor; | |
1247 | vdev_t *vd; | |
1248 | int dshift = 3; | |
1249 | int all_zero; | |
fb5f0bc8 BB |
1250 | int zio_lock = B_FALSE; |
1251 | boolean_t allocatable; | |
34dc7c2f BB |
1252 | uint64_t offset = -1ULL; |
1253 | uint64_t asize; | |
1254 | uint64_t distance; | |
1255 | ||
1256 | ASSERT(!DVA_IS_VALID(&dva[d])); | |
1257 | ||
1258 | /* | |
1259 | * For testing, make some blocks above a certain size be gang blocks. | |
1260 | */ | |
428870ff | 1261 | if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0) |
34dc7c2f BB |
1262 | return (ENOSPC); |
1263 | ||
1264 | /* | |
1265 | * Start at the rotor and loop through all mgs until we find something. | |
428870ff | 1266 | * Note that there's no locking on mc_rotor or mc_aliquot because |
34dc7c2f BB |
1267 | * nothing actually breaks if we miss a few updates -- we just won't |
1268 | * allocate quite as evenly. It all balances out over time. | |
1269 | * | |
1270 | * If we are doing ditto or log blocks, try to spread them across | |
1271 | * consecutive vdevs. If we're forced to reuse a vdev before we've | |
1272 | * allocated all of our ditto blocks, then try and spread them out on | |
1273 | * that vdev as much as possible. If it turns out to not be possible, | |
1274 | * gradually lower our standards until anything becomes acceptable. | |
1275 | * Also, allocating on consecutive vdevs (as opposed to random vdevs) | |
1276 | * gives us hope of containing our fault domains to something we're | |
1277 | * able to reason about. Otherwise, any two top-level vdev failures | |
1278 | * will guarantee the loss of data. With consecutive allocation, | |
1279 | * only two adjacent top-level vdev failures will result in data loss. | |
1280 | * | |
1281 | * If we are doing gang blocks (hintdva is non-NULL), try to keep | |
1282 | * ourselves on the same vdev as our gang block header. That | |
1283 | * way, we can hope for locality in vdev_cache, plus it makes our | |
1284 | * fault domains something tractable. | |
1285 | */ | |
1286 | if (hintdva) { | |
1287 | vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d])); | |
428870ff BB |
1288 | |
1289 | /* | |
1290 | * It's possible the vdev we're using as the hint no | |
1291 | * longer exists (i.e. removed). Consult the rotor when | |
1292 | * all else fails. | |
1293 | */ | |
1294 | if (vd != NULL) { | |
34dc7c2f | 1295 | mg = vd->vdev_mg; |
428870ff BB |
1296 | |
1297 | if (flags & METASLAB_HINTBP_AVOID && | |
1298 | mg->mg_next != NULL) | |
1299 | mg = mg->mg_next; | |
1300 | } else { | |
1301 | mg = mc->mc_rotor; | |
1302 | } | |
34dc7c2f BB |
1303 | } else if (d != 0) { |
1304 | vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1])); | |
1305 | mg = vd->vdev_mg->mg_next; | |
1306 | } else { | |
1307 | mg = mc->mc_rotor; | |
1308 | } | |
1309 | ||
1310 | /* | |
428870ff BB |
1311 | * If the hint put us into the wrong metaslab class, or into a |
1312 | * metaslab group that has been passivated, just follow the rotor. | |
34dc7c2f | 1313 | */ |
428870ff | 1314 | if (mg->mg_class != mc || mg->mg_activation_count <= 0) |
34dc7c2f BB |
1315 | mg = mc->mc_rotor; |
1316 | ||
1317 | rotor = mg; | |
1318 | top: | |
1319 | all_zero = B_TRUE; | |
1320 | do { | |
428870ff BB |
1321 | ASSERT(mg->mg_activation_count == 1); |
1322 | ||
34dc7c2f | 1323 | vd = mg->mg_vd; |
fb5f0bc8 | 1324 | |
34dc7c2f | 1325 | /* |
b128c09f | 1326 | * Don't allocate from faulted devices. |
34dc7c2f | 1327 | */ |
fb5f0bc8 BB |
1328 | if (zio_lock) { |
1329 | spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER); | |
1330 | allocatable = vdev_allocatable(vd); | |
1331 | spa_config_exit(spa, SCL_ZIO, FTAG); | |
1332 | } else { | |
1333 | allocatable = vdev_allocatable(vd); | |
1334 | } | |
1335 | if (!allocatable) | |
34dc7c2f | 1336 | goto next; |
fb5f0bc8 | 1337 | |
34dc7c2f BB |
1338 | /* |
1339 | * Avoid writing single-copy data to a failing vdev | |
1340 | */ | |
1341 | if ((vd->vdev_stat.vs_write_errors > 0 || | |
1342 | vd->vdev_state < VDEV_STATE_HEALTHY) && | |
1343 | d == 0 && dshift == 3) { | |
1344 | all_zero = B_FALSE; | |
1345 | goto next; | |
1346 | } | |
1347 | ||
1348 | ASSERT(mg->mg_class == mc); | |
1349 | ||
1350 | distance = vd->vdev_asize >> dshift; | |
1351 | if (distance <= (1ULL << vd->vdev_ms_shift)) | |
1352 | distance = 0; | |
1353 | else | |
1354 | all_zero = B_FALSE; | |
1355 | ||
1356 | asize = vdev_psize_to_asize(vd, psize); | |
1357 | ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); | |
1358 | ||
1359 | offset = metaslab_group_alloc(mg, asize, txg, distance, dva, d); | |
1360 | if (offset != -1ULL) { | |
1361 | /* | |
1362 | * If we've just selected this metaslab group, | |
1363 | * figure out whether the corresponding vdev is | |
1364 | * over- or under-used relative to the pool, | |
1365 | * and set an allocation bias to even it out. | |
1366 | */ | |
428870ff | 1367 | if (mc->mc_aliquot == 0) { |
34dc7c2f | 1368 | vdev_stat_t *vs = &vd->vdev_stat; |
428870ff | 1369 | int64_t vu, cu; |
34dc7c2f BB |
1370 | |
1371 | /* | |
1372 | * Determine percent used in units of 0..1024. | |
1373 | * (This is just to avoid floating point.) | |
1374 | */ | |
1375 | vu = (vs->vs_alloc << 10) / (vs->vs_space + 1); | |
428870ff | 1376 | cu = (mc->mc_alloc << 10) / (mc->mc_space + 1); |
34dc7c2f BB |
1377 | |
1378 | /* | |
1379 | * Bias by at most +/- 25% of the aliquot. | |
1380 | */ | |
428870ff | 1381 | mg->mg_bias = ((cu - vu) * |
34dc7c2f BB |
1382 | (int64_t)mg->mg_aliquot) / (1024 * 4); |
1383 | } | |
1384 | ||
428870ff | 1385 | if (atomic_add_64_nv(&mc->mc_aliquot, asize) >= |
34dc7c2f BB |
1386 | mg->mg_aliquot + mg->mg_bias) { |
1387 | mc->mc_rotor = mg->mg_next; | |
428870ff | 1388 | mc->mc_aliquot = 0; |
34dc7c2f BB |
1389 | } |
1390 | ||
1391 | DVA_SET_VDEV(&dva[d], vd->vdev_id); | |
1392 | DVA_SET_OFFSET(&dva[d], offset); | |
b128c09f | 1393 | DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER)); |
34dc7c2f BB |
1394 | DVA_SET_ASIZE(&dva[d], asize); |
1395 | ||
1396 | return (0); | |
1397 | } | |
1398 | next: | |
1399 | mc->mc_rotor = mg->mg_next; | |
428870ff | 1400 | mc->mc_aliquot = 0; |
34dc7c2f BB |
1401 | } while ((mg = mg->mg_next) != rotor); |
1402 | ||
1403 | if (!all_zero) { | |
1404 | dshift++; | |
1405 | ASSERT(dshift < 64); | |
1406 | goto top; | |
1407 | } | |
1408 | ||
9babb374 | 1409 | if (!allocatable && !zio_lock) { |
fb5f0bc8 BB |
1410 | dshift = 3; |
1411 | zio_lock = B_TRUE; | |
1412 | goto top; | |
1413 | } | |
1414 | ||
34dc7c2f BB |
1415 | bzero(&dva[d], sizeof (dva_t)); |
1416 | ||
1417 | return (ENOSPC); | |
1418 | } | |
1419 | ||
1420 | /* | |
1421 | * Free the block represented by DVA in the context of the specified | |
1422 | * transaction group. | |
1423 | */ | |
1424 | static void | |
1425 | metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now) | |
1426 | { | |
1427 | uint64_t vdev = DVA_GET_VDEV(dva); | |
1428 | uint64_t offset = DVA_GET_OFFSET(dva); | |
1429 | uint64_t size = DVA_GET_ASIZE(dva); | |
1430 | vdev_t *vd; | |
1431 | metaslab_t *msp; | |
1432 | ||
1433 | ASSERT(DVA_IS_VALID(dva)); | |
1434 | ||
1435 | if (txg > spa_freeze_txg(spa)) | |
1436 | return; | |
1437 | ||
1438 | if ((vd = vdev_lookup_top(spa, vdev)) == NULL || | |
1439 | (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) { | |
1440 | cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu", | |
1441 | (u_longlong_t)vdev, (u_longlong_t)offset); | |
1442 | ASSERT(0); | |
1443 | return; | |
1444 | } | |
1445 | ||
1446 | msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; | |
1447 | ||
1448 | if (DVA_GET_GANG(dva)) | |
1449 | size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); | |
1450 | ||
1451 | mutex_enter(&msp->ms_lock); | |
1452 | ||
1453 | if (now) { | |
1454 | space_map_remove(&msp->ms_allocmap[txg & TXG_MASK], | |
1455 | offset, size); | |
1456 | space_map_free(&msp->ms_map, offset, size); | |
1457 | } else { | |
1458 | if (msp->ms_freemap[txg & TXG_MASK].sm_space == 0) | |
1459 | vdev_dirty(vd, VDD_METASLAB, msp, txg); | |
1460 | space_map_add(&msp->ms_freemap[txg & TXG_MASK], offset, size); | |
34dc7c2f BB |
1461 | } |
1462 | ||
1463 | mutex_exit(&msp->ms_lock); | |
1464 | } | |
1465 | ||
1466 | /* | |
1467 | * Intent log support: upon opening the pool after a crash, notify the SPA | |
1468 | * of blocks that the intent log has allocated for immediate write, but | |
1469 | * which are still considered free by the SPA because the last transaction | |
1470 | * group didn't commit yet. | |
1471 | */ | |
1472 | static int | |
1473 | metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) | |
1474 | { | |
1475 | uint64_t vdev = DVA_GET_VDEV(dva); | |
1476 | uint64_t offset = DVA_GET_OFFSET(dva); | |
1477 | uint64_t size = DVA_GET_ASIZE(dva); | |
1478 | vdev_t *vd; | |
1479 | metaslab_t *msp; | |
428870ff | 1480 | int error = 0; |
34dc7c2f BB |
1481 | |
1482 | ASSERT(DVA_IS_VALID(dva)); | |
1483 | ||
1484 | if ((vd = vdev_lookup_top(spa, vdev)) == NULL || | |
1485 | (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) | |
1486 | return (ENXIO); | |
1487 | ||
1488 | msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; | |
1489 | ||
1490 | if (DVA_GET_GANG(dva)) | |
1491 | size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); | |
1492 | ||
1493 | mutex_enter(&msp->ms_lock); | |
1494 | ||
428870ff BB |
1495 | if ((txg != 0 && spa_writeable(spa)) || !msp->ms_map.sm_loaded) |
1496 | error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY, 0); | |
1497 | ||
1498 | if (error == 0 && !space_map_contains(&msp->ms_map, offset, size)) | |
1499 | error = ENOENT; | |
1500 | ||
b128c09f | 1501 | if (error || txg == 0) { /* txg == 0 indicates dry run */ |
34dc7c2f BB |
1502 | mutex_exit(&msp->ms_lock); |
1503 | return (error); | |
1504 | } | |
1505 | ||
34dc7c2f | 1506 | space_map_claim(&msp->ms_map, offset, size); |
b128c09f | 1507 | |
fb5f0bc8 | 1508 | if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */ |
b128c09f BB |
1509 | if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0) |
1510 | vdev_dirty(vd, VDD_METASLAB, msp, txg); | |
1511 | space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size); | |
1512 | } | |
34dc7c2f BB |
1513 | |
1514 | mutex_exit(&msp->ms_lock); | |
1515 | ||
1516 | return (0); | |
1517 | } | |
1518 | ||
1519 | int | |
1520 | metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, | |
b128c09f | 1521 | int ndvas, uint64_t txg, blkptr_t *hintbp, int flags) |
34dc7c2f BB |
1522 | { |
1523 | dva_t *dva = bp->blk_dva; | |
1524 | dva_t *hintdva = hintbp->blk_dva; | |
d6320ddb | 1525 | int d, error = 0; |
34dc7c2f | 1526 | |
b128c09f | 1527 | ASSERT(bp->blk_birth == 0); |
428870ff | 1528 | ASSERT(BP_PHYSICAL_BIRTH(bp) == 0); |
b128c09f BB |
1529 | |
1530 | spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); | |
1531 | ||
1532 | if (mc->mc_rotor == NULL) { /* no vdevs in this class */ | |
1533 | spa_config_exit(spa, SCL_ALLOC, FTAG); | |
34dc7c2f | 1534 | return (ENOSPC); |
b128c09f | 1535 | } |
34dc7c2f BB |
1536 | |
1537 | ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa)); | |
1538 | ASSERT(BP_GET_NDVAS(bp) == 0); | |
1539 | ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp)); | |
1540 | ||
d6320ddb | 1541 | for (d = 0; d < ndvas; d++) { |
34dc7c2f | 1542 | error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva, |
b128c09f | 1543 | txg, flags); |
34dc7c2f BB |
1544 | if (error) { |
1545 | for (d--; d >= 0; d--) { | |
1546 | metaslab_free_dva(spa, &dva[d], txg, B_TRUE); | |
1547 | bzero(&dva[d], sizeof (dva_t)); | |
1548 | } | |
b128c09f | 1549 | spa_config_exit(spa, SCL_ALLOC, FTAG); |
34dc7c2f BB |
1550 | return (error); |
1551 | } | |
1552 | } | |
1553 | ASSERT(error == 0); | |
1554 | ASSERT(BP_GET_NDVAS(bp) == ndvas); | |
1555 | ||
b128c09f BB |
1556 | spa_config_exit(spa, SCL_ALLOC, FTAG); |
1557 | ||
428870ff | 1558 | BP_SET_BIRTH(bp, txg, txg); |
b128c09f | 1559 | |
34dc7c2f BB |
1560 | return (0); |
1561 | } | |
1562 | ||
1563 | void | |
1564 | metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now) | |
1565 | { | |
1566 | const dva_t *dva = bp->blk_dva; | |
d6320ddb | 1567 | int d, ndvas = BP_GET_NDVAS(bp); |
34dc7c2f BB |
1568 | |
1569 | ASSERT(!BP_IS_HOLE(bp)); | |
428870ff | 1570 | ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa)); |
b128c09f BB |
1571 | |
1572 | spa_config_enter(spa, SCL_FREE, FTAG, RW_READER); | |
34dc7c2f | 1573 | |
d6320ddb | 1574 | for (d = 0; d < ndvas; d++) |
34dc7c2f | 1575 | metaslab_free_dva(spa, &dva[d], txg, now); |
b128c09f BB |
1576 | |
1577 | spa_config_exit(spa, SCL_FREE, FTAG); | |
34dc7c2f BB |
1578 | } |
1579 | ||
1580 | int | |
1581 | metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) | |
1582 | { | |
1583 | const dva_t *dva = bp->blk_dva; | |
1584 | int ndvas = BP_GET_NDVAS(bp); | |
d6320ddb | 1585 | int d, error = 0; |
34dc7c2f BB |
1586 | |
1587 | ASSERT(!BP_IS_HOLE(bp)); | |
1588 | ||
b128c09f BB |
1589 | if (txg != 0) { |
1590 | /* | |
1591 | * First do a dry run to make sure all DVAs are claimable, | |
1592 | * so we don't have to unwind from partial failures below. | |
1593 | */ | |
1594 | if ((error = metaslab_claim(spa, bp, 0)) != 0) | |
1595 | return (error); | |
1596 | } | |
1597 | ||
1598 | spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); | |
1599 | ||
d6320ddb | 1600 | for (d = 0; d < ndvas; d++) |
34dc7c2f | 1601 | if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0) |
b128c09f BB |
1602 | break; |
1603 | ||
1604 | spa_config_exit(spa, SCL_ALLOC, FTAG); | |
1605 | ||
1606 | ASSERT(error == 0 || txg == 0); | |
34dc7c2f | 1607 | |
b128c09f | 1608 | return (error); |
34dc7c2f | 1609 | } |