]>
Commit | Line | Data |
---|---|---|
34dc7c2f BB |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
9 | * or http://www.opensolaris.org/os/licensing. | |
10 | * See the License for the specific language governing permissions | |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | /* | |
428870ff | 22 | * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. |
ebf8e3a2 | 23 | * Copyright (c) 2012 by Delphix. All rights reserved. |
34dc7c2f BB |
24 | */ |
25 | ||
34dc7c2f | 26 | #include <sys/zfs_context.h> |
34dc7c2f BB |
27 | #include <sys/dmu.h> |
28 | #include <sys/dmu_tx.h> | |
29 | #include <sys/space_map.h> | |
30 | #include <sys/metaslab_impl.h> | |
31 | #include <sys/vdev_impl.h> | |
32 | #include <sys/zio.h> | |
33 | ||
6d974228 GW |
34 | #define WITH_DF_BLOCK_ALLOCATOR |
35 | ||
36 | /* | |
37 | * Allow allocations to switch to gang blocks quickly. We do this to | |
38 | * avoid having to load lots of space_maps in a given txg. There are, | |
39 | * however, some cases where we want to avoid "fast" ganging and instead | |
40 | * we want to do an exhaustive search of all metaslabs on this device. | |
ebf8e3a2 | 41 | * Currently we don't allow any gang, zil, or dump device related allocations |
6d974228 GW |
42 | * to "fast" gang. |
43 | */ | |
44 | #define CAN_FASTGANG(flags) \ | |
45 | (!((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER | \ | |
46 | METASLAB_GANG_AVOID))) | |
22c81dd8 | 47 | |
34dc7c2f BB |
48 | uint64_t metaslab_aliquot = 512ULL << 10; |
49 | uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */ | |
50 | ||
6d974228 GW |
51 | /* |
52 | * This value defines the number of allowed allocation failures per vdev. | |
53 | * If a device reaches this threshold in a given txg then we consider skipping | |
54 | * allocations on that device. | |
55 | */ | |
56 | int zfs_mg_alloc_failures; | |
57 | ||
428870ff BB |
58 | /* |
59 | * Metaslab debugging: when set, keeps all space maps in core to verify frees. | |
60 | */ | |
61 | static int metaslab_debug = 0; | |
62 | ||
9babb374 BB |
63 | /* |
64 | * Minimum size which forces the dynamic allocator to change | |
428870ff | 65 | * it's allocation strategy. Once the space map cannot satisfy |
9babb374 BB |
66 | * an allocation of this size then it switches to using more |
67 | * aggressive strategy (i.e search by size rather than offset). | |
68 | */ | |
69 | uint64_t metaslab_df_alloc_threshold = SPA_MAXBLOCKSIZE; | |
70 | ||
71 | /* | |
72 | * The minimum free space, in percent, which must be available | |
73 | * in a space map to continue allocations in a first-fit fashion. | |
74 | * Once the space_map's free space drops below this level we dynamically | |
75 | * switch to using best-fit allocations. | |
76 | */ | |
428870ff BB |
77 | int metaslab_df_free_pct = 4; |
78 | ||
79 | /* | |
80 | * A metaslab is considered "free" if it contains a contiguous | |
81 | * segment which is greater than metaslab_min_alloc_size. | |
82 | */ | |
83 | uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS; | |
84 | ||
85 | /* | |
86 | * Max number of space_maps to prefetch. | |
87 | */ | |
88 | int metaslab_prefetch_limit = SPA_DVAS_PER_BP; | |
89 | ||
90 | /* | |
91 | * Percentage bonus multiplier for metaslabs that are in the bonus area. | |
92 | */ | |
93 | int metaslab_smo_bonus_pct = 150; | |
9babb374 | 94 | |
34dc7c2f BB |
95 | /* |
96 | * ========================================================================== | |
97 | * Metaslab classes | |
98 | * ========================================================================== | |
99 | */ | |
100 | metaslab_class_t * | |
428870ff | 101 | metaslab_class_create(spa_t *spa, space_map_ops_t *ops) |
34dc7c2f BB |
102 | { |
103 | metaslab_class_t *mc; | |
104 | ||
b8d06fca | 105 | mc = kmem_zalloc(sizeof (metaslab_class_t), KM_PUSHPAGE); |
34dc7c2f | 106 | |
428870ff | 107 | mc->mc_spa = spa; |
34dc7c2f | 108 | mc->mc_rotor = NULL; |
9babb374 | 109 | mc->mc_ops = ops; |
34dc7c2f BB |
110 | |
111 | return (mc); | |
112 | } | |
113 | ||
114 | void | |
115 | metaslab_class_destroy(metaslab_class_t *mc) | |
116 | { | |
428870ff BB |
117 | ASSERT(mc->mc_rotor == NULL); |
118 | ASSERT(mc->mc_alloc == 0); | |
119 | ASSERT(mc->mc_deferred == 0); | |
120 | ASSERT(mc->mc_space == 0); | |
121 | ASSERT(mc->mc_dspace == 0); | |
34dc7c2f BB |
122 | |
123 | kmem_free(mc, sizeof (metaslab_class_t)); | |
124 | } | |
125 | ||
428870ff BB |
126 | int |
127 | metaslab_class_validate(metaslab_class_t *mc) | |
34dc7c2f | 128 | { |
428870ff BB |
129 | metaslab_group_t *mg; |
130 | vdev_t *vd; | |
34dc7c2f | 131 | |
428870ff BB |
132 | /* |
133 | * Must hold one of the spa_config locks. | |
134 | */ | |
135 | ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) || | |
136 | spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER)); | |
34dc7c2f | 137 | |
428870ff BB |
138 | if ((mg = mc->mc_rotor) == NULL) |
139 | return (0); | |
140 | ||
141 | do { | |
142 | vd = mg->mg_vd; | |
143 | ASSERT(vd->vdev_mg != NULL); | |
144 | ASSERT3P(vd->vdev_top, ==, vd); | |
145 | ASSERT3P(mg->mg_class, ==, mc); | |
146 | ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops); | |
147 | } while ((mg = mg->mg_next) != mc->mc_rotor); | |
148 | ||
149 | return (0); | |
34dc7c2f BB |
150 | } |
151 | ||
152 | void | |
428870ff BB |
153 | metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta, |
154 | int64_t defer_delta, int64_t space_delta, int64_t dspace_delta) | |
34dc7c2f | 155 | { |
428870ff BB |
156 | atomic_add_64(&mc->mc_alloc, alloc_delta); |
157 | atomic_add_64(&mc->mc_deferred, defer_delta); | |
158 | atomic_add_64(&mc->mc_space, space_delta); | |
159 | atomic_add_64(&mc->mc_dspace, dspace_delta); | |
160 | } | |
34dc7c2f | 161 | |
428870ff BB |
162 | uint64_t |
163 | metaslab_class_get_alloc(metaslab_class_t *mc) | |
164 | { | |
165 | return (mc->mc_alloc); | |
166 | } | |
34dc7c2f | 167 | |
428870ff BB |
168 | uint64_t |
169 | metaslab_class_get_deferred(metaslab_class_t *mc) | |
170 | { | |
171 | return (mc->mc_deferred); | |
172 | } | |
34dc7c2f | 173 | |
428870ff BB |
174 | uint64_t |
175 | metaslab_class_get_space(metaslab_class_t *mc) | |
176 | { | |
177 | return (mc->mc_space); | |
178 | } | |
34dc7c2f | 179 | |
428870ff BB |
180 | uint64_t |
181 | metaslab_class_get_dspace(metaslab_class_t *mc) | |
182 | { | |
183 | return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space); | |
34dc7c2f BB |
184 | } |
185 | ||
186 | /* | |
187 | * ========================================================================== | |
188 | * Metaslab groups | |
189 | * ========================================================================== | |
190 | */ | |
191 | static int | |
192 | metaslab_compare(const void *x1, const void *x2) | |
193 | { | |
194 | const metaslab_t *m1 = x1; | |
195 | const metaslab_t *m2 = x2; | |
196 | ||
197 | if (m1->ms_weight < m2->ms_weight) | |
198 | return (1); | |
199 | if (m1->ms_weight > m2->ms_weight) | |
200 | return (-1); | |
201 | ||
202 | /* | |
203 | * If the weights are identical, use the offset to force uniqueness. | |
204 | */ | |
205 | if (m1->ms_map.sm_start < m2->ms_map.sm_start) | |
206 | return (-1); | |
207 | if (m1->ms_map.sm_start > m2->ms_map.sm_start) | |
208 | return (1); | |
209 | ||
210 | ASSERT3P(m1, ==, m2); | |
211 | ||
212 | return (0); | |
213 | } | |
214 | ||
215 | metaslab_group_t * | |
216 | metaslab_group_create(metaslab_class_t *mc, vdev_t *vd) | |
217 | { | |
218 | metaslab_group_t *mg; | |
219 | ||
b8d06fca | 220 | mg = kmem_zalloc(sizeof (metaslab_group_t), KM_PUSHPAGE); |
34dc7c2f BB |
221 | mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); |
222 | avl_create(&mg->mg_metaslab_tree, metaslab_compare, | |
223 | sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node)); | |
34dc7c2f | 224 | mg->mg_vd = vd; |
428870ff BB |
225 | mg->mg_class = mc; |
226 | mg->mg_activation_count = 0; | |
34dc7c2f BB |
227 | |
228 | return (mg); | |
229 | } | |
230 | ||
231 | void | |
232 | metaslab_group_destroy(metaslab_group_t *mg) | |
233 | { | |
428870ff BB |
234 | ASSERT(mg->mg_prev == NULL); |
235 | ASSERT(mg->mg_next == NULL); | |
236 | /* | |
237 | * We may have gone below zero with the activation count | |
238 | * either because we never activated in the first place or | |
239 | * because we're done, and possibly removing the vdev. | |
240 | */ | |
241 | ASSERT(mg->mg_activation_count <= 0); | |
242 | ||
34dc7c2f BB |
243 | avl_destroy(&mg->mg_metaslab_tree); |
244 | mutex_destroy(&mg->mg_lock); | |
245 | kmem_free(mg, sizeof (metaslab_group_t)); | |
246 | } | |
247 | ||
428870ff BB |
248 | void |
249 | metaslab_group_activate(metaslab_group_t *mg) | |
250 | { | |
251 | metaslab_class_t *mc = mg->mg_class; | |
252 | metaslab_group_t *mgprev, *mgnext; | |
253 | ||
254 | ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER)); | |
255 | ||
256 | ASSERT(mc->mc_rotor != mg); | |
257 | ASSERT(mg->mg_prev == NULL); | |
258 | ASSERT(mg->mg_next == NULL); | |
259 | ASSERT(mg->mg_activation_count <= 0); | |
260 | ||
261 | if (++mg->mg_activation_count <= 0) | |
262 | return; | |
263 | ||
264 | mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children); | |
265 | ||
266 | if ((mgprev = mc->mc_rotor) == NULL) { | |
267 | mg->mg_prev = mg; | |
268 | mg->mg_next = mg; | |
269 | } else { | |
270 | mgnext = mgprev->mg_next; | |
271 | mg->mg_prev = mgprev; | |
272 | mg->mg_next = mgnext; | |
273 | mgprev->mg_next = mg; | |
274 | mgnext->mg_prev = mg; | |
275 | } | |
276 | mc->mc_rotor = mg; | |
277 | } | |
278 | ||
279 | void | |
280 | metaslab_group_passivate(metaslab_group_t *mg) | |
281 | { | |
282 | metaslab_class_t *mc = mg->mg_class; | |
283 | metaslab_group_t *mgprev, *mgnext; | |
284 | ||
285 | ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER)); | |
286 | ||
287 | if (--mg->mg_activation_count != 0) { | |
288 | ASSERT(mc->mc_rotor != mg); | |
289 | ASSERT(mg->mg_prev == NULL); | |
290 | ASSERT(mg->mg_next == NULL); | |
291 | ASSERT(mg->mg_activation_count < 0); | |
292 | return; | |
293 | } | |
294 | ||
295 | mgprev = mg->mg_prev; | |
296 | mgnext = mg->mg_next; | |
297 | ||
298 | if (mg == mgnext) { | |
299 | mc->mc_rotor = NULL; | |
300 | } else { | |
301 | mc->mc_rotor = mgnext; | |
302 | mgprev->mg_next = mgnext; | |
303 | mgnext->mg_prev = mgprev; | |
304 | } | |
305 | ||
306 | mg->mg_prev = NULL; | |
307 | mg->mg_next = NULL; | |
308 | } | |
309 | ||
34dc7c2f BB |
310 | static void |
311 | metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp) | |
312 | { | |
313 | mutex_enter(&mg->mg_lock); | |
314 | ASSERT(msp->ms_group == NULL); | |
315 | msp->ms_group = mg; | |
316 | msp->ms_weight = 0; | |
317 | avl_add(&mg->mg_metaslab_tree, msp); | |
318 | mutex_exit(&mg->mg_lock); | |
319 | } | |
320 | ||
321 | static void | |
322 | metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp) | |
323 | { | |
324 | mutex_enter(&mg->mg_lock); | |
325 | ASSERT(msp->ms_group == mg); | |
326 | avl_remove(&mg->mg_metaslab_tree, msp); | |
327 | msp->ms_group = NULL; | |
328 | mutex_exit(&mg->mg_lock); | |
329 | } | |
330 | ||
331 | static void | |
332 | metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) | |
333 | { | |
334 | /* | |
335 | * Although in principle the weight can be any value, in | |
336 | * practice we do not use values in the range [1, 510]. | |
337 | */ | |
338 | ASSERT(weight >= SPA_MINBLOCKSIZE-1 || weight == 0); | |
339 | ASSERT(MUTEX_HELD(&msp->ms_lock)); | |
340 | ||
341 | mutex_enter(&mg->mg_lock); | |
342 | ASSERT(msp->ms_group == mg); | |
343 | avl_remove(&mg->mg_metaslab_tree, msp); | |
344 | msp->ms_weight = weight; | |
345 | avl_add(&mg->mg_metaslab_tree, msp); | |
346 | mutex_exit(&mg->mg_lock); | |
347 | } | |
348 | ||
428870ff BB |
349 | /* |
350 | * ========================================================================== | |
351 | * Common allocator routines | |
352 | * ========================================================================== | |
353 | */ | |
354 | static int | |
355 | metaslab_segsize_compare(const void *x1, const void *x2) | |
356 | { | |
357 | const space_seg_t *s1 = x1; | |
358 | const space_seg_t *s2 = x2; | |
359 | uint64_t ss_size1 = s1->ss_end - s1->ss_start; | |
360 | uint64_t ss_size2 = s2->ss_end - s2->ss_start; | |
361 | ||
362 | if (ss_size1 < ss_size2) | |
363 | return (-1); | |
364 | if (ss_size1 > ss_size2) | |
365 | return (1); | |
366 | ||
367 | if (s1->ss_start < s2->ss_start) | |
368 | return (-1); | |
369 | if (s1->ss_start > s2->ss_start) | |
370 | return (1); | |
371 | ||
372 | return (0); | |
373 | } | |
374 | ||
22c81dd8 BB |
375 | #if defined(WITH_FF_BLOCK_ALLOCATOR) || \ |
376 | defined(WITH_DF_BLOCK_ALLOCATOR) || \ | |
377 | defined(WITH_CDF_BLOCK_ALLOCATOR) | |
34dc7c2f | 378 | /* |
9babb374 BB |
379 | * This is a helper function that can be used by the allocator to find |
380 | * a suitable block to allocate. This will search the specified AVL | |
381 | * tree looking for a block that matches the specified criteria. | |
34dc7c2f | 382 | */ |
34dc7c2f | 383 | static uint64_t |
9babb374 BB |
384 | metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size, |
385 | uint64_t align) | |
34dc7c2f | 386 | { |
34dc7c2f BB |
387 | space_seg_t *ss, ssearch; |
388 | avl_index_t where; | |
389 | ||
390 | ssearch.ss_start = *cursor; | |
391 | ssearch.ss_end = *cursor + size; | |
392 | ||
393 | ss = avl_find(t, &ssearch, &where); | |
394 | if (ss == NULL) | |
395 | ss = avl_nearest(t, where, AVL_AFTER); | |
396 | ||
397 | while (ss != NULL) { | |
398 | uint64_t offset = P2ROUNDUP(ss->ss_start, align); | |
399 | ||
400 | if (offset + size <= ss->ss_end) { | |
401 | *cursor = offset + size; | |
402 | return (offset); | |
403 | } | |
404 | ss = AVL_NEXT(t, ss); | |
405 | } | |
406 | ||
407 | /* | |
408 | * If we know we've searched the whole map (*cursor == 0), give up. | |
409 | * Otherwise, reset the cursor to the beginning and try again. | |
410 | */ | |
411 | if (*cursor == 0) | |
412 | return (-1ULL); | |
413 | ||
414 | *cursor = 0; | |
9babb374 BB |
415 | return (metaslab_block_picker(t, cursor, size, align)); |
416 | } | |
22c81dd8 | 417 | #endif /* WITH_FF/DF/CDF_BLOCK_ALLOCATOR */ |
9babb374 | 418 | |
9babb374 | 419 | static void |
428870ff | 420 | metaslab_pp_load(space_map_t *sm) |
9babb374 | 421 | { |
428870ff BB |
422 | space_seg_t *ss; |
423 | ||
9babb374 | 424 | ASSERT(sm->sm_ppd == NULL); |
b8d06fca | 425 | sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_PUSHPAGE); |
428870ff | 426 | |
b8d06fca | 427 | sm->sm_pp_root = kmem_alloc(sizeof (avl_tree_t), KM_PUSHPAGE); |
428870ff BB |
428 | avl_create(sm->sm_pp_root, metaslab_segsize_compare, |
429 | sizeof (space_seg_t), offsetof(struct space_seg, ss_pp_node)); | |
430 | ||
431 | for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss)) | |
432 | avl_add(sm->sm_pp_root, ss); | |
9babb374 BB |
433 | } |
434 | ||
435 | static void | |
428870ff | 436 | metaslab_pp_unload(space_map_t *sm) |
9babb374 | 437 | { |
428870ff BB |
438 | void *cookie = NULL; |
439 | ||
9babb374 BB |
440 | kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t)); |
441 | sm->sm_ppd = NULL; | |
9babb374 | 442 | |
428870ff BB |
443 | while (avl_destroy_nodes(sm->sm_pp_root, &cookie) != NULL) { |
444 | /* tear down the tree */ | |
445 | } | |
9babb374 | 446 | |
428870ff BB |
447 | avl_destroy(sm->sm_pp_root); |
448 | kmem_free(sm->sm_pp_root, sizeof (avl_tree_t)); | |
449 | sm->sm_pp_root = NULL; | |
34dc7c2f BB |
450 | } |
451 | ||
452 | /* ARGSUSED */ | |
453 | static void | |
428870ff | 454 | metaslab_pp_claim(space_map_t *sm, uint64_t start, uint64_t size) |
34dc7c2f BB |
455 | { |
456 | /* No need to update cursor */ | |
457 | } | |
458 | ||
459 | /* ARGSUSED */ | |
460 | static void | |
428870ff | 461 | metaslab_pp_free(space_map_t *sm, uint64_t start, uint64_t size) |
34dc7c2f BB |
462 | { |
463 | /* No need to update cursor */ | |
464 | } | |
465 | ||
9babb374 | 466 | /* |
428870ff | 467 | * Return the maximum contiguous segment within the metaslab. |
9babb374 | 468 | */ |
9babb374 | 469 | uint64_t |
428870ff | 470 | metaslab_pp_maxsize(space_map_t *sm) |
9babb374 BB |
471 | { |
472 | avl_tree_t *t = sm->sm_pp_root; | |
473 | space_seg_t *ss; | |
474 | ||
475 | if (t == NULL || (ss = avl_last(t)) == NULL) | |
476 | return (0ULL); | |
477 | ||
478 | return (ss->ss_end - ss->ss_start); | |
479 | } | |
480 | ||
22c81dd8 | 481 | #if defined(WITH_FF_BLOCK_ALLOCATOR) |
428870ff BB |
482 | /* |
483 | * ========================================================================== | |
484 | * The first-fit block allocator | |
485 | * ========================================================================== | |
486 | */ | |
487 | static uint64_t | |
488 | metaslab_ff_alloc(space_map_t *sm, uint64_t size) | |
9babb374 | 489 | { |
428870ff BB |
490 | avl_tree_t *t = &sm->sm_root; |
491 | uint64_t align = size & -size; | |
492 | uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1; | |
9babb374 | 493 | |
428870ff | 494 | return (metaslab_block_picker(t, cursor, size, align)); |
9babb374 BB |
495 | } |
496 | ||
428870ff BB |
497 | /* ARGSUSED */ |
498 | boolean_t | |
499 | metaslab_ff_fragmented(space_map_t *sm) | |
9babb374 | 500 | { |
428870ff | 501 | return (B_TRUE); |
9babb374 BB |
502 | } |
503 | ||
428870ff BB |
504 | static space_map_ops_t metaslab_ff_ops = { |
505 | metaslab_pp_load, | |
506 | metaslab_pp_unload, | |
507 | metaslab_ff_alloc, | |
508 | metaslab_pp_claim, | |
509 | metaslab_pp_free, | |
510 | metaslab_pp_maxsize, | |
511 | metaslab_ff_fragmented | |
512 | }; | |
9babb374 | 513 | |
22c81dd8 BB |
514 | space_map_ops_t *zfs_metaslab_ops = &metaslab_ff_ops; |
515 | #endif /* WITH_FF_BLOCK_ALLOCATOR */ | |
516 | ||
517 | #if defined(WITH_DF_BLOCK_ALLOCATOR) | |
428870ff BB |
518 | /* |
519 | * ========================================================================== | |
520 | * Dynamic block allocator - | |
521 | * Uses the first fit allocation scheme until space get low and then | |
522 | * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold | |
523 | * and metaslab_df_free_pct to determine when to switch the allocation scheme. | |
524 | * ========================================================================== | |
525 | */ | |
9babb374 BB |
526 | static uint64_t |
527 | metaslab_df_alloc(space_map_t *sm, uint64_t size) | |
528 | { | |
529 | avl_tree_t *t = &sm->sm_root; | |
530 | uint64_t align = size & -size; | |
531 | uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1; | |
428870ff | 532 | uint64_t max_size = metaslab_pp_maxsize(sm); |
9babb374 BB |
533 | int free_pct = sm->sm_space * 100 / sm->sm_size; |
534 | ||
535 | ASSERT(MUTEX_HELD(sm->sm_lock)); | |
536 | ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root)); | |
537 | ||
538 | if (max_size < size) | |
539 | return (-1ULL); | |
540 | ||
541 | /* | |
542 | * If we're running low on space switch to using the size | |
543 | * sorted AVL tree (best-fit). | |
544 | */ | |
545 | if (max_size < metaslab_df_alloc_threshold || | |
546 | free_pct < metaslab_df_free_pct) { | |
547 | t = sm->sm_pp_root; | |
548 | *cursor = 0; | |
549 | } | |
550 | ||
551 | return (metaslab_block_picker(t, cursor, size, 1ULL)); | |
552 | } | |
553 | ||
428870ff BB |
554 | static boolean_t |
555 | metaslab_df_fragmented(space_map_t *sm) | |
9babb374 | 556 | { |
428870ff BB |
557 | uint64_t max_size = metaslab_pp_maxsize(sm); |
558 | int free_pct = sm->sm_space * 100 / sm->sm_size; | |
9babb374 | 559 | |
428870ff BB |
560 | if (max_size >= metaslab_df_alloc_threshold && |
561 | free_pct >= metaslab_df_free_pct) | |
562 | return (B_FALSE); | |
563 | ||
564 | return (B_TRUE); | |
9babb374 BB |
565 | } |
566 | ||
567 | static space_map_ops_t metaslab_df_ops = { | |
428870ff BB |
568 | metaslab_pp_load, |
569 | metaslab_pp_unload, | |
9babb374 | 570 | metaslab_df_alloc, |
428870ff BB |
571 | metaslab_pp_claim, |
572 | metaslab_pp_free, | |
573 | metaslab_pp_maxsize, | |
574 | metaslab_df_fragmented | |
34dc7c2f BB |
575 | }; |
576 | ||
22c81dd8 BB |
577 | space_map_ops_t *zfs_metaslab_ops = &metaslab_df_ops; |
578 | #endif /* WITH_DF_BLOCK_ALLOCATOR */ | |
579 | ||
428870ff BB |
580 | /* |
581 | * ========================================================================== | |
582 | * Other experimental allocators | |
583 | * ========================================================================== | |
584 | */ | |
22c81dd8 | 585 | #if defined(WITH_CDF_BLOCK_ALLOCATOR) |
428870ff BB |
586 | static uint64_t |
587 | metaslab_cdf_alloc(space_map_t *sm, uint64_t size) | |
588 | { | |
589 | avl_tree_t *t = &sm->sm_root; | |
590 | uint64_t *cursor = (uint64_t *)sm->sm_ppd; | |
591 | uint64_t *extent_end = (uint64_t *)sm->sm_ppd + 1; | |
592 | uint64_t max_size = metaslab_pp_maxsize(sm); | |
593 | uint64_t rsize = size; | |
594 | uint64_t offset = 0; | |
595 | ||
596 | ASSERT(MUTEX_HELD(sm->sm_lock)); | |
597 | ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root)); | |
598 | ||
599 | if (max_size < size) | |
600 | return (-1ULL); | |
601 | ||
602 | ASSERT3U(*extent_end, >=, *cursor); | |
603 | ||
604 | /* | |
605 | * If we're running low on space switch to using the size | |
606 | * sorted AVL tree (best-fit). | |
607 | */ | |
608 | if ((*cursor + size) > *extent_end) { | |
609 | ||
610 | t = sm->sm_pp_root; | |
611 | *cursor = *extent_end = 0; | |
612 | ||
613 | if (max_size > 2 * SPA_MAXBLOCKSIZE) | |
614 | rsize = MIN(metaslab_min_alloc_size, max_size); | |
615 | offset = metaslab_block_picker(t, extent_end, rsize, 1ULL); | |
616 | if (offset != -1) | |
617 | *cursor = offset + size; | |
618 | } else { | |
619 | offset = metaslab_block_picker(t, cursor, rsize, 1ULL); | |
620 | } | |
621 | ASSERT3U(*cursor, <=, *extent_end); | |
622 | return (offset); | |
623 | } | |
624 | ||
625 | static boolean_t | |
626 | metaslab_cdf_fragmented(space_map_t *sm) | |
627 | { | |
628 | uint64_t max_size = metaslab_pp_maxsize(sm); | |
629 | ||
630 | if (max_size > (metaslab_min_alloc_size * 10)) | |
631 | return (B_FALSE); | |
632 | return (B_TRUE); | |
633 | } | |
634 | ||
635 | static space_map_ops_t metaslab_cdf_ops = { | |
636 | metaslab_pp_load, | |
637 | metaslab_pp_unload, | |
638 | metaslab_cdf_alloc, | |
639 | metaslab_pp_claim, | |
640 | metaslab_pp_free, | |
641 | metaslab_pp_maxsize, | |
642 | metaslab_cdf_fragmented | |
643 | }; | |
644 | ||
22c81dd8 BB |
645 | space_map_ops_t *zfs_metaslab_ops = &metaslab_cdf_ops; |
646 | #endif /* WITH_CDF_BLOCK_ALLOCATOR */ | |
647 | ||
648 | #if defined(WITH_NDF_BLOCK_ALLOCATOR) | |
428870ff BB |
649 | uint64_t metaslab_ndf_clump_shift = 4; |
650 | ||
651 | static uint64_t | |
652 | metaslab_ndf_alloc(space_map_t *sm, uint64_t size) | |
653 | { | |
654 | avl_tree_t *t = &sm->sm_root; | |
655 | avl_index_t where; | |
656 | space_seg_t *ss, ssearch; | |
657 | uint64_t hbit = highbit(size); | |
658 | uint64_t *cursor = (uint64_t *)sm->sm_ppd + hbit - 1; | |
659 | uint64_t max_size = metaslab_pp_maxsize(sm); | |
660 | ||
661 | ASSERT(MUTEX_HELD(sm->sm_lock)); | |
662 | ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root)); | |
663 | ||
664 | if (max_size < size) | |
665 | return (-1ULL); | |
666 | ||
667 | ssearch.ss_start = *cursor; | |
668 | ssearch.ss_end = *cursor + size; | |
669 | ||
670 | ss = avl_find(t, &ssearch, &where); | |
671 | if (ss == NULL || (ss->ss_start + size > ss->ss_end)) { | |
672 | t = sm->sm_pp_root; | |
673 | ||
674 | ssearch.ss_start = 0; | |
675 | ssearch.ss_end = MIN(max_size, | |
676 | 1ULL << (hbit + metaslab_ndf_clump_shift)); | |
677 | ss = avl_find(t, &ssearch, &where); | |
678 | if (ss == NULL) | |
679 | ss = avl_nearest(t, where, AVL_AFTER); | |
680 | ASSERT(ss != NULL); | |
681 | } | |
682 | ||
683 | if (ss != NULL) { | |
684 | if (ss->ss_start + size <= ss->ss_end) { | |
685 | *cursor = ss->ss_start + size; | |
686 | return (ss->ss_start); | |
687 | } | |
688 | } | |
689 | return (-1ULL); | |
690 | } | |
691 | ||
692 | static boolean_t | |
693 | metaslab_ndf_fragmented(space_map_t *sm) | |
694 | { | |
695 | uint64_t max_size = metaslab_pp_maxsize(sm); | |
696 | ||
697 | if (max_size > (metaslab_min_alloc_size << metaslab_ndf_clump_shift)) | |
698 | return (B_FALSE); | |
699 | return (B_TRUE); | |
700 | } | |
701 | ||
702 | ||
703 | static space_map_ops_t metaslab_ndf_ops = { | |
704 | metaslab_pp_load, | |
705 | metaslab_pp_unload, | |
706 | metaslab_ndf_alloc, | |
707 | metaslab_pp_claim, | |
708 | metaslab_pp_free, | |
709 | metaslab_pp_maxsize, | |
710 | metaslab_ndf_fragmented | |
711 | }; | |
712 | ||
713 | space_map_ops_t *zfs_metaslab_ops = &metaslab_ndf_ops; | |
22c81dd8 | 714 | #endif /* WITH_NDF_BLOCK_ALLOCATOR */ |
9babb374 | 715 | |
34dc7c2f BB |
716 | /* |
717 | * ========================================================================== | |
718 | * Metaslabs | |
719 | * ========================================================================== | |
720 | */ | |
721 | metaslab_t * | |
722 | metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo, | |
723 | uint64_t start, uint64_t size, uint64_t txg) | |
724 | { | |
725 | vdev_t *vd = mg->mg_vd; | |
726 | metaslab_t *msp; | |
727 | ||
b8d06fca | 728 | msp = kmem_zalloc(sizeof (metaslab_t), KM_PUSHPAGE); |
34dc7c2f BB |
729 | mutex_init(&msp->ms_lock, NULL, MUTEX_DEFAULT, NULL); |
730 | ||
731 | msp->ms_smo_syncing = *smo; | |
732 | ||
733 | /* | |
734 | * We create the main space map here, but we don't create the | |
735 | * allocmaps and freemaps until metaslab_sync_done(). This serves | |
736 | * two purposes: it allows metaslab_sync_done() to detect the | |
737 | * addition of new space; and for debugging, it ensures that we'd | |
738 | * data fault on any attempt to use this metaslab before it's ready. | |
739 | */ | |
740 | space_map_create(&msp->ms_map, start, size, | |
741 | vd->vdev_ashift, &msp->ms_lock); | |
742 | ||
743 | metaslab_group_add(mg, msp); | |
744 | ||
428870ff BB |
745 | if (metaslab_debug && smo->smo_object != 0) { |
746 | mutex_enter(&msp->ms_lock); | |
747 | VERIFY(space_map_load(&msp->ms_map, mg->mg_class->mc_ops, | |
748 | SM_FREE, smo, spa_meta_objset(vd->vdev_spa)) == 0); | |
749 | mutex_exit(&msp->ms_lock); | |
750 | } | |
751 | ||
34dc7c2f BB |
752 | /* |
753 | * If we're opening an existing pool (txg == 0) or creating | |
754 | * a new one (txg == TXG_INITIAL), all space is available now. | |
755 | * If we're adding space to an existing pool, the new space | |
756 | * does not become available until after this txg has synced. | |
757 | */ | |
758 | if (txg <= TXG_INITIAL) | |
759 | metaslab_sync_done(msp, 0); | |
760 | ||
761 | if (txg != 0) { | |
34dc7c2f | 762 | vdev_dirty(vd, 0, NULL, txg); |
428870ff | 763 | vdev_dirty(vd, VDD_METASLAB, msp, txg); |
34dc7c2f BB |
764 | } |
765 | ||
766 | return (msp); | |
767 | } | |
768 | ||
769 | void | |
770 | metaslab_fini(metaslab_t *msp) | |
771 | { | |
772 | metaslab_group_t *mg = msp->ms_group; | |
d6320ddb | 773 | int t; |
34dc7c2f | 774 | |
428870ff BB |
775 | vdev_space_update(mg->mg_vd, |
776 | -msp->ms_smo.smo_alloc, 0, -msp->ms_map.sm_size); | |
34dc7c2f BB |
777 | |
778 | metaslab_group_remove(mg, msp); | |
779 | ||
780 | mutex_enter(&msp->ms_lock); | |
781 | ||
782 | space_map_unload(&msp->ms_map); | |
783 | space_map_destroy(&msp->ms_map); | |
784 | ||
d6320ddb | 785 | for (t = 0; t < TXG_SIZE; t++) { |
34dc7c2f BB |
786 | space_map_destroy(&msp->ms_allocmap[t]); |
787 | space_map_destroy(&msp->ms_freemap[t]); | |
788 | } | |
789 | ||
d6320ddb | 790 | for (t = 0; t < TXG_DEFER_SIZE; t++) |
428870ff BB |
791 | space_map_destroy(&msp->ms_defermap[t]); |
792 | ||
793 | ASSERT3S(msp->ms_deferspace, ==, 0); | |
794 | ||
34dc7c2f BB |
795 | mutex_exit(&msp->ms_lock); |
796 | mutex_destroy(&msp->ms_lock); | |
797 | ||
798 | kmem_free(msp, sizeof (metaslab_t)); | |
799 | } | |
800 | ||
801 | #define METASLAB_WEIGHT_PRIMARY (1ULL << 63) | |
802 | #define METASLAB_WEIGHT_SECONDARY (1ULL << 62) | |
803 | #define METASLAB_ACTIVE_MASK \ | |
804 | (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY) | |
34dc7c2f BB |
805 | |
806 | static uint64_t | |
807 | metaslab_weight(metaslab_t *msp) | |
808 | { | |
809 | metaslab_group_t *mg = msp->ms_group; | |
810 | space_map_t *sm = &msp->ms_map; | |
811 | space_map_obj_t *smo = &msp->ms_smo; | |
812 | vdev_t *vd = mg->mg_vd; | |
813 | uint64_t weight, space; | |
814 | ||
815 | ASSERT(MUTEX_HELD(&msp->ms_lock)); | |
816 | ||
817 | /* | |
818 | * The baseline weight is the metaslab's free space. | |
819 | */ | |
820 | space = sm->sm_size - smo->smo_alloc; | |
821 | weight = space; | |
822 | ||
823 | /* | |
824 | * Modern disks have uniform bit density and constant angular velocity. | |
825 | * Therefore, the outer recording zones are faster (higher bandwidth) | |
826 | * than the inner zones by the ratio of outer to inner track diameter, | |
827 | * which is typically around 2:1. We account for this by assigning | |
828 | * higher weight to lower metaslabs (multiplier ranging from 2x to 1x). | |
829 | * In effect, this means that we'll select the metaslab with the most | |
830 | * free bandwidth rather than simply the one with the most free space. | |
831 | */ | |
832 | weight = 2 * weight - | |
833 | ((sm->sm_start >> vd->vdev_ms_shift) * weight) / vd->vdev_ms_count; | |
834 | ASSERT(weight >= space && weight <= 2 * space); | |
835 | ||
836 | /* | |
428870ff BB |
837 | * For locality, assign higher weight to metaslabs which have |
838 | * a lower offset than what we've already activated. | |
34dc7c2f | 839 | */ |
428870ff BB |
840 | if (sm->sm_start <= mg->mg_bonus_area) |
841 | weight *= (metaslab_smo_bonus_pct / 100); | |
34dc7c2f | 842 | ASSERT(weight >= space && |
428870ff BB |
843 | weight <= 2 * (metaslab_smo_bonus_pct / 100) * space); |
844 | ||
845 | if (sm->sm_loaded && !sm->sm_ops->smop_fragmented(sm)) { | |
846 | /* | |
847 | * If this metaslab is one we're actively using, adjust its | |
848 | * weight to make it preferable to any inactive metaslab so | |
849 | * we'll polish it off. | |
850 | */ | |
851 | weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK); | |
852 | } | |
853 | return (weight); | |
854 | } | |
855 | ||
856 | static void | |
857 | metaslab_prefetch(metaslab_group_t *mg) | |
858 | { | |
859 | spa_t *spa = mg->mg_vd->vdev_spa; | |
860 | metaslab_t *msp; | |
861 | avl_tree_t *t = &mg->mg_metaslab_tree; | |
862 | int m; | |
863 | ||
864 | mutex_enter(&mg->mg_lock); | |
34dc7c2f BB |
865 | |
866 | /* | |
428870ff | 867 | * Prefetch the next potential metaslabs |
34dc7c2f | 868 | */ |
428870ff BB |
869 | for (msp = avl_first(t), m = 0; msp; msp = AVL_NEXT(t, msp), m++) { |
870 | space_map_t *sm = &msp->ms_map; | |
871 | space_map_obj_t *smo = &msp->ms_smo; | |
34dc7c2f | 872 | |
428870ff BB |
873 | /* If we have reached our prefetch limit then we're done */ |
874 | if (m >= metaslab_prefetch_limit) | |
875 | break; | |
876 | ||
877 | if (!sm->sm_loaded && smo->smo_object != 0) { | |
878 | mutex_exit(&mg->mg_lock); | |
879 | dmu_prefetch(spa_meta_objset(spa), smo->smo_object, | |
880 | 0ULL, smo->smo_objsize); | |
881 | mutex_enter(&mg->mg_lock); | |
882 | } | |
883 | } | |
884 | mutex_exit(&mg->mg_lock); | |
34dc7c2f BB |
885 | } |
886 | ||
887 | static int | |
6d974228 | 888 | metaslab_activate(metaslab_t *msp, uint64_t activation_weight) |
34dc7c2f | 889 | { |
428870ff | 890 | metaslab_group_t *mg = msp->ms_group; |
34dc7c2f | 891 | space_map_t *sm = &msp->ms_map; |
9babb374 | 892 | space_map_ops_t *sm_ops = msp->ms_group->mg_class->mc_ops; |
d6320ddb | 893 | int t; |
34dc7c2f BB |
894 | |
895 | ASSERT(MUTEX_HELD(&msp->ms_lock)); | |
896 | ||
897 | if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { | |
428870ff BB |
898 | space_map_load_wait(sm); |
899 | if (!sm->sm_loaded) { | |
900 | int error = space_map_load(sm, sm_ops, SM_FREE, | |
901 | &msp->ms_smo, | |
902 | spa_meta_objset(msp->ms_group->mg_vd->vdev_spa)); | |
903 | if (error) { | |
904 | metaslab_group_sort(msp->ms_group, msp, 0); | |
905 | return (error); | |
906 | } | |
d6320ddb | 907 | for (t = 0; t < TXG_DEFER_SIZE; t++) |
428870ff BB |
908 | space_map_walk(&msp->ms_defermap[t], |
909 | space_map_claim, sm); | |
910 | ||
911 | } | |
912 | ||
913 | /* | |
914 | * Track the bonus area as we activate new metaslabs. | |
915 | */ | |
916 | if (sm->sm_start > mg->mg_bonus_area) { | |
917 | mutex_enter(&mg->mg_lock); | |
918 | mg->mg_bonus_area = sm->sm_start; | |
919 | mutex_exit(&mg->mg_lock); | |
34dc7c2f | 920 | } |
9babb374 | 921 | |
34dc7c2f BB |
922 | metaslab_group_sort(msp->ms_group, msp, |
923 | msp->ms_weight | activation_weight); | |
924 | } | |
925 | ASSERT(sm->sm_loaded); | |
926 | ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); | |
927 | ||
928 | return (0); | |
929 | } | |
930 | ||
931 | static void | |
932 | metaslab_passivate(metaslab_t *msp, uint64_t size) | |
933 | { | |
934 | /* | |
935 | * If size < SPA_MINBLOCKSIZE, then we will not allocate from | |
936 | * this metaslab again. In that case, it had better be empty, | |
937 | * or we would be leaving space on the table. | |
938 | */ | |
939 | ASSERT(size >= SPA_MINBLOCKSIZE || msp->ms_map.sm_space == 0); | |
940 | metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size)); | |
941 | ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0); | |
942 | } | |
943 | ||
944 | /* | |
945 | * Write a metaslab to disk in the context of the specified transaction group. | |
946 | */ | |
947 | void | |
948 | metaslab_sync(metaslab_t *msp, uint64_t txg) | |
949 | { | |
950 | vdev_t *vd = msp->ms_group->mg_vd; | |
951 | spa_t *spa = vd->vdev_spa; | |
428870ff | 952 | objset_t *mos = spa_meta_objset(spa); |
34dc7c2f BB |
953 | space_map_t *allocmap = &msp->ms_allocmap[txg & TXG_MASK]; |
954 | space_map_t *freemap = &msp->ms_freemap[txg & TXG_MASK]; | |
955 | space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; | |
956 | space_map_t *sm = &msp->ms_map; | |
957 | space_map_obj_t *smo = &msp->ms_smo_syncing; | |
958 | dmu_buf_t *db; | |
959 | dmu_tx_t *tx; | |
d6320ddb | 960 | int t; |
34dc7c2f | 961 | |
428870ff BB |
962 | ASSERT(!vd->vdev_ishole); |
963 | ||
964 | if (allocmap->sm_space == 0 && freemap->sm_space == 0) | |
965 | return; | |
34dc7c2f BB |
966 | |
967 | /* | |
968 | * The only state that can actually be changing concurrently with | |
969 | * metaslab_sync() is the metaslab's ms_map. No other thread can | |
970 | * be modifying this txg's allocmap, freemap, freed_map, or smo. | |
971 | * Therefore, we only hold ms_lock to satify space_map ASSERTs. | |
972 | * We drop it whenever we call into the DMU, because the DMU | |
973 | * can call down to us (e.g. via zio_free()) at any time. | |
974 | */ | |
428870ff BB |
975 | |
976 | tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); | |
34dc7c2f BB |
977 | |
978 | if (smo->smo_object == 0) { | |
979 | ASSERT(smo->smo_objsize == 0); | |
980 | ASSERT(smo->smo_alloc == 0); | |
34dc7c2f BB |
981 | smo->smo_object = dmu_object_alloc(mos, |
982 | DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT, | |
983 | DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx); | |
984 | ASSERT(smo->smo_object != 0); | |
985 | dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) * | |
986 | (sm->sm_start >> vd->vdev_ms_shift), | |
987 | sizeof (uint64_t), &smo->smo_object, tx); | |
34dc7c2f BB |
988 | } |
989 | ||
428870ff BB |
990 | mutex_enter(&msp->ms_lock); |
991 | ||
34dc7c2f BB |
992 | space_map_walk(freemap, space_map_add, freed_map); |
993 | ||
994 | if (sm->sm_loaded && spa_sync_pass(spa) == 1 && smo->smo_objsize >= | |
995 | 2 * sizeof (uint64_t) * avl_numnodes(&sm->sm_root)) { | |
996 | /* | |
997 | * The in-core space map representation is twice as compact | |
998 | * as the on-disk one, so it's time to condense the latter | |
999 | * by generating a pure allocmap from first principles. | |
1000 | * | |
1001 | * This metaslab is 100% allocated, | |
1002 | * minus the content of the in-core map (sm), | |
1003 | * minus what's been freed this txg (freed_map), | |
428870ff | 1004 | * minus deferred frees (ms_defermap[]), |
34dc7c2f BB |
1005 | * minus allocations from txgs in the future |
1006 | * (because they haven't been committed yet). | |
1007 | */ | |
1008 | space_map_vacate(allocmap, NULL, NULL); | |
1009 | space_map_vacate(freemap, NULL, NULL); | |
1010 | ||
1011 | space_map_add(allocmap, allocmap->sm_start, allocmap->sm_size); | |
1012 | ||
1013 | space_map_walk(sm, space_map_remove, allocmap); | |
1014 | space_map_walk(freed_map, space_map_remove, allocmap); | |
1015 | ||
d6320ddb | 1016 | for (t = 0; t < TXG_DEFER_SIZE; t++) |
428870ff BB |
1017 | space_map_walk(&msp->ms_defermap[t], |
1018 | space_map_remove, allocmap); | |
1019 | ||
d6320ddb | 1020 | for (t = 1; t < TXG_CONCURRENT_STATES; t++) |
34dc7c2f BB |
1021 | space_map_walk(&msp->ms_allocmap[(txg + t) & TXG_MASK], |
1022 | space_map_remove, allocmap); | |
1023 | ||
1024 | mutex_exit(&msp->ms_lock); | |
1025 | space_map_truncate(smo, mos, tx); | |
1026 | mutex_enter(&msp->ms_lock); | |
1027 | } | |
1028 | ||
1029 | space_map_sync(allocmap, SM_ALLOC, smo, mos, tx); | |
1030 | space_map_sync(freemap, SM_FREE, smo, mos, tx); | |
1031 | ||
1032 | mutex_exit(&msp->ms_lock); | |
1033 | ||
1034 | VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)); | |
1035 | dmu_buf_will_dirty(db, tx); | |
1036 | ASSERT3U(db->db_size, >=, sizeof (*smo)); | |
1037 | bcopy(smo, db->db_data, sizeof (*smo)); | |
1038 | dmu_buf_rele(db, FTAG); | |
1039 | ||
1040 | dmu_tx_commit(tx); | |
1041 | } | |
1042 | ||
1043 | /* | |
1044 | * Called after a transaction group has completely synced to mark | |
1045 | * all of the metaslab's free space as usable. | |
1046 | */ | |
1047 | void | |
1048 | metaslab_sync_done(metaslab_t *msp, uint64_t txg) | |
1049 | { | |
1050 | space_map_obj_t *smo = &msp->ms_smo; | |
1051 | space_map_obj_t *smosync = &msp->ms_smo_syncing; | |
1052 | space_map_t *sm = &msp->ms_map; | |
1053 | space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; | |
428870ff | 1054 | space_map_t *defer_map = &msp->ms_defermap[txg % TXG_DEFER_SIZE]; |
34dc7c2f BB |
1055 | metaslab_group_t *mg = msp->ms_group; |
1056 | vdev_t *vd = mg->mg_vd; | |
428870ff | 1057 | int64_t alloc_delta, defer_delta; |
d6320ddb | 1058 | int t; |
428870ff BB |
1059 | |
1060 | ASSERT(!vd->vdev_ishole); | |
34dc7c2f BB |
1061 | |
1062 | mutex_enter(&msp->ms_lock); | |
1063 | ||
1064 | /* | |
1065 | * If this metaslab is just becoming available, initialize its | |
1066 | * allocmaps and freemaps and add its capacity to the vdev. | |
1067 | */ | |
1068 | if (freed_map->sm_size == 0) { | |
d6320ddb | 1069 | for (t = 0; t < TXG_SIZE; t++) { |
34dc7c2f BB |
1070 | space_map_create(&msp->ms_allocmap[t], sm->sm_start, |
1071 | sm->sm_size, sm->sm_shift, sm->sm_lock); | |
1072 | space_map_create(&msp->ms_freemap[t], sm->sm_start, | |
1073 | sm->sm_size, sm->sm_shift, sm->sm_lock); | |
1074 | } | |
428870ff | 1075 | |
d6320ddb | 1076 | for (t = 0; t < TXG_DEFER_SIZE; t++) |
428870ff BB |
1077 | space_map_create(&msp->ms_defermap[t], sm->sm_start, |
1078 | sm->sm_size, sm->sm_shift, sm->sm_lock); | |
1079 | ||
1080 | vdev_space_update(vd, 0, 0, sm->sm_size); | |
34dc7c2f BB |
1081 | } |
1082 | ||
428870ff BB |
1083 | alloc_delta = smosync->smo_alloc - smo->smo_alloc; |
1084 | defer_delta = freed_map->sm_space - defer_map->sm_space; | |
1085 | ||
1086 | vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0); | |
34dc7c2f BB |
1087 | |
1088 | ASSERT(msp->ms_allocmap[txg & TXG_MASK].sm_space == 0); | |
1089 | ASSERT(msp->ms_freemap[txg & TXG_MASK].sm_space == 0); | |
1090 | ||
1091 | /* | |
1092 | * If there's a space_map_load() in progress, wait for it to complete | |
1093 | * so that we have a consistent view of the in-core space map. | |
428870ff BB |
1094 | * Then, add defer_map (oldest deferred frees) to this map and |
1095 | * transfer freed_map (this txg's frees) to defer_map. | |
34dc7c2f BB |
1096 | */ |
1097 | space_map_load_wait(sm); | |
428870ff BB |
1098 | space_map_vacate(defer_map, sm->sm_loaded ? space_map_free : NULL, sm); |
1099 | space_map_vacate(freed_map, space_map_add, defer_map); | |
34dc7c2f BB |
1100 | |
1101 | *smo = *smosync; | |
1102 | ||
428870ff BB |
1103 | msp->ms_deferspace += defer_delta; |
1104 | ASSERT3S(msp->ms_deferspace, >=, 0); | |
1105 | ASSERT3S(msp->ms_deferspace, <=, sm->sm_size); | |
1106 | if (msp->ms_deferspace != 0) { | |
1107 | /* | |
1108 | * Keep syncing this metaslab until all deferred frees | |
1109 | * are back in circulation. | |
1110 | */ | |
1111 | vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); | |
1112 | } | |
1113 | ||
34dc7c2f BB |
1114 | /* |
1115 | * If the map is loaded but no longer active, evict it as soon as all | |
1116 | * future allocations have synced. (If we unloaded it now and then | |
1117 | * loaded a moment later, the map wouldn't reflect those allocations.) | |
1118 | */ | |
1119 | if (sm->sm_loaded && (msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { | |
1120 | int evictable = 1; | |
1121 | ||
d6320ddb | 1122 | for (t = 1; t < TXG_CONCURRENT_STATES; t++) |
34dc7c2f BB |
1123 | if (msp->ms_allocmap[(txg + t) & TXG_MASK].sm_space) |
1124 | evictable = 0; | |
1125 | ||
428870ff | 1126 | if (evictable && !metaslab_debug) |
34dc7c2f BB |
1127 | space_map_unload(sm); |
1128 | } | |
1129 | ||
1130 | metaslab_group_sort(mg, msp, metaslab_weight(msp)); | |
1131 | ||
1132 | mutex_exit(&msp->ms_lock); | |
1133 | } | |
1134 | ||
428870ff BB |
1135 | void |
1136 | metaslab_sync_reassess(metaslab_group_t *mg) | |
1137 | { | |
1138 | vdev_t *vd = mg->mg_vd; | |
6d974228 | 1139 | int64_t failures = mg->mg_alloc_failures; |
d6320ddb | 1140 | int m; |
428870ff BB |
1141 | |
1142 | /* | |
1143 | * Re-evaluate all metaslabs which have lower offsets than the | |
1144 | * bonus area. | |
1145 | */ | |
d6320ddb | 1146 | for (m = 0; m < vd->vdev_ms_count; m++) { |
428870ff BB |
1147 | metaslab_t *msp = vd->vdev_ms[m]; |
1148 | ||
1149 | if (msp->ms_map.sm_start > mg->mg_bonus_area) | |
1150 | break; | |
1151 | ||
1152 | mutex_enter(&msp->ms_lock); | |
1153 | metaslab_group_sort(mg, msp, metaslab_weight(msp)); | |
1154 | mutex_exit(&msp->ms_lock); | |
1155 | } | |
1156 | ||
6d974228 GW |
1157 | atomic_add_64(&mg->mg_alloc_failures, -failures); |
1158 | ||
428870ff BB |
1159 | /* |
1160 | * Prefetch the next potential metaslabs | |
1161 | */ | |
1162 | metaslab_prefetch(mg); | |
1163 | } | |
1164 | ||
34dc7c2f BB |
1165 | static uint64_t |
1166 | metaslab_distance(metaslab_t *msp, dva_t *dva) | |
1167 | { | |
1168 | uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift; | |
1169 | uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift; | |
1170 | uint64_t start = msp->ms_map.sm_start >> ms_shift; | |
1171 | ||
1172 | if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva)) | |
1173 | return (1ULL << 63); | |
1174 | ||
1175 | if (offset < start) | |
1176 | return ((start - offset) << ms_shift); | |
1177 | if (offset > start) | |
1178 | return ((offset - start) << ms_shift); | |
1179 | return (0); | |
1180 | } | |
1181 | ||
1182 | static uint64_t | |
6d974228 GW |
1183 | metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize, |
1184 | uint64_t txg, uint64_t min_distance, dva_t *dva, int d, int flags) | |
34dc7c2f | 1185 | { |
6d974228 | 1186 | spa_t *spa = mg->mg_vd->vdev_spa; |
34dc7c2f BB |
1187 | metaslab_t *msp = NULL; |
1188 | uint64_t offset = -1ULL; | |
1189 | avl_tree_t *t = &mg->mg_metaslab_tree; | |
1190 | uint64_t activation_weight; | |
1191 | uint64_t target_distance; | |
1192 | int i; | |
1193 | ||
1194 | activation_weight = METASLAB_WEIGHT_PRIMARY; | |
9babb374 BB |
1195 | for (i = 0; i < d; i++) { |
1196 | if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { | |
34dc7c2f | 1197 | activation_weight = METASLAB_WEIGHT_SECONDARY; |
9babb374 BB |
1198 | break; |
1199 | } | |
1200 | } | |
34dc7c2f BB |
1201 | |
1202 | for (;;) { | |
9babb374 BB |
1203 | boolean_t was_active; |
1204 | ||
34dc7c2f BB |
1205 | mutex_enter(&mg->mg_lock); |
1206 | for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) { | |
6d974228 GW |
1207 | if (msp->ms_weight < asize) { |
1208 | spa_dbgmsg(spa, "%s: failed to meet weight " | |
1209 | "requirement: vdev %llu, txg %llu, mg %p, " | |
1210 | "msp %p, psize %llu, asize %llu, " | |
1211 | "failures %llu, weight %llu", | |
1212 | spa_name(spa), mg->mg_vd->vdev_id, txg, | |
1213 | mg, msp, psize, asize, | |
1214 | mg->mg_alloc_failures, msp->ms_weight); | |
34dc7c2f BB |
1215 | mutex_exit(&mg->mg_lock); |
1216 | return (-1ULL); | |
1217 | } | |
9babb374 | 1218 | was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; |
34dc7c2f BB |
1219 | if (activation_weight == METASLAB_WEIGHT_PRIMARY) |
1220 | break; | |
1221 | ||
1222 | target_distance = min_distance + | |
1223 | (msp->ms_smo.smo_alloc ? 0 : min_distance >> 1); | |
1224 | ||
1225 | for (i = 0; i < d; i++) | |
1226 | if (metaslab_distance(msp, &dva[i]) < | |
1227 | target_distance) | |
1228 | break; | |
1229 | if (i == d) | |
1230 | break; | |
1231 | } | |
1232 | mutex_exit(&mg->mg_lock); | |
1233 | if (msp == NULL) | |
1234 | return (-1ULL); | |
1235 | ||
6d974228 GW |
1236 | /* |
1237 | * If we've already reached the allowable number of failed | |
1238 | * allocation attempts on this metaslab group then we | |
1239 | * consider skipping it. We skip it only if we're allowed | |
1240 | * to "fast" gang, the physical size is larger than | |
1241 | * a gang block, and we're attempting to allocate from | |
1242 | * the primary metaslab. | |
1243 | */ | |
1244 | if (mg->mg_alloc_failures > zfs_mg_alloc_failures && | |
1245 | CAN_FASTGANG(flags) && psize > SPA_GANGBLOCKSIZE && | |
1246 | activation_weight == METASLAB_WEIGHT_PRIMARY) { | |
1247 | spa_dbgmsg(spa, "%s: skipping metaslab group: " | |
1248 | "vdev %llu, txg %llu, mg %p, psize %llu, " | |
1249 | "asize %llu, failures %llu", spa_name(spa), | |
1250 | mg->mg_vd->vdev_id, txg, mg, psize, asize, | |
1251 | mg->mg_alloc_failures); | |
1252 | return (-1ULL); | |
1253 | } | |
1254 | ||
34dc7c2f BB |
1255 | mutex_enter(&msp->ms_lock); |
1256 | ||
1257 | /* | |
1258 | * Ensure that the metaslab we have selected is still | |
1259 | * capable of handling our request. It's possible that | |
1260 | * another thread may have changed the weight while we | |
1261 | * were blocked on the metaslab lock. | |
1262 | */ | |
6d974228 | 1263 | if (msp->ms_weight < asize || (was_active && |
9babb374 BB |
1264 | !(msp->ms_weight & METASLAB_ACTIVE_MASK) && |
1265 | activation_weight == METASLAB_WEIGHT_PRIMARY)) { | |
34dc7c2f BB |
1266 | mutex_exit(&msp->ms_lock); |
1267 | continue; | |
1268 | } | |
1269 | ||
1270 | if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) && | |
1271 | activation_weight == METASLAB_WEIGHT_PRIMARY) { | |
1272 | metaslab_passivate(msp, | |
1273 | msp->ms_weight & ~METASLAB_ACTIVE_MASK); | |
1274 | mutex_exit(&msp->ms_lock); | |
1275 | continue; | |
1276 | } | |
1277 | ||
6d974228 | 1278 | if (metaslab_activate(msp, activation_weight) != 0) { |
34dc7c2f BB |
1279 | mutex_exit(&msp->ms_lock); |
1280 | continue; | |
1281 | } | |
1282 | ||
6d974228 | 1283 | if ((offset = space_map_alloc(&msp->ms_map, asize)) != -1ULL) |
34dc7c2f BB |
1284 | break; |
1285 | ||
6d974228 GW |
1286 | atomic_inc_64(&mg->mg_alloc_failures); |
1287 | ||
428870ff | 1288 | metaslab_passivate(msp, space_map_maxsize(&msp->ms_map)); |
34dc7c2f BB |
1289 | |
1290 | mutex_exit(&msp->ms_lock); | |
1291 | } | |
1292 | ||
1293 | if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0) | |
1294 | vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); | |
1295 | ||
6d974228 | 1296 | space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, asize); |
34dc7c2f BB |
1297 | |
1298 | mutex_exit(&msp->ms_lock); | |
1299 | ||
1300 | return (offset); | |
1301 | } | |
1302 | ||
1303 | /* | |
1304 | * Allocate a block for the specified i/o. | |
1305 | */ | |
1306 | static int | |
1307 | metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, | |
b128c09f | 1308 | dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags) |
34dc7c2f BB |
1309 | { |
1310 | metaslab_group_t *mg, *rotor; | |
1311 | vdev_t *vd; | |
1312 | int dshift = 3; | |
1313 | int all_zero; | |
fb5f0bc8 BB |
1314 | int zio_lock = B_FALSE; |
1315 | boolean_t allocatable; | |
34dc7c2f BB |
1316 | uint64_t offset = -1ULL; |
1317 | uint64_t asize; | |
1318 | uint64_t distance; | |
1319 | ||
1320 | ASSERT(!DVA_IS_VALID(&dva[d])); | |
1321 | ||
1322 | /* | |
1323 | * For testing, make some blocks above a certain size be gang blocks. | |
1324 | */ | |
428870ff | 1325 | if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0) |
34dc7c2f BB |
1326 | return (ENOSPC); |
1327 | ||
1328 | /* | |
1329 | * Start at the rotor and loop through all mgs until we find something. | |
428870ff | 1330 | * Note that there's no locking on mc_rotor or mc_aliquot because |
34dc7c2f BB |
1331 | * nothing actually breaks if we miss a few updates -- we just won't |
1332 | * allocate quite as evenly. It all balances out over time. | |
1333 | * | |
1334 | * If we are doing ditto or log blocks, try to spread them across | |
1335 | * consecutive vdevs. If we're forced to reuse a vdev before we've | |
1336 | * allocated all of our ditto blocks, then try and spread them out on | |
1337 | * that vdev as much as possible. If it turns out to not be possible, | |
1338 | * gradually lower our standards until anything becomes acceptable. | |
1339 | * Also, allocating on consecutive vdevs (as opposed to random vdevs) | |
1340 | * gives us hope of containing our fault domains to something we're | |
1341 | * able to reason about. Otherwise, any two top-level vdev failures | |
1342 | * will guarantee the loss of data. With consecutive allocation, | |
1343 | * only two adjacent top-level vdev failures will result in data loss. | |
1344 | * | |
1345 | * If we are doing gang blocks (hintdva is non-NULL), try to keep | |
1346 | * ourselves on the same vdev as our gang block header. That | |
1347 | * way, we can hope for locality in vdev_cache, plus it makes our | |
1348 | * fault domains something tractable. | |
1349 | */ | |
1350 | if (hintdva) { | |
1351 | vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d])); | |
428870ff BB |
1352 | |
1353 | /* | |
1354 | * It's possible the vdev we're using as the hint no | |
1355 | * longer exists (i.e. removed). Consult the rotor when | |
1356 | * all else fails. | |
1357 | */ | |
1358 | if (vd != NULL) { | |
34dc7c2f | 1359 | mg = vd->vdev_mg; |
428870ff BB |
1360 | |
1361 | if (flags & METASLAB_HINTBP_AVOID && | |
1362 | mg->mg_next != NULL) | |
1363 | mg = mg->mg_next; | |
1364 | } else { | |
1365 | mg = mc->mc_rotor; | |
1366 | } | |
34dc7c2f BB |
1367 | } else if (d != 0) { |
1368 | vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1])); | |
1369 | mg = vd->vdev_mg->mg_next; | |
1370 | } else { | |
1371 | mg = mc->mc_rotor; | |
1372 | } | |
1373 | ||
1374 | /* | |
428870ff BB |
1375 | * If the hint put us into the wrong metaslab class, or into a |
1376 | * metaslab group that has been passivated, just follow the rotor. | |
34dc7c2f | 1377 | */ |
428870ff | 1378 | if (mg->mg_class != mc || mg->mg_activation_count <= 0) |
34dc7c2f BB |
1379 | mg = mc->mc_rotor; |
1380 | ||
1381 | rotor = mg; | |
1382 | top: | |
1383 | all_zero = B_TRUE; | |
1384 | do { | |
428870ff BB |
1385 | ASSERT(mg->mg_activation_count == 1); |
1386 | ||
34dc7c2f | 1387 | vd = mg->mg_vd; |
fb5f0bc8 | 1388 | |
34dc7c2f | 1389 | /* |
b128c09f | 1390 | * Don't allocate from faulted devices. |
34dc7c2f | 1391 | */ |
fb5f0bc8 BB |
1392 | if (zio_lock) { |
1393 | spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER); | |
1394 | allocatable = vdev_allocatable(vd); | |
1395 | spa_config_exit(spa, SCL_ZIO, FTAG); | |
1396 | } else { | |
1397 | allocatable = vdev_allocatable(vd); | |
1398 | } | |
1399 | if (!allocatable) | |
34dc7c2f | 1400 | goto next; |
fb5f0bc8 | 1401 | |
34dc7c2f BB |
1402 | /* |
1403 | * Avoid writing single-copy data to a failing vdev | |
1404 | */ | |
1405 | if ((vd->vdev_stat.vs_write_errors > 0 || | |
1406 | vd->vdev_state < VDEV_STATE_HEALTHY) && | |
1407 | d == 0 && dshift == 3) { | |
1408 | all_zero = B_FALSE; | |
1409 | goto next; | |
1410 | } | |
1411 | ||
1412 | ASSERT(mg->mg_class == mc); | |
1413 | ||
1414 | distance = vd->vdev_asize >> dshift; | |
1415 | if (distance <= (1ULL << vd->vdev_ms_shift)) | |
1416 | distance = 0; | |
1417 | else | |
1418 | all_zero = B_FALSE; | |
1419 | ||
1420 | asize = vdev_psize_to_asize(vd, psize); | |
1421 | ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); | |
1422 | ||
6d974228 GW |
1423 | offset = metaslab_group_alloc(mg, psize, asize, txg, distance, |
1424 | dva, d, flags); | |
34dc7c2f BB |
1425 | if (offset != -1ULL) { |
1426 | /* | |
1427 | * If we've just selected this metaslab group, | |
1428 | * figure out whether the corresponding vdev is | |
1429 | * over- or under-used relative to the pool, | |
1430 | * and set an allocation bias to even it out. | |
1431 | */ | |
428870ff | 1432 | if (mc->mc_aliquot == 0) { |
34dc7c2f | 1433 | vdev_stat_t *vs = &vd->vdev_stat; |
428870ff | 1434 | int64_t vu, cu; |
34dc7c2f | 1435 | |
6d974228 GW |
1436 | vu = (vs->vs_alloc * 100) / (vs->vs_space + 1); |
1437 | cu = (mc->mc_alloc * 100) / (mc->mc_space + 1); | |
34dc7c2f BB |
1438 | |
1439 | /* | |
6d974228 GW |
1440 | * Calculate how much more or less we should |
1441 | * try to allocate from this device during | |
1442 | * this iteration around the rotor. | |
1443 | * For example, if a device is 80% full | |
1444 | * and the pool is 20% full then we should | |
1445 | * reduce allocations by 60% on this device. | |
1446 | * | |
1447 | * mg_bias = (20 - 80) * 512K / 100 = -307K | |
1448 | * | |
1449 | * This reduces allocations by 307K for this | |
1450 | * iteration. | |
34dc7c2f | 1451 | */ |
428870ff | 1452 | mg->mg_bias = ((cu - vu) * |
6d974228 | 1453 | (int64_t)mg->mg_aliquot) / 100; |
34dc7c2f BB |
1454 | } |
1455 | ||
428870ff | 1456 | if (atomic_add_64_nv(&mc->mc_aliquot, asize) >= |
34dc7c2f BB |
1457 | mg->mg_aliquot + mg->mg_bias) { |
1458 | mc->mc_rotor = mg->mg_next; | |
428870ff | 1459 | mc->mc_aliquot = 0; |
34dc7c2f BB |
1460 | } |
1461 | ||
1462 | DVA_SET_VDEV(&dva[d], vd->vdev_id); | |
1463 | DVA_SET_OFFSET(&dva[d], offset); | |
b128c09f | 1464 | DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER)); |
34dc7c2f BB |
1465 | DVA_SET_ASIZE(&dva[d], asize); |
1466 | ||
1467 | return (0); | |
1468 | } | |
1469 | next: | |
1470 | mc->mc_rotor = mg->mg_next; | |
428870ff | 1471 | mc->mc_aliquot = 0; |
34dc7c2f BB |
1472 | } while ((mg = mg->mg_next) != rotor); |
1473 | ||
1474 | if (!all_zero) { | |
1475 | dshift++; | |
1476 | ASSERT(dshift < 64); | |
1477 | goto top; | |
1478 | } | |
1479 | ||
9babb374 | 1480 | if (!allocatable && !zio_lock) { |
fb5f0bc8 BB |
1481 | dshift = 3; |
1482 | zio_lock = B_TRUE; | |
1483 | goto top; | |
1484 | } | |
1485 | ||
34dc7c2f BB |
1486 | bzero(&dva[d], sizeof (dva_t)); |
1487 | ||
1488 | return (ENOSPC); | |
1489 | } | |
1490 | ||
1491 | /* | |
1492 | * Free the block represented by DVA in the context of the specified | |
1493 | * transaction group. | |
1494 | */ | |
1495 | static void | |
1496 | metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now) | |
1497 | { | |
1498 | uint64_t vdev = DVA_GET_VDEV(dva); | |
1499 | uint64_t offset = DVA_GET_OFFSET(dva); | |
1500 | uint64_t size = DVA_GET_ASIZE(dva); | |
1501 | vdev_t *vd; | |
1502 | metaslab_t *msp; | |
1503 | ||
1504 | ASSERT(DVA_IS_VALID(dva)); | |
1505 | ||
1506 | if (txg > spa_freeze_txg(spa)) | |
1507 | return; | |
1508 | ||
1509 | if ((vd = vdev_lookup_top(spa, vdev)) == NULL || | |
1510 | (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) { | |
1511 | cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu", | |
1512 | (u_longlong_t)vdev, (u_longlong_t)offset); | |
1513 | ASSERT(0); | |
1514 | return; | |
1515 | } | |
1516 | ||
1517 | msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; | |
1518 | ||
1519 | if (DVA_GET_GANG(dva)) | |
1520 | size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); | |
1521 | ||
1522 | mutex_enter(&msp->ms_lock); | |
1523 | ||
1524 | if (now) { | |
1525 | space_map_remove(&msp->ms_allocmap[txg & TXG_MASK], | |
1526 | offset, size); | |
1527 | space_map_free(&msp->ms_map, offset, size); | |
1528 | } else { | |
1529 | if (msp->ms_freemap[txg & TXG_MASK].sm_space == 0) | |
1530 | vdev_dirty(vd, VDD_METASLAB, msp, txg); | |
1531 | space_map_add(&msp->ms_freemap[txg & TXG_MASK], offset, size); | |
34dc7c2f BB |
1532 | } |
1533 | ||
1534 | mutex_exit(&msp->ms_lock); | |
1535 | } | |
1536 | ||
1537 | /* | |
1538 | * Intent log support: upon opening the pool after a crash, notify the SPA | |
1539 | * of blocks that the intent log has allocated for immediate write, but | |
1540 | * which are still considered free by the SPA because the last transaction | |
1541 | * group didn't commit yet. | |
1542 | */ | |
1543 | static int | |
1544 | metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) | |
1545 | { | |
1546 | uint64_t vdev = DVA_GET_VDEV(dva); | |
1547 | uint64_t offset = DVA_GET_OFFSET(dva); | |
1548 | uint64_t size = DVA_GET_ASIZE(dva); | |
1549 | vdev_t *vd; | |
1550 | metaslab_t *msp; | |
428870ff | 1551 | int error = 0; |
34dc7c2f BB |
1552 | |
1553 | ASSERT(DVA_IS_VALID(dva)); | |
1554 | ||
1555 | if ((vd = vdev_lookup_top(spa, vdev)) == NULL || | |
1556 | (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) | |
1557 | return (ENXIO); | |
1558 | ||
1559 | msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; | |
1560 | ||
1561 | if (DVA_GET_GANG(dva)) | |
1562 | size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); | |
1563 | ||
1564 | mutex_enter(&msp->ms_lock); | |
1565 | ||
428870ff | 1566 | if ((txg != 0 && spa_writeable(spa)) || !msp->ms_map.sm_loaded) |
6d974228 | 1567 | error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY); |
428870ff BB |
1568 | |
1569 | if (error == 0 && !space_map_contains(&msp->ms_map, offset, size)) | |
1570 | error = ENOENT; | |
1571 | ||
b128c09f | 1572 | if (error || txg == 0) { /* txg == 0 indicates dry run */ |
34dc7c2f BB |
1573 | mutex_exit(&msp->ms_lock); |
1574 | return (error); | |
1575 | } | |
1576 | ||
34dc7c2f | 1577 | space_map_claim(&msp->ms_map, offset, size); |
b128c09f | 1578 | |
fb5f0bc8 | 1579 | if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */ |
b128c09f BB |
1580 | if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0) |
1581 | vdev_dirty(vd, VDD_METASLAB, msp, txg); | |
1582 | space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size); | |
1583 | } | |
34dc7c2f BB |
1584 | |
1585 | mutex_exit(&msp->ms_lock); | |
1586 | ||
1587 | return (0); | |
1588 | } | |
1589 | ||
1590 | int | |
1591 | metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, | |
b128c09f | 1592 | int ndvas, uint64_t txg, blkptr_t *hintbp, int flags) |
34dc7c2f BB |
1593 | { |
1594 | dva_t *dva = bp->blk_dva; | |
1595 | dva_t *hintdva = hintbp->blk_dva; | |
d6320ddb | 1596 | int d, error = 0; |
34dc7c2f | 1597 | |
b128c09f | 1598 | ASSERT(bp->blk_birth == 0); |
428870ff | 1599 | ASSERT(BP_PHYSICAL_BIRTH(bp) == 0); |
b128c09f BB |
1600 | |
1601 | spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); | |
1602 | ||
1603 | if (mc->mc_rotor == NULL) { /* no vdevs in this class */ | |
1604 | spa_config_exit(spa, SCL_ALLOC, FTAG); | |
34dc7c2f | 1605 | return (ENOSPC); |
b128c09f | 1606 | } |
34dc7c2f BB |
1607 | |
1608 | ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa)); | |
1609 | ASSERT(BP_GET_NDVAS(bp) == 0); | |
1610 | ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp)); | |
1611 | ||
d6320ddb | 1612 | for (d = 0; d < ndvas; d++) { |
34dc7c2f | 1613 | error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva, |
b128c09f | 1614 | txg, flags); |
34dc7c2f BB |
1615 | if (error) { |
1616 | for (d--; d >= 0; d--) { | |
1617 | metaslab_free_dva(spa, &dva[d], txg, B_TRUE); | |
1618 | bzero(&dva[d], sizeof (dva_t)); | |
1619 | } | |
b128c09f | 1620 | spa_config_exit(spa, SCL_ALLOC, FTAG); |
34dc7c2f BB |
1621 | return (error); |
1622 | } | |
1623 | } | |
1624 | ASSERT(error == 0); | |
1625 | ASSERT(BP_GET_NDVAS(bp) == ndvas); | |
1626 | ||
b128c09f BB |
1627 | spa_config_exit(spa, SCL_ALLOC, FTAG); |
1628 | ||
428870ff | 1629 | BP_SET_BIRTH(bp, txg, txg); |
b128c09f | 1630 | |
34dc7c2f BB |
1631 | return (0); |
1632 | } | |
1633 | ||
1634 | void | |
1635 | metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now) | |
1636 | { | |
1637 | const dva_t *dva = bp->blk_dva; | |
d6320ddb | 1638 | int d, ndvas = BP_GET_NDVAS(bp); |
34dc7c2f BB |
1639 | |
1640 | ASSERT(!BP_IS_HOLE(bp)); | |
428870ff | 1641 | ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa)); |
b128c09f BB |
1642 | |
1643 | spa_config_enter(spa, SCL_FREE, FTAG, RW_READER); | |
34dc7c2f | 1644 | |
d6320ddb | 1645 | for (d = 0; d < ndvas; d++) |
34dc7c2f | 1646 | metaslab_free_dva(spa, &dva[d], txg, now); |
b128c09f BB |
1647 | |
1648 | spa_config_exit(spa, SCL_FREE, FTAG); | |
34dc7c2f BB |
1649 | } |
1650 | ||
1651 | int | |
1652 | metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) | |
1653 | { | |
1654 | const dva_t *dva = bp->blk_dva; | |
1655 | int ndvas = BP_GET_NDVAS(bp); | |
d6320ddb | 1656 | int d, error = 0; |
34dc7c2f BB |
1657 | |
1658 | ASSERT(!BP_IS_HOLE(bp)); | |
1659 | ||
b128c09f BB |
1660 | if (txg != 0) { |
1661 | /* | |
1662 | * First do a dry run to make sure all DVAs are claimable, | |
1663 | * so we don't have to unwind from partial failures below. | |
1664 | */ | |
1665 | if ((error = metaslab_claim(spa, bp, 0)) != 0) | |
1666 | return (error); | |
1667 | } | |
1668 | ||
1669 | spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); | |
1670 | ||
d6320ddb | 1671 | for (d = 0; d < ndvas; d++) |
34dc7c2f | 1672 | if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0) |
b128c09f BB |
1673 | break; |
1674 | ||
1675 | spa_config_exit(spa, SCL_ALLOC, FTAG); | |
1676 | ||
1677 | ASSERT(error == 0 || txg == 0); | |
34dc7c2f | 1678 | |
b128c09f | 1679 | return (error); |
34dc7c2f | 1680 | } |