]>
Commit | Line | Data |
---|---|---|
34dc7c2f BB |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
9 | * or http://www.opensolaris.org/os/licensing. | |
10 | * See the License for the specific language governing permissions | |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | /* | |
428870ff | 22 | * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. |
ebf8e3a2 | 23 | * Copyright (c) 2012 by Delphix. All rights reserved. |
34dc7c2f BB |
24 | */ |
25 | ||
34dc7c2f | 26 | #include <sys/zfs_context.h> |
34dc7c2f BB |
27 | #include <sys/dmu.h> |
28 | #include <sys/dmu_tx.h> | |
29 | #include <sys/space_map.h> | |
30 | #include <sys/metaslab_impl.h> | |
31 | #include <sys/vdev_impl.h> | |
32 | #include <sys/zio.h> | |
33 | ||
6d974228 GW |
34 | #define WITH_DF_BLOCK_ALLOCATOR |
35 | ||
36 | /* | |
37 | * Allow allocations to switch to gang blocks quickly. We do this to | |
38 | * avoid having to load lots of space_maps in a given txg. There are, | |
39 | * however, some cases where we want to avoid "fast" ganging and instead | |
40 | * we want to do an exhaustive search of all metaslabs on this device. | |
ebf8e3a2 | 41 | * Currently we don't allow any gang, zil, or dump device related allocations |
6d974228 GW |
42 | * to "fast" gang. |
43 | */ | |
44 | #define CAN_FASTGANG(flags) \ | |
45 | (!((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER | \ | |
46 | METASLAB_GANG_AVOID))) | |
22c81dd8 | 47 | |
34dc7c2f BB |
48 | uint64_t metaslab_aliquot = 512ULL << 10; |
49 | uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */ | |
50 | ||
6d974228 GW |
51 | /* |
52 | * This value defines the number of allowed allocation failures per vdev. | |
53 | * If a device reaches this threshold in a given txg then we consider skipping | |
54 | * allocations on that device. | |
55 | */ | |
56 | int zfs_mg_alloc_failures; | |
57 | ||
428870ff BB |
58 | /* |
59 | * Metaslab debugging: when set, keeps all space maps in core to verify frees. | |
60 | */ | |
30b92c1d | 61 | int metaslab_debug = 0; |
428870ff | 62 | |
9babb374 BB |
63 | /* |
64 | * Minimum size which forces the dynamic allocator to change | |
428870ff | 65 | * it's allocation strategy. Once the space map cannot satisfy |
9babb374 BB |
66 | * an allocation of this size then it switches to using more |
67 | * aggressive strategy (i.e search by size rather than offset). | |
68 | */ | |
69 | uint64_t metaslab_df_alloc_threshold = SPA_MAXBLOCKSIZE; | |
70 | ||
71 | /* | |
72 | * The minimum free space, in percent, which must be available | |
73 | * in a space map to continue allocations in a first-fit fashion. | |
74 | * Once the space_map's free space drops below this level we dynamically | |
75 | * switch to using best-fit allocations. | |
76 | */ | |
428870ff BB |
77 | int metaslab_df_free_pct = 4; |
78 | ||
79 | /* | |
80 | * A metaslab is considered "free" if it contains a contiguous | |
81 | * segment which is greater than metaslab_min_alloc_size. | |
82 | */ | |
83 | uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS; | |
84 | ||
85 | /* | |
86 | * Max number of space_maps to prefetch. | |
87 | */ | |
88 | int metaslab_prefetch_limit = SPA_DVAS_PER_BP; | |
89 | ||
90 | /* | |
91 | * Percentage bonus multiplier for metaslabs that are in the bonus area. | |
92 | */ | |
93 | int metaslab_smo_bonus_pct = 150; | |
9babb374 | 94 | |
34dc7c2f BB |
95 | /* |
96 | * ========================================================================== | |
97 | * Metaslab classes | |
98 | * ========================================================================== | |
99 | */ | |
100 | metaslab_class_t * | |
428870ff | 101 | metaslab_class_create(spa_t *spa, space_map_ops_t *ops) |
34dc7c2f BB |
102 | { |
103 | metaslab_class_t *mc; | |
104 | ||
b8d06fca | 105 | mc = kmem_zalloc(sizeof (metaslab_class_t), KM_PUSHPAGE); |
34dc7c2f | 106 | |
428870ff | 107 | mc->mc_spa = spa; |
34dc7c2f | 108 | mc->mc_rotor = NULL; |
9babb374 | 109 | mc->mc_ops = ops; |
920dd524 | 110 | mutex_init(&mc->mc_fastwrite_lock, NULL, MUTEX_DEFAULT, NULL); |
34dc7c2f BB |
111 | |
112 | return (mc); | |
113 | } | |
114 | ||
115 | void | |
116 | metaslab_class_destroy(metaslab_class_t *mc) | |
117 | { | |
428870ff BB |
118 | ASSERT(mc->mc_rotor == NULL); |
119 | ASSERT(mc->mc_alloc == 0); | |
120 | ASSERT(mc->mc_deferred == 0); | |
121 | ASSERT(mc->mc_space == 0); | |
122 | ASSERT(mc->mc_dspace == 0); | |
34dc7c2f | 123 | |
920dd524 | 124 | mutex_destroy(&mc->mc_fastwrite_lock); |
34dc7c2f BB |
125 | kmem_free(mc, sizeof (metaslab_class_t)); |
126 | } | |
127 | ||
428870ff BB |
128 | int |
129 | metaslab_class_validate(metaslab_class_t *mc) | |
34dc7c2f | 130 | { |
428870ff BB |
131 | metaslab_group_t *mg; |
132 | vdev_t *vd; | |
34dc7c2f | 133 | |
428870ff BB |
134 | /* |
135 | * Must hold one of the spa_config locks. | |
136 | */ | |
137 | ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) || | |
138 | spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER)); | |
34dc7c2f | 139 | |
428870ff BB |
140 | if ((mg = mc->mc_rotor) == NULL) |
141 | return (0); | |
142 | ||
143 | do { | |
144 | vd = mg->mg_vd; | |
145 | ASSERT(vd->vdev_mg != NULL); | |
146 | ASSERT3P(vd->vdev_top, ==, vd); | |
147 | ASSERT3P(mg->mg_class, ==, mc); | |
148 | ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops); | |
149 | } while ((mg = mg->mg_next) != mc->mc_rotor); | |
150 | ||
151 | return (0); | |
34dc7c2f BB |
152 | } |
153 | ||
154 | void | |
428870ff BB |
155 | metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta, |
156 | int64_t defer_delta, int64_t space_delta, int64_t dspace_delta) | |
34dc7c2f | 157 | { |
428870ff BB |
158 | atomic_add_64(&mc->mc_alloc, alloc_delta); |
159 | atomic_add_64(&mc->mc_deferred, defer_delta); | |
160 | atomic_add_64(&mc->mc_space, space_delta); | |
161 | atomic_add_64(&mc->mc_dspace, dspace_delta); | |
162 | } | |
34dc7c2f | 163 | |
428870ff BB |
164 | uint64_t |
165 | metaslab_class_get_alloc(metaslab_class_t *mc) | |
166 | { | |
167 | return (mc->mc_alloc); | |
168 | } | |
34dc7c2f | 169 | |
428870ff BB |
170 | uint64_t |
171 | metaslab_class_get_deferred(metaslab_class_t *mc) | |
172 | { | |
173 | return (mc->mc_deferred); | |
174 | } | |
34dc7c2f | 175 | |
428870ff BB |
176 | uint64_t |
177 | metaslab_class_get_space(metaslab_class_t *mc) | |
178 | { | |
179 | return (mc->mc_space); | |
180 | } | |
34dc7c2f | 181 | |
428870ff BB |
182 | uint64_t |
183 | metaslab_class_get_dspace(metaslab_class_t *mc) | |
184 | { | |
185 | return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space); | |
34dc7c2f BB |
186 | } |
187 | ||
188 | /* | |
189 | * ========================================================================== | |
190 | * Metaslab groups | |
191 | * ========================================================================== | |
192 | */ | |
193 | static int | |
194 | metaslab_compare(const void *x1, const void *x2) | |
195 | { | |
196 | const metaslab_t *m1 = x1; | |
197 | const metaslab_t *m2 = x2; | |
198 | ||
199 | if (m1->ms_weight < m2->ms_weight) | |
200 | return (1); | |
201 | if (m1->ms_weight > m2->ms_weight) | |
202 | return (-1); | |
203 | ||
204 | /* | |
205 | * If the weights are identical, use the offset to force uniqueness. | |
206 | */ | |
207 | if (m1->ms_map.sm_start < m2->ms_map.sm_start) | |
208 | return (-1); | |
209 | if (m1->ms_map.sm_start > m2->ms_map.sm_start) | |
210 | return (1); | |
211 | ||
212 | ASSERT3P(m1, ==, m2); | |
213 | ||
214 | return (0); | |
215 | } | |
216 | ||
217 | metaslab_group_t * | |
218 | metaslab_group_create(metaslab_class_t *mc, vdev_t *vd) | |
219 | { | |
220 | metaslab_group_t *mg; | |
221 | ||
b8d06fca | 222 | mg = kmem_zalloc(sizeof (metaslab_group_t), KM_PUSHPAGE); |
34dc7c2f BB |
223 | mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); |
224 | avl_create(&mg->mg_metaslab_tree, metaslab_compare, | |
225 | sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node)); | |
34dc7c2f | 226 | mg->mg_vd = vd; |
428870ff BB |
227 | mg->mg_class = mc; |
228 | mg->mg_activation_count = 0; | |
34dc7c2f BB |
229 | |
230 | return (mg); | |
231 | } | |
232 | ||
233 | void | |
234 | metaslab_group_destroy(metaslab_group_t *mg) | |
235 | { | |
428870ff BB |
236 | ASSERT(mg->mg_prev == NULL); |
237 | ASSERT(mg->mg_next == NULL); | |
238 | /* | |
239 | * We may have gone below zero with the activation count | |
240 | * either because we never activated in the first place or | |
241 | * because we're done, and possibly removing the vdev. | |
242 | */ | |
243 | ASSERT(mg->mg_activation_count <= 0); | |
244 | ||
34dc7c2f BB |
245 | avl_destroy(&mg->mg_metaslab_tree); |
246 | mutex_destroy(&mg->mg_lock); | |
247 | kmem_free(mg, sizeof (metaslab_group_t)); | |
248 | } | |
249 | ||
428870ff BB |
250 | void |
251 | metaslab_group_activate(metaslab_group_t *mg) | |
252 | { | |
253 | metaslab_class_t *mc = mg->mg_class; | |
254 | metaslab_group_t *mgprev, *mgnext; | |
255 | ||
256 | ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER)); | |
257 | ||
258 | ASSERT(mc->mc_rotor != mg); | |
259 | ASSERT(mg->mg_prev == NULL); | |
260 | ASSERT(mg->mg_next == NULL); | |
261 | ASSERT(mg->mg_activation_count <= 0); | |
262 | ||
263 | if (++mg->mg_activation_count <= 0) | |
264 | return; | |
265 | ||
266 | mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children); | |
267 | ||
268 | if ((mgprev = mc->mc_rotor) == NULL) { | |
269 | mg->mg_prev = mg; | |
270 | mg->mg_next = mg; | |
271 | } else { | |
272 | mgnext = mgprev->mg_next; | |
273 | mg->mg_prev = mgprev; | |
274 | mg->mg_next = mgnext; | |
275 | mgprev->mg_next = mg; | |
276 | mgnext->mg_prev = mg; | |
277 | } | |
278 | mc->mc_rotor = mg; | |
279 | } | |
280 | ||
281 | void | |
282 | metaslab_group_passivate(metaslab_group_t *mg) | |
283 | { | |
284 | metaslab_class_t *mc = mg->mg_class; | |
285 | metaslab_group_t *mgprev, *mgnext; | |
286 | ||
287 | ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER)); | |
288 | ||
289 | if (--mg->mg_activation_count != 0) { | |
290 | ASSERT(mc->mc_rotor != mg); | |
291 | ASSERT(mg->mg_prev == NULL); | |
292 | ASSERT(mg->mg_next == NULL); | |
293 | ASSERT(mg->mg_activation_count < 0); | |
294 | return; | |
295 | } | |
296 | ||
297 | mgprev = mg->mg_prev; | |
298 | mgnext = mg->mg_next; | |
299 | ||
300 | if (mg == mgnext) { | |
301 | mc->mc_rotor = NULL; | |
302 | } else { | |
303 | mc->mc_rotor = mgnext; | |
304 | mgprev->mg_next = mgnext; | |
305 | mgnext->mg_prev = mgprev; | |
306 | } | |
307 | ||
308 | mg->mg_prev = NULL; | |
309 | mg->mg_next = NULL; | |
310 | } | |
311 | ||
34dc7c2f BB |
312 | static void |
313 | metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp) | |
314 | { | |
315 | mutex_enter(&mg->mg_lock); | |
316 | ASSERT(msp->ms_group == NULL); | |
317 | msp->ms_group = mg; | |
318 | msp->ms_weight = 0; | |
319 | avl_add(&mg->mg_metaslab_tree, msp); | |
320 | mutex_exit(&mg->mg_lock); | |
321 | } | |
322 | ||
323 | static void | |
324 | metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp) | |
325 | { | |
326 | mutex_enter(&mg->mg_lock); | |
327 | ASSERT(msp->ms_group == mg); | |
328 | avl_remove(&mg->mg_metaslab_tree, msp); | |
329 | msp->ms_group = NULL; | |
330 | mutex_exit(&mg->mg_lock); | |
331 | } | |
332 | ||
333 | static void | |
334 | metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) | |
335 | { | |
336 | /* | |
337 | * Although in principle the weight can be any value, in | |
338 | * practice we do not use values in the range [1, 510]. | |
339 | */ | |
340 | ASSERT(weight >= SPA_MINBLOCKSIZE-1 || weight == 0); | |
341 | ASSERT(MUTEX_HELD(&msp->ms_lock)); | |
342 | ||
343 | mutex_enter(&mg->mg_lock); | |
344 | ASSERT(msp->ms_group == mg); | |
345 | avl_remove(&mg->mg_metaslab_tree, msp); | |
346 | msp->ms_weight = weight; | |
347 | avl_add(&mg->mg_metaslab_tree, msp); | |
348 | mutex_exit(&mg->mg_lock); | |
349 | } | |
350 | ||
428870ff BB |
351 | /* |
352 | * ========================================================================== | |
353 | * Common allocator routines | |
354 | * ========================================================================== | |
355 | */ | |
356 | static int | |
357 | metaslab_segsize_compare(const void *x1, const void *x2) | |
358 | { | |
359 | const space_seg_t *s1 = x1; | |
360 | const space_seg_t *s2 = x2; | |
361 | uint64_t ss_size1 = s1->ss_end - s1->ss_start; | |
362 | uint64_t ss_size2 = s2->ss_end - s2->ss_start; | |
363 | ||
364 | if (ss_size1 < ss_size2) | |
365 | return (-1); | |
366 | if (ss_size1 > ss_size2) | |
367 | return (1); | |
368 | ||
369 | if (s1->ss_start < s2->ss_start) | |
370 | return (-1); | |
371 | if (s1->ss_start > s2->ss_start) | |
372 | return (1); | |
373 | ||
374 | return (0); | |
375 | } | |
376 | ||
22c81dd8 BB |
377 | #if defined(WITH_FF_BLOCK_ALLOCATOR) || \ |
378 | defined(WITH_DF_BLOCK_ALLOCATOR) || \ | |
379 | defined(WITH_CDF_BLOCK_ALLOCATOR) | |
34dc7c2f | 380 | /* |
9babb374 BB |
381 | * This is a helper function that can be used by the allocator to find |
382 | * a suitable block to allocate. This will search the specified AVL | |
383 | * tree looking for a block that matches the specified criteria. | |
34dc7c2f | 384 | */ |
34dc7c2f | 385 | static uint64_t |
9babb374 BB |
386 | metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size, |
387 | uint64_t align) | |
34dc7c2f | 388 | { |
34dc7c2f BB |
389 | space_seg_t *ss, ssearch; |
390 | avl_index_t where; | |
391 | ||
392 | ssearch.ss_start = *cursor; | |
393 | ssearch.ss_end = *cursor + size; | |
394 | ||
395 | ss = avl_find(t, &ssearch, &where); | |
396 | if (ss == NULL) | |
397 | ss = avl_nearest(t, where, AVL_AFTER); | |
398 | ||
399 | while (ss != NULL) { | |
400 | uint64_t offset = P2ROUNDUP(ss->ss_start, align); | |
401 | ||
402 | if (offset + size <= ss->ss_end) { | |
403 | *cursor = offset + size; | |
404 | return (offset); | |
405 | } | |
406 | ss = AVL_NEXT(t, ss); | |
407 | } | |
408 | ||
409 | /* | |
410 | * If we know we've searched the whole map (*cursor == 0), give up. | |
411 | * Otherwise, reset the cursor to the beginning and try again. | |
412 | */ | |
413 | if (*cursor == 0) | |
414 | return (-1ULL); | |
415 | ||
416 | *cursor = 0; | |
9babb374 BB |
417 | return (metaslab_block_picker(t, cursor, size, align)); |
418 | } | |
22c81dd8 | 419 | #endif /* WITH_FF/DF/CDF_BLOCK_ALLOCATOR */ |
9babb374 | 420 | |
9babb374 | 421 | static void |
428870ff | 422 | metaslab_pp_load(space_map_t *sm) |
9babb374 | 423 | { |
428870ff BB |
424 | space_seg_t *ss; |
425 | ||
9babb374 | 426 | ASSERT(sm->sm_ppd == NULL); |
b8d06fca | 427 | sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_PUSHPAGE); |
428870ff | 428 | |
b8d06fca | 429 | sm->sm_pp_root = kmem_alloc(sizeof (avl_tree_t), KM_PUSHPAGE); |
428870ff BB |
430 | avl_create(sm->sm_pp_root, metaslab_segsize_compare, |
431 | sizeof (space_seg_t), offsetof(struct space_seg, ss_pp_node)); | |
432 | ||
433 | for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss)) | |
434 | avl_add(sm->sm_pp_root, ss); | |
9babb374 BB |
435 | } |
436 | ||
437 | static void | |
428870ff | 438 | metaslab_pp_unload(space_map_t *sm) |
9babb374 | 439 | { |
428870ff BB |
440 | void *cookie = NULL; |
441 | ||
9babb374 BB |
442 | kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t)); |
443 | sm->sm_ppd = NULL; | |
9babb374 | 444 | |
428870ff BB |
445 | while (avl_destroy_nodes(sm->sm_pp_root, &cookie) != NULL) { |
446 | /* tear down the tree */ | |
447 | } | |
9babb374 | 448 | |
428870ff BB |
449 | avl_destroy(sm->sm_pp_root); |
450 | kmem_free(sm->sm_pp_root, sizeof (avl_tree_t)); | |
451 | sm->sm_pp_root = NULL; | |
34dc7c2f BB |
452 | } |
453 | ||
454 | /* ARGSUSED */ | |
455 | static void | |
428870ff | 456 | metaslab_pp_claim(space_map_t *sm, uint64_t start, uint64_t size) |
34dc7c2f BB |
457 | { |
458 | /* No need to update cursor */ | |
459 | } | |
460 | ||
461 | /* ARGSUSED */ | |
462 | static void | |
428870ff | 463 | metaslab_pp_free(space_map_t *sm, uint64_t start, uint64_t size) |
34dc7c2f BB |
464 | { |
465 | /* No need to update cursor */ | |
466 | } | |
467 | ||
9babb374 | 468 | /* |
428870ff | 469 | * Return the maximum contiguous segment within the metaslab. |
9babb374 | 470 | */ |
9babb374 | 471 | uint64_t |
428870ff | 472 | metaslab_pp_maxsize(space_map_t *sm) |
9babb374 BB |
473 | { |
474 | avl_tree_t *t = sm->sm_pp_root; | |
475 | space_seg_t *ss; | |
476 | ||
477 | if (t == NULL || (ss = avl_last(t)) == NULL) | |
478 | return (0ULL); | |
479 | ||
480 | return (ss->ss_end - ss->ss_start); | |
481 | } | |
482 | ||
22c81dd8 | 483 | #if defined(WITH_FF_BLOCK_ALLOCATOR) |
428870ff BB |
484 | /* |
485 | * ========================================================================== | |
486 | * The first-fit block allocator | |
487 | * ========================================================================== | |
488 | */ | |
489 | static uint64_t | |
490 | metaslab_ff_alloc(space_map_t *sm, uint64_t size) | |
9babb374 | 491 | { |
428870ff BB |
492 | avl_tree_t *t = &sm->sm_root; |
493 | uint64_t align = size & -size; | |
494 | uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1; | |
9babb374 | 495 | |
428870ff | 496 | return (metaslab_block_picker(t, cursor, size, align)); |
9babb374 BB |
497 | } |
498 | ||
428870ff BB |
499 | /* ARGSUSED */ |
500 | boolean_t | |
501 | metaslab_ff_fragmented(space_map_t *sm) | |
9babb374 | 502 | { |
428870ff | 503 | return (B_TRUE); |
9babb374 BB |
504 | } |
505 | ||
428870ff BB |
506 | static space_map_ops_t metaslab_ff_ops = { |
507 | metaslab_pp_load, | |
508 | metaslab_pp_unload, | |
509 | metaslab_ff_alloc, | |
510 | metaslab_pp_claim, | |
511 | metaslab_pp_free, | |
512 | metaslab_pp_maxsize, | |
513 | metaslab_ff_fragmented | |
514 | }; | |
9babb374 | 515 | |
22c81dd8 BB |
516 | space_map_ops_t *zfs_metaslab_ops = &metaslab_ff_ops; |
517 | #endif /* WITH_FF_BLOCK_ALLOCATOR */ | |
518 | ||
519 | #if defined(WITH_DF_BLOCK_ALLOCATOR) | |
428870ff BB |
520 | /* |
521 | * ========================================================================== | |
522 | * Dynamic block allocator - | |
523 | * Uses the first fit allocation scheme until space get low and then | |
524 | * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold | |
525 | * and metaslab_df_free_pct to determine when to switch the allocation scheme. | |
526 | * ========================================================================== | |
527 | */ | |
9babb374 BB |
528 | static uint64_t |
529 | metaslab_df_alloc(space_map_t *sm, uint64_t size) | |
530 | { | |
531 | avl_tree_t *t = &sm->sm_root; | |
532 | uint64_t align = size & -size; | |
533 | uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1; | |
428870ff | 534 | uint64_t max_size = metaslab_pp_maxsize(sm); |
9babb374 BB |
535 | int free_pct = sm->sm_space * 100 / sm->sm_size; |
536 | ||
537 | ASSERT(MUTEX_HELD(sm->sm_lock)); | |
538 | ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root)); | |
539 | ||
540 | if (max_size < size) | |
541 | return (-1ULL); | |
542 | ||
543 | /* | |
544 | * If we're running low on space switch to using the size | |
545 | * sorted AVL tree (best-fit). | |
546 | */ | |
547 | if (max_size < metaslab_df_alloc_threshold || | |
548 | free_pct < metaslab_df_free_pct) { | |
549 | t = sm->sm_pp_root; | |
550 | *cursor = 0; | |
551 | } | |
552 | ||
553 | return (metaslab_block_picker(t, cursor, size, 1ULL)); | |
554 | } | |
555 | ||
428870ff BB |
556 | static boolean_t |
557 | metaslab_df_fragmented(space_map_t *sm) | |
9babb374 | 558 | { |
428870ff BB |
559 | uint64_t max_size = metaslab_pp_maxsize(sm); |
560 | int free_pct = sm->sm_space * 100 / sm->sm_size; | |
9babb374 | 561 | |
428870ff BB |
562 | if (max_size >= metaslab_df_alloc_threshold && |
563 | free_pct >= metaslab_df_free_pct) | |
564 | return (B_FALSE); | |
565 | ||
566 | return (B_TRUE); | |
9babb374 BB |
567 | } |
568 | ||
569 | static space_map_ops_t metaslab_df_ops = { | |
428870ff BB |
570 | metaslab_pp_load, |
571 | metaslab_pp_unload, | |
9babb374 | 572 | metaslab_df_alloc, |
428870ff BB |
573 | metaslab_pp_claim, |
574 | metaslab_pp_free, | |
575 | metaslab_pp_maxsize, | |
576 | metaslab_df_fragmented | |
34dc7c2f BB |
577 | }; |
578 | ||
22c81dd8 BB |
579 | space_map_ops_t *zfs_metaslab_ops = &metaslab_df_ops; |
580 | #endif /* WITH_DF_BLOCK_ALLOCATOR */ | |
581 | ||
428870ff BB |
582 | /* |
583 | * ========================================================================== | |
584 | * Other experimental allocators | |
585 | * ========================================================================== | |
586 | */ | |
22c81dd8 | 587 | #if defined(WITH_CDF_BLOCK_ALLOCATOR) |
428870ff BB |
588 | static uint64_t |
589 | metaslab_cdf_alloc(space_map_t *sm, uint64_t size) | |
590 | { | |
591 | avl_tree_t *t = &sm->sm_root; | |
592 | uint64_t *cursor = (uint64_t *)sm->sm_ppd; | |
593 | uint64_t *extent_end = (uint64_t *)sm->sm_ppd + 1; | |
594 | uint64_t max_size = metaslab_pp_maxsize(sm); | |
595 | uint64_t rsize = size; | |
596 | uint64_t offset = 0; | |
597 | ||
598 | ASSERT(MUTEX_HELD(sm->sm_lock)); | |
599 | ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root)); | |
600 | ||
601 | if (max_size < size) | |
602 | return (-1ULL); | |
603 | ||
604 | ASSERT3U(*extent_end, >=, *cursor); | |
605 | ||
606 | /* | |
607 | * If we're running low on space switch to using the size | |
608 | * sorted AVL tree (best-fit). | |
609 | */ | |
610 | if ((*cursor + size) > *extent_end) { | |
611 | ||
612 | t = sm->sm_pp_root; | |
613 | *cursor = *extent_end = 0; | |
614 | ||
615 | if (max_size > 2 * SPA_MAXBLOCKSIZE) | |
616 | rsize = MIN(metaslab_min_alloc_size, max_size); | |
617 | offset = metaslab_block_picker(t, extent_end, rsize, 1ULL); | |
618 | if (offset != -1) | |
619 | *cursor = offset + size; | |
620 | } else { | |
621 | offset = metaslab_block_picker(t, cursor, rsize, 1ULL); | |
622 | } | |
623 | ASSERT3U(*cursor, <=, *extent_end); | |
624 | return (offset); | |
625 | } | |
626 | ||
627 | static boolean_t | |
628 | metaslab_cdf_fragmented(space_map_t *sm) | |
629 | { | |
630 | uint64_t max_size = metaslab_pp_maxsize(sm); | |
631 | ||
632 | if (max_size > (metaslab_min_alloc_size * 10)) | |
633 | return (B_FALSE); | |
634 | return (B_TRUE); | |
635 | } | |
636 | ||
637 | static space_map_ops_t metaslab_cdf_ops = { | |
638 | metaslab_pp_load, | |
639 | metaslab_pp_unload, | |
640 | metaslab_cdf_alloc, | |
641 | metaslab_pp_claim, | |
642 | metaslab_pp_free, | |
643 | metaslab_pp_maxsize, | |
644 | metaslab_cdf_fragmented | |
645 | }; | |
646 | ||
22c81dd8 BB |
647 | space_map_ops_t *zfs_metaslab_ops = &metaslab_cdf_ops; |
648 | #endif /* WITH_CDF_BLOCK_ALLOCATOR */ | |
649 | ||
650 | #if defined(WITH_NDF_BLOCK_ALLOCATOR) | |
428870ff BB |
651 | uint64_t metaslab_ndf_clump_shift = 4; |
652 | ||
653 | static uint64_t | |
654 | metaslab_ndf_alloc(space_map_t *sm, uint64_t size) | |
655 | { | |
656 | avl_tree_t *t = &sm->sm_root; | |
657 | avl_index_t where; | |
658 | space_seg_t *ss, ssearch; | |
659 | uint64_t hbit = highbit(size); | |
660 | uint64_t *cursor = (uint64_t *)sm->sm_ppd + hbit - 1; | |
661 | uint64_t max_size = metaslab_pp_maxsize(sm); | |
662 | ||
663 | ASSERT(MUTEX_HELD(sm->sm_lock)); | |
664 | ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root)); | |
665 | ||
666 | if (max_size < size) | |
667 | return (-1ULL); | |
668 | ||
669 | ssearch.ss_start = *cursor; | |
670 | ssearch.ss_end = *cursor + size; | |
671 | ||
672 | ss = avl_find(t, &ssearch, &where); | |
673 | if (ss == NULL || (ss->ss_start + size > ss->ss_end)) { | |
674 | t = sm->sm_pp_root; | |
675 | ||
676 | ssearch.ss_start = 0; | |
677 | ssearch.ss_end = MIN(max_size, | |
678 | 1ULL << (hbit + metaslab_ndf_clump_shift)); | |
679 | ss = avl_find(t, &ssearch, &where); | |
680 | if (ss == NULL) | |
681 | ss = avl_nearest(t, where, AVL_AFTER); | |
682 | ASSERT(ss != NULL); | |
683 | } | |
684 | ||
685 | if (ss != NULL) { | |
686 | if (ss->ss_start + size <= ss->ss_end) { | |
687 | *cursor = ss->ss_start + size; | |
688 | return (ss->ss_start); | |
689 | } | |
690 | } | |
691 | return (-1ULL); | |
692 | } | |
693 | ||
694 | static boolean_t | |
695 | metaslab_ndf_fragmented(space_map_t *sm) | |
696 | { | |
697 | uint64_t max_size = metaslab_pp_maxsize(sm); | |
698 | ||
699 | if (max_size > (metaslab_min_alloc_size << metaslab_ndf_clump_shift)) | |
700 | return (B_FALSE); | |
701 | return (B_TRUE); | |
702 | } | |
703 | ||
704 | ||
705 | static space_map_ops_t metaslab_ndf_ops = { | |
706 | metaslab_pp_load, | |
707 | metaslab_pp_unload, | |
708 | metaslab_ndf_alloc, | |
709 | metaslab_pp_claim, | |
710 | metaslab_pp_free, | |
711 | metaslab_pp_maxsize, | |
712 | metaslab_ndf_fragmented | |
713 | }; | |
714 | ||
715 | space_map_ops_t *zfs_metaslab_ops = &metaslab_ndf_ops; | |
22c81dd8 | 716 | #endif /* WITH_NDF_BLOCK_ALLOCATOR */ |
9babb374 | 717 | |
34dc7c2f BB |
718 | /* |
719 | * ========================================================================== | |
720 | * Metaslabs | |
721 | * ========================================================================== | |
722 | */ | |
723 | metaslab_t * | |
724 | metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo, | |
725 | uint64_t start, uint64_t size, uint64_t txg) | |
726 | { | |
727 | vdev_t *vd = mg->mg_vd; | |
728 | metaslab_t *msp; | |
729 | ||
b8d06fca | 730 | msp = kmem_zalloc(sizeof (metaslab_t), KM_PUSHPAGE); |
34dc7c2f BB |
731 | mutex_init(&msp->ms_lock, NULL, MUTEX_DEFAULT, NULL); |
732 | ||
733 | msp->ms_smo_syncing = *smo; | |
734 | ||
735 | /* | |
736 | * We create the main space map here, but we don't create the | |
737 | * allocmaps and freemaps until metaslab_sync_done(). This serves | |
738 | * two purposes: it allows metaslab_sync_done() to detect the | |
739 | * addition of new space; and for debugging, it ensures that we'd | |
740 | * data fault on any attempt to use this metaslab before it's ready. | |
741 | */ | |
742 | space_map_create(&msp->ms_map, start, size, | |
743 | vd->vdev_ashift, &msp->ms_lock); | |
744 | ||
745 | metaslab_group_add(mg, msp); | |
746 | ||
428870ff BB |
747 | if (metaslab_debug && smo->smo_object != 0) { |
748 | mutex_enter(&msp->ms_lock); | |
749 | VERIFY(space_map_load(&msp->ms_map, mg->mg_class->mc_ops, | |
750 | SM_FREE, smo, spa_meta_objset(vd->vdev_spa)) == 0); | |
751 | mutex_exit(&msp->ms_lock); | |
752 | } | |
753 | ||
34dc7c2f BB |
754 | /* |
755 | * If we're opening an existing pool (txg == 0) or creating | |
756 | * a new one (txg == TXG_INITIAL), all space is available now. | |
757 | * If we're adding space to an existing pool, the new space | |
758 | * does not become available until after this txg has synced. | |
759 | */ | |
760 | if (txg <= TXG_INITIAL) | |
761 | metaslab_sync_done(msp, 0); | |
762 | ||
763 | if (txg != 0) { | |
34dc7c2f | 764 | vdev_dirty(vd, 0, NULL, txg); |
428870ff | 765 | vdev_dirty(vd, VDD_METASLAB, msp, txg); |
34dc7c2f BB |
766 | } |
767 | ||
768 | return (msp); | |
769 | } | |
770 | ||
771 | void | |
772 | metaslab_fini(metaslab_t *msp) | |
773 | { | |
774 | metaslab_group_t *mg = msp->ms_group; | |
d6320ddb | 775 | int t; |
34dc7c2f | 776 | |
428870ff BB |
777 | vdev_space_update(mg->mg_vd, |
778 | -msp->ms_smo.smo_alloc, 0, -msp->ms_map.sm_size); | |
34dc7c2f BB |
779 | |
780 | metaslab_group_remove(mg, msp); | |
781 | ||
782 | mutex_enter(&msp->ms_lock); | |
783 | ||
784 | space_map_unload(&msp->ms_map); | |
785 | space_map_destroy(&msp->ms_map); | |
786 | ||
d6320ddb | 787 | for (t = 0; t < TXG_SIZE; t++) { |
34dc7c2f BB |
788 | space_map_destroy(&msp->ms_allocmap[t]); |
789 | space_map_destroy(&msp->ms_freemap[t]); | |
790 | } | |
791 | ||
d6320ddb | 792 | for (t = 0; t < TXG_DEFER_SIZE; t++) |
428870ff BB |
793 | space_map_destroy(&msp->ms_defermap[t]); |
794 | ||
795 | ASSERT3S(msp->ms_deferspace, ==, 0); | |
796 | ||
34dc7c2f BB |
797 | mutex_exit(&msp->ms_lock); |
798 | mutex_destroy(&msp->ms_lock); | |
799 | ||
800 | kmem_free(msp, sizeof (metaslab_t)); | |
801 | } | |
802 | ||
803 | #define METASLAB_WEIGHT_PRIMARY (1ULL << 63) | |
804 | #define METASLAB_WEIGHT_SECONDARY (1ULL << 62) | |
805 | #define METASLAB_ACTIVE_MASK \ | |
806 | (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY) | |
34dc7c2f BB |
807 | |
808 | static uint64_t | |
809 | metaslab_weight(metaslab_t *msp) | |
810 | { | |
811 | metaslab_group_t *mg = msp->ms_group; | |
812 | space_map_t *sm = &msp->ms_map; | |
813 | space_map_obj_t *smo = &msp->ms_smo; | |
814 | vdev_t *vd = mg->mg_vd; | |
815 | uint64_t weight, space; | |
816 | ||
817 | ASSERT(MUTEX_HELD(&msp->ms_lock)); | |
818 | ||
819 | /* | |
820 | * The baseline weight is the metaslab's free space. | |
821 | */ | |
822 | space = sm->sm_size - smo->smo_alloc; | |
823 | weight = space; | |
824 | ||
825 | /* | |
826 | * Modern disks have uniform bit density and constant angular velocity. | |
827 | * Therefore, the outer recording zones are faster (higher bandwidth) | |
828 | * than the inner zones by the ratio of outer to inner track diameter, | |
829 | * which is typically around 2:1. We account for this by assigning | |
830 | * higher weight to lower metaslabs (multiplier ranging from 2x to 1x). | |
831 | * In effect, this means that we'll select the metaslab with the most | |
832 | * free bandwidth rather than simply the one with the most free space. | |
833 | */ | |
834 | weight = 2 * weight - | |
835 | ((sm->sm_start >> vd->vdev_ms_shift) * weight) / vd->vdev_ms_count; | |
836 | ASSERT(weight >= space && weight <= 2 * space); | |
837 | ||
838 | /* | |
428870ff BB |
839 | * For locality, assign higher weight to metaslabs which have |
840 | * a lower offset than what we've already activated. | |
34dc7c2f | 841 | */ |
428870ff BB |
842 | if (sm->sm_start <= mg->mg_bonus_area) |
843 | weight *= (metaslab_smo_bonus_pct / 100); | |
34dc7c2f | 844 | ASSERT(weight >= space && |
428870ff BB |
845 | weight <= 2 * (metaslab_smo_bonus_pct / 100) * space); |
846 | ||
847 | if (sm->sm_loaded && !sm->sm_ops->smop_fragmented(sm)) { | |
848 | /* | |
849 | * If this metaslab is one we're actively using, adjust its | |
850 | * weight to make it preferable to any inactive metaslab so | |
851 | * we'll polish it off. | |
852 | */ | |
853 | weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK); | |
854 | } | |
855 | return (weight); | |
856 | } | |
857 | ||
858 | static void | |
859 | metaslab_prefetch(metaslab_group_t *mg) | |
860 | { | |
861 | spa_t *spa = mg->mg_vd->vdev_spa; | |
862 | metaslab_t *msp; | |
863 | avl_tree_t *t = &mg->mg_metaslab_tree; | |
864 | int m; | |
865 | ||
866 | mutex_enter(&mg->mg_lock); | |
34dc7c2f BB |
867 | |
868 | /* | |
428870ff | 869 | * Prefetch the next potential metaslabs |
34dc7c2f | 870 | */ |
428870ff BB |
871 | for (msp = avl_first(t), m = 0; msp; msp = AVL_NEXT(t, msp), m++) { |
872 | space_map_t *sm = &msp->ms_map; | |
873 | space_map_obj_t *smo = &msp->ms_smo; | |
34dc7c2f | 874 | |
428870ff BB |
875 | /* If we have reached our prefetch limit then we're done */ |
876 | if (m >= metaslab_prefetch_limit) | |
877 | break; | |
878 | ||
879 | if (!sm->sm_loaded && smo->smo_object != 0) { | |
880 | mutex_exit(&mg->mg_lock); | |
881 | dmu_prefetch(spa_meta_objset(spa), smo->smo_object, | |
882 | 0ULL, smo->smo_objsize); | |
883 | mutex_enter(&mg->mg_lock); | |
884 | } | |
885 | } | |
886 | mutex_exit(&mg->mg_lock); | |
34dc7c2f BB |
887 | } |
888 | ||
889 | static int | |
6d974228 | 890 | metaslab_activate(metaslab_t *msp, uint64_t activation_weight) |
34dc7c2f | 891 | { |
428870ff | 892 | metaslab_group_t *mg = msp->ms_group; |
34dc7c2f | 893 | space_map_t *sm = &msp->ms_map; |
9babb374 | 894 | space_map_ops_t *sm_ops = msp->ms_group->mg_class->mc_ops; |
d6320ddb | 895 | int t; |
34dc7c2f BB |
896 | |
897 | ASSERT(MUTEX_HELD(&msp->ms_lock)); | |
898 | ||
899 | if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { | |
428870ff BB |
900 | space_map_load_wait(sm); |
901 | if (!sm->sm_loaded) { | |
55d85d5a GW |
902 | space_map_obj_t *smo = &msp->ms_smo; |
903 | ||
904 | int error = space_map_load(sm, sm_ops, SM_FREE, smo, | |
428870ff BB |
905 | spa_meta_objset(msp->ms_group->mg_vd->vdev_spa)); |
906 | if (error) { | |
907 | metaslab_group_sort(msp->ms_group, msp, 0); | |
908 | return (error); | |
909 | } | |
d6320ddb | 910 | for (t = 0; t < TXG_DEFER_SIZE; t++) |
428870ff BB |
911 | space_map_walk(&msp->ms_defermap[t], |
912 | space_map_claim, sm); | |
913 | ||
914 | } | |
915 | ||
916 | /* | |
917 | * Track the bonus area as we activate new metaslabs. | |
918 | */ | |
919 | if (sm->sm_start > mg->mg_bonus_area) { | |
920 | mutex_enter(&mg->mg_lock); | |
921 | mg->mg_bonus_area = sm->sm_start; | |
922 | mutex_exit(&mg->mg_lock); | |
34dc7c2f | 923 | } |
9babb374 | 924 | |
34dc7c2f BB |
925 | metaslab_group_sort(msp->ms_group, msp, |
926 | msp->ms_weight | activation_weight); | |
927 | } | |
928 | ASSERT(sm->sm_loaded); | |
929 | ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); | |
930 | ||
931 | return (0); | |
932 | } | |
933 | ||
934 | static void | |
935 | metaslab_passivate(metaslab_t *msp, uint64_t size) | |
936 | { | |
937 | /* | |
938 | * If size < SPA_MINBLOCKSIZE, then we will not allocate from | |
939 | * this metaslab again. In that case, it had better be empty, | |
940 | * or we would be leaving space on the table. | |
941 | */ | |
942 | ASSERT(size >= SPA_MINBLOCKSIZE || msp->ms_map.sm_space == 0); | |
943 | metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size)); | |
944 | ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0); | |
945 | } | |
946 | ||
947 | /* | |
948 | * Write a metaslab to disk in the context of the specified transaction group. | |
949 | */ | |
950 | void | |
951 | metaslab_sync(metaslab_t *msp, uint64_t txg) | |
952 | { | |
953 | vdev_t *vd = msp->ms_group->mg_vd; | |
954 | spa_t *spa = vd->vdev_spa; | |
428870ff | 955 | objset_t *mos = spa_meta_objset(spa); |
34dc7c2f BB |
956 | space_map_t *allocmap = &msp->ms_allocmap[txg & TXG_MASK]; |
957 | space_map_t *freemap = &msp->ms_freemap[txg & TXG_MASK]; | |
958 | space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; | |
959 | space_map_t *sm = &msp->ms_map; | |
960 | space_map_obj_t *smo = &msp->ms_smo_syncing; | |
961 | dmu_buf_t *db; | |
962 | dmu_tx_t *tx; | |
d6320ddb | 963 | int t; |
34dc7c2f | 964 | |
428870ff BB |
965 | ASSERT(!vd->vdev_ishole); |
966 | ||
967 | if (allocmap->sm_space == 0 && freemap->sm_space == 0) | |
968 | return; | |
34dc7c2f BB |
969 | |
970 | /* | |
971 | * The only state that can actually be changing concurrently with | |
972 | * metaslab_sync() is the metaslab's ms_map. No other thread can | |
973 | * be modifying this txg's allocmap, freemap, freed_map, or smo. | |
974 | * Therefore, we only hold ms_lock to satify space_map ASSERTs. | |
975 | * We drop it whenever we call into the DMU, because the DMU | |
976 | * can call down to us (e.g. via zio_free()) at any time. | |
977 | */ | |
428870ff BB |
978 | |
979 | tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); | |
34dc7c2f BB |
980 | |
981 | if (smo->smo_object == 0) { | |
982 | ASSERT(smo->smo_objsize == 0); | |
983 | ASSERT(smo->smo_alloc == 0); | |
34dc7c2f BB |
984 | smo->smo_object = dmu_object_alloc(mos, |
985 | DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT, | |
986 | DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx); | |
987 | ASSERT(smo->smo_object != 0); | |
988 | dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) * | |
989 | (sm->sm_start >> vd->vdev_ms_shift), | |
990 | sizeof (uint64_t), &smo->smo_object, tx); | |
34dc7c2f BB |
991 | } |
992 | ||
428870ff BB |
993 | mutex_enter(&msp->ms_lock); |
994 | ||
34dc7c2f BB |
995 | space_map_walk(freemap, space_map_add, freed_map); |
996 | ||
997 | if (sm->sm_loaded && spa_sync_pass(spa) == 1 && smo->smo_objsize >= | |
998 | 2 * sizeof (uint64_t) * avl_numnodes(&sm->sm_root)) { | |
999 | /* | |
1000 | * The in-core space map representation is twice as compact | |
1001 | * as the on-disk one, so it's time to condense the latter | |
1002 | * by generating a pure allocmap from first principles. | |
1003 | * | |
1004 | * This metaslab is 100% allocated, | |
1005 | * minus the content of the in-core map (sm), | |
1006 | * minus what's been freed this txg (freed_map), | |
428870ff | 1007 | * minus deferred frees (ms_defermap[]), |
34dc7c2f BB |
1008 | * minus allocations from txgs in the future |
1009 | * (because they haven't been committed yet). | |
1010 | */ | |
1011 | space_map_vacate(allocmap, NULL, NULL); | |
1012 | space_map_vacate(freemap, NULL, NULL); | |
1013 | ||
1014 | space_map_add(allocmap, allocmap->sm_start, allocmap->sm_size); | |
1015 | ||
1016 | space_map_walk(sm, space_map_remove, allocmap); | |
1017 | space_map_walk(freed_map, space_map_remove, allocmap); | |
1018 | ||
d6320ddb | 1019 | for (t = 0; t < TXG_DEFER_SIZE; t++) |
428870ff BB |
1020 | space_map_walk(&msp->ms_defermap[t], |
1021 | space_map_remove, allocmap); | |
1022 | ||
d6320ddb | 1023 | for (t = 1; t < TXG_CONCURRENT_STATES; t++) |
34dc7c2f BB |
1024 | space_map_walk(&msp->ms_allocmap[(txg + t) & TXG_MASK], |
1025 | space_map_remove, allocmap); | |
1026 | ||
1027 | mutex_exit(&msp->ms_lock); | |
1028 | space_map_truncate(smo, mos, tx); | |
1029 | mutex_enter(&msp->ms_lock); | |
1030 | } | |
1031 | ||
1032 | space_map_sync(allocmap, SM_ALLOC, smo, mos, tx); | |
1033 | space_map_sync(freemap, SM_FREE, smo, mos, tx); | |
1034 | ||
1035 | mutex_exit(&msp->ms_lock); | |
1036 | ||
1037 | VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)); | |
1038 | dmu_buf_will_dirty(db, tx); | |
1039 | ASSERT3U(db->db_size, >=, sizeof (*smo)); | |
1040 | bcopy(smo, db->db_data, sizeof (*smo)); | |
1041 | dmu_buf_rele(db, FTAG); | |
1042 | ||
1043 | dmu_tx_commit(tx); | |
1044 | } | |
1045 | ||
1046 | /* | |
1047 | * Called after a transaction group has completely synced to mark | |
1048 | * all of the metaslab's free space as usable. | |
1049 | */ | |
1050 | void | |
1051 | metaslab_sync_done(metaslab_t *msp, uint64_t txg) | |
1052 | { | |
1053 | space_map_obj_t *smo = &msp->ms_smo; | |
1054 | space_map_obj_t *smosync = &msp->ms_smo_syncing; | |
1055 | space_map_t *sm = &msp->ms_map; | |
1056 | space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; | |
428870ff | 1057 | space_map_t *defer_map = &msp->ms_defermap[txg % TXG_DEFER_SIZE]; |
34dc7c2f BB |
1058 | metaslab_group_t *mg = msp->ms_group; |
1059 | vdev_t *vd = mg->mg_vd; | |
428870ff | 1060 | int64_t alloc_delta, defer_delta; |
d6320ddb | 1061 | int t; |
428870ff BB |
1062 | |
1063 | ASSERT(!vd->vdev_ishole); | |
34dc7c2f BB |
1064 | |
1065 | mutex_enter(&msp->ms_lock); | |
1066 | ||
1067 | /* | |
1068 | * If this metaslab is just becoming available, initialize its | |
1069 | * allocmaps and freemaps and add its capacity to the vdev. | |
1070 | */ | |
1071 | if (freed_map->sm_size == 0) { | |
d6320ddb | 1072 | for (t = 0; t < TXG_SIZE; t++) { |
34dc7c2f BB |
1073 | space_map_create(&msp->ms_allocmap[t], sm->sm_start, |
1074 | sm->sm_size, sm->sm_shift, sm->sm_lock); | |
1075 | space_map_create(&msp->ms_freemap[t], sm->sm_start, | |
1076 | sm->sm_size, sm->sm_shift, sm->sm_lock); | |
1077 | } | |
428870ff | 1078 | |
d6320ddb | 1079 | for (t = 0; t < TXG_DEFER_SIZE; t++) |
428870ff BB |
1080 | space_map_create(&msp->ms_defermap[t], sm->sm_start, |
1081 | sm->sm_size, sm->sm_shift, sm->sm_lock); | |
1082 | ||
1083 | vdev_space_update(vd, 0, 0, sm->sm_size); | |
34dc7c2f BB |
1084 | } |
1085 | ||
428870ff BB |
1086 | alloc_delta = smosync->smo_alloc - smo->smo_alloc; |
1087 | defer_delta = freed_map->sm_space - defer_map->sm_space; | |
1088 | ||
1089 | vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0); | |
34dc7c2f BB |
1090 | |
1091 | ASSERT(msp->ms_allocmap[txg & TXG_MASK].sm_space == 0); | |
1092 | ASSERT(msp->ms_freemap[txg & TXG_MASK].sm_space == 0); | |
1093 | ||
1094 | /* | |
1095 | * If there's a space_map_load() in progress, wait for it to complete | |
1096 | * so that we have a consistent view of the in-core space map. | |
428870ff BB |
1097 | * Then, add defer_map (oldest deferred frees) to this map and |
1098 | * transfer freed_map (this txg's frees) to defer_map. | |
34dc7c2f BB |
1099 | */ |
1100 | space_map_load_wait(sm); | |
428870ff BB |
1101 | space_map_vacate(defer_map, sm->sm_loaded ? space_map_free : NULL, sm); |
1102 | space_map_vacate(freed_map, space_map_add, defer_map); | |
34dc7c2f BB |
1103 | |
1104 | *smo = *smosync; | |
1105 | ||
428870ff BB |
1106 | msp->ms_deferspace += defer_delta; |
1107 | ASSERT3S(msp->ms_deferspace, >=, 0); | |
1108 | ASSERT3S(msp->ms_deferspace, <=, sm->sm_size); | |
1109 | if (msp->ms_deferspace != 0) { | |
1110 | /* | |
1111 | * Keep syncing this metaslab until all deferred frees | |
1112 | * are back in circulation. | |
1113 | */ | |
1114 | vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); | |
1115 | } | |
1116 | ||
34dc7c2f BB |
1117 | /* |
1118 | * If the map is loaded but no longer active, evict it as soon as all | |
1119 | * future allocations have synced. (If we unloaded it now and then | |
1120 | * loaded a moment later, the map wouldn't reflect those allocations.) | |
1121 | */ | |
1122 | if (sm->sm_loaded && (msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { | |
1123 | int evictable = 1; | |
1124 | ||
d6320ddb | 1125 | for (t = 1; t < TXG_CONCURRENT_STATES; t++) |
34dc7c2f BB |
1126 | if (msp->ms_allocmap[(txg + t) & TXG_MASK].sm_space) |
1127 | evictable = 0; | |
1128 | ||
428870ff | 1129 | if (evictable && !metaslab_debug) |
34dc7c2f BB |
1130 | space_map_unload(sm); |
1131 | } | |
1132 | ||
1133 | metaslab_group_sort(mg, msp, metaslab_weight(msp)); | |
1134 | ||
1135 | mutex_exit(&msp->ms_lock); | |
1136 | } | |
1137 | ||
428870ff BB |
1138 | void |
1139 | metaslab_sync_reassess(metaslab_group_t *mg) | |
1140 | { | |
1141 | vdev_t *vd = mg->mg_vd; | |
6d974228 | 1142 | int64_t failures = mg->mg_alloc_failures; |
d6320ddb | 1143 | int m; |
428870ff BB |
1144 | |
1145 | /* | |
1146 | * Re-evaluate all metaslabs which have lower offsets than the | |
1147 | * bonus area. | |
1148 | */ | |
d6320ddb | 1149 | for (m = 0; m < vd->vdev_ms_count; m++) { |
428870ff BB |
1150 | metaslab_t *msp = vd->vdev_ms[m]; |
1151 | ||
1152 | if (msp->ms_map.sm_start > mg->mg_bonus_area) | |
1153 | break; | |
1154 | ||
1155 | mutex_enter(&msp->ms_lock); | |
1156 | metaslab_group_sort(mg, msp, metaslab_weight(msp)); | |
1157 | mutex_exit(&msp->ms_lock); | |
1158 | } | |
1159 | ||
6d974228 GW |
1160 | atomic_add_64(&mg->mg_alloc_failures, -failures); |
1161 | ||
428870ff BB |
1162 | /* |
1163 | * Prefetch the next potential metaslabs | |
1164 | */ | |
1165 | metaslab_prefetch(mg); | |
1166 | } | |
1167 | ||
34dc7c2f BB |
1168 | static uint64_t |
1169 | metaslab_distance(metaslab_t *msp, dva_t *dva) | |
1170 | { | |
1171 | uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift; | |
1172 | uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift; | |
1173 | uint64_t start = msp->ms_map.sm_start >> ms_shift; | |
1174 | ||
1175 | if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva)) | |
1176 | return (1ULL << 63); | |
1177 | ||
1178 | if (offset < start) | |
1179 | return ((start - offset) << ms_shift); | |
1180 | if (offset > start) | |
1181 | return ((offset - start) << ms_shift); | |
1182 | return (0); | |
1183 | } | |
1184 | ||
1185 | static uint64_t | |
6d974228 GW |
1186 | metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize, |
1187 | uint64_t txg, uint64_t min_distance, dva_t *dva, int d, int flags) | |
34dc7c2f | 1188 | { |
6d974228 | 1189 | spa_t *spa = mg->mg_vd->vdev_spa; |
34dc7c2f BB |
1190 | metaslab_t *msp = NULL; |
1191 | uint64_t offset = -1ULL; | |
1192 | avl_tree_t *t = &mg->mg_metaslab_tree; | |
1193 | uint64_t activation_weight; | |
1194 | uint64_t target_distance; | |
1195 | int i; | |
1196 | ||
1197 | activation_weight = METASLAB_WEIGHT_PRIMARY; | |
9babb374 BB |
1198 | for (i = 0; i < d; i++) { |
1199 | if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { | |
34dc7c2f | 1200 | activation_weight = METASLAB_WEIGHT_SECONDARY; |
9babb374 BB |
1201 | break; |
1202 | } | |
1203 | } | |
34dc7c2f BB |
1204 | |
1205 | for (;;) { | |
9babb374 BB |
1206 | boolean_t was_active; |
1207 | ||
34dc7c2f BB |
1208 | mutex_enter(&mg->mg_lock); |
1209 | for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) { | |
6d974228 GW |
1210 | if (msp->ms_weight < asize) { |
1211 | spa_dbgmsg(spa, "%s: failed to meet weight " | |
1212 | "requirement: vdev %llu, txg %llu, mg %p, " | |
1213 | "msp %p, psize %llu, asize %llu, " | |
1214 | "failures %llu, weight %llu", | |
1215 | spa_name(spa), mg->mg_vd->vdev_id, txg, | |
1216 | mg, msp, psize, asize, | |
1217 | mg->mg_alloc_failures, msp->ms_weight); | |
34dc7c2f BB |
1218 | mutex_exit(&mg->mg_lock); |
1219 | return (-1ULL); | |
1220 | } | |
9babb374 | 1221 | was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; |
34dc7c2f BB |
1222 | if (activation_weight == METASLAB_WEIGHT_PRIMARY) |
1223 | break; | |
1224 | ||
1225 | target_distance = min_distance + | |
1226 | (msp->ms_smo.smo_alloc ? 0 : min_distance >> 1); | |
1227 | ||
1228 | for (i = 0; i < d; i++) | |
1229 | if (metaslab_distance(msp, &dva[i]) < | |
1230 | target_distance) | |
1231 | break; | |
1232 | if (i == d) | |
1233 | break; | |
1234 | } | |
1235 | mutex_exit(&mg->mg_lock); | |
1236 | if (msp == NULL) | |
1237 | return (-1ULL); | |
1238 | ||
6d974228 GW |
1239 | /* |
1240 | * If we've already reached the allowable number of failed | |
1241 | * allocation attempts on this metaslab group then we | |
1242 | * consider skipping it. We skip it only if we're allowed | |
1243 | * to "fast" gang, the physical size is larger than | |
1244 | * a gang block, and we're attempting to allocate from | |
1245 | * the primary metaslab. | |
1246 | */ | |
1247 | if (mg->mg_alloc_failures > zfs_mg_alloc_failures && | |
1248 | CAN_FASTGANG(flags) && psize > SPA_GANGBLOCKSIZE && | |
1249 | activation_weight == METASLAB_WEIGHT_PRIMARY) { | |
1250 | spa_dbgmsg(spa, "%s: skipping metaslab group: " | |
1251 | "vdev %llu, txg %llu, mg %p, psize %llu, " | |
1252 | "asize %llu, failures %llu", spa_name(spa), | |
1253 | mg->mg_vd->vdev_id, txg, mg, psize, asize, | |
1254 | mg->mg_alloc_failures); | |
1255 | return (-1ULL); | |
1256 | } | |
1257 | ||
34dc7c2f BB |
1258 | mutex_enter(&msp->ms_lock); |
1259 | ||
1260 | /* | |
1261 | * Ensure that the metaslab we have selected is still | |
1262 | * capable of handling our request. It's possible that | |
1263 | * another thread may have changed the weight while we | |
1264 | * were blocked on the metaslab lock. | |
1265 | */ | |
6d974228 | 1266 | if (msp->ms_weight < asize || (was_active && |
9babb374 BB |
1267 | !(msp->ms_weight & METASLAB_ACTIVE_MASK) && |
1268 | activation_weight == METASLAB_WEIGHT_PRIMARY)) { | |
34dc7c2f BB |
1269 | mutex_exit(&msp->ms_lock); |
1270 | continue; | |
1271 | } | |
1272 | ||
1273 | if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) && | |
1274 | activation_weight == METASLAB_WEIGHT_PRIMARY) { | |
1275 | metaslab_passivate(msp, | |
1276 | msp->ms_weight & ~METASLAB_ACTIVE_MASK); | |
1277 | mutex_exit(&msp->ms_lock); | |
1278 | continue; | |
1279 | } | |
1280 | ||
6d974228 | 1281 | if (metaslab_activate(msp, activation_weight) != 0) { |
34dc7c2f BB |
1282 | mutex_exit(&msp->ms_lock); |
1283 | continue; | |
1284 | } | |
1285 | ||
6d974228 | 1286 | if ((offset = space_map_alloc(&msp->ms_map, asize)) != -1ULL) |
34dc7c2f BB |
1287 | break; |
1288 | ||
6d974228 GW |
1289 | atomic_inc_64(&mg->mg_alloc_failures); |
1290 | ||
428870ff | 1291 | metaslab_passivate(msp, space_map_maxsize(&msp->ms_map)); |
34dc7c2f BB |
1292 | |
1293 | mutex_exit(&msp->ms_lock); | |
1294 | } | |
1295 | ||
1296 | if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0) | |
1297 | vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); | |
1298 | ||
6d974228 | 1299 | space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, asize); |
34dc7c2f BB |
1300 | |
1301 | mutex_exit(&msp->ms_lock); | |
1302 | ||
1303 | return (offset); | |
1304 | } | |
1305 | ||
1306 | /* | |
1307 | * Allocate a block for the specified i/o. | |
1308 | */ | |
1309 | static int | |
1310 | metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, | |
b128c09f | 1311 | dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags) |
34dc7c2f | 1312 | { |
920dd524 | 1313 | metaslab_group_t *mg, *fast_mg, *rotor; |
34dc7c2f BB |
1314 | vdev_t *vd; |
1315 | int dshift = 3; | |
1316 | int all_zero; | |
fb5f0bc8 BB |
1317 | int zio_lock = B_FALSE; |
1318 | boolean_t allocatable; | |
34dc7c2f BB |
1319 | uint64_t offset = -1ULL; |
1320 | uint64_t asize; | |
1321 | uint64_t distance; | |
1322 | ||
1323 | ASSERT(!DVA_IS_VALID(&dva[d])); | |
1324 | ||
1325 | /* | |
1326 | * For testing, make some blocks above a certain size be gang blocks. | |
1327 | */ | |
428870ff | 1328 | if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0) |
34dc7c2f BB |
1329 | return (ENOSPC); |
1330 | ||
920dd524 ED |
1331 | if (flags & METASLAB_FASTWRITE) |
1332 | mutex_enter(&mc->mc_fastwrite_lock); | |
1333 | ||
34dc7c2f BB |
1334 | /* |
1335 | * Start at the rotor and loop through all mgs until we find something. | |
428870ff | 1336 | * Note that there's no locking on mc_rotor or mc_aliquot because |
34dc7c2f BB |
1337 | * nothing actually breaks if we miss a few updates -- we just won't |
1338 | * allocate quite as evenly. It all balances out over time. | |
1339 | * | |
1340 | * If we are doing ditto or log blocks, try to spread them across | |
1341 | * consecutive vdevs. If we're forced to reuse a vdev before we've | |
1342 | * allocated all of our ditto blocks, then try and spread them out on | |
1343 | * that vdev as much as possible. If it turns out to not be possible, | |
1344 | * gradually lower our standards until anything becomes acceptable. | |
1345 | * Also, allocating on consecutive vdevs (as opposed to random vdevs) | |
1346 | * gives us hope of containing our fault domains to something we're | |
1347 | * able to reason about. Otherwise, any two top-level vdev failures | |
1348 | * will guarantee the loss of data. With consecutive allocation, | |
1349 | * only two adjacent top-level vdev failures will result in data loss. | |
1350 | * | |
1351 | * If we are doing gang blocks (hintdva is non-NULL), try to keep | |
1352 | * ourselves on the same vdev as our gang block header. That | |
1353 | * way, we can hope for locality in vdev_cache, plus it makes our | |
1354 | * fault domains something tractable. | |
1355 | */ | |
1356 | if (hintdva) { | |
1357 | vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d])); | |
428870ff BB |
1358 | |
1359 | /* | |
1360 | * It's possible the vdev we're using as the hint no | |
1361 | * longer exists (i.e. removed). Consult the rotor when | |
1362 | * all else fails. | |
1363 | */ | |
1364 | if (vd != NULL) { | |
34dc7c2f | 1365 | mg = vd->vdev_mg; |
428870ff BB |
1366 | |
1367 | if (flags & METASLAB_HINTBP_AVOID && | |
1368 | mg->mg_next != NULL) | |
1369 | mg = mg->mg_next; | |
1370 | } else { | |
1371 | mg = mc->mc_rotor; | |
1372 | } | |
34dc7c2f BB |
1373 | } else if (d != 0) { |
1374 | vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1])); | |
1375 | mg = vd->vdev_mg->mg_next; | |
920dd524 ED |
1376 | } else if (flags & METASLAB_FASTWRITE) { |
1377 | mg = fast_mg = mc->mc_rotor; | |
1378 | ||
1379 | do { | |
1380 | if (fast_mg->mg_vd->vdev_pending_fastwrite < | |
1381 | mg->mg_vd->vdev_pending_fastwrite) | |
1382 | mg = fast_mg; | |
1383 | } while ((fast_mg = fast_mg->mg_next) != mc->mc_rotor); | |
1384 | ||
34dc7c2f BB |
1385 | } else { |
1386 | mg = mc->mc_rotor; | |
1387 | } | |
1388 | ||
1389 | /* | |
428870ff BB |
1390 | * If the hint put us into the wrong metaslab class, or into a |
1391 | * metaslab group that has been passivated, just follow the rotor. | |
34dc7c2f | 1392 | */ |
428870ff | 1393 | if (mg->mg_class != mc || mg->mg_activation_count <= 0) |
34dc7c2f BB |
1394 | mg = mc->mc_rotor; |
1395 | ||
1396 | rotor = mg; | |
1397 | top: | |
1398 | all_zero = B_TRUE; | |
1399 | do { | |
428870ff BB |
1400 | ASSERT(mg->mg_activation_count == 1); |
1401 | ||
34dc7c2f | 1402 | vd = mg->mg_vd; |
fb5f0bc8 | 1403 | |
34dc7c2f | 1404 | /* |
b128c09f | 1405 | * Don't allocate from faulted devices. |
34dc7c2f | 1406 | */ |
fb5f0bc8 BB |
1407 | if (zio_lock) { |
1408 | spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER); | |
1409 | allocatable = vdev_allocatable(vd); | |
1410 | spa_config_exit(spa, SCL_ZIO, FTAG); | |
1411 | } else { | |
1412 | allocatable = vdev_allocatable(vd); | |
1413 | } | |
1414 | if (!allocatable) | |
34dc7c2f | 1415 | goto next; |
fb5f0bc8 | 1416 | |
34dc7c2f BB |
1417 | /* |
1418 | * Avoid writing single-copy data to a failing vdev | |
1419 | */ | |
1420 | if ((vd->vdev_stat.vs_write_errors > 0 || | |
1421 | vd->vdev_state < VDEV_STATE_HEALTHY) && | |
1422 | d == 0 && dshift == 3) { | |
1423 | all_zero = B_FALSE; | |
1424 | goto next; | |
1425 | } | |
1426 | ||
1427 | ASSERT(mg->mg_class == mc); | |
1428 | ||
1429 | distance = vd->vdev_asize >> dshift; | |
1430 | if (distance <= (1ULL << vd->vdev_ms_shift)) | |
1431 | distance = 0; | |
1432 | else | |
1433 | all_zero = B_FALSE; | |
1434 | ||
1435 | asize = vdev_psize_to_asize(vd, psize); | |
1436 | ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); | |
1437 | ||
6d974228 GW |
1438 | offset = metaslab_group_alloc(mg, psize, asize, txg, distance, |
1439 | dva, d, flags); | |
34dc7c2f BB |
1440 | if (offset != -1ULL) { |
1441 | /* | |
1442 | * If we've just selected this metaslab group, | |
1443 | * figure out whether the corresponding vdev is | |
1444 | * over- or under-used relative to the pool, | |
1445 | * and set an allocation bias to even it out. | |
1446 | */ | |
428870ff | 1447 | if (mc->mc_aliquot == 0) { |
34dc7c2f | 1448 | vdev_stat_t *vs = &vd->vdev_stat; |
428870ff | 1449 | int64_t vu, cu; |
34dc7c2f | 1450 | |
6d974228 GW |
1451 | vu = (vs->vs_alloc * 100) / (vs->vs_space + 1); |
1452 | cu = (mc->mc_alloc * 100) / (mc->mc_space + 1); | |
34dc7c2f BB |
1453 | |
1454 | /* | |
6d974228 GW |
1455 | * Calculate how much more or less we should |
1456 | * try to allocate from this device during | |
1457 | * this iteration around the rotor. | |
1458 | * For example, if a device is 80% full | |
1459 | * and the pool is 20% full then we should | |
1460 | * reduce allocations by 60% on this device. | |
1461 | * | |
1462 | * mg_bias = (20 - 80) * 512K / 100 = -307K | |
1463 | * | |
1464 | * This reduces allocations by 307K for this | |
1465 | * iteration. | |
34dc7c2f | 1466 | */ |
428870ff | 1467 | mg->mg_bias = ((cu - vu) * |
6d974228 | 1468 | (int64_t)mg->mg_aliquot) / 100; |
34dc7c2f BB |
1469 | } |
1470 | ||
920dd524 ED |
1471 | if ((flags & METASLAB_FASTWRITE) || |
1472 | atomic_add_64_nv(&mc->mc_aliquot, asize) >= | |
34dc7c2f BB |
1473 | mg->mg_aliquot + mg->mg_bias) { |
1474 | mc->mc_rotor = mg->mg_next; | |
428870ff | 1475 | mc->mc_aliquot = 0; |
34dc7c2f BB |
1476 | } |
1477 | ||
1478 | DVA_SET_VDEV(&dva[d], vd->vdev_id); | |
1479 | DVA_SET_OFFSET(&dva[d], offset); | |
b128c09f | 1480 | DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER)); |
34dc7c2f BB |
1481 | DVA_SET_ASIZE(&dva[d], asize); |
1482 | ||
920dd524 ED |
1483 | if (flags & METASLAB_FASTWRITE) { |
1484 | atomic_add_64(&vd->vdev_pending_fastwrite, | |
1485 | psize); | |
1486 | mutex_exit(&mc->mc_fastwrite_lock); | |
1487 | } | |
1488 | ||
34dc7c2f BB |
1489 | return (0); |
1490 | } | |
1491 | next: | |
1492 | mc->mc_rotor = mg->mg_next; | |
428870ff | 1493 | mc->mc_aliquot = 0; |
34dc7c2f BB |
1494 | } while ((mg = mg->mg_next) != rotor); |
1495 | ||
1496 | if (!all_zero) { | |
1497 | dshift++; | |
1498 | ASSERT(dshift < 64); | |
1499 | goto top; | |
1500 | } | |
1501 | ||
9babb374 | 1502 | if (!allocatable && !zio_lock) { |
fb5f0bc8 BB |
1503 | dshift = 3; |
1504 | zio_lock = B_TRUE; | |
1505 | goto top; | |
1506 | } | |
1507 | ||
34dc7c2f BB |
1508 | bzero(&dva[d], sizeof (dva_t)); |
1509 | ||
920dd524 ED |
1510 | if (flags & METASLAB_FASTWRITE) |
1511 | mutex_exit(&mc->mc_fastwrite_lock); | |
34dc7c2f BB |
1512 | return (ENOSPC); |
1513 | } | |
1514 | ||
1515 | /* | |
1516 | * Free the block represented by DVA in the context of the specified | |
1517 | * transaction group. | |
1518 | */ | |
1519 | static void | |
1520 | metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now) | |
1521 | { | |
1522 | uint64_t vdev = DVA_GET_VDEV(dva); | |
1523 | uint64_t offset = DVA_GET_OFFSET(dva); | |
1524 | uint64_t size = DVA_GET_ASIZE(dva); | |
1525 | vdev_t *vd; | |
1526 | metaslab_t *msp; | |
1527 | ||
1528 | ASSERT(DVA_IS_VALID(dva)); | |
1529 | ||
1530 | if (txg > spa_freeze_txg(spa)) | |
1531 | return; | |
1532 | ||
1533 | if ((vd = vdev_lookup_top(spa, vdev)) == NULL || | |
1534 | (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) { | |
1535 | cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu", | |
1536 | (u_longlong_t)vdev, (u_longlong_t)offset); | |
1537 | ASSERT(0); | |
1538 | return; | |
1539 | } | |
1540 | ||
1541 | msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; | |
1542 | ||
1543 | if (DVA_GET_GANG(dva)) | |
1544 | size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); | |
1545 | ||
1546 | mutex_enter(&msp->ms_lock); | |
1547 | ||
1548 | if (now) { | |
1549 | space_map_remove(&msp->ms_allocmap[txg & TXG_MASK], | |
1550 | offset, size); | |
1551 | space_map_free(&msp->ms_map, offset, size); | |
1552 | } else { | |
1553 | if (msp->ms_freemap[txg & TXG_MASK].sm_space == 0) | |
1554 | vdev_dirty(vd, VDD_METASLAB, msp, txg); | |
1555 | space_map_add(&msp->ms_freemap[txg & TXG_MASK], offset, size); | |
34dc7c2f BB |
1556 | } |
1557 | ||
1558 | mutex_exit(&msp->ms_lock); | |
1559 | } | |
1560 | ||
1561 | /* | |
1562 | * Intent log support: upon opening the pool after a crash, notify the SPA | |
1563 | * of blocks that the intent log has allocated for immediate write, but | |
1564 | * which are still considered free by the SPA because the last transaction | |
1565 | * group didn't commit yet. | |
1566 | */ | |
1567 | static int | |
1568 | metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) | |
1569 | { | |
1570 | uint64_t vdev = DVA_GET_VDEV(dva); | |
1571 | uint64_t offset = DVA_GET_OFFSET(dva); | |
1572 | uint64_t size = DVA_GET_ASIZE(dva); | |
1573 | vdev_t *vd; | |
1574 | metaslab_t *msp; | |
428870ff | 1575 | int error = 0; |
34dc7c2f BB |
1576 | |
1577 | ASSERT(DVA_IS_VALID(dva)); | |
1578 | ||
1579 | if ((vd = vdev_lookup_top(spa, vdev)) == NULL || | |
1580 | (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) | |
1581 | return (ENXIO); | |
1582 | ||
1583 | msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; | |
1584 | ||
1585 | if (DVA_GET_GANG(dva)) | |
1586 | size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); | |
1587 | ||
1588 | mutex_enter(&msp->ms_lock); | |
1589 | ||
428870ff | 1590 | if ((txg != 0 && spa_writeable(spa)) || !msp->ms_map.sm_loaded) |
6d974228 | 1591 | error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY); |
428870ff BB |
1592 | |
1593 | if (error == 0 && !space_map_contains(&msp->ms_map, offset, size)) | |
1594 | error = ENOENT; | |
1595 | ||
b128c09f | 1596 | if (error || txg == 0) { /* txg == 0 indicates dry run */ |
34dc7c2f BB |
1597 | mutex_exit(&msp->ms_lock); |
1598 | return (error); | |
1599 | } | |
1600 | ||
34dc7c2f | 1601 | space_map_claim(&msp->ms_map, offset, size); |
b128c09f | 1602 | |
fb5f0bc8 | 1603 | if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */ |
b128c09f BB |
1604 | if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0) |
1605 | vdev_dirty(vd, VDD_METASLAB, msp, txg); | |
1606 | space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size); | |
1607 | } | |
34dc7c2f BB |
1608 | |
1609 | mutex_exit(&msp->ms_lock); | |
1610 | ||
1611 | return (0); | |
1612 | } | |
1613 | ||
1614 | int | |
1615 | metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, | |
b128c09f | 1616 | int ndvas, uint64_t txg, blkptr_t *hintbp, int flags) |
34dc7c2f BB |
1617 | { |
1618 | dva_t *dva = bp->blk_dva; | |
1619 | dva_t *hintdva = hintbp->blk_dva; | |
d6320ddb | 1620 | int d, error = 0; |
34dc7c2f | 1621 | |
b128c09f | 1622 | ASSERT(bp->blk_birth == 0); |
428870ff | 1623 | ASSERT(BP_PHYSICAL_BIRTH(bp) == 0); |
b128c09f BB |
1624 | |
1625 | spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); | |
1626 | ||
1627 | if (mc->mc_rotor == NULL) { /* no vdevs in this class */ | |
1628 | spa_config_exit(spa, SCL_ALLOC, FTAG); | |
34dc7c2f | 1629 | return (ENOSPC); |
b128c09f | 1630 | } |
34dc7c2f BB |
1631 | |
1632 | ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa)); | |
1633 | ASSERT(BP_GET_NDVAS(bp) == 0); | |
1634 | ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp)); | |
1635 | ||
d6320ddb | 1636 | for (d = 0; d < ndvas; d++) { |
34dc7c2f | 1637 | error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva, |
b128c09f | 1638 | txg, flags); |
34dc7c2f BB |
1639 | if (error) { |
1640 | for (d--; d >= 0; d--) { | |
1641 | metaslab_free_dva(spa, &dva[d], txg, B_TRUE); | |
1642 | bzero(&dva[d], sizeof (dva_t)); | |
1643 | } | |
b128c09f | 1644 | spa_config_exit(spa, SCL_ALLOC, FTAG); |
34dc7c2f BB |
1645 | return (error); |
1646 | } | |
1647 | } | |
1648 | ASSERT(error == 0); | |
1649 | ASSERT(BP_GET_NDVAS(bp) == ndvas); | |
1650 | ||
b128c09f BB |
1651 | spa_config_exit(spa, SCL_ALLOC, FTAG); |
1652 | ||
428870ff | 1653 | BP_SET_BIRTH(bp, txg, txg); |
b128c09f | 1654 | |
34dc7c2f BB |
1655 | return (0); |
1656 | } | |
1657 | ||
1658 | void | |
1659 | metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now) | |
1660 | { | |
1661 | const dva_t *dva = bp->blk_dva; | |
d6320ddb | 1662 | int d, ndvas = BP_GET_NDVAS(bp); |
34dc7c2f BB |
1663 | |
1664 | ASSERT(!BP_IS_HOLE(bp)); | |
428870ff | 1665 | ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa)); |
b128c09f BB |
1666 | |
1667 | spa_config_enter(spa, SCL_FREE, FTAG, RW_READER); | |
34dc7c2f | 1668 | |
d6320ddb | 1669 | for (d = 0; d < ndvas; d++) |
34dc7c2f | 1670 | metaslab_free_dva(spa, &dva[d], txg, now); |
b128c09f BB |
1671 | |
1672 | spa_config_exit(spa, SCL_FREE, FTAG); | |
34dc7c2f BB |
1673 | } |
1674 | ||
1675 | int | |
1676 | metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) | |
1677 | { | |
1678 | const dva_t *dva = bp->blk_dva; | |
1679 | int ndvas = BP_GET_NDVAS(bp); | |
d6320ddb | 1680 | int d, error = 0; |
34dc7c2f BB |
1681 | |
1682 | ASSERT(!BP_IS_HOLE(bp)); | |
1683 | ||
b128c09f BB |
1684 | if (txg != 0) { |
1685 | /* | |
1686 | * First do a dry run to make sure all DVAs are claimable, | |
1687 | * so we don't have to unwind from partial failures below. | |
1688 | */ | |
1689 | if ((error = metaslab_claim(spa, bp, 0)) != 0) | |
1690 | return (error); | |
1691 | } | |
1692 | ||
1693 | spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); | |
1694 | ||
d6320ddb | 1695 | for (d = 0; d < ndvas; d++) |
34dc7c2f | 1696 | if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0) |
b128c09f BB |
1697 | break; |
1698 | ||
1699 | spa_config_exit(spa, SCL_ALLOC, FTAG); | |
1700 | ||
1701 | ASSERT(error == 0 || txg == 0); | |
34dc7c2f | 1702 | |
b128c09f | 1703 | return (error); |
34dc7c2f | 1704 | } |
920dd524 ED |
1705 | |
1706 | void metaslab_fastwrite_mark(spa_t *spa, const blkptr_t *bp) | |
1707 | { | |
1708 | const dva_t *dva = bp->blk_dva; | |
1709 | int ndvas = BP_GET_NDVAS(bp); | |
1710 | uint64_t psize = BP_GET_PSIZE(bp); | |
1711 | int d; | |
1712 | vdev_t *vd; | |
1713 | ||
1714 | ASSERT(!BP_IS_HOLE(bp)); | |
1715 | ASSERT(psize > 0); | |
1716 | ||
1717 | spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); | |
1718 | ||
1719 | for (d = 0; d < ndvas; d++) { | |
1720 | if ((vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]))) == NULL) | |
1721 | continue; | |
1722 | atomic_add_64(&vd->vdev_pending_fastwrite, psize); | |
1723 | } | |
1724 | ||
1725 | spa_config_exit(spa, SCL_VDEV, FTAG); | |
1726 | } | |
1727 | ||
1728 | void metaslab_fastwrite_unmark(spa_t *spa, const blkptr_t *bp) | |
1729 | { | |
1730 | const dva_t *dva = bp->blk_dva; | |
1731 | int ndvas = BP_GET_NDVAS(bp); | |
1732 | uint64_t psize = BP_GET_PSIZE(bp); | |
1733 | int d; | |
1734 | vdev_t *vd; | |
1735 | ||
1736 | ASSERT(!BP_IS_HOLE(bp)); | |
1737 | ASSERT(psize > 0); | |
1738 | ||
1739 | spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); | |
1740 | ||
1741 | for (d = 0; d < ndvas; d++) { | |
1742 | if ((vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]))) == NULL) | |
1743 | continue; | |
1744 | ASSERT3U(vd->vdev_pending_fastwrite, >=, psize); | |
1745 | atomic_sub_64(&vd->vdev_pending_fastwrite, psize); | |
1746 | } | |
1747 | ||
1748 | spa_config_exit(spa, SCL_VDEV, FTAG); | |
1749 | } | |
30b92c1d BB |
1750 | |
1751 | #if defined(_KERNEL) && defined(HAVE_SPL) | |
1752 | module_param(metaslab_debug, int, 0644); | |
1753 | MODULE_PARM_DESC(metaslab_debug, "keep space maps in core to verify frees"); | |
1754 | #endif /* _KERNEL && HAVE_SPL */ |