]>
Commit | Line | Data |
---|---|---|
34dc7c2f BB |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
9 | * or http://www.opensolaris.org/os/licensing. | |
10 | * See the License for the specific language governing permissions | |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | /* | |
428870ff | 22 | * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. |
ebf8e3a2 | 23 | * Copyright (c) 2012 by Delphix. All rights reserved. |
34dc7c2f BB |
24 | */ |
25 | ||
34dc7c2f | 26 | #include <sys/zfs_context.h> |
34dc7c2f BB |
27 | #include <sys/dmu.h> |
28 | #include <sys/dmu_tx.h> | |
29 | #include <sys/space_map.h> | |
30 | #include <sys/metaslab_impl.h> | |
31 | #include <sys/vdev_impl.h> | |
32 | #include <sys/zio.h> | |
33 | ||
6d974228 GW |
34 | #define WITH_DF_BLOCK_ALLOCATOR |
35 | ||
36 | /* | |
37 | * Allow allocations to switch to gang blocks quickly. We do this to | |
38 | * avoid having to load lots of space_maps in a given txg. There are, | |
39 | * however, some cases where we want to avoid "fast" ganging and instead | |
40 | * we want to do an exhaustive search of all metaslabs on this device. | |
ebf8e3a2 | 41 | * Currently we don't allow any gang, zil, or dump device related allocations |
6d974228 GW |
42 | * to "fast" gang. |
43 | */ | |
44 | #define CAN_FASTGANG(flags) \ | |
45 | (!((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER | \ | |
46 | METASLAB_GANG_AVOID))) | |
22c81dd8 | 47 | |
34dc7c2f BB |
48 | uint64_t metaslab_aliquot = 512ULL << 10; |
49 | uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */ | |
50 | ||
6d974228 GW |
51 | /* |
52 | * This value defines the number of allowed allocation failures per vdev. | |
53 | * If a device reaches this threshold in a given txg then we consider skipping | |
54 | * allocations on that device. | |
55 | */ | |
56 | int zfs_mg_alloc_failures; | |
57 | ||
428870ff BB |
58 | /* |
59 | * Metaslab debugging: when set, keeps all space maps in core to verify frees. | |
60 | */ | |
61 | static int metaslab_debug = 0; | |
62 | ||
9babb374 BB |
63 | /* |
64 | * Minimum size which forces the dynamic allocator to change | |
428870ff | 65 | * it's allocation strategy. Once the space map cannot satisfy |
9babb374 BB |
66 | * an allocation of this size then it switches to using more |
67 | * aggressive strategy (i.e search by size rather than offset). | |
68 | */ | |
69 | uint64_t metaslab_df_alloc_threshold = SPA_MAXBLOCKSIZE; | |
70 | ||
71 | /* | |
72 | * The minimum free space, in percent, which must be available | |
73 | * in a space map to continue allocations in a first-fit fashion. | |
74 | * Once the space_map's free space drops below this level we dynamically | |
75 | * switch to using best-fit allocations. | |
76 | */ | |
428870ff BB |
77 | int metaslab_df_free_pct = 4; |
78 | ||
79 | /* | |
80 | * A metaslab is considered "free" if it contains a contiguous | |
81 | * segment which is greater than metaslab_min_alloc_size. | |
82 | */ | |
83 | uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS; | |
84 | ||
85 | /* | |
86 | * Max number of space_maps to prefetch. | |
87 | */ | |
88 | int metaslab_prefetch_limit = SPA_DVAS_PER_BP; | |
89 | ||
90 | /* | |
91 | * Percentage bonus multiplier for metaslabs that are in the bonus area. | |
92 | */ | |
93 | int metaslab_smo_bonus_pct = 150; | |
9babb374 | 94 | |
34dc7c2f BB |
95 | /* |
96 | * ========================================================================== | |
97 | * Metaslab classes | |
98 | * ========================================================================== | |
99 | */ | |
100 | metaslab_class_t * | |
428870ff | 101 | metaslab_class_create(spa_t *spa, space_map_ops_t *ops) |
34dc7c2f BB |
102 | { |
103 | metaslab_class_t *mc; | |
104 | ||
b8d06fca | 105 | mc = kmem_zalloc(sizeof (metaslab_class_t), KM_PUSHPAGE); |
34dc7c2f | 106 | |
428870ff | 107 | mc->mc_spa = spa; |
34dc7c2f | 108 | mc->mc_rotor = NULL; |
9babb374 | 109 | mc->mc_ops = ops; |
920dd524 | 110 | mutex_init(&mc->mc_fastwrite_lock, NULL, MUTEX_DEFAULT, NULL); |
34dc7c2f BB |
111 | |
112 | return (mc); | |
113 | } | |
114 | ||
115 | void | |
116 | metaslab_class_destroy(metaslab_class_t *mc) | |
117 | { | |
428870ff BB |
118 | ASSERT(mc->mc_rotor == NULL); |
119 | ASSERT(mc->mc_alloc == 0); | |
120 | ASSERT(mc->mc_deferred == 0); | |
121 | ASSERT(mc->mc_space == 0); | |
122 | ASSERT(mc->mc_dspace == 0); | |
34dc7c2f | 123 | |
920dd524 | 124 | mutex_destroy(&mc->mc_fastwrite_lock); |
34dc7c2f BB |
125 | kmem_free(mc, sizeof (metaslab_class_t)); |
126 | } | |
127 | ||
428870ff BB |
128 | int |
129 | metaslab_class_validate(metaslab_class_t *mc) | |
34dc7c2f | 130 | { |
428870ff BB |
131 | metaslab_group_t *mg; |
132 | vdev_t *vd; | |
34dc7c2f | 133 | |
428870ff BB |
134 | /* |
135 | * Must hold one of the spa_config locks. | |
136 | */ | |
137 | ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) || | |
138 | spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER)); | |
34dc7c2f | 139 | |
428870ff BB |
140 | if ((mg = mc->mc_rotor) == NULL) |
141 | return (0); | |
142 | ||
143 | do { | |
144 | vd = mg->mg_vd; | |
145 | ASSERT(vd->vdev_mg != NULL); | |
146 | ASSERT3P(vd->vdev_top, ==, vd); | |
147 | ASSERT3P(mg->mg_class, ==, mc); | |
148 | ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops); | |
149 | } while ((mg = mg->mg_next) != mc->mc_rotor); | |
150 | ||
151 | return (0); | |
34dc7c2f BB |
152 | } |
153 | ||
154 | void | |
428870ff BB |
155 | metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta, |
156 | int64_t defer_delta, int64_t space_delta, int64_t dspace_delta) | |
34dc7c2f | 157 | { |
428870ff BB |
158 | atomic_add_64(&mc->mc_alloc, alloc_delta); |
159 | atomic_add_64(&mc->mc_deferred, defer_delta); | |
160 | atomic_add_64(&mc->mc_space, space_delta); | |
161 | atomic_add_64(&mc->mc_dspace, dspace_delta); | |
162 | } | |
34dc7c2f | 163 | |
428870ff BB |
164 | uint64_t |
165 | metaslab_class_get_alloc(metaslab_class_t *mc) | |
166 | { | |
167 | return (mc->mc_alloc); | |
168 | } | |
34dc7c2f | 169 | |
428870ff BB |
170 | uint64_t |
171 | metaslab_class_get_deferred(metaslab_class_t *mc) | |
172 | { | |
173 | return (mc->mc_deferred); | |
174 | } | |
34dc7c2f | 175 | |
428870ff BB |
176 | uint64_t |
177 | metaslab_class_get_space(metaslab_class_t *mc) | |
178 | { | |
179 | return (mc->mc_space); | |
180 | } | |
34dc7c2f | 181 | |
428870ff BB |
182 | uint64_t |
183 | metaslab_class_get_dspace(metaslab_class_t *mc) | |
184 | { | |
185 | return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space); | |
34dc7c2f BB |
186 | } |
187 | ||
188 | /* | |
189 | * ========================================================================== | |
190 | * Metaslab groups | |
191 | * ========================================================================== | |
192 | */ | |
193 | static int | |
194 | metaslab_compare(const void *x1, const void *x2) | |
195 | { | |
196 | const metaslab_t *m1 = x1; | |
197 | const metaslab_t *m2 = x2; | |
198 | ||
199 | if (m1->ms_weight < m2->ms_weight) | |
200 | return (1); | |
201 | if (m1->ms_weight > m2->ms_weight) | |
202 | return (-1); | |
203 | ||
204 | /* | |
205 | * If the weights are identical, use the offset to force uniqueness. | |
206 | */ | |
207 | if (m1->ms_map.sm_start < m2->ms_map.sm_start) | |
208 | return (-1); | |
209 | if (m1->ms_map.sm_start > m2->ms_map.sm_start) | |
210 | return (1); | |
211 | ||
212 | ASSERT3P(m1, ==, m2); | |
213 | ||
214 | return (0); | |
215 | } | |
216 | ||
217 | metaslab_group_t * | |
218 | metaslab_group_create(metaslab_class_t *mc, vdev_t *vd) | |
219 | { | |
220 | metaslab_group_t *mg; | |
221 | ||
b8d06fca | 222 | mg = kmem_zalloc(sizeof (metaslab_group_t), KM_PUSHPAGE); |
34dc7c2f BB |
223 | mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); |
224 | avl_create(&mg->mg_metaslab_tree, metaslab_compare, | |
225 | sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node)); | |
34dc7c2f | 226 | mg->mg_vd = vd; |
428870ff BB |
227 | mg->mg_class = mc; |
228 | mg->mg_activation_count = 0; | |
34dc7c2f BB |
229 | |
230 | return (mg); | |
231 | } | |
232 | ||
233 | void | |
234 | metaslab_group_destroy(metaslab_group_t *mg) | |
235 | { | |
428870ff BB |
236 | ASSERT(mg->mg_prev == NULL); |
237 | ASSERT(mg->mg_next == NULL); | |
238 | /* | |
239 | * We may have gone below zero with the activation count | |
240 | * either because we never activated in the first place or | |
241 | * because we're done, and possibly removing the vdev. | |
242 | */ | |
243 | ASSERT(mg->mg_activation_count <= 0); | |
244 | ||
34dc7c2f BB |
245 | avl_destroy(&mg->mg_metaslab_tree); |
246 | mutex_destroy(&mg->mg_lock); | |
247 | kmem_free(mg, sizeof (metaslab_group_t)); | |
248 | } | |
249 | ||
428870ff BB |
250 | void |
251 | metaslab_group_activate(metaslab_group_t *mg) | |
252 | { | |
253 | metaslab_class_t *mc = mg->mg_class; | |
254 | metaslab_group_t *mgprev, *mgnext; | |
255 | ||
256 | ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER)); | |
257 | ||
258 | ASSERT(mc->mc_rotor != mg); | |
259 | ASSERT(mg->mg_prev == NULL); | |
260 | ASSERT(mg->mg_next == NULL); | |
261 | ASSERT(mg->mg_activation_count <= 0); | |
262 | ||
263 | if (++mg->mg_activation_count <= 0) | |
264 | return; | |
265 | ||
266 | mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children); | |
267 | ||
268 | if ((mgprev = mc->mc_rotor) == NULL) { | |
269 | mg->mg_prev = mg; | |
270 | mg->mg_next = mg; | |
271 | } else { | |
272 | mgnext = mgprev->mg_next; | |
273 | mg->mg_prev = mgprev; | |
274 | mg->mg_next = mgnext; | |
275 | mgprev->mg_next = mg; | |
276 | mgnext->mg_prev = mg; | |
277 | } | |
278 | mc->mc_rotor = mg; | |
279 | } | |
280 | ||
281 | void | |
282 | metaslab_group_passivate(metaslab_group_t *mg) | |
283 | { | |
284 | metaslab_class_t *mc = mg->mg_class; | |
285 | metaslab_group_t *mgprev, *mgnext; | |
286 | ||
287 | ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER)); | |
288 | ||
289 | if (--mg->mg_activation_count != 0) { | |
290 | ASSERT(mc->mc_rotor != mg); | |
291 | ASSERT(mg->mg_prev == NULL); | |
292 | ASSERT(mg->mg_next == NULL); | |
293 | ASSERT(mg->mg_activation_count < 0); | |
294 | return; | |
295 | } | |
296 | ||
297 | mgprev = mg->mg_prev; | |
298 | mgnext = mg->mg_next; | |
299 | ||
300 | if (mg == mgnext) { | |
301 | mc->mc_rotor = NULL; | |
302 | } else { | |
303 | mc->mc_rotor = mgnext; | |
304 | mgprev->mg_next = mgnext; | |
305 | mgnext->mg_prev = mgprev; | |
306 | } | |
307 | ||
308 | mg->mg_prev = NULL; | |
309 | mg->mg_next = NULL; | |
310 | } | |
311 | ||
34dc7c2f BB |
312 | static void |
313 | metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp) | |
314 | { | |
315 | mutex_enter(&mg->mg_lock); | |
316 | ASSERT(msp->ms_group == NULL); | |
317 | msp->ms_group = mg; | |
318 | msp->ms_weight = 0; | |
319 | avl_add(&mg->mg_metaslab_tree, msp); | |
320 | mutex_exit(&mg->mg_lock); | |
321 | } | |
322 | ||
323 | static void | |
324 | metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp) | |
325 | { | |
326 | mutex_enter(&mg->mg_lock); | |
327 | ASSERT(msp->ms_group == mg); | |
328 | avl_remove(&mg->mg_metaslab_tree, msp); | |
329 | msp->ms_group = NULL; | |
330 | mutex_exit(&mg->mg_lock); | |
331 | } | |
332 | ||
333 | static void | |
334 | metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) | |
335 | { | |
336 | /* | |
337 | * Although in principle the weight can be any value, in | |
338 | * practice we do not use values in the range [1, 510]. | |
339 | */ | |
340 | ASSERT(weight >= SPA_MINBLOCKSIZE-1 || weight == 0); | |
341 | ASSERT(MUTEX_HELD(&msp->ms_lock)); | |
342 | ||
343 | mutex_enter(&mg->mg_lock); | |
344 | ASSERT(msp->ms_group == mg); | |
345 | avl_remove(&mg->mg_metaslab_tree, msp); | |
346 | msp->ms_weight = weight; | |
347 | avl_add(&mg->mg_metaslab_tree, msp); | |
348 | mutex_exit(&mg->mg_lock); | |
349 | } | |
350 | ||
428870ff BB |
351 | /* |
352 | * ========================================================================== | |
353 | * Common allocator routines | |
354 | * ========================================================================== | |
355 | */ | |
356 | static int | |
357 | metaslab_segsize_compare(const void *x1, const void *x2) | |
358 | { | |
359 | const space_seg_t *s1 = x1; | |
360 | const space_seg_t *s2 = x2; | |
361 | uint64_t ss_size1 = s1->ss_end - s1->ss_start; | |
362 | uint64_t ss_size2 = s2->ss_end - s2->ss_start; | |
363 | ||
364 | if (ss_size1 < ss_size2) | |
365 | return (-1); | |
366 | if (ss_size1 > ss_size2) | |
367 | return (1); | |
368 | ||
369 | if (s1->ss_start < s2->ss_start) | |
370 | return (-1); | |
371 | if (s1->ss_start > s2->ss_start) | |
372 | return (1); | |
373 | ||
374 | return (0); | |
375 | } | |
376 | ||
22c81dd8 BB |
377 | #if defined(WITH_FF_BLOCK_ALLOCATOR) || \ |
378 | defined(WITH_DF_BLOCK_ALLOCATOR) || \ | |
379 | defined(WITH_CDF_BLOCK_ALLOCATOR) | |
34dc7c2f | 380 | /* |
9babb374 BB |
381 | * This is a helper function that can be used by the allocator to find |
382 | * a suitable block to allocate. This will search the specified AVL | |
383 | * tree looking for a block that matches the specified criteria. | |
34dc7c2f | 384 | */ |
34dc7c2f | 385 | static uint64_t |
9babb374 BB |
386 | metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size, |
387 | uint64_t align) | |
34dc7c2f | 388 | { |
34dc7c2f BB |
389 | space_seg_t *ss, ssearch; |
390 | avl_index_t where; | |
391 | ||
392 | ssearch.ss_start = *cursor; | |
393 | ssearch.ss_end = *cursor + size; | |
394 | ||
395 | ss = avl_find(t, &ssearch, &where); | |
396 | if (ss == NULL) | |
397 | ss = avl_nearest(t, where, AVL_AFTER); | |
398 | ||
399 | while (ss != NULL) { | |
400 | uint64_t offset = P2ROUNDUP(ss->ss_start, align); | |
401 | ||
402 | if (offset + size <= ss->ss_end) { | |
403 | *cursor = offset + size; | |
404 | return (offset); | |
405 | } | |
406 | ss = AVL_NEXT(t, ss); | |
407 | } | |
408 | ||
409 | /* | |
410 | * If we know we've searched the whole map (*cursor == 0), give up. | |
411 | * Otherwise, reset the cursor to the beginning and try again. | |
412 | */ | |
413 | if (*cursor == 0) | |
414 | return (-1ULL); | |
415 | ||
416 | *cursor = 0; | |
9babb374 BB |
417 | return (metaslab_block_picker(t, cursor, size, align)); |
418 | } | |
22c81dd8 | 419 | #endif /* WITH_FF/DF/CDF_BLOCK_ALLOCATOR */ |
9babb374 | 420 | |
9babb374 | 421 | static void |
428870ff | 422 | metaslab_pp_load(space_map_t *sm) |
9babb374 | 423 | { |
428870ff BB |
424 | space_seg_t *ss; |
425 | ||
9babb374 | 426 | ASSERT(sm->sm_ppd == NULL); |
b8d06fca | 427 | sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_PUSHPAGE); |
428870ff | 428 | |
b8d06fca | 429 | sm->sm_pp_root = kmem_alloc(sizeof (avl_tree_t), KM_PUSHPAGE); |
428870ff BB |
430 | avl_create(sm->sm_pp_root, metaslab_segsize_compare, |
431 | sizeof (space_seg_t), offsetof(struct space_seg, ss_pp_node)); | |
432 | ||
433 | for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss)) | |
434 | avl_add(sm->sm_pp_root, ss); | |
9babb374 BB |
435 | } |
436 | ||
437 | static void | |
428870ff | 438 | metaslab_pp_unload(space_map_t *sm) |
9babb374 | 439 | { |
428870ff BB |
440 | void *cookie = NULL; |
441 | ||
9babb374 BB |
442 | kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t)); |
443 | sm->sm_ppd = NULL; | |
9babb374 | 444 | |
428870ff BB |
445 | while (avl_destroy_nodes(sm->sm_pp_root, &cookie) != NULL) { |
446 | /* tear down the tree */ | |
447 | } | |
9babb374 | 448 | |
428870ff BB |
449 | avl_destroy(sm->sm_pp_root); |
450 | kmem_free(sm->sm_pp_root, sizeof (avl_tree_t)); | |
451 | sm->sm_pp_root = NULL; | |
34dc7c2f BB |
452 | } |
453 | ||
454 | /* ARGSUSED */ | |
455 | static void | |
428870ff | 456 | metaslab_pp_claim(space_map_t *sm, uint64_t start, uint64_t size) |
34dc7c2f BB |
457 | { |
458 | /* No need to update cursor */ | |
459 | } | |
460 | ||
461 | /* ARGSUSED */ | |
462 | static void | |
428870ff | 463 | metaslab_pp_free(space_map_t *sm, uint64_t start, uint64_t size) |
34dc7c2f BB |
464 | { |
465 | /* No need to update cursor */ | |
466 | } | |
467 | ||
9babb374 | 468 | /* |
428870ff | 469 | * Return the maximum contiguous segment within the metaslab. |
9babb374 | 470 | */ |
9babb374 | 471 | uint64_t |
428870ff | 472 | metaslab_pp_maxsize(space_map_t *sm) |
9babb374 BB |
473 | { |
474 | avl_tree_t *t = sm->sm_pp_root; | |
475 | space_seg_t *ss; | |
476 | ||
477 | if (t == NULL || (ss = avl_last(t)) == NULL) | |
478 | return (0ULL); | |
479 | ||
480 | return (ss->ss_end - ss->ss_start); | |
481 | } | |
482 | ||
22c81dd8 | 483 | #if defined(WITH_FF_BLOCK_ALLOCATOR) |
428870ff BB |
484 | /* |
485 | * ========================================================================== | |
486 | * The first-fit block allocator | |
487 | * ========================================================================== | |
488 | */ | |
489 | static uint64_t | |
490 | metaslab_ff_alloc(space_map_t *sm, uint64_t size) | |
9babb374 | 491 | { |
428870ff BB |
492 | avl_tree_t *t = &sm->sm_root; |
493 | uint64_t align = size & -size; | |
494 | uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1; | |
9babb374 | 495 | |
428870ff | 496 | return (metaslab_block_picker(t, cursor, size, align)); |
9babb374 BB |
497 | } |
498 | ||
428870ff BB |
499 | /* ARGSUSED */ |
500 | boolean_t | |
501 | metaslab_ff_fragmented(space_map_t *sm) | |
9babb374 | 502 | { |
428870ff | 503 | return (B_TRUE); |
9babb374 BB |
504 | } |
505 | ||
428870ff BB |
506 | static space_map_ops_t metaslab_ff_ops = { |
507 | metaslab_pp_load, | |
508 | metaslab_pp_unload, | |
509 | metaslab_ff_alloc, | |
510 | metaslab_pp_claim, | |
511 | metaslab_pp_free, | |
512 | metaslab_pp_maxsize, | |
513 | metaslab_ff_fragmented | |
514 | }; | |
9babb374 | 515 | |
22c81dd8 BB |
516 | space_map_ops_t *zfs_metaslab_ops = &metaslab_ff_ops; |
517 | #endif /* WITH_FF_BLOCK_ALLOCATOR */ | |
518 | ||
519 | #if defined(WITH_DF_BLOCK_ALLOCATOR) | |
428870ff BB |
520 | /* |
521 | * ========================================================================== | |
522 | * Dynamic block allocator - | |
523 | * Uses the first fit allocation scheme until space get low and then | |
524 | * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold | |
525 | * and metaslab_df_free_pct to determine when to switch the allocation scheme. | |
526 | * ========================================================================== | |
527 | */ | |
9babb374 BB |
528 | static uint64_t |
529 | metaslab_df_alloc(space_map_t *sm, uint64_t size) | |
530 | { | |
531 | avl_tree_t *t = &sm->sm_root; | |
532 | uint64_t align = size & -size; | |
533 | uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1; | |
428870ff | 534 | uint64_t max_size = metaslab_pp_maxsize(sm); |
9babb374 BB |
535 | int free_pct = sm->sm_space * 100 / sm->sm_size; |
536 | ||
537 | ASSERT(MUTEX_HELD(sm->sm_lock)); | |
538 | ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root)); | |
539 | ||
540 | if (max_size < size) | |
541 | return (-1ULL); | |
542 | ||
543 | /* | |
544 | * If we're running low on space switch to using the size | |
545 | * sorted AVL tree (best-fit). | |
546 | */ | |
547 | if (max_size < metaslab_df_alloc_threshold || | |
548 | free_pct < metaslab_df_free_pct) { | |
549 | t = sm->sm_pp_root; | |
550 | *cursor = 0; | |
551 | } | |
552 | ||
553 | return (metaslab_block_picker(t, cursor, size, 1ULL)); | |
554 | } | |
555 | ||
428870ff BB |
556 | static boolean_t |
557 | metaslab_df_fragmented(space_map_t *sm) | |
9babb374 | 558 | { |
428870ff BB |
559 | uint64_t max_size = metaslab_pp_maxsize(sm); |
560 | int free_pct = sm->sm_space * 100 / sm->sm_size; | |
9babb374 | 561 | |
428870ff BB |
562 | if (max_size >= metaslab_df_alloc_threshold && |
563 | free_pct >= metaslab_df_free_pct) | |
564 | return (B_FALSE); | |
565 | ||
566 | return (B_TRUE); | |
9babb374 BB |
567 | } |
568 | ||
569 | static space_map_ops_t metaslab_df_ops = { | |
428870ff BB |
570 | metaslab_pp_load, |
571 | metaslab_pp_unload, | |
9babb374 | 572 | metaslab_df_alloc, |
428870ff BB |
573 | metaslab_pp_claim, |
574 | metaslab_pp_free, | |
575 | metaslab_pp_maxsize, | |
576 | metaslab_df_fragmented | |
34dc7c2f BB |
577 | }; |
578 | ||
22c81dd8 BB |
579 | space_map_ops_t *zfs_metaslab_ops = &metaslab_df_ops; |
580 | #endif /* WITH_DF_BLOCK_ALLOCATOR */ | |
581 | ||
428870ff BB |
582 | /* |
583 | * ========================================================================== | |
584 | * Other experimental allocators | |
585 | * ========================================================================== | |
586 | */ | |
22c81dd8 | 587 | #if defined(WITH_CDF_BLOCK_ALLOCATOR) |
428870ff BB |
588 | static uint64_t |
589 | metaslab_cdf_alloc(space_map_t *sm, uint64_t size) | |
590 | { | |
591 | avl_tree_t *t = &sm->sm_root; | |
592 | uint64_t *cursor = (uint64_t *)sm->sm_ppd; | |
593 | uint64_t *extent_end = (uint64_t *)sm->sm_ppd + 1; | |
594 | uint64_t max_size = metaslab_pp_maxsize(sm); | |
595 | uint64_t rsize = size; | |
596 | uint64_t offset = 0; | |
597 | ||
598 | ASSERT(MUTEX_HELD(sm->sm_lock)); | |
599 | ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root)); | |
600 | ||
601 | if (max_size < size) | |
602 | return (-1ULL); | |
603 | ||
604 | ASSERT3U(*extent_end, >=, *cursor); | |
605 | ||
606 | /* | |
607 | * If we're running low on space switch to using the size | |
608 | * sorted AVL tree (best-fit). | |
609 | */ | |
610 | if ((*cursor + size) > *extent_end) { | |
611 | ||
612 | t = sm->sm_pp_root; | |
613 | *cursor = *extent_end = 0; | |
614 | ||
615 | if (max_size > 2 * SPA_MAXBLOCKSIZE) | |
616 | rsize = MIN(metaslab_min_alloc_size, max_size); | |
617 | offset = metaslab_block_picker(t, extent_end, rsize, 1ULL); | |
618 | if (offset != -1) | |
619 | *cursor = offset + size; | |
620 | } else { | |
621 | offset = metaslab_block_picker(t, cursor, rsize, 1ULL); | |
622 | } | |
623 | ASSERT3U(*cursor, <=, *extent_end); | |
624 | return (offset); | |
625 | } | |
626 | ||
627 | static boolean_t | |
628 | metaslab_cdf_fragmented(space_map_t *sm) | |
629 | { | |
630 | uint64_t max_size = metaslab_pp_maxsize(sm); | |
631 | ||
632 | if (max_size > (metaslab_min_alloc_size * 10)) | |
633 | return (B_FALSE); | |
634 | return (B_TRUE); | |
635 | } | |
636 | ||
637 | static space_map_ops_t metaslab_cdf_ops = { | |
638 | metaslab_pp_load, | |
639 | metaslab_pp_unload, | |
640 | metaslab_cdf_alloc, | |
641 | metaslab_pp_claim, | |
642 | metaslab_pp_free, | |
643 | metaslab_pp_maxsize, | |
644 | metaslab_cdf_fragmented | |
645 | }; | |
646 | ||
22c81dd8 BB |
647 | space_map_ops_t *zfs_metaslab_ops = &metaslab_cdf_ops; |
648 | #endif /* WITH_CDF_BLOCK_ALLOCATOR */ | |
649 | ||
650 | #if defined(WITH_NDF_BLOCK_ALLOCATOR) | |
428870ff BB |
651 | uint64_t metaslab_ndf_clump_shift = 4; |
652 | ||
653 | static uint64_t | |
654 | metaslab_ndf_alloc(space_map_t *sm, uint64_t size) | |
655 | { | |
656 | avl_tree_t *t = &sm->sm_root; | |
657 | avl_index_t where; | |
658 | space_seg_t *ss, ssearch; | |
659 | uint64_t hbit = highbit(size); | |
660 | uint64_t *cursor = (uint64_t *)sm->sm_ppd + hbit - 1; | |
661 | uint64_t max_size = metaslab_pp_maxsize(sm); | |
662 | ||
663 | ASSERT(MUTEX_HELD(sm->sm_lock)); | |
664 | ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root)); | |
665 | ||
666 | if (max_size < size) | |
667 | return (-1ULL); | |
668 | ||
669 | ssearch.ss_start = *cursor; | |
670 | ssearch.ss_end = *cursor + size; | |
671 | ||
672 | ss = avl_find(t, &ssearch, &where); | |
673 | if (ss == NULL || (ss->ss_start + size > ss->ss_end)) { | |
674 | t = sm->sm_pp_root; | |
675 | ||
676 | ssearch.ss_start = 0; | |
677 | ssearch.ss_end = MIN(max_size, | |
678 | 1ULL << (hbit + metaslab_ndf_clump_shift)); | |
679 | ss = avl_find(t, &ssearch, &where); | |
680 | if (ss == NULL) | |
681 | ss = avl_nearest(t, where, AVL_AFTER); | |
682 | ASSERT(ss != NULL); | |
683 | } | |
684 | ||
685 | if (ss != NULL) { | |
686 | if (ss->ss_start + size <= ss->ss_end) { | |
687 | *cursor = ss->ss_start + size; | |
688 | return (ss->ss_start); | |
689 | } | |
690 | } | |
691 | return (-1ULL); | |
692 | } | |
693 | ||
694 | static boolean_t | |
695 | metaslab_ndf_fragmented(space_map_t *sm) | |
696 | { | |
697 | uint64_t max_size = metaslab_pp_maxsize(sm); | |
698 | ||
699 | if (max_size > (metaslab_min_alloc_size << metaslab_ndf_clump_shift)) | |
700 | return (B_FALSE); | |
701 | return (B_TRUE); | |
702 | } | |
703 | ||
704 | ||
705 | static space_map_ops_t metaslab_ndf_ops = { | |
706 | metaslab_pp_load, | |
707 | metaslab_pp_unload, | |
708 | metaslab_ndf_alloc, | |
709 | metaslab_pp_claim, | |
710 | metaslab_pp_free, | |
711 | metaslab_pp_maxsize, | |
712 | metaslab_ndf_fragmented | |
713 | }; | |
714 | ||
715 | space_map_ops_t *zfs_metaslab_ops = &metaslab_ndf_ops; | |
22c81dd8 | 716 | #endif /* WITH_NDF_BLOCK_ALLOCATOR */ |
9babb374 | 717 | |
34dc7c2f BB |
718 | /* |
719 | * ========================================================================== | |
720 | * Metaslabs | |
721 | * ========================================================================== | |
722 | */ | |
723 | metaslab_t * | |
724 | metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo, | |
725 | uint64_t start, uint64_t size, uint64_t txg) | |
726 | { | |
727 | vdev_t *vd = mg->mg_vd; | |
728 | metaslab_t *msp; | |
729 | ||
b8d06fca | 730 | msp = kmem_zalloc(sizeof (metaslab_t), KM_PUSHPAGE); |
34dc7c2f BB |
731 | mutex_init(&msp->ms_lock, NULL, MUTEX_DEFAULT, NULL); |
732 | ||
733 | msp->ms_smo_syncing = *smo; | |
734 | ||
735 | /* | |
736 | * We create the main space map here, but we don't create the | |
737 | * allocmaps and freemaps until metaslab_sync_done(). This serves | |
738 | * two purposes: it allows metaslab_sync_done() to detect the | |
739 | * addition of new space; and for debugging, it ensures that we'd | |
740 | * data fault on any attempt to use this metaslab before it's ready. | |
741 | */ | |
742 | space_map_create(&msp->ms_map, start, size, | |
743 | vd->vdev_ashift, &msp->ms_lock); | |
744 | ||
745 | metaslab_group_add(mg, msp); | |
746 | ||
428870ff BB |
747 | if (metaslab_debug && smo->smo_object != 0) { |
748 | mutex_enter(&msp->ms_lock); | |
749 | VERIFY(space_map_load(&msp->ms_map, mg->mg_class->mc_ops, | |
750 | SM_FREE, smo, spa_meta_objset(vd->vdev_spa)) == 0); | |
751 | mutex_exit(&msp->ms_lock); | |
752 | } | |
753 | ||
34dc7c2f BB |
754 | /* |
755 | * If we're opening an existing pool (txg == 0) or creating | |
756 | * a new one (txg == TXG_INITIAL), all space is available now. | |
757 | * If we're adding space to an existing pool, the new space | |
758 | * does not become available until after this txg has synced. | |
759 | */ | |
760 | if (txg <= TXG_INITIAL) | |
761 | metaslab_sync_done(msp, 0); | |
762 | ||
763 | if (txg != 0) { | |
34dc7c2f | 764 | vdev_dirty(vd, 0, NULL, txg); |
428870ff | 765 | vdev_dirty(vd, VDD_METASLAB, msp, txg); |
34dc7c2f BB |
766 | } |
767 | ||
768 | return (msp); | |
769 | } | |
770 | ||
771 | void | |
772 | metaslab_fini(metaslab_t *msp) | |
773 | { | |
774 | metaslab_group_t *mg = msp->ms_group; | |
d6320ddb | 775 | int t; |
34dc7c2f | 776 | |
428870ff BB |
777 | vdev_space_update(mg->mg_vd, |
778 | -msp->ms_smo.smo_alloc, 0, -msp->ms_map.sm_size); | |
34dc7c2f BB |
779 | |
780 | metaslab_group_remove(mg, msp); | |
781 | ||
782 | mutex_enter(&msp->ms_lock); | |
783 | ||
784 | space_map_unload(&msp->ms_map); | |
785 | space_map_destroy(&msp->ms_map); | |
786 | ||
d6320ddb | 787 | for (t = 0; t < TXG_SIZE; t++) { |
34dc7c2f BB |
788 | space_map_destroy(&msp->ms_allocmap[t]); |
789 | space_map_destroy(&msp->ms_freemap[t]); | |
790 | } | |
791 | ||
d6320ddb | 792 | for (t = 0; t < TXG_DEFER_SIZE; t++) |
428870ff BB |
793 | space_map_destroy(&msp->ms_defermap[t]); |
794 | ||
795 | ASSERT3S(msp->ms_deferspace, ==, 0); | |
796 | ||
34dc7c2f BB |
797 | mutex_exit(&msp->ms_lock); |
798 | mutex_destroy(&msp->ms_lock); | |
799 | ||
800 | kmem_free(msp, sizeof (metaslab_t)); | |
801 | } | |
802 | ||
803 | #define METASLAB_WEIGHT_PRIMARY (1ULL << 63) | |
804 | #define METASLAB_WEIGHT_SECONDARY (1ULL << 62) | |
805 | #define METASLAB_ACTIVE_MASK \ | |
806 | (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY) | |
34dc7c2f BB |
807 | |
808 | static uint64_t | |
809 | metaslab_weight(metaslab_t *msp) | |
810 | { | |
811 | metaslab_group_t *mg = msp->ms_group; | |
812 | space_map_t *sm = &msp->ms_map; | |
813 | space_map_obj_t *smo = &msp->ms_smo; | |
814 | vdev_t *vd = mg->mg_vd; | |
815 | uint64_t weight, space; | |
816 | ||
817 | ASSERT(MUTEX_HELD(&msp->ms_lock)); | |
818 | ||
819 | /* | |
820 | * The baseline weight is the metaslab's free space. | |
821 | */ | |
822 | space = sm->sm_size - smo->smo_alloc; | |
823 | weight = space; | |
824 | ||
825 | /* | |
826 | * Modern disks have uniform bit density and constant angular velocity. | |
827 | * Therefore, the outer recording zones are faster (higher bandwidth) | |
828 | * than the inner zones by the ratio of outer to inner track diameter, | |
829 | * which is typically around 2:1. We account for this by assigning | |
830 | * higher weight to lower metaslabs (multiplier ranging from 2x to 1x). | |
831 | * In effect, this means that we'll select the metaslab with the most | |
832 | * free bandwidth rather than simply the one with the most free space. | |
833 | */ | |
834 | weight = 2 * weight - | |
835 | ((sm->sm_start >> vd->vdev_ms_shift) * weight) / vd->vdev_ms_count; | |
836 | ASSERT(weight >= space && weight <= 2 * space); | |
837 | ||
838 | /* | |
428870ff BB |
839 | * For locality, assign higher weight to metaslabs which have |
840 | * a lower offset than what we've already activated. | |
34dc7c2f | 841 | */ |
428870ff BB |
842 | if (sm->sm_start <= mg->mg_bonus_area) |
843 | weight *= (metaslab_smo_bonus_pct / 100); | |
34dc7c2f | 844 | ASSERT(weight >= space && |
428870ff BB |
845 | weight <= 2 * (metaslab_smo_bonus_pct / 100) * space); |
846 | ||
847 | if (sm->sm_loaded && !sm->sm_ops->smop_fragmented(sm)) { | |
848 | /* | |
849 | * If this metaslab is one we're actively using, adjust its | |
850 | * weight to make it preferable to any inactive metaslab so | |
851 | * we'll polish it off. | |
852 | */ | |
853 | weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK); | |
854 | } | |
855 | return (weight); | |
856 | } | |
857 | ||
858 | static void | |
859 | metaslab_prefetch(metaslab_group_t *mg) | |
860 | { | |
861 | spa_t *spa = mg->mg_vd->vdev_spa; | |
862 | metaslab_t *msp; | |
863 | avl_tree_t *t = &mg->mg_metaslab_tree; | |
864 | int m; | |
865 | ||
866 | mutex_enter(&mg->mg_lock); | |
34dc7c2f BB |
867 | |
868 | /* | |
428870ff | 869 | * Prefetch the next potential metaslabs |
34dc7c2f | 870 | */ |
428870ff BB |
871 | for (msp = avl_first(t), m = 0; msp; msp = AVL_NEXT(t, msp), m++) { |
872 | space_map_t *sm = &msp->ms_map; | |
873 | space_map_obj_t *smo = &msp->ms_smo; | |
34dc7c2f | 874 | |
428870ff BB |
875 | /* If we have reached our prefetch limit then we're done */ |
876 | if (m >= metaslab_prefetch_limit) | |
877 | break; | |
878 | ||
879 | if (!sm->sm_loaded && smo->smo_object != 0) { | |
880 | mutex_exit(&mg->mg_lock); | |
881 | dmu_prefetch(spa_meta_objset(spa), smo->smo_object, | |
882 | 0ULL, smo->smo_objsize); | |
883 | mutex_enter(&mg->mg_lock); | |
884 | } | |
885 | } | |
886 | mutex_exit(&mg->mg_lock); | |
34dc7c2f BB |
887 | } |
888 | ||
889 | static int | |
6d974228 | 890 | metaslab_activate(metaslab_t *msp, uint64_t activation_weight) |
34dc7c2f | 891 | { |
428870ff | 892 | metaslab_group_t *mg = msp->ms_group; |
34dc7c2f | 893 | space_map_t *sm = &msp->ms_map; |
9babb374 | 894 | space_map_ops_t *sm_ops = msp->ms_group->mg_class->mc_ops; |
d6320ddb | 895 | int t; |
34dc7c2f BB |
896 | |
897 | ASSERT(MUTEX_HELD(&msp->ms_lock)); | |
898 | ||
899 | if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { | |
428870ff BB |
900 | space_map_load_wait(sm); |
901 | if (!sm->sm_loaded) { | |
902 | int error = space_map_load(sm, sm_ops, SM_FREE, | |
903 | &msp->ms_smo, | |
904 | spa_meta_objset(msp->ms_group->mg_vd->vdev_spa)); | |
905 | if (error) { | |
906 | metaslab_group_sort(msp->ms_group, msp, 0); | |
907 | return (error); | |
908 | } | |
d6320ddb | 909 | for (t = 0; t < TXG_DEFER_SIZE; t++) |
428870ff BB |
910 | space_map_walk(&msp->ms_defermap[t], |
911 | space_map_claim, sm); | |
912 | ||
913 | } | |
914 | ||
915 | /* | |
916 | * Track the bonus area as we activate new metaslabs. | |
917 | */ | |
918 | if (sm->sm_start > mg->mg_bonus_area) { | |
919 | mutex_enter(&mg->mg_lock); | |
920 | mg->mg_bonus_area = sm->sm_start; | |
921 | mutex_exit(&mg->mg_lock); | |
34dc7c2f | 922 | } |
9babb374 | 923 | |
34dc7c2f BB |
924 | metaslab_group_sort(msp->ms_group, msp, |
925 | msp->ms_weight | activation_weight); | |
926 | } | |
927 | ASSERT(sm->sm_loaded); | |
928 | ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); | |
929 | ||
930 | return (0); | |
931 | } | |
932 | ||
933 | static void | |
934 | metaslab_passivate(metaslab_t *msp, uint64_t size) | |
935 | { | |
936 | /* | |
937 | * If size < SPA_MINBLOCKSIZE, then we will not allocate from | |
938 | * this metaslab again. In that case, it had better be empty, | |
939 | * or we would be leaving space on the table. | |
940 | */ | |
941 | ASSERT(size >= SPA_MINBLOCKSIZE || msp->ms_map.sm_space == 0); | |
942 | metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size)); | |
943 | ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0); | |
944 | } | |
945 | ||
946 | /* | |
947 | * Write a metaslab to disk in the context of the specified transaction group. | |
948 | */ | |
949 | void | |
950 | metaslab_sync(metaslab_t *msp, uint64_t txg) | |
951 | { | |
952 | vdev_t *vd = msp->ms_group->mg_vd; | |
953 | spa_t *spa = vd->vdev_spa; | |
428870ff | 954 | objset_t *mos = spa_meta_objset(spa); |
34dc7c2f BB |
955 | space_map_t *allocmap = &msp->ms_allocmap[txg & TXG_MASK]; |
956 | space_map_t *freemap = &msp->ms_freemap[txg & TXG_MASK]; | |
957 | space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; | |
958 | space_map_t *sm = &msp->ms_map; | |
959 | space_map_obj_t *smo = &msp->ms_smo_syncing; | |
960 | dmu_buf_t *db; | |
961 | dmu_tx_t *tx; | |
d6320ddb | 962 | int t; |
34dc7c2f | 963 | |
428870ff BB |
964 | ASSERT(!vd->vdev_ishole); |
965 | ||
966 | if (allocmap->sm_space == 0 && freemap->sm_space == 0) | |
967 | return; | |
34dc7c2f BB |
968 | |
969 | /* | |
970 | * The only state that can actually be changing concurrently with | |
971 | * metaslab_sync() is the metaslab's ms_map. No other thread can | |
972 | * be modifying this txg's allocmap, freemap, freed_map, or smo. | |
973 | * Therefore, we only hold ms_lock to satify space_map ASSERTs. | |
974 | * We drop it whenever we call into the DMU, because the DMU | |
975 | * can call down to us (e.g. via zio_free()) at any time. | |
976 | */ | |
428870ff BB |
977 | |
978 | tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); | |
34dc7c2f BB |
979 | |
980 | if (smo->smo_object == 0) { | |
981 | ASSERT(smo->smo_objsize == 0); | |
982 | ASSERT(smo->smo_alloc == 0); | |
34dc7c2f BB |
983 | smo->smo_object = dmu_object_alloc(mos, |
984 | DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT, | |
985 | DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx); | |
986 | ASSERT(smo->smo_object != 0); | |
987 | dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) * | |
988 | (sm->sm_start >> vd->vdev_ms_shift), | |
989 | sizeof (uint64_t), &smo->smo_object, tx); | |
34dc7c2f BB |
990 | } |
991 | ||
428870ff BB |
992 | mutex_enter(&msp->ms_lock); |
993 | ||
34dc7c2f BB |
994 | space_map_walk(freemap, space_map_add, freed_map); |
995 | ||
996 | if (sm->sm_loaded && spa_sync_pass(spa) == 1 && smo->smo_objsize >= | |
997 | 2 * sizeof (uint64_t) * avl_numnodes(&sm->sm_root)) { | |
998 | /* | |
999 | * The in-core space map representation is twice as compact | |
1000 | * as the on-disk one, so it's time to condense the latter | |
1001 | * by generating a pure allocmap from first principles. | |
1002 | * | |
1003 | * This metaslab is 100% allocated, | |
1004 | * minus the content of the in-core map (sm), | |
1005 | * minus what's been freed this txg (freed_map), | |
428870ff | 1006 | * minus deferred frees (ms_defermap[]), |
34dc7c2f BB |
1007 | * minus allocations from txgs in the future |
1008 | * (because they haven't been committed yet). | |
1009 | */ | |
1010 | space_map_vacate(allocmap, NULL, NULL); | |
1011 | space_map_vacate(freemap, NULL, NULL); | |
1012 | ||
1013 | space_map_add(allocmap, allocmap->sm_start, allocmap->sm_size); | |
1014 | ||
1015 | space_map_walk(sm, space_map_remove, allocmap); | |
1016 | space_map_walk(freed_map, space_map_remove, allocmap); | |
1017 | ||
d6320ddb | 1018 | for (t = 0; t < TXG_DEFER_SIZE; t++) |
428870ff BB |
1019 | space_map_walk(&msp->ms_defermap[t], |
1020 | space_map_remove, allocmap); | |
1021 | ||
d6320ddb | 1022 | for (t = 1; t < TXG_CONCURRENT_STATES; t++) |
34dc7c2f BB |
1023 | space_map_walk(&msp->ms_allocmap[(txg + t) & TXG_MASK], |
1024 | space_map_remove, allocmap); | |
1025 | ||
1026 | mutex_exit(&msp->ms_lock); | |
1027 | space_map_truncate(smo, mos, tx); | |
1028 | mutex_enter(&msp->ms_lock); | |
1029 | } | |
1030 | ||
1031 | space_map_sync(allocmap, SM_ALLOC, smo, mos, tx); | |
1032 | space_map_sync(freemap, SM_FREE, smo, mos, tx); | |
1033 | ||
1034 | mutex_exit(&msp->ms_lock); | |
1035 | ||
1036 | VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)); | |
1037 | dmu_buf_will_dirty(db, tx); | |
1038 | ASSERT3U(db->db_size, >=, sizeof (*smo)); | |
1039 | bcopy(smo, db->db_data, sizeof (*smo)); | |
1040 | dmu_buf_rele(db, FTAG); | |
1041 | ||
1042 | dmu_tx_commit(tx); | |
1043 | } | |
1044 | ||
1045 | /* | |
1046 | * Called after a transaction group has completely synced to mark | |
1047 | * all of the metaslab's free space as usable. | |
1048 | */ | |
1049 | void | |
1050 | metaslab_sync_done(metaslab_t *msp, uint64_t txg) | |
1051 | { | |
1052 | space_map_obj_t *smo = &msp->ms_smo; | |
1053 | space_map_obj_t *smosync = &msp->ms_smo_syncing; | |
1054 | space_map_t *sm = &msp->ms_map; | |
1055 | space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; | |
428870ff | 1056 | space_map_t *defer_map = &msp->ms_defermap[txg % TXG_DEFER_SIZE]; |
34dc7c2f BB |
1057 | metaslab_group_t *mg = msp->ms_group; |
1058 | vdev_t *vd = mg->mg_vd; | |
428870ff | 1059 | int64_t alloc_delta, defer_delta; |
d6320ddb | 1060 | int t; |
428870ff BB |
1061 | |
1062 | ASSERT(!vd->vdev_ishole); | |
34dc7c2f BB |
1063 | |
1064 | mutex_enter(&msp->ms_lock); | |
1065 | ||
1066 | /* | |
1067 | * If this metaslab is just becoming available, initialize its | |
1068 | * allocmaps and freemaps and add its capacity to the vdev. | |
1069 | */ | |
1070 | if (freed_map->sm_size == 0) { | |
d6320ddb | 1071 | for (t = 0; t < TXG_SIZE; t++) { |
34dc7c2f BB |
1072 | space_map_create(&msp->ms_allocmap[t], sm->sm_start, |
1073 | sm->sm_size, sm->sm_shift, sm->sm_lock); | |
1074 | space_map_create(&msp->ms_freemap[t], sm->sm_start, | |
1075 | sm->sm_size, sm->sm_shift, sm->sm_lock); | |
1076 | } | |
428870ff | 1077 | |
d6320ddb | 1078 | for (t = 0; t < TXG_DEFER_SIZE; t++) |
428870ff BB |
1079 | space_map_create(&msp->ms_defermap[t], sm->sm_start, |
1080 | sm->sm_size, sm->sm_shift, sm->sm_lock); | |
1081 | ||
1082 | vdev_space_update(vd, 0, 0, sm->sm_size); | |
34dc7c2f BB |
1083 | } |
1084 | ||
428870ff BB |
1085 | alloc_delta = smosync->smo_alloc - smo->smo_alloc; |
1086 | defer_delta = freed_map->sm_space - defer_map->sm_space; | |
1087 | ||
1088 | vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0); | |
34dc7c2f BB |
1089 | |
1090 | ASSERT(msp->ms_allocmap[txg & TXG_MASK].sm_space == 0); | |
1091 | ASSERT(msp->ms_freemap[txg & TXG_MASK].sm_space == 0); | |
1092 | ||
1093 | /* | |
1094 | * If there's a space_map_load() in progress, wait for it to complete | |
1095 | * so that we have a consistent view of the in-core space map. | |
428870ff BB |
1096 | * Then, add defer_map (oldest deferred frees) to this map and |
1097 | * transfer freed_map (this txg's frees) to defer_map. | |
34dc7c2f BB |
1098 | */ |
1099 | space_map_load_wait(sm); | |
428870ff BB |
1100 | space_map_vacate(defer_map, sm->sm_loaded ? space_map_free : NULL, sm); |
1101 | space_map_vacate(freed_map, space_map_add, defer_map); | |
34dc7c2f BB |
1102 | |
1103 | *smo = *smosync; | |
1104 | ||
428870ff BB |
1105 | msp->ms_deferspace += defer_delta; |
1106 | ASSERT3S(msp->ms_deferspace, >=, 0); | |
1107 | ASSERT3S(msp->ms_deferspace, <=, sm->sm_size); | |
1108 | if (msp->ms_deferspace != 0) { | |
1109 | /* | |
1110 | * Keep syncing this metaslab until all deferred frees | |
1111 | * are back in circulation. | |
1112 | */ | |
1113 | vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); | |
1114 | } | |
1115 | ||
34dc7c2f BB |
1116 | /* |
1117 | * If the map is loaded but no longer active, evict it as soon as all | |
1118 | * future allocations have synced. (If we unloaded it now and then | |
1119 | * loaded a moment later, the map wouldn't reflect those allocations.) | |
1120 | */ | |
1121 | if (sm->sm_loaded && (msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { | |
1122 | int evictable = 1; | |
1123 | ||
d6320ddb | 1124 | for (t = 1; t < TXG_CONCURRENT_STATES; t++) |
34dc7c2f BB |
1125 | if (msp->ms_allocmap[(txg + t) & TXG_MASK].sm_space) |
1126 | evictable = 0; | |
1127 | ||
428870ff | 1128 | if (evictable && !metaslab_debug) |
34dc7c2f BB |
1129 | space_map_unload(sm); |
1130 | } | |
1131 | ||
1132 | metaslab_group_sort(mg, msp, metaslab_weight(msp)); | |
1133 | ||
1134 | mutex_exit(&msp->ms_lock); | |
1135 | } | |
1136 | ||
428870ff BB |
1137 | void |
1138 | metaslab_sync_reassess(metaslab_group_t *mg) | |
1139 | { | |
1140 | vdev_t *vd = mg->mg_vd; | |
6d974228 | 1141 | int64_t failures = mg->mg_alloc_failures; |
d6320ddb | 1142 | int m; |
428870ff BB |
1143 | |
1144 | /* | |
1145 | * Re-evaluate all metaslabs which have lower offsets than the | |
1146 | * bonus area. | |
1147 | */ | |
d6320ddb | 1148 | for (m = 0; m < vd->vdev_ms_count; m++) { |
428870ff BB |
1149 | metaslab_t *msp = vd->vdev_ms[m]; |
1150 | ||
1151 | if (msp->ms_map.sm_start > mg->mg_bonus_area) | |
1152 | break; | |
1153 | ||
1154 | mutex_enter(&msp->ms_lock); | |
1155 | metaslab_group_sort(mg, msp, metaslab_weight(msp)); | |
1156 | mutex_exit(&msp->ms_lock); | |
1157 | } | |
1158 | ||
6d974228 GW |
1159 | atomic_add_64(&mg->mg_alloc_failures, -failures); |
1160 | ||
428870ff BB |
1161 | /* |
1162 | * Prefetch the next potential metaslabs | |
1163 | */ | |
1164 | metaslab_prefetch(mg); | |
1165 | } | |
1166 | ||
34dc7c2f BB |
1167 | static uint64_t |
1168 | metaslab_distance(metaslab_t *msp, dva_t *dva) | |
1169 | { | |
1170 | uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift; | |
1171 | uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift; | |
1172 | uint64_t start = msp->ms_map.sm_start >> ms_shift; | |
1173 | ||
1174 | if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva)) | |
1175 | return (1ULL << 63); | |
1176 | ||
1177 | if (offset < start) | |
1178 | return ((start - offset) << ms_shift); | |
1179 | if (offset > start) | |
1180 | return ((offset - start) << ms_shift); | |
1181 | return (0); | |
1182 | } | |
1183 | ||
1184 | static uint64_t | |
6d974228 GW |
1185 | metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize, |
1186 | uint64_t txg, uint64_t min_distance, dva_t *dva, int d, int flags) | |
34dc7c2f | 1187 | { |
6d974228 | 1188 | spa_t *spa = mg->mg_vd->vdev_spa; |
34dc7c2f BB |
1189 | metaslab_t *msp = NULL; |
1190 | uint64_t offset = -1ULL; | |
1191 | avl_tree_t *t = &mg->mg_metaslab_tree; | |
1192 | uint64_t activation_weight; | |
1193 | uint64_t target_distance; | |
1194 | int i; | |
1195 | ||
1196 | activation_weight = METASLAB_WEIGHT_PRIMARY; | |
9babb374 BB |
1197 | for (i = 0; i < d; i++) { |
1198 | if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { | |
34dc7c2f | 1199 | activation_weight = METASLAB_WEIGHT_SECONDARY; |
9babb374 BB |
1200 | break; |
1201 | } | |
1202 | } | |
34dc7c2f BB |
1203 | |
1204 | for (;;) { | |
9babb374 BB |
1205 | boolean_t was_active; |
1206 | ||
34dc7c2f BB |
1207 | mutex_enter(&mg->mg_lock); |
1208 | for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) { | |
6d974228 GW |
1209 | if (msp->ms_weight < asize) { |
1210 | spa_dbgmsg(spa, "%s: failed to meet weight " | |
1211 | "requirement: vdev %llu, txg %llu, mg %p, " | |
1212 | "msp %p, psize %llu, asize %llu, " | |
1213 | "failures %llu, weight %llu", | |
1214 | spa_name(spa), mg->mg_vd->vdev_id, txg, | |
1215 | mg, msp, psize, asize, | |
1216 | mg->mg_alloc_failures, msp->ms_weight); | |
34dc7c2f BB |
1217 | mutex_exit(&mg->mg_lock); |
1218 | return (-1ULL); | |
1219 | } | |
9babb374 | 1220 | was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; |
34dc7c2f BB |
1221 | if (activation_weight == METASLAB_WEIGHT_PRIMARY) |
1222 | break; | |
1223 | ||
1224 | target_distance = min_distance + | |
1225 | (msp->ms_smo.smo_alloc ? 0 : min_distance >> 1); | |
1226 | ||
1227 | for (i = 0; i < d; i++) | |
1228 | if (metaslab_distance(msp, &dva[i]) < | |
1229 | target_distance) | |
1230 | break; | |
1231 | if (i == d) | |
1232 | break; | |
1233 | } | |
1234 | mutex_exit(&mg->mg_lock); | |
1235 | if (msp == NULL) | |
1236 | return (-1ULL); | |
1237 | ||
6d974228 GW |
1238 | /* |
1239 | * If we've already reached the allowable number of failed | |
1240 | * allocation attempts on this metaslab group then we | |
1241 | * consider skipping it. We skip it only if we're allowed | |
1242 | * to "fast" gang, the physical size is larger than | |
1243 | * a gang block, and we're attempting to allocate from | |
1244 | * the primary metaslab. | |
1245 | */ | |
1246 | if (mg->mg_alloc_failures > zfs_mg_alloc_failures && | |
1247 | CAN_FASTGANG(flags) && psize > SPA_GANGBLOCKSIZE && | |
1248 | activation_weight == METASLAB_WEIGHT_PRIMARY) { | |
1249 | spa_dbgmsg(spa, "%s: skipping metaslab group: " | |
1250 | "vdev %llu, txg %llu, mg %p, psize %llu, " | |
1251 | "asize %llu, failures %llu", spa_name(spa), | |
1252 | mg->mg_vd->vdev_id, txg, mg, psize, asize, | |
1253 | mg->mg_alloc_failures); | |
1254 | return (-1ULL); | |
1255 | } | |
1256 | ||
34dc7c2f BB |
1257 | mutex_enter(&msp->ms_lock); |
1258 | ||
1259 | /* | |
1260 | * Ensure that the metaslab we have selected is still | |
1261 | * capable of handling our request. It's possible that | |
1262 | * another thread may have changed the weight while we | |
1263 | * were blocked on the metaslab lock. | |
1264 | */ | |
6d974228 | 1265 | if (msp->ms_weight < asize || (was_active && |
9babb374 BB |
1266 | !(msp->ms_weight & METASLAB_ACTIVE_MASK) && |
1267 | activation_weight == METASLAB_WEIGHT_PRIMARY)) { | |
34dc7c2f BB |
1268 | mutex_exit(&msp->ms_lock); |
1269 | continue; | |
1270 | } | |
1271 | ||
1272 | if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) && | |
1273 | activation_weight == METASLAB_WEIGHT_PRIMARY) { | |
1274 | metaslab_passivate(msp, | |
1275 | msp->ms_weight & ~METASLAB_ACTIVE_MASK); | |
1276 | mutex_exit(&msp->ms_lock); | |
1277 | continue; | |
1278 | } | |
1279 | ||
6d974228 | 1280 | if (metaslab_activate(msp, activation_weight) != 0) { |
34dc7c2f BB |
1281 | mutex_exit(&msp->ms_lock); |
1282 | continue; | |
1283 | } | |
1284 | ||
6d974228 | 1285 | if ((offset = space_map_alloc(&msp->ms_map, asize)) != -1ULL) |
34dc7c2f BB |
1286 | break; |
1287 | ||
6d974228 GW |
1288 | atomic_inc_64(&mg->mg_alloc_failures); |
1289 | ||
428870ff | 1290 | metaslab_passivate(msp, space_map_maxsize(&msp->ms_map)); |
34dc7c2f BB |
1291 | |
1292 | mutex_exit(&msp->ms_lock); | |
1293 | } | |
1294 | ||
1295 | if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0) | |
1296 | vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); | |
1297 | ||
6d974228 | 1298 | space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, asize); |
34dc7c2f BB |
1299 | |
1300 | mutex_exit(&msp->ms_lock); | |
1301 | ||
1302 | return (offset); | |
1303 | } | |
1304 | ||
1305 | /* | |
1306 | * Allocate a block for the specified i/o. | |
1307 | */ | |
1308 | static int | |
1309 | metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, | |
b128c09f | 1310 | dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags) |
34dc7c2f | 1311 | { |
920dd524 | 1312 | metaslab_group_t *mg, *fast_mg, *rotor; |
34dc7c2f BB |
1313 | vdev_t *vd; |
1314 | int dshift = 3; | |
1315 | int all_zero; | |
fb5f0bc8 BB |
1316 | int zio_lock = B_FALSE; |
1317 | boolean_t allocatable; | |
34dc7c2f BB |
1318 | uint64_t offset = -1ULL; |
1319 | uint64_t asize; | |
1320 | uint64_t distance; | |
1321 | ||
1322 | ASSERT(!DVA_IS_VALID(&dva[d])); | |
1323 | ||
1324 | /* | |
1325 | * For testing, make some blocks above a certain size be gang blocks. | |
1326 | */ | |
428870ff | 1327 | if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0) |
34dc7c2f BB |
1328 | return (ENOSPC); |
1329 | ||
920dd524 ED |
1330 | if (flags & METASLAB_FASTWRITE) |
1331 | mutex_enter(&mc->mc_fastwrite_lock); | |
1332 | ||
34dc7c2f BB |
1333 | /* |
1334 | * Start at the rotor and loop through all mgs until we find something. | |
428870ff | 1335 | * Note that there's no locking on mc_rotor or mc_aliquot because |
34dc7c2f BB |
1336 | * nothing actually breaks if we miss a few updates -- we just won't |
1337 | * allocate quite as evenly. It all balances out over time. | |
1338 | * | |
1339 | * If we are doing ditto or log blocks, try to spread them across | |
1340 | * consecutive vdevs. If we're forced to reuse a vdev before we've | |
1341 | * allocated all of our ditto blocks, then try and spread them out on | |
1342 | * that vdev as much as possible. If it turns out to not be possible, | |
1343 | * gradually lower our standards until anything becomes acceptable. | |
1344 | * Also, allocating on consecutive vdevs (as opposed to random vdevs) | |
1345 | * gives us hope of containing our fault domains to something we're | |
1346 | * able to reason about. Otherwise, any two top-level vdev failures | |
1347 | * will guarantee the loss of data. With consecutive allocation, | |
1348 | * only two adjacent top-level vdev failures will result in data loss. | |
1349 | * | |
1350 | * If we are doing gang blocks (hintdva is non-NULL), try to keep | |
1351 | * ourselves on the same vdev as our gang block header. That | |
1352 | * way, we can hope for locality in vdev_cache, plus it makes our | |
1353 | * fault domains something tractable. | |
1354 | */ | |
1355 | if (hintdva) { | |
1356 | vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d])); | |
428870ff BB |
1357 | |
1358 | /* | |
1359 | * It's possible the vdev we're using as the hint no | |
1360 | * longer exists (i.e. removed). Consult the rotor when | |
1361 | * all else fails. | |
1362 | */ | |
1363 | if (vd != NULL) { | |
34dc7c2f | 1364 | mg = vd->vdev_mg; |
428870ff BB |
1365 | |
1366 | if (flags & METASLAB_HINTBP_AVOID && | |
1367 | mg->mg_next != NULL) | |
1368 | mg = mg->mg_next; | |
1369 | } else { | |
1370 | mg = mc->mc_rotor; | |
1371 | } | |
34dc7c2f BB |
1372 | } else if (d != 0) { |
1373 | vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1])); | |
1374 | mg = vd->vdev_mg->mg_next; | |
920dd524 ED |
1375 | } else if (flags & METASLAB_FASTWRITE) { |
1376 | mg = fast_mg = mc->mc_rotor; | |
1377 | ||
1378 | do { | |
1379 | if (fast_mg->mg_vd->vdev_pending_fastwrite < | |
1380 | mg->mg_vd->vdev_pending_fastwrite) | |
1381 | mg = fast_mg; | |
1382 | } while ((fast_mg = fast_mg->mg_next) != mc->mc_rotor); | |
1383 | ||
34dc7c2f BB |
1384 | } else { |
1385 | mg = mc->mc_rotor; | |
1386 | } | |
1387 | ||
1388 | /* | |
428870ff BB |
1389 | * If the hint put us into the wrong metaslab class, or into a |
1390 | * metaslab group that has been passivated, just follow the rotor. | |
34dc7c2f | 1391 | */ |
428870ff | 1392 | if (mg->mg_class != mc || mg->mg_activation_count <= 0) |
34dc7c2f BB |
1393 | mg = mc->mc_rotor; |
1394 | ||
1395 | rotor = mg; | |
1396 | top: | |
1397 | all_zero = B_TRUE; | |
1398 | do { | |
428870ff BB |
1399 | ASSERT(mg->mg_activation_count == 1); |
1400 | ||
34dc7c2f | 1401 | vd = mg->mg_vd; |
fb5f0bc8 | 1402 | |
34dc7c2f | 1403 | /* |
b128c09f | 1404 | * Don't allocate from faulted devices. |
34dc7c2f | 1405 | */ |
fb5f0bc8 BB |
1406 | if (zio_lock) { |
1407 | spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER); | |
1408 | allocatable = vdev_allocatable(vd); | |
1409 | spa_config_exit(spa, SCL_ZIO, FTAG); | |
1410 | } else { | |
1411 | allocatable = vdev_allocatable(vd); | |
1412 | } | |
1413 | if (!allocatable) | |
34dc7c2f | 1414 | goto next; |
fb5f0bc8 | 1415 | |
34dc7c2f BB |
1416 | /* |
1417 | * Avoid writing single-copy data to a failing vdev | |
1418 | */ | |
1419 | if ((vd->vdev_stat.vs_write_errors > 0 || | |
1420 | vd->vdev_state < VDEV_STATE_HEALTHY) && | |
1421 | d == 0 && dshift == 3) { | |
1422 | all_zero = B_FALSE; | |
1423 | goto next; | |
1424 | } | |
1425 | ||
1426 | ASSERT(mg->mg_class == mc); | |
1427 | ||
1428 | distance = vd->vdev_asize >> dshift; | |
1429 | if (distance <= (1ULL << vd->vdev_ms_shift)) | |
1430 | distance = 0; | |
1431 | else | |
1432 | all_zero = B_FALSE; | |
1433 | ||
1434 | asize = vdev_psize_to_asize(vd, psize); | |
1435 | ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); | |
1436 | ||
6d974228 GW |
1437 | offset = metaslab_group_alloc(mg, psize, asize, txg, distance, |
1438 | dva, d, flags); | |
34dc7c2f BB |
1439 | if (offset != -1ULL) { |
1440 | /* | |
1441 | * If we've just selected this metaslab group, | |
1442 | * figure out whether the corresponding vdev is | |
1443 | * over- or under-used relative to the pool, | |
1444 | * and set an allocation bias to even it out. | |
1445 | */ | |
428870ff | 1446 | if (mc->mc_aliquot == 0) { |
34dc7c2f | 1447 | vdev_stat_t *vs = &vd->vdev_stat; |
428870ff | 1448 | int64_t vu, cu; |
34dc7c2f | 1449 | |
6d974228 GW |
1450 | vu = (vs->vs_alloc * 100) / (vs->vs_space + 1); |
1451 | cu = (mc->mc_alloc * 100) / (mc->mc_space + 1); | |
34dc7c2f BB |
1452 | |
1453 | /* | |
6d974228 GW |
1454 | * Calculate how much more or less we should |
1455 | * try to allocate from this device during | |
1456 | * this iteration around the rotor. | |
1457 | * For example, if a device is 80% full | |
1458 | * and the pool is 20% full then we should | |
1459 | * reduce allocations by 60% on this device. | |
1460 | * | |
1461 | * mg_bias = (20 - 80) * 512K / 100 = -307K | |
1462 | * | |
1463 | * This reduces allocations by 307K for this | |
1464 | * iteration. | |
34dc7c2f | 1465 | */ |
428870ff | 1466 | mg->mg_bias = ((cu - vu) * |
6d974228 | 1467 | (int64_t)mg->mg_aliquot) / 100; |
34dc7c2f BB |
1468 | } |
1469 | ||
920dd524 ED |
1470 | if ((flags & METASLAB_FASTWRITE) || |
1471 | atomic_add_64_nv(&mc->mc_aliquot, asize) >= | |
34dc7c2f BB |
1472 | mg->mg_aliquot + mg->mg_bias) { |
1473 | mc->mc_rotor = mg->mg_next; | |
428870ff | 1474 | mc->mc_aliquot = 0; |
34dc7c2f BB |
1475 | } |
1476 | ||
1477 | DVA_SET_VDEV(&dva[d], vd->vdev_id); | |
1478 | DVA_SET_OFFSET(&dva[d], offset); | |
b128c09f | 1479 | DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER)); |
34dc7c2f BB |
1480 | DVA_SET_ASIZE(&dva[d], asize); |
1481 | ||
920dd524 ED |
1482 | if (flags & METASLAB_FASTWRITE) { |
1483 | atomic_add_64(&vd->vdev_pending_fastwrite, | |
1484 | psize); | |
1485 | mutex_exit(&mc->mc_fastwrite_lock); | |
1486 | } | |
1487 | ||
34dc7c2f BB |
1488 | return (0); |
1489 | } | |
1490 | next: | |
1491 | mc->mc_rotor = mg->mg_next; | |
428870ff | 1492 | mc->mc_aliquot = 0; |
34dc7c2f BB |
1493 | } while ((mg = mg->mg_next) != rotor); |
1494 | ||
1495 | if (!all_zero) { | |
1496 | dshift++; | |
1497 | ASSERT(dshift < 64); | |
1498 | goto top; | |
1499 | } | |
1500 | ||
9babb374 | 1501 | if (!allocatable && !zio_lock) { |
fb5f0bc8 BB |
1502 | dshift = 3; |
1503 | zio_lock = B_TRUE; | |
1504 | goto top; | |
1505 | } | |
1506 | ||
34dc7c2f BB |
1507 | bzero(&dva[d], sizeof (dva_t)); |
1508 | ||
920dd524 ED |
1509 | if (flags & METASLAB_FASTWRITE) |
1510 | mutex_exit(&mc->mc_fastwrite_lock); | |
34dc7c2f BB |
1511 | return (ENOSPC); |
1512 | } | |
1513 | ||
1514 | /* | |
1515 | * Free the block represented by DVA in the context of the specified | |
1516 | * transaction group. | |
1517 | */ | |
1518 | static void | |
1519 | metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now) | |
1520 | { | |
1521 | uint64_t vdev = DVA_GET_VDEV(dva); | |
1522 | uint64_t offset = DVA_GET_OFFSET(dva); | |
1523 | uint64_t size = DVA_GET_ASIZE(dva); | |
1524 | vdev_t *vd; | |
1525 | metaslab_t *msp; | |
1526 | ||
1527 | ASSERT(DVA_IS_VALID(dva)); | |
1528 | ||
1529 | if (txg > spa_freeze_txg(spa)) | |
1530 | return; | |
1531 | ||
1532 | if ((vd = vdev_lookup_top(spa, vdev)) == NULL || | |
1533 | (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) { | |
1534 | cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu", | |
1535 | (u_longlong_t)vdev, (u_longlong_t)offset); | |
1536 | ASSERT(0); | |
1537 | return; | |
1538 | } | |
1539 | ||
1540 | msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; | |
1541 | ||
1542 | if (DVA_GET_GANG(dva)) | |
1543 | size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); | |
1544 | ||
1545 | mutex_enter(&msp->ms_lock); | |
1546 | ||
1547 | if (now) { | |
1548 | space_map_remove(&msp->ms_allocmap[txg & TXG_MASK], | |
1549 | offset, size); | |
1550 | space_map_free(&msp->ms_map, offset, size); | |
1551 | } else { | |
1552 | if (msp->ms_freemap[txg & TXG_MASK].sm_space == 0) | |
1553 | vdev_dirty(vd, VDD_METASLAB, msp, txg); | |
1554 | space_map_add(&msp->ms_freemap[txg & TXG_MASK], offset, size); | |
34dc7c2f BB |
1555 | } |
1556 | ||
1557 | mutex_exit(&msp->ms_lock); | |
1558 | } | |
1559 | ||
1560 | /* | |
1561 | * Intent log support: upon opening the pool after a crash, notify the SPA | |
1562 | * of blocks that the intent log has allocated for immediate write, but | |
1563 | * which are still considered free by the SPA because the last transaction | |
1564 | * group didn't commit yet. | |
1565 | */ | |
1566 | static int | |
1567 | metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) | |
1568 | { | |
1569 | uint64_t vdev = DVA_GET_VDEV(dva); | |
1570 | uint64_t offset = DVA_GET_OFFSET(dva); | |
1571 | uint64_t size = DVA_GET_ASIZE(dva); | |
1572 | vdev_t *vd; | |
1573 | metaslab_t *msp; | |
428870ff | 1574 | int error = 0; |
34dc7c2f BB |
1575 | |
1576 | ASSERT(DVA_IS_VALID(dva)); | |
1577 | ||
1578 | if ((vd = vdev_lookup_top(spa, vdev)) == NULL || | |
1579 | (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) | |
1580 | return (ENXIO); | |
1581 | ||
1582 | msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; | |
1583 | ||
1584 | if (DVA_GET_GANG(dva)) | |
1585 | size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); | |
1586 | ||
1587 | mutex_enter(&msp->ms_lock); | |
1588 | ||
428870ff | 1589 | if ((txg != 0 && spa_writeable(spa)) || !msp->ms_map.sm_loaded) |
6d974228 | 1590 | error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY); |
428870ff BB |
1591 | |
1592 | if (error == 0 && !space_map_contains(&msp->ms_map, offset, size)) | |
1593 | error = ENOENT; | |
1594 | ||
b128c09f | 1595 | if (error || txg == 0) { /* txg == 0 indicates dry run */ |
34dc7c2f BB |
1596 | mutex_exit(&msp->ms_lock); |
1597 | return (error); | |
1598 | } | |
1599 | ||
34dc7c2f | 1600 | space_map_claim(&msp->ms_map, offset, size); |
b128c09f | 1601 | |
fb5f0bc8 | 1602 | if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */ |
b128c09f BB |
1603 | if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0) |
1604 | vdev_dirty(vd, VDD_METASLAB, msp, txg); | |
1605 | space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size); | |
1606 | } | |
34dc7c2f BB |
1607 | |
1608 | mutex_exit(&msp->ms_lock); | |
1609 | ||
1610 | return (0); | |
1611 | } | |
1612 | ||
1613 | int | |
1614 | metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, | |
b128c09f | 1615 | int ndvas, uint64_t txg, blkptr_t *hintbp, int flags) |
34dc7c2f BB |
1616 | { |
1617 | dva_t *dva = bp->blk_dva; | |
1618 | dva_t *hintdva = hintbp->blk_dva; | |
d6320ddb | 1619 | int d, error = 0; |
34dc7c2f | 1620 | |
b128c09f | 1621 | ASSERT(bp->blk_birth == 0); |
428870ff | 1622 | ASSERT(BP_PHYSICAL_BIRTH(bp) == 0); |
b128c09f BB |
1623 | |
1624 | spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); | |
1625 | ||
1626 | if (mc->mc_rotor == NULL) { /* no vdevs in this class */ | |
1627 | spa_config_exit(spa, SCL_ALLOC, FTAG); | |
34dc7c2f | 1628 | return (ENOSPC); |
b128c09f | 1629 | } |
34dc7c2f BB |
1630 | |
1631 | ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa)); | |
1632 | ASSERT(BP_GET_NDVAS(bp) == 0); | |
1633 | ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp)); | |
1634 | ||
d6320ddb | 1635 | for (d = 0; d < ndvas; d++) { |
34dc7c2f | 1636 | error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva, |
b128c09f | 1637 | txg, flags); |
34dc7c2f BB |
1638 | if (error) { |
1639 | for (d--; d >= 0; d--) { | |
1640 | metaslab_free_dva(spa, &dva[d], txg, B_TRUE); | |
1641 | bzero(&dva[d], sizeof (dva_t)); | |
1642 | } | |
b128c09f | 1643 | spa_config_exit(spa, SCL_ALLOC, FTAG); |
34dc7c2f BB |
1644 | return (error); |
1645 | } | |
1646 | } | |
1647 | ASSERT(error == 0); | |
1648 | ASSERT(BP_GET_NDVAS(bp) == ndvas); | |
1649 | ||
b128c09f BB |
1650 | spa_config_exit(spa, SCL_ALLOC, FTAG); |
1651 | ||
428870ff | 1652 | BP_SET_BIRTH(bp, txg, txg); |
b128c09f | 1653 | |
34dc7c2f BB |
1654 | return (0); |
1655 | } | |
1656 | ||
1657 | void | |
1658 | metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now) | |
1659 | { | |
1660 | const dva_t *dva = bp->blk_dva; | |
d6320ddb | 1661 | int d, ndvas = BP_GET_NDVAS(bp); |
34dc7c2f BB |
1662 | |
1663 | ASSERT(!BP_IS_HOLE(bp)); | |
428870ff | 1664 | ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa)); |
b128c09f BB |
1665 | |
1666 | spa_config_enter(spa, SCL_FREE, FTAG, RW_READER); | |
34dc7c2f | 1667 | |
d6320ddb | 1668 | for (d = 0; d < ndvas; d++) |
34dc7c2f | 1669 | metaslab_free_dva(spa, &dva[d], txg, now); |
b128c09f BB |
1670 | |
1671 | spa_config_exit(spa, SCL_FREE, FTAG); | |
34dc7c2f BB |
1672 | } |
1673 | ||
1674 | int | |
1675 | metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) | |
1676 | { | |
1677 | const dva_t *dva = bp->blk_dva; | |
1678 | int ndvas = BP_GET_NDVAS(bp); | |
d6320ddb | 1679 | int d, error = 0; |
34dc7c2f BB |
1680 | |
1681 | ASSERT(!BP_IS_HOLE(bp)); | |
1682 | ||
b128c09f BB |
1683 | if (txg != 0) { |
1684 | /* | |
1685 | * First do a dry run to make sure all DVAs are claimable, | |
1686 | * so we don't have to unwind from partial failures below. | |
1687 | */ | |
1688 | if ((error = metaslab_claim(spa, bp, 0)) != 0) | |
1689 | return (error); | |
1690 | } | |
1691 | ||
1692 | spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); | |
1693 | ||
d6320ddb | 1694 | for (d = 0; d < ndvas; d++) |
34dc7c2f | 1695 | if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0) |
b128c09f BB |
1696 | break; |
1697 | ||
1698 | spa_config_exit(spa, SCL_ALLOC, FTAG); | |
1699 | ||
1700 | ASSERT(error == 0 || txg == 0); | |
34dc7c2f | 1701 | |
b128c09f | 1702 | return (error); |
34dc7c2f | 1703 | } |
920dd524 ED |
1704 | |
1705 | void metaslab_fastwrite_mark(spa_t *spa, const blkptr_t *bp) | |
1706 | { | |
1707 | const dva_t *dva = bp->blk_dva; | |
1708 | int ndvas = BP_GET_NDVAS(bp); | |
1709 | uint64_t psize = BP_GET_PSIZE(bp); | |
1710 | int d; | |
1711 | vdev_t *vd; | |
1712 | ||
1713 | ASSERT(!BP_IS_HOLE(bp)); | |
1714 | ASSERT(psize > 0); | |
1715 | ||
1716 | spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); | |
1717 | ||
1718 | for (d = 0; d < ndvas; d++) { | |
1719 | if ((vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]))) == NULL) | |
1720 | continue; | |
1721 | atomic_add_64(&vd->vdev_pending_fastwrite, psize); | |
1722 | } | |
1723 | ||
1724 | spa_config_exit(spa, SCL_VDEV, FTAG); | |
1725 | } | |
1726 | ||
1727 | void metaslab_fastwrite_unmark(spa_t *spa, const blkptr_t *bp) | |
1728 | { | |
1729 | const dva_t *dva = bp->blk_dva; | |
1730 | int ndvas = BP_GET_NDVAS(bp); | |
1731 | uint64_t psize = BP_GET_PSIZE(bp); | |
1732 | int d; | |
1733 | vdev_t *vd; | |
1734 | ||
1735 | ASSERT(!BP_IS_HOLE(bp)); | |
1736 | ASSERT(psize > 0); | |
1737 | ||
1738 | spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); | |
1739 | ||
1740 | for (d = 0; d < ndvas; d++) { | |
1741 | if ((vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]))) == NULL) | |
1742 | continue; | |
1743 | ASSERT3U(vd->vdev_pending_fastwrite, >=, psize); | |
1744 | atomic_sub_64(&vd->vdev_pending_fastwrite, psize); | |
1745 | } | |
1746 | ||
1747 | spa_config_exit(spa, SCL_VDEV, FTAG); | |
1748 | } |