]>
Commit | Line | Data |
---|---|---|
34dc7c2f BB |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
9 | * or http://www.opensolaris.org/os/licensing. | |
10 | * See the License for the specific language governing permissions | |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | /* | |
b128c09f | 22 | * Copyright 2008 Sun Microsystems, Inc. All rights reserved. |
34dc7c2f BB |
23 | * Use is subject to license terms. |
24 | */ | |
25 | ||
34dc7c2f BB |
26 | #include <sys/zfs_context.h> |
27 | #include <sys/spa_impl.h> | |
28 | #include <sys/dmu.h> | |
29 | #include <sys/dmu_tx.h> | |
30 | #include <sys/space_map.h> | |
31 | #include <sys/metaslab_impl.h> | |
32 | #include <sys/vdev_impl.h> | |
33 | #include <sys/zio.h> | |
34 | ||
35 | uint64_t metaslab_aliquot = 512ULL << 10; | |
36 | uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */ | |
37 | ||
38 | /* | |
39 | * ========================================================================== | |
40 | * Metaslab classes | |
41 | * ========================================================================== | |
42 | */ | |
43 | metaslab_class_t * | |
44 | metaslab_class_create(void) | |
45 | { | |
46 | metaslab_class_t *mc; | |
47 | ||
48 | mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP); | |
49 | ||
50 | mc->mc_rotor = NULL; | |
51 | ||
52 | return (mc); | |
53 | } | |
54 | ||
55 | void | |
56 | metaslab_class_destroy(metaslab_class_t *mc) | |
57 | { | |
58 | metaslab_group_t *mg; | |
59 | ||
60 | while ((mg = mc->mc_rotor) != NULL) { | |
61 | metaslab_class_remove(mc, mg); | |
62 | metaslab_group_destroy(mg); | |
63 | } | |
64 | ||
65 | kmem_free(mc, sizeof (metaslab_class_t)); | |
66 | } | |
67 | ||
68 | void | |
69 | metaslab_class_add(metaslab_class_t *mc, metaslab_group_t *mg) | |
70 | { | |
71 | metaslab_group_t *mgprev, *mgnext; | |
72 | ||
73 | ASSERT(mg->mg_class == NULL); | |
74 | ||
75 | if ((mgprev = mc->mc_rotor) == NULL) { | |
76 | mg->mg_prev = mg; | |
77 | mg->mg_next = mg; | |
78 | } else { | |
79 | mgnext = mgprev->mg_next; | |
80 | mg->mg_prev = mgprev; | |
81 | mg->mg_next = mgnext; | |
82 | mgprev->mg_next = mg; | |
83 | mgnext->mg_prev = mg; | |
84 | } | |
85 | mc->mc_rotor = mg; | |
86 | mg->mg_class = mc; | |
87 | } | |
88 | ||
89 | void | |
90 | metaslab_class_remove(metaslab_class_t *mc, metaslab_group_t *mg) | |
91 | { | |
92 | metaslab_group_t *mgprev, *mgnext; | |
93 | ||
94 | ASSERT(mg->mg_class == mc); | |
95 | ||
96 | mgprev = mg->mg_prev; | |
97 | mgnext = mg->mg_next; | |
98 | ||
99 | if (mg == mgnext) { | |
100 | mc->mc_rotor = NULL; | |
101 | } else { | |
102 | mc->mc_rotor = mgnext; | |
103 | mgprev->mg_next = mgnext; | |
104 | mgnext->mg_prev = mgprev; | |
105 | } | |
106 | ||
107 | mg->mg_prev = NULL; | |
108 | mg->mg_next = NULL; | |
109 | mg->mg_class = NULL; | |
110 | } | |
111 | ||
112 | /* | |
113 | * ========================================================================== | |
114 | * Metaslab groups | |
115 | * ========================================================================== | |
116 | */ | |
117 | static int | |
118 | metaslab_compare(const void *x1, const void *x2) | |
119 | { | |
120 | const metaslab_t *m1 = x1; | |
121 | const metaslab_t *m2 = x2; | |
122 | ||
123 | if (m1->ms_weight < m2->ms_weight) | |
124 | return (1); | |
125 | if (m1->ms_weight > m2->ms_weight) | |
126 | return (-1); | |
127 | ||
128 | /* | |
129 | * If the weights are identical, use the offset to force uniqueness. | |
130 | */ | |
131 | if (m1->ms_map.sm_start < m2->ms_map.sm_start) | |
132 | return (-1); | |
133 | if (m1->ms_map.sm_start > m2->ms_map.sm_start) | |
134 | return (1); | |
135 | ||
136 | ASSERT3P(m1, ==, m2); | |
137 | ||
138 | return (0); | |
139 | } | |
140 | ||
141 | metaslab_group_t * | |
142 | metaslab_group_create(metaslab_class_t *mc, vdev_t *vd) | |
143 | { | |
144 | metaslab_group_t *mg; | |
145 | ||
146 | mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP); | |
147 | mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); | |
148 | avl_create(&mg->mg_metaslab_tree, metaslab_compare, | |
149 | sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node)); | |
150 | mg->mg_aliquot = metaslab_aliquot * MAX(1, vd->vdev_children); | |
151 | mg->mg_vd = vd; | |
152 | metaslab_class_add(mc, mg); | |
153 | ||
154 | return (mg); | |
155 | } | |
156 | ||
157 | void | |
158 | metaslab_group_destroy(metaslab_group_t *mg) | |
159 | { | |
160 | avl_destroy(&mg->mg_metaslab_tree); | |
161 | mutex_destroy(&mg->mg_lock); | |
162 | kmem_free(mg, sizeof (metaslab_group_t)); | |
163 | } | |
164 | ||
165 | static void | |
166 | metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp) | |
167 | { | |
168 | mutex_enter(&mg->mg_lock); | |
169 | ASSERT(msp->ms_group == NULL); | |
170 | msp->ms_group = mg; | |
171 | msp->ms_weight = 0; | |
172 | avl_add(&mg->mg_metaslab_tree, msp); | |
173 | mutex_exit(&mg->mg_lock); | |
174 | } | |
175 | ||
176 | static void | |
177 | metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp) | |
178 | { | |
179 | mutex_enter(&mg->mg_lock); | |
180 | ASSERT(msp->ms_group == mg); | |
181 | avl_remove(&mg->mg_metaslab_tree, msp); | |
182 | msp->ms_group = NULL; | |
183 | mutex_exit(&mg->mg_lock); | |
184 | } | |
185 | ||
186 | static void | |
187 | metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) | |
188 | { | |
189 | /* | |
190 | * Although in principle the weight can be any value, in | |
191 | * practice we do not use values in the range [1, 510]. | |
192 | */ | |
193 | ASSERT(weight >= SPA_MINBLOCKSIZE-1 || weight == 0); | |
194 | ASSERT(MUTEX_HELD(&msp->ms_lock)); | |
195 | ||
196 | mutex_enter(&mg->mg_lock); | |
197 | ASSERT(msp->ms_group == mg); | |
198 | avl_remove(&mg->mg_metaslab_tree, msp); | |
199 | msp->ms_weight = weight; | |
200 | avl_add(&mg->mg_metaslab_tree, msp); | |
201 | mutex_exit(&mg->mg_lock); | |
202 | } | |
203 | ||
204 | /* | |
205 | * ========================================================================== | |
206 | * The first-fit block allocator | |
207 | * ========================================================================== | |
208 | */ | |
209 | static void | |
210 | metaslab_ff_load(space_map_t *sm) | |
211 | { | |
212 | ASSERT(sm->sm_ppd == NULL); | |
213 | sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP); | |
214 | } | |
215 | ||
216 | static void | |
217 | metaslab_ff_unload(space_map_t *sm) | |
218 | { | |
219 | kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t)); | |
220 | sm->sm_ppd = NULL; | |
221 | } | |
222 | ||
223 | static uint64_t | |
224 | metaslab_ff_alloc(space_map_t *sm, uint64_t size) | |
225 | { | |
226 | avl_tree_t *t = &sm->sm_root; | |
227 | uint64_t align = size & -size; | |
228 | uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1; | |
229 | space_seg_t *ss, ssearch; | |
230 | avl_index_t where; | |
231 | ||
232 | ssearch.ss_start = *cursor; | |
233 | ssearch.ss_end = *cursor + size; | |
234 | ||
235 | ss = avl_find(t, &ssearch, &where); | |
236 | if (ss == NULL) | |
237 | ss = avl_nearest(t, where, AVL_AFTER); | |
238 | ||
239 | while (ss != NULL) { | |
240 | uint64_t offset = P2ROUNDUP(ss->ss_start, align); | |
241 | ||
242 | if (offset + size <= ss->ss_end) { | |
243 | *cursor = offset + size; | |
244 | return (offset); | |
245 | } | |
246 | ss = AVL_NEXT(t, ss); | |
247 | } | |
248 | ||
249 | /* | |
250 | * If we know we've searched the whole map (*cursor == 0), give up. | |
251 | * Otherwise, reset the cursor to the beginning and try again. | |
252 | */ | |
253 | if (*cursor == 0) | |
254 | return (-1ULL); | |
255 | ||
256 | *cursor = 0; | |
257 | return (metaslab_ff_alloc(sm, size)); | |
258 | } | |
259 | ||
260 | /* ARGSUSED */ | |
261 | static void | |
262 | metaslab_ff_claim(space_map_t *sm, uint64_t start, uint64_t size) | |
263 | { | |
264 | /* No need to update cursor */ | |
265 | } | |
266 | ||
267 | /* ARGSUSED */ | |
268 | static void | |
269 | metaslab_ff_free(space_map_t *sm, uint64_t start, uint64_t size) | |
270 | { | |
271 | /* No need to update cursor */ | |
272 | } | |
273 | ||
274 | static space_map_ops_t metaslab_ff_ops = { | |
275 | metaslab_ff_load, | |
276 | metaslab_ff_unload, | |
277 | metaslab_ff_alloc, | |
278 | metaslab_ff_claim, | |
279 | metaslab_ff_free | |
280 | }; | |
281 | ||
282 | /* | |
283 | * ========================================================================== | |
284 | * Metaslabs | |
285 | * ========================================================================== | |
286 | */ | |
287 | metaslab_t * | |
288 | metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo, | |
289 | uint64_t start, uint64_t size, uint64_t txg) | |
290 | { | |
291 | vdev_t *vd = mg->mg_vd; | |
292 | metaslab_t *msp; | |
293 | ||
294 | msp = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP); | |
295 | mutex_init(&msp->ms_lock, NULL, MUTEX_DEFAULT, NULL); | |
296 | ||
297 | msp->ms_smo_syncing = *smo; | |
298 | ||
299 | /* | |
300 | * We create the main space map here, but we don't create the | |
301 | * allocmaps and freemaps until metaslab_sync_done(). This serves | |
302 | * two purposes: it allows metaslab_sync_done() to detect the | |
303 | * addition of new space; and for debugging, it ensures that we'd | |
304 | * data fault on any attempt to use this metaslab before it's ready. | |
305 | */ | |
306 | space_map_create(&msp->ms_map, start, size, | |
307 | vd->vdev_ashift, &msp->ms_lock); | |
308 | ||
309 | metaslab_group_add(mg, msp); | |
310 | ||
311 | /* | |
312 | * If we're opening an existing pool (txg == 0) or creating | |
313 | * a new one (txg == TXG_INITIAL), all space is available now. | |
314 | * If we're adding space to an existing pool, the new space | |
315 | * does not become available until after this txg has synced. | |
316 | */ | |
317 | if (txg <= TXG_INITIAL) | |
318 | metaslab_sync_done(msp, 0); | |
319 | ||
320 | if (txg != 0) { | |
321 | /* | |
322 | * The vdev is dirty, but the metaslab isn't -- it just needs | |
323 | * to have metaslab_sync_done() invoked from vdev_sync_done(). | |
324 | * [We could just dirty the metaslab, but that would cause us | |
325 | * to allocate a space map object for it, which is wasteful | |
326 | * and would mess up the locality logic in metaslab_weight().] | |
327 | */ | |
328 | ASSERT(TXG_CLEAN(txg) == spa_last_synced_txg(vd->vdev_spa)); | |
329 | vdev_dirty(vd, 0, NULL, txg); | |
330 | vdev_dirty(vd, VDD_METASLAB, msp, TXG_CLEAN(txg)); | |
331 | } | |
332 | ||
333 | return (msp); | |
334 | } | |
335 | ||
336 | void | |
337 | metaslab_fini(metaslab_t *msp) | |
338 | { | |
339 | metaslab_group_t *mg = msp->ms_group; | |
340 | int t; | |
341 | ||
342 | vdev_space_update(mg->mg_vd, -msp->ms_map.sm_size, | |
343 | -msp->ms_smo.smo_alloc, B_TRUE); | |
344 | ||
345 | metaslab_group_remove(mg, msp); | |
346 | ||
347 | mutex_enter(&msp->ms_lock); | |
348 | ||
349 | space_map_unload(&msp->ms_map); | |
350 | space_map_destroy(&msp->ms_map); | |
351 | ||
352 | for (t = 0; t < TXG_SIZE; t++) { | |
353 | space_map_destroy(&msp->ms_allocmap[t]); | |
354 | space_map_destroy(&msp->ms_freemap[t]); | |
355 | } | |
356 | ||
357 | mutex_exit(&msp->ms_lock); | |
358 | mutex_destroy(&msp->ms_lock); | |
359 | ||
360 | kmem_free(msp, sizeof (metaslab_t)); | |
361 | } | |
362 | ||
363 | #define METASLAB_WEIGHT_PRIMARY (1ULL << 63) | |
364 | #define METASLAB_WEIGHT_SECONDARY (1ULL << 62) | |
365 | #define METASLAB_ACTIVE_MASK \ | |
366 | (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY) | |
367 | #define METASLAB_SMO_BONUS_MULTIPLIER 2 | |
368 | ||
369 | static uint64_t | |
370 | metaslab_weight(metaslab_t *msp) | |
371 | { | |
372 | metaslab_group_t *mg = msp->ms_group; | |
373 | space_map_t *sm = &msp->ms_map; | |
374 | space_map_obj_t *smo = &msp->ms_smo; | |
375 | vdev_t *vd = mg->mg_vd; | |
376 | uint64_t weight, space; | |
377 | ||
378 | ASSERT(MUTEX_HELD(&msp->ms_lock)); | |
379 | ||
380 | /* | |
381 | * The baseline weight is the metaslab's free space. | |
382 | */ | |
383 | space = sm->sm_size - smo->smo_alloc; | |
384 | weight = space; | |
385 | ||
386 | /* | |
387 | * Modern disks have uniform bit density and constant angular velocity. | |
388 | * Therefore, the outer recording zones are faster (higher bandwidth) | |
389 | * than the inner zones by the ratio of outer to inner track diameter, | |
390 | * which is typically around 2:1. We account for this by assigning | |
391 | * higher weight to lower metaslabs (multiplier ranging from 2x to 1x). | |
392 | * In effect, this means that we'll select the metaslab with the most | |
393 | * free bandwidth rather than simply the one with the most free space. | |
394 | */ | |
395 | weight = 2 * weight - | |
396 | ((sm->sm_start >> vd->vdev_ms_shift) * weight) / vd->vdev_ms_count; | |
397 | ASSERT(weight >= space && weight <= 2 * space); | |
398 | ||
399 | /* | |
400 | * For locality, assign higher weight to metaslabs we've used before. | |
401 | */ | |
402 | if (smo->smo_object != 0) | |
403 | weight *= METASLAB_SMO_BONUS_MULTIPLIER; | |
404 | ASSERT(weight >= space && | |
405 | weight <= 2 * METASLAB_SMO_BONUS_MULTIPLIER * space); | |
406 | ||
407 | /* | |
408 | * If this metaslab is one we're actively using, adjust its weight to | |
409 | * make it preferable to any inactive metaslab so we'll polish it off. | |
410 | */ | |
411 | weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK); | |
412 | ||
413 | return (weight); | |
414 | } | |
415 | ||
416 | static int | |
417 | metaslab_activate(metaslab_t *msp, uint64_t activation_weight) | |
418 | { | |
419 | space_map_t *sm = &msp->ms_map; | |
420 | ||
421 | ASSERT(MUTEX_HELD(&msp->ms_lock)); | |
422 | ||
423 | if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { | |
424 | int error = space_map_load(sm, &metaslab_ff_ops, | |
425 | SM_FREE, &msp->ms_smo, | |
426 | msp->ms_group->mg_vd->vdev_spa->spa_meta_objset); | |
427 | if (error) { | |
428 | metaslab_group_sort(msp->ms_group, msp, 0); | |
429 | return (error); | |
430 | } | |
431 | metaslab_group_sort(msp->ms_group, msp, | |
432 | msp->ms_weight | activation_weight); | |
433 | } | |
434 | ASSERT(sm->sm_loaded); | |
435 | ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); | |
436 | ||
437 | return (0); | |
438 | } | |
439 | ||
440 | static void | |
441 | metaslab_passivate(metaslab_t *msp, uint64_t size) | |
442 | { | |
443 | /* | |
444 | * If size < SPA_MINBLOCKSIZE, then we will not allocate from | |
445 | * this metaslab again. In that case, it had better be empty, | |
446 | * or we would be leaving space on the table. | |
447 | */ | |
448 | ASSERT(size >= SPA_MINBLOCKSIZE || msp->ms_map.sm_space == 0); | |
449 | metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size)); | |
450 | ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0); | |
451 | } | |
452 | ||
453 | /* | |
454 | * Write a metaslab to disk in the context of the specified transaction group. | |
455 | */ | |
456 | void | |
457 | metaslab_sync(metaslab_t *msp, uint64_t txg) | |
458 | { | |
459 | vdev_t *vd = msp->ms_group->mg_vd; | |
460 | spa_t *spa = vd->vdev_spa; | |
461 | objset_t *mos = spa->spa_meta_objset; | |
462 | space_map_t *allocmap = &msp->ms_allocmap[txg & TXG_MASK]; | |
463 | space_map_t *freemap = &msp->ms_freemap[txg & TXG_MASK]; | |
464 | space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; | |
465 | space_map_t *sm = &msp->ms_map; | |
466 | space_map_obj_t *smo = &msp->ms_smo_syncing; | |
467 | dmu_buf_t *db; | |
468 | dmu_tx_t *tx; | |
469 | int t; | |
470 | ||
471 | tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); | |
472 | ||
473 | /* | |
474 | * The only state that can actually be changing concurrently with | |
475 | * metaslab_sync() is the metaslab's ms_map. No other thread can | |
476 | * be modifying this txg's allocmap, freemap, freed_map, or smo. | |
477 | * Therefore, we only hold ms_lock to satify space_map ASSERTs. | |
478 | * We drop it whenever we call into the DMU, because the DMU | |
479 | * can call down to us (e.g. via zio_free()) at any time. | |
480 | */ | |
481 | mutex_enter(&msp->ms_lock); | |
482 | ||
483 | if (smo->smo_object == 0) { | |
484 | ASSERT(smo->smo_objsize == 0); | |
485 | ASSERT(smo->smo_alloc == 0); | |
486 | mutex_exit(&msp->ms_lock); | |
487 | smo->smo_object = dmu_object_alloc(mos, | |
488 | DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT, | |
489 | DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx); | |
490 | ASSERT(smo->smo_object != 0); | |
491 | dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) * | |
492 | (sm->sm_start >> vd->vdev_ms_shift), | |
493 | sizeof (uint64_t), &smo->smo_object, tx); | |
494 | mutex_enter(&msp->ms_lock); | |
495 | } | |
496 | ||
497 | space_map_walk(freemap, space_map_add, freed_map); | |
498 | ||
499 | if (sm->sm_loaded && spa_sync_pass(spa) == 1 && smo->smo_objsize >= | |
500 | 2 * sizeof (uint64_t) * avl_numnodes(&sm->sm_root)) { | |
501 | /* | |
502 | * The in-core space map representation is twice as compact | |
503 | * as the on-disk one, so it's time to condense the latter | |
504 | * by generating a pure allocmap from first principles. | |
505 | * | |
506 | * This metaslab is 100% allocated, | |
507 | * minus the content of the in-core map (sm), | |
508 | * minus what's been freed this txg (freed_map), | |
509 | * minus allocations from txgs in the future | |
510 | * (because they haven't been committed yet). | |
511 | */ | |
512 | space_map_vacate(allocmap, NULL, NULL); | |
513 | space_map_vacate(freemap, NULL, NULL); | |
514 | ||
515 | space_map_add(allocmap, allocmap->sm_start, allocmap->sm_size); | |
516 | ||
517 | space_map_walk(sm, space_map_remove, allocmap); | |
518 | space_map_walk(freed_map, space_map_remove, allocmap); | |
519 | ||
520 | for (t = 1; t < TXG_CONCURRENT_STATES; t++) | |
521 | space_map_walk(&msp->ms_allocmap[(txg + t) & TXG_MASK], | |
522 | space_map_remove, allocmap); | |
523 | ||
524 | mutex_exit(&msp->ms_lock); | |
525 | space_map_truncate(smo, mos, tx); | |
526 | mutex_enter(&msp->ms_lock); | |
527 | } | |
528 | ||
529 | space_map_sync(allocmap, SM_ALLOC, smo, mos, tx); | |
530 | space_map_sync(freemap, SM_FREE, smo, mos, tx); | |
531 | ||
532 | mutex_exit(&msp->ms_lock); | |
533 | ||
534 | VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)); | |
535 | dmu_buf_will_dirty(db, tx); | |
536 | ASSERT3U(db->db_size, >=, sizeof (*smo)); | |
537 | bcopy(smo, db->db_data, sizeof (*smo)); | |
538 | dmu_buf_rele(db, FTAG); | |
539 | ||
540 | dmu_tx_commit(tx); | |
541 | } | |
542 | ||
543 | /* | |
544 | * Called after a transaction group has completely synced to mark | |
545 | * all of the metaslab's free space as usable. | |
546 | */ | |
547 | void | |
548 | metaslab_sync_done(metaslab_t *msp, uint64_t txg) | |
549 | { | |
550 | space_map_obj_t *smo = &msp->ms_smo; | |
551 | space_map_obj_t *smosync = &msp->ms_smo_syncing; | |
552 | space_map_t *sm = &msp->ms_map; | |
553 | space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; | |
554 | metaslab_group_t *mg = msp->ms_group; | |
555 | vdev_t *vd = mg->mg_vd; | |
556 | int t; | |
557 | ||
558 | mutex_enter(&msp->ms_lock); | |
559 | ||
560 | /* | |
561 | * If this metaslab is just becoming available, initialize its | |
562 | * allocmaps and freemaps and add its capacity to the vdev. | |
563 | */ | |
564 | if (freed_map->sm_size == 0) { | |
565 | for (t = 0; t < TXG_SIZE; t++) { | |
566 | space_map_create(&msp->ms_allocmap[t], sm->sm_start, | |
567 | sm->sm_size, sm->sm_shift, sm->sm_lock); | |
568 | space_map_create(&msp->ms_freemap[t], sm->sm_start, | |
569 | sm->sm_size, sm->sm_shift, sm->sm_lock); | |
570 | } | |
571 | vdev_space_update(vd, sm->sm_size, 0, B_TRUE); | |
572 | } | |
573 | ||
574 | vdev_space_update(vd, 0, smosync->smo_alloc - smo->smo_alloc, B_TRUE); | |
575 | ||
576 | ASSERT(msp->ms_allocmap[txg & TXG_MASK].sm_space == 0); | |
577 | ASSERT(msp->ms_freemap[txg & TXG_MASK].sm_space == 0); | |
578 | ||
579 | /* | |
580 | * If there's a space_map_load() in progress, wait for it to complete | |
581 | * so that we have a consistent view of the in-core space map. | |
582 | * Then, add everything we freed in this txg to the map. | |
583 | */ | |
584 | space_map_load_wait(sm); | |
585 | space_map_vacate(freed_map, sm->sm_loaded ? space_map_free : NULL, sm); | |
586 | ||
587 | *smo = *smosync; | |
588 | ||
589 | /* | |
590 | * If the map is loaded but no longer active, evict it as soon as all | |
591 | * future allocations have synced. (If we unloaded it now and then | |
592 | * loaded a moment later, the map wouldn't reflect those allocations.) | |
593 | */ | |
594 | if (sm->sm_loaded && (msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { | |
595 | int evictable = 1; | |
596 | ||
597 | for (t = 1; t < TXG_CONCURRENT_STATES; t++) | |
598 | if (msp->ms_allocmap[(txg + t) & TXG_MASK].sm_space) | |
599 | evictable = 0; | |
600 | ||
601 | if (evictable) | |
602 | space_map_unload(sm); | |
603 | } | |
604 | ||
605 | metaslab_group_sort(mg, msp, metaslab_weight(msp)); | |
606 | ||
607 | mutex_exit(&msp->ms_lock); | |
608 | } | |
609 | ||
610 | static uint64_t | |
611 | metaslab_distance(metaslab_t *msp, dva_t *dva) | |
612 | { | |
613 | uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift; | |
614 | uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift; | |
615 | uint64_t start = msp->ms_map.sm_start >> ms_shift; | |
616 | ||
617 | if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva)) | |
618 | return (1ULL << 63); | |
619 | ||
620 | if (offset < start) | |
621 | return ((start - offset) << ms_shift); | |
622 | if (offset > start) | |
623 | return ((offset - start) << ms_shift); | |
624 | return (0); | |
625 | } | |
626 | ||
627 | static uint64_t | |
628 | metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg, | |
629 | uint64_t min_distance, dva_t *dva, int d) | |
630 | { | |
631 | metaslab_t *msp = NULL; | |
632 | uint64_t offset = -1ULL; | |
633 | avl_tree_t *t = &mg->mg_metaslab_tree; | |
634 | uint64_t activation_weight; | |
635 | uint64_t target_distance; | |
636 | int i; | |
637 | ||
638 | activation_weight = METASLAB_WEIGHT_PRIMARY; | |
639 | for (i = 0; i < d; i++) | |
640 | if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) | |
641 | activation_weight = METASLAB_WEIGHT_SECONDARY; | |
642 | ||
643 | for (;;) { | |
644 | mutex_enter(&mg->mg_lock); | |
645 | for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) { | |
646 | if (msp->ms_weight < size) { | |
647 | mutex_exit(&mg->mg_lock); | |
648 | return (-1ULL); | |
649 | } | |
650 | ||
651 | if (activation_weight == METASLAB_WEIGHT_PRIMARY) | |
652 | break; | |
653 | ||
654 | target_distance = min_distance + | |
655 | (msp->ms_smo.smo_alloc ? 0 : min_distance >> 1); | |
656 | ||
657 | for (i = 0; i < d; i++) | |
658 | if (metaslab_distance(msp, &dva[i]) < | |
659 | target_distance) | |
660 | break; | |
661 | if (i == d) | |
662 | break; | |
663 | } | |
664 | mutex_exit(&mg->mg_lock); | |
665 | if (msp == NULL) | |
666 | return (-1ULL); | |
667 | ||
668 | mutex_enter(&msp->ms_lock); | |
669 | ||
670 | /* | |
671 | * Ensure that the metaslab we have selected is still | |
672 | * capable of handling our request. It's possible that | |
673 | * another thread may have changed the weight while we | |
674 | * were blocked on the metaslab lock. | |
675 | */ | |
676 | if (msp->ms_weight < size) { | |
677 | mutex_exit(&msp->ms_lock); | |
678 | continue; | |
679 | } | |
680 | ||
681 | if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) && | |
682 | activation_weight == METASLAB_WEIGHT_PRIMARY) { | |
683 | metaslab_passivate(msp, | |
684 | msp->ms_weight & ~METASLAB_ACTIVE_MASK); | |
685 | mutex_exit(&msp->ms_lock); | |
686 | continue; | |
687 | } | |
688 | ||
689 | if (metaslab_activate(msp, activation_weight) != 0) { | |
690 | mutex_exit(&msp->ms_lock); | |
691 | continue; | |
692 | } | |
693 | ||
694 | if ((offset = space_map_alloc(&msp->ms_map, size)) != -1ULL) | |
695 | break; | |
696 | ||
697 | metaslab_passivate(msp, size - 1); | |
698 | ||
699 | mutex_exit(&msp->ms_lock); | |
700 | } | |
701 | ||
702 | if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0) | |
703 | vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); | |
704 | ||
705 | space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size); | |
706 | ||
707 | mutex_exit(&msp->ms_lock); | |
708 | ||
709 | return (offset); | |
710 | } | |
711 | ||
712 | /* | |
713 | * Allocate a block for the specified i/o. | |
714 | */ | |
715 | static int | |
716 | metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, | |
b128c09f | 717 | dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags) |
34dc7c2f BB |
718 | { |
719 | metaslab_group_t *mg, *rotor; | |
720 | vdev_t *vd; | |
721 | int dshift = 3; | |
722 | int all_zero; | |
fb5f0bc8 BB |
723 | int zio_lock = B_FALSE; |
724 | boolean_t allocatable; | |
34dc7c2f BB |
725 | uint64_t offset = -1ULL; |
726 | uint64_t asize; | |
727 | uint64_t distance; | |
728 | ||
729 | ASSERT(!DVA_IS_VALID(&dva[d])); | |
730 | ||
731 | /* | |
732 | * For testing, make some blocks above a certain size be gang blocks. | |
733 | */ | |
734 | if (psize >= metaslab_gang_bang && (lbolt & 3) == 0) | |
735 | return (ENOSPC); | |
736 | ||
737 | /* | |
738 | * Start at the rotor and loop through all mgs until we find something. | |
739 | * Note that there's no locking on mc_rotor or mc_allocated because | |
740 | * nothing actually breaks if we miss a few updates -- we just won't | |
741 | * allocate quite as evenly. It all balances out over time. | |
742 | * | |
743 | * If we are doing ditto or log blocks, try to spread them across | |
744 | * consecutive vdevs. If we're forced to reuse a vdev before we've | |
745 | * allocated all of our ditto blocks, then try and spread them out on | |
746 | * that vdev as much as possible. If it turns out to not be possible, | |
747 | * gradually lower our standards until anything becomes acceptable. | |
748 | * Also, allocating on consecutive vdevs (as opposed to random vdevs) | |
749 | * gives us hope of containing our fault domains to something we're | |
750 | * able to reason about. Otherwise, any two top-level vdev failures | |
751 | * will guarantee the loss of data. With consecutive allocation, | |
752 | * only two adjacent top-level vdev failures will result in data loss. | |
753 | * | |
754 | * If we are doing gang blocks (hintdva is non-NULL), try to keep | |
755 | * ourselves on the same vdev as our gang block header. That | |
756 | * way, we can hope for locality in vdev_cache, plus it makes our | |
757 | * fault domains something tractable. | |
758 | */ | |
759 | if (hintdva) { | |
760 | vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d])); | |
b128c09f | 761 | if (flags & METASLAB_HINTBP_AVOID) |
34dc7c2f BB |
762 | mg = vd->vdev_mg->mg_next; |
763 | else | |
764 | mg = vd->vdev_mg; | |
765 | } else if (d != 0) { | |
766 | vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1])); | |
767 | mg = vd->vdev_mg->mg_next; | |
768 | } else { | |
769 | mg = mc->mc_rotor; | |
770 | } | |
771 | ||
772 | /* | |
773 | * If the hint put us into the wrong class, just follow the rotor. | |
774 | */ | |
775 | if (mg->mg_class != mc) | |
776 | mg = mc->mc_rotor; | |
777 | ||
778 | rotor = mg; | |
779 | top: | |
780 | all_zero = B_TRUE; | |
781 | do { | |
782 | vd = mg->mg_vd; | |
fb5f0bc8 | 783 | |
34dc7c2f | 784 | /* |
b128c09f | 785 | * Don't allocate from faulted devices. |
34dc7c2f | 786 | */ |
fb5f0bc8 BB |
787 | if (zio_lock) { |
788 | spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER); | |
789 | allocatable = vdev_allocatable(vd); | |
790 | spa_config_exit(spa, SCL_ZIO, FTAG); | |
791 | } else { | |
792 | allocatable = vdev_allocatable(vd); | |
793 | } | |
794 | if (!allocatable) | |
34dc7c2f | 795 | goto next; |
fb5f0bc8 | 796 | |
34dc7c2f BB |
797 | /* |
798 | * Avoid writing single-copy data to a failing vdev | |
799 | */ | |
800 | if ((vd->vdev_stat.vs_write_errors > 0 || | |
801 | vd->vdev_state < VDEV_STATE_HEALTHY) && | |
802 | d == 0 && dshift == 3) { | |
803 | all_zero = B_FALSE; | |
804 | goto next; | |
805 | } | |
806 | ||
807 | ASSERT(mg->mg_class == mc); | |
808 | ||
809 | distance = vd->vdev_asize >> dshift; | |
810 | if (distance <= (1ULL << vd->vdev_ms_shift)) | |
811 | distance = 0; | |
812 | else | |
813 | all_zero = B_FALSE; | |
814 | ||
815 | asize = vdev_psize_to_asize(vd, psize); | |
816 | ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); | |
817 | ||
818 | offset = metaslab_group_alloc(mg, asize, txg, distance, dva, d); | |
819 | if (offset != -1ULL) { | |
820 | /* | |
821 | * If we've just selected this metaslab group, | |
822 | * figure out whether the corresponding vdev is | |
823 | * over- or under-used relative to the pool, | |
824 | * and set an allocation bias to even it out. | |
825 | */ | |
826 | if (mc->mc_allocated == 0) { | |
827 | vdev_stat_t *vs = &vd->vdev_stat; | |
828 | uint64_t alloc, space; | |
829 | int64_t vu, su; | |
830 | ||
831 | alloc = spa_get_alloc(spa); | |
832 | space = spa_get_space(spa); | |
833 | ||
834 | /* | |
835 | * Determine percent used in units of 0..1024. | |
836 | * (This is just to avoid floating point.) | |
837 | */ | |
838 | vu = (vs->vs_alloc << 10) / (vs->vs_space + 1); | |
839 | su = (alloc << 10) / (space + 1); | |
840 | ||
841 | /* | |
842 | * Bias by at most +/- 25% of the aliquot. | |
843 | */ | |
844 | mg->mg_bias = ((su - vu) * | |
845 | (int64_t)mg->mg_aliquot) / (1024 * 4); | |
846 | } | |
847 | ||
848 | if (atomic_add_64_nv(&mc->mc_allocated, asize) >= | |
849 | mg->mg_aliquot + mg->mg_bias) { | |
850 | mc->mc_rotor = mg->mg_next; | |
851 | mc->mc_allocated = 0; | |
852 | } | |
853 | ||
854 | DVA_SET_VDEV(&dva[d], vd->vdev_id); | |
855 | DVA_SET_OFFSET(&dva[d], offset); | |
b128c09f | 856 | DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER)); |
34dc7c2f BB |
857 | DVA_SET_ASIZE(&dva[d], asize); |
858 | ||
859 | return (0); | |
860 | } | |
861 | next: | |
862 | mc->mc_rotor = mg->mg_next; | |
863 | mc->mc_allocated = 0; | |
864 | } while ((mg = mg->mg_next) != rotor); | |
865 | ||
866 | if (!all_zero) { | |
867 | dshift++; | |
868 | ASSERT(dshift < 64); | |
869 | goto top; | |
870 | } | |
871 | ||
fb5f0bc8 BB |
872 | if (!zio_lock) { |
873 | dshift = 3; | |
874 | zio_lock = B_TRUE; | |
875 | goto top; | |
876 | } | |
877 | ||
34dc7c2f BB |
878 | bzero(&dva[d], sizeof (dva_t)); |
879 | ||
880 | return (ENOSPC); | |
881 | } | |
882 | ||
883 | /* | |
884 | * Free the block represented by DVA in the context of the specified | |
885 | * transaction group. | |
886 | */ | |
887 | static void | |
888 | metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now) | |
889 | { | |
890 | uint64_t vdev = DVA_GET_VDEV(dva); | |
891 | uint64_t offset = DVA_GET_OFFSET(dva); | |
892 | uint64_t size = DVA_GET_ASIZE(dva); | |
893 | vdev_t *vd; | |
894 | metaslab_t *msp; | |
895 | ||
896 | ASSERT(DVA_IS_VALID(dva)); | |
897 | ||
898 | if (txg > spa_freeze_txg(spa)) | |
899 | return; | |
900 | ||
901 | if ((vd = vdev_lookup_top(spa, vdev)) == NULL || | |
902 | (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) { | |
903 | cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu", | |
904 | (u_longlong_t)vdev, (u_longlong_t)offset); | |
905 | ASSERT(0); | |
906 | return; | |
907 | } | |
908 | ||
909 | msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; | |
910 | ||
911 | if (DVA_GET_GANG(dva)) | |
912 | size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); | |
913 | ||
914 | mutex_enter(&msp->ms_lock); | |
915 | ||
916 | if (now) { | |
917 | space_map_remove(&msp->ms_allocmap[txg & TXG_MASK], | |
918 | offset, size); | |
919 | space_map_free(&msp->ms_map, offset, size); | |
920 | } else { | |
921 | if (msp->ms_freemap[txg & TXG_MASK].sm_space == 0) | |
922 | vdev_dirty(vd, VDD_METASLAB, msp, txg); | |
923 | space_map_add(&msp->ms_freemap[txg & TXG_MASK], offset, size); | |
34dc7c2f BB |
924 | } |
925 | ||
926 | mutex_exit(&msp->ms_lock); | |
927 | } | |
928 | ||
929 | /* | |
930 | * Intent log support: upon opening the pool after a crash, notify the SPA | |
931 | * of blocks that the intent log has allocated for immediate write, but | |
932 | * which are still considered free by the SPA because the last transaction | |
933 | * group didn't commit yet. | |
934 | */ | |
935 | static int | |
936 | metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) | |
937 | { | |
938 | uint64_t vdev = DVA_GET_VDEV(dva); | |
939 | uint64_t offset = DVA_GET_OFFSET(dva); | |
940 | uint64_t size = DVA_GET_ASIZE(dva); | |
941 | vdev_t *vd; | |
942 | metaslab_t *msp; | |
943 | int error; | |
944 | ||
945 | ASSERT(DVA_IS_VALID(dva)); | |
946 | ||
947 | if ((vd = vdev_lookup_top(spa, vdev)) == NULL || | |
948 | (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) | |
949 | return (ENXIO); | |
950 | ||
951 | msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; | |
952 | ||
953 | if (DVA_GET_GANG(dva)) | |
954 | size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); | |
955 | ||
956 | mutex_enter(&msp->ms_lock); | |
957 | ||
958 | error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY); | |
b128c09f | 959 | if (error || txg == 0) { /* txg == 0 indicates dry run */ |
34dc7c2f BB |
960 | mutex_exit(&msp->ms_lock); |
961 | return (error); | |
962 | } | |
963 | ||
34dc7c2f | 964 | space_map_claim(&msp->ms_map, offset, size); |
b128c09f | 965 | |
fb5f0bc8 | 966 | if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */ |
b128c09f BB |
967 | if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0) |
968 | vdev_dirty(vd, VDD_METASLAB, msp, txg); | |
969 | space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size); | |
970 | } | |
34dc7c2f BB |
971 | |
972 | mutex_exit(&msp->ms_lock); | |
973 | ||
974 | return (0); | |
975 | } | |
976 | ||
977 | int | |
978 | metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, | |
b128c09f | 979 | int ndvas, uint64_t txg, blkptr_t *hintbp, int flags) |
34dc7c2f BB |
980 | { |
981 | dva_t *dva = bp->blk_dva; | |
982 | dva_t *hintdva = hintbp->blk_dva; | |
34dc7c2f BB |
983 | int error = 0; |
984 | ||
b128c09f BB |
985 | ASSERT(bp->blk_birth == 0); |
986 | ||
987 | spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); | |
988 | ||
989 | if (mc->mc_rotor == NULL) { /* no vdevs in this class */ | |
990 | spa_config_exit(spa, SCL_ALLOC, FTAG); | |
34dc7c2f | 991 | return (ENOSPC); |
b128c09f | 992 | } |
34dc7c2f BB |
993 | |
994 | ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa)); | |
995 | ASSERT(BP_GET_NDVAS(bp) == 0); | |
996 | ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp)); | |
997 | ||
b128c09f | 998 | for (int d = 0; d < ndvas; d++) { |
34dc7c2f | 999 | error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva, |
b128c09f | 1000 | txg, flags); |
34dc7c2f BB |
1001 | if (error) { |
1002 | for (d--; d >= 0; d--) { | |
1003 | metaslab_free_dva(spa, &dva[d], txg, B_TRUE); | |
1004 | bzero(&dva[d], sizeof (dva_t)); | |
1005 | } | |
b128c09f | 1006 | spa_config_exit(spa, SCL_ALLOC, FTAG); |
34dc7c2f BB |
1007 | return (error); |
1008 | } | |
1009 | } | |
1010 | ASSERT(error == 0); | |
1011 | ASSERT(BP_GET_NDVAS(bp) == ndvas); | |
1012 | ||
b128c09f BB |
1013 | spa_config_exit(spa, SCL_ALLOC, FTAG); |
1014 | ||
1015 | bp->blk_birth = txg; | |
1016 | ||
34dc7c2f BB |
1017 | return (0); |
1018 | } | |
1019 | ||
1020 | void | |
1021 | metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now) | |
1022 | { | |
1023 | const dva_t *dva = bp->blk_dva; | |
1024 | int ndvas = BP_GET_NDVAS(bp); | |
34dc7c2f BB |
1025 | |
1026 | ASSERT(!BP_IS_HOLE(bp)); | |
b128c09f BB |
1027 | ASSERT(!now || bp->blk_birth >= spa->spa_syncing_txg); |
1028 | ||
1029 | spa_config_enter(spa, SCL_FREE, FTAG, RW_READER); | |
34dc7c2f | 1030 | |
b128c09f | 1031 | for (int d = 0; d < ndvas; d++) |
34dc7c2f | 1032 | metaslab_free_dva(spa, &dva[d], txg, now); |
b128c09f BB |
1033 | |
1034 | spa_config_exit(spa, SCL_FREE, FTAG); | |
34dc7c2f BB |
1035 | } |
1036 | ||
1037 | int | |
1038 | metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) | |
1039 | { | |
1040 | const dva_t *dva = bp->blk_dva; | |
1041 | int ndvas = BP_GET_NDVAS(bp); | |
b128c09f | 1042 | int error = 0; |
34dc7c2f BB |
1043 | |
1044 | ASSERT(!BP_IS_HOLE(bp)); | |
1045 | ||
b128c09f BB |
1046 | if (txg != 0) { |
1047 | /* | |
1048 | * First do a dry run to make sure all DVAs are claimable, | |
1049 | * so we don't have to unwind from partial failures below. | |
1050 | */ | |
1051 | if ((error = metaslab_claim(spa, bp, 0)) != 0) | |
1052 | return (error); | |
1053 | } | |
1054 | ||
1055 | spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); | |
1056 | ||
1057 | for (int d = 0; d < ndvas; d++) | |
34dc7c2f | 1058 | if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0) |
b128c09f BB |
1059 | break; |
1060 | ||
1061 | spa_config_exit(spa, SCL_ALLOC, FTAG); | |
1062 | ||
1063 | ASSERT(error == 0 || txg == 0); | |
34dc7c2f | 1064 | |
b128c09f | 1065 | return (error); |
34dc7c2f | 1066 | } |