]>
Commit | Line | Data |
---|---|---|
34dc7c2f BB |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
9 | * or http://www.opensolaris.org/os/licensing. | |
10 | * See the License for the specific language governing permissions | |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | /* | |
428870ff | 22 | * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. |
1a5b96b8 | 23 | * Copyright (c) 2013, 2017 by Delphix. All rights reserved. |
6c59307a | 24 | * Copyright 2014 HybridCluster. All rights reserved. |
34dc7c2f BB |
25 | */ |
26 | ||
34dc7c2f BB |
27 | #include <sys/dmu.h> |
28 | #include <sys/dmu_objset.h> | |
29 | #include <sys/dmu_tx.h> | |
30 | #include <sys/dnode.h> | |
fa86b5db MA |
31 | #include <sys/zap.h> |
32 | #include <sys/zfeature.h> | |
50c957f7 | 33 | #include <sys/dsl_dataset.h> |
34dc7c2f | 34 | |
dbeb8796 MA |
35 | /* |
36 | * Each of the concurrent object allocators will grab | |
37 | * 2^dmu_object_alloc_chunk_shift dnode slots at a time. The default is to | |
38 | * grab 128 slots, which is 4 blocks worth. This was experimentally | |
39 | * determined to be the lowest value that eliminates the measurable effect | |
40 | * of lock contention from this code path. | |
41 | */ | |
42 | int dmu_object_alloc_chunk_shift = 7; | |
43 | ||
3a549dc7 MA |
44 | static uint64_t |
45 | dmu_object_alloc_impl(objset_t *os, dmu_object_type_t ot, int blocksize, | |
46 | int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen, | |
47 | int dnodesize, dmu_tx_t *tx) | |
34dc7c2f | 48 | { |
34dc7c2f | 49 | uint64_t object; |
68cbd56e | 50 | uint64_t L1_dnode_count = DNODES_PER_BLOCK << |
572e2857 | 51 | (DMU_META_DNODE(os)->dn_indblkshift - SPA_BLKPTRSHIFT); |
34dc7c2f | 52 | dnode_t *dn = NULL; |
50c957f7 NB |
53 | int dn_slots = dnodesize >> DNODE_SHIFT; |
54 | boolean_t restarted = B_FALSE; | |
d9ad3fea | 55 | uint64_t *cpuobj = NULL; |
dbeb8796 | 56 | int dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift; |
9631681b | 57 | int error; |
50c957f7 | 58 | |
d9ad3fea MJ |
59 | kpreempt_disable(); |
60 | cpuobj = &os->os_obj_next_percpu[CPU_SEQID % | |
61 | os->os_obj_next_percpu_len]; | |
62 | kpreempt_enable(); | |
63 | ||
50c957f7 NB |
64 | if (dn_slots == 0) { |
65 | dn_slots = DNODE_MIN_SLOTS; | |
66 | } else { | |
67 | ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS); | |
68 | ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS); | |
69 | } | |
34dc7c2f | 70 | |
dbeb8796 MA |
71 | /* |
72 | * The "chunk" of dnodes that is assigned to a CPU-specific | |
73 | * allocator needs to be at least one block's worth, to avoid | |
74 | * lock contention on the dbuf. It can be at most one L1 block's | |
75 | * worth, so that the "rescan after polishing off a L1's worth" | |
76 | * logic below will be sure to kick in. | |
77 | */ | |
78 | if (dnodes_per_chunk < DNODES_PER_BLOCK) | |
79 | dnodes_per_chunk = DNODES_PER_BLOCK; | |
80 | if (dnodes_per_chunk > L1_dnode_count) | |
81 | dnodes_per_chunk = L1_dnode_count; | |
82 | ||
83 | object = *cpuobj; | |
34dc7c2f | 84 | for (;;) { |
34dc7c2f | 85 | /* |
dbeb8796 MA |
86 | * If we finished a chunk of dnodes, get a new one from |
87 | * the global allocator. | |
34dc7c2f | 88 | */ |
4c5b89f5 OF |
89 | if ((P2PHASE(object, dnodes_per_chunk) == 0) || |
90 | (P2PHASE(object + dn_slots - 1, dnodes_per_chunk) < | |
91 | dn_slots)) { | |
92 | DNODE_STAT_BUMP(dnode_alloc_next_chunk); | |
dbeb8796 MA |
93 | mutex_enter(&os->os_obj_lock); |
94 | ASSERT0(P2PHASE(os->os_obj_next_chunk, | |
95 | dnodes_per_chunk)); | |
96 | object = os->os_obj_next_chunk; | |
97 | ||
98 | /* | |
99 | * Each time we polish off a L1 bp worth of dnodes | |
100 | * (2^12 objects), move to another L1 bp that's | |
101 | * still reasonably sparse (at most 1/4 full). Look | |
102 | * from the beginning at most once per txg. If we | |
103 | * still can't allocate from that L1 block, search | |
104 | * for an empty L0 block, which will quickly skip | |
105 | * to the end of the metadnode if no nearby L0 | |
106 | * blocks are empty. This fallback avoids a | |
107 | * pathology where full dnode blocks containing | |
108 | * large dnodes appear sparse because they have a | |
109 | * low blk_fill, leading to many failed allocation | |
110 | * attempts. In the long term a better mechanism to | |
111 | * search for sparse metadnode regions, such as | |
112 | * spacemaps, could be implemented. | |
113 | * | |
114 | * os_scan_dnodes is set during txg sync if enough | |
115 | * objects have been freed since the previous | |
116 | * rescan to justify backfilling again. | |
117 | * | |
118 | * Note that dmu_traverse depends on the behavior | |
119 | * that we use multiple blocks of the dnode object | |
120 | * before going back to reuse objects. Any change | |
121 | * to this algorithm should preserve that property | |
122 | * or find another solution to the issues described | |
123 | * in traverse_visitbp. | |
124 | */ | |
125 | if (P2PHASE(object, L1_dnode_count) == 0) { | |
126 | uint64_t offset; | |
127 | uint64_t blkfill; | |
128 | int minlvl; | |
dbeb8796 MA |
129 | if (os->os_rescan_dnodes) { |
130 | offset = 0; | |
131 | os->os_rescan_dnodes = B_FALSE; | |
132 | } else { | |
133 | offset = object << DNODE_SHIFT; | |
134 | } | |
135 | blkfill = restarted ? 1 : DNODES_PER_BLOCK >> 2; | |
136 | minlvl = restarted ? 1 : 2; | |
137 | restarted = B_TRUE; | |
138 | error = dnode_next_offset(DMU_META_DNODE(os), | |
139 | DNODE_FIND_HOLE, &offset, minlvl, | |
140 | blkfill, 0); | |
141 | if (error == 0) { | |
142 | object = offset >> DNODE_SHIFT; | |
143 | } | |
68cbd56e | 144 | } |
dbeb8796 MA |
145 | /* |
146 | * Note: if "restarted", we may find a L0 that | |
147 | * is not suitably aligned. | |
148 | */ | |
149 | os->os_obj_next_chunk = | |
150 | P2ALIGN(object, dnodes_per_chunk) + | |
151 | dnodes_per_chunk; | |
152 | (void) atomic_swap_64(cpuobj, object); | |
153 | mutex_exit(&os->os_obj_lock); | |
34dc7c2f | 154 | } |
34dc7c2f | 155 | |
4c5b89f5 OF |
156 | /* |
157 | * The value of (*cpuobj) before adding dn_slots is the object | |
158 | * ID assigned to us. The value afterwards is the object ID | |
159 | * assigned to whoever wants to do an allocation next. | |
160 | */ | |
161 | object = atomic_add_64_nv(cpuobj, dn_slots) - dn_slots; | |
162 | ||
34dc7c2f BB |
163 | /* |
164 | * XXX We should check for an i/o error here and return | |
165 | * up to our caller. Actually we should pre-read it in | |
166 | * dmu_tx_assign(), but there is currently no mechanism | |
167 | * to do so. | |
168 | */ | |
9631681b | 169 | error = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, |
dbeb8796 | 170 | dn_slots, FTAG, &dn); |
9631681b | 171 | if (error == 0) { |
dbeb8796 | 172 | rw_enter(&dn->dn_struct_rwlock, RW_WRITER); |
50c957f7 | 173 | /* |
dbeb8796 MA |
174 | * Another thread could have allocated it; check |
175 | * again now that we have the struct lock. | |
50c957f7 | 176 | */ |
dbeb8796 | 177 | if (dn->dn_type == DMU_OT_NONE) { |
3a549dc7 MA |
178 | dnode_allocate(dn, ot, blocksize, |
179 | indirect_blockshift, bonustype, | |
180 | bonuslen, dn_slots, tx); | |
dbeb8796 MA |
181 | rw_exit(&dn->dn_struct_rwlock); |
182 | dmu_tx_add_new_object(tx, dn); | |
183 | dnode_rele(dn, FTAG); | |
dbeb8796 MA |
184 | return (object); |
185 | } | |
186 | rw_exit(&dn->dn_struct_rwlock); | |
187 | dnode_rele(dn, FTAG); | |
4c5b89f5 | 188 | DNODE_STAT_BUMP(dnode_alloc_race); |
dbeb8796 | 189 | } |
0eef1bde | 190 | |
4c5b89f5 OF |
191 | /* |
192 | * Skip to next known valid starting point on error. This | |
193 | * is the start of the next block of dnodes. | |
194 | */ | |
dbeb8796 | 195 | if (dmu_object_next(os, &object, B_TRUE, 0) != 0) { |
dbeb8796 | 196 | object = P2ROUNDUP(object + 1, DNODES_PER_BLOCK); |
4c5b89f5 | 197 | DNODE_STAT_BUMP(dnode_alloc_next_block); |
dbeb8796 MA |
198 | } |
199 | (void) atomic_swap_64(cpuobj, object); | |
200 | } | |
34dc7c2f BB |
201 | } |
202 | ||
3a549dc7 MA |
203 | uint64_t |
204 | dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize, | |
205 | dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) | |
206 | { | |
207 | return dmu_object_alloc_impl(os, ot, blocksize, 0, bonustype, | |
208 | bonuslen, 0, tx); | |
209 | } | |
210 | ||
211 | uint64_t | |
212 | dmu_object_alloc_ibs(objset_t *os, dmu_object_type_t ot, int blocksize, | |
213 | int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen, | |
214 | dmu_tx_t *tx) | |
215 | { | |
216 | return dmu_object_alloc_impl(os, ot, blocksize, indirect_blockshift, | |
217 | bonustype, bonuslen, 0, tx); | |
218 | } | |
219 | ||
220 | uint64_t | |
221 | dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize, | |
222 | dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) | |
223 | { | |
224 | return (dmu_object_alloc_impl(os, ot, blocksize, 0, bonustype, | |
225 | bonuslen, dnodesize, tx)); | |
226 | } | |
227 | ||
34dc7c2f BB |
228 | int |
229 | dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot, | |
230 | int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) | |
50c957f7 NB |
231 | { |
232 | return (dmu_object_claim_dnsize(os, object, ot, blocksize, bonustype, | |
233 | bonuslen, 0, tx)); | |
234 | } | |
235 | ||
236 | int | |
237 | dmu_object_claim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot, | |
238 | int blocksize, dmu_object_type_t bonustype, int bonuslen, | |
239 | int dnodesize, dmu_tx_t *tx) | |
34dc7c2f BB |
240 | { |
241 | dnode_t *dn; | |
50c957f7 | 242 | int dn_slots = dnodesize >> DNODE_SHIFT; |
34dc7c2f BB |
243 | int err; |
244 | ||
50c957f7 NB |
245 | if (dn_slots == 0) |
246 | dn_slots = DNODE_MIN_SLOTS; | |
247 | ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS); | |
248 | ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS); | |
249 | ||
34dc7c2f | 250 | if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx)) |
2e528b49 | 251 | return (SET_ERROR(EBADF)); |
34dc7c2f | 252 | |
50c957f7 NB |
253 | err = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, dn_slots, |
254 | FTAG, &dn); | |
34dc7c2f BB |
255 | if (err) |
256 | return (err); | |
50c957f7 NB |
257 | |
258 | dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, dn_slots, tx); | |
66eead53 | 259 | dmu_tx_add_new_object(tx, dn); |
0eef1bde | 260 | |
34dc7c2f BB |
261 | dnode_rele(dn, FTAG); |
262 | ||
34dc7c2f BB |
263 | return (0); |
264 | } | |
265 | ||
266 | int | |
267 | dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot, | |
6c59307a | 268 | int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) |
50c957f7 NB |
269 | { |
270 | return (dmu_object_reclaim_dnsize(os, object, ot, blocksize, bonustype, | |
e14a32b1 | 271 | bonuslen, DNODE_MIN_SIZE, tx)); |
50c957f7 NB |
272 | } |
273 | ||
274 | int | |
275 | dmu_object_reclaim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot, | |
276 | int blocksize, dmu_object_type_t bonustype, int bonuslen, int dnodesize, | |
277 | dmu_tx_t *tx) | |
34dc7c2f BB |
278 | { |
279 | dnode_t *dn; | |
50c957f7 | 280 | int dn_slots = dnodesize >> DNODE_SHIFT; |
34dc7c2f BB |
281 | int err; |
282 | ||
da2feb42 TC |
283 | if (dn_slots == 0) |
284 | dn_slots = DNODE_MIN_SLOTS; | |
285 | ||
9babb374 | 286 | if (object == DMU_META_DNODE_OBJECT) |
2e528b49 | 287 | return (SET_ERROR(EBADF)); |
34dc7c2f | 288 | |
50c957f7 | 289 | err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0, |
34dc7c2f BB |
290 | FTAG, &dn); |
291 | if (err) | |
292 | return (err); | |
9babb374 | 293 | |
50c957f7 | 294 | dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, dn_slots, tx); |
9babb374 | 295 | |
34dc7c2f | 296 | dnode_rele(dn, FTAG); |
9babb374 | 297 | return (err); |
34dc7c2f BB |
298 | } |
299 | ||
300 | int | |
301 | dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx) | |
302 | { | |
303 | dnode_t *dn; | |
304 | int err; | |
305 | ||
306 | ASSERT(object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx)); | |
307 | ||
50c957f7 | 308 | err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0, |
34dc7c2f BB |
309 | FTAG, &dn); |
310 | if (err) | |
311 | return (err); | |
312 | ||
313 | ASSERT(dn->dn_type != DMU_OT_NONE); | |
21d48b5e PD |
314 | /* |
315 | * If we don't create this free range, we'll leak indirect blocks when | |
316 | * we get to freeing the dnode in syncing context. | |
317 | */ | |
b128c09f | 318 | dnode_free_range(dn, 0, DMU_OBJECT_END, tx); |
34dc7c2f BB |
319 | dnode_free(dn, tx); |
320 | dnode_rele(dn, FTAG); | |
321 | ||
322 | return (0); | |
323 | } | |
324 | ||
fcff0f35 PD |
325 | /* |
326 | * Return (in *objectp) the next object which is allocated (or a hole) | |
327 | * after *object, taking into account only objects that may have been modified | |
328 | * after the specified txg. | |
329 | */ | |
34dc7c2f BB |
330 | int |
331 | dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg) | |
332 | { | |
50c957f7 | 333 | uint64_t offset; |
08f0510d | 334 | uint64_t start_obj; |
50c957f7 | 335 | struct dsl_dataset *ds = os->os_dsl_dataset; |
34dc7c2f BB |
336 | int error; |
337 | ||
08f0510d | 338 | if (*objectp == 0) { |
339 | start_obj = 1; | |
340 | } else if (ds && ds->ds_feature_inuse[SPA_FEATURE_LARGE_DNODE]) { | |
4c5b89f5 OF |
341 | uint64_t i = *objectp + 1; |
342 | uint64_t last_obj = *objectp | (DNODES_PER_BLOCK - 1); | |
343 | dmu_object_info_t doi; | |
344 | ||
08f0510d | 345 | /* |
4c5b89f5 OF |
346 | * Scan through the remaining meta dnode block. The contents |
347 | * of each slot in the block are known so it can be quickly | |
348 | * checked. If the block is exhausted without a match then | |
349 | * hand off to dnode_next_offset() for further scanning. | |
08f0510d | 350 | */ |
4c5b89f5 | 351 | while (i <= last_obj) { |
08f0510d | 352 | error = dmu_object_info(os, i, &doi); |
4c5b89f5 OF |
353 | if (error == ENOENT) { |
354 | if (hole) { | |
355 | *objectp = i; | |
356 | return (0); | |
357 | } else { | |
358 | i++; | |
359 | } | |
360 | } else if (error == EEXIST) { | |
361 | i++; | |
362 | } else if (error == 0) { | |
363 | if (hole) { | |
364 | i += doi.doi_dnodesize >> DNODE_SHIFT; | |
365 | } else { | |
366 | *objectp = i; | |
367 | return (0); | |
368 | } | |
369 | } else { | |
370 | return (error); | |
371 | } | |
08f0510d | 372 | } |
373 | ||
374 | start_obj = i; | |
50c957f7 | 375 | } else { |
08f0510d | 376 | start_obj = *objectp + 1; |
50c957f7 NB |
377 | } |
378 | ||
08f0510d | 379 | offset = start_obj << DNODE_SHIFT; |
50c957f7 | 380 | |
572e2857 | 381 | error = dnode_next_offset(DMU_META_DNODE(os), |
b128c09f | 382 | (hole ? DNODE_FIND_HOLE : 0), &offset, 0, DNODES_PER_BLOCK, txg); |
34dc7c2f BB |
383 | |
384 | *objectp = offset >> DNODE_SHIFT; | |
385 | ||
386 | return (error); | |
387 | } | |
c28b2279 | 388 | |
fa86b5db MA |
389 | /* |
390 | * Turn this object from old_type into DMU_OTN_ZAP_METADATA, and bump the | |
391 | * refcount on SPA_FEATURE_EXTENSIBLE_DATASET. | |
392 | * | |
393 | * Only for use from syncing context, on MOS objects. | |
394 | */ | |
395 | void | |
396 | dmu_object_zapify(objset_t *mos, uint64_t object, dmu_object_type_t old_type, | |
397 | dmu_tx_t *tx) | |
398 | { | |
399 | dnode_t *dn; | |
400 | ||
401 | ASSERT(dmu_tx_is_syncing(tx)); | |
402 | ||
403 | VERIFY0(dnode_hold(mos, object, FTAG, &dn)); | |
404 | if (dn->dn_type == DMU_OTN_ZAP_METADATA) { | |
405 | dnode_rele(dn, FTAG); | |
406 | return; | |
407 | } | |
408 | ASSERT3U(dn->dn_type, ==, old_type); | |
409 | ASSERT0(dn->dn_maxblkid); | |
1a5b96b8 MA |
410 | |
411 | /* | |
412 | * We must initialize the ZAP data before changing the type, | |
413 | * so that concurrent calls to *_is_zapified() can determine if | |
414 | * the object has been completely zapified by checking the type. | |
415 | */ | |
416 | mzap_create_impl(mos, object, 0, 0, tx); | |
417 | ||
fa86b5db MA |
418 | dn->dn_next_type[tx->tx_txg & TXG_MASK] = dn->dn_type = |
419 | DMU_OTN_ZAP_METADATA; | |
420 | dnode_setdirty(dn, tx); | |
421 | dnode_rele(dn, FTAG); | |
422 | ||
fa86b5db MA |
423 | |
424 | spa_feature_incr(dmu_objset_spa(mos), | |
425 | SPA_FEATURE_EXTENSIBLE_DATASET, tx); | |
426 | } | |
427 | ||
428 | void | |
429 | dmu_object_free_zapified(objset_t *mos, uint64_t object, dmu_tx_t *tx) | |
430 | { | |
431 | dnode_t *dn; | |
432 | dmu_object_type_t t; | |
433 | ||
434 | ASSERT(dmu_tx_is_syncing(tx)); | |
435 | ||
436 | VERIFY0(dnode_hold(mos, object, FTAG, &dn)); | |
437 | t = dn->dn_type; | |
438 | dnode_rele(dn, FTAG); | |
439 | ||
440 | if (t == DMU_OTN_ZAP_METADATA) { | |
441 | spa_feature_decr(dmu_objset_spa(mos), | |
442 | SPA_FEATURE_EXTENSIBLE_DATASET, tx); | |
443 | } | |
444 | VERIFY0(dmu_object_free(mos, object, tx)); | |
445 | } | |
446 | ||
93ce2b4c | 447 | #if defined(_KERNEL) |
c28b2279 | 448 | EXPORT_SYMBOL(dmu_object_alloc); |
3a549dc7 | 449 | EXPORT_SYMBOL(dmu_object_alloc_ibs); |
50c957f7 | 450 | EXPORT_SYMBOL(dmu_object_alloc_dnsize); |
c28b2279 | 451 | EXPORT_SYMBOL(dmu_object_claim); |
50c957f7 | 452 | EXPORT_SYMBOL(dmu_object_claim_dnsize); |
c28b2279 | 453 | EXPORT_SYMBOL(dmu_object_reclaim); |
50c957f7 | 454 | EXPORT_SYMBOL(dmu_object_reclaim_dnsize); |
c28b2279 BB |
455 | EXPORT_SYMBOL(dmu_object_free); |
456 | EXPORT_SYMBOL(dmu_object_next); | |
fa86b5db MA |
457 | EXPORT_SYMBOL(dmu_object_zapify); |
458 | EXPORT_SYMBOL(dmu_object_free_zapified); | |
dbeb8796 MA |
459 | |
460 | /* BEGIN CSTYLED */ | |
461 | module_param(dmu_object_alloc_chunk_shift, int, 0644); | |
462 | MODULE_PARM_DESC(dmu_object_alloc_chunk_shift, | |
463 | "CPU-specific allocator grabs 2^N objects at once"); | |
464 | /* END CSTYLED */ | |
c28b2279 | 465 | #endif |