]>
Commit | Line | Data |
---|---|---|
34dc7c2f BB |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
9 | * or http://www.opensolaris.org/os/licensing. | |
10 | * See the License for the specific language governing permissions | |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | /* | |
428870ff | 22 | * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. |
1a5b96b8 | 23 | * Copyright (c) 2013, 2017 by Delphix. All rights reserved. |
6c59307a | 24 | * Copyright 2014 HybridCluster. All rights reserved. |
34dc7c2f BB |
25 | */ |
26 | ||
34dc7c2f BB |
27 | #include <sys/dmu.h> |
28 | #include <sys/dmu_objset.h> | |
29 | #include <sys/dmu_tx.h> | |
30 | #include <sys/dnode.h> | |
fa86b5db MA |
31 | #include <sys/zap.h> |
32 | #include <sys/zfeature.h> | |
50c957f7 | 33 | #include <sys/dsl_dataset.h> |
34dc7c2f | 34 | |
dbeb8796 MA |
35 | /* |
36 | * Each of the concurrent object allocators will grab | |
37 | * 2^dmu_object_alloc_chunk_shift dnode slots at a time. The default is to | |
38 | * grab 128 slots, which is 4 blocks worth. This was experimentally | |
39 | * determined to be the lowest value that eliminates the measurable effect | |
40 | * of lock contention from this code path. | |
41 | */ | |
42 | int dmu_object_alloc_chunk_shift = 7; | |
43 | ||
34dc7c2f BB |
44 | uint64_t |
45 | dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize, | |
46 | dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) | |
50c957f7 NB |
47 | { |
48 | return dmu_object_alloc_dnsize(os, ot, blocksize, bonustype, bonuslen, | |
49 | 0, tx); | |
50 | } | |
51 | ||
52 | uint64_t | |
53 | dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize, | |
54 | dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) | |
34dc7c2f | 55 | { |
34dc7c2f | 56 | uint64_t object; |
68cbd56e | 57 | uint64_t L1_dnode_count = DNODES_PER_BLOCK << |
572e2857 | 58 | (DMU_META_DNODE(os)->dn_indblkshift - SPA_BLKPTRSHIFT); |
34dc7c2f | 59 | dnode_t *dn = NULL; |
50c957f7 NB |
60 | int dn_slots = dnodesize >> DNODE_SHIFT; |
61 | boolean_t restarted = B_FALSE; | |
d9ad3fea | 62 | uint64_t *cpuobj = NULL; |
dbeb8796 | 63 | int dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift; |
9631681b | 64 | int error; |
50c957f7 | 65 | |
d9ad3fea MJ |
66 | kpreempt_disable(); |
67 | cpuobj = &os->os_obj_next_percpu[CPU_SEQID % | |
68 | os->os_obj_next_percpu_len]; | |
69 | kpreempt_enable(); | |
70 | ||
50c957f7 NB |
71 | if (dn_slots == 0) { |
72 | dn_slots = DNODE_MIN_SLOTS; | |
73 | } else { | |
74 | ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS); | |
75 | ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS); | |
76 | } | |
34dc7c2f | 77 | |
dbeb8796 MA |
78 | /* |
79 | * The "chunk" of dnodes that is assigned to a CPU-specific | |
80 | * allocator needs to be at least one block's worth, to avoid | |
81 | * lock contention on the dbuf. It can be at most one L1 block's | |
82 | * worth, so that the "rescan after polishing off a L1's worth" | |
83 | * logic below will be sure to kick in. | |
84 | */ | |
85 | if (dnodes_per_chunk < DNODES_PER_BLOCK) | |
86 | dnodes_per_chunk = DNODES_PER_BLOCK; | |
87 | if (dnodes_per_chunk > L1_dnode_count) | |
88 | dnodes_per_chunk = L1_dnode_count; | |
89 | ||
90 | object = *cpuobj; | |
34dc7c2f | 91 | for (;;) { |
34dc7c2f | 92 | /* |
dbeb8796 MA |
93 | * If we finished a chunk of dnodes, get a new one from |
94 | * the global allocator. | |
34dc7c2f | 95 | */ |
4c5b89f5 OF |
96 | if ((P2PHASE(object, dnodes_per_chunk) == 0) || |
97 | (P2PHASE(object + dn_slots - 1, dnodes_per_chunk) < | |
98 | dn_slots)) { | |
99 | DNODE_STAT_BUMP(dnode_alloc_next_chunk); | |
dbeb8796 MA |
100 | mutex_enter(&os->os_obj_lock); |
101 | ASSERT0(P2PHASE(os->os_obj_next_chunk, | |
102 | dnodes_per_chunk)); | |
103 | object = os->os_obj_next_chunk; | |
104 | ||
105 | /* | |
106 | * Each time we polish off a L1 bp worth of dnodes | |
107 | * (2^12 objects), move to another L1 bp that's | |
108 | * still reasonably sparse (at most 1/4 full). Look | |
109 | * from the beginning at most once per txg. If we | |
110 | * still can't allocate from that L1 block, search | |
111 | * for an empty L0 block, which will quickly skip | |
112 | * to the end of the metadnode if no nearby L0 | |
113 | * blocks are empty. This fallback avoids a | |
114 | * pathology where full dnode blocks containing | |
115 | * large dnodes appear sparse because they have a | |
116 | * low blk_fill, leading to many failed allocation | |
117 | * attempts. In the long term a better mechanism to | |
118 | * search for sparse metadnode regions, such as | |
119 | * spacemaps, could be implemented. | |
120 | * | |
121 | * os_scan_dnodes is set during txg sync if enough | |
122 | * objects have been freed since the previous | |
123 | * rescan to justify backfilling again. | |
124 | * | |
125 | * Note that dmu_traverse depends on the behavior | |
126 | * that we use multiple blocks of the dnode object | |
127 | * before going back to reuse objects. Any change | |
128 | * to this algorithm should preserve that property | |
129 | * or find another solution to the issues described | |
130 | * in traverse_visitbp. | |
131 | */ | |
132 | if (P2PHASE(object, L1_dnode_count) == 0) { | |
133 | uint64_t offset; | |
134 | uint64_t blkfill; | |
135 | int minlvl; | |
dbeb8796 MA |
136 | if (os->os_rescan_dnodes) { |
137 | offset = 0; | |
138 | os->os_rescan_dnodes = B_FALSE; | |
139 | } else { | |
140 | offset = object << DNODE_SHIFT; | |
141 | } | |
142 | blkfill = restarted ? 1 : DNODES_PER_BLOCK >> 2; | |
143 | minlvl = restarted ? 1 : 2; | |
144 | restarted = B_TRUE; | |
145 | error = dnode_next_offset(DMU_META_DNODE(os), | |
146 | DNODE_FIND_HOLE, &offset, minlvl, | |
147 | blkfill, 0); | |
148 | if (error == 0) { | |
149 | object = offset >> DNODE_SHIFT; | |
150 | } | |
68cbd56e | 151 | } |
dbeb8796 MA |
152 | /* |
153 | * Note: if "restarted", we may find a L0 that | |
154 | * is not suitably aligned. | |
155 | */ | |
156 | os->os_obj_next_chunk = | |
157 | P2ALIGN(object, dnodes_per_chunk) + | |
158 | dnodes_per_chunk; | |
159 | (void) atomic_swap_64(cpuobj, object); | |
160 | mutex_exit(&os->os_obj_lock); | |
34dc7c2f | 161 | } |
34dc7c2f | 162 | |
4c5b89f5 OF |
163 | /* |
164 | * The value of (*cpuobj) before adding dn_slots is the object | |
165 | * ID assigned to us. The value afterwards is the object ID | |
166 | * assigned to whoever wants to do an allocation next. | |
167 | */ | |
168 | object = atomic_add_64_nv(cpuobj, dn_slots) - dn_slots; | |
169 | ||
34dc7c2f BB |
170 | /* |
171 | * XXX We should check for an i/o error here and return | |
172 | * up to our caller. Actually we should pre-read it in | |
173 | * dmu_tx_assign(), but there is currently no mechanism | |
174 | * to do so. | |
175 | */ | |
9631681b | 176 | error = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, |
dbeb8796 | 177 | dn_slots, FTAG, &dn); |
9631681b | 178 | if (error == 0) { |
dbeb8796 | 179 | rw_enter(&dn->dn_struct_rwlock, RW_WRITER); |
50c957f7 | 180 | /* |
dbeb8796 MA |
181 | * Another thread could have allocated it; check |
182 | * again now that we have the struct lock. | |
50c957f7 | 183 | */ |
dbeb8796 MA |
184 | if (dn->dn_type == DMU_OT_NONE) { |
185 | dnode_allocate(dn, ot, blocksize, 0, | |
186 | bonustype, bonuslen, dn_slots, tx); | |
187 | rw_exit(&dn->dn_struct_rwlock); | |
188 | dmu_tx_add_new_object(tx, dn); | |
189 | dnode_rele(dn, FTAG); | |
dbeb8796 MA |
190 | return (object); |
191 | } | |
192 | rw_exit(&dn->dn_struct_rwlock); | |
193 | dnode_rele(dn, FTAG); | |
4c5b89f5 | 194 | DNODE_STAT_BUMP(dnode_alloc_race); |
dbeb8796 | 195 | } |
0eef1bde | 196 | |
4c5b89f5 OF |
197 | /* |
198 | * Skip to next known valid starting point on error. This | |
199 | * is the start of the next block of dnodes. | |
200 | */ | |
dbeb8796 | 201 | if (dmu_object_next(os, &object, B_TRUE, 0) != 0) { |
dbeb8796 | 202 | object = P2ROUNDUP(object + 1, DNODES_PER_BLOCK); |
4c5b89f5 | 203 | DNODE_STAT_BUMP(dnode_alloc_next_block); |
dbeb8796 MA |
204 | } |
205 | (void) atomic_swap_64(cpuobj, object); | |
206 | } | |
34dc7c2f BB |
207 | } |
208 | ||
209 | int | |
210 | dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot, | |
211 | int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) | |
50c957f7 NB |
212 | { |
213 | return (dmu_object_claim_dnsize(os, object, ot, blocksize, bonustype, | |
214 | bonuslen, 0, tx)); | |
215 | } | |
216 | ||
217 | int | |
218 | dmu_object_claim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot, | |
219 | int blocksize, dmu_object_type_t bonustype, int bonuslen, | |
220 | int dnodesize, dmu_tx_t *tx) | |
34dc7c2f BB |
221 | { |
222 | dnode_t *dn; | |
50c957f7 | 223 | int dn_slots = dnodesize >> DNODE_SHIFT; |
34dc7c2f BB |
224 | int err; |
225 | ||
50c957f7 NB |
226 | if (dn_slots == 0) |
227 | dn_slots = DNODE_MIN_SLOTS; | |
228 | ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS); | |
229 | ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS); | |
230 | ||
34dc7c2f | 231 | if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx)) |
2e528b49 | 232 | return (SET_ERROR(EBADF)); |
34dc7c2f | 233 | |
50c957f7 NB |
234 | err = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, dn_slots, |
235 | FTAG, &dn); | |
34dc7c2f BB |
236 | if (err) |
237 | return (err); | |
50c957f7 NB |
238 | |
239 | dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, dn_slots, tx); | |
66eead53 | 240 | dmu_tx_add_new_object(tx, dn); |
0eef1bde | 241 | |
34dc7c2f BB |
242 | dnode_rele(dn, FTAG); |
243 | ||
34dc7c2f BB |
244 | return (0); |
245 | } | |
246 | ||
247 | int | |
248 | dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot, | |
6c59307a | 249 | int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) |
50c957f7 NB |
250 | { |
251 | return (dmu_object_reclaim_dnsize(os, object, ot, blocksize, bonustype, | |
e14a32b1 | 252 | bonuslen, DNODE_MIN_SIZE, tx)); |
50c957f7 NB |
253 | } |
254 | ||
255 | int | |
256 | dmu_object_reclaim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot, | |
257 | int blocksize, dmu_object_type_t bonustype, int bonuslen, int dnodesize, | |
258 | dmu_tx_t *tx) | |
34dc7c2f BB |
259 | { |
260 | dnode_t *dn; | |
50c957f7 | 261 | int dn_slots = dnodesize >> DNODE_SHIFT; |
34dc7c2f BB |
262 | int err; |
263 | ||
9babb374 | 264 | if (object == DMU_META_DNODE_OBJECT) |
2e528b49 | 265 | return (SET_ERROR(EBADF)); |
34dc7c2f | 266 | |
50c957f7 | 267 | err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0, |
34dc7c2f BB |
268 | FTAG, &dn); |
269 | if (err) | |
270 | return (err); | |
9babb374 | 271 | |
50c957f7 | 272 | dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, dn_slots, tx); |
9babb374 | 273 | |
34dc7c2f | 274 | dnode_rele(dn, FTAG); |
9babb374 | 275 | return (err); |
34dc7c2f BB |
276 | } |
277 | ||
278 | int | |
279 | dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx) | |
280 | { | |
281 | dnode_t *dn; | |
282 | int err; | |
283 | ||
284 | ASSERT(object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx)); | |
285 | ||
50c957f7 | 286 | err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0, |
34dc7c2f BB |
287 | FTAG, &dn); |
288 | if (err) | |
289 | return (err); | |
290 | ||
291 | ASSERT(dn->dn_type != DMU_OT_NONE); | |
b128c09f | 292 | dnode_free_range(dn, 0, DMU_OBJECT_END, tx); |
34dc7c2f BB |
293 | dnode_free(dn, tx); |
294 | dnode_rele(dn, FTAG); | |
295 | ||
296 | return (0); | |
297 | } | |
298 | ||
fcff0f35 PD |
299 | /* |
300 | * Return (in *objectp) the next object which is allocated (or a hole) | |
301 | * after *object, taking into account only objects that may have been modified | |
302 | * after the specified txg. | |
303 | */ | |
34dc7c2f BB |
304 | int |
305 | dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg) | |
306 | { | |
50c957f7 | 307 | uint64_t offset; |
08f0510d | 308 | uint64_t start_obj; |
50c957f7 | 309 | struct dsl_dataset *ds = os->os_dsl_dataset; |
34dc7c2f BB |
310 | int error; |
311 | ||
08f0510d | 312 | if (*objectp == 0) { |
313 | start_obj = 1; | |
314 | } else if (ds && ds->ds_feature_inuse[SPA_FEATURE_LARGE_DNODE]) { | |
4c5b89f5 OF |
315 | uint64_t i = *objectp + 1; |
316 | uint64_t last_obj = *objectp | (DNODES_PER_BLOCK - 1); | |
317 | dmu_object_info_t doi; | |
318 | ||
08f0510d | 319 | /* |
4c5b89f5 OF |
320 | * Scan through the remaining meta dnode block. The contents |
321 | * of each slot in the block are known so it can be quickly | |
322 | * checked. If the block is exhausted without a match then | |
323 | * hand off to dnode_next_offset() for further scanning. | |
08f0510d | 324 | */ |
4c5b89f5 | 325 | while (i <= last_obj) { |
08f0510d | 326 | error = dmu_object_info(os, i, &doi); |
4c5b89f5 OF |
327 | if (error == ENOENT) { |
328 | if (hole) { | |
329 | *objectp = i; | |
330 | return (0); | |
331 | } else { | |
332 | i++; | |
333 | } | |
334 | } else if (error == EEXIST) { | |
335 | i++; | |
336 | } else if (error == 0) { | |
337 | if (hole) { | |
338 | i += doi.doi_dnodesize >> DNODE_SHIFT; | |
339 | } else { | |
340 | *objectp = i; | |
341 | return (0); | |
342 | } | |
343 | } else { | |
344 | return (error); | |
345 | } | |
08f0510d | 346 | } |
347 | ||
348 | start_obj = i; | |
50c957f7 | 349 | } else { |
08f0510d | 350 | start_obj = *objectp + 1; |
50c957f7 NB |
351 | } |
352 | ||
08f0510d | 353 | offset = start_obj << DNODE_SHIFT; |
50c957f7 | 354 | |
572e2857 | 355 | error = dnode_next_offset(DMU_META_DNODE(os), |
b128c09f | 356 | (hole ? DNODE_FIND_HOLE : 0), &offset, 0, DNODES_PER_BLOCK, txg); |
34dc7c2f BB |
357 | |
358 | *objectp = offset >> DNODE_SHIFT; | |
359 | ||
360 | return (error); | |
361 | } | |
c28b2279 | 362 | |
fa86b5db MA |
363 | /* |
364 | * Turn this object from old_type into DMU_OTN_ZAP_METADATA, and bump the | |
365 | * refcount on SPA_FEATURE_EXTENSIBLE_DATASET. | |
366 | * | |
367 | * Only for use from syncing context, on MOS objects. | |
368 | */ | |
369 | void | |
370 | dmu_object_zapify(objset_t *mos, uint64_t object, dmu_object_type_t old_type, | |
371 | dmu_tx_t *tx) | |
372 | { | |
373 | dnode_t *dn; | |
374 | ||
375 | ASSERT(dmu_tx_is_syncing(tx)); | |
376 | ||
377 | VERIFY0(dnode_hold(mos, object, FTAG, &dn)); | |
378 | if (dn->dn_type == DMU_OTN_ZAP_METADATA) { | |
379 | dnode_rele(dn, FTAG); | |
380 | return; | |
381 | } | |
382 | ASSERT3U(dn->dn_type, ==, old_type); | |
383 | ASSERT0(dn->dn_maxblkid); | |
1a5b96b8 MA |
384 | |
385 | /* | |
386 | * We must initialize the ZAP data before changing the type, | |
387 | * so that concurrent calls to *_is_zapified() can determine if | |
388 | * the object has been completely zapified by checking the type. | |
389 | */ | |
390 | mzap_create_impl(mos, object, 0, 0, tx); | |
391 | ||
fa86b5db MA |
392 | dn->dn_next_type[tx->tx_txg & TXG_MASK] = dn->dn_type = |
393 | DMU_OTN_ZAP_METADATA; | |
394 | dnode_setdirty(dn, tx); | |
395 | dnode_rele(dn, FTAG); | |
396 | ||
fa86b5db MA |
397 | |
398 | spa_feature_incr(dmu_objset_spa(mos), | |
399 | SPA_FEATURE_EXTENSIBLE_DATASET, tx); | |
400 | } | |
401 | ||
402 | void | |
403 | dmu_object_free_zapified(objset_t *mos, uint64_t object, dmu_tx_t *tx) | |
404 | { | |
405 | dnode_t *dn; | |
406 | dmu_object_type_t t; | |
407 | ||
408 | ASSERT(dmu_tx_is_syncing(tx)); | |
409 | ||
410 | VERIFY0(dnode_hold(mos, object, FTAG, &dn)); | |
411 | t = dn->dn_type; | |
412 | dnode_rele(dn, FTAG); | |
413 | ||
414 | if (t == DMU_OTN_ZAP_METADATA) { | |
415 | spa_feature_decr(dmu_objset_spa(mos), | |
416 | SPA_FEATURE_EXTENSIBLE_DATASET, tx); | |
417 | } | |
418 | VERIFY0(dmu_object_free(mos, object, tx)); | |
419 | } | |
420 | ||
93ce2b4c | 421 | #if defined(_KERNEL) |
c28b2279 | 422 | EXPORT_SYMBOL(dmu_object_alloc); |
50c957f7 | 423 | EXPORT_SYMBOL(dmu_object_alloc_dnsize); |
c28b2279 | 424 | EXPORT_SYMBOL(dmu_object_claim); |
50c957f7 | 425 | EXPORT_SYMBOL(dmu_object_claim_dnsize); |
c28b2279 | 426 | EXPORT_SYMBOL(dmu_object_reclaim); |
50c957f7 | 427 | EXPORT_SYMBOL(dmu_object_reclaim_dnsize); |
c28b2279 BB |
428 | EXPORT_SYMBOL(dmu_object_free); |
429 | EXPORT_SYMBOL(dmu_object_next); | |
fa86b5db MA |
430 | EXPORT_SYMBOL(dmu_object_zapify); |
431 | EXPORT_SYMBOL(dmu_object_free_zapified); | |
dbeb8796 MA |
432 | |
433 | /* BEGIN CSTYLED */ | |
434 | module_param(dmu_object_alloc_chunk_shift, int, 0644); | |
435 | MODULE_PARM_DESC(dmu_object_alloc_chunk_shift, | |
436 | "CPU-specific allocator grabs 2^N objects at once"); | |
437 | /* END CSTYLED */ | |
c28b2279 | 438 | #endif |