]>
Commit | Line | Data |
---|---|---|
34dc7c2f BB |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
9 | * or http://www.opensolaris.org/os/licensing. | |
10 | * See the License for the specific language governing permissions | |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | /* | |
428870ff | 22 | * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. |
4e820b5a | 23 | * Copyright (c) 2013, 2015 by Delphix. All rights reserved. |
ea04106b | 24 | * Copyright 2014 HybridCluster. All rights reserved. |
34dc7c2f BB |
25 | */ |
26 | ||
34dc7c2f BB |
27 | #include <sys/dmu.h> |
28 | #include <sys/dmu_objset.h> | |
29 | #include <sys/dmu_tx.h> | |
30 | #include <sys/dnode.h> | |
ea04106b AX |
31 | #include <sys/zap.h> |
32 | #include <sys/zfeature.h> | |
cae5b340 AX |
33 | #include <sys/dsl_dataset.h> |
34 | ||
35 | /* | |
36 | * Each of the concurrent object allocators will grab | |
37 | * 2^dmu_object_alloc_chunk_shift dnode slots at a time. The default is to | |
38 | * grab 128 slots, which is 4 blocks worth. This was experimentally | |
39 | * determined to be the lowest value that eliminates the measurable effect | |
40 | * of lock contention from this code path. | |
41 | */ | |
42 | int dmu_object_alloc_chunk_shift = 7; | |
34dc7c2f BB |
43 | |
44 | uint64_t | |
45 | dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize, | |
46 | dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) | |
cae5b340 AX |
47 | { |
48 | return dmu_object_alloc_dnsize(os, ot, blocksize, bonustype, bonuslen, | |
49 | 0, tx); | |
50 | } | |
51 | ||
52 | uint64_t | |
53 | dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize, | |
54 | dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) | |
34dc7c2f | 55 | { |
34dc7c2f | 56 | uint64_t object; |
cae5b340 | 57 | uint64_t L1_dnode_count = DNODES_PER_BLOCK << |
572e2857 | 58 | (DMU_META_DNODE(os)->dn_indblkshift - SPA_BLKPTRSHIFT); |
34dc7c2f | 59 | dnode_t *dn = NULL; |
cae5b340 AX |
60 | int dn_slots = dnodesize >> DNODE_SHIFT; |
61 | boolean_t restarted = B_FALSE; | |
62 | uint64_t *cpuobj = NULL; | |
63 | int dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift; | |
64 | int error; | |
65 | ||
66 | kpreempt_disable(); | |
67 | cpuobj = &os->os_obj_next_percpu[CPU_SEQID % | |
68 | os->os_obj_next_percpu_len]; | |
69 | kpreempt_enable(); | |
34dc7c2f | 70 | |
cae5b340 AX |
71 | if (dn_slots == 0) { |
72 | dn_slots = DNODE_MIN_SLOTS; | |
73 | } else { | |
74 | ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS); | |
75 | ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS); | |
76 | } | |
77 | ||
78 | /* | |
79 | * The "chunk" of dnodes that is assigned to a CPU-specific | |
80 | * allocator needs to be at least one block's worth, to avoid | |
81 | * lock contention on the dbuf. It can be at most one L1 block's | |
82 | * worth, so that the "rescan after polishing off a L1's worth" | |
83 | * logic below will be sure to kick in. | |
84 | */ | |
85 | if (dnodes_per_chunk < DNODES_PER_BLOCK) | |
86 | dnodes_per_chunk = DNODES_PER_BLOCK; | |
87 | if (dnodes_per_chunk > L1_dnode_count) | |
88 | dnodes_per_chunk = L1_dnode_count; | |
89 | ||
90 | object = *cpuobj; | |
34dc7c2f | 91 | for (;;) { |
34dc7c2f | 92 | /* |
cae5b340 AX |
93 | * If we finished a chunk of dnodes, get a new one from |
94 | * the global allocator. | |
34dc7c2f | 95 | */ |
cae5b340 AX |
96 | if ((P2PHASE(object, dnodes_per_chunk) == 0) || |
97 | (P2PHASE(object + dn_slots - 1, dnodes_per_chunk) < | |
98 | dn_slots)) { | |
99 | DNODE_STAT_BUMP(dnode_alloc_next_chunk); | |
100 | mutex_enter(&os->os_obj_lock); | |
101 | ASSERT0(P2PHASE(os->os_obj_next_chunk, | |
102 | dnodes_per_chunk)); | |
103 | object = os->os_obj_next_chunk; | |
104 | ||
105 | /* | |
106 | * Each time we polish off a L1 bp worth of dnodes | |
107 | * (2^12 objects), move to another L1 bp that's | |
108 | * still reasonably sparse (at most 1/4 full). Look | |
109 | * from the beginning at most once per txg. If we | |
110 | * still can't allocate from that L1 block, search | |
111 | * for an empty L0 block, which will quickly skip | |
112 | * to the end of the metadnode if no nearby L0 | |
113 | * blocks are empty. This fallback avoids a | |
114 | * pathology where full dnode blocks containing | |
115 | * large dnodes appear sparse because they have a | |
116 | * low blk_fill, leading to many failed allocation | |
117 | * attempts. In the long term a better mechanism to | |
118 | * search for sparse metadnode regions, such as | |
119 | * spacemaps, could be implemented. | |
120 | * | |
121 | * os_scan_dnodes is set during txg sync if enough | |
122 | * objects have been freed since the previous | |
123 | * rescan to justify backfilling again. | |
124 | * | |
125 | * Note that dmu_traverse depends on the behavior | |
126 | * that we use multiple blocks of the dnode object | |
127 | * before going back to reuse objects. Any change | |
128 | * to this algorithm should preserve that property | |
129 | * or find another solution to the issues described | |
130 | * in traverse_visitbp. | |
131 | */ | |
132 | if (P2PHASE(object, L1_dnode_count) == 0) { | |
133 | uint64_t offset; | |
134 | uint64_t blkfill; | |
135 | int minlvl; | |
136 | if (os->os_rescan_dnodes) { | |
137 | offset = 0; | |
138 | os->os_rescan_dnodes = B_FALSE; | |
139 | } else { | |
140 | offset = object << DNODE_SHIFT; | |
141 | } | |
142 | blkfill = restarted ? 1 : DNODES_PER_BLOCK >> 2; | |
143 | minlvl = restarted ? 1 : 2; | |
144 | restarted = B_TRUE; | |
145 | error = dnode_next_offset(DMU_META_DNODE(os), | |
146 | DNODE_FIND_HOLE, &offset, minlvl, | |
147 | blkfill, 0); | |
148 | if (error == 0) { | |
149 | object = offset >> DNODE_SHIFT; | |
150 | } | |
151 | } | |
152 | /* | |
153 | * Note: if "restarted", we may find a L0 that | |
154 | * is not suitably aligned. | |
155 | */ | |
156 | os->os_obj_next_chunk = | |
157 | P2ALIGN(object, dnodes_per_chunk) + | |
158 | dnodes_per_chunk; | |
159 | (void) atomic_swap_64(cpuobj, object); | |
160 | mutex_exit(&os->os_obj_lock); | |
34dc7c2f | 161 | } |
cae5b340 AX |
162 | |
163 | /* | |
164 | * The value of (*cpuobj) before adding dn_slots is the object | |
165 | * ID assigned to us. The value afterwards is the object ID | |
166 | * assigned to whoever wants to do an allocation next. | |
167 | */ | |
168 | object = atomic_add_64_nv(cpuobj, dn_slots) - dn_slots; | |
34dc7c2f BB |
169 | |
170 | /* | |
171 | * XXX We should check for an i/o error here and return | |
172 | * up to our caller. Actually we should pre-read it in | |
173 | * dmu_tx_assign(), but there is currently no mechanism | |
174 | * to do so. | |
175 | */ | |
cae5b340 AX |
176 | error = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, |
177 | dn_slots, FTAG, &dn); | |
178 | if (error == 0) { | |
179 | rw_enter(&dn->dn_struct_rwlock, RW_WRITER); | |
180 | /* | |
181 | * Another thread could have allocated it; check | |
182 | * again now that we have the struct lock. | |
183 | */ | |
184 | if (dn->dn_type == DMU_OT_NONE) { | |
185 | dnode_allocate(dn, ot, blocksize, 0, | |
186 | bonustype, bonuslen, dn_slots, tx); | |
187 | rw_exit(&dn->dn_struct_rwlock); | |
188 | dmu_tx_add_new_object(tx, dn); | |
189 | dnode_rele(dn, FTAG); | |
190 | return (object); | |
191 | } | |
192 | rw_exit(&dn->dn_struct_rwlock); | |
193 | dnode_rele(dn, FTAG); | |
194 | DNODE_STAT_BUMP(dnode_alloc_race); | |
195 | } | |
34dc7c2f | 196 | |
cae5b340 AX |
197 | /* |
198 | * Skip to next known valid starting point on error. This | |
199 | * is the start of the next block of dnodes. | |
200 | */ | |
201 | if (dmu_object_next(os, &object, B_TRUE, 0) != 0) { | |
202 | object = P2ROUNDUP(object + 1, DNODES_PER_BLOCK); | |
203 | DNODE_STAT_BUMP(dnode_alloc_next_block); | |
204 | } | |
205 | (void) atomic_swap_64(cpuobj, object); | |
34dc7c2f | 206 | } |
34dc7c2f BB |
207 | } |
208 | ||
209 | int | |
210 | dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot, | |
211 | int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) | |
cae5b340 AX |
212 | { |
213 | return (dmu_object_claim_dnsize(os, object, ot, blocksize, bonustype, | |
214 | bonuslen, 0, tx)); | |
215 | } | |
216 | ||
217 | int | |
218 | dmu_object_claim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot, | |
219 | int blocksize, dmu_object_type_t bonustype, int bonuslen, | |
220 | int dnodesize, dmu_tx_t *tx) | |
34dc7c2f BB |
221 | { |
222 | dnode_t *dn; | |
cae5b340 | 223 | int dn_slots = dnodesize >> DNODE_SHIFT; |
34dc7c2f BB |
224 | int err; |
225 | ||
cae5b340 AX |
226 | if (dn_slots == 0) |
227 | dn_slots = DNODE_MIN_SLOTS; | |
228 | ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS); | |
229 | ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS); | |
230 | ||
34dc7c2f | 231 | if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx)) |
a08ee875 | 232 | return (SET_ERROR(EBADF)); |
34dc7c2f | 233 | |
cae5b340 AX |
234 | err = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, dn_slots, |
235 | FTAG, &dn); | |
34dc7c2f BB |
236 | if (err) |
237 | return (err); | |
cae5b340 AX |
238 | |
239 | dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, dn_slots, tx); | |
240 | dmu_tx_add_new_object(tx, dn); | |
241 | ||
34dc7c2f BB |
242 | dnode_rele(dn, FTAG); |
243 | ||
34dc7c2f BB |
244 | return (0); |
245 | } | |
246 | ||
247 | int | |
248 | dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot, | |
ea04106b | 249 | int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) |
cae5b340 AX |
250 | { |
251 | return (dmu_object_reclaim_dnsize(os, object, ot, blocksize, bonustype, | |
a07c8b41 | 252 | bonuslen, DNODE_MIN_SIZE, tx)); |
cae5b340 AX |
253 | } |
254 | ||
255 | int | |
256 | dmu_object_reclaim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot, | |
257 | int blocksize, dmu_object_type_t bonustype, int bonuslen, int dnodesize, | |
258 | dmu_tx_t *tx) | |
34dc7c2f BB |
259 | { |
260 | dnode_t *dn; | |
cae5b340 | 261 | int dn_slots = dnodesize >> DNODE_SHIFT; |
34dc7c2f BB |
262 | int err; |
263 | ||
a07c8b41 MZ |
264 | if (dn_slots == 0) |
265 | dn_slots = DNODE_MIN_SLOTS; | |
266 | ||
9babb374 | 267 | if (object == DMU_META_DNODE_OBJECT) |
a08ee875 | 268 | return (SET_ERROR(EBADF)); |
34dc7c2f | 269 | |
cae5b340 | 270 | err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0, |
34dc7c2f BB |
271 | FTAG, &dn); |
272 | if (err) | |
273 | return (err); | |
9babb374 | 274 | |
cae5b340 | 275 | dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, dn_slots, tx); |
9babb374 | 276 | |
34dc7c2f | 277 | dnode_rele(dn, FTAG); |
9babb374 | 278 | return (err); |
34dc7c2f BB |
279 | } |
280 | ||
281 | int | |
282 | dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx) | |
283 | { | |
284 | dnode_t *dn; | |
285 | int err; | |
286 | ||
287 | ASSERT(object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx)); | |
288 | ||
cae5b340 | 289 | err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0, |
34dc7c2f BB |
290 | FTAG, &dn); |
291 | if (err) | |
292 | return (err); | |
293 | ||
294 | ASSERT(dn->dn_type != DMU_OT_NONE); | |
b128c09f | 295 | dnode_free_range(dn, 0, DMU_OBJECT_END, tx); |
34dc7c2f BB |
296 | dnode_free(dn, tx); |
297 | dnode_rele(dn, FTAG); | |
298 | ||
299 | return (0); | |
300 | } | |
301 | ||
cae5b340 AX |
302 | /* |
303 | * Return (in *objectp) the next object which is allocated (or a hole) | |
304 | * after *object, taking into account only objects that may have been modified | |
305 | * after the specified txg. | |
306 | */ | |
34dc7c2f BB |
307 | int |
308 | dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg) | |
309 | { | |
cae5b340 AX |
310 | uint64_t offset; |
311 | uint64_t start_obj; | |
312 | struct dsl_dataset *ds = os->os_dsl_dataset; | |
34dc7c2f BB |
313 | int error; |
314 | ||
cae5b340 AX |
315 | if (*objectp == 0) { |
316 | start_obj = 1; | |
317 | } else if (ds && ds->ds_feature_inuse[SPA_FEATURE_LARGE_DNODE]) { | |
318 | uint64_t i = *objectp + 1; | |
319 | uint64_t last_obj = *objectp | (DNODES_PER_BLOCK - 1); | |
320 | dmu_object_info_t doi; | |
321 | ||
322 | /* | |
323 | * Scan through the remaining meta dnode block. The contents | |
324 | * of each slot in the block are known so it can be quickly | |
325 | * checked. If the block is exhausted without a match then | |
326 | * hand off to dnode_next_offset() for further scanning. | |
327 | */ | |
328 | while (i <= last_obj) { | |
329 | error = dmu_object_info(os, i, &doi); | |
330 | if (error == ENOENT) { | |
331 | if (hole) { | |
332 | *objectp = i; | |
333 | return (0); | |
334 | } else { | |
335 | i++; | |
336 | } | |
337 | } else if (error == EEXIST) { | |
338 | i++; | |
339 | } else if (error == 0) { | |
340 | if (hole) { | |
341 | i += doi.doi_dnodesize >> DNODE_SHIFT; | |
342 | } else { | |
343 | *objectp = i; | |
344 | return (0); | |
345 | } | |
346 | } else { | |
347 | return (error); | |
348 | } | |
349 | } | |
350 | ||
351 | start_obj = i; | |
352 | } else { | |
353 | start_obj = *objectp + 1; | |
354 | } | |
355 | ||
356 | offset = start_obj << DNODE_SHIFT; | |
357 | ||
572e2857 | 358 | error = dnode_next_offset(DMU_META_DNODE(os), |
b128c09f | 359 | (hole ? DNODE_FIND_HOLE : 0), &offset, 0, DNODES_PER_BLOCK, txg); |
34dc7c2f BB |
360 | |
361 | *objectp = offset >> DNODE_SHIFT; | |
362 | ||
363 | return (error); | |
364 | } | |
c28b2279 | 365 | |
ea04106b AX |
366 | /* |
367 | * Turn this object from old_type into DMU_OTN_ZAP_METADATA, and bump the | |
368 | * refcount on SPA_FEATURE_EXTENSIBLE_DATASET. | |
369 | * | |
370 | * Only for use from syncing context, on MOS objects. | |
371 | */ | |
372 | void | |
373 | dmu_object_zapify(objset_t *mos, uint64_t object, dmu_object_type_t old_type, | |
374 | dmu_tx_t *tx) | |
375 | { | |
376 | dnode_t *dn; | |
377 | ||
378 | ASSERT(dmu_tx_is_syncing(tx)); | |
379 | ||
380 | VERIFY0(dnode_hold(mos, object, FTAG, &dn)); | |
381 | if (dn->dn_type == DMU_OTN_ZAP_METADATA) { | |
382 | dnode_rele(dn, FTAG); | |
383 | return; | |
384 | } | |
385 | ASSERT3U(dn->dn_type, ==, old_type); | |
386 | ASSERT0(dn->dn_maxblkid); | |
387 | dn->dn_next_type[tx->tx_txg & TXG_MASK] = dn->dn_type = | |
388 | DMU_OTN_ZAP_METADATA; | |
389 | dnode_setdirty(dn, tx); | |
390 | dnode_rele(dn, FTAG); | |
391 | ||
392 | mzap_create_impl(mos, object, 0, 0, tx); | |
393 | ||
394 | spa_feature_incr(dmu_objset_spa(mos), | |
395 | SPA_FEATURE_EXTENSIBLE_DATASET, tx); | |
396 | } | |
397 | ||
398 | void | |
399 | dmu_object_free_zapified(objset_t *mos, uint64_t object, dmu_tx_t *tx) | |
400 | { | |
401 | dnode_t *dn; | |
402 | dmu_object_type_t t; | |
403 | ||
404 | ASSERT(dmu_tx_is_syncing(tx)); | |
405 | ||
406 | VERIFY0(dnode_hold(mos, object, FTAG, &dn)); | |
407 | t = dn->dn_type; | |
408 | dnode_rele(dn, FTAG); | |
409 | ||
410 | if (t == DMU_OTN_ZAP_METADATA) { | |
411 | spa_feature_decr(dmu_objset_spa(mos), | |
412 | SPA_FEATURE_EXTENSIBLE_DATASET, tx); | |
413 | } | |
414 | VERIFY0(dmu_object_free(mos, object, tx)); | |
415 | } | |
416 | ||
c28b2279 BB |
417 | #if defined(_KERNEL) && defined(HAVE_SPL) |
418 | EXPORT_SYMBOL(dmu_object_alloc); | |
cae5b340 | 419 | EXPORT_SYMBOL(dmu_object_alloc_dnsize); |
c28b2279 | 420 | EXPORT_SYMBOL(dmu_object_claim); |
cae5b340 | 421 | EXPORT_SYMBOL(dmu_object_claim_dnsize); |
c28b2279 | 422 | EXPORT_SYMBOL(dmu_object_reclaim); |
cae5b340 | 423 | EXPORT_SYMBOL(dmu_object_reclaim_dnsize); |
c28b2279 BB |
424 | EXPORT_SYMBOL(dmu_object_free); |
425 | EXPORT_SYMBOL(dmu_object_next); | |
ea04106b AX |
426 | EXPORT_SYMBOL(dmu_object_zapify); |
427 | EXPORT_SYMBOL(dmu_object_free_zapified); | |
cae5b340 AX |
428 | |
429 | /* BEGIN CSTYLED */ | |
430 | module_param(dmu_object_alloc_chunk_shift, int, 0644); | |
431 | MODULE_PARM_DESC(dmu_object_alloc_chunk_shift, | |
432 | "CPU-specific allocator grabs 2^N objects at once"); | |
433 | /* END CSTYLED */ | |
c28b2279 | 434 | #endif |