]> git.proxmox.com Git - mirror_zfs.git/blame - module/zfs/dmu_object.c
Fix typo/etc in module/zfs/zfs_ctldir.c
[mirror_zfs.git] / module / zfs / dmu_object.c
CommitLineData
34dc7c2f
BB
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
428870ff 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
1a5b96b8 23 * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
6c59307a 24 * Copyright 2014 HybridCluster. All rights reserved.
34dc7c2f
BB
25 */
26
34dc7c2f
BB
27#include <sys/dmu.h>
28#include <sys/dmu_objset.h>
29#include <sys/dmu_tx.h>
30#include <sys/dnode.h>
fa86b5db
MA
31#include <sys/zap.h>
32#include <sys/zfeature.h>
50c957f7 33#include <sys/dsl_dataset.h>
34dc7c2f 34
dbeb8796
MA
35/*
36 * Each of the concurrent object allocators will grab
37 * 2^dmu_object_alloc_chunk_shift dnode slots at a time. The default is to
38 * grab 128 slots, which is 4 blocks worth. This was experimentally
39 * determined to be the lowest value that eliminates the measurable effect
40 * of lock contention from this code path.
41 */
42int dmu_object_alloc_chunk_shift = 7;
43
3a549dc7
MA
44static uint64_t
45dmu_object_alloc_impl(objset_t *os, dmu_object_type_t ot, int blocksize,
46 int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen,
6955b401 47 int dnodesize, dnode_t **allocated_dnode, void *tag, dmu_tx_t *tx)
34dc7c2f 48{
34dc7c2f 49 uint64_t object;
68cbd56e 50 uint64_t L1_dnode_count = DNODES_PER_BLOCK <<
572e2857 51 (DMU_META_DNODE(os)->dn_indblkshift - SPA_BLKPTRSHIFT);
34dc7c2f 52 dnode_t *dn = NULL;
50c957f7
NB
53 int dn_slots = dnodesize >> DNODE_SHIFT;
54 boolean_t restarted = B_FALSE;
d9ad3fea 55 uint64_t *cpuobj = NULL;
dbeb8796 56 int dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift;
9631681b 57 int error;
50c957f7 58
d9ad3fea
MJ
59 kpreempt_disable();
60 cpuobj = &os->os_obj_next_percpu[CPU_SEQID %
61 os->os_obj_next_percpu_len];
62 kpreempt_enable();
63
50c957f7
NB
64 if (dn_slots == 0) {
65 dn_slots = DNODE_MIN_SLOTS;
66 } else {
67 ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS);
68 ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS);
69 }
34dc7c2f 70
dbeb8796
MA
71 /*
72 * The "chunk" of dnodes that is assigned to a CPU-specific
73 * allocator needs to be at least one block's worth, to avoid
74 * lock contention on the dbuf. It can be at most one L1 block's
75 * worth, so that the "rescan after polishing off a L1's worth"
76 * logic below will be sure to kick in.
77 */
78 if (dnodes_per_chunk < DNODES_PER_BLOCK)
79 dnodes_per_chunk = DNODES_PER_BLOCK;
80 if (dnodes_per_chunk > L1_dnode_count)
81 dnodes_per_chunk = L1_dnode_count;
82
6955b401
BB
83 /*
84 * The caller requested the dnode be returned as a performance
85 * optimization in order to avoid releasing the hold only to
86 * immediately reacquire it. Since they caller is responsible
87 * for releasing the hold they must provide the tag.
88 */
89 if (allocated_dnode != NULL) {
90 ASSERT3P(tag, !=, NULL);
91 } else {
92 ASSERT3P(tag, ==, NULL);
93 tag = FTAG;
94 }
95
dbeb8796 96 object = *cpuobj;
34dc7c2f 97 for (;;) {
34dc7c2f 98 /*
dbeb8796
MA
99 * If we finished a chunk of dnodes, get a new one from
100 * the global allocator.
34dc7c2f 101 */
4c5b89f5
OF
102 if ((P2PHASE(object, dnodes_per_chunk) == 0) ||
103 (P2PHASE(object + dn_slots - 1, dnodes_per_chunk) <
104 dn_slots)) {
105 DNODE_STAT_BUMP(dnode_alloc_next_chunk);
dbeb8796
MA
106 mutex_enter(&os->os_obj_lock);
107 ASSERT0(P2PHASE(os->os_obj_next_chunk,
108 dnodes_per_chunk));
109 object = os->os_obj_next_chunk;
110
111 /*
112 * Each time we polish off a L1 bp worth of dnodes
113 * (2^12 objects), move to another L1 bp that's
114 * still reasonably sparse (at most 1/4 full). Look
115 * from the beginning at most once per txg. If we
116 * still can't allocate from that L1 block, search
117 * for an empty L0 block, which will quickly skip
118 * to the end of the metadnode if no nearby L0
119 * blocks are empty. This fallback avoids a
120 * pathology where full dnode blocks containing
121 * large dnodes appear sparse because they have a
122 * low blk_fill, leading to many failed allocation
123 * attempts. In the long term a better mechanism to
124 * search for sparse metadnode regions, such as
125 * spacemaps, could be implemented.
126 *
127 * os_scan_dnodes is set during txg sync if enough
128 * objects have been freed since the previous
129 * rescan to justify backfilling again.
130 *
131 * Note that dmu_traverse depends on the behavior
132 * that we use multiple blocks of the dnode object
133 * before going back to reuse objects. Any change
134 * to this algorithm should preserve that property
135 * or find another solution to the issues described
136 * in traverse_visitbp.
137 */
138 if (P2PHASE(object, L1_dnode_count) == 0) {
139 uint64_t offset;
140 uint64_t blkfill;
141 int minlvl;
dbeb8796
MA
142 if (os->os_rescan_dnodes) {
143 offset = 0;
144 os->os_rescan_dnodes = B_FALSE;
145 } else {
146 offset = object << DNODE_SHIFT;
147 }
148 blkfill = restarted ? 1 : DNODES_PER_BLOCK >> 2;
149 minlvl = restarted ? 1 : 2;
150 restarted = B_TRUE;
151 error = dnode_next_offset(DMU_META_DNODE(os),
152 DNODE_FIND_HOLE, &offset, minlvl,
153 blkfill, 0);
154 if (error == 0) {
155 object = offset >> DNODE_SHIFT;
156 }
68cbd56e 157 }
dbeb8796
MA
158 /*
159 * Note: if "restarted", we may find a L0 that
160 * is not suitably aligned.
161 */
162 os->os_obj_next_chunk =
163 P2ALIGN(object, dnodes_per_chunk) +
164 dnodes_per_chunk;
165 (void) atomic_swap_64(cpuobj, object);
166 mutex_exit(&os->os_obj_lock);
34dc7c2f 167 }
34dc7c2f 168
4c5b89f5
OF
169 /*
170 * The value of (*cpuobj) before adding dn_slots is the object
171 * ID assigned to us. The value afterwards is the object ID
172 * assigned to whoever wants to do an allocation next.
173 */
174 object = atomic_add_64_nv(cpuobj, dn_slots) - dn_slots;
175
34dc7c2f
BB
176 /*
177 * XXX We should check for an i/o error here and return
178 * up to our caller. Actually we should pre-read it in
179 * dmu_tx_assign(), but there is currently no mechanism
180 * to do so.
181 */
9631681b 182 error = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE,
6955b401 183 dn_slots, tag, &dn);
9631681b 184 if (error == 0) {
dbeb8796 185 rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
50c957f7 186 /*
dbeb8796
MA
187 * Another thread could have allocated it; check
188 * again now that we have the struct lock.
50c957f7 189 */
dbeb8796 190 if (dn->dn_type == DMU_OT_NONE) {
3a549dc7
MA
191 dnode_allocate(dn, ot, blocksize,
192 indirect_blockshift, bonustype,
193 bonuslen, dn_slots, tx);
dbeb8796
MA
194 rw_exit(&dn->dn_struct_rwlock);
195 dmu_tx_add_new_object(tx, dn);
6955b401
BB
196
197 /*
198 * Caller requested the allocated dnode be
199 * returned and is responsible for the hold.
200 */
201 if (allocated_dnode != NULL)
202 *allocated_dnode = dn;
203 else
204 dnode_rele(dn, tag);
205
dbeb8796
MA
206 return (object);
207 }
208 rw_exit(&dn->dn_struct_rwlock);
6955b401 209 dnode_rele(dn, tag);
4c5b89f5 210 DNODE_STAT_BUMP(dnode_alloc_race);
dbeb8796 211 }
0eef1bde 212
4c5b89f5
OF
213 /*
214 * Skip to next known valid starting point on error. This
215 * is the start of the next block of dnodes.
216 */
dbeb8796 217 if (dmu_object_next(os, &object, B_TRUE, 0) != 0) {
dbeb8796 218 object = P2ROUNDUP(object + 1, DNODES_PER_BLOCK);
4c5b89f5 219 DNODE_STAT_BUMP(dnode_alloc_next_block);
dbeb8796
MA
220 }
221 (void) atomic_swap_64(cpuobj, object);
222 }
34dc7c2f
BB
223}
224
3a549dc7
MA
225uint64_t
226dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
227 dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
228{
229 return dmu_object_alloc_impl(os, ot, blocksize, 0, bonustype,
6955b401 230 bonuslen, 0, NULL, NULL, tx);
3a549dc7
MA
231}
232
233uint64_t
234dmu_object_alloc_ibs(objset_t *os, dmu_object_type_t ot, int blocksize,
235 int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen,
236 dmu_tx_t *tx)
237{
238 return dmu_object_alloc_impl(os, ot, blocksize, indirect_blockshift,
6955b401 239 bonustype, bonuslen, 0, NULL, NULL, tx);
3a549dc7
MA
240}
241
242uint64_t
243dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize,
244 dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
245{
246 return (dmu_object_alloc_impl(os, ot, blocksize, 0, bonustype,
6955b401
BB
247 bonuslen, dnodesize, NULL, NULL, tx));
248}
249
250/*
251 * Allocate a new object and return a pointer to the newly allocated dnode
252 * via the allocated_dnode argument. The returned dnode will be held and
253 * the caller is responsible for releasing the hold by calling dnode_rele().
254 */
255uint64_t
256dmu_object_alloc_hold(objset_t *os, dmu_object_type_t ot, int blocksize,
257 int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen,
258 int dnodesize, dnode_t **allocated_dnode, void *tag, dmu_tx_t *tx)
259{
260 return (dmu_object_alloc_impl(os, ot, blocksize, indirect_blockshift,
261 bonustype, bonuslen, dnodesize, allocated_dnode, tag, tx));
3a549dc7
MA
262}
263
34dc7c2f
BB
264int
265dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
266 int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
50c957f7
NB
267{
268 return (dmu_object_claim_dnsize(os, object, ot, blocksize, bonustype,
269 bonuslen, 0, tx));
270}
271
272int
273dmu_object_claim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,
274 int blocksize, dmu_object_type_t bonustype, int bonuslen,
275 int dnodesize, dmu_tx_t *tx)
34dc7c2f
BB
276{
277 dnode_t *dn;
50c957f7 278 int dn_slots = dnodesize >> DNODE_SHIFT;
34dc7c2f
BB
279 int err;
280
50c957f7
NB
281 if (dn_slots == 0)
282 dn_slots = DNODE_MIN_SLOTS;
283 ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS);
284 ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS);
285
34dc7c2f 286 if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx))
2e528b49 287 return (SET_ERROR(EBADF));
34dc7c2f 288
50c957f7
NB
289 err = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, dn_slots,
290 FTAG, &dn);
34dc7c2f
BB
291 if (err)
292 return (err);
50c957f7
NB
293
294 dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, dn_slots, tx);
66eead53 295 dmu_tx_add_new_object(tx, dn);
0eef1bde 296
34dc7c2f
BB
297 dnode_rele(dn, FTAG);
298
34dc7c2f
BB
299 return (0);
300}
301
302int
303dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
6c59307a 304 int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
50c957f7
NB
305{
306 return (dmu_object_reclaim_dnsize(os, object, ot, blocksize, bonustype,
e14a32b1 307 bonuslen, DNODE_MIN_SIZE, tx));
50c957f7
NB
308}
309
310int
311dmu_object_reclaim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,
312 int blocksize, dmu_object_type_t bonustype, int bonuslen, int dnodesize,
313 dmu_tx_t *tx)
34dc7c2f
BB
314{
315 dnode_t *dn;
50c957f7 316 int dn_slots = dnodesize >> DNODE_SHIFT;
34dc7c2f
BB
317 int err;
318
da2feb42
TC
319 if (dn_slots == 0)
320 dn_slots = DNODE_MIN_SLOTS;
321
9babb374 322 if (object == DMU_META_DNODE_OBJECT)
2e528b49 323 return (SET_ERROR(EBADF));
34dc7c2f 324
50c957f7 325 err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0,
34dc7c2f
BB
326 FTAG, &dn);
327 if (err)
328 return (err);
9babb374 329
50c957f7 330 dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, dn_slots, tx);
9babb374 331
34dc7c2f 332 dnode_rele(dn, FTAG);
9babb374 333 return (err);
34dc7c2f
BB
334}
335
336int
337dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx)
338{
339 dnode_t *dn;
340 int err;
341
342 ASSERT(object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
343
50c957f7 344 err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0,
34dc7c2f
BB
345 FTAG, &dn);
346 if (err)
347 return (err);
348
349 ASSERT(dn->dn_type != DMU_OT_NONE);
21d48b5e
PD
350 /*
351 * If we don't create this free range, we'll leak indirect blocks when
352 * we get to freeing the dnode in syncing context.
353 */
b128c09f 354 dnode_free_range(dn, 0, DMU_OBJECT_END, tx);
34dc7c2f
BB
355 dnode_free(dn, tx);
356 dnode_rele(dn, FTAG);
357
358 return (0);
359}
360
fcff0f35
PD
361/*
362 * Return (in *objectp) the next object which is allocated (or a hole)
363 * after *object, taking into account only objects that may have been modified
364 * after the specified txg.
365 */
34dc7c2f
BB
366int
367dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg)
368{
50c957f7 369 uint64_t offset;
08f0510d 370 uint64_t start_obj;
50c957f7 371 struct dsl_dataset *ds = os->os_dsl_dataset;
34dc7c2f
BB
372 int error;
373
08f0510d 374 if (*objectp == 0) {
375 start_obj = 1;
d52d80b7
PD
376 } else if (ds && dsl_dataset_feature_is_active(ds,
377 SPA_FEATURE_LARGE_DNODE)) {
4c5b89f5
OF
378 uint64_t i = *objectp + 1;
379 uint64_t last_obj = *objectp | (DNODES_PER_BLOCK - 1);
380 dmu_object_info_t doi;
381
08f0510d 382 /*
4c5b89f5
OF
383 * Scan through the remaining meta dnode block. The contents
384 * of each slot in the block are known so it can be quickly
385 * checked. If the block is exhausted without a match then
386 * hand off to dnode_next_offset() for further scanning.
08f0510d 387 */
4c5b89f5 388 while (i <= last_obj) {
08f0510d 389 error = dmu_object_info(os, i, &doi);
4c5b89f5
OF
390 if (error == ENOENT) {
391 if (hole) {
392 *objectp = i;
393 return (0);
394 } else {
395 i++;
396 }
397 } else if (error == EEXIST) {
398 i++;
399 } else if (error == 0) {
400 if (hole) {
401 i += doi.doi_dnodesize >> DNODE_SHIFT;
402 } else {
403 *objectp = i;
404 return (0);
405 }
406 } else {
407 return (error);
408 }
08f0510d 409 }
410
411 start_obj = i;
50c957f7 412 } else {
08f0510d 413 start_obj = *objectp + 1;
50c957f7
NB
414 }
415
08f0510d 416 offset = start_obj << DNODE_SHIFT;
50c957f7 417
572e2857 418 error = dnode_next_offset(DMU_META_DNODE(os),
b128c09f 419 (hole ? DNODE_FIND_HOLE : 0), &offset, 0, DNODES_PER_BLOCK, txg);
34dc7c2f
BB
420
421 *objectp = offset >> DNODE_SHIFT;
422
423 return (error);
424}
c28b2279 425
fa86b5db
MA
426/*
427 * Turn this object from old_type into DMU_OTN_ZAP_METADATA, and bump the
428 * refcount on SPA_FEATURE_EXTENSIBLE_DATASET.
429 *
430 * Only for use from syncing context, on MOS objects.
431 */
432void
433dmu_object_zapify(objset_t *mos, uint64_t object, dmu_object_type_t old_type,
434 dmu_tx_t *tx)
435{
436 dnode_t *dn;
437
438 ASSERT(dmu_tx_is_syncing(tx));
439
440 VERIFY0(dnode_hold(mos, object, FTAG, &dn));
441 if (dn->dn_type == DMU_OTN_ZAP_METADATA) {
442 dnode_rele(dn, FTAG);
443 return;
444 }
445 ASSERT3U(dn->dn_type, ==, old_type);
446 ASSERT0(dn->dn_maxblkid);
1a5b96b8
MA
447
448 /*
449 * We must initialize the ZAP data before changing the type,
450 * so that concurrent calls to *_is_zapified() can determine if
451 * the object has been completely zapified by checking the type.
452 */
6955b401 453 mzap_create_impl(dn, 0, 0, tx);
1a5b96b8 454
fa86b5db
MA
455 dn->dn_next_type[tx->tx_txg & TXG_MASK] = dn->dn_type =
456 DMU_OTN_ZAP_METADATA;
457 dnode_setdirty(dn, tx);
458 dnode_rele(dn, FTAG);
459
fa86b5db
MA
460 spa_feature_incr(dmu_objset_spa(mos),
461 SPA_FEATURE_EXTENSIBLE_DATASET, tx);
462}
463
464void
465dmu_object_free_zapified(objset_t *mos, uint64_t object, dmu_tx_t *tx)
466{
467 dnode_t *dn;
468 dmu_object_type_t t;
469
470 ASSERT(dmu_tx_is_syncing(tx));
471
472 VERIFY0(dnode_hold(mos, object, FTAG, &dn));
473 t = dn->dn_type;
474 dnode_rele(dn, FTAG);
475
476 if (t == DMU_OTN_ZAP_METADATA) {
477 spa_feature_decr(dmu_objset_spa(mos),
478 SPA_FEATURE_EXTENSIBLE_DATASET, tx);
479 }
480 VERIFY0(dmu_object_free(mos, object, tx));
481}
482
93ce2b4c 483#if defined(_KERNEL)
c28b2279 484EXPORT_SYMBOL(dmu_object_alloc);
3a549dc7 485EXPORT_SYMBOL(dmu_object_alloc_ibs);
50c957f7 486EXPORT_SYMBOL(dmu_object_alloc_dnsize);
6955b401 487EXPORT_SYMBOL(dmu_object_alloc_hold);
c28b2279 488EXPORT_SYMBOL(dmu_object_claim);
50c957f7 489EXPORT_SYMBOL(dmu_object_claim_dnsize);
c28b2279 490EXPORT_SYMBOL(dmu_object_reclaim);
50c957f7 491EXPORT_SYMBOL(dmu_object_reclaim_dnsize);
c28b2279
BB
492EXPORT_SYMBOL(dmu_object_free);
493EXPORT_SYMBOL(dmu_object_next);
fa86b5db
MA
494EXPORT_SYMBOL(dmu_object_zapify);
495EXPORT_SYMBOL(dmu_object_free_zapified);
dbeb8796
MA
496
497/* BEGIN CSTYLED */
498module_param(dmu_object_alloc_chunk_shift, int, 0644);
499MODULE_PARM_DESC(dmu_object_alloc_chunk_shift,
500 "CPU-specific allocator grabs 2^N objects at once");
501/* END CSTYLED */
c28b2279 502#endif