]> git.proxmox.com Git - mirror_zfs-debian.git/blame - module/zfs/dmu_object.c
Merge branch 'add_breaks_replaces_zfs_initramfs' into 'master'
[mirror_zfs-debian.git] / module / zfs / dmu_object.c
CommitLineData
34dc7c2f
BB
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
428870ff 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
4e820b5a 23 * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
ea04106b 24 * Copyright 2014 HybridCluster. All rights reserved.
34dc7c2f
BB
25 */
26
34dc7c2f
BB
27#include <sys/dmu.h>
28#include <sys/dmu_objset.h>
29#include <sys/dmu_tx.h>
30#include <sys/dnode.h>
ea04106b
AX
31#include <sys/zap.h>
32#include <sys/zfeature.h>
cae5b340
AX
33#include <sys/dsl_dataset.h>
34
35/*
36 * Each of the concurrent object allocators will grab
37 * 2^dmu_object_alloc_chunk_shift dnode slots at a time. The default is to
38 * grab 128 slots, which is 4 blocks worth. This was experimentally
39 * determined to be the lowest value that eliminates the measurable effect
40 * of lock contention from this code path.
41 */
42int dmu_object_alloc_chunk_shift = 7;
34dc7c2f
BB
43
44uint64_t
45dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
46 dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
cae5b340
AX
47{
48 return dmu_object_alloc_dnsize(os, ot, blocksize, bonustype, bonuslen,
49 0, tx);
50}
51
52uint64_t
53dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize,
54 dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
34dc7c2f 55{
34dc7c2f 56 uint64_t object;
cae5b340 57 uint64_t L1_dnode_count = DNODES_PER_BLOCK <<
572e2857 58 (DMU_META_DNODE(os)->dn_indblkshift - SPA_BLKPTRSHIFT);
34dc7c2f 59 dnode_t *dn = NULL;
cae5b340
AX
60 int dn_slots = dnodesize >> DNODE_SHIFT;
61 boolean_t restarted = B_FALSE;
62 uint64_t *cpuobj = NULL;
63 int dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift;
64 int error;
65
66 kpreempt_disable();
67 cpuobj = &os->os_obj_next_percpu[CPU_SEQID %
68 os->os_obj_next_percpu_len];
69 kpreempt_enable();
34dc7c2f 70
cae5b340
AX
71 if (dn_slots == 0) {
72 dn_slots = DNODE_MIN_SLOTS;
73 } else {
74 ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS);
75 ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS);
76 }
77
78 /*
79 * The "chunk" of dnodes that is assigned to a CPU-specific
80 * allocator needs to be at least one block's worth, to avoid
81 * lock contention on the dbuf. It can be at most one L1 block's
82 * worth, so that the "rescan after polishing off a L1's worth"
83 * logic below will be sure to kick in.
84 */
85 if (dnodes_per_chunk < DNODES_PER_BLOCK)
86 dnodes_per_chunk = DNODES_PER_BLOCK;
87 if (dnodes_per_chunk > L1_dnode_count)
88 dnodes_per_chunk = L1_dnode_count;
89
90 object = *cpuobj;
34dc7c2f 91 for (;;) {
34dc7c2f 92 /*
cae5b340
AX
93 * If we finished a chunk of dnodes, get a new one from
94 * the global allocator.
34dc7c2f 95 */
cae5b340
AX
96 if ((P2PHASE(object, dnodes_per_chunk) == 0) ||
97 (P2PHASE(object + dn_slots - 1, dnodes_per_chunk) <
98 dn_slots)) {
99 DNODE_STAT_BUMP(dnode_alloc_next_chunk);
100 mutex_enter(&os->os_obj_lock);
101 ASSERT0(P2PHASE(os->os_obj_next_chunk,
102 dnodes_per_chunk));
103 object = os->os_obj_next_chunk;
104
105 /*
106 * Each time we polish off a L1 bp worth of dnodes
107 * (2^12 objects), move to another L1 bp that's
108 * still reasonably sparse (at most 1/4 full). Look
109 * from the beginning at most once per txg. If we
110 * still can't allocate from that L1 block, search
111 * for an empty L0 block, which will quickly skip
112 * to the end of the metadnode if no nearby L0
113 * blocks are empty. This fallback avoids a
114 * pathology where full dnode blocks containing
115 * large dnodes appear sparse because they have a
116 * low blk_fill, leading to many failed allocation
117 * attempts. In the long term a better mechanism to
118 * search for sparse metadnode regions, such as
119 * spacemaps, could be implemented.
120 *
121 * os_scan_dnodes is set during txg sync if enough
122 * objects have been freed since the previous
123 * rescan to justify backfilling again.
124 *
125 * Note that dmu_traverse depends on the behavior
126 * that we use multiple blocks of the dnode object
127 * before going back to reuse objects. Any change
128 * to this algorithm should preserve that property
129 * or find another solution to the issues described
130 * in traverse_visitbp.
131 */
132 if (P2PHASE(object, L1_dnode_count) == 0) {
133 uint64_t offset;
134 uint64_t blkfill;
135 int minlvl;
136 if (os->os_rescan_dnodes) {
137 offset = 0;
138 os->os_rescan_dnodes = B_FALSE;
139 } else {
140 offset = object << DNODE_SHIFT;
141 }
142 blkfill = restarted ? 1 : DNODES_PER_BLOCK >> 2;
143 minlvl = restarted ? 1 : 2;
144 restarted = B_TRUE;
145 error = dnode_next_offset(DMU_META_DNODE(os),
146 DNODE_FIND_HOLE, &offset, minlvl,
147 blkfill, 0);
148 if (error == 0) {
149 object = offset >> DNODE_SHIFT;
150 }
151 }
152 /*
153 * Note: if "restarted", we may find a L0 that
154 * is not suitably aligned.
155 */
156 os->os_obj_next_chunk =
157 P2ALIGN(object, dnodes_per_chunk) +
158 dnodes_per_chunk;
159 (void) atomic_swap_64(cpuobj, object);
160 mutex_exit(&os->os_obj_lock);
34dc7c2f 161 }
cae5b340
AX
162
163 /*
164 * The value of (*cpuobj) before adding dn_slots is the object
165 * ID assigned to us. The value afterwards is the object ID
166 * assigned to whoever wants to do an allocation next.
167 */
168 object = atomic_add_64_nv(cpuobj, dn_slots) - dn_slots;
34dc7c2f
BB
169
170 /*
171 * XXX We should check for an i/o error here and return
172 * up to our caller. Actually we should pre-read it in
173 * dmu_tx_assign(), but there is currently no mechanism
174 * to do so.
175 */
cae5b340
AX
176 error = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE,
177 dn_slots, FTAG, &dn);
178 if (error == 0) {
179 rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
180 /*
181 * Another thread could have allocated it; check
182 * again now that we have the struct lock.
183 */
184 if (dn->dn_type == DMU_OT_NONE) {
185 dnode_allocate(dn, ot, blocksize, 0,
186 bonustype, bonuslen, dn_slots, tx);
187 rw_exit(&dn->dn_struct_rwlock);
188 dmu_tx_add_new_object(tx, dn);
189 dnode_rele(dn, FTAG);
190 return (object);
191 }
192 rw_exit(&dn->dn_struct_rwlock);
193 dnode_rele(dn, FTAG);
194 DNODE_STAT_BUMP(dnode_alloc_race);
195 }
34dc7c2f 196
cae5b340
AX
197 /*
198 * Skip to next known valid starting point on error. This
199 * is the start of the next block of dnodes.
200 */
201 if (dmu_object_next(os, &object, B_TRUE, 0) != 0) {
202 object = P2ROUNDUP(object + 1, DNODES_PER_BLOCK);
203 DNODE_STAT_BUMP(dnode_alloc_next_block);
204 }
205 (void) atomic_swap_64(cpuobj, object);
34dc7c2f 206 }
34dc7c2f
BB
207}
208
209int
210dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
211 int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
cae5b340
AX
212{
213 return (dmu_object_claim_dnsize(os, object, ot, blocksize, bonustype,
214 bonuslen, 0, tx));
215}
216
217int
218dmu_object_claim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,
219 int blocksize, dmu_object_type_t bonustype, int bonuslen,
220 int dnodesize, dmu_tx_t *tx)
34dc7c2f
BB
221{
222 dnode_t *dn;
cae5b340 223 int dn_slots = dnodesize >> DNODE_SHIFT;
34dc7c2f
BB
224 int err;
225
cae5b340
AX
226 if (dn_slots == 0)
227 dn_slots = DNODE_MIN_SLOTS;
228 ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS);
229 ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS);
230
34dc7c2f 231 if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx))
a08ee875 232 return (SET_ERROR(EBADF));
34dc7c2f 233
cae5b340
AX
234 err = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, dn_slots,
235 FTAG, &dn);
34dc7c2f
BB
236 if (err)
237 return (err);
cae5b340
AX
238
239 dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, dn_slots, tx);
240 dmu_tx_add_new_object(tx, dn);
241
34dc7c2f
BB
242 dnode_rele(dn, FTAG);
243
34dc7c2f
BB
244 return (0);
245}
246
247int
248dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
ea04106b 249 int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
cae5b340
AX
250{
251 return (dmu_object_reclaim_dnsize(os, object, ot, blocksize, bonustype,
a07c8b41 252 bonuslen, DNODE_MIN_SIZE, tx));
cae5b340
AX
253}
254
255int
256dmu_object_reclaim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,
257 int blocksize, dmu_object_type_t bonustype, int bonuslen, int dnodesize,
258 dmu_tx_t *tx)
34dc7c2f
BB
259{
260 dnode_t *dn;
cae5b340 261 int dn_slots = dnodesize >> DNODE_SHIFT;
34dc7c2f
BB
262 int err;
263
a07c8b41
MZ
264 if (dn_slots == 0)
265 dn_slots = DNODE_MIN_SLOTS;
266
9babb374 267 if (object == DMU_META_DNODE_OBJECT)
a08ee875 268 return (SET_ERROR(EBADF));
34dc7c2f 269
cae5b340 270 err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0,
34dc7c2f
BB
271 FTAG, &dn);
272 if (err)
273 return (err);
9babb374 274
cae5b340 275 dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, dn_slots, tx);
9babb374 276
34dc7c2f 277 dnode_rele(dn, FTAG);
9babb374 278 return (err);
34dc7c2f
BB
279}
280
281int
282dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx)
283{
284 dnode_t *dn;
285 int err;
286
287 ASSERT(object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
288
cae5b340 289 err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0,
34dc7c2f
BB
290 FTAG, &dn);
291 if (err)
292 return (err);
293
294 ASSERT(dn->dn_type != DMU_OT_NONE);
b128c09f 295 dnode_free_range(dn, 0, DMU_OBJECT_END, tx);
34dc7c2f
BB
296 dnode_free(dn, tx);
297 dnode_rele(dn, FTAG);
298
299 return (0);
300}
301
cae5b340
AX
302/*
303 * Return (in *objectp) the next object which is allocated (or a hole)
304 * after *object, taking into account only objects that may have been modified
305 * after the specified txg.
306 */
34dc7c2f
BB
307int
308dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg)
309{
cae5b340
AX
310 uint64_t offset;
311 uint64_t start_obj;
312 struct dsl_dataset *ds = os->os_dsl_dataset;
34dc7c2f
BB
313 int error;
314
cae5b340
AX
315 if (*objectp == 0) {
316 start_obj = 1;
317 } else if (ds && ds->ds_feature_inuse[SPA_FEATURE_LARGE_DNODE]) {
318 uint64_t i = *objectp + 1;
319 uint64_t last_obj = *objectp | (DNODES_PER_BLOCK - 1);
320 dmu_object_info_t doi;
321
322 /*
323 * Scan through the remaining meta dnode block. The contents
324 * of each slot in the block are known so it can be quickly
325 * checked. If the block is exhausted without a match then
326 * hand off to dnode_next_offset() for further scanning.
327 */
328 while (i <= last_obj) {
329 error = dmu_object_info(os, i, &doi);
330 if (error == ENOENT) {
331 if (hole) {
332 *objectp = i;
333 return (0);
334 } else {
335 i++;
336 }
337 } else if (error == EEXIST) {
338 i++;
339 } else if (error == 0) {
340 if (hole) {
341 i += doi.doi_dnodesize >> DNODE_SHIFT;
342 } else {
343 *objectp = i;
344 return (0);
345 }
346 } else {
347 return (error);
348 }
349 }
350
351 start_obj = i;
352 } else {
353 start_obj = *objectp + 1;
354 }
355
356 offset = start_obj << DNODE_SHIFT;
357
572e2857 358 error = dnode_next_offset(DMU_META_DNODE(os),
b128c09f 359 (hole ? DNODE_FIND_HOLE : 0), &offset, 0, DNODES_PER_BLOCK, txg);
34dc7c2f
BB
360
361 *objectp = offset >> DNODE_SHIFT;
362
363 return (error);
364}
c28b2279 365
ea04106b
AX
366/*
367 * Turn this object from old_type into DMU_OTN_ZAP_METADATA, and bump the
368 * refcount on SPA_FEATURE_EXTENSIBLE_DATASET.
369 *
370 * Only for use from syncing context, on MOS objects.
371 */
372void
373dmu_object_zapify(objset_t *mos, uint64_t object, dmu_object_type_t old_type,
374 dmu_tx_t *tx)
375{
376 dnode_t *dn;
377
378 ASSERT(dmu_tx_is_syncing(tx));
379
380 VERIFY0(dnode_hold(mos, object, FTAG, &dn));
381 if (dn->dn_type == DMU_OTN_ZAP_METADATA) {
382 dnode_rele(dn, FTAG);
383 return;
384 }
385 ASSERT3U(dn->dn_type, ==, old_type);
386 ASSERT0(dn->dn_maxblkid);
387 dn->dn_next_type[tx->tx_txg & TXG_MASK] = dn->dn_type =
388 DMU_OTN_ZAP_METADATA;
389 dnode_setdirty(dn, tx);
390 dnode_rele(dn, FTAG);
391
392 mzap_create_impl(mos, object, 0, 0, tx);
393
394 spa_feature_incr(dmu_objset_spa(mos),
395 SPA_FEATURE_EXTENSIBLE_DATASET, tx);
396}
397
398void
399dmu_object_free_zapified(objset_t *mos, uint64_t object, dmu_tx_t *tx)
400{
401 dnode_t *dn;
402 dmu_object_type_t t;
403
404 ASSERT(dmu_tx_is_syncing(tx));
405
406 VERIFY0(dnode_hold(mos, object, FTAG, &dn));
407 t = dn->dn_type;
408 dnode_rele(dn, FTAG);
409
410 if (t == DMU_OTN_ZAP_METADATA) {
411 spa_feature_decr(dmu_objset_spa(mos),
412 SPA_FEATURE_EXTENSIBLE_DATASET, tx);
413 }
414 VERIFY0(dmu_object_free(mos, object, tx));
415}
416
c28b2279
BB
417#if defined(_KERNEL) && defined(HAVE_SPL)
418EXPORT_SYMBOL(dmu_object_alloc);
cae5b340 419EXPORT_SYMBOL(dmu_object_alloc_dnsize);
c28b2279 420EXPORT_SYMBOL(dmu_object_claim);
cae5b340 421EXPORT_SYMBOL(dmu_object_claim_dnsize);
c28b2279 422EXPORT_SYMBOL(dmu_object_reclaim);
cae5b340 423EXPORT_SYMBOL(dmu_object_reclaim_dnsize);
c28b2279
BB
424EXPORT_SYMBOL(dmu_object_free);
425EXPORT_SYMBOL(dmu_object_next);
ea04106b
AX
426EXPORT_SYMBOL(dmu_object_zapify);
427EXPORT_SYMBOL(dmu_object_free_zapified);
cae5b340
AX
428
429/* BEGIN CSTYLED */
430module_param(dmu_object_alloc_chunk_shift, int, 0644);
431MODULE_PARM_DESC(dmu_object_alloc_chunk_shift,
432 "CPU-specific allocator grabs 2^N objects at once");
433/* END CSTYLED */
c28b2279 434#endif