]> git.proxmox.com Git - mirror_zfs.git/blame - module/zfs/dmu_object.c
Fixes for procfs files backed by linked lists
[mirror_zfs.git] / module / zfs / dmu_object.c
CommitLineData
34dc7c2f
BB
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
428870ff 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
1a5b96b8 23 * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
6c59307a 24 * Copyright 2014 HybridCluster. All rights reserved.
34dc7c2f
BB
25 */
26
34dc7c2f
BB
27#include <sys/dmu.h>
28#include <sys/dmu_objset.h>
29#include <sys/dmu_tx.h>
30#include <sys/dnode.h>
fa86b5db
MA
31#include <sys/zap.h>
32#include <sys/zfeature.h>
50c957f7 33#include <sys/dsl_dataset.h>
34dc7c2f 34
dbeb8796
MA
35/*
36 * Each of the concurrent object allocators will grab
37 * 2^dmu_object_alloc_chunk_shift dnode slots at a time. The default is to
38 * grab 128 slots, which is 4 blocks worth. This was experimentally
39 * determined to be the lowest value that eliminates the measurable effect
40 * of lock contention from this code path.
41 */
42int dmu_object_alloc_chunk_shift = 7;
43
3a549dc7
MA
44static uint64_t
45dmu_object_alloc_impl(objset_t *os, dmu_object_type_t ot, int blocksize,
46 int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen,
47 int dnodesize, dmu_tx_t *tx)
34dc7c2f 48{
34dc7c2f 49 uint64_t object;
68cbd56e 50 uint64_t L1_dnode_count = DNODES_PER_BLOCK <<
572e2857 51 (DMU_META_DNODE(os)->dn_indblkshift - SPA_BLKPTRSHIFT);
34dc7c2f 52 dnode_t *dn = NULL;
50c957f7
NB
53 int dn_slots = dnodesize >> DNODE_SHIFT;
54 boolean_t restarted = B_FALSE;
d9ad3fea 55 uint64_t *cpuobj = NULL;
dbeb8796 56 int dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift;
9631681b 57 int error;
50c957f7 58
d9ad3fea
MJ
59 kpreempt_disable();
60 cpuobj = &os->os_obj_next_percpu[CPU_SEQID %
61 os->os_obj_next_percpu_len];
62 kpreempt_enable();
63
50c957f7
NB
64 if (dn_slots == 0) {
65 dn_slots = DNODE_MIN_SLOTS;
66 } else {
67 ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS);
68 ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS);
69 }
34dc7c2f 70
dbeb8796
MA
71 /*
72 * The "chunk" of dnodes that is assigned to a CPU-specific
73 * allocator needs to be at least one block's worth, to avoid
74 * lock contention on the dbuf. It can be at most one L1 block's
75 * worth, so that the "rescan after polishing off a L1's worth"
76 * logic below will be sure to kick in.
77 */
78 if (dnodes_per_chunk < DNODES_PER_BLOCK)
79 dnodes_per_chunk = DNODES_PER_BLOCK;
80 if (dnodes_per_chunk > L1_dnode_count)
81 dnodes_per_chunk = L1_dnode_count;
82
83 object = *cpuobj;
34dc7c2f 84 for (;;) {
34dc7c2f 85 /*
dbeb8796
MA
86 * If we finished a chunk of dnodes, get a new one from
87 * the global allocator.
34dc7c2f 88 */
4c5b89f5
OF
89 if ((P2PHASE(object, dnodes_per_chunk) == 0) ||
90 (P2PHASE(object + dn_slots - 1, dnodes_per_chunk) <
91 dn_slots)) {
92 DNODE_STAT_BUMP(dnode_alloc_next_chunk);
dbeb8796
MA
93 mutex_enter(&os->os_obj_lock);
94 ASSERT0(P2PHASE(os->os_obj_next_chunk,
95 dnodes_per_chunk));
96 object = os->os_obj_next_chunk;
97
98 /*
99 * Each time we polish off a L1 bp worth of dnodes
100 * (2^12 objects), move to another L1 bp that's
101 * still reasonably sparse (at most 1/4 full). Look
102 * from the beginning at most once per txg. If we
103 * still can't allocate from that L1 block, search
104 * for an empty L0 block, which will quickly skip
105 * to the end of the metadnode if no nearby L0
106 * blocks are empty. This fallback avoids a
107 * pathology where full dnode blocks containing
108 * large dnodes appear sparse because they have a
109 * low blk_fill, leading to many failed allocation
110 * attempts. In the long term a better mechanism to
111 * search for sparse metadnode regions, such as
112 * spacemaps, could be implemented.
113 *
114 * os_scan_dnodes is set during txg sync if enough
115 * objects have been freed since the previous
116 * rescan to justify backfilling again.
117 *
118 * Note that dmu_traverse depends on the behavior
119 * that we use multiple blocks of the dnode object
120 * before going back to reuse objects. Any change
121 * to this algorithm should preserve that property
122 * or find another solution to the issues described
123 * in traverse_visitbp.
124 */
125 if (P2PHASE(object, L1_dnode_count) == 0) {
126 uint64_t offset;
127 uint64_t blkfill;
128 int minlvl;
dbeb8796
MA
129 if (os->os_rescan_dnodes) {
130 offset = 0;
131 os->os_rescan_dnodes = B_FALSE;
132 } else {
133 offset = object << DNODE_SHIFT;
134 }
135 blkfill = restarted ? 1 : DNODES_PER_BLOCK >> 2;
136 minlvl = restarted ? 1 : 2;
137 restarted = B_TRUE;
138 error = dnode_next_offset(DMU_META_DNODE(os),
139 DNODE_FIND_HOLE, &offset, minlvl,
140 blkfill, 0);
141 if (error == 0) {
142 object = offset >> DNODE_SHIFT;
143 }
68cbd56e 144 }
dbeb8796
MA
145 /*
146 * Note: if "restarted", we may find a L0 that
147 * is not suitably aligned.
148 */
149 os->os_obj_next_chunk =
150 P2ALIGN(object, dnodes_per_chunk) +
151 dnodes_per_chunk;
152 (void) atomic_swap_64(cpuobj, object);
153 mutex_exit(&os->os_obj_lock);
34dc7c2f 154 }
34dc7c2f 155
4c5b89f5
OF
156 /*
157 * The value of (*cpuobj) before adding dn_slots is the object
158 * ID assigned to us. The value afterwards is the object ID
159 * assigned to whoever wants to do an allocation next.
160 */
161 object = atomic_add_64_nv(cpuobj, dn_slots) - dn_slots;
162
34dc7c2f
BB
163 /*
164 * XXX We should check for an i/o error here and return
165 * up to our caller. Actually we should pre-read it in
166 * dmu_tx_assign(), but there is currently no mechanism
167 * to do so.
168 */
9631681b 169 error = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE,
dbeb8796 170 dn_slots, FTAG, &dn);
9631681b 171 if (error == 0) {
dbeb8796 172 rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
50c957f7 173 /*
dbeb8796
MA
174 * Another thread could have allocated it; check
175 * again now that we have the struct lock.
50c957f7 176 */
dbeb8796 177 if (dn->dn_type == DMU_OT_NONE) {
3a549dc7
MA
178 dnode_allocate(dn, ot, blocksize,
179 indirect_blockshift, bonustype,
180 bonuslen, dn_slots, tx);
dbeb8796
MA
181 rw_exit(&dn->dn_struct_rwlock);
182 dmu_tx_add_new_object(tx, dn);
183 dnode_rele(dn, FTAG);
dbeb8796
MA
184 return (object);
185 }
186 rw_exit(&dn->dn_struct_rwlock);
187 dnode_rele(dn, FTAG);
4c5b89f5 188 DNODE_STAT_BUMP(dnode_alloc_race);
dbeb8796 189 }
0eef1bde 190
4c5b89f5
OF
191 /*
192 * Skip to next known valid starting point on error. This
193 * is the start of the next block of dnodes.
194 */
dbeb8796 195 if (dmu_object_next(os, &object, B_TRUE, 0) != 0) {
dbeb8796 196 object = P2ROUNDUP(object + 1, DNODES_PER_BLOCK);
4c5b89f5 197 DNODE_STAT_BUMP(dnode_alloc_next_block);
dbeb8796
MA
198 }
199 (void) atomic_swap_64(cpuobj, object);
200 }
34dc7c2f
BB
201}
202
3a549dc7
MA
203uint64_t
204dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
205 dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
206{
207 return dmu_object_alloc_impl(os, ot, blocksize, 0, bonustype,
208 bonuslen, 0, tx);
209}
210
211uint64_t
212dmu_object_alloc_ibs(objset_t *os, dmu_object_type_t ot, int blocksize,
213 int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen,
214 dmu_tx_t *tx)
215{
216 return dmu_object_alloc_impl(os, ot, blocksize, indirect_blockshift,
217 bonustype, bonuslen, 0, tx);
218}
219
220uint64_t
221dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize,
222 dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
223{
224 return (dmu_object_alloc_impl(os, ot, blocksize, 0, bonustype,
225 bonuslen, dnodesize, tx));
226}
227
34dc7c2f
BB
228int
229dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
230 int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
50c957f7
NB
231{
232 return (dmu_object_claim_dnsize(os, object, ot, blocksize, bonustype,
233 bonuslen, 0, tx));
234}
235
236int
237dmu_object_claim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,
238 int blocksize, dmu_object_type_t bonustype, int bonuslen,
239 int dnodesize, dmu_tx_t *tx)
34dc7c2f
BB
240{
241 dnode_t *dn;
50c957f7 242 int dn_slots = dnodesize >> DNODE_SHIFT;
34dc7c2f
BB
243 int err;
244
50c957f7
NB
245 if (dn_slots == 0)
246 dn_slots = DNODE_MIN_SLOTS;
247 ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS);
248 ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS);
249
34dc7c2f 250 if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx))
2e528b49 251 return (SET_ERROR(EBADF));
34dc7c2f 252
50c957f7
NB
253 err = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, dn_slots,
254 FTAG, &dn);
34dc7c2f
BB
255 if (err)
256 return (err);
50c957f7
NB
257
258 dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, dn_slots, tx);
66eead53 259 dmu_tx_add_new_object(tx, dn);
0eef1bde 260
34dc7c2f
BB
261 dnode_rele(dn, FTAG);
262
34dc7c2f
BB
263 return (0);
264}
265
266int
267dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
6c59307a 268 int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
50c957f7
NB
269{
270 return (dmu_object_reclaim_dnsize(os, object, ot, blocksize, bonustype,
e14a32b1 271 bonuslen, DNODE_MIN_SIZE, tx));
50c957f7
NB
272}
273
274int
275dmu_object_reclaim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,
276 int blocksize, dmu_object_type_t bonustype, int bonuslen, int dnodesize,
277 dmu_tx_t *tx)
34dc7c2f
BB
278{
279 dnode_t *dn;
50c957f7 280 int dn_slots = dnodesize >> DNODE_SHIFT;
34dc7c2f
BB
281 int err;
282
da2feb42
TC
283 if (dn_slots == 0)
284 dn_slots = DNODE_MIN_SLOTS;
285
9babb374 286 if (object == DMU_META_DNODE_OBJECT)
2e528b49 287 return (SET_ERROR(EBADF));
34dc7c2f 288
50c957f7 289 err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0,
34dc7c2f
BB
290 FTAG, &dn);
291 if (err)
292 return (err);
9babb374 293
50c957f7 294 dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, dn_slots, tx);
9babb374 295
34dc7c2f 296 dnode_rele(dn, FTAG);
9babb374 297 return (err);
34dc7c2f
BB
298}
299
300int
301dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx)
302{
303 dnode_t *dn;
304 int err;
305
306 ASSERT(object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
307
50c957f7 308 err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0,
34dc7c2f
BB
309 FTAG, &dn);
310 if (err)
311 return (err);
312
313 ASSERT(dn->dn_type != DMU_OT_NONE);
21d48b5e
PD
314 /*
315 * If we don't create this free range, we'll leak indirect blocks when
316 * we get to freeing the dnode in syncing context.
317 */
b128c09f 318 dnode_free_range(dn, 0, DMU_OBJECT_END, tx);
34dc7c2f
BB
319 dnode_free(dn, tx);
320 dnode_rele(dn, FTAG);
321
322 return (0);
323}
324
fcff0f35
PD
325/*
326 * Return (in *objectp) the next object which is allocated (or a hole)
327 * after *object, taking into account only objects that may have been modified
328 * after the specified txg.
329 */
34dc7c2f
BB
330int
331dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg)
332{
50c957f7 333 uint64_t offset;
08f0510d 334 uint64_t start_obj;
50c957f7 335 struct dsl_dataset *ds = os->os_dsl_dataset;
34dc7c2f
BB
336 int error;
337
08f0510d 338 if (*objectp == 0) {
339 start_obj = 1;
340 } else if (ds && ds->ds_feature_inuse[SPA_FEATURE_LARGE_DNODE]) {
4c5b89f5
OF
341 uint64_t i = *objectp + 1;
342 uint64_t last_obj = *objectp | (DNODES_PER_BLOCK - 1);
343 dmu_object_info_t doi;
344
08f0510d 345 /*
4c5b89f5
OF
346 * Scan through the remaining meta dnode block. The contents
347 * of each slot in the block are known so it can be quickly
348 * checked. If the block is exhausted without a match then
349 * hand off to dnode_next_offset() for further scanning.
08f0510d 350 */
4c5b89f5 351 while (i <= last_obj) {
08f0510d 352 error = dmu_object_info(os, i, &doi);
4c5b89f5
OF
353 if (error == ENOENT) {
354 if (hole) {
355 *objectp = i;
356 return (0);
357 } else {
358 i++;
359 }
360 } else if (error == EEXIST) {
361 i++;
362 } else if (error == 0) {
363 if (hole) {
364 i += doi.doi_dnodesize >> DNODE_SHIFT;
365 } else {
366 *objectp = i;
367 return (0);
368 }
369 } else {
370 return (error);
371 }
08f0510d 372 }
373
374 start_obj = i;
50c957f7 375 } else {
08f0510d 376 start_obj = *objectp + 1;
50c957f7
NB
377 }
378
08f0510d 379 offset = start_obj << DNODE_SHIFT;
50c957f7 380
572e2857 381 error = dnode_next_offset(DMU_META_DNODE(os),
b128c09f 382 (hole ? DNODE_FIND_HOLE : 0), &offset, 0, DNODES_PER_BLOCK, txg);
34dc7c2f
BB
383
384 *objectp = offset >> DNODE_SHIFT;
385
386 return (error);
387}
c28b2279 388
fa86b5db
MA
389/*
390 * Turn this object from old_type into DMU_OTN_ZAP_METADATA, and bump the
391 * refcount on SPA_FEATURE_EXTENSIBLE_DATASET.
392 *
393 * Only for use from syncing context, on MOS objects.
394 */
395void
396dmu_object_zapify(objset_t *mos, uint64_t object, dmu_object_type_t old_type,
397 dmu_tx_t *tx)
398{
399 dnode_t *dn;
400
401 ASSERT(dmu_tx_is_syncing(tx));
402
403 VERIFY0(dnode_hold(mos, object, FTAG, &dn));
404 if (dn->dn_type == DMU_OTN_ZAP_METADATA) {
405 dnode_rele(dn, FTAG);
406 return;
407 }
408 ASSERT3U(dn->dn_type, ==, old_type);
409 ASSERT0(dn->dn_maxblkid);
1a5b96b8
MA
410
411 /*
412 * We must initialize the ZAP data before changing the type,
413 * so that concurrent calls to *_is_zapified() can determine if
414 * the object has been completely zapified by checking the type.
415 */
416 mzap_create_impl(mos, object, 0, 0, tx);
417
fa86b5db
MA
418 dn->dn_next_type[tx->tx_txg & TXG_MASK] = dn->dn_type =
419 DMU_OTN_ZAP_METADATA;
420 dnode_setdirty(dn, tx);
421 dnode_rele(dn, FTAG);
422
fa86b5db
MA
423
424 spa_feature_incr(dmu_objset_spa(mos),
425 SPA_FEATURE_EXTENSIBLE_DATASET, tx);
426}
427
428void
429dmu_object_free_zapified(objset_t *mos, uint64_t object, dmu_tx_t *tx)
430{
431 dnode_t *dn;
432 dmu_object_type_t t;
433
434 ASSERT(dmu_tx_is_syncing(tx));
435
436 VERIFY0(dnode_hold(mos, object, FTAG, &dn));
437 t = dn->dn_type;
438 dnode_rele(dn, FTAG);
439
440 if (t == DMU_OTN_ZAP_METADATA) {
441 spa_feature_decr(dmu_objset_spa(mos),
442 SPA_FEATURE_EXTENSIBLE_DATASET, tx);
443 }
444 VERIFY0(dmu_object_free(mos, object, tx));
445}
446
93ce2b4c 447#if defined(_KERNEL)
c28b2279 448EXPORT_SYMBOL(dmu_object_alloc);
3a549dc7 449EXPORT_SYMBOL(dmu_object_alloc_ibs);
50c957f7 450EXPORT_SYMBOL(dmu_object_alloc_dnsize);
c28b2279 451EXPORT_SYMBOL(dmu_object_claim);
50c957f7 452EXPORT_SYMBOL(dmu_object_claim_dnsize);
c28b2279 453EXPORT_SYMBOL(dmu_object_reclaim);
50c957f7 454EXPORT_SYMBOL(dmu_object_reclaim_dnsize);
c28b2279
BB
455EXPORT_SYMBOL(dmu_object_free);
456EXPORT_SYMBOL(dmu_object_next);
fa86b5db
MA
457EXPORT_SYMBOL(dmu_object_zapify);
458EXPORT_SYMBOL(dmu_object_free_zapified);
dbeb8796
MA
459
460/* BEGIN CSTYLED */
461module_param(dmu_object_alloc_chunk_shift, int, 0644);
462MODULE_PARM_DESC(dmu_object_alloc_chunk_shift,
463 "CPU-specific allocator grabs 2^N objects at once");
464/* END CSTYLED */
c28b2279 465#endif