]> git.proxmox.com Git - mirror_zfs.git/blame - module/zfs/dmu_object.c
OpenZFS 9235 - rename zpool_rewind_policy_t to zpool_load_policy_t
[mirror_zfs.git] / module / zfs / dmu_object.c
CommitLineData
34dc7c2f
BB
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
428870ff 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
1a5b96b8 23 * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
6c59307a 24 * Copyright 2014 HybridCluster. All rights reserved.
34dc7c2f
BB
25 */
26
34dc7c2f
BB
27#include <sys/dmu.h>
28#include <sys/dmu_objset.h>
29#include <sys/dmu_tx.h>
30#include <sys/dnode.h>
fa86b5db
MA
31#include <sys/zap.h>
32#include <sys/zfeature.h>
50c957f7 33#include <sys/dsl_dataset.h>
34dc7c2f 34
dbeb8796
MA
35/*
36 * Each of the concurrent object allocators will grab
37 * 2^dmu_object_alloc_chunk_shift dnode slots at a time. The default is to
38 * grab 128 slots, which is 4 blocks worth. This was experimentally
39 * determined to be the lowest value that eliminates the measurable effect
40 * of lock contention from this code path.
41 */
42int dmu_object_alloc_chunk_shift = 7;
43
34dc7c2f
BB
44uint64_t
45dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
46 dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
50c957f7
NB
47{
48 return dmu_object_alloc_dnsize(os, ot, blocksize, bonustype, bonuslen,
49 0, tx);
50}
51
52uint64_t
53dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize,
54 dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
34dc7c2f 55{
34dc7c2f 56 uint64_t object;
68cbd56e 57 uint64_t L1_dnode_count = DNODES_PER_BLOCK <<
572e2857 58 (DMU_META_DNODE(os)->dn_indblkshift - SPA_BLKPTRSHIFT);
34dc7c2f 59 dnode_t *dn = NULL;
50c957f7
NB
60 int dn_slots = dnodesize >> DNODE_SHIFT;
61 boolean_t restarted = B_FALSE;
d9ad3fea 62 uint64_t *cpuobj = NULL;
dbeb8796 63 int dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift;
9631681b 64 int error;
50c957f7 65
d9ad3fea
MJ
66 kpreempt_disable();
67 cpuobj = &os->os_obj_next_percpu[CPU_SEQID %
68 os->os_obj_next_percpu_len];
69 kpreempt_enable();
70
50c957f7
NB
71 if (dn_slots == 0) {
72 dn_slots = DNODE_MIN_SLOTS;
73 } else {
74 ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS);
75 ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS);
76 }
34dc7c2f 77
dbeb8796
MA
78 /*
79 * The "chunk" of dnodes that is assigned to a CPU-specific
80 * allocator needs to be at least one block's worth, to avoid
81 * lock contention on the dbuf. It can be at most one L1 block's
82 * worth, so that the "rescan after polishing off a L1's worth"
83 * logic below will be sure to kick in.
84 */
85 if (dnodes_per_chunk < DNODES_PER_BLOCK)
86 dnodes_per_chunk = DNODES_PER_BLOCK;
87 if (dnodes_per_chunk > L1_dnode_count)
88 dnodes_per_chunk = L1_dnode_count;
89
90 object = *cpuobj;
34dc7c2f 91 for (;;) {
34dc7c2f 92 /*
dbeb8796
MA
93 * If we finished a chunk of dnodes, get a new one from
94 * the global allocator.
34dc7c2f 95 */
4c5b89f5
OF
96 if ((P2PHASE(object, dnodes_per_chunk) == 0) ||
97 (P2PHASE(object + dn_slots - 1, dnodes_per_chunk) <
98 dn_slots)) {
99 DNODE_STAT_BUMP(dnode_alloc_next_chunk);
dbeb8796
MA
100 mutex_enter(&os->os_obj_lock);
101 ASSERT0(P2PHASE(os->os_obj_next_chunk,
102 dnodes_per_chunk));
103 object = os->os_obj_next_chunk;
104
105 /*
106 * Each time we polish off a L1 bp worth of dnodes
107 * (2^12 objects), move to another L1 bp that's
108 * still reasonably sparse (at most 1/4 full). Look
109 * from the beginning at most once per txg. If we
110 * still can't allocate from that L1 block, search
111 * for an empty L0 block, which will quickly skip
112 * to the end of the metadnode if no nearby L0
113 * blocks are empty. This fallback avoids a
114 * pathology where full dnode blocks containing
115 * large dnodes appear sparse because they have a
116 * low blk_fill, leading to many failed allocation
117 * attempts. In the long term a better mechanism to
118 * search for sparse metadnode regions, such as
119 * spacemaps, could be implemented.
120 *
121 * os_scan_dnodes is set during txg sync if enough
122 * objects have been freed since the previous
123 * rescan to justify backfilling again.
124 *
125 * Note that dmu_traverse depends on the behavior
126 * that we use multiple blocks of the dnode object
127 * before going back to reuse objects. Any change
128 * to this algorithm should preserve that property
129 * or find another solution to the issues described
130 * in traverse_visitbp.
131 */
132 if (P2PHASE(object, L1_dnode_count) == 0) {
133 uint64_t offset;
134 uint64_t blkfill;
135 int minlvl;
dbeb8796
MA
136 if (os->os_rescan_dnodes) {
137 offset = 0;
138 os->os_rescan_dnodes = B_FALSE;
139 } else {
140 offset = object << DNODE_SHIFT;
141 }
142 blkfill = restarted ? 1 : DNODES_PER_BLOCK >> 2;
143 minlvl = restarted ? 1 : 2;
144 restarted = B_TRUE;
145 error = dnode_next_offset(DMU_META_DNODE(os),
146 DNODE_FIND_HOLE, &offset, minlvl,
147 blkfill, 0);
148 if (error == 0) {
149 object = offset >> DNODE_SHIFT;
150 }
68cbd56e 151 }
dbeb8796
MA
152 /*
153 * Note: if "restarted", we may find a L0 that
154 * is not suitably aligned.
155 */
156 os->os_obj_next_chunk =
157 P2ALIGN(object, dnodes_per_chunk) +
158 dnodes_per_chunk;
159 (void) atomic_swap_64(cpuobj, object);
160 mutex_exit(&os->os_obj_lock);
34dc7c2f 161 }
34dc7c2f 162
4c5b89f5
OF
163 /*
164 * The value of (*cpuobj) before adding dn_slots is the object
165 * ID assigned to us. The value afterwards is the object ID
166 * assigned to whoever wants to do an allocation next.
167 */
168 object = atomic_add_64_nv(cpuobj, dn_slots) - dn_slots;
169
34dc7c2f
BB
170 /*
171 * XXX We should check for an i/o error here and return
172 * up to our caller. Actually we should pre-read it in
173 * dmu_tx_assign(), but there is currently no mechanism
174 * to do so.
175 */
9631681b 176 error = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE,
dbeb8796 177 dn_slots, FTAG, &dn);
9631681b 178 if (error == 0) {
dbeb8796 179 rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
50c957f7 180 /*
dbeb8796
MA
181 * Another thread could have allocated it; check
182 * again now that we have the struct lock.
50c957f7 183 */
dbeb8796
MA
184 if (dn->dn_type == DMU_OT_NONE) {
185 dnode_allocate(dn, ot, blocksize, 0,
186 bonustype, bonuslen, dn_slots, tx);
187 rw_exit(&dn->dn_struct_rwlock);
188 dmu_tx_add_new_object(tx, dn);
189 dnode_rele(dn, FTAG);
dbeb8796
MA
190 return (object);
191 }
192 rw_exit(&dn->dn_struct_rwlock);
193 dnode_rele(dn, FTAG);
4c5b89f5 194 DNODE_STAT_BUMP(dnode_alloc_race);
dbeb8796 195 }
0eef1bde 196
4c5b89f5
OF
197 /*
198 * Skip to next known valid starting point on error. This
199 * is the start of the next block of dnodes.
200 */
dbeb8796 201 if (dmu_object_next(os, &object, B_TRUE, 0) != 0) {
dbeb8796 202 object = P2ROUNDUP(object + 1, DNODES_PER_BLOCK);
4c5b89f5 203 DNODE_STAT_BUMP(dnode_alloc_next_block);
dbeb8796
MA
204 }
205 (void) atomic_swap_64(cpuobj, object);
206 }
34dc7c2f
BB
207}
208
209int
210dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
211 int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
50c957f7
NB
212{
213 return (dmu_object_claim_dnsize(os, object, ot, blocksize, bonustype,
214 bonuslen, 0, tx));
215}
216
217int
218dmu_object_claim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,
219 int blocksize, dmu_object_type_t bonustype, int bonuslen,
220 int dnodesize, dmu_tx_t *tx)
34dc7c2f
BB
221{
222 dnode_t *dn;
50c957f7 223 int dn_slots = dnodesize >> DNODE_SHIFT;
34dc7c2f
BB
224 int err;
225
50c957f7
NB
226 if (dn_slots == 0)
227 dn_slots = DNODE_MIN_SLOTS;
228 ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS);
229 ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS);
230
34dc7c2f 231 if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx))
2e528b49 232 return (SET_ERROR(EBADF));
34dc7c2f 233
50c957f7
NB
234 err = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, dn_slots,
235 FTAG, &dn);
34dc7c2f
BB
236 if (err)
237 return (err);
50c957f7
NB
238
239 dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, dn_slots, tx);
66eead53 240 dmu_tx_add_new_object(tx, dn);
0eef1bde 241
34dc7c2f
BB
242 dnode_rele(dn, FTAG);
243
34dc7c2f
BB
244 return (0);
245}
246
247int
248dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
6c59307a 249 int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
50c957f7
NB
250{
251 return (dmu_object_reclaim_dnsize(os, object, ot, blocksize, bonustype,
e14a32b1 252 bonuslen, DNODE_MIN_SIZE, tx));
50c957f7
NB
253}
254
255int
256dmu_object_reclaim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,
257 int blocksize, dmu_object_type_t bonustype, int bonuslen, int dnodesize,
258 dmu_tx_t *tx)
34dc7c2f
BB
259{
260 dnode_t *dn;
50c957f7 261 int dn_slots = dnodesize >> DNODE_SHIFT;
34dc7c2f
BB
262 int err;
263
9babb374 264 if (object == DMU_META_DNODE_OBJECT)
2e528b49 265 return (SET_ERROR(EBADF));
34dc7c2f 266
50c957f7 267 err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0,
34dc7c2f
BB
268 FTAG, &dn);
269 if (err)
270 return (err);
9babb374 271
50c957f7 272 dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, dn_slots, tx);
9babb374 273
34dc7c2f 274 dnode_rele(dn, FTAG);
9babb374 275 return (err);
34dc7c2f
BB
276}
277
278int
279dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx)
280{
281 dnode_t *dn;
282 int err;
283
284 ASSERT(object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
285
50c957f7 286 err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0,
34dc7c2f
BB
287 FTAG, &dn);
288 if (err)
289 return (err);
290
291 ASSERT(dn->dn_type != DMU_OT_NONE);
b128c09f 292 dnode_free_range(dn, 0, DMU_OBJECT_END, tx);
34dc7c2f
BB
293 dnode_free(dn, tx);
294 dnode_rele(dn, FTAG);
295
296 return (0);
297}
298
fcff0f35
PD
299/*
300 * Return (in *objectp) the next object which is allocated (or a hole)
301 * after *object, taking into account only objects that may have been modified
302 * after the specified txg.
303 */
34dc7c2f
BB
304int
305dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg)
306{
50c957f7 307 uint64_t offset;
08f0510d 308 uint64_t start_obj;
50c957f7 309 struct dsl_dataset *ds = os->os_dsl_dataset;
34dc7c2f
BB
310 int error;
311
08f0510d 312 if (*objectp == 0) {
313 start_obj = 1;
314 } else if (ds && ds->ds_feature_inuse[SPA_FEATURE_LARGE_DNODE]) {
4c5b89f5
OF
315 uint64_t i = *objectp + 1;
316 uint64_t last_obj = *objectp | (DNODES_PER_BLOCK - 1);
317 dmu_object_info_t doi;
318
08f0510d 319 /*
4c5b89f5
OF
320 * Scan through the remaining meta dnode block. The contents
321 * of each slot in the block are known so it can be quickly
322 * checked. If the block is exhausted without a match then
323 * hand off to dnode_next_offset() for further scanning.
08f0510d 324 */
4c5b89f5 325 while (i <= last_obj) {
08f0510d 326 error = dmu_object_info(os, i, &doi);
4c5b89f5
OF
327 if (error == ENOENT) {
328 if (hole) {
329 *objectp = i;
330 return (0);
331 } else {
332 i++;
333 }
334 } else if (error == EEXIST) {
335 i++;
336 } else if (error == 0) {
337 if (hole) {
338 i += doi.doi_dnodesize >> DNODE_SHIFT;
339 } else {
340 *objectp = i;
341 return (0);
342 }
343 } else {
344 return (error);
345 }
08f0510d 346 }
347
348 start_obj = i;
50c957f7 349 } else {
08f0510d 350 start_obj = *objectp + 1;
50c957f7
NB
351 }
352
08f0510d 353 offset = start_obj << DNODE_SHIFT;
50c957f7 354
572e2857 355 error = dnode_next_offset(DMU_META_DNODE(os),
b128c09f 356 (hole ? DNODE_FIND_HOLE : 0), &offset, 0, DNODES_PER_BLOCK, txg);
34dc7c2f
BB
357
358 *objectp = offset >> DNODE_SHIFT;
359
360 return (error);
361}
c28b2279 362
fa86b5db
MA
363/*
364 * Turn this object from old_type into DMU_OTN_ZAP_METADATA, and bump the
365 * refcount on SPA_FEATURE_EXTENSIBLE_DATASET.
366 *
367 * Only for use from syncing context, on MOS objects.
368 */
369void
370dmu_object_zapify(objset_t *mos, uint64_t object, dmu_object_type_t old_type,
371 dmu_tx_t *tx)
372{
373 dnode_t *dn;
374
375 ASSERT(dmu_tx_is_syncing(tx));
376
377 VERIFY0(dnode_hold(mos, object, FTAG, &dn));
378 if (dn->dn_type == DMU_OTN_ZAP_METADATA) {
379 dnode_rele(dn, FTAG);
380 return;
381 }
382 ASSERT3U(dn->dn_type, ==, old_type);
383 ASSERT0(dn->dn_maxblkid);
1a5b96b8
MA
384
385 /*
386 * We must initialize the ZAP data before changing the type,
387 * so that concurrent calls to *_is_zapified() can determine if
388 * the object has been completely zapified by checking the type.
389 */
390 mzap_create_impl(mos, object, 0, 0, tx);
391
fa86b5db
MA
392 dn->dn_next_type[tx->tx_txg & TXG_MASK] = dn->dn_type =
393 DMU_OTN_ZAP_METADATA;
394 dnode_setdirty(dn, tx);
395 dnode_rele(dn, FTAG);
396
fa86b5db
MA
397
398 spa_feature_incr(dmu_objset_spa(mos),
399 SPA_FEATURE_EXTENSIBLE_DATASET, tx);
400}
401
402void
403dmu_object_free_zapified(objset_t *mos, uint64_t object, dmu_tx_t *tx)
404{
405 dnode_t *dn;
406 dmu_object_type_t t;
407
408 ASSERT(dmu_tx_is_syncing(tx));
409
410 VERIFY0(dnode_hold(mos, object, FTAG, &dn));
411 t = dn->dn_type;
412 dnode_rele(dn, FTAG);
413
414 if (t == DMU_OTN_ZAP_METADATA) {
415 spa_feature_decr(dmu_objset_spa(mos),
416 SPA_FEATURE_EXTENSIBLE_DATASET, tx);
417 }
418 VERIFY0(dmu_object_free(mos, object, tx));
419}
420
93ce2b4c 421#if defined(_KERNEL)
c28b2279 422EXPORT_SYMBOL(dmu_object_alloc);
50c957f7 423EXPORT_SYMBOL(dmu_object_alloc_dnsize);
c28b2279 424EXPORT_SYMBOL(dmu_object_claim);
50c957f7 425EXPORT_SYMBOL(dmu_object_claim_dnsize);
c28b2279 426EXPORT_SYMBOL(dmu_object_reclaim);
50c957f7 427EXPORT_SYMBOL(dmu_object_reclaim_dnsize);
c28b2279
BB
428EXPORT_SYMBOL(dmu_object_free);
429EXPORT_SYMBOL(dmu_object_next);
fa86b5db
MA
430EXPORT_SYMBOL(dmu_object_zapify);
431EXPORT_SYMBOL(dmu_object_free_zapified);
dbeb8796
MA
432
433/* BEGIN CSTYLED */
434module_param(dmu_object_alloc_chunk_shift, int, 0644);
435MODULE_PARM_DESC(dmu_object_alloc_chunk_shift,
436 "CPU-specific allocator grabs 2^N objects at once");
437/* END CSTYLED */
c28b2279 438#endif