]> git.proxmox.com Git - mirror_zfs.git/blob - module/zfs/dmu_object.c
e77ebeca54f20cb27fe7238e1184b4e771fe1723
[mirror_zfs.git] / module / zfs / dmu_object.c
1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
24 * Copyright 2014 HybridCluster. All rights reserved.
25 */
26
27 #include <sys/dmu.h>
28 #include <sys/dmu_objset.h>
29 #include <sys/dmu_tx.h>
30 #include <sys/dnode.h>
31 #include <sys/zap.h>
32 #include <sys/zfeature.h>
33 #include <sys/dsl_dataset.h>
34
35 /*
36 * Each of the concurrent object allocators will grab
37 * 2^dmu_object_alloc_chunk_shift dnode slots at a time. The default is to
38 * grab 128 slots, which is 4 blocks worth. This was experimentally
39 * determined to be the lowest value that eliminates the measurable effect
40 * of lock contention from this code path.
41 */
42 int dmu_object_alloc_chunk_shift = 7;
43
44 static uint64_t
45 dmu_object_alloc_impl(objset_t *os, dmu_object_type_t ot, int blocksize,
46 int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen,
47 int dnodesize, dnode_t **allocated_dnode, void *tag, dmu_tx_t *tx)
48 {
49 uint64_t object;
50 uint64_t L1_dnode_count = DNODES_PER_BLOCK <<
51 (DMU_META_DNODE(os)->dn_indblkshift - SPA_BLKPTRSHIFT);
52 dnode_t *dn = NULL;
53 int dn_slots = dnodesize >> DNODE_SHIFT;
54 boolean_t restarted = B_FALSE;
55 uint64_t *cpuobj = NULL;
56 int dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift;
57 int error;
58
59 kpreempt_disable();
60 cpuobj = &os->os_obj_next_percpu[CPU_SEQID %
61 os->os_obj_next_percpu_len];
62 kpreempt_enable();
63
64 if (dn_slots == 0) {
65 dn_slots = DNODE_MIN_SLOTS;
66 } else {
67 ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS);
68 ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS);
69 }
70
71 /*
72 * The "chunk" of dnodes that is assigned to a CPU-specific
73 * allocator needs to be at least one block's worth, to avoid
74 * lock contention on the dbuf. It can be at most one L1 block's
75 * worth, so that the "rescan after polishing off a L1's worth"
76 * logic below will be sure to kick in.
77 */
78 if (dnodes_per_chunk < DNODES_PER_BLOCK)
79 dnodes_per_chunk = DNODES_PER_BLOCK;
80 if (dnodes_per_chunk > L1_dnode_count)
81 dnodes_per_chunk = L1_dnode_count;
82
83 /*
84 * The caller requested the dnode be returned as a performance
85 * optimization in order to avoid releasing the hold only to
86 * immediately reacquire it. Since they caller is responsible
87 * for releasing the hold they must provide the tag.
88 */
89 if (allocated_dnode != NULL) {
90 ASSERT3P(tag, !=, NULL);
91 } else {
92 ASSERT3P(tag, ==, NULL);
93 tag = FTAG;
94 }
95
96 object = *cpuobj;
97 for (;;) {
98 /*
99 * If we finished a chunk of dnodes, get a new one from
100 * the global allocator.
101 */
102 if ((P2PHASE(object, dnodes_per_chunk) == 0) ||
103 (P2PHASE(object + dn_slots - 1, dnodes_per_chunk) <
104 dn_slots)) {
105 DNODE_STAT_BUMP(dnode_alloc_next_chunk);
106 mutex_enter(&os->os_obj_lock);
107 ASSERT0(P2PHASE(os->os_obj_next_chunk,
108 dnodes_per_chunk));
109 object = os->os_obj_next_chunk;
110
111 /*
112 * Each time we polish off a L1 bp worth of dnodes
113 * (2^12 objects), move to another L1 bp that's
114 * still reasonably sparse (at most 1/4 full). Look
115 * from the beginning at most once per txg. If we
116 * still can't allocate from that L1 block, search
117 * for an empty L0 block, which will quickly skip
118 * to the end of the metadnode if no nearby L0
119 * blocks are empty. This fallback avoids a
120 * pathology where full dnode blocks containing
121 * large dnodes appear sparse because they have a
122 * low blk_fill, leading to many failed allocation
123 * attempts. In the long term a better mechanism to
124 * search for sparse metadnode regions, such as
125 * spacemaps, could be implemented.
126 *
127 * os_scan_dnodes is set during txg sync if enough
128 * objects have been freed since the previous
129 * rescan to justify backfilling again.
130 *
131 * Note that dmu_traverse depends on the behavior
132 * that we use multiple blocks of the dnode object
133 * before going back to reuse objects. Any change
134 * to this algorithm should preserve that property
135 * or find another solution to the issues described
136 * in traverse_visitbp.
137 */
138 if (P2PHASE(object, L1_dnode_count) == 0) {
139 uint64_t offset;
140 uint64_t blkfill;
141 int minlvl;
142 if (os->os_rescan_dnodes) {
143 offset = 0;
144 os->os_rescan_dnodes = B_FALSE;
145 } else {
146 offset = object << DNODE_SHIFT;
147 }
148 blkfill = restarted ? 1 : DNODES_PER_BLOCK >> 2;
149 minlvl = restarted ? 1 : 2;
150 restarted = B_TRUE;
151 error = dnode_next_offset(DMU_META_DNODE(os),
152 DNODE_FIND_HOLE, &offset, minlvl,
153 blkfill, 0);
154 if (error == 0) {
155 object = offset >> DNODE_SHIFT;
156 }
157 }
158 /*
159 * Note: if "restarted", we may find a L0 that
160 * is not suitably aligned.
161 */
162 os->os_obj_next_chunk =
163 P2ALIGN(object, dnodes_per_chunk) +
164 dnodes_per_chunk;
165 (void) atomic_swap_64(cpuobj, object);
166 mutex_exit(&os->os_obj_lock);
167 }
168
169 /*
170 * The value of (*cpuobj) before adding dn_slots is the object
171 * ID assigned to us. The value afterwards is the object ID
172 * assigned to whoever wants to do an allocation next.
173 */
174 object = atomic_add_64_nv(cpuobj, dn_slots) - dn_slots;
175
176 /*
177 * XXX We should check for an i/o error here and return
178 * up to our caller. Actually we should pre-read it in
179 * dmu_tx_assign(), but there is currently no mechanism
180 * to do so.
181 */
182 error = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE,
183 dn_slots, tag, &dn);
184 if (error == 0) {
185 rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
186 /*
187 * Another thread could have allocated it; check
188 * again now that we have the struct lock.
189 */
190 if (dn->dn_type == DMU_OT_NONE) {
191 dnode_allocate(dn, ot, blocksize,
192 indirect_blockshift, bonustype,
193 bonuslen, dn_slots, tx);
194 rw_exit(&dn->dn_struct_rwlock);
195 dmu_tx_add_new_object(tx, dn);
196
197 /*
198 * Caller requested the allocated dnode be
199 * returned and is responsible for the hold.
200 */
201 if (allocated_dnode != NULL)
202 *allocated_dnode = dn;
203 else
204 dnode_rele(dn, tag);
205
206 return (object);
207 }
208 rw_exit(&dn->dn_struct_rwlock);
209 dnode_rele(dn, tag);
210 DNODE_STAT_BUMP(dnode_alloc_race);
211 }
212
213 /*
214 * Skip to next known valid starting point on error. This
215 * is the start of the next block of dnodes.
216 */
217 if (dmu_object_next(os, &object, B_TRUE, 0) != 0) {
218 object = P2ROUNDUP(object + 1, DNODES_PER_BLOCK);
219 DNODE_STAT_BUMP(dnode_alloc_next_block);
220 }
221 (void) atomic_swap_64(cpuobj, object);
222 }
223 }
224
225 uint64_t
226 dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
227 dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
228 {
229 return dmu_object_alloc_impl(os, ot, blocksize, 0, bonustype,
230 bonuslen, 0, NULL, NULL, tx);
231 }
232
233 uint64_t
234 dmu_object_alloc_ibs(objset_t *os, dmu_object_type_t ot, int blocksize,
235 int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen,
236 dmu_tx_t *tx)
237 {
238 return dmu_object_alloc_impl(os, ot, blocksize, indirect_blockshift,
239 bonustype, bonuslen, 0, NULL, NULL, tx);
240 }
241
242 uint64_t
243 dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize,
244 dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
245 {
246 return (dmu_object_alloc_impl(os, ot, blocksize, 0, bonustype,
247 bonuslen, dnodesize, NULL, NULL, tx));
248 }
249
250 /*
251 * Allocate a new object and return a pointer to the newly allocated dnode
252 * via the allocated_dnode argument. The returned dnode will be held and
253 * the caller is responsible for releasing the hold by calling dnode_rele().
254 */
255 uint64_t
256 dmu_object_alloc_hold(objset_t *os, dmu_object_type_t ot, int blocksize,
257 int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen,
258 int dnodesize, dnode_t **allocated_dnode, void *tag, dmu_tx_t *tx)
259 {
260 return (dmu_object_alloc_impl(os, ot, blocksize, indirect_blockshift,
261 bonustype, bonuslen, dnodesize, allocated_dnode, tag, tx));
262 }
263
264 int
265 dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
266 int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
267 {
268 return (dmu_object_claim_dnsize(os, object, ot, blocksize, bonustype,
269 bonuslen, 0, tx));
270 }
271
272 int
273 dmu_object_claim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,
274 int blocksize, dmu_object_type_t bonustype, int bonuslen,
275 int dnodesize, dmu_tx_t *tx)
276 {
277 dnode_t *dn;
278 int dn_slots = dnodesize >> DNODE_SHIFT;
279 int err;
280
281 if (dn_slots == 0)
282 dn_slots = DNODE_MIN_SLOTS;
283 ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS);
284 ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS);
285
286 if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx))
287 return (SET_ERROR(EBADF));
288
289 err = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, dn_slots,
290 FTAG, &dn);
291 if (err)
292 return (err);
293
294 dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, dn_slots, tx);
295 dmu_tx_add_new_object(tx, dn);
296
297 dnode_rele(dn, FTAG);
298
299 return (0);
300 }
301
302 int
303 dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
304 int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
305 {
306 return (dmu_object_reclaim_dnsize(os, object, ot, blocksize, bonustype,
307 bonuslen, DNODE_MIN_SIZE, tx));
308 }
309
310 int
311 dmu_object_reclaim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,
312 int blocksize, dmu_object_type_t bonustype, int bonuslen, int dnodesize,
313 dmu_tx_t *tx)
314 {
315 dnode_t *dn;
316 int dn_slots = dnodesize >> DNODE_SHIFT;
317 int err;
318
319 if (dn_slots == 0)
320 dn_slots = DNODE_MIN_SLOTS;
321
322 if (object == DMU_META_DNODE_OBJECT)
323 return (SET_ERROR(EBADF));
324
325 err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0,
326 FTAG, &dn);
327 if (err)
328 return (err);
329
330 dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, dn_slots, tx);
331
332 dnode_rele(dn, FTAG);
333 return (err);
334 }
335
336 int
337 dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx)
338 {
339 dnode_t *dn;
340 int err;
341
342 ASSERT(object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
343
344 err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0,
345 FTAG, &dn);
346 if (err)
347 return (err);
348
349 ASSERT(dn->dn_type != DMU_OT_NONE);
350 /*
351 * If we don't create this free range, we'll leak indirect blocks when
352 * we get to freeing the dnode in syncing context.
353 */
354 dnode_free_range(dn, 0, DMU_OBJECT_END, tx);
355 dnode_free(dn, tx);
356 dnode_rele(dn, FTAG);
357
358 return (0);
359 }
360
361 /*
362 * Return (in *objectp) the next object which is allocated (or a hole)
363 * after *object, taking into account only objects that may have been modified
364 * after the specified txg.
365 */
366 int
367 dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg)
368 {
369 uint64_t offset;
370 uint64_t start_obj;
371 struct dsl_dataset *ds = os->os_dsl_dataset;
372 int error;
373
374 if (*objectp == 0) {
375 start_obj = 1;
376 } else if (ds && dsl_dataset_feature_is_active(ds,
377 SPA_FEATURE_LARGE_DNODE)) {
378 uint64_t i = *objectp + 1;
379 uint64_t last_obj = *objectp | (DNODES_PER_BLOCK - 1);
380 dmu_object_info_t doi;
381
382 /*
383 * Scan through the remaining meta dnode block. The contents
384 * of each slot in the block are known so it can be quickly
385 * checked. If the block is exhausted without a match then
386 * hand off to dnode_next_offset() for further scanning.
387 */
388 while (i <= last_obj) {
389 error = dmu_object_info(os, i, &doi);
390 if (error == ENOENT) {
391 if (hole) {
392 *objectp = i;
393 return (0);
394 } else {
395 i++;
396 }
397 } else if (error == EEXIST) {
398 i++;
399 } else if (error == 0) {
400 if (hole) {
401 i += doi.doi_dnodesize >> DNODE_SHIFT;
402 } else {
403 *objectp = i;
404 return (0);
405 }
406 } else {
407 return (error);
408 }
409 }
410
411 start_obj = i;
412 } else {
413 start_obj = *objectp + 1;
414 }
415
416 offset = start_obj << DNODE_SHIFT;
417
418 error = dnode_next_offset(DMU_META_DNODE(os),
419 (hole ? DNODE_FIND_HOLE : 0), &offset, 0, DNODES_PER_BLOCK, txg);
420
421 *objectp = offset >> DNODE_SHIFT;
422
423 return (error);
424 }
425
426 /*
427 * Turn this object from old_type into DMU_OTN_ZAP_METADATA, and bump the
428 * refcount on SPA_FEATURE_EXTENSIBLE_DATASET.
429 *
430 * Only for use from syncing context, on MOS objects.
431 */
432 void
433 dmu_object_zapify(objset_t *mos, uint64_t object, dmu_object_type_t old_type,
434 dmu_tx_t *tx)
435 {
436 dnode_t *dn;
437
438 ASSERT(dmu_tx_is_syncing(tx));
439
440 VERIFY0(dnode_hold(mos, object, FTAG, &dn));
441 if (dn->dn_type == DMU_OTN_ZAP_METADATA) {
442 dnode_rele(dn, FTAG);
443 return;
444 }
445 ASSERT3U(dn->dn_type, ==, old_type);
446 ASSERT0(dn->dn_maxblkid);
447
448 /*
449 * We must initialize the ZAP data before changing the type,
450 * so that concurrent calls to *_is_zapified() can determine if
451 * the object has been completely zapified by checking the type.
452 */
453 mzap_create_impl(dn, 0, 0, tx);
454
455 dn->dn_next_type[tx->tx_txg & TXG_MASK] = dn->dn_type =
456 DMU_OTN_ZAP_METADATA;
457 dnode_setdirty(dn, tx);
458 dnode_rele(dn, FTAG);
459
460 spa_feature_incr(dmu_objset_spa(mos),
461 SPA_FEATURE_EXTENSIBLE_DATASET, tx);
462 }
463
464 void
465 dmu_object_free_zapified(objset_t *mos, uint64_t object, dmu_tx_t *tx)
466 {
467 dnode_t *dn;
468 dmu_object_type_t t;
469
470 ASSERT(dmu_tx_is_syncing(tx));
471
472 VERIFY0(dnode_hold(mos, object, FTAG, &dn));
473 t = dn->dn_type;
474 dnode_rele(dn, FTAG);
475
476 if (t == DMU_OTN_ZAP_METADATA) {
477 spa_feature_decr(dmu_objset_spa(mos),
478 SPA_FEATURE_EXTENSIBLE_DATASET, tx);
479 }
480 VERIFY0(dmu_object_free(mos, object, tx));
481 }
482
483 #if defined(_KERNEL)
484 EXPORT_SYMBOL(dmu_object_alloc);
485 EXPORT_SYMBOL(dmu_object_alloc_ibs);
486 EXPORT_SYMBOL(dmu_object_alloc_dnsize);
487 EXPORT_SYMBOL(dmu_object_alloc_hold);
488 EXPORT_SYMBOL(dmu_object_claim);
489 EXPORT_SYMBOL(dmu_object_claim_dnsize);
490 EXPORT_SYMBOL(dmu_object_reclaim);
491 EXPORT_SYMBOL(dmu_object_reclaim_dnsize);
492 EXPORT_SYMBOL(dmu_object_free);
493 EXPORT_SYMBOL(dmu_object_next);
494 EXPORT_SYMBOL(dmu_object_zapify);
495 EXPORT_SYMBOL(dmu_object_free_zapified);
496
497 /* BEGIN CSTYLED */
498 module_param(dmu_object_alloc_chunk_shift, int, 0644);
499 MODULE_PARM_DESC(dmu_object_alloc_chunk_shift,
500 "CPU-specific allocator grabs 2^N objects at once");
501 /* END CSTYLED */
502 #endif