[mirror_zfs.git] / module / zfs / dmu_object.c

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
 * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
 * Copyright 2014 HybridCluster. All rights reserved.
 */

#include <sys/dbuf.h>
#include <sys/dmu.h>
#include <sys/dmu_objset.h>
#include <sys/dmu_tx.h>
#include <sys/dnode.h>
#include <sys/zap.h>
#include <sys/zfeature.h>
#include <sys/dsl_dataset.h>

/*
 * Each of the concurrent object allocators will grab
 * 2^dmu_object_alloc_chunk_shift dnode slots at a time.  The default is to
 * grab 128 slots, which is 4 blocks worth.  This was experimentally
 * determined to be the lowest value that eliminates the measurable effect
 * of lock contention from this code path.
 */
int dmu_object_alloc_chunk_shift = 7;

static uint64_t
dmu_object_alloc_impl(objset_t *os, dmu_object_type_t ot, int blocksize,
    int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen,
    int dnodesize, dnode_t **allocated_dnode, void *tag, dmu_tx_t *tx)
{
	uint64_t object;
	uint64_t L1_dnode_count = DNODES_PER_BLOCK <<
	    (DMU_META_DNODE(os)->dn_indblkshift - SPA_BLKPTRSHIFT);
	dnode_t *dn = NULL;
	int dn_slots = dnodesize >> DNODE_SHIFT;
	boolean_t restarted = B_FALSE;
	uint64_t *cpuobj = NULL;
	int dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift;
	int error;

	kpreempt_disable();
	cpuobj = &os->os_obj_next_percpu[CPU_SEQID %
	    os->os_obj_next_percpu_len];
	kpreempt_enable();

	if (dn_slots == 0) {
		dn_slots = DNODE_MIN_SLOTS;
	} else {
		ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS);
		ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS);
	}

	/*
	 * The "chunk" of dnodes that is assigned to a CPU-specific
	 * allocator needs to be at least one block's worth, to avoid
	 * lock contention on the dbuf.  It can be at most one L1 block's
	 * worth, so that the "rescan after polishing off a L1's worth"
	 * logic below will be sure to kick in.
	 */
	if (dnodes_per_chunk < DNODES_PER_BLOCK)
		dnodes_per_chunk = DNODES_PER_BLOCK;
	if (dnodes_per_chunk > L1_dnode_count)
		dnodes_per_chunk = L1_dnode_count;

	/*
	 * The caller requested the dnode be returned as a performance
	 * optimization in order to avoid releasing the hold only to
	 * immediately reacquire it.  Since they caller is responsible
	 * for releasing the hold they must provide the tag.
	 */
	if (allocated_dnode != NULL) {
		ASSERT3P(tag, !=, NULL);
	} else {
		ASSERT3P(tag, ==, NULL);
		tag = FTAG;
	}

	object = *cpuobj;
	for (;;) {
		/*
		 * If we finished a chunk of dnodes, get a new one from
		 * the global allocator.
		 */
		if ((P2PHASE(object, dnodes_per_chunk) == 0) ||
		    (P2PHASE(object + dn_slots - 1, dnodes_per_chunk) <
		    dn_slots)) {
			DNODE_STAT_BUMP(dnode_alloc_next_chunk);
			mutex_enter(&os->os_obj_lock);
			ASSERT0(P2PHASE(os->os_obj_next_chunk,
			    dnodes_per_chunk));
			object = os->os_obj_next_chunk;

			/*
			 * Each time we polish off a L1 bp worth of dnodes
			 * (2^12 objects), move to another L1 bp that's
			 * still reasonably sparse (at most 1/4 full). Look
			 * from the beginning at most once per txg. If we
			 * still can't allocate from that L1 block, search
			 * for an empty L0 block, which will quickly skip
			 * to the end of the metadnode if no nearby L0
			 * blocks are empty. This fallback avoids a
			 * pathology where full dnode blocks containing
			 * large dnodes appear sparse because they have a
			 * low blk_fill, leading to many failed allocation
			 * attempts. In the long term a better mechanism to
			 * search for sparse metadnode regions, such as
			 * spacemaps, could be implemented.
			 *
			 * os_scan_dnodes is set during txg sync if enough
			 * objects have been freed since the previous
			 * rescan to justify backfilling again.
			 *
			 * Note that dmu_traverse depends on the behavior
			 * that we use multiple blocks of the dnode object
			 * before going back to reuse objects.  Any change
			 * to this algorithm should preserve that property
			 * or find another solution to the issues described
			 * in traverse_visitbp.
			 */
			if (P2PHASE(object, L1_dnode_count) == 0) {
				uint64_t offset;
				uint64_t blkfill;
				int minlvl;
				if (os->os_rescan_dnodes) {
					offset = 0;
					os->os_rescan_dnodes = B_FALSE;
				} else {
					offset = object << DNODE_SHIFT;
				}
				blkfill = restarted ? 1 : DNODES_PER_BLOCK >> 2;
				minlvl = restarted ? 1 : 2;
				restarted = B_TRUE;
				error = dnode_next_offset(DMU_META_DNODE(os),
				    DNODE_FIND_HOLE, &offset, minlvl,
				    blkfill, 0);
				if (error == 0) {
					object = offset >> DNODE_SHIFT;
				}
			}
			/*
			 * Note: if "restarted", we may find a L0 that
			 * is not suitably aligned.
			 */
			os->os_obj_next_chunk =
			    P2ALIGN(object, dnodes_per_chunk) +
			    dnodes_per_chunk;
			(void) atomic_swap_64(cpuobj, object);
			mutex_exit(&os->os_obj_lock);
		}

		/*
		 * The value of (*cpuobj) before adding dn_slots is the object
		 * ID assigned to us.  The value afterwards is the object ID
		 * assigned to whoever wants to do an allocation next.
		 */
		object = atomic_add_64_nv(cpuobj, dn_slots) - dn_slots;

		/*
		 * XXX We should check for an i/o error here and return
		 * up to our caller.  Actually we should pre-read it in
		 * dmu_tx_assign(), but there is currently no mechanism
		 * to do so.
		 */
		error = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE,
		    dn_slots, tag, &dn);
		if (error == 0) {
			rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
			/*
			 * Another thread could have allocated it; check
			 * again now that we have the struct lock.
			 */
			if (dn->dn_type == DMU_OT_NONE) {
				dnode_allocate(dn, ot, blocksize,
				    indirect_blockshift, bonustype,
				    bonuslen, dn_slots, tx);
				rw_exit(&dn->dn_struct_rwlock);
				dmu_tx_add_new_object(tx, dn);

				/*
				 * Caller requested the allocated dnode be
				 * returned and is responsible for the hold.
				 */
				if (allocated_dnode != NULL)
					*allocated_dnode = dn;
				else
					dnode_rele(dn, tag);

				return (object);
			}
			rw_exit(&dn->dn_struct_rwlock);
			dnode_rele(dn, tag);
			DNODE_STAT_BUMP(dnode_alloc_race);
		}

		/*
		 * Skip to next known valid starting point on error.  This
		 * is the start of the next block of dnodes.
		 */
		if (dmu_object_next(os, &object, B_TRUE, 0) != 0) {
			object = P2ROUNDUP(object + 1, DNODES_PER_BLOCK);
			DNODE_STAT_BUMP(dnode_alloc_next_block);
		}
		(void) atomic_swap_64(cpuobj, object);
	}
}

uint64_t
dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
{
	return dmu_object_alloc_impl(os, ot, blocksize, 0, bonustype,
	    bonuslen, 0, NULL, NULL, tx);
}

uint64_t
dmu_object_alloc_ibs(objset_t *os, dmu_object_type_t ot, int blocksize,
    int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen,
    dmu_tx_t *tx)
{
	return dmu_object_alloc_impl(os, ot, blocksize, indirect_blockshift,
	    bonustype, bonuslen, 0, NULL, NULL, tx);
}

uint64_t
dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize,
    dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
{
	return (dmu_object_alloc_impl(os, ot, blocksize, 0, bonustype,
	    bonuslen, dnodesize, NULL, NULL, tx));
}

/*
 * Allocate a new object and return a pointer to the newly allocated dnode
 * via the allocated_dnode argument.  The returned dnode will be held and
 * the caller is responsible for releasing the hold by calling dnode_rele().
 */
uint64_t
dmu_object_alloc_hold(objset_t *os, dmu_object_type_t ot, int blocksize,
    int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen,
    int dnodesize, dnode_t **allocated_dnode, void *tag, dmu_tx_t *tx)
{
	return (dmu_object_alloc_impl(os, ot, blocksize, indirect_blockshift,
	    bonustype, bonuslen, dnodesize, allocated_dnode, tag, tx));
}

int
dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
    int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
{
	return (dmu_object_claim_dnsize(os, object, ot, blocksize, bonustype,
	    bonuslen, 0, tx));
}

int
dmu_object_claim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,
    int blocksize, dmu_object_type_t bonustype, int bonuslen,
    int dnodesize, dmu_tx_t *tx)
{
	dnode_t *dn;
	int dn_slots = dnodesize >> DNODE_SHIFT;
	int err;

	if (dn_slots == 0)
		dn_slots = DNODE_MIN_SLOTS;
	ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS);
	ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS);

	if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx))
		return (SET_ERROR(EBADF));

	err = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, dn_slots,
	    FTAG, &dn);
	if (err)
		return (err);

	dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, dn_slots, tx);
	dmu_tx_add_new_object(tx, dn);

	dnode_rele(dn, FTAG);

	return (0);
}

int
dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
    int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
{
	return (dmu_object_reclaim_dnsize(os, object, ot, blocksize, bonustype,
	    bonuslen, DNODE_MIN_SIZE, B_FALSE, tx));
}

int
dmu_object_reclaim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,
    int blocksize, dmu_object_type_t bonustype, int bonuslen, int dnodesize,
    boolean_t keep_spill, dmu_tx_t *tx)
{
	dnode_t *dn;
	int dn_slots = dnodesize >> DNODE_SHIFT;
	int err;

	if (dn_slots == 0)
		dn_slots = DNODE_MIN_SLOTS;

	if (object == DMU_META_DNODE_OBJECT)
		return (SET_ERROR(EBADF));

	err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0,
	    FTAG, &dn);
	if (err)
		return (err);

	dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, dn_slots,
	    keep_spill, tx);

	dnode_rele(dn, FTAG);
	return (err);
}

int
dmu_object_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx)
{
	dnode_t *dn;
	int err;

	err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0,
	    FTAG, &dn);
	if (err)
		return (err);

	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
	if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
		dbuf_rm_spill(dn, tx);
		dnode_rm_spill(dn, tx);
	}
	rw_exit(&dn->dn_struct_rwlock);

	dnode_rele(dn, FTAG);
	return (err);
}

int
dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx)
{
	dnode_t *dn;
	int err;

	ASSERT(object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));

	err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0,
	    FTAG, &dn);
	if (err)
		return (err);

	ASSERT(dn->dn_type != DMU_OT_NONE);
	/*
	 * If we don't create this free range, we'll leak indirect blocks when
	 * we get to freeing the dnode in syncing context.
	 */
	dnode_free_range(dn, 0, DMU_OBJECT_END, tx);
	dnode_free(dn, tx);
	dnode_rele(dn, FTAG);

	return (0);
}

/*
 * Return (in *objectp) the next object which is allocated (or a hole)
 * after *object, taking into account only objects that may have been modified
 * after the specified txg.
 */
int
dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg)
{
	uint64_t offset;
	uint64_t start_obj;
	struct dsl_dataset *ds = os->os_dsl_dataset;
	int error;

	if (*objectp == 0) {
		start_obj = 1;
	} else if (ds && dsl_dataset_feature_is_active(ds,
	    SPA_FEATURE_LARGE_DNODE)) {
		uint64_t i = *objectp + 1;
		uint64_t last_obj = *objectp | (DNODES_PER_BLOCK - 1);
		dmu_object_info_t doi;

		/*
		 * Scan through the remaining meta dnode block.  The contents
		 * of each slot in the block are known so it can be quickly
		 * checked.  If the block is exhausted without a match then
		 * hand off to dnode_next_offset() for further scanning.
		 */
		while (i <= last_obj) {
			error = dmu_object_info(os, i, &doi);
			if (error == ENOENT) {
				if (hole) {
					*objectp = i;
					return (0);
				} else {
					i++;
				}
			} else if (error == EEXIST) {
				i++;
			} else if (error == 0) {
				if (hole) {
					i += doi.doi_dnodesize >> DNODE_SHIFT;
				} else {
					*objectp = i;
					return (0);
				}
			} else {
				return (error);
			}
		}

		start_obj = i;
	} else {
		start_obj = *objectp + 1;
	}

	offset = start_obj << DNODE_SHIFT;

	error = dnode_next_offset(DMU_META_DNODE(os),
	    (hole ? DNODE_FIND_HOLE : 0), &offset, 0, DNODES_PER_BLOCK, txg);

	*objectp = offset >> DNODE_SHIFT;

	return (error);
}

/*
 * Turn this object from old_type into DMU_OTN_ZAP_METADATA, and bump the
 * refcount on SPA_FEATURE_EXTENSIBLE_DATASET.
 *
 * Only for use from syncing context, on MOS objects.
 */
void
dmu_object_zapify(objset_t *mos, uint64_t object, dmu_object_type_t old_type,
    dmu_tx_t *tx)
{
	dnode_t *dn;

	ASSERT(dmu_tx_is_syncing(tx));

	VERIFY0(dnode_hold(mos, object, FTAG, &dn));
	if (dn->dn_type == DMU_OTN_ZAP_METADATA) {
		dnode_rele(dn, FTAG);
		return;
	}
	ASSERT3U(dn->dn_type, ==, old_type);
	ASSERT0(dn->dn_maxblkid);

	/*
	 * We must initialize the ZAP data before changing the type,
	 * so that concurrent calls to *_is_zapified() can determine if
	 * the object has been completely zapified by checking the type.
	 */
	mzap_create_impl(dn, 0, 0, tx);

	dn->dn_next_type[tx->tx_txg & TXG_MASK] = dn->dn_type =
	    DMU_OTN_ZAP_METADATA;
	dnode_setdirty(dn, tx);
	dnode_rele(dn, FTAG);

	spa_feature_incr(dmu_objset_spa(mos),
	    SPA_FEATURE_EXTENSIBLE_DATASET, tx);
}

void
dmu_object_free_zapified(objset_t *mos, uint64_t object, dmu_tx_t *tx)
{
	dnode_t *dn;
	dmu_object_type_t t;

	ASSERT(dmu_tx_is_syncing(tx));

	VERIFY0(dnode_hold(mos, object, FTAG, &dn));
	t = dn->dn_type;
	dnode_rele(dn, FTAG);

	if (t == DMU_OTN_ZAP_METADATA) {
		spa_feature_decr(dmu_objset_spa(mos),
		    SPA_FEATURE_EXTENSIBLE_DATASET, tx);
	}
	VERIFY0(dmu_object_free(mos, object, tx));
}

#if defined(_KERNEL)
EXPORT_SYMBOL(dmu_object_alloc);
EXPORT_SYMBOL(dmu_object_alloc_ibs);
EXPORT_SYMBOL(dmu_object_alloc_dnsize);
EXPORT_SYMBOL(dmu_object_alloc_hold);
EXPORT_SYMBOL(dmu_object_claim);
EXPORT_SYMBOL(dmu_object_claim_dnsize);
EXPORT_SYMBOL(dmu_object_reclaim);
EXPORT_SYMBOL(dmu_object_reclaim_dnsize);
EXPORT_SYMBOL(dmu_object_rm_spill);
EXPORT_SYMBOL(dmu_object_free);
EXPORT_SYMBOL(dmu_object_next);
EXPORT_SYMBOL(dmu_object_zapify);
EXPORT_SYMBOL(dmu_object_free_zapified);

/* BEGIN CSTYLED */
module_param(dmu_object_alloc_chunk_shift, int, 0644);
MODULE_PARM_DESC(dmu_object_alloc_chunk_shift,
	"CPU-specific allocator grabs 2^N objects at once");
/* END CSTYLED */
#endif
Commit	Line	Data
34dc7c2f BB	1	/*
	2	* CDDL HEADER START
	3	*
	4	* The contents of this file are subject to the terms of the
	5	* Common Development and Distribution License (the "License").
	6	* You may not use this file except in compliance with the License.
	7	*
	8	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	9	* or http://www.opensolaris.org/os/licensing.
	10	* See the License for the specific language governing permissions
	11	* and limitations under the License.
	12	*
	13	* When distributing Covered Code, include this CDDL HEADER in each
	14	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	15	* If applicable, add the following below this CDDL HEADER, with the
	16	* fields enclosed by brackets "[]" replaced with your own identifying
	17	* information: Portions Copyright [yyyy] [name of copyright owner]
	18	*
	19	* CDDL HEADER END
	20	*/
	21	/*
428870ff	22	* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
1a5b96b8	23	* Copyright (c) 2013, 2017 by Delphix. All rights reserved.
6c59307a	24	* Copyright 2014 HybridCluster. All rights reserved.
34dc7c2f BB	25	*/
34dc7c2f BB	26
caf9dd20	27	#include <sys/dbuf.h>
34dc7c2f BB	28	#include <sys/dmu.h>
	29	#include <sys/dmu_objset.h>
	30	#include <sys/dmu_tx.h>
	31	#include <sys/dnode.h>
fa86b5db MA	32	#include <sys/zap.h>
fa86b5db MA	33	#include <sys/zfeature.h>
50c957f7	34	#include <sys/dsl_dataset.h>
34dc7c2f	35
dbeb8796 MA	36	/*
	37	* Each of the concurrent object allocators will grab
	38	* 2^dmu_object_alloc_chunk_shift dnode slots at a time. The default is to
	39	* grab 128 slots, which is 4 blocks worth. This was experimentally
	40	* determined to be the lowest value that eliminates the measurable effect
	41	* of lock contention from this code path.
	42	*/
	43	int dmu_object_alloc_chunk_shift = 7;
	44
3a549dc7 MA	45	static uint64_t
	46	dmu_object_alloc_impl(objset_t *os, dmu_object_type_t ot, int blocksize,
	47	int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen,
6955b401	48	int dnodesize, dnode_t *allocated_dnode, void tag, dmu_tx_t *tx)
34dc7c2f	49	{
34dc7c2f	50	uint64_t object;
68cbd56e	51	uint64_t L1_dnode_count = DNODES_PER_BLOCK <<
572e2857	52	(DMU_META_DNODE(os)->dn_indblkshift - SPA_BLKPTRSHIFT);
34dc7c2f	53	dnode_t *dn = NULL;
50c957f7 NB	54	int dn_slots = dnodesize >> DNODE_SHIFT;
50c957f7 NB	55	boolean_t restarted = B_FALSE;
d9ad3fea	56	uint64_t *cpuobj = NULL;
dbeb8796	57	int dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift;
9631681b	58	int error;
50c957f7	59
d9ad3fea MJ	60	kpreempt_disable();
	61	cpuobj = &os->os_obj_next_percpu[CPU_SEQID %
	62	os->os_obj_next_percpu_len];
	63	kpreempt_enable();
	64
50c957f7 NB	65	if (dn_slots == 0) {
	66	dn_slots = DNODE_MIN_SLOTS;
	67	} else {
	68	ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS);
	69	ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS);
	70	}
34dc7c2f	71
dbeb8796 MA	72	/*
	73	* The "chunk" of dnodes that is assigned to a CPU-specific
	74	* allocator needs to be at least one block's worth, to avoid
	75	* lock contention on the dbuf. It can be at most one L1 block's
	76	* worth, so that the "rescan after polishing off a L1's worth"
	77	* logic below will be sure to kick in.
	78	*/
	79	if (dnodes_per_chunk < DNODES_PER_BLOCK)
	80	dnodes_per_chunk = DNODES_PER_BLOCK;
	81	if (dnodes_per_chunk > L1_dnode_count)
	82	dnodes_per_chunk = L1_dnode_count;
	83
6955b401 BB	84	/*
	85	* The caller requested the dnode be returned as a performance
	86	* optimization in order to avoid releasing the hold only to
	87	* immediately reacquire it. Since they caller is responsible
	88	* for releasing the hold they must provide the tag.
	89	*/
	90	if (allocated_dnode != NULL) {
	91	ASSERT3P(tag, !=, NULL);
	92	} else {
	93	ASSERT3P(tag, ==, NULL);
	94	tag = FTAG;
	95	}
	96
dbeb8796	97	object = *cpuobj;
34dc7c2f	98	for (;;) {
34dc7c2f	99	/*
dbeb8796 MA	100	* If we finished a chunk of dnodes, get a new one from
dbeb8796 MA	101	* the global allocator.
34dc7c2f	102	*/
4c5b89f5 OF	103	if ((P2PHASE(object, dnodes_per_chunk) == 0) \|\|
	104	(P2PHASE(object + dn_slots - 1, dnodes_per_chunk) <
	105	dn_slots)) {
	106	DNODE_STAT_BUMP(dnode_alloc_next_chunk);
dbeb8796 MA	107	mutex_enter(&os->os_obj_lock);
	108	ASSERT0(P2PHASE(os->os_obj_next_chunk,
	109	dnodes_per_chunk));
	110	object = os->os_obj_next_chunk;
	111
	112	/*
	113	* Each time we polish off a L1 bp worth of dnodes
	114	* (2^12 objects), move to another L1 bp that's
	115	* still reasonably sparse (at most 1/4 full). Look
	116	* from the beginning at most once per txg. If we
	117	* still can't allocate from that L1 block, search
	118	* for an empty L0 block, which will quickly skip
	119	* to the end of the metadnode if no nearby L0
	120	* blocks are empty. This fallback avoids a
	121	* pathology where full dnode blocks containing
	122	* large dnodes appear sparse because they have a
	123	* low blk_fill, leading to many failed allocation
	124	* attempts. In the long term a better mechanism to
	125	* search for sparse metadnode regions, such as
	126	* spacemaps, could be implemented.
	127	*
	128	* os_scan_dnodes is set during txg sync if enough
	129	* objects have been freed since the previous
	130	* rescan to justify backfilling again.
	131	*
	132	* Note that dmu_traverse depends on the behavior
	133	* that we use multiple blocks of the dnode object
	134	* before going back to reuse objects. Any change
	135	* to this algorithm should preserve that property
	136	* or find another solution to the issues described
	137	* in traverse_visitbp.
	138	*/
	139	if (P2PHASE(object, L1_dnode_count) == 0) {
	140	uint64_t offset;
	141	uint64_t blkfill;
	142	int minlvl;
dbeb8796 MA	143	if (os->os_rescan_dnodes) {
	144	offset = 0;
	145	os->os_rescan_dnodes = B_FALSE;
	146	} else {
	147	offset = object << DNODE_SHIFT;
	148	}
	149	blkfill = restarted ? 1 : DNODES_PER_BLOCK >> 2;
	150	minlvl = restarted ? 1 : 2;
	151	restarted = B_TRUE;
	152	error = dnode_next_offset(DMU_META_DNODE(os),
	153	DNODE_FIND_HOLE, &offset, minlvl,
	154	blkfill, 0);
	155	if (error == 0) {
	156	object = offset >> DNODE_SHIFT;
	157	}
68cbd56e	158	}
dbeb8796 MA	159	/*
	160	* Note: if "restarted", we may find a L0 that
	161	* is not suitably aligned.
	162	*/
	163	os->os_obj_next_chunk =
	164	P2ALIGN(object, dnodes_per_chunk) +
	165	dnodes_per_chunk;
	166	(void) atomic_swap_64(cpuobj, object);
	167	mutex_exit(&os->os_obj_lock);
34dc7c2f	168	}
34dc7c2f	169
4c5b89f5 OF	170	/*
	171	* The value of (*cpuobj) before adding dn_slots is the object
	172	* ID assigned to us. The value afterwards is the object ID
	173	* assigned to whoever wants to do an allocation next.
	174	*/
	175	object = atomic_add_64_nv(cpuobj, dn_slots) - dn_slots;
	176
34dc7c2f BB	177	/*
	178	* XXX We should check for an i/o error here and return
	179	* up to our caller. Actually we should pre-read it in
	180	* dmu_tx_assign(), but there is currently no mechanism
	181	* to do so.
	182	*/
9631681b	183	error = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE,
6955b401	184	dn_slots, tag, &dn);
9631681b	185	if (error == 0) {
dbeb8796	186	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
50c957f7	187	/*
dbeb8796 MA	188	* Another thread could have allocated it; check
dbeb8796 MA	189	* again now that we have the struct lock.
50c957f7	190	*/
dbeb8796	191	if (dn->dn_type == DMU_OT_NONE) {
3a549dc7 MA	192	dnode_allocate(dn, ot, blocksize,
	193	indirect_blockshift, bonustype,
	194	bonuslen, dn_slots, tx);
dbeb8796 MA	195	rw_exit(&dn->dn_struct_rwlock);
dbeb8796 MA	196	dmu_tx_add_new_object(tx, dn);
6955b401 BB	197
	198	/*
	199	* Caller requested the allocated dnode be
	200	* returned and is responsible for the hold.
	201	*/
	202	if (allocated_dnode != NULL)
	203	*allocated_dnode = dn;
	204	else
	205	dnode_rele(dn, tag);
	206
dbeb8796 MA	207	return (object);
	208	}
	209	rw_exit(&dn->dn_struct_rwlock);
6955b401	210	dnode_rele(dn, tag);
4c5b89f5	211	DNODE_STAT_BUMP(dnode_alloc_race);
dbeb8796	212	}
0eef1bde	213
4c5b89f5 OF	214	/*
	215	* Skip to next known valid starting point on error. This
	216	* is the start of the next block of dnodes.
	217	*/
dbeb8796	218	if (dmu_object_next(os, &object, B_TRUE, 0) != 0) {
dbeb8796	219	object = P2ROUNDUP(object + 1, DNODES_PER_BLOCK);
4c5b89f5	220	DNODE_STAT_BUMP(dnode_alloc_next_block);
dbeb8796 MA	221	}
	222	(void) atomic_swap_64(cpuobj, object);
	223	}
34dc7c2f BB	224	}
34dc7c2f BB	225
3a549dc7 MA	226	uint64_t
	227	dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
	228	dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
	229	{
	230	return dmu_object_alloc_impl(os, ot, blocksize, 0, bonustype,
6955b401	231	bonuslen, 0, NULL, NULL, tx);
3a549dc7 MA	232	}
	233
	234	uint64_t
	235	dmu_object_alloc_ibs(objset_t *os, dmu_object_type_t ot, int blocksize,
	236	int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen,
	237	dmu_tx_t *tx)
	238	{
	239	return dmu_object_alloc_impl(os, ot, blocksize, indirect_blockshift,
6955b401	240	bonustype, bonuslen, 0, NULL, NULL, tx);
3a549dc7 MA	241	}
	242
	243	uint64_t
	244	dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize,
	245	dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
	246	{
	247	return (dmu_object_alloc_impl(os, ot, blocksize, 0, bonustype,
6955b401 BB	248	bonuslen, dnodesize, NULL, NULL, tx));
	249	}
	250
	251	/*
	252	* Allocate a new object and return a pointer to the newly allocated dnode
	253	* via the allocated_dnode argument. The returned dnode will be held and
	254	* the caller is responsible for releasing the hold by calling dnode_rele().
	255	*/
	256	uint64_t
	257	dmu_object_alloc_hold(objset_t *os, dmu_object_type_t ot, int blocksize,
	258	int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen,
	259	int dnodesize, dnode_t *allocated_dnode, void tag, dmu_tx_t *tx)
	260	{
	261	return (dmu_object_alloc_impl(os, ot, blocksize, indirect_blockshift,
	262	bonustype, bonuslen, dnodesize, allocated_dnode, tag, tx));
3a549dc7 MA	263	}
3a549dc7 MA	264
34dc7c2f BB	265	int
	266	dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
	267	int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
50c957f7 NB	268	{
	269	return (dmu_object_claim_dnsize(os, object, ot, blocksize, bonustype,
	270	bonuslen, 0, tx));
	271	}
	272
	273	int
	274	dmu_object_claim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,
	275	int blocksize, dmu_object_type_t bonustype, int bonuslen,
	276	int dnodesize, dmu_tx_t *tx)
34dc7c2f BB	277	{
34dc7c2f BB	278	dnode_t *dn;
50c957f7	279	int dn_slots = dnodesize >> DNODE_SHIFT;
34dc7c2f BB	280	int err;
34dc7c2f BB	281
50c957f7 NB	282	if (dn_slots == 0)
	283	dn_slots = DNODE_MIN_SLOTS;
	284	ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS);
	285	ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS);
	286
34dc7c2f	287	if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx))
2e528b49	288	return (SET_ERROR(EBADF));
34dc7c2f	289
50c957f7 NB	290	err = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, dn_slots,
50c957f7 NB	291	FTAG, &dn);
34dc7c2f BB	292	if (err)
34dc7c2f BB	293	return (err);
50c957f7 NB	294
50c957f7 NB	295	dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, dn_slots, tx);
66eead53	296	dmu_tx_add_new_object(tx, dn);
0eef1bde	297
34dc7c2f BB	298	dnode_rele(dn, FTAG);
34dc7c2f BB	299
34dc7c2f BB	300	return (0);
	301	}
	302
	303	int
	304	dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
6c59307a	305	int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
50c957f7 NB	306	{
50c957f7 NB	307	return (dmu_object_reclaim_dnsize(os, object, ot, blocksize, bonustype,
caf9dd20	308	bonuslen, DNODE_MIN_SIZE, B_FALSE, tx));
50c957f7 NB	309	}
	310
	311	int
	312	dmu_object_reclaim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,
	313	int blocksize, dmu_object_type_t bonustype, int bonuslen, int dnodesize,
caf9dd20	314	boolean_t keep_spill, dmu_tx_t *tx)
34dc7c2f BB	315	{
34dc7c2f BB	316	dnode_t *dn;
50c957f7	317	int dn_slots = dnodesize >> DNODE_SHIFT;
34dc7c2f BB	318	int err;
34dc7c2f BB	319
da2feb42 TC	320	if (dn_slots == 0)
	321	dn_slots = DNODE_MIN_SLOTS;
	322
9babb374	323	if (object == DMU_META_DNODE_OBJECT)
2e528b49	324	return (SET_ERROR(EBADF));
34dc7c2f	325
50c957f7	326	err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0,
34dc7c2f BB	327	FTAG, &dn);
	328	if (err)
	329	return (err);
9babb374	330
caf9dd20 BB	331	dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, dn_slots,
	332	keep_spill, tx);
	333
	334	dnode_rele(dn, FTAG);
	335	return (err);
	336	}
	337
	338	int
	339	dmu_object_rm_spill(objset_t os, uint64_t object, dmu_tx_t tx)
	340	{
	341	dnode_t *dn;
	342	int err;
	343
	344	err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0,
	345	FTAG, &dn);
	346	if (err)
	347	return (err);
	348
	349	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
	350	if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
	351	dbuf_rm_spill(dn, tx);
	352	dnode_rm_spill(dn, tx);
	353	}
	354	rw_exit(&dn->dn_struct_rwlock);
9babb374	355
34dc7c2f	356	dnode_rele(dn, FTAG);
9babb374	357	return (err);
34dc7c2f BB	358	}
	359
	360	int
	361	dmu_object_free(objset_t os, uint64_t object, dmu_tx_t tx)
	362	{
	363	dnode_t *dn;
	364	int err;
	365
	366	ASSERT(object != DMU_META_DNODE_OBJECT \|\| dmu_tx_private_ok(tx));
	367
50c957f7	368	err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0,
34dc7c2f BB	369	FTAG, &dn);
	370	if (err)
	371	return (err);
	372
	373	ASSERT(dn->dn_type != DMU_OT_NONE);
21d48b5e PD	374	/*
	375	* If we don't create this free range, we'll leak indirect blocks when
	376	* we get to freeing the dnode in syncing context.
	377	*/
b128c09f	378	dnode_free_range(dn, 0, DMU_OBJECT_END, tx);
34dc7c2f BB	379	dnode_free(dn, tx);
	380	dnode_rele(dn, FTAG);
	381
	382	return (0);
	383	}
	384
fcff0f35 PD	385	/*
	386	* Return (in *objectp) the next object which is allocated (or a hole)
	387	* after *object, taking into account only objects that may have been modified
	388	* after the specified txg.
	389	*/
34dc7c2f BB	390	int
	391	dmu_object_next(objset_t os, uint64_t objectp, boolean_t hole, uint64_t txg)
	392	{
50c957f7	393	uint64_t offset;
08f0510d	394	uint64_t start_obj;
50c957f7	395	struct dsl_dataset *ds = os->os_dsl_dataset;
34dc7c2f BB	396	int error;
34dc7c2f BB	397
08f0510d	398	if (*objectp == 0) {
08f0510d	399	start_obj = 1;
d52d80b7 PD	400	} else if (ds && dsl_dataset_feature_is_active(ds,
d52d80b7 PD	401	SPA_FEATURE_LARGE_DNODE)) {
4c5b89f5 OF	402	uint64_t i = *objectp + 1;
	403	uint64_t last_obj = *objectp \| (DNODES_PER_BLOCK - 1);
	404	dmu_object_info_t doi;
	405
08f0510d	406	/*
4c5b89f5 OF	407	* Scan through the remaining meta dnode block. The contents
	408	* of each slot in the block are known so it can be quickly
	409	* checked. If the block is exhausted without a match then
	410	* hand off to dnode_next_offset() for further scanning.
08f0510d	411	*/
4c5b89f5	412	while (i <= last_obj) {
08f0510d	413	error = dmu_object_info(os, i, &doi);
4c5b89f5 OF	414	if (error == ENOENT) {
	415	if (hole) {
	416	*objectp = i;
	417	return (0);
	418	} else {
	419	i++;
	420	}
	421	} else if (error == EEXIST) {
	422	i++;
	423	} else if (error == 0) {
	424	if (hole) {
	425	i += doi.doi_dnodesize >> DNODE_SHIFT;
	426	} else {
	427	*objectp = i;
	428	return (0);
	429	}
	430	} else {
	431	return (error);
	432	}
08f0510d	433	}
	434
	435	start_obj = i;
50c957f7	436	} else {
08f0510d	437	start_obj = *objectp + 1;
50c957f7 NB	438	}
50c957f7 NB	439
08f0510d	440	offset = start_obj << DNODE_SHIFT;
50c957f7	441
572e2857	442	error = dnode_next_offset(DMU_META_DNODE(os),
b128c09f	443	(hole ? DNODE_FIND_HOLE : 0), &offset, 0, DNODES_PER_BLOCK, txg);
34dc7c2f BB	444
	445	*objectp = offset >> DNODE_SHIFT;
	446
	447	return (error);
	448	}
c28b2279	449
fa86b5db MA	450	/*
	451	* Turn this object from old_type into DMU_OTN_ZAP_METADATA, and bump the
	452	* refcount on SPA_FEATURE_EXTENSIBLE_DATASET.
	453	*
	454	* Only for use from syncing context, on MOS objects.
	455	*/
	456	void
	457	dmu_object_zapify(objset_t *mos, uint64_t object, dmu_object_type_t old_type,
	458	dmu_tx_t *tx)
	459	{
	460	dnode_t *dn;
	461
	462	ASSERT(dmu_tx_is_syncing(tx));
	463
	464	VERIFY0(dnode_hold(mos, object, FTAG, &dn));
	465	if (dn->dn_type == DMU_OTN_ZAP_METADATA) {
	466	dnode_rele(dn, FTAG);
	467	return;
	468	}
	469	ASSERT3U(dn->dn_type, ==, old_type);
	470	ASSERT0(dn->dn_maxblkid);
1a5b96b8 MA	471
	472	/*
	473	* We must initialize the ZAP data before changing the type,
	474	* so that concurrent calls to *_is_zapified() can determine if
	475	* the object has been completely zapified by checking the type.
	476	*/
6955b401	477	mzap_create_impl(dn, 0, 0, tx);
1a5b96b8	478
fa86b5db MA	479	dn->dn_next_type[tx->tx_txg & TXG_MASK] = dn->dn_type =
	480	DMU_OTN_ZAP_METADATA;
	481	dnode_setdirty(dn, tx);
	482	dnode_rele(dn, FTAG);
	483
fa86b5db MA	484	spa_feature_incr(dmu_objset_spa(mos),
	485	SPA_FEATURE_EXTENSIBLE_DATASET, tx);
	486	}
	487
	488	void
	489	dmu_object_free_zapified(objset_t mos, uint64_t object, dmu_tx_t tx)
	490	{
	491	dnode_t *dn;
	492	dmu_object_type_t t;
	493
	494	ASSERT(dmu_tx_is_syncing(tx));
	495
	496	VERIFY0(dnode_hold(mos, object, FTAG, &dn));
	497	t = dn->dn_type;
	498	dnode_rele(dn, FTAG);
	499
	500	if (t == DMU_OTN_ZAP_METADATA) {
	501	spa_feature_decr(dmu_objset_spa(mos),
	502	SPA_FEATURE_EXTENSIBLE_DATASET, tx);
	503	}
	504	VERIFY0(dmu_object_free(mos, object, tx));
	505	}
	506
93ce2b4c	507	#if defined(_KERNEL)
c28b2279	508	EXPORT_SYMBOL(dmu_object_alloc);
3a549dc7	509	EXPORT_SYMBOL(dmu_object_alloc_ibs);
50c957f7	510	EXPORT_SYMBOL(dmu_object_alloc_dnsize);
6955b401	511	EXPORT_SYMBOL(dmu_object_alloc_hold);
c28b2279	512	EXPORT_SYMBOL(dmu_object_claim);
50c957f7	513	EXPORT_SYMBOL(dmu_object_claim_dnsize);
c28b2279	514	EXPORT_SYMBOL(dmu_object_reclaim);
50c957f7	515	EXPORT_SYMBOL(dmu_object_reclaim_dnsize);
caf9dd20	516	EXPORT_SYMBOL(dmu_object_rm_spill);
c28b2279 BB	517	EXPORT_SYMBOL(dmu_object_free);
c28b2279 BB	518	EXPORT_SYMBOL(dmu_object_next);
fa86b5db MA	519	EXPORT_SYMBOL(dmu_object_zapify);
fa86b5db MA	520	EXPORT_SYMBOL(dmu_object_free_zapified);
dbeb8796 MA	521
	522	/* BEGIN CSTYLED */
	523	module_param(dmu_object_alloc_chunk_shift, int, 0644);
	524	MODULE_PARM_DESC(dmu_object_alloc_chunk_shift,
	525	"CPU-specific allocator grabs 2^N objects at once");
	526	/* END CSTYLED */
c28b2279	527	#endif