]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blame - zfs/module/zfs/dmu_object.c
UBUNTU: SAUCE: (noup) Update zfs to 0.7.5-1ubuntu16.6
[mirror_ubuntu-bionic-kernel.git] / zfs / module / zfs / dmu_object.c
CommitLineData
70e083d2
TG
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
24 * Copyright 2014 HybridCluster. All rights reserved.
25 */
26
27#include <sys/dmu.h>
28#include <sys/dmu_objset.h>
29#include <sys/dmu_tx.h>
30#include <sys/dnode.h>
31#include <sys/zap.h>
32#include <sys/zfeature.h>
86e3c28a
CIK
33#include <sys/dsl_dataset.h>
34
35/*
36 * Each of the concurrent object allocators will grab
37 * 2^dmu_object_alloc_chunk_shift dnode slots at a time. The default is to
38 * grab 128 slots, which is 4 blocks worth. This was experimentally
39 * determined to be the lowest value that eliminates the measurable effect
40 * of lock contention from this code path.
41 */
42int dmu_object_alloc_chunk_shift = 7;
70e083d2
TG
43
44uint64_t
45dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
46 dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
86e3c28a
CIK
47{
48 return dmu_object_alloc_dnsize(os, ot, blocksize, bonustype, bonuslen,
49 0, tx);
50}
51
52uint64_t
53dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize,
54 dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
70e083d2
TG
55{
56 uint64_t object;
86e3c28a 57 uint64_t L1_dnode_count = DNODES_PER_BLOCK <<
70e083d2
TG
58 (DMU_META_DNODE(os)->dn_indblkshift - SPA_BLKPTRSHIFT);
59 dnode_t *dn = NULL;
86e3c28a
CIK
60 int dn_slots = dnodesize >> DNODE_SHIFT;
61 boolean_t restarted = B_FALSE;
62 uint64_t *cpuobj = NULL;
63 int dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift;
64 int error;
65
66 kpreempt_disable();
67 cpuobj = &os->os_obj_next_percpu[CPU_SEQID %
68 os->os_obj_next_percpu_len];
69 kpreempt_enable();
70e083d2 70
86e3c28a
CIK
71 if (dn_slots == 0) {
72 dn_slots = DNODE_MIN_SLOTS;
73 } else {
74 ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS);
75 ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS);
76 }
77
78 /*
79 * The "chunk" of dnodes that is assigned to a CPU-specific
80 * allocator needs to be at least one block's worth, to avoid
81 * lock contention on the dbuf. It can be at most one L1 block's
82 * worth, so that the "rescan after polishing off a L1's worth"
83 * logic below will be sure to kick in.
84 */
85 if (dnodes_per_chunk < DNODES_PER_BLOCK)
86 dnodes_per_chunk = DNODES_PER_BLOCK;
87 if (dnodes_per_chunk > L1_dnode_count)
88 dnodes_per_chunk = L1_dnode_count;
89
90 object = *cpuobj;
70e083d2 91 for (;;) {
70e083d2 92 /*
86e3c28a
CIK
93 * If we finished a chunk of dnodes, get a new one from
94 * the global allocator.
70e083d2 95 */
86e3c28a
CIK
96 if ((P2PHASE(object, dnodes_per_chunk) == 0) ||
97 (P2PHASE(object + dn_slots - 1, dnodes_per_chunk) <
98 dn_slots)) {
99 DNODE_STAT_BUMP(dnode_alloc_next_chunk);
100 mutex_enter(&os->os_obj_lock);
101 ASSERT0(P2PHASE(os->os_obj_next_chunk,
102 dnodes_per_chunk));
103 object = os->os_obj_next_chunk;
104
105 /*
106 * Each time we polish off a L1 bp worth of dnodes
107 * (2^12 objects), move to another L1 bp that's
108 * still reasonably sparse (at most 1/4 full). Look
109 * from the beginning at most once per txg. If we
110 * still can't allocate from that L1 block, search
111 * for an empty L0 block, which will quickly skip
112 * to the end of the metadnode if no nearby L0
113 * blocks are empty. This fallback avoids a
114 * pathology where full dnode blocks containing
115 * large dnodes appear sparse because they have a
116 * low blk_fill, leading to many failed allocation
117 * attempts. In the long term a better mechanism to
118 * search for sparse metadnode regions, such as
119 * spacemaps, could be implemented.
120 *
121 * os_scan_dnodes is set during txg sync if enough
122 * objects have been freed since the previous
123 * rescan to justify backfilling again.
124 *
125 * Note that dmu_traverse depends on the behavior
126 * that we use multiple blocks of the dnode object
127 * before going back to reuse objects. Any change
128 * to this algorithm should preserve that property
129 * or find another solution to the issues described
130 * in traverse_visitbp.
131 */
132 if (P2PHASE(object, L1_dnode_count) == 0) {
133 uint64_t offset;
134 uint64_t blkfill;
135 int minlvl;
136 if (os->os_rescan_dnodes) {
137 offset = 0;
138 os->os_rescan_dnodes = B_FALSE;
139 } else {
140 offset = object << DNODE_SHIFT;
141 }
142 blkfill = restarted ? 1 : DNODES_PER_BLOCK >> 2;
143 minlvl = restarted ? 1 : 2;
144 restarted = B_TRUE;
145 error = dnode_next_offset(DMU_META_DNODE(os),
146 DNODE_FIND_HOLE, &offset, minlvl,
147 blkfill, 0);
148 if (error == 0) {
149 object = offset >> DNODE_SHIFT;
150 }
151 }
152 /*
153 * Note: if "restarted", we may find a L0 that
154 * is not suitably aligned.
155 */
156 os->os_obj_next_chunk =
157 P2ALIGN(object, dnodes_per_chunk) +
158 dnodes_per_chunk;
159 (void) atomic_swap_64(cpuobj, object);
160 mutex_exit(&os->os_obj_lock);
70e083d2 161 }
86e3c28a
CIK
162
163 /*
164 * The value of (*cpuobj) before adding dn_slots is the object
165 * ID assigned to us. The value afterwards is the object ID
166 * assigned to whoever wants to do an allocation next.
167 */
168 object = atomic_add_64_nv(cpuobj, dn_slots) - dn_slots;
70e083d2
TG
169
170 /*
171 * XXX We should check for an i/o error here and return
172 * up to our caller. Actually we should pre-read it in
173 * dmu_tx_assign(), but there is currently no mechanism
174 * to do so.
175 */
86e3c28a
CIK
176 error = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE,
177 dn_slots, FTAG, &dn);
178 if (error == 0) {
179 rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
180 /*
181 * Another thread could have allocated it; check
182 * again now that we have the struct lock.
183 */
184 if (dn->dn_type == DMU_OT_NONE) {
185 dnode_allocate(dn, ot, blocksize, 0,
186 bonustype, bonuslen, dn_slots, tx);
187 rw_exit(&dn->dn_struct_rwlock);
188 dmu_tx_add_new_object(tx, dn);
189 dnode_rele(dn, FTAG);
190 return (object);
191 }
192 rw_exit(&dn->dn_struct_rwlock);
193 dnode_rele(dn, FTAG);
194 DNODE_STAT_BUMP(dnode_alloc_race);
195 }
70e083d2 196
86e3c28a
CIK
197 /*
198 * Skip to next known valid starting point on error. This
199 * is the start of the next block of dnodes.
200 */
201 if (dmu_object_next(os, &object, B_TRUE, 0) != 0) {
202 object = P2ROUNDUP(object + 1, DNODES_PER_BLOCK);
203 DNODE_STAT_BUMP(dnode_alloc_next_block);
204 }
205 (void) atomic_swap_64(cpuobj, object);
70e083d2 206 }
70e083d2
TG
207}
208
209int
210dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
211 int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
86e3c28a
CIK
212{
213 return (dmu_object_claim_dnsize(os, object, ot, blocksize, bonustype,
214 bonuslen, 0, tx));
215}
216
217int
218dmu_object_claim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,
219 int blocksize, dmu_object_type_t bonustype, int bonuslen,
220 int dnodesize, dmu_tx_t *tx)
70e083d2
TG
221{
222 dnode_t *dn;
86e3c28a 223 int dn_slots = dnodesize >> DNODE_SHIFT;
70e083d2
TG
224 int err;
225
86e3c28a
CIK
226 if (dn_slots == 0)
227 dn_slots = DNODE_MIN_SLOTS;
228 ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS);
229 ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS);
230
70e083d2
TG
231 if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx))
232 return (SET_ERROR(EBADF));
233
86e3c28a
CIK
234 err = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, dn_slots,
235 FTAG, &dn);
70e083d2
TG
236 if (err)
237 return (err);
86e3c28a
CIK
238
239 dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, dn_slots, tx);
240 dmu_tx_add_new_object(tx, dn);
241
70e083d2
TG
242 dnode_rele(dn, FTAG);
243
70e083d2
TG
244 return (0);
245}
246
247int
248dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
249 int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
86e3c28a
CIK
250{
251 return (dmu_object_reclaim_dnsize(os, object, ot, blocksize, bonustype,
252 bonuslen, 0, tx));
253}
254
255int
256dmu_object_reclaim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,
257 int blocksize, dmu_object_type_t bonustype, int bonuslen, int dnodesize,
258 dmu_tx_t *tx)
70e083d2
TG
259{
260 dnode_t *dn;
86e3c28a 261 int dn_slots = dnodesize >> DNODE_SHIFT;
70e083d2
TG
262 int err;
263
264 if (object == DMU_META_DNODE_OBJECT)
265 return (SET_ERROR(EBADF));
266
86e3c28a 267 err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0,
70e083d2
TG
268 FTAG, &dn);
269 if (err)
270 return (err);
271
86e3c28a 272 dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, dn_slots, tx);
70e083d2
TG
273
274 dnode_rele(dn, FTAG);
275 return (err);
276}
277
86e3c28a 278
70e083d2
TG
279int
280dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx)
281{
282 dnode_t *dn;
283 int err;
284
285 ASSERT(object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
286
86e3c28a 287 err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0,
70e083d2
TG
288 FTAG, &dn);
289 if (err)
290 return (err);
291
292 ASSERT(dn->dn_type != DMU_OT_NONE);
293 dnode_free_range(dn, 0, DMU_OBJECT_END, tx);
294 dnode_free(dn, tx);
295 dnode_rele(dn, FTAG);
296
297 return (0);
298}
299
86e3c28a
CIK
300/*
301 * Return (in *objectp) the next object which is allocated (or a hole)
302 * after *object, taking into account only objects that may have been modified
303 * after the specified txg.
304 */
70e083d2
TG
305int
306dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg)
307{
86e3c28a
CIK
308 uint64_t offset;
309 uint64_t start_obj;
310 struct dsl_dataset *ds = os->os_dsl_dataset;
70e083d2
TG
311 int error;
312
86e3c28a
CIK
313 if (*objectp == 0) {
314 start_obj = 1;
315 } else if (ds && ds->ds_feature_inuse[SPA_FEATURE_LARGE_DNODE]) {
316 uint64_t i = *objectp + 1;
317 uint64_t last_obj = *objectp | (DNODES_PER_BLOCK - 1);
318 dmu_object_info_t doi;
319
320 /*
321 * Scan through the remaining meta dnode block. The contents
322 * of each slot in the block are known so it can be quickly
323 * checked. If the block is exhausted without a match then
324 * hand off to dnode_next_offset() for further scanning.
325 */
326 while (i <= last_obj) {
327 error = dmu_object_info(os, i, &doi);
328 if (error == ENOENT) {
329 if (hole) {
330 *objectp = i;
331 return (0);
332 } else {
333 i++;
334 }
335 } else if (error == EEXIST) {
336 i++;
337 } else if (error == 0) {
338 if (hole) {
339 i += doi.doi_dnodesize >> DNODE_SHIFT;
340 } else {
341 *objectp = i;
342 return (0);
343 }
344 } else {
345 return (error);
346 }
347 }
348
349 start_obj = i;
350 } else {
351 start_obj = *objectp + 1;
352 }
353
354 offset = start_obj << DNODE_SHIFT;
355
70e083d2
TG
356 error = dnode_next_offset(DMU_META_DNODE(os),
357 (hole ? DNODE_FIND_HOLE : 0), &offset, 0, DNODES_PER_BLOCK, txg);
358
359 *objectp = offset >> DNODE_SHIFT;
360
361 return (error);
362}
363
364/*
365 * Turn this object from old_type into DMU_OTN_ZAP_METADATA, and bump the
366 * refcount on SPA_FEATURE_EXTENSIBLE_DATASET.
367 *
368 * Only for use from syncing context, on MOS objects.
369 */
370void
371dmu_object_zapify(objset_t *mos, uint64_t object, dmu_object_type_t old_type,
372 dmu_tx_t *tx)
373{
374 dnode_t *dn;
375
376 ASSERT(dmu_tx_is_syncing(tx));
377
378 VERIFY0(dnode_hold(mos, object, FTAG, &dn));
379 if (dn->dn_type == DMU_OTN_ZAP_METADATA) {
380 dnode_rele(dn, FTAG);
381 return;
382 }
383 ASSERT3U(dn->dn_type, ==, old_type);
384 ASSERT0(dn->dn_maxblkid);
385 dn->dn_next_type[tx->tx_txg & TXG_MASK] = dn->dn_type =
386 DMU_OTN_ZAP_METADATA;
387 dnode_setdirty(dn, tx);
388 dnode_rele(dn, FTAG);
389
390 mzap_create_impl(mos, object, 0, 0, tx);
391
392 spa_feature_incr(dmu_objset_spa(mos),
393 SPA_FEATURE_EXTENSIBLE_DATASET, tx);
394}
395
396void
397dmu_object_free_zapified(objset_t *mos, uint64_t object, dmu_tx_t *tx)
398{
399 dnode_t *dn;
400 dmu_object_type_t t;
401
402 ASSERT(dmu_tx_is_syncing(tx));
403
404 VERIFY0(dnode_hold(mos, object, FTAG, &dn));
405 t = dn->dn_type;
406 dnode_rele(dn, FTAG);
407
408 if (t == DMU_OTN_ZAP_METADATA) {
409 spa_feature_decr(dmu_objset_spa(mos),
410 SPA_FEATURE_EXTENSIBLE_DATASET, tx);
411 }
412 VERIFY0(dmu_object_free(mos, object, tx));
413}
414
415#if defined(_KERNEL) && defined(HAVE_SPL)
416EXPORT_SYMBOL(dmu_object_alloc);
86e3c28a 417EXPORT_SYMBOL(dmu_object_alloc_dnsize);
70e083d2 418EXPORT_SYMBOL(dmu_object_claim);
86e3c28a 419EXPORT_SYMBOL(dmu_object_claim_dnsize);
70e083d2 420EXPORT_SYMBOL(dmu_object_reclaim);
86e3c28a 421EXPORT_SYMBOL(dmu_object_reclaim_dnsize);
70e083d2
TG
422EXPORT_SYMBOL(dmu_object_free);
423EXPORT_SYMBOL(dmu_object_next);
424EXPORT_SYMBOL(dmu_object_zapify);
425EXPORT_SYMBOL(dmu_object_free_zapified);
86e3c28a
CIK
426
427/* BEGIN CSTYLED */
428module_param(dmu_object_alloc_chunk_shift, int, 0644);
429MODULE_PARM_DESC(dmu_object_alloc_chunk_shift,
430 "CPU-specific allocator grabs 2^N objects at once");
431/* END CSTYLED */
70e083d2 432#endif