]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blob - zfs/module/zfs/dmu_object.c
UBUNTU: SAUCE: (noup) Update zfs to 0.7.5-1ubuntu16.6
[mirror_ubuntu-bionic-kernel.git] / zfs / module / zfs / dmu_object.c
1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
24 * Copyright 2014 HybridCluster. All rights reserved.
25 */
26
27 #include <sys/dmu.h>
28 #include <sys/dmu_objset.h>
29 #include <sys/dmu_tx.h>
30 #include <sys/dnode.h>
31 #include <sys/zap.h>
32 #include <sys/zfeature.h>
33 #include <sys/dsl_dataset.h>
34
35 /*
36 * Each of the concurrent object allocators will grab
37 * 2^dmu_object_alloc_chunk_shift dnode slots at a time. The default is to
38 * grab 128 slots, which is 4 blocks worth. This was experimentally
39 * determined to be the lowest value that eliminates the measurable effect
40 * of lock contention from this code path.
41 */
42 int dmu_object_alloc_chunk_shift = 7;
43
44 uint64_t
45 dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
46 dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
47 {
48 return dmu_object_alloc_dnsize(os, ot, blocksize, bonustype, bonuslen,
49 0, tx);
50 }
51
52 uint64_t
53 dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize,
54 dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
55 {
56 uint64_t object;
57 uint64_t L1_dnode_count = DNODES_PER_BLOCK <<
58 (DMU_META_DNODE(os)->dn_indblkshift - SPA_BLKPTRSHIFT);
59 dnode_t *dn = NULL;
60 int dn_slots = dnodesize >> DNODE_SHIFT;
61 boolean_t restarted = B_FALSE;
62 uint64_t *cpuobj = NULL;
63 int dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift;
64 int error;
65
66 kpreempt_disable();
67 cpuobj = &os->os_obj_next_percpu[CPU_SEQID %
68 os->os_obj_next_percpu_len];
69 kpreempt_enable();
70
71 if (dn_slots == 0) {
72 dn_slots = DNODE_MIN_SLOTS;
73 } else {
74 ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS);
75 ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS);
76 }
77
78 /*
79 * The "chunk" of dnodes that is assigned to a CPU-specific
80 * allocator needs to be at least one block's worth, to avoid
81 * lock contention on the dbuf. It can be at most one L1 block's
82 * worth, so that the "rescan after polishing off a L1's worth"
83 * logic below will be sure to kick in.
84 */
85 if (dnodes_per_chunk < DNODES_PER_BLOCK)
86 dnodes_per_chunk = DNODES_PER_BLOCK;
87 if (dnodes_per_chunk > L1_dnode_count)
88 dnodes_per_chunk = L1_dnode_count;
89
90 object = *cpuobj;
91 for (;;) {
92 /*
93 * If we finished a chunk of dnodes, get a new one from
94 * the global allocator.
95 */
96 if ((P2PHASE(object, dnodes_per_chunk) == 0) ||
97 (P2PHASE(object + dn_slots - 1, dnodes_per_chunk) <
98 dn_slots)) {
99 DNODE_STAT_BUMP(dnode_alloc_next_chunk);
100 mutex_enter(&os->os_obj_lock);
101 ASSERT0(P2PHASE(os->os_obj_next_chunk,
102 dnodes_per_chunk));
103 object = os->os_obj_next_chunk;
104
105 /*
106 * Each time we polish off a L1 bp worth of dnodes
107 * (2^12 objects), move to another L1 bp that's
108 * still reasonably sparse (at most 1/4 full). Look
109 * from the beginning at most once per txg. If we
110 * still can't allocate from that L1 block, search
111 * for an empty L0 block, which will quickly skip
112 * to the end of the metadnode if no nearby L0
113 * blocks are empty. This fallback avoids a
114 * pathology where full dnode blocks containing
115 * large dnodes appear sparse because they have a
116 * low blk_fill, leading to many failed allocation
117 * attempts. In the long term a better mechanism to
118 * search for sparse metadnode regions, such as
119 * spacemaps, could be implemented.
120 *
121 * os_scan_dnodes is set during txg sync if enough
122 * objects have been freed since the previous
123 * rescan to justify backfilling again.
124 *
125 * Note that dmu_traverse depends on the behavior
126 * that we use multiple blocks of the dnode object
127 * before going back to reuse objects. Any change
128 * to this algorithm should preserve that property
129 * or find another solution to the issues described
130 * in traverse_visitbp.
131 */
132 if (P2PHASE(object, L1_dnode_count) == 0) {
133 uint64_t offset;
134 uint64_t blkfill;
135 int minlvl;
136 if (os->os_rescan_dnodes) {
137 offset = 0;
138 os->os_rescan_dnodes = B_FALSE;
139 } else {
140 offset = object << DNODE_SHIFT;
141 }
142 blkfill = restarted ? 1 : DNODES_PER_BLOCK >> 2;
143 minlvl = restarted ? 1 : 2;
144 restarted = B_TRUE;
145 error = dnode_next_offset(DMU_META_DNODE(os),
146 DNODE_FIND_HOLE, &offset, minlvl,
147 blkfill, 0);
148 if (error == 0) {
149 object = offset >> DNODE_SHIFT;
150 }
151 }
152 /*
153 * Note: if "restarted", we may find a L0 that
154 * is not suitably aligned.
155 */
156 os->os_obj_next_chunk =
157 P2ALIGN(object, dnodes_per_chunk) +
158 dnodes_per_chunk;
159 (void) atomic_swap_64(cpuobj, object);
160 mutex_exit(&os->os_obj_lock);
161 }
162
163 /*
164 * The value of (*cpuobj) before adding dn_slots is the object
165 * ID assigned to us. The value afterwards is the object ID
166 * assigned to whoever wants to do an allocation next.
167 */
168 object = atomic_add_64_nv(cpuobj, dn_slots) - dn_slots;
169
170 /*
171 * XXX We should check for an i/o error here and return
172 * up to our caller. Actually we should pre-read it in
173 * dmu_tx_assign(), but there is currently no mechanism
174 * to do so.
175 */
176 error = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE,
177 dn_slots, FTAG, &dn);
178 if (error == 0) {
179 rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
180 /*
181 * Another thread could have allocated it; check
182 * again now that we have the struct lock.
183 */
184 if (dn->dn_type == DMU_OT_NONE) {
185 dnode_allocate(dn, ot, blocksize, 0,
186 bonustype, bonuslen, dn_slots, tx);
187 rw_exit(&dn->dn_struct_rwlock);
188 dmu_tx_add_new_object(tx, dn);
189 dnode_rele(dn, FTAG);
190 return (object);
191 }
192 rw_exit(&dn->dn_struct_rwlock);
193 dnode_rele(dn, FTAG);
194 DNODE_STAT_BUMP(dnode_alloc_race);
195 }
196
197 /*
198 * Skip to next known valid starting point on error. This
199 * is the start of the next block of dnodes.
200 */
201 if (dmu_object_next(os, &object, B_TRUE, 0) != 0) {
202 object = P2ROUNDUP(object + 1, DNODES_PER_BLOCK);
203 DNODE_STAT_BUMP(dnode_alloc_next_block);
204 }
205 (void) atomic_swap_64(cpuobj, object);
206 }
207 }
208
209 int
210 dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
211 int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
212 {
213 return (dmu_object_claim_dnsize(os, object, ot, blocksize, bonustype,
214 bonuslen, 0, tx));
215 }
216
217 int
218 dmu_object_claim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,
219 int blocksize, dmu_object_type_t bonustype, int bonuslen,
220 int dnodesize, dmu_tx_t *tx)
221 {
222 dnode_t *dn;
223 int dn_slots = dnodesize >> DNODE_SHIFT;
224 int err;
225
226 if (dn_slots == 0)
227 dn_slots = DNODE_MIN_SLOTS;
228 ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS);
229 ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS);
230
231 if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx))
232 return (SET_ERROR(EBADF));
233
234 err = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, dn_slots,
235 FTAG, &dn);
236 if (err)
237 return (err);
238
239 dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, dn_slots, tx);
240 dmu_tx_add_new_object(tx, dn);
241
242 dnode_rele(dn, FTAG);
243
244 return (0);
245 }
246
247 int
248 dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
249 int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
250 {
251 return (dmu_object_reclaim_dnsize(os, object, ot, blocksize, bonustype,
252 bonuslen, 0, tx));
253 }
254
255 int
256 dmu_object_reclaim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,
257 int blocksize, dmu_object_type_t bonustype, int bonuslen, int dnodesize,
258 dmu_tx_t *tx)
259 {
260 dnode_t *dn;
261 int dn_slots = dnodesize >> DNODE_SHIFT;
262 int err;
263
264 if (object == DMU_META_DNODE_OBJECT)
265 return (SET_ERROR(EBADF));
266
267 err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0,
268 FTAG, &dn);
269 if (err)
270 return (err);
271
272 dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, dn_slots, tx);
273
274 dnode_rele(dn, FTAG);
275 return (err);
276 }
277
278
279 int
280 dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx)
281 {
282 dnode_t *dn;
283 int err;
284
285 ASSERT(object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
286
287 err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0,
288 FTAG, &dn);
289 if (err)
290 return (err);
291
292 ASSERT(dn->dn_type != DMU_OT_NONE);
293 dnode_free_range(dn, 0, DMU_OBJECT_END, tx);
294 dnode_free(dn, tx);
295 dnode_rele(dn, FTAG);
296
297 return (0);
298 }
299
300 /*
301 * Return (in *objectp) the next object which is allocated (or a hole)
302 * after *object, taking into account only objects that may have been modified
303 * after the specified txg.
304 */
305 int
306 dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg)
307 {
308 uint64_t offset;
309 uint64_t start_obj;
310 struct dsl_dataset *ds = os->os_dsl_dataset;
311 int error;
312
313 if (*objectp == 0) {
314 start_obj = 1;
315 } else if (ds && ds->ds_feature_inuse[SPA_FEATURE_LARGE_DNODE]) {
316 uint64_t i = *objectp + 1;
317 uint64_t last_obj = *objectp | (DNODES_PER_BLOCK - 1);
318 dmu_object_info_t doi;
319
320 /*
321 * Scan through the remaining meta dnode block. The contents
322 * of each slot in the block are known so it can be quickly
323 * checked. If the block is exhausted without a match then
324 * hand off to dnode_next_offset() for further scanning.
325 */
326 while (i <= last_obj) {
327 error = dmu_object_info(os, i, &doi);
328 if (error == ENOENT) {
329 if (hole) {
330 *objectp = i;
331 return (0);
332 } else {
333 i++;
334 }
335 } else if (error == EEXIST) {
336 i++;
337 } else if (error == 0) {
338 if (hole) {
339 i += doi.doi_dnodesize >> DNODE_SHIFT;
340 } else {
341 *objectp = i;
342 return (0);
343 }
344 } else {
345 return (error);
346 }
347 }
348
349 start_obj = i;
350 } else {
351 start_obj = *objectp + 1;
352 }
353
354 offset = start_obj << DNODE_SHIFT;
355
356 error = dnode_next_offset(DMU_META_DNODE(os),
357 (hole ? DNODE_FIND_HOLE : 0), &offset, 0, DNODES_PER_BLOCK, txg);
358
359 *objectp = offset >> DNODE_SHIFT;
360
361 return (error);
362 }
363
364 /*
365 * Turn this object from old_type into DMU_OTN_ZAP_METADATA, and bump the
366 * refcount on SPA_FEATURE_EXTENSIBLE_DATASET.
367 *
368 * Only for use from syncing context, on MOS objects.
369 */
370 void
371 dmu_object_zapify(objset_t *mos, uint64_t object, dmu_object_type_t old_type,
372 dmu_tx_t *tx)
373 {
374 dnode_t *dn;
375
376 ASSERT(dmu_tx_is_syncing(tx));
377
378 VERIFY0(dnode_hold(mos, object, FTAG, &dn));
379 if (dn->dn_type == DMU_OTN_ZAP_METADATA) {
380 dnode_rele(dn, FTAG);
381 return;
382 }
383 ASSERT3U(dn->dn_type, ==, old_type);
384 ASSERT0(dn->dn_maxblkid);
385 dn->dn_next_type[tx->tx_txg & TXG_MASK] = dn->dn_type =
386 DMU_OTN_ZAP_METADATA;
387 dnode_setdirty(dn, tx);
388 dnode_rele(dn, FTAG);
389
390 mzap_create_impl(mos, object, 0, 0, tx);
391
392 spa_feature_incr(dmu_objset_spa(mos),
393 SPA_FEATURE_EXTENSIBLE_DATASET, tx);
394 }
395
396 void
397 dmu_object_free_zapified(objset_t *mos, uint64_t object, dmu_tx_t *tx)
398 {
399 dnode_t *dn;
400 dmu_object_type_t t;
401
402 ASSERT(dmu_tx_is_syncing(tx));
403
404 VERIFY0(dnode_hold(mos, object, FTAG, &dn));
405 t = dn->dn_type;
406 dnode_rele(dn, FTAG);
407
408 if (t == DMU_OTN_ZAP_METADATA) {
409 spa_feature_decr(dmu_objset_spa(mos),
410 SPA_FEATURE_EXTENSIBLE_DATASET, tx);
411 }
412 VERIFY0(dmu_object_free(mos, object, tx));
413 }
414
415 #if defined(_KERNEL) && defined(HAVE_SPL)
416 EXPORT_SYMBOL(dmu_object_alloc);
417 EXPORT_SYMBOL(dmu_object_alloc_dnsize);
418 EXPORT_SYMBOL(dmu_object_claim);
419 EXPORT_SYMBOL(dmu_object_claim_dnsize);
420 EXPORT_SYMBOL(dmu_object_reclaim);
421 EXPORT_SYMBOL(dmu_object_reclaim_dnsize);
422 EXPORT_SYMBOL(dmu_object_free);
423 EXPORT_SYMBOL(dmu_object_next);
424 EXPORT_SYMBOL(dmu_object_zapify);
425 EXPORT_SYMBOL(dmu_object_free_zapified);
426
427 /* BEGIN CSTYLED */
428 module_param(dmu_object_alloc_chunk_shift, int, 0644);
429 MODULE_PARM_DESC(dmu_object_alloc_chunk_shift,
430 "CPU-specific allocator grabs 2^N objects at once");
431 /* END CSTYLED */
432 #endif