]> git.proxmox.com Git - mirror_zfs.git/blame - module/zfs/dmu_objset.c
Fix infinite scan on a pool with only special allocations
[mirror_zfs.git] / module / zfs / dmu_objset.c
CommitLineData
34dc7c2f
BB
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
9b7b9cd3 21
34dc7c2f 22/*
428870ff 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
d52d80b7 24 * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
3a17a7a9 25 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
788eb90c 26 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
0c66c32d 27 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
9c43027b 28 * Copyright (c) 2015, STRATO AG, Inc. All rights reserved.
a0bd735a 29 * Copyright (c) 2016 Actifio, Inc. All rights reserved.
9b7b9cd3 30 * Copyright 2017 Nexenta Systems, Inc.
c0daec32 31 * Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
d8d418ff 32 * Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
34dc7c2f
BB
33 */
34
428870ff
BB
35/* Portions Copyright 2010 Robert Milkowski */
36
1de321e6 37#include <sys/zfeature.h>
34dc7c2f
BB
38#include <sys/cred.h>
39#include <sys/zfs_context.h>
40#include <sys/dmu_objset.h>
41#include <sys/dsl_dir.h>
42#include <sys/dsl_dataset.h>
43#include <sys/dsl_prop.h>
44#include <sys/dsl_pool.h>
45#include <sys/dsl_synctask.h>
46#include <sys/dsl_deleg.h>
47#include <sys/dnode.h>
48#include <sys/dbuf.h>
49#include <sys/zvol.h>
50#include <sys/dmu_tx.h>
34dc7c2f
BB
51#include <sys/zap.h>
52#include <sys/zil.h>
53#include <sys/dmu_impl.h>
54#include <sys/zfs_ioctl.h>
428870ff 55#include <sys/sa.h>
572e2857 56#include <sys/zfs_onexit.h>
13fe0198 57#include <sys/dsl_destroy.h>
9c43027b 58#include <sys/vdev.h>
a1d477c2 59#include <sys/zfeature.h>
f74b821a 60#include <sys/policy.h>
1de321e6 61#include <sys/spa_impl.h>
03916905 62#include <sys/dmu_recv.h>
9c5167d1 63#include <sys/zfs_project.h>
a7ed98d8 64#include "zfs_namecheck.h"
572e2857
BB
65
66/*
67 * Needed to close a window in dnode_move() that allows the objset to be freed
68 * before it can be safely accessed.
69 */
70krwlock_t os_lock;
71
9c43027b 72/*
4e33ba4c 73 * Tunable to overwrite the maximum number of threads for the parallelization
9c43027b
AJ
74 * of dmu_objset_find_dp, needed to speed up the import of pools with many
75 * datasets.
76 * Default is 4 times the number of leaf vdevs.
77 */
78int dmu_find_threads = 0;
79
68cbd56e
NB
80/*
81 * Backfill lower metadnode objects after this many have been freed.
82 * Backfilling negatively impacts object creation rates, so only do it
83 * if there are enough holes to fill.
84 */
85int dmu_rescan_dnode_threshold = 1 << DN_MAX_INDBLKSHIFT;
86
c0daec32
AB
87static char *upgrade_tag = "upgrade_tag";
88
9c43027b
AJ
89static void dmu_objset_find_dp_cb(void *arg);
90
1de321e6
JX
91static void dmu_objset_upgrade(objset_t *os, dmu_objset_upgrade_cb_t cb);
92static void dmu_objset_upgrade_stop(objset_t *os);
93
572e2857
BB
94void
95dmu_objset_init(void)
96{
97 rw_init(&os_lock, NULL, RW_DEFAULT, NULL);
98}
99
100void
101dmu_objset_fini(void)
102{
103 rw_destroy(&os_lock);
104}
34dc7c2f
BB
105
106spa_t *
107dmu_objset_spa(objset_t *os)
108{
428870ff 109 return (os->os_spa);
34dc7c2f
BB
110}
111
112zilog_t *
113dmu_objset_zil(objset_t *os)
114{
428870ff 115 return (os->os_zil);
34dc7c2f
BB
116}
117
118dsl_pool_t *
119dmu_objset_pool(objset_t *os)
120{
121 dsl_dataset_t *ds;
122
428870ff 123 if ((ds = os->os_dsl_dataset) != NULL && ds->ds_dir)
34dc7c2f
BB
124 return (ds->ds_dir->dd_pool);
125 else
428870ff 126 return (spa_get_dsl(os->os_spa));
34dc7c2f
BB
127}
128
129dsl_dataset_t *
130dmu_objset_ds(objset_t *os)
131{
428870ff 132 return (os->os_dsl_dataset);
34dc7c2f
BB
133}
134
135dmu_objset_type_t
136dmu_objset_type(objset_t *os)
137{
428870ff 138 return (os->os_phys->os_type);
34dc7c2f
BB
139}
140
141void
142dmu_objset_name(objset_t *os, char *buf)
143{
428870ff 144 dsl_dataset_name(os->os_dsl_dataset, buf);
34dc7c2f
BB
145}
146
147uint64_t
148dmu_objset_id(objset_t *os)
149{
428870ff 150 dsl_dataset_t *ds = os->os_dsl_dataset;
34dc7c2f
BB
151
152 return (ds ? ds->ds_object : 0);
153}
154
50c957f7
NB
155uint64_t
156dmu_objset_dnodesize(objset_t *os)
157{
158 return (os->os_dnodesize);
159}
160
faf0f58c 161zfs_sync_type_t
428870ff
BB
162dmu_objset_syncprop(objset_t *os)
163{
164 return (os->os_sync);
165}
166
faf0f58c 167zfs_logbias_op_t
428870ff
BB
168dmu_objset_logbias(objset_t *os)
169{
170 return (os->os_logbias);
171}
172
34dc7c2f
BB
173static void
174checksum_changed_cb(void *arg, uint64_t newval)
175{
428870ff 176 objset_t *os = arg;
34dc7c2f
BB
177
178 /*
179 * Inheritance should have been done by now.
180 */
181 ASSERT(newval != ZIO_CHECKSUM_INHERIT);
182
428870ff 183 os->os_checksum = zio_checksum_select(newval, ZIO_CHECKSUM_ON_VALUE);
34dc7c2f
BB
184}
185
186static void
187compression_changed_cb(void *arg, uint64_t newval)
188{
428870ff 189 objset_t *os = arg;
34dc7c2f
BB
190
191 /*
192 * Inheritance and range checking should have been done by now.
193 */
194 ASSERT(newval != ZIO_COMPRESS_INHERIT);
195
99197f03
JG
196 os->os_compress = zio_compress_select(os->os_spa, newval,
197 ZIO_COMPRESS_ON);
34dc7c2f
BB
198}
199
200static void
201copies_changed_cb(void *arg, uint64_t newval)
202{
428870ff 203 objset_t *os = arg;
34dc7c2f
BB
204
205 /*
206 * Inheritance and range checking should have been done by now.
207 */
208 ASSERT(newval > 0);
428870ff 209 ASSERT(newval <= spa_max_replication(os->os_spa));
34dc7c2f 210
428870ff
BB
211 os->os_copies = newval;
212}
213
214static void
215dedup_changed_cb(void *arg, uint64_t newval)
216{
217 objset_t *os = arg;
218 spa_t *spa = os->os_spa;
219 enum zio_checksum checksum;
220
221 /*
222 * Inheritance should have been done by now.
223 */
224 ASSERT(newval != ZIO_CHECKSUM_INHERIT);
225
226 checksum = zio_checksum_dedup_select(spa, newval, ZIO_CHECKSUM_OFF);
227
228 os->os_dedup_checksum = checksum & ZIO_CHECKSUM_MASK;
229 os->os_dedup_verify = !!(checksum & ZIO_CHECKSUM_VERIFY);
34dc7c2f
BB
230}
231
b128c09f
BB
232static void
233primary_cache_changed_cb(void *arg, uint64_t newval)
234{
428870ff 235 objset_t *os = arg;
b128c09f
BB
236
237 /*
238 * Inheritance and range checking should have been done by now.
239 */
240 ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE ||
241 newval == ZFS_CACHE_METADATA);
242
428870ff 243 os->os_primary_cache = newval;
b128c09f
BB
244}
245
246static void
247secondary_cache_changed_cb(void *arg, uint64_t newval)
248{
428870ff 249 objset_t *os = arg;
b128c09f
BB
250
251 /*
252 * Inheritance and range checking should have been done by now.
253 */
254 ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE ||
255 newval == ZFS_CACHE_METADATA);
256
428870ff
BB
257 os->os_secondary_cache = newval;
258}
259
260static void
261sync_changed_cb(void *arg, uint64_t newval)
262{
263 objset_t *os = arg;
264
265 /*
266 * Inheritance and range checking should have been done by now.
267 */
268 ASSERT(newval == ZFS_SYNC_STANDARD || newval == ZFS_SYNC_ALWAYS ||
269 newval == ZFS_SYNC_DISABLED);
270
271 os->os_sync = newval;
272 if (os->os_zil)
273 zil_set_sync(os->os_zil, newval);
274}
275
faf0f58c
MA
276static void
277redundant_metadata_changed_cb(void *arg, uint64_t newval)
278{
279 objset_t *os = arg;
280
281 /*
282 * Inheritance and range checking should have been done by now.
283 */
284 ASSERT(newval == ZFS_REDUNDANT_METADATA_ALL ||
285 newval == ZFS_REDUNDANT_METADATA_MOST);
286
287 os->os_redundant_metadata = newval;
288}
289
50c957f7
NB
290static void
291dnodesize_changed_cb(void *arg, uint64_t newval)
292{
293 objset_t *os = arg;
294
295 switch (newval) {
296 case ZFS_DNSIZE_LEGACY:
297 os->os_dnodesize = DNODE_MIN_SIZE;
298 break;
299 case ZFS_DNSIZE_AUTO:
300 /*
301 * Choose a dnode size that will work well for most
302 * workloads if the user specified "auto". Future code
303 * improvements could dynamically select a dnode size
304 * based on observed workload patterns.
305 */
306 os->os_dnodesize = DNODE_MIN_SIZE * 2;
307 break;
308 case ZFS_DNSIZE_1K:
309 case ZFS_DNSIZE_2K:
310 case ZFS_DNSIZE_4K:
311 case ZFS_DNSIZE_8K:
312 case ZFS_DNSIZE_16K:
313 os->os_dnodesize = newval;
314 break;
315 }
316}
317
cc99f275
DB
318static void
319smallblk_changed_cb(void *arg, uint64_t newval)
320{
321 objset_t *os = arg;
322
323 /*
324 * Inheritance and range checking should have been done by now.
325 */
326 ASSERT(newval <= SPA_OLD_MAXBLOCKSIZE);
327 ASSERT(ISP2(newval));
328
329 os->os_zpl_special_smallblock = newval;
330}
331
428870ff
BB
332static void
333logbias_changed_cb(void *arg, uint64_t newval)
334{
335 objset_t *os = arg;
336
337 ASSERT(newval == ZFS_LOGBIAS_LATENCY ||
338 newval == ZFS_LOGBIAS_THROUGHPUT);
339 os->os_logbias = newval;
340 if (os->os_zil)
341 zil_set_logbias(os->os_zil, newval);
b128c09f
BB
342}
343
f1512ee6
MA
344static void
345recordsize_changed_cb(void *arg, uint64_t newval)
346{
347 objset_t *os = arg;
348
349 os->os_recordsize = newval;
350}
351
34dc7c2f
BB
352void
353dmu_objset_byteswap(void *buf, size_t size)
354{
355 objset_phys_t *osp = buf;
356
9c5167d1
NF
357 ASSERT(size == OBJSET_PHYS_SIZE_V1 || size == OBJSET_PHYS_SIZE_V2 ||
358 size == sizeof (objset_phys_t));
34dc7c2f
BB
359 dnode_byteswap(&osp->os_meta_dnode);
360 byteswap_uint64_array(&osp->os_zil_header, sizeof (zil_header_t));
361 osp->os_type = BSWAP_64(osp->os_type);
9babb374 362 osp->os_flags = BSWAP_64(osp->os_flags);
9c5167d1 363 if (size >= OBJSET_PHYS_SIZE_V2) {
9babb374
BB
364 dnode_byteswap(&osp->os_userused_dnode);
365 dnode_byteswap(&osp->os_groupused_dnode);
9c5167d1
NF
366 if (size >= sizeof (objset_phys_t))
367 dnode_byteswap(&osp->os_projectused_dnode);
9babb374 368 }
34dc7c2f
BB
369}
370
64fc7762
MA
371/*
372 * The hash is a CRC-based hash of the objset_t pointer and the object number.
373 */
374static uint64_t
375dnode_hash(const objset_t *os, uint64_t obj)
376{
377 uintptr_t osv = (uintptr_t)os;
378 uint64_t crc = -1ULL;
379
380 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
381 /*
382 * The low 6 bits of the pointer don't have much entropy, because
383 * the objset_t is larger than 2^6 bytes long.
384 */
385 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF];
386 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF];
387 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF];
388 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 16)) & 0xFF];
389
390 crc ^= (osv>>14) ^ (obj>>24);
391
392 return (crc);
393}
394
395unsigned int
396dnode_multilist_index_func(multilist_t *ml, void *obj)
397{
398 dnode_t *dn = obj;
399 return (dnode_hash(dn->dn_objset, dn->dn_object) %
400 multilist_get_num_sublists(ml));
401}
402
a1d477c2
MA
403/*
404 * Instantiates the objset_t in-memory structure corresponding to the
405 * objset_phys_t that's pointed to by the specified blkptr_t.
406 */
34dc7c2f
BB
407int
408dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
428870ff 409 objset_t **osp)
34dc7c2f 410{
428870ff 411 objset_t *os;
b128c09f 412 int i, err;
34dc7c2f
BB
413
414 ASSERT(ds == NULL || MUTEX_HELD(&ds->ds_opening_lock));
30af21b0 415 ASSERT(!BP_IS_REDACTED(bp));
34dc7c2f 416
a1d477c2
MA
417 /*
418 * The $ORIGIN dataset (if it exists) doesn't have an associated
419 * objset, so there's no reason to open it. The $ORIGIN dataset
420 * will not exist on pools older than SPA_VERSION_ORIGIN.
421 */
422 if (ds != NULL && spa_get_dsl(spa) != NULL &&
423 spa_get_dsl(spa)->dp_origin_snap != NULL) {
424 ASSERT3P(ds->ds_dir, !=,
425 spa_get_dsl(spa)->dp_origin_snap->ds_dir);
426 }
427
79c76d5b 428 os = kmem_zalloc(sizeof (objset_t), KM_SLEEP);
428870ff
BB
429 os->os_dsl_dataset = ds;
430 os->os_spa = spa;
431 os->os_rootbp = bp;
432 if (!BP_IS_HOLE(os->os_rootbp)) {
2a432414 433 arc_flags_t aflags = ARC_FLAG_WAIT;
5dbd68a3 434 zbookmark_phys_t zb;
9c5167d1 435 int size;
b5256303 436 enum zio_flag zio_flags = ZIO_FLAG_CANFAIL;
428870ff
BB
437 SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
438 ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
439
440 if (DMU_OS_IS_L2CACHEABLE(os))
2a432414 441 aflags |= ARC_FLAG_L2CACHE;
34dc7c2f 442
b5256303
TC
443 if (ds != NULL && ds->ds_dir->dd_crypto_obj != 0) {
444 ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF);
445 ASSERT(BP_IS_AUTHENTICATED(bp));
446 zio_flags |= ZIO_FLAG_RAW;
447 }
448
428870ff 449 dprintf_bp(os->os_rootbp, "reading %s", "");
294f6806 450 err = arc_read(NULL, spa, os->os_rootbp,
428870ff 451 arc_getbuf_func, &os->os_phys_buf,
b5256303 452 ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);
13fe0198 453 if (err != 0) {
428870ff 454 kmem_free(os, sizeof (objset_t));
b128c09f
BB
455 /* convert checksum errors into IO errors */
456 if (err == ECKSUM)
2e528b49 457 err = SET_ERROR(EIO);
34dc7c2f
BB
458 return (err);
459 }
9babb374 460
9c5167d1
NF
461 if (spa_version(spa) < SPA_VERSION_USERSPACE)
462 size = OBJSET_PHYS_SIZE_V1;
463 else if (!spa_feature_is_enabled(spa,
464 SPA_FEATURE_PROJECT_QUOTA))
465 size = OBJSET_PHYS_SIZE_V2;
466 else
467 size = sizeof (objset_phys_t);
468
9babb374 469 /* Increase the blocksize if we are permitted. */
9c5167d1 470 if (arc_buf_size(os->os_phys_buf) < size) {
2aa34383 471 arc_buf_t *buf = arc_alloc_buf(spa, &os->os_phys_buf,
9c5167d1
NF
472 ARC_BUFC_METADATA, size);
473 bzero(buf->b_data, size);
428870ff
BB
474 bcopy(os->os_phys_buf->b_data, buf->b_data,
475 arc_buf_size(os->os_phys_buf));
d3c2ae1c 476 arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf);
428870ff 477 os->os_phys_buf = buf;
9babb374
BB
478 }
479
428870ff
BB
480 os->os_phys = os->os_phys_buf->b_data;
481 os->os_flags = os->os_phys->os_flags;
34dc7c2f 482 } else {
9babb374 483 int size = spa_version(spa) >= SPA_VERSION_USERSPACE ?
9c5167d1 484 sizeof (objset_phys_t) : OBJSET_PHYS_SIZE_V1;
2aa34383
DK
485 os->os_phys_buf = arc_alloc_buf(spa, &os->os_phys_buf,
486 ARC_BUFC_METADATA, size);
428870ff
BB
487 os->os_phys = os->os_phys_buf->b_data;
488 bzero(os->os_phys, size);
34dc7c2f 489 }
2e5dc449
MA
490 /*
491 * These properties will be filled in by the logic in zfs_get_zplprop()
492 * when they are queried for the first time.
493 */
494 os->os_version = OBJSET_PROP_UNINITIALIZED;
495 os->os_normalization = OBJSET_PROP_UNINITIALIZED;
496 os->os_utf8only = OBJSET_PROP_UNINITIALIZED;
497 os->os_casesensitivity = OBJSET_PROP_UNINITIALIZED;
34dc7c2f
BB
498
499 /*
500 * Note: the changed_cb will be called once before the register
501 * func returns, thus changing the checksum/compression from the
b128c09f
BB
502 * default (fletcher2/off). Snapshots don't need to know about
503 * checksum/compression/copies.
34dc7c2f 504 */
9b67f605 505 if (ds != NULL) {
47dfff3b
MA
506 boolean_t needlock = B_FALSE;
507
b5256303
TC
508 os->os_encrypted = (ds->ds_dir->dd_crypto_obj != 0);
509
47dfff3b
MA
510 /*
511 * Note: it's valid to open the objset if the dataset is
512 * long-held, in which case the pool_config lock will not
513 * be held.
514 */
515 if (!dsl_pool_config_held(dmu_objset_pool(os))) {
516 needlock = B_TRUE;
517 dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
518 }
b5256303 519
13fe0198
MA
520 err = dsl_prop_register(ds,
521 zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE),
428870ff 522 primary_cache_changed_cb, os);
13fe0198
MA
523 if (err == 0) {
524 err = dsl_prop_register(ds,
525 zfs_prop_to_name(ZFS_PROP_SECONDARYCACHE),
428870ff 526 secondary_cache_changed_cb, os);
13fe0198 527 }
0c66c32d 528 if (!ds->ds_is_snapshot) {
13fe0198
MA
529 if (err == 0) {
530 err = dsl_prop_register(ds,
531 zfs_prop_to_name(ZFS_PROP_CHECKSUM),
428870ff 532 checksum_changed_cb, os);
13fe0198
MA
533 }
534 if (err == 0) {
535 err = dsl_prop_register(ds,
536 zfs_prop_to_name(ZFS_PROP_COMPRESSION),
428870ff 537 compression_changed_cb, os);
13fe0198
MA
538 }
539 if (err == 0) {
540 err = dsl_prop_register(ds,
541 zfs_prop_to_name(ZFS_PROP_COPIES),
428870ff 542 copies_changed_cb, os);
13fe0198
MA
543 }
544 if (err == 0) {
545 err = dsl_prop_register(ds,
546 zfs_prop_to_name(ZFS_PROP_DEDUP),
428870ff 547 dedup_changed_cb, os);
13fe0198
MA
548 }
549 if (err == 0) {
550 err = dsl_prop_register(ds,
551 zfs_prop_to_name(ZFS_PROP_LOGBIAS),
428870ff 552 logbias_changed_cb, os);
13fe0198
MA
553 }
554 if (err == 0) {
555 err = dsl_prop_register(ds,
556 zfs_prop_to_name(ZFS_PROP_SYNC),
428870ff 557 sync_changed_cb, os);
13fe0198 558 }
faf0f58c
MA
559 if (err == 0) {
560 err = dsl_prop_register(ds,
561 zfs_prop_to_name(
562 ZFS_PROP_REDUNDANT_METADATA),
563 redundant_metadata_changed_cb, os);
564 }
f1512ee6
MA
565 if (err == 0) {
566 err = dsl_prop_register(ds,
567 zfs_prop_to_name(ZFS_PROP_RECORDSIZE),
568 recordsize_changed_cb, os);
569 }
50c957f7
NB
570 if (err == 0) {
571 err = dsl_prop_register(ds,
572 zfs_prop_to_name(ZFS_PROP_DNODESIZE),
573 dnodesize_changed_cb, os);
574 }
cc99f275
DB
575 if (err == 0) {
576 err = dsl_prop_register(ds,
577 zfs_prop_to_name(
578 ZFS_PROP_SPECIAL_SMALL_BLOCKS),
579 smallblk_changed_cb, os);
580 }
b128c09f 581 }
47dfff3b
MA
582 if (needlock)
583 dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
13fe0198 584 if (err != 0) {
d3c2ae1c 585 arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf);
428870ff 586 kmem_free(os, sizeof (objset_t));
34dc7c2f
BB
587 return (err);
588 }
9b67f605 589 } else {
34dc7c2f 590 /* It's the meta-objset. */
428870ff 591 os->os_checksum = ZIO_CHECKSUM_FLETCHER_4;
99197f03 592 os->os_compress = ZIO_COMPRESS_ON;
b5256303 593 os->os_encrypted = B_FALSE;
428870ff
BB
594 os->os_copies = spa_max_replication(spa);
595 os->os_dedup_checksum = ZIO_CHECKSUM_OFF;
faf0f58c
MA
596 os->os_dedup_verify = B_FALSE;
597 os->os_logbias = ZFS_LOGBIAS_LATENCY;
598 os->os_sync = ZFS_SYNC_STANDARD;
428870ff
BB
599 os->os_primary_cache = ZFS_CACHE_ALL;
600 os->os_secondary_cache = ZFS_CACHE_ALL;
50c957f7 601 os->os_dnodesize = DNODE_MIN_SIZE;
34dc7c2f
BB
602 }
603
0c66c32d 604 if (ds == NULL || !ds->ds_is_snapshot)
572e2857 605 os->os_zil_header = os->os_phys->os_zil_header;
428870ff 606 os->os_zil = zil_alloc(os, &os->os_zil_header);
34dc7c2f
BB
607
608 for (i = 0; i < TXG_SIZE; i++) {
64fc7762
MA
609 os->os_dirty_dnodes[i] = multilist_create(sizeof (dnode_t),
610 offsetof(dnode_t, dn_dirty_link[i]),
611 dnode_multilist_index_func);
34dc7c2f 612 }
428870ff 613 list_create(&os->os_dnodes, sizeof (dnode_t),
34dc7c2f 614 offsetof(dnode_t, dn_link));
428870ff 615 list_create(&os->os_downgraded_dbufs, sizeof (dmu_buf_impl_t),
34dc7c2f
BB
616 offsetof(dmu_buf_impl_t, db_link));
617
0c66c32d
JG
618 list_link_init(&os->os_evicting_node);
619
428870ff 620 mutex_init(&os->os_lock, NULL, MUTEX_DEFAULT, NULL);
64fc7762 621 mutex_init(&os->os_userused_lock, NULL, MUTEX_DEFAULT, NULL);
428870ff
BB
622 mutex_init(&os->os_obj_lock, NULL, MUTEX_DEFAULT, NULL);
623 mutex_init(&os->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL);
dbeb8796
MA
624 os->os_obj_next_percpu_len = boot_ncpus;
625 os->os_obj_next_percpu = kmem_zalloc(os->os_obj_next_percpu_len *
626 sizeof (os->os_obj_next_percpu[0]), KM_SLEEP);
428870ff 627
0c66c32d
JG
628 dnode_special_open(os, &os->os_phys->os_meta_dnode,
629 DMU_META_DNODE_OBJECT, &os->os_meta_dnode);
9c5167d1 630 if (OBJSET_BUF_HAS_USERUSED(os->os_phys_buf)) {
0c66c32d
JG
631 dnode_special_open(os, &os->os_phys->os_userused_dnode,
632 DMU_USERUSED_OBJECT, &os->os_userused_dnode);
633 dnode_special_open(os, &os->os_phys->os_groupused_dnode,
634 DMU_GROUPUSED_OBJECT, &os->os_groupused_dnode);
9c5167d1
NF
635 if (OBJSET_BUF_HAS_PROJECTUSED(os->os_phys_buf))
636 dnode_special_open(os,
637 &os->os_phys->os_projectused_dnode,
638 DMU_PROJECTUSED_OBJECT, &os->os_projectused_dnode);
9babb374 639 }
34dc7c2f 640
1de321e6
JX
641 mutex_init(&os->os_upgrade_lock, NULL, MUTEX_DEFAULT, NULL);
642
428870ff 643 *osp = os;
34dc7c2f
BB
644 return (0);
645}
646
428870ff
BB
647int
648dmu_objset_from_ds(dsl_dataset_t *ds, objset_t **osp)
34dc7c2f 649{
428870ff 650 int err = 0;
34dc7c2f 651
47dfff3b
MA
652 /*
653 * We shouldn't be doing anything with dsl_dataset_t's unless the
654 * pool_config lock is held, or the dataset is long-held.
655 */
656 ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool) ||
657 dsl_dataset_long_held(ds));
658
34dc7c2f 659 mutex_enter(&ds->ds_opening_lock);
9b67f605
MA
660 if (ds->ds_objset == NULL) {
661 objset_t *os;
cc9bb3e5 662 rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
34dc7c2f 663 err = dmu_objset_open_impl(dsl_dataset_get_spa(ds),
9b67f605 664 ds, dsl_dataset_get_blkptr(ds), &os);
cc9bb3e5 665 rrw_exit(&ds->ds_bp_rwlock, FTAG);
9b67f605
MA
666
667 if (err == 0) {
668 mutex_enter(&ds->ds_lock);
669 ASSERT(ds->ds_objset == NULL);
670 ds->ds_objset = os;
671 mutex_exit(&ds->ds_lock);
672 }
34dc7c2f 673 }
9b67f605 674 *osp = ds->ds_objset;
34dc7c2f 675 mutex_exit(&ds->ds_opening_lock);
428870ff 676 return (err);
34dc7c2f
BB
677}
678
13fe0198
MA
679/*
680 * Holds the pool while the objset is held. Therefore only one objset
681 * can be held at a time.
682 */
34dc7c2f 683int
b5256303
TC
684dmu_objset_hold_flags(const char *name, boolean_t decrypt, void *tag,
685 objset_t **osp)
34dc7c2f 686{
13fe0198 687 dsl_pool_t *dp;
428870ff 688 dsl_dataset_t *ds;
34dc7c2f 689 int err;
b5256303 690 ds_hold_flags_t flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : 0;
34dc7c2f 691
13fe0198
MA
692 err = dsl_pool_hold(name, tag, &dp);
693 if (err != 0)
694 return (err);
b5256303 695 err = dsl_dataset_hold_flags(dp, name, flags, tag, &ds);
13fe0198
MA
696 if (err != 0) {
697 dsl_pool_rele(dp, tag);
428870ff 698 return (err);
13fe0198 699 }
428870ff
BB
700
701 err = dmu_objset_from_ds(ds, osp);
13fe0198 702 if (err != 0) {
428870ff 703 dsl_dataset_rele(ds, tag);
13fe0198
MA
704 dsl_pool_rele(dp, tag);
705 }
428870ff 706
34dc7c2f
BB
707 return (err);
708}
709
b5256303
TC
710int
711dmu_objset_hold(const char *name, void *tag, objset_t **osp)
712{
713 return (dmu_objset_hold_flags(name, B_FALSE, tag, osp));
714}
715
9c43027b
AJ
716static int
717dmu_objset_own_impl(dsl_dataset_t *ds, dmu_objset_type_t type,
b5256303 718 boolean_t readonly, boolean_t decrypt, void *tag, objset_t **osp)
9c43027b
AJ
719{
720 int err;
721
722 err = dmu_objset_from_ds(ds, osp);
723 if (err != 0) {
b5256303 724 return (err);
9c43027b 725 } else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) {
9c43027b
AJ
726 return (SET_ERROR(EINVAL));
727 } else if (!readonly && dsl_dataset_is_snapshot(ds)) {
9c43027b 728 return (SET_ERROR(EROFS));
ae76f45c
TC
729 } else if (!readonly && decrypt &&
730 dsl_dir_incompatible_encryption_version(ds->ds_dir)) {
731 return (SET_ERROR(EROFS));
9c43027b 732 }
b5256303
TC
733
734 /* if we are decrypting, we can now check MACs in os->os_phys_buf */
735 if (decrypt && arc_is_unauthenticated((*osp)->os_phys_buf)) {
a2c2ed1b
TC
736 zbookmark_phys_t zb;
737
738 SET_BOOKMARK(&zb, ds->ds_object, ZB_ROOT_OBJECT,
739 ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
b5256303 740 err = arc_untransform((*osp)->os_phys_buf, (*osp)->os_spa,
a2c2ed1b 741 &zb, B_FALSE);
b5256303
TC
742 if (err != 0)
743 return (err);
744
745 ASSERT0(arc_is_unauthenticated((*osp)->os_phys_buf));
746 }
747
748 return (0);
9c43027b
AJ
749}
750
13fe0198
MA
751/*
752 * dsl_pool must not be held when this is called.
753 * Upon successful return, there will be a longhold on the dataset,
754 * and the dsl_pool will not be held.
755 */
34dc7c2f 756int
428870ff 757dmu_objset_own(const char *name, dmu_objset_type_t type,
b5256303 758 boolean_t readonly, boolean_t decrypt, void *tag, objset_t **osp)
34dc7c2f 759{
13fe0198 760 dsl_pool_t *dp;
34dc7c2f
BB
761 dsl_dataset_t *ds;
762 int err;
b5256303 763 ds_hold_flags_t flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : 0;
34dc7c2f 764
13fe0198
MA
765 err = dsl_pool_hold(name, FTAG, &dp);
766 if (err != 0)
767 return (err);
b5256303 768 err = dsl_dataset_own(dp, name, flags, tag, &ds);
13fe0198
MA
769 if (err != 0) {
770 dsl_pool_rele(dp, FTAG);
34dc7c2f 771 return (err);
13fe0198 772 }
b5256303
TC
773 err = dmu_objset_own_impl(ds, type, readonly, decrypt, tag, osp);
774 if (err != 0) {
775 dsl_dataset_disown(ds, flags, tag);
776 dsl_pool_rele(dp, FTAG);
777 return (err);
778 }
779
163a8c28
TC
780 /*
781 * User accounting requires the dataset to be decrypted and rw.
782 * We also don't begin user accounting during claiming to help
783 * speed up pool import times and to keep this txg reserved
784 * completely for recovery work.
785 */
9c5167d1
NF
786 if ((dmu_objset_userobjspace_upgradable(*osp) ||
787 dmu_objset_projectquota_upgradable(*osp)) &&
163a8c28 788 !readonly && !dp->dp_spa->spa_claiming &&
4807c0ba 789 (ds->ds_dir->dd_crypto_obj == 0 || decrypt))
9c5167d1 790 dmu_objset_id_quota_upgrade(*osp);
1de321e6 791
c0daec32 792 dsl_pool_rele(dp, FTAG);
b5256303 793 return (0);
34dc7c2f
BB
794}
795
9c43027b
AJ
796int
797dmu_objset_own_obj(dsl_pool_t *dp, uint64_t obj, dmu_objset_type_t type,
b5256303 798 boolean_t readonly, boolean_t decrypt, void *tag, objset_t **osp)
9c43027b
AJ
799{
800 dsl_dataset_t *ds;
801 int err;
b5256303 802 ds_hold_flags_t flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : 0;
9c43027b 803
b5256303 804 err = dsl_dataset_own_obj(dp, obj, flags, tag, &ds);
9c43027b
AJ
805 if (err != 0)
806 return (err);
807
b5256303
TC
808 err = dmu_objset_own_impl(ds, type, readonly, decrypt, tag, osp);
809 if (err != 0) {
810 dsl_dataset_disown(ds, flags, tag);
811 return (err);
812 }
813
814 return (0);
9c43027b
AJ
815}
816
34dc7c2f 817void
b5256303 818dmu_objset_rele_flags(objset_t *os, boolean_t decrypt, void *tag)
34dc7c2f 819{
b5256303
TC
820 ds_hold_flags_t flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : 0;
821
13fe0198 822 dsl_pool_t *dp = dmu_objset_pool(os);
b5256303 823 dsl_dataset_rele_flags(os->os_dsl_dataset, flags, tag);
13fe0198 824 dsl_pool_rele(dp, tag);
428870ff 825}
b128c09f 826
b5256303
TC
827void
828dmu_objset_rele(objset_t *os, void *tag)
829{
830 dmu_objset_rele_flags(os, B_FALSE, tag);
831}
832
831baf06
KW
833/*
834 * When we are called, os MUST refer to an objset associated with a dataset
835 * that is owned by 'tag'; that is, is held and long held by 'tag' and ds_owner
836 * == tag. We will then release and reacquire ownership of the dataset while
837 * holding the pool config_rwlock to avoid intervening namespace or ownership
838 * changes may occur.
839 *
840 * This exists solely to accommodate zfs_ioc_userspace_upgrade()'s desire to
841 * release the hold on its dataset and acquire a new one on the dataset of the
842 * same name so that it can be partially torn down and reconstructed.
843 */
844void
5e00213e
AG
845dmu_objset_refresh_ownership(dsl_dataset_t *ds, dsl_dataset_t **newds,
846 boolean_t decrypt, void *tag)
831baf06
KW
847{
848 dsl_pool_t *dp;
eca7b760 849 char name[ZFS_MAX_DATASET_NAME_LEN];
831baf06 850
831baf06
KW
851 VERIFY3P(ds, !=, NULL);
852 VERIFY3P(ds->ds_owner, ==, tag);
853 VERIFY(dsl_dataset_long_held(ds));
854
855 dsl_dataset_name(ds, name);
5e00213e 856 dp = ds->ds_dir->dd_pool;
831baf06 857 dsl_pool_config_enter(dp, FTAG);
5e00213e 858 dsl_dataset_disown(ds, decrypt, tag);
b5256303 859 VERIFY0(dsl_dataset_own(dp, name,
5e00213e 860 (decrypt) ? DS_HOLD_FLAG_DECRYPT : 0, tag, newds));
831baf06
KW
861 dsl_pool_config_exit(dp, FTAG);
862}
863
428870ff 864void
b5256303 865dmu_objset_disown(objset_t *os, boolean_t decrypt, void *tag)
428870ff 866{
1de321e6
JX
867 /*
868 * Stop upgrading thread
869 */
870 dmu_objset_upgrade_stop(os);
b5256303
TC
871 dsl_dataset_disown(os->os_dsl_dataset,
872 (decrypt) ? DS_HOLD_FLAG_DECRYPT : 0, tag);
34dc7c2f
BB
873}
874
13fe0198 875void
34dc7c2f
BB
876dmu_objset_evict_dbufs(objset_t *os)
877{
0c66c32d 878 dnode_t *dn_marker;
34dc7c2f
BB
879 dnode_t *dn;
880
0c66c32d 881 dn_marker = kmem_alloc(sizeof (dnode_t), KM_SLEEP);
34dc7c2f 882
0c66c32d
JG
883 mutex_enter(&os->os_lock);
884 dn = list_head(&os->os_dnodes);
885 while (dn != NULL) {
886 /*
887 * Skip dnodes without holds. We have to do this dance
888 * because dnode_add_ref() only works if there is already a
889 * hold. If the dnode has no holds, then it has no dbufs.
890 */
891 if (dnode_add_ref(dn, FTAG)) {
892 list_insert_after(&os->os_dnodes, dn, dn_marker);
893 mutex_exit(&os->os_lock);
34dc7c2f 894
0c66c32d
JG
895 dnode_evict_dbufs(dn);
896 dnode_rele(dn, FTAG);
34dc7c2f 897
0c66c32d
JG
898 mutex_enter(&os->os_lock);
899 dn = list_next(&os->os_dnodes, dn_marker);
900 list_remove(&os->os_dnodes, dn_marker);
901 } else {
902 dn = list_next(&os->os_dnodes, dn);
903 }
904 }
905 mutex_exit(&os->os_lock);
34dc7c2f 906
0c66c32d 907 kmem_free(dn_marker, sizeof (dnode_t));
34dc7c2f 908
0c66c32d 909 if (DMU_USERUSED_DNODE(os) != NULL) {
9c5167d1
NF
910 if (DMU_PROJECTUSED_DNODE(os) != NULL)
911 dnode_evict_dbufs(DMU_PROJECTUSED_DNODE(os));
0c66c32d
JG
912 dnode_evict_dbufs(DMU_GROUPUSED_DNODE(os));
913 dnode_evict_dbufs(DMU_USERUSED_DNODE(os));
34dc7c2f 914 }
0c66c32d 915 dnode_evict_dbufs(DMU_META_DNODE(os));
34dc7c2f
BB
916}
917
0c66c32d
JG
918/*
919 * Objset eviction processing is split into into two pieces.
920 * The first marks the objset as evicting, evicts any dbufs that
921 * have a refcount of zero, and then queues up the objset for the
922 * second phase of eviction. Once os->os_dnodes has been cleared by
923 * dnode_buf_pageout()->dnode_destroy(), the second phase is executed.
924 * The second phase closes the special dnodes, dequeues the objset from
925 * the list of those undergoing eviction, and finally frees the objset.
926 *
927 * NOTE: Due to asynchronous eviction processing (invocation of
928 * dnode_buf_pageout()), it is possible for the meta dnode for the
929 * objset to have no holds even though os->os_dnodes is not empty.
930 */
34dc7c2f 931void
428870ff 932dmu_objset_evict(objset_t *os)
34dc7c2f 933{
6f1ffb06
MA
934 dsl_dataset_t *ds = os->os_dsl_dataset;
935
1c27024e 936 for (int t = 0; t < TXG_SIZE; t++)
428870ff 937 ASSERT(!dmu_objset_is_dirty(os, t));
34dc7c2f 938
0eb21616
JG
939 if (ds)
940 dsl_prop_unregister_all(ds, os);
34dc7c2f 941
428870ff
BB
942 if (os->os_sa)
943 sa_tear_down(os);
944
13fe0198 945 dmu_objset_evict_dbufs(os);
34dc7c2f 946
0c66c32d
JG
947 mutex_enter(&os->os_lock);
948 spa_evicting_os_register(os->os_spa, os);
949 if (list_is_empty(&os->os_dnodes)) {
950 mutex_exit(&os->os_lock);
951 dmu_objset_evict_done(os);
952 } else {
953 mutex_exit(&os->os_lock);
954 }
b5256303
TC
955
956
0c66c32d
JG
957}
958
959void
960dmu_objset_evict_done(objset_t *os)
961{
962 ASSERT3P(list_head(&os->os_dnodes), ==, NULL);
963
572e2857
BB
964 dnode_special_close(&os->os_meta_dnode);
965 if (DMU_USERUSED_DNODE(os)) {
9c5167d1
NF
966 if (DMU_PROJECTUSED_DNODE(os))
967 dnode_special_close(&os->os_projectused_dnode);
572e2857
BB
968 dnode_special_close(&os->os_userused_dnode);
969 dnode_special_close(&os->os_groupused_dnode);
9babb374 970 }
428870ff
BB
971 zil_free(os->os_zil);
972
d3c2ae1c 973 arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf);
572e2857
BB
974
975 /*
976 * This is a barrier to prevent the objset from going away in
977 * dnode_move() until we can safely ensure that the objset is still in
978 * use. We consider the objset valid before the barrier and invalid
979 * after the barrier.
980 */
981 rw_enter(&os_lock, RW_READER);
982 rw_exit(&os_lock);
983
dbeb8796
MA
984 kmem_free(os->os_obj_next_percpu,
985 os->os_obj_next_percpu_len * sizeof (os->os_obj_next_percpu[0]));
986
428870ff 987 mutex_destroy(&os->os_lock);
64fc7762 988 mutex_destroy(&os->os_userused_lock);
428870ff
BB
989 mutex_destroy(&os->os_obj_lock);
990 mutex_destroy(&os->os_user_ptr_lock);
c17486b2 991 mutex_destroy(&os->os_upgrade_lock);
64fc7762
MA
992 for (int i = 0; i < TXG_SIZE; i++) {
993 multilist_destroy(os->os_dirty_dnodes[i]);
994 }
0c66c32d 995 spa_evicting_os_deregister(os->os_spa, os);
428870ff
BB
996 kmem_free(os, sizeof (objset_t));
997}
9babb374 998
6413c95f 999inode_timespec_t
428870ff
BB
1000dmu_objset_snap_cmtime(objset_t *os)
1001{
1002 return (dsl_dir_snap_cmtime(os->os_dsl_dataset->ds_dir));
34dc7c2f
BB
1003}
1004
428870ff 1005objset_t *
b5256303
TC
1006dmu_objset_create_impl_dnstats(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
1007 dmu_objset_type_t type, int levels, int blksz, int ibs, dmu_tx_t *tx)
34dc7c2f 1008{
428870ff 1009 objset_t *os;
34dc7c2f
BB
1010 dnode_t *mdn;
1011
1012 ASSERT(dmu_tx_is_syncing(tx));
13fe0198 1013
b5256303
TC
1014 if (blksz == 0)
1015 blksz = DNODE_BLOCK_SIZE;
4807c0ba 1016 if (ibs == 0)
b5256303
TC
1017 ibs = DN_MAX_INDBLKSHIFT;
1018
572e2857 1019 if (ds != NULL)
13fe0198 1020 VERIFY0(dmu_objset_from_ds(ds, &os));
572e2857 1021 else
13fe0198 1022 VERIFY0(dmu_objset_open_impl(spa, NULL, bp, &os));
572e2857
BB
1023
1024 mdn = DMU_META_DNODE(os);
34dc7c2f 1025
b5256303
TC
1026 dnode_allocate(mdn, DMU_OT_DNODE, blksz, ibs, DMU_OT_NONE, 0,
1027 DNODE_MIN_SLOTS, tx);
34dc7c2f
BB
1028
1029 /*
1030 * We don't want to have to increase the meta-dnode's nlevels
e1cfd73f 1031 * later, because then we could do it in quiescing context while
34dc7c2f
BB
1032 * we are also accessing it in open context.
1033 *
1034 * This precaution is not necessary for the MOS (ds == NULL),
1035 * because the MOS is only updated in syncing context.
1036 * This is most fortunate: the MOS is the only objset that
1037 * needs to be synced multiple times as spa_sync() iterates
1038 * to convergence, so minimizing its dn_nlevels matters.
1039 */
1040 if (ds != NULL) {
b5256303
TC
1041 if (levels == 0) {
1042 levels = 1;
1043
1044 /*
1045 * Determine the number of levels necessary for the
1046 * meta-dnode to contain DN_MAX_OBJECT dnodes. Note
1047 * that in order to ensure that we do not overflow
1048 * 64 bits, there has to be a nlevels that gives us a
1049 * number of blocks > DN_MAX_OBJECT but < 2^64.
1050 * Therefore, (mdn->dn_indblkshift - SPA_BLKPTRSHIFT)
1051 * (10) must be less than (64 - log2(DN_MAX_OBJECT))
1052 * (16).
1053 */
1054 while ((uint64_t)mdn->dn_nblkptr <<
1055 (mdn->dn_datablkshift - DNODE_SHIFT + (levels - 1) *
1056 (mdn->dn_indblkshift - SPA_BLKPTRSHIFT)) <
1057 DN_MAX_OBJECT)
1058 levels++;
1059 }
34dc7c2f
BB
1060
1061 mdn->dn_next_nlevels[tx->tx_txg & TXG_MASK] =
1062 mdn->dn_nlevels = levels;
1063 }
1064
1065 ASSERT(type != DMU_OST_NONE);
1066 ASSERT(type != DMU_OST_ANY);
1067 ASSERT(type < DMU_OST_NUMTYPES);
428870ff 1068 os->os_phys->os_type = type;
b5256303
TC
1069
1070 /*
1071 * Enable user accounting if it is enabled and this is not an
1072 * encrypted receive.
1073 */
1074 if (dmu_objset_userused_enabled(os) &&
1075 (!os->os_encrypted || !dmu_objset_is_receiving(os))) {
428870ff 1076 os->os_phys->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE;
1de321e6 1077 if (dmu_objset_userobjused_enabled(os)) {
d52d80b7
PD
1078 ds->ds_feature_activation[
1079 SPA_FEATURE_USEROBJ_ACCOUNTING] = (void *)B_TRUE;
1de321e6
JX
1080 os->os_phys->os_flags |=
1081 OBJSET_FLAG_USEROBJACCOUNTING_COMPLETE;
1082 }
9c5167d1 1083 if (dmu_objset_projectquota_enabled(os)) {
d52d80b7
PD
1084 ds->ds_feature_activation[
1085 SPA_FEATURE_PROJECT_QUOTA] = (void *)B_TRUE;
9c5167d1
NF
1086 os->os_phys->os_flags |=
1087 OBJSET_FLAG_PROJECTQUOTA_COMPLETE;
1088 }
428870ff 1089 os->os_flags = os->os_phys->os_flags;
9babb374 1090 }
34dc7c2f
BB
1091
1092 dsl_dataset_dirty(ds, tx);
1093
428870ff 1094 return (os);
34dc7c2f
BB
1095}
1096
b5256303
TC
1097/* called from dsl for meta-objset */
1098objset_t *
1099dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
1100 dmu_objset_type_t type, dmu_tx_t *tx)
1101{
1102 return (dmu_objset_create_impl_dnstats(spa, ds, bp, type, 0, 0, 0, tx));
1103}
1104
13fe0198
MA
1105typedef struct dmu_objset_create_arg {
1106 const char *doca_name;
1107 cred_t *doca_cred;
1108 void (*doca_userfunc)(objset_t *os, void *arg,
1109 cred_t *cr, dmu_tx_t *tx);
1110 void *doca_userarg;
1111 dmu_objset_type_t doca_type;
1112 uint64_t doca_flags;
b5256303 1113 dsl_crypto_params_t *doca_dcp;
13fe0198 1114} dmu_objset_create_arg_t;
34dc7c2f
BB
1115
1116/*ARGSUSED*/
1117static int
13fe0198 1118dmu_objset_create_check(void *arg, dmu_tx_t *tx)
34dc7c2f 1119{
13fe0198
MA
1120 dmu_objset_create_arg_t *doca = arg;
1121 dsl_pool_t *dp = dmu_tx_pool(tx);
1122 dsl_dir_t *pdd;
d8d418ff 1123 dsl_dataset_t *parentds;
1124 objset_t *parentos;
13fe0198
MA
1125 const char *tail;
1126 int error;
34dc7c2f 1127
13fe0198 1128 if (strchr(doca->doca_name, '@') != NULL)
2e528b49 1129 return (SET_ERROR(EINVAL));
34dc7c2f 1130
eca7b760
IK
1131 if (strlen(doca->doca_name) >= ZFS_MAX_DATASET_NAME_LEN)
1132 return (SET_ERROR(ENAMETOOLONG));
1133
a7ed98d8
SD
1134 if (dataset_nestcheck(doca->doca_name) != 0)
1135 return (SET_ERROR(ENAMETOOLONG));
1136
13fe0198
MA
1137 error = dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail);
1138 if (error != 0)
1139 return (error);
1140 if (tail == NULL) {
1141 dsl_dir_rele(pdd, FTAG);
2e528b49 1142 return (SET_ERROR(EEXIST));
34dc7c2f 1143 }
b5256303 1144
1fff937a 1145 error = dmu_objset_create_crypt_check(pdd, doca->doca_dcp, NULL);
b5256303
TC
1146 if (error != 0) {
1147 dsl_dir_rele(pdd, FTAG);
1148 return (error);
1149 }
1150
788eb90c
JJ
1151 error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL,
1152 doca->doca_cred);
d8d418ff 1153 if (error != 0) {
1154 dsl_dir_rele(pdd, FTAG);
1155 return (error);
1156 }
b5256303 1157
d8d418ff 1158 /* can't create below anything but filesystems (eg. no ZVOLs) */
1159 error = dsl_dataset_hold_obj(pdd->dd_pool,
1160 dsl_dir_phys(pdd)->dd_head_dataset_obj, FTAG, &parentds);
1161 if (error != 0) {
1162 dsl_dir_rele(pdd, FTAG);
1163 return (error);
1164 }
1165 error = dmu_objset_from_ds(parentds, &parentos);
1166 if (error != 0) {
1167 dsl_dataset_rele(parentds, FTAG);
1168 dsl_dir_rele(pdd, FTAG);
1169 return (error);
1170 }
1171 if (dmu_objset_type(parentos) != DMU_OST_ZFS) {
1172 dsl_dataset_rele(parentds, FTAG);
1173 dsl_dir_rele(pdd, FTAG);
1174 return (SET_ERROR(ZFS_ERR_WRONG_PARENT));
1175 }
1176 dsl_dataset_rele(parentds, FTAG);
13fe0198 1177 dsl_dir_rele(pdd, FTAG);
34dc7c2f 1178
788eb90c 1179 return (error);
34dc7c2f
BB
1180}
1181
1182static void
13fe0198 1183dmu_objset_create_sync(void *arg, dmu_tx_t *tx)
34dc7c2f 1184{
13fe0198
MA
1185 dmu_objset_create_arg_t *doca = arg;
1186 dsl_pool_t *dp = dmu_tx_pool(tx);
52ce99dd 1187 spa_t *spa = dp->dp_spa;
13fe0198
MA
1188 dsl_dir_t *pdd;
1189 const char *tail;
6f1ffb06 1190 dsl_dataset_t *ds;
13fe0198 1191 uint64_t obj;
6f1ffb06 1192 blkptr_t *bp;
13fe0198 1193 objset_t *os;
b5256303 1194 zio_t *rzio;
34dc7c2f 1195
13fe0198 1196 VERIFY0(dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail));
34dc7c2f 1197
13fe0198 1198 obj = dsl_dataset_create_sync(pdd, tail, NULL, doca->doca_flags,
b5256303 1199 doca->doca_cred, doca->doca_dcp, tx);
34dc7c2f 1200
b5256303
TC
1201 VERIFY0(dsl_dataset_hold_obj_flags(pdd->dd_pool, obj,
1202 DS_HOLD_FLAG_DECRYPT, FTAG, &ds));
cc9bb3e5 1203 rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
6f1ffb06 1204 bp = dsl_dataset_get_blkptr(ds);
52ce99dd 1205 os = dmu_objset_create_impl(spa, ds, bp, doca->doca_type, tx);
cc9bb3e5 1206 rrw_exit(&ds->ds_bp_rwlock, FTAG);
34dc7c2f 1207
13fe0198
MA
1208 if (doca->doca_userfunc != NULL) {
1209 doca->doca_userfunc(os, doca->doca_userarg,
1210 doca->doca_cred, tx);
34dc7c2f
BB
1211 }
1212
b5256303 1213 /*
4807c0ba 1214 * The doca_userfunc() may write out some data that needs to be
b5256303
TC
1215 * encrypted if the dataset is encrypted (specifically the root
1216 * directory). This data must be written out before the encryption
1217 * key mapping is removed by dsl_dataset_rele_flags(). Force the
1218 * I/O to occur immediately by invoking the relevant sections of
1219 * dsl_pool_sync().
1220 */
1221 if (os->os_encrypted) {
1222 dsl_dataset_t *tmpds = NULL;
1223 boolean_t need_sync_done = B_FALSE;
1224
4807c0ba
TC
1225 mutex_enter(&ds->ds_lock);
1226 ds->ds_owner = FTAG;
1227 mutex_exit(&ds->ds_lock);
1228
52ce99dd 1229 rzio = zio_root(spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
4807c0ba
TC
1230 tmpds = txg_list_remove_this(&dp->dp_dirty_datasets, ds,
1231 tx->tx_txg);
b5256303 1232 if (tmpds != NULL) {
b5256303
TC
1233 dsl_dataset_sync(ds, rzio, tx);
1234 need_sync_done = B_TRUE;
1235 }
1236 VERIFY0(zio_wait(rzio));
1237
1238 dmu_objset_do_userquota_updates(os, tx);
1239 taskq_wait(dp->dp_sync_taskq);
52ce99dd
TC
1240 if (txg_list_member(&dp->dp_dirty_datasets, ds, tx->tx_txg)) {
1241 ASSERT3P(ds->ds_key_mapping, !=, NULL);
1242 key_mapping_rele(spa, ds->ds_key_mapping, ds);
1243 }
b5256303 1244
52ce99dd 1245 rzio = zio_root(spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
4807c0ba
TC
1246 tmpds = txg_list_remove_this(&dp->dp_dirty_datasets, ds,
1247 tx->tx_txg);
b5256303 1248 if (tmpds != NULL) {
b5256303
TC
1249 dmu_buf_rele(ds->ds_dbuf, ds);
1250 dsl_dataset_sync(ds, rzio, tx);
1251 }
1252 VERIFY0(zio_wait(rzio));
1253
52ce99dd
TC
1254 if (need_sync_done) {
1255 ASSERT3P(ds->ds_key_mapping, !=, NULL);
1256 key_mapping_rele(spa, ds->ds_key_mapping, ds);
b5256303 1257 dsl_dataset_sync_done(ds, tx);
52ce99dd 1258 }
4807c0ba
TC
1259
1260 mutex_enter(&ds->ds_lock);
1261 ds->ds_owner = NULL;
1262 mutex_exit(&ds->ds_lock);
b5256303
TC
1263 }
1264
74756182 1265 spa_history_log_internal_ds(ds, "create", tx, " ");
a0bd735a 1266
b5256303 1267 dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG);
13fe0198 1268 dsl_dir_rele(pdd, FTAG);
34dc7c2f
BB
1269}
1270
1271int
428870ff 1272dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags,
b5256303 1273 dsl_crypto_params_t *dcp, dmu_objset_create_sync_func_t func, void *arg)
34dc7c2f 1274{
13fe0198 1275 dmu_objset_create_arg_t doca;
b5256303 1276 dsl_crypto_params_t tmp_dcp = { 0 };
34dc7c2f 1277
13fe0198
MA
1278 doca.doca_name = name;
1279 doca.doca_cred = CRED();
1280 doca.doca_flags = flags;
1281 doca.doca_userfunc = func;
1282 doca.doca_userarg = arg;
1283 doca.doca_type = type;
34dc7c2f 1284
b5256303
TC
1285 /*
1286 * Some callers (mostly for testing) do not provide a dcp on their
1287 * own but various code inside the sync task will require it to be
1288 * allocated. Rather than adding NULL checks throughout this code
1289 * or adding dummy dcp's to all of the callers we simply create a
1290 * dummy one here and use that. This zero dcp will have the same
85ce3f4f 1291 * effect as asking for inheritance of all encryption params.
b5256303
TC
1292 */
1293 doca.doca_dcp = (dcp != NULL) ? dcp : &tmp_dcp;
1294
ec213971 1295 int rv = dsl_sync_task(name,
3d45fdd6 1296 dmu_objset_create_check, dmu_objset_create_sync, &doca,
ec213971
MA
1297 6, ZFS_SPACE_CHECK_NORMAL);
1298
1299 if (rv == 0)
1300 zvol_create_minor(name);
1301 return (rv);
34dc7c2f
BB
1302}
1303
13fe0198
MA
1304typedef struct dmu_objset_clone_arg {
1305 const char *doca_clone;
1306 const char *doca_origin;
1307 cred_t *doca_cred;
1308} dmu_objset_clone_arg_t;
1309
1310/*ARGSUSED*/
1311static int
1312dmu_objset_clone_check(void *arg, dmu_tx_t *tx)
34dc7c2f 1313{
13fe0198 1314 dmu_objset_clone_arg_t *doca = arg;
428870ff
BB
1315 dsl_dir_t *pdd;
1316 const char *tail;
13fe0198
MA
1317 int error;
1318 dsl_dataset_t *origin;
1319 dsl_pool_t *dp = dmu_tx_pool(tx);
34dc7c2f 1320
13fe0198 1321 if (strchr(doca->doca_clone, '@') != NULL)
2e528b49 1322 return (SET_ERROR(EINVAL));
13fe0198 1323
eca7b760
IK
1324 if (strlen(doca->doca_clone) >= ZFS_MAX_DATASET_NAME_LEN)
1325 return (SET_ERROR(ENAMETOOLONG));
1326
13fe0198
MA
1327 error = dsl_dir_hold(dp, doca->doca_clone, FTAG, &pdd, &tail);
1328 if (error != 0)
1329 return (error);
428870ff 1330 if (tail == NULL) {
13fe0198 1331 dsl_dir_rele(pdd, FTAG);
2e528b49 1332 return (SET_ERROR(EEXIST));
34dc7c2f 1333 }
1cddb8c9 1334
788eb90c
JJ
1335 error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL,
1336 doca->doca_cred);
1337 if (error != 0) {
1338 dsl_dir_rele(pdd, FTAG);
1339 return (SET_ERROR(EDQUOT));
1340 }
428870ff 1341
13fe0198 1342 error = dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin);
b5256303
TC
1343 if (error != 0) {
1344 dsl_dir_rele(pdd, FTAG);
572e2857 1345 return (error);
b5256303 1346 }
572e2857 1347
13fe0198 1348 /* You can only clone snapshots, not the head datasets. */
0c66c32d 1349 if (!origin->ds_is_snapshot) {
13fe0198 1350 dsl_dataset_rele(origin, FTAG);
b5256303 1351 dsl_dir_rele(pdd, FTAG);
2e528b49 1352 return (SET_ERROR(EINVAL));
428870ff 1353 }
b5256303 1354
13fe0198 1355 dsl_dataset_rele(origin, FTAG);
b5256303 1356 dsl_dir_rele(pdd, FTAG);
572e2857 1357
13fe0198 1358 return (0);
9babb374 1359}
34dc7c2f 1360
13fe0198
MA
1361static void
1362dmu_objset_clone_sync(void *arg, dmu_tx_t *tx)
34dc7c2f 1363{
13fe0198
MA
1364 dmu_objset_clone_arg_t *doca = arg;
1365 dsl_pool_t *dp = dmu_tx_pool(tx);
1366 dsl_dir_t *pdd;
1367 const char *tail;
1368 dsl_dataset_t *origin, *ds;
1369 uint64_t obj;
eca7b760 1370 char namebuf[ZFS_MAX_DATASET_NAME_LEN];
6f1ffb06 1371
13fe0198
MA
1372 VERIFY0(dsl_dir_hold(dp, doca->doca_clone, FTAG, &pdd, &tail));
1373 VERIFY0(dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin));
6f1ffb06 1374
13fe0198 1375 obj = dsl_dataset_create_sync(pdd, tail, origin, 0,
b5256303 1376 doca->doca_cred, NULL, tx);
34dc7c2f 1377
13fe0198
MA
1378 VERIFY0(dsl_dataset_hold_obj(pdd->dd_pool, obj, FTAG, &ds));
1379 dsl_dataset_name(origin, namebuf);
1380 spa_history_log_internal_ds(ds, "clone", tx,
74756182 1381 "origin=%s (%llu)", namebuf, (u_longlong_t)origin->ds_object);
13fe0198
MA
1382 dsl_dataset_rele(ds, FTAG);
1383 dsl_dataset_rele(origin, FTAG);
1384 dsl_dir_rele(pdd, FTAG);
34dc7c2f
BB
1385}
1386
1387int
13fe0198 1388dmu_objset_clone(const char *clone, const char *origin)
34dc7c2f 1389{
13fe0198 1390 dmu_objset_clone_arg_t doca;
34dc7c2f 1391
13fe0198
MA
1392 doca.doca_clone = clone;
1393 doca.doca_origin = origin;
1394 doca.doca_cred = CRED();
572e2857 1395
ec213971 1396 int rv = dsl_sync_task(clone,
3d45fdd6 1397 dmu_objset_clone_check, dmu_objset_clone_sync, &doca,
ec213971
MA
1398 6, ZFS_SPACE_CHECK_NORMAL);
1399
1400 if (rv == 0)
1401 zvol_create_minor(clone);
1402
1403 return (rv);
6f1ffb06
MA
1404}
1405
1406int
1407dmu_objset_snapshot_one(const char *fsname, const char *snapname)
1408{
1409 int err;
1410 char *longsnap = kmem_asprintf("%s@%s", fsname, snapname);
1411 nvlist_t *snaps = fnvlist_alloc();
1412
1413 fnvlist_add_boolean(snaps, longsnap);
e4f5fa12 1414 kmem_strfree(longsnap);
13fe0198
MA
1415 err = dsl_dataset_snapshot(snaps, NULL, NULL);
1416 fnvlist_free(snaps);
6f1ffb06
MA
1417 return (err);
1418}
1419
1de321e6
JX
1420static void
1421dmu_objset_upgrade_task_cb(void *data)
1422{
1423 objset_t *os = data;
1424
1425 mutex_enter(&os->os_upgrade_lock);
1426 os->os_upgrade_status = EINTR;
1427 if (!os->os_upgrade_exit) {
1428 mutex_exit(&os->os_upgrade_lock);
1429
1430 os->os_upgrade_status = os->os_upgrade_cb(os);
1431 mutex_enter(&os->os_upgrade_lock);
1432 }
1433 os->os_upgrade_exit = B_TRUE;
1434 os->os_upgrade_id = 0;
1435 mutex_exit(&os->os_upgrade_lock);
c0daec32 1436 dsl_dataset_long_rele(dmu_objset_ds(os), upgrade_tag);
1de321e6
JX
1437}
1438
1439static void
1440dmu_objset_upgrade(objset_t *os, dmu_objset_upgrade_cb_t cb)
1441{
1442 if (os->os_upgrade_id != 0)
1443 return;
1444
c0daec32
AB
1445 ASSERT(dsl_pool_config_held(dmu_objset_pool(os)));
1446 dsl_dataset_long_hold(dmu_objset_ds(os), upgrade_tag);
1447
1de321e6
JX
1448 mutex_enter(&os->os_upgrade_lock);
1449 if (os->os_upgrade_id == 0 && os->os_upgrade_status == 0) {
1450 os->os_upgrade_exit = B_FALSE;
1451 os->os_upgrade_cb = cb;
1452 os->os_upgrade_id = taskq_dispatch(
1453 os->os_spa->spa_upgrade_taskq,
1454 dmu_objset_upgrade_task_cb, os, TQ_SLEEP);
c0daec32
AB
1455 if (os->os_upgrade_id == TASKQID_INVALID) {
1456 dsl_dataset_long_rele(dmu_objset_ds(os), upgrade_tag);
1de321e6 1457 os->os_upgrade_status = ENOMEM;
c0daec32 1458 }
1de321e6
JX
1459 }
1460 mutex_exit(&os->os_upgrade_lock);
1461}
1462
1463static void
1464dmu_objset_upgrade_stop(objset_t *os)
1465{
1466 mutex_enter(&os->os_upgrade_lock);
1467 os->os_upgrade_exit = B_TRUE;
1468 if (os->os_upgrade_id != 0) {
1469 taskqid_t id = os->os_upgrade_id;
1470
1471 os->os_upgrade_id = 0;
1472 mutex_exit(&os->os_upgrade_lock);
1473
c0daec32
AB
1474 if ((taskq_cancel_id(os->os_spa->spa_upgrade_taskq, id)) == 0) {
1475 dsl_dataset_long_rele(dmu_objset_ds(os), upgrade_tag);
1476 }
4807c0ba 1477 txg_wait_synced(os->os_spa->spa_dsl_pool, 0);
1de321e6
JX
1478 } else {
1479 mutex_exit(&os->os_upgrade_lock);
1480 }
1481}
1482
34dc7c2f 1483static void
64fc7762 1484dmu_objset_sync_dnodes(multilist_sublist_t *list, dmu_tx_t *tx)
34dc7c2f
BB
1485{
1486 dnode_t *dn;
1487
64fc7762 1488 while ((dn = multilist_sublist_head(list)) != NULL) {
34dc7c2f
BB
1489 ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
1490 ASSERT(dn->dn_dbuf->db_data_pending);
1491 /*
9babb374 1492 * Initialize dn_zio outside dnode_sync() because the
93e28d66 1493 * meta-dnode needs to set it outside dnode_sync().
34dc7c2f
BB
1494 */
1495 dn->dn_zio = dn->dn_dbuf->db_data_pending->dr_zio;
1496 ASSERT(dn->dn_zio);
1497
1498 ASSERT3U(dn->dn_nlevels, <=, DN_MAX_LEVELS);
64fc7762 1499 multilist_sublist_remove(list, dn);
9babb374 1500
edc1e713
TC
1501 /*
1502 * If we are not doing useraccounting (os_synced_dnodes == NULL)
1503 * we are done with this dnode for this txg. Unset dn_dirty_txg
1504 * if later txgs aren't dirtying it so that future holders do
1505 * not get a stale value. Otherwise, we will do this in
1506 * userquota_updates_task() when processing has completely
1507 * finished for this txg.
1508 */
64fc7762
MA
1509 multilist_t *newlist = dn->dn_objset->os_synced_dnodes;
1510 if (newlist != NULL) {
9babb374 1511 (void) dnode_add_ref(dn, newlist);
64fc7762 1512 multilist_insert(newlist, dn);
edc1e713
TC
1513 } else {
1514 mutex_enter(&dn->dn_mtx);
1515 if (dn->dn_dirty_txg == tx->tx_txg)
1516 dn->dn_dirty_txg = 0;
1517 mutex_exit(&dn->dn_mtx);
9babb374
BB
1518 }
1519
34dc7c2f
BB
1520 dnode_sync(dn, tx);
1521 }
1522}
1523
1524/* ARGSUSED */
1525static void
428870ff 1526dmu_objset_write_ready(zio_t *zio, arc_buf_t *abuf, void *arg)
34dc7c2f 1527{
b128c09f 1528 blkptr_t *bp = zio->io_bp;
428870ff 1529 objset_t *os = arg;
34dc7c2f 1530 dnode_phys_t *dnp = &os->os_phys->os_meta_dnode;
b5256303 1531 uint64_t fill = 0;
34dc7c2f 1532
9b67f605 1533 ASSERT(!BP_IS_EMBEDDED(bp));
13fe0198
MA
1534 ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_OBJSET);
1535 ASSERT0(BP_GET_LEVEL(bp));
34dc7c2f
BB
1536
1537 /*
9babb374
BB
1538 * Update rootbp fill count: it should be the number of objects
1539 * allocated in the object set (not counting the "special"
1540 * objects that are stored in the objset_phys_t -- the meta
9c5167d1 1541 * dnode and user/group/project accounting objects).
34dc7c2f 1542 */
1c27024e 1543 for (int i = 0; i < dnp->dn_nblkptr; i++)
b5256303
TC
1544 fill += BP_GET_FILL(&dnp->dn_blkptr[i]);
1545
1546 BP_SET_FILL(bp, fill);
1547
cc9bb3e5
GM
1548 if (os->os_dsl_dataset != NULL)
1549 rrw_enter(&os->os_dsl_dataset->ds_bp_rwlock, RW_WRITER, FTAG);
1550 *os->os_rootbp = *bp;
1551 if (os->os_dsl_dataset != NULL)
1552 rrw_exit(&os->os_dsl_dataset->ds_bp_rwlock, FTAG);
428870ff
BB
1553}
1554
1555/* ARGSUSED */
1556static void
1557dmu_objset_write_done(zio_t *zio, arc_buf_t *abuf, void *arg)
1558{
1559 blkptr_t *bp = zio->io_bp;
1560 blkptr_t *bp_orig = &zio->io_bp_orig;
1561 objset_t *os = arg;
34dc7c2f 1562
b128c09f 1563 if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
428870ff 1564 ASSERT(BP_EQUAL(bp, bp_orig));
b128c09f 1565 } else {
428870ff
BB
1566 dsl_dataset_t *ds = os->os_dsl_dataset;
1567 dmu_tx_t *tx = os->os_synctx;
1568
1569 (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
1570 dsl_dataset_block_born(ds, bp, tx);
34dc7c2f 1571 }
cc9bb3e5 1572 kmem_free(bp, sizeof (*bp));
34dc7c2f
BB
1573}
1574
64fc7762
MA
1575typedef struct sync_dnodes_arg {
1576 multilist_t *sda_list;
1577 int sda_sublist_idx;
1578 multilist_t *sda_newlist;
1579 dmu_tx_t *sda_tx;
1580} sync_dnodes_arg_t;
1581
1582static void
1583sync_dnodes_task(void *arg)
1584{
1585 sync_dnodes_arg_t *sda = arg;
1586
1587 multilist_sublist_t *ms =
1588 multilist_sublist_lock(sda->sda_list, sda->sda_sublist_idx);
1589
1590 dmu_objset_sync_dnodes(ms, sda->sda_tx);
1591
1592 multilist_sublist_unlock(ms);
1593
1594 kmem_free(sda, sizeof (*sda));
1595}
1596
1597
34dc7c2f
BB
1598/* called from dsl */
1599void
428870ff 1600dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx)
34dc7c2f
BB
1601{
1602 int txgoff;
5dbd68a3 1603 zbookmark_phys_t zb;
428870ff 1604 zio_prop_t zp;
34dc7c2f
BB
1605 zio_t *zio;
1606 list_t *list;
1607 dbuf_dirty_record_t *dr;
fc754677
AM
1608 int num_sublists;
1609 multilist_t *ml;
cc9bb3e5
GM
1610 blkptr_t *blkptr_copy = kmem_alloc(sizeof (*os->os_rootbp), KM_SLEEP);
1611 *blkptr_copy = *os->os_rootbp;
34dc7c2f
BB
1612
1613 dprintf_ds(os->os_dsl_dataset, "txg=%llu\n", tx->tx_txg);
1614
1615 ASSERT(dmu_tx_is_syncing(tx));
1616 /* XXX the write_done callback should really give us the tx... */
1617 os->os_synctx = tx;
1618
1619 if (os->os_dsl_dataset == NULL) {
1620 /*
1621 * This is the MOS. If we have upgraded,
1622 * spa_max_replication() could change, so reset
1623 * os_copies here.
1624 */
1625 os->os_copies = spa_max_replication(os->os_spa);
1626 }
1627
1628 /*
1629 * Create the root block IO
1630 */
428870ff
BB
1631 SET_BOOKMARK(&zb, os->os_dsl_dataset ?
1632 os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
1633 ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
294f6806 1634 arc_release(os->os_phys_buf, &os->os_phys_buf);
b128c09f 1635
82644107 1636 dmu_write_policy(os, NULL, 0, 0, &zp);
9babb374 1637
b5256303 1638 /*
0c03d21a
MA
1639 * If we are either claiming the ZIL or doing a raw receive, write
1640 * out the os_phys_buf raw. Neither of these actions will effect the
1641 * MAC at this point.
b5256303 1642 */
0c03d21a
MA
1643 if (os->os_raw_receive ||
1644 os->os_next_write_raw[tx->tx_txg & TXG_MASK]) {
b5256303 1645 ASSERT(os->os_encrypted);
b5256303
TC
1646 arc_convert_to_raw(os->os_phys_buf,
1647 os->os_dsl_dataset->ds_object, ZFS_HOST_BYTEORDER,
1648 DMU_OT_OBJSET, NULL, NULL, NULL);
1649 }
1650
428870ff 1651 zio = arc_write(pio, os->os_spa, tx->tx_txg,
cc9bb3e5 1652 blkptr_copy, os->os_phys_buf, DMU_OS_IS_L2CACHEABLE(os),
bc77ba73
PD
1653 &zp, dmu_objset_write_ready, NULL, NULL, dmu_objset_write_done,
1654 os, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
34dc7c2f
BB
1655
1656 /*
9babb374 1657 * Sync special dnodes - the parent IO for the sync is the root block
34dc7c2f 1658 */
572e2857
BB
1659 DMU_META_DNODE(os)->dn_zio = zio;
1660 dnode_sync(DMU_META_DNODE(os), tx);
34dc7c2f 1661
9babb374
BB
1662 os->os_phys->os_flags = os->os_flags;
1663
572e2857
BB
1664 if (DMU_USERUSED_DNODE(os) &&
1665 DMU_USERUSED_DNODE(os)->dn_type != DMU_OT_NONE) {
1666 DMU_USERUSED_DNODE(os)->dn_zio = zio;
1667 dnode_sync(DMU_USERUSED_DNODE(os), tx);
1668 DMU_GROUPUSED_DNODE(os)->dn_zio = zio;
1669 dnode_sync(DMU_GROUPUSED_DNODE(os), tx);
9babb374
BB
1670 }
1671
9c5167d1
NF
1672 if (DMU_PROJECTUSED_DNODE(os) &&
1673 DMU_PROJECTUSED_DNODE(os)->dn_type != DMU_OT_NONE) {
1674 DMU_PROJECTUSED_DNODE(os)->dn_zio = zio;
1675 dnode_sync(DMU_PROJECTUSED_DNODE(os), tx);
1676 }
1677
34dc7c2f
BB
1678 txgoff = tx->tx_txg & TXG_MASK;
1679
b5256303
TC
1680 if (dmu_objset_userused_enabled(os) &&
1681 (!os->os_encrypted || !dmu_objset_is_receiving(os))) {
9babb374
BB
1682 /*
1683 * We must create the list here because it uses the
64fc7762
MA
1684 * dn_dirty_link[] of this txg. But it may already
1685 * exist because we call dsl_dataset_sync() twice per txg.
9babb374 1686 */
64fc7762
MA
1687 if (os->os_synced_dnodes == NULL) {
1688 os->os_synced_dnodes =
1689 multilist_create(sizeof (dnode_t),
1690 offsetof(dnode_t, dn_dirty_link[txgoff]),
1691 dnode_multilist_index_func);
1692 } else {
1693 ASSERT3U(os->os_synced_dnodes->ml_offset, ==,
1694 offsetof(dnode_t, dn_dirty_link[txgoff]));
1695 }
9babb374
BB
1696 }
1697
fc754677
AM
1698 ml = os->os_dirty_dnodes[txgoff];
1699 num_sublists = multilist_get_num_sublists(ml);
1700 for (int i = 0; i < num_sublists; i++) {
1701 if (multilist_sublist_is_empty_idx(ml, i))
1702 continue;
64fc7762 1703 sync_dnodes_arg_t *sda = kmem_alloc(sizeof (*sda), KM_SLEEP);
fc754677 1704 sda->sda_list = ml;
64fc7762
MA
1705 sda->sda_sublist_idx = i;
1706 sda->sda_tx = tx;
1707 (void) taskq_dispatch(dmu_objset_pool(os)->dp_sync_taskq,
1708 sync_dnodes_task, sda, 0);
1709 /* callback frees sda */
1710 }
1711 taskq_wait(dmu_objset_pool(os)->dp_sync_taskq);
34dc7c2f 1712
572e2857 1713 list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff];
64fc7762 1714 while ((dr = list_head(list)) != NULL) {
13fe0198 1715 ASSERT0(dr->dr_dbuf->db_level);
34dc7c2f 1716 list_remove(list, dr);
9cdf7b1f 1717 zio_nowait(dr->dr_zio);
34dc7c2f 1718 }
68cbd56e
NB
1719
1720 /* Enable dnode backfill if enough objects have been freed. */
1721 if (os->os_freed_dnodes >= dmu_rescan_dnode_threshold) {
1722 os->os_rescan_dnodes = B_TRUE;
1723 os->os_freed_dnodes = 0;
1724 }
1725
34dc7c2f
BB
1726 /*
1727 * Free intent log blocks up to this tx.
1728 */
1729 zil_sync(os->os_zil, tx);
b128c09f 1730 os->os_phys->os_zil_header = os->os_zil_header;
34dc7c2f
BB
1731 zio_nowait(zio);
1732}
1733
428870ff
BB
1734boolean_t
1735dmu_objset_is_dirty(objset_t *os, uint64_t txg)
1736{
64fc7762 1737 return (!multilist_is_empty(os->os_dirty_dnodes[txg & TXG_MASK]));
428870ff
BB
1738}
1739
572e2857 1740static objset_used_cb_t *used_cbs[DMU_OST_NUMTYPES];
9babb374
BB
1741
1742void
1743dmu_objset_register_type(dmu_objset_type_t ost, objset_used_cb_t *cb)
1744{
1745 used_cbs[ost] = cb;
1746}
1747
1748boolean_t
428870ff 1749dmu_objset_userused_enabled(objset_t *os)
9babb374
BB
1750{
1751 return (spa_version(os->os_spa) >= SPA_VERSION_USERSPACE &&
572e2857
BB
1752 used_cbs[os->os_phys->os_type] != NULL &&
1753 DMU_USERUSED_DNODE(os) != NULL);
9babb374
BB
1754}
1755
1de321e6
JX
1756boolean_t
1757dmu_objset_userobjused_enabled(objset_t *os)
1758{
1759 return (dmu_objset_userused_enabled(os) &&
1760 spa_feature_is_enabled(os->os_spa, SPA_FEATURE_USEROBJ_ACCOUNTING));
1761}
1762
9c5167d1
NF
1763boolean_t
1764dmu_objset_projectquota_enabled(objset_t *os)
1765{
1766 return (used_cbs[os->os_phys->os_type] != NULL &&
1767 DMU_PROJECTUSED_DNODE(os) != NULL &&
1768 spa_feature_is_enabled(os->os_spa, SPA_FEATURE_PROJECT_QUOTA));
1769}
1770
9b7a83cb
JX
1771typedef struct userquota_node {
1772 /* must be in the first filed, see userquota_update_cache() */
1773 char uqn_id[20 + DMU_OBJACCT_PREFIX_LEN];
1774 int64_t uqn_delta;
1775 avl_node_t uqn_node;
1776} userquota_node_t;
1777
1778typedef struct userquota_cache {
1779 avl_tree_t uqc_user_deltas;
1780 avl_tree_t uqc_group_deltas;
9c5167d1 1781 avl_tree_t uqc_project_deltas;
9b7a83cb
JX
1782} userquota_cache_t;
1783
1784static int
1785userquota_compare(const void *l, const void *r)
1786{
1787 const userquota_node_t *luqn = l;
1788 const userquota_node_t *ruqn = r;
e4ffa98d 1789 int rv;
9b7a83cb
JX
1790
1791 /*
1792 * NB: can only access uqn_id because userquota_update_cache() doesn't
1793 * pass in an entire userquota_node_t.
1794 */
e4ffa98d
BB
1795 rv = strcmp(luqn->uqn_id, ruqn->uqn_id);
1796
ca577779 1797 return (TREE_ISIGN(rv));
9b7a83cb
JX
1798}
1799
1800static void
1801do_userquota_cacheflush(objset_t *os, userquota_cache_t *cache, dmu_tx_t *tx)
1802{
1803 void *cookie;
1804 userquota_node_t *uqn;
1805
1806 ASSERT(dmu_tx_is_syncing(tx));
1807
1808 cookie = NULL;
1809 while ((uqn = avl_destroy_nodes(&cache->uqc_user_deltas,
1810 &cookie)) != NULL) {
64fc7762
MA
1811 /*
1812 * os_userused_lock protects against concurrent calls to
1813 * zap_increment_int(). It's needed because zap_increment_int()
1814 * is not thread-safe (i.e. not atomic).
1815 */
1816 mutex_enter(&os->os_userused_lock);
9b7a83cb
JX
1817 VERIFY0(zap_increment(os, DMU_USERUSED_OBJECT,
1818 uqn->uqn_id, uqn->uqn_delta, tx));
64fc7762 1819 mutex_exit(&os->os_userused_lock);
9b7a83cb
JX
1820 kmem_free(uqn, sizeof (*uqn));
1821 }
1822 avl_destroy(&cache->uqc_user_deltas);
1823
1824 cookie = NULL;
1825 while ((uqn = avl_destroy_nodes(&cache->uqc_group_deltas,
1826 &cookie)) != NULL) {
64fc7762 1827 mutex_enter(&os->os_userused_lock);
9b7a83cb
JX
1828 VERIFY0(zap_increment(os, DMU_GROUPUSED_OBJECT,
1829 uqn->uqn_id, uqn->uqn_delta, tx));
64fc7762 1830 mutex_exit(&os->os_userused_lock);
9b7a83cb
JX
1831 kmem_free(uqn, sizeof (*uqn));
1832 }
1833 avl_destroy(&cache->uqc_group_deltas);
9c5167d1
NF
1834
1835 if (dmu_objset_projectquota_enabled(os)) {
1836 cookie = NULL;
1837 while ((uqn = avl_destroy_nodes(&cache->uqc_project_deltas,
1838 &cookie)) != NULL) {
1839 mutex_enter(&os->os_userused_lock);
1840 VERIFY0(zap_increment(os, DMU_PROJECTUSED_OBJECT,
1841 uqn->uqn_id, uqn->uqn_delta, tx));
1842 mutex_exit(&os->os_userused_lock);
1843 kmem_free(uqn, sizeof (*uqn));
1844 }
1845 avl_destroy(&cache->uqc_project_deltas);
1846 }
9b7a83cb
JX
1847}
1848
1849static void
1850userquota_update_cache(avl_tree_t *avl, const char *id, int64_t delta)
1851{
1852 userquota_node_t *uqn;
1853 avl_index_t idx;
1854
1855 ASSERT(strlen(id) < sizeof (uqn->uqn_id));
1856 /*
1857 * Use id directly for searching because uqn_id is the first field of
1858 * userquota_node_t and fields after uqn_id won't be accessed in
1859 * avl_find().
1860 */
1861 uqn = avl_find(avl, (const void *)id, &idx);
1862 if (uqn == NULL) {
1863 uqn = kmem_zalloc(sizeof (*uqn), KM_SLEEP);
1f51b525 1864 strlcpy(uqn->uqn_id, id, sizeof (uqn->uqn_id));
9b7a83cb
JX
1865 avl_insert(avl, uqn, idx);
1866 }
1867 uqn->uqn_delta += delta;
1868}
1869
428870ff 1870static void
9c5167d1
NF
1871do_userquota_update(objset_t *os, userquota_cache_t *cache, uint64_t used,
1872 uint64_t flags, uint64_t user, uint64_t group, uint64_t project,
1873 boolean_t subtract)
428870ff 1874{
9c5167d1 1875 if (flags & DNODE_FLAG_USERUSED_ACCOUNTED) {
50c957f7 1876 int64_t delta = DNODE_MIN_SIZE + used;
9b7a83cb
JX
1877 char name[20];
1878
428870ff
BB
1879 if (subtract)
1880 delta = -delta;
9b7a83cb
JX
1881
1882 (void) sprintf(name, "%llx", (longlong_t)user);
1883 userquota_update_cache(&cache->uqc_user_deltas, name, delta);
1884
1885 (void) sprintf(name, "%llx", (longlong_t)group);
1886 userquota_update_cache(&cache->uqc_group_deltas, name, delta);
9c5167d1
NF
1887
1888 if (dmu_objset_projectquota_enabled(os)) {
1889 (void) sprintf(name, "%llx", (longlong_t)project);
1890 userquota_update_cache(&cache->uqc_project_deltas,
1891 name, delta);
1892 }
428870ff
BB
1893 }
1894}
1895
1de321e6 1896static void
9c5167d1
NF
1897do_userobjquota_update(objset_t *os, userquota_cache_t *cache, uint64_t flags,
1898 uint64_t user, uint64_t group, uint64_t project, boolean_t subtract)
1de321e6
JX
1899{
1900 if (flags & DNODE_FLAG_USEROBJUSED_ACCOUNTED) {
1901 char name[20 + DMU_OBJACCT_PREFIX_LEN];
9b7a83cb 1902 int delta = subtract ? -1 : 1;
1de321e6
JX
1903
1904 (void) snprintf(name, sizeof (name), DMU_OBJACCT_PREFIX "%llx",
1905 (longlong_t)user);
9b7a83cb 1906 userquota_update_cache(&cache->uqc_user_deltas, name, delta);
1de321e6
JX
1907
1908 (void) snprintf(name, sizeof (name), DMU_OBJACCT_PREFIX "%llx",
1909 (longlong_t)group);
9b7a83cb 1910 userquota_update_cache(&cache->uqc_group_deltas, name, delta);
9c5167d1
NF
1911
1912 if (dmu_objset_projectquota_enabled(os)) {
1913 (void) snprintf(name, sizeof (name),
1914 DMU_OBJACCT_PREFIX "%llx", (longlong_t)project);
1915 userquota_update_cache(&cache->uqc_project_deltas,
1916 name, delta);
1917 }
1de321e6
JX
1918 }
1919}
1920
64fc7762
MA
1921typedef struct userquota_updates_arg {
1922 objset_t *uua_os;
1923 int uua_sublist_idx;
1924 dmu_tx_t *uua_tx;
1925} userquota_updates_arg_t;
1926
1927static void
1928userquota_updates_task(void *arg)
9babb374 1929{
64fc7762
MA
1930 userquota_updates_arg_t *uua = arg;
1931 objset_t *os = uua->uua_os;
1932 dmu_tx_t *tx = uua->uua_tx;
9babb374 1933 dnode_t *dn;
9b7a83cb 1934 userquota_cache_t cache = { { 0 } };
9babb374 1935
64fc7762
MA
1936 multilist_sublist_t *list =
1937 multilist_sublist_lock(os->os_synced_dnodes, uua->uua_sublist_idx);
9babb374 1938
64fc7762
MA
1939 ASSERT(multilist_sublist_head(list) == NULL ||
1940 dmu_objset_userused_enabled(os));
9b7a83cb
JX
1941 avl_create(&cache.uqc_user_deltas, userquota_compare,
1942 sizeof (userquota_node_t), offsetof(userquota_node_t, uqn_node));
1943 avl_create(&cache.uqc_group_deltas, userquota_compare,
1944 sizeof (userquota_node_t), offsetof(userquota_node_t, uqn_node));
9c5167d1
NF
1945 if (dmu_objset_projectquota_enabled(os))
1946 avl_create(&cache.uqc_project_deltas, userquota_compare,
1947 sizeof (userquota_node_t), offsetof(userquota_node_t,
1948 uqn_node));
9b7a83cb 1949
64fc7762 1950 while ((dn = multilist_sublist_head(list)) != NULL) {
572e2857 1951 int flags;
9babb374 1952 ASSERT(!DMU_OBJECT_IS_SPECIAL(dn->dn_object));
9babb374
BB
1953 ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE ||
1954 dn->dn_phys->dn_flags &
1955 DNODE_FLAG_USERUSED_ACCOUNTED);
1956
572e2857
BB
1957 flags = dn->dn_id_flags;
1958 ASSERT(flags);
1959 if (flags & DN_ID_OLD_EXIST) {
9c5167d1
NF
1960 do_userquota_update(os, &cache, dn->dn_oldused,
1961 dn->dn_oldflags, dn->dn_olduid, dn->dn_oldgid,
1962 dn->dn_oldprojid, B_TRUE);
1963 do_userobjquota_update(os, &cache, dn->dn_oldflags,
1964 dn->dn_olduid, dn->dn_oldgid,
1965 dn->dn_oldprojid, B_TRUE);
428870ff 1966 }
572e2857 1967 if (flags & DN_ID_NEW_EXIST) {
9c5167d1 1968 do_userquota_update(os, &cache,
9b7a83cb 1969 DN_USED_BYTES(dn->dn_phys), dn->dn_phys->dn_flags,
9c5167d1
NF
1970 dn->dn_newuid, dn->dn_newgid,
1971 dn->dn_newprojid, B_FALSE);
1972 do_userobjquota_update(os, &cache,
1973 dn->dn_phys->dn_flags, dn->dn_newuid, dn->dn_newgid,
1974 dn->dn_newprojid, B_FALSE);
428870ff
BB
1975 }
1976
572e2857 1977 mutex_enter(&dn->dn_mtx);
428870ff
BB
1978 dn->dn_oldused = 0;
1979 dn->dn_oldflags = 0;
1980 if (dn->dn_id_flags & DN_ID_NEW_EXIST) {
1981 dn->dn_olduid = dn->dn_newuid;
1982 dn->dn_oldgid = dn->dn_newgid;
9c5167d1 1983 dn->dn_oldprojid = dn->dn_newprojid;
428870ff
BB
1984 dn->dn_id_flags |= DN_ID_OLD_EXIST;
1985 if (dn->dn_bonuslen == 0)
1986 dn->dn_id_flags |= DN_ID_CHKED_SPILL;
1987 else
1988 dn->dn_id_flags |= DN_ID_CHKED_BONUS;
1989 }
1990 dn->dn_id_flags &= ~(DN_ID_NEW_EXIST);
edc1e713
TC
1991 if (dn->dn_dirty_txg == spa_syncing_txg(os->os_spa))
1992 dn->dn_dirty_txg = 0;
9babb374
BB
1993 mutex_exit(&dn->dn_mtx);
1994
64fc7762
MA
1995 multilist_sublist_remove(list, dn);
1996 dnode_rele(dn, os->os_synced_dnodes);
9babb374 1997 }
9b7a83cb 1998 do_userquota_cacheflush(os, &cache, tx);
64fc7762
MA
1999 multilist_sublist_unlock(list);
2000 kmem_free(uua, sizeof (*uua));
2001}
2002
2003void
2004dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx)
2005{
fc754677
AM
2006 int num_sublists;
2007
64fc7762
MA
2008 if (!dmu_objset_userused_enabled(os))
2009 return;
2010
163a8c28
TC
2011 /*
2012 * If this is a raw receive just return and handle accounting
2013 * later when we have the keys loaded. We also don't do user
2014 * accounting during claiming since the datasets are not owned
2015 * for the duration of claiming and this txg should only be
2016 * used for recovery.
2017 */
b5256303
TC
2018 if (os->os_encrypted && dmu_objset_is_receiving(os))
2019 return;
2020
163a8c28
TC
2021 if (tx->tx_txg <= os->os_spa->spa_claim_max_txg)
2022 return;
2023
9c5167d1 2024 /* Allocate the user/group/project used objects if necessary. */
64fc7762
MA
2025 if (DMU_USERUSED_DNODE(os)->dn_type == DMU_OT_NONE) {
2026 VERIFY0(zap_create_claim(os,
2027 DMU_USERUSED_OBJECT,
2028 DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));
2029 VERIFY0(zap_create_claim(os,
2030 DMU_GROUPUSED_OBJECT,
2031 DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));
2032 }
2033
9c5167d1
NF
2034 if (dmu_objset_projectquota_enabled(os) &&
2035 DMU_PROJECTUSED_DNODE(os)->dn_type == DMU_OT_NONE) {
2036 VERIFY0(zap_create_claim(os, DMU_PROJECTUSED_OBJECT,
2037 DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));
2038 }
2039
fc754677
AM
2040 num_sublists = multilist_get_num_sublists(os->os_synced_dnodes);
2041 for (int i = 0; i < num_sublists; i++) {
2042 if (multilist_sublist_is_empty_idx(os->os_synced_dnodes, i))
2043 continue;
64fc7762
MA
2044 userquota_updates_arg_t *uua =
2045 kmem_alloc(sizeof (*uua), KM_SLEEP);
2046 uua->uua_os = os;
2047 uua->uua_sublist_idx = i;
2048 uua->uua_tx = tx;
2049 /* note: caller does taskq_wait() */
2050 (void) taskq_dispatch(dmu_objset_pool(os)->dp_sync_taskq,
2051 userquota_updates_task, uua, 0);
2052 /* callback frees uua */
2053 }
9babb374
BB
2054}
2055
428870ff
BB
2056/*
2057 * Returns a pointer to data to find uid/gid from
2058 *
2059 * If a dirty record for transaction group that is syncing can't
2060 * be found then NULL is returned. In the NULL case it is assumed
2061 * the uid/gid aren't changing.
2062 */
2063static void *
2064dmu_objset_userquota_find_data(dmu_buf_impl_t *db, dmu_tx_t *tx)
2065{
cccbed9f 2066 dbuf_dirty_record_t *dr;
428870ff
BB
2067 void *data;
2068
2069 if (db->db_dirtycnt == 0)
2070 return (db->db.db_data); /* Nothing is changing */
2071
cccbed9f 2072 dr = dbuf_find_dirty_eq(db, tx->tx_txg);
428870ff 2073
572e2857 2074 if (dr == NULL) {
428870ff 2075 data = NULL;
572e2857
BB
2076 } else {
2077 dnode_t *dn;
2078
2079 DB_DNODE_ENTER(dr->dr_dbuf);
2080 dn = DB_DNODE(dr->dr_dbuf);
2081
2082 if (dn->dn_bonuslen == 0 &&
2083 dr->dr_dbuf->db_blkid == DMU_SPILL_BLKID)
2084 data = dr->dt.dl.dr_data->b_data;
2085 else
2086 data = dr->dt.dl.dr_data;
2087
2088 DB_DNODE_EXIT(dr->dr_dbuf);
2089 }
2090
428870ff
BB
2091 return (data);
2092}
2093
2094void
2095dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx)
2096{
2097 objset_t *os = dn->dn_objset;
2098 void *data = NULL;
2099 dmu_buf_impl_t *db = NULL;
a117a6d6
GW
2100 uint64_t *user = NULL;
2101 uint64_t *group = NULL;
9c5167d1 2102 uint64_t *project = NULL;
428870ff
BB
2103 int flags = dn->dn_id_flags;
2104 int error;
2105 boolean_t have_spill = B_FALSE;
2106
2107 if (!dmu_objset_userused_enabled(dn->dn_objset))
2108 return;
2109
b5256303
TC
2110 /*
2111 * Raw receives introduce a problem with user accounting. Raw
2112 * receives cannot update the user accounting info because the
2113 * user ids and the sizes are encrypted. To guarantee that we
2114 * never end up with bad user accounting, we simply disable it
2115 * during raw receives. We also disable this for normal receives
2116 * so that an incremental raw receive may be done on top of an
2117 * existing non-raw receive.
2118 */
2119 if (os->os_encrypted && dmu_objset_is_receiving(os))
2120 return;
2121
428870ff
BB
2122 if (before && (flags & (DN_ID_CHKED_BONUS|DN_ID_OLD_EXIST|
2123 DN_ID_CHKED_SPILL)))
2124 return;
2125
2126 if (before && dn->dn_bonuslen != 0)
2127 data = DN_BONUS(dn->dn_phys);
2128 else if (!before && dn->dn_bonuslen != 0) {
a3000f93
BB
2129 if (dn->dn_bonus) {
2130 db = dn->dn_bonus;
428870ff
BB
2131 mutex_enter(&db->db_mtx);
2132 data = dmu_objset_userquota_find_data(db, tx);
2133 } else {
2134 data = DN_BONUS(dn->dn_phys);
2135 }
2136 } else if (dn->dn_bonuslen == 0 && dn->dn_bonustype == DMU_OT_SA) {
2137 int rf = 0;
2138
2139 if (RW_WRITE_HELD(&dn->dn_struct_rwlock))
2140 rf |= DB_RF_HAVESTRUCT;
572e2857
BB
2141 error = dmu_spill_hold_by_dnode(dn,
2142 rf | DB_RF_MUST_SUCCEED,
428870ff
BB
2143 FTAG, (dmu_buf_t **)&db);
2144 ASSERT(error == 0);
2145 mutex_enter(&db->db_mtx);
2146 data = (before) ? db->db.db_data :
2147 dmu_objset_userquota_find_data(db, tx);
2148 have_spill = B_TRUE;
2149 } else {
2150 mutex_enter(&dn->dn_mtx);
2151 dn->dn_id_flags |= DN_ID_CHKED_BONUS;
2152 mutex_exit(&dn->dn_mtx);
2153 return;
2154 }
2155
2156 if (before) {
2157 ASSERT(data);
2158 user = &dn->dn_olduid;
2159 group = &dn->dn_oldgid;
9c5167d1 2160 project = &dn->dn_oldprojid;
428870ff
BB
2161 } else if (data) {
2162 user = &dn->dn_newuid;
2163 group = &dn->dn_newgid;
9c5167d1 2164 project = &dn->dn_newprojid;
428870ff
BB
2165 }
2166
2167 /*
2168 * Must always call the callback in case the object
2169 * type has changed and that type isn't an object type to track
2170 */
2171 error = used_cbs[os->os_phys->os_type](dn->dn_bonustype, data,
9c5167d1 2172 user, group, project);
428870ff
BB
2173
2174 /*
2175 * Preserve existing uid/gid when the callback can't determine
2176 * what the new uid/gid are and the callback returned EEXIST.
2177 * The EEXIST error tells us to just use the existing uid/gid.
2178 * If we don't know what the old values are then just assign
2179 * them to 0, since that is a new file being created.
2180 */
2181 if (!before && data == NULL && error == EEXIST) {
2182 if (flags & DN_ID_OLD_EXIST) {
2183 dn->dn_newuid = dn->dn_olduid;
2184 dn->dn_newgid = dn->dn_oldgid;
2705ebf0 2185 dn->dn_newprojid = dn->dn_oldprojid;
428870ff
BB
2186 } else {
2187 dn->dn_newuid = 0;
2188 dn->dn_newgid = 0;
9c5167d1 2189 dn->dn_newprojid = ZFS_DEFAULT_PROJID;
428870ff
BB
2190 }
2191 error = 0;
2192 }
2193
2194 if (db)
2195 mutex_exit(&db->db_mtx);
2196
2197 mutex_enter(&dn->dn_mtx);
2198 if (error == 0 && before)
2199 dn->dn_id_flags |= DN_ID_OLD_EXIST;
2200 if (error == 0 && !before)
2201 dn->dn_id_flags |= DN_ID_NEW_EXIST;
2202
2203 if (have_spill) {
2204 dn->dn_id_flags |= DN_ID_CHKED_SPILL;
2205 } else {
2206 dn->dn_id_flags |= DN_ID_CHKED_BONUS;
2207 }
2208 mutex_exit(&dn->dn_mtx);
a3000f93 2209 if (have_spill)
428870ff
BB
2210 dmu_buf_rele((dmu_buf_t *)db, FTAG);
2211}
2212
9babb374
BB
2213boolean_t
2214dmu_objset_userspace_present(objset_t *os)
2215{
428870ff 2216 return (os->os_phys->os_flags &
9babb374
BB
2217 OBJSET_FLAG_USERACCOUNTING_COMPLETE);
2218}
2219
1de321e6
JX
2220boolean_t
2221dmu_objset_userobjspace_present(objset_t *os)
2222{
2223 return (os->os_phys->os_flags &
2224 OBJSET_FLAG_USEROBJACCOUNTING_COMPLETE);
2225}
2226
9c5167d1
NF
2227boolean_t
2228dmu_objset_projectquota_present(objset_t *os)
2229{
2230 return (os->os_phys->os_flags &
2231 OBJSET_FLAG_PROJECTQUOTA_COMPLETE);
2232}
2233
1de321e6
JX
2234static int
2235dmu_objset_space_upgrade(objset_t *os)
9babb374
BB
2236{
2237 uint64_t obj;
2238 int err = 0;
2239
9babb374
BB
2240 /*
2241 * We simply need to mark every object dirty, so that it will be
2242 * synced out and now accounted. If this is called
2243 * concurrently, or if we already did some work before crashing,
2244 * that's fine, since we track each object's accounted state
2245 * independently.
2246 */
2247
2248 for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 0)) {
45d1cae3 2249 dmu_tx_t *tx;
9babb374
BB
2250 dmu_buf_t *db;
2251 int objerr;
2252
1de321e6
JX
2253 mutex_enter(&os->os_upgrade_lock);
2254 if (os->os_upgrade_exit)
2255 err = SET_ERROR(EINTR);
2256 mutex_exit(&os->os_upgrade_lock);
2257 if (err != 0)
2258 return (err);
2259
9babb374 2260 if (issig(JUSTLOOKING) && issig(FORREAL))
2e528b49 2261 return (SET_ERROR(EINTR));
9babb374
BB
2262
2263 objerr = dmu_bonus_hold(os, obj, FTAG, &db);
13fe0198 2264 if (objerr != 0)
9babb374 2265 continue;
45d1cae3 2266 tx = dmu_tx_create(os);
9babb374
BB
2267 dmu_tx_hold_bonus(tx, obj);
2268 objerr = dmu_tx_assign(tx, TXG_WAIT);
13fe0198 2269 if (objerr != 0) {
d22323e8 2270 dmu_buf_rele(db, FTAG);
9babb374
BB
2271 dmu_tx_abort(tx);
2272 continue;
2273 }
2274 dmu_buf_will_dirty(db, tx);
2275 dmu_buf_rele(db, FTAG);
2276 dmu_tx_commit(tx);
2277 }
1de321e6
JX
2278 return (0);
2279}
2280
2281int
2282dmu_objset_userspace_upgrade(objset_t *os)
2283{
2284 int err = 0;
2285
2286 if (dmu_objset_userspace_present(os))
2287 return (0);
2288 if (dmu_objset_is_snapshot(os))
2289 return (SET_ERROR(EINVAL));
2290 if (!dmu_objset_userused_enabled(os))
2291 return (SET_ERROR(ENOTSUP));
2292
2293 err = dmu_objset_space_upgrade(os);
2294 if (err)
2295 return (err);
9babb374 2296
428870ff 2297 os->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE;
9babb374
BB
2298 txg_wait_synced(dmu_objset_pool(os), 0);
2299 return (0);
2300}
2301
1de321e6 2302static int
9c5167d1 2303dmu_objset_id_quota_upgrade_cb(objset_t *os)
1de321e6
JX
2304{
2305 int err = 0;
2306
9c5167d1
NF
2307 if (dmu_objset_userobjspace_present(os) &&
2308 dmu_objset_projectquota_present(os))
1de321e6
JX
2309 return (0);
2310 if (dmu_objset_is_snapshot(os))
2311 return (SET_ERROR(EINVAL));
2312 if (!dmu_objset_userobjused_enabled(os))
2313 return (SET_ERROR(ENOTSUP));
9c5167d1
NF
2314 if (!dmu_objset_projectquota_enabled(os) &&
2315 dmu_objset_userobjspace_present(os))
2316 return (SET_ERROR(ENOTSUP));
1de321e6 2317
d52d80b7
PD
2318 dmu_objset_ds(os)->ds_feature_activation[
2319 SPA_FEATURE_USEROBJ_ACCOUNTING] = (void *)B_TRUE;
9c5167d1 2320 if (dmu_objset_projectquota_enabled(os))
d52d80b7
PD
2321 dmu_objset_ds(os)->ds_feature_activation[
2322 SPA_FEATURE_PROJECT_QUOTA] = (void *)B_TRUE;
1de321e6
JX
2323
2324 err = dmu_objset_space_upgrade(os);
2325 if (err)
2326 return (err);
2327
2328 os->os_flags |= OBJSET_FLAG_USEROBJACCOUNTING_COMPLETE;
9c5167d1
NF
2329 if (dmu_objset_projectquota_enabled(os))
2330 os->os_flags |= OBJSET_FLAG_PROJECTQUOTA_COMPLETE;
2331
1de321e6
JX
2332 txg_wait_synced(dmu_objset_pool(os), 0);
2333 return (0);
2334}
2335
2336void
9c5167d1 2337dmu_objset_id_quota_upgrade(objset_t *os)
1de321e6 2338{
9c5167d1 2339 dmu_objset_upgrade(os, dmu_objset_id_quota_upgrade_cb);
1de321e6
JX
2340}
2341
126ae9f4
JX
2342boolean_t
2343dmu_objset_userobjspace_upgradable(objset_t *os)
2344{
2345 return (dmu_objset_type(os) == DMU_OST_ZFS &&
2346 !dmu_objset_is_snapshot(os) &&
2347 dmu_objset_userobjused_enabled(os) &&
bb1be77a 2348 !dmu_objset_userobjspace_present(os) &&
2349 spa_writeable(dmu_objset_spa(os)));
126ae9f4
JX
2350}
2351
9c5167d1
NF
2352boolean_t
2353dmu_objset_projectquota_upgradable(objset_t *os)
2354{
2355 return (dmu_objset_type(os) == DMU_OST_ZFS &&
2356 !dmu_objset_is_snapshot(os) &&
2357 dmu_objset_projectquota_enabled(os) &&
bb1be77a 2358 !dmu_objset_projectquota_present(os) &&
2359 spa_writeable(dmu_objset_spa(os)));
9c5167d1
NF
2360}
2361
34dc7c2f
BB
2362void
2363dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp,
2364 uint64_t *usedobjsp, uint64_t *availobjsp)
2365{
428870ff 2366 dsl_dataset_space(os->os_dsl_dataset, refdbytesp, availbytesp,
34dc7c2f
BB
2367 usedobjsp, availobjsp);
2368}
2369
2370uint64_t
2371dmu_objset_fsid_guid(objset_t *os)
2372{
428870ff 2373 return (dsl_dataset_fsid_guid(os->os_dsl_dataset));
34dc7c2f
BB
2374}
2375
2376void
2377dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat)
2378{
428870ff
BB
2379 stat->dds_type = os->os_phys->os_type;
2380 if (os->os_dsl_dataset)
2381 dsl_dataset_fast_stat(os->os_dsl_dataset, stat);
34dc7c2f
BB
2382}
2383
2384void
2385dmu_objset_stats(objset_t *os, nvlist_t *nv)
2386{
428870ff
BB
2387 ASSERT(os->os_dsl_dataset ||
2388 os->os_phys->os_type == DMU_OST_META);
34dc7c2f 2389
428870ff
BB
2390 if (os->os_dsl_dataset != NULL)
2391 dsl_dataset_stats(os->os_dsl_dataset, nv);
34dc7c2f
BB
2392
2393 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_TYPE,
428870ff 2394 os->os_phys->os_type);
9babb374
BB
2395 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERACCOUNTING,
2396 dmu_objset_userspace_present(os));
34dc7c2f
BB
2397}
2398
2399int
2400dmu_objset_is_snapshot(objset_t *os)
2401{
428870ff 2402 if (os->os_dsl_dataset != NULL)
0c66c32d 2403 return (os->os_dsl_dataset->ds_is_snapshot);
34dc7c2f
BB
2404 else
2405 return (B_FALSE);
2406}
2407
2408int
2409dmu_snapshot_realname(objset_t *os, char *name, char *real, int maxlen,
2410 boolean_t *conflict)
2411{
428870ff 2412 dsl_dataset_t *ds = os->os_dsl_dataset;
34dc7c2f
BB
2413 uint64_t ignored;
2414
d683ddbb 2415 if (dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0)
2e528b49 2416 return (SET_ERROR(ENOENT));
34dc7c2f
BB
2417
2418 return (zap_lookup_norm(ds->ds_dir->dd_pool->dp_meta_objset,
d683ddbb 2419 dsl_dataset_phys(ds)->ds_snapnames_zapobj, name, 8, 1, &ignored,
9b7b9cd3 2420 MT_NORMALIZE, real, maxlen, conflict));
34dc7c2f
BB
2421}
2422
2423int
2424dmu_snapshot_list_next(objset_t *os, int namelen, char *name,
2425 uint64_t *idp, uint64_t *offp, boolean_t *case_conflict)
2426{
428870ff 2427 dsl_dataset_t *ds = os->os_dsl_dataset;
34dc7c2f
BB
2428 zap_cursor_t cursor;
2429 zap_attribute_t attr;
2430
13fe0198
MA
2431 ASSERT(dsl_pool_config_held(dmu_objset_pool(os)));
2432
d683ddbb 2433 if (dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0)
2e528b49 2434 return (SET_ERROR(ENOENT));
34dc7c2f
BB
2435
2436 zap_cursor_init_serialized(&cursor,
2437 ds->ds_dir->dd_pool->dp_meta_objset,
d683ddbb 2438 dsl_dataset_phys(ds)->ds_snapnames_zapobj, *offp);
34dc7c2f
BB
2439
2440 if (zap_cursor_retrieve(&cursor, &attr) != 0) {
2441 zap_cursor_fini(&cursor);
2e528b49 2442 return (SET_ERROR(ENOENT));
34dc7c2f
BB
2443 }
2444
2445 if (strlen(attr.za_name) + 1 > namelen) {
2446 zap_cursor_fini(&cursor);
2e528b49 2447 return (SET_ERROR(ENAMETOOLONG));
34dc7c2f
BB
2448 }
2449
2450 (void) strcpy(name, attr.za_name);
2451 if (idp)
2452 *idp = attr.za_first_integer;
2453 if (case_conflict)
2454 *case_conflict = attr.za_normalization_conflict;
2455 zap_cursor_advance(&cursor);
2456 *offp = zap_cursor_serialize(&cursor);
2457 zap_cursor_fini(&cursor);
2458
2459 return (0);
2460}
2461
ebe7e575 2462int
6772fb67 2463dmu_snapshot_lookup(objset_t *os, const char *name, uint64_t *value)
ebe7e575 2464{
d1d7e268 2465 return (dsl_dataset_snap_lookup(os->os_dsl_dataset, name, value));
ebe7e575
BB
2466}
2467
34dc7c2f
BB
2468int
2469dmu_dir_list_next(objset_t *os, int namelen, char *name,
2470 uint64_t *idp, uint64_t *offp)
2471{
428870ff 2472 dsl_dir_t *dd = os->os_dsl_dataset->ds_dir;
34dc7c2f
BB
2473 zap_cursor_t cursor;
2474 zap_attribute_t attr;
2475
2476 /* there is no next dir on a snapshot! */
428870ff 2477 if (os->os_dsl_dataset->ds_object !=
d683ddbb 2478 dsl_dir_phys(dd)->dd_head_dataset_obj)
2e528b49 2479 return (SET_ERROR(ENOENT));
34dc7c2f
BB
2480
2481 zap_cursor_init_serialized(&cursor,
2482 dd->dd_pool->dp_meta_objset,
d683ddbb 2483 dsl_dir_phys(dd)->dd_child_dir_zapobj, *offp);
34dc7c2f
BB
2484
2485 if (zap_cursor_retrieve(&cursor, &attr) != 0) {
2486 zap_cursor_fini(&cursor);
2e528b49 2487 return (SET_ERROR(ENOENT));
34dc7c2f
BB
2488 }
2489
2490 if (strlen(attr.za_name) + 1 > namelen) {
2491 zap_cursor_fini(&cursor);
2e528b49 2492 return (SET_ERROR(ENAMETOOLONG));
34dc7c2f
BB
2493 }
2494
2495 (void) strcpy(name, attr.za_name);
2496 if (idp)
2497 *idp = attr.za_first_integer;
2498 zap_cursor_advance(&cursor);
2499 *offp = zap_cursor_serialize(&cursor);
2500 zap_cursor_fini(&cursor);
2501
2502 return (0);
2503}
2504
9c43027b
AJ
2505typedef struct dmu_objset_find_ctx {
2506 taskq_t *dc_tq;
2507 dsl_pool_t *dc_dp;
2508 uint64_t dc_ddobj;
1149ba64 2509 char *dc_ddname; /* last component of ddobj's name */
9c43027b
AJ
2510 int (*dc_func)(dsl_pool_t *, dsl_dataset_t *, void *);
2511 void *dc_arg;
2512 int dc_flags;
2513 kmutex_t *dc_error_lock;
2514 int *dc_error;
2515} dmu_objset_find_ctx_t;
2516
2517static void
2518dmu_objset_find_dp_impl(dmu_objset_find_ctx_t *dcp)
b128c09f 2519{
9c43027b 2520 dsl_pool_t *dp = dcp->dc_dp;
13fe0198
MA
2521 dsl_dir_t *dd;
2522 dsl_dataset_t *ds;
2523 zap_cursor_t zc;
2524 zap_attribute_t *attr;
2525 uint64_t thisobj;
9c43027b 2526 int err = 0;
13fe0198 2527
9c43027b
AJ
2528 /* don't process if there already was an error */
2529 if (*dcp->dc_error != 0)
2530 goto out;
13fe0198 2531
1149ba64
GM
2532 /*
2533 * Note: passing the name (dc_ddname) here is optional, but it
2534 * improves performance because we don't need to call
2535 * zap_value_search() to determine the name.
2536 */
2537 err = dsl_dir_hold_obj(dp, dcp->dc_ddobj, dcp->dc_ddname, FTAG, &dd);
13fe0198 2538 if (err != 0)
9c43027b 2539 goto out;
13fe0198
MA
2540
2541 /* Don't visit hidden ($MOS & $ORIGIN) objsets. */
2542 if (dd->dd_myname[0] == '$') {
2543 dsl_dir_rele(dd, FTAG);
9c43027b 2544 goto out;
13fe0198
MA
2545 }
2546
d683ddbb 2547 thisobj = dsl_dir_phys(dd)->dd_head_dataset_obj;
79c76d5b 2548 attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
13fe0198
MA
2549
2550 /*
2551 * Iterate over all children.
2552 */
9c43027b 2553 if (dcp->dc_flags & DS_FIND_CHILDREN) {
13fe0198 2554 for (zap_cursor_init(&zc, dp->dp_meta_objset,
d683ddbb 2555 dsl_dir_phys(dd)->dd_child_dir_zapobj);
13fe0198
MA
2556 zap_cursor_retrieve(&zc, attr) == 0;
2557 (void) zap_cursor_advance(&zc)) {
2558 ASSERT3U(attr->za_integer_length, ==,
2559 sizeof (uint64_t));
2560 ASSERT3U(attr->za_num_integers, ==, 1);
2561
1c27024e 2562 dmu_objset_find_ctx_t *child_dcp =
1149ba64 2563 kmem_alloc(sizeof (*child_dcp), KM_SLEEP);
9c43027b
AJ
2564 *child_dcp = *dcp;
2565 child_dcp->dc_ddobj = attr->za_first_integer;
1149ba64 2566 child_dcp->dc_ddname = spa_strdup(attr->za_name);
9c43027b
AJ
2567 if (dcp->dc_tq != NULL)
2568 (void) taskq_dispatch(dcp->dc_tq,
2569 dmu_objset_find_dp_cb, child_dcp, TQ_SLEEP);
2570 else
2571 dmu_objset_find_dp_impl(child_dcp);
13fe0198
MA
2572 }
2573 zap_cursor_fini(&zc);
13fe0198
MA
2574 }
2575
2576 /*
2577 * Iterate over all snapshots.
2578 */
9c43027b 2579 if (dcp->dc_flags & DS_FIND_SNAPSHOTS) {
13fe0198
MA
2580 dsl_dataset_t *ds;
2581 err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
2582
2583 if (err == 0) {
d683ddbb
JG
2584 uint64_t snapobj;
2585
2586 snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
13fe0198
MA
2587 dsl_dataset_rele(ds, FTAG);
2588
2589 for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj);
2590 zap_cursor_retrieve(&zc, attr) == 0;
2591 (void) zap_cursor_advance(&zc)) {
2592 ASSERT3U(attr->za_integer_length, ==,
2593 sizeof (uint64_t));
2594 ASSERT3U(attr->za_num_integers, ==, 1);
2595
2596 err = dsl_dataset_hold_obj(dp,
2597 attr->za_first_integer, FTAG, &ds);
2598 if (err != 0)
2599 break;
9c43027b 2600 err = dcp->dc_func(dp, ds, dcp->dc_arg);
13fe0198
MA
2601 dsl_dataset_rele(ds, FTAG);
2602 if (err != 0)
2603 break;
2604 }
2605 zap_cursor_fini(&zc);
2606 }
2607 }
2608
13fe0198
MA
2609 kmem_free(attr, sizeof (zap_attribute_t));
2610
1149ba64
GM
2611 if (err != 0) {
2612 dsl_dir_rele(dd, FTAG);
9c43027b 2613 goto out;
1149ba64 2614 }
13fe0198
MA
2615
2616 /*
2617 * Apply to self.
2618 */
2619 err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
1149ba64
GM
2620
2621 /*
2622 * Note: we hold the dir while calling dsl_dataset_hold_obj() so
2623 * that the dir will remain cached, and we won't have to re-instantiate
2624 * it (which could be expensive due to finding its name via
2625 * zap_value_search()).
2626 */
2627 dsl_dir_rele(dd, FTAG);
13fe0198 2628 if (err != 0)
9c43027b
AJ
2629 goto out;
2630 err = dcp->dc_func(dp, ds, dcp->dc_arg);
13fe0198 2631 dsl_dataset_rele(ds, FTAG);
9c43027b
AJ
2632
2633out:
2634 if (err != 0) {
2635 mutex_enter(dcp->dc_error_lock);
2636 /* only keep first error */
2637 if (*dcp->dc_error == 0)
2638 *dcp->dc_error = err;
2639 mutex_exit(dcp->dc_error_lock);
2640 }
2641
1149ba64
GM
2642 if (dcp->dc_ddname != NULL)
2643 spa_strfree(dcp->dc_ddname);
9c43027b
AJ
2644 kmem_free(dcp, sizeof (*dcp));
2645}
2646
2647static void
2648dmu_objset_find_dp_cb(void *arg)
2649{
2650 dmu_objset_find_ctx_t *dcp = arg;
2651 dsl_pool_t *dp = dcp->dc_dp;
2652
5e8cd5d1
AJ
2653 /*
2654 * We need to get a pool_config_lock here, as there are several
e1cfd73f 2655 * assert(pool_config_held) down the stack. Getting a lock via
5e8cd5d1
AJ
2656 * dsl_pool_config_enter is risky, as it might be stalled by a
2657 * pending writer. This would deadlock, as the write lock can
2658 * only be granted when our parent thread gives up the lock.
2659 * The _prio interface gives us priority over a pending writer.
2660 */
2661 dsl_pool_config_enter_prio(dp, FTAG);
9c43027b
AJ
2662
2663 dmu_objset_find_dp_impl(dcp);
2664
2665 dsl_pool_config_exit(dp, FTAG);
2666}
2667
2668/*
2669 * Find objsets under and including ddobj, call func(ds) on each.
2670 * The order for the enumeration is completely undefined.
2671 * func is called with dsl_pool_config held.
2672 */
2673int
2674dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj,
2675 int func(dsl_pool_t *, dsl_dataset_t *, void *), void *arg, int flags)
2676{
2677 int error = 0;
2678 taskq_t *tq = NULL;
2679 int ntasks;
2680 dmu_objset_find_ctx_t *dcp;
2681 kmutex_t err_lock;
2682
2683 mutex_init(&err_lock, NULL, MUTEX_DEFAULT, NULL);
2684 dcp = kmem_alloc(sizeof (*dcp), KM_SLEEP);
2685 dcp->dc_tq = NULL;
2686 dcp->dc_dp = dp;
2687 dcp->dc_ddobj = ddobj;
1149ba64 2688 dcp->dc_ddname = NULL;
9c43027b
AJ
2689 dcp->dc_func = func;
2690 dcp->dc_arg = arg;
2691 dcp->dc_flags = flags;
2692 dcp->dc_error_lock = &err_lock;
2693 dcp->dc_error = &error;
2694
2695 if ((flags & DS_FIND_SERIALIZE) || dsl_pool_config_held_writer(dp)) {
2696 /*
2697 * In case a write lock is held we can't make use of
2698 * parallelism, as down the stack of the worker threads
2699 * the lock is asserted via dsl_pool_config_held.
2700 * In case of a read lock this is solved by getting a read
2701 * lock in each worker thread, which isn't possible in case
2702 * of a writer lock. So we fall back to the synchronous path
2703 * here.
2704 * In the future it might be possible to get some magic into
2705 * dsl_pool_config_held in a way that it returns true for
2706 * the worker threads so that a single lock held from this
2707 * thread suffices. For now, stay single threaded.
2708 */
2709 dmu_objset_find_dp_impl(dcp);
e5676636 2710 mutex_destroy(&err_lock);
9c43027b
AJ
2711
2712 return (error);
2713 }
2714
2715 ntasks = dmu_find_threads;
2716 if (ntasks == 0)
2717 ntasks = vdev_count_leaves(dp->dp_spa) * 4;
1229323d 2718 tq = taskq_create("dmu_objset_find", ntasks, maxclsyspri, ntasks,
9c43027b
AJ
2719 INT_MAX, 0);
2720 if (tq == NULL) {
2721 kmem_free(dcp, sizeof (*dcp));
e5676636
BB
2722 mutex_destroy(&err_lock);
2723
9c43027b
AJ
2724 return (SET_ERROR(ENOMEM));
2725 }
2726 dcp->dc_tq = tq;
2727
2728 /* dcp will be freed by task */
2729 (void) taskq_dispatch(tq, dmu_objset_find_dp_cb, dcp, TQ_SLEEP);
2730
2731 /*
2732 * PORTING: this code relies on the property of taskq_wait to wait
2733 * until no more tasks are queued and no more tasks are active. As
2734 * we always queue new tasks from within other tasks, task_wait
2735 * reliably waits for the full recursion to finish, even though we
2736 * enqueue new tasks after taskq_wait has been called.
2737 * On platforms other than illumos, taskq_wait may not have this
2738 * property.
2739 */
2740 taskq_wait(tq);
2741 taskq_destroy(tq);
2742 mutex_destroy(&err_lock);
2743
2744 return (error);
b128c09f
BB
2745}
2746
2747/*
13fe0198
MA
2748 * Find all objsets under name, and for each, call 'func(child_name, arg)'.
2749 * The dp_config_rwlock must not be held when this is called, and it
2750 * will not be held when the callback is called.
2751 * Therefore this function should only be used when the pool is not changing
2752 * (e.g. in syncing context), or the callback can deal with the possible races.
b128c09f 2753 */
13fe0198
MA
2754static int
2755dmu_objset_find_impl(spa_t *spa, const char *name,
2756 int func(const char *, void *), void *arg, int flags)
34dc7c2f
BB
2757{
2758 dsl_dir_t *dd;
13fe0198 2759 dsl_pool_t *dp = spa_get_dsl(spa);
b128c09f 2760 dsl_dataset_t *ds;
34dc7c2f
BB
2761 zap_cursor_t zc;
2762 zap_attribute_t *attr;
2763 char *child;
b128c09f
BB
2764 uint64_t thisobj;
2765 int err;
34dc7c2f 2766
13fe0198
MA
2767 dsl_pool_config_enter(dp, FTAG);
2768
2769 err = dsl_dir_hold(dp, name, FTAG, &dd, NULL);
2770 if (err != 0) {
2771 dsl_pool_config_exit(dp, FTAG);
34dc7c2f 2772 return (err);
13fe0198 2773 }
34dc7c2f 2774
b128c09f
BB
2775 /* Don't visit hidden ($MOS & $ORIGIN) objsets. */
2776 if (dd->dd_myname[0] == '$') {
13fe0198
MA
2777 dsl_dir_rele(dd, FTAG);
2778 dsl_pool_config_exit(dp, FTAG);
b128c09f
BB
2779 return (0);
2780 }
2781
d683ddbb 2782 thisobj = dsl_dir_phys(dd)->dd_head_dataset_obj;
79c76d5b 2783 attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
34dc7c2f
BB
2784
2785 /*
2786 * Iterate over all children.
2787 */
2788 if (flags & DS_FIND_CHILDREN) {
b128c09f 2789 for (zap_cursor_init(&zc, dp->dp_meta_objset,
d683ddbb 2790 dsl_dir_phys(dd)->dd_child_dir_zapobj);
34dc7c2f
BB
2791 zap_cursor_retrieve(&zc, attr) == 0;
2792 (void) zap_cursor_advance(&zc)) {
13fe0198
MA
2793 ASSERT3U(attr->za_integer_length, ==,
2794 sizeof (uint64_t));
2795 ASSERT3U(attr->za_num_integers, ==, 1);
34dc7c2f 2796
428870ff 2797 child = kmem_asprintf("%s/%s", name, attr->za_name);
13fe0198
MA
2798 dsl_pool_config_exit(dp, FTAG);
2799 err = dmu_objset_find_impl(spa, child,
2800 func, arg, flags);
2801 dsl_pool_config_enter(dp, FTAG);
e4f5fa12 2802 kmem_strfree(child);
13fe0198 2803 if (err != 0)
34dc7c2f
BB
2804 break;
2805 }
2806 zap_cursor_fini(&zc);
2807
13fe0198
MA
2808 if (err != 0) {
2809 dsl_dir_rele(dd, FTAG);
2810 dsl_pool_config_exit(dp, FTAG);
34dc7c2f
BB
2811 kmem_free(attr, sizeof (zap_attribute_t));
2812 return (err);
2813 }
2814 }
2815
2816 /*
2817 * Iterate over all snapshots.
2818 */
b128c09f 2819 if (flags & DS_FIND_SNAPSHOTS) {
b128c09f 2820 err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
b128c09f
BB
2821
2822 if (err == 0) {
d683ddbb
JG
2823 uint64_t snapobj;
2824
2825 snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
b128c09f
BB
2826 dsl_dataset_rele(ds, FTAG);
2827
2828 for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj);
2829 zap_cursor_retrieve(&zc, attr) == 0;
2830 (void) zap_cursor_advance(&zc)) {
13fe0198 2831 ASSERT3U(attr->za_integer_length, ==,
b128c09f 2832 sizeof (uint64_t));
13fe0198 2833 ASSERT3U(attr->za_num_integers, ==, 1);
b128c09f 2834
428870ff
BB
2835 child = kmem_asprintf("%s@%s",
2836 name, attr->za_name);
13fe0198
MA
2837 dsl_pool_config_exit(dp, FTAG);
2838 err = func(child, arg);
2839 dsl_pool_config_enter(dp, FTAG);
e4f5fa12 2840 kmem_strfree(child);
13fe0198 2841 if (err != 0)
b128c09f
BB
2842 break;
2843 }
2844 zap_cursor_fini(&zc);
34dc7c2f 2845 }
34dc7c2f
BB
2846 }
2847
13fe0198 2848 dsl_dir_rele(dd, FTAG);
34dc7c2f 2849 kmem_free(attr, sizeof (zap_attribute_t));
13fe0198 2850 dsl_pool_config_exit(dp, FTAG);
34dc7c2f 2851
13fe0198 2852 if (err != 0)
34dc7c2f
BB
2853 return (err);
2854
13fe0198
MA
2855 /* Apply to self. */
2856 return (func(name, arg));
34dc7c2f
BB
2857}
2858
13fe0198
MA
2859/*
2860 * See comment above dmu_objset_find_impl().
2861 */
d164b209 2862int
5df7e9d8 2863dmu_objset_find(const char *name, int func(const char *, void *), void *arg,
13fe0198 2864 int flags)
d164b209 2865{
13fe0198
MA
2866 spa_t *spa;
2867 int error;
d164b209 2868
13fe0198
MA
2869 error = spa_open(name, &spa, FTAG);
2870 if (error != 0)
2871 return (error);
2872 error = dmu_objset_find_impl(spa, name, func, arg, flags);
2873 spa_close(spa, FTAG);
2874 return (error);
d164b209
BB
2875}
2876
ae76f45c
TC
2877boolean_t
2878dmu_objset_incompatible_encryption_version(objset_t *os)
2879{
2880 return (dsl_dir_incompatible_encryption_version(
2881 os->os_dsl_dataset->ds_dir));
2882}
2883
34dc7c2f
BB
2884void
2885dmu_objset_set_user(objset_t *os, void *user_ptr)
2886{
428870ff
BB
2887 ASSERT(MUTEX_HELD(&os->os_user_ptr_lock));
2888 os->os_user_ptr = user_ptr;
34dc7c2f
BB
2889}
2890
2891void *
2892dmu_objset_get_user(objset_t *os)
2893{
428870ff
BB
2894 ASSERT(MUTEX_HELD(&os->os_user_ptr_lock));
2895 return (os->os_user_ptr);
34dc7c2f 2896}
c28b2279 2897
13fe0198
MA
2898/*
2899 * Determine name of filesystem, given name of snapshot.
eca7b760 2900 * buf must be at least ZFS_MAX_DATASET_NAME_LEN bytes
13fe0198
MA
2901 */
2902int
2903dmu_fsname(const char *snapname, char *buf)
2904{
2905 char *atp = strchr(snapname, '@');
2906 if (atp == NULL)
2e528b49 2907 return (SET_ERROR(EINVAL));
eca7b760 2908 if (atp - snapname >= ZFS_MAX_DATASET_NAME_LEN)
2e528b49 2909 return (SET_ERROR(ENAMETOOLONG));
13fe0198
MA
2910 (void) strlcpy(buf, snapname, atp - snapname + 1);
2911 return (0);
2912}
2913
3ec3bc21 2914/*
0f8ff49e
SD
2915 * Call when we think we're going to write/free space in open context
2916 * to track the amount of dirty data in the open txg, which is also the
2917 * amount of memory that can not be evicted until this txg syncs.
2918 *
2919 * Note that there are two conditions where this can be called from
2920 * syncing context:
2921 *
2922 * [1] When we just created the dataset, in which case we go on with
2923 * updating any accounting of dirty data as usual.
2924 * [2] When we are dirtying MOS data, in which case we only update the
2925 * pool's accounting of dirty data.
3ec3bc21
BB
2926 */
2927void
2928dmu_objset_willuse_space(objset_t *os, int64_t space, dmu_tx_t *tx)
2929{
2930 dsl_dataset_t *ds = os->os_dsl_dataset;
2931 int64_t aspace = spa_get_worst_case_asize(os->os_spa, space);
2932
2933 if (ds != NULL) {
2934 dsl_dir_willuse_space(ds->ds_dir, aspace, tx);
3ec3bc21 2935 }
0f8ff49e
SD
2936
2937 dsl_pool_dirty_space(dmu_tx_pool(tx), space, tx);
3ec3bc21
BB
2938}
2939
93ce2b4c 2940#if defined(_KERNEL)
f0fd83be 2941EXPORT_SYMBOL(dmu_objset_zil);
c28b2279 2942EXPORT_SYMBOL(dmu_objset_pool);
f0fd83be
BB
2943EXPORT_SYMBOL(dmu_objset_ds);
2944EXPORT_SYMBOL(dmu_objset_type);
c28b2279
BB
2945EXPORT_SYMBOL(dmu_objset_name);
2946EXPORT_SYMBOL(dmu_objset_hold);
b5256303 2947EXPORT_SYMBOL(dmu_objset_hold_flags);
c28b2279
BB
2948EXPORT_SYMBOL(dmu_objset_own);
2949EXPORT_SYMBOL(dmu_objset_rele);
b5256303 2950EXPORT_SYMBOL(dmu_objset_rele_flags);
c28b2279
BB
2951EXPORT_SYMBOL(dmu_objset_disown);
2952EXPORT_SYMBOL(dmu_objset_from_ds);
2953EXPORT_SYMBOL(dmu_objset_create);
2954EXPORT_SYMBOL(dmu_objset_clone);
c28b2279
BB
2955EXPORT_SYMBOL(dmu_objset_stats);
2956EXPORT_SYMBOL(dmu_objset_fast_stat);
54a179e7 2957EXPORT_SYMBOL(dmu_objset_spa);
c28b2279
BB
2958EXPORT_SYMBOL(dmu_objset_space);
2959EXPORT_SYMBOL(dmu_objset_fsid_guid);
2960EXPORT_SYMBOL(dmu_objset_find);
c28b2279
BB
2961EXPORT_SYMBOL(dmu_objset_byteswap);
2962EXPORT_SYMBOL(dmu_objset_evict_dbufs);
2963EXPORT_SYMBOL(dmu_objset_snap_cmtime);
50c957f7 2964EXPORT_SYMBOL(dmu_objset_dnodesize);
c28b2279
BB
2965
2966EXPORT_SYMBOL(dmu_objset_sync);
2967EXPORT_SYMBOL(dmu_objset_is_dirty);
b5256303 2968EXPORT_SYMBOL(dmu_objset_create_impl_dnstats);
c28b2279
BB
2969EXPORT_SYMBOL(dmu_objset_create_impl);
2970EXPORT_SYMBOL(dmu_objset_open_impl);
2971EXPORT_SYMBOL(dmu_objset_evict);
2972EXPORT_SYMBOL(dmu_objset_register_type);
2973EXPORT_SYMBOL(dmu_objset_do_userquota_updates);
2974EXPORT_SYMBOL(dmu_objset_userquota_get_ids);
2975EXPORT_SYMBOL(dmu_objset_userused_enabled);
2976EXPORT_SYMBOL(dmu_objset_userspace_upgrade);
2977EXPORT_SYMBOL(dmu_objset_userspace_present);
1de321e6 2978EXPORT_SYMBOL(dmu_objset_userobjused_enabled);
126ae9f4 2979EXPORT_SYMBOL(dmu_objset_userobjspace_upgradable);
1de321e6 2980EXPORT_SYMBOL(dmu_objset_userobjspace_present);
9c5167d1
NF
2981EXPORT_SYMBOL(dmu_objset_projectquota_enabled);
2982EXPORT_SYMBOL(dmu_objset_projectquota_present);
2983EXPORT_SYMBOL(dmu_objset_projectquota_upgradable);
2984EXPORT_SYMBOL(dmu_objset_id_quota_upgrade);
c28b2279 2985#endif