]> git.proxmox.com Git - mirror_zfs.git/blame - module/zfs/dmu_objset.c
Linux 5.11 compat: blk_{un}register_region()
[mirror_zfs.git] / module / zfs / dmu_objset.c
CommitLineData
34dc7c2f
BB
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
9b7b9cd3 21
34dc7c2f 22/*
428870ff 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
ba67d821 24 * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
3a17a7a9 25 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
788eb90c 26 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
0c66c32d 27 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
9c43027b 28 * Copyright (c) 2015, STRATO AG, Inc. All rights reserved.
a0bd735a 29 * Copyright (c) 2016 Actifio, Inc. All rights reserved.
9b7b9cd3 30 * Copyright 2017 Nexenta Systems, Inc.
c0daec32 31 * Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
d8d418ff 32 * Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
10b3c7f5
MN
33 * Copyright (c) 2019, Klara Inc.
34 * Copyright (c) 2019, Allan Jude
34dc7c2f
BB
35 */
36
428870ff
BB
37/* Portions Copyright 2010 Robert Milkowski */
38
34dc7c2f
BB
39#include <sys/cred.h>
40#include <sys/zfs_context.h>
41#include <sys/dmu_objset.h>
42#include <sys/dsl_dir.h>
43#include <sys/dsl_dataset.h>
44#include <sys/dsl_prop.h>
45#include <sys/dsl_pool.h>
46#include <sys/dsl_synctask.h>
47#include <sys/dsl_deleg.h>
48#include <sys/dnode.h>
49#include <sys/dbuf.h>
50#include <sys/zvol.h>
51#include <sys/dmu_tx.h>
34dc7c2f
BB
52#include <sys/zap.h>
53#include <sys/zil.h>
54#include <sys/dmu_impl.h>
55#include <sys/zfs_ioctl.h>
428870ff 56#include <sys/sa.h>
572e2857 57#include <sys/zfs_onexit.h>
13fe0198 58#include <sys/dsl_destroy.h>
9c43027b 59#include <sys/vdev.h>
a1d477c2 60#include <sys/zfeature.h>
f74b821a 61#include <sys/policy.h>
1de321e6 62#include <sys/spa_impl.h>
03916905 63#include <sys/dmu_recv.h>
9c5167d1 64#include <sys/zfs_project.h>
a7ed98d8 65#include "zfs_namecheck.h"
572e2857
BB
66
67/*
68 * Needed to close a window in dnode_move() that allows the objset to be freed
69 * before it can be safely accessed.
70 */
71krwlock_t os_lock;
72
9c43027b 73/*
4e33ba4c 74 * Tunable to overwrite the maximum number of threads for the parallelization
9c43027b
AJ
75 * of dmu_objset_find_dp, needed to speed up the import of pools with many
76 * datasets.
77 * Default is 4 times the number of leaf vdevs.
78 */
79int dmu_find_threads = 0;
80
68cbd56e
NB
81/*
82 * Backfill lower metadnode objects after this many have been freed.
83 * Backfilling negatively impacts object creation rates, so only do it
84 * if there are enough holes to fill.
85 */
86int dmu_rescan_dnode_threshold = 1 << DN_MAX_INDBLKSHIFT;
87
c0daec32
AB
88static char *upgrade_tag = "upgrade_tag";
89
9c43027b
AJ
90static void dmu_objset_find_dp_cb(void *arg);
91
1de321e6
JX
92static void dmu_objset_upgrade(objset_t *os, dmu_objset_upgrade_cb_t cb);
93static void dmu_objset_upgrade_stop(objset_t *os);
94
572e2857
BB
95void
96dmu_objset_init(void)
97{
98 rw_init(&os_lock, NULL, RW_DEFAULT, NULL);
99}
100
101void
102dmu_objset_fini(void)
103{
104 rw_destroy(&os_lock);
105}
34dc7c2f
BB
106
107spa_t *
108dmu_objset_spa(objset_t *os)
109{
428870ff 110 return (os->os_spa);
34dc7c2f
BB
111}
112
113zilog_t *
114dmu_objset_zil(objset_t *os)
115{
428870ff 116 return (os->os_zil);
34dc7c2f
BB
117}
118
119dsl_pool_t *
120dmu_objset_pool(objset_t *os)
121{
122 dsl_dataset_t *ds;
123
428870ff 124 if ((ds = os->os_dsl_dataset) != NULL && ds->ds_dir)
34dc7c2f
BB
125 return (ds->ds_dir->dd_pool);
126 else
428870ff 127 return (spa_get_dsl(os->os_spa));
34dc7c2f
BB
128}
129
130dsl_dataset_t *
131dmu_objset_ds(objset_t *os)
132{
428870ff 133 return (os->os_dsl_dataset);
34dc7c2f
BB
134}
135
136dmu_objset_type_t
137dmu_objset_type(objset_t *os)
138{
428870ff 139 return (os->os_phys->os_type);
34dc7c2f
BB
140}
141
142void
143dmu_objset_name(objset_t *os, char *buf)
144{
428870ff 145 dsl_dataset_name(os->os_dsl_dataset, buf);
34dc7c2f
BB
146}
147
148uint64_t
149dmu_objset_id(objset_t *os)
150{
428870ff 151 dsl_dataset_t *ds = os->os_dsl_dataset;
34dc7c2f
BB
152
153 return (ds ? ds->ds_object : 0);
154}
155
50c957f7
NB
156uint64_t
157dmu_objset_dnodesize(objset_t *os)
158{
159 return (os->os_dnodesize);
160}
161
faf0f58c 162zfs_sync_type_t
428870ff
BB
163dmu_objset_syncprop(objset_t *os)
164{
165 return (os->os_sync);
166}
167
faf0f58c 168zfs_logbias_op_t
428870ff
BB
169dmu_objset_logbias(objset_t *os)
170{
171 return (os->os_logbias);
172}
173
34dc7c2f
BB
174static void
175checksum_changed_cb(void *arg, uint64_t newval)
176{
428870ff 177 objset_t *os = arg;
34dc7c2f
BB
178
179 /*
180 * Inheritance should have been done by now.
181 */
182 ASSERT(newval != ZIO_CHECKSUM_INHERIT);
183
428870ff 184 os->os_checksum = zio_checksum_select(newval, ZIO_CHECKSUM_ON_VALUE);
34dc7c2f
BB
185}
186
187static void
188compression_changed_cb(void *arg, uint64_t newval)
189{
428870ff 190 objset_t *os = arg;
34dc7c2f
BB
191
192 /*
193 * Inheritance and range checking should have been done by now.
194 */
195 ASSERT(newval != ZIO_COMPRESS_INHERIT);
196
10b3c7f5
MN
197 os->os_compress = zio_compress_select(os->os_spa,
198 ZIO_COMPRESS_ALGO(newval), ZIO_COMPRESS_ON);
199 os->os_complevel = zio_complevel_select(os->os_spa, os->os_compress,
200 ZIO_COMPRESS_LEVEL(newval), ZIO_COMPLEVEL_DEFAULT);
34dc7c2f
BB
201}
202
203static void
204copies_changed_cb(void *arg, uint64_t newval)
205{
428870ff 206 objset_t *os = arg;
34dc7c2f
BB
207
208 /*
209 * Inheritance and range checking should have been done by now.
210 */
211 ASSERT(newval > 0);
428870ff 212 ASSERT(newval <= spa_max_replication(os->os_spa));
34dc7c2f 213
428870ff
BB
214 os->os_copies = newval;
215}
216
217static void
218dedup_changed_cb(void *arg, uint64_t newval)
219{
220 objset_t *os = arg;
221 spa_t *spa = os->os_spa;
222 enum zio_checksum checksum;
223
224 /*
225 * Inheritance should have been done by now.
226 */
227 ASSERT(newval != ZIO_CHECKSUM_INHERIT);
228
229 checksum = zio_checksum_dedup_select(spa, newval, ZIO_CHECKSUM_OFF);
230
231 os->os_dedup_checksum = checksum & ZIO_CHECKSUM_MASK;
232 os->os_dedup_verify = !!(checksum & ZIO_CHECKSUM_VERIFY);
34dc7c2f
BB
233}
234
b128c09f
BB
235static void
236primary_cache_changed_cb(void *arg, uint64_t newval)
237{
428870ff 238 objset_t *os = arg;
b128c09f
BB
239
240 /*
241 * Inheritance and range checking should have been done by now.
242 */
243 ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE ||
244 newval == ZFS_CACHE_METADATA);
245
428870ff 246 os->os_primary_cache = newval;
b128c09f
BB
247}
248
249static void
250secondary_cache_changed_cb(void *arg, uint64_t newval)
251{
428870ff 252 objset_t *os = arg;
b128c09f
BB
253
254 /*
255 * Inheritance and range checking should have been done by now.
256 */
257 ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE ||
258 newval == ZFS_CACHE_METADATA);
259
428870ff
BB
260 os->os_secondary_cache = newval;
261}
262
263static void
264sync_changed_cb(void *arg, uint64_t newval)
265{
266 objset_t *os = arg;
267
268 /*
269 * Inheritance and range checking should have been done by now.
270 */
271 ASSERT(newval == ZFS_SYNC_STANDARD || newval == ZFS_SYNC_ALWAYS ||
272 newval == ZFS_SYNC_DISABLED);
273
274 os->os_sync = newval;
275 if (os->os_zil)
276 zil_set_sync(os->os_zil, newval);
277}
278
faf0f58c
MA
279static void
280redundant_metadata_changed_cb(void *arg, uint64_t newval)
281{
282 objset_t *os = arg;
283
284 /*
285 * Inheritance and range checking should have been done by now.
286 */
287 ASSERT(newval == ZFS_REDUNDANT_METADATA_ALL ||
288 newval == ZFS_REDUNDANT_METADATA_MOST);
289
290 os->os_redundant_metadata = newval;
291}
292
50c957f7
NB
293static void
294dnodesize_changed_cb(void *arg, uint64_t newval)
295{
296 objset_t *os = arg;
297
298 switch (newval) {
299 case ZFS_DNSIZE_LEGACY:
300 os->os_dnodesize = DNODE_MIN_SIZE;
301 break;
302 case ZFS_DNSIZE_AUTO:
303 /*
304 * Choose a dnode size that will work well for most
305 * workloads if the user specified "auto". Future code
306 * improvements could dynamically select a dnode size
307 * based on observed workload patterns.
308 */
309 os->os_dnodesize = DNODE_MIN_SIZE * 2;
310 break;
311 case ZFS_DNSIZE_1K:
312 case ZFS_DNSIZE_2K:
313 case ZFS_DNSIZE_4K:
314 case ZFS_DNSIZE_8K:
315 case ZFS_DNSIZE_16K:
316 os->os_dnodesize = newval;
317 break;
318 }
319}
320
cc99f275
DB
321static void
322smallblk_changed_cb(void *arg, uint64_t newval)
323{
324 objset_t *os = arg;
325
326 /*
327 * Inheritance and range checking should have been done by now.
328 */
329 ASSERT(newval <= SPA_OLD_MAXBLOCKSIZE);
330 ASSERT(ISP2(newval));
331
332 os->os_zpl_special_smallblock = newval;
333}
334
428870ff
BB
335static void
336logbias_changed_cb(void *arg, uint64_t newval)
337{
338 objset_t *os = arg;
339
340 ASSERT(newval == ZFS_LOGBIAS_LATENCY ||
341 newval == ZFS_LOGBIAS_THROUGHPUT);
342 os->os_logbias = newval;
343 if (os->os_zil)
344 zil_set_logbias(os->os_zil, newval);
b128c09f
BB
345}
346
f1512ee6
MA
347static void
348recordsize_changed_cb(void *arg, uint64_t newval)
349{
350 objset_t *os = arg;
351
352 os->os_recordsize = newval;
353}
354
34dc7c2f
BB
355void
356dmu_objset_byteswap(void *buf, size_t size)
357{
358 objset_phys_t *osp = buf;
359
9c5167d1
NF
360 ASSERT(size == OBJSET_PHYS_SIZE_V1 || size == OBJSET_PHYS_SIZE_V2 ||
361 size == sizeof (objset_phys_t));
34dc7c2f
BB
362 dnode_byteswap(&osp->os_meta_dnode);
363 byteswap_uint64_array(&osp->os_zil_header, sizeof (zil_header_t));
364 osp->os_type = BSWAP_64(osp->os_type);
9babb374 365 osp->os_flags = BSWAP_64(osp->os_flags);
9c5167d1 366 if (size >= OBJSET_PHYS_SIZE_V2) {
9babb374
BB
367 dnode_byteswap(&osp->os_userused_dnode);
368 dnode_byteswap(&osp->os_groupused_dnode);
9c5167d1
NF
369 if (size >= sizeof (objset_phys_t))
370 dnode_byteswap(&osp->os_projectused_dnode);
9babb374 371 }
34dc7c2f
BB
372}
373
64fc7762
MA
374/*
375 * The hash is a CRC-based hash of the objset_t pointer and the object number.
376 */
377static uint64_t
378dnode_hash(const objset_t *os, uint64_t obj)
379{
380 uintptr_t osv = (uintptr_t)os;
381 uint64_t crc = -1ULL;
382
383 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
384 /*
385 * The low 6 bits of the pointer don't have much entropy, because
386 * the objset_t is larger than 2^6 bytes long.
387 */
388 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF];
389 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF];
390 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF];
391 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 16)) & 0xFF];
392
393 crc ^= (osv>>14) ^ (obj>>24);
394
395 return (crc);
396}
397
65c7cc49 398static unsigned int
64fc7762
MA
399dnode_multilist_index_func(multilist_t *ml, void *obj)
400{
401 dnode_t *dn = obj;
402 return (dnode_hash(dn->dn_objset, dn->dn_object) %
403 multilist_get_num_sublists(ml));
404}
405
a1d477c2
MA
406/*
407 * Instantiates the objset_t in-memory structure corresponding to the
408 * objset_phys_t that's pointed to by the specified blkptr_t.
409 */
34dc7c2f
BB
410int
411dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
428870ff 412 objset_t **osp)
34dc7c2f 413{
428870ff 414 objset_t *os;
b128c09f 415 int i, err;
34dc7c2f
BB
416
417 ASSERT(ds == NULL || MUTEX_HELD(&ds->ds_opening_lock));
30af21b0 418 ASSERT(!BP_IS_REDACTED(bp));
34dc7c2f 419
0fdd6106
MA
420 /*
421 * We need the pool config lock to get properties.
422 */
423 ASSERT(ds == NULL || dsl_pool_config_held(ds->ds_dir->dd_pool));
424
a1d477c2
MA
425 /*
426 * The $ORIGIN dataset (if it exists) doesn't have an associated
427 * objset, so there's no reason to open it. The $ORIGIN dataset
428 * will not exist on pools older than SPA_VERSION_ORIGIN.
429 */
430 if (ds != NULL && spa_get_dsl(spa) != NULL &&
431 spa_get_dsl(spa)->dp_origin_snap != NULL) {
432 ASSERT3P(ds->ds_dir, !=,
433 spa_get_dsl(spa)->dp_origin_snap->ds_dir);
434 }
435
79c76d5b 436 os = kmem_zalloc(sizeof (objset_t), KM_SLEEP);
428870ff
BB
437 os->os_dsl_dataset = ds;
438 os->os_spa = spa;
439 os->os_rootbp = bp;
440 if (!BP_IS_HOLE(os->os_rootbp)) {
2a432414 441 arc_flags_t aflags = ARC_FLAG_WAIT;
5dbd68a3 442 zbookmark_phys_t zb;
9c5167d1 443 int size;
b5256303 444 enum zio_flag zio_flags = ZIO_FLAG_CANFAIL;
428870ff
BB
445 SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
446 ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
447
448 if (DMU_OS_IS_L2CACHEABLE(os))
2a432414 449 aflags |= ARC_FLAG_L2CACHE;
34dc7c2f 450
b5256303
TC
451 if (ds != NULL && ds->ds_dir->dd_crypto_obj != 0) {
452 ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF);
453 ASSERT(BP_IS_AUTHENTICATED(bp));
454 zio_flags |= ZIO_FLAG_RAW;
455 }
456
428870ff 457 dprintf_bp(os->os_rootbp, "reading %s", "");
294f6806 458 err = arc_read(NULL, spa, os->os_rootbp,
428870ff 459 arc_getbuf_func, &os->os_phys_buf,
b5256303 460 ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);
13fe0198 461 if (err != 0) {
428870ff 462 kmem_free(os, sizeof (objset_t));
b128c09f
BB
463 /* convert checksum errors into IO errors */
464 if (err == ECKSUM)
2e528b49 465 err = SET_ERROR(EIO);
34dc7c2f
BB
466 return (err);
467 }
9babb374 468
9c5167d1
NF
469 if (spa_version(spa) < SPA_VERSION_USERSPACE)
470 size = OBJSET_PHYS_SIZE_V1;
471 else if (!spa_feature_is_enabled(spa,
472 SPA_FEATURE_PROJECT_QUOTA))
473 size = OBJSET_PHYS_SIZE_V2;
474 else
475 size = sizeof (objset_phys_t);
476
9babb374 477 /* Increase the blocksize if we are permitted. */
9c5167d1 478 if (arc_buf_size(os->os_phys_buf) < size) {
2aa34383 479 arc_buf_t *buf = arc_alloc_buf(spa, &os->os_phys_buf,
9c5167d1
NF
480 ARC_BUFC_METADATA, size);
481 bzero(buf->b_data, size);
428870ff
BB
482 bcopy(os->os_phys_buf->b_data, buf->b_data,
483 arc_buf_size(os->os_phys_buf));
d3c2ae1c 484 arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf);
428870ff 485 os->os_phys_buf = buf;
9babb374
BB
486 }
487
428870ff
BB
488 os->os_phys = os->os_phys_buf->b_data;
489 os->os_flags = os->os_phys->os_flags;
34dc7c2f 490 } else {
9babb374 491 int size = spa_version(spa) >= SPA_VERSION_USERSPACE ?
9c5167d1 492 sizeof (objset_phys_t) : OBJSET_PHYS_SIZE_V1;
2aa34383
DK
493 os->os_phys_buf = arc_alloc_buf(spa, &os->os_phys_buf,
494 ARC_BUFC_METADATA, size);
428870ff
BB
495 os->os_phys = os->os_phys_buf->b_data;
496 bzero(os->os_phys, size);
34dc7c2f 497 }
2e5dc449
MA
498 /*
499 * These properties will be filled in by the logic in zfs_get_zplprop()
500 * when they are queried for the first time.
501 */
502 os->os_version = OBJSET_PROP_UNINITIALIZED;
503 os->os_normalization = OBJSET_PROP_UNINITIALIZED;
504 os->os_utf8only = OBJSET_PROP_UNINITIALIZED;
505 os->os_casesensitivity = OBJSET_PROP_UNINITIALIZED;
34dc7c2f
BB
506
507 /*
508 * Note: the changed_cb will be called once before the register
509 * func returns, thus changing the checksum/compression from the
b128c09f
BB
510 * default (fletcher2/off). Snapshots don't need to know about
511 * checksum/compression/copies.
34dc7c2f 512 */
9b67f605 513 if (ds != NULL) {
b5256303
TC
514 os->os_encrypted = (ds->ds_dir->dd_crypto_obj != 0);
515
13fe0198
MA
516 err = dsl_prop_register(ds,
517 zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE),
428870ff 518 primary_cache_changed_cb, os);
13fe0198
MA
519 if (err == 0) {
520 err = dsl_prop_register(ds,
521 zfs_prop_to_name(ZFS_PROP_SECONDARYCACHE),
428870ff 522 secondary_cache_changed_cb, os);
13fe0198 523 }
0c66c32d 524 if (!ds->ds_is_snapshot) {
13fe0198
MA
525 if (err == 0) {
526 err = dsl_prop_register(ds,
527 zfs_prop_to_name(ZFS_PROP_CHECKSUM),
428870ff 528 checksum_changed_cb, os);
13fe0198
MA
529 }
530 if (err == 0) {
531 err = dsl_prop_register(ds,
532 zfs_prop_to_name(ZFS_PROP_COMPRESSION),
428870ff 533 compression_changed_cb, os);
13fe0198
MA
534 }
535 if (err == 0) {
536 err = dsl_prop_register(ds,
537 zfs_prop_to_name(ZFS_PROP_COPIES),
428870ff 538 copies_changed_cb, os);
13fe0198
MA
539 }
540 if (err == 0) {
541 err = dsl_prop_register(ds,
542 zfs_prop_to_name(ZFS_PROP_DEDUP),
428870ff 543 dedup_changed_cb, os);
13fe0198
MA
544 }
545 if (err == 0) {
546 err = dsl_prop_register(ds,
547 zfs_prop_to_name(ZFS_PROP_LOGBIAS),
428870ff 548 logbias_changed_cb, os);
13fe0198
MA
549 }
550 if (err == 0) {
551 err = dsl_prop_register(ds,
552 zfs_prop_to_name(ZFS_PROP_SYNC),
428870ff 553 sync_changed_cb, os);
13fe0198 554 }
faf0f58c
MA
555 if (err == 0) {
556 err = dsl_prop_register(ds,
557 zfs_prop_to_name(
558 ZFS_PROP_REDUNDANT_METADATA),
559 redundant_metadata_changed_cb, os);
560 }
f1512ee6
MA
561 if (err == 0) {
562 err = dsl_prop_register(ds,
563 zfs_prop_to_name(ZFS_PROP_RECORDSIZE),
564 recordsize_changed_cb, os);
565 }
50c957f7
NB
566 if (err == 0) {
567 err = dsl_prop_register(ds,
568 zfs_prop_to_name(ZFS_PROP_DNODESIZE),
569 dnodesize_changed_cb, os);
570 }
cc99f275
DB
571 if (err == 0) {
572 err = dsl_prop_register(ds,
573 zfs_prop_to_name(
574 ZFS_PROP_SPECIAL_SMALL_BLOCKS),
575 smallblk_changed_cb, os);
576 }
b128c09f 577 }
13fe0198 578 if (err != 0) {
d3c2ae1c 579 arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf);
428870ff 580 kmem_free(os, sizeof (objset_t));
34dc7c2f
BB
581 return (err);
582 }
9b67f605 583 } else {
34dc7c2f 584 /* It's the meta-objset. */
428870ff 585 os->os_checksum = ZIO_CHECKSUM_FLETCHER_4;
99197f03 586 os->os_compress = ZIO_COMPRESS_ON;
10b3c7f5 587 os->os_complevel = ZIO_COMPLEVEL_DEFAULT;
b5256303 588 os->os_encrypted = B_FALSE;
428870ff
BB
589 os->os_copies = spa_max_replication(spa);
590 os->os_dedup_checksum = ZIO_CHECKSUM_OFF;
faf0f58c
MA
591 os->os_dedup_verify = B_FALSE;
592 os->os_logbias = ZFS_LOGBIAS_LATENCY;
593 os->os_sync = ZFS_SYNC_STANDARD;
428870ff
BB
594 os->os_primary_cache = ZFS_CACHE_ALL;
595 os->os_secondary_cache = ZFS_CACHE_ALL;
50c957f7 596 os->os_dnodesize = DNODE_MIN_SIZE;
34dc7c2f
BB
597 }
598
0c66c32d 599 if (ds == NULL || !ds->ds_is_snapshot)
572e2857 600 os->os_zil_header = os->os_phys->os_zil_header;
428870ff 601 os->os_zil = zil_alloc(os, &os->os_zil_header);
34dc7c2f
BB
602
603 for (i = 0; i < TXG_SIZE; i++) {
64fc7762
MA
604 os->os_dirty_dnodes[i] = multilist_create(sizeof (dnode_t),
605 offsetof(dnode_t, dn_dirty_link[i]),
606 dnode_multilist_index_func);
34dc7c2f 607 }
428870ff 608 list_create(&os->os_dnodes, sizeof (dnode_t),
34dc7c2f 609 offsetof(dnode_t, dn_link));
428870ff 610 list_create(&os->os_downgraded_dbufs, sizeof (dmu_buf_impl_t),
34dc7c2f
BB
611 offsetof(dmu_buf_impl_t, db_link));
612
0c66c32d
JG
613 list_link_init(&os->os_evicting_node);
614
428870ff 615 mutex_init(&os->os_lock, NULL, MUTEX_DEFAULT, NULL);
64fc7762 616 mutex_init(&os->os_userused_lock, NULL, MUTEX_DEFAULT, NULL);
428870ff
BB
617 mutex_init(&os->os_obj_lock, NULL, MUTEX_DEFAULT, NULL);
618 mutex_init(&os->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL);
dbeb8796
MA
619 os->os_obj_next_percpu_len = boot_ncpus;
620 os->os_obj_next_percpu = kmem_zalloc(os->os_obj_next_percpu_len *
621 sizeof (os->os_obj_next_percpu[0]), KM_SLEEP);
428870ff 622
0c66c32d
JG
623 dnode_special_open(os, &os->os_phys->os_meta_dnode,
624 DMU_META_DNODE_OBJECT, &os->os_meta_dnode);
9c5167d1 625 if (OBJSET_BUF_HAS_USERUSED(os->os_phys_buf)) {
0c66c32d
JG
626 dnode_special_open(os, &os->os_phys->os_userused_dnode,
627 DMU_USERUSED_OBJECT, &os->os_userused_dnode);
628 dnode_special_open(os, &os->os_phys->os_groupused_dnode,
629 DMU_GROUPUSED_OBJECT, &os->os_groupused_dnode);
9c5167d1
NF
630 if (OBJSET_BUF_HAS_PROJECTUSED(os->os_phys_buf))
631 dnode_special_open(os,
632 &os->os_phys->os_projectused_dnode,
633 DMU_PROJECTUSED_OBJECT, &os->os_projectused_dnode);
9babb374 634 }
34dc7c2f 635
1de321e6
JX
636 mutex_init(&os->os_upgrade_lock, NULL, MUTEX_DEFAULT, NULL);
637
428870ff 638 *osp = os;
34dc7c2f
BB
639 return (0);
640}
641
428870ff
BB
642int
643dmu_objset_from_ds(dsl_dataset_t *ds, objset_t **osp)
34dc7c2f 644{
428870ff 645 int err = 0;
34dc7c2f 646
47dfff3b 647 /*
0fdd6106
MA
648 * We need the pool_config lock to manipulate the dsl_dataset_t.
649 * Even if the dataset is long-held, we need the pool_config lock
650 * to open the objset, as it needs to get properties.
47dfff3b 651 */
0fdd6106 652 ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
47dfff3b 653
34dc7c2f 654 mutex_enter(&ds->ds_opening_lock);
9b67f605
MA
655 if (ds->ds_objset == NULL) {
656 objset_t *os;
cc9bb3e5 657 rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
34dc7c2f 658 err = dmu_objset_open_impl(dsl_dataset_get_spa(ds),
9b67f605 659 ds, dsl_dataset_get_blkptr(ds), &os);
cc9bb3e5 660 rrw_exit(&ds->ds_bp_rwlock, FTAG);
9b67f605
MA
661
662 if (err == 0) {
663 mutex_enter(&ds->ds_lock);
664 ASSERT(ds->ds_objset == NULL);
665 ds->ds_objset = os;
666 mutex_exit(&ds->ds_lock);
667 }
34dc7c2f 668 }
9b67f605 669 *osp = ds->ds_objset;
34dc7c2f 670 mutex_exit(&ds->ds_opening_lock);
428870ff 671 return (err);
34dc7c2f
BB
672}
673
13fe0198
MA
674/*
675 * Holds the pool while the objset is held. Therefore only one objset
676 * can be held at a time.
677 */
34dc7c2f 678int
b5256303
TC
679dmu_objset_hold_flags(const char *name, boolean_t decrypt, void *tag,
680 objset_t **osp)
34dc7c2f 681{
13fe0198 682 dsl_pool_t *dp;
428870ff 683 dsl_dataset_t *ds;
34dc7c2f 684 int err;
b5256303 685 ds_hold_flags_t flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : 0;
34dc7c2f 686
13fe0198
MA
687 err = dsl_pool_hold(name, tag, &dp);
688 if (err != 0)
689 return (err);
b5256303 690 err = dsl_dataset_hold_flags(dp, name, flags, tag, &ds);
13fe0198
MA
691 if (err != 0) {
692 dsl_pool_rele(dp, tag);
428870ff 693 return (err);
13fe0198 694 }
428870ff
BB
695
696 err = dmu_objset_from_ds(ds, osp);
13fe0198 697 if (err != 0) {
428870ff 698 dsl_dataset_rele(ds, tag);
13fe0198
MA
699 dsl_pool_rele(dp, tag);
700 }
428870ff 701
34dc7c2f
BB
702 return (err);
703}
704
b5256303
TC
705int
706dmu_objset_hold(const char *name, void *tag, objset_t **osp)
707{
708 return (dmu_objset_hold_flags(name, B_FALSE, tag, osp));
709}
710
9c43027b
AJ
711static int
712dmu_objset_own_impl(dsl_dataset_t *ds, dmu_objset_type_t type,
b5256303 713 boolean_t readonly, boolean_t decrypt, void *tag, objset_t **osp)
9c43027b
AJ
714{
715 int err;
716
717 err = dmu_objset_from_ds(ds, osp);
718 if (err != 0) {
b5256303 719 return (err);
9c43027b 720 } else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) {
9c43027b
AJ
721 return (SET_ERROR(EINVAL));
722 } else if (!readonly && dsl_dataset_is_snapshot(ds)) {
9c43027b 723 return (SET_ERROR(EROFS));
ae76f45c
TC
724 } else if (!readonly && decrypt &&
725 dsl_dir_incompatible_encryption_version(ds->ds_dir)) {
726 return (SET_ERROR(EROFS));
9c43027b 727 }
b5256303
TC
728
729 /* if we are decrypting, we can now check MACs in os->os_phys_buf */
730 if (decrypt && arc_is_unauthenticated((*osp)->os_phys_buf)) {
a2c2ed1b
TC
731 zbookmark_phys_t zb;
732
733 SET_BOOKMARK(&zb, ds->ds_object, ZB_ROOT_OBJECT,
734 ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
b5256303 735 err = arc_untransform((*osp)->os_phys_buf, (*osp)->os_spa,
a2c2ed1b 736 &zb, B_FALSE);
b5256303
TC
737 if (err != 0)
738 return (err);
739
740 ASSERT0(arc_is_unauthenticated((*osp)->os_phys_buf));
741 }
742
743 return (0);
9c43027b
AJ
744}
745
13fe0198
MA
746/*
747 * dsl_pool must not be held when this is called.
748 * Upon successful return, there will be a longhold on the dataset,
749 * and the dsl_pool will not be held.
750 */
34dc7c2f 751int
428870ff 752dmu_objset_own(const char *name, dmu_objset_type_t type,
b5256303 753 boolean_t readonly, boolean_t decrypt, void *tag, objset_t **osp)
34dc7c2f 754{
13fe0198 755 dsl_pool_t *dp;
34dc7c2f
BB
756 dsl_dataset_t *ds;
757 int err;
b5256303 758 ds_hold_flags_t flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : 0;
34dc7c2f 759
13fe0198
MA
760 err = dsl_pool_hold(name, FTAG, &dp);
761 if (err != 0)
762 return (err);
b5256303 763 err = dsl_dataset_own(dp, name, flags, tag, &ds);
13fe0198
MA
764 if (err != 0) {
765 dsl_pool_rele(dp, FTAG);
34dc7c2f 766 return (err);
13fe0198 767 }
b5256303
TC
768 err = dmu_objset_own_impl(ds, type, readonly, decrypt, tag, osp);
769 if (err != 0) {
770 dsl_dataset_disown(ds, flags, tag);
771 dsl_pool_rele(dp, FTAG);
772 return (err);
773 }
774
163a8c28
TC
775 /*
776 * User accounting requires the dataset to be decrypted and rw.
777 * We also don't begin user accounting during claiming to help
778 * speed up pool import times and to keep this txg reserved
779 * completely for recovery work.
780 */
4072f465 781 if (!readonly && !dp->dp_spa->spa_claiming &&
782 (ds->ds_dir->dd_crypto_obj == 0 || decrypt)) {
783 if (dmu_objset_userobjspace_upgradable(*osp) ||
784 dmu_objset_projectquota_upgradable(*osp)) {
785 dmu_objset_id_quota_upgrade(*osp);
786 } else if (dmu_objset_userused_enabled(*osp)) {
787 dmu_objset_userspace_upgrade(*osp);
788 }
789 }
1de321e6 790
c0daec32 791 dsl_pool_rele(dp, FTAG);
b5256303 792 return (0);
34dc7c2f
BB
793}
794
9c43027b
AJ
795int
796dmu_objset_own_obj(dsl_pool_t *dp, uint64_t obj, dmu_objset_type_t type,
b5256303 797 boolean_t readonly, boolean_t decrypt, void *tag, objset_t **osp)
9c43027b
AJ
798{
799 dsl_dataset_t *ds;
800 int err;
b5256303 801 ds_hold_flags_t flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : 0;
9c43027b 802
b5256303 803 err = dsl_dataset_own_obj(dp, obj, flags, tag, &ds);
9c43027b
AJ
804 if (err != 0)
805 return (err);
806
b5256303
TC
807 err = dmu_objset_own_impl(ds, type, readonly, decrypt, tag, osp);
808 if (err != 0) {
809 dsl_dataset_disown(ds, flags, tag);
810 return (err);
811 }
812
813 return (0);
9c43027b
AJ
814}
815
34dc7c2f 816void
b5256303 817dmu_objset_rele_flags(objset_t *os, boolean_t decrypt, void *tag)
34dc7c2f 818{
b5256303
TC
819 ds_hold_flags_t flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : 0;
820
13fe0198 821 dsl_pool_t *dp = dmu_objset_pool(os);
b5256303 822 dsl_dataset_rele_flags(os->os_dsl_dataset, flags, tag);
13fe0198 823 dsl_pool_rele(dp, tag);
428870ff 824}
b128c09f 825
b5256303
TC
826void
827dmu_objset_rele(objset_t *os, void *tag)
828{
829 dmu_objset_rele_flags(os, B_FALSE, tag);
830}
831
831baf06
KW
832/*
833 * When we are called, os MUST refer to an objset associated with a dataset
834 * that is owned by 'tag'; that is, is held and long held by 'tag' and ds_owner
835 * == tag. We will then release and reacquire ownership of the dataset while
836 * holding the pool config_rwlock to avoid intervening namespace or ownership
837 * changes may occur.
838 *
839 * This exists solely to accommodate zfs_ioc_userspace_upgrade()'s desire to
840 * release the hold on its dataset and acquire a new one on the dataset of the
841 * same name so that it can be partially torn down and reconstructed.
842 */
843void
5e00213e
AG
844dmu_objset_refresh_ownership(dsl_dataset_t *ds, dsl_dataset_t **newds,
845 boolean_t decrypt, void *tag)
831baf06
KW
846{
847 dsl_pool_t *dp;
eca7b760 848 char name[ZFS_MAX_DATASET_NAME_LEN];
831baf06 849
831baf06
KW
850 VERIFY3P(ds, !=, NULL);
851 VERIFY3P(ds->ds_owner, ==, tag);
852 VERIFY(dsl_dataset_long_held(ds));
853
854 dsl_dataset_name(ds, name);
5e00213e 855 dp = ds->ds_dir->dd_pool;
831baf06 856 dsl_pool_config_enter(dp, FTAG);
5e00213e 857 dsl_dataset_disown(ds, decrypt, tag);
b5256303 858 VERIFY0(dsl_dataset_own(dp, name,
5e00213e 859 (decrypt) ? DS_HOLD_FLAG_DECRYPT : 0, tag, newds));
831baf06
KW
860 dsl_pool_config_exit(dp, FTAG);
861}
862
428870ff 863void
b5256303 864dmu_objset_disown(objset_t *os, boolean_t decrypt, void *tag)
428870ff 865{
1de321e6
JX
866 /*
867 * Stop upgrading thread
868 */
869 dmu_objset_upgrade_stop(os);
b5256303
TC
870 dsl_dataset_disown(os->os_dsl_dataset,
871 (decrypt) ? DS_HOLD_FLAG_DECRYPT : 0, tag);
34dc7c2f
BB
872}
873
13fe0198 874void
34dc7c2f
BB
875dmu_objset_evict_dbufs(objset_t *os)
876{
0c66c32d 877 dnode_t *dn_marker;
34dc7c2f
BB
878 dnode_t *dn;
879
0c66c32d 880 dn_marker = kmem_alloc(sizeof (dnode_t), KM_SLEEP);
34dc7c2f 881
0c66c32d
JG
882 mutex_enter(&os->os_lock);
883 dn = list_head(&os->os_dnodes);
884 while (dn != NULL) {
885 /*
886 * Skip dnodes without holds. We have to do this dance
887 * because dnode_add_ref() only works if there is already a
888 * hold. If the dnode has no holds, then it has no dbufs.
889 */
890 if (dnode_add_ref(dn, FTAG)) {
891 list_insert_after(&os->os_dnodes, dn, dn_marker);
892 mutex_exit(&os->os_lock);
34dc7c2f 893
0c66c32d
JG
894 dnode_evict_dbufs(dn);
895 dnode_rele(dn, FTAG);
34dc7c2f 896
0c66c32d
JG
897 mutex_enter(&os->os_lock);
898 dn = list_next(&os->os_dnodes, dn_marker);
899 list_remove(&os->os_dnodes, dn_marker);
900 } else {
901 dn = list_next(&os->os_dnodes, dn);
902 }
903 }
904 mutex_exit(&os->os_lock);
34dc7c2f 905
0c66c32d 906 kmem_free(dn_marker, sizeof (dnode_t));
34dc7c2f 907
0c66c32d 908 if (DMU_USERUSED_DNODE(os) != NULL) {
9c5167d1
NF
909 if (DMU_PROJECTUSED_DNODE(os) != NULL)
910 dnode_evict_dbufs(DMU_PROJECTUSED_DNODE(os));
0c66c32d
JG
911 dnode_evict_dbufs(DMU_GROUPUSED_DNODE(os));
912 dnode_evict_dbufs(DMU_USERUSED_DNODE(os));
34dc7c2f 913 }
0c66c32d 914 dnode_evict_dbufs(DMU_META_DNODE(os));
34dc7c2f
BB
915}
916
0c66c32d
JG
917/*
918 * Objset eviction processing is split into into two pieces.
919 * The first marks the objset as evicting, evicts any dbufs that
920 * have a refcount of zero, and then queues up the objset for the
921 * second phase of eviction. Once os->os_dnodes has been cleared by
922 * dnode_buf_pageout()->dnode_destroy(), the second phase is executed.
923 * The second phase closes the special dnodes, dequeues the objset from
924 * the list of those undergoing eviction, and finally frees the objset.
925 *
926 * NOTE: Due to asynchronous eviction processing (invocation of
927 * dnode_buf_pageout()), it is possible for the meta dnode for the
928 * objset to have no holds even though os->os_dnodes is not empty.
929 */
34dc7c2f 930void
428870ff 931dmu_objset_evict(objset_t *os)
34dc7c2f 932{
6f1ffb06
MA
933 dsl_dataset_t *ds = os->os_dsl_dataset;
934
1c27024e 935 for (int t = 0; t < TXG_SIZE; t++)
428870ff 936 ASSERT(!dmu_objset_is_dirty(os, t));
34dc7c2f 937
0eb21616
JG
938 if (ds)
939 dsl_prop_unregister_all(ds, os);
34dc7c2f 940
428870ff
BB
941 if (os->os_sa)
942 sa_tear_down(os);
943
13fe0198 944 dmu_objset_evict_dbufs(os);
34dc7c2f 945
0c66c32d
JG
946 mutex_enter(&os->os_lock);
947 spa_evicting_os_register(os->os_spa, os);
948 if (list_is_empty(&os->os_dnodes)) {
949 mutex_exit(&os->os_lock);
950 dmu_objset_evict_done(os);
951 } else {
952 mutex_exit(&os->os_lock);
953 }
b5256303
TC
954
955
0c66c32d
JG
956}
957
958void
959dmu_objset_evict_done(objset_t *os)
960{
961 ASSERT3P(list_head(&os->os_dnodes), ==, NULL);
962
572e2857
BB
963 dnode_special_close(&os->os_meta_dnode);
964 if (DMU_USERUSED_DNODE(os)) {
9c5167d1
NF
965 if (DMU_PROJECTUSED_DNODE(os))
966 dnode_special_close(&os->os_projectused_dnode);
572e2857
BB
967 dnode_special_close(&os->os_userused_dnode);
968 dnode_special_close(&os->os_groupused_dnode);
9babb374 969 }
428870ff
BB
970 zil_free(os->os_zil);
971
d3c2ae1c 972 arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf);
572e2857
BB
973
974 /*
975 * This is a barrier to prevent the objset from going away in
976 * dnode_move() until we can safely ensure that the objset is still in
977 * use. We consider the objset valid before the barrier and invalid
978 * after the barrier.
979 */
980 rw_enter(&os_lock, RW_READER);
981 rw_exit(&os_lock);
982
dbeb8796
MA
983 kmem_free(os->os_obj_next_percpu,
984 os->os_obj_next_percpu_len * sizeof (os->os_obj_next_percpu[0]));
985
428870ff 986 mutex_destroy(&os->os_lock);
64fc7762 987 mutex_destroy(&os->os_userused_lock);
428870ff
BB
988 mutex_destroy(&os->os_obj_lock);
989 mutex_destroy(&os->os_user_ptr_lock);
c17486b2 990 mutex_destroy(&os->os_upgrade_lock);
64fc7762
MA
991 for (int i = 0; i < TXG_SIZE; i++) {
992 multilist_destroy(os->os_dirty_dnodes[i]);
993 }
0c66c32d 994 spa_evicting_os_deregister(os->os_spa, os);
428870ff
BB
995 kmem_free(os, sizeof (objset_t));
996}
9babb374 997
6413c95f 998inode_timespec_t
428870ff
BB
999dmu_objset_snap_cmtime(objset_t *os)
1000{
1001 return (dsl_dir_snap_cmtime(os->os_dsl_dataset->ds_dir));
34dc7c2f
BB
1002}
1003
428870ff 1004objset_t *
b5256303
TC
1005dmu_objset_create_impl_dnstats(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
1006 dmu_objset_type_t type, int levels, int blksz, int ibs, dmu_tx_t *tx)
34dc7c2f 1007{
428870ff 1008 objset_t *os;
34dc7c2f
BB
1009 dnode_t *mdn;
1010
1011 ASSERT(dmu_tx_is_syncing(tx));
13fe0198 1012
b5256303
TC
1013 if (blksz == 0)
1014 blksz = DNODE_BLOCK_SIZE;
4807c0ba 1015 if (ibs == 0)
b5256303
TC
1016 ibs = DN_MAX_INDBLKSHIFT;
1017
572e2857 1018 if (ds != NULL)
13fe0198 1019 VERIFY0(dmu_objset_from_ds(ds, &os));
572e2857 1020 else
13fe0198 1021 VERIFY0(dmu_objset_open_impl(spa, NULL, bp, &os));
572e2857
BB
1022
1023 mdn = DMU_META_DNODE(os);
34dc7c2f 1024
b5256303
TC
1025 dnode_allocate(mdn, DMU_OT_DNODE, blksz, ibs, DMU_OT_NONE, 0,
1026 DNODE_MIN_SLOTS, tx);
34dc7c2f
BB
1027
1028 /*
1029 * We don't want to have to increase the meta-dnode's nlevels
e1cfd73f 1030 * later, because then we could do it in quiescing context while
34dc7c2f
BB
1031 * we are also accessing it in open context.
1032 *
1033 * This precaution is not necessary for the MOS (ds == NULL),
1034 * because the MOS is only updated in syncing context.
1035 * This is most fortunate: the MOS is the only objset that
1036 * needs to be synced multiple times as spa_sync() iterates
1037 * to convergence, so minimizing its dn_nlevels matters.
1038 */
1039 if (ds != NULL) {
b5256303
TC
1040 if (levels == 0) {
1041 levels = 1;
1042
1043 /*
1044 * Determine the number of levels necessary for the
1045 * meta-dnode to contain DN_MAX_OBJECT dnodes. Note
1046 * that in order to ensure that we do not overflow
1047 * 64 bits, there has to be a nlevels that gives us a
1048 * number of blocks > DN_MAX_OBJECT but < 2^64.
1049 * Therefore, (mdn->dn_indblkshift - SPA_BLKPTRSHIFT)
1050 * (10) must be less than (64 - log2(DN_MAX_OBJECT))
1051 * (16).
1052 */
1053 while ((uint64_t)mdn->dn_nblkptr <<
1054 (mdn->dn_datablkshift - DNODE_SHIFT + (levels - 1) *
1055 (mdn->dn_indblkshift - SPA_BLKPTRSHIFT)) <
1056 DN_MAX_OBJECT)
1057 levels++;
1058 }
34dc7c2f
BB
1059
1060 mdn->dn_next_nlevels[tx->tx_txg & TXG_MASK] =
1061 mdn->dn_nlevels = levels;
1062 }
1063
1064 ASSERT(type != DMU_OST_NONE);
1065 ASSERT(type != DMU_OST_ANY);
1066 ASSERT(type < DMU_OST_NUMTYPES);
428870ff 1067 os->os_phys->os_type = type;
b5256303
TC
1068
1069 /*
1070 * Enable user accounting if it is enabled and this is not an
1071 * encrypted receive.
1072 */
1073 if (dmu_objset_userused_enabled(os) &&
1074 (!os->os_encrypted || !dmu_objset_is_receiving(os))) {
428870ff 1075 os->os_phys->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE;
1de321e6 1076 if (dmu_objset_userobjused_enabled(os)) {
d52d80b7
PD
1077 ds->ds_feature_activation[
1078 SPA_FEATURE_USEROBJ_ACCOUNTING] = (void *)B_TRUE;
1de321e6
JX
1079 os->os_phys->os_flags |=
1080 OBJSET_FLAG_USEROBJACCOUNTING_COMPLETE;
1081 }
9c5167d1 1082 if (dmu_objset_projectquota_enabled(os)) {
d52d80b7
PD
1083 ds->ds_feature_activation[
1084 SPA_FEATURE_PROJECT_QUOTA] = (void *)B_TRUE;
9c5167d1
NF
1085 os->os_phys->os_flags |=
1086 OBJSET_FLAG_PROJECTQUOTA_COMPLETE;
1087 }
428870ff 1088 os->os_flags = os->os_phys->os_flags;
9babb374 1089 }
34dc7c2f
BB
1090
1091 dsl_dataset_dirty(ds, tx);
1092
428870ff 1093 return (os);
34dc7c2f
BB
1094}
1095
b5256303
TC
1096/* called from dsl for meta-objset */
1097objset_t *
1098dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
1099 dmu_objset_type_t type, dmu_tx_t *tx)
1100{
1101 return (dmu_objset_create_impl_dnstats(spa, ds, bp, type, 0, 0, 0, tx));
1102}
1103
13fe0198
MA
1104typedef struct dmu_objset_create_arg {
1105 const char *doca_name;
1106 cred_t *doca_cred;
e59a377a 1107 proc_t *doca_proc;
13fe0198
MA
1108 void (*doca_userfunc)(objset_t *os, void *arg,
1109 cred_t *cr, dmu_tx_t *tx);
1110 void *doca_userarg;
1111 dmu_objset_type_t doca_type;
1112 uint64_t doca_flags;
b5256303 1113 dsl_crypto_params_t *doca_dcp;
13fe0198 1114} dmu_objset_create_arg_t;
34dc7c2f
BB
1115
1116/*ARGSUSED*/
1117static int
13fe0198 1118dmu_objset_create_check(void *arg, dmu_tx_t *tx)
34dc7c2f 1119{
13fe0198
MA
1120 dmu_objset_create_arg_t *doca = arg;
1121 dsl_pool_t *dp = dmu_tx_pool(tx);
1122 dsl_dir_t *pdd;
d8d418ff 1123 dsl_dataset_t *parentds;
1124 objset_t *parentos;
13fe0198
MA
1125 const char *tail;
1126 int error;
34dc7c2f 1127
13fe0198 1128 if (strchr(doca->doca_name, '@') != NULL)
2e528b49 1129 return (SET_ERROR(EINVAL));
34dc7c2f 1130
eca7b760
IK
1131 if (strlen(doca->doca_name) >= ZFS_MAX_DATASET_NAME_LEN)
1132 return (SET_ERROR(ENAMETOOLONG));
1133
a7ed98d8
SD
1134 if (dataset_nestcheck(doca->doca_name) != 0)
1135 return (SET_ERROR(ENAMETOOLONG));
1136
13fe0198
MA
1137 error = dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail);
1138 if (error != 0)
1139 return (error);
1140 if (tail == NULL) {
1141 dsl_dir_rele(pdd, FTAG);
2e528b49 1142 return (SET_ERROR(EEXIST));
34dc7c2f 1143 }
b5256303 1144
1fff937a 1145 error = dmu_objset_create_crypt_check(pdd, doca->doca_dcp, NULL);
b5256303
TC
1146 if (error != 0) {
1147 dsl_dir_rele(pdd, FTAG);
1148 return (error);
1149 }
1150
788eb90c 1151 error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL,
e59a377a 1152 doca->doca_cred, doca->doca_proc);
d8d418ff 1153 if (error != 0) {
1154 dsl_dir_rele(pdd, FTAG);
1155 return (error);
1156 }
b5256303 1157
d8d418ff 1158 /* can't create below anything but filesystems (eg. no ZVOLs) */
1159 error = dsl_dataset_hold_obj(pdd->dd_pool,
1160 dsl_dir_phys(pdd)->dd_head_dataset_obj, FTAG, &parentds);
1161 if (error != 0) {
1162 dsl_dir_rele(pdd, FTAG);
1163 return (error);
1164 }
1165 error = dmu_objset_from_ds(parentds, &parentos);
1166 if (error != 0) {
1167 dsl_dataset_rele(parentds, FTAG);
1168 dsl_dir_rele(pdd, FTAG);
1169 return (error);
1170 }
1171 if (dmu_objset_type(parentos) != DMU_OST_ZFS) {
1172 dsl_dataset_rele(parentds, FTAG);
1173 dsl_dir_rele(pdd, FTAG);
1174 return (SET_ERROR(ZFS_ERR_WRONG_PARENT));
1175 }
1176 dsl_dataset_rele(parentds, FTAG);
13fe0198 1177 dsl_dir_rele(pdd, FTAG);
34dc7c2f 1178
788eb90c 1179 return (error);
34dc7c2f
BB
1180}
1181
1182static void
13fe0198 1183dmu_objset_create_sync(void *arg, dmu_tx_t *tx)
34dc7c2f 1184{
13fe0198
MA
1185 dmu_objset_create_arg_t *doca = arg;
1186 dsl_pool_t *dp = dmu_tx_pool(tx);
52ce99dd 1187 spa_t *spa = dp->dp_spa;
13fe0198
MA
1188 dsl_dir_t *pdd;
1189 const char *tail;
6f1ffb06 1190 dsl_dataset_t *ds;
13fe0198 1191 uint64_t obj;
6f1ffb06 1192 blkptr_t *bp;
13fe0198 1193 objset_t *os;
b5256303 1194 zio_t *rzio;
34dc7c2f 1195
13fe0198 1196 VERIFY0(dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail));
34dc7c2f 1197
13fe0198 1198 obj = dsl_dataset_create_sync(pdd, tail, NULL, doca->doca_flags,
b5256303 1199 doca->doca_cred, doca->doca_dcp, tx);
34dc7c2f 1200
b5256303
TC
1201 VERIFY0(dsl_dataset_hold_obj_flags(pdd->dd_pool, obj,
1202 DS_HOLD_FLAG_DECRYPT, FTAG, &ds));
cc9bb3e5 1203 rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
6f1ffb06 1204 bp = dsl_dataset_get_blkptr(ds);
52ce99dd 1205 os = dmu_objset_create_impl(spa, ds, bp, doca->doca_type, tx);
cc9bb3e5 1206 rrw_exit(&ds->ds_bp_rwlock, FTAG);
34dc7c2f 1207
13fe0198
MA
1208 if (doca->doca_userfunc != NULL) {
1209 doca->doca_userfunc(os, doca->doca_userarg,
1210 doca->doca_cred, tx);
34dc7c2f
BB
1211 }
1212
b5256303 1213 /*
4807c0ba 1214 * The doca_userfunc() may write out some data that needs to be
b5256303
TC
1215 * encrypted if the dataset is encrypted (specifically the root
1216 * directory). This data must be written out before the encryption
1217 * key mapping is removed by dsl_dataset_rele_flags(). Force the
1218 * I/O to occur immediately by invoking the relevant sections of
1219 * dsl_pool_sync().
1220 */
1221 if (os->os_encrypted) {
1222 dsl_dataset_t *tmpds = NULL;
1223 boolean_t need_sync_done = B_FALSE;
1224
4807c0ba
TC
1225 mutex_enter(&ds->ds_lock);
1226 ds->ds_owner = FTAG;
1227 mutex_exit(&ds->ds_lock);
1228
52ce99dd 1229 rzio = zio_root(spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
4807c0ba
TC
1230 tmpds = txg_list_remove_this(&dp->dp_dirty_datasets, ds,
1231 tx->tx_txg);
b5256303 1232 if (tmpds != NULL) {
b5256303
TC
1233 dsl_dataset_sync(ds, rzio, tx);
1234 need_sync_done = B_TRUE;
1235 }
1236 VERIFY0(zio_wait(rzio));
1237
ba67d821 1238 dmu_objset_sync_done(os, tx);
b5256303 1239 taskq_wait(dp->dp_sync_taskq);
52ce99dd
TC
1240 if (txg_list_member(&dp->dp_dirty_datasets, ds, tx->tx_txg)) {
1241 ASSERT3P(ds->ds_key_mapping, !=, NULL);
1242 key_mapping_rele(spa, ds->ds_key_mapping, ds);
1243 }
b5256303 1244
52ce99dd 1245 rzio = zio_root(spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
4807c0ba
TC
1246 tmpds = txg_list_remove_this(&dp->dp_dirty_datasets, ds,
1247 tx->tx_txg);
b5256303 1248 if (tmpds != NULL) {
b5256303
TC
1249 dmu_buf_rele(ds->ds_dbuf, ds);
1250 dsl_dataset_sync(ds, rzio, tx);
1251 }
1252 VERIFY0(zio_wait(rzio));
1253
52ce99dd
TC
1254 if (need_sync_done) {
1255 ASSERT3P(ds->ds_key_mapping, !=, NULL);
1256 key_mapping_rele(spa, ds->ds_key_mapping, ds);
b5256303 1257 dsl_dataset_sync_done(ds, tx);
52ce99dd 1258 }
4807c0ba
TC
1259
1260 mutex_enter(&ds->ds_lock);
1261 ds->ds_owner = NULL;
1262 mutex_exit(&ds->ds_lock);
b5256303
TC
1263 }
1264
74756182 1265 spa_history_log_internal_ds(ds, "create", tx, " ");
a0bd735a 1266
b5256303 1267 dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG);
13fe0198 1268 dsl_dir_rele(pdd, FTAG);
34dc7c2f
BB
1269}
1270
1271int
428870ff 1272dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags,
b5256303 1273 dsl_crypto_params_t *dcp, dmu_objset_create_sync_func_t func, void *arg)
34dc7c2f 1274{
13fe0198 1275 dmu_objset_create_arg_t doca;
b5256303 1276 dsl_crypto_params_t tmp_dcp = { 0 };
34dc7c2f 1277
13fe0198
MA
1278 doca.doca_name = name;
1279 doca.doca_cred = CRED();
e59a377a 1280 doca.doca_proc = curproc;
13fe0198
MA
1281 doca.doca_flags = flags;
1282 doca.doca_userfunc = func;
1283 doca.doca_userarg = arg;
1284 doca.doca_type = type;
34dc7c2f 1285
b5256303
TC
1286 /*
1287 * Some callers (mostly for testing) do not provide a dcp on their
1288 * own but various code inside the sync task will require it to be
1289 * allocated. Rather than adding NULL checks throughout this code
1290 * or adding dummy dcp's to all of the callers we simply create a
1291 * dummy one here and use that. This zero dcp will have the same
85ce3f4f 1292 * effect as asking for inheritance of all encryption params.
b5256303
TC
1293 */
1294 doca.doca_dcp = (dcp != NULL) ? dcp : &tmp_dcp;
1295
ec213971 1296 int rv = dsl_sync_task(name,
3d45fdd6 1297 dmu_objset_create_check, dmu_objset_create_sync, &doca,
ec213971
MA
1298 6, ZFS_SPACE_CHECK_NORMAL);
1299
1300 if (rv == 0)
1301 zvol_create_minor(name);
1302 return (rv);
34dc7c2f
BB
1303}
1304
13fe0198
MA
1305typedef struct dmu_objset_clone_arg {
1306 const char *doca_clone;
1307 const char *doca_origin;
1308 cred_t *doca_cred;
e59a377a 1309 proc_t *doca_proc;
13fe0198
MA
1310} dmu_objset_clone_arg_t;
1311
1312/*ARGSUSED*/
1313static int
1314dmu_objset_clone_check(void *arg, dmu_tx_t *tx)
34dc7c2f 1315{
13fe0198 1316 dmu_objset_clone_arg_t *doca = arg;
428870ff
BB
1317 dsl_dir_t *pdd;
1318 const char *tail;
13fe0198
MA
1319 int error;
1320 dsl_dataset_t *origin;
1321 dsl_pool_t *dp = dmu_tx_pool(tx);
34dc7c2f 1322
13fe0198 1323 if (strchr(doca->doca_clone, '@') != NULL)
2e528b49 1324 return (SET_ERROR(EINVAL));
13fe0198 1325
eca7b760
IK
1326 if (strlen(doca->doca_clone) >= ZFS_MAX_DATASET_NAME_LEN)
1327 return (SET_ERROR(ENAMETOOLONG));
1328
13fe0198
MA
1329 error = dsl_dir_hold(dp, doca->doca_clone, FTAG, &pdd, &tail);
1330 if (error != 0)
1331 return (error);
428870ff 1332 if (tail == NULL) {
13fe0198 1333 dsl_dir_rele(pdd, FTAG);
2e528b49 1334 return (SET_ERROR(EEXIST));
34dc7c2f 1335 }
1cddb8c9 1336
788eb90c 1337 error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL,
e59a377a 1338 doca->doca_cred, doca->doca_proc);
788eb90c
JJ
1339 if (error != 0) {
1340 dsl_dir_rele(pdd, FTAG);
1341 return (SET_ERROR(EDQUOT));
1342 }
428870ff 1343
13fe0198 1344 error = dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin);
b5256303
TC
1345 if (error != 0) {
1346 dsl_dir_rele(pdd, FTAG);
572e2857 1347 return (error);
b5256303 1348 }
572e2857 1349
13fe0198 1350 /* You can only clone snapshots, not the head datasets. */
0c66c32d 1351 if (!origin->ds_is_snapshot) {
13fe0198 1352 dsl_dataset_rele(origin, FTAG);
b5256303 1353 dsl_dir_rele(pdd, FTAG);
2e528b49 1354 return (SET_ERROR(EINVAL));
428870ff 1355 }
b5256303 1356
13fe0198 1357 dsl_dataset_rele(origin, FTAG);
b5256303 1358 dsl_dir_rele(pdd, FTAG);
572e2857 1359
13fe0198 1360 return (0);
9babb374 1361}
34dc7c2f 1362
13fe0198
MA
1363static void
1364dmu_objset_clone_sync(void *arg, dmu_tx_t *tx)
34dc7c2f 1365{
13fe0198
MA
1366 dmu_objset_clone_arg_t *doca = arg;
1367 dsl_pool_t *dp = dmu_tx_pool(tx);
1368 dsl_dir_t *pdd;
1369 const char *tail;
1370 dsl_dataset_t *origin, *ds;
1371 uint64_t obj;
eca7b760 1372 char namebuf[ZFS_MAX_DATASET_NAME_LEN];
6f1ffb06 1373
13fe0198
MA
1374 VERIFY0(dsl_dir_hold(dp, doca->doca_clone, FTAG, &pdd, &tail));
1375 VERIFY0(dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin));
6f1ffb06 1376
13fe0198 1377 obj = dsl_dataset_create_sync(pdd, tail, origin, 0,
b5256303 1378 doca->doca_cred, NULL, tx);
34dc7c2f 1379
13fe0198
MA
1380 VERIFY0(dsl_dataset_hold_obj(pdd->dd_pool, obj, FTAG, &ds));
1381 dsl_dataset_name(origin, namebuf);
1382 spa_history_log_internal_ds(ds, "clone", tx,
74756182 1383 "origin=%s (%llu)", namebuf, (u_longlong_t)origin->ds_object);
13fe0198
MA
1384 dsl_dataset_rele(ds, FTAG);
1385 dsl_dataset_rele(origin, FTAG);
1386 dsl_dir_rele(pdd, FTAG);
34dc7c2f
BB
1387}
1388
1389int
13fe0198 1390dmu_objset_clone(const char *clone, const char *origin)
34dc7c2f 1391{
13fe0198 1392 dmu_objset_clone_arg_t doca;
34dc7c2f 1393
13fe0198
MA
1394 doca.doca_clone = clone;
1395 doca.doca_origin = origin;
1396 doca.doca_cred = CRED();
e59a377a 1397 doca.doca_proc = curproc;
572e2857 1398
ec213971 1399 int rv = dsl_sync_task(clone,
3d45fdd6 1400 dmu_objset_clone_check, dmu_objset_clone_sync, &doca,
ec213971
MA
1401 6, ZFS_SPACE_CHECK_NORMAL);
1402
1403 if (rv == 0)
1404 zvol_create_minor(clone);
1405
1406 return (rv);
6f1ffb06
MA
1407}
1408
1409int
1410dmu_objset_snapshot_one(const char *fsname, const char *snapname)
1411{
1412 int err;
1413 char *longsnap = kmem_asprintf("%s@%s", fsname, snapname);
1414 nvlist_t *snaps = fnvlist_alloc();
1415
1416 fnvlist_add_boolean(snaps, longsnap);
e4f5fa12 1417 kmem_strfree(longsnap);
13fe0198
MA
1418 err = dsl_dataset_snapshot(snaps, NULL, NULL);
1419 fnvlist_free(snaps);
6f1ffb06
MA
1420 return (err);
1421}
1422
1de321e6
JX
1423static void
1424dmu_objset_upgrade_task_cb(void *data)
1425{
1426 objset_t *os = data;
1427
1428 mutex_enter(&os->os_upgrade_lock);
1429 os->os_upgrade_status = EINTR;
1430 if (!os->os_upgrade_exit) {
39372fa2
AF
1431 int status;
1432
1de321e6
JX
1433 mutex_exit(&os->os_upgrade_lock);
1434
39372fa2
AF
1435 status = os->os_upgrade_cb(os);
1436
1de321e6 1437 mutex_enter(&os->os_upgrade_lock);
39372fa2
AF
1438
1439 os->os_upgrade_status = status;
1de321e6
JX
1440 }
1441 os->os_upgrade_exit = B_TRUE;
1442 os->os_upgrade_id = 0;
1443 mutex_exit(&os->os_upgrade_lock);
c0daec32 1444 dsl_dataset_long_rele(dmu_objset_ds(os), upgrade_tag);
1de321e6
JX
1445}
1446
1447static void
1448dmu_objset_upgrade(objset_t *os, dmu_objset_upgrade_cb_t cb)
1449{
1450 if (os->os_upgrade_id != 0)
1451 return;
1452
c0daec32
AB
1453 ASSERT(dsl_pool_config_held(dmu_objset_pool(os)));
1454 dsl_dataset_long_hold(dmu_objset_ds(os), upgrade_tag);
1455
1de321e6
JX
1456 mutex_enter(&os->os_upgrade_lock);
1457 if (os->os_upgrade_id == 0 && os->os_upgrade_status == 0) {
1458 os->os_upgrade_exit = B_FALSE;
1459 os->os_upgrade_cb = cb;
1460 os->os_upgrade_id = taskq_dispatch(
1461 os->os_spa->spa_upgrade_taskq,
1462 dmu_objset_upgrade_task_cb, os, TQ_SLEEP);
c0daec32
AB
1463 if (os->os_upgrade_id == TASKQID_INVALID) {
1464 dsl_dataset_long_rele(dmu_objset_ds(os), upgrade_tag);
1de321e6 1465 os->os_upgrade_status = ENOMEM;
c0daec32 1466 }
39372fa2
AF
1467 } else {
1468 dsl_dataset_long_rele(dmu_objset_ds(os), upgrade_tag);
1de321e6
JX
1469 }
1470 mutex_exit(&os->os_upgrade_lock);
1471}
1472
1473static void
1474dmu_objset_upgrade_stop(objset_t *os)
1475{
1476 mutex_enter(&os->os_upgrade_lock);
1477 os->os_upgrade_exit = B_TRUE;
1478 if (os->os_upgrade_id != 0) {
1479 taskqid_t id = os->os_upgrade_id;
1480
1481 os->os_upgrade_id = 0;
1482 mutex_exit(&os->os_upgrade_lock);
1483
c0daec32
AB
1484 if ((taskq_cancel_id(os->os_spa->spa_upgrade_taskq, id)) == 0) {
1485 dsl_dataset_long_rele(dmu_objset_ds(os), upgrade_tag);
1486 }
4807c0ba 1487 txg_wait_synced(os->os_spa->spa_dsl_pool, 0);
1de321e6
JX
1488 } else {
1489 mutex_exit(&os->os_upgrade_lock);
1490 }
1491}
1492
34dc7c2f 1493static void
64fc7762 1494dmu_objset_sync_dnodes(multilist_sublist_t *list, dmu_tx_t *tx)
34dc7c2f
BB
1495{
1496 dnode_t *dn;
1497
64fc7762 1498 while ((dn = multilist_sublist_head(list)) != NULL) {
34dc7c2f
BB
1499 ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
1500 ASSERT(dn->dn_dbuf->db_data_pending);
1501 /*
9babb374 1502 * Initialize dn_zio outside dnode_sync() because the
93e28d66 1503 * meta-dnode needs to set it outside dnode_sync().
34dc7c2f
BB
1504 */
1505 dn->dn_zio = dn->dn_dbuf->db_data_pending->dr_zio;
1506 ASSERT(dn->dn_zio);
1507
1508 ASSERT3U(dn->dn_nlevels, <=, DN_MAX_LEVELS);
64fc7762 1509 multilist_sublist_remove(list, dn);
9babb374 1510
edc1e713 1511 /*
ba67d821
MA
1512 * See the comment above dnode_rele_task() for an explanation
1513 * of why this dnode hold is always needed (even when not
1514 * doing user accounting).
edc1e713 1515 */
64fc7762 1516 multilist_t *newlist = dn->dn_objset->os_synced_dnodes;
ba67d821
MA
1517 (void) dnode_add_ref(dn, newlist);
1518 multilist_insert(newlist, dn);
9babb374 1519
34dc7c2f
BB
1520 dnode_sync(dn, tx);
1521 }
1522}
1523
1524/* ARGSUSED */
1525static void
428870ff 1526dmu_objset_write_ready(zio_t *zio, arc_buf_t *abuf, void *arg)
34dc7c2f 1527{
b128c09f 1528 blkptr_t *bp = zio->io_bp;
428870ff 1529 objset_t *os = arg;
34dc7c2f 1530 dnode_phys_t *dnp = &os->os_phys->os_meta_dnode;
b5256303 1531 uint64_t fill = 0;
34dc7c2f 1532
9b67f605 1533 ASSERT(!BP_IS_EMBEDDED(bp));
13fe0198
MA
1534 ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_OBJSET);
1535 ASSERT0(BP_GET_LEVEL(bp));
34dc7c2f
BB
1536
1537 /*
9babb374
BB
1538 * Update rootbp fill count: it should be the number of objects
1539 * allocated in the object set (not counting the "special"
1540 * objects that are stored in the objset_phys_t -- the meta
9c5167d1 1541 * dnode and user/group/project accounting objects).
34dc7c2f 1542 */
1c27024e 1543 for (int i = 0; i < dnp->dn_nblkptr; i++)
b5256303
TC
1544 fill += BP_GET_FILL(&dnp->dn_blkptr[i]);
1545
1546 BP_SET_FILL(bp, fill);
1547
cc9bb3e5
GM
1548 if (os->os_dsl_dataset != NULL)
1549 rrw_enter(&os->os_dsl_dataset->ds_bp_rwlock, RW_WRITER, FTAG);
1550 *os->os_rootbp = *bp;
1551 if (os->os_dsl_dataset != NULL)
1552 rrw_exit(&os->os_dsl_dataset->ds_bp_rwlock, FTAG);
428870ff
BB
1553}
1554
1555/* ARGSUSED */
1556static void
1557dmu_objset_write_done(zio_t *zio, arc_buf_t *abuf, void *arg)
1558{
1559 blkptr_t *bp = zio->io_bp;
1560 blkptr_t *bp_orig = &zio->io_bp_orig;
1561 objset_t *os = arg;
34dc7c2f 1562
b128c09f 1563 if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
428870ff 1564 ASSERT(BP_EQUAL(bp, bp_orig));
b128c09f 1565 } else {
428870ff
BB
1566 dsl_dataset_t *ds = os->os_dsl_dataset;
1567 dmu_tx_t *tx = os->os_synctx;
1568
1569 (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
1570 dsl_dataset_block_born(ds, bp, tx);
34dc7c2f 1571 }
cc9bb3e5 1572 kmem_free(bp, sizeof (*bp));
34dc7c2f
BB
1573}
1574
64fc7762
MA
1575typedef struct sync_dnodes_arg {
1576 multilist_t *sda_list;
1577 int sda_sublist_idx;
1578 multilist_t *sda_newlist;
1579 dmu_tx_t *sda_tx;
1580} sync_dnodes_arg_t;
1581
1582static void
1583sync_dnodes_task(void *arg)
1584{
1585 sync_dnodes_arg_t *sda = arg;
1586
1587 multilist_sublist_t *ms =
1588 multilist_sublist_lock(sda->sda_list, sda->sda_sublist_idx);
1589
1590 dmu_objset_sync_dnodes(ms, sda->sda_tx);
1591
1592 multilist_sublist_unlock(ms);
1593
1594 kmem_free(sda, sizeof (*sda));
1595}
1596
1597
34dc7c2f
BB
1598/* called from dsl */
1599void
428870ff 1600dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx)
34dc7c2f
BB
1601{
1602 int txgoff;
5dbd68a3 1603 zbookmark_phys_t zb;
428870ff 1604 zio_prop_t zp;
34dc7c2f
BB
1605 zio_t *zio;
1606 list_t *list;
1607 dbuf_dirty_record_t *dr;
fc754677
AM
1608 int num_sublists;
1609 multilist_t *ml;
cc9bb3e5
GM
1610 blkptr_t *blkptr_copy = kmem_alloc(sizeof (*os->os_rootbp), KM_SLEEP);
1611 *blkptr_copy = *os->os_rootbp;
34dc7c2f
BB
1612
1613 dprintf_ds(os->os_dsl_dataset, "txg=%llu\n", tx->tx_txg);
1614
1615 ASSERT(dmu_tx_is_syncing(tx));
1616 /* XXX the write_done callback should really give us the tx... */
1617 os->os_synctx = tx;
1618
1619 if (os->os_dsl_dataset == NULL) {
1620 /*
1621 * This is the MOS. If we have upgraded,
1622 * spa_max_replication() could change, so reset
1623 * os_copies here.
1624 */
1625 os->os_copies = spa_max_replication(os->os_spa);
1626 }
1627
1628 /*
1629 * Create the root block IO
1630 */
428870ff
BB
1631 SET_BOOKMARK(&zb, os->os_dsl_dataset ?
1632 os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
1633 ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
294f6806 1634 arc_release(os->os_phys_buf, &os->os_phys_buf);
b128c09f 1635
82644107 1636 dmu_write_policy(os, NULL, 0, 0, &zp);
9babb374 1637
b5256303 1638 /*
0c03d21a
MA
1639 * If we are either claiming the ZIL or doing a raw receive, write
1640 * out the os_phys_buf raw. Neither of these actions will effect the
1641 * MAC at this point.
b5256303 1642 */
0c03d21a
MA
1643 if (os->os_raw_receive ||
1644 os->os_next_write_raw[tx->tx_txg & TXG_MASK]) {
b5256303 1645 ASSERT(os->os_encrypted);
b5256303
TC
1646 arc_convert_to_raw(os->os_phys_buf,
1647 os->os_dsl_dataset->ds_object, ZFS_HOST_BYTEORDER,
1648 DMU_OT_OBJSET, NULL, NULL, NULL);
1649 }
1650
428870ff 1651 zio = arc_write(pio, os->os_spa, tx->tx_txg,
cc9bb3e5 1652 blkptr_copy, os->os_phys_buf, DMU_OS_IS_L2CACHEABLE(os),
bc77ba73
PD
1653 &zp, dmu_objset_write_ready, NULL, NULL, dmu_objset_write_done,
1654 os, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
34dc7c2f
BB
1655
1656 /*
9babb374 1657 * Sync special dnodes - the parent IO for the sync is the root block
34dc7c2f 1658 */
572e2857
BB
1659 DMU_META_DNODE(os)->dn_zio = zio;
1660 dnode_sync(DMU_META_DNODE(os), tx);
34dc7c2f 1661
9babb374
BB
1662 os->os_phys->os_flags = os->os_flags;
1663
572e2857
BB
1664 if (DMU_USERUSED_DNODE(os) &&
1665 DMU_USERUSED_DNODE(os)->dn_type != DMU_OT_NONE) {
1666 DMU_USERUSED_DNODE(os)->dn_zio = zio;
1667 dnode_sync(DMU_USERUSED_DNODE(os), tx);
1668 DMU_GROUPUSED_DNODE(os)->dn_zio = zio;
1669 dnode_sync(DMU_GROUPUSED_DNODE(os), tx);
9babb374
BB
1670 }
1671
9c5167d1
NF
1672 if (DMU_PROJECTUSED_DNODE(os) &&
1673 DMU_PROJECTUSED_DNODE(os)->dn_type != DMU_OT_NONE) {
1674 DMU_PROJECTUSED_DNODE(os)->dn_zio = zio;
1675 dnode_sync(DMU_PROJECTUSED_DNODE(os), tx);
1676 }
1677
34dc7c2f
BB
1678 txgoff = tx->tx_txg & TXG_MASK;
1679
ba67d821
MA
1680 /*
1681 * We must create the list here because it uses the
1682 * dn_dirty_link[] of this txg. But it may already
1683 * exist because we call dsl_dataset_sync() twice per txg.
1684 */
1685 if (os->os_synced_dnodes == NULL) {
1686 os->os_synced_dnodes =
1687 multilist_create(sizeof (dnode_t),
1688 offsetof(dnode_t, dn_dirty_link[txgoff]),
1689 dnode_multilist_index_func);
1690 } else {
1691 ASSERT3U(os->os_synced_dnodes->ml_offset, ==,
1692 offsetof(dnode_t, dn_dirty_link[txgoff]));
9babb374
BB
1693 }
1694
fc754677
AM
1695 ml = os->os_dirty_dnodes[txgoff];
1696 num_sublists = multilist_get_num_sublists(ml);
1697 for (int i = 0; i < num_sublists; i++) {
1698 if (multilist_sublist_is_empty_idx(ml, i))
1699 continue;
64fc7762 1700 sync_dnodes_arg_t *sda = kmem_alloc(sizeof (*sda), KM_SLEEP);
fc754677 1701 sda->sda_list = ml;
64fc7762
MA
1702 sda->sda_sublist_idx = i;
1703 sda->sda_tx = tx;
1704 (void) taskq_dispatch(dmu_objset_pool(os)->dp_sync_taskq,
1705 sync_dnodes_task, sda, 0);
1706 /* callback frees sda */
1707 }
1708 taskq_wait(dmu_objset_pool(os)->dp_sync_taskq);
34dc7c2f 1709
572e2857 1710 list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff];
64fc7762 1711 while ((dr = list_head(list)) != NULL) {
13fe0198 1712 ASSERT0(dr->dr_dbuf->db_level);
34dc7c2f 1713 list_remove(list, dr);
9cdf7b1f 1714 zio_nowait(dr->dr_zio);
34dc7c2f 1715 }
68cbd56e
NB
1716
1717 /* Enable dnode backfill if enough objects have been freed. */
1718 if (os->os_freed_dnodes >= dmu_rescan_dnode_threshold) {
1719 os->os_rescan_dnodes = B_TRUE;
1720 os->os_freed_dnodes = 0;
1721 }
1722
34dc7c2f
BB
1723 /*
1724 * Free intent log blocks up to this tx.
1725 */
1726 zil_sync(os->os_zil, tx);
b128c09f 1727 os->os_phys->os_zil_header = os->os_zil_header;
34dc7c2f
BB
1728 zio_nowait(zio);
1729}
1730
428870ff
BB
1731boolean_t
1732dmu_objset_is_dirty(objset_t *os, uint64_t txg)
1733{
64fc7762 1734 return (!multilist_is_empty(os->os_dirty_dnodes[txg & TXG_MASK]));
428870ff
BB
1735}
1736
7bcb7f08 1737static file_info_cb_t *file_cbs[DMU_OST_NUMTYPES];
9babb374
BB
1738
1739void
7bcb7f08 1740dmu_objset_register_type(dmu_objset_type_t ost, file_info_cb_t *cb)
9babb374 1741{
7bcb7f08
MA
1742 file_cbs[ost] = cb;
1743}
1744
1745int
1746dmu_get_file_info(objset_t *os, dmu_object_type_t bonustype, const void *data,
1747 zfs_file_info_t *zfi)
1748{
1749 file_info_cb_t *cb = file_cbs[os->os_phys->os_type];
1750 if (cb == NULL)
1751 return (EINVAL);
1752 return (cb(bonustype, data, zfi));
9babb374
BB
1753}
1754
1755boolean_t
428870ff 1756dmu_objset_userused_enabled(objset_t *os)
9babb374
BB
1757{
1758 return (spa_version(os->os_spa) >= SPA_VERSION_USERSPACE &&
7bcb7f08 1759 file_cbs[os->os_phys->os_type] != NULL &&
572e2857 1760 DMU_USERUSED_DNODE(os) != NULL);
9babb374
BB
1761}
1762
1de321e6
JX
1763boolean_t
1764dmu_objset_userobjused_enabled(objset_t *os)
1765{
1766 return (dmu_objset_userused_enabled(os) &&
1767 spa_feature_is_enabled(os->os_spa, SPA_FEATURE_USEROBJ_ACCOUNTING));
1768}
1769
9c5167d1
NF
1770boolean_t
1771dmu_objset_projectquota_enabled(objset_t *os)
1772{
7bcb7f08 1773 return (file_cbs[os->os_phys->os_type] != NULL &&
9c5167d1
NF
1774 DMU_PROJECTUSED_DNODE(os) != NULL &&
1775 spa_feature_is_enabled(os->os_spa, SPA_FEATURE_PROJECT_QUOTA));
1776}
1777
9b7a83cb
JX
1778typedef struct userquota_node {
1779 /* must be in the first filed, see userquota_update_cache() */
1780 char uqn_id[20 + DMU_OBJACCT_PREFIX_LEN];
1781 int64_t uqn_delta;
1782 avl_node_t uqn_node;
1783} userquota_node_t;
1784
1785typedef struct userquota_cache {
1786 avl_tree_t uqc_user_deltas;
1787 avl_tree_t uqc_group_deltas;
9c5167d1 1788 avl_tree_t uqc_project_deltas;
9b7a83cb
JX
1789} userquota_cache_t;
1790
1791static int
1792userquota_compare(const void *l, const void *r)
1793{
1794 const userquota_node_t *luqn = l;
1795 const userquota_node_t *ruqn = r;
e4ffa98d 1796 int rv;
9b7a83cb
JX
1797
1798 /*
1799 * NB: can only access uqn_id because userquota_update_cache() doesn't
1800 * pass in an entire userquota_node_t.
1801 */
e4ffa98d
BB
1802 rv = strcmp(luqn->uqn_id, ruqn->uqn_id);
1803
ca577779 1804 return (TREE_ISIGN(rv));
9b7a83cb
JX
1805}
1806
1807static void
1808do_userquota_cacheflush(objset_t *os, userquota_cache_t *cache, dmu_tx_t *tx)
1809{
1810 void *cookie;
1811 userquota_node_t *uqn;
1812
1813 ASSERT(dmu_tx_is_syncing(tx));
1814
1815 cookie = NULL;
1816 while ((uqn = avl_destroy_nodes(&cache->uqc_user_deltas,
1817 &cookie)) != NULL) {
64fc7762
MA
1818 /*
1819 * os_userused_lock protects against concurrent calls to
1820 * zap_increment_int(). It's needed because zap_increment_int()
1821 * is not thread-safe (i.e. not atomic).
1822 */
1823 mutex_enter(&os->os_userused_lock);
9b7a83cb
JX
1824 VERIFY0(zap_increment(os, DMU_USERUSED_OBJECT,
1825 uqn->uqn_id, uqn->uqn_delta, tx));
64fc7762 1826 mutex_exit(&os->os_userused_lock);
9b7a83cb
JX
1827 kmem_free(uqn, sizeof (*uqn));
1828 }
1829 avl_destroy(&cache->uqc_user_deltas);
1830
1831 cookie = NULL;
1832 while ((uqn = avl_destroy_nodes(&cache->uqc_group_deltas,
1833 &cookie)) != NULL) {
64fc7762 1834 mutex_enter(&os->os_userused_lock);
9b7a83cb
JX
1835 VERIFY0(zap_increment(os, DMU_GROUPUSED_OBJECT,
1836 uqn->uqn_id, uqn->uqn_delta, tx));
64fc7762 1837 mutex_exit(&os->os_userused_lock);
9b7a83cb
JX
1838 kmem_free(uqn, sizeof (*uqn));
1839 }
1840 avl_destroy(&cache->uqc_group_deltas);
9c5167d1
NF
1841
1842 if (dmu_objset_projectquota_enabled(os)) {
1843 cookie = NULL;
1844 while ((uqn = avl_destroy_nodes(&cache->uqc_project_deltas,
1845 &cookie)) != NULL) {
1846 mutex_enter(&os->os_userused_lock);
1847 VERIFY0(zap_increment(os, DMU_PROJECTUSED_OBJECT,
1848 uqn->uqn_id, uqn->uqn_delta, tx));
1849 mutex_exit(&os->os_userused_lock);
1850 kmem_free(uqn, sizeof (*uqn));
1851 }
1852 avl_destroy(&cache->uqc_project_deltas);
1853 }
9b7a83cb
JX
1854}
1855
1856static void
1857userquota_update_cache(avl_tree_t *avl, const char *id, int64_t delta)
1858{
1859 userquota_node_t *uqn;
1860 avl_index_t idx;
1861
1862 ASSERT(strlen(id) < sizeof (uqn->uqn_id));
1863 /*
1864 * Use id directly for searching because uqn_id is the first field of
1865 * userquota_node_t and fields after uqn_id won't be accessed in
1866 * avl_find().
1867 */
1868 uqn = avl_find(avl, (const void *)id, &idx);
1869 if (uqn == NULL) {
1870 uqn = kmem_zalloc(sizeof (*uqn), KM_SLEEP);
1f51b525 1871 strlcpy(uqn->uqn_id, id, sizeof (uqn->uqn_id));
9b7a83cb
JX
1872 avl_insert(avl, uqn, idx);
1873 }
1874 uqn->uqn_delta += delta;
1875}
1876
428870ff 1877static void
9c5167d1
NF
1878do_userquota_update(objset_t *os, userquota_cache_t *cache, uint64_t used,
1879 uint64_t flags, uint64_t user, uint64_t group, uint64_t project,
1880 boolean_t subtract)
428870ff 1881{
9c5167d1 1882 if (flags & DNODE_FLAG_USERUSED_ACCOUNTED) {
50c957f7 1883 int64_t delta = DNODE_MIN_SIZE + used;
9b7a83cb
JX
1884 char name[20];
1885
428870ff
BB
1886 if (subtract)
1887 delta = -delta;
9b7a83cb 1888
c9e319fa 1889 (void) snprintf(name, sizeof (name), "%llx", (longlong_t)user);
9b7a83cb
JX
1890 userquota_update_cache(&cache->uqc_user_deltas, name, delta);
1891
c9e319fa 1892 (void) snprintf(name, sizeof (name), "%llx", (longlong_t)group);
9b7a83cb 1893 userquota_update_cache(&cache->uqc_group_deltas, name, delta);
9c5167d1
NF
1894
1895 if (dmu_objset_projectquota_enabled(os)) {
c9e319fa
JL
1896 (void) snprintf(name, sizeof (name), "%llx",
1897 (longlong_t)project);
9c5167d1
NF
1898 userquota_update_cache(&cache->uqc_project_deltas,
1899 name, delta);
1900 }
428870ff
BB
1901 }
1902}
1903
1de321e6 1904static void
9c5167d1
NF
1905do_userobjquota_update(objset_t *os, userquota_cache_t *cache, uint64_t flags,
1906 uint64_t user, uint64_t group, uint64_t project, boolean_t subtract)
1de321e6
JX
1907{
1908 if (flags & DNODE_FLAG_USEROBJUSED_ACCOUNTED) {
1909 char name[20 + DMU_OBJACCT_PREFIX_LEN];
9b7a83cb 1910 int delta = subtract ? -1 : 1;
1de321e6
JX
1911
1912 (void) snprintf(name, sizeof (name), DMU_OBJACCT_PREFIX "%llx",
1913 (longlong_t)user);
9b7a83cb 1914 userquota_update_cache(&cache->uqc_user_deltas, name, delta);
1de321e6
JX
1915
1916 (void) snprintf(name, sizeof (name), DMU_OBJACCT_PREFIX "%llx",
1917 (longlong_t)group);
9b7a83cb 1918 userquota_update_cache(&cache->uqc_group_deltas, name, delta);
9c5167d1
NF
1919
1920 if (dmu_objset_projectquota_enabled(os)) {
1921 (void) snprintf(name, sizeof (name),
1922 DMU_OBJACCT_PREFIX "%llx", (longlong_t)project);
1923 userquota_update_cache(&cache->uqc_project_deltas,
1924 name, delta);
1925 }
1de321e6
JX
1926 }
1927}
1928
64fc7762
MA
1929typedef struct userquota_updates_arg {
1930 objset_t *uua_os;
1931 int uua_sublist_idx;
1932 dmu_tx_t *uua_tx;
1933} userquota_updates_arg_t;
1934
1935static void
1936userquota_updates_task(void *arg)
9babb374 1937{
64fc7762
MA
1938 userquota_updates_arg_t *uua = arg;
1939 objset_t *os = uua->uua_os;
1940 dmu_tx_t *tx = uua->uua_tx;
9babb374 1941 dnode_t *dn;
9b7a83cb 1942 userquota_cache_t cache = { { 0 } };
9babb374 1943
64fc7762
MA
1944 multilist_sublist_t *list =
1945 multilist_sublist_lock(os->os_synced_dnodes, uua->uua_sublist_idx);
9babb374 1946
64fc7762
MA
1947 ASSERT(multilist_sublist_head(list) == NULL ||
1948 dmu_objset_userused_enabled(os));
9b7a83cb
JX
1949 avl_create(&cache.uqc_user_deltas, userquota_compare,
1950 sizeof (userquota_node_t), offsetof(userquota_node_t, uqn_node));
1951 avl_create(&cache.uqc_group_deltas, userquota_compare,
1952 sizeof (userquota_node_t), offsetof(userquota_node_t, uqn_node));
9c5167d1
NF
1953 if (dmu_objset_projectquota_enabled(os))
1954 avl_create(&cache.uqc_project_deltas, userquota_compare,
1955 sizeof (userquota_node_t), offsetof(userquota_node_t,
1956 uqn_node));
9b7a83cb 1957
64fc7762 1958 while ((dn = multilist_sublist_head(list)) != NULL) {
572e2857 1959 int flags;
9babb374 1960 ASSERT(!DMU_OBJECT_IS_SPECIAL(dn->dn_object));
9babb374
BB
1961 ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE ||
1962 dn->dn_phys->dn_flags &
1963 DNODE_FLAG_USERUSED_ACCOUNTED);
1964
572e2857
BB
1965 flags = dn->dn_id_flags;
1966 ASSERT(flags);
1967 if (flags & DN_ID_OLD_EXIST) {
9c5167d1
NF
1968 do_userquota_update(os, &cache, dn->dn_oldused,
1969 dn->dn_oldflags, dn->dn_olduid, dn->dn_oldgid,
1970 dn->dn_oldprojid, B_TRUE);
1971 do_userobjquota_update(os, &cache, dn->dn_oldflags,
1972 dn->dn_olduid, dn->dn_oldgid,
1973 dn->dn_oldprojid, B_TRUE);
428870ff 1974 }
572e2857 1975 if (flags & DN_ID_NEW_EXIST) {
9c5167d1 1976 do_userquota_update(os, &cache,
9b7a83cb 1977 DN_USED_BYTES(dn->dn_phys), dn->dn_phys->dn_flags,
9c5167d1
NF
1978 dn->dn_newuid, dn->dn_newgid,
1979 dn->dn_newprojid, B_FALSE);
1980 do_userobjquota_update(os, &cache,
1981 dn->dn_phys->dn_flags, dn->dn_newuid, dn->dn_newgid,
1982 dn->dn_newprojid, B_FALSE);
428870ff
BB
1983 }
1984
572e2857 1985 mutex_enter(&dn->dn_mtx);
428870ff
BB
1986 dn->dn_oldused = 0;
1987 dn->dn_oldflags = 0;
1988 if (dn->dn_id_flags & DN_ID_NEW_EXIST) {
1989 dn->dn_olduid = dn->dn_newuid;
1990 dn->dn_oldgid = dn->dn_newgid;
9c5167d1 1991 dn->dn_oldprojid = dn->dn_newprojid;
428870ff
BB
1992 dn->dn_id_flags |= DN_ID_OLD_EXIST;
1993 if (dn->dn_bonuslen == 0)
1994 dn->dn_id_flags |= DN_ID_CHKED_SPILL;
1995 else
1996 dn->dn_id_flags |= DN_ID_CHKED_BONUS;
1997 }
1998 dn->dn_id_flags &= ~(DN_ID_NEW_EXIST);
9babb374
BB
1999 mutex_exit(&dn->dn_mtx);
2000
64fc7762
MA
2001 multilist_sublist_remove(list, dn);
2002 dnode_rele(dn, os->os_synced_dnodes);
9babb374 2003 }
9b7a83cb 2004 do_userquota_cacheflush(os, &cache, tx);
64fc7762
MA
2005 multilist_sublist_unlock(list);
2006 kmem_free(uua, sizeof (*uua));
2007}
2008
ba67d821
MA
2009/*
2010 * Release dnode holds from dmu_objset_sync_dnodes(). When the dnode is being
2011 * synced (i.e. we have issued the zio's for blocks in the dnode), it can't be
2012 * evicted because the block containing the dnode can't be evicted until it is
2013 * written out. However, this hold is necessary to prevent the dnode_t from
2014 * being moved (via dnode_move()) while it's still referenced by
2015 * dbuf_dirty_record_t:dr_dnode. And dr_dnode is needed for
2016 * dirty_lightweight_leaf-type dirty records.
2017 *
2018 * If we are doing user-object accounting, the dnode_rele() happens from
2019 * userquota_updates_task() instead.
2020 */
2021static void
2022dnode_rele_task(void *arg)
64fc7762 2023{
ba67d821
MA
2024 userquota_updates_arg_t *uua = arg;
2025 objset_t *os = uua->uua_os;
2026
2027 multilist_sublist_t *list =
2028 multilist_sublist_lock(os->os_synced_dnodes, uua->uua_sublist_idx);
fc754677 2029
ba67d821
MA
2030 dnode_t *dn;
2031 while ((dn = multilist_sublist_head(list)) != NULL) {
2032 multilist_sublist_remove(list, dn);
2033 dnode_rele(dn, os->os_synced_dnodes);
2034 }
2035 multilist_sublist_unlock(list);
2036 kmem_free(uua, sizeof (*uua));
2037}
2038
2039/*
2040 * Return TRUE if userquota updates are needed.
2041 */
2042static boolean_t
2043dmu_objset_do_userquota_updates_prep(objset_t *os, dmu_tx_t *tx)
2044{
64fc7762 2045 if (!dmu_objset_userused_enabled(os))
ba67d821 2046 return (B_FALSE);
64fc7762 2047
163a8c28
TC
2048 /*
2049 * If this is a raw receive just return and handle accounting
2050 * later when we have the keys loaded. We also don't do user
2051 * accounting during claiming since the datasets are not owned
2052 * for the duration of claiming and this txg should only be
2053 * used for recovery.
2054 */
b5256303 2055 if (os->os_encrypted && dmu_objset_is_receiving(os))
ba67d821 2056 return (B_FALSE);
b5256303 2057
163a8c28 2058 if (tx->tx_txg <= os->os_spa->spa_claim_max_txg)
ba67d821 2059 return (B_FALSE);
163a8c28 2060
9c5167d1 2061 /* Allocate the user/group/project used objects if necessary. */
64fc7762
MA
2062 if (DMU_USERUSED_DNODE(os)->dn_type == DMU_OT_NONE) {
2063 VERIFY0(zap_create_claim(os,
2064 DMU_USERUSED_OBJECT,
2065 DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));
2066 VERIFY0(zap_create_claim(os,
2067 DMU_GROUPUSED_OBJECT,
2068 DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));
2069 }
2070
9c5167d1
NF
2071 if (dmu_objset_projectquota_enabled(os) &&
2072 DMU_PROJECTUSED_DNODE(os)->dn_type == DMU_OT_NONE) {
2073 VERIFY0(zap_create_claim(os, DMU_PROJECTUSED_OBJECT,
2074 DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));
2075 }
ba67d821
MA
2076 return (B_TRUE);
2077}
9c5167d1 2078
ba67d821
MA
2079/*
2080 * Dispatch taskq tasks to dp_sync_taskq to update the user accounting, and
2081 * also release the holds on the dnodes from dmu_objset_sync_dnodes().
2082 * The caller must taskq_wait(dp_sync_taskq).
2083 */
2084void
2085dmu_objset_sync_done(objset_t *os, dmu_tx_t *tx)
2086{
2087 boolean_t need_userquota = dmu_objset_do_userquota_updates_prep(os, tx);
2088
2089 int num_sublists = multilist_get_num_sublists(os->os_synced_dnodes);
fc754677 2090 for (int i = 0; i < num_sublists; i++) {
64fc7762
MA
2091 userquota_updates_arg_t *uua =
2092 kmem_alloc(sizeof (*uua), KM_SLEEP);
2093 uua->uua_os = os;
2094 uua->uua_sublist_idx = i;
2095 uua->uua_tx = tx;
ba67d821
MA
2096
2097 /*
2098 * If we don't need to update userquotas, use
2099 * dnode_rele_task() to call dnode_rele()
2100 */
64fc7762 2101 (void) taskq_dispatch(dmu_objset_pool(os)->dp_sync_taskq,
ba67d821
MA
2102 need_userquota ? userquota_updates_task : dnode_rele_task,
2103 uua, 0);
64fc7762
MA
2104 /* callback frees uua */
2105 }
9babb374
BB
2106}
2107
ba67d821 2108
428870ff
BB
2109/*
2110 * Returns a pointer to data to find uid/gid from
2111 *
2112 * If a dirty record for transaction group that is syncing can't
2113 * be found then NULL is returned. In the NULL case it is assumed
2114 * the uid/gid aren't changing.
2115 */
2116static void *
2117dmu_objset_userquota_find_data(dmu_buf_impl_t *db, dmu_tx_t *tx)
2118{
cccbed9f 2119 dbuf_dirty_record_t *dr;
428870ff
BB
2120 void *data;
2121
2122 if (db->db_dirtycnt == 0)
2123 return (db->db.db_data); /* Nothing is changing */
2124
cccbed9f 2125 dr = dbuf_find_dirty_eq(db, tx->tx_txg);
428870ff 2126
572e2857 2127 if (dr == NULL) {
428870ff 2128 data = NULL;
572e2857 2129 } else {
ba67d821 2130 if (dr->dr_dnode->dn_bonuslen == 0 &&
572e2857
BB
2131 dr->dr_dbuf->db_blkid == DMU_SPILL_BLKID)
2132 data = dr->dt.dl.dr_data->b_data;
2133 else
2134 data = dr->dt.dl.dr_data;
572e2857
BB
2135 }
2136
428870ff
BB
2137 return (data);
2138}
2139
2140void
2141dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx)
2142{
2143 objset_t *os = dn->dn_objset;
2144 void *data = NULL;
2145 dmu_buf_impl_t *db = NULL;
428870ff
BB
2146 int flags = dn->dn_id_flags;
2147 int error;
2148 boolean_t have_spill = B_FALSE;
2149
2150 if (!dmu_objset_userused_enabled(dn->dn_objset))
2151 return;
2152
b5256303
TC
2153 /*
2154 * Raw receives introduce a problem with user accounting. Raw
2155 * receives cannot update the user accounting info because the
2156 * user ids and the sizes are encrypted. To guarantee that we
2157 * never end up with bad user accounting, we simply disable it
2158 * during raw receives. We also disable this for normal receives
2159 * so that an incremental raw receive may be done on top of an
2160 * existing non-raw receive.
2161 */
2162 if (os->os_encrypted && dmu_objset_is_receiving(os))
2163 return;
2164
428870ff
BB
2165 if (before && (flags & (DN_ID_CHKED_BONUS|DN_ID_OLD_EXIST|
2166 DN_ID_CHKED_SPILL)))
2167 return;
2168
2169 if (before && dn->dn_bonuslen != 0)
2170 data = DN_BONUS(dn->dn_phys);
2171 else if (!before && dn->dn_bonuslen != 0) {
a3000f93
BB
2172 if (dn->dn_bonus) {
2173 db = dn->dn_bonus;
428870ff
BB
2174 mutex_enter(&db->db_mtx);
2175 data = dmu_objset_userquota_find_data(db, tx);
2176 } else {
2177 data = DN_BONUS(dn->dn_phys);
2178 }
2179 } else if (dn->dn_bonuslen == 0 && dn->dn_bonustype == DMU_OT_SA) {
2180 int rf = 0;
2181
2182 if (RW_WRITE_HELD(&dn->dn_struct_rwlock))
2183 rf |= DB_RF_HAVESTRUCT;
572e2857
BB
2184 error = dmu_spill_hold_by_dnode(dn,
2185 rf | DB_RF_MUST_SUCCEED,
428870ff
BB
2186 FTAG, (dmu_buf_t **)&db);
2187 ASSERT(error == 0);
2188 mutex_enter(&db->db_mtx);
2189 data = (before) ? db->db.db_data :
2190 dmu_objset_userquota_find_data(db, tx);
2191 have_spill = B_TRUE;
2192 } else {
2193 mutex_enter(&dn->dn_mtx);
2194 dn->dn_id_flags |= DN_ID_CHKED_BONUS;
2195 mutex_exit(&dn->dn_mtx);
2196 return;
2197 }
2198
428870ff
BB
2199 /*
2200 * Must always call the callback in case the object
2201 * type has changed and that type isn't an object type to track
2202 */
7bcb7f08
MA
2203 zfs_file_info_t zfi;
2204 error = file_cbs[os->os_phys->os_type](dn->dn_bonustype, data, &zfi);
2205
2206 if (before) {
2207 ASSERT(data);
2208 dn->dn_olduid = zfi.zfi_user;
2209 dn->dn_oldgid = zfi.zfi_group;
2210 dn->dn_oldprojid = zfi.zfi_project;
2211 } else if (data) {
2212 dn->dn_newuid = zfi.zfi_user;
2213 dn->dn_newgid = zfi.zfi_group;
2214 dn->dn_newprojid = zfi.zfi_project;
2215 }
428870ff
BB
2216
2217 /*
2218 * Preserve existing uid/gid when the callback can't determine
2219 * what the new uid/gid are and the callback returned EEXIST.
2220 * The EEXIST error tells us to just use the existing uid/gid.
2221 * If we don't know what the old values are then just assign
2222 * them to 0, since that is a new file being created.
2223 */
2224 if (!before && data == NULL && error == EEXIST) {
2225 if (flags & DN_ID_OLD_EXIST) {
2226 dn->dn_newuid = dn->dn_olduid;
2227 dn->dn_newgid = dn->dn_oldgid;
2705ebf0 2228 dn->dn_newprojid = dn->dn_oldprojid;
428870ff
BB
2229 } else {
2230 dn->dn_newuid = 0;
2231 dn->dn_newgid = 0;
9c5167d1 2232 dn->dn_newprojid = ZFS_DEFAULT_PROJID;
428870ff
BB
2233 }
2234 error = 0;
2235 }
2236
2237 if (db)
2238 mutex_exit(&db->db_mtx);
2239
2240 mutex_enter(&dn->dn_mtx);
2241 if (error == 0 && before)
2242 dn->dn_id_flags |= DN_ID_OLD_EXIST;
2243 if (error == 0 && !before)
2244 dn->dn_id_flags |= DN_ID_NEW_EXIST;
2245
2246 if (have_spill) {
2247 dn->dn_id_flags |= DN_ID_CHKED_SPILL;
2248 } else {
2249 dn->dn_id_flags |= DN_ID_CHKED_BONUS;
2250 }
2251 mutex_exit(&dn->dn_mtx);
a3000f93 2252 if (have_spill)
428870ff
BB
2253 dmu_buf_rele((dmu_buf_t *)db, FTAG);
2254}
2255
9babb374
BB
2256boolean_t
2257dmu_objset_userspace_present(objset_t *os)
2258{
428870ff 2259 return (os->os_phys->os_flags &
9babb374
BB
2260 OBJSET_FLAG_USERACCOUNTING_COMPLETE);
2261}
2262
1de321e6
JX
2263boolean_t
2264dmu_objset_userobjspace_present(objset_t *os)
2265{
2266 return (os->os_phys->os_flags &
2267 OBJSET_FLAG_USEROBJACCOUNTING_COMPLETE);
2268}
2269
9c5167d1
NF
2270boolean_t
2271dmu_objset_projectquota_present(objset_t *os)
2272{
2273 return (os->os_phys->os_flags &
2274 OBJSET_FLAG_PROJECTQUOTA_COMPLETE);
2275}
2276
1de321e6
JX
2277static int
2278dmu_objset_space_upgrade(objset_t *os)
9babb374
BB
2279{
2280 uint64_t obj;
2281 int err = 0;
2282
9babb374
BB
2283 /*
2284 * We simply need to mark every object dirty, so that it will be
2285 * synced out and now accounted. If this is called
2286 * concurrently, or if we already did some work before crashing,
2287 * that's fine, since we track each object's accounted state
2288 * independently.
2289 */
2290
2291 for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 0)) {
45d1cae3 2292 dmu_tx_t *tx;
9babb374
BB
2293 dmu_buf_t *db;
2294 int objerr;
2295
1de321e6
JX
2296 mutex_enter(&os->os_upgrade_lock);
2297 if (os->os_upgrade_exit)
2298 err = SET_ERROR(EINTR);
2299 mutex_exit(&os->os_upgrade_lock);
2300 if (err != 0)
2301 return (err);
2302
9babb374 2303 if (issig(JUSTLOOKING) && issig(FORREAL))
2e528b49 2304 return (SET_ERROR(EINTR));
9babb374
BB
2305
2306 objerr = dmu_bonus_hold(os, obj, FTAG, &db);
13fe0198 2307 if (objerr != 0)
9babb374 2308 continue;
45d1cae3 2309 tx = dmu_tx_create(os);
9babb374
BB
2310 dmu_tx_hold_bonus(tx, obj);
2311 objerr = dmu_tx_assign(tx, TXG_WAIT);
13fe0198 2312 if (objerr != 0) {
d22323e8 2313 dmu_buf_rele(db, FTAG);
9babb374
BB
2314 dmu_tx_abort(tx);
2315 continue;
2316 }
2317 dmu_buf_will_dirty(db, tx);
2318 dmu_buf_rele(db, FTAG);
2319 dmu_tx_commit(tx);
2320 }
1de321e6
JX
2321 return (0);
2322}
2323
4072f465 2324static int
2325dmu_objset_userspace_upgrade_cb(objset_t *os)
1de321e6
JX
2326{
2327 int err = 0;
2328
2329 if (dmu_objset_userspace_present(os))
2330 return (0);
2331 if (dmu_objset_is_snapshot(os))
2332 return (SET_ERROR(EINVAL));
2333 if (!dmu_objset_userused_enabled(os))
2334 return (SET_ERROR(ENOTSUP));
2335
2336 err = dmu_objset_space_upgrade(os);
2337 if (err)
2338 return (err);
9babb374 2339
428870ff 2340 os->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE;
9babb374
BB
2341 txg_wait_synced(dmu_objset_pool(os), 0);
2342 return (0);
2343}
2344
4072f465 2345void
2346dmu_objset_userspace_upgrade(objset_t *os)
2347{
2348 dmu_objset_upgrade(os, dmu_objset_userspace_upgrade_cb);
2349}
2350
1de321e6 2351static int
9c5167d1 2352dmu_objset_id_quota_upgrade_cb(objset_t *os)
1de321e6
JX
2353{
2354 int err = 0;
2355
9c5167d1
NF
2356 if (dmu_objset_userobjspace_present(os) &&
2357 dmu_objset_projectquota_present(os))
1de321e6
JX
2358 return (0);
2359 if (dmu_objset_is_snapshot(os))
2360 return (SET_ERROR(EINVAL));
4072f465 2361 if (!dmu_objset_userused_enabled(os))
1de321e6 2362 return (SET_ERROR(ENOTSUP));
9c5167d1
NF
2363 if (!dmu_objset_projectquota_enabled(os) &&
2364 dmu_objset_userobjspace_present(os))
2365 return (SET_ERROR(ENOTSUP));
1de321e6 2366
4072f465 2367 if (dmu_objset_userobjused_enabled(os))
2368 dmu_objset_ds(os)->ds_feature_activation[
2369 SPA_FEATURE_USEROBJ_ACCOUNTING] = (void *)B_TRUE;
9c5167d1 2370 if (dmu_objset_projectquota_enabled(os))
d52d80b7
PD
2371 dmu_objset_ds(os)->ds_feature_activation[
2372 SPA_FEATURE_PROJECT_QUOTA] = (void *)B_TRUE;
1de321e6
JX
2373
2374 err = dmu_objset_space_upgrade(os);
2375 if (err)
2376 return (err);
2377
4072f465 2378 os->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE;
2379 if (dmu_objset_userobjused_enabled(os))
2380 os->os_flags |= OBJSET_FLAG_USEROBJACCOUNTING_COMPLETE;
9c5167d1
NF
2381 if (dmu_objset_projectquota_enabled(os))
2382 os->os_flags |= OBJSET_FLAG_PROJECTQUOTA_COMPLETE;
2383
1de321e6
JX
2384 txg_wait_synced(dmu_objset_pool(os), 0);
2385 return (0);
2386}
2387
2388void
9c5167d1 2389dmu_objset_id_quota_upgrade(objset_t *os)
1de321e6 2390{
9c5167d1 2391 dmu_objset_upgrade(os, dmu_objset_id_quota_upgrade_cb);
1de321e6
JX
2392}
2393
126ae9f4
JX
2394boolean_t
2395dmu_objset_userobjspace_upgradable(objset_t *os)
2396{
2397 return (dmu_objset_type(os) == DMU_OST_ZFS &&
2398 !dmu_objset_is_snapshot(os) &&
2399 dmu_objset_userobjused_enabled(os) &&
bb1be77a 2400 !dmu_objset_userobjspace_present(os) &&
2401 spa_writeable(dmu_objset_spa(os)));
126ae9f4
JX
2402}
2403
9c5167d1
NF
2404boolean_t
2405dmu_objset_projectquota_upgradable(objset_t *os)
2406{
2407 return (dmu_objset_type(os) == DMU_OST_ZFS &&
2408 !dmu_objset_is_snapshot(os) &&
2409 dmu_objset_projectquota_enabled(os) &&
bb1be77a 2410 !dmu_objset_projectquota_present(os) &&
2411 spa_writeable(dmu_objset_spa(os)));
9c5167d1
NF
2412}
2413
34dc7c2f
BB
2414void
2415dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp,
2416 uint64_t *usedobjsp, uint64_t *availobjsp)
2417{
428870ff 2418 dsl_dataset_space(os->os_dsl_dataset, refdbytesp, availbytesp,
34dc7c2f
BB
2419 usedobjsp, availobjsp);
2420}
2421
2422uint64_t
2423dmu_objset_fsid_guid(objset_t *os)
2424{
428870ff 2425 return (dsl_dataset_fsid_guid(os->os_dsl_dataset));
34dc7c2f
BB
2426}
2427
2428void
2429dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat)
2430{
428870ff
BB
2431 stat->dds_type = os->os_phys->os_type;
2432 if (os->os_dsl_dataset)
2433 dsl_dataset_fast_stat(os->os_dsl_dataset, stat);
34dc7c2f
BB
2434}
2435
2436void
2437dmu_objset_stats(objset_t *os, nvlist_t *nv)
2438{
428870ff
BB
2439 ASSERT(os->os_dsl_dataset ||
2440 os->os_phys->os_type == DMU_OST_META);
34dc7c2f 2441
428870ff
BB
2442 if (os->os_dsl_dataset != NULL)
2443 dsl_dataset_stats(os->os_dsl_dataset, nv);
34dc7c2f
BB
2444
2445 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_TYPE,
428870ff 2446 os->os_phys->os_type);
9babb374
BB
2447 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERACCOUNTING,
2448 dmu_objset_userspace_present(os));
34dc7c2f
BB
2449}
2450
2451int
2452dmu_objset_is_snapshot(objset_t *os)
2453{
428870ff 2454 if (os->os_dsl_dataset != NULL)
0c66c32d 2455 return (os->os_dsl_dataset->ds_is_snapshot);
34dc7c2f
BB
2456 else
2457 return (B_FALSE);
2458}
2459
2460int
4d55ea81 2461dmu_snapshot_realname(objset_t *os, const char *name, char *real, int maxlen,
34dc7c2f
BB
2462 boolean_t *conflict)
2463{
428870ff 2464 dsl_dataset_t *ds = os->os_dsl_dataset;
34dc7c2f
BB
2465 uint64_t ignored;
2466
d683ddbb 2467 if (dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0)
2e528b49 2468 return (SET_ERROR(ENOENT));
34dc7c2f
BB
2469
2470 return (zap_lookup_norm(ds->ds_dir->dd_pool->dp_meta_objset,
d683ddbb 2471 dsl_dataset_phys(ds)->ds_snapnames_zapobj, name, 8, 1, &ignored,
9b7b9cd3 2472 MT_NORMALIZE, real, maxlen, conflict));
34dc7c2f
BB
2473}
2474
2475int
2476dmu_snapshot_list_next(objset_t *os, int namelen, char *name,
2477 uint64_t *idp, uint64_t *offp, boolean_t *case_conflict)
2478{
428870ff 2479 dsl_dataset_t *ds = os->os_dsl_dataset;
34dc7c2f
BB
2480 zap_cursor_t cursor;
2481 zap_attribute_t attr;
2482
13fe0198
MA
2483 ASSERT(dsl_pool_config_held(dmu_objset_pool(os)));
2484
d683ddbb 2485 if (dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0)
2e528b49 2486 return (SET_ERROR(ENOENT));
34dc7c2f
BB
2487
2488 zap_cursor_init_serialized(&cursor,
2489 ds->ds_dir->dd_pool->dp_meta_objset,
d683ddbb 2490 dsl_dataset_phys(ds)->ds_snapnames_zapobj, *offp);
34dc7c2f
BB
2491
2492 if (zap_cursor_retrieve(&cursor, &attr) != 0) {
2493 zap_cursor_fini(&cursor);
2e528b49 2494 return (SET_ERROR(ENOENT));
34dc7c2f
BB
2495 }
2496
2497 if (strlen(attr.za_name) + 1 > namelen) {
2498 zap_cursor_fini(&cursor);
2e528b49 2499 return (SET_ERROR(ENAMETOOLONG));
34dc7c2f
BB
2500 }
2501
c9e319fa 2502 (void) strlcpy(name, attr.za_name, namelen);
34dc7c2f
BB
2503 if (idp)
2504 *idp = attr.za_first_integer;
2505 if (case_conflict)
2506 *case_conflict = attr.za_normalization_conflict;
2507 zap_cursor_advance(&cursor);
2508 *offp = zap_cursor_serialize(&cursor);
2509 zap_cursor_fini(&cursor);
2510
2511 return (0);
2512}
2513
ebe7e575 2514int
6772fb67 2515dmu_snapshot_lookup(objset_t *os, const char *name, uint64_t *value)
ebe7e575 2516{
d1d7e268 2517 return (dsl_dataset_snap_lookup(os->os_dsl_dataset, name, value));
ebe7e575
BB
2518}
2519
34dc7c2f
BB
2520int
2521dmu_dir_list_next(objset_t *os, int namelen, char *name,
2522 uint64_t *idp, uint64_t *offp)
2523{
428870ff 2524 dsl_dir_t *dd = os->os_dsl_dataset->ds_dir;
34dc7c2f
BB
2525 zap_cursor_t cursor;
2526 zap_attribute_t attr;
2527
2528 /* there is no next dir on a snapshot! */
428870ff 2529 if (os->os_dsl_dataset->ds_object !=
d683ddbb 2530 dsl_dir_phys(dd)->dd_head_dataset_obj)
2e528b49 2531 return (SET_ERROR(ENOENT));
34dc7c2f
BB
2532
2533 zap_cursor_init_serialized(&cursor,
2534 dd->dd_pool->dp_meta_objset,
d683ddbb 2535 dsl_dir_phys(dd)->dd_child_dir_zapobj, *offp);
34dc7c2f
BB
2536
2537 if (zap_cursor_retrieve(&cursor, &attr) != 0) {
2538 zap_cursor_fini(&cursor);
2e528b49 2539 return (SET_ERROR(ENOENT));
34dc7c2f
BB
2540 }
2541
2542 if (strlen(attr.za_name) + 1 > namelen) {
2543 zap_cursor_fini(&cursor);
2e528b49 2544 return (SET_ERROR(ENAMETOOLONG));
34dc7c2f
BB
2545 }
2546
c9e319fa 2547 (void) strlcpy(name, attr.za_name, namelen);
34dc7c2f
BB
2548 if (idp)
2549 *idp = attr.za_first_integer;
2550 zap_cursor_advance(&cursor);
2551 *offp = zap_cursor_serialize(&cursor);
2552 zap_cursor_fini(&cursor);
2553
2554 return (0);
2555}
2556
9c43027b
AJ
2557typedef struct dmu_objset_find_ctx {
2558 taskq_t *dc_tq;
2559 dsl_pool_t *dc_dp;
2560 uint64_t dc_ddobj;
1149ba64 2561 char *dc_ddname; /* last component of ddobj's name */
9c43027b
AJ
2562 int (*dc_func)(dsl_pool_t *, dsl_dataset_t *, void *);
2563 void *dc_arg;
2564 int dc_flags;
2565 kmutex_t *dc_error_lock;
2566 int *dc_error;
2567} dmu_objset_find_ctx_t;
2568
2569static void
2570dmu_objset_find_dp_impl(dmu_objset_find_ctx_t *dcp)
b128c09f 2571{
9c43027b 2572 dsl_pool_t *dp = dcp->dc_dp;
13fe0198
MA
2573 dsl_dir_t *dd;
2574 dsl_dataset_t *ds;
2575 zap_cursor_t zc;
2576 zap_attribute_t *attr;
2577 uint64_t thisobj;
9c43027b 2578 int err = 0;
13fe0198 2579
9c43027b
AJ
2580 /* don't process if there already was an error */
2581 if (*dcp->dc_error != 0)
2582 goto out;
13fe0198 2583
1149ba64
GM
2584 /*
2585 * Note: passing the name (dc_ddname) here is optional, but it
2586 * improves performance because we don't need to call
2587 * zap_value_search() to determine the name.
2588 */
2589 err = dsl_dir_hold_obj(dp, dcp->dc_ddobj, dcp->dc_ddname, FTAG, &dd);
13fe0198 2590 if (err != 0)
9c43027b 2591 goto out;
13fe0198
MA
2592
2593 /* Don't visit hidden ($MOS & $ORIGIN) objsets. */
2594 if (dd->dd_myname[0] == '$') {
2595 dsl_dir_rele(dd, FTAG);
9c43027b 2596 goto out;
13fe0198
MA
2597 }
2598
d683ddbb 2599 thisobj = dsl_dir_phys(dd)->dd_head_dataset_obj;
79c76d5b 2600 attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
13fe0198
MA
2601
2602 /*
2603 * Iterate over all children.
2604 */
9c43027b 2605 if (dcp->dc_flags & DS_FIND_CHILDREN) {
13fe0198 2606 for (zap_cursor_init(&zc, dp->dp_meta_objset,
d683ddbb 2607 dsl_dir_phys(dd)->dd_child_dir_zapobj);
13fe0198
MA
2608 zap_cursor_retrieve(&zc, attr) == 0;
2609 (void) zap_cursor_advance(&zc)) {
2610 ASSERT3U(attr->za_integer_length, ==,
2611 sizeof (uint64_t));
2612 ASSERT3U(attr->za_num_integers, ==, 1);
2613
1c27024e 2614 dmu_objset_find_ctx_t *child_dcp =
1149ba64 2615 kmem_alloc(sizeof (*child_dcp), KM_SLEEP);
9c43027b
AJ
2616 *child_dcp = *dcp;
2617 child_dcp->dc_ddobj = attr->za_first_integer;
1149ba64 2618 child_dcp->dc_ddname = spa_strdup(attr->za_name);
9c43027b
AJ
2619 if (dcp->dc_tq != NULL)
2620 (void) taskq_dispatch(dcp->dc_tq,
2621 dmu_objset_find_dp_cb, child_dcp, TQ_SLEEP);
2622 else
2623 dmu_objset_find_dp_impl(child_dcp);
13fe0198
MA
2624 }
2625 zap_cursor_fini(&zc);
13fe0198
MA
2626 }
2627
2628 /*
2629 * Iterate over all snapshots.
2630 */
9c43027b 2631 if (dcp->dc_flags & DS_FIND_SNAPSHOTS) {
13fe0198
MA
2632 dsl_dataset_t *ds;
2633 err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
2634
2635 if (err == 0) {
d683ddbb
JG
2636 uint64_t snapobj;
2637
2638 snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
13fe0198
MA
2639 dsl_dataset_rele(ds, FTAG);
2640
2641 for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj);
2642 zap_cursor_retrieve(&zc, attr) == 0;
2643 (void) zap_cursor_advance(&zc)) {
2644 ASSERT3U(attr->za_integer_length, ==,
2645 sizeof (uint64_t));
2646 ASSERT3U(attr->za_num_integers, ==, 1);
2647
2648 err = dsl_dataset_hold_obj(dp,
2649 attr->za_first_integer, FTAG, &ds);
2650 if (err != 0)
2651 break;
9c43027b 2652 err = dcp->dc_func(dp, ds, dcp->dc_arg);
13fe0198
MA
2653 dsl_dataset_rele(ds, FTAG);
2654 if (err != 0)
2655 break;
2656 }
2657 zap_cursor_fini(&zc);
2658 }
2659 }
2660
13fe0198
MA
2661 kmem_free(attr, sizeof (zap_attribute_t));
2662
1149ba64
GM
2663 if (err != 0) {
2664 dsl_dir_rele(dd, FTAG);
9c43027b 2665 goto out;
1149ba64 2666 }
13fe0198
MA
2667
2668 /*
2669 * Apply to self.
2670 */
2671 err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
1149ba64
GM
2672
2673 /*
2674 * Note: we hold the dir while calling dsl_dataset_hold_obj() so
2675 * that the dir will remain cached, and we won't have to re-instantiate
2676 * it (which could be expensive due to finding its name via
2677 * zap_value_search()).
2678 */
2679 dsl_dir_rele(dd, FTAG);
13fe0198 2680 if (err != 0)
9c43027b
AJ
2681 goto out;
2682 err = dcp->dc_func(dp, ds, dcp->dc_arg);
13fe0198 2683 dsl_dataset_rele(ds, FTAG);
9c43027b
AJ
2684
2685out:
2686 if (err != 0) {
2687 mutex_enter(dcp->dc_error_lock);
2688 /* only keep first error */
2689 if (*dcp->dc_error == 0)
2690 *dcp->dc_error = err;
2691 mutex_exit(dcp->dc_error_lock);
2692 }
2693
1149ba64
GM
2694 if (dcp->dc_ddname != NULL)
2695 spa_strfree(dcp->dc_ddname);
9c43027b
AJ
2696 kmem_free(dcp, sizeof (*dcp));
2697}
2698
2699static void
2700dmu_objset_find_dp_cb(void *arg)
2701{
2702 dmu_objset_find_ctx_t *dcp = arg;
2703 dsl_pool_t *dp = dcp->dc_dp;
2704
5e8cd5d1
AJ
2705 /*
2706 * We need to get a pool_config_lock here, as there are several
e1cfd73f 2707 * assert(pool_config_held) down the stack. Getting a lock via
5e8cd5d1
AJ
2708 * dsl_pool_config_enter is risky, as it might be stalled by a
2709 * pending writer. This would deadlock, as the write lock can
2710 * only be granted when our parent thread gives up the lock.
2711 * The _prio interface gives us priority over a pending writer.
2712 */
2713 dsl_pool_config_enter_prio(dp, FTAG);
9c43027b
AJ
2714
2715 dmu_objset_find_dp_impl(dcp);
2716
2717 dsl_pool_config_exit(dp, FTAG);
2718}
2719
2720/*
2721 * Find objsets under and including ddobj, call func(ds) on each.
2722 * The order for the enumeration is completely undefined.
2723 * func is called with dsl_pool_config held.
2724 */
2725int
2726dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj,
2727 int func(dsl_pool_t *, dsl_dataset_t *, void *), void *arg, int flags)
2728{
2729 int error = 0;
2730 taskq_t *tq = NULL;
2731 int ntasks;
2732 dmu_objset_find_ctx_t *dcp;
2733 kmutex_t err_lock;
2734
2735 mutex_init(&err_lock, NULL, MUTEX_DEFAULT, NULL);
2736 dcp = kmem_alloc(sizeof (*dcp), KM_SLEEP);
2737 dcp->dc_tq = NULL;
2738 dcp->dc_dp = dp;
2739 dcp->dc_ddobj = ddobj;
1149ba64 2740 dcp->dc_ddname = NULL;
9c43027b
AJ
2741 dcp->dc_func = func;
2742 dcp->dc_arg = arg;
2743 dcp->dc_flags = flags;
2744 dcp->dc_error_lock = &err_lock;
2745 dcp->dc_error = &error;
2746
2747 if ((flags & DS_FIND_SERIALIZE) || dsl_pool_config_held_writer(dp)) {
2748 /*
2749 * In case a write lock is held we can't make use of
2750 * parallelism, as down the stack of the worker threads
2751 * the lock is asserted via dsl_pool_config_held.
2752 * In case of a read lock this is solved by getting a read
2753 * lock in each worker thread, which isn't possible in case
2754 * of a writer lock. So we fall back to the synchronous path
2755 * here.
2756 * In the future it might be possible to get some magic into
2757 * dsl_pool_config_held in a way that it returns true for
2758 * the worker threads so that a single lock held from this
2759 * thread suffices. For now, stay single threaded.
2760 */
2761 dmu_objset_find_dp_impl(dcp);
e5676636 2762 mutex_destroy(&err_lock);
9c43027b
AJ
2763
2764 return (error);
2765 }
2766
2767 ntasks = dmu_find_threads;
2768 if (ntasks == 0)
2769 ntasks = vdev_count_leaves(dp->dp_spa) * 4;
1229323d 2770 tq = taskq_create("dmu_objset_find", ntasks, maxclsyspri, ntasks,
9c43027b
AJ
2771 INT_MAX, 0);
2772 if (tq == NULL) {
2773 kmem_free(dcp, sizeof (*dcp));
e5676636
BB
2774 mutex_destroy(&err_lock);
2775
9c43027b
AJ
2776 return (SET_ERROR(ENOMEM));
2777 }
2778 dcp->dc_tq = tq;
2779
2780 /* dcp will be freed by task */
2781 (void) taskq_dispatch(tq, dmu_objset_find_dp_cb, dcp, TQ_SLEEP);
2782
2783 /*
2784 * PORTING: this code relies on the property of taskq_wait to wait
2785 * until no more tasks are queued and no more tasks are active. As
2786 * we always queue new tasks from within other tasks, task_wait
2787 * reliably waits for the full recursion to finish, even though we
2788 * enqueue new tasks after taskq_wait has been called.
2789 * On platforms other than illumos, taskq_wait may not have this
2790 * property.
2791 */
2792 taskq_wait(tq);
2793 taskq_destroy(tq);
2794 mutex_destroy(&err_lock);
2795
2796 return (error);
b128c09f
BB
2797}
2798
2799/*
13fe0198
MA
2800 * Find all objsets under name, and for each, call 'func(child_name, arg)'.
2801 * The dp_config_rwlock must not be held when this is called, and it
2802 * will not be held when the callback is called.
2803 * Therefore this function should only be used when the pool is not changing
2804 * (e.g. in syncing context), or the callback can deal with the possible races.
b128c09f 2805 */
13fe0198
MA
2806static int
2807dmu_objset_find_impl(spa_t *spa, const char *name,
2808 int func(const char *, void *), void *arg, int flags)
34dc7c2f
BB
2809{
2810 dsl_dir_t *dd;
13fe0198 2811 dsl_pool_t *dp = spa_get_dsl(spa);
b128c09f 2812 dsl_dataset_t *ds;
34dc7c2f
BB
2813 zap_cursor_t zc;
2814 zap_attribute_t *attr;
2815 char *child;
b128c09f
BB
2816 uint64_t thisobj;
2817 int err;
34dc7c2f 2818
13fe0198
MA
2819 dsl_pool_config_enter(dp, FTAG);
2820
2821 err = dsl_dir_hold(dp, name, FTAG, &dd, NULL);
2822 if (err != 0) {
2823 dsl_pool_config_exit(dp, FTAG);
34dc7c2f 2824 return (err);
13fe0198 2825 }
34dc7c2f 2826
b128c09f
BB
2827 /* Don't visit hidden ($MOS & $ORIGIN) objsets. */
2828 if (dd->dd_myname[0] == '$') {
13fe0198
MA
2829 dsl_dir_rele(dd, FTAG);
2830 dsl_pool_config_exit(dp, FTAG);
b128c09f
BB
2831 return (0);
2832 }
2833
d683ddbb 2834 thisobj = dsl_dir_phys(dd)->dd_head_dataset_obj;
79c76d5b 2835 attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
34dc7c2f
BB
2836
2837 /*
2838 * Iterate over all children.
2839 */
2840 if (flags & DS_FIND_CHILDREN) {
b128c09f 2841 for (zap_cursor_init(&zc, dp->dp_meta_objset,
d683ddbb 2842 dsl_dir_phys(dd)->dd_child_dir_zapobj);
34dc7c2f
BB
2843 zap_cursor_retrieve(&zc, attr) == 0;
2844 (void) zap_cursor_advance(&zc)) {
13fe0198
MA
2845 ASSERT3U(attr->za_integer_length, ==,
2846 sizeof (uint64_t));
2847 ASSERT3U(attr->za_num_integers, ==, 1);
34dc7c2f 2848
428870ff 2849 child = kmem_asprintf("%s/%s", name, attr->za_name);
13fe0198
MA
2850 dsl_pool_config_exit(dp, FTAG);
2851 err = dmu_objset_find_impl(spa, child,
2852 func, arg, flags);
2853 dsl_pool_config_enter(dp, FTAG);
e4f5fa12 2854 kmem_strfree(child);
13fe0198 2855 if (err != 0)
34dc7c2f
BB
2856 break;
2857 }
2858 zap_cursor_fini(&zc);
2859
13fe0198
MA
2860 if (err != 0) {
2861 dsl_dir_rele(dd, FTAG);
2862 dsl_pool_config_exit(dp, FTAG);
34dc7c2f
BB
2863 kmem_free(attr, sizeof (zap_attribute_t));
2864 return (err);
2865 }
2866 }
2867
2868 /*
2869 * Iterate over all snapshots.
2870 */
b128c09f 2871 if (flags & DS_FIND_SNAPSHOTS) {
b128c09f 2872 err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
b128c09f
BB
2873
2874 if (err == 0) {
d683ddbb
JG
2875 uint64_t snapobj;
2876
2877 snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
b128c09f
BB
2878 dsl_dataset_rele(ds, FTAG);
2879
2880 for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj);
2881 zap_cursor_retrieve(&zc, attr) == 0;
2882 (void) zap_cursor_advance(&zc)) {
13fe0198 2883 ASSERT3U(attr->za_integer_length, ==,
b128c09f 2884 sizeof (uint64_t));
13fe0198 2885 ASSERT3U(attr->za_num_integers, ==, 1);
b128c09f 2886
428870ff
BB
2887 child = kmem_asprintf("%s@%s",
2888 name, attr->za_name);
13fe0198
MA
2889 dsl_pool_config_exit(dp, FTAG);
2890 err = func(child, arg);
2891 dsl_pool_config_enter(dp, FTAG);
e4f5fa12 2892 kmem_strfree(child);
13fe0198 2893 if (err != 0)
b128c09f
BB
2894 break;
2895 }
2896 zap_cursor_fini(&zc);
34dc7c2f 2897 }
34dc7c2f
BB
2898 }
2899
13fe0198 2900 dsl_dir_rele(dd, FTAG);
34dc7c2f 2901 kmem_free(attr, sizeof (zap_attribute_t));
13fe0198 2902 dsl_pool_config_exit(dp, FTAG);
34dc7c2f 2903
13fe0198 2904 if (err != 0)
34dc7c2f
BB
2905 return (err);
2906
13fe0198
MA
2907 /* Apply to self. */
2908 return (func(name, arg));
34dc7c2f
BB
2909}
2910
13fe0198
MA
2911/*
2912 * See comment above dmu_objset_find_impl().
2913 */
d164b209 2914int
5df7e9d8 2915dmu_objset_find(const char *name, int func(const char *, void *), void *arg,
13fe0198 2916 int flags)
d164b209 2917{
13fe0198
MA
2918 spa_t *spa;
2919 int error;
d164b209 2920
13fe0198
MA
2921 error = spa_open(name, &spa, FTAG);
2922 if (error != 0)
2923 return (error);
2924 error = dmu_objset_find_impl(spa, name, func, arg, flags);
2925 spa_close(spa, FTAG);
2926 return (error);
d164b209
BB
2927}
2928
ae76f45c
TC
2929boolean_t
2930dmu_objset_incompatible_encryption_version(objset_t *os)
2931{
2932 return (dsl_dir_incompatible_encryption_version(
2933 os->os_dsl_dataset->ds_dir));
2934}
2935
34dc7c2f
BB
2936void
2937dmu_objset_set_user(objset_t *os, void *user_ptr)
2938{
428870ff
BB
2939 ASSERT(MUTEX_HELD(&os->os_user_ptr_lock));
2940 os->os_user_ptr = user_ptr;
34dc7c2f
BB
2941}
2942
2943void *
2944dmu_objset_get_user(objset_t *os)
2945{
428870ff
BB
2946 ASSERT(MUTEX_HELD(&os->os_user_ptr_lock));
2947 return (os->os_user_ptr);
34dc7c2f 2948}
c28b2279 2949
13fe0198
MA
2950/*
2951 * Determine name of filesystem, given name of snapshot.
eca7b760 2952 * buf must be at least ZFS_MAX_DATASET_NAME_LEN bytes
13fe0198
MA
2953 */
2954int
2955dmu_fsname(const char *snapname, char *buf)
2956{
2957 char *atp = strchr(snapname, '@');
2958 if (atp == NULL)
2e528b49 2959 return (SET_ERROR(EINVAL));
eca7b760 2960 if (atp - snapname >= ZFS_MAX_DATASET_NAME_LEN)
2e528b49 2961 return (SET_ERROR(ENAMETOOLONG));
13fe0198
MA
2962 (void) strlcpy(buf, snapname, atp - snapname + 1);
2963 return (0);
2964}
2965
3ec3bc21 2966/*
0f8ff49e
SD
2967 * Call when we think we're going to write/free space in open context
2968 * to track the amount of dirty data in the open txg, which is also the
2969 * amount of memory that can not be evicted until this txg syncs.
2970 *
2971 * Note that there are two conditions where this can be called from
2972 * syncing context:
2973 *
2974 * [1] When we just created the dataset, in which case we go on with
2975 * updating any accounting of dirty data as usual.
2976 * [2] When we are dirtying MOS data, in which case we only update the
2977 * pool's accounting of dirty data.
3ec3bc21
BB
2978 */
2979void
2980dmu_objset_willuse_space(objset_t *os, int64_t space, dmu_tx_t *tx)
2981{
2982 dsl_dataset_t *ds = os->os_dsl_dataset;
2983 int64_t aspace = spa_get_worst_case_asize(os->os_spa, space);
2984
2985 if (ds != NULL) {
2986 dsl_dir_willuse_space(ds->ds_dir, aspace, tx);
3ec3bc21 2987 }
0f8ff49e
SD
2988
2989 dsl_pool_dirty_space(dmu_tx_pool(tx), space, tx);
3ec3bc21
BB
2990}
2991
93ce2b4c 2992#if defined(_KERNEL)
f0fd83be 2993EXPORT_SYMBOL(dmu_objset_zil);
c28b2279 2994EXPORT_SYMBOL(dmu_objset_pool);
f0fd83be
BB
2995EXPORT_SYMBOL(dmu_objset_ds);
2996EXPORT_SYMBOL(dmu_objset_type);
c28b2279
BB
2997EXPORT_SYMBOL(dmu_objset_name);
2998EXPORT_SYMBOL(dmu_objset_hold);
b5256303 2999EXPORT_SYMBOL(dmu_objset_hold_flags);
c28b2279
BB
3000EXPORT_SYMBOL(dmu_objset_own);
3001EXPORT_SYMBOL(dmu_objset_rele);
b5256303 3002EXPORT_SYMBOL(dmu_objset_rele_flags);
c28b2279
BB
3003EXPORT_SYMBOL(dmu_objset_disown);
3004EXPORT_SYMBOL(dmu_objset_from_ds);
3005EXPORT_SYMBOL(dmu_objset_create);
3006EXPORT_SYMBOL(dmu_objset_clone);
c28b2279
BB
3007EXPORT_SYMBOL(dmu_objset_stats);
3008EXPORT_SYMBOL(dmu_objset_fast_stat);
54a179e7 3009EXPORT_SYMBOL(dmu_objset_spa);
c28b2279
BB
3010EXPORT_SYMBOL(dmu_objset_space);
3011EXPORT_SYMBOL(dmu_objset_fsid_guid);
3012EXPORT_SYMBOL(dmu_objset_find);
c28b2279
BB
3013EXPORT_SYMBOL(dmu_objset_byteswap);
3014EXPORT_SYMBOL(dmu_objset_evict_dbufs);
3015EXPORT_SYMBOL(dmu_objset_snap_cmtime);
50c957f7 3016EXPORT_SYMBOL(dmu_objset_dnodesize);
c28b2279
BB
3017
3018EXPORT_SYMBOL(dmu_objset_sync);
3019EXPORT_SYMBOL(dmu_objset_is_dirty);
b5256303 3020EXPORT_SYMBOL(dmu_objset_create_impl_dnstats);
c28b2279
BB
3021EXPORT_SYMBOL(dmu_objset_create_impl);
3022EXPORT_SYMBOL(dmu_objset_open_impl);
3023EXPORT_SYMBOL(dmu_objset_evict);
3024EXPORT_SYMBOL(dmu_objset_register_type);
ba67d821 3025EXPORT_SYMBOL(dmu_objset_sync_done);
c28b2279
BB
3026EXPORT_SYMBOL(dmu_objset_userquota_get_ids);
3027EXPORT_SYMBOL(dmu_objset_userused_enabled);
3028EXPORT_SYMBOL(dmu_objset_userspace_upgrade);
3029EXPORT_SYMBOL(dmu_objset_userspace_present);
1de321e6 3030EXPORT_SYMBOL(dmu_objset_userobjused_enabled);
126ae9f4 3031EXPORT_SYMBOL(dmu_objset_userobjspace_upgradable);
1de321e6 3032EXPORT_SYMBOL(dmu_objset_userobjspace_present);
9c5167d1
NF
3033EXPORT_SYMBOL(dmu_objset_projectquota_enabled);
3034EXPORT_SYMBOL(dmu_objset_projectquota_present);
3035EXPORT_SYMBOL(dmu_objset_projectquota_upgradable);
3036EXPORT_SYMBOL(dmu_objset_id_quota_upgrade);
c28b2279 3037#endif