module/zfs/dmu_objset.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
  24  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  25  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  26  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  27  * Copyright (c) 2015 Nexenta Systems, Inc. All rights reserved.
  28  * Copyright (c) 2015, STRATO AG, Inc. All rights reserved.
  29  * Copyright (c) 2016 Actifio, Inc. All rights reserved.
  30  */
  31
  32 /* Portions Copyright 2010 Robert Milkowski */
  33
  34 #include <sys/cred.h>
  35 #include <sys/zfs_context.h>
  36 #include <sys/dmu_objset.h>
  37 #include <sys/dsl_dir.h>
  38 #include <sys/dsl_dataset.h>
  39 #include <sys/dsl_prop.h>
  40 #include <sys/dsl_pool.h>
  41 #include <sys/dsl_synctask.h>
  42 #include <sys/dsl_deleg.h>
  43 #include <sys/dnode.h>
  44 #include <sys/dbuf.h>
  45 #include <sys/zvol.h>
  46 #include <sys/dmu_tx.h>
  47 #include <sys/zap.h>
  48 #include <sys/zil.h>
  49 #include <sys/dmu_impl.h>
  50 #include <sys/zfs_ioctl.h>
  51 #include <sys/sa.h>
  52 #include <sys/zfs_onexit.h>
  53 #include <sys/dsl_destroy.h>
  54 #include <sys/vdev.h>
  55 #include <sys/policy.h>
  56
  57 /*
  58  * Needed to close a window in dnode_move() that allows the objset to be freed
  59  * before it can be safely accessed.
  60  */
  61 krwlock_t os_lock;
  62
  63 /*
  64  * Tunable to overwrite the maximum number of threads for the parallization
  65  * of dmu_objset_find_dp, needed to speed up the import of pools with many
  66  * datasets.
  67  * Default is 4 times the number of leaf vdevs.
  68  */
  69 int dmu_find_threads = 0;
  70
  71 /*
  72  * Backfill lower metadnode objects after this many have been freed.
  73  * Backfilling negatively impacts object creation rates, so only do it
  74  * if there are enough holes to fill.
  75  */
  76 int dmu_rescan_dnode_threshold = 1 << DN_MAX_INDBLKSHIFT;
  77
  78 static void dmu_objset_find_dp_cb(void *arg);
  79
  80 void
  81 dmu_objset_init(void)
  82 {
  83         rw_init(&os_lock, NULL, RW_DEFAULT, NULL);
  84 }
  85
  86 void
  87 dmu_objset_fini(void)
  88 {
  89         rw_destroy(&os_lock);
  90 }
  91
  92 spa_t *
  93 dmu_objset_spa(objset_t *os)
  94 {
  95         return (os->os_spa);
  96 }
  97
  98 zilog_t *
  99 dmu_objset_zil(objset_t *os)
 100 {
 101         return (os->os_zil);
 102 }
 103
 104 dsl_pool_t *
 105 dmu_objset_pool(objset_t *os)
 106 {
 107         dsl_dataset_t *ds;
 108
 109         if ((ds = os->os_dsl_dataset) != NULL && ds->ds_dir)
 110                 return (ds->ds_dir->dd_pool);
 111         else
 112                 return (spa_get_dsl(os->os_spa));
 113 }
 114
 115 dsl_dataset_t *
 116 dmu_objset_ds(objset_t *os)
 117 {
 118         return (os->os_dsl_dataset);
 119 }
 120
 121 dmu_objset_type_t
 122 dmu_objset_type(objset_t *os)
 123 {
 124         return (os->os_phys->os_type);
 125 }
 126
 127 void
 128 dmu_objset_name(objset_t *os, char *buf)
 129 {
 130         dsl_dataset_name(os->os_dsl_dataset, buf);
 131 }
 132
 133 uint64_t
 134 dmu_objset_id(objset_t *os)
 135 {
 136         dsl_dataset_t *ds = os->os_dsl_dataset;
 137
 138         return (ds ? ds->ds_object : 0);
 139 }
 140
 141 uint64_t
 142 dmu_objset_dnodesize(objset_t *os)
 143 {
 144         return (os->os_dnodesize);
 145 }
 146
 147 zfs_sync_type_t
 148 dmu_objset_syncprop(objset_t *os)
 149 {
 150         return (os->os_sync);
 151 }
 152
 153 zfs_logbias_op_t
 154 dmu_objset_logbias(objset_t *os)
 155 {
 156         return (os->os_logbias);
 157 }
 158
 159 static void
 160 checksum_changed_cb(void *arg, uint64_t newval)
 161 {
 162         objset_t *os = arg;
 163
 164         /*
 165          * Inheritance should have been done by now.
 166          */
 167         ASSERT(newval != ZIO_CHECKSUM_INHERIT);
 168
 169         os->os_checksum = zio_checksum_select(newval, ZIO_CHECKSUM_ON_VALUE);
 170 }
 171
 172 static void
 173 compression_changed_cb(void *arg, uint64_t newval)
 174 {
 175         objset_t *os = arg;
 176
 177         /*
 178          * Inheritance and range checking should have been done by now.
 179          */
 180         ASSERT(newval != ZIO_COMPRESS_INHERIT);
 181
 182         os->os_compress = zio_compress_select(os->os_spa, newval,
 183             ZIO_COMPRESS_ON);
 184 }
 185
 186 static void
 187 copies_changed_cb(void *arg, uint64_t newval)
 188 {
 189         objset_t *os = arg;
 190
 191         /*
 192          * Inheritance and range checking should have been done by now.
 193          */
 194         ASSERT(newval > 0);
 195         ASSERT(newval <= spa_max_replication(os->os_spa));
 196
 197         os->os_copies = newval;
 198 }
 199
 200 static void
 201 dedup_changed_cb(void *arg, uint64_t newval)
 202 {
 203         objset_t *os = arg;
 204         spa_t *spa = os->os_spa;
 205         enum zio_checksum checksum;
 206
 207         /*
 208          * Inheritance should have been done by now.
 209          */
 210         ASSERT(newval != ZIO_CHECKSUM_INHERIT);
 211
 212         checksum = zio_checksum_dedup_select(spa, newval, ZIO_CHECKSUM_OFF);
 213
 214         os->os_dedup_checksum = checksum & ZIO_CHECKSUM_MASK;
 215         os->os_dedup_verify = !!(checksum & ZIO_CHECKSUM_VERIFY);
 216 }
 217
 218 static void
 219 primary_cache_changed_cb(void *arg, uint64_t newval)
 220 {
 221         objset_t *os = arg;
 222
 223         /*
 224          * Inheritance and range checking should have been done by now.
 225          */
 226         ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE ||
 227             newval == ZFS_CACHE_METADATA);
 228
 229         os->os_primary_cache = newval;
 230 }
 231
 232 static void
 233 secondary_cache_changed_cb(void *arg, uint64_t newval)
 234 {
 235         objset_t *os = arg;
 236
 237         /*
 238          * Inheritance and range checking should have been done by now.
 239          */
 240         ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE ||
 241             newval == ZFS_CACHE_METADATA);
 242
 243         os->os_secondary_cache = newval;
 244 }
 245
 246 static void
 247 sync_changed_cb(void *arg, uint64_t newval)
 248 {
 249         objset_t *os = arg;
 250
 251         /*
 252          * Inheritance and range checking should have been done by now.
 253          */
 254         ASSERT(newval == ZFS_SYNC_STANDARD || newval == ZFS_SYNC_ALWAYS ||
 255             newval == ZFS_SYNC_DISABLED);
 256
 257         os->os_sync = newval;
 258         if (os->os_zil)
 259                 zil_set_sync(os->os_zil, newval);
 260 }
 261
 262 static void
 263 redundant_metadata_changed_cb(void *arg, uint64_t newval)
 264 {
 265         objset_t *os = arg;
 266
 267         /*
 268          * Inheritance and range checking should have been done by now.
 269          */
 270         ASSERT(newval == ZFS_REDUNDANT_METADATA_ALL ||
 271             newval == ZFS_REDUNDANT_METADATA_MOST);
 272
 273         os->os_redundant_metadata = newval;
 274 }
 275
 276 static void
 277 dnodesize_changed_cb(void *arg, uint64_t newval)
 278 {
 279         objset_t *os = arg;
 280
 281         switch (newval) {
 282         case ZFS_DNSIZE_LEGACY:
 283                 os->os_dnodesize = DNODE_MIN_SIZE;
 284                 break;
 285         case ZFS_DNSIZE_AUTO:
 286                 /*
 287                  * Choose a dnode size that will work well for most
 288                  * workloads if the user specified "auto". Future code
 289                  * improvements could dynamically select a dnode size
 290                  * based on observed workload patterns.
 291                  */
 292                 os->os_dnodesize = DNODE_MIN_SIZE * 2;
 293                 break;
 294         case ZFS_DNSIZE_1K:
 295         case ZFS_DNSIZE_2K:
 296         case ZFS_DNSIZE_4K:
 297         case ZFS_DNSIZE_8K:
 298         case ZFS_DNSIZE_16K:
 299                 os->os_dnodesize = newval;
 300                 break;
 301         }
 302 }
 303
 304 static void
 305 logbias_changed_cb(void *arg, uint64_t newval)
 306 {
 307         objset_t *os = arg;
 308
 309         ASSERT(newval == ZFS_LOGBIAS_LATENCY ||
 310             newval == ZFS_LOGBIAS_THROUGHPUT);
 311         os->os_logbias = newval;
 312         if (os->os_zil)
 313                 zil_set_logbias(os->os_zil, newval);
 314 }
 315
 316 static void
 317 recordsize_changed_cb(void *arg, uint64_t newval)
 318 {
 319         objset_t *os = arg;
 320
 321         os->os_recordsize = newval;
 322 }
 323
 324 void
 325 dmu_objset_byteswap(void *buf, size_t size)
 326 {
 327         objset_phys_t *osp = buf;
 328
 329         ASSERT(size == OBJSET_OLD_PHYS_SIZE || size == sizeof (objset_phys_t));
 330         dnode_byteswap(&osp->os_meta_dnode);
 331         byteswap_uint64_array(&osp->os_zil_header, sizeof (zil_header_t));
 332         osp->os_type = BSWAP_64(osp->os_type);
 333         osp->os_flags = BSWAP_64(osp->os_flags);
 334         if (size == sizeof (objset_phys_t)) {
 335                 dnode_byteswap(&osp->os_userused_dnode);
 336                 dnode_byteswap(&osp->os_groupused_dnode);
 337         }
 338 }
 339
 340 int
 341 dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
 342     objset_t **osp)
 343 {
 344         objset_t *os;
 345         int i, err;
 346
 347         ASSERT(ds == NULL || MUTEX_HELD(&ds->ds_opening_lock));
 348
 349         os = kmem_zalloc(sizeof (objset_t), KM_SLEEP);
 350         os->os_dsl_dataset = ds;
 351         os->os_spa = spa;
 352         os->os_rootbp = bp;
 353         if (!BP_IS_HOLE(os->os_rootbp)) {
 354                 arc_flags_t aflags = ARC_FLAG_WAIT;
 355                 zbookmark_phys_t zb;
 356                 SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
 357                     ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
 358
 359                 if (DMU_OS_IS_L2CACHEABLE(os))
 360                         aflags |= ARC_FLAG_L2CACHE;
 361                 if (DMU_OS_IS_L2COMPRESSIBLE(os))
 362                         aflags |= ARC_FLAG_L2COMPRESS;
 363
 364                 dprintf_bp(os->os_rootbp, "reading %s", "");
 365                 err = arc_read(NULL, spa, os->os_rootbp,
 366                     arc_getbuf_func, &os->os_phys_buf,
 367                     ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &aflags, &zb);
 368                 if (err != 0) {
 369                         kmem_free(os, sizeof (objset_t));
 370                         /* convert checksum errors into IO errors */
 371                         if (err == ECKSUM)
 372                                 err = SET_ERROR(EIO);
 373                         return (err);
 374                 }
 375
 376                 /* Increase the blocksize if we are permitted. */
 377                 if (spa_version(spa) >= SPA_VERSION_USERSPACE &&
 378                     arc_buf_size(os->os_phys_buf) < sizeof (objset_phys_t)) {
 379                         arc_buf_t *buf = arc_buf_alloc(spa,
 380                             sizeof (objset_phys_t), &os->os_phys_buf,
 381                             ARC_BUFC_METADATA);
 382                         bzero(buf->b_data, sizeof (objset_phys_t));
 383                         bcopy(os->os_phys_buf->b_data, buf->b_data,
 384                             arc_buf_size(os->os_phys_buf));
 385                         (void) arc_buf_remove_ref(os->os_phys_buf,
 386                             &os->os_phys_buf);
 387                         os->os_phys_buf = buf;
 388                 }
 389
 390                 os->os_phys = os->os_phys_buf->b_data;
 391                 os->os_flags = os->os_phys->os_flags;
 392         } else {
 393                 int size = spa_version(spa) >= SPA_VERSION_USERSPACE ?
 394                     sizeof (objset_phys_t) : OBJSET_OLD_PHYS_SIZE;
 395                 os->os_phys_buf = arc_buf_alloc(spa, size,
 396                     &os->os_phys_buf, ARC_BUFC_METADATA);
 397                 os->os_phys = os->os_phys_buf->b_data;
 398                 bzero(os->os_phys, size);
 399         }
 400
 401         /*
 402          * Note: the changed_cb will be called once before the register
 403          * func returns, thus changing the checksum/compression from the
 404          * default (fletcher2/off).  Snapshots don't need to know about
 405          * checksum/compression/copies.
 406          */
 407         if (ds != NULL) {
 408                 boolean_t needlock = B_FALSE;
 409
 410                 /*
 411                  * Note: it's valid to open the objset if the dataset is
 412                  * long-held, in which case the pool_config lock will not
 413                  * be held.
 414                  */
 415                 if (!dsl_pool_config_held(dmu_objset_pool(os))) {
 416                         needlock = B_TRUE;
 417                         dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
 418                 }
 419                 err = dsl_prop_register(ds,
 420                     zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE),
 421                     primary_cache_changed_cb, os);
 422                 if (err == 0) {
 423                         err = dsl_prop_register(ds,
 424                             zfs_prop_to_name(ZFS_PROP_SECONDARYCACHE),
 425                             secondary_cache_changed_cb, os);
 426                 }
 427                 if (!ds->ds_is_snapshot) {
 428                         if (err == 0) {
 429                                 err = dsl_prop_register(ds,
 430                                     zfs_prop_to_name(ZFS_PROP_CHECKSUM),
 431                                     checksum_changed_cb, os);
 432                         }
 433                         if (err == 0) {
 434                                 err = dsl_prop_register(ds,
 435                                     zfs_prop_to_name(ZFS_PROP_COMPRESSION),
 436                                     compression_changed_cb, os);
 437                         }
 438                         if (err == 0) {
 439                                 err = dsl_prop_register(ds,
 440                                     zfs_prop_to_name(ZFS_PROP_COPIES),
 441                                     copies_changed_cb, os);
 442                         }
 443                         if (err == 0) {
 444                                 err = dsl_prop_register(ds,
 445                                     zfs_prop_to_name(ZFS_PROP_DEDUP),
 446                                     dedup_changed_cb, os);
 447                         }
 448                         if (err == 0) {
 449                                 err = dsl_prop_register(ds,
 450                                     zfs_prop_to_name(ZFS_PROP_LOGBIAS),
 451                                     logbias_changed_cb, os);
 452                         }
 453                         if (err == 0) {
 454                                 err = dsl_prop_register(ds,
 455                                     zfs_prop_to_name(ZFS_PROP_SYNC),
 456                                     sync_changed_cb, os);
 457                         }
 458                         if (err == 0) {
 459                                 err = dsl_prop_register(ds,
 460                                     zfs_prop_to_name(
 461                                     ZFS_PROP_REDUNDANT_METADATA),
 462                                     redundant_metadata_changed_cb, os);
 463                         }
 464                         if (err == 0) {
 465                                 err = dsl_prop_register(ds,
 466                                     zfs_prop_to_name(ZFS_PROP_RECORDSIZE),
 467                                     recordsize_changed_cb, os);
 468                         }
 469                         if (err == 0) {
 470                                 err = dsl_prop_register(ds,
 471                                     zfs_prop_to_name(ZFS_PROP_DNODESIZE),
 472                                     dnodesize_changed_cb, os);
 473                         }
 474                 }
 475                 if (needlock)
 476                         dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
 477                 if (err != 0) {
 478                         VERIFY(arc_buf_remove_ref(os->os_phys_buf,
 479                             &os->os_phys_buf));
 480                         kmem_free(os, sizeof (objset_t));
 481                         return (err);
 482                 }
 483         } else {
 484                 /* It's the meta-objset. */
 485                 os->os_checksum = ZIO_CHECKSUM_FLETCHER_4;
 486                 os->os_compress = ZIO_COMPRESS_ON;
 487                 os->os_copies = spa_max_replication(spa);
 488                 os->os_dedup_checksum = ZIO_CHECKSUM_OFF;
 489                 os->os_dedup_verify = B_FALSE;
 490                 os->os_logbias = ZFS_LOGBIAS_LATENCY;
 491                 os->os_sync = ZFS_SYNC_STANDARD;
 492                 os->os_primary_cache = ZFS_CACHE_ALL;
 493                 os->os_secondary_cache = ZFS_CACHE_ALL;
 494                 os->os_dnodesize = DNODE_MIN_SIZE;
 495         }
 496
 497         if (ds == NULL || !ds->ds_is_snapshot)
 498                 os->os_zil_header = os->os_phys->os_zil_header;
 499         os->os_zil = zil_alloc(os, &os->os_zil_header);
 500
 501         for (i = 0; i < TXG_SIZE; i++) {
 502                 list_create(&os->os_dirty_dnodes[i], sizeof (dnode_t),
 503                     offsetof(dnode_t, dn_dirty_link[i]));
 504                 list_create(&os->os_free_dnodes[i], sizeof (dnode_t),
 505                     offsetof(dnode_t, dn_dirty_link[i]));
 506         }
 507         list_create(&os->os_dnodes, sizeof (dnode_t),
 508             offsetof(dnode_t, dn_link));
 509         list_create(&os->os_downgraded_dbufs, sizeof (dmu_buf_impl_t),
 510             offsetof(dmu_buf_impl_t, db_link));
 511
 512         list_link_init(&os->os_evicting_node);
 513
 514         mutex_init(&os->os_lock, NULL, MUTEX_DEFAULT, NULL);
 515         mutex_init(&os->os_obj_lock, NULL, MUTEX_DEFAULT, NULL);
 516         mutex_init(&os->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL);
 517
 518         dnode_special_open(os, &os->os_phys->os_meta_dnode,
 519             DMU_META_DNODE_OBJECT, &os->os_meta_dnode);
 520         if (arc_buf_size(os->os_phys_buf) >= sizeof (objset_phys_t)) {
 521                 dnode_special_open(os, &os->os_phys->os_userused_dnode,
 522                     DMU_USERUSED_OBJECT, &os->os_userused_dnode);
 523                 dnode_special_open(os, &os->os_phys->os_groupused_dnode,
 524                     DMU_GROUPUSED_OBJECT, &os->os_groupused_dnode);
 525         }
 526
 527         *osp = os;
 528         return (0);
 529 }
 530
 531 int
 532 dmu_objset_from_ds(dsl_dataset_t *ds, objset_t **osp)
 533 {
 534         int err = 0;
 535
 536         /*
 537          * We shouldn't be doing anything with dsl_dataset_t's unless the
 538          * pool_config lock is held, or the dataset is long-held.
 539          */
 540         ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool) ||
 541             dsl_dataset_long_held(ds));
 542
 543         mutex_enter(&ds->ds_opening_lock);
 544         if (ds->ds_objset == NULL) {
 545                 objset_t *os;
 546                 err = dmu_objset_open_impl(dsl_dataset_get_spa(ds),
 547                     ds, dsl_dataset_get_blkptr(ds), &os);
 548
 549                 if (err == 0) {
 550                         mutex_enter(&ds->ds_lock);
 551                         ASSERT(ds->ds_objset == NULL);
 552                         ds->ds_objset = os;
 553                         mutex_exit(&ds->ds_lock);
 554                 }
 555         }
 556         *osp = ds->ds_objset;
 557         mutex_exit(&ds->ds_opening_lock);
 558         return (err);
 559 }
 560
 561 /*
 562  * Holds the pool while the objset is held.  Therefore only one objset
 563  * can be held at a time.
 564  */
 565 int
 566 dmu_objset_hold(const char *name, void *tag, objset_t **osp)
 567 {
 568         dsl_pool_t *dp;
 569         dsl_dataset_t *ds;
 570         int err;
 571
 572         err = dsl_pool_hold(name, tag, &dp);
 573         if (err != 0)
 574                 return (err);
 575         err = dsl_dataset_hold(dp, name, tag, &ds);
 576         if (err != 0) {
 577                 dsl_pool_rele(dp, tag);
 578                 return (err);
 579         }
 580
 581         err = dmu_objset_from_ds(ds, osp);
 582         if (err != 0) {
 583                 dsl_dataset_rele(ds, tag);
 584                 dsl_pool_rele(dp, tag);
 585         }
 586
 587         return (err);
 588 }
 589
 590 static int
 591 dmu_objset_own_impl(dsl_dataset_t *ds, dmu_objset_type_t type,
 592     boolean_t readonly, void *tag, objset_t **osp)
 593 {
 594         int err;
 595
 596         err = dmu_objset_from_ds(ds, osp);
 597         if (err != 0) {
 598                 dsl_dataset_disown(ds, tag);
 599         } else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) {
 600                 dsl_dataset_disown(ds, tag);
 601                 return (SET_ERROR(EINVAL));
 602         } else if (!readonly && dsl_dataset_is_snapshot(ds)) {
 603                 dsl_dataset_disown(ds, tag);
 604                 return (SET_ERROR(EROFS));
 605         }
 606         return (err);
 607 }
 608
 609 /*
 610  * dsl_pool must not be held when this is called.
 611  * Upon successful return, there will be a longhold on the dataset,
 612  * and the dsl_pool will not be held.
 613  */
 614 int
 615 dmu_objset_own(const char *name, dmu_objset_type_t type,
 616     boolean_t readonly, void *tag, objset_t **osp)
 617 {
 618         dsl_pool_t *dp;
 619         dsl_dataset_t *ds;
 620         int err;
 621
 622         err = dsl_pool_hold(name, FTAG, &dp);
 623         if (err != 0)
 624                 return (err);
 625         err = dsl_dataset_own(dp, name, tag, &ds);
 626         if (err != 0) {
 627                 dsl_pool_rele(dp, FTAG);
 628                 return (err);
 629         }
 630         err = dmu_objset_own_impl(ds, type, readonly, tag, osp);
 631         dsl_pool_rele(dp, FTAG);
 632
 633         return (err);
 634 }
 635
 636 int
 637 dmu_objset_own_obj(dsl_pool_t *dp, uint64_t obj, dmu_objset_type_t type,
 638     boolean_t readonly, void *tag, objset_t **osp)
 639 {
 640         dsl_dataset_t *ds;
 641         int err;
 642
 643         err = dsl_dataset_own_obj(dp, obj, tag, &ds);
 644         if (err != 0)
 645                 return (err);
 646
 647         return (dmu_objset_own_impl(ds, type, readonly, tag, osp));
 648 }
 649
 650 void
 651 dmu_objset_rele(objset_t *os, void *tag)
 652 {
 653         dsl_pool_t *dp = dmu_objset_pool(os);
 654         dsl_dataset_rele(os->os_dsl_dataset, tag);
 655         dsl_pool_rele(dp, tag);
 656 }
 657
 658 /*
 659  * When we are called, os MUST refer to an objset associated with a dataset
 660  * that is owned by 'tag'; that is, is held and long held by 'tag' and ds_owner
 661  * == tag.  We will then release and reacquire ownership of the dataset while
 662  * holding the pool config_rwlock to avoid intervening namespace or ownership
 663  * changes may occur.
 664  *
 665  * This exists solely to accommodate zfs_ioc_userspace_upgrade()'s desire to
 666  * release the hold on its dataset and acquire a new one on the dataset of the
 667  * same name so that it can be partially torn down and reconstructed.
 668  */
 669 void
 670 dmu_objset_refresh_ownership(objset_t *os, void *tag)
 671 {
 672         dsl_pool_t *dp;
 673         dsl_dataset_t *ds, *newds;
 674         char name[ZFS_MAX_DATASET_NAME_LEN];
 675
 676         ds = os->os_dsl_dataset;
 677         VERIFY3P(ds, !=, NULL);
 678         VERIFY3P(ds->ds_owner, ==, tag);
 679         VERIFY(dsl_dataset_long_held(ds));
 680
 681         dsl_dataset_name(ds, name);
 682         dp = dmu_objset_pool(os);
 683         dsl_pool_config_enter(dp, FTAG);
 684         dmu_objset_disown(os, tag);
 685         VERIFY0(dsl_dataset_own(dp, name, tag, &newds));
 686         VERIFY3P(newds, ==, os->os_dsl_dataset);
 687         dsl_pool_config_exit(dp, FTAG);
 688 }
 689
 690 void
 691 dmu_objset_disown(objset_t *os, void *tag)
 692 {
 693         dsl_dataset_disown(os->os_dsl_dataset, tag);
 694 }
 695
 696 void
 697 dmu_objset_evict_dbufs(objset_t *os)
 698 {
 699         dnode_t *dn_marker;
 700         dnode_t *dn;
 701
 702         dn_marker = kmem_alloc(sizeof (dnode_t), KM_SLEEP);
 703
 704         mutex_enter(&os->os_lock);
 705         dn = list_head(&os->os_dnodes);
 706         while (dn != NULL) {
 707                 /*
 708                  * Skip dnodes without holds.  We have to do this dance
 709                  * because dnode_add_ref() only works if there is already a
 710                  * hold.  If the dnode has no holds, then it has no dbufs.
 711                  */
 712                 if (dnode_add_ref(dn, FTAG)) {
 713                         list_insert_after(&os->os_dnodes, dn, dn_marker);
 714                         mutex_exit(&os->os_lock);
 715
 716                         dnode_evict_dbufs(dn);
 717                         dnode_rele(dn, FTAG);
 718
 719                         mutex_enter(&os->os_lock);
 720                         dn = list_next(&os->os_dnodes, dn_marker);
 721                         list_remove(&os->os_dnodes, dn_marker);
 722                 } else {
 723                         dn = list_next(&os->os_dnodes, dn);
 724                 }
 725         }
 726         mutex_exit(&os->os_lock);
 727
 728         kmem_free(dn_marker, sizeof (dnode_t));
 729
 730         if (DMU_USERUSED_DNODE(os) != NULL) {
 731                 dnode_evict_dbufs(DMU_GROUPUSED_DNODE(os));
 732                 dnode_evict_dbufs(DMU_USERUSED_DNODE(os));
 733         }
 734         dnode_evict_dbufs(DMU_META_DNODE(os));
 735 }
 736
 737 /*
 738  * Objset eviction processing is split into into two pieces.
 739  * The first marks the objset as evicting, evicts any dbufs that
 740  * have a refcount of zero, and then queues up the objset for the
 741  * second phase of eviction.  Once os->os_dnodes has been cleared by
 742  * dnode_buf_pageout()->dnode_destroy(), the second phase is executed.
 743  * The second phase closes the special dnodes, dequeues the objset from
 744  * the list of those undergoing eviction, and finally frees the objset.
 745  *
 746  * NOTE: Due to asynchronous eviction processing (invocation of
 747  *       dnode_buf_pageout()), it is possible for the meta dnode for the
 748  *       objset to have no holds even though os->os_dnodes is not empty.
 749  */
 750 void
 751 dmu_objset_evict(objset_t *os)
 752 {
 753         int t;
 754
 755         dsl_dataset_t *ds = os->os_dsl_dataset;
 756
 757         for (t = 0; t < TXG_SIZE; t++)
 758                 ASSERT(!dmu_objset_is_dirty(os, t));
 759
 760         if (ds)
 761                 dsl_prop_unregister_all(ds, os);
 762
 763         if (os->os_sa)
 764                 sa_tear_down(os);
 765
 766         dmu_objset_evict_dbufs(os);
 767
 768         mutex_enter(&os->os_lock);
 769         spa_evicting_os_register(os->os_spa, os);
 770         if (list_is_empty(&os->os_dnodes)) {
 771                 mutex_exit(&os->os_lock);
 772                 dmu_objset_evict_done(os);
 773         } else {
 774                 mutex_exit(&os->os_lock);
 775         }
 776 }
 777
 778 void
 779 dmu_objset_evict_done(objset_t *os)
 780 {
 781         ASSERT3P(list_head(&os->os_dnodes), ==, NULL);
 782
 783         dnode_special_close(&os->os_meta_dnode);
 784         if (DMU_USERUSED_DNODE(os)) {
 785                 dnode_special_close(&os->os_userused_dnode);
 786                 dnode_special_close(&os->os_groupused_dnode);
 787         }
 788         zil_free(os->os_zil);
 789
 790         VERIFY(arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf));
 791
 792         /*
 793          * This is a barrier to prevent the objset from going away in
 794          * dnode_move() until we can safely ensure that the objset is still in
 795          * use. We consider the objset valid before the barrier and invalid
 796          * after the barrier.
 797          */
 798         rw_enter(&os_lock, RW_READER);
 799         rw_exit(&os_lock);
 800
 801         mutex_destroy(&os->os_lock);
 802         mutex_destroy(&os->os_obj_lock);
 803         mutex_destroy(&os->os_user_ptr_lock);
 804         spa_evicting_os_deregister(os->os_spa, os);
 805         kmem_free(os, sizeof (objset_t));
 806 }
 807
 808 timestruc_t
 809 dmu_objset_snap_cmtime(objset_t *os)
 810 {
 811         return (dsl_dir_snap_cmtime(os->os_dsl_dataset->ds_dir));
 812 }
 813
 814 /* called from dsl for meta-objset */
 815 objset_t *
 816 dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
 817     dmu_objset_type_t type, dmu_tx_t *tx)
 818 {
 819         objset_t *os;
 820         dnode_t *mdn;
 821
 822         ASSERT(dmu_tx_is_syncing(tx));
 823
 824         if (ds != NULL)
 825                 VERIFY0(dmu_objset_from_ds(ds, &os));
 826         else
 827                 VERIFY0(dmu_objset_open_impl(spa, NULL, bp, &os));
 828
 829         mdn = DMU_META_DNODE(os);
 830
 831         dnode_allocate(mdn, DMU_OT_DNODE, DNODE_BLOCK_SIZE, DN_MAX_INDBLKSHIFT,
 832             DMU_OT_NONE, 0, DNODE_MIN_SLOTS, tx);
 833
 834         /*
 835          * We don't want to have to increase the meta-dnode's nlevels
 836          * later, because then we could do it in quescing context while
 837          * we are also accessing it in open context.
 838          *
 839          * This precaution is not necessary for the MOS (ds == NULL),
 840          * because the MOS is only updated in syncing context.
 841          * This is most fortunate: the MOS is the only objset that
 842          * needs to be synced multiple times as spa_sync() iterates
 843          * to convergence, so minimizing its dn_nlevels matters.
 844          */
 845         if (ds != NULL) {
 846                 int levels = 1;
 847
 848                 /*
 849                  * Determine the number of levels necessary for the meta-dnode
 850                  * to contain DN_MAX_OBJECT dnodes.
 851                  */
 852                 while ((uint64_t)mdn->dn_nblkptr << (mdn->dn_datablkshift +
 853                     (levels - 1) * (mdn->dn_indblkshift - SPA_BLKPTRSHIFT)) <
 854                     DN_MAX_OBJECT * sizeof (dnode_phys_t))
 855                         levels++;
 856
 857                 mdn->dn_next_nlevels[tx->tx_txg & TXG_MASK] =
 858                     mdn->dn_nlevels = levels;
 859         }
 860
 861         ASSERT(type != DMU_OST_NONE);
 862         ASSERT(type != DMU_OST_ANY);
 863         ASSERT(type < DMU_OST_NUMTYPES);
 864         os->os_phys->os_type = type;
 865         if (dmu_objset_userused_enabled(os)) {
 866                 os->os_phys->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE;
 867                 os->os_flags = os->os_phys->os_flags;
 868         }
 869
 870         dsl_dataset_dirty(ds, tx);
 871
 872         return (os);
 873 }
 874
 875 typedef struct dmu_objset_create_arg {
 876         const char *doca_name;
 877         cred_t *doca_cred;
 878         void (*doca_userfunc)(objset_t *os, void *arg,
 879             cred_t *cr, dmu_tx_t *tx);
 880         void *doca_userarg;
 881         dmu_objset_type_t doca_type;
 882         uint64_t doca_flags;
 883 } dmu_objset_create_arg_t;
 884
 885 /*ARGSUSED*/
 886 static int
 887 dmu_objset_create_check(void *arg, dmu_tx_t *tx)
 888 {
 889         dmu_objset_create_arg_t *doca = arg;
 890         dsl_pool_t *dp = dmu_tx_pool(tx);
 891         dsl_dir_t *pdd;
 892         const char *tail;
 893         int error;
 894
 895         if (strchr(doca->doca_name, '@') != NULL)
 896                 return (SET_ERROR(EINVAL));
 897
 898         if (strlen(doca->doca_name) >= ZFS_MAX_DATASET_NAME_LEN)
 899                 return (SET_ERROR(ENAMETOOLONG));
 900
 901         error = dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail);
 902         if (error != 0)
 903                 return (error);
 904         if (tail == NULL) {
 905                 dsl_dir_rele(pdd, FTAG);
 906                 return (SET_ERROR(EEXIST));
 907         }
 908         error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL,
 909             doca->doca_cred);
 910         dsl_dir_rele(pdd, FTAG);
 911
 912         return (error);
 913 }
 914
 915 static void
 916 dmu_objset_create_sync(void *arg, dmu_tx_t *tx)
 917 {
 918         dmu_objset_create_arg_t *doca = arg;
 919         dsl_pool_t *dp = dmu_tx_pool(tx);
 920         dsl_dir_t *pdd;
 921         const char *tail;
 922         dsl_dataset_t *ds;
 923         uint64_t obj;
 924         blkptr_t *bp;
 925         objset_t *os;
 926
 927         VERIFY0(dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail));
 928
 929         obj = dsl_dataset_create_sync(pdd, tail, NULL, doca->doca_flags,
 930             doca->doca_cred, tx);
 931
 932         VERIFY0(dsl_dataset_hold_obj(pdd->dd_pool, obj, FTAG, &ds));
 933         bp = dsl_dataset_get_blkptr(ds);
 934         os = dmu_objset_create_impl(pdd->dd_pool->dp_spa,
 935             ds, bp, doca->doca_type, tx);
 936
 937         if (doca->doca_userfunc != NULL) {
 938                 doca->doca_userfunc(os, doca->doca_userarg,
 939                     doca->doca_cred, tx);
 940         }
 941
 942         spa_history_log_internal_ds(ds, "create", tx, "");
 943         zvol_create_minors(dp->dp_spa, doca->doca_name, B_TRUE);
 944
 945         dsl_dataset_rele(ds, FTAG);
 946         dsl_dir_rele(pdd, FTAG);
 947 }
 948
 949 int
 950 dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags,
 951     void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg)
 952 {
 953         dmu_objset_create_arg_t doca;
 954
 955         doca.doca_name = name;
 956         doca.doca_cred = CRED();
 957         doca.doca_flags = flags;
 958         doca.doca_userfunc = func;
 959         doca.doca_userarg = arg;
 960         doca.doca_type = type;
 961
 962         return (dsl_sync_task(name,
 963             dmu_objset_create_check, dmu_objset_create_sync, &doca,
 964             5, ZFS_SPACE_CHECK_NORMAL));
 965 }
 966
 967 typedef struct dmu_objset_clone_arg {
 968         const char *doca_clone;
 969         const char *doca_origin;
 970         cred_t *doca_cred;
 971 } dmu_objset_clone_arg_t;
 972
 973 /*ARGSUSED*/
 974 static int
 975 dmu_objset_clone_check(void *arg, dmu_tx_t *tx)
 976 {
 977         dmu_objset_clone_arg_t *doca = arg;
 978         dsl_dir_t *pdd;
 979         const char *tail;
 980         int error;
 981         dsl_dataset_t *origin;
 982         dsl_pool_t *dp = dmu_tx_pool(tx);
 983
 984         if (strchr(doca->doca_clone, '@') != NULL)
 985                 return (SET_ERROR(EINVAL));
 986
 987         if (strlen(doca->doca_clone) >= ZFS_MAX_DATASET_NAME_LEN)
 988                 return (SET_ERROR(ENAMETOOLONG));
 989
 990         error = dsl_dir_hold(dp, doca->doca_clone, FTAG, &pdd, &tail);
 991         if (error != 0)
 992                 return (error);
 993         if (tail == NULL) {
 994                 dsl_dir_rele(pdd, FTAG);
 995                 return (SET_ERROR(EEXIST));
 996         }
 997
 998         error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL,
 999             doca->doca_cred);
1000         if (error != 0) {
1001                 dsl_dir_rele(pdd, FTAG);
1002                 return (SET_ERROR(EDQUOT));
1003         }
1004         dsl_dir_rele(pdd, FTAG);
1005
1006         error = dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin);
1007         if (error != 0)
1008                 return (error);
1009
1010         /* You can only clone snapshots, not the head datasets. */
1011         if (!origin->ds_is_snapshot) {
1012                 dsl_dataset_rele(origin, FTAG);
1013                 return (SET_ERROR(EINVAL));
1014         }
1015         dsl_dataset_rele(origin, FTAG);
1016
1017         return (0);
1018 }
1019
1020 static void
1021 dmu_objset_clone_sync(void *arg, dmu_tx_t *tx)
1022 {
1023         dmu_objset_clone_arg_t *doca = arg;
1024         dsl_pool_t *dp = dmu_tx_pool(tx);
1025         dsl_dir_t *pdd;
1026         const char *tail;
1027         dsl_dataset_t *origin, *ds;
1028         uint64_t obj;
1029         char namebuf[ZFS_MAX_DATASET_NAME_LEN];
1030
1031         VERIFY0(dsl_dir_hold(dp, doca->doca_clone, FTAG, &pdd, &tail));
1032         VERIFY0(dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin));
1033
1034         obj = dsl_dataset_create_sync(pdd, tail, origin, 0,
1035             doca->doca_cred, tx);
1036
1037         VERIFY0(dsl_dataset_hold_obj(pdd->dd_pool, obj, FTAG, &ds));
1038         dsl_dataset_name(origin, namebuf);
1039         spa_history_log_internal_ds(ds, "clone", tx,
1040             "origin=%s (%llu)", namebuf, origin->ds_object);
1041         zvol_create_minors(dp->dp_spa, doca->doca_clone, B_TRUE);
1042         dsl_dataset_rele(ds, FTAG);
1043         dsl_dataset_rele(origin, FTAG);
1044         dsl_dir_rele(pdd, FTAG);
1045 }
1046
1047 int
1048 dmu_objset_clone(const char *clone, const char *origin)
1049 {
1050         dmu_objset_clone_arg_t doca;
1051
1052         doca.doca_clone = clone;
1053         doca.doca_origin = origin;
1054         doca.doca_cred = CRED();
1055
1056         return (dsl_sync_task(clone,
1057             dmu_objset_clone_check, dmu_objset_clone_sync, &doca,
1058             5, ZFS_SPACE_CHECK_NORMAL));
1059 }
1060
1061 int
1062 dmu_objset_snapshot_one(const char *fsname, const char *snapname)
1063 {
1064         int err;
1065         char *longsnap = kmem_asprintf("%s@%s", fsname, snapname);
1066         nvlist_t *snaps = fnvlist_alloc();
1067
1068         fnvlist_add_boolean(snaps, longsnap);
1069         strfree(longsnap);
1070         err = dsl_dataset_snapshot(snaps, NULL, NULL);
1071         fnvlist_free(snaps);
1072         return (err);
1073 }
1074
1075 static void
1076 dmu_objset_sync_dnodes(list_t *list, list_t *newlist, dmu_tx_t *tx)
1077 {
1078         dnode_t *dn;
1079
1080         while ((dn = list_head(list))) {
1081                 ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
1082                 ASSERT(dn->dn_dbuf->db_data_pending);
1083                 /*
1084                  * Initialize dn_zio outside dnode_sync() because the
1085                  * meta-dnode needs to set it ouside dnode_sync().
1086                  */
1087                 dn->dn_zio = dn->dn_dbuf->db_data_pending->dr_zio;
1088                 ASSERT(dn->dn_zio);
1089
1090                 ASSERT3U(dn->dn_nlevels, <=, DN_MAX_LEVELS);
1091                 list_remove(list, dn);
1092
1093                 if (newlist) {
1094                         (void) dnode_add_ref(dn, newlist);
1095                         list_insert_tail(newlist, dn);
1096                 }
1097
1098                 dnode_sync(dn, tx);
1099         }
1100 }
1101
1102 /* ARGSUSED */
1103 static void
1104 dmu_objset_write_ready(zio_t *zio, arc_buf_t *abuf, void *arg)
1105 {
1106         int i;
1107
1108         blkptr_t *bp = zio->io_bp;
1109         objset_t *os = arg;
1110         dnode_phys_t *dnp = &os->os_phys->os_meta_dnode;
1111
1112         ASSERT(!BP_IS_EMBEDDED(bp));
1113         ASSERT3P(bp, ==, os->os_rootbp);
1114         ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_OBJSET);
1115         ASSERT0(BP_GET_LEVEL(bp));
1116
1117         /*
1118          * Update rootbp fill count: it should be the number of objects
1119          * allocated in the object set (not counting the "special"
1120          * objects that are stored in the objset_phys_t -- the meta
1121          * dnode and user/group accounting objects).
1122          */
1123         bp->blk_fill = 0;
1124         for (i = 0; i < dnp->dn_nblkptr; i++)
1125                 bp->blk_fill += BP_GET_FILL(&dnp->dn_blkptr[i]);
1126 }
1127
1128 /* ARGSUSED */
1129 static void
1130 dmu_objset_write_done(zio_t *zio, arc_buf_t *abuf, void *arg)
1131 {
1132         blkptr_t *bp = zio->io_bp;
1133         blkptr_t *bp_orig = &zio->io_bp_orig;
1134         objset_t *os = arg;
1135
1136         if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
1137                 ASSERT(BP_EQUAL(bp, bp_orig));
1138         } else {
1139                 dsl_dataset_t *ds = os->os_dsl_dataset;
1140                 dmu_tx_t *tx = os->os_synctx;
1141
1142                 (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
1143                 dsl_dataset_block_born(ds, bp, tx);
1144         }
1145 }
1146
1147 /* called from dsl */
1148 void
1149 dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx)
1150 {
1151         int txgoff;
1152         zbookmark_phys_t zb;
1153         zio_prop_t zp;
1154         zio_t *zio;
1155         list_t *list;
1156         list_t *newlist = NULL;
1157         dbuf_dirty_record_t *dr;
1158
1159         dprintf_ds(os->os_dsl_dataset, "txg=%llu\n", tx->tx_txg);
1160
1161         ASSERT(dmu_tx_is_syncing(tx));
1162         /* XXX the write_done callback should really give us the tx... */
1163         os->os_synctx = tx;
1164
1165         if (os->os_dsl_dataset == NULL) {
1166                 /*
1167                  * This is the MOS.  If we have upgraded,
1168                  * spa_max_replication() could change, so reset
1169                  * os_copies here.
1170                  */
1171                 os->os_copies = spa_max_replication(os->os_spa);
1172         }
1173
1174         /*
1175          * Create the root block IO
1176          */
1177         SET_BOOKMARK(&zb, os->os_dsl_dataset ?
1178             os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
1179             ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
1180         arc_release(os->os_phys_buf, &os->os_phys_buf);
1181
1182         dmu_write_policy(os, NULL, 0, 0, &zp);
1183
1184         zio = arc_write(pio, os->os_spa, tx->tx_txg,
1185             os->os_rootbp, os->os_phys_buf, DMU_OS_IS_L2CACHEABLE(os),
1186             DMU_OS_IS_L2COMPRESSIBLE(os),
1187             &zp, dmu_objset_write_ready, NULL, NULL, dmu_objset_write_done,
1188             os, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
1189
1190         /*
1191          * Sync special dnodes - the parent IO for the sync is the root block
1192          */
1193         DMU_META_DNODE(os)->dn_zio = zio;
1194         dnode_sync(DMU_META_DNODE(os), tx);
1195
1196         os->os_phys->os_flags = os->os_flags;
1197
1198         if (DMU_USERUSED_DNODE(os) &&
1199             DMU_USERUSED_DNODE(os)->dn_type != DMU_OT_NONE) {
1200                 DMU_USERUSED_DNODE(os)->dn_zio = zio;
1201                 dnode_sync(DMU_USERUSED_DNODE(os), tx);
1202                 DMU_GROUPUSED_DNODE(os)->dn_zio = zio;
1203                 dnode_sync(DMU_GROUPUSED_DNODE(os), tx);
1204         }
1205
1206         txgoff = tx->tx_txg & TXG_MASK;
1207
1208         if (dmu_objset_userused_enabled(os)) {
1209                 newlist = &os->os_synced_dnodes;
1210                 /*
1211                  * We must create the list here because it uses the
1212                  * dn_dirty_link[] of this txg.
1213                  */
1214                 list_create(newlist, sizeof (dnode_t),
1215                     offsetof(dnode_t, dn_dirty_link[txgoff]));
1216         }
1217
1218         dmu_objset_sync_dnodes(&os->os_free_dnodes[txgoff], newlist, tx);
1219         dmu_objset_sync_dnodes(&os->os_dirty_dnodes[txgoff], newlist, tx);
1220
1221         list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff];
1222         while ((dr = list_head(list))) {
1223                 ASSERT0(dr->dr_dbuf->db_level);
1224                 list_remove(list, dr);
1225                 if (dr->dr_zio)
1226                         zio_nowait(dr->dr_zio);
1227         }
1228
1229         /* Enable dnode backfill if enough objects have been freed. */
1230         if (os->os_freed_dnodes >= dmu_rescan_dnode_threshold) {
1231                 os->os_rescan_dnodes = B_TRUE;
1232                 os->os_freed_dnodes = 0;
1233         }
1234
1235         /*
1236          * Free intent log blocks up to this tx.
1237          */
1238         zil_sync(os->os_zil, tx);
1239         os->os_phys->os_zil_header = os->os_zil_header;
1240         zio_nowait(zio);
1241 }
1242
1243 boolean_t
1244 dmu_objset_is_dirty(objset_t *os, uint64_t txg)
1245 {
1246         return (!list_is_empty(&os->os_dirty_dnodes[txg & TXG_MASK]) ||
1247             !list_is_empty(&os->os_free_dnodes[txg & TXG_MASK]));
1248 }
1249
1250 static objset_used_cb_t *used_cbs[DMU_OST_NUMTYPES];
1251
1252 void
1253 dmu_objset_register_type(dmu_objset_type_t ost, objset_used_cb_t *cb)
1254 {
1255         used_cbs[ost] = cb;
1256 }
1257
1258 boolean_t
1259 dmu_objset_userused_enabled(objset_t *os)
1260 {
1261         return (spa_version(os->os_spa) >= SPA_VERSION_USERSPACE &&
1262             used_cbs[os->os_phys->os_type] != NULL &&
1263             DMU_USERUSED_DNODE(os) != NULL);
1264 }
1265
1266 static void
1267 do_userquota_update(objset_t *os, uint64_t used, uint64_t flags,
1268     uint64_t user, uint64_t group, boolean_t subtract, dmu_tx_t *tx)
1269 {
1270         if ((flags & DNODE_FLAG_USERUSED_ACCOUNTED)) {
1271                 int64_t delta = DNODE_MIN_SIZE + used;
1272                 if (subtract)
1273                         delta = -delta;
1274                 VERIFY3U(0, ==, zap_increment_int(os, DMU_USERUSED_OBJECT,
1275                     user, delta, tx));
1276                 VERIFY3U(0, ==, zap_increment_int(os, DMU_GROUPUSED_OBJECT,
1277                     group, delta, tx));
1278         }
1279 }
1280
1281 void
1282 dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx)
1283 {
1284         dnode_t *dn;
1285         list_t *list = &os->os_synced_dnodes;
1286
1287         ASSERT(list_head(list) == NULL || dmu_objset_userused_enabled(os));
1288
1289         while ((dn = list_head(list))) {
1290                 int flags;
1291                 ASSERT(!DMU_OBJECT_IS_SPECIAL(dn->dn_object));
1292                 ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE ||
1293                     dn->dn_phys->dn_flags &
1294                     DNODE_FLAG_USERUSED_ACCOUNTED);
1295
1296                 /* Allocate the user/groupused objects if necessary. */
1297                 if (DMU_USERUSED_DNODE(os)->dn_type == DMU_OT_NONE) {
1298                         VERIFY(0 == zap_create_claim(os,
1299                             DMU_USERUSED_OBJECT,
1300                             DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));
1301                         VERIFY(0 == zap_create_claim(os,
1302                             DMU_GROUPUSED_OBJECT,
1303                             DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));
1304                 }
1305
1306                 /*
1307                  * We intentionally modify the zap object even if the
1308                  * net delta is zero.  Otherwise
1309                  * the block of the zap obj could be shared between
1310                  * datasets but need to be different between them after
1311                  * a bprewrite.
1312                  */
1313
1314                 flags = dn->dn_id_flags;
1315                 ASSERT(flags);
1316                 if (flags & DN_ID_OLD_EXIST)  {
1317                         do_userquota_update(os, dn->dn_oldused, dn->dn_oldflags,
1318                             dn->dn_olduid, dn->dn_oldgid, B_TRUE, tx);
1319                 }
1320                 if (flags & DN_ID_NEW_EXIST) {
1321                         do_userquota_update(os, DN_USED_BYTES(dn->dn_phys),
1322                             dn->dn_phys->dn_flags,  dn->dn_newuid,
1323                             dn->dn_newgid, B_FALSE, tx);
1324                 }
1325
1326                 mutex_enter(&dn->dn_mtx);
1327                 dn->dn_oldused = 0;
1328                 dn->dn_oldflags = 0;
1329                 if (dn->dn_id_flags & DN_ID_NEW_EXIST) {
1330                         dn->dn_olduid = dn->dn_newuid;
1331                         dn->dn_oldgid = dn->dn_newgid;
1332                         dn->dn_id_flags |= DN_ID_OLD_EXIST;
1333                         if (dn->dn_bonuslen == 0)
1334                                 dn->dn_id_flags |= DN_ID_CHKED_SPILL;
1335                         else
1336                                 dn->dn_id_flags |= DN_ID_CHKED_BONUS;
1337                 }
1338                 dn->dn_id_flags &= ~(DN_ID_NEW_EXIST);
1339                 mutex_exit(&dn->dn_mtx);
1340
1341                 list_remove(list, dn);
1342                 dnode_rele(dn, list);
1343         }
1344 }
1345
1346 /*
1347  * Returns a pointer to data to find uid/gid from
1348  *
1349  * If a dirty record for transaction group that is syncing can't
1350  * be found then NULL is returned.  In the NULL case it is assumed
1351  * the uid/gid aren't changing.
1352  */
1353 static void *
1354 dmu_objset_userquota_find_data(dmu_buf_impl_t *db, dmu_tx_t *tx)
1355 {
1356         dbuf_dirty_record_t *dr, **drp;
1357         void *data;
1358
1359         if (db->db_dirtycnt == 0)
1360                 return (db->db.db_data);  /* Nothing is changing */
1361
1362         for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next)
1363                 if (dr->dr_txg == tx->tx_txg)
1364                         break;
1365
1366         if (dr == NULL) {
1367                 data = NULL;
1368         } else {
1369                 dnode_t *dn;
1370
1371                 DB_DNODE_ENTER(dr->dr_dbuf);
1372                 dn = DB_DNODE(dr->dr_dbuf);
1373
1374                 if (dn->dn_bonuslen == 0 &&
1375                     dr->dr_dbuf->db_blkid == DMU_SPILL_BLKID)
1376                         data = dr->dt.dl.dr_data->b_data;
1377                 else
1378                         data = dr->dt.dl.dr_data;
1379
1380                 DB_DNODE_EXIT(dr->dr_dbuf);
1381         }
1382
1383         return (data);
1384 }
1385
1386 void
1387 dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx)
1388 {
1389         objset_t *os = dn->dn_objset;
1390         void *data = NULL;
1391         dmu_buf_impl_t *db = NULL;
1392         uint64_t *user = NULL;
1393         uint64_t *group = NULL;
1394         int flags = dn->dn_id_flags;
1395         int error;
1396         boolean_t have_spill = B_FALSE;
1397
1398         if (!dmu_objset_userused_enabled(dn->dn_objset))
1399                 return;
1400
1401         if (before && (flags & (DN_ID_CHKED_BONUS|DN_ID_OLD_EXIST|
1402             DN_ID_CHKED_SPILL)))
1403                 return;
1404
1405         if (before && dn->dn_bonuslen != 0)
1406                 data = DN_BONUS(dn->dn_phys);
1407         else if (!before && dn->dn_bonuslen != 0) {
1408                 if (dn->dn_bonus) {
1409                         db = dn->dn_bonus;
1410                         mutex_enter(&db->db_mtx);
1411                         data = dmu_objset_userquota_find_data(db, tx);
1412                 } else {
1413                         data = DN_BONUS(dn->dn_phys);
1414                 }
1415         } else if (dn->dn_bonuslen == 0 && dn->dn_bonustype == DMU_OT_SA) {
1416                         int rf = 0;
1417
1418                         if (RW_WRITE_HELD(&dn->dn_struct_rwlock))
1419                                 rf |= DB_RF_HAVESTRUCT;
1420                         error = dmu_spill_hold_by_dnode(dn,
1421                             rf | DB_RF_MUST_SUCCEED,
1422                             FTAG, (dmu_buf_t **)&db);
1423                         ASSERT(error == 0);
1424                         mutex_enter(&db->db_mtx);
1425                         data = (before) ? db->db.db_data :
1426                             dmu_objset_userquota_find_data(db, tx);
1427                         have_spill = B_TRUE;
1428         } else {
1429                 mutex_enter(&dn->dn_mtx);
1430                 dn->dn_id_flags |= DN_ID_CHKED_BONUS;
1431                 mutex_exit(&dn->dn_mtx);
1432                 return;
1433         }
1434
1435         if (before) {
1436                 ASSERT(data);
1437                 user = &dn->dn_olduid;
1438                 group = &dn->dn_oldgid;
1439         } else if (data) {
1440                 user = &dn->dn_newuid;
1441                 group = &dn->dn_newgid;
1442         }
1443
1444         /*
1445          * Must always call the callback in case the object
1446          * type has changed and that type isn't an object type to track
1447          */
1448         error = used_cbs[os->os_phys->os_type](dn->dn_bonustype, data,
1449             user, group);
1450
1451         /*
1452          * Preserve existing uid/gid when the callback can't determine
1453          * what the new uid/gid are and the callback returned EEXIST.
1454          * The EEXIST error tells us to just use the existing uid/gid.
1455          * If we don't know what the old values are then just assign
1456          * them to 0, since that is a new file  being created.
1457          */
1458         if (!before && data == NULL && error == EEXIST) {
1459                 if (flags & DN_ID_OLD_EXIST) {
1460                         dn->dn_newuid = dn->dn_olduid;
1461                         dn->dn_newgid = dn->dn_oldgid;
1462                 } else {
1463                         dn->dn_newuid = 0;
1464                         dn->dn_newgid = 0;
1465                 }
1466                 error = 0;
1467         }
1468
1469         if (db)
1470                 mutex_exit(&db->db_mtx);
1471
1472         mutex_enter(&dn->dn_mtx);
1473         if (error == 0 && before)
1474                 dn->dn_id_flags |= DN_ID_OLD_EXIST;
1475         if (error == 0 && !before)
1476                 dn->dn_id_flags |= DN_ID_NEW_EXIST;
1477
1478         if (have_spill) {
1479                 dn->dn_id_flags |= DN_ID_CHKED_SPILL;
1480         } else {
1481                 dn->dn_id_flags |= DN_ID_CHKED_BONUS;
1482         }
1483         mutex_exit(&dn->dn_mtx);
1484         if (have_spill)
1485                 dmu_buf_rele((dmu_buf_t *)db, FTAG);
1486 }
1487
1488 boolean_t
1489 dmu_objset_userspace_present(objset_t *os)
1490 {
1491         return (os->os_phys->os_flags &
1492             OBJSET_FLAG_USERACCOUNTING_COMPLETE);
1493 }
1494
1495 int
1496 dmu_objset_userspace_upgrade(objset_t *os)
1497 {
1498         uint64_t obj;
1499         int err = 0;
1500
1501         if (dmu_objset_userspace_present(os))
1502                 return (0);
1503         if (!dmu_objset_userused_enabled(os))
1504                 return (SET_ERROR(ENOTSUP));
1505         if (dmu_objset_is_snapshot(os))
1506                 return (SET_ERROR(EINVAL));
1507
1508         /*
1509          * We simply need to mark every object dirty, so that it will be
1510          * synced out and now accounted.  If this is called
1511          * concurrently, or if we already did some work before crashing,
1512          * that's fine, since we track each object's accounted state
1513          * independently.
1514          */
1515
1516         for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 0)) {
1517                 dmu_tx_t *tx;
1518                 dmu_buf_t *db;
1519                 int objerr;
1520
1521                 if (issig(JUSTLOOKING) && issig(FORREAL))
1522                         return (SET_ERROR(EINTR));
1523
1524                 objerr = dmu_bonus_hold(os, obj, FTAG, &db);
1525                 if (objerr != 0)
1526                         continue;
1527                 tx = dmu_tx_create(os);
1528                 dmu_tx_hold_bonus(tx, obj);
1529                 objerr = dmu_tx_assign(tx, TXG_WAIT);
1530                 if (objerr != 0) {
1531                         dmu_tx_abort(tx);
1532                         continue;
1533                 }
1534                 dmu_buf_will_dirty(db, tx);
1535                 dmu_buf_rele(db, FTAG);
1536                 dmu_tx_commit(tx);
1537         }
1538
1539         os->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE;
1540         txg_wait_synced(dmu_objset_pool(os), 0);
1541         return (0);
1542 }
1543
1544 void
1545 dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp,
1546     uint64_t *usedobjsp, uint64_t *availobjsp)
1547 {
1548         dsl_dataset_space(os->os_dsl_dataset, refdbytesp, availbytesp,
1549             usedobjsp, availobjsp);
1550 }
1551
1552 uint64_t
1553 dmu_objset_fsid_guid(objset_t *os)
1554 {
1555         return (dsl_dataset_fsid_guid(os->os_dsl_dataset));
1556 }
1557
1558 void
1559 dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat)
1560 {
1561         stat->dds_type = os->os_phys->os_type;
1562         if (os->os_dsl_dataset)
1563                 dsl_dataset_fast_stat(os->os_dsl_dataset, stat);
1564 }
1565
1566 void
1567 dmu_objset_stats(objset_t *os, nvlist_t *nv)
1568 {
1569         ASSERT(os->os_dsl_dataset ||
1570             os->os_phys->os_type == DMU_OST_META);
1571
1572         if (os->os_dsl_dataset != NULL)
1573                 dsl_dataset_stats(os->os_dsl_dataset, nv);
1574
1575         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_TYPE,
1576             os->os_phys->os_type);
1577         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERACCOUNTING,
1578             dmu_objset_userspace_present(os));
1579 }
1580
1581 int
1582 dmu_objset_is_snapshot(objset_t *os)
1583 {
1584         if (os->os_dsl_dataset != NULL)
1585                 return (os->os_dsl_dataset->ds_is_snapshot);
1586         else
1587                 return (B_FALSE);
1588 }
1589
1590 int
1591 dmu_snapshot_realname(objset_t *os, char *name, char *real, int maxlen,
1592     boolean_t *conflict)
1593 {
1594         dsl_dataset_t *ds = os->os_dsl_dataset;
1595         uint64_t ignored;
1596
1597         if (dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0)
1598                 return (SET_ERROR(ENOENT));
1599
1600         return (zap_lookup_norm(ds->ds_dir->dd_pool->dp_meta_objset,
1601             dsl_dataset_phys(ds)->ds_snapnames_zapobj, name, 8, 1, &ignored,
1602             MT_FIRST, real, maxlen, conflict));
1603 }
1604
1605 int
1606 dmu_snapshot_list_next(objset_t *os, int namelen, char *name,
1607     uint64_t *idp, uint64_t *offp, boolean_t *case_conflict)
1608 {
1609         dsl_dataset_t *ds = os->os_dsl_dataset;
1610         zap_cursor_t cursor;
1611         zap_attribute_t attr;
1612
1613         ASSERT(dsl_pool_config_held(dmu_objset_pool(os)));
1614
1615         if (dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0)
1616                 return (SET_ERROR(ENOENT));
1617
1618         zap_cursor_init_serialized(&cursor,
1619             ds->ds_dir->dd_pool->dp_meta_objset,
1620             dsl_dataset_phys(ds)->ds_snapnames_zapobj, *offp);
1621
1622         if (zap_cursor_retrieve(&cursor, &attr) != 0) {
1623                 zap_cursor_fini(&cursor);
1624                 return (SET_ERROR(ENOENT));
1625         }
1626
1627         if (strlen(attr.za_name) + 1 > namelen) {
1628                 zap_cursor_fini(&cursor);
1629                 return (SET_ERROR(ENAMETOOLONG));
1630         }
1631
1632         (void) strcpy(name, attr.za_name);
1633         if (idp)
1634                 *idp = attr.za_first_integer;
1635         if (case_conflict)
1636                 *case_conflict = attr.za_normalization_conflict;
1637         zap_cursor_advance(&cursor);
1638         *offp = zap_cursor_serialize(&cursor);
1639         zap_cursor_fini(&cursor);
1640
1641         return (0);
1642 }
1643
1644 int
1645 dmu_snapshot_lookup(objset_t *os, const char *name, uint64_t *value)
1646 {
1647         return (dsl_dataset_snap_lookup(os->os_dsl_dataset, name, value));
1648 }
1649
1650 int
1651 dmu_dir_list_next(objset_t *os, int namelen, char *name,
1652     uint64_t *idp, uint64_t *offp)
1653 {
1654         dsl_dir_t *dd = os->os_dsl_dataset->ds_dir;
1655         zap_cursor_t cursor;
1656         zap_attribute_t attr;
1657
1658         /* there is no next dir on a snapshot! */
1659         if (os->os_dsl_dataset->ds_object !=
1660             dsl_dir_phys(dd)->dd_head_dataset_obj)
1661                 return (SET_ERROR(ENOENT));
1662
1663         zap_cursor_init_serialized(&cursor,
1664             dd->dd_pool->dp_meta_objset,
1665             dsl_dir_phys(dd)->dd_child_dir_zapobj, *offp);
1666
1667         if (zap_cursor_retrieve(&cursor, &attr) != 0) {
1668                 zap_cursor_fini(&cursor);
1669                 return (SET_ERROR(ENOENT));
1670         }
1671
1672         if (strlen(attr.za_name) + 1 > namelen) {
1673                 zap_cursor_fini(&cursor);
1674                 return (SET_ERROR(ENAMETOOLONG));
1675         }
1676
1677         (void) strcpy(name, attr.za_name);
1678         if (idp)
1679                 *idp = attr.za_first_integer;
1680         zap_cursor_advance(&cursor);
1681         *offp = zap_cursor_serialize(&cursor);
1682         zap_cursor_fini(&cursor);
1683
1684         return (0);
1685 }
1686
1687 typedef struct dmu_objset_find_ctx {
1688         taskq_t         *dc_tq;
1689         dsl_pool_t      *dc_dp;
1690         uint64_t        dc_ddobj;
1691         int             (*dc_func)(dsl_pool_t *, dsl_dataset_t *, void *);
1692         void            *dc_arg;
1693         int             dc_flags;
1694         kmutex_t        *dc_error_lock;
1695         int             *dc_error;
1696 } dmu_objset_find_ctx_t;
1697
1698 static void
1699 dmu_objset_find_dp_impl(dmu_objset_find_ctx_t *dcp)
1700 {
1701         dsl_pool_t *dp = dcp->dc_dp;
1702         dmu_objset_find_ctx_t *child_dcp;
1703         dsl_dir_t *dd;
1704         dsl_dataset_t *ds;
1705         zap_cursor_t zc;
1706         zap_attribute_t *attr;
1707         uint64_t thisobj;
1708         int err = 0;
1709
1710         /* don't process if there already was an error */
1711         if (*dcp->dc_error != 0)
1712                 goto out;
1713
1714         err = dsl_dir_hold_obj(dp, dcp->dc_ddobj, NULL, FTAG, &dd);
1715         if (err != 0)
1716                 goto out;
1717
1718         /* Don't visit hidden ($MOS & $ORIGIN) objsets. */
1719         if (dd->dd_myname[0] == '$') {
1720                 dsl_dir_rele(dd, FTAG);
1721                 goto out;
1722         }
1723
1724         thisobj = dsl_dir_phys(dd)->dd_head_dataset_obj;
1725         attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
1726
1727         /*
1728          * Iterate over all children.
1729          */
1730         if (dcp->dc_flags & DS_FIND_CHILDREN) {
1731                 for (zap_cursor_init(&zc, dp->dp_meta_objset,
1732                     dsl_dir_phys(dd)->dd_child_dir_zapobj);
1733                     zap_cursor_retrieve(&zc, attr) == 0;
1734                     (void) zap_cursor_advance(&zc)) {
1735                         ASSERT3U(attr->za_integer_length, ==,
1736                             sizeof (uint64_t));
1737                         ASSERT3U(attr->za_num_integers, ==, 1);
1738
1739                         child_dcp = kmem_alloc(sizeof (*child_dcp), KM_SLEEP);
1740                         *child_dcp = *dcp;
1741                         child_dcp->dc_ddobj = attr->za_first_integer;
1742                         if (dcp->dc_tq != NULL)
1743                                 (void) taskq_dispatch(dcp->dc_tq,
1744                                     dmu_objset_find_dp_cb, child_dcp, TQ_SLEEP);
1745                         else
1746                                 dmu_objset_find_dp_impl(child_dcp);
1747                 }
1748                 zap_cursor_fini(&zc);
1749         }
1750
1751         /*
1752          * Iterate over all snapshots.
1753          */
1754         if (dcp->dc_flags & DS_FIND_SNAPSHOTS) {
1755                 dsl_dataset_t *ds;
1756                 err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
1757
1758                 if (err == 0) {
1759                         uint64_t snapobj;
1760
1761                         snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
1762                         dsl_dataset_rele(ds, FTAG);
1763
1764                         for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj);
1765                             zap_cursor_retrieve(&zc, attr) == 0;
1766                             (void) zap_cursor_advance(&zc)) {
1767                                 ASSERT3U(attr->za_integer_length, ==,
1768                                     sizeof (uint64_t));
1769                                 ASSERT3U(attr->za_num_integers, ==, 1);
1770
1771                                 err = dsl_dataset_hold_obj(dp,
1772                                     attr->za_first_integer, FTAG, &ds);
1773                                 if (err != 0)
1774                                         break;
1775                                 err = dcp->dc_func(dp, ds, dcp->dc_arg);
1776                                 dsl_dataset_rele(ds, FTAG);
1777                                 if (err != 0)
1778                                         break;
1779                         }
1780                         zap_cursor_fini(&zc);
1781                 }
1782         }
1783
1784         dsl_dir_rele(dd, FTAG);
1785         kmem_free(attr, sizeof (zap_attribute_t));
1786
1787         if (err != 0)
1788                 goto out;
1789
1790         /*
1791          * Apply to self.
1792          */
1793         err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
1794         if (err != 0)
1795                 goto out;
1796         err = dcp->dc_func(dp, ds, dcp->dc_arg);
1797         dsl_dataset_rele(ds, FTAG);
1798
1799 out:
1800         if (err != 0) {
1801                 mutex_enter(dcp->dc_error_lock);
1802                 /* only keep first error */
1803                 if (*dcp->dc_error == 0)
1804                         *dcp->dc_error = err;
1805                 mutex_exit(dcp->dc_error_lock);
1806         }
1807
1808         kmem_free(dcp, sizeof (*dcp));
1809 }
1810
1811 static void
1812 dmu_objset_find_dp_cb(void *arg)
1813 {
1814         dmu_objset_find_ctx_t *dcp = arg;
1815         dsl_pool_t *dp = dcp->dc_dp;
1816
1817         /*
1818          * We need to get a pool_config_lock here, as there are several
1819          * asssert(pool_config_held) down the stack. Getting a lock via
1820          * dsl_pool_config_enter is risky, as it might be stalled by a
1821          * pending writer. This would deadlock, as the write lock can
1822          * only be granted when our parent thread gives up the lock.
1823          * The _prio interface gives us priority over a pending writer.
1824          */
1825         dsl_pool_config_enter_prio(dp, FTAG);
1826
1827         dmu_objset_find_dp_impl(dcp);
1828
1829         dsl_pool_config_exit(dp, FTAG);
1830 }
1831
1832 /*
1833  * Find objsets under and including ddobj, call func(ds) on each.
1834  * The order for the enumeration is completely undefined.
1835  * func is called with dsl_pool_config held.
1836  */
1837 int
1838 dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj,
1839     int func(dsl_pool_t *, dsl_dataset_t *, void *), void *arg, int flags)
1840 {
1841         int error = 0;
1842         taskq_t *tq = NULL;
1843         int ntasks;
1844         dmu_objset_find_ctx_t *dcp;
1845         kmutex_t err_lock;
1846
1847         mutex_init(&err_lock, NULL, MUTEX_DEFAULT, NULL);
1848         dcp = kmem_alloc(sizeof (*dcp), KM_SLEEP);
1849         dcp->dc_tq = NULL;
1850         dcp->dc_dp = dp;
1851         dcp->dc_ddobj = ddobj;
1852         dcp->dc_func = func;
1853         dcp->dc_arg = arg;
1854         dcp->dc_flags = flags;
1855         dcp->dc_error_lock = &err_lock;
1856         dcp->dc_error = &error;
1857
1858         if ((flags & DS_FIND_SERIALIZE) || dsl_pool_config_held_writer(dp)) {
1859                 /*
1860                  * In case a write lock is held we can't make use of
1861                  * parallelism, as down the stack of the worker threads
1862                  * the lock is asserted via dsl_pool_config_held.
1863                  * In case of a read lock this is solved by getting a read
1864                  * lock in each worker thread, which isn't possible in case
1865                  * of a writer lock. So we fall back to the synchronous path
1866                  * here.
1867                  * In the future it might be possible to get some magic into
1868                  * dsl_pool_config_held in a way that it returns true for
1869                  * the worker threads so that a single lock held from this
1870                  * thread suffices. For now, stay single threaded.
1871                  */
1872                 dmu_objset_find_dp_impl(dcp);
1873                 mutex_destroy(&err_lock);
1874
1875                 return (error);
1876         }
1877
1878         ntasks = dmu_find_threads;
1879         if (ntasks == 0)
1880                 ntasks = vdev_count_leaves(dp->dp_spa) * 4;
1881         tq = taskq_create("dmu_objset_find", ntasks, maxclsyspri, ntasks,
1882             INT_MAX, 0);
1883         if (tq == NULL) {
1884                 kmem_free(dcp, sizeof (*dcp));
1885                 mutex_destroy(&err_lock);
1886
1887                 return (SET_ERROR(ENOMEM));
1888         }
1889         dcp->dc_tq = tq;
1890
1891         /* dcp will be freed by task */
1892         (void) taskq_dispatch(tq, dmu_objset_find_dp_cb, dcp, TQ_SLEEP);
1893
1894         /*
1895          * PORTING: this code relies on the property of taskq_wait to wait
1896          * until no more tasks are queued and no more tasks are active. As
1897          * we always queue new tasks from within other tasks, task_wait
1898          * reliably waits for the full recursion to finish, even though we
1899          * enqueue new tasks after taskq_wait has been called.
1900          * On platforms other than illumos, taskq_wait may not have this
1901          * property.
1902          */
1903         taskq_wait(tq);
1904         taskq_destroy(tq);
1905         mutex_destroy(&err_lock);
1906
1907         return (error);
1908 }
1909
1910 /*
1911  * Find all objsets under name, and for each, call 'func(child_name, arg)'.
1912  * The dp_config_rwlock must not be held when this is called, and it
1913  * will not be held when the callback is called.
1914  * Therefore this function should only be used when the pool is not changing
1915  * (e.g. in syncing context), or the callback can deal with the possible races.
1916  */
1917 static int
1918 dmu_objset_find_impl(spa_t *spa, const char *name,
1919     int func(const char *, void *), void *arg, int flags)
1920 {
1921         dsl_dir_t *dd;
1922         dsl_pool_t *dp = spa_get_dsl(spa);
1923         dsl_dataset_t *ds;
1924         zap_cursor_t zc;
1925         zap_attribute_t *attr;
1926         char *child;
1927         uint64_t thisobj;
1928         int err;
1929
1930         dsl_pool_config_enter(dp, FTAG);
1931
1932         err = dsl_dir_hold(dp, name, FTAG, &dd, NULL);
1933         if (err != 0) {
1934                 dsl_pool_config_exit(dp, FTAG);
1935                 return (err);
1936         }
1937
1938         /* Don't visit hidden ($MOS & $ORIGIN) objsets. */
1939         if (dd->dd_myname[0] == '$') {
1940                 dsl_dir_rele(dd, FTAG);
1941                 dsl_pool_config_exit(dp, FTAG);
1942                 return (0);
1943         }
1944
1945         thisobj = dsl_dir_phys(dd)->dd_head_dataset_obj;
1946         attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
1947
1948         /*
1949          * Iterate over all children.
1950          */
1951         if (flags & DS_FIND_CHILDREN) {
1952                 for (zap_cursor_init(&zc, dp->dp_meta_objset,
1953                     dsl_dir_phys(dd)->dd_child_dir_zapobj);
1954                     zap_cursor_retrieve(&zc, attr) == 0;
1955                     (void) zap_cursor_advance(&zc)) {
1956                         ASSERT3U(attr->za_integer_length, ==,
1957                             sizeof (uint64_t));
1958                         ASSERT3U(attr->za_num_integers, ==, 1);
1959
1960                         child = kmem_asprintf("%s/%s", name, attr->za_name);
1961                         dsl_pool_config_exit(dp, FTAG);
1962                         err = dmu_objset_find_impl(spa, child,
1963                             func, arg, flags);
1964                         dsl_pool_config_enter(dp, FTAG);
1965                         strfree(child);
1966                         if (err != 0)
1967                                 break;
1968                 }
1969                 zap_cursor_fini(&zc);
1970
1971                 if (err != 0) {
1972                         dsl_dir_rele(dd, FTAG);
1973                         dsl_pool_config_exit(dp, FTAG);
1974                         kmem_free(attr, sizeof (zap_attribute_t));
1975                         return (err);
1976                 }
1977         }
1978
1979         /*
1980          * Iterate over all snapshots.
1981          */
1982         if (flags & DS_FIND_SNAPSHOTS) {
1983                 err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
1984
1985                 if (err == 0) {
1986                         uint64_t snapobj;
1987
1988                         snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
1989                         dsl_dataset_rele(ds, FTAG);
1990
1991                         for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj);
1992                             zap_cursor_retrieve(&zc, attr) == 0;
1993                             (void) zap_cursor_advance(&zc)) {
1994                                 ASSERT3U(attr->za_integer_length, ==,
1995                                     sizeof (uint64_t));
1996                                 ASSERT3U(attr->za_num_integers, ==, 1);
1997
1998                                 child = kmem_asprintf("%s@%s",
1999                                     name, attr->za_name);
2000                                 dsl_pool_config_exit(dp, FTAG);
2001                                 err = func(child, arg);
2002                                 dsl_pool_config_enter(dp, FTAG);
2003                                 strfree(child);
2004                                 if (err != 0)
2005                                         break;
2006                         }
2007                         zap_cursor_fini(&zc);
2008                 }
2009         }
2010
2011         dsl_dir_rele(dd, FTAG);
2012         kmem_free(attr, sizeof (zap_attribute_t));
2013         dsl_pool_config_exit(dp, FTAG);
2014
2015         if (err != 0)
2016                 return (err);
2017
2018         /* Apply to self. */
2019         return (func(name, arg));
2020 }
2021
2022 /*
2023  * See comment above dmu_objset_find_impl().
2024  */
2025 int
2026 dmu_objset_find(char *name, int func(const char *, void *), void *arg,
2027     int flags)
2028 {
2029         spa_t *spa;
2030         int error;
2031
2032         error = spa_open(name, &spa, FTAG);
2033         if (error != 0)
2034                 return (error);
2035         error = dmu_objset_find_impl(spa, name, func, arg, flags);
2036         spa_close(spa, FTAG);
2037         return (error);
2038 }
2039
2040 void
2041 dmu_objset_set_user(objset_t *os, void *user_ptr)
2042 {
2043         ASSERT(MUTEX_HELD(&os->os_user_ptr_lock));
2044         os->os_user_ptr = user_ptr;
2045 }
2046
2047 void *
2048 dmu_objset_get_user(objset_t *os)
2049 {
2050         ASSERT(MUTEX_HELD(&os->os_user_ptr_lock));
2051         return (os->os_user_ptr);
2052 }
2053
2054 /*
2055  * Determine name of filesystem, given name of snapshot.
2056  * buf must be at least ZFS_MAX_DATASET_NAME_LEN bytes
2057  */
2058 int
2059 dmu_fsname(const char *snapname, char *buf)
2060 {
2061         char *atp = strchr(snapname, '@');
2062         if (atp == NULL)
2063                 return (SET_ERROR(EINVAL));
2064         if (atp - snapname >= ZFS_MAX_DATASET_NAME_LEN)
2065                 return (SET_ERROR(ENAMETOOLONG));
2066         (void) strlcpy(buf, snapname, atp - snapname + 1);
2067         return (0);
2068 }
2069
2070 #if defined(_KERNEL) && defined(HAVE_SPL)
2071 EXPORT_SYMBOL(dmu_objset_zil);
2072 EXPORT_SYMBOL(dmu_objset_pool);
2073 EXPORT_SYMBOL(dmu_objset_ds);
2074 EXPORT_SYMBOL(dmu_objset_type);
2075 EXPORT_SYMBOL(dmu_objset_name);
2076 EXPORT_SYMBOL(dmu_objset_hold);
2077 EXPORT_SYMBOL(dmu_objset_own);
2078 EXPORT_SYMBOL(dmu_objset_rele);
2079 EXPORT_SYMBOL(dmu_objset_disown);
2080 EXPORT_SYMBOL(dmu_objset_from_ds);
2081 EXPORT_SYMBOL(dmu_objset_create);
2082 EXPORT_SYMBOL(dmu_objset_clone);
2083 EXPORT_SYMBOL(dmu_objset_stats);
2084 EXPORT_SYMBOL(dmu_objset_fast_stat);
2085 EXPORT_SYMBOL(dmu_objset_spa);
2086 EXPORT_SYMBOL(dmu_objset_space);
2087 EXPORT_SYMBOL(dmu_objset_fsid_guid);
2088 EXPORT_SYMBOL(dmu_objset_find);
2089 EXPORT_SYMBOL(dmu_objset_byteswap);
2090 EXPORT_SYMBOL(dmu_objset_evict_dbufs);
2091 EXPORT_SYMBOL(dmu_objset_snap_cmtime);
2092 EXPORT_SYMBOL(dmu_objset_dnodesize);
2093
2094 EXPORT_SYMBOL(dmu_objset_sync);
2095 EXPORT_SYMBOL(dmu_objset_is_dirty);
2096 EXPORT_SYMBOL(dmu_objset_create_impl);
2097 EXPORT_SYMBOL(dmu_objset_open_impl);
2098 EXPORT_SYMBOL(dmu_objset_evict);
2099 EXPORT_SYMBOL(dmu_objset_register_type);
2100 EXPORT_SYMBOL(dmu_objset_do_userquota_updates);
2101 EXPORT_SYMBOL(dmu_objset_userquota_get_ids);
2102 EXPORT_SYMBOL(dmu_objset_userused_enabled);
2103 EXPORT_SYMBOL(dmu_objset_userspace_upgrade);
2104 EXPORT_SYMBOL(dmu_objset_userspace_present);
2105 #endif