module/zfs/dsl_pool.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2012 by Delphix. All rights reserved.
  24  */
  25
  26 #include <sys/dsl_pool.h>
  27 #include <sys/dsl_dataset.h>
  28 #include <sys/dsl_prop.h>
  29 #include <sys/dsl_dir.h>
  30 #include <sys/dsl_synctask.h>
  31 #include <sys/dsl_scan.h>
  32 #include <sys/dnode.h>
  33 #include <sys/dmu_tx.h>
  34 #include <sys/dmu_objset.h>
  35 #include <sys/arc.h>
  36 #include <sys/zap.h>
  37 #include <sys/zio.h>
  38 #include <sys/zfs_context.h>
  39 #include <sys/fs/zfs.h>
  40 #include <sys/zfs_znode.h>
  41 #include <sys/spa_impl.h>
  42 #include <sys/dsl_deadlist.h>
  43 #include <sys/bptree.h>
  44 #include <sys/zfeature.h>
  45 #include <sys/zil_impl.h>
  46 #include <sys/dsl_userhold.h>
  47
  48 int zfs_no_write_throttle = 0;
  49 int zfs_write_limit_shift = 3;                  /* 1/8th of physical memory */
  50 int zfs_txg_synctime_ms = 1000;         /* target millisecs to sync a txg */
  51
  52 unsigned long zfs_write_limit_min = 32 << 20;   /* min write limit is 32MB */
  53 unsigned long zfs_write_limit_max = 0;          /* max data payload per txg */
  54 unsigned long zfs_write_limit_inflated = 0;
  55 unsigned long zfs_write_limit_override = 0;
  56
  57 kmutex_t zfs_write_limit_lock;
  58
  59 static pgcnt_t old_physmem = 0;
  60
  61 int
  62 dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp)
  63 {
  64         uint64_t obj;
  65         int err;
  66
  67         err = zap_lookup(dp->dp_meta_objset,
  68             dp->dp_root_dir->dd_phys->dd_child_dir_zapobj,
  69             name, sizeof (obj), 1, &obj);
  70         if (err)
  71                 return (err);
  72
  73         return (dsl_dir_hold_obj(dp, obj, name, dp, ddp));
  74 }
  75
  76 static dsl_pool_t *
  77 dsl_pool_open_impl(spa_t *spa, uint64_t txg)
  78 {
  79         dsl_pool_t *dp;
  80         blkptr_t *bp = spa_get_rootblkptr(spa);
  81
  82         dp = kmem_zalloc(sizeof (dsl_pool_t), KM_SLEEP);
  83         dp->dp_spa = spa;
  84         dp->dp_meta_rootbp = *bp;
  85         rrw_init(&dp->dp_config_rwlock, B_TRUE);
  86         dp->dp_write_limit = zfs_write_limit_min;
  87         txg_init(dp, txg);
  88
  89         txg_list_create(&dp->dp_dirty_datasets,
  90             offsetof(dsl_dataset_t, ds_dirty_link));
  91         txg_list_create(&dp->dp_dirty_zilogs,
  92             offsetof(zilog_t, zl_dirty_link));
  93         txg_list_create(&dp->dp_dirty_dirs,
  94             offsetof(dsl_dir_t, dd_dirty_link));
  95         txg_list_create(&dp->dp_sync_tasks,
  96             offsetof(dsl_sync_task_t, dst_node));
  97
  98         mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL);
  99
 100         dp->dp_iput_taskq = taskq_create("zfs_iput_taskq", 1, minclsyspri,
 101             1, 4, 0);
 102
 103         return (dp);
 104 }
 105
 106 int
 107 dsl_pool_init(spa_t *spa, uint64_t txg, dsl_pool_t **dpp)
 108 {
 109         int err;
 110         dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
 111
 112         err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp,
 113             &dp->dp_meta_objset);
 114         if (err != 0)
 115                 dsl_pool_close(dp);
 116         else
 117                 *dpp = dp;
 118
 119         return (err);
 120 }
 121
 122 int
 123 dsl_pool_open(dsl_pool_t *dp)
 124 {
 125         int err;
 126         dsl_dir_t *dd;
 127         dsl_dataset_t *ds;
 128         uint64_t obj;
 129
 130         rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
 131         err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 132             DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1,
 133             &dp->dp_root_dir_obj);
 134         if (err)
 135                 goto out;
 136
 137         err = dsl_dir_hold_obj(dp, dp->dp_root_dir_obj,
 138             NULL, dp, &dp->dp_root_dir);
 139         if (err)
 140                 goto out;
 141
 142         err = dsl_pool_open_special_dir(dp, MOS_DIR_NAME, &dp->dp_mos_dir);
 143         if (err)
 144                 goto out;
 145
 146         if (spa_version(dp->dp_spa) >= SPA_VERSION_ORIGIN) {
 147                 err = dsl_pool_open_special_dir(dp, ORIGIN_DIR_NAME, &dd);
 148                 if (err)
 149                         goto out;
 150                 err = dsl_dataset_hold_obj(dp, dd->dd_phys->dd_head_dataset_obj,
 151                     FTAG, &ds);
 152                 if (err == 0) {
 153                         err = dsl_dataset_hold_obj(dp,
 154                             ds->ds_phys->ds_prev_snap_obj, dp,
 155                             &dp->dp_origin_snap);
 156                         dsl_dataset_rele(ds, FTAG);
 157                 }
 158                 dsl_dir_rele(dd, dp);
 159                 if (err)
 160                         goto out;
 161         }
 162
 163         if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
 164                 err = dsl_pool_open_special_dir(dp, FREE_DIR_NAME,
 165                     &dp->dp_free_dir);
 166                 if (err)
 167                         goto out;
 168
 169                 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 170                     DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj);
 171                 if (err)
 172                         goto out;
 173                 VERIFY0(bpobj_open(&dp->dp_free_bpobj,
 174                     dp->dp_meta_objset, obj));
 175         }
 176
 177         if (spa_feature_is_active(dp->dp_spa,
 178             &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) {
 179                 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 180                     DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1,
 181                     &dp->dp_bptree_obj);
 182                 if (err != 0)
 183                         goto out;
 184         }
 185
 186         if (spa_feature_is_active(dp->dp_spa,
 187             &spa_feature_table[SPA_FEATURE_EMPTY_BPOBJ])) {
 188                 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 189                     DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1,
 190                     &dp->dp_empty_bpobj);
 191                 if (err != 0)
 192                         goto out;
 193         }
 194
 195         err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 196             DMU_POOL_TMP_USERREFS, sizeof (uint64_t), 1,
 197             &dp->dp_tmp_userrefs_obj);
 198         if (err == ENOENT)
 199                 err = 0;
 200         if (err)
 201                 goto out;
 202
 203         err = dsl_scan_init(dp, dp->dp_tx.tx_open_txg);
 204
 205 out:
 206         rrw_exit(&dp->dp_config_rwlock, FTAG);
 207         return (err);
 208 }
 209
 210 void
 211 dsl_pool_close(dsl_pool_t *dp)
 212 {
 213         /* drop our references from dsl_pool_open() */
 214
 215         /*
 216          * Since we held the origin_snap from "syncing" context (which
 217          * includes pool-opening context), it actually only got a "ref"
 218          * and not a hold, so just drop that here.
 219          */
 220         if (dp->dp_origin_snap)
 221                 dsl_dataset_rele(dp->dp_origin_snap, dp);
 222         if (dp->dp_mos_dir)
 223                 dsl_dir_rele(dp->dp_mos_dir, dp);
 224         if (dp->dp_free_dir)
 225                 dsl_dir_rele(dp->dp_free_dir, dp);
 226         if (dp->dp_root_dir)
 227                 dsl_dir_rele(dp->dp_root_dir, dp);
 228
 229         bpobj_close(&dp->dp_free_bpobj);
 230
 231         /* undo the dmu_objset_open_impl(mos) from dsl_pool_open() */
 232         if (dp->dp_meta_objset)
 233                 dmu_objset_evict(dp->dp_meta_objset);
 234
 235         txg_list_destroy(&dp->dp_dirty_datasets);
 236         txg_list_destroy(&dp->dp_dirty_zilogs);
 237         txg_list_destroy(&dp->dp_sync_tasks);
 238         txg_list_destroy(&dp->dp_dirty_dirs);
 239
 240         arc_flush(dp->dp_spa);
 241         txg_fini(dp);
 242         dsl_scan_fini(dp);
 243         rrw_destroy(&dp->dp_config_rwlock);
 244         mutex_destroy(&dp->dp_lock);
 245         taskq_destroy(dp->dp_iput_taskq);
 246         if (dp->dp_blkstats)
 247                 kmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t));
 248         kmem_free(dp, sizeof (dsl_pool_t));
 249 }
 250
 251 dsl_pool_t *
 252 dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg)
 253 {
 254         int err;
 255         dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
 256         dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg);
 257         objset_t *os;
 258         dsl_dataset_t *ds;
 259         uint64_t obj;
 260
 261         rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
 262
 263         /* create and open the MOS (meta-objset) */
 264         dp->dp_meta_objset = dmu_objset_create_impl(spa,
 265             NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx);
 266
 267         /* create the pool directory */
 268         err = zap_create_claim(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 269             DMU_OT_OBJECT_DIRECTORY, DMU_OT_NONE, 0, tx);
 270         ASSERT0(err);
 271
 272         /* Initialize scan structures */
 273         VERIFY0(dsl_scan_init(dp, txg));
 274
 275         /* create and open the root dir */
 276         dp->dp_root_dir_obj = dsl_dir_create_sync(dp, NULL, NULL, tx);
 277         VERIFY0(dsl_dir_hold_obj(dp, dp->dp_root_dir_obj,
 278             NULL, dp, &dp->dp_root_dir));
 279
 280         /* create and open the meta-objset dir */
 281         (void) dsl_dir_create_sync(dp, dp->dp_root_dir, MOS_DIR_NAME, tx);
 282         VERIFY0(dsl_pool_open_special_dir(dp,
 283             MOS_DIR_NAME, &dp->dp_mos_dir));
 284
 285         if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
 286                 /* create and open the free dir */
 287                 (void) dsl_dir_create_sync(dp, dp->dp_root_dir,
 288                     FREE_DIR_NAME, tx);
 289                 VERIFY0(dsl_pool_open_special_dir(dp,
 290                     FREE_DIR_NAME, &dp->dp_free_dir));
 291
 292                 /* create and open the free_bplist */
 293                 obj = bpobj_alloc(dp->dp_meta_objset, SPA_MAXBLOCKSIZE, tx);
 294                 VERIFY(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 295                     DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx) == 0);
 296                 VERIFY0(bpobj_open(&dp->dp_free_bpobj,
 297                     dp->dp_meta_objset, obj));
 298         }
 299
 300         if (spa_version(spa) >= SPA_VERSION_DSL_SCRUB)
 301                 dsl_pool_create_origin(dp, tx);
 302
 303         /* create the root dataset */
 304         obj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, 0, tx);
 305
 306         /* create the root objset */
 307         VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG, &ds));
 308         VERIFY(NULL != (os = dmu_objset_create_impl(dp->dp_spa, ds,
 309             dsl_dataset_get_blkptr(ds), DMU_OST_ZFS, tx)));
 310 #ifdef _KERNEL
 311         zfs_create_fs(os, kcred, zplprops, tx);
 312 #endif
 313         dsl_dataset_rele(ds, FTAG);
 314
 315         dmu_tx_commit(tx);
 316
 317         rrw_exit(&dp->dp_config_rwlock, FTAG);
 318
 319         return (dp);
 320 }
 321
 322 /*
 323  * Account for the meta-objset space in its placeholder dsl_dir.
 324  */
 325 void
 326 dsl_pool_mos_diduse_space(dsl_pool_t *dp,
 327     int64_t used, int64_t comp, int64_t uncomp)
 328 {
 329         ASSERT3U(comp, ==, uncomp); /* it's all metadata */
 330         mutex_enter(&dp->dp_lock);
 331         dp->dp_mos_used_delta += used;
 332         dp->dp_mos_compressed_delta += comp;
 333         dp->dp_mos_uncompressed_delta += uncomp;
 334         mutex_exit(&dp->dp_lock);
 335 }
 336
 337 static int
 338 deadlist_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 339 {
 340         dsl_deadlist_t *dl = arg;
 341         dsl_deadlist_insert(dl, bp, tx);
 342         return (0);
 343 }
 344
 345 void
 346 dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
 347 {
 348         zio_t *zio;
 349         dmu_tx_t *tx;
 350         dsl_dir_t *dd;
 351         dsl_dataset_t *ds;
 352         objset_t *mos = dp->dp_meta_objset;
 353         hrtime_t start, write_time;
 354         uint64_t data_written;
 355         int err;
 356         list_t synced_datasets;
 357
 358         list_create(&synced_datasets, sizeof (dsl_dataset_t),
 359             offsetof(dsl_dataset_t, ds_synced_link));
 360
 361         /*
 362          * We need to copy dp_space_towrite() before doing
 363          * dsl_sync_task_sync(), because
 364          * dsl_dataset_snapshot_reserve_space() will increase
 365          * dp_space_towrite but not actually write anything.
 366          */
 367         data_written = dp->dp_space_towrite[txg & TXG_MASK];
 368
 369         tx = dmu_tx_create_assigned(dp, txg);
 370
 371         dp->dp_read_overhead = 0;
 372         start = gethrtime();
 373
 374         zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
 375         while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg))) {
 376                 /*
 377                  * We must not sync any non-MOS datasets twice, because
 378                  * we may have taken a snapshot of them.  However, we
 379                  * may sync newly-created datasets on pass 2.
 380                  */
 381                 ASSERT(!list_link_active(&ds->ds_synced_link));
 382                 list_insert_tail(&synced_datasets, ds);
 383                 dsl_dataset_sync(ds, zio, tx);
 384         }
 385         DTRACE_PROBE(pool_sync__1setup);
 386         err = zio_wait(zio);
 387
 388         write_time = gethrtime() - start;
 389         ASSERT(err == 0);
 390         DTRACE_PROBE(pool_sync__2rootzio);
 391
 392         /*
 393          * After the data blocks have been written (ensured by the zio_wait()
 394          * above), update the user/group space accounting.
 395          */
 396         for (ds = list_head(&synced_datasets); ds;
 397             ds = list_next(&synced_datasets, ds))
 398                 dmu_objset_do_userquota_updates(ds->ds_objset, tx);
 399
 400         /*
 401          * Sync the datasets again to push out the changes due to
 402          * userspace updates.  This must be done before we process the
 403          * sync tasks, so that any snapshots will have the correct
 404          * user accounting information (and we won't get confused
 405          * about which blocks are part of the snapshot).
 406          */
 407         zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
 408         while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg))) {
 409                 ASSERT(list_link_active(&ds->ds_synced_link));
 410                 dmu_buf_rele(ds->ds_dbuf, ds);
 411                 dsl_dataset_sync(ds, zio, tx);
 412         }
 413         err = zio_wait(zio);
 414
 415         /*
 416          * Now that the datasets have been completely synced, we can
 417          * clean up our in-memory structures accumulated while syncing:
 418          *
 419          *  - move dead blocks from the pending deadlist to the on-disk deadlist
 420          *  - clean up zil records
 421          *  - release hold from dsl_dataset_dirty()
 422          */
 423         while ((ds = list_remove_head(&synced_datasets))) {
 424                 ASSERTV(objset_t *os = ds->ds_objset);
 425                 bplist_iterate(&ds->ds_pending_deadlist,
 426                     deadlist_enqueue_cb, &ds->ds_deadlist, tx);
 427                 ASSERT(!dmu_objset_is_dirty(os, txg));
 428                 dmu_buf_rele(ds->ds_dbuf, ds);
 429         }
 430
 431         start = gethrtime();
 432         while ((dd = txg_list_remove(&dp->dp_dirty_dirs, txg)))
 433                 dsl_dir_sync(dd, tx);
 434         write_time += gethrtime() - start;
 435
 436         /*
 437          * The MOS's space is accounted for in the pool/$MOS
 438          * (dp_mos_dir).  We can't modify the mos while we're syncing
 439          * it, so we remember the deltas and apply them here.
 440          */
 441         if (dp->dp_mos_used_delta != 0 || dp->dp_mos_compressed_delta != 0 ||
 442             dp->dp_mos_uncompressed_delta != 0) {
 443                 dsl_dir_diduse_space(dp->dp_mos_dir, DD_USED_HEAD,
 444                     dp->dp_mos_used_delta,
 445                     dp->dp_mos_compressed_delta,
 446                     dp->dp_mos_uncompressed_delta, tx);
 447                 dp->dp_mos_used_delta = 0;
 448                 dp->dp_mos_compressed_delta = 0;
 449                 dp->dp_mos_uncompressed_delta = 0;
 450         }
 451
 452         start = gethrtime();
 453         if (list_head(&mos->os_dirty_dnodes[txg & TXG_MASK]) != NULL ||
 454             list_head(&mos->os_free_dnodes[txg & TXG_MASK]) != NULL) {
 455                 zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
 456                 dmu_objset_sync(mos, zio, tx);
 457                 err = zio_wait(zio);
 458                 ASSERT(err == 0);
 459                 dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", "");
 460                 spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp);
 461         }
 462         write_time += gethrtime() - start;
 463         DTRACE_PROBE2(pool_sync__4io, hrtime_t, write_time,
 464             hrtime_t, dp->dp_read_overhead);
 465         write_time -= dp->dp_read_overhead;
 466
 467         /*
 468          * If we modify a dataset in the same txg that we want to destroy it,
 469          * its dsl_dir's dd_dbuf will be dirty, and thus have a hold on it.
 470          * dsl_dir_destroy_check() will fail if there are unexpected holds.
 471          * Therefore, we want to sync the MOS (thus syncing the dd_dbuf
 472          * and clearing the hold on it) before we process the sync_tasks.
 473          * The MOS data dirtied by the sync_tasks will be synced on the next
 474          * pass.
 475          */
 476         DTRACE_PROBE(pool_sync__3task);
 477         if (!txg_list_empty(&dp->dp_sync_tasks, txg)) {
 478                 dsl_sync_task_t *dst;
 479                 /*
 480                  * No more sync tasks should have been added while we
 481                  * were syncing.
 482                  */
 483                 ASSERT(spa_sync_pass(dp->dp_spa) == 1);
 484                 while ((dst = txg_list_remove(&dp->dp_sync_tasks, txg)))
 485                         dsl_sync_task_sync(dst, tx);
 486         }
 487
 488         dmu_tx_commit(tx);
 489
 490         dp->dp_space_towrite[txg & TXG_MASK] = 0;
 491         ASSERT(dp->dp_tempreserved[txg & TXG_MASK] == 0);
 492
 493         /*
 494          * If the write limit max has not been explicitly set, set it
 495          * to a fraction of available physical memory (default 1/8th).
 496          * Note that we must inflate the limit because the spa
 497          * inflates write sizes to account for data replication.
 498          * Check this each sync phase to catch changing memory size.
 499          */
 500         if (physmem != old_physmem && zfs_write_limit_shift) {
 501                 mutex_enter(&zfs_write_limit_lock);
 502                 old_physmem = physmem;
 503                 zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift;
 504                 zfs_write_limit_inflated = MAX(zfs_write_limit_min,
 505                     spa_get_asize(dp->dp_spa, zfs_write_limit_max));
 506                 mutex_exit(&zfs_write_limit_lock);
 507         }
 508
 509         /*
 510          * Attempt to keep the sync time consistent by adjusting the
 511          * amount of write traffic allowed into each transaction group.
 512          * Weight the throughput calculation towards the current value:
 513          *      thru = 3/4 old_thru + 1/4 new_thru
 514          *
 515          * Note: write_time is in nanosecs, so write_time/MICROSEC
 516          * yields millisecs
 517          */
 518         ASSERT(zfs_write_limit_min > 0);
 519         if (data_written > zfs_write_limit_min / 8 && write_time > MICROSEC) {
 520                 uint64_t throughput = data_written / (write_time / MICROSEC);
 521
 522                 if (dp->dp_throughput)
 523                         dp->dp_throughput = throughput / 4 +
 524                             3 * dp->dp_throughput / 4;
 525                 else
 526                         dp->dp_throughput = throughput;
 527                 dp->dp_write_limit = MIN(zfs_write_limit_inflated,
 528                     MAX(zfs_write_limit_min,
 529                     dp->dp_throughput * zfs_txg_synctime_ms));
 530         }
 531 }
 532
 533 void
 534 dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg)
 535 {
 536         zilog_t *zilog;
 537         dsl_dataset_t *ds;
 538
 539         while ((zilog = txg_list_remove(&dp->dp_dirty_zilogs, txg))) {
 540                 ds = dmu_objset_ds(zilog->zl_os);
 541                 zil_clean(zilog, txg);
 542                 ASSERT(!dmu_objset_is_dirty(zilog->zl_os, txg));
 543                 dmu_buf_rele(ds->ds_dbuf, zilog);
 544         }
 545         ASSERT(!dmu_objset_is_dirty(dp->dp_meta_objset, txg));
 546 }
 547
 548 /*
 549  * TRUE if the current thread is the tx_sync_thread or if we
 550  * are being called from SPA context during pool initialization.
 551  */
 552 int
 553 dsl_pool_sync_context(dsl_pool_t *dp)
 554 {
 555         return (curthread == dp->dp_tx.tx_sync_thread ||
 556             spa_is_initializing(dp->dp_spa));
 557 }
 558
 559 uint64_t
 560 dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree)
 561 {
 562         uint64_t space, resv;
 563
 564         /*
 565          * Reserve about 1.6% (1/64), or at least 32MB, for allocation
 566          * efficiency.
 567          * XXX The intent log is not accounted for, so it must fit
 568          * within this slop.
 569          *
 570          * If we're trying to assess whether it's OK to do a free,
 571          * cut the reservation in half to allow forward progress
 572          * (e.g. make it possible to rm(1) files from a full pool).
 573          */
 574         space = spa_get_dspace(dp->dp_spa);
 575         resv = MAX(space >> 6, SPA_MINDEVSIZE >> 1);
 576         if (netfree)
 577                 resv >>= 1;
 578
 579         return (space - resv);
 580 }
 581
 582 int
 583 dsl_pool_tempreserve_space(dsl_pool_t *dp, uint64_t space, dmu_tx_t *tx)
 584 {
 585         uint64_t reserved = 0;
 586         uint64_t write_limit = (zfs_write_limit_override ?
 587             zfs_write_limit_override : dp->dp_write_limit);
 588
 589         if (zfs_no_write_throttle) {
 590                 atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK],
 591                     space);
 592                 return (0);
 593         }
 594
 595         /*
 596          * Check to see if we have exceeded the maximum allowed IO for
 597          * this transaction group.  We can do this without locks since
 598          * a little slop here is ok.  Note that we do the reserved check
 599          * with only half the requested reserve: this is because the
 600          * reserve requests are worst-case, and we really don't want to
 601          * throttle based off of worst-case estimates.
 602          */
 603         if (write_limit > 0) {
 604                 reserved = dp->dp_space_towrite[tx->tx_txg & TXG_MASK]
 605                     + dp->dp_tempreserved[tx->tx_txg & TXG_MASK] / 2;
 606
 607                 if (reserved && reserved > write_limit) {
 608                         DMU_TX_STAT_BUMP(dmu_tx_write_limit);
 609                         return (ERESTART);
 610                 }
 611         }
 612
 613         atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK], space);
 614
 615         /*
 616          * If this transaction group is over 7/8ths capacity, delay
 617          * the caller 1 clock tick.  This will slow down the "fill"
 618          * rate until the sync process can catch up with us.
 619          */
 620         if (reserved && reserved > (write_limit - (write_limit >> 3)))
 621                 txg_delay(dp, tx->tx_txg, 1);
 622
 623         return (0);
 624 }
 625
 626 void
 627 dsl_pool_tempreserve_clear(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx)
 628 {
 629         ASSERT(dp->dp_tempreserved[tx->tx_txg & TXG_MASK] >= space);
 630         atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK], -space);
 631 }
 632
 633 void
 634 dsl_pool_memory_pressure(dsl_pool_t *dp)
 635 {
 636         uint64_t space_inuse = 0;
 637         int i;
 638
 639         if (dp->dp_write_limit == zfs_write_limit_min)
 640                 return;
 641
 642         for (i = 0; i < TXG_SIZE; i++) {
 643                 space_inuse += dp->dp_space_towrite[i];
 644                 space_inuse += dp->dp_tempreserved[i];
 645         }
 646         dp->dp_write_limit = MAX(zfs_write_limit_min,
 647             MIN(dp->dp_write_limit, space_inuse / 4));
 648 }
 649
 650 void
 651 dsl_pool_willuse_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx)
 652 {
 653         if (space > 0) {
 654                 mutex_enter(&dp->dp_lock);
 655                 dp->dp_space_towrite[tx->tx_txg & TXG_MASK] += space;
 656                 mutex_exit(&dp->dp_lock);
 657         }
 658 }
 659
 660 /* ARGSUSED */
 661 static int
 662 upgrade_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
 663 {
 664         dmu_tx_t *tx = arg;
 665         dsl_dataset_t *ds, *prev = NULL;
 666         int err;
 667
 668         err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds);
 669         if (err)
 670                 return (err);
 671
 672         while (ds->ds_phys->ds_prev_snap_obj != 0) {
 673                 err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj,
 674                     FTAG, &prev);
 675                 if (err) {
 676                         dsl_dataset_rele(ds, FTAG);
 677                         return (err);
 678                 }
 679
 680                 if (prev->ds_phys->ds_next_snap_obj != ds->ds_object)
 681                         break;
 682                 dsl_dataset_rele(ds, FTAG);
 683                 ds = prev;
 684                 prev = NULL;
 685         }
 686
 687         if (prev == NULL) {
 688                 prev = dp->dp_origin_snap;
 689
 690                 /*
 691                  * The $ORIGIN can't have any data, or the accounting
 692                  * will be wrong.
 693                  */
 694                 ASSERT0(prev->ds_phys->ds_bp.blk_birth);
 695
 696                 /* The origin doesn't get attached to itself */
 697                 if (ds->ds_object == prev->ds_object) {
 698                         dsl_dataset_rele(ds, FTAG);
 699                         return (0);
 700                 }
 701
 702                 dmu_buf_will_dirty(ds->ds_dbuf, tx);
 703                 ds->ds_phys->ds_prev_snap_obj = prev->ds_object;
 704                 ds->ds_phys->ds_prev_snap_txg = prev->ds_phys->ds_creation_txg;
 705
 706                 dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
 707                 ds->ds_dir->dd_phys->dd_origin_obj = prev->ds_object;
 708
 709                 dmu_buf_will_dirty(prev->ds_dbuf, tx);
 710                 prev->ds_phys->ds_num_children++;
 711
 712                 if (ds->ds_phys->ds_next_snap_obj == 0) {
 713                         ASSERT(ds->ds_prev == NULL);
 714                         VERIFY0(dsl_dataset_hold_obj(dp,
 715                             ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev));
 716                 }
 717         }
 718
 719         ASSERT3U(ds->ds_dir->dd_phys->dd_origin_obj, ==, prev->ds_object);
 720         ASSERT3U(ds->ds_phys->ds_prev_snap_obj, ==, prev->ds_object);
 721
 722         if (prev->ds_phys->ds_next_clones_obj == 0) {
 723                 dmu_buf_will_dirty(prev->ds_dbuf, tx);
 724                 prev->ds_phys->ds_next_clones_obj =
 725                     zap_create(dp->dp_meta_objset,
 726                     DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx);
 727         }
 728         VERIFY0(zap_add_int(dp->dp_meta_objset,
 729             prev->ds_phys->ds_next_clones_obj, ds->ds_object, tx));
 730
 731         dsl_dataset_rele(ds, FTAG);
 732         if (prev != dp->dp_origin_snap)
 733                 dsl_dataset_rele(prev, FTAG);
 734         return (0);
 735 }
 736
 737 void
 738 dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx)
 739 {
 740         ASSERT(dmu_tx_is_syncing(tx));
 741         ASSERT(dp->dp_origin_snap != NULL);
 742
 743         VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, upgrade_clones_cb,
 744             tx, DS_FIND_CHILDREN));
 745 }
 746
 747 /* ARGSUSED */
 748 static int
 749 upgrade_dir_clones_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
 750 {
 751         dmu_tx_t *tx = arg;
 752         objset_t *mos = dp->dp_meta_objset;
 753
 754         if (ds->ds_dir->dd_phys->dd_origin_obj != 0) {
 755                 dsl_dataset_t *origin;
 756
 757                 VERIFY0(dsl_dataset_hold_obj(dp,
 758                     ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &origin));
 759
 760                 if (origin->ds_dir->dd_phys->dd_clones == 0) {
 761                         dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx);
 762                         origin->ds_dir->dd_phys->dd_clones = zap_create(mos,
 763                             DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx);
 764                 }
 765
 766                 VERIFY0(zap_add_int(dp->dp_meta_objset,
 767                     origin->ds_dir->dd_phys->dd_clones, ds->ds_object, tx));
 768
 769                 dsl_dataset_rele(origin, FTAG);
 770         }
 771         return (0);
 772 }
 773
 774 void
 775 dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx)
 776 {
 777         uint64_t obj;
 778
 779         ASSERT(dmu_tx_is_syncing(tx));
 780
 781         (void) dsl_dir_create_sync(dp, dp->dp_root_dir, FREE_DIR_NAME, tx);
 782         VERIFY0(dsl_pool_open_special_dir(dp,
 783             FREE_DIR_NAME, &dp->dp_free_dir));
 784
 785         /*
 786          * We can't use bpobj_alloc(), because spa_version() still
 787          * returns the old version, and we need a new-version bpobj with
 788          * subobj support.  So call dmu_object_alloc() directly.
 789          */
 790         obj = dmu_object_alloc(dp->dp_meta_objset, DMU_OT_BPOBJ,
 791             SPA_MAXBLOCKSIZE, DMU_OT_BPOBJ_HDR, sizeof (bpobj_phys_t), tx);
 792         VERIFY0(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 793             DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx));
 794         VERIFY0(bpobj_open(&dp->dp_free_bpobj, dp->dp_meta_objset, obj));
 795
 796         VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
 797             upgrade_dir_clones_cb, tx, DS_FIND_CHILDREN));
 798 }
 799
 800 void
 801 dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx)
 802 {
 803         uint64_t dsobj;
 804         dsl_dataset_t *ds;
 805
 806         ASSERT(dmu_tx_is_syncing(tx));
 807         ASSERT(dp->dp_origin_snap == NULL);
 808         ASSERT(rrw_held(&dp->dp_config_rwlock, RW_WRITER));
 809
 810         /* create the origin dir, ds, & snap-ds */
 811         dsobj = dsl_dataset_create_sync(dp->dp_root_dir, ORIGIN_DIR_NAME,
 812             NULL, 0, kcred, tx);
 813         VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
 814         dsl_dataset_snapshot_sync_impl(ds, ORIGIN_DIR_NAME, tx);
 815         VERIFY0(dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj,
 816             dp, &dp->dp_origin_snap));
 817         dsl_dataset_rele(ds, FTAG);
 818 }
 819
 820 taskq_t *
 821 dsl_pool_iput_taskq(dsl_pool_t *dp)
 822 {
 823         return (dp->dp_iput_taskq);
 824 }
 825
 826 /*
 827  * Walk through the pool-wide zap object of temporary snapshot user holds
 828  * and release them.
 829  */
 830 void
 831 dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp)
 832 {
 833         zap_attribute_t za;
 834         zap_cursor_t zc;
 835         objset_t *mos = dp->dp_meta_objset;
 836         uint64_t zapobj = dp->dp_tmp_userrefs_obj;
 837
 838         if (zapobj == 0)
 839                 return;
 840         ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
 841
 842         for (zap_cursor_init(&zc, mos, zapobj);
 843             zap_cursor_retrieve(&zc, &za) == 0;
 844             zap_cursor_advance(&zc)) {
 845                 char *htag;
 846                 uint64_t dsobj;
 847
 848                 htag = strchr(za.za_name, '-');
 849                 *htag = '\0';
 850                 ++htag;
 851                 dsobj = strtonum(za.za_name, NULL);
 852                 dsl_dataset_user_release_tmp(dp, dsobj, htag);
 853         }
 854         zap_cursor_fini(&zc);
 855 }
 856
 857 /*
 858  * Create the pool-wide zap object for storing temporary snapshot holds.
 859  */
 860 void
 861 dsl_pool_user_hold_create_obj(dsl_pool_t *dp, dmu_tx_t *tx)
 862 {
 863         objset_t *mos = dp->dp_meta_objset;
 864
 865         ASSERT(dp->dp_tmp_userrefs_obj == 0);
 866         ASSERT(dmu_tx_is_syncing(tx));
 867
 868         dp->dp_tmp_userrefs_obj = zap_create_link(mos, DMU_OT_USERREFS,
 869             DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS, tx);
 870 }
 871
 872 static int
 873 dsl_pool_user_hold_rele_impl(dsl_pool_t *dp, uint64_t dsobj,
 874     const char *tag, uint64_t now, dmu_tx_t *tx, boolean_t holding)
 875 {
 876         objset_t *mos = dp->dp_meta_objset;
 877         uint64_t zapobj = dp->dp_tmp_userrefs_obj;
 878         char *name;
 879         int error;
 880
 881         ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
 882         ASSERT(dmu_tx_is_syncing(tx));
 883
 884         /*
 885          * If the pool was created prior to SPA_VERSION_USERREFS, the
 886          * zap object for temporary holds might not exist yet.
 887          */
 888         if (zapobj == 0) {
 889                 if (holding) {
 890                         dsl_pool_user_hold_create_obj(dp, tx);
 891                         zapobj = dp->dp_tmp_userrefs_obj;
 892                 } else {
 893                         return (ENOENT);
 894                 }
 895         }
 896
 897         name = kmem_asprintf("%llx-%s", (u_longlong_t)dsobj, tag);
 898         if (holding)
 899                 error = zap_add(mos, zapobj, name, 8, 1, &now, tx);
 900         else
 901                 error = zap_remove(mos, zapobj, name, tx);
 902         strfree(name);
 903
 904         return (error);
 905 }
 906
 907 /*
 908  * Add a temporary hold for the given dataset object and tag.
 909  */
 910 int
 911 dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj, const char *tag,
 912     uint64_t now, dmu_tx_t *tx)
 913 {
 914         return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, now, tx, B_TRUE));
 915 }
 916
 917 /*
 918  * Release a temporary hold for the given dataset object and tag.
 919  */
 920 int
 921 dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, const char *tag,
 922     dmu_tx_t *tx)
 923 {
 924         return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, 0,
 925             tx, B_FALSE));
 926 }
 927
 928 /*
 929  * DSL Pool Configuration Lock
 930  *
 931  * The dp_config_rwlock protects against changes to DSL state (e.g. dataset
 932  * creation / destruction / rename / property setting).  It must be held for
 933  * read to hold a dataset or dsl_dir.  I.e. you must call
 934  * dsl_pool_config_enter() or dsl_pool_hold() before calling
 935  * dsl_{dataset,dir}_hold{_obj}.  In most circumstances, the dp_config_rwlock
 936  * must be held continuously until all datasets and dsl_dirs are released.
 937  *
 938  * The only exception to this rule is that if a "long hold" is placed on
 939  * a dataset, then the dp_config_rwlock may be dropped while the dataset
 940  * is still held.  The long hold will prevent the dataset from being
 941  * destroyed -- the destroy will fail with EBUSY.  A long hold can be
 942  * obtained by calling dsl_dataset_long_hold(), or by "owning" a dataset
 943  * (by calling dsl_{dataset,objset}_{try}own{_obj}).
 944  *
 945  * Legitimate long-holders (including owners) should be long-running, cancelable
 946  * tasks that should cause "zfs destroy" to fail.  This includes DMU
 947  * consumers (i.e. a ZPL filesystem being mounted or ZVOL being open),
 948  * "zfs send", and "zfs diff".  There are several other long-holders whose
 949  * uses are suboptimal (e.g. "zfs promote", and zil_suspend()).
 950  *
 951  * The usual formula for long-holding would be:
 952  * dsl_pool_hold()
 953  * dsl_dataset_hold()
 954  * ... perform checks ...
 955  * dsl_dataset_long_hold()
 956  * dsl_pool_rele()
 957  * ... perform long-running task ...
 958  * dsl_dataset_long_rele()
 959  * dsl_dataset_rele()
 960  *
 961  * Note that when the long hold is released, the dataset is still held but
 962  * the pool is not held.  The dataset may change arbitrarily during this time
 963  * (e.g. it could be destroyed).  Therefore you shouldn't do anything to the
 964  * dataset except release it.
 965  *
 966  * User-initiated operations (e.g. ioctls, zfs_ioc_*()) are either read-only
 967  * or modifying operations.
 968  *
 969  * Modifying operations should generally use dsl_sync_task().  The synctask
 970  * infrastructure enforces proper locking strategy with respect to the
 971  * dp_config_rwlock.  See the comment above dsl_sync_task() for details.
 972  *
 973  * Read-only operations will manually hold the pool, then the dataset, obtain
 974  * information from the dataset, then release the pool and dataset.
 975  * dmu_objset_{hold,rele}() are convenience routines that also do the pool
 976  * hold/rele.
 977  */
 978
 979 int
 980 dsl_pool_hold(const char *name, void *tag, dsl_pool_t **dp)
 981 {
 982         spa_t *spa;
 983         int error;
 984
 985         error = spa_open(name, &spa, tag);
 986         if (error == 0) {
 987                 *dp = spa_get_dsl(spa);
 988                 dsl_pool_config_enter(*dp, tag);
 989         }
 990         return (error);
 991 }
 992
 993 void
 994 dsl_pool_rele(dsl_pool_t *dp, void *tag)
 995 {
 996         dsl_pool_config_exit(dp, tag);
 997         spa_close(dp->dp_spa, tag);
 998 }
 999
1000 void
1001 dsl_pool_config_enter(dsl_pool_t *dp, void *tag)
1002 {
1003         /*
1004          * We use a "reentrant" reader-writer lock, but not reentrantly.
1005          *
1006          * The rrwlock can (with the track_all flag) track all reading threads,
1007          * which is very useful for debugging which code path failed to release
1008          * the lock, and for verifying that the *current* thread does hold
1009          * the lock.
1010          *
1011          * (Unlike a rwlock, which knows that N threads hold it for
1012          * read, but not *which* threads, so rw_held(RW_READER) returns TRUE
1013          * if any thread holds it for read, even if this thread doesn't).
1014          */
1015         ASSERT(!rrw_held(&dp->dp_config_rwlock, RW_READER));
1016         rrw_enter(&dp->dp_config_rwlock, RW_READER, tag);
1017 }
1018
1019 void
1020 dsl_pool_config_exit(dsl_pool_t *dp, void *tag)
1021 {
1022         rrw_exit(&dp->dp_config_rwlock, tag);
1023 }
1024
1025 boolean_t
1026 dsl_pool_config_held(dsl_pool_t *dp)
1027 {
1028         return (RRW_LOCK_HELD(&dp->dp_config_rwlock));
1029 }
1030
1031 #if defined(_KERNEL) && defined(HAVE_SPL)
1032 EXPORT_SYMBOL(dsl_pool_config_enter);
1033 EXPORT_SYMBOL(dsl_pool_config_exit);
1034
1035 module_param(zfs_no_write_throttle, int, 0644);
1036 MODULE_PARM_DESC(zfs_no_write_throttle, "Disable write throttling");
1037
1038 module_param(zfs_write_limit_shift, int, 0444);
1039 MODULE_PARM_DESC(zfs_write_limit_shift, "log2(fraction of memory) per txg");
1040
1041 module_param(zfs_txg_synctime_ms, int, 0644);
1042 MODULE_PARM_DESC(zfs_txg_synctime_ms, "Target milliseconds between txg sync");
1043
1044 module_param(zfs_write_limit_min, ulong, 0444);
1045 MODULE_PARM_DESC(zfs_write_limit_min, "Min txg write limit");
1046
1047 module_param(zfs_write_limit_max, ulong, 0444);
1048 MODULE_PARM_DESC(zfs_write_limit_max, "Max txg write limit");
1049
1050 module_param(zfs_write_limit_inflated, ulong, 0444);
1051 MODULE_PARM_DESC(zfs_write_limit_inflated, "Inflated txg write limit");
1052
1053 module_param(zfs_write_limit_override, ulong, 0444);
1054 MODULE_PARM_DESC(zfs_write_limit_override, "Override txg write limit");
1055 #endif