port async unlinked drain from illumos-nexenta

[mirror_zfs.git] / module / zfs / dsl_pool.c
diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c

index f1ab29c6214ce9cb192ec0953b8a465f9522660b..10e967ab91ed9218415c2cfa5db3c3628b20cfd4 100644 (file)
--- a/module/zfs/dsl_pool.c
+++ b/module/zfs/dsl_pool.c
@@ -20,8 +20,10 @@
   */
  /*
   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
   * Copyright (c) 2013 Steven Hartland. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
   */
  
  #include <sys/dsl_pool.h>
@@ -41,10 +43,14 @@
  #include <sys/zfs_znode.h>
  #include <sys/spa_impl.h>
  #include <sys/dsl_deadlist.h>
+#include <sys/vdev_impl.h>
+#include <sys/metaslab_impl.h>
  #include <sys/bptree.h>
  #include <sys/zfeature.h>
  #include <sys/zil_impl.h>
  #include <sys/dsl_userhold.h>
+#include <sys/trace_txg.h>
+#include <sys/mmp.h>
  
  /*
   * ZFS Write Throttle
@@ -77,7 +83,7 @@
   * zfs_dirty_data_max determines the dirty space limit. Once that value is
   * exceeded, new writes are halted until space frees up.
   *
- * The zfs_dirty_data_sync tunable dictates the threshold at which we
+ * The zfs_dirty_data_sync_percent tunable dictates the threshold at which we
   * ensure that there is a txg syncing (see the comment in txg.c for a full
   * description of transaction group stages).
   *
@@ -100,9 +106,11 @@ int zfs_dirty_data_max_percent = 10;
  int zfs_dirty_data_max_max_percent = 25;
  
  /*
- * If there is at least this much dirty data, push out a txg.
+ * If there's at least this much dirty data (as a percentage of
+ * zfs_dirty_data_max), push out a txg.  This should be less than
+ * zfs_vdev_async_write_active_min_dirty_percent.
   */
-unsigned long zfs_dirty_data_sync = 64 * 1024 * 1024;
+int zfs_dirty_data_sync_percent = 20;
  
  /*
   * Once there is this amount of dirty data, the dmu_tx_delay() will kick in
@@ -126,8 +134,40 @@ int zfs_delay_min_dirty_percent = 60;
   */
  unsigned long zfs_delay_scale = 1000 * 1000 * 1000 / 2000;
  
-hrtime_t zfs_throttle_delay = MSEC2NSEC(10);
-hrtime_t zfs_throttle_resolution = MSEC2NSEC(10);
+/*
+ * This determines the number of threads used by the dp_sync_taskq.
+ */
+int zfs_sync_taskq_batch_pct = 75;
+
+/*
+ * These tunables determine the behavior of how zil_itxg_clean() is
+ * called via zil_clean() in the context of spa_sync(). When an itxg
+ * list needs to be cleaned, TQ_NOSLEEP will be used when dispatching.
+ * If the dispatch fails, the call to zil_itxg_clean() will occur
+ * synchronously in the context of spa_sync(), which can negatively
+ * impact the performance of spa_sync() (e.g. in the case of the itxg
+ * list having a large number of itxs that needs to be cleaned).
+ *
+ * Thus, these tunables can be used to manipulate the behavior of the
+ * taskq used by zil_clean(); they determine the number of taskq entries
+ * that are pre-populated when the taskq is first created (via the
+ * "zfs_zil_clean_taskq_minalloc" tunable) and the maximum number of
+ * taskq entries that are cached after an on-demand allocation (via the
+ * "zfs_zil_clean_taskq_maxalloc").
+ *
+ * The idea being, we want to try reasonably hard to ensure there will
+ * already be a taskq entry pre-allocated by the time that it is needed
+ * by zil_clean(). This way, we can avoid the possibility of an
+ * on-demand allocation of a new taskq entry from failing, which would
+ * result in zil_itxg_clean() being called synchronously from zil_clean()
+ * (which can adversely affect performance of spa_sync()).
+ *
+ * Additionally, the number of threads used by the taskq can be
+ * configured via the "zfs_zil_clean_taskq_nthr_pct" tunable.
+ */
+int zfs_zil_clean_taskq_nthr_pct = 100;
+int zfs_zil_clean_taskq_minalloc = 1024;
+int zfs_zil_clean_taskq_maxalloc = 1024 * 1024;
  
  int
  dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp)
@@ -136,7 +176,7 @@ dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp)
         int err;
  
         err = zap_lookup(dp->dp_meta_objset,
-           dp->dp_root_dir->dd_phys->dd_child_dir_zapobj,
+           dsl_dir_phys(dp->dp_root_dir)->dd_child_dir_zapobj,
             name, sizeof (obj), 1, &obj);
         if (err)
                 return (err);
@@ -155,21 +195,37 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg)
         dp->dp_meta_rootbp = *bp;
         rrw_init(&dp->dp_config_rwlock, B_TRUE);
         txg_init(dp, txg);
+       mmp_init(spa);
  
-       txg_list_create(&dp->dp_dirty_datasets,
+       txg_list_create(&dp->dp_dirty_datasets, spa,
             offsetof(dsl_dataset_t, ds_dirty_link));
-       txg_list_create(&dp->dp_dirty_zilogs,
+       txg_list_create(&dp->dp_dirty_zilogs, spa,
             offsetof(zilog_t, zl_dirty_link));
-       txg_list_create(&dp->dp_dirty_dirs,
+       txg_list_create(&dp->dp_dirty_dirs, spa,
             offsetof(dsl_dir_t, dd_dirty_link));
-       txg_list_create(&dp->dp_sync_tasks,
+       txg_list_create(&dp->dp_sync_tasks, spa,
             offsetof(dsl_sync_task_t, dst_node));
+       txg_list_create(&dp->dp_early_sync_tasks, spa,
+           offsetof(dsl_sync_task_t, dst_node));
+
+       dp->dp_sync_taskq = taskq_create("dp_sync_taskq",
+           zfs_sync_taskq_batch_pct, minclsyspri, 1, INT_MAX,
+           TASKQ_THREADS_CPU_PCT);
+
+       dp->dp_zil_clean_taskq = taskq_create("dp_zil_clean_taskq",
+           zfs_zil_clean_taskq_nthr_pct, minclsyspri,
+           zfs_zil_clean_taskq_minalloc,
+           zfs_zil_clean_taskq_maxalloc,
+           TASKQ_PREPOPULATE | TASKQ_THREADS_CPU_PCT);
  
         mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL);
         cv_init(&dp->dp_spaceavail_cv, NULL, CV_DEFAULT, NULL);
  
-       dp->dp_iput_taskq = taskq_create("zfs_iput_taskq", 1, minclsyspri,
-           1, 4, 0);
+       dp->dp_iput_taskq = taskq_create("z_iput", max_ncpus, defclsyspri,
+           max_ncpus * 8, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
+       dp->dp_unlinked_drain_taskq = taskq_create("z_unlinked_drain",
+           max_ncpus, defclsyspri, max_ncpus, INT_MAX,
+           TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
  
         return (dp);
  }
@@ -180,12 +236,20 @@ dsl_pool_init(spa_t *spa, uint64_t txg, dsl_pool_t **dpp)
         int err;
         dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
  
+       /*
+        * Initialize the caller's dsl_pool_t structure before we actually open
+        * the meta objset.  This is done because a self-healing write zio may
+        * be issued as part of dmu_objset_open_impl() and the spa needs its
+        * dsl_pool_t initialized in order to handle the write.
+        */
+       *dpp = dp;
+
         err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp,
             &dp->dp_meta_objset);
-       if (err != 0)
+       if (err != 0) {
                 dsl_pool_close(dp);
-       else
-               *dpp = dp;
+               *dpp = NULL;
+       }
  
         return (err);
  }
@@ -218,11 +282,11 @@ dsl_pool_open(dsl_pool_t *dp)
                 err = dsl_pool_open_special_dir(dp, ORIGIN_DIR_NAME, &dd);
                 if (err)
                         goto out;
-               err = dsl_dataset_hold_obj(dp, dd->dd_phys->dd_head_dataset_obj,
-                   FTAG, &ds);
+               err = dsl_dataset_hold_obj(dp,
+                   dsl_dir_phys(dd)->dd_head_dataset_obj, FTAG, &ds);
                 if (err == 0) {
                         err = dsl_dataset_hold_obj(dp,
-                           ds->ds_phys->ds_prev_snap_obj, dp,
+                           dsl_dataset_phys(ds)->ds_prev_snap_obj, dp,
                             &dp->dp_origin_snap);
                         dsl_dataset_rele(ds, FTAG);
                 }
@@ -245,9 +309,25 @@ dsl_pool_open(dsl_pool_t *dp)
                     dp->dp_meta_objset, obj));
         }
  
+       if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_OBSOLETE_COUNTS)) {
+               err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+                   DMU_POOL_OBSOLETE_BPOBJ, sizeof (uint64_t), 1, &obj);
+               if (err == 0) {
+                       VERIFY0(bpobj_open(&dp->dp_obsolete_bpobj,
+                           dp->dp_meta_objset, obj));
+               } else if (err == ENOENT) {
+                       /*
+                        * We might not have created the remap bpobj yet.
+                        */
+                       err = 0;
+               } else {
+                       goto out;
+               }
+       }
+
         /*
-        * Note: errors ignored, because the leak dir will not exist if we
-        * have not encountered a leak yet.
+        * Note: errors ignored, because the these special dirs, used for
+        * space accounting, are only created on demand.
          */
         (void) dsl_pool_open_special_dir(dp, LEAK_DIR_NAME,
             &dp->dp_leak_dir);
@@ -293,46 +373,99 @@ dsl_pool_close(dsl_pool_t *dp)
          * includes pool-opening context), it actually only got a "ref"
          * and not a hold, so just drop that here.
          */
-       if (dp->dp_origin_snap)
+       if (dp->dp_origin_snap != NULL)
                 dsl_dataset_rele(dp->dp_origin_snap, dp);
-       if (dp->dp_mos_dir)
+       if (dp->dp_mos_dir != NULL)
                 dsl_dir_rele(dp->dp_mos_dir, dp);
-       if (dp->dp_free_dir)
+       if (dp->dp_free_dir != NULL)
                 dsl_dir_rele(dp->dp_free_dir, dp);
-       if (dp->dp_leak_dir)
+       if (dp->dp_leak_dir != NULL)
                 dsl_dir_rele(dp->dp_leak_dir, dp);
-       if (dp->dp_root_dir)
+       if (dp->dp_root_dir != NULL)
                 dsl_dir_rele(dp->dp_root_dir, dp);
  
         bpobj_close(&dp->dp_free_bpobj);
+       bpobj_close(&dp->dp_obsolete_bpobj);
  
         /* undo the dmu_objset_open_impl(mos) from dsl_pool_open() */
-       if (dp->dp_meta_objset)
+       if (dp->dp_meta_objset != NULL)
                 dmu_objset_evict(dp->dp_meta_objset);
  
         txg_list_destroy(&dp->dp_dirty_datasets);
         txg_list_destroy(&dp->dp_dirty_zilogs);
         txg_list_destroy(&dp->dp_sync_tasks);
+       txg_list_destroy(&dp->dp_early_sync_tasks);
         txg_list_destroy(&dp->dp_dirty_dirs);
  
-       arc_flush(dp->dp_spa);
+       taskq_destroy(dp->dp_zil_clean_taskq);
+       taskq_destroy(dp->dp_sync_taskq);
+
+       /*
+        * We can't set retry to TRUE since we're explicitly specifying
+        * a spa to flush. This is good enough; any missed buffers for
+        * this spa won't cause trouble, and they'll eventually fall
+        * out of the ARC just like any other unused buffer.
+        */
+       arc_flush(dp->dp_spa, FALSE);
+
+       mmp_fini(dp->dp_spa);
         txg_fini(dp);
         dsl_scan_fini(dp);
+       dmu_buf_user_evict_wait();
+
         rrw_destroy(&dp->dp_config_rwlock);
         mutex_destroy(&dp->dp_lock);
+       cv_destroy(&dp->dp_spaceavail_cv);
+       taskq_destroy(dp->dp_unlinked_drain_taskq);
         taskq_destroy(dp->dp_iput_taskq);
-       if (dp->dp_blkstats)
-               kmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t));
+       if (dp->dp_blkstats != NULL) {
+               mutex_destroy(&dp->dp_blkstats->zab_lock);
+               vmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t));
+       }
         kmem_free(dp, sizeof (dsl_pool_t));
  }
  
+void
+dsl_pool_create_obsolete_bpobj(dsl_pool_t *dp, dmu_tx_t *tx)
+{
+       uint64_t obj;
+       /*
+        * Currently, we only create the obsolete_bpobj where there are
+        * indirect vdevs with referenced mappings.
+        */
+       ASSERT(spa_feature_is_active(dp->dp_spa, SPA_FEATURE_DEVICE_REMOVAL));
+       /* create and open the obsolete_bpobj */
+       obj = bpobj_alloc(dp->dp_meta_objset, SPA_OLD_MAXBLOCKSIZE, tx);
+       VERIFY0(bpobj_open(&dp->dp_obsolete_bpobj, dp->dp_meta_objset, obj));
+       VERIFY0(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+           DMU_POOL_OBSOLETE_BPOBJ, sizeof (uint64_t), 1, &obj, tx));
+       spa_feature_incr(dp->dp_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
+}
+
+void
+dsl_pool_destroy_obsolete_bpobj(dsl_pool_t *dp, dmu_tx_t *tx)
+{
+       spa_feature_decr(dp->dp_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
+       VERIFY0(zap_remove(dp->dp_meta_objset,
+           DMU_POOL_DIRECTORY_OBJECT,
+           DMU_POOL_OBSOLETE_BPOBJ, tx));
+       bpobj_free(dp->dp_meta_objset,
+           dp->dp_obsolete_bpobj.bpo_object, tx);
+       bpobj_close(&dp->dp_obsolete_bpobj);
+}
+
  dsl_pool_t *
-dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg)
+dsl_pool_create(spa_t *spa, nvlist_t *zplprops, dsl_crypto_params_t *dcp,
+    uint64_t txg)
  {
         int err;
         dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
         dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg);
+#ifdef _KERNEL
         objset_t *os;
+#else
+       objset_t *os __attribute__((unused));
+#endif
         dsl_dataset_t *ds;
         uint64_t obj;
  
@@ -341,6 +474,7 @@ dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg)
         /* create and open the MOS (meta-objset) */
         dp->dp_meta_objset = dmu_objset_create_impl(spa,
             NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx);
+       spa->spa_meta_objset = dp->dp_meta_objset;
  
         /* create the pool directory */
         err = zap_create_claim(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
@@ -368,7 +502,7 @@ dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg)
                     FREE_DIR_NAME, &dp->dp_free_dir));
  
                 /* create and open the free_bplist */
-               obj = bpobj_alloc(dp->dp_meta_objset, SPA_MAXBLOCKSIZE, tx);
+               obj = bpobj_alloc(dp->dp_meta_objset, SPA_OLD_MAXBLOCKSIZE, tx);
                 VERIFY(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
                     DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx) == 0);
                 VERIFY0(bpobj_open(&dp->dp_free_bpobj,
@@ -378,17 +512,31 @@ dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg)
         if (spa_version(spa) >= SPA_VERSION_DSL_SCRUB)
                 dsl_pool_create_origin(dp, tx);
  
+       /*
+        * Some features may be needed when creating the root dataset, so we
+        * create the feature objects here.
+        */
+       if (spa_version(spa) >= SPA_VERSION_FEATURES)
+               spa_feature_create_zap_objects(spa, tx);
+
+       if (dcp != NULL && dcp->cp_crypt != ZIO_CRYPT_OFF &&
+           dcp->cp_crypt != ZIO_CRYPT_INHERIT)
+               spa_feature_enable(spa, SPA_FEATURE_ENCRYPTION, tx);
+
         /* create the root dataset */
-       obj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, 0, tx);
+       obj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, dcp, 0, tx);
  
         /* create the root objset */
-       VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG, &ds));
-       VERIFY(NULL != (os = dmu_objset_create_impl(dp->dp_spa, ds,
-           dsl_dataset_get_blkptr(ds), DMU_OST_ZFS, tx)));
+       VERIFY0(dsl_dataset_hold_obj_flags(dp, obj,
+           DS_HOLD_FLAG_DECRYPT, FTAG, &ds));
+       rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
+       os = dmu_objset_create_impl(dp->dp_spa, ds,
+           dsl_dataset_get_blkptr(ds), DMU_OST_ZFS, tx);
+       rrw_exit(&ds->ds_bp_rwlock, FTAG);
  #ifdef _KERNEL
         zfs_create_fs(os, kcred, zplprops, tx);
  #endif
-       dsl_dataset_rele(ds, FTAG);
+       dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG);
  
         dmu_tx_commit(tx);
  
@@ -412,14 +560,6 @@ dsl_pool_mos_diduse_space(dsl_pool_t *dp,
         mutex_exit(&dp->dp_lock);
  }
  
-static int
-deadlist_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
-{
-       dsl_deadlist_t *dl = arg;
-       dsl_deadlist_insert(dl, bp, tx);
-       return (0);
-}
-
  static void
  dsl_pool_sync_mos(dsl_pool_t *dp, dmu_tx_t *tx)
  {
@@ -444,10 +584,33 @@ dsl_pool_dirty_delta(dsl_pool_t *dp, int64_t delta)
          * Note: we signal even when increasing dp_dirty_total.
          * This ensures forward progress -- each thread wakes the next waiter.
          */
-       if (dp->dp_dirty_total <= zfs_dirty_data_max)
+       if (dp->dp_dirty_total < zfs_dirty_data_max)
                 cv_signal(&dp->dp_spaceavail_cv);
  }
  
+#ifdef ZFS_DEBUG
+static boolean_t
+dsl_early_sync_task_verify(dsl_pool_t *dp, uint64_t txg)
+{
+       spa_t *spa = dp->dp_spa;
+       vdev_t *rvd = spa->spa_root_vdev;
+
+       for (uint64_t c = 0; c < rvd->vdev_children; c++) {
+               vdev_t *vd = rvd->vdev_child[c];
+               txg_list_t *tl = &vd->vdev_ms_list;
+               metaslab_t *ms;
+
+               for (ms = txg_list_head(tl, TXG_CLEAN(txg)); ms;
+                   ms = txg_list_next(tl, ms, TXG_CLEAN(txg))) {
+                       VERIFY(range_tree_is_empty(ms->ms_freeing));
+                       VERIFY(range_tree_is_empty(ms->ms_checkpointing));
+               }
+       }
+
+       return (B_TRUE);
+}
+#endif
+
  void
  dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
  {
@@ -463,6 +626,23 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
  
         tx = dmu_tx_create_assigned(dp, txg);
  
+       /*
+        * Run all early sync tasks before writing out any dirty blocks.
+        * For more info on early sync tasks see block comment in
+        * dsl_early_sync_task().
+        */
+       if (!txg_list_empty(&dp->dp_early_sync_tasks, txg)) {
+               dsl_sync_task_t *dst;
+
+               ASSERT3U(spa_sync_pass(dp->dp_spa), ==, 1);
+               while ((dst =
+                   txg_list_remove(&dp->dp_early_sync_tasks, txg)) != NULL) {
+                       ASSERT(dsl_early_sync_task_verify(dp, txg));
+                       dsl_sync_task_sync(dst, tx);
+               }
+               ASSERT(dsl_early_sync_task_verify(dp, txg));
+       }
+
         /*
          * Write out all dirty blocks of dirty datasets.
          */
@@ -488,14 +668,27 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
          */
         dsl_pool_undirty_space(dp, dp->dp_dirty_pertxg[txg & TXG_MASK], txg);
  
+       /*
+        * Update the long range free counter after
+        * we're done syncing user data
+        */
+       mutex_enter(&dp->dp_lock);
+       ASSERT(spa_sync_pass(dp->dp_spa) == 1 ||
+           dp->dp_long_free_dirty_pertxg[txg & TXG_MASK] == 0);
+       dp->dp_long_free_dirty_pertxg[txg & TXG_MASK] = 0;
+       mutex_exit(&dp->dp_lock);
+
         /*
          * After the data blocks have been written (ensured by the zio_wait()
-        * above), update the user/group space accounting.
+        * above), update the user/group/project space accounting.  This happens
+        * in tasks dispatched to dp_sync_taskq, so wait for them before
+        * continuing.
          */
         for (ds = list_head(&synced_datasets); ds != NULL;
             ds = list_next(&synced_datasets, ds)) {
                 dmu_objset_do_userquota_updates(ds->ds_objset, tx);
         }
+       taskq_wait(dp->dp_sync_taskq);
  
         /*
          * Sync the datasets again to push out the changes due to
@@ -506,9 +699,22 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
          */
         zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
         while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) {
+               objset_t *os = ds->ds_objset;
+
                 ASSERT(list_link_active(&ds->ds_synced_link));
                 dmu_buf_rele(ds->ds_dbuf, ds);
                 dsl_dataset_sync(ds, zio, tx);
+
+               /*
+                * Release any key mappings created by calls to
+                * dsl_dataset_dirty() from the userquota accounting
+                * code paths.
+                */
+               if (os->os_encrypted && !os->os_raw_receive &&
+                   !os->os_next_write_raw[txg & TXG_MASK]) {
+                       ASSERT3P(ds->ds_key_mapping, !=, NULL);
+                       key_mapping_rele(dp->dp_spa, ds->ds_key_mapping, ds);
+               }
         }
         VERIFY0(zio_wait(zio));
  
@@ -518,13 +724,18 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
          *
          *  - move dead blocks from the pending deadlist to the on-disk deadlist
          *  - release hold from dsl_dataset_dirty()
+        *  - release key mapping hold from dsl_dataset_dirty()
          */
         while ((ds = list_remove_head(&synced_datasets)) != NULL) {
-               ASSERTV(objset_t *os = ds->ds_objset);
-               bplist_iterate(&ds->ds_pending_deadlist,
-                   deadlist_enqueue_cb, &ds->ds_deadlist, tx);
-               ASSERT(!dmu_objset_is_dirty(os, txg));
-               dmu_buf_rele(ds->ds_dbuf, ds);
+               objset_t *os = ds->ds_objset;
+
+               if (os->os_encrypted && !os->os_raw_receive &&
+                   !os->os_next_write_raw[txg & TXG_MASK]) {
+                       ASSERT3P(ds->ds_key_mapping, !=, NULL);
+                       key_mapping_rele(dp->dp_spa, ds->ds_key_mapping, ds);
+               }
+
+               dsl_dataset_sync_done(ds, tx);
         }
  
         while ((dd = txg_list_remove(&dp->dp_dirty_dirs, txg)) != NULL) {
@@ -547,8 +758,7 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
                 dp->dp_mos_uncompressed_delta = 0;
         }
  
-       if (list_head(&mos->os_dirty_dnodes[txg & TXG_MASK]) != NULL ||
-           list_head(&mos->os_free_dnodes[txg & TXG_MASK]) != NULL) {
+       if (!multilist_is_empty(mos->os_dirty_dnodes[txg & TXG_MASK])) {
                 dsl_pool_sync_mos(dp, tx);
         }
  
@@ -582,9 +792,16 @@ dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg)
  {
         zilog_t *zilog;
  
-       while ((zilog = txg_list_remove(&dp->dp_dirty_zilogs, txg))) {
+       while ((zilog = txg_list_head(&dp->dp_dirty_zilogs, txg))) {
                 dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os);
+               /*
+                * We don't remove the zilog from the dp_dirty_zilogs
+                * list until after we've cleaned it. This ensures that
+                * callers of zilog_is_dirty() receive an accurate
+                * answer when they are racing with the spa sync thread.
+                */
                 zil_clean(zilog, txg);
+               (void) txg_list_remove_this(&dp->dp_dirty_zilogs, zilog, txg);
                 ASSERT(!dmu_objset_is_dirty(zilog->zl_os, txg));
                 dmu_buf_rele(ds->ds_dbuf, zilog);
         }
@@ -599,30 +816,70 @@ int
  dsl_pool_sync_context(dsl_pool_t *dp)
  {
         return (curthread == dp->dp_tx.tx_sync_thread ||
-           spa_is_initializing(dp->dp_spa));
+           spa_is_initializing(dp->dp_spa) ||
+           taskq_member(dp->dp_sync_taskq, curthread));
  }
  
+/*
+ * This function returns the amount of allocatable space in the pool
+ * minus whatever space is currently reserved by ZFS for specific
+ * purposes. Specifically:
+ *
+ * 1] Any reserved SLOP space
+ * 2] Any space used by the checkpoint
+ * 3] Any space used for deferred frees
+ *
+ * The latter 2 are especially important because they are needed to
+ * rectify the SPA's and DMU's different understanding of how much space
+ * is used. Now the DMU is aware of that extra space tracked by the SPA
+ * without having to maintain a separate special dir (e.g similar to
+ * $MOS, $FREEING, and $LEAKED).
+ *
+ * Note: By deferred frees here, we mean the frees that were deferred
+ * in spa_sync() after sync pass 1 (spa_deferred_bpobj), and not the
+ * segments placed in ms_defer trees during metaslab_sync_done().
+ */
  uint64_t
-dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree)
+dsl_pool_adjustedsize(dsl_pool_t *dp, zfs_space_check_t slop_policy)
  {
-       uint64_t space, resv;
-
-       /*
-        * Reserve about 1.6% (1/64), or at least 32MB, for allocation
-        * efficiency.
-        * XXX The intent log is not accounted for, so it must fit
-        * within this slop.
-        *
-        * If we're trying to assess whether it's OK to do a free,
-        * cut the reservation in half to allow forward progress
-        * (e.g. make it possible to rm(1) files from a full pool).
-        */
-       space = spa_get_dspace(dp->dp_spa);
-       resv = MAX(space >> 6, SPA_MINDEVSIZE >> 1);
-       if (netfree)
+       spa_t *spa = dp->dp_spa;
+       uint64_t space, resv, adjustedsize;
+       uint64_t spa_deferred_frees =
+           spa->spa_deferred_bpobj.bpo_phys->bpo_bytes;
+
+       space = spa_get_dspace(spa)
+           - spa_get_checkpoint_space(spa) - spa_deferred_frees;
+       resv = spa_get_slop_space(spa);
+
+       switch (slop_policy) {
+       case ZFS_SPACE_CHECK_NORMAL:
+               break;
+       case ZFS_SPACE_CHECK_RESERVED:
                 resv >>= 1;
+               break;
+       case ZFS_SPACE_CHECK_EXTRA_RESERVED:
+               resv >>= 2;
+               break;
+       case ZFS_SPACE_CHECK_NONE:
+               resv = 0;
+               break;
+       default:
+               panic("invalid slop policy value: %d", slop_policy);
+               break;
+       }
+       adjustedsize = (space >= resv) ? (space - resv) : 0;
  
-       return (space - resv);
+       return (adjustedsize);
+}
+
+uint64_t
+dsl_pool_unreserved_space(dsl_pool_t *dp, zfs_space_check_t slop_policy)
+{
+       uint64_t poolsize = dsl_pool_adjustedsize(dp, slop_policy);
+       uint64_t deferred =
+           metaslab_class_get_deferred(spa_normal_class(dp->dp_spa));
+       uint64_t quota = (poolsize >= deferred) ? (poolsize - deferred) : 0;
+       return (quota);
  }
  
  boolean_t
@@ -630,10 +887,12 @@ dsl_pool_need_dirty_delay(dsl_pool_t *dp)
  {
         uint64_t delay_min_bytes =
             zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
+       uint64_t dirty_min_bytes =
+           zfs_dirty_data_max * zfs_dirty_data_sync_percent / 100;
         boolean_t rv;
  
         mutex_enter(&dp->dp_lock);
-       if (dp->dp_dirty_total > zfs_dirty_data_sync)
+       if (dp->dp_dirty_total > dirty_min_bytes)
                 txg_kick(dp);
         rv = (dp->dp_dirty_total > delay_min_bytes);
         mutex_exit(&dp->dp_lock);
@@ -682,15 +941,15 @@ upgrade_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
         if (err)
                 return (err);
  
-       while (ds->ds_phys->ds_prev_snap_obj != 0) {
-               err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj,
-                   FTAG, &prev);
+       while (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
+               err = dsl_dataset_hold_obj(dp,
+                   dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
                 if (err) {
                         dsl_dataset_rele(ds, FTAG);
                         return (err);
                 }
  
-               if (prev->ds_phys->ds_next_snap_obj != ds->ds_object)
+               if (dsl_dataset_phys(prev)->ds_next_snap_obj != ds->ds_object)
                         break;
                 dsl_dataset_rele(ds, FTAG);
                 ds = prev;
@@ -704,7 +963,9 @@ upgrade_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
                  * The $ORIGIN can't have any data, or the accounting
                  * will be wrong.
                  */
-               ASSERT0(prev->ds_phys->ds_bp.blk_birth);
+               rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
+               ASSERT0(dsl_dataset_phys(prev)->ds_bp.blk_birth);
+               rrw_exit(&ds->ds_bp_rwlock, FTAG);
  
                 /* The origin doesn't get attached to itself */
                 if (ds->ds_object == prev->ds_object) {
@@ -713,33 +974,35 @@ upgrade_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
                 }
  
                 dmu_buf_will_dirty(ds->ds_dbuf, tx);
-               ds->ds_phys->ds_prev_snap_obj = prev->ds_object;
-               ds->ds_phys->ds_prev_snap_txg = prev->ds_phys->ds_creation_txg;
+               dsl_dataset_phys(ds)->ds_prev_snap_obj = prev->ds_object;
+               dsl_dataset_phys(ds)->ds_prev_snap_txg =
+                   dsl_dataset_phys(prev)->ds_creation_txg;
  
                 dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
-               ds->ds_dir->dd_phys->dd_origin_obj = prev->ds_object;
+               dsl_dir_phys(ds->ds_dir)->dd_origin_obj = prev->ds_object;
  
                 dmu_buf_will_dirty(prev->ds_dbuf, tx);
-               prev->ds_phys->ds_num_children++;
+               dsl_dataset_phys(prev)->ds_num_children++;
  
-               if (ds->ds_phys->ds_next_snap_obj == 0) {
+               if (dsl_dataset_phys(ds)->ds_next_snap_obj == 0) {
                         ASSERT(ds->ds_prev == NULL);
                         VERIFY0(dsl_dataset_hold_obj(dp,
-                           ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev));
+                           dsl_dataset_phys(ds)->ds_prev_snap_obj,
+                           ds, &ds->ds_prev));
                 }
         }
  
-       ASSERT3U(ds->ds_dir->dd_phys->dd_origin_obj, ==, prev->ds_object);
-       ASSERT3U(ds->ds_phys->ds_prev_snap_obj, ==, prev->ds_object);
+       ASSERT3U(dsl_dir_phys(ds->ds_dir)->dd_origin_obj, ==, prev->ds_object);
+       ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_obj, ==, prev->ds_object);
  
-       if (prev->ds_phys->ds_next_clones_obj == 0) {
+       if (dsl_dataset_phys(prev)->ds_next_clones_obj == 0) {
                 dmu_buf_will_dirty(prev->ds_dbuf, tx);
-               prev->ds_phys->ds_next_clones_obj =
+               dsl_dataset_phys(prev)->ds_next_clones_obj =
                     zap_create(dp->dp_meta_objset,
                     DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx);
         }
         VERIFY0(zap_add_int(dp->dp_meta_objset,
-           prev->ds_phys->ds_next_clones_obj, ds->ds_object, tx));
+           dsl_dataset_phys(prev)->ds_next_clones_obj, ds->ds_object, tx));
  
         dsl_dataset_rele(ds, FTAG);
         if (prev != dp->dp_origin_snap)
@@ -754,7 +1017,7 @@ dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx)
         ASSERT(dp->dp_origin_snap != NULL);
  
         VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, upgrade_clones_cb,
-           tx, DS_FIND_CHILDREN));
+           tx, DS_FIND_CHILDREN | DS_FIND_SERIALIZE));
  }
  
  /* ARGSUSED */
@@ -764,20 +1027,22 @@ upgrade_dir_clones_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
         dmu_tx_t *tx = arg;
         objset_t *mos = dp->dp_meta_objset;
  
-       if (ds->ds_dir->dd_phys->dd_origin_obj != 0) {
+       if (dsl_dir_phys(ds->ds_dir)->dd_origin_obj != 0) {
                 dsl_dataset_t *origin;
  
                 VERIFY0(dsl_dataset_hold_obj(dp,
-                   ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &origin));
+                   dsl_dir_phys(ds->ds_dir)->dd_origin_obj, FTAG, &origin));
  
-               if (origin->ds_dir->dd_phys->dd_clones == 0) {
+               if (dsl_dir_phys(origin->ds_dir)->dd_clones == 0) {
                         dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx);
-                       origin->ds_dir->dd_phys->dd_clones = zap_create(mos,
-                           DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx);
+                       dsl_dir_phys(origin->ds_dir)->dd_clones =
+                           zap_create(mos, DMU_OT_DSL_CLONES, DMU_OT_NONE,
+                           0, tx);
                 }
  
                 VERIFY0(zap_add_int(dp->dp_meta_objset,
-                   origin->ds_dir->dd_phys->dd_clones, ds->ds_object, tx));
+                   dsl_dir_phys(origin->ds_dir)->dd_clones,
+                   ds->ds_object, tx));
  
                 dsl_dataset_rele(origin, FTAG);
         }
@@ -801,13 +1066,13 @@ dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx)
          * subobj support.  So call dmu_object_alloc() directly.
          */
         obj = dmu_object_alloc(dp->dp_meta_objset, DMU_OT_BPOBJ,
-           SPA_MAXBLOCKSIZE, DMU_OT_BPOBJ_HDR, sizeof (bpobj_phys_t), tx);
+           SPA_OLD_MAXBLOCKSIZE, DMU_OT_BPOBJ_HDR, sizeof (bpobj_phys_t), tx);
         VERIFY0(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
             DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx));
         VERIFY0(bpobj_open(&dp->dp_free_bpobj, dp->dp_meta_objset, obj));
  
         VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
-           upgrade_dir_clones_cb, tx, DS_FIND_CHILDREN));
+           upgrade_dir_clones_cb, tx, DS_FIND_CHILDREN | DS_FIND_SERIALIZE));
  }
  
  void
@@ -822,10 +1087,10 @@ dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx)
  
         /* create the origin dir, ds, & snap-ds */
         dsobj = dsl_dataset_create_sync(dp->dp_root_dir, ORIGIN_DIR_NAME,
-           NULL, 0, kcred, tx);
+           NULL, 0, kcred, NULL, tx);
         VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
         dsl_dataset_snapshot_sync_impl(ds, ORIGIN_DIR_NAME, tx);
-       VERIFY0(dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj,
+       VERIFY0(dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj,
             dp, &dp->dp_origin_snap));
         dsl_dataset_rele(ds, FTAG);
  }
@@ -836,6 +1101,12 @@ dsl_pool_iput_taskq(dsl_pool_t *dp)
         return (dp->dp_iput_taskq);
  }
  
+taskq_t *
+dsl_pool_unlinked_drain_taskq(dsl_pool_t *dp)
+{
+       return (dp->dp_unlinked_drain_taskq);
+}
+
  /*
   * Walk through the pool-wide zap object of temporary snapshot user holds
   * and release them.
@@ -1040,6 +1311,13 @@ dsl_pool_config_enter(dsl_pool_t *dp, void *tag)
         rrw_enter(&dp->dp_config_rwlock, RW_READER, tag);
  }
  
+void
+dsl_pool_config_enter_prio(dsl_pool_t *dp, void *tag)
+{
+       ASSERT(!rrw_held(&dp->dp_config_rwlock, RW_READER));
+       rrw_enter_read_prio(&dp->dp_config_rwlock, tag);
+}
+
  void
  dsl_pool_config_exit(dsl_pool_t *dp, void *tag)
  {
@@ -1052,10 +1330,17 @@ dsl_pool_config_held(dsl_pool_t *dp)
         return (RRW_LOCK_HELD(&dp->dp_config_rwlock));
  }
  
-#if defined(_KERNEL) && defined(HAVE_SPL)
+boolean_t
+dsl_pool_config_held_writer(dsl_pool_t *dp)
+{
+       return (RRW_WRITE_HELD(&dp->dp_config_rwlock));
+}
+
+#if defined(_KERNEL)
  EXPORT_SYMBOL(dsl_pool_config_enter);
  EXPORT_SYMBOL(dsl_pool_config_exit);
  
+/* BEGIN CSTYLED */
  /* zfs_dirty_data_max_percent only applied at module load in arc_init(). */
  module_param(zfs_dirty_data_max_percent, int, 0444);
  MODULE_PARM_DESC(zfs_dirty_data_max_percent, "percent of ram can be dirty");
@@ -1076,9 +1361,28 @@ module_param(zfs_dirty_data_max_max, ulong, 0444);
  MODULE_PARM_DESC(zfs_dirty_data_max_max,
         "zfs_dirty_data_max upper bound in bytes");
  
-module_param(zfs_dirty_data_sync, ulong, 0644);
-MODULE_PARM_DESC(zfs_dirty_data_sync, "sync txg when this much dirty data");
+module_param(zfs_dirty_data_sync_percent, int, 0644);
+MODULE_PARM_DESC(zfs_dirty_data_sync_percent,
+       "dirty data txg sync threshold as a percentage of zfs_dirty_data_max");
  
  module_param(zfs_delay_scale, ulong, 0644);
  MODULE_PARM_DESC(zfs_delay_scale, "how quickly delay approaches infinity");
+
+module_param(zfs_sync_taskq_batch_pct, int, 0644);
+MODULE_PARM_DESC(zfs_sync_taskq_batch_pct,
+       "max percent of CPUs that are used to sync dirty data");
+
+module_param(zfs_zil_clean_taskq_nthr_pct, int, 0644);
+MODULE_PARM_DESC(zfs_zil_clean_taskq_nthr_pct,
+       "max percent of CPUs that are used per dp_sync_taskq");
+
+module_param(zfs_zil_clean_taskq_minalloc, int, 0644);
+MODULE_PARM_DESC(zfs_zil_clean_taskq_minalloc,
+       "number of taskq entries that are pre-populated");
+
+module_param(zfs_zil_clean_taskq_maxalloc, int, 0644);
+MODULE_PARM_DESC(zfs_zil_clean_taskq_maxalloc,
+       "max number of taskq entries that are cached");
+
+/* END CSTYLED */
  #endif