port async unlinked drain from illumos-nexenta

[mirror_zfs.git] / module / zfs / dsl_pool.c
diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c

index 1bb49c13ae15cced8c3b625235ee5002c8f8c926..10e967ab91ed9218415c2cfa5db3c3628b20cfd4 100644 (file)
--- a/module/zfs/dsl_pool.c
+++ b/module/zfs/dsl_pool.c
@@ -43,6 +43,8 @@
  #include <sys/zfs_znode.h>
  #include <sys/spa_impl.h>
  #include <sys/dsl_deadlist.h>
+#include <sys/vdev_impl.h>
+#include <sys/metaslab_impl.h>
  #include <sys/bptree.h>
  #include <sys/zfeature.h>
  #include <sys/zil_impl.h>
@@ -81,7 +83,7 @@
   * zfs_dirty_data_max determines the dirty space limit. Once that value is
   * exceeded, new writes are halted until space frees up.
   *
- * The zfs_dirty_data_sync tunable dictates the threshold at which we
+ * The zfs_dirty_data_sync_percent tunable dictates the threshold at which we
   * ensure that there is a txg syncing (see the comment in txg.c for a full
   * description of transaction group stages).
   *
@@ -104,9 +106,11 @@ int zfs_dirty_data_max_percent = 10;
  int zfs_dirty_data_max_max_percent = 25;
  
  /*
- * If there is at least this much dirty data, push out a txg.
+ * If there's at least this much dirty data (as a percentage of
+ * zfs_dirty_data_max), push out a txg.  This should be less than
+ * zfs_vdev_async_write_active_min_dirty_percent.
   */
-unsigned long zfs_dirty_data_sync = 64 * 1024 * 1024;
+int zfs_dirty_data_sync_percent = 20;
  
  /*
   * Once there is this amount of dirty data, the dmu_tx_delay() will kick in
@@ -201,6 +205,8 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg)
             offsetof(dsl_dir_t, dd_dirty_link));
         txg_list_create(&dp->dp_sync_tasks, spa,
             offsetof(dsl_sync_task_t, dst_node));
+       txg_list_create(&dp->dp_early_sync_tasks, spa,
+           offsetof(dsl_sync_task_t, dst_node));
  
         dp->dp_sync_taskq = taskq_create("dp_sync_taskq",
             zfs_sync_taskq_batch_pct, minclsyspri, 1, INT_MAX,
@@ -217,6 +223,9 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg)
  
         dp->dp_iput_taskq = taskq_create("z_iput", max_ncpus, defclsyspri,
             max_ncpus * 8, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
+       dp->dp_unlinked_drain_taskq = taskq_create("z_unlinked_drain",
+           max_ncpus, defclsyspri, max_ncpus, INT_MAX,
+           TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
  
         return (dp);
  }
@@ -385,6 +394,7 @@ dsl_pool_close(dsl_pool_t *dp)
         txg_list_destroy(&dp->dp_dirty_datasets);
         txg_list_destroy(&dp->dp_dirty_zilogs);
         txg_list_destroy(&dp->dp_sync_tasks);
+       txg_list_destroy(&dp->dp_early_sync_tasks);
         txg_list_destroy(&dp->dp_dirty_dirs);
  
         taskq_destroy(dp->dp_zil_clean_taskq);
@@ -406,6 +416,7 @@ dsl_pool_close(dsl_pool_t *dp)
         rrw_destroy(&dp->dp_config_rwlock);
         mutex_destroy(&dp->dp_lock);
         cv_destroy(&dp->dp_spaceavail_cv);
+       taskq_destroy(dp->dp_unlinked_drain_taskq);
         taskq_destroy(dp->dp_iput_taskq);
         if (dp->dp_blkstats != NULL) {
                 mutex_destroy(&dp->dp_blkstats->zab_lock);
@@ -450,6 +461,11 @@ dsl_pool_create(spa_t *spa, nvlist_t *zplprops, dsl_crypto_params_t *dcp,
         int err;
         dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
         dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg);
+#ifdef _KERNEL
+       objset_t *os;
+#else
+       objset_t *os __attribute__((unused));
+#endif
         dsl_dataset_t *ds;
         uint64_t obj;
  
@@ -511,18 +527,16 @@ dsl_pool_create(spa_t *spa, nvlist_t *zplprops, dsl_crypto_params_t *dcp,
         obj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, dcp, 0, tx);
  
         /* create the root objset */
-       VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG, &ds));
+       VERIFY0(dsl_dataset_hold_obj_flags(dp, obj,
+           DS_HOLD_FLAG_DECRYPT, FTAG, &ds));
+       rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
+       os = dmu_objset_create_impl(dp->dp_spa, ds,
+           dsl_dataset_get_blkptr(ds), DMU_OST_ZFS, tx);
+       rrw_exit(&ds->ds_bp_rwlock, FTAG);
  #ifdef _KERNEL
-       {
-               objset_t *os;
-               rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
-               os = dmu_objset_create_impl(dp->dp_spa, ds,
-                   dsl_dataset_get_blkptr(ds), DMU_OST_ZFS, tx);
-               rrw_exit(&ds->ds_bp_rwlock, FTAG);
-               zfs_create_fs(os, kcred, zplprops, tx);
-       }
+       zfs_create_fs(os, kcred, zplprops, tx);
  #endif
-       dsl_dataset_rele(ds, FTAG);
+       dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG);
  
         dmu_tx_commit(tx);
  
@@ -574,6 +588,29 @@ dsl_pool_dirty_delta(dsl_pool_t *dp, int64_t delta)
                 cv_signal(&dp->dp_spaceavail_cv);
  }
  
+#ifdef ZFS_DEBUG
+static boolean_t
+dsl_early_sync_task_verify(dsl_pool_t *dp, uint64_t txg)
+{
+       spa_t *spa = dp->dp_spa;
+       vdev_t *rvd = spa->spa_root_vdev;
+
+       for (uint64_t c = 0; c < rvd->vdev_children; c++) {
+               vdev_t *vd = rvd->vdev_child[c];
+               txg_list_t *tl = &vd->vdev_ms_list;
+               metaslab_t *ms;
+
+               for (ms = txg_list_head(tl, TXG_CLEAN(txg)); ms;
+                   ms = txg_list_next(tl, ms, TXG_CLEAN(txg))) {
+                       VERIFY(range_tree_is_empty(ms->ms_freeing));
+                       VERIFY(range_tree_is_empty(ms->ms_checkpointing));
+               }
+       }
+
+       return (B_TRUE);
+}
+#endif
+
  void
  dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
  {
@@ -589,6 +626,23 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
  
         tx = dmu_tx_create_assigned(dp, txg);
  
+       /*
+        * Run all early sync tasks before writing out any dirty blocks.
+        * For more info on early sync tasks see block comment in
+        * dsl_early_sync_task().
+        */
+       if (!txg_list_empty(&dp->dp_early_sync_tasks, txg)) {
+               dsl_sync_task_t *dst;
+
+               ASSERT3U(spa_sync_pass(dp->dp_spa), ==, 1);
+               while ((dst =
+                   txg_list_remove(&dp->dp_early_sync_tasks, txg)) != NULL) {
+                       ASSERT(dsl_early_sync_task_verify(dp, txg));
+                       dsl_sync_task_sync(dst, tx);
+               }
+               ASSERT(dsl_early_sync_task_verify(dp, txg));
+       }
+
         /*
          * Write out all dirty blocks of dirty datasets.
          */
@@ -645,9 +699,22 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
          */
         zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
         while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) {
+               objset_t *os = ds->ds_objset;
+
                 ASSERT(list_link_active(&ds->ds_synced_link));
                 dmu_buf_rele(ds->ds_dbuf, ds);
                 dsl_dataset_sync(ds, zio, tx);
+
+               /*
+                * Release any key mappings created by calls to
+                * dsl_dataset_dirty() from the userquota accounting
+                * code paths.
+                */
+               if (os->os_encrypted && !os->os_raw_receive &&
+                   !os->os_next_write_raw[txg & TXG_MASK]) {
+                       ASSERT3P(ds->ds_key_mapping, !=, NULL);
+                       key_mapping_rele(dp->dp_spa, ds->ds_key_mapping, ds);
+               }
         }
         VERIFY0(zio_wait(zio));
  
@@ -657,8 +724,17 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
          *
          *  - move dead blocks from the pending deadlist to the on-disk deadlist
          *  - release hold from dsl_dataset_dirty()
+        *  - release key mapping hold from dsl_dataset_dirty()
          */
         while ((ds = list_remove_head(&synced_datasets)) != NULL) {
+               objset_t *os = ds->ds_objset;
+
+               if (os->os_encrypted && !os->os_raw_receive &&
+                   !os->os_next_write_raw[txg & TXG_MASK]) {
+                       ASSERT3P(ds->ds_key_mapping, !=, NULL);
+                       key_mapping_rele(dp->dp_spa, ds->ds_key_mapping, ds);
+               }
+
                 dsl_dataset_sync_done(ds, tx);
         }
  
@@ -744,22 +820,66 @@ dsl_pool_sync_context(dsl_pool_t *dp)
             taskq_member(dp->dp_sync_taskq, curthread));
  }
  
+/*
+ * This function returns the amount of allocatable space in the pool
+ * minus whatever space is currently reserved by ZFS for specific
+ * purposes. Specifically:
+ *
+ * 1] Any reserved SLOP space
+ * 2] Any space used by the checkpoint
+ * 3] Any space used for deferred frees
+ *
+ * The latter 2 are especially important because they are needed to
+ * rectify the SPA's and DMU's different understanding of how much space
+ * is used. Now the DMU is aware of that extra space tracked by the SPA
+ * without having to maintain a separate special dir (e.g similar to
+ * $MOS, $FREEING, and $LEAKED).
+ *
+ * Note: By deferred frees here, we mean the frees that were deferred
+ * in spa_sync() after sync pass 1 (spa_deferred_bpobj), and not the
+ * segments placed in ms_defer trees during metaslab_sync_done().
+ */
  uint64_t
-dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree)
+dsl_pool_adjustedsize(dsl_pool_t *dp, zfs_space_check_t slop_policy)
  {
-       uint64_t space, resv;
-
-       /*
-        * If we're trying to assess whether it's OK to do a free,
-        * cut the reservation in half to allow forward progress
-        * (e.g. make it possible to rm(1) files from a full pool).
-        */
-       space = spa_get_dspace(dp->dp_spa);
-       resv = spa_get_slop_space(dp->dp_spa);
-       if (netfree)
+       spa_t *spa = dp->dp_spa;
+       uint64_t space, resv, adjustedsize;
+       uint64_t spa_deferred_frees =
+           spa->spa_deferred_bpobj.bpo_phys->bpo_bytes;
+
+       space = spa_get_dspace(spa)
+           - spa_get_checkpoint_space(spa) - spa_deferred_frees;
+       resv = spa_get_slop_space(spa);
+
+       switch (slop_policy) {
+       case ZFS_SPACE_CHECK_NORMAL:
+               break;
+       case ZFS_SPACE_CHECK_RESERVED:
                 resv >>= 1;
+               break;
+       case ZFS_SPACE_CHECK_EXTRA_RESERVED:
+               resv >>= 2;
+               break;
+       case ZFS_SPACE_CHECK_NONE:
+               resv = 0;
+               break;
+       default:
+               panic("invalid slop policy value: %d", slop_policy);
+               break;
+       }
+       adjustedsize = (space >= resv) ? (space - resv) : 0;
+
+       return (adjustedsize);
+}
  
-       return (space - resv);
+uint64_t
+dsl_pool_unreserved_space(dsl_pool_t *dp, zfs_space_check_t slop_policy)
+{
+       uint64_t poolsize = dsl_pool_adjustedsize(dp, slop_policy);
+       uint64_t deferred =
+           metaslab_class_get_deferred(spa_normal_class(dp->dp_spa));
+       uint64_t quota = (poolsize >= deferred) ? (poolsize - deferred) : 0;
+       return (quota);
  }
  
  boolean_t
@@ -767,10 +887,12 @@ dsl_pool_need_dirty_delay(dsl_pool_t *dp)
  {
         uint64_t delay_min_bytes =
             zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
+       uint64_t dirty_min_bytes =
+           zfs_dirty_data_max * zfs_dirty_data_sync_percent / 100;
         boolean_t rv;
  
         mutex_enter(&dp->dp_lock);
-       if (dp->dp_dirty_total > zfs_dirty_data_sync)
+       if (dp->dp_dirty_total > dirty_min_bytes)
                 txg_kick(dp);
         rv = (dp->dp_dirty_total > delay_min_bytes);
         mutex_exit(&dp->dp_lock);
@@ -979,6 +1101,12 @@ dsl_pool_iput_taskq(dsl_pool_t *dp)
         return (dp->dp_iput_taskq);
  }
  
+taskq_t *
+dsl_pool_unlinked_drain_taskq(dsl_pool_t *dp)
+{
+       return (dp->dp_unlinked_drain_taskq);
+}
+
  /*
   * Walk through the pool-wide zap object of temporary snapshot user holds
   * and release them.
@@ -1233,8 +1361,9 @@ module_param(zfs_dirty_data_max_max, ulong, 0444);
  MODULE_PARM_DESC(zfs_dirty_data_max_max,
         "zfs_dirty_data_max upper bound in bytes");
  
-module_param(zfs_dirty_data_sync, ulong, 0644);
-MODULE_PARM_DESC(zfs_dirty_data_sync, "sync txg when this much dirty data");
+module_param(zfs_dirty_data_sync_percent, int, 0644);
+MODULE_PARM_DESC(zfs_dirty_data_sync_percent,
+       "dirty data txg sync threshold as a percentage of zfs_dirty_data_max");
  
  module_param(zfs_delay_scale, ulong, 0644);
  MODULE_PARM_DESC(zfs_delay_scale, "how quickly delay approaches infinity");