Restrict kstats and print real pointers

[mirror_zfs.git] / module / zfs / metaslab.c
diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c

index 9f6f0048fcbad4e6688ed835e6a32c124069b504..ec89810b48abeed5f31355537803f2d9634e03ee 100644 (file)
--- a/module/zfs/metaslab.c
+++ b/module/zfs/metaslab.c
@@ -181,7 +181,6 @@ int metaslab_lba_weighting_enabled = B_TRUE;
   */
  int metaslab_bias_enabled = B_TRUE;
  
-
  /*
   * Enable/disable remapping of indirect DVAs to their concrete vdevs.
   */
@@ -219,6 +218,12 @@ boolean_t metaslab_trace_enabled = B_TRUE;
  uint64_t metaslab_trace_max_entries = 5000;
  #endif
  
+/*
+ * Maximum number of metaslabs per group that can be disabled
+ * simultaneously.
+ */
+int max_disabled_ms = 3;
+
  static uint64_t metaslab_weight(metaslab_t *);
  static void metaslab_set_fragmentation(metaslab_t *);
  static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t);
@@ -652,8 +657,8 @@ metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators)
  
         mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
         mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
-       mutex_init(&mg->mg_ms_initialize_lock, NULL, MUTEX_DEFAULT, NULL);
-       cv_init(&mg->mg_ms_initialize_cv, NULL, CV_DEFAULT, NULL);
+       mutex_init(&mg->mg_ms_disabled_lock, NULL, MUTEX_DEFAULT, NULL);
+       cv_init(&mg->mg_ms_disabled_cv, NULL, CV_DEFAULT, NULL);
         mg->mg_primaries = kmem_zalloc(allocators * sizeof (metaslab_t *),
             KM_SLEEP);
         mg->mg_secondaries = kmem_zalloc(allocators * sizeof (metaslab_t *),
@@ -700,8 +705,8 @@ metaslab_group_destroy(metaslab_group_t *mg)
         kmem_free(mg->mg_secondaries, mg->mg_allocators *
             sizeof (metaslab_t *));
         mutex_destroy(&mg->mg_lock);
-       mutex_destroy(&mg->mg_ms_initialize_lock);
-       cv_destroy(&mg->mg_ms_initialize_cv);
+       mutex_destroy(&mg->mg_ms_disabled_lock);
+       cv_destroy(&mg->mg_ms_disabled_cv);
  
         for (int i = 0; i < mg->mg_allocators; i++) {
                 zfs_refcount_destroy(&mg->mg_alloc_queue_depth[i]);
@@ -1682,8 +1687,10 @@ metaslab_load_impl(metaslab_t *msp)
         mutex_enter(&msp->ms_lock);
         ASSERT(!msp->ms_condensing);
  
-       if (error != 0)
+       if (error != 0) {
+               mutex_exit(&msp->ms_sync_lock);
                 return (error);
+       }
  
         ASSERT3P(msp->ms_group, !=, NULL);
         msp->ms_loaded = B_TRUE;
@@ -1844,8 +1851,10 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
          */
         ms->ms_allocatable = range_tree_create_impl(&rt_avl_ops,
             &ms->ms_allocatable_by_size, metaslab_rangesize_compare, 0);
-       metaslab_group_add(mg, ms);
  
+       ms->ms_trim = range_tree_create(NULL, NULL);
+
+       metaslab_group_add(mg, ms);
         metaslab_set_fragmentation(ms);
  
         /*
@@ -1919,6 +1928,9 @@ metaslab_fini(metaslab_t *msp)
         for (int t = 0; t < TXG_SIZE; t++)
                 ASSERT(!txg_list_member(&vd->vdev_ms_list, msp, t));
  
+       range_tree_vacate(msp->ms_trim, NULL, NULL);
+       range_tree_destroy(msp->ms_trim);
+
         mutex_exit(&msp->ms_lock);
         cv_destroy(&msp->ms_load_cv);
         mutex_destroy(&msp->ms_lock);
@@ -2631,7 +2643,7 @@ metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx)
         ASSERT(msp->ms_loaded);
  
  
-       zfs_dbgmsg("condensing: txg %llu, msp[%llu] %p, vdev id %llu, "
+       zfs_dbgmsg("condensing: txg %llu, msp[%llu] %px, vdev id %llu, "
             "spa %s, smp size %llu, segments %lu, forcing condense=%s", txg,
             msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id,
             msp->ms_group->mg_vd->vdev_spa->spa_name,
@@ -2725,6 +2737,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
         ASSERT3P(msp->ms_freeing, !=, NULL);
         ASSERT3P(msp->ms_freed, !=, NULL);
         ASSERT3P(msp->ms_checkpointing, !=, NULL);
+       ASSERT3P(msp->ms_trim, !=, NULL);
  
         /*
          * Normally, we don't want to process a metaslab if there are no
@@ -2997,6 +3010,24 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
          */
         metaslab_load_wait(msp);
  
+       /*
+        * When auto-trimming is enabled, free ranges which are added to
+        * ms_allocatable are also be added to ms_trim.  The ms_trim tree is
+        * periodically consumed by the vdev_autotrim_thread() which issues
+        * trims for all ranges and then vacates the tree.  The ms_trim tree
+        * can be discarded at any time with the sole consequence of recent
+        * frees not being trimmed.
+        */
+       if (spa_get_autotrim(spa) == SPA_AUTOTRIM_ON) {
+               range_tree_walk(*defer_tree, range_tree_add, msp->ms_trim);
+               if (!defer_allowed) {
+                       range_tree_walk(msp->ms_freed, range_tree_add,
+                           msp->ms_trim);
+               }
+       } else {
+               range_tree_vacate(msp->ms_trim, NULL, NULL);
+       }
+
         /*
          * Move the frees from the defer_tree back to the free
          * range tree (if it's loaded). Swap the freed_tree and
@@ -3045,7 +3076,7 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
          * from it in 'metaslab_unload_delay' txgs, then unload it.
          */
         if (msp->ms_loaded &&
-           msp->ms_initializing == 0 &&
+           msp->ms_disabled == 0 &&
             msp->ms_selected_txg + metaslab_unload_delay < txg) {
  
                 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
@@ -3328,7 +3359,7 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
         metaslab_class_t *mc = msp->ms_group->mg_class;
  
         VERIFY(!msp->ms_condensing);
-       VERIFY0(msp->ms_initializing);
+       VERIFY0(msp->ms_disabled);
  
         start = mc->mc_ops->msop_alloc(msp, size);
         if (start != -1ULL) {
@@ -3339,6 +3370,7 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
                 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
                 VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size);
                 range_tree_remove(rt, start, size);
+               range_tree_clear(msp->ms_trim, start, size);
  
                 if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK]))
                         vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
@@ -3389,10 +3421,10 @@ find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight,
                 }
  
                 /*
-                * If the selected metaslab is condensing or being
-                * initialized, skip it.
+                * If the selected metaslab is condensing or disabled,
+                * skip it.
                  */
-               if (msp->ms_condensing || msp->ms_initializing > 0)
+               if (msp->ms_condensing || msp->ms_disabled > 0)
                         continue;
  
                 *was_active = msp->ms_allocator != -1;
@@ -3564,9 +3596,9 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
                             ~METASLAB_ACTIVE_MASK);
                         mutex_exit(&msp->ms_lock);
                         continue;
-               } else if (msp->ms_initializing > 0) {
+               } else if (msp->ms_disabled > 0) {
                         metaslab_trace_add(zal, mg, msp, asize, d,
-                           TRACE_INITIALIZING, allocator);
+                           TRACE_DISABLED, allocator);
                         metaslab_passivate(msp, msp->ms_weight &
                             ~METASLAB_ACTIVE_MASK);
                         mutex_exit(&msp->ms_lock);
@@ -4292,6 +4324,7 @@ metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size,
         VERIFY3U(range_tree_space(msp->ms_allocatable) - size, <=,
             msp->ms_size);
         range_tree_remove(msp->ms_allocatable, offset, size);
+       range_tree_clear(msp->ms_trim, offset, size);
  
         if (spa_writeable(spa)) {       /* don't dirty if we're zdb(1M) */
                 if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK]))
@@ -4604,6 +4637,7 @@ metaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size)
                     offset, size);
         }
  
+       range_tree_verify_not_present(msp->ms_trim, offset, size);
         range_tree_verify_not_present(msp->ms_freeing, offset, size);
         range_tree_verify_not_present(msp->ms_checkpointing, offset, size);
         range_tree_verify_not_present(msp->ms_freed, offset, size);
@@ -4635,6 +4669,89 @@ metaslab_check_free(spa_t *spa, const blkptr_t *bp)
         spa_config_exit(spa, SCL_VDEV, FTAG);
  }
  
+static void
+metaslab_group_disable_wait(metaslab_group_t *mg)
+{
+       ASSERT(MUTEX_HELD(&mg->mg_ms_disabled_lock));
+       while (mg->mg_disabled_updating) {
+               cv_wait(&mg->mg_ms_disabled_cv, &mg->mg_ms_disabled_lock);
+       }
+}
+
+static void
+metaslab_group_disabled_increment(metaslab_group_t *mg)
+{
+       ASSERT(MUTEX_HELD(&mg->mg_ms_disabled_lock));
+       ASSERT(mg->mg_disabled_updating);
+
+       while (mg->mg_ms_disabled >= max_disabled_ms) {
+               cv_wait(&mg->mg_ms_disabled_cv, &mg->mg_ms_disabled_lock);
+       }
+       mg->mg_ms_disabled++;
+       ASSERT3U(mg->mg_ms_disabled, <=, max_disabled_ms);
+}
+
+/*
+ * Mark the metaslab as disabled to prevent any allocations on this metaslab.
+ * We must also track how many metaslabs are currently disabled within a
+ * metaslab group and limit them to prevent allocation failures from
+ * occurring because all metaslabs are disabled.
+ */
+void
+metaslab_disable(metaslab_t *msp)
+{
+       ASSERT(!MUTEX_HELD(&msp->ms_lock));
+       metaslab_group_t *mg = msp->ms_group;
+
+       mutex_enter(&mg->mg_ms_disabled_lock);
+
+       /*
+        * To keep an accurate count of how many threads have disabled
+        * a specific metaslab group, we only allow one thread to mark
+        * the metaslab group at a time. This ensures that the value of
+        * ms_disabled will be accurate when we decide to mark a metaslab
+        * group as disabled. To do this we force all other threads
+        * to wait till the metaslab's mg_disabled_updating flag is no
+        * longer set.
+        */
+       metaslab_group_disable_wait(mg);
+       mg->mg_disabled_updating = B_TRUE;
+       if (msp->ms_disabled == 0) {
+               metaslab_group_disabled_increment(mg);
+       }
+       mutex_enter(&msp->ms_lock);
+       msp->ms_disabled++;
+       mutex_exit(&msp->ms_lock);
+
+       mg->mg_disabled_updating = B_FALSE;
+       cv_broadcast(&mg->mg_ms_disabled_cv);
+       mutex_exit(&mg->mg_ms_disabled_lock);
+}
+
+void
+metaslab_enable(metaslab_t *msp, boolean_t sync)
+{
+       metaslab_group_t *mg = msp->ms_group;
+       spa_t *spa = mg->mg_vd->vdev_spa;
+
+       /*
+        * Wait for the outstanding IO to be synced to prevent newly
+        * allocated blocks from being overwritten.  This used by
+        * initialize and TRIM which are modifying unallocated space.
+        */
+       if (sync)
+               txg_wait_synced(spa_get_dsl(spa), 0);
+
+       mutex_enter(&mg->mg_ms_disabled_lock);
+       mutex_enter(&msp->ms_lock);
+       if (--msp->ms_disabled == 0) {
+               mg->mg_ms_disabled--;
+               cv_broadcast(&mg->mg_ms_disabled_cv);
+       }
+       mutex_exit(&msp->ms_lock);
+       mutex_exit(&mg->mg_ms_disabled_lock);
+}
+
  #if defined(_KERNEL)
  /* BEGIN CSTYLED */
  module_param(metaslab_aliquot, ulong, 0644);