Cap metaslab memory usage

author Paul Dagnelie <pcd@delphix.com>

Fri, 16 Aug 2019 15:08:21 +0000 (08:08 -0700)

committer Brian Behlendorf <behlendorf1@llnl.gov>

Fri, 16 Aug 2019 15:08:21 +0000 (09:08 -0600)
author Paul Dagnelie <pcd@delphix.com>
Fri, 16 Aug 2019 15:08:21 +0000 (08:08 -0700)
committer Brian Behlendorf <behlendorf1@llnl.gov>
Fri, 16 Aug 2019 15:08:21 +0000 (09:08 -0600)
diff --git a/include/sys/arc.h b/include/sys/arc.h

index dc2fd03647f3b2d5fd2bd9fec2a576162563d698..59c0bea920229b4db0b153192a3c4d8c0d6a8454 100644 (file)
--- a/include/sys/arc.h
+++ b/include/sys/arc.h
@@ -291,6 +291,7 @@ void arc_flush(spa_t *spa, boolean_t retry);
  void arc_tempreserve_clear(uint64_t reserve);
  int arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg);
  
+uint64_t arc_all_memory(void);
  uint64_t arc_target_bytes(void);
  void arc_init(void);
  void arc_fini(void);
diff --git a/include/sys/metaslab.h b/include/sys/metaslab.h

index 7dd5fe2b54c79016768ed4919f64e86fd447c8bc..00b8b475811011452d994188d40294ae7db5eb00 100644 (file)
--- a/include/sys/metaslab.h
+++ b/include/sys/metaslab.h
@@ -57,7 +57,6 @@ int metaslab_sort_by_flushed(const void *, const void *);
  uint64_t metaslab_unflushed_changes_memused(metaslab_t *);
  
  int metaslab_load(metaslab_t *);
-void metaslab_potentially_unload(metaslab_t *, uint64_t);
  void metaslab_unload(metaslab_t *);
  boolean_t metaslab_flush(metaslab_t *, dmu_tx_t *);
  
@@ -110,7 +109,7 @@ uint64_t metaslab_class_expandable_space(metaslab_class_t *);
  boolean_t metaslab_class_throttle_reserve(metaslab_class_t *, int, int,
      zio_t *, int);
  void metaslab_class_throttle_unreserve(metaslab_class_t *, int, int, zio_t *);
-
+void metaslab_class_evict_old(metaslab_class_t *, uint64_t);
  uint64_t metaslab_class_get_alloc(metaslab_class_t *);
  uint64_t metaslab_class_get_space(metaslab_class_t *);
  uint64_t metaslab_class_get_dspace(metaslab_class_t *);
@@ -133,7 +132,8 @@ void metaslab_group_alloc_decrement(spa_t *, uint64_t, void *, int, int,
  void metaslab_group_alloc_verify(spa_t *, const blkptr_t *, void *, int);
  void metaslab_recalculate_weight_and_sort(metaslab_t *);
  void metaslab_disable(metaslab_t *);
-void metaslab_enable(metaslab_t *, boolean_t);
+void metaslab_enable(metaslab_t *, boolean_t, boolean_t);
+void metaslab_set_selected_txg(metaslab_t *, uint64_t);
  
  extern int metaslab_debug_load;
  
diff --git a/include/sys/metaslab_impl.h b/include/sys/metaslab_impl.h

index 08ee8d279ddd66776efee6de27fed1df6c624fb1..07f07c02d1a89dd5794bdd7322ba89badce22c6e 100644 (file)
--- a/include/sys/metaslab_impl.h
+++ b/include/sys/metaslab_impl.h
@@ -36,6 +36,7 @@
  #include <sys/vdev.h>
  #include <sys/txg.h>
  #include <sys/avl.h>
+#include <sys/multilist.h>
  
  #ifdef __cplusplus
  extern "C" {
@@ -194,6 +195,12 @@ struct metaslab_class {
         uint64_t                mc_space;       /* total space (alloc + free) */
         uint64_t                mc_dspace;      /* total deflated space */
         uint64_t                mc_histogram[RANGE_TREE_HISTOGRAM_SIZE];
+
+       /*
+        * List of all loaded metaslabs in the class, sorted in order of most
+        * recent use.
+        */
+       multilist_t             *mc_metaslab_txg_list;
  };
  
  /*
@@ -378,6 +385,7 @@ struct metaslab {
         range_tree_t    *ms_allocating[TXG_SIZE];
         range_tree_t    *ms_allocatable;
         uint64_t        ms_allocated_this_txg;
+       uint64_t        ms_allocating_total;
  
         /*
          * The following range trees are accessed only from syncing context.
@@ -508,6 +516,10 @@ struct metaslab {
         avl_node_t      ms_group_node;  /* node in metaslab group tree  */
         txg_node_t      ms_txg_node;    /* per-txg dirty metaslab links */
         avl_node_t      ms_spa_txg_node; /* node in spa_metaslabs_by_txg */
+       /*
+        * Node in metaslab class's selected txg list
+        */
+       multilist_node_t        ms_class_txg_node;
  
         /*
          * Allocs and frees that are committed to the vdev log spacemap but
diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5

index c25f2f04678bd0140f28e7c160773e72ae4a18b8..8a1048bee43ed72ea6497a404592a2a7dd2a5ab4 100644 (file)
--- a/man/man5/zfs-module-parameters.5
+++ b/man/man5/zfs-module-parameters.5
@@ -386,6 +386,21 @@ considering only the histogram instead.
  Default value: \fB3600 seconds\fR (one hour)
  .RE
  
+.sp
+.ne 2
+.na
+\fBzfs_metaslab_mem_limit\fR (int)
+.ad
+.RS 12n
+When we are loading a new metaslab, we check the amount of memory being used
+to store metaslab range trees. If it is over a threshold, we attempt to unload
+the least recently used metaslab to prevent the system from clogging all of
+its memory with range trees. This tunable sets the percentage of total system
+memory that is the threshold.
+.sp
+Default value: \fB75 percent\fR
+.RE
+
  .sp
  .ne 2
  .na
diff --git a/module/zfs/arc.c b/module/zfs/arc.c

index 90a731bffa96eb9a5a2ed15ac32a8c671d800361..b5fca8e263139ab46008e9f244049d7f51e9febe 100644 (file)
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -1110,7 +1110,6 @@ static boolean_t arc_is_overflowing(void);
  static void arc_buf_watch(arc_buf_t *);
  static void arc_tuning_update(void);
  static void arc_prune_async(int64_t);
-static uint64_t arc_all_memory(void);
  
  static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *);
  static uint32_t arc_bufc_to_flags(arc_buf_contents_t);
@@ -4828,7 +4827,7 @@ arc_reduce_target_size(int64_t to_free)
   * Return maximum amount of memory that we could possibly use.  Reduced
   * to half of all memory in user space which is primarily used for testing.
   */
-static uint64_t
+uint64_t
  arc_all_memory(void)
  {
  #ifdef _KERNEL
diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c

index 9a9a5e0cf8a2dc73b4cfc96aa03cc5f956e4f446..2f92fffa4ec0029de46ed99f66134b8ebfc29b79 100644 (file)
--- a/module/zfs/metaslab.c
+++ b/module/zfs/metaslab.c
@@ -278,6 +278,13 @@ int max_disabled_ms = 3;
   */
  unsigned long zfs_metaslab_max_size_cache_sec = 3600; /* 1 hour */
  
+/*
+ * Maximum percentage of memory to use on storing loaded metaslabs. If loading
+ * a metaslab would take it over this percentage, the oldest selected metaslab
+ * is automatically unloaded.
+ */
+int zfs_metaslab_mem_limit = 75;
+
  static uint64_t metaslab_weight(metaslab_t *);
  static void metaslab_set_fragmentation(metaslab_t *);
  static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t);
@@ -286,6 +293,8 @@ static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t);
  static void metaslab_passivate(metaslab_t *msp, uint64_t weight);
  static uint64_t metaslab_weight_from_range_tree(metaslab_t *msp);
  static void metaslab_flush_update(metaslab_t *, dmu_tx_t *);
+static unsigned int metaslab_idx_func(multilist_t *, void *);
+static void metaslab_evict(metaslab_t *, uint64_t);
  #ifdef _METASLAB_TRACING
  kmem_cache_t *metaslab_alloc_trace_cache;
  #endif
@@ -306,6 +315,8 @@ metaslab_class_create(spa_t *spa, metaslab_ops_t *ops)
         mc->mc_rotor = NULL;
         mc->mc_ops = ops;
         mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL);
+       mc->mc_metaslab_txg_list = multilist_create(sizeof (metaslab_t),
+           offsetof(metaslab_t, ms_class_txg_node), metaslab_idx_func);
         mc->mc_alloc_slots = kmem_zalloc(spa->spa_alloc_count *
             sizeof (zfs_refcount_t), KM_SLEEP);
         mc->mc_alloc_max_slots = kmem_zalloc(spa->spa_alloc_count *
@@ -332,6 +343,7 @@ metaslab_class_destroy(metaslab_class_t *mc)
         kmem_free(mc->mc_alloc_max_slots, mc->mc_spa->spa_alloc_count *
             sizeof (uint64_t));
         mutex_destroy(&mc->mc_lock);
+       multilist_destroy(mc->mc_metaslab_txg_list);
         kmem_free(mc, sizeof (metaslab_class_t));
  }
  
@@ -517,6 +529,47 @@ metaslab_class_expandable_space(metaslab_class_t *mc)
         return (space);
  }
  
+void
+metaslab_class_evict_old(metaslab_class_t *mc, uint64_t txg)
+{
+       multilist_t *ml = mc->mc_metaslab_txg_list;
+       for (int i = 0; i < multilist_get_num_sublists(ml); i++) {
+               multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
+               metaslab_t *msp = multilist_sublist_head(mls);
+               multilist_sublist_unlock(mls);
+               while (msp != NULL) {
+                       mutex_enter(&msp->ms_lock);
+                       /*
+                        * Once we've hit a metaslab selected too recently to
+                        * evict, we're done evicting for now.
+                        */
+                       if (msp->ms_selected_txg + metaslab_unload_delay >=
+                           txg) {
+                               mutex_exit(&msp->ms_lock);
+                               break;
+                       }
+
+                       /*
+                        * If the metaslab has been removed from the list
+                        * (which could happen if we were at the memory limit
+                        * and it was evicted during this loop), then we can't
+                        * proceed and we should restart the sublist.
+                        */
+                       if (!multilist_link_active(&msp->ms_class_txg_node)) {
+                               mutex_exit(&msp->ms_lock);
+                               i--;
+                               break;
+                       }
+                       mls = multilist_sublist_lock(ml, i);
+                       metaslab_t *next_msp = multilist_sublist_next(mls, msp);
+                       multilist_sublist_unlock(mls);
+                       metaslab_evict(msp, txg);
+                       mutex_exit(&msp->ms_lock);
+                       msp = next_msp;
+               }
+       }
+}
+
  static int
  metaslab_compare(const void *x1, const void *x2)
  {
@@ -960,6 +1013,14 @@ metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
         mutex_enter(&mg->mg_lock);
         ASSERT(msp->ms_group == mg);
         avl_remove(&mg->mg_metaslab_tree, msp);
+
+       metaslab_class_t *mc = msp->ms_group->mg_class;
+       multilist_sublist_t *mls =
+           multilist_sublist_lock_obj(mc->mc_metaslab_txg_list, msp);
+       if (multilist_link_active(&msp->ms_class_txg_node))
+               multilist_sublist_remove(mls, msp);
+       multilist_sublist_unlock(mls);
+
         msp->ms_group = NULL;
         mutex_exit(&mg->mg_lock);
  }
@@ -1519,6 +1580,13 @@ metaslab_flush_wait(metaslab_t *msp)
                 cv_wait(&msp->ms_flush_cv, &msp->ms_lock);
  }
  
+static unsigned int
+metaslab_idx_func(multilist_t *ml, void *arg)
+{
+       metaslab_t *msp = arg;
+       return (msp->ms_id % multilist_get_num_sublists(ml));
+}
+
  uint64_t
  metaslab_allocated_space(metaslab_t *msp)
  {
@@ -1577,6 +1645,8 @@ metaslab_verify_space(metaslab_t *msp, uint64_t txg)
                 allocating +=
                     range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]);
         }
+       ASSERT3U(allocating + msp->ms_allocated_this_txg, ==,
+           msp->ms_allocating_total);
  
         ASSERT3U(msp->ms_deferspace, ==,
             range_tree_space(msp->ms_defer[0]) +
@@ -1792,6 +1862,86 @@ metaslab_verify_weight_and_frag(metaslab_t *msp)
         VERIFY3U(msp->ms_weight, ==, weight);
  }
  
+/*
+ * If we're over the zfs_metaslab_mem_limit, select the loaded metaslab from
+ * this class that was used longest ago, and attempt to unload it.  We don't
+ * want to spend too much time in this loop to prevent performance
+ * degredation, and we expect that most of the time this operation will
+ * succeed. Between that and the normal unloading processing during txg sync,
+ * we expect this to keep the metaslab memory usage under control.
+ */
+static void
+metaslab_potentially_evict(metaslab_class_t *mc)
+{
+#ifdef _KERNEL
+       uint64_t allmem = arc_all_memory();
+       extern kmem_cache_t *range_seg_cache;
+       uint64_t inuse = range_seg_cache->skc_obj_total;
+       uint64_t size = range_seg_cache->skc_obj_size;
+       int tries = 0;
+       for (; allmem * zfs_metaslab_mem_limit / 100 < inuse * size &&
+           tries < multilist_get_num_sublists(mc->mc_metaslab_txg_list) * 2;
+           tries++) {
+               unsigned int idx = multilist_get_random_index(
+                   mc->mc_metaslab_txg_list);
+               multilist_sublist_t *mls =
+                   multilist_sublist_lock(mc->mc_metaslab_txg_list, idx);
+               metaslab_t *msp = multilist_sublist_head(mls);
+               multilist_sublist_unlock(mls);
+               while (msp != NULL && allmem * zfs_metaslab_mem_limit / 100 <
+                   inuse * size) {
+                       VERIFY3P(mls, ==, multilist_sublist_lock(
+                           mc->mc_metaslab_txg_list, idx));
+                       ASSERT3U(idx, ==,
+                           metaslab_idx_func(mc->mc_metaslab_txg_list, msp));
+
+                       if (!multilist_link_active(&msp->ms_class_txg_node)) {
+                               multilist_sublist_unlock(mls);
+                               break;
+                       }
+                       metaslab_t *next_msp = multilist_sublist_next(mls, msp);
+                       multilist_sublist_unlock(mls);
+                       /*
+                        * If the metaslab is currently loading there are two
+                        * cases. If it's the metaslab we're evicting, we
+                        * can't continue on or we'll panic when we attempt to
+                        * recursively lock the mutex. If it's another
+                        * metaslab that's loading, it can be safely skipped,
+                        * since we know it's very new and therefore not a
+                        * good eviction candidate. We check later once the
+                        * lock is held that the metaslab is fully loaded
+                        * before actually unloading it.
+                        */
+                       if (msp->ms_loading) {
+                               msp = next_msp;
+                               inuse = range_seg_cache->skc_obj_total;
+                               continue;
+                       }
+                       /*
+                        * We can't unload metaslabs with no spacemap because
+                        * they're not ready to be unloaded yet. We can't
+                        * unload metaslabs with outstanding allocations
+                        * because doing so could cause the metaslab's weight
+                        * to decrease while it's unloaded, which violates an
+                        * invariant that we use to prevent unnecessary
+                        * loading. We also don't unload metaslabs that are
+                        * currently active because they are high-weight
+                        * metaslabs that are likely to be used in the near
+                        * future.
+                        */
+                       mutex_enter(&msp->ms_lock);
+                       if (msp->ms_allocator == -1 && msp->ms_sm != NULL &&
+                           msp->ms_allocating_total == 0) {
+                               metaslab_unload(msp);
+                       }
+                       mutex_exit(&msp->ms_lock);
+                       msp = next_msp;
+                       inuse = range_seg_cache->skc_obj_total;
+               }
+       }
+#endif
+}
+
  static int
  metaslab_load_impl(metaslab_t *msp)
  {
@@ -2024,6 +2174,16 @@ metaslab_load(metaslab_t *msp)
          */
         ASSERT(!msp->ms_loaded);
  
+       /*
+        * If we're loading a metaslab in the normal class, consider evicting
+        * another one to keep our memory usage under the limit defined by the
+        * zfs_metaslab_mem_limit tunable.
+        */
+       if (spa_normal_class(msp->ms_group->mg_class->mc_spa) ==
+           msp->ms_group->mg_class) {
+               metaslab_potentially_evict(msp->ms_group->mg_class);
+       }
+
         int error = metaslab_load_impl(msp);
  
         ASSERT(MUTEX_HELD(&msp->ms_lock));
@@ -2038,7 +2198,13 @@ metaslab_unload(metaslab_t *msp)
  {
         ASSERT(MUTEX_HELD(&msp->ms_lock));
  
-       metaslab_verify_weight_and_frag(msp);
+       /*
+        * This can happen if a metaslab is selected for eviction (in
+        * metaslab_potentially_evict) and then unloaded during spa_sync (via
+        * metaslab_class_evict_old).
+        */
+       if (!msp->ms_loaded)
+               return;
  
         range_tree_vacate(msp->ms_allocatable, NULL, NULL);
         msp->ms_loaded = B_FALSE;
@@ -2047,6 +2213,15 @@ metaslab_unload(metaslab_t *msp)
         msp->ms_activation_weight = 0;
         msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
  
+       if (msp->ms_group != NULL) {
+               metaslab_class_t *mc = msp->ms_group->mg_class;
+               multilist_sublist_t *mls =
+                   multilist_sublist_lock_obj(mc->mc_metaslab_txg_list, msp);
+               if (multilist_link_active(&msp->ms_class_txg_node))
+                       multilist_sublist_remove(mls, msp);
+               multilist_sublist_unlock(mls);
+       }
+
         /*
          * We explicitly recalculate the metaslab's weight based on its space
          * map (as it is now not loaded). We want unload metaslabs to always
@@ -2063,6 +2238,20 @@ metaslab_unload(metaslab_t *msp)
                 metaslab_recalculate_weight_and_sort(msp);
  }
  
+void
+metaslab_set_selected_txg(metaslab_t *msp, uint64_t txg)
+{
+       ASSERT(MUTEX_HELD(&msp->ms_lock));
+       metaslab_class_t *mc = msp->ms_group->mg_class;
+       multilist_sublist_t *mls =
+           multilist_sublist_lock_obj(mc->mc_metaslab_txg_list, msp);
+       if (multilist_link_active(&msp->ms_class_txg_node))
+               multilist_sublist_remove(mls, msp);
+       msp->ms_selected_txg = txg;
+       multilist_sublist_insert_tail(mls, msp);
+       multilist_sublist_unlock(mls);
+}
+
  void
  metaslab_space_update(vdev_t *vd, metaslab_class_t *mc, int64_t alloc_delta,
      int64_t defer_delta, int64_t space_delta)
@@ -2091,6 +2280,7 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object,
         mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL);
         cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL);
         cv_init(&ms->ms_flush_cv, NULL, CV_DEFAULT, NULL);
+       multilist_link_init(&ms->ms_class_txg_node);
  
         ms->ms_id = id;
         ms->ms_start = id << vd->vdev_ms_shift;
@@ -2703,8 +2893,13 @@ metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp,
          * If we're activating for the claim code, we don't want to actually
          * set the metaslab up for a specific allocator.
          */
-       if (activation_weight == METASLAB_WEIGHT_CLAIM)
+       if (activation_weight == METASLAB_WEIGHT_CLAIM) {
+               ASSERT0(msp->ms_activation_weight);
+               msp->ms_activation_weight = msp->ms_weight;
+               metaslab_group_sort(mg, msp, msp->ms_weight |
+                   activation_weight);
                 return (0);
+       }
  
         metaslab_t **arr = (activation_weight == METASLAB_WEIGHT_PRIMARY ?
             mg->mg_primaries : mg->mg_secondaries);
@@ -2719,6 +2914,12 @@ metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp,
         ASSERT3S(msp->ms_allocator, ==, -1);
         msp->ms_allocator = allocator;
         msp->ms_primary = (activation_weight == METASLAB_WEIGHT_PRIMARY);
+
+       ASSERT0(msp->ms_activation_weight);
+       msp->ms_activation_weight = msp->ms_weight;
+       metaslab_group_sort_impl(mg, msp,
+           msp->ms_weight | activation_weight);
+
         mutex_exit(&mg->mg_lock);
  
         return (0);
@@ -2795,11 +2996,6 @@ metaslab_activate(metaslab_t *msp, int allocator, uint64_t activation_weight)
                 return (error);
         }
  
-       ASSERT0(msp->ms_activation_weight);
-       msp->ms_activation_weight = msp->ms_weight;
-       metaslab_group_sort(msp->ms_group, msp,
-           msp->ms_weight | activation_weight);
-
         ASSERT(msp->ms_loaded);
         ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
  
@@ -2894,14 +3090,15 @@ static void
  metaslab_preload(void *arg)
  {
         metaslab_t *msp = arg;
-       spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
+       metaslab_class_t *mc = msp->ms_group->mg_class;
+       spa_t *spa = mc->mc_spa;
         fstrans_cookie_t cookie = spl_fstrans_mark();
  
         ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock));
  
         mutex_enter(&msp->ms_lock);
         (void) metaslab_load(msp);
-       msp->ms_selected_txg = spa_syncing_txg(spa);
+       metaslab_set_selected_txg(msp, spa_syncing_txg(spa));
         mutex_exit(&msp->ms_lock);
         spl_fstrans_unmark(cookie);
  }
@@ -3613,28 +3810,21 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
         dmu_tx_commit(tx);
  }
  
-void
-metaslab_potentially_unload(metaslab_t *msp, uint64_t txg)
+static void
+metaslab_evict(metaslab_t *msp, uint64_t txg)
  {
-       /*
-        * If the metaslab is loaded and we've not tried to load or allocate
-        * from it in 'metaslab_unload_delay' txgs, then unload it.
-        */
-       if (msp->ms_loaded &&
-           msp->ms_disabled == 0 &&
-           msp->ms_selected_txg + metaslab_unload_delay < txg) {
-               for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
-                       VERIFY0(range_tree_space(
-                           msp->ms_allocating[(txg + t) & TXG_MASK]));
-               }
-               if (msp->ms_allocator != -1) {
-                       metaslab_passivate(msp, msp->ms_weight &
-                           ~METASLAB_ACTIVE_MASK);
-               }
+       if (!msp->ms_loaded || msp->ms_disabled != 0)
+               return;
  
-               if (!metaslab_debug_unload)
-                       metaslab_unload(msp);
+       for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
+               VERIFY0(range_tree_space(
+                   msp->ms_allocating[(txg + t) & TXG_MASK]));
         }
+       if (msp->ms_allocator != -1)
+               metaslab_passivate(msp, msp->ms_weight & ~METASLAB_ACTIVE_MASK);
+
+       if (!metaslab_debug_unload)
+               metaslab_unload(msp);
  }
  
  /*
@@ -3791,7 +3981,7 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
         ASSERT0(range_tree_space(msp->ms_freeing));
         ASSERT0(range_tree_space(msp->ms_freed));
         ASSERT0(range_tree_space(msp->ms_checkpointing));
-
+       msp->ms_allocating_total -= msp->ms_allocated_this_txg;
         msp->ms_allocated_this_txg = 0;
         mutex_exit(&msp->ms_lock);
  }
@@ -4072,6 +4262,7 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
                         vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
  
                 range_tree_add(msp->ms_allocating[txg & TXG_MASK], start, size);
+               msp->ms_allocating_total += size;
  
                 /* Track the last successful allocation */
                 msp->ms_alloc_txg = txg;
@@ -4250,6 +4441,7 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
                         ASSERT(msp->ms_loaded);
  
                         was_active = B_TRUE;
+                       ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
                 } else if (activation_weight == METASLAB_WEIGHT_SECONDARY &&
                     mg->mg_secondaries[allocator] != NULL) {
                         msp = mg->mg_secondaries[allocator];
@@ -4263,6 +4455,7 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
                         ASSERT(msp->ms_loaded);
  
                         was_active = B_TRUE;
+                       ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
                 } else {
                         msp = find_valid_metaslab(mg, activation_weight, dva, d,
                             want_unique, asize, allocator, try_hard, zal,
@@ -4293,7 +4486,7 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
                  * capable of handling our request. It's possible that
                  * another thread may have changed the weight while we
                  * were blocked on the metaslab lock. We check the
-                * active status first to see if we need to reselect
+                * active status first to see if we need to set_selected_txg
                  * a new metaslab.
                  */
                 if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) {
@@ -4336,7 +4529,7 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
                         continue;
                 }
  
-               msp->ms_selected_txg = txg;
+               metaslab_set_selected_txg(msp, txg);
  
                 int activation_error =
                     metaslab_activate(msp, allocator, activation_weight);
@@ -5027,6 +5220,7 @@ metaslab_unalloc_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
         mutex_enter(&msp->ms_lock);
         range_tree_remove(msp->ms_allocating[txg & TXG_MASK],
             offset, size);
+       msp->ms_allocating_total -= size;
  
         VERIFY(!msp->ms_condensing);
         VERIFY3U(offset, >=, msp->ms_start);
@@ -5158,10 +5352,20 @@ metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size,
         range_tree_clear(msp->ms_trim, offset, size);
  
         if (spa_writeable(spa)) {       /* don't dirty if we're zdb(1M) */
+               metaslab_class_t *mc = msp->ms_group->mg_class;
+               multilist_sublist_t *mls =
+                   multilist_sublist_lock_obj(mc->mc_metaslab_txg_list, msp);
+               if (!multilist_link_active(&msp->ms_class_txg_node)) {
+                       msp->ms_selected_txg = txg;
+                       multilist_sublist_insert_head(mls, msp);
+               }
+               multilist_sublist_unlock(mls);
+
                 if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK]))
                         vdev_dirty(vd, VDD_METASLAB, msp, txg);
                 range_tree_add(msp->ms_allocating[txg & TXG_MASK],
                     offset, size);
+               msp->ms_allocating_total += size;
         }
  
         mutex_exit(&msp->ms_lock);
@@ -5571,7 +5775,7 @@ metaslab_disable(metaslab_t *msp)
  }
  
  void
-metaslab_enable(metaslab_t *msp, boolean_t sync)
+metaslab_enable(metaslab_t *msp, boolean_t sync, boolean_t unload)
  {
         metaslab_group_t *mg = msp->ms_group;
         spa_t *spa = mg->mg_vd->vdev_spa;
@@ -5589,6 +5793,8 @@ metaslab_enable(metaslab_t *msp, boolean_t sync)
         if (--msp->ms_disabled == 0) {
                 mg->mg_ms_disabled--;
                 cv_broadcast(&mg->mg_ms_disabled_cv);
+               if (unload)
+                       metaslab_unload(msp);
         }
         mutex_exit(&msp->ms_lock);
         mutex_exit(&mg->mg_ms_disabled_lock);
@@ -5710,6 +5916,10 @@ MODULE_PARM_DESC(metaslab_df_use_largest_segment,
  module_param(zfs_metaslab_max_size_cache_sec, ulong, 0644);
  MODULE_PARM_DESC(zfs_metaslab_max_size_cache_sec,
         "how long to trust the cached max chunk size of a metaslab");
+
+module_param(zfs_metaslab_mem_limit, int, 0644);
+MODULE_PARM_DESC(zfs_metaslab_mem_limit,
+       "percentage of memory that can be used to store metaslab range trees");
  /* END CSTYLED */
  
  #endif
diff --git a/module/zfs/spa.c b/module/zfs/spa.c

index 437efb50f90089ddf32b7beee534e11504af91f3..c404e876b4bc6d78768fba689a196cbee2f0fb01 100644 (file)
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -9013,6 +9013,10 @@ spa_sync(spa_t *spa, uint64_t txg)
         while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)))
             != NULL)
                 vdev_sync_done(vd, txg);
+
+       metaslab_class_evict_old(spa->spa_normal_class, txg);
+       metaslab_class_evict_old(spa->spa_log_class, txg);
+
         spa_sync_close_syncing_log_sm(spa);
  
         spa_update_dspace(spa);
diff --git a/module/zfs/spa_log_spacemap.c b/module/zfs/spa_log_spacemap.c

index ad82e025e4c7c8be7dd433ecde586376884ac895..550aa1e3a5f595002b7af359fb47f849b2ec4c1f 100644 (file)
--- a/module/zfs/spa_log_spacemap.c
+++ b/module/zfs/spa_log_spacemap.c
@@ -1189,6 +1189,7 @@ out:
  
                 if (metaslab_debug_load && m->ms_sm != NULL) {
                         VERIFY0(metaslab_load(m));
+                       metaslab_set_selected_txg(m, 0);
                 }
                 mutex_exit(&m->ms_lock);
         }
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c

index 5644b9c5b2dbb7820054a812a3cb157c4da2547e..a6280e0112ed15e6ef2baf83d1812d94a1ccc2e4 100644 (file)
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -3262,20 +3262,6 @@ vdev_sync_done(vdev_t *vd, uint64_t txg)
             != NULL)
                 metaslab_sync_done(msp, txg);
  
-       /*
-        * Because this function is only called on dirty vdevs, it's possible
-        * we won't consider all metaslabs for unloading on every
-        * txg. However, unless the system is largely idle it is likely that
-        * we will dirty all vdevs within a few txgs.
-        */
-       for (int i = 0; i < vd->vdev_ms_count; i++) {
-               msp = vd->vdev_ms[i];
-               mutex_enter(&msp->ms_lock);
-               if (msp->ms_sm != NULL)
-                       metaslab_potentially_unload(msp, txg);
-               mutex_exit(&msp->ms_lock);
-       }
-
         if (reassess)
                 metaslab_sync_reassess(vd->vdev_mg);
  }
diff --git a/module/zfs/vdev_initialize.c b/module/zfs/vdev_initialize.c

index b1590132636bc72ca1ab2458eaee146b1bb53b42..a355f185cc2e8aedf011490f2ba33a7bfc400346 100644 (file)
--- a/module/zfs/vdev_initialize.c
+++ b/module/zfs/vdev_initialize.c
@@ -20,7 +20,7 @@
   */
  
  /*
- * Copyright (c) 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2016, 2019 by Delphix. All rights reserved.
   */
  
  #include <sys/spa.h>
@@ -483,6 +483,7 @@ vdev_initialize_thread(void *arg)
         for (uint64_t i = 0; !vd->vdev_detached &&
             i < vd->vdev_top->vdev_ms_count; i++) {
                 metaslab_t *msp = vd->vdev_top->vdev_ms[i];
+               boolean_t unload_when_done = B_FALSE;
  
                 /*
                  * If we've expanded the top-level vdev or it's our
@@ -496,6 +497,8 @@ vdev_initialize_thread(void *arg)
                 spa_config_exit(spa, SCL_CONFIG, FTAG);
                 metaslab_disable(msp);
                 mutex_enter(&msp->ms_lock);
+               if (!msp->ms_loaded && !msp->ms_loading)
+                       unload_when_done = B_TRUE;
                 VERIFY0(metaslab_load(msp));
  
                 range_tree_walk(msp->ms_allocatable, vdev_initialize_range_add,
@@ -503,7 +506,7 @@ vdev_initialize_thread(void *arg)
                 mutex_exit(&msp->ms_lock);
  
                 error = vdev_initialize_ranges(vd, deadbeef);
-               metaslab_enable(msp, B_TRUE);
+               metaslab_enable(msp, B_TRUE, unload_when_done);
                 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
  
                 range_tree_vacate(vd->vdev_initialize_tree, NULL, NULL);
diff --git a/module/zfs/vdev_trim.c b/module/zfs/vdev_trim.c

index 5ad47cccdafe476a0d593e68d29cee17750b2fc3..70b122a0a6e0cbd8b0fed8c3e5c9f6bde25b78b1 100644 (file)
--- a/module/zfs/vdev_trim.c
+++ b/module/zfs/vdev_trim.c
@@ -837,7 +837,7 @@ vdev_trim_thread(void *arg)
                  */
                 if (msp->ms_sm == NULL && vd->vdev_trim_partial) {
                         mutex_exit(&msp->ms_lock);
-                       metaslab_enable(msp, B_FALSE);
+                       metaslab_enable(msp, B_FALSE, B_FALSE);
                         spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
                         vdev_trim_calculate_progress(vd);
                         continue;
@@ -849,7 +849,7 @@ vdev_trim_thread(void *arg)
                 mutex_exit(&msp->ms_lock);
  
                 error = vdev_trim_ranges(&ta);
-               metaslab_enable(msp, B_TRUE);
+               metaslab_enable(msp, B_TRUE, B_FALSE);
                 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
  
                 range_tree_vacate(ta.trim_tree, NULL, NULL);
@@ -1154,7 +1154,7 @@ vdev_autotrim_thread(void *arg)
                         if (msp->ms_sm == NULL ||
                             range_tree_is_empty(msp->ms_trim)) {
                                 mutex_exit(&msp->ms_lock);
-                               metaslab_enable(msp, B_FALSE);
+                               metaslab_enable(msp, B_FALSE, B_FALSE);
                                 continue;
                         }
  
@@ -1170,7 +1170,7 @@ vdev_autotrim_thread(void *arg)
                          */
                         if (msp->ms_disabled > 1) {
                                 mutex_exit(&msp->ms_lock);
-                               metaslab_enable(msp, B_FALSE);
+                               metaslab_enable(msp, B_FALSE, B_FALSE);
                                 continue;
                         }
  
@@ -1288,7 +1288,7 @@ vdev_autotrim_thread(void *arg)
                         range_tree_vacate(trim_tree, NULL, NULL);
                         range_tree_destroy(trim_tree);
  
-                       metaslab_enable(msp, issued_trim);
+                       metaslab_enable(msp, issued_trim, B_FALSE);
                         spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
  
                         for (uint64_t c = 0; c < children; c++) {
author	Paul Dagnelie <pcd@delphix.com>
	Fri, 16 Aug 2019 15:08:21 +0000 (08:08 -0700)
committer	Brian Behlendorf <behlendorf1@llnl.gov>
	Fri, 16 Aug 2019 15:08:21 +0000 (09:08 -0600)
include/sys/arc.h		patch \| blob \| blame \| history
include/sys/metaslab.h		patch \| blob \| blame \| history
include/sys/metaslab_impl.h		patch \| blob \| blame \| history
man/man5/zfs-module-parameters.5		patch \| blob \| blame \| history
module/zfs/arc.c		patch \| blob \| blame \| history
module/zfs/metaslab.c		patch \| blob \| blame \| history
module/zfs/spa.c		patch \| blob \| blame \| history
module/zfs/spa_log_spacemap.c		patch \| blob \| blame \| history
module/zfs/vdev.c		patch \| blob \| blame \| history
module/zfs/vdev_initialize.c		patch \| blob \| blame \| history
module/zfs/vdev_trim.c		patch \| blob \| blame \| history