Use a struct to organize metaslab-group-allocator fields

[mirror_zfs.git] / include / sys / metaslab_impl.h
diff --git a/include/sys/metaslab_impl.h b/include/sys/metaslab_impl.h

index 676c5dd46bf30bed045996bc6f07f055041b370c..4a7475256a2bb9b03df79c2fa4bb19bfbd36b362 100644 (file)
--- a/include/sys/metaslab_impl.h
+++ b/include/sys/metaslab_impl.h
@@ -24,7 +24,7 @@
   */
  
  /*
- * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2019 by Delphix. All rights reserved.
   */
  
  #ifndef _SYS_METASLAB_IMPL_H
@@ -36,6 +36,7 @@
  #include <sys/vdev.h>
  #include <sys/txg.h>
  #include <sys/avl.h>
+#include <sys/multilist.h>
  
  #ifdef __cplusplus
  extern "C" {
@@ -69,7 +70,7 @@ typedef enum trace_alloc_type {
         TRACE_ENOSPC            = -6ULL,
         TRACE_CONDENSING        = -7ULL,
         TRACE_VDEV_ERROR        = -8ULL,
-       TRACE_INITIALIZING      = -9ULL
+       TRACE_DISABLED          = -9ULL,
  } trace_alloc_type_t;
  
  #define        METASLAB_WEIGHT_PRIMARY         (1ULL << 63)
@@ -194,8 +195,24 @@ struct metaslab_class {
         uint64_t                mc_space;       /* total space (alloc + free) */
         uint64_t                mc_dspace;      /* total deflated space */
         uint64_t                mc_histogram[RANGE_TREE_HISTOGRAM_SIZE];
+
+       /*
+        * List of all loaded metaslabs in the class, sorted in order of most
+        * recent use.
+        */
+       multilist_t             *mc_metaslab_txg_list;
  };
  
+/*
+ * Per-allocator data structure.
+ */
+typedef struct metaslab_group_allocator {
+       uint64_t        mga_cur_max_alloc_queue_depth;
+       zfs_refcount_t  mga_alloc_queue_depth;
+       metaslab_t      *mga_primary;
+       metaslab_t      *mga_secondary;
+} metaslab_group_allocator_t;
+
  /*
   * Metaslab groups encapsulate all the allocatable regions (i.e. metaslabs)
   * of a top-level vdev. They are linked together to form a circular linked
@@ -207,8 +224,6 @@ struct metaslab_class {
   */
  struct metaslab_group {
         kmutex_t                mg_lock;
-       metaslab_t              **mg_primaries;
-       metaslab_t              **mg_secondaries;
         avl_tree_t              mg_metaslab_tree;
         uint64_t                mg_aliquot;
         boolean_t               mg_allocatable;         /* can we allocate? */
@@ -256,9 +271,8 @@ struct metaslab_group {
          * groups are unable to handle their share of allocations.
          */
         uint64_t                mg_max_alloc_queue_depth;
-       uint64_t                *mg_cur_max_alloc_queue_depth;
-       zfs_refcount_t          *mg_alloc_queue_depth;
         int                     mg_allocators;
+       metaslab_group_allocator_t *mg_allocator; /* array */
         /*
          * A metalab group that can no longer allocate the minimum block
          * size will set mg_no_free_space. Once a metaslab group is out
@@ -272,10 +286,10 @@ struct metaslab_group {
         uint64_t                mg_fragmentation;
         uint64_t                mg_histogram[RANGE_TREE_HISTOGRAM_SIZE];
  
-       int                     mg_ms_initializing;
-       boolean_t               mg_initialize_updating;
-       kmutex_t                mg_ms_initialize_lock;
-       kcondvar_t              mg_ms_initialize_cv;
+       int                     mg_ms_disabled;
+       boolean_t               mg_disabled_updating;
+       kmutex_t                mg_ms_disabled_lock;
+       kcondvar_t              mg_ms_disabled_cv;
  };
  
  /*
@@ -357,7 +371,7 @@ struct metaslab {
          * write to metaslab data on-disk (i.e flushing entries to
          * the metaslab's space map). It helps coordinate readers of
          * the metaslab's space map [see spa_vdev_remove_thread()]
-        * with writers [see metaslab_sync()].
+        * with writers [see metaslab_sync() or metaslab_flush()].
          *
          * Note that metaslab_load(), even though a reader, uses
          * a completely different mechanism to deal with the reading
@@ -378,6 +392,7 @@ struct metaslab {
         range_tree_t    *ms_allocating[TXG_SIZE];
         range_tree_t    *ms_allocatable;
         uint64_t        ms_allocated_this_txg;
+       uint64_t        ms_allocating_total;
  
         /*
          * The following range trees are accessed only from syncing context.
@@ -389,11 +404,23 @@ struct metaslab {
         range_tree_t    *ms_defer[TXG_DEFER_SIZE];
         range_tree_t    *ms_checkpointing; /* to add to the checkpoint */
  
+       /*
+        * The ms_trim tree is the set of allocatable segments which are
+        * eligible for trimming. (When the metaslab is loaded, it's a
+        * subset of ms_allocatable.)  It's kept in-core as long as the
+        * autotrim property is set and is not vacated when the metaslab
+        * is unloaded.  Its purpose is to aggregate freed ranges to
+        * facilitate efficient trimming.
+        */
+       range_tree_t    *ms_trim;
+
         boolean_t       ms_condensing;  /* condensing? */
         boolean_t       ms_condense_wanted;
-       uint64_t        ms_condense_checked_txg;
  
-       uint64_t        ms_initializing; /* leaves initializing this ms */
+       /*
+        * The number of consumers which have disabled the metaslab.
+        */
+       uint64_t        ms_disabled;
  
         /*
          * We must always hold the ms_lock when modifying ms_loaded
@@ -401,6 +428,8 @@ struct metaslab {
          */
         boolean_t       ms_loaded;
         boolean_t       ms_loading;
+       kcondvar_t      ms_flush_cv;
+       boolean_t       ms_flushing;
  
         /*
          * The following histograms count entries that are in the
@@ -461,6 +490,13 @@ struct metaslab {
          * stay cached.
          */
         uint64_t        ms_selected_txg;
+       /*
+        * ms_load/unload_time can be used for performance monitoring
+        * (e.g. by dtrace or mdb).
+        */
+       hrtime_t        ms_load_time;   /* time last loaded */
+       hrtime_t        ms_unload_time; /* time last unloaded */
+       hrtime_t        ms_selected_time; /* time last allocated from */
  
         uint64_t        ms_alloc_txg;   /* last successful alloc (debug only) */
         uint64_t        ms_max_size;    /* maximum allocatable size     */
@@ -480,12 +516,33 @@ struct metaslab {
          * only difference is that the ms_allocatable_by_size is ordered by
          * segment sizes.
          */
-       avl_tree_t      ms_allocatable_by_size;
+       zfs_btree_t             ms_allocatable_by_size;
+       zfs_btree_t             ms_unflushed_frees_by_size;
         uint64_t        ms_lbas[MAX_LBAS];
  
         metaslab_group_t *ms_group;     /* metaslab group               */
         avl_node_t      ms_group_node;  /* node in metaslab group tree  */
         txg_node_t      ms_txg_node;    /* per-txg dirty metaslab links */
+       avl_node_t      ms_spa_txg_node; /* node in spa_metaslabs_by_txg */
+       /*
+        * Node in metaslab class's selected txg list
+        */
+       multilist_node_t        ms_class_txg_node;
+
+       /*
+        * Allocs and frees that are committed to the vdev log spacemap but
+        * not yet to this metaslab's spacemap.
+        */
+       range_tree_t    *ms_unflushed_allocs;
+       range_tree_t    *ms_unflushed_frees;
+
+       /*
+        * We have flushed entries up to but not including this TXG. In
+        * other words, all changes from this TXG and onward should not
+        * be in this metaslab's space map and must be read from the
+        * log space maps.
+        */
+       uint64_t        ms_unflushed_txg;
  
         /* updated every time we are done syncing the metaslab's space map */
         uint64_t        ms_synced_length;
@@ -493,6 +550,11 @@ struct metaslab {
         boolean_t       ms_new;
  };
  
+typedef struct metaslab_unflushed_phys {
+       /* on-disk counterpart of ms_unflushed_txg */
+       uint64_t        msp_unflushed_txg;
+} metaslab_unflushed_phys_t;
+
  #ifdef __cplusplus
  }
  #endif