Use a struct to organize metaslab-group-allocator fields

[mirror_zfs.git] / include / sys / metaslab_impl.h
diff --git a/include/sys/metaslab_impl.h b/include/sys/metaslab_impl.h

index cc6e8b796d40baeeba104482f71338efc2abe267..4a7475256a2bb9b03df79c2fa4bb19bfbd36b362 100644 (file)
--- a/include/sys/metaslab_impl.h
+++ b/include/sys/metaslab_impl.h
@@ -24,7 +24,7 @@
   */
  
  /*
- * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2019 by Delphix. All rights reserved.
   */
  
  #ifndef _SYS_METASLAB_IMPL_H
@@ -36,6 +36,7 @@
  #include <sys/vdev.h>
  #include <sys/txg.h>
  #include <sys/avl.h>
+#include <sys/multilist.h>
  
  #ifdef __cplusplus
  extern "C" {
@@ -68,7 +69,8 @@ typedef enum trace_alloc_type {
         TRACE_GROUP_FAILURE     = -5ULL,
         TRACE_ENOSPC            = -6ULL,
         TRACE_CONDENSING        = -7ULL,
-       TRACE_VDEV_ERROR        = -8ULL
+       TRACE_VDEV_ERROR        = -8ULL,
+       TRACE_DISABLED          = -9ULL,
  } trace_alloc_type_t;
  
  #define        METASLAB_WEIGHT_PRIMARY         (1ULL << 63)
@@ -184,7 +186,7 @@ struct metaslab_class {
          * number of allocations allowed.
          */
         uint64_t                *mc_alloc_max_slots;
-       refcount_t              *mc_alloc_slots;
+       zfs_refcount_t          *mc_alloc_slots;
  
         uint64_t                mc_alloc_groups; /* # of allocatable groups */
  
@@ -193,8 +195,24 @@ struct metaslab_class {
         uint64_t                mc_space;       /* total space (alloc + free) */
         uint64_t                mc_dspace;      /* total deflated space */
         uint64_t                mc_histogram[RANGE_TREE_HISTOGRAM_SIZE];
+
+       /*
+        * List of all loaded metaslabs in the class, sorted in order of most
+        * recent use.
+        */
+       multilist_t             *mc_metaslab_txg_list;
  };
  
+/*
+ * Per-allocator data structure.
+ */
+typedef struct metaslab_group_allocator {
+       uint64_t        mga_cur_max_alloc_queue_depth;
+       zfs_refcount_t  mga_alloc_queue_depth;
+       metaslab_t      *mga_primary;
+       metaslab_t      *mga_secondary;
+} metaslab_group_allocator_t;
+
  /*
   * Metaslab groups encapsulate all the allocatable regions (i.e. metaslabs)
   * of a top-level vdev. They are linked together to form a circular linked
@@ -206,8 +224,6 @@ struct metaslab_class {
   */
  struct metaslab_group {
         kmutex_t                mg_lock;
-       metaslab_t              **mg_primaries;
-       metaslab_t              **mg_secondaries;
         avl_tree_t              mg_metaslab_tree;
         uint64_t                mg_aliquot;
         boolean_t               mg_allocatable;         /* can we allocate? */
@@ -255,9 +271,8 @@ struct metaslab_group {
          * groups are unable to handle their share of allocations.
          */
         uint64_t                mg_max_alloc_queue_depth;
-       uint64_t                *mg_cur_max_alloc_queue_depth;
-       refcount_t              *mg_alloc_queue_depth;
         int                     mg_allocators;
+       metaslab_group_allocator_t *mg_allocator; /* array */
         /*
          * A metalab group that can no longer allocate the minimum block
          * size will set mg_no_free_space. Once a metaslab group is out
@@ -270,6 +285,11 @@ struct metaslab_group {
         uint64_t                mg_failed_allocations;
         uint64_t                mg_fragmentation;
         uint64_t                mg_histogram[RANGE_TREE_HISTOGRAM_SIZE];
+
+       int                     mg_ms_disabled;
+       boolean_t               mg_disabled_updating;
+       kmutex_t                mg_ms_disabled_lock;
+       kcondvar_t              mg_ms_disabled_cv;
  };
  
  /*
@@ -334,8 +354,34 @@ struct metaslab_group {
   * being written.
   */
  struct metaslab {
+       /*
+        * This is the main lock of the metaslab and its purpose is to
+        * coordinate our allocations and frees [e.g metaslab_block_alloc(),
+        * metaslab_free_concrete(), ..etc] with our various syncing
+        * procedures [e.g. metaslab_sync(), metaslab_sync_done(), ..etc].
+        *
+        * The lock is also used during some miscellaneous operations like
+        * using the metaslab's histogram for the metaslab group's histogram
+        * aggregation, or marking the metaslab for initialization.
+        */
         kmutex_t        ms_lock;
+
+       /*
+        * Acquired together with the ms_lock whenever we expect to
+        * write to metaslab data on-disk (i.e flushing entries to
+        * the metaslab's space map). It helps coordinate readers of
+        * the metaslab's space map [see spa_vdev_remove_thread()]
+        * with writers [see metaslab_sync() or metaslab_flush()].
+        *
+        * Note that metaslab_load(), even though a reader, uses
+        * a completely different mechanism to deal with the reading
+        * of the metaslab's space map based on ms_synced_length. That
+        * said, the function still uses the ms_sync_lock after it
+        * has read the ms_sm [see relevant comment in metaslab_load()
+        * as to why].
+        */
         kmutex_t        ms_sync_lock;
+
         kcondvar_t      ms_load_cv;
         space_map_t     *ms_sm;
         uint64_t        ms_id;
@@ -345,6 +391,8 @@ struct metaslab {
  
         range_tree_t    *ms_allocating[TXG_SIZE];
         range_tree_t    *ms_allocatable;
+       uint64_t        ms_allocated_this_txg;
+       uint64_t        ms_allocating_total;
  
         /*
          * The following range trees are accessed only from syncing context.
@@ -356,17 +404,82 @@ struct metaslab {
         range_tree_t    *ms_defer[TXG_DEFER_SIZE];
         range_tree_t    *ms_checkpointing; /* to add to the checkpoint */
  
+       /*
+        * The ms_trim tree is the set of allocatable segments which are
+        * eligible for trimming. (When the metaslab is loaded, it's a
+        * subset of ms_allocatable.)  It's kept in-core as long as the
+        * autotrim property is set and is not vacated when the metaslab
+        * is unloaded.  Its purpose is to aggregate freed ranges to
+        * facilitate efficient trimming.
+        */
+       range_tree_t    *ms_trim;
+
         boolean_t       ms_condensing;  /* condensing? */
         boolean_t       ms_condense_wanted;
-       uint64_t        ms_condense_checked_txg;
  
         /*
-        * We must hold both ms_lock and ms_group->mg_lock in order to
-        * modify ms_loaded.
+        * The number of consumers which have disabled the metaslab.
+        */
+       uint64_t        ms_disabled;
+
+       /*
+        * We must always hold the ms_lock when modifying ms_loaded
+        * and ms_loading.
          */
         boolean_t       ms_loaded;
         boolean_t       ms_loading;
+       kcondvar_t      ms_flush_cv;
+       boolean_t       ms_flushing;
  
+       /*
+        * The following histograms count entries that are in the
+        * metaslab's space map (and its histogram) but are not in
+        * ms_allocatable yet, because they are in ms_freed, ms_freeing,
+        * or ms_defer[].
+        *
+        * When the metaslab is not loaded, its ms_weight needs to
+        * reflect what is allocatable (i.e. what will be part of
+        * ms_allocatable if it is loaded).  The weight is computed from
+        * the spacemap histogram, but that includes ranges that are
+        * not yet allocatable (because they are in ms_freed,
+        * ms_freeing, or ms_defer[]).  Therefore, when calculating the
+        * weight, we need to remove those ranges.
+        *
+        * The ranges in the ms_freed and ms_defer[] range trees are all
+        * present in the spacemap.  However, the spacemap may have
+        * multiple entries to represent a contiguous range, because it
+        * is written across multiple sync passes, but the changes of
+        * all sync passes are consolidated into the range trees.
+        * Adjacent ranges that are freed in different sync passes of
+        * one txg will be represented separately (as 2 or more entries)
+        * in the space map (and its histogram), but these adjacent
+        * ranges will be consolidated (represented as one entry) in the
+        * ms_freed/ms_defer[] range trees (and their histograms).
+        *
+        * When calculating the weight, we can not simply subtract the
+        * range trees' histograms from the spacemap's histogram,
+        * because the range trees' histograms may have entries in
+        * higher buckets than the spacemap, due to consolidation.
+        * Instead we must subtract the exact entries that were added to
+        * the spacemap's histogram.  ms_synchist and ms_deferhist[]
+        * represent these exact entries, so we can subtract them from
+        * the spacemap's histogram when calculating ms_weight.
+        *
+        * ms_synchist represents the same ranges as ms_freeing +
+        * ms_freed, but without consolidation across sync passes.
+        *
+        * ms_deferhist[i] represents the same ranges as ms_defer[i],
+        * but without consolidation across sync passes.
+        */
+       uint64_t        ms_synchist[SPACE_MAP_HISTOGRAM_SIZE];
+       uint64_t        ms_deferhist[TXG_DEFER_SIZE][SPACE_MAP_HISTOGRAM_SIZE];
+
+       /*
+        * Tracks the exact amount of allocated space of this metaslab
+        * (and specifically the metaslab's space map) up to the most
+        * recently completed sync pass [see usage in metaslab_sync()].
+        */
+       uint64_t        ms_allocated_space;
         int64_t         ms_deferspace;  /* sum of ms_defermap[] space   */
         uint64_t        ms_weight;      /* weight vs. others in group   */
         uint64_t        ms_activation_weight;   /* activation weight    */
@@ -377,6 +490,13 @@ struct metaslab {
          * stay cached.
          */
         uint64_t        ms_selected_txg;
+       /*
+        * ms_load/unload_time can be used for performance monitoring
+        * (e.g. by dtrace or mdb).
+        */
+       hrtime_t        ms_load_time;   /* time last loaded */
+       hrtime_t        ms_unload_time; /* time last unloaded */
+       hrtime_t        ms_selected_time; /* time last allocated from */
  
         uint64_t        ms_alloc_txg;   /* last successful alloc (debug only) */
         uint64_t        ms_max_size;    /* maximum allocatable size     */
@@ -396,16 +516,45 @@ struct metaslab {
          * only difference is that the ms_allocatable_by_size is ordered by
          * segment sizes.
          */
-       avl_tree_t      ms_allocatable_by_size;
+       zfs_btree_t             ms_allocatable_by_size;
+       zfs_btree_t             ms_unflushed_frees_by_size;
         uint64_t        ms_lbas[MAX_LBAS];
  
         metaslab_group_t *ms_group;     /* metaslab group               */
         avl_node_t      ms_group_node;  /* node in metaslab group tree  */
         txg_node_t      ms_txg_node;    /* per-txg dirty metaslab links */
+       avl_node_t      ms_spa_txg_node; /* node in spa_metaslabs_by_txg */
+       /*
+        * Node in metaslab class's selected txg list
+        */
+       multilist_node_t        ms_class_txg_node;
+
+       /*
+        * Allocs and frees that are committed to the vdev log spacemap but
+        * not yet to this metaslab's spacemap.
+        */
+       range_tree_t    *ms_unflushed_allocs;
+       range_tree_t    *ms_unflushed_frees;
+
+       /*
+        * We have flushed entries up to but not including this TXG. In
+        * other words, all changes from this TXG and onward should not
+        * be in this metaslab's space map and must be read from the
+        * log space maps.
+        */
+       uint64_t        ms_unflushed_txg;
+
+       /* updated every time we are done syncing the metaslab's space map */
+       uint64_t        ms_synced_length;
  
         boolean_t       ms_new;
  };
  
+typedef struct metaslab_unflushed_phys {
+       /* on-disk counterpart of ms_unflushed_txg */
+       uint64_t        msp_unflushed_txg;
+} metaslab_unflushed_phys_t;
+
  #ifdef __cplusplus
  }
  #endif