*/
/*
- * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2019 by Delphix. All rights reserved.
*/
#ifndef _SYS_METASLAB_IMPL_H
#include <sys/vdev.h>
#include <sys/txg.h>
#include <sys/avl.h>
+#include <sys/multilist.h>
#ifdef __cplusplus
extern "C" {
TRACE_GROUP_FAILURE = -5ULL,
TRACE_ENOSPC = -6ULL,
TRACE_CONDENSING = -7ULL,
- TRACE_VDEV_ERROR = -8ULL
+ TRACE_VDEV_ERROR = -8ULL,
+ TRACE_DISABLED = -9ULL,
} trace_alloc_type_t;
#define METASLAB_WEIGHT_PRIMARY (1ULL << 63)
* number of allocations allowed.
*/
uint64_t *mc_alloc_max_slots;
- refcount_t *mc_alloc_slots;
+ zfs_refcount_t *mc_alloc_slots;
uint64_t mc_alloc_groups; /* # of allocatable groups */
uint64_t mc_space; /* total space (alloc + free) */
uint64_t mc_dspace; /* total deflated space */
uint64_t mc_histogram[RANGE_TREE_HISTOGRAM_SIZE];
+
+ /*
+ * List of all loaded metaslabs in the class, sorted in order of most
+ * recent use.
+ */
+ multilist_t *mc_metaslab_txg_list;
};
+/*
+ * Per-allocator data structure.
+ */
+typedef struct metaslab_group_allocator {
+ uint64_t mga_cur_max_alloc_queue_depth;
+ zfs_refcount_t mga_alloc_queue_depth;
+ metaslab_t *mga_primary;
+ metaslab_t *mga_secondary;
+} metaslab_group_allocator_t;
+
/*
* Metaslab groups encapsulate all the allocatable regions (i.e. metaslabs)
* of a top-level vdev. They are linked together to form a circular linked
*/
struct metaslab_group {
kmutex_t mg_lock;
- metaslab_t **mg_primaries;
- metaslab_t **mg_secondaries;
avl_tree_t mg_metaslab_tree;
uint64_t mg_aliquot;
boolean_t mg_allocatable; /* can we allocate? */
* groups are unable to handle their share of allocations.
*/
uint64_t mg_max_alloc_queue_depth;
- uint64_t *mg_cur_max_alloc_queue_depth;
- refcount_t *mg_alloc_queue_depth;
int mg_allocators;
+ metaslab_group_allocator_t *mg_allocator; /* array */
/*
* A metalab group that can no longer allocate the minimum block
* size will set mg_no_free_space. Once a metaslab group is out
uint64_t mg_failed_allocations;
uint64_t mg_fragmentation;
uint64_t mg_histogram[RANGE_TREE_HISTOGRAM_SIZE];
+
+ int mg_ms_disabled;
+ boolean_t mg_disabled_updating;
+ kmutex_t mg_ms_disabled_lock;
+ kcondvar_t mg_ms_disabled_cv;
};
/*
* being written.
*/
struct metaslab {
+ /*
+ * This is the main lock of the metaslab and its purpose is to
+ * coordinate our allocations and frees [e.g metaslab_block_alloc(),
+ * metaslab_free_concrete(), ..etc] with our various syncing
+ * procedures [e.g. metaslab_sync(), metaslab_sync_done(), ..etc].
+ *
+ * The lock is also used during some miscellaneous operations like
+ * using the metaslab's histogram for the metaslab group's histogram
+ * aggregation, or marking the metaslab for initialization.
+ */
kmutex_t ms_lock;
+
+ /*
+ * Acquired together with the ms_lock whenever we expect to
+ * write to metaslab data on-disk (i.e flushing entries to
+ * the metaslab's space map). It helps coordinate readers of
+ * the metaslab's space map [see spa_vdev_remove_thread()]
+ * with writers [see metaslab_sync() or metaslab_flush()].
+ *
+ * Note that metaslab_load(), even though a reader, uses
+ * a completely different mechanism to deal with the reading
+ * of the metaslab's space map based on ms_synced_length. That
+ * said, the function still uses the ms_sync_lock after it
+ * has read the ms_sm [see relevant comment in metaslab_load()
+ * as to why].
+ */
kmutex_t ms_sync_lock;
+
kcondvar_t ms_load_cv;
space_map_t *ms_sm;
uint64_t ms_id;
range_tree_t *ms_allocating[TXG_SIZE];
range_tree_t *ms_allocatable;
+ uint64_t ms_allocated_this_txg;
+ uint64_t ms_allocating_total;
/*
* The following range trees are accessed only from syncing context.
range_tree_t *ms_defer[TXG_DEFER_SIZE];
range_tree_t *ms_checkpointing; /* to add to the checkpoint */
+ /*
+ * The ms_trim tree is the set of allocatable segments which are
+ * eligible for trimming. (When the metaslab is loaded, it's a
+ * subset of ms_allocatable.) It's kept in-core as long as the
+ * autotrim property is set and is not vacated when the metaslab
+ * is unloaded. Its purpose is to aggregate freed ranges to
+ * facilitate efficient trimming.
+ */
+ range_tree_t *ms_trim;
+
boolean_t ms_condensing; /* condensing? */
boolean_t ms_condense_wanted;
- uint64_t ms_condense_checked_txg;
/*
- * We must hold both ms_lock and ms_group->mg_lock in order to
- * modify ms_loaded.
+ * The number of consumers which have disabled the metaslab.
+ */
+ uint64_t ms_disabled;
+
+ /*
+ * We must always hold the ms_lock when modifying ms_loaded
+ * and ms_loading.
*/
boolean_t ms_loaded;
boolean_t ms_loading;
+ kcondvar_t ms_flush_cv;
+ boolean_t ms_flushing;
+ /*
+ * The following histograms count entries that are in the
+ * metaslab's space map (and its histogram) but are not in
+ * ms_allocatable yet, because they are in ms_freed, ms_freeing,
+ * or ms_defer[].
+ *
+ * When the metaslab is not loaded, its ms_weight needs to
+ * reflect what is allocatable (i.e. what will be part of
+ * ms_allocatable if it is loaded). The weight is computed from
+ * the spacemap histogram, but that includes ranges that are
+ * not yet allocatable (because they are in ms_freed,
+ * ms_freeing, or ms_defer[]). Therefore, when calculating the
+ * weight, we need to remove those ranges.
+ *
+ * The ranges in the ms_freed and ms_defer[] range trees are all
+ * present in the spacemap. However, the spacemap may have
+ * multiple entries to represent a contiguous range, because it
+ * is written across multiple sync passes, but the changes of
+ * all sync passes are consolidated into the range trees.
+ * Adjacent ranges that are freed in different sync passes of
+ * one txg will be represented separately (as 2 or more entries)
+ * in the space map (and its histogram), but these adjacent
+ * ranges will be consolidated (represented as one entry) in the
+ * ms_freed/ms_defer[] range trees (and their histograms).
+ *
+ * When calculating the weight, we can not simply subtract the
+ * range trees' histograms from the spacemap's histogram,
+ * because the range trees' histograms may have entries in
+ * higher buckets than the spacemap, due to consolidation.
+ * Instead we must subtract the exact entries that were added to
+ * the spacemap's histogram. ms_synchist and ms_deferhist[]
+ * represent these exact entries, so we can subtract them from
+ * the spacemap's histogram when calculating ms_weight.
+ *
+ * ms_synchist represents the same ranges as ms_freeing +
+ * ms_freed, but without consolidation across sync passes.
+ *
+ * ms_deferhist[i] represents the same ranges as ms_defer[i],
+ * but without consolidation across sync passes.
+ */
+ uint64_t ms_synchist[SPACE_MAP_HISTOGRAM_SIZE];
+ uint64_t ms_deferhist[TXG_DEFER_SIZE][SPACE_MAP_HISTOGRAM_SIZE];
+
+ /*
+ * Tracks the exact amount of allocated space of this metaslab
+ * (and specifically the metaslab's space map) up to the most
+ * recently completed sync pass [see usage in metaslab_sync()].
+ */
+ uint64_t ms_allocated_space;
int64_t ms_deferspace; /* sum of ms_defermap[] space */
uint64_t ms_weight; /* weight vs. others in group */
uint64_t ms_activation_weight; /* activation weight */
* stay cached.
*/
uint64_t ms_selected_txg;
+ /*
+ * ms_load/unload_time can be used for performance monitoring
+ * (e.g. by dtrace or mdb).
+ */
+ hrtime_t ms_load_time; /* time last loaded */
+ hrtime_t ms_unload_time; /* time last unloaded */
+ hrtime_t ms_selected_time; /* time last allocated from */
uint64_t ms_alloc_txg; /* last successful alloc (debug only) */
uint64_t ms_max_size; /* maximum allocatable size */
* only difference is that the ms_allocatable_by_size is ordered by
* segment sizes.
*/
- avl_tree_t ms_allocatable_by_size;
+ zfs_btree_t ms_allocatable_by_size;
+ zfs_btree_t ms_unflushed_frees_by_size;
uint64_t ms_lbas[MAX_LBAS];
metaslab_group_t *ms_group; /* metaslab group */
avl_node_t ms_group_node; /* node in metaslab group tree */
txg_node_t ms_txg_node; /* per-txg dirty metaslab links */
+ avl_node_t ms_spa_txg_node; /* node in spa_metaslabs_by_txg */
+ /*
+ * Node in metaslab class's selected txg list
+ */
+ multilist_node_t ms_class_txg_node;
+
+ /*
+ * Allocs and frees that are committed to the vdev log spacemap but
+ * not yet to this metaslab's spacemap.
+ */
+ range_tree_t *ms_unflushed_allocs;
+ range_tree_t *ms_unflushed_frees;
+
+ /*
+ * We have flushed entries up to but not including this TXG. In
+ * other words, all changes from this TXG and onward should not
+ * be in this metaslab's space map and must be read from the
+ * log space maps.
+ */
+ uint64_t ms_unflushed_txg;
+
+ /* updated every time we are done syncing the metaslab's space map */
+ uint64_t ms_synced_length;
boolean_t ms_new;
};
+typedef struct metaslab_unflushed_phys {
+ /* on-disk counterpart of ms_unflushed_txg */
+ uint64_t msp_unflushed_txg;
+} metaslab_unflushed_phys_t;
+
#ifdef __cplusplus
}
#endif