#include <sys/zvol.h>
#include <sys/zfs_ratelimit.h>
-/* target number of metaslabs per top-level vdev */
-int vdev_max_ms_count = 200;
+/* default target for number of metaslabs per top-level vdev */
+int zfs_vdev_default_ms_count = 200;
/* minimum number of metaslabs per top-level vdev */
-int vdev_min_ms_count = 16;
+int zfs_vdev_min_ms_count = 16;
/* practical upper limit of total metaslabs per top-level vdev */
-int vdev_ms_count_limit = 1ULL << 17;
+int zfs_vdev_ms_count_limit = 1ULL << 17;
/* lower limit for metaslab size (512M) */
-int vdev_default_ms_shift = 29;
+int zfs_vdev_default_ms_shift = 29;
-/* upper limit for metaslab size (256G) */
-int vdev_max_ms_shift = 38;
+/* upper limit for metaslab size (16G) */
+int zfs_vdev_max_ms_shift = 34;
int vdev_validate_skip = B_FALSE;
list_link_init(&vd->vdev_config_dirty_node);
list_link_init(&vd->vdev_state_dirty_node);
+ list_link_init(&vd->vdev_initialize_node);
mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_NOLOCKDEP, NULL);
mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&vd->vdev_queue_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_scan_io_queue_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_initialize_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_initialize_io_lock, NULL, MUTEX_DEFAULT, NULL);
rw_destroy(&vd->vdev_indirect_rwlock);
mutex_destroy(&vd->vdev_obsolete_lock);
- mutex_destroy(&vd->vdev_queue_lock);
mutex_destroy(&vd->vdev_dtl_lock);
mutex_destroy(&vd->vdev_stat_lock);
mutex_destroy(&vd->vdev_probe_lock);
vdev_metaslab_set_size(vdev_t *vd)
{
uint64_t asize = vd->vdev_asize;
- uint64_t ms_count = asize >> vdev_default_ms_shift;
+ uint64_t ms_count = asize >> zfs_vdev_default_ms_shift;
uint64_t ms_shift;
/*
* There are two dimensions to the metaslab sizing calculation:
* the size of the metaslab and the count of metaslabs per vdev.
- * In general, we aim for vdev_max_ms_count (200) metaslabs. The
- * range of the dimensions are as follows:
*
- * 2^29 <= ms_size <= 2^38
+ * The default values used below are a good balance between memory
+ * usage (larger metaslab size means more memory needed for loaded
+ * metaslabs; more metaslabs means more memory needed for the
+ * metaslab_t structs), metaslab load time (larger metaslabs take
+ * longer to load), and metaslab sync time (more metaslabs means
+ * more time spent syncing all of them).
+ *
+ * In general, we aim for zfs_vdev_default_ms_count (200) metaslabs.
+ * The range of the dimensions are as follows:
+ *
+ * 2^29 <= ms_size <= 2^34
* 16 <= ms_count <= 131,072
*
* On the lower end of vdev sizes, we aim for metaslabs sizes of
* of at least 16 metaslabs will override this minimum size goal.
*
* On the upper end of vdev sizes, we aim for a maximum metaslab
- * size of 256GB. However, we will cap the total count to 2^17
- * metaslabs to keep our memory footprint in check.
+ * size of 16GB. However, we will cap the total count to 2^17
+ * metaslabs to keep our memory footprint in check and let the
+ * metaslab size grow from there if that limit is hit.
*
* The net effect of applying above constrains is summarized below.
*
- * vdev size metaslab count
- * -------------|-----------------
- * < 8GB ~16
- * 8GB - 100GB one per 512MB
- * 100GB - 50TB ~200
- * 50TB - 32PB one per 256GB
- * > 32PB ~131,072
- * -------------------------------
+ * vdev size metaslab count
+ * --------------|-----------------
+ * < 8GB ~16
+ * 8GB - 100GB one per 512MB
+ * 100GB - 3TB ~200
+ * 3TB - 2PB one per 16GB
+ * > 2PB ~131,072
+ * --------------------------------
+ *
+ * Finally, note that all of the above calculate the initial
+ * number of metaslabs. Expanding a top-level vdev will result
+ * in additional metaslabs being allocated making it possible
+ * to exceed the zfs_vdev_ms_count_limit.
*/
- if (ms_count < vdev_min_ms_count)
- ms_shift = highbit64(asize / vdev_min_ms_count);
- else if (ms_count > vdev_max_ms_count)
- ms_shift = highbit64(asize / vdev_max_ms_count);
+ if (ms_count < zfs_vdev_min_ms_count)
+ ms_shift = highbit64(asize / zfs_vdev_min_ms_count);
+ else if (ms_count > zfs_vdev_default_ms_count)
+ ms_shift = highbit64(asize / zfs_vdev_default_ms_count);
else
- ms_shift = vdev_default_ms_shift;
+ ms_shift = zfs_vdev_default_ms_shift;
if (ms_shift < SPA_MAXBLOCKSHIFT) {
ms_shift = SPA_MAXBLOCKSHIFT;
- } else if (ms_shift > vdev_max_ms_shift) {
- ms_shift = vdev_max_ms_shift;
+ } else if (ms_shift > zfs_vdev_max_ms_shift) {
+ ms_shift = zfs_vdev_max_ms_shift;
/* cap the total count to constrain memory footprint */
- if ((asize >> ms_shift) > vdev_ms_count_limit)
- ms_shift = highbit64(asize / vdev_ms_count_limit);
+ if ((asize >> ms_shift) > zfs_vdev_ms_count_limit)
+ ms_shift = highbit64(asize / zfs_vdev_ms_count_limit);
}
vd->vdev_ms_shift = ms_shift;
ASSERT(vd->vdev_dtl_sm != NULL);
mutex_enter(&vd->vdev_dtl_lock);
-
- /*
- * Now that we've opened the space_map we need to update
- * the in-core DTL.
- */
- space_map_update(vd->vdev_dtl_sm);
-
error = space_map_load(vd->vdev_dtl_sm,
vd->vdev_dtl[DTL_MISSING], SM_ALLOC);
mutex_exit(&vd->vdev_dtl_lock);
}
dmu_tx_commit(tx);
-
- mutex_enter(&vd->vdev_dtl_lock);
- space_map_update(vd->vdev_dtl_sm);
- mutex_exit(&vd->vdev_dtl_lock);
}
/*
return (error);
}
ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
- space_map_update(vd->vdev_checkpoint_sm);
/*
* Since the checkpoint_sm contains free entries
- * exclusively we can use sm_alloc to indicate the
- * cumulative checkpointed space that has been freed.
+ * exclusively we can use space_map_allocated() to
+ * indicate the cumulative checkpointed space that
+ * has been freed.
*/
vd->vdev_stat.vs_checkpoint_space =
- -vd->vdev_checkpoint_sm->sm_alloc;
+ -space_map_allocated(vd->vdev_checkpoint_sm);
vd->vdev_spa->spa_checkpoint_info.sci_dspace +=
vd->vdev_stat.vs_checkpoint_space;
} else if (error != 0) {
(u_longlong_t)obsolete_sm_object, error);
return (error);
}
- space_map_update(vd->vdev_obsolete_sm);
} else if (error != 0) {
vdev_dbgmsg(vd, "vdev_load: failed to retrieve obsolete "
"space map object from vdev ZAP [error=%d]", error);
ASSERT(vd == vd->vdev_top);
ASSERT3U(txg, ==, spa_syncing_txg(spa));
- if (vd->vdev_ms != NULL) {
- metaslab_group_t *mg = vd->vdev_mg;
-
- metaslab_group_histogram_verify(mg);
- metaslab_class_histogram_verify(mg->mg_class);
-
- for (int m = 0; m < vd->vdev_ms_count; m++) {
- metaslab_t *msp = vd->vdev_ms[m];
-
- if (msp == NULL || msp->ms_sm == NULL)
- continue;
-
- mutex_enter(&msp->ms_lock);
- /*
- * If the metaslab was not loaded when the vdev
- * was removed then the histogram accounting may
- * not be accurate. Update the histogram information
- * here so that we ensure that the metaslab group
- * and metaslab class are up-to-date.
- */
- metaslab_group_histogram_remove(mg, msp);
-
- VERIFY0(space_map_allocated(msp->ms_sm));
- space_map_close(msp->ms_sm);
- msp->ms_sm = NULL;
- mutex_exit(&msp->ms_lock);
- }
-
- if (vd->vdev_checkpoint_sm != NULL) {
- ASSERT(spa_has_checkpoint(spa));
- space_map_close(vd->vdev_checkpoint_sm);
- vd->vdev_checkpoint_sm = NULL;
- }
-
- metaslab_group_histogram_verify(mg);
- metaslab_class_histogram_verify(mg->mg_class);
-
- for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
- ASSERT0(mg->mg_histogram[i]);
- }
-
dmu_tx_t *tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
vdev_destroy_spacemaps(vd, tx);
spa_t *spa = vd->vdev_spa;
vdev_t *lvd;
metaslab_t *msp;
- dmu_tx_t *tx;
+ ASSERT3U(txg, ==, spa->spa_syncing_txg);
+ dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
if (range_tree_space(vd->vdev_obsolete_segments) > 0) {
- dmu_tx_t *tx;
-
ASSERT(vd->vdev_removing ||
vd->vdev_ops == &vdev_indirect_ops);
- tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
vdev_indirect_sync_obsolete(vd, tx);
- dmu_tx_commit(tx);
/*
* If the vdev is indirect, it can't have dirty
if (vd->vdev_ops == &vdev_indirect_ops) {
ASSERT(txg_list_empty(&vd->vdev_ms_list, txg));
ASSERT(txg_list_empty(&vd->vdev_dtl_list, txg));
+ dmu_tx_commit(tx);
return;
}
}
!vd->vdev_removing) {
ASSERT(vd == vd->vdev_top);
ASSERT0(vd->vdev_indirect_config.vic_mapping_object);
- tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset,
DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx);
ASSERT(vd->vdev_ms_array != 0);
vdev_config_dirty(vd);
- dmu_tx_commit(tx);
}
while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) {
vdev_remove_empty_log(vd, txg);
(void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg));
+ dmu_tx_commit(tx);
}
uint64_t
*/
if (error == 0 &&
tvd->vdev_checkpoint_sm != NULL) {
- ASSERT3U(tvd->vdev_checkpoint_sm->sm_alloc,
- !=, 0);
+ ASSERT3U(space_map_allocated(
+ tvd->vdev_checkpoint_sm), !=, 0);
error = ZFS_ERR_CHECKPOINT_EXISTS;
}
boolean_t
vdev_is_spacemap_addressable(vdev_t *vd)
{
+ if (spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_SPACEMAP_V2))
+ return (B_TRUE);
+
/*
- * Assuming 47 bits of the space map entry dedicated for the entry's
- * offset (see description in space_map.h), we calculate the maximum
- * address that can be described by a space map entry for the given
- * device.
+ * If double-word space map entries are not enabled we assume
+ * 47 bits of the space map entry are dedicated to the entry's
+ * offset (see SM_OFFSET_BITS in space_map.h). We then use that
+ * to calculate the maximum address that can be described by a
+ * space map entry for the given device.
*/
- uint64_t shift = vd->vdev_ashift + 47;
+ uint64_t shift = vd->vdev_ashift + SM_OFFSET_BITS;
if (shift >= 63) /* detect potential overflow */
return (B_TRUE);
dspace_delta = vdev_deflated_space(vd, space_delta);
mutex_enter(&vd->vdev_stat_lock);
+ /* ensure we won't underflow */
+ if (alloc_delta < 0) {
+ ASSERT3U(vd->vdev_stat.vs_alloc, >=, -alloc_delta);
+ }
+
vd->vdev_stat.vs_alloc += alloc_delta;
vd->vdev_stat.vs_space += space_delta;
vd->vdev_stat.vs_dspace += dspace_delta;
/* every class but log contributes to root space stats */
if (vd->vdev_mg != NULL && !vd->vdev_islog) {
+ ASSERT(!vd->vdev_isl2cache);
mutex_enter(&rvd->vdev_stat_lock);
rvd->vdev_stat.vs_alloc += alloc_delta;
rvd->vdev_stat.vs_space += space_delta;
EXPORT_SYMBOL(vdev_offline);
EXPORT_SYMBOL(vdev_clear);
/* BEGIN CSTYLED */
-module_param(vdev_max_ms_count, int, 0644);
-MODULE_PARM_DESC(vdev_max_ms_count,
+module_param(zfs_vdev_default_ms_count, int, 0644);
+MODULE_PARM_DESC(zfs_vdev_default_ms_count,
"Target number of metaslabs per top-level vdev");
-module_param(vdev_min_ms_count, int, 0644);
-MODULE_PARM_DESC(vdev_min_ms_count,
+module_param(zfs_vdev_min_ms_count, int, 0644);
+MODULE_PARM_DESC(zfs_vdev_min_ms_count,
"Minimum number of metaslabs per top-level vdev");
-module_param(vdev_ms_count_limit, int, 0644);
-MODULE_PARM_DESC(vdev_ms_count_limit,
+module_param(zfs_vdev_ms_count_limit, int, 0644);
+MODULE_PARM_DESC(zfs_vdev_ms_count_limit,
"Practical upper limit of total metaslabs per top-level vdev");
module_param(zfs_slow_io_events_per_second, uint, 0644);