#include <sys/zil.h>
#include <sys/dsl_scan.h>
#include <sys/abd.h>
+#include <sys/vdev_initialize.h>
#include <sys/zvol.h>
#include <sys/zfs_ratelimit.h>
-/* target number of metaslabs per top-level vdev */
-int vdev_max_ms_count = 200;
+/* default target for number of metaslabs per top-level vdev */
+int zfs_vdev_default_ms_count = 200;
/* minimum number of metaslabs per top-level vdev */
-int vdev_min_ms_count = 16;
+int zfs_vdev_min_ms_count = 16;
/* practical upper limit of total metaslabs per top-level vdev */
-int vdev_ms_count_limit = 1ULL << 17;
+int zfs_vdev_ms_count_limit = 1ULL << 17;
/* lower limit for metaslab size (512M) */
-int vdev_default_ms_shift = 29;
+int zfs_vdev_default_ms_shift = 29;
-/* upper limit for metaslab size (256G) */
-int vdev_max_ms_shift = 38;
+/* upper limit for metaslab size (16G) */
+int zfs_vdev_max_ms_shift = 34;
int vdev_validate_skip = B_FALSE;
int vdev_dtl_sm_blksz = (1 << 12);
/*
- * Rate limit delay events to this many IO delays per second.
+ * Rate limit slow IO (delay) events to this many per second.
*/
-unsigned int zfs_delays_per_second = 20;
+unsigned int zfs_slow_io_events_per_second = 20;
/*
* Rate limit checksum events after this many checksum errors per second.
*/
-unsigned int zfs_checksums_per_second = 20;
+unsigned int zfs_checksum_events_per_second = 20;
/*
* Ignore errors during scrub/resilver. Allows to work around resilver
*/
int vdev_standard_sm_blksz = (1 << 17);
+/*
+ * Tunable parameter for debugging or performance analysis. Setting this
+ * will cause pool corruption on power loss if a volatile out-of-order
+ * write cache is enabled.
+ */
+int zfs_nocacheflush = 0;
+
/*PRINTFLIKE2*/
void
vdev_dbgmsg(vdev_t *vd, const char *fmt, ...)
return (ops);
}
+/* ARGSUSED */
+void
+vdev_default_xlate(vdev_t *vd, const range_seg_t *in, range_seg_t *res)
+{
+ res->rs_start = in->rs_start;
+ res->rs_end = in->rs_end;
+}
+
/*
* Derive the enumerated alloction bias from string input.
* String origin is either the per-vdev zap or zpool(1M).
* and checksum events so that we don't overwhelm ZED with thousands
* of events when a disk is acting up.
*/
- zfs_ratelimit_init(&vd->vdev_delay_rl, &zfs_delays_per_second, 1);
- zfs_ratelimit_init(&vd->vdev_checksum_rl, &zfs_checksums_per_second, 1);
+ zfs_ratelimit_init(&vd->vdev_delay_rl, &zfs_slow_io_events_per_second,
+ 1);
+ zfs_ratelimit_init(&vd->vdev_checksum_rl,
+ &zfs_checksum_events_per_second, 1);
list_link_init(&vd->vdev_config_dirty_node);
list_link_init(&vd->vdev_state_dirty_node);
+ list_link_init(&vd->vdev_initialize_node);
mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_NOLOCKDEP, NULL);
mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&vd->vdev_queue_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_scan_io_queue_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&vd->vdev_initialize_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&vd->vdev_initialize_io_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&vd->vdev_initialize_cv, NULL, CV_DEFAULT, NULL);
+ cv_init(&vd->vdev_initialize_io_cv, NULL, CV_DEFAULT, NULL);
for (int t = 0; t < DTL_TYPES; t++) {
vd->vdev_dtl[t] = range_tree_create(NULL, NULL);
vdev_free(vdev_t *vd)
{
spa_t *spa = vd->vdev_spa;
+ ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
/*
* Scan queues are normally destroyed at the end of a scan. If the
ASSERT(vd->vdev_child == NULL);
ASSERT(vd->vdev_guid_sum == vd->vdev_guid);
+ ASSERT(vd->vdev_initialize_thread == NULL);
/*
* Discard allocation state.
rw_destroy(&vd->vdev_indirect_rwlock);
mutex_destroy(&vd->vdev_obsolete_lock);
- mutex_destroy(&vd->vdev_queue_lock);
mutex_destroy(&vd->vdev_dtl_lock);
mutex_destroy(&vd->vdev_stat_lock);
mutex_destroy(&vd->vdev_probe_lock);
mutex_destroy(&vd->vdev_scan_io_queue_lock);
+ mutex_destroy(&vd->vdev_initialize_lock);
+ mutex_destroy(&vd->vdev_initialize_io_lock);
+ cv_destroy(&vd->vdev_initialize_io_cv);
+ cv_destroy(&vd->vdev_initialize_cv);
zfs_ratelimit_fini(&vd->vdev_delay_rl);
zfs_ratelimit_fini(&vd->vdev_checksum_rl);
vdev_metaslab_set_size(vdev_t *vd)
{
uint64_t asize = vd->vdev_asize;
- uint64_t ms_count = asize >> vdev_default_ms_shift;
+ uint64_t ms_count = asize >> zfs_vdev_default_ms_shift;
uint64_t ms_shift;
/*
* There are two dimensions to the metaslab sizing calculation:
* the size of the metaslab and the count of metaslabs per vdev.
- * In general, we aim for vdev_max_ms_count (200) metaslabs. The
- * range of the dimensions are as follows:
*
- * 2^29 <= ms_size <= 2^38
+ * The default values used below are a good balance between memory
+ * usage (larger metaslab size means more memory needed for loaded
+ * metaslabs; more metaslabs means more memory needed for the
+ * metaslab_t structs), metaslab load time (larger metaslabs take
+ * longer to load), and metaslab sync time (more metaslabs means
+ * more time spent syncing all of them).
+ *
+ * In general, we aim for zfs_vdev_default_ms_count (200) metaslabs.
+ * The range of the dimensions are as follows:
+ *
+ * 2^29 <= ms_size <= 2^34
* 16 <= ms_count <= 131,072
*
* On the lower end of vdev sizes, we aim for metaslabs sizes of
* of at least 16 metaslabs will override this minimum size goal.
*
* On the upper end of vdev sizes, we aim for a maximum metaslab
- * size of 256GB. However, we will cap the total count to 2^17
- * metaslabs to keep our memory footprint in check.
+ * size of 16GB. However, we will cap the total count to 2^17
+ * metaslabs to keep our memory footprint in check and let the
+ * metaslab size grow from there if that limit is hit.
*
* The net effect of applying above constrains is summarized below.
*
- * vdev size metaslab count
- * -------------|-----------------
- * < 8GB ~16
- * 8GB - 100GB one per 512MB
- * 100GB - 50TB ~200
- * 50TB - 32PB one per 256GB
- * > 32PB ~131,072
- * -------------------------------
+ * vdev size metaslab count
+ * --------------|-----------------
+ * < 8GB ~16
+ * 8GB - 100GB one per 512MB
+ * 100GB - 3TB ~200
+ * 3TB - 2PB one per 16GB
+ * > 2PB ~131,072
+ * --------------------------------
+ *
+ * Finally, note that all of the above calculate the initial
+ * number of metaslabs. Expanding a top-level vdev will result
+ * in additional metaslabs being allocated making it possible
+ * to exceed the zfs_vdev_ms_count_limit.
*/
- if (ms_count < vdev_min_ms_count)
- ms_shift = highbit64(asize / vdev_min_ms_count);
- else if (ms_count > vdev_max_ms_count)
- ms_shift = highbit64(asize / vdev_max_ms_count);
+ if (ms_count < zfs_vdev_min_ms_count)
+ ms_shift = highbit64(asize / zfs_vdev_min_ms_count);
+ else if (ms_count > zfs_vdev_default_ms_count)
+ ms_shift = highbit64(asize / zfs_vdev_default_ms_count);
else
- ms_shift = vdev_default_ms_shift;
+ ms_shift = zfs_vdev_default_ms_shift;
if (ms_shift < SPA_MAXBLOCKSHIFT) {
ms_shift = SPA_MAXBLOCKSHIFT;
- } else if (ms_shift > vdev_max_ms_shift) {
- ms_shift = vdev_max_ms_shift;
+ } else if (ms_shift > zfs_vdev_max_ms_shift) {
+ ms_shift = zfs_vdev_max_ms_shift;
/* cap the total count to constrain memory footprint */
- if ((asize >> ms_shift) > vdev_ms_count_limit)
- ms_shift = highbit64(asize / vdev_ms_count_limit);
+ if ((asize >> ms_shift) > zfs_vdev_ms_count_limit)
+ ms_shift = highbit64(asize / zfs_vdev_ms_count_limit);
}
vd->vdev_ms_shift = ms_shift;
}
/*
- * Reassess DTLs after a config change or scrub completion.
+ * Reassess DTLs after a config change or scrub completion. If txg == 0 no
+ * write operations will be issued to the pool.
*/
void
vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
* DTLs then reset its resilvering flag and dirty
* the top level so that we persist the change.
*/
- if (vd->vdev_resilver_txg != 0 &&
+ if (txg != 0 && vd->vdev_resilver_txg != 0 &&
range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) &&
range_tree_is_empty(vd->vdev_dtl[DTL_OUTAGE])) {
vd->vdev_resilver_txg = 0;
ASSERT(vd->vdev_dtl_sm != NULL);
mutex_enter(&vd->vdev_dtl_lock);
-
- /*
- * Now that we've opened the space_map we need to update
- * the in-core DTL.
- */
- space_map_update(vd->vdev_dtl_sm);
-
error = space_map_load(vd->vdev_dtl_sm,
vd->vdev_dtl[DTL_MISSING], SM_ALLOC);
mutex_exit(&vd->vdev_dtl_lock);
}
dmu_tx_commit(tx);
-
- mutex_enter(&vd->vdev_dtl_lock);
- space_map_update(vd->vdev_dtl_sm);
- mutex_exit(&vd->vdev_dtl_lock);
}
/*
return (error);
}
ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
- space_map_update(vd->vdev_checkpoint_sm);
/*
* Since the checkpoint_sm contains free entries
- * exclusively we can use sm_alloc to indicate the
- * cumulative checkpointed space that has been freed.
+ * exclusively we can use space_map_allocated() to
+ * indicate the cumulative checkpointed space that
+ * has been freed.
*/
vd->vdev_stat.vs_checkpoint_space =
- -vd->vdev_checkpoint_sm->sm_alloc;
+ -space_map_allocated(vd->vdev_checkpoint_sm);
vd->vdev_spa->spa_checkpoint_info.sci_dspace +=
vd->vdev_stat.vs_checkpoint_space;
} else if (error != 0) {
(u_longlong_t)obsolete_sm_object, error);
return (error);
}
- space_map_update(vd->vdev_obsolete_sm);
} else if (error != 0) {
vdev_dbgmsg(vd, "vdev_load: failed to retrieve obsolete "
"space map object from vdev ZAP [error=%d]", error);
ASSERT(vd == vd->vdev_top);
ASSERT3U(txg, ==, spa_syncing_txg(spa));
- if (vd->vdev_ms != NULL) {
- metaslab_group_t *mg = vd->vdev_mg;
-
- metaslab_group_histogram_verify(mg);
- metaslab_class_histogram_verify(mg->mg_class);
-
- for (int m = 0; m < vd->vdev_ms_count; m++) {
- metaslab_t *msp = vd->vdev_ms[m];
-
- if (msp == NULL || msp->ms_sm == NULL)
- continue;
-
- mutex_enter(&msp->ms_lock);
- /*
- * If the metaslab was not loaded when the vdev
- * was removed then the histogram accounting may
- * not be accurate. Update the histogram information
- * here so that we ensure that the metaslab group
- * and metaslab class are up-to-date.
- */
- metaslab_group_histogram_remove(mg, msp);
-
- VERIFY0(space_map_allocated(msp->ms_sm));
- space_map_close(msp->ms_sm);
- msp->ms_sm = NULL;
- mutex_exit(&msp->ms_lock);
- }
-
- if (vd->vdev_checkpoint_sm != NULL) {
- ASSERT(spa_has_checkpoint(spa));
- space_map_close(vd->vdev_checkpoint_sm);
- vd->vdev_checkpoint_sm = NULL;
- }
-
- metaslab_group_histogram_verify(mg);
- metaslab_class_histogram_verify(mg->mg_class);
-
- for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
- ASSERT0(mg->mg_histogram[i]);
- }
-
dmu_tx_t *tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
vdev_destroy_spacemaps(vd, tx);
ASSERT(vdev_is_concrete(vd));
- while ((msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg))))
+ while ((msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))
+ != NULL)
metaslab_sync_done(msp, txg);
if (reassess)
spa_t *spa = vd->vdev_spa;
vdev_t *lvd;
metaslab_t *msp;
- dmu_tx_t *tx;
+ ASSERT3U(txg, ==, spa->spa_syncing_txg);
+ dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
if (range_tree_space(vd->vdev_obsolete_segments) > 0) {
- dmu_tx_t *tx;
-
ASSERT(vd->vdev_removing ||
vd->vdev_ops == &vdev_indirect_ops);
- tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
vdev_indirect_sync_obsolete(vd, tx);
- dmu_tx_commit(tx);
/*
* If the vdev is indirect, it can't have dirty
if (vd->vdev_ops == &vdev_indirect_ops) {
ASSERT(txg_list_empty(&vd->vdev_ms_list, txg));
ASSERT(txg_list_empty(&vd->vdev_dtl_list, txg));
+ dmu_tx_commit(tx);
return;
}
}
!vd->vdev_removing) {
ASSERT(vd == vd->vdev_top);
ASSERT0(vd->vdev_indirect_config.vic_mapping_object);
- tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset,
DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx);
ASSERT(vd->vdev_ms_array != 0);
vdev_config_dirty(vd);
- dmu_tx_commit(tx);
}
while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) {
vdev_remove_empty_log(vd, txg);
(void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg));
+ dmu_tx_commit(tx);
}
uint64_t
for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
pvd->vdev_expanding = !!((flags & ZFS_ONLINE_EXPAND) ||
spa->spa_autoexpand);
+ vd->vdev_expansion_time = gethrestime_sec();
}
vdev_reopen(tvd);
spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
}
+ /* Restart initializing if necessary */
+ mutex_enter(&vd->vdev_initialize_lock);
+ if (vdev_writeable(vd) &&
+ vd->vdev_initialize_thread == NULL &&
+ vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE) {
+ (void) vdev_initialize(vd);
+ }
+ mutex_exit(&vd->vdev_initialize_lock);
+
if (wasoffline ||
(oldstate < VDEV_STATE_DEGRADED &&
vd->vdev_state >= VDEV_STATE_DEGRADED))
*/
if (error == 0 &&
tvd->vdev_checkpoint_sm != NULL) {
- ASSERT3U(tvd->vdev_checkpoint_sm->sm_alloc,
- !=, 0);
+ ASSERT3U(space_map_allocated(
+ tvd->vdev_checkpoint_sm), !=, 0);
error = ZFS_ERR_CHECKPOINT_EXISTS;
}
vd->vdev_stat.vs_read_errors = 0;
vd->vdev_stat.vs_write_errors = 0;
vd->vdev_stat.vs_checksum_errors = 0;
+ vd->vdev_stat.vs_slow_ios = 0;
for (int c = 0; c < vd->vdev_children; c++)
vdev_clear(spa, vd->vdev_child[c]);
boolean_t
vdev_is_spacemap_addressable(vdev_t *vd)
{
+ if (spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_SPACEMAP_V2))
+ return (B_TRUE);
+
/*
- * Assuming 47 bits of the space map entry dedicated for the entry's
- * offset (see description in space_map.h), we calculate the maximum
- * address that can be described by a space map entry for the given
- * device.
+ * If double-word space map entries are not enabled we assume
+ * 47 bits of the space map entry are dedicated to the entry's
+ * offset (see SM_OFFSET_BITS in space_map.h). We then use that
+ * to calculate the maximum address that can be described by a
+ * space map entry for the given device.
*/
- uint64_t shift = vd->vdev_ashift + 47;
+ uint64_t shift = vd->vdev_ashift + SM_OFFSET_BITS;
if (shift >= 63) /* detect potential overflow */
return (B_TRUE);
vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
vs->vs_state = vd->vdev_state;
vs->vs_rsize = vdev_get_min_asize(vd);
- if (vd->vdev_ops->vdev_op_leaf)
+ if (vd->vdev_ops->vdev_op_leaf) {
vs->vs_rsize += VDEV_LABEL_START_SIZE +
VDEV_LABEL_END_SIZE;
+ /*
+ * Report intializing progress. Since we don't
+ * have the initializing locks held, this is only
+ * an estimate (although a fairly accurate one).
+ */
+ vs->vs_initialize_bytes_done =
+ vd->vdev_initialize_bytes_done;
+ vs->vs_initialize_bytes_est =
+ vd->vdev_initialize_bytes_est;
+ vs->vs_initialize_state = vd->vdev_initialize_state;
+ vs->vs_initialize_action_time =
+ vd->vdev_initialize_action_time;
+ }
/*
* Report expandable space on top-level, non-auxillary devices
* only. The expandable space is reported in terms of metaslab
dspace_delta = vdev_deflated_space(vd, space_delta);
mutex_enter(&vd->vdev_stat_lock);
+ /* ensure we won't underflow */
+ if (alloc_delta < 0) {
+ ASSERT3U(vd->vdev_stat.vs_alloc, >=, -alloc_delta);
+ }
+
vd->vdev_stat.vs_alloc += alloc_delta;
vd->vdev_stat.vs_space += space_delta;
vd->vdev_stat.vs_dspace += dspace_delta;
/* every class but log contributes to root space stats */
if (vd->vdev_mg != NULL && !vd->vdev_islog) {
+ ASSERT(!vd->vdev_isl2cache);
mutex_enter(&rvd->vdev_stat_lock);
rvd->vdev_stat.vs_alloc += alloc_delta;
rvd->vdev_stat.vs_space += space_delta;
EXPORT_SYMBOL(vdev_offline);
EXPORT_SYMBOL(vdev_clear);
/* BEGIN CSTYLED */
-module_param(vdev_max_ms_count, int, 0644);
-MODULE_PARM_DESC(vdev_max_ms_count,
+module_param(zfs_vdev_default_ms_count, int, 0644);
+MODULE_PARM_DESC(zfs_vdev_default_ms_count,
"Target number of metaslabs per top-level vdev");
-module_param(vdev_min_ms_count, int, 0644);
-MODULE_PARM_DESC(vdev_min_ms_count,
+module_param(zfs_vdev_min_ms_count, int, 0644);
+MODULE_PARM_DESC(zfs_vdev_min_ms_count,
"Minimum number of metaslabs per top-level vdev");
-module_param(vdev_ms_count_limit, int, 0644);
-MODULE_PARM_DESC(vdev_ms_count_limit,
+module_param(zfs_vdev_ms_count_limit, int, 0644);
+MODULE_PARM_DESC(zfs_vdev_ms_count_limit,
"Practical upper limit of total metaslabs per top-level vdev");
-module_param(zfs_delays_per_second, uint, 0644);
-MODULE_PARM_DESC(zfs_delays_per_second, "Rate limit delay events to this many "
- "IO delays per second");
+module_param(zfs_slow_io_events_per_second, uint, 0644);
+MODULE_PARM_DESC(zfs_slow_io_events_per_second,
+ "Rate limit slow IO (delay) events to this many per second");
-module_param(zfs_checksums_per_second, uint, 0644);
- MODULE_PARM_DESC(zfs_checksums_per_second, "Rate limit checksum events "
+module_param(zfs_checksum_events_per_second, uint, 0644);
+MODULE_PARM_DESC(zfs_checksum_events_per_second, "Rate limit checksum events "
"to this many checksum errors per second (do not set below zed"
"threshold).");
module_param(vdev_validate_skip, int, 0644);
MODULE_PARM_DESC(vdev_validate_skip,
"Bypass vdev_validate()");
+
+module_param(zfs_nocacheflush, int, 0644);
+MODULE_PARM_DESC(zfs_nocacheflush, "Disable cache flushes");
/* END CSTYLED */
#endif