X-Git-Url: https://git.proxmox.com/?a=blobdiff_plain;f=module%2Fzfs%2Fvdev.c;h=81c34da074fd0a2218b42984bacfaaec0add29ab;hb=425d3237ee88abc53d8522a7139c926d278b4b7f;hp=6db6fbacb73568d2f7b204c55e824a8e80ecd23e;hpb=e902ddb0f8c57ac8d5b7d64edc687a022d0b9b7f;p=mirror_zfs.git diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 6db6fbacb..81c34da07 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -26,6 +26,7 @@ * Copyright (c) 2014 Integros [integros.com] * Copyright 2016 Toomas Soome * Copyright 2017 Joyent, Inc. + * Copyright (c) 2017, Intel Corporation. */ #include @@ -49,17 +50,24 @@ #include #include #include +#include #include #include -/* maximum number of metaslabs per top-level vdev */ -int vdev_max_ms_count = 200; +/* default target for number of metaslabs per top-level vdev */ +int zfs_vdev_default_ms_count = 200; -/* minimum amount of metaslabs per top-level vdev */ -int vdev_min_ms_count = 16; +/* minimum number of metaslabs per top-level vdev */ +int zfs_vdev_min_ms_count = 16; -/* see comment in vdev_metaslab_set_size() */ -int vdev_default_ms_shift = 29; +/* practical upper limit of total metaslabs per top-level vdev */ +int zfs_vdev_ms_count_limit = 1ULL << 17; + +/* lower limit for metaslab size (512M) */ +int zfs_vdev_default_ms_shift = 29; + +/* upper limit for metaslab size (16G) */ +int zfs_vdev_max_ms_shift = 34; int vdev_validate_skip = B_FALSE; @@ -70,14 +78,14 @@ int vdev_validate_skip = B_FALSE; int vdev_dtl_sm_blksz = (1 << 12); /* - * Rate limit delay events to this many IO delays per second. + * Rate limit slow IO (delay) events to this many per second. */ -unsigned int zfs_delays_per_second = 20; +unsigned int zfs_slow_io_events_per_second = 20; /* * Rate limit checksum events after this many checksum errors per second. */ -unsigned int zfs_checksums_per_second = 20; +unsigned int zfs_checksum_events_per_second = 20; /* * Ignore errors during scrub/resilver. Allows to work around resilver @@ -92,6 +100,13 @@ int zfs_scan_ignore_errors = 0; */ int vdev_standard_sm_blksz = (1 << 17); +/* + * Tunable parameter for debugging or performance analysis. Setting this + * will cause pool corruption on power loss if a volatile out-of-order + * write cache is enabled. + */ +int zfs_nocacheflush = 0; + /*PRINTFLIKE2*/ void vdev_dbgmsg(vdev_t *vd, const char *fmt, ...) @@ -198,6 +213,33 @@ vdev_getops(const char *type) return (ops); } +/* ARGSUSED */ +void +vdev_default_xlate(vdev_t *vd, const range_seg_t *in, range_seg_t *res) +{ + res->rs_start = in->rs_start; + res->rs_end = in->rs_end; +} + +/* + * Derive the enumerated alloction bias from string input. + * String origin is either the per-vdev zap or zpool(1M). + */ +static vdev_alloc_bias_t +vdev_derive_alloc_bias(const char *bias) +{ + vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE; + + if (strcmp(bias, VDEV_ALLOC_BIAS_LOG) == 0) + alloc_bias = VDEV_BIAS_LOG; + else if (strcmp(bias, VDEV_ALLOC_BIAS_SPECIAL) == 0) + alloc_bias = VDEV_BIAS_SPECIAL; + else if (strcmp(bias, VDEV_ALLOC_BIAS_DEDUP) == 0) + alloc_bias = VDEV_BIAS_DEDUP; + + return (alloc_bias); +} + /* * Default asize function: return the MAX of psize with the asize of * all children. This is what's used by anything other than RAID-Z. @@ -481,16 +523,22 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) * and checksum events so that we don't overwhelm ZED with thousands * of events when a disk is acting up. */ - zfs_ratelimit_init(&vd->vdev_delay_rl, &zfs_delays_per_second, 1); - zfs_ratelimit_init(&vd->vdev_checksum_rl, &zfs_checksums_per_second, 1); + zfs_ratelimit_init(&vd->vdev_delay_rl, &zfs_slow_io_events_per_second, + 1); + zfs_ratelimit_init(&vd->vdev_checksum_rl, + &zfs_checksum_events_per_second, 1); list_link_init(&vd->vdev_config_dirty_node); list_link_init(&vd->vdev_state_dirty_node); + list_link_init(&vd->vdev_initialize_node); mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_NOLOCKDEP, NULL); mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&vd->vdev_queue_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_scan_io_queue_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&vd->vdev_initialize_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&vd->vdev_initialize_io_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&vd->vdev_initialize_cv, NULL, CV_DEFAULT, NULL); + cv_init(&vd->vdev_initialize_io_cv, NULL, CV_DEFAULT, NULL); for (int t = 0; t < DTL_TYPES; t++) { vd->vdev_dtl[t] = range_tree_create(NULL, NULL); @@ -522,6 +570,8 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, vdev_indirect_config_t *vic; char *tmp = NULL; int rc; + vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE; + boolean_t top_level = (parent && !parent->vdev_parent); ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); @@ -608,11 +658,32 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, } ASSERT(nparity != -1ULL); + /* + * If creating a top-level vdev, check for allocation classes input + */ + if (top_level && alloctype == VDEV_ALLOC_ADD) { + char *bias; + + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS, + &bias) == 0) { + alloc_bias = vdev_derive_alloc_bias(bias); + + /* spa_vdev_add() expects feature to be enabled */ + if (spa->spa_load_state != SPA_LOAD_CREATE && + !spa_feature_is_enabled(spa, + SPA_FEATURE_ALLOCATION_CLASSES)) { + return (SET_ERROR(ENOTSUP)); + } + } + } + vd = vdev_alloc_common(spa, id, guid, ops); vic = &vd->vdev_indirect_config; vd->vdev_islog = islog; vd->vdev_nparity = nparity; + if (top_level && alloc_bias != VDEV_BIAS_NONE) + vd->vdev_alloc_bias = alloc_bias; if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0) vd->vdev_path = spa_strdup(vd->vdev_path); @@ -681,7 +752,7 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, /* * If we're a top-level vdev, try to load the allocation parameters. */ - if (parent && !parent->vdev_parent && + if (top_level && (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, &vd->vdev_ms_array); @@ -697,13 +768,12 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, ASSERT0(vd->vdev_top_zap); } - if (parent && !parent->vdev_parent && alloctype != VDEV_ALLOC_ATTACH) { + if (top_level && alloctype != VDEV_ALLOC_ATTACH) { ASSERT(alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_ADD || alloctype == VDEV_ALLOC_SPLIT || alloctype == VDEV_ALLOC_ROOTPOOL); - vd->vdev_mg = metaslab_group_create(islog ? - spa_log_class(spa) : spa_normal_class(spa), vd); + /* Note: metaslab_group_create() is now deferred */ } if (vd->vdev_ops->vdev_op_leaf && @@ -742,6 +812,9 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG, &vd->vdev_resilver_txg); + if (nvlist_exists(nv, ZPOOL_CONFIG_RESILVER_DEFER)) + vdev_set_deferred_resilver(spa, vd); + /* * In general, when importing a pool we want to ignore the * persistent fault state, as the diagnosis made on another @@ -790,6 +863,7 @@ void vdev_free(vdev_t *vd) { spa_t *spa = vd->vdev_spa; + ASSERT3P(vd->vdev_initialize_thread, ==, NULL); /* * Scan queues are normally destroyed at the end of a scan. If the @@ -820,6 +894,7 @@ vdev_free(vdev_t *vd) ASSERT(vd->vdev_child == NULL); ASSERT(vd->vdev_guid_sum == vd->vdev_guid); + ASSERT(vd->vdev_initialize_thread == NULL); /* * Discard allocation state. @@ -892,11 +967,14 @@ vdev_free(vdev_t *vd) rw_destroy(&vd->vdev_indirect_rwlock); mutex_destroy(&vd->vdev_obsolete_lock); - mutex_destroy(&vd->vdev_queue_lock); mutex_destroy(&vd->vdev_dtl_lock); mutex_destroy(&vd->vdev_stat_lock); mutex_destroy(&vd->vdev_probe_lock); mutex_destroy(&vd->vdev_scan_io_queue_lock); + mutex_destroy(&vd->vdev_initialize_lock); + mutex_destroy(&vd->vdev_initialize_io_lock); + cv_destroy(&vd->vdev_initialize_io_cv); + cv_destroy(&vd->vdev_initialize_cv); zfs_ratelimit_fini(&vd->vdev_delay_rl); zfs_ratelimit_fini(&vd->vdev_checksum_rl); @@ -945,6 +1023,9 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd) tvd->vdev_checkpoint_sm = svd->vdev_checkpoint_sm; svd->vdev_checkpoint_sm = NULL; + tvd->vdev_alloc_bias = svd->vdev_alloc_bias; + svd->vdev_alloc_bias = VDEV_BIAS_NONE; + tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc; tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space; tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace; @@ -1107,6 +1188,55 @@ vdev_remove_parent(vdev_t *cvd) vdev_free(mvd); } +static void +vdev_metaslab_group_create(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + + /* + * metaslab_group_create was delayed until allocation bias was available + */ + if (vd->vdev_mg == NULL) { + metaslab_class_t *mc; + + if (vd->vdev_islog && vd->vdev_alloc_bias == VDEV_BIAS_NONE) + vd->vdev_alloc_bias = VDEV_BIAS_LOG; + + ASSERT3U(vd->vdev_islog, ==, + (vd->vdev_alloc_bias == VDEV_BIAS_LOG)); + + switch (vd->vdev_alloc_bias) { + case VDEV_BIAS_LOG: + mc = spa_log_class(spa); + break; + case VDEV_BIAS_SPECIAL: + mc = spa_special_class(spa); + break; + case VDEV_BIAS_DEDUP: + mc = spa_dedup_class(spa); + break; + default: + mc = spa_normal_class(spa); + } + + vd->vdev_mg = metaslab_group_create(mc, vd, + spa->spa_alloc_count); + + /* + * The spa ashift values currently only reflect the + * general vdev classes. Class destination is late + * binding so ashift checking had to wait until now + */ + if (vd->vdev_top == vd && vd->vdev_ashift != 0 && + mc == spa_normal_class(spa) && vd->vdev_aux == NULL) { + if (vd->vdev_ashift > spa->spa_max_ashift) + spa->spa_max_ashift = vd->vdev_ashift; + if (vd->vdev_ashift < spa->spa_min_ashift) + spa->spa_min_ashift = vd->vdev_ashift; + } + } +} + int vdev_metaslab_init(vdev_t *vd, uint64_t txg) { @@ -1117,6 +1247,7 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg) uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift; metaslab_t **mspp; int error; + boolean_t expanding = (oldc != 0); ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER)); @@ -1132,14 +1263,13 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg) mspp = vmem_zalloc(newc * sizeof (*mspp), KM_SLEEP); - if (oldc != 0) { + if (expanding) { bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp)); vmem_free(vd->vdev_ms, oldc * sizeof (*mspp)); } vd->vdev_ms = mspp; vd->vdev_ms_count = newc; - for (m = oldc; m < newc; m++) { uint64_t object = 0; @@ -1159,6 +1289,17 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg) } } +#ifndef _KERNEL + /* + * To accomodate zdb_leak_init() fake indirect + * metaslabs, we allocate a metaslab group for + * indirect vdevs which normally don't have one. + */ + if (vd->vdev_mg == NULL) { + ASSERT0(vdev_is_concrete(vd)); + vdev_metaslab_group_create(vd); + } +#endif error = metaslab_init(vd->vdev_mg, m, object, txg, &(vd->vdev_ms[m])); if (error != 0) { @@ -1176,8 +1317,9 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg) * the metaslabs since we want to ensure that no new * allocations are performed on this device. */ - if (oldc == 0 && !vd->vdev_removing) + if (!expanding && !vd->vdev_removing) { metaslab_group_activate(vd->vdev_mg); + } if (txg == 0) spa_config_exit(spa, SCL_ALLOC, FTAG); @@ -1667,9 +1809,13 @@ vdev_open(vdev_t *vd) /* * Track the min and max ashift values for normal data devices. + * + * DJB - TBD these should perhaps be tracked per allocation class + * (e.g. spa_min_ashift is used to round up post compression buffers) */ if (vd->vdev_top == vd && vd->vdev_ashift != 0 && - !vd->vdev_islog && vd->vdev_aux == NULL) { + vd->vdev_alloc_bias == VDEV_BIAS_NONE && + vd->vdev_aux == NULL) { if (vd->vdev_ashift > spa->spa_max_ashift) spa->spa_max_ashift = vd->vdev_ashift; if (vd->vdev_ashift < spa->spa_min_ashift) @@ -1682,8 +1828,13 @@ vdev_open(vdev_t *vd) * since this would just restart the scrub we are already doing. */ if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen && - vdev_resilver_needed(vd, NULL, NULL)) - spa_async_request(spa, SPA_ASYNC_RESILVER); + vdev_resilver_needed(vd, NULL, NULL)) { + if (dsl_scan_resilvering(spa->spa_dsl_pool) && + spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER)) + vdev_set_deferred_resilver(spa, vd); + else + spa_async_request(spa, SPA_ASYNC_RESILVER); + } return (0); } @@ -2130,34 +2281,67 @@ void vdev_metaslab_set_size(vdev_t *vd) { uint64_t asize = vd->vdev_asize; - uint64_t ms_shift = 0; + uint64_t ms_count = asize >> zfs_vdev_default_ms_shift; + uint64_t ms_shift; /* - * For vdevs that are bigger than 8G the metaslab size varies in - * a way that the number of metaslabs increases in powers of two, - * linearly in terms of vdev_asize, starting from 16 metaslabs. - * So for vdev_asize of 8G we get 16 metaslabs, for 16G, we get 32, - * and so on, until we hit the maximum metaslab count limit - * [vdev_max_ms_count] from which point the metaslab count stays - * the same. + * There are two dimensions to the metaslab sizing calculation: + * the size of the metaslab and the count of metaslabs per vdev. + * + * The default values used below are a good balance between memory + * usage (larger metaslab size means more memory needed for loaded + * metaslabs; more metaslabs means more memory needed for the + * metaslab_t structs), metaslab load time (larger metaslabs take + * longer to load), and metaslab sync time (more metaslabs means + * more time spent syncing all of them). + * + * In general, we aim for zfs_vdev_default_ms_count (200) metaslabs. + * The range of the dimensions are as follows: + * + * 2^29 <= ms_size <= 2^34 + * 16 <= ms_count <= 131,072 + * + * On the lower end of vdev sizes, we aim for metaslabs sizes of + * at least 512MB (2^29) to minimize fragmentation effects when + * testing with smaller devices. However, the count constraint + * of at least 16 metaslabs will override this minimum size goal. + * + * On the upper end of vdev sizes, we aim for a maximum metaslab + * size of 16GB. However, we will cap the total count to 2^17 + * metaslabs to keep our memory footprint in check and let the + * metaslab size grow from there if that limit is hit. + * + * The net effect of applying above constrains is summarized below. + * + * vdev size metaslab count + * --------------|----------------- + * < 8GB ~16 + * 8GB - 100GB one per 512MB + * 100GB - 3TB ~200 + * 3TB - 2PB one per 16GB + * > 2PB ~131,072 + * -------------------------------- + * + * Finally, note that all of the above calculate the initial + * number of metaslabs. Expanding a top-level vdev will result + * in additional metaslabs being allocated making it possible + * to exceed the zfs_vdev_ms_count_limit. */ - ms_shift = vdev_default_ms_shift; - if ((asize >> ms_shift) < vdev_min_ms_count) { - /* - * For devices that are less than 8G we want to have - * exactly 16 metaslabs. We don't want less as integer - * division rounds down, so less metaslabs mean more - * wasted space. We don't want more as these vdevs are - * small and in the likely event that we are running - * out of space, the SPA will have a hard time finding - * space due to fragmentation. - */ - ms_shift = highbit64(asize / vdev_min_ms_count); - ms_shift = MAX(ms_shift, SPA_MAXBLOCKSHIFT); - - } else if ((asize >> ms_shift) > vdev_max_ms_count) { - ms_shift = highbit64(asize / vdev_max_ms_count); + if (ms_count < zfs_vdev_min_ms_count) + ms_shift = highbit64(asize / zfs_vdev_min_ms_count); + else if (ms_count > zfs_vdev_default_ms_count) + ms_shift = highbit64(asize / zfs_vdev_default_ms_count); + else + ms_shift = zfs_vdev_default_ms_shift; + + if (ms_shift < SPA_MAXBLOCKSHIFT) { + ms_shift = SPA_MAXBLOCKSHIFT; + } else if (ms_shift > zfs_vdev_max_ms_shift) { + ms_shift = zfs_vdev_max_ms_shift; + /* cap the total count to constrain memory footprint */ + if ((asize >> ms_shift) > zfs_vdev_ms_count_limit) + ms_shift = highbit64(asize / zfs_vdev_ms_count_limit); } vd->vdev_ms_shift = ms_shift; @@ -2353,6 +2537,9 @@ vdev_dtl_should_excise(vdev_t *vd) if (vd->vdev_state < VDEV_STATE_DEGRADED) return (B_FALSE); + if (vd->vdev_resilver_deferred) + return (B_FALSE); + if (vd->vdev_resilver_txg == 0 || range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) return (B_TRUE); @@ -2374,7 +2561,8 @@ vdev_dtl_should_excise(vdev_t *vd) } /* - * Reassess DTLs after a config change or scrub completion. + * Reassess DTLs after a config change or scrub completion. If txg == 0 no + * write operations will be issued to the pool. */ void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) @@ -2457,7 +2645,7 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) * DTLs then reset its resilvering flag and dirty * the top level so that we persist the change. */ - if (vd->vdev_resilver_txg != 0 && + if (txg != 0 && vd->vdev_resilver_txg != 0 && range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) && range_tree_is_empty(vd->vdev_dtl[DTL_OUTAGE])) { vd->vdev_resilver_txg = 0; @@ -2513,13 +2701,6 @@ vdev_dtl_load(vdev_t *vd) ASSERT(vd->vdev_dtl_sm != NULL); mutex_enter(&vd->vdev_dtl_lock); - - /* - * Now that we've opened the space_map we need to update - * the in-core DTL. - */ - space_map_update(vd->vdev_dtl_sm); - error = space_map_load(vd->vdev_dtl_sm, vd->vdev_dtl[DTL_MISSING], SM_ALLOC); mutex_exit(&vd->vdev_dtl_lock); @@ -2536,6 +2717,30 @@ vdev_dtl_load(vdev_t *vd) return (error); } +static void +vdev_zap_allocation_data(vdev_t *vd, dmu_tx_t *tx) +{ + spa_t *spa = vd->vdev_spa; + objset_t *mos = spa->spa_meta_objset; + vdev_alloc_bias_t alloc_bias = vd->vdev_alloc_bias; + const char *string; + + ASSERT(alloc_bias != VDEV_BIAS_NONE); + + string = + (alloc_bias == VDEV_BIAS_LOG) ? VDEV_ALLOC_BIAS_LOG : + (alloc_bias == VDEV_BIAS_SPECIAL) ? VDEV_ALLOC_BIAS_SPECIAL : + (alloc_bias == VDEV_BIAS_DEDUP) ? VDEV_ALLOC_BIAS_DEDUP : NULL; + + ASSERT(string != NULL); + VERIFY0(zap_add(mos, vd->vdev_top_zap, VDEV_TOP_ZAP_ALLOCATION_BIAS, + 1, strlen(string) + 1, string, tx)); + + if (alloc_bias == VDEV_BIAS_SPECIAL || alloc_bias == VDEV_BIAS_DEDUP) { + spa_activate_allocation_classes(spa, tx); + } +} + void vdev_destroy_unlink_zap(vdev_t *vd, uint64_t zapobj, dmu_tx_t *tx) { @@ -2572,8 +2777,11 @@ vdev_construct_zaps(vdev_t *vd, dmu_tx_t *tx) } if (vd == vd->vdev_top && vd->vdev_top_zap == 0) { vd->vdev_top_zap = vdev_create_link_zap(vd, tx); + if (vd->vdev_alloc_bias != VDEV_BIAS_NONE) + vdev_zap_allocation_data(vd, tx); } } + for (uint64_t i = 0; i < vd->vdev_children; i++) { vdev_construct_zaps(vd->vdev_child[i], tx); } @@ -2652,10 +2860,6 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg) } dmu_tx_commit(tx); - - mutex_enter(&vd->vdev_dtl_lock); - space_map_update(vd->vdev_dtl_sm); - mutex_exit(&vd->vdev_dtl_lock); } /* @@ -2733,25 +2937,28 @@ vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp) } /* - * Gets the checkpoint space map object from the vdev's ZAP. - * Returns the spacemap object, or 0 if it wasn't in the ZAP - * or the ZAP doesn't exist yet. + * Gets the checkpoint space map object from the vdev's ZAP. On success sm_obj + * will contain either the checkpoint spacemap object or zero if none exists. + * All other errors are returned to the caller. */ int -vdev_checkpoint_sm_object(vdev_t *vd) +vdev_checkpoint_sm_object(vdev_t *vd, uint64_t *sm_obj) { ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); + if (vd->vdev_top_zap == 0) { + *sm_obj = 0; return (0); } - uint64_t sm_obj = 0; - int err = zap_lookup(spa_meta_objset(vd->vdev_spa), vd->vdev_top_zap, - VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1, &sm_obj); - - VERIFY(err == 0 || err == ENOENT); + int error = zap_lookup(spa_meta_objset(vd->vdev_spa), vd->vdev_top_zap, + VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1, sm_obj); + if (error == ENOENT) { + *sm_obj = 0; + error = 0; + } - return (sm_obj); + return (error); } int @@ -2771,10 +2978,27 @@ vdev_load(vdev_t *vd) vdev_set_deflate_ratio(vd); + /* + * On spa_load path, grab the allocation bias from our zap + */ + if (vd == vd->vdev_top && vd->vdev_top_zap != 0) { + spa_t *spa = vd->vdev_spa; + char bias_str[64]; + + if (zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap, + VDEV_TOP_ZAP_ALLOCATION_BIAS, 1, sizeof (bias_str), + bias_str) == 0) { + ASSERT(vd->vdev_alloc_bias == VDEV_BIAS_NONE); + vd->vdev_alloc_bias = vdev_derive_alloc_bias(bias_str); + } + } + /* * If this is a top-level vdev, initialize its metaslabs. */ if (vd == vd->vdev_top && vdev_is_concrete(vd)) { + vdev_metaslab_group_create(vd); + if (vd->vdev_ashift == 0 || vd->vdev_asize == 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); @@ -2790,8 +3014,9 @@ vdev_load(vdev_t *vd) return (error); } - uint64_t checkpoint_sm_obj = vdev_checkpoint_sm_object(vd); - if (checkpoint_sm_obj != 0) { + uint64_t checkpoint_sm_obj; + error = vdev_checkpoint_sm_object(vd, &checkpoint_sm_obj); + if (error == 0 && checkpoint_sm_obj != 0) { objset_t *mos = spa_meta_objset(vd->vdev_spa); ASSERT(vd->vdev_asize != 0); ASSERT3P(vd->vdev_checkpoint_sm, ==, NULL); @@ -2806,17 +3031,22 @@ vdev_load(vdev_t *vd) return (error); } ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); - space_map_update(vd->vdev_checkpoint_sm); /* * Since the checkpoint_sm contains free entries - * exclusively we can use sm_alloc to indicate the - * culmulative checkpointed space that has been freed. + * exclusively we can use space_map_allocated() to + * indicate the cumulative checkpointed space that + * has been freed. */ vd->vdev_stat.vs_checkpoint_space = - -vd->vdev_checkpoint_sm->sm_alloc; + -space_map_allocated(vd->vdev_checkpoint_sm); vd->vdev_spa->spa_checkpoint_info.sci_dspace += vd->vdev_stat.vs_checkpoint_space; + } else if (error != 0) { + vdev_dbgmsg(vd, "vdev_load: failed to retrieve " + "checkpoint space map object from vdev ZAP " + "[error=%d]", error); + return (error); } } @@ -2831,8 +3061,9 @@ vdev_load(vdev_t *vd) return (error); } - uint64_t obsolete_sm_object = vdev_obsolete_sm_object(vd); - if (obsolete_sm_object != 0) { + uint64_t obsolete_sm_object; + error = vdev_obsolete_sm_object(vd, &obsolete_sm_object); + if (error == 0 && obsolete_sm_object != 0) { objset_t *mos = vd->vdev_spa->spa_meta_objset; ASSERT(vd->vdev_asize != 0); ASSERT3P(vd->vdev_obsolete_sm, ==, NULL); @@ -2846,7 +3077,10 @@ vdev_load(vdev_t *vd) (u_longlong_t)obsolete_sm_object, error); return (error); } - space_map_update(vd->vdev_obsolete_sm); + } else if (error != 0) { + vdev_dbgmsg(vd, "vdev_load: failed to retrieve obsolete " + "space map object from vdev ZAP [error=%d]", error); + return (error); } return (0); @@ -2925,61 +3159,22 @@ vdev_destroy_spacemaps(vdev_t *vd, dmu_tx_t *tx) } static void -vdev_remove_empty(vdev_t *vd, uint64_t txg) +vdev_remove_empty_log(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; - dmu_tx_t *tx; + ASSERT(vd->vdev_islog); ASSERT(vd == vd->vdev_top); ASSERT3U(txg, ==, spa_syncing_txg(spa)); - if (vd->vdev_ms != NULL) { - metaslab_group_t *mg = vd->vdev_mg; - - metaslab_group_histogram_verify(mg); - metaslab_class_histogram_verify(mg->mg_class); - - for (int m = 0; m < vd->vdev_ms_count; m++) { - metaslab_t *msp = vd->vdev_ms[m]; - - if (msp == NULL || msp->ms_sm == NULL) - continue; - - mutex_enter(&msp->ms_lock); - /* - * If the metaslab was not loaded when the vdev - * was removed then the histogram accounting may - * not be accurate. Update the histogram information - * here so that we ensure that the metaslab group - * and metaslab class are up-to-date. - */ - metaslab_group_histogram_remove(mg, msp); - - VERIFY0(space_map_allocated(msp->ms_sm)); - space_map_close(msp->ms_sm); - msp->ms_sm = NULL; - mutex_exit(&msp->ms_lock); - } - - if (vd->vdev_checkpoint_sm != NULL) { - ASSERT(spa_has_checkpoint(spa)); - space_map_close(vd->vdev_checkpoint_sm); - vd->vdev_checkpoint_sm = NULL; - } - - metaslab_group_histogram_verify(mg); - metaslab_class_histogram_verify(mg->mg_class); - for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) - ASSERT0(mg->mg_histogram[i]); - } + dmu_tx_t *tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); - tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); vdev_destroy_spacemaps(vd, tx); - - if (vd->vdev_islog && vd->vdev_top_zap != 0) { + if (vd->vdev_top_zap != 0) { vdev_destroy_unlink_zap(vd, vd->vdev_top_zap, tx); vd->vdev_top_zap = 0; } + dmu_tx_commit(tx); } @@ -2991,7 +3186,8 @@ vdev_sync_done(vdev_t *vd, uint64_t txg) ASSERT(vdev_is_concrete(vd)); - while ((msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))) + while ((msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg))) + != NULL) metaslab_sync_done(msp, txg); if (reassess) @@ -3004,17 +3200,14 @@ vdev_sync(vdev_t *vd, uint64_t txg) spa_t *spa = vd->vdev_spa; vdev_t *lvd; metaslab_t *msp; - dmu_tx_t *tx; + ASSERT3U(txg, ==, spa->spa_syncing_txg); + dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); if (range_tree_space(vd->vdev_obsolete_segments) > 0) { - dmu_tx_t *tx; - ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops); - tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); vdev_indirect_sync_obsolete(vd, tx); - dmu_tx_commit(tx); /* * If the vdev is indirect, it can't have dirty @@ -3023,6 +3216,7 @@ vdev_sync(vdev_t *vd, uint64_t txg) if (vd->vdev_ops == &vdev_indirect_ops) { ASSERT(txg_list_empty(&vd->vdev_ms_list, txg)); ASSERT(txg_list_empty(&vd->vdev_dtl_list, txg)); + dmu_tx_commit(tx); return; } } @@ -3033,12 +3227,10 @@ vdev_sync(vdev_t *vd, uint64_t txg) !vd->vdev_removing) { ASSERT(vd == vd->vdev_top); ASSERT0(vd->vdev_indirect_config.vic_mapping_object); - tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset, DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx); ASSERT(vd->vdev_ms_array != 0); vdev_config_dirty(vd); - dmu_tx_commit(tx); } while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) { @@ -3050,16 +3242,14 @@ vdev_sync(vdev_t *vd, uint64_t txg) vdev_dtl_sync(lvd, txg); /* - * Remove the metadata associated with this vdev once it's empty. - * Note that this is typically used for log/cache device removal; - * we don't empty toplevel vdevs when removing them. But if - * a toplevel happens to be emptied, this is not harmful. + * If this is an empty log device being removed, destroy the + * metadata associated with it. */ - if (vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing) { - vdev_remove_empty(vd, txg); - } + if (vd->vdev_islog && vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing) + vdev_remove_empty_log(vd, txg); (void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg)); + dmu_tx_commit(tx); } uint64_t @@ -3216,7 +3406,9 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) /* XXX - L2ARC 1.0 does not support expansion */ if (!vd->vdev_aux) { for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) - pvd->vdev_expanding = !!(flags & ZFS_ONLINE_EXPAND); + pvd->vdev_expanding = !!((flags & ZFS_ONLINE_EXPAND) || + spa->spa_autoexpand); + vd->vdev_expansion_time = gethrestime_sec(); } vdev_reopen(tvd); @@ -3243,6 +3435,15 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); } + /* Restart initializing if necessary */ + mutex_enter(&vd->vdev_initialize_lock); + if (vdev_writeable(vd) && + vd->vdev_initialize_thread == NULL && + vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE) { + (void) vdev_initialize(vd); + } + mutex_exit(&vd->vdev_initialize_lock); + if (wasoffline || (oldstate < VDEV_STATE_DEGRADED && vd->vdev_state >= VDEV_STATE_DEGRADED)) @@ -3306,8 +3507,8 @@ top: */ if (error == 0 && tvd->vdev_checkpoint_sm != NULL) { - ASSERT3U(tvd->vdev_checkpoint_sm->sm_alloc, - !=, 0); + ASSERT3U(space_map_allocated( + tvd->vdev_checkpoint_sm), !=, 0); error = ZFS_ERR_CHECKPOINT_EXISTS; } @@ -3386,6 +3587,7 @@ vdev_clear(spa_t *spa, vdev_t *vd) vd->vdev_stat.vs_read_errors = 0; vd->vdev_stat.vs_write_errors = 0; vd->vdev_stat.vs_checksum_errors = 0; + vd->vdev_stat.vs_slow_ios = 0; for (int c = 0; c < vd->vdev_children; c++) vdev_clear(spa, vd->vdev_child[c]); @@ -3423,8 +3625,14 @@ vdev_clear(spa_t *spa, vdev_t *vd) if (vd != rvd && vdev_writeable(vd->vdev_top)) vdev_state_dirty(vd->vdev_top); - if (vd->vdev_aux == NULL && !vdev_is_dead(vd)) - spa_async_request(spa, SPA_ASYNC_RESILVER); + if (vd->vdev_aux == NULL && !vdev_is_dead(vd)) { + if (dsl_scan_resilvering(spa->spa_dsl_pool) && + spa_feature_is_enabled(spa, + SPA_FEATURE_RESILVER_DEFER)) + vdev_set_deferred_resilver(spa, vd); + else + spa_async_request(spa, SPA_ASYNC_RESILVER); + } spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_CLEAR); } @@ -3552,13 +3760,17 @@ vdev_get_child_stat_ex(vdev_t *cvd, vdev_stat_ex_t *vsx, vdev_stat_ex_t *cvsx) boolean_t vdev_is_spacemap_addressable(vdev_t *vd) { + if (spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_SPACEMAP_V2)) + return (B_TRUE); + /* - * Assuming 47 bits of the space map entry dedicated for the entry's - * offset (see description in space_map.h), we calculate the maximum - * address that can be described by a space map entry for the given - * device. + * If double-word space map entries are not enabled we assume + * 47 bits of the space map entry are dedicated to the entry's + * offset (see SM_OFFSET_BITS in space_map.h). We then use that + * to calculate the maximum address that can be described by a + * space map entry for the given device. */ - uint64_t shift = vd->vdev_ashift + 47; + uint64_t shift = vd->vdev_ashift + SM_OFFSET_BITS; if (shift >= 63) /* detect potential overflow */ return (B_TRUE); @@ -3626,9 +3838,22 @@ vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx) vs->vs_timestamp = gethrtime() - vs->vs_timestamp; vs->vs_state = vd->vdev_state; vs->vs_rsize = vdev_get_min_asize(vd); - if (vd->vdev_ops->vdev_op_leaf) + if (vd->vdev_ops->vdev_op_leaf) { vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; + /* + * Report intializing progress. Since we don't + * have the initializing locks held, this is only + * an estimate (although a fairly accurate one). + */ + vs->vs_initialize_bytes_done = + vd->vdev_initialize_bytes_done; + vs->vs_initialize_bytes_est = + vd->vdev_initialize_bytes_est; + vs->vs_initialize_state = vd->vdev_initialize_state; + vs->vs_initialize_action_time = + vd->vdev_initialize_action_time; + } /* * Report expandable space on top-level, non-auxillary devices * only. The expandable space is reported in terms of metaslab @@ -3642,8 +3867,11 @@ vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx) } if (vd->vdev_aux == NULL && vd == vd->vdev_top && vdev_is_concrete(vd)) { - vs->vs_fragmentation = vd->vdev_mg->mg_fragmentation; + vs->vs_fragmentation = (vd->vdev_mg != NULL) ? + vd->vdev_mg->mg_fragmentation : 0; } + if (vd->vdev_ops->vdev_op_leaf) + vs->vs_resilver_deferred = vd->vdev_resilver_deferred; } ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_READER) != 0); @@ -3847,19 +4075,25 @@ vdev_stat_update(zio_t *zio, uint64_t psize) } } +int64_t +vdev_deflated_space(vdev_t *vd, int64_t space) +{ + ASSERT((space & (SPA_MINBLOCKSIZE-1)) == 0); + ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache); + + return ((space >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio); +} + /* - * Update the in-core space usage stats for this vdev, its metaslab class, - * and the root vdev. + * Update the in-core space usage stats for this vdev and the root vdev. */ void vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta, int64_t space_delta) { - int64_t dspace_delta = space_delta; + int64_t dspace_delta; spa_t *spa = vd->vdev_spa; vdev_t *rvd = spa->spa_root_vdev; - metaslab_group_t *mg = vd->vdev_mg; - metaslab_class_t *mc = mg ? mg->mg_class : NULL; ASSERT(vd == vd->vdev_top); @@ -3869,32 +4103,29 @@ vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta, * because the root vdev's psize-to-asize is simply the max of its * childrens', thus not accurate enough for us. */ - ASSERT((dspace_delta & (SPA_MINBLOCKSIZE-1)) == 0); - ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache); - dspace_delta = (dspace_delta >> SPA_MINBLOCKSHIFT) * - vd->vdev_deflate_ratio; + dspace_delta = vdev_deflated_space(vd, space_delta); mutex_enter(&vd->vdev_stat_lock); + /* ensure we won't underflow */ + if (alloc_delta < 0) { + ASSERT3U(vd->vdev_stat.vs_alloc, >=, -alloc_delta); + } + vd->vdev_stat.vs_alloc += alloc_delta; vd->vdev_stat.vs_space += space_delta; vd->vdev_stat.vs_dspace += dspace_delta; mutex_exit(&vd->vdev_stat_lock); - if (mc == spa_normal_class(spa)) { + /* every class but log contributes to root space stats */ + if (vd->vdev_mg != NULL && !vd->vdev_islog) { + ASSERT(!vd->vdev_isl2cache); mutex_enter(&rvd->vdev_stat_lock); rvd->vdev_stat.vs_alloc += alloc_delta; rvd->vdev_stat.vs_space += space_delta; rvd->vdev_stat.vs_dspace += dspace_delta; mutex_exit(&rvd->vdev_stat_lock); } - - if (mc != NULL) { - ASSERT(rvd == vd->vdev_parent); - ASSERT(vd->vdev_ms_count != 0); - - metaslab_class_space_update(mc, - alloc_delta, defer_delta, space_delta, dspace_delta); - } + /* Note: metaslab_class_space_update moved to metaslab_space_update */ } /* @@ -4323,7 +4554,9 @@ vdev_expand(vdev_t *vd, uint64_t txg) vdev_set_deflate_ratio(vd); - if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count) { + if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count && + vdev_is_concrete(vd)) { + vdev_metaslab_group_create(vd); VERIFY(vdev_metaslab_init(vd, txg) == 0); vdev_config_dirty(vd); } @@ -4383,6 +4616,21 @@ vdev_deadman(vdev_t *vd, char *tag) } } +void +vdev_set_deferred_resilver(spa_t *spa, vdev_t *vd) +{ + for (uint64_t i = 0; i < vd->vdev_children; i++) + vdev_set_deferred_resilver(spa, vd->vdev_child[i]); + + if (!vd->vdev_ops->vdev_op_leaf || !vdev_writeable(vd) || + range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) { + return; + } + + vd->vdev_resilver_deferred = B_TRUE; + spa->spa_resilver_deferred = B_TRUE; +} + #if defined(_KERNEL) EXPORT_SYMBOL(vdev_fault); EXPORT_SYMBOL(vdev_degrade); @@ -4390,21 +4638,24 @@ EXPORT_SYMBOL(vdev_online); EXPORT_SYMBOL(vdev_offline); EXPORT_SYMBOL(vdev_clear); /* BEGIN CSTYLED */ -module_param(vdev_max_ms_count, int, 0644); -MODULE_PARM_DESC(vdev_max_ms_count, - "Divide added vdev into approximately (but no more than) this number " - "of metaslabs"); +module_param(zfs_vdev_default_ms_count, int, 0644); +MODULE_PARM_DESC(zfs_vdev_default_ms_count, + "Target number of metaslabs per top-level vdev"); -module_param(vdev_min_ms_count, int, 0644); -MODULE_PARM_DESC(vdev_min_ms_count, +module_param(zfs_vdev_min_ms_count, int, 0644); +MODULE_PARM_DESC(zfs_vdev_min_ms_count, "Minimum number of metaslabs per top-level vdev"); -module_param(zfs_delays_per_second, uint, 0644); -MODULE_PARM_DESC(zfs_delays_per_second, "Rate limit delay events to this many " - "IO delays per second"); +module_param(zfs_vdev_ms_count_limit, int, 0644); +MODULE_PARM_DESC(zfs_vdev_ms_count_limit, + "Practical upper limit of total metaslabs per top-level vdev"); -module_param(zfs_checksums_per_second, uint, 0644); - MODULE_PARM_DESC(zfs_checksums_per_second, "Rate limit checksum events " +module_param(zfs_slow_io_events_per_second, uint, 0644); +MODULE_PARM_DESC(zfs_slow_io_events_per_second, + "Rate limit slow IO (delay) events to this many per second"); + +module_param(zfs_checksum_events_per_second, uint, 0644); +MODULE_PARM_DESC(zfs_checksum_events_per_second, "Rate limit checksum events " "to this many checksum errors per second (do not set below zed" "threshold)."); @@ -4415,5 +4666,8 @@ MODULE_PARM_DESC(zfs_scan_ignore_errors, module_param(vdev_validate_skip, int, 0644); MODULE_PARM_DESC(vdev_validate_skip, "Bypass vdev_validate()"); + +module_param(zfs_nocacheflush, int, 0644); +MODULE_PARM_DESC(zfs_nocacheflush, "Disable cache flushes"); /* END CSTYLED */ #endif