X-Git-Url: https://git.proxmox.com/?a=blobdiff_plain;f=module%2Fzfs%2Fspa.c;h=bbe2f89629a5ab1a905522dbf396e53728f3e8a9;hb=8dc2197b7b1e4d7ebc1420ea30e51c6541f1d834;hp=53b5aabf02fd738eca3b809d13616b6d729c970f;hpb=533ea0415bd788530ff894bc0edbc3cea20fbe4c;p=mirror_zfs.git diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 53b5aabf0..bbe2f8962 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -22,15 +22,16 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2017 by Delphix. All rights reserved. - * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2013, 2014, Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2018, Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2016 Toomas Soome * Copyright (c) 2016 Actifio, Inc. All rights reserved. + * Copyright 2018 Joyent, Inc. * Copyright (c) 2017 Datto Inc. * Copyright 2017 Joyent, Inc. + * Copyright (c) 2017, Intel Corporation. */ /* @@ -52,6 +53,10 @@ #include #include #include +#include +#include +#include +#include #include #include #include @@ -59,6 +64,7 @@ #include #include #include +#include #include #include #include @@ -81,11 +87,7 @@ #ifdef _KERNEL #include #include -#include #include -#include -#include -#include #include #endif /* _KERNEL */ @@ -96,7 +98,7 @@ * The interval, in seconds, at which failed configuration cache file writes * should be retried. */ -static int zfs_ccw_retry_interval = 300; +int zfs_ccw_retry_interval = 300; typedef enum zti_modes { ZTI_MODE_FIXED, /* value is # of threads (min 1) */ @@ -150,30 +152,84 @@ const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* IOCTL */ }; -static sysevent_t *spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, - const char *name); -static void spa_event_post(sysevent_t *ev); static void spa_sync_version(void *arg, dmu_tx_t *tx); static void spa_sync_props(void *arg, dmu_tx_t *tx); static boolean_t spa_has_active_shared_spare(spa_t *spa); -static inline int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config, - spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, - char **ereport); +static int spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport); static void spa_vdev_resilver_done(spa_t *spa); uint_t zio_taskq_batch_pct = 75; /* 1 thread per cpu in pset */ -id_t zio_taskq_psrset_bind = PS_NONE; boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */ uint_t zio_taskq_basedc = 80; /* base duty cycle */ boolean_t spa_create_process = B_TRUE; /* no process ==> no sysdc */ +/* + * Report any spa_load_verify errors found, but do not fail spa_load. + * This is used by zdb to analyze non-idle pools. + */ +boolean_t spa_load_verify_dryrun = B_FALSE; + /* * This (illegal) pool name is used when temporarily importing a spa_t in order * to get the vdev stats associated with the imported devices. */ #define TRYIMPORT_NAME "$import" +/* + * For debugging purposes: print out vdev tree during pool import. + */ +int spa_load_print_vdev_tree = B_FALSE; + +/* + * A non-zero value for zfs_max_missing_tvds means that we allow importing + * pools with missing top-level vdevs. This is strictly intended for advanced + * pool recovery cases since missing data is almost inevitable. Pools with + * missing devices can only be imported read-only for safety reasons, and their + * fail-mode will be automatically set to "continue". + * + * With 1 missing vdev we should be able to import the pool and mount all + * datasets. User data that was not modified after the missing device has been + * added should be recoverable. This means that snapshots created prior to the + * addition of that device should be completely intact. + * + * With 2 missing vdevs, some datasets may fail to mount since there are + * dataset statistics that are stored as regular metadata. Some data might be + * recoverable if those vdevs were added recently. + * + * With 3 or more missing vdevs, the pool is severely damaged and MOS entries + * may be missing entirely. Chances of data recovery are very low. Note that + * there are also risks of performing an inadvertent rewind as we might be + * missing all the vdevs with the latest uberblocks. + */ +unsigned long zfs_max_missing_tvds = 0; + +/* + * The parameters below are similar to zfs_max_missing_tvds but are only + * intended for a preliminary open of the pool with an untrusted config which + * might be incomplete or out-dated. + * + * We are more tolerant for pools opened from a cachefile since we could have + * an out-dated cachefile where a device removal was not registered. + * We could have set the limit arbitrarily high but in the case where devices + * are really missing we would want to return the proper error codes; we chose + * SPA_DVAS_PER_BP - 1 so that some copies of the MOS would still be available + * and we get a chance to retrieve the trusted config. + */ +uint64_t zfs_max_missing_tvds_cachefile = SPA_DVAS_PER_BP - 1; + +/* + * In the case where config was assembled by scanning device paths (/dev/dsks + * by default) we are less tolerant since all the existing devices should have + * been detected and we want spa_load to return the right error codes. + */ +uint64_t zfs_max_missing_tvds_scan = 0; + +/* + * Debugging aid that pauses spa_sync() towards the end. + */ +boolean_t zfs_pause_spa_sync = B_FALSE; + /* * ========================================================================== * SPA properties routines @@ -218,13 +274,21 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp) ASSERT(MUTEX_HELD(&spa->spa_props_lock)); if (rvd != NULL) { - alloc = metaslab_class_get_alloc(spa_normal_class(spa)); - size = metaslab_class_get_space(spa_normal_class(spa)); + alloc = metaslab_class_get_alloc(mc); + alloc += metaslab_class_get_alloc(spa_special_class(spa)); + alloc += metaslab_class_get_alloc(spa_dedup_class(spa)); + + size = metaslab_class_get_space(mc); + size += metaslab_class_get_space(spa_special_class(spa)); + size += metaslab_class_get_space(spa_dedup_class(spa)); + spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src); spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL, size - alloc, src); + spa_prop_add_list(*nvp, ZPOOL_PROP_CHECKPOINT, NULL, + spa->spa_checkpoint_info.sci_dspace, src); spa_prop_add_list(*nvp, ZPOOL_PROP_FRAGMENTATION, NULL, metaslab_class_fragmentation(mc), src); @@ -250,6 +314,8 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp) spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, ZPROP_SRC_LOCAL); } + spa_prop_add_list(*nvp, ZPOOL_PROP_LOAD_GUID, + NULL, spa_load_guid(spa), src); } if (pool != NULL) { @@ -369,8 +435,9 @@ spa_prop_get(spa_t *spa, nvlist_t **nvp) dp = spa_get_dsl(spa); dsl_pool_config_enter(dp, FTAG); - if ((err = dsl_dataset_hold_obj(dp, - za.za_first_integer, FTAG, &ds))) { + err = dsl_dataset_hold_obj(dp, + za.za_first_integer, FTAG, &ds); + if (err != 0) { dsl_pool_config_exit(dp, FTAG); break; } @@ -536,7 +603,7 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) } error = dmu_objset_hold(strval, FTAG, &os); - if (error) + if (error != 0) break; /* @@ -762,6 +829,12 @@ spa_change_guid_check(void *arg, dmu_tx_t *tx) vdev_t *rvd = spa->spa_root_vdev; uint64_t vdev_state; + if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { + int error = (spa_has_checkpoint(spa)) ? + ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; + return (SET_ERROR(error)); + } + spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); vdev_state = rvd->vdev_state; spa_config_exit(spa, SCL_STATE, FTAG); @@ -817,7 +890,7 @@ spa_change_guid(spa_t *spa) spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED); if (error == 0) { - spa_config_sync(spa, B_FALSE, B_TRUE); + spa_write_cachefile(spa, B_FALSE, B_TRUE); spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_REGUID); } @@ -874,7 +947,6 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) uint_t value = ztip->zti_value; uint_t count = ztip->zti_count; spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; - char name[32]; uint_t flags = 0; boolean_t batch = B_FALSE; @@ -911,14 +983,10 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) for (uint_t i = 0; i < count; i++) { taskq_t *tq; + char name[32]; - if (count > 1) { - (void) snprintf(name, sizeof (name), "%s_%s_%u", - zio_type_name[t], zio_taskq_types[q], i); - } else { - (void) snprintf(name, sizeof (name), "%s_%s", - zio_type_name[t], zio_taskq_types[q]); - } + (void) snprintf(name, sizeof (name), "%s_%s", + zio_type_name[t], zio_taskq_types[q]); if (zio_taskq_sysdc && spa->spa_proc != &p0) { if (batch) @@ -1034,6 +1102,7 @@ spa_create_zio_taskqs(spa_t *spa) static void spa_thread(void *arg) { + psetid_t zio_taskq_psrset_bind = PS_NONE; callb_cpr_t cprinfo; spa_t *spa = arg; @@ -1113,6 +1182,8 @@ spa_activate(spa_t *spa, int mode) spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops); spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops); + spa->spa_special_class = metaslab_class_create(spa, zfs_metaslab_ops); + spa->spa_dedup_class = metaslab_class_create(spa, zfs_metaslab_ops); /* Try to create a covering process */ mutex_enter(&spa->spa_proc_lock); @@ -1149,6 +1220,11 @@ spa_activate(spa_t *spa, int mode) spa_create_zio_taskqs(spa); } + for (size_t i = 0; i < TXG_SIZE; i++) { + spa->spa_txg_zio[i] = zio_root(spa, NULL, NULL, + ZIO_FLAG_CANFAIL); + } + list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), offsetof(vdev_t, vdev_config_dirty_node)); list_create(&spa->spa_evicting_os_list, sizeof (objset_t), @@ -1245,12 +1321,24 @@ spa_deactivate(spa_t *spa) } } + for (size_t i = 0; i < TXG_SIZE; i++) { + ASSERT3P(spa->spa_txg_zio[i], !=, NULL); + VERIFY0(zio_wait(spa->spa_txg_zio[i])); + spa->spa_txg_zio[i] = NULL; + } + metaslab_class_destroy(spa->spa_normal_class); spa->spa_normal_class = NULL; metaslab_class_destroy(spa->spa_log_class); spa->spa_log_class = NULL; + metaslab_class_destroy(spa->spa_special_class); + spa->spa_special_class = NULL; + + metaslab_class_destroy(spa->spa_dedup_class); + spa->spa_dedup_class = NULL; + /* * If this was part of an import or the open otherwise failed, we may * still have errors left in the queues. Empty them just in case. @@ -1346,11 +1434,18 @@ spa_unload(spa_t *spa) ASSERT(MUTEX_HELD(&spa_namespace_lock)); + spa_load_note(spa, "UNLOADING"); + /* * Stop async tasks. */ spa_async_suspend(spa); + if (spa->spa_root_vdev) { + vdev_initialize_stop_all(spa->spa_root_vdev, + VDEV_INITIALIZE_ACTIVE); + } + /* * Stop syncing. */ @@ -1366,10 +1461,10 @@ spa_unload(spa_t *spa) * calling taskq_wait(mg_taskq). */ if (spa->spa_root_vdev != NULL) { - spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) vdev_metaslab_fini(spa->spa_root_vdev->vdev_child[c]); - spa_config_exit(spa, SCL_ALL, FTAG); + spa_config_exit(spa, SCL_ALL, spa); } if (spa->spa_mmp.mmp_thread) @@ -1385,9 +1480,26 @@ spa_unload(spa_t *spa) spa->spa_async_zio_root = NULL; } + if (spa->spa_vdev_removal != NULL) { + spa_vdev_removal_destroy(spa->spa_vdev_removal); + spa->spa_vdev_removal = NULL; + } + + if (spa->spa_condense_zthr != NULL) { + zthr_destroy(spa->spa_condense_zthr); + spa->spa_condense_zthr = NULL; + } + + if (spa->spa_checkpoint_discard_zthr != NULL) { + zthr_destroy(spa->spa_checkpoint_discard_zthr); + spa->spa_checkpoint_discard_zthr = NULL; + } + + spa_condense_fini(spa); + bpobj_close(&spa->spa_deferred_bpobj); - spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); /* * Close all vdevs. @@ -1442,12 +1554,14 @@ spa_unload(spa_t *spa) spa->spa_async_suspended = 0; + spa->spa_indirect_vdevs_loaded = B_FALSE; + if (spa->spa_comment != NULL) { spa_strfree(spa->spa_comment); spa->spa_comment = NULL; } - spa_config_exit(spa, SCL_ALL, FTAG); + spa_config_exit(spa, SCL_ALL, spa); } /* @@ -1456,7 +1570,7 @@ spa_unload(spa_t *spa) * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and * then re-generate a more complete list including status information. */ -static void +void spa_load_spares(spa_t *spa) { nvlist_t **spares; @@ -1464,6 +1578,18 @@ spa_load_spares(spa_t *spa) int i; vdev_t *vd, *tvd; +#ifndef _KERNEL + /* + * zdb opens both the current state of the pool and the + * checkpointed state (if present), with a different spa_t. + * + * As spare vdevs are shared among open pools, we skip loading + * them when we load the checkpointed state of the pool. + */ + if (!spa_writeable(spa)) + return; +#endif + ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); /* @@ -1573,7 +1699,7 @@ spa_load_spares(spa_t *spa) * Devices which are already active have their details maintained, and are * not re-opened. */ -static void +void spa_load_l2cache(spa_t *spa) { nvlist_t **l2cache = NULL; @@ -1583,6 +1709,19 @@ spa_load_l2cache(spa_t *spa) vdev_t *vd, **oldvdevs, **newvdevs; spa_aux_vdev_t *sav = &spa->spa_l2cache; +#ifndef _KERNEL + /* + * zdb opens both the current state of the pool and the + * checkpointed state (if present), with a different spa_t. + * + * As L2 caches are part of the ARC which is shared among open + * pools, we skip loading them when we load the checkpointed + * state of the pool. + */ + if (!spa_writeable(spa)) + return; +#endif + ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); oldvdevs = sav->sav_vdevs; @@ -1723,6 +1862,27 @@ load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) return (error); } +/* + * Concrete top-level vdevs that are not missing and are not logs. At every + * spa_sync we write new uberblocks to at least SPA_SYNC_MIN_VDEVS core tvds. + */ +static uint64_t +spa_healthy_core_tvds(spa_t *spa) +{ + vdev_t *rvd = spa->spa_root_vdev; + uint64_t tvds = 0; + + for (uint64_t i = 0; i < rvd->vdev_children; i++) { + vdev_t *vd = rvd->vdev_child[i]; + if (vd->vdev_islog) + continue; + if (vdev_is_concrete(vd) && !vdev_is_dead(vd)) + tvds++; + } + + return (tvds); +} + /* * Checks to see if the given vdev could not be opened, in which case we post a * sysevent to notify the autoreplace code that the device has been removed. @@ -1730,48 +1890,24 @@ load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) static void spa_check_removed(vdev_t *vd) { - for (int c = 0; c < vd->vdev_children; c++) + for (uint64_t c = 0; c < vd->vdev_children; c++) spa_check_removed(vd->vdev_child[c]); if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) && - !vd->vdev_ishole) { + vdev_is_concrete(vd)) { zfs_post_autoreplace(vd->vdev_spa, vd); spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_CHECK); } } -static void -spa_config_valid_zaps(vdev_t *vd, vdev_t *mvd) -{ - ASSERT3U(vd->vdev_children, ==, mvd->vdev_children); - - vd->vdev_top_zap = mvd->vdev_top_zap; - vd->vdev_leaf_zap = mvd->vdev_leaf_zap; - - for (uint64_t i = 0; i < vd->vdev_children; i++) { - spa_config_valid_zaps(vd->vdev_child[i], mvd->vdev_child[i]); - } -} - -/* - * Validate the current config against the MOS config - */ -static boolean_t -spa_config_valid(spa_t *spa, nvlist_t *config) +static int +spa_check_for_missing_logs(spa_t *spa) { - vdev_t *mrvd, *rvd = spa->spa_root_vdev; - nvlist_t *nv; - - VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nv) == 0); - - spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); - VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0); - - ASSERT3U(rvd->vdev_children, ==, mrvd->vdev_children); + vdev_t *rvd = spa->spa_root_vdev; /* * If we're doing a normal import, then build up any additional - * diagnostic information about missing devices in this config. + * diagnostic information about missing log devices. * We'll pass this up to the user for further processing. */ if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) { @@ -1782,106 +1918,54 @@ spa_config_valid(spa_t *spa, nvlist_t *config) KM_SLEEP); VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); - for (int c = 0; c < rvd->vdev_children; c++) { + for (uint64_t c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; - vdev_t *mtvd = mrvd->vdev_child[c]; - if (tvd->vdev_ops == &vdev_missing_ops && - mtvd->vdev_ops != &vdev_missing_ops && - mtvd->vdev_islog) - child[idx++] = vdev_config_generate(spa, mtvd, - B_FALSE, 0); + /* + * We consider a device as missing only if it failed + * to open (i.e. offline or faulted is not considered + * as missing). + */ + if (tvd->vdev_islog && + tvd->vdev_state == VDEV_STATE_CANT_OPEN) { + child[idx++] = vdev_config_generate(spa, tvd, + B_FALSE, VDEV_CONFIG_MISSING); + } } - if (idx) { - VERIFY(nvlist_add_nvlist_array(nv, - ZPOOL_CONFIG_CHILDREN, child, idx) == 0); - VERIFY(nvlist_add_nvlist(spa->spa_load_info, - ZPOOL_CONFIG_MISSING_DEVICES, nv) == 0); + if (idx > 0) { + fnvlist_add_nvlist_array(nv, + ZPOOL_CONFIG_CHILDREN, child, idx); + fnvlist_add_nvlist(spa->spa_load_info, + ZPOOL_CONFIG_MISSING_DEVICES, nv); - for (int i = 0; i < idx; i++) + for (uint64_t i = 0; i < idx; i++) nvlist_free(child[i]); } nvlist_free(nv); kmem_free(child, rvd->vdev_children * sizeof (char **)); - } - - /* - * Compare the root vdev tree with the information we have - * from the MOS config (mrvd). Check each top-level vdev - * with the corresponding MOS config top-level (mtvd). - */ - for (int c = 0; c < rvd->vdev_children; c++) { - vdev_t *tvd = rvd->vdev_child[c]; - vdev_t *mtvd = mrvd->vdev_child[c]; - - /* - * Resolve any "missing" vdevs in the current configuration. - * If we find that the MOS config has more accurate information - * about the top-level vdev then use that vdev instead. - */ - if (tvd->vdev_ops == &vdev_missing_ops && - mtvd->vdev_ops != &vdev_missing_ops) { - if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) - continue; + if (idx > 0) { + spa_load_failed(spa, "some log devices are missing"); + vdev_dbgmsg_print_tree(rvd, 2); + return (SET_ERROR(ENXIO)); + } + } else { + for (uint64_t c = 0; c < rvd->vdev_children; c++) { + vdev_t *tvd = rvd->vdev_child[c]; - /* - * Device specific actions. - */ - if (mtvd->vdev_islog) { + if (tvd->vdev_islog && + tvd->vdev_state == VDEV_STATE_CANT_OPEN) { spa_set_log_state(spa, SPA_LOG_CLEAR); - } else { - /* - * XXX - once we have 'readonly' pool - * support we should be able to handle - * missing data devices by transitioning - * the pool to readonly. - */ - continue; - } - - /* - * Swap the missing vdev with the data we were - * able to obtain from the MOS config. - */ - vdev_remove_child(rvd, tvd); - vdev_remove_child(mrvd, mtvd); - - vdev_add_child(rvd, mtvd); - vdev_add_child(mrvd, tvd); - - spa_config_exit(spa, SCL_ALL, FTAG); - vdev_load(mtvd); - spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); - - vdev_reopen(rvd); - } else { - if (mtvd->vdev_islog) { - /* - * Load the slog device's state from the MOS - * config since it's possible that the label - * does not contain the most up-to-date - * information. - */ - vdev_load_log_state(tvd, mtvd); - vdev_reopen(tvd); + spa_load_note(spa, "some log devices are " + "missing, ZIL is dropped."); + vdev_dbgmsg_print_tree(rvd, 2); + break; } - - /* - * Per-vdev ZAP info is stored exclusively in the MOS. - */ - spa_config_valid_zaps(tvd, mtvd); } } - vdev_free(mrvd); - spa_config_exit(spa, SCL_ALL, FTAG); - - /* - * Ensure we were able to validate the config. - */ - return (rvd->vdev_guid_sum == spa->spa_uberblock.ub_guid_sum); + return (0); } /* @@ -1949,11 +2033,11 @@ spa_activate_log(spa_t *spa) } int -spa_offline_log(spa_t *spa) +spa_reset_logs(spa_t *spa) { int error; - error = dmu_objset_find(spa_name(spa), zil_vdev_offline, + error = dmu_objset_find(spa_name(spa), zil_reset, NULL, DS_FIND_CHILDREN); if (error == 0) { /* @@ -2072,13 +2156,13 @@ spa_load_verify(spa_t *spa) { zio_t *rio; spa_load_error_t sle = { 0 }; - zpool_rewind_policy_t policy; + zpool_load_policy_t policy; boolean_t verify_ok = B_FALSE; int error = 0; - zpool_get_rewind_policy(spa->spa_config, &policy); + zpool_get_load_policy(spa->spa_config, &policy); - if (policy.zrp_request & ZPOOL_NEVER_REWIND) + if (policy.zlp_rewind & ZPOOL_NEVER_REWIND) return (0); dsl_pool_config_enter(spa->spa_dsl_pool, FTAG); @@ -2093,6 +2177,13 @@ spa_load_verify(spa_t *spa) ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); if (spa_load_verify_metadata) { + if (spa->spa_extreme_rewind) { + spa_load_note(spa, "performing a complete scan of the " + "pool since extreme rewind is on. This may take " + "a very long time.\n (spa_load_verify_data=%u, " + "spa_load_verify_metadata=%u)", + spa_load_verify_data, spa_load_verify_metadata); + } error = traverse_pool(spa, spa->spa_verify_min_txg, TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | TRAVERSE_NO_DECRYPT, spa_load_verify_cb, rio); @@ -2103,8 +2194,15 @@ spa_load_verify(spa_t *spa) spa->spa_load_meta_errors = sle.sle_meta_count; spa->spa_load_data_errors = sle.sle_data_count; - if (!error && sle.sle_meta_count <= policy.zrp_maxmeta && - sle.sle_data_count <= policy.zrp_maxdata) { + if (sle.sle_meta_count != 0 || sle.sle_data_count != 0) { + spa_load_note(spa, "spa_load_verify found %llu metadata errors " + "and %llu data errors", (u_longlong_t)sle.sle_meta_count, + (u_longlong_t)sle.sle_data_count); + } + + if (spa_load_verify_dryrun || + (!error && sle.sle_meta_count <= policy.zlp_maxmeta && + sle.sle_data_count <= policy.zlp_maxdata)) { int64_t loss = 0; verify_ok = B_TRUE; @@ -2122,6 +2220,9 @@ spa_load_verify(spa_t *spa) spa->spa_load_max_txg = spa->spa_uberblock.ub_txg; } + if (spa_load_verify_dryrun) + return (0); + if (error) { if (error != ENXIO && error != EIO) error = SET_ERROR(EIO); @@ -2145,17 +2246,39 @@ spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val) * Find a value in the pool directory object. */ static int -spa_dir_prop(spa_t *spa, const char *name, uint64_t *val) +spa_dir_prop(spa_t *spa, const char *name, uint64_t *val, boolean_t log_enoent) { - return (zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - name, sizeof (uint64_t), 1, val)); + int error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + name, sizeof (uint64_t), 1, val); + + if (error != 0 && (error != ENOENT || log_enoent)) { + spa_load_failed(spa, "couldn't get '%s' value in MOS directory " + "[error=%d]", name, error); + } + + return (error); } static int spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err) { vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux); - return (err); + return (SET_ERROR(err)); +} + +static void +spa_spawn_aux_threads(spa_t *spa) +{ + ASSERT(spa_writeable(spa)); + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + spa_start_indirect_condensing_thread(spa); + + ASSERT3P(spa->spa_checkpoint_discard_zthr, ==, NULL); + spa->spa_checkpoint_discard_zthr = + zthr_create(spa_checkpoint_discard_thread_check, + spa_checkpoint_discard_thread, spa); } /* @@ -2241,60 +2364,22 @@ spa_try_repair(spa_t *spa, nvlist_t *config) } static int -spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type, - boolean_t mosconfig) +spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type) { - nvlist_t *config = spa->spa_config; char *ereport = FM_EREPORT_ZFS_POOL; - char *comment; int error; - uint64_t pool_guid; - nvlist_t *nvl; - - if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) - return (SET_ERROR(EINVAL)); - - ASSERT(spa->spa_comment == NULL); - if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) - spa->spa_comment = spa_strdup(comment); - - /* - * Versioning wasn't explicitly added to the label until later, so if - * it's not present treat it as the initial version. - */ - if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, - &spa->spa_ubsync.ub_version) != 0) - spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; - - (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, - &spa->spa_config_txg); - - if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && - spa_guid_exists(pool_guid, 0)) { - error = SET_ERROR(EEXIST); - } else { - spa->spa_config_guid = pool_guid; - - if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, - &nvl) == 0) { - VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting, - KM_SLEEP) == 0); - } - nvlist_free(spa->spa_load_info); - spa->spa_load_info = fnvlist_alloc(); + spa->spa_load_state = state; - gethrestime(&spa->spa_loaded_ts); - error = spa_load_impl(spa, pool_guid, config, state, type, - mosconfig, &ereport); - } + gethrestime(&spa->spa_loaded_ts); + error = spa_load_impl(spa, type, &ereport); /* * Don't count references from objsets that are already closed * and are making their way through the eviction process. */ spa_evicting_os_wait(spa); - spa->spa_minref = refcount_count(&spa->spa_refcount); + spa->spa_minref = zfs_refcount_count(&spa->spa_refcount); if (error) { if (error != EEXIST) { spa->spa_loaded_ts.tv_sec = 0; @@ -2540,44 +2625,104 @@ out: return (error); } -/* - * Load an existing storage pool, using the pool's builtin spa_config as a - * source of configuration information. - */ -__attribute__((always_inline)) -static inline int -spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, - spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, - char **ereport) +static int +spa_verify_host(spa_t *spa, nvlist_t *mos_config) +{ + uint64_t hostid; + char *hostname; + uint64_t myhostid = 0; + + if (!spa_is_root(spa) && nvlist_lookup_uint64(mos_config, + ZPOOL_CONFIG_HOSTID, &hostid) == 0) { + hostname = fnvlist_lookup_string(mos_config, + ZPOOL_CONFIG_HOSTNAME); + + myhostid = zone_get_hostid(NULL); + + if (hostid != 0 && myhostid != 0 && hostid != myhostid) { + cmn_err(CE_WARN, "pool '%s' could not be " + "loaded as it was last accessed by " + "another system (host: %s hostid: 0x%llx). " + "See: http://illumos.org/msg/ZFS-8000-EY", + spa_name(spa), hostname, (u_longlong_t)hostid); + spa_load_failed(spa, "hostid verification failed: pool " + "last accessed by host: %s (hostid: 0x%llx)", + hostname, (u_longlong_t)hostid); + return (SET_ERROR(EBADF)); + } + } + + return (0); +} + +static int +spa_ld_parse_config(spa_t *spa, spa_import_type_t type) { int error = 0; - nvlist_t *nvroot = NULL; - nvlist_t *label; - vdev_t *rvd; - uberblock_t *ub = &spa->spa_uberblock; - uint64_t children, config_cache_txg = spa->spa_config_txg; - int orig_mode = spa->spa_mode; + nvlist_t *nvtree, *nvl, *config = spa->spa_config; int parse; - uint64_t obj; - boolean_t missing_feat_write = B_FALSE; - boolean_t activity_check = B_FALSE; + vdev_t *rvd; + uint64_t pool_guid; + char *comment; /* - * If this is an untrusted config, access the pool in read-only mode. - * This prevents things like resilvering recently removed devices. + * Versioning wasn't explicitly added to the label until later, so if + * it's not present treat it as the initial version. */ - if (!mosconfig) - spa->spa_mode = FREAD; + if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, + &spa->spa_ubsync.ub_version) != 0) + spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; - ASSERT(MUTEX_HELD(&spa_namespace_lock)); + if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { + spa_load_failed(spa, "invalid config provided: '%s' missing", + ZPOOL_CONFIG_POOL_GUID); + return (SET_ERROR(EINVAL)); + } - spa->spa_load_state = state; + /* + * If we are doing an import, ensure that the pool is not already + * imported by checking if its pool guid already exists in the + * spa namespace. + * + * The only case that we allow an already imported pool to be + * imported again, is when the pool is checkpointed and we want to + * look at its checkpointed state from userland tools like zdb. + */ +#ifdef _KERNEL + if ((spa->spa_load_state == SPA_LOAD_IMPORT || + spa->spa_load_state == SPA_LOAD_TRYIMPORT) && + spa_guid_exists(pool_guid, 0)) { +#else + if ((spa->spa_load_state == SPA_LOAD_IMPORT || + spa->spa_load_state == SPA_LOAD_TRYIMPORT) && + spa_guid_exists(pool_guid, 0) && + !spa_importing_readonly_checkpoint(spa)) { +#endif + spa_load_failed(spa, "a pool with guid %llu is already open", + (u_longlong_t)pool_guid); + return (SET_ERROR(EEXIST)); + } - if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot)) - return (SET_ERROR(EINVAL)); + spa->spa_config_guid = pool_guid; - parse = (type == SPA_IMPORT_EXISTING ? - VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT); + nvlist_free(spa->spa_load_info); + spa->spa_load_info = fnvlist_alloc(); + + ASSERT(spa->spa_comment == NULL); + if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) + spa->spa_comment = spa_strdup(comment); + + (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, + &spa->spa_config_txg); + + if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) == 0) + spa->spa_config_splitting = fnvlist_dup(nvl); + + if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtree)) { + spa_load_failed(spa, "invalid config provided: '%s' missing", + ZPOOL_CONFIG_VDEV_TREE); + return (SET_ERROR(EINVAL)); + } /* * Create "The Godfather" zio to hold all async IOs @@ -2596,11 +2741,16 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, * configuration requires knowing the version number. */ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); - error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, parse); + parse = (type == SPA_IMPORT_EXISTING ? + VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT); + error = spa_config_parse(spa, &rvd, nvtree, NULL, 0, parse); spa_config_exit(spa, SCL_ALL, FTAG); - if (error != 0) + if (error != 0) { + spa_load_failed(spa, "unable to parse config [error=%d]", + error); return (error); + } ASSERT(spa->spa_root_vdev == rvd); ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT); @@ -2610,38 +2760,144 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, ASSERT(spa_guid(spa) == pool_guid); } + return (0); +} + +/* + * Recursively open all vdevs in the vdev tree. This function is called twice: + * first with the untrusted config, then with the trusted config. + */ +static int +spa_ld_open_vdevs(spa_t *spa) +{ + int error = 0; + /* - * Try to open all vdevs, loading each label in the process. + * spa_missing_tvds_allowed defines how many top-level vdevs can be + * missing/unopenable for the root vdev to be still considered openable. */ - spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); - error = vdev_open(rvd); - spa_config_exit(spa, SCL_ALL, FTAG); - if (error != 0) + if (spa->spa_trust_config) { + spa->spa_missing_tvds_allowed = zfs_max_missing_tvds; + } else if (spa->spa_config_source == SPA_CONFIG_SRC_CACHEFILE) { + spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_cachefile; + } else if (spa->spa_config_source == SPA_CONFIG_SRC_SCAN) { + spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_scan; + } else { + spa->spa_missing_tvds_allowed = 0; + } + + spa->spa_missing_tvds_allowed = + MAX(zfs_max_missing_tvds, spa->spa_missing_tvds_allowed); + + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + error = vdev_open(spa->spa_root_vdev); + spa_config_exit(spa, SCL_ALL, FTAG); + + if (spa->spa_missing_tvds != 0) { + spa_load_note(spa, "vdev tree has %lld missing top-level " + "vdevs.", (u_longlong_t)spa->spa_missing_tvds); + if (spa->spa_trust_config && (spa->spa_mode & FWRITE)) { + /* + * Although theoretically we could allow users to open + * incomplete pools in RW mode, we'd need to add a lot + * of extra logic (e.g. adjust pool space to account + * for missing vdevs). + * This limitation also prevents users from accidentally + * opening the pool in RW mode during data recovery and + * damaging it further. + */ + spa_load_note(spa, "pools with missing top-level " + "vdevs can only be opened in read-only mode."); + error = SET_ERROR(ENXIO); + } else { + spa_load_note(spa, "current settings allow for maximum " + "%lld missing top-level vdevs at this stage.", + (u_longlong_t)spa->spa_missing_tvds_allowed); + } + } + if (error != 0) { + spa_load_failed(spa, "unable to open vdev tree [error=%d]", + error); + } + if (spa->spa_missing_tvds != 0 || error != 0) + vdev_dbgmsg_print_tree(spa->spa_root_vdev, 2); + + return (error); +} + +/* + * We need to validate the vdev labels against the configuration that + * we have in hand. This function is called twice: first with an untrusted + * config, then with a trusted config. The validation is more strict when the + * config is trusted. + */ +static int +spa_ld_validate_vdevs(spa_t *spa) +{ + int error = 0; + vdev_t *rvd = spa->spa_root_vdev; + + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + error = vdev_validate(rvd); + spa_config_exit(spa, SCL_ALL, FTAG); + + if (error != 0) { + spa_load_failed(spa, "vdev_validate failed [error=%d]", error); return (error); + } - /* - * We need to validate the vdev labels against the configuration that - * we have in hand, which is dependent on the setting of mosconfig. If - * mosconfig is true then we're validating the vdev labels based on - * that config. Otherwise, we're validating against the cached config - * (zpool.cache) that was read when we loaded the zfs module, and then - * later we will recursively call spa_load() and validate against - * the vdev config. - * - * If we're assembling a new pool that's been split off from an - * existing pool, the labels haven't yet been updated so we skip - * validation for now. - */ - if (type != SPA_IMPORT_ASSEMBLE) { - spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); - error = vdev_validate(rvd, mosconfig); - spa_config_exit(spa, SCL_ALL, FTAG); + if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { + spa_load_failed(spa, "cannot open vdev tree after invalidating " + "some vdevs"); + vdev_dbgmsg_print_tree(rvd, 2); + return (SET_ERROR(ENXIO)); + } - if (error != 0) - return (error); + return (0); +} - if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) - return (SET_ERROR(ENXIO)); +static void +spa_ld_select_uberblock_done(spa_t *spa, uberblock_t *ub) +{ + spa->spa_state = POOL_STATE_ACTIVE; + spa->spa_ubsync = spa->spa_uberblock; + spa->spa_verify_min_txg = spa->spa_extreme_rewind ? + TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1; + spa->spa_first_txg = spa->spa_last_ubsync_txg ? + spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1; + spa->spa_claim_max_txg = spa->spa_first_txg; + spa->spa_prev_software_version = ub->ub_software_version; +} + +static int +spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type) +{ + vdev_t *rvd = spa->spa_root_vdev; + nvlist_t *label; + uberblock_t *ub = &spa->spa_uberblock; + boolean_t activity_check = B_FALSE; + + /* + * If we are opening the checkpointed state of the pool by + * rewinding to it, at this point we will have written the + * checkpointed uberblock to the vdev labels, so searching + * the labels will find the right uberblock. However, if + * we are opening the checkpointed state read-only, we have + * not modified the labels. Therefore, we must ignore the + * labels and continue using the spa_uberblock that was set + * by spa_ld_checkpoint_rewind. + * + * Note that it would be fine to ignore the labels when + * rewinding (opening writeable) as well. However, if we + * crash just after writing the labels, we will end up + * searching the labels. Doing so in the common case means + * that this code path gets exercised normally, rather than + * just in the edge case. + */ + if (ub->ub_checkpoint_txg != 0 && + spa_importing_readonly_checkpoint(spa)) { + spa_ld_select_uberblock_done(spa, ub); + return (0); } /* @@ -2654,15 +2910,21 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, */ if (ub->ub_txg == 0) { nvlist_free(label); + spa_load_failed(spa, "no valid uberblock found"); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); } + spa_load_note(spa, "using uberblock with txg=%llu", + (u_longlong_t)ub->ub_txg); + + /* * For pools which have the multihost property on determine if the * pool is truly inactive and can be safely imported. Prevent * hosts which don't have a hostid set from importing the pool. */ - activity_check = spa_activity_check_required(spa, ub, label, config); + activity_check = spa_activity_check_required(spa, ub, label, + spa->spa_config); if (activity_check) { if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay && spa_get_hostid() == 0) { @@ -2672,7 +2934,7 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO)); } - error = spa_activity_check(spa, ub, config); + int error = spa_activity_check(spa, ub, spa->spa_config); if (error) { nvlist_free(label); return (error); @@ -2689,6 +2951,8 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, */ if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) { nvlist_free(label); + spa_load_failed(spa, "version %llu is not supported", + (u_longlong_t)ub->ub_version); return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP)); } @@ -2699,9 +2963,17 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, * If we weren't able to find what's necessary for reading the * MOS in the label, return failure. */ - if (label == NULL || nvlist_lookup_nvlist(label, - ZPOOL_CONFIG_FEATURES_FOR_READ, &features) != 0) { + if (label == NULL) { + spa_load_failed(spa, "label config unavailable"); + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, + ENXIO)); + } + + if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_FEATURES_FOR_READ, + &features) != 0) { nvlist_free(label); + spa_load_failed(spa, "invalid label: '%s' missing", + ZPOOL_CONFIG_FEATURES_FOR_READ); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); } @@ -2740,6 +3012,7 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, VERIFY(nvlist_add_nvlist(spa->spa_load_info, ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0); nvlist_free(unsup_feat); + spa_load_failed(spa, "some features are unsupported"); return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, ENOTSUP)); } @@ -2747,21 +3020,9 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, nvlist_free(unsup_feat); } - /* - * If the vdev guid sum doesn't match the uberblock, we have an - * incomplete configuration. We first check to see if the pool - * is aware of the complete config (i.e ZPOOL_CONFIG_VDEV_CHILDREN). - * If it is, defer the vdev_guid_sum check till later so we - * can handle missing vdevs. - */ - if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN, - &children) != 0 && mosconfig && type != SPA_IMPORT_ASSEMBLE && - rvd->vdev_guid_sum != ub->ub_guid_sum) - return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); - if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) { spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); - spa_try_repair(spa, config); + spa_try_repair(spa, spa->spa_config); spa_config_exit(spa, SCL_ALL, FTAG); nvlist_free(spa->spa_config_splitting); spa->spa_config_splitting = NULL; @@ -2770,39 +3031,245 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, /* * Initialize internal SPA structures. */ - spa->spa_state = POOL_STATE_ACTIVE; - spa->spa_ubsync = spa->spa_uberblock; - spa->spa_verify_min_txg = spa->spa_extreme_rewind ? - TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1; - spa->spa_first_txg = spa->spa_last_ubsync_txg ? - spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1; - spa->spa_claim_max_txg = spa->spa_first_txg; - spa->spa_prev_software_version = ub->ub_software_version; + spa_ld_select_uberblock_done(spa, ub); + + return (0); +} + +static int +spa_ld_open_rootbp(spa_t *spa) +{ + int error = 0; + vdev_t *rvd = spa->spa_root_vdev; error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool); - if (error) + if (error != 0) { + spa_load_failed(spa, "unable to open rootbp in dsl_pool_init " + "[error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + } spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; - if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0) + return (0); +} + +static int +spa_ld_trusted_config(spa_t *spa, spa_import_type_t type, + boolean_t reloading) +{ + vdev_t *mrvd, *rvd = spa->spa_root_vdev; + nvlist_t *nv, *mos_config, *policy; + int error = 0, copy_error; + uint64_t healthy_tvds, healthy_tvds_mos; + uint64_t mos_config_txg; + + if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object, B_TRUE) + != 0) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + + /* + * If we're assembling a pool from a split, the config provided is + * already trusted so there is nothing to do. + */ + if (type == SPA_IMPORT_ASSEMBLE) + return (0); + + healthy_tvds = spa_healthy_core_tvds(spa); + + if (load_nvlist(spa, spa->spa_config_object, &mos_config) + != 0) { + spa_load_failed(spa, "unable to retrieve MOS config"); + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + } + + /* + * If we are doing an open, pool owner wasn't verified yet, thus do + * the verification here. + */ + if (spa->spa_load_state == SPA_LOAD_OPEN) { + error = spa_verify_host(spa, mos_config); + if (error != 0) { + nvlist_free(mos_config); + return (error); + } + } + + nv = fnvlist_lookup_nvlist(mos_config, ZPOOL_CONFIG_VDEV_TREE); + + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + + /* + * Build a new vdev tree from the trusted config + */ + VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0); + + /* + * Vdev paths in the MOS may be obsolete. If the untrusted config was + * obtained by scanning /dev/dsk, then it will have the right vdev + * paths. We update the trusted MOS config with this information. + * We first try to copy the paths with vdev_copy_path_strict, which + * succeeds only when both configs have exactly the same vdev tree. + * If that fails, we fall back to a more flexible method that has a + * best effort policy. + */ + copy_error = vdev_copy_path_strict(rvd, mrvd); + if (copy_error != 0 || spa_load_print_vdev_tree) { + spa_load_note(spa, "provided vdev tree:"); + vdev_dbgmsg_print_tree(rvd, 2); + spa_load_note(spa, "MOS vdev tree:"); + vdev_dbgmsg_print_tree(mrvd, 2); + } + if (copy_error != 0) { + spa_load_note(spa, "vdev_copy_path_strict failed, falling " + "back to vdev_copy_path_relaxed"); + vdev_copy_path_relaxed(rvd, mrvd); + } + + vdev_close(rvd); + vdev_free(rvd); + spa->spa_root_vdev = mrvd; + rvd = mrvd; + spa_config_exit(spa, SCL_ALL, FTAG); + + /* + * We will use spa_config if we decide to reload the spa or if spa_load + * fails and we rewind. We must thus regenerate the config using the + * MOS information with the updated paths. ZPOOL_LOAD_POLICY is used to + * pass settings on how to load the pool and is not stored in the MOS. + * We copy it over to our new, trusted config. + */ + mos_config_txg = fnvlist_lookup_uint64(mos_config, + ZPOOL_CONFIG_POOL_TXG); + nvlist_free(mos_config); + mos_config = spa_config_generate(spa, NULL, mos_config_txg, B_FALSE); + if (nvlist_lookup_nvlist(spa->spa_config, ZPOOL_LOAD_POLICY, + &policy) == 0) + fnvlist_add_nvlist(mos_config, ZPOOL_LOAD_POLICY, policy); + spa_config_set(spa, mos_config); + spa->spa_config_source = SPA_CONFIG_SRC_MOS; + + /* + * Now that we got the config from the MOS, we should be more strict + * in checking blkptrs and can make assumptions about the consistency + * of the vdev tree. spa_trust_config must be set to true before opening + * vdevs in order for them to be writeable. + */ + spa->spa_trust_config = B_TRUE; + + /* + * Open and validate the new vdev tree + */ + error = spa_ld_open_vdevs(spa); + if (error != 0) + return (error); + + error = spa_ld_validate_vdevs(spa); + if (error != 0) + return (error); + + if (copy_error != 0 || spa_load_print_vdev_tree) { + spa_load_note(spa, "final vdev tree:"); + vdev_dbgmsg_print_tree(rvd, 2); + } + + if (spa->spa_load_state != SPA_LOAD_TRYIMPORT && + !spa->spa_extreme_rewind && zfs_max_missing_tvds == 0) { + /* + * Sanity check to make sure that we are indeed loading the + * latest uberblock. If we missed SPA_SYNC_MIN_VDEVS tvds + * in the config provided and they happened to be the only ones + * to have the latest uberblock, we could involuntarily perform + * an extreme rewind. + */ + healthy_tvds_mos = spa_healthy_core_tvds(spa); + if (healthy_tvds_mos - healthy_tvds >= + SPA_SYNC_MIN_VDEVS) { + spa_load_note(spa, "config provided misses too many " + "top-level vdevs compared to MOS (%lld vs %lld). ", + (u_longlong_t)healthy_tvds, + (u_longlong_t)healthy_tvds_mos); + spa_load_note(spa, "vdev tree:"); + vdev_dbgmsg_print_tree(rvd, 2); + if (reloading) { + spa_load_failed(spa, "config was already " + "provided from MOS. Aborting."); + return (spa_vdev_err(rvd, + VDEV_AUX_CORRUPT_DATA, EIO)); + } + spa_load_note(spa, "spa must be reloaded using MOS " + "config"); + return (SET_ERROR(EAGAIN)); + } + } + + error = spa_check_for_missing_logs(spa); + if (error != 0) + return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); + + if (rvd->vdev_guid_sum != spa->spa_uberblock.ub_guid_sum) { + spa_load_failed(spa, "uberblock guid sum doesn't match MOS " + "guid sum (%llu != %llu)", + (u_longlong_t)spa->spa_uberblock.ub_guid_sum, + (u_longlong_t)rvd->vdev_guid_sum); + return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, + ENXIO)); + } + + return (0); +} + +static int +spa_ld_open_indirect_vdev_metadata(spa_t *spa) +{ + int error = 0; + vdev_t *rvd = spa->spa_root_vdev; + + /* + * Everything that we read before spa_remove_init() must be stored + * on concreted vdevs. Therefore we do this as early as possible. + */ + error = spa_remove_init(spa); + if (error != 0) { + spa_load_failed(spa, "spa_remove_init failed [error=%d]", + error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + } + + /* + * Retrieve information needed to condense indirect vdev mappings. + */ + error = spa_condense_init(spa); + if (error != 0) { + spa_load_failed(spa, "spa_condense_init failed [error=%d]", + error); + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); + } + + return (0); +} + +static int +spa_ld_check_features(spa_t *spa, boolean_t *missing_feat_writep) +{ + int error = 0; + vdev_t *rvd = spa->spa_root_vdev; if (spa_version(spa) >= SPA_VERSION_FEATURES) { boolean_t missing_feat_read = B_FALSE; nvlist_t *unsup_feat, *enabled_feat; if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ, - &spa->spa_feat_for_read_obj) != 0) { + &spa->spa_feat_for_read_obj, B_TRUE) != 0) { return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE, - &spa->spa_feat_for_write_obj) != 0) { + &spa->spa_feat_for_write_obj, B_TRUE) != 0) { return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS, - &spa->spa_feat_desc_obj) != 0) { + &spa->spa_feat_desc_obj, B_TRUE) != 0) { return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } @@ -2813,10 +3280,11 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, unsup_feat, enabled_feat)) missing_feat_read = B_TRUE; - if (spa_writeable(spa) || state == SPA_LOAD_TRYIMPORT) { + if (spa_writeable(spa) || + spa->spa_load_state == SPA_LOAD_TRYIMPORT) { if (!spa_features_check(spa, B_TRUE, unsup_feat, enabled_feat)) { - missing_feat_write = B_TRUE; + *missing_feat_writep = B_TRUE; } } @@ -2855,8 +3323,9 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, * userland in order to know whether to display the * abovementioned note. */ - if (missing_feat_read || (missing_feat_write && + if (missing_feat_read || (*missing_feat_writep && spa_writeable(spa))) { + spa_load_failed(spa, "pool uses unsupported features"); return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, ENOTSUP)); } @@ -2876,6 +3345,9 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, spa->spa_feat_refcount_cache[i] = SPA_FEATURE_DISABLED; } else { + spa_load_failed(spa, "error getting refcount " + "for feature %s [error=%d]", + spa_feature_table[i].fi_guid, error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } @@ -2884,49 +3356,36 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) { if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG, - &spa->spa_feat_enabled_txg_obj) != 0) + &spa->spa_feat_enabled_txg_obj, B_TRUE) != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } + return (0); +} + +static int +spa_ld_load_special_directories(spa_t *spa) +{ + int error = 0; + vdev_t *rvd = spa->spa_root_vdev; + spa->spa_is_initializing = B_TRUE; error = dsl_pool_open(spa->spa_dsl_pool); spa->spa_is_initializing = B_FALSE; - if (error != 0) + if (error != 0) { + spa_load_failed(spa, "dsl_pool_open failed [error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + } - if (!mosconfig) { - uint64_t hostid; - nvlist_t *policy = NULL, *nvconfig; - - if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) - return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); - - if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig, - ZPOOL_CONFIG_HOSTID, &hostid) == 0) { - char *hostname; - unsigned long myhostid = 0; - - VERIFY(nvlist_lookup_string(nvconfig, - ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); - - myhostid = spa_get_hostid(); - if (hostid && myhostid && hostid != myhostid) { - nvlist_free(nvconfig); - return (SET_ERROR(EBADF)); - } - } - if (nvlist_lookup_nvlist(spa->spa_config, - ZPOOL_REWIND_POLICY, &policy) == 0) - VERIFY(nvlist_add_nvlist(nvconfig, - ZPOOL_REWIND_POLICY, policy) == 0); - - spa_config_set(spa, nvconfig); - spa_unload(spa); - spa_deactivate(spa); - spa_activate(spa, orig_mode); + return (0); +} - return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE)); - } +static int +spa_ld_get_props(spa_t *spa) +{ + int error = 0; + uint64_t obj; + vdev_t *rvd = spa->spa_root_vdev; /* Grab the checksum salt from the MOS. */ error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, @@ -2938,26 +3397,31 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes, sizeof (spa->spa_cksum_salt.zcs_bytes)); } else if (error != 0) { + spa_load_failed(spa, "unable to retrieve checksum salt from " + "MOS [error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } - if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj) != 0) + if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj, B_TRUE) != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj); - if (error != 0) + if (error != 0) { + spa_load_failed(spa, "error opening deferred-frees bpobj " + "[error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + } /* * Load the bit that tells us to use the new accounting function * (raid-z deflation). If we have an older pool, this will not * be present. */ - error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate); + error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate, B_FALSE); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION, - &spa->spa_creation_version); + &spa->spa_creation_version, B_FALSE); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); @@ -2965,12 +3429,13 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, * Load the persistent error log. If we have an older pool, this will * not be present. */ - error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last); + error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last, + B_FALSE); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB, - &spa->spa_errlog_scrub); + &spa->spa_errlog_scrub, B_FALSE); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); @@ -2978,7 +3443,7 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, * Load the history object. If we have an older pool, this * will not be present. */ - error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history); + error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history, B_FALSE); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); @@ -2991,11 +3456,13 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, /* The sentinel is only available in the MOS config. */ nvlist_t *mos_config; - if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0) + if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0) { + spa_load_failed(spa, "unable to retrieve MOS config"); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + } error = spa_dir_prop(spa, DMU_POOL_VDEV_ZAP_MAP, - &spa->spa_all_vdev_zaps); + &spa->spa_all_vdev_zaps, B_FALSE); if (error == ENOENT) { VERIFY(!nvlist_exists(mos_config, @@ -3019,6 +3486,50 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, } nvlist_free(mos_config); + spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); + + error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object, + B_FALSE); + if (error && error != ENOENT) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + + if (error == 0) { + uint64_t autoreplace; + + spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs); + spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace); + spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation); + spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode); + spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand); + spa_prop_find(spa, ZPOOL_PROP_MULTIHOST, &spa->spa_multihost); + spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO, + &spa->spa_dedup_ditto); + + spa->spa_autoreplace = (autoreplace != 0); + } + + /* + * If we are importing a pool with missing top-level vdevs, + * we enforce that the pool doesn't panic or get suspended on + * error since the likelihood of missing data is extremely high. + */ + if (spa->spa_missing_tvds > 0 && + spa->spa_failmode != ZIO_FAILURE_MODE_CONTINUE && + spa->spa_load_state != SPA_LOAD_TRYIMPORT) { + spa_load_note(spa, "forcing failmode to 'continue' " + "as some top level vdevs are missing"); + spa->spa_failmode = ZIO_FAILURE_MODE_CONTINUE; + } + + return (0); +} + +static int +spa_ld_open_aux_vdevs(spa_t *spa, spa_import_type_t type) +{ + int error = 0; + vdev_t *rvd = spa->spa_root_vdev; + /* * If we're assembling the pool from the split-off vdevs of * an existing pool, we don't want to attach the spares & cache @@ -3028,14 +3539,17 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, /* * Load any hot spares for this pool. */ - error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object); + error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object, + B_FALSE); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); if (load_nvlist(spa, spa->spa_spares.sav_object, - &spa->spa_spares.sav_config) != 0) + &spa->spa_spares.sav_config) != 0) { + spa_load_failed(spa, "error loading spares nvlist"); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + } spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_spares(spa); @@ -3048,14 +3562,16 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, * Load any level 2 ARC devices for this pool. */ error = spa_dir_prop(spa, DMU_POOL_L2CACHE, - &spa->spa_l2cache.sav_object); + &spa->spa_l2cache.sav_object, B_FALSE); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); if (load_nvlist(spa, spa->spa_l2cache.sav_object, - &spa->spa_l2cache.sav_config) != 0) + &spa->spa_l2cache.sav_config) != 0) { + spa_load_failed(spa, "error loading l2cache nvlist"); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + } spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_l2cache(spa); @@ -3064,26 +3580,14 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, spa->spa_l2cache.sav_sync = B_TRUE; } - spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); - - error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object); - if (error && error != ENOENT) - return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); - - if (error == 0) { - uint64_t autoreplace = 0; - - spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs); - spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace); - spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation); - spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode); - spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand); - spa_prop_find(spa, ZPOOL_PROP_MULTIHOST, &spa->spa_multihost); - spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO, - &spa->spa_dedup_ditto); + return (0); +} - spa->spa_autoreplace = (autoreplace != 0); - } +static int +spa_ld_load_vdev_metadata(spa_t *spa) +{ + int error = 0; + vdev_t *rvd = spa->spa_root_vdev; /* * If the 'multihost' property is set, then never allow a pool to @@ -3104,120 +3608,597 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, * unopenable vdevs so that the normal autoreplace handler can take * over. */ - if (spa->spa_autoreplace && state != SPA_LOAD_TRYIMPORT) { + if (spa->spa_autoreplace && spa->spa_load_state != SPA_LOAD_TRYIMPORT) { spa_check_removed(spa->spa_root_vdev); /* * For the import case, this is done in spa_import(), because * at this point we're using the spare definitions from * the MOS config, not necessarily from the userland config. */ - if (state != SPA_LOAD_IMPORT) { + if (spa->spa_load_state != SPA_LOAD_IMPORT) { spa_aux_check_removed(&spa->spa_spares); spa_aux_check_removed(&spa->spa_l2cache); } } /* - * Load the vdev state for all toplevel vdevs. + * Load the vdev metadata such as metaslabs, DTLs, spacemap object, etc. */ - vdev_load(rvd); + error = vdev_load(rvd); + if (error != 0) { + spa_load_failed(spa, "vdev_load failed [error=%d]", error); + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); + } /* - * Propagate the leaf DTLs we just loaded all the way up the tree. + * Propagate the leaf DTLs we just loaded all the way up the vdev tree. */ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); vdev_dtl_reassess(rvd, 0, 0, B_FALSE); spa_config_exit(spa, SCL_ALL, FTAG); + return (0); +} + +static int +spa_ld_load_dedup_tables(spa_t *spa) +{ + int error = 0; + vdev_t *rvd = spa->spa_root_vdev; + + error = ddt_load(spa); + if (error != 0) { + spa_load_failed(spa, "ddt_load failed [error=%d]", error); + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + } + + return (0); +} + +static int +spa_ld_verify_logs(spa_t *spa, spa_import_type_t type, char **ereport) +{ + vdev_t *rvd = spa->spa_root_vdev; + + if (type != SPA_IMPORT_ASSEMBLE && spa_writeable(spa)) { + boolean_t missing = spa_check_logs(spa); + if (missing) { + if (spa->spa_missing_tvds != 0) { + spa_load_note(spa, "spa_check_logs failed " + "so dropping the logs"); + } else { + *ereport = FM_EREPORT_ZFS_LOG_REPLAY; + spa_load_failed(spa, "spa_check_logs failed"); + return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, + ENXIO)); + } + } + } + + return (0); +} + +static int +spa_ld_verify_pool_data(spa_t *spa) +{ + int error = 0; + vdev_t *rvd = spa->spa_root_vdev; + /* - * Load the DDTs (dedup tables). + * We've successfully opened the pool, verify that we're ready + * to start pushing transactions. */ - error = ddt_load(spa); + if (spa->spa_load_state != SPA_LOAD_TRYIMPORT) { + error = spa_load_verify(spa); + if (error != 0) { + spa_load_failed(spa, "spa_load_verify failed " + "[error=%d]", error); + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, + error)); + } + } + + return (0); +} + +static void +spa_ld_claim_log_blocks(spa_t *spa) +{ + dmu_tx_t *tx; + dsl_pool_t *dp = spa_get_dsl(spa); + + /* + * Claim log blocks that haven't been committed yet. + * This must all happen in a single txg. + * Note: spa_claim_max_txg is updated by spa_claim_notify(), + * invoked from zil_claim_log_block()'s i/o done callback. + * Price of rollback is that we abandon the log. + */ + spa->spa_claiming = B_TRUE; + + tx = dmu_tx_create_assigned(dp, spa_first_txg(spa)); + (void) dmu_objset_find_dp(dp, dp->dp_root_dir_obj, + zil_claim, tx, DS_FIND_CHILDREN); + dmu_tx_commit(tx); + + spa->spa_claiming = B_FALSE; + + spa_set_log_state(spa, SPA_LOG_GOOD); +} + +static void +spa_ld_check_for_config_update(spa_t *spa, uint64_t config_cache_txg, + boolean_t update_config_cache) +{ + vdev_t *rvd = spa->spa_root_vdev; + int need_update = B_FALSE; + + /* + * If the config cache is stale, or we have uninitialized + * metaslabs (see spa_vdev_add()), then update the config. + * + * If this is a verbatim import, trust the current + * in-core spa_config and update the disk labels. + */ + if (update_config_cache || config_cache_txg != spa->spa_config_txg || + spa->spa_load_state == SPA_LOAD_IMPORT || + spa->spa_load_state == SPA_LOAD_RECOVER || + (spa->spa_import_flags & ZFS_IMPORT_VERBATIM)) + need_update = B_TRUE; + + for (int c = 0; c < rvd->vdev_children; c++) + if (rvd->vdev_child[c]->vdev_ms_array == 0) + need_update = B_TRUE; + + /* + * Update the config cache asychronously in case we're the + * root pool, in which case the config cache isn't writable yet. + */ + if (need_update) + spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); +} + +static void +spa_ld_prepare_for_reload(spa_t *spa) +{ + int mode = spa->spa_mode; + int async_suspended = spa->spa_async_suspended; + + spa_unload(spa); + spa_deactivate(spa); + spa_activate(spa, mode); + + /* + * We save the value of spa_async_suspended as it gets reset to 0 by + * spa_unload(). We want to restore it back to the original value before + * returning as we might be calling spa_async_resume() later. + */ + spa->spa_async_suspended = async_suspended; +} + +static int +spa_ld_read_checkpoint_txg(spa_t *spa) +{ + uberblock_t checkpoint; + int error = 0; + + ASSERT0(spa->spa_checkpoint_txg); + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t), + sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint); + + if (error == ENOENT) + return (0); + if (error != 0) - return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + return (error); - spa_update_dspace(spa); + ASSERT3U(checkpoint.ub_txg, !=, 0); + ASSERT3U(checkpoint.ub_checkpoint_txg, !=, 0); + ASSERT3U(checkpoint.ub_timestamp, !=, 0); + spa->spa_checkpoint_txg = checkpoint.ub_txg; + spa->spa_checkpoint_info.sci_timestamp = checkpoint.ub_timestamp; + + return (0); +} + +static int +spa_ld_mos_init(spa_t *spa, spa_import_type_t type) +{ + int error = 0; + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE); /* - * Validate the config, using the MOS config to fill in any - * information which might be missing. If we fail to validate - * the config then declare the pool unfit for use. If we're - * assembling a pool from a split, the log is not transferred - * over. + * Never trust the config that is provided unless we are assembling + * a pool following a split. + * This means don't trust blkptrs and the vdev tree in general. This + * also effectively puts the spa in read-only mode since + * spa_writeable() checks for spa_trust_config to be true. + * We will later load a trusted config from the MOS. + */ + if (type != SPA_IMPORT_ASSEMBLE) + spa->spa_trust_config = B_FALSE; + + /* + * Parse the config provided to create a vdev tree. + */ + error = spa_ld_parse_config(spa, type); + if (error != 0) + return (error); + + /* + * Now that we have the vdev tree, try to open each vdev. This involves + * opening the underlying physical device, retrieving its geometry and + * probing the vdev with a dummy I/O. The state of each vdev will be set + * based on the success of those operations. After this we'll be ready + * to read from the vdevs. + */ + error = spa_ld_open_vdevs(spa); + if (error != 0) + return (error); + + /* + * Read the label of each vdev and make sure that the GUIDs stored + * there match the GUIDs in the config provided. + * If we're assembling a new pool that's been split off from an + * existing pool, the labels haven't yet been updated so we skip + * validation for now. */ if (type != SPA_IMPORT_ASSEMBLE) { - nvlist_t *nvconfig; + error = spa_ld_validate_vdevs(spa); + if (error != 0) + return (error); + } - if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) - return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + /* + * Read all vdev labels to find the best uberblock (i.e. latest, + * unless spa_load_max_txg is set) and store it in spa_uberblock. We + * get the list of features required to read blkptrs in the MOS from + * the vdev label with the best uberblock and verify that our version + * of zfs supports them all. + */ + error = spa_ld_select_uberblock(spa, type); + if (error != 0) + return (error); - if (!spa_config_valid(spa, nvconfig)) { - nvlist_free(nvconfig); - return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, - ENXIO)); + /* + * Pass that uberblock to the dsl_pool layer which will open the root + * blkptr. This blkptr points to the latest version of the MOS and will + * allow us to read its contents. + */ + error = spa_ld_open_rootbp(spa); + if (error != 0) + return (error); + + return (0); +} + +static int +spa_ld_checkpoint_rewind(spa_t *spa) +{ + uberblock_t checkpoint; + int error = 0; + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); + + error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t), + sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint); + + if (error != 0) { + spa_load_failed(spa, "unable to retrieve checkpointed " + "uberblock from the MOS config [error=%d]", error); + + if (error == ENOENT) + error = ZFS_ERR_NO_CHECKPOINT; + + return (error); + } + + ASSERT3U(checkpoint.ub_txg, <, spa->spa_uberblock.ub_txg); + ASSERT3U(checkpoint.ub_txg, ==, checkpoint.ub_checkpoint_txg); + + /* + * We need to update the txg and timestamp of the checkpointed + * uberblock to be higher than the latest one. This ensures that + * the checkpointed uberblock is selected if we were to close and + * reopen the pool right after we've written it in the vdev labels. + * (also see block comment in vdev_uberblock_compare) + */ + checkpoint.ub_txg = spa->spa_uberblock.ub_txg + 1; + checkpoint.ub_timestamp = gethrestime_sec(); + + /* + * Set current uberblock to be the checkpointed uberblock. + */ + spa->spa_uberblock = checkpoint; + + /* + * If we are doing a normal rewind, then the pool is open for + * writing and we sync the "updated" checkpointed uberblock to + * disk. Once this is done, we've basically rewound the whole + * pool and there is no way back. + * + * There are cases when we don't want to attempt and sync the + * checkpointed uberblock to disk because we are opening a + * pool as read-only. Specifically, verifying the checkpointed + * state with zdb, and importing the checkpointed state to get + * a "preview" of its content. + */ + if (spa_writeable(spa)) { + vdev_t *rvd = spa->spa_root_vdev; + + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL }; + int svdcount = 0; + int children = rvd->vdev_children; + int c0 = spa_get_random(children); + + for (int c = 0; c < children; c++) { + vdev_t *vd = rvd->vdev_child[(c0 + c) % children]; + + /* Stop when revisiting the first vdev */ + if (c > 0 && svd[0] == vd) + break; + + if (vd->vdev_ms_array == 0 || vd->vdev_islog || + !vdev_is_concrete(vd)) + continue; + + svd[svdcount++] = vd; + if (svdcount == SPA_SYNC_MIN_VDEVS) + break; } - nvlist_free(nvconfig); + error = vdev_config_sync(svd, svdcount, spa->spa_first_txg); + if (error == 0) + spa->spa_last_synced_guid = rvd->vdev_guid; + spa_config_exit(spa, SCL_ALL, FTAG); + + if (error != 0) { + spa_load_failed(spa, "failed to write checkpointed " + "uberblock to the vdev labels [error=%d]", error); + return (error); + } + } + + return (0); +} + +static int +spa_ld_mos_with_trusted_config(spa_t *spa, spa_import_type_t type, + boolean_t *update_config_cache) +{ + int error; + + /* + * Parse the config for pool, open and validate vdevs, + * select an uberblock, and use that uberblock to open + * the MOS. + */ + error = spa_ld_mos_init(spa, type); + if (error != 0) + return (error); + + /* + * Retrieve the trusted config stored in the MOS and use it to create + * a new, exact version of the vdev tree, then reopen all vdevs. + */ + error = spa_ld_trusted_config(spa, type, B_FALSE); + if (error == EAGAIN) { + if (update_config_cache != NULL) + *update_config_cache = B_TRUE; /* - * Now that we've validated the config, check the state of the - * root vdev. If it can't be opened, it indicates one or - * more toplevel vdevs are faulted. + * Redo the loading process with the trusted config if it is + * too different from the untrusted config. */ - if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) - return (SET_ERROR(ENXIO)); + spa_ld_prepare_for_reload(spa); + spa_load_note(spa, "RELOADING"); + error = spa_ld_mos_init(spa, type); + if (error != 0) + return (error); - if (spa_writeable(spa) && spa_check_logs(spa)) { - *ereport = FM_EREPORT_ZFS_LOG_REPLAY; - return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO)); - } + error = spa_ld_trusted_config(spa, type, B_TRUE); + if (error != 0) + return (error); + + } else if (error != 0) { + return (error); } + return (0); +} + +/* + * Load an existing storage pool, using the config provided. This config + * describes which vdevs are part of the pool and is later validated against + * partial configs present in each vdev's label and an entire copy of the + * config stored in the MOS. + */ +static int +spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport) +{ + int error = 0; + boolean_t missing_feat_write = B_FALSE; + boolean_t checkpoint_rewind = + (spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); + boolean_t update_config_cache = B_FALSE; + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE); + + spa_load_note(spa, "LOADING"); + + error = spa_ld_mos_with_trusted_config(spa, type, &update_config_cache); + if (error != 0) + return (error); + + /* + * If we are rewinding to the checkpoint then we need to repeat + * everything we've done so far in this function but this time + * selecting the checkpointed uberblock and using that to open + * the MOS. + */ + if (checkpoint_rewind) { + /* + * If we are rewinding to the checkpoint update config cache + * anyway. + */ + update_config_cache = B_TRUE; + + /* + * Extract the checkpointed uberblock from the current MOS + * and use this as the pool's uberblock from now on. If the + * pool is imported as writeable we also write the checkpoint + * uberblock to the labels, making the rewind permanent. + */ + error = spa_ld_checkpoint_rewind(spa); + if (error != 0) + return (error); + + /* + * Redo the loading process process again with the + * checkpointed uberblock. + */ + spa_ld_prepare_for_reload(spa); + spa_load_note(spa, "LOADING checkpointed uberblock"); + error = spa_ld_mos_with_trusted_config(spa, type, NULL); + if (error != 0) + return (error); + } + + /* + * Retrieve the checkpoint txg if the pool has a checkpoint. + */ + error = spa_ld_read_checkpoint_txg(spa); + if (error != 0) + return (error); + + /* + * Retrieve the mapping of indirect vdevs. Those vdevs were removed + * from the pool and their contents were re-mapped to other vdevs. Note + * that everything that we read before this step must have been + * rewritten on concrete vdevs after the last device removal was + * initiated. Otherwise we could be reading from indirect vdevs before + * we have loaded their mappings. + */ + error = spa_ld_open_indirect_vdev_metadata(spa); + if (error != 0) + return (error); + + /* + * Retrieve the full list of active features from the MOS and check if + * they are all supported. + */ + error = spa_ld_check_features(spa, &missing_feat_write); + if (error != 0) + return (error); + + /* + * Load several special directories from the MOS needed by the dsl_pool + * layer. + */ + error = spa_ld_load_special_directories(spa); + if (error != 0) + return (error); + + /* + * Retrieve pool properties from the MOS. + */ + error = spa_ld_get_props(spa); + if (error != 0) + return (error); + + /* + * Retrieve the list of auxiliary devices - cache devices and spares - + * and open them. + */ + error = spa_ld_open_aux_vdevs(spa, type); + if (error != 0) + return (error); + + /* + * Load the metadata for all vdevs. Also check if unopenable devices + * should be autoreplaced. + */ + error = spa_ld_load_vdev_metadata(spa); + if (error != 0) + return (error); + + error = spa_ld_load_dedup_tables(spa); + if (error != 0) + return (error); + + /* + * Verify the logs now to make sure we don't have any unexpected errors + * when we claim log blocks later. + */ + error = spa_ld_verify_logs(spa, type, ereport); + if (error != 0) + return (error); + if (missing_feat_write) { - ASSERT(state == SPA_LOAD_TRYIMPORT); + ASSERT(spa->spa_load_state == SPA_LOAD_TRYIMPORT); /* * At this point, we know that we can open the pool in * read-only mode but not read-write mode. We now have enough * information and can return to userland. */ - return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, ENOTSUP)); + return (spa_vdev_err(spa->spa_root_vdev, VDEV_AUX_UNSUP_FEAT, + ENOTSUP)); } /* - * We've successfully opened the pool, verify that we're ready - * to start pushing transactions. + * Traverse the last txgs to make sure the pool was left off in a safe + * state. When performing an extreme rewind, we verify the whole pool, + * which can take a very long time. */ - if (state != SPA_LOAD_TRYIMPORT) { - if ((error = spa_load_verify(spa))) - return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, - error)); - } + error = spa_ld_verify_pool_data(spa); + if (error != 0) + return (error); + + /* + * Calculate the deflated space for the pool. This must be done before + * we write anything to the pool because we'd need to update the space + * accounting using the deflated sizes. + */ + spa_update_dspace(spa); - if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER || + /* + * We have now retrieved all the information we needed to open the + * pool. If we are importing the pool in read-write mode, a few + * additional steps must be performed to finish the import. + */ + if (spa_writeable(spa) && (spa->spa_load_state == SPA_LOAD_RECOVER || spa->spa_load_max_txg == UINT64_MAX)) { - dmu_tx_t *tx; - int need_update = B_FALSE; - dsl_pool_t *dp = spa_get_dsl(spa); + uint64_t config_cache_txg = spa->spa_config_txg; - ASSERT(state != SPA_LOAD_TRYIMPORT); + ASSERT(spa->spa_load_state != SPA_LOAD_TRYIMPORT); /* - * Claim log blocks that haven't been committed yet. - * This must all happen in a single txg. - * Note: spa_claim_max_txg is updated by spa_claim_notify(), - * invoked from zil_claim_log_block()'s i/o done callback. - * Price of rollback is that we abandon the log. + * In case of a checkpoint rewind, log the original txg + * of the checkpointed uberblock. */ - spa->spa_claiming = B_TRUE; - - tx = dmu_tx_create_assigned(dp, spa_first_txg(spa)); - (void) dmu_objset_find_dp(dp, dp->dp_root_dir_obj, - zil_claim, tx, DS_FIND_CHILDREN); - dmu_tx_commit(tx); + if (checkpoint_rewind) { + spa_history_log_internal(spa, "checkpoint rewind", + NULL, "rewound state to txg=%llu", + (u_longlong_t)spa->spa_uberblock.ub_checkpoint_txg); + } - spa->spa_claiming = B_FALSE; + /* + * Traverse the ZIL and claim all blocks. + */ + spa_ld_claim_log_blocks(spa); - spa_set_log_state(spa, SPA_LOG_GOOD); + /* + * Kick-off the syncing thread. + */ spa->spa_sync_on = B_TRUE; txg_sync_start(spa->spa_dsl_pool); mmp_thread_start(spa); @@ -3226,40 +4207,24 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, * Wait for all claims to sync. We sync up to the highest * claimed log block birth time so that claimed log blocks * don't appear to be from the future. spa_claim_max_txg - * will have been set for us by either zil_check_log_chain() - * (invoked from spa_check_logs()) or zil_claim() above. + * will have been set for us by ZIL traversal operations + * performed above. */ txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg); /* - * If the config cache is stale, or we have uninitialized - * metaslabs (see spa_vdev_add()), then update the config. - * - * If this is a verbatim import, trust the current - * in-core spa_config and update the disk labels. - */ - if (config_cache_txg != spa->spa_config_txg || - state == SPA_LOAD_IMPORT || - state == SPA_LOAD_RECOVER || - (spa->spa_import_flags & ZFS_IMPORT_VERBATIM)) - need_update = B_TRUE; - - for (int c = 0; c < rvd->vdev_children; c++) - if (rvd->vdev_child[c]->vdev_ms_array == 0) - need_update = B_TRUE; - - /* - * Update the config cache asychronously in case we're the - * root pool, in which case the config cache isn't writable yet. + * Check if we need to request an update of the config. On the + * next sync, we would update the config stored in vdev labels + * and the cachefile (by default /etc/zfs/zpool.cache). */ - if (need_update) - spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); + spa_ld_check_for_config_update(spa, config_cache_txg, + update_config_cache); /* * Check all DTLs to see if anything needs resilvering. */ if (!dsl_scan_resilvering(spa->spa_dsl_pool) && - vdev_resilver_needed(rvd, NULL, NULL)) + vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) spa_async_request(spa, SPA_ASYNC_RESILVER); /* @@ -3268,8 +4233,17 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, */ spa_history_log_version(spa, "open", NULL); + spa_restart_removal(spa); + spa_spawn_aux_threads(spa); + /* * Delete any inconsistent datasets. + * + * Note: + * Since we may be issuing deletes for clones here, + * we make sure to do so after we've spawned all the + * auxiliary threads above (from which the livelist + * deletion zthr is part of). */ (void) dmu_objset_find(spa_name(spa), dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN); @@ -3278,13 +4252,19 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, * Clean up any stale temporary dataset userrefs. */ dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); + + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + vdev_initialize_restart(spa->spa_root_vdev); + spa_config_exit(spa, SCL_CONFIG, FTAG); } + spa_load_note(spa, "LOADED"); + return (0); } static int -spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig) +spa_load_retry(spa_t *spa, spa_load_state_t state) { int mode = spa->spa_mode; @@ -3296,7 +4276,10 @@ spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig) spa_activate(spa, mode); spa_async_suspend(spa); - return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig)); + spa_load_note(spa, "spa_load_retry: rewind, max txg: %llu", + (u_longlong_t)spa->spa_load_max_txg); + + return (spa_load(spa, state, SPA_IMPORT_EXISTING)); } /* @@ -3307,8 +4290,8 @@ spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig) * spa_load(). */ static int -spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig, - uint64_t max_request, int rewind_flags) +spa_load_best(spa_t *spa, spa_load_state_t state, uint64_t max_request, + int rewind_flags) { nvlist_t *loadinfo = NULL; nvlist_t *config = NULL; @@ -3325,10 +4308,18 @@ spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig, spa->spa_extreme_rewind = B_TRUE; } - load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING, - mosconfig); + load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING); if (load_error == 0) return (0); + if (load_error == ZFS_ERR_NO_CHECKPOINT) { + /* + * When attempting checkpoint-rewind on a pool with no + * checkpoint, we should not attempt to load uberblocks + * from previous txgs when spa_load fails. + */ + ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); + return (load_error); + } if (spa->spa_root_vdev != NULL) config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); @@ -3367,7 +4358,7 @@ spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig, spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) { if (spa->spa_load_max_txg < safe_rewind_txg) spa->spa_extreme_rewind = B_TRUE; - rewind_error = spa_load_retry(spa, state, mosconfig); + rewind_error = spa_load_retry(spa, state); } spa->spa_extreme_rewind = B_FALSE; @@ -3436,22 +4427,24 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, } if (spa->spa_state == POOL_STATE_UNINITIALIZED) { - zpool_rewind_policy_t policy; + zpool_load_policy_t policy; firstopen = B_TRUE; - zpool_get_rewind_policy(nvpolicy ? nvpolicy : spa->spa_config, + zpool_get_load_policy(nvpolicy ? nvpolicy : spa->spa_config, &policy); - if (policy.zrp_request & ZPOOL_DO_REWIND) + if (policy.zlp_rewind & ZPOOL_DO_REWIND) state = SPA_LOAD_RECOVER; spa_activate(spa, spa_mode_global); if (state != SPA_LOAD_RECOVER) spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; + spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE; - error = spa_load_best(spa, state, B_FALSE, policy.zrp_txg, - policy.zrp_request); + zfs_dbgmsg("spa_open_common: opening %s", pool); + error = spa_load_best(spa, state, policy.zlp_txg, + policy.zlp_rewind); if (error == EBADF) { /* @@ -3463,7 +4456,7 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, */ spa_unload(spa); spa_deactivate(spa); - spa_config_sync(spa, B_TRUE, B_TRUE); + spa_write_cachefile(spa, B_TRUE, B_TRUE); spa_remove(spa); if (locked) mutex_exit(&spa_namespace_lock); @@ -3999,7 +4992,7 @@ spa_create_check_encryption_params(dsl_crypto_params_t *dcp, !has_encryption) return (SET_ERROR(ENOTSUP)); - return (dmu_objset_create_crypt_check(NULL, dcp)); + return (dmu_objset_create_crypt_check(NULL, dcp, NULL)); } /* @@ -4018,7 +5011,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, uint64_t txg = TXG_INITIAL; nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; - uint64_t version, obj, root_dsobj = 0; + uint64_t version, obj; boolean_t has_features; boolean_t has_encryption; spa_feature_t feat; @@ -4026,7 +5019,8 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, char *poolname; nvlist_t *nvl; - if (nvlist_lookup_string(props, "tname", &poolname) != 0) + if (props == NULL || + nvlist_lookup_string(props, "tname", &poolname) != 0) poolname = (char *)pool; /* @@ -4098,6 +5092,9 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, spa->spa_uberblock.ub_version = version; spa->spa_ubsync = spa->spa_uberblock; spa->spa_load_state = SPA_LOAD_CREATE; + spa->spa_removing_phys.sr_state = DSS_NONE; + spa->spa_removing_phys.sr_removing_vdev = -1; + spa->spa_removing_phys.sr_prev_indirect_vdev = -1; /* * Create "The Godfather" zio to hold all async IOs @@ -4127,9 +5124,15 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, (error = vdev_create(rvd, txg, B_FALSE)) == 0 && (error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) == 0) { - for (int c = 0; c < rvd->vdev_children; c++) { - vdev_metaslab_set_size(rvd->vdev_child[c]); - vdev_expand(rvd->vdev_child[c], txg); + /* + * instantiate the metaslab groups (this will dirty the vdevs) + * we can no longer error exit past this point + */ + for (int c = 0; error == 0 && c < rvd->vdev_children; c++) { + vdev_t *vd = rvd->vdev_child[c]; + + vdev_metaslab_set_size(vd); + vdev_expand(vd, txg); } } @@ -4262,35 +5265,21 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, dmu_tx_commit(tx); - /* - * If the root dataset is encrypted we will need to create key mappings - * for the zio layer before we start to write any data to disk and hold - * them until after the first txg has been synced. Waiting for the first - * transaction to complete also ensures that our bean counters are - * appropriately updated. - */ - if (dp->dp_root_dir->dd_crypto_obj != 0) { - root_dsobj = dsl_dir_phys(dp->dp_root_dir)->dd_head_dataset_obj; - VERIFY0(spa_keystore_create_mapping_impl(spa, root_dsobj, - dp->dp_root_dir, FTAG)); - } - spa->spa_sync_on = B_TRUE; txg_sync_start(dp); mmp_thread_start(spa); txg_wait_synced(dp, txg); - if (dp->dp_root_dir->dd_crypto_obj != 0) - VERIFY0(spa_keystore_remove_mapping(spa, root_dsobj, FTAG)); + spa_spawn_aux_threads(spa); - spa_config_sync(spa, B_FALSE, B_TRUE); + spa_write_cachefile(spa, B_FALSE, B_TRUE); /* * Don't count references from objsets that are already closed * and are making their way through the eviction process. */ spa_evicting_os_wait(spa); - spa->spa_minref = refcount_count(&spa->spa_refcount); + spa->spa_minref = zfs_refcount_count(&spa->spa_refcount); spa->spa_load_state = SPA_LOAD_NONE; mutex_exit(&spa_namespace_lock); @@ -4307,7 +5296,7 @@ spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) spa_t *spa; char *altroot = NULL; spa_load_state_t state = SPA_LOAD_IMPORT; - zpool_rewind_policy_t policy; + zpool_load_policy_t policy; uint64_t mode = spa_mode_global; uint64_t readonly = B_FALSE; int error; @@ -4344,9 +5333,9 @@ spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) if (props != NULL) spa_configfile_set(spa, props, B_FALSE); - spa_config_sync(spa, B_FALSE, B_TRUE); + spa_write_cachefile(spa, B_FALSE, B_TRUE); spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT); - + zfs_dbgmsg("spa_import: verbatim import of %s", pool); mutex_exit(&spa_namespace_lock); return (0); } @@ -4358,20 +5347,20 @@ spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) */ spa_async_suspend(spa); - zpool_get_rewind_policy(config, &policy); - if (policy.zrp_request & ZPOOL_DO_REWIND) + zpool_get_load_policy(config, &policy); + if (policy.zlp_rewind & ZPOOL_DO_REWIND) state = SPA_LOAD_RECOVER; - /* - * Pass off the heavy lifting to spa_load(). Pass TRUE for mosconfig - * because the user-supplied config is actually the one to trust when - * doing an import. - */ - if (state != SPA_LOAD_RECOVER) - spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; + spa->spa_config_source = SPA_CONFIG_SRC_TRYIMPORT; - error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg, - policy.zrp_request); + if (state != SPA_LOAD_RECOVER) { + spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; + zfs_dbgmsg("spa_import: importing %s", pool); + } else { + zfs_dbgmsg("spa_import: importing %s, max_txg=%lld " + "(RECOVERY MODE)", pool, (longlong_t)policy.zlp_txg); + } + error = spa_load_best(spa, state, policy.zlp_txg, policy.zlp_rewind); /* * Propagate anything learned while loading the pool and pass it @@ -4485,10 +5474,11 @@ nvlist_t * spa_tryimport(nvlist_t *tryconfig) { nvlist_t *config = NULL; - char *poolname; + char *poolname, *cachefile; spa_t *spa; uint64_t state; int error; + zpool_load_policy_t policy; if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) return (NULL); @@ -4504,11 +5494,27 @@ spa_tryimport(nvlist_t *tryconfig) spa_activate(spa, FREAD); /* - * Pass off the heavy lifting to spa_load(). - * Pass TRUE for mosconfig because the user-supplied config - * is actually the one to trust when doing an import. + * Rewind pool if a max txg was provided. */ - error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING, B_TRUE); + zpool_get_load_policy(spa->spa_config, &policy); + if (policy.zlp_txg != UINT64_MAX) { + spa->spa_load_max_txg = policy.zlp_txg; + spa->spa_extreme_rewind = B_TRUE; + zfs_dbgmsg("spa_tryimport: importing %s, max_txg=%lld", + poolname, (longlong_t)policy.zlp_txg); + } else { + zfs_dbgmsg("spa_tryimport: importing %s", poolname); + } + + if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_CACHEFILE, &cachefile) + == 0) { + zfs_dbgmsg("spa_tryimport: using cachefile '%s'", cachefile); + spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE; + } else { + spa->spa_config_source = SPA_CONFIG_SRC_SCAN; + } + + error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING); /* * If 'tryconfig' was at least parsable, return the current config. @@ -4657,6 +5663,18 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, return (SET_ERROR(EXDEV)); } + /* + * We're about to export or destroy this pool. Make sure + * we stop all initializtion activity here before we + * set the spa_final_txg. This will ensure that all + * dirty data resulting from the initialization is + * committed to disk before we unload the pool. + */ + if (spa->spa_root_vdev != NULL) { + vdev_initialize_stop_all(spa->spa_root_vdev, + VDEV_INITIALIZE_ACTIVE); + } + /* * We want this to be reflected on every label, * so mark them all dirty. spa_unload() will do the @@ -4688,7 +5706,7 @@ export_spa: if (new_state != POOL_STATE_UNINITIALIZED) { if (!hardforce) - spa_config_sync(spa, B_TRUE, B_TRUE); + spa_write_cachefile(spa, B_TRUE, B_TRUE); spa_remove(spa); } mutex_exit(&spa_namespace_lock); @@ -4780,8 +5798,40 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot) return (spa_vdev_exit(spa, vd, txg, error)); /* - * Transfer each new top-level vdev from vd to rvd. + * If we are in the middle of a device removal, we can only add + * devices which match the existing devices in the pool. + * If we are in the middle of a removal, or have some indirect + * vdevs, we can not add raidz toplevels. */ + if (spa->spa_vdev_removal != NULL || + spa->spa_removing_phys.sr_prev_indirect_vdev != -1) { + for (int c = 0; c < vd->vdev_children; c++) { + tvd = vd->vdev_child[c]; + if (spa->spa_vdev_removal != NULL && + tvd->vdev_ashift != spa->spa_max_ashift) { + return (spa_vdev_exit(spa, vd, txg, EINVAL)); + } + /* Fail if top level vdev is raidz */ + if (tvd->vdev_ops == &vdev_raidz_ops) { + return (spa_vdev_exit(spa, vd, txg, EINVAL)); + } + /* + * Need the top level mirror to be + * a mirror of leaf vdevs only + */ + if (tvd->vdev_ops == &vdev_mirror_ops) { + for (uint64_t cid = 0; + cid < tvd->vdev_children; cid++) { + vdev_t *cvd = tvd->vdev_child[cid]; + if (!cvd->vdev_ops->vdev_op_leaf) { + return (spa_vdev_exit(spa, vd, + txg, EINVAL)); + } + } + } + } + } + for (int c = 0; c < vd->vdev_children; c++) { /* @@ -4867,6 +5917,16 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { + error = (spa_has_checkpoint(spa)) ? + ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; + return (spa_vdev_exit(spa, NULL, txg, error)); + } + + if (spa->spa_vdev_removal != NULL) + return (spa_vdev_exit(spa, NULL, txg, EBUSY)); + if (oldvd == NULL) return (spa_vdev_exit(spa, NULL, txg, ENODEV)); @@ -5027,9 +6087,14 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) /* * Schedule the resilver to restart in the future. We do this to * ensure that dmu_sync-ed blocks have been stitched into the - * respective datasets. + * respective datasets. We do not do this if resilvers have been + * deferred. */ - dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg); + if (dsl_scan_resilvering(spa_get_dsl(spa)) && + spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER)) + vdev_set_deferred_resilver(spa, newvd); + else + dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg); if (spa->spa_bootfs) spa_event_notify(spa, newvd, NULL, ESC_ZFS_BOOTFS_VDEV_ATTACH); @@ -5076,6 +6141,27 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) vd = spa_lookup_by_guid(spa, guid, B_FALSE); + /* + * Besides being called directly from the userland through the + * ioctl interface, spa_vdev_detach() can be potentially called + * at the end of spa_vdev_resilver_done(). + * + * In the regular case, when we have a checkpoint this shouldn't + * happen as we never empty the DTLs of a vdev during the scrub + * [see comment in dsl_scan_done()]. Thus spa_vdev_resilvering_done() + * should never get here when we have a checkpoint. + * + * That said, even in a case when we checkpoint the pool exactly + * as spa_vdev_resilver_done() calls this function everything + * should be fine as the resilver will return right away. + */ + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { + error = (spa_has_checkpoint(spa)) ? + ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; + return (spa_vdev_exit(spa, NULL, txg, error)); + } + if (vd == NULL) return (spa_vdev_exit(spa, NULL, txg, ENODEV)); @@ -5293,6 +6379,117 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) return (error); } +static int +spa_vdev_initialize_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type, + list_t *vd_list) +{ + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); + + /* Look up vdev and ensure it's a leaf. */ + vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE); + if (vd == NULL || vd->vdev_detached) { + spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); + return (SET_ERROR(ENODEV)); + } else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) { + spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); + return (SET_ERROR(EINVAL)); + } else if (!vdev_writeable(vd)) { + spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); + return (SET_ERROR(EROFS)); + } + mutex_enter(&vd->vdev_initialize_lock); + spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); + + /* + * When we activate an initialize action we check to see + * if the vdev_initialize_thread is NULL. We do this instead + * of using the vdev_initialize_state since there might be + * a previous initialization process which has completed but + * the thread is not exited. + */ + if (cmd_type == POOL_INITIALIZE_DO && + (vd->vdev_initialize_thread != NULL || + vd->vdev_top->vdev_removing)) { + mutex_exit(&vd->vdev_initialize_lock); + return (SET_ERROR(EBUSY)); + } else if (cmd_type == POOL_INITIALIZE_CANCEL && + (vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE && + vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED)) { + mutex_exit(&vd->vdev_initialize_lock); + return (SET_ERROR(ESRCH)); + } else if (cmd_type == POOL_INITIALIZE_SUSPEND && + vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE) { + mutex_exit(&vd->vdev_initialize_lock); + return (SET_ERROR(ESRCH)); + } + + switch (cmd_type) { + case POOL_INITIALIZE_DO: + vdev_initialize(vd); + break; + case POOL_INITIALIZE_CANCEL: + vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED, vd_list); + break; + case POOL_INITIALIZE_SUSPEND: + vdev_initialize_stop(vd, VDEV_INITIALIZE_SUSPENDED, vd_list); + break; + default: + panic("invalid cmd_type %llu", (unsigned long long)cmd_type); + } + mutex_exit(&vd->vdev_initialize_lock); + + return (0); +} + +int +spa_vdev_initialize(spa_t *spa, nvlist_t *nv, uint64_t cmd_type, + nvlist_t *vdev_errlist) +{ + int total_errors = 0; + list_t vd_list; + + list_create(&vd_list, sizeof (vdev_t), + offsetof(vdev_t, vdev_initialize_node)); + + /* + * We hold the namespace lock through the whole function + * to prevent any changes to the pool while we're starting or + * stopping initialization. The config and state locks are held so that + * we can properly assess the vdev state before we commit to + * the initializing operation. + */ + mutex_enter(&spa_namespace_lock); + + for (nvpair_t *pair = nvlist_next_nvpair(nv, NULL); + pair != NULL; pair = nvlist_next_nvpair(nv, pair)) { + uint64_t vdev_guid = fnvpair_value_uint64(pair); + + int error = spa_vdev_initialize_impl(spa, vdev_guid, cmd_type, + &vd_list); + if (error != 0) { + char guid_as_str[MAXNAMELEN]; + + (void) snprintf(guid_as_str, sizeof (guid_as_str), + "%llu", (unsigned long long)vdev_guid); + fnvlist_add_int64(vdev_errlist, guid_as_str, error); + total_errors++; + } + } + + /* Wait for all initialize threads to stop. */ + vdev_initialize_stop_wait(spa, &vd_list); + + /* Sync out the initializing state */ + txg_wait_synced(spa->spa_dsl_pool, 0); + mutex_exit(&spa_namespace_lock); + + list_destroy(&vd_list); + + return (total_errors); +} + /* * Split a set of devices from their mirrors, and create a new pool from them. */ @@ -5314,10 +6511,17 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, txg = spa_vdev_enter(spa); + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { + error = (spa_has_checkpoint(spa)) ? + ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; + return (spa_vdev_exit(spa, NULL, txg, error)); + } + /* clear the log and flush everything up to now */ activate_slog = spa_passivate_log(spa); (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); - error = spa_offline_log(spa); + error = spa_reset_logs(spa); txg = spa_vdev_config_enter(spa); if (activate_slog) @@ -5345,7 +6549,7 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, vdev_t *vd = rvd->vdev_child[c]; /* don't count the holes & logs as children */ - if (vd->vdev_islog || vd->vdev_ishole) { + if (vd->vdev_islog || !vdev_is_concrete(vd)) { if (lastlog == 0) lastlog = c; continue; @@ -5398,7 +6602,7 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, /* make sure there's nothing stopping the split */ if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops || vml[c]->vdev_islog || - vml[c]->vdev_ishole || + !vdev_is_concrete(vml[c]) || vml[c]->vdev_isspare || vml[c]->vdev_isl2cache || !vdev_writeable(vml[c]) || @@ -5409,7 +6613,8 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, break; } - if (vdev_dtl_required(vml[c])) { + if (vdev_dtl_required(vml[c]) || + vdev_resilver_needed(vml[c], NULL, NULL)) { error = SET_ERROR(EBUSY); break; } @@ -5493,8 +6698,30 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, spa_activate(newspa, spa_mode_global); spa_async_suspend(newspa); + /* + * Temporarily stop the initializing activity. We set the state to + * ACTIVE so that we know to resume the initializing once the split + * has completed. + */ + list_t vd_list; + list_create(&vd_list, sizeof (vdev_t), + offsetof(vdev_t, vdev_initialize_node)); + + for (c = 0; c < children; c++) { + if (vml[c] != NULL) { + mutex_enter(&vml[c]->vdev_initialize_lock); + vdev_initialize_stop(vml[c], VDEV_INITIALIZE_ACTIVE, + &vd_list); + mutex_exit(&vml[c]->vdev_initialize_lock); + } + } + vdev_initialize_stop_wait(spa, &vd_list); + list_destroy(&vd_list); + + newspa->spa_config_source = SPA_CONFIG_SRC_SPLIT; + /* create the new pool from the disks of the original pool */ - error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE); + error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE); if (error) goto out; @@ -5578,6 +6805,10 @@ out: if (vml[c] != NULL) vml[c]->vdev_offline = B_FALSE; } + + /* restart initializing disks as necessary */ + spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART); + vdev_reopen(spa->spa_root_vdev); nvlist_free(spa->spa_config_splitting); @@ -5588,257 +6819,6 @@ out: return (error); } -static nvlist_t * -spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid) -{ - for (int i = 0; i < count; i++) { - uint64_t guid; - - VERIFY(nvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID, - &guid) == 0); - - if (guid == target_guid) - return (nvpp[i]); - } - - return (NULL); -} - -static void -spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count, - nvlist_t *dev_to_remove) -{ - nvlist_t **newdev = NULL; - - if (count > 1) - newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP); - - for (int i = 0, j = 0; i < count; i++) { - if (dev[i] == dev_to_remove) - continue; - VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0); - } - - VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0); - VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0); - - for (int i = 0; i < count - 1; i++) - nvlist_free(newdev[i]); - - if (count > 1) - kmem_free(newdev, (count - 1) * sizeof (void *)); -} - -/* - * Evacuate the device. - */ -static int -spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd) -{ - uint64_t txg; - int error = 0; - - ASSERT(MUTEX_HELD(&spa_namespace_lock)); - ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); - ASSERT(vd == vd->vdev_top); - - /* - * Evacuate the device. We don't hold the config lock as writer - * since we need to do I/O but we do keep the - * spa_namespace_lock held. Once this completes the device - * should no longer have any blocks allocated on it. - */ - if (vd->vdev_islog) { - if (vd->vdev_stat.vs_alloc != 0) - error = spa_offline_log(spa); - } else { - error = SET_ERROR(ENOTSUP); - } - - if (error) - return (error); - - /* - * The evacuation succeeded. Remove any remaining MOS metadata - * associated with this vdev, and wait for these changes to sync. - */ - ASSERT0(vd->vdev_stat.vs_alloc); - txg = spa_vdev_config_enter(spa); - vd->vdev_removing = B_TRUE; - vdev_dirty_leaves(vd, VDD_DTL, txg); - vdev_config_dirty(vd); - spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); - - return (0); -} - -/* - * Complete the removal by cleaning up the namespace. - */ -static void -spa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd) -{ - vdev_t *rvd = spa->spa_root_vdev; - uint64_t id = vd->vdev_id; - boolean_t last_vdev = (id == (rvd->vdev_children - 1)); - - ASSERT(MUTEX_HELD(&spa_namespace_lock)); - ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); - ASSERT(vd == vd->vdev_top); - - /* - * Only remove any devices which are empty. - */ - if (vd->vdev_stat.vs_alloc != 0) - return; - - (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); - - if (list_link_active(&vd->vdev_state_dirty_node)) - vdev_state_clean(vd); - if (list_link_active(&vd->vdev_config_dirty_node)) - vdev_config_clean(vd); - - vdev_free(vd); - - if (last_vdev) { - vdev_compact_children(rvd); - } else { - vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops); - vdev_add_child(rvd, vd); - } - vdev_config_dirty(rvd); - - /* - * Reassess the health of our root vdev. - */ - vdev_reopen(rvd); -} - -/* - * Remove a device from the pool - - * - * Removing a device from the vdev namespace requires several steps - * and can take a significant amount of time. As a result we use - * the spa_vdev_config_[enter/exit] functions which allow us to - * grab and release the spa_config_lock while still holding the namespace - * lock. During each step the configuration is synced out. - * - * Currently, this supports removing only hot spares, slogs, and level 2 ARC - * devices. - */ -int -spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) -{ - vdev_t *vd; - sysevent_t *ev = NULL; - metaslab_group_t *mg; - nvlist_t **spares, **l2cache, *nv; - uint64_t txg = 0; - uint_t nspares, nl2cache; - int error = 0; - boolean_t locked = MUTEX_HELD(&spa_namespace_lock); - - ASSERT(spa_writeable(spa)); - - if (!locked) - txg = spa_vdev_enter(spa); - - vd = spa_lookup_by_guid(spa, guid, B_FALSE); - - if (spa->spa_spares.sav_vdevs != NULL && - nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, - ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 && - (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) { - /* - * Only remove the hot spare if it's not currently in use - * in this pool. - */ - if (vd == NULL || unspare) { - if (vd == NULL) - vd = spa_lookup_by_guid(spa, guid, B_TRUE); - ev = spa_event_create(spa, vd, NULL, - ESC_ZFS_VDEV_REMOVE_AUX); - spa_vdev_remove_aux(spa->spa_spares.sav_config, - ZPOOL_CONFIG_SPARES, spares, nspares, nv); - spa_load_spares(spa); - spa->spa_spares.sav_sync = B_TRUE; - } else { - error = SET_ERROR(EBUSY); - } - } else if (spa->spa_l2cache.sav_vdevs != NULL && - nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, - ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 && - (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) { - /* - * Cache devices can always be removed. - */ - vd = spa_lookup_by_guid(spa, guid, B_TRUE); - ev = spa_event_create(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE_AUX); - spa_vdev_remove_aux(spa->spa_l2cache.sav_config, - ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv); - spa_load_l2cache(spa); - spa->spa_l2cache.sav_sync = B_TRUE; - } else if (vd != NULL && vd->vdev_islog) { - ASSERT(!locked); - ASSERT(vd == vd->vdev_top); - - mg = vd->vdev_mg; - - /* - * Stop allocating from this vdev. - */ - metaslab_group_passivate(mg); - - /* - * Wait for the youngest allocations and frees to sync, - * and then wait for the deferral of those frees to finish. - */ - spa_vdev_config_exit(spa, NULL, - txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); - - /* - * Attempt to evacuate the vdev. - */ - error = spa_vdev_remove_evacuate(spa, vd); - - txg = spa_vdev_config_enter(spa); - - /* - * If we couldn't evacuate the vdev, unwind. - */ - if (error) { - metaslab_group_activate(mg); - return (spa_vdev_exit(spa, NULL, txg, error)); - } - - /* - * Clean up the vdev namespace. - */ - ev = spa_event_create(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE_DEV); - spa_vdev_remove_from_namespace(spa, vd); - - } else if (vd != NULL) { - /* - * Normal vdevs cannot be removed (yet). - */ - error = SET_ERROR(ENOTSUP); - } else { - /* - * There is no vdev of any kind with the specified guid. - */ - error = SET_ERROR(ENOENT); - } - - if (!locked) - error = spa_vdev_exit(spa, NULL, txg, error); - - if (ev) - spa_event_post(ev); - - return (error); -} - /* * Find any device that's done replacing, or a vdev marked 'unspare' that's * currently spared, so we can detach it. @@ -5876,6 +6856,7 @@ spa_vdev_resilver_done_hunt(vdev_t *vd) /* * Check for a completed resilver with the 'unspare' flag set. + * Also potentially update faulted state. */ if (vd->vdev_ops == &vdev_spare_ops) { vdev_t *first = vd->vdev_child[0]; @@ -5897,6 +6878,8 @@ spa_vdev_resilver_done_hunt(vdev_t *vd) !vdev_dtl_required(oldvd)) return (oldvd); + vdev_propagate_state(vd); + /* * If there are more than two spares attached to a disk, * and those spares are not required, then we want to @@ -6118,6 +7101,7 @@ static void spa_async_thread(void *arg) { spa_t *spa = (spa_t *)arg; + dsl_pool_t *dp = spa->spa_dsl_pool; int tasks; ASSERT(spa->spa_sync_on); @@ -6135,8 +7119,14 @@ spa_async_thread(void *arg) mutex_enter(&spa_namespace_lock); old_space = metaslab_class_get_space(spa_normal_class(spa)); + old_space += metaslab_class_get_space(spa_special_class(spa)); + old_space += metaslab_class_get_space(spa_dedup_class(spa)); + spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); + new_space = metaslab_class_get_space(spa_normal_class(spa)); + new_space += metaslab_class_get_space(spa_special_class(spa)); + new_space += metaslab_class_get_space(spa_dedup_class(spa)); mutex_exit(&spa_namespace_lock); /* @@ -6187,8 +7177,18 @@ spa_async_thread(void *arg) /* * Kick off a resilver. */ - if (tasks & SPA_ASYNC_RESILVER) - dsl_resilver_restart(spa->spa_dsl_pool, 0); + if (tasks & SPA_ASYNC_RESILVER && + (!dsl_scan_resilvering(dp) || + !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_RESILVER_DEFER))) + dsl_resilver_restart(dp, 0); + + if (tasks & SPA_ASYNC_INITIALIZE_RESTART) { + mutex_enter(&spa_namespace_lock); + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + vdev_initialize_restart(spa->spa_root_vdev); + spa_config_exit(spa, SCL_CONFIG, FTAG); + mutex_exit(&spa_namespace_lock); + } /* * Let the world know that we're done. @@ -6208,6 +7208,16 @@ spa_async_suspend(spa_t *spa) while (spa->spa_async_thread != NULL) cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); mutex_exit(&spa->spa_async_lock); + + spa_vdev_remove_suspend(spa); + + zthr_t *condense_thread = spa->spa_condense_zthr; + if (condense_thread != NULL) + zthr_cancel(condense_thread); + + zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr; + if (discard_thread != NULL) + zthr_cancel(discard_thread); } void @@ -6217,6 +7227,15 @@ spa_async_resume(spa_t *spa) ASSERT(spa->spa_async_suspended != 0); spa->spa_async_suspended--; mutex_exit(&spa->spa_async_lock); + spa_restart_removal(spa); + + zthr_t *condense_thread = spa->spa_condense_zthr; + if (condense_thread != NULL) + zthr_resume(condense_thread); + + zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr; + if (discard_thread != NULL) + zthr_resume(discard_thread); } static boolean_t @@ -6304,6 +7323,9 @@ spa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx) static void spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx) { + if (spa_sync_pass(spa) != 1) + return; + zio_t *zio = zio_root(spa, NULL, NULL, 0); VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj, spa_free_sync_cb, zio, tx), ==, 0); @@ -6699,10 +7721,10 @@ spa_sync_props(void *arg, dmu_tx_t *tx) static void spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) { - dsl_pool_t *dp = spa->spa_dsl_pool; - - ASSERT(spa->spa_sync_pass == 1); + if (spa_sync_pass(spa) != 1) + return; + dsl_pool_t *dp = spa->spa_dsl_pool; rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && @@ -6763,103 +7785,67 @@ spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) rrw_exit(&dp->dp_config_rwlock, FTAG); } -/* - * Sync the specified transaction group. New blocks may be dirtied as - * part of the process, so we iterate until it converges. - */ -void -spa_sync(spa_t *spa, uint64_t txg) +static void +vdev_indirect_state_sync_verify(vdev_t *vd) { - dsl_pool_t *dp = spa->spa_dsl_pool; - objset_t *mos = spa->spa_meta_objset; - bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK]; - vdev_t *rvd = spa->spa_root_vdev; - vdev_t *vd; - dmu_tx_t *tx; - int error; - uint32_t max_queue_depth = zfs_vdev_async_write_max_active * - zfs_vdev_queue_depth_pct / 100; - - VERIFY(spa_writeable(spa)); - - /* - * Lock out configuration changes. - */ - spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + ASSERTV(vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping); + ASSERTV(vdev_indirect_births_t *vib = vd->vdev_indirect_births); - spa->spa_syncing_txg = txg; - spa->spa_sync_pass = 0; - - mutex_enter(&spa->spa_alloc_lock); - VERIFY0(avl_numnodes(&spa->spa_alloc_tree)); - mutex_exit(&spa->spa_alloc_lock); - - /* - * If there are any pending vdev state changes, convert them - * into config changes that go out with this transaction group. - */ - spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); - while (list_head(&spa->spa_state_dirty_list) != NULL) { - /* - * We need the write lock here because, for aux vdevs, - * calling vdev_config_dirty() modifies sav_config. - * This is ugly and will become unnecessary when we - * eliminate the aux vdev wart by integrating all vdevs - * into the root vdev tree. - */ - spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); - spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER); - while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { - vdev_state_clean(vd); - vdev_config_dirty(vd); - } - spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); - spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); + if (vd->vdev_ops == &vdev_indirect_ops) { + ASSERT(vim != NULL); + ASSERT(vib != NULL); } - spa_config_exit(spa, SCL_STATE, FTAG); - - tx = dmu_tx_create_assigned(dp, txg); - spa->spa_sync_starttime = gethrtime(); - taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid); - spa->spa_deadman_tqid = taskq_dispatch_delay(system_delay_taskq, - spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() + - NSEC_TO_TICK(spa->spa_deadman_synctime)); + uint64_t obsolete_sm_object = 0; + ASSERT0(vdev_obsolete_sm_object(vd, &obsolete_sm_object)); + if (obsolete_sm_object != 0) { + ASSERT(vd->vdev_obsolete_sm != NULL); + ASSERT(vd->vdev_removing || + vd->vdev_ops == &vdev_indirect_ops); + ASSERT(vdev_indirect_mapping_num_entries(vim) > 0); + ASSERT(vdev_indirect_mapping_bytes_mapped(vim) > 0); + ASSERT3U(obsolete_sm_object, ==, + space_map_object(vd->vdev_obsolete_sm)); + ASSERT3U(vdev_indirect_mapping_bytes_mapped(vim), >=, + space_map_allocated(vd->vdev_obsolete_sm)); + } + ASSERT(vd->vdev_obsolete_segments != NULL); /* - * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, - * set spa_deflate if we have no raid-z vdevs. + * Since frees / remaps to an indirect vdev can only + * happen in syncing context, the obsolete segments + * tree must be empty when we start syncing. */ - if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && - spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { - int i; + ASSERT0(range_tree_space(vd->vdev_obsolete_segments)); +} - for (i = 0; i < rvd->vdev_children; i++) { - vd = rvd->vdev_child[i]; - if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) - break; - } - if (i == rvd->vdev_children) { - spa->spa_deflate = TRUE; - VERIFY(0 == zap_add(spa->spa_meta_objset, - DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, - sizeof (uint64_t), 1, &spa->spa_deflate, tx)); - } - } +/* + * Set the top-level vdev's max queue depth. Evaluate each top-level's + * async write queue depth in case it changed. The max queue depth will + * not change in the middle of syncing out this txg. + */ +static void +spa_sync_adjust_vdev_max_queue_depth(spa_t *spa) +{ + ASSERT(spa_writeable(spa)); - /* - * Set the top-level vdev's max queue depth. Evaluate each - * top-level's async write queue depth in case it changed. - * The max queue depth will not change in the middle of syncing - * out this txg. - */ - uint64_t queue_depth_total = 0; + vdev_t *rvd = spa->spa_root_vdev; + uint32_t max_queue_depth = zfs_vdev_async_write_max_active * + zfs_vdev_queue_depth_pct / 100; + metaslab_class_t *normal = spa_normal_class(spa); + metaslab_class_t *special = spa_special_class(spa); + metaslab_class_t *dedup = spa_dedup_class(spa); + + uint64_t slots_per_allocator = 0; for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; + metaslab_group_t *mg = tvd->vdev_mg; + if (mg == NULL || !metaslab_group_initialized(mg)) + continue; - if (mg == NULL || mg->mg_class != spa_normal_class(spa) || - !metaslab_group_initialized(mg)) + metaslab_class_t *mc = mg->mg_class; + if (mc != normal && mc != special && mc != dedup) continue; /* @@ -6867,21 +7853,56 @@ spa_sync(spa_t *spa, uint64_t txg) * allocations look at mg_max_alloc_queue_depth, and async * allocations all happen from spa_sync(). */ - ASSERT0(refcount_count(&mg->mg_alloc_queue_depth)); + for (int i = 0; i < spa->spa_alloc_count; i++) + ASSERT0(zfs_refcount_count( + &(mg->mg_alloc_queue_depth[i]))); mg->mg_max_alloc_queue_depth = max_queue_depth; - queue_depth_total += mg->mg_max_alloc_queue_depth; + + for (int i = 0; i < spa->spa_alloc_count; i++) { + mg->mg_cur_max_alloc_queue_depth[i] = + zfs_vdev_def_queue_depth; + } + slots_per_allocator += zfs_vdev_def_queue_depth; } - metaslab_class_t *mc = spa_normal_class(spa); - ASSERT0(refcount_count(&mc->mc_alloc_slots)); - mc->mc_alloc_max_slots = queue_depth_total; - mc->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; - ASSERT3U(mc->mc_alloc_max_slots, <=, - max_queue_depth * rvd->vdev_children); + for (int i = 0; i < spa->spa_alloc_count; i++) { + ASSERT0(zfs_refcount_count(&normal->mc_alloc_slots[i])); + ASSERT0(zfs_refcount_count(&special->mc_alloc_slots[i])); + ASSERT0(zfs_refcount_count(&dedup->mc_alloc_slots[i])); + normal->mc_alloc_max_slots[i] = slots_per_allocator; + special->mc_alloc_max_slots[i] = slots_per_allocator; + dedup->mc_alloc_max_slots[i] = slots_per_allocator; + } + normal->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; + special->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; + dedup->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; +} + +static void +spa_sync_condense_indirect(spa_t *spa, dmu_tx_t *tx) +{ + ASSERT(spa_writeable(spa)); + + vdev_t *rvd = spa->spa_root_vdev; + for (int c = 0; c < rvd->vdev_children; c++) { + vdev_t *vd = rvd->vdev_child[c]; + vdev_indirect_state_sync_verify(vd); + + if (vdev_indirect_should_condense(vd)) { + spa_condense_indirect_start_sync(vd, tx); + break; + } + } +} + +static void +spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx) +{ + objset_t *mos = spa->spa_meta_objset; + dsl_pool_t *dp = spa->spa_dsl_pool; + uint64_t txg = tx->tx_txg; + bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK]; - /* - * Iterate to convergence. - */ do { int pass = ++spa->spa_sync_pass; @@ -6907,71 +7928,60 @@ spa_sync(spa_t *spa, uint64_t txg) ddt_sync(spa, txg); dsl_scan_sync(dp, tx); + svr_sync(spa, tx); + spa_sync_upgrades(spa, tx); - while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, txg))) + vdev_t *vd = NULL; + while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) + != NULL) vdev_sync(vd, txg); - if (pass == 1) { - spa_sync_upgrades(spa, tx); - ASSERT3U(txg, >=, - spa->spa_uberblock.ub_rootbp.blk_birth); + /* + * Note: We need to check if the MOS is dirty because we could + * have marked the MOS dirty without updating the uberblock + * (e.g. if we have sync tasks but no dirty user data). We need + * to check the uberblock's rootbp because it is updated if we + * have synced out dirty data (though in this case the MOS will + * most likely also be dirty due to second order effects, we + * don't want to rely on that here). + */ + if (pass == 1 && + spa->spa_uberblock.ub_rootbp.blk_birth < txg && + !dmu_objset_is_dirty(mos, txg)) { /* - * Note: We need to check if the MOS is dirty - * because we could have marked the MOS dirty - * without updating the uberblock (e.g. if we - * have sync tasks but no dirty user data). We - * need to check the uberblock's rootbp because - * it is updated if we have synced out dirty - * data (though in this case the MOS will most - * likely also be dirty due to second order - * effects, we don't want to rely on that here). + * Nothing changed on the first pass, therefore this + * TXG is a no-op. Avoid syncing deferred frees, so + * that we can keep this TXG as a no-op. */ - if (spa->spa_uberblock.ub_rootbp.blk_birth < txg && - !dmu_objset_is_dirty(mos, txg)) { - /* - * Nothing changed on the first pass, - * therefore this TXG is a no-op. Avoid - * syncing deferred frees, so that we - * can keep this TXG as a no-op. - */ - ASSERT(txg_list_empty(&dp->dp_dirty_datasets, - txg)); - ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); - ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg)); - break; - } - spa_sync_deferred_frees(spa, tx); + ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); + ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); + ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg)); + ASSERT(txg_list_empty(&dp->dp_early_sync_tasks, txg)); + break; } + spa_sync_deferred_frees(spa, tx); } while (dmu_objset_is_dirty(mos, txg)); +} -#ifdef ZFS_DEBUG - if (!list_is_empty(&spa->spa_config_dirty_list)) { - /* - * Make sure that the number of ZAPs for all the vdevs matches - * the number of ZAPs in the per-vdev ZAP list. This only gets - * called if the config is dirty; otherwise there may be - * outstanding AVZ operations that weren't completed in - * spa_sync_config_object. - */ - uint64_t all_vdev_zap_entry_count; - ASSERT0(zap_count(spa->spa_meta_objset, - spa->spa_all_vdev_zaps, &all_vdev_zap_entry_count)); - ASSERT3U(vdev_count_verify_zaps(spa->spa_root_vdev), ==, - all_vdev_zap_entry_count); - } -#endif +/* + * Rewrite the vdev configuration (which includes the uberblock) to + * commit the transaction group. + * + * If there are no dirty vdevs, we sync the uberblock to a few random + * top-level vdevs that are known to be visible in the config cache + * (see spa_vdev_add() for a complete description). If there *are* dirty + * vdevs, sync the uberblock to all vdevs. + */ +static void +spa_sync_rewrite_vdev_config(spa_t *spa, dmu_tx_t *tx) +{ + vdev_t *rvd = spa->spa_root_vdev; + uint64_t txg = tx->tx_txg; - /* - * Rewrite the vdev configuration (which includes the uberblock) - * to commit the transaction group. - * - * If there are no dirty vdevs, we sync the uberblock to a few - * random top-level vdevs that are known to be visible in the - * config cache (see spa_vdev_add() for a complete description). - * If there *are* dirty vdevs, sync the uberblock to all vdevs. - */ for (;;) { + int error = 0; + /* * We hold SCL_STATE to prevent vdev open/close/etc. * while we're attempting to write the vdev labels. @@ -6979,17 +7989,26 @@ spa_sync(spa_t *spa, uint64_t txg) spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); if (list_is_empty(&spa->spa_config_dirty_list)) { - vdev_t *svd[SPA_DVAS_PER_BP]; + vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL }; int svdcount = 0; int children = rvd->vdev_children; int c0 = spa_get_random(children); for (int c = 0; c < children; c++) { - vd = rvd->vdev_child[(c0 + c) % children]; - if (vd->vdev_ms_array == 0 || vd->vdev_islog) + vdev_t *vd = + rvd->vdev_child[(c0 + c) % children]; + + /* Stop when revisiting the first vdev */ + if (c > 0 && svd[0] == vd) + break; + + if (vd->vdev_ms_array == 0 || + vd->vdev_islog || + !vdev_is_concrete(vd)) continue; + svd[svdcount++] = vd; - if (svdcount == SPA_DVAS_PER_BP) + if (svdcount == SPA_SYNC_MIN_VDEVS) break; } error = vdev_config_sync(svd, svdcount, txg); @@ -7008,6 +8027,124 @@ spa_sync(spa_t *spa, uint64_t txg) zio_suspend(spa, NULL, ZIO_SUSPEND_IOERR); zio_resume_wait(spa); } +} + +/* + * Sync the specified transaction group. New blocks may be dirtied as + * part of the process, so we iterate until it converges. + */ +void +spa_sync(spa_t *spa, uint64_t txg) +{ + vdev_t *vd = NULL; + + VERIFY(spa_writeable(spa)); + + /* + * Wait for i/os issued in open context that need to complete + * before this txg syncs. + */ + (void) zio_wait(spa->spa_txg_zio[txg & TXG_MASK]); + spa->spa_txg_zio[txg & TXG_MASK] = zio_root(spa, NULL, NULL, + ZIO_FLAG_CANFAIL); + + /* + * Lock out configuration changes. + */ + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + + spa->spa_syncing_txg = txg; + spa->spa_sync_pass = 0; + + for (int i = 0; i < spa->spa_alloc_count; i++) { + mutex_enter(&spa->spa_alloc_locks[i]); + VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i])); + mutex_exit(&spa->spa_alloc_locks[i]); + } + + /* + * If there are any pending vdev state changes, convert them + * into config changes that go out with this transaction group. + */ + spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); + while (list_head(&spa->spa_state_dirty_list) != NULL) { + /* + * We need the write lock here because, for aux vdevs, + * calling vdev_config_dirty() modifies sav_config. + * This is ugly and will become unnecessary when we + * eliminate the aux vdev wart by integrating all vdevs + * into the root vdev tree. + */ + spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); + spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER); + while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { + vdev_state_clean(vd); + vdev_config_dirty(vd); + } + spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); + spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); + } + spa_config_exit(spa, SCL_STATE, FTAG); + + dsl_pool_t *dp = spa->spa_dsl_pool; + dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg); + + spa->spa_sync_starttime = gethrtime(); + taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid); + spa->spa_deadman_tqid = taskq_dispatch_delay(system_delay_taskq, + spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() + + NSEC_TO_TICK(spa->spa_deadman_synctime)); + + /* + * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, + * set spa_deflate if we have no raid-z vdevs. + */ + if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && + spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { + vdev_t *rvd = spa->spa_root_vdev; + + int i; + for (i = 0; i < rvd->vdev_children; i++) { + vd = rvd->vdev_child[i]; + if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) + break; + } + if (i == rvd->vdev_children) { + spa->spa_deflate = TRUE; + VERIFY0(zap_add(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, + sizeof (uint64_t), 1, &spa->spa_deflate, tx)); + } + } + + spa_sync_adjust_vdev_max_queue_depth(spa); + + spa_sync_condense_indirect(spa, tx); + + spa_sync_iterate_to_convergence(spa, tx); + +#ifdef ZFS_DEBUG + if (!list_is_empty(&spa->spa_config_dirty_list)) { + /* + * Make sure that the number of ZAPs for all the vdevs matches + * the number of ZAPs in the per-vdev ZAP list. This only gets + * called if the config is dirty; otherwise there may be + * outstanding AVZ operations that weren't completed in + * spa_sync_config_object. + */ + uint64_t all_vdev_zap_entry_count; + ASSERT0(zap_count(spa->spa_meta_objset, + spa->spa_all_vdev_zaps, &all_vdev_zap_entry_count)); + ASSERT3U(vdev_count_verify_zaps(spa->spa_root_vdev), ==, + all_vdev_zap_entry_count); + } +#endif + + if (spa->spa_vdev_removal != NULL) { + ASSERT0(spa->spa_vdev_removal->svr_bytes_done[txg & TXG_MASK]); + } + + spa_sync_rewrite_vdev_config(spa, tx); dmu_tx_commit(tx); taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid); @@ -7031,14 +8168,17 @@ spa_sync(spa_t *spa, uint64_t txg) dsl_pool_sync_done(dp, txg); - mutex_enter(&spa->spa_alloc_lock); - VERIFY0(avl_numnodes(&spa->spa_alloc_tree)); - mutex_exit(&spa->spa_alloc_lock); + for (int i = 0; i < spa->spa_alloc_count; i++) { + mutex_enter(&spa->spa_alloc_locks[i]); + VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i])); + mutex_exit(&spa->spa_alloc_locks[i]); + } /* * Update usable space statistics. */ - while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)))) + while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) + != NULL) vdev_sync_done(vd, txg); spa_update_dspace(spa); @@ -7051,6 +8191,9 @@ spa_sync(spa_t *spa, uint64_t txg) ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); + while (zfs_pause_spa_sync) + delay(1); + spa->spa_sync_pass = 0; /* @@ -7223,7 +8366,7 @@ spa_has_active_shared_spare(spa_t *spa) return (B_FALSE); } -static sysevent_t * +sysevent_t * spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name) { sysevent_t *ev = NULL; @@ -7239,7 +8382,7 @@ spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name) return (ev); } -static void +void spa_event_post(sysevent_t *ev) { #ifdef _KERNEL @@ -7263,7 +8406,7 @@ spa_event_notify(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name) spa_event_post(spa_event_create(spa, vd, hist_nvl, name)); } -#if defined(_KERNEL) && defined(HAVE_SPL) +#if defined(_KERNEL) /* state manipulation functions */ EXPORT_SYMBOL(spa_open); EXPORT_SYMBOL(spa_open_rewind); @@ -7286,7 +8429,6 @@ EXPORT_SYMBOL(spa_scan_get_stats); EXPORT_SYMBOL(spa_vdev_add); EXPORT_SYMBOL(spa_vdev_attach); EXPORT_SYMBOL(spa_vdev_detach); -EXPORT_SYMBOL(spa_vdev_remove); EXPORT_SYMBOL(spa_vdev_setpath); EXPORT_SYMBOL(spa_vdev_setfru); EXPORT_SYMBOL(spa_vdev_split_mirror); @@ -7321,7 +8463,7 @@ EXPORT_SYMBOL(spa_prop_clear_bootfs); EXPORT_SYMBOL(spa_event_notify); #endif -#if defined(_KERNEL) && defined(HAVE_SPL) +#if defined(_KERNEL) module_param(spa_load_verify_maxinflight, int, 0644); MODULE_PARM_DESC(spa_load_verify_maxinflight, "Max concurrent traversal I/Os while verifying pool during import -X"); @@ -7334,9 +8476,20 @@ module_param(spa_load_verify_data, int, 0644); MODULE_PARM_DESC(spa_load_verify_data, "Set to traverse data on pool import"); +module_param(spa_load_print_vdev_tree, int, 0644); +MODULE_PARM_DESC(spa_load_print_vdev_tree, + "Print vdev tree to zfs_dbgmsg during pool import"); + /* CSTYLED */ module_param(zio_taskq_batch_pct, uint, 0444); MODULE_PARM_DESC(zio_taskq_batch_pct, "Percentage of CPUs to run an IO worker thread"); +/* BEGIN CSTYLED */ +module_param(zfs_max_missing_tvds, ulong, 0644); +MODULE_PARM_DESC(zfs_max_missing_tvds, + "Allow importing pool with up to this number of missing top-level vdevs" + " (in read-only mode)"); +/* END CSTYLED */ + #endif