X-Git-Url: https://git.proxmox.com/?a=blobdiff_plain;f=module%2Fzfs%2Fspa.c;h=80f0c6f368f3f92022ce9c37f58884c1a3d3e6df;hb=afd2f7b7117ff8bf23afa70ecae86ec0c1a1461e;hp=b5e024c3f2911b069b3c1b46bb5e16f13662576c;hpb=aa9af22cdf8d16c197974c3a478d2053b3bed498;p=mirror_zfs.git diff --git a/module/zfs/spa.c b/module/zfs/spa.c index b5e024c3f..80f0c6f36 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -21,9 +21,16 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2011, 2017 by Delphix. All rights reserved. + * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2013, 2014, Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + * Copyright 2013 Saso Kiselkov. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + * Copyright 2016 Toomas Soome + * Copyright (c) 2016 Actifio, Inc. All rights reserved. + * Copyright (c) 2017 Datto Inc. + * Copyright 2017 Joyent, Inc. */ /* @@ -45,12 +52,17 @@ #include #include #include +#include +#include +#include #include #include #include +#include #include #include #include +#include #include #include #include @@ -71,6 +83,8 @@ #include #ifdef _KERNEL +#include +#include #include #include #include @@ -82,6 +96,12 @@ #include "zfs_prop.h" #include "zfs_comutil.h" +/* + * The interval, in seconds, at which failed configuration cache file writes + * should be retried. + */ +int zfs_ccw_retry_interval = 300; + typedef enum zti_modes { ZTI_MODE_FIXED, /* value is # of threads (min 1) */ ZTI_MODE_BATCH, /* cpu-intensive; value is ignored */ @@ -138,7 +158,7 @@ static void spa_sync_version(void *arg, dmu_tx_t *tx); static void spa_sync_props(void *arg, dmu_tx_t *tx); static boolean_t spa_has_active_shared_spare(spa_t *spa); static inline int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config, - spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, + spa_load_state_t state, spa_import_type_t type, boolean_t trust_config, char **ereport); static void spa_vdev_resilver_done(spa_t *spa); @@ -149,6 +169,12 @@ uint_t zio_taskq_basedc = 80; /* base duty cycle */ boolean_t spa_create_process = B_TRUE; /* no process ==> no sysdc */ +/* + * Report any spa_load_verify errors found, but do not fail spa_load. + * This is used by zdb to analyze non-idle pools. + */ +boolean_t spa_load_verify_dryrun = B_FALSE; + /* * This (illegal) pool name is used when temporarily importing a spa_t in order * to get the vdev stats associated with the imported devices. @@ -192,7 +218,7 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp) vdev_t *rvd = spa->spa_root_vdev; dsl_pool_t *pool = spa->spa_dsl_pool; uint64_t size, alloc, cap, version; - zprop_source_t src = ZPROP_SRC_NONE; + const zprop_source_t src = ZPROP_SRC_NONE; spa_config_dirent_t *dp; metaslab_class_t *mc = spa_normal_class(spa); @@ -224,11 +250,13 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp) rvd->vdev_state, src); version = spa_version(spa); - if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) - src = ZPROP_SRC_DEFAULT; - else - src = ZPROP_SRC_LOCAL; - spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src); + if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) { + spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, + version, ZPROP_SRC_DEFAULT); + } else { + spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, + version, ZPROP_SRC_LOCAL); + } } if (pool != NULL) { @@ -274,6 +302,14 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp) SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE); } + if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE)) { + spa_prop_add_list(*nvp, ZPOOL_PROP_MAXDNODESIZE, NULL, + DNODE_MAX_SIZE, ZPROP_SRC_NONE); + } else { + spa_prop_add_list(*nvp, ZPOOL_PROP_MAXDNODESIZE, NULL, + DNODE_MIN_SIZE, ZPROP_SRC_NONE); + } + if ((dp = list_head(&spa->spa_config_list)) != NULL) { if (dp->scd_path == NULL) { spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, @@ -324,7 +360,7 @@ spa_prop_get(spa_t *spa, nvlist_t **nvp) zprop_source_t src = ZPROP_SRC_DEFAULT; zpool_prop_t prop; - if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL) + if ((prop = zpool_name_to_prop(za.za_name)) == ZPOOL_PROP_INVAL) continue; switch (za.za_integer_length) { @@ -346,8 +382,7 @@ spa_prop_get(spa_t *spa, nvlist_t **nvp) break; } - strval = kmem_alloc( - MAXNAMELEN + strlen(MOS_DIR_NAME) + 1, + strval = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); dsl_dataset_name(ds, strval); dsl_dataset_rele(ds, FTAG); @@ -360,8 +395,7 @@ spa_prop_get(spa_t *spa, nvlist_t **nvp) spa_prop_add_list(*nvp, prop, strval, intval, src); if (strval != NULL) - kmem_free(strval, - MAXNAMELEN + strlen(MOS_DIR_NAME) + 1); + kmem_free(strval, ZFS_MAX_DATASET_NAME_LEN); break; @@ -413,8 +447,8 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) const char *propname = nvpair_name(elem); zpool_prop_t prop = zpool_name_to_prop(propname); - switch ((int)prop) { - case ZPROP_INVAL: + switch (prop) { + case ZPOOL_PROP_INVAL: if (!zpool_prop_feature(propname)) { error = SET_ERROR(EINVAL); break; @@ -465,6 +499,16 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) error = SET_ERROR(EINVAL); break; + case ZPOOL_PROP_MULTIHOST: + error = nvpair_value_uint64(elem, &intval); + if (!error && intval > 1) + error = SET_ERROR(EINVAL); + + if (!error && !spa_get_hostid()) + error = SET_ERROR(ENOTSUP); + + break; + case ZPOOL_PROP_BOOTFS: /* * If the pool version is less than SPA_VERSION_BOOTFS, @@ -505,7 +549,8 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) /* * Must be ZPL, and its property settings * must be supported by GRUB (compression - * is not gzip, and large blocks are not used). + * is not gzip, and large blocks or large + * dnodes are not used). */ if (dmu_objset_type(os) != DMU_OST_ZFS) { @@ -518,9 +563,9 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) error = SET_ERROR(ENOTSUP); } else if ((error = dsl_prop_get_int_ds(dmu_objset_ds(os), - zfs_prop_to_name(ZFS_PROP_RECORDSIZE), + zfs_prop_to_name(ZFS_PROP_DNODESIZE), &propval)) == 0 && - propval > SPA_OLD_MAXBLOCKSIZE) { + propval != ZFS_DNSIZE_LEGACY) { error = SET_ERROR(ENOTSUP); } else { objnum = dmu_objset_id(os); @@ -531,8 +576,7 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) case ZPOOL_PROP_FAILUREMODE: error = nvpair_value_uint64(elem, &intval); - if (!error && (intval < ZIO_FAILURE_MODE_WAIT || - intval > ZIO_FAILURE_MODE_PANIC)) + if (!error && intval > ZIO_FAILURE_MODE_PANIC) error = SET_ERROR(EINVAL); /* @@ -582,7 +626,6 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) error = SET_ERROR(EINVAL); break; } - check++; } if (strlen(strval) > ZPROP_MAX_COMMENT) error = SET_ERROR(E2BIG); @@ -662,7 +705,7 @@ spa_prop_set(spa_t *spa, nvlist_t *nvp) prop == ZPOOL_PROP_READONLY) continue; - if (prop == ZPOOL_PROP_VERSION || prop == ZPROP_INVAL) { + if (prop == ZPOOL_PROP_VERSION || prop == ZPOOL_PROP_INVAL) { uint64_t ver; if (prop == ZPOOL_PROP_VERSION) { @@ -721,10 +764,10 @@ spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) static int spa_change_guid_check(void *arg, dmu_tx_t *tx) { + ASSERTV(uint64_t *newguid = arg); spa_t *spa = dmu_tx_pool(tx)->dp_spa; vdev_t *rvd = spa->spa_root_vdev; uint64_t vdev_state; - ASSERTV(uint64_t *newguid = arg); spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); vdev_state = rvd->vdev_state; @@ -781,8 +824,8 @@ spa_change_guid(spa_t *spa) spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED); if (error == 0) { - spa_config_sync(spa, B_FALSE, B_TRUE); - spa_event_notify(spa, NULL, FM_EREPORT_ZFS_POOL_REGUID); + spa_write_cachefile(spa, B_FALSE, B_TRUE); + spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_REGUID); } mutex_exit(&spa_namespace_lock); @@ -800,19 +843,14 @@ spa_change_guid(spa_t *spa) static int spa_error_entry_compare(const void *a, const void *b) { - spa_error_entry_t *sa = (spa_error_entry_t *)a; - spa_error_entry_t *sb = (spa_error_entry_t *)b; + const spa_error_entry_t *sa = (const spa_error_entry_t *)a; + const spa_error_entry_t *sb = (const spa_error_entry_t *)b; int ret; - ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, + ret = memcmp(&sa->se_bookmark, &sb->se_bookmark, sizeof (zbookmark_phys_t)); - if (ret < 0) - return (-1); - else if (ret > 0) - return (1); - else - return (0); + return (AVL_ISIGN(ret)); } /* @@ -844,7 +882,7 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) uint_t count = ztip->zti_count; spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; char name[32]; - uint_t i, flags = TASKQ_DYNAMIC; + uint_t flags = 0; boolean_t batch = B_FALSE; if (mode == ZTI_MODE_NULL) { @@ -862,12 +900,13 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) case ZTI_MODE_FIXED: ASSERT3U(value, >=, 1); value = MAX(value, 1); + flags |= TASKQ_DYNAMIC; break; case ZTI_MODE_BATCH: batch = B_TRUE; flags |= TASKQ_THREADS_CPU_PCT; - value = zio_taskq_batch_pct; + value = MIN(zio_taskq_batch_pct, 100); break; default: @@ -877,7 +916,7 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) break; } - for (i = 0; i < count; i++) { + for (uint_t i = 0; i < count; i++) { taskq_t *tq; if (count > 1) { @@ -898,11 +937,13 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) pri_t pri = maxclsyspri; /* * The write issue taskq can be extremely CPU - * intensive. Run it at slightly lower priority - * than the other taskqs. + * intensive. Run it at slightly less important + * priority than the other taskqs. Under Linux this + * means incrementing the priority value on platforms + * like illumos it should be decremented. */ if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE) - pri--; + pri++; tq = taskq_create_proc(name, value, pri, 50, INT_MAX, spa->spa_proc, flags); @@ -916,14 +957,13 @@ static void spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q) { spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; - uint_t i; if (tqs->stqs_taskq == NULL) { ASSERT3U(tqs->stqs_count, ==, 0); return; } - for (i = 0; i < tqs->stqs_count; i++) { + for (uint_t i = 0; i < tqs->stqs_count; i++) { ASSERT3P(tqs->stqs_taskq[i], !=, NULL); taskq_destroy(tqs->stqs_taskq[i]); } @@ -985,15 +1025,18 @@ spa_taskq_dispatch_sync(spa_t *spa, zio_type_t t, zio_taskq_type_t q, static void spa_create_zio_taskqs(spa_t *spa) { - int t, q; - - for (t = 0; t < ZIO_TYPES; t++) { - for (q = 0; q < ZIO_TASKQ_TYPES; q++) { + for (int t = 0; t < ZIO_TYPES; t++) { + for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { spa_taskqs_init(spa, t, q); } } } +/* + * Disabled until spa_thread() can be adapted for Linux. + */ +#undef HAVE_SPA_THREAD + #if defined(_KERNEL) && defined(HAVE_SPA_THREAD) static void spa_thread(void *arg) @@ -1113,6 +1156,9 @@ spa_activate(spa_t *spa, int mode) spa_create_zio_taskqs(spa); } + for (size_t i = 0; i < TXG_SIZE; i++) + spa->spa_txg_zio[i] = zio_root(spa, NULL, NULL, 0); + list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), offsetof(vdev_t, vdev_config_dirty_node)); list_create(&spa->spa_evicting_os_list, sizeof (objset_t), @@ -1120,7 +1166,7 @@ spa_activate(spa_t *spa, int mode) list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), offsetof(vdev_t, vdev_state_dirty_node)); - txg_list_create(&spa->spa_vdev_txg_list, + txg_list_create(&spa->spa_vdev_txg_list, spa, offsetof(struct vdev, vdev_txg_node)); avl_create(&spa->spa_errlist_scrub, @@ -1129,6 +1175,41 @@ spa_activate(spa_t *spa, int mode) avl_create(&spa->spa_errlist_last, spa_error_entry_compare, sizeof (spa_error_entry_t), offsetof(spa_error_entry_t, se_avl)); + + spa_keystore_init(&spa->spa_keystore); + + /* + * This taskq is used to perform zvol-minor-related tasks + * asynchronously. This has several advantages, including easy + * resolution of various deadlocks (zfsonlinux bug #3681). + * + * The taskq must be single threaded to ensure tasks are always + * processed in the order in which they were dispatched. + * + * A taskq per pool allows one to keep the pools independent. + * This way if one pool is suspended, it will not impact another. + * + * The preferred location to dispatch a zvol minor task is a sync + * task. In this context, there is easy access to the spa_t and minimal + * error handling is required because the sync task must succeed. + */ + spa->spa_zvol_taskq = taskq_create("z_zvol", 1, defclsyspri, + 1, INT_MAX, 0); + + /* + * Taskq dedicated to prefetcher threads: this is used to prevent the + * pool traverse code from monopolizing the global (and limited) + * system_taskq by inappropriately scheduling long running tasks on it. + */ + spa->spa_prefetch_taskq = taskq_create("z_prefetch", boot_ncpus, + defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC); + + /* + * The taskq to upgrade datasets in this pool. Currently used by + * feature SPA_FEATURE_USEROBJ_ACCOUNTING/SPA_FEATURE_PROJECT_QUOTA. + */ + spa->spa_upgrade_taskq = taskq_create("z_upgrade", boot_ncpus, + defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC); } /* @@ -1137,8 +1218,6 @@ spa_activate(spa_t *spa, int mode) static void spa_deactivate(spa_t *spa) { - int t, q; - ASSERT(spa->spa_sync_on == B_FALSE); ASSERT(spa->spa_dsl_pool == NULL); ASSERT(spa->spa_root_vdev == NULL); @@ -1147,20 +1226,41 @@ spa_deactivate(spa_t *spa) spa_evicting_os_wait(spa); + if (spa->spa_zvol_taskq) { + taskq_destroy(spa->spa_zvol_taskq); + spa->spa_zvol_taskq = NULL; + } + + if (spa->spa_prefetch_taskq) { + taskq_destroy(spa->spa_prefetch_taskq); + spa->spa_prefetch_taskq = NULL; + } + + if (spa->spa_upgrade_taskq) { + taskq_destroy(spa->spa_upgrade_taskq); + spa->spa_upgrade_taskq = NULL; + } + txg_list_destroy(&spa->spa_vdev_txg_list); list_destroy(&spa->spa_config_dirty_list); list_destroy(&spa->spa_evicting_os_list); list_destroy(&spa->spa_state_dirty_list); - taskq_cancel_id(system_taskq, spa->spa_deadman_tqid); + taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid); - for (t = 0; t < ZIO_TYPES; t++) { - for (q = 0; q < ZIO_TASKQ_TYPES; q++) { + for (int t = 0; t < ZIO_TYPES; t++) { + for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { spa_taskqs_fini(spa, t, q); } } + for (size_t i = 0; i < TXG_SIZE; i++) { + ASSERT3P(spa->spa_txg_zio[i], !=, NULL); + VERIFY0(zio_wait(spa->spa_txg_zio[i])); + spa->spa_txg_zio[i] = NULL; + } + metaslab_class_destroy(spa->spa_normal_class); spa->spa_normal_class = NULL; @@ -1172,10 +1272,11 @@ spa_deactivate(spa_t *spa) * still have errors left in the queues. Empty them just in case. */ spa_errlog_drain(spa); - avl_destroy(&spa->spa_errlist_scrub); avl_destroy(&spa->spa_errlist_last); + spa_keystore_fini(&spa->spa_keystore); + spa->spa_state = POOL_STATE_UNINITIALIZED; mutex_enter(&spa->spa_proc_lock); @@ -1217,7 +1318,6 @@ spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, nvlist_t **child; uint_t children; int error; - int c; if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) return (error); @@ -1237,7 +1337,7 @@ spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, return (SET_ERROR(EINVAL)); } - for (c = 0; c < children; c++) { + for (int c = 0; c < children; c++) { vdev_t *vd; if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, atype)) != 0) { @@ -1262,6 +1362,8 @@ spa_unload(spa_t *spa) ASSERT(MUTEX_HELD(&spa_namespace_lock)); + spa_load_note(spa, "UNLOADING"); + /* * Stop async tasks. */ @@ -1275,16 +1377,45 @@ spa_unload(spa_t *spa) spa->spa_sync_on = B_FALSE; } + /* + * Even though vdev_free() also calls vdev_metaslab_fini, we need + * to call it earlier, before we wait for async i/o to complete. + * This ensures that there is no async metaslab prefetching, by + * calling taskq_wait(mg_taskq). + */ + if (spa->spa_root_vdev != NULL) { + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) + vdev_metaslab_fini(spa->spa_root_vdev->vdev_child[c]); + spa_config_exit(spa, SCL_ALL, FTAG); + } + + if (spa->spa_mmp.mmp_thread) + mmp_thread_stop(spa); + /* * Wait for any outstanding async I/O to complete. */ if (spa->spa_async_zio_root != NULL) { - for (i = 0; i < max_ncpus; i++) + for (int i = 0; i < max_ncpus; i++) (void) zio_wait(spa->spa_async_zio_root[i]); kmem_free(spa->spa_async_zio_root, max_ncpus * sizeof (void *)); spa->spa_async_zio_root = NULL; } + if (spa->spa_vdev_removal != NULL) { + spa_vdev_removal_destroy(spa->spa_vdev_removal); + spa->spa_vdev_removal = NULL; + } + + if (spa->spa_condense_zthr != NULL) { + ASSERT(!zthr_isrunning(spa->spa_condense_zthr)); + zthr_destroy(spa->spa_condense_zthr); + spa->spa_condense_zthr = NULL; + } + + spa_condense_fini(spa); + bpobj_close(&spa->spa_deferred_bpobj); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); @@ -1307,7 +1438,6 @@ spa_unload(spa_t *spa) ddt_unload(spa); - /* * Drop and purge level 2 cache */ @@ -1343,6 +1473,8 @@ spa_unload(spa_t *spa) spa->spa_async_suspended = 0; + spa->spa_indirect_vdevs_loaded = B_FALSE; + if (spa->spa_comment != NULL) { spa_strfree(spa->spa_comment); spa->spa_comment = NULL; @@ -1357,7 +1489,7 @@ spa_unload(spa_t *spa) * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and * then re-generate a more complete list including status information. */ -static void +void spa_load_spares(spa_t *spa) { nvlist_t **spares; @@ -1474,10 +1606,10 @@ spa_load_spares(spa_t *spa) * Devices which are already active have their details maintained, and are * not re-opened. */ -static void +void spa_load_l2cache(spa_t *spa) { - nvlist_t **l2cache; + nvlist_t **l2cache = NULL; uint_t nl2cache; int i, j, oldnvdevs; uint64_t guid; @@ -1486,20 +1618,21 @@ spa_load_l2cache(spa_t *spa) ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); - if (sav->sav_config != NULL) { - VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, - ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); - newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); - } else { - nl2cache = 0; - newvdevs = NULL; - } - oldvdevs = sav->sav_vdevs; oldnvdevs = sav->sav_count; sav->sav_vdevs = NULL; sav->sav_count = 0; + if (sav->sav_config == NULL) { + nl2cache = 0; + newvdevs = NULL; + goto out; + } + + VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, + ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); + newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); + /* * Process new nvlist of vdevs. */ @@ -1550,6 +1683,26 @@ spa_load_l2cache(spa_t *spa) } } + sav->sav_vdevs = newvdevs; + sav->sav_count = (int)nl2cache; + + /* + * Recompute the stashed list of l2cache devices, with status + * information this time. + */ + VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE, + DATA_TYPE_NVLIST_ARRAY) == 0); + + if (sav->sav_count > 0) + l2cache = kmem_alloc(sav->sav_count * sizeof (void *), + KM_SLEEP); + for (i = 0; i < sav->sav_count; i++) + l2cache[i] = vdev_config_generate(spa, + sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE); + VERIFY(nvlist_add_nvlist_array(sav->sav_config, + ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0); + +out: /* * Purge vdevs that were dropped */ @@ -1571,26 +1724,6 @@ spa_load_l2cache(spa_t *spa) if (oldvdevs) kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); - if (sav->sav_config == NULL) - goto out; - - sav->sav_vdevs = newvdevs; - sav->sav_count = (int)nl2cache; - - /* - * Recompute the stashed list of l2cache devices, with status - * information this time. - */ - VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE, - DATA_TYPE_NVLIST_ARRAY) == 0); - - l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); - for (i = 0; i < sav->sav_count; i++) - l2cache[i] = vdev_config_generate(spa, - sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE); - VERIFY(nvlist_add_nvlist_array(sav->sav_config, - ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0); -out: for (i = 0; i < sav->sav_count; i++) nvlist_free(l2cache[i]); if (sav->sav_count) @@ -1630,16 +1763,26 @@ load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) static void spa_check_removed(vdev_t *vd) { - int c; - - for (c = 0; c < vd->vdev_children; c++) + for (int c = 0; c < vd->vdev_children; c++) spa_check_removed(vd->vdev_child[c]); if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) && - !vd->vdev_ishole) { - zfs_ereport_post(FM_EREPORT_RESOURCE_AUTOREPLACE, - vd->vdev_spa, vd, NULL, 0, 0); - spa_event_notify(vd->vdev_spa, vd, FM_EREPORT_ZFS_DEVICE_CHECK); + vdev_is_concrete(vd)) { + zfs_post_autoreplace(vd->vdev_spa, vd); + spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_CHECK); + } +} + +static void +spa_config_valid_zaps(vdev_t *vd, vdev_t *mvd) +{ + ASSERT3U(vd->vdev_children, ==, mvd->vdev_children); + + vd->vdev_top_zap = mvd->vdev_top_zap; + vd->vdev_leaf_zap = mvd->vdev_leaf_zap; + + for (uint64_t i = 0; i < vd->vdev_children; i++) { + spa_config_valid_zaps(vd->vdev_child[i], mvd->vdev_child[i]); } } @@ -1651,7 +1794,6 @@ spa_config_valid(spa_t *spa, nvlist_t *config) { vdev_t *mrvd, *rvd = spa->spa_root_vdev; nvlist_t *nv; - int c, i; VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nv) == 0); @@ -1669,11 +1811,11 @@ spa_config_valid(spa_t *spa, nvlist_t *config) nvlist_t **child, *nv; uint64_t idx = 0; - child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t **), + child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t *), KM_SLEEP); VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); - for (c = 0; c < rvd->vdev_children; c++) { + for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; vdev_t *mtvd = mrvd->vdev_child[c]; @@ -1690,7 +1832,7 @@ spa_config_valid(spa_t *spa, nvlist_t *config) VERIFY(nvlist_add_nvlist(spa->spa_load_info, ZPOOL_CONFIG_MISSING_DEVICES, nv) == 0); - for (i = 0; i < idx; i++) + for (int i = 0; i < idx; i++) nvlist_free(child[i]); } nvlist_free(nv); @@ -1702,33 +1844,32 @@ spa_config_valid(spa_t *spa, nvlist_t *config) * from the MOS config (mrvd). Check each top-level vdev * with the corresponding MOS config top-level (mtvd). */ - for (c = 0; c < rvd->vdev_children; c++) { + for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; vdev_t *mtvd = mrvd->vdev_child[c]; /* * Resolve any "missing" vdevs in the current configuration. + * Also trust the MOS config about any "indirect" vdevs. * If we find that the MOS config has more accurate information * about the top-level vdev then use that vdev instead. */ - if (tvd->vdev_ops == &vdev_missing_ops && - mtvd->vdev_ops != &vdev_missing_ops) { - - if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) - continue; + if ((tvd->vdev_ops == &vdev_missing_ops && + mtvd->vdev_ops != &vdev_missing_ops) || + (mtvd->vdev_ops == &vdev_indirect_ops && + tvd->vdev_ops != &vdev_indirect_ops)) { /* * Device specific actions. */ if (mtvd->vdev_islog) { + if (!(spa->spa_import_flags & + ZFS_IMPORT_MISSING_LOG)) { + continue; + } + spa_set_log_state(spa, SPA_LOG_CLEAR); - } else { - /* - * XXX - once we have 'readonly' pool - * support we should be able to handle - * missing data devices by transitioning - * the pool to readonly. - */ + } else if (mtvd->vdev_ops != &vdev_indirect_ops) { continue; } @@ -1742,21 +1883,34 @@ spa_config_valid(spa_t *spa, nvlist_t *config) vdev_add_child(rvd, mtvd); vdev_add_child(mrvd, tvd); - spa_config_exit(spa, SCL_ALL, FTAG); - vdev_load(mtvd); - spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); - vdev_reopen(rvd); - } else if (mtvd->vdev_islog) { + } else { + if (mtvd->vdev_islog) { + /* + * Load the slog device's state from the MOS + * config since it's possible that the label + * does not contain the most up-to-date + * information. + */ + vdev_load_log_state(tvd, mtvd); + vdev_reopen(tvd); + } + /* - * Load the slog device's state from the MOS config - * since it's possible that the label does not - * contain the most up-to-date information. + * Per-vdev ZAP info is stored exclusively in the MOS. */ - vdev_load_log_state(tvd, mtvd); - vdev_reopen(tvd); + spa_config_valid_zaps(tvd, mtvd); } + + /* + * Never trust this info from userland; always use what's + * in the MOS. This prevents it from getting out of sync + * with the rest of the info in the MOS. + */ + tvd->vdev_removing = mtvd->vdev_removing; + tvd->vdev_indirect_config = mtvd->vdev_indirect_config; } + vdev_free(mrvd); spa_config_exit(spa, SCL_ALL, FTAG); @@ -1795,14 +1949,13 @@ spa_passivate_log(spa_t *spa) { vdev_t *rvd = spa->spa_root_vdev; boolean_t slog_found = B_FALSE; - int c; ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); if (!spa_has_slogs(spa)) return (B_FALSE); - for (c = 0; c < rvd->vdev_children; c++) { + for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; metaslab_group_t *mg = tvd->vdev_mg; @@ -1819,11 +1972,10 @@ static void spa_activate_log(spa_t *spa) { vdev_t *rvd = spa->spa_root_vdev; - int c; ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); - for (c = 0; c < rvd->vdev_children; c++) { + for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; metaslab_group_t *mg = tvd->vdev_mg; @@ -1833,11 +1985,11 @@ spa_activate_log(spa_t *spa) } int -spa_offline_log(spa_t *spa) +spa_reset_logs(spa_t *spa) { int error; - error = dmu_objset_find(spa_name(spa), zil_vdev_offline, + error = dmu_objset_find(spa_name(spa), zil_reset, NULL, DS_FIND_CHILDREN); if (error == 0) { /* @@ -1853,9 +2005,7 @@ spa_offline_log(spa_t *spa) static void spa_aux_check_removed(spa_aux_vdev_t *sav) { - int i; - - for (i = 0; i < sav->sav_count; i++) + for (int i = 0; i < sav->sav_count; i++) spa_check_removed(sav->sav_vdevs[i]); } @@ -1887,17 +2037,17 @@ spa_load_verify_done(zio_t *zio) int error = zio->io_error; spa_t *spa = zio->io_spa; + abd_free(zio->io_abd); if (error) { if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) && type != DMU_OT_INTENT_LOG) - atomic_add_64(&sle->sle_meta_count, 1); + atomic_inc_64(&sle->sle_meta_count); else - atomic_add_64(&sle->sle_data_count, 1); + atomic_inc_64(&sle->sle_data_count); } - zio_data_buf_free(zio->io_data, zio->io_size); mutex_enter(&spa->spa_scrub_lock); - spa->spa_scrub_inflight--; + spa->spa_load_verify_ios--; cv_broadcast(&spa->spa_scrub_io_cv); mutex_exit(&spa->spa_scrub_lock); } @@ -1915,11 +2065,7 @@ static int spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) { - zio_t *rio; - size_t size; - void *data; - - if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) + if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) return (0); /* * Note: normally this routine will not be called if @@ -1928,26 +2074,35 @@ spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, */ if (!spa_load_verify_metadata) return (0); - if (BP_GET_BUFC_TYPE(bp) == ARC_BUFC_DATA && !spa_load_verify_data) + if (!BP_IS_METADATA(bp) && !spa_load_verify_data) return (0); - rio = arg; - size = BP_GET_PSIZE(bp); - data = zio_data_buf_alloc(size); + zio_t *rio = arg; + size_t size = BP_GET_PSIZE(bp); mutex_enter(&spa->spa_scrub_lock); - while (spa->spa_scrub_inflight >= spa_load_verify_maxinflight) + while (spa->spa_load_verify_ios >= spa_load_verify_maxinflight) cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); - spa->spa_scrub_inflight++; + spa->spa_load_verify_ios++; mutex_exit(&spa->spa_scrub_lock); - zio_nowait(zio_read(rio, spa, bp, data, size, + zio_nowait(zio_read(rio, spa, bp, abd_alloc_for_io(size, B_FALSE), size, spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB, ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb)); return (0); } +/* ARGSUSED */ +int +verify_dataset_name_len(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) +{ + if (dsl_dataset_namelen(ds) >= ZFS_MAX_DATASET_NAME_LEN) + return (SET_ERROR(ENAMETOOLONG)); + + return (0); +} + static int spa_load_verify(spa_t *spa) { @@ -1962,13 +2117,28 @@ spa_load_verify(spa_t *spa) if (policy.zrp_request & ZPOOL_NEVER_REWIND) return (0); + dsl_pool_config_enter(spa->spa_dsl_pool, FTAG); + error = dmu_objset_find_dp(spa->spa_dsl_pool, + spa->spa_dsl_pool->dp_root_dir_obj, verify_dataset_name_len, NULL, + DS_FIND_CHILDREN); + dsl_pool_config_exit(spa->spa_dsl_pool, FTAG); + if (error != 0) + return (error); + rio = zio_root(spa, NULL, &sle, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); if (spa_load_verify_metadata) { + if (spa->spa_extreme_rewind) { + spa_load_note(spa, "performing a complete scan of the " + "pool since extreme rewind is on. This may take " + "a very long time.\n (spa_load_verify_data=%u, " + "spa_load_verify_metadata=%u)", + spa_load_verify_data, spa_load_verify_metadata); + } error = traverse_pool(spa, spa->spa_verify_min_txg, - TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, - spa_load_verify_cb, rio); + TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | + TRAVERSE_NO_DECRYPT, spa_load_verify_cb, rio); } (void) zio_wait(rio); @@ -1976,8 +2146,15 @@ spa_load_verify(spa_t *spa) spa->spa_load_meta_errors = sle.sle_meta_count; spa->spa_load_data_errors = sle.sle_data_count; - if (!error && sle.sle_meta_count <= policy.zrp_maxmeta && - sle.sle_data_count <= policy.zrp_maxdata) { + if (sle.sle_meta_count != 0 || sle.sle_data_count != 0) { + spa_load_note(spa, "spa_load_verify found %llu metadata errors " + "and %llu data errors", (u_longlong_t)sle.sle_meta_count, + (u_longlong_t)sle.sle_data_count); + } + + if (spa_load_verify_dryrun || + (!error && sle.sle_meta_count <= policy.zrp_maxmeta && + sle.sle_data_count <= policy.zrp_maxdata)) { int64_t loss = 0; verify_ok = B_TRUE; @@ -1995,6 +2172,9 @@ spa_load_verify(spa_t *spa) spa->spa_load_max_txg = spa->spa_uberblock.ub_txg; } + if (spa_load_verify_dryrun) + return (0); + if (error) { if (error != ENXIO && error != EIO) error = SET_ERROR(EIO); @@ -2018,17 +2198,34 @@ spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val) * Find a value in the pool directory object. */ static int -spa_dir_prop(spa_t *spa, const char *name, uint64_t *val) +spa_dir_prop(spa_t *spa, const char *name, uint64_t *val, boolean_t log_enoent) { - return (zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - name, sizeof (uint64_t), 1, val)); + int error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + name, sizeof (uint64_t), 1, val); + + if (error != 0 && (error != ENOENT || log_enoent)) { + spa_load_failed(spa, "couldn't get '%s' value in MOS directory " + "[error=%d]", name, error); + } + + return (error); } static int spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err) { vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux); - return (err); + return (SET_ERROR(err)); +} + +static void +spa_spawn_aux_threads(spa_t *spa) +{ + ASSERT(spa_writeable(spa)); + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + spa_start_indirect_condensing_thread(spa); } /* @@ -2115,7 +2312,7 @@ spa_try_repair(spa_t *spa, nvlist_t *config) static int spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type, - boolean_t mosconfig) + boolean_t trust_config) { nvlist_t *config = spa->spa_config; char *ereport = FM_EREPORT_ZFS_POOL; @@ -2159,7 +2356,7 @@ spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type, gethrestime(&spa->spa_loaded_ts); error = spa_load_impl(spa, pool_guid, config, state, type, - mosconfig, &ereport); + trust_config, &ereport); } /* @@ -2174,7 +2371,7 @@ spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type, spa->spa_loaded_ts.tv_nsec = 0; } if (error != EBADF) { - zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0); + zfs_ereport_post(ereport, spa, NULL, NULL, NULL, 0, 0); } } spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; @@ -2183,109 +2380,358 @@ spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type, return (error); } +#ifdef ZFS_DEBUG /* - * Load an existing storage pool, using the pool's builtin spa_config as a - * source of configuration information. + * Count the number of per-vdev ZAPs associated with all of the vdevs in the + * vdev tree rooted in the given vd, and ensure that each ZAP is present in the + * spa's per-vdev ZAP list. */ -__attribute__((always_inline)) -static inline int -spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, - spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, - char **ereport) +static uint64_t +vdev_count_verify_zaps(vdev_t *vd) { - int error = 0; - nvlist_t *nvroot = NULL; - nvlist_t *label; - vdev_t *rvd; - uberblock_t *ub = &spa->spa_uberblock; - uint64_t children, config_cache_txg = spa->spa_config_txg; - int orig_mode = spa->spa_mode; - int parse, i; - uint64_t obj; - boolean_t missing_feat_write = B_FALSE; + spa_t *spa = vd->vdev_spa; + uint64_t total = 0; - /* - * If this is an untrusted config, access the pool in read-only mode. - * This prevents things like resilvering recently removed devices. - */ - if (!mosconfig) - spa->spa_mode = FREAD; + if (vd->vdev_top_zap != 0) { + total++; + ASSERT0(zap_lookup_int(spa->spa_meta_objset, + spa->spa_all_vdev_zaps, vd->vdev_top_zap)); + } + if (vd->vdev_leaf_zap != 0) { + total++; + ASSERT0(zap_lookup_int(spa->spa_meta_objset, + spa->spa_all_vdev_zaps, vd->vdev_leaf_zap)); + } - ASSERT(MUTEX_HELD(&spa_namespace_lock)); + for (uint64_t i = 0; i < vd->vdev_children; i++) { + total += vdev_count_verify_zaps(vd->vdev_child[i]); + } - spa->spa_load_state = state; + return (total); +} +#endif - if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot)) - return (SET_ERROR(EINVAL)); +/* + * Determine whether the activity check is required. + */ +static boolean_t +spa_activity_check_required(spa_t *spa, uberblock_t *ub, nvlist_t *label, + nvlist_t *config) +{ + uint64_t state = 0; + uint64_t hostid = 0; + uint64_t tryconfig_txg = 0; + uint64_t tryconfig_timestamp = 0; + nvlist_t *nvinfo; - parse = (type == SPA_IMPORT_EXISTING ? - VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT); + if (nvlist_exists(config, ZPOOL_CONFIG_LOAD_INFO)) { + nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO); + (void) nvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_TXG, + &tryconfig_txg); + (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_TIMESTAMP, + &tryconfig_timestamp); + } + + (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, &state); /* - * Create "The Godfather" zio to hold all async IOs + * Disable the MMP activity check - This is used by zdb which + * is intended to be used on potentially active pools. */ - spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), - KM_SLEEP); - for (i = 0; i < max_ncpus; i++) { - spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, - ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | - ZIO_FLAG_GODFATHER); - } + if (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP) + return (B_FALSE); /* - * Parse the configuration into a vdev tree. We explicitly set the - * value that will be returned by spa_version() since parsing the - * configuration requires knowing the version number. + * Skip the activity check when the MMP feature is disabled. */ - spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); - error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, parse); - spa_config_exit(spa, SCL_ALL, FTAG); - - if (error != 0) - return (error); - - ASSERT(spa->spa_root_vdev == rvd); - ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT); - ASSERT3U(spa->spa_max_ashift, <=, SPA_MAXBLOCKSHIFT); - - if (type != SPA_IMPORT_ASSEMBLE) { - ASSERT(spa_guid(spa) == pool_guid); - } - + if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay == 0) + return (B_FALSE); /* - * Try to open all vdevs, loading each label in the process. + * If the tryconfig_* values are nonzero, they are the results of an + * earlier tryimport. If they match the uberblock we just found, then + * the pool has not changed and we return false so we do not test a + * second time. */ - spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); - error = vdev_open(rvd); - spa_config_exit(spa, SCL_ALL, FTAG); - if (error != 0) - return (error); + if (tryconfig_txg && tryconfig_txg == ub->ub_txg && + tryconfig_timestamp && tryconfig_timestamp == ub->ub_timestamp) + return (B_FALSE); /* - * We need to validate the vdev labels against the configuration that - * we have in hand, which is dependent on the setting of mosconfig. If - * mosconfig is true then we're validating the vdev labels based on - * that config. Otherwise, we're validating against the cached config - * (zpool.cache) that was read when we loaded the zfs module, and then - * later we will recursively call spa_load() and validate against - * the vdev config. - * - * If we're assembling a new pool that's been split off from an - * existing pool, the labels haven't yet been updated so we skip - * validation for now. + * Allow the activity check to be skipped when importing the pool + * on the same host which last imported it. Since the hostid from + * configuration may be stale use the one read from the label. */ - if (type != SPA_IMPORT_ASSEMBLE) { - spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); - error = vdev_validate(rvd, mosconfig); - spa_config_exit(spa, SCL_ALL, FTAG); + if (nvlist_exists(label, ZPOOL_CONFIG_HOSTID)) + hostid = fnvlist_lookup_uint64(label, ZPOOL_CONFIG_HOSTID); - if (error != 0) + if (hostid == spa_get_hostid()) + return (B_FALSE); + + /* + * Skip the activity test when the pool was cleanly exported. + */ + if (state != POOL_STATE_ACTIVE) + return (B_FALSE); + + return (B_TRUE); +} + +/* + * Perform the import activity check. If the user canceled the import or + * we detected activity then fail. + */ +static int +spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config) +{ + uint64_t import_intervals = MAX(zfs_multihost_import_intervals, 1); + uint64_t txg = ub->ub_txg; + uint64_t timestamp = ub->ub_timestamp; + uint64_t import_delay = NANOSEC; + hrtime_t import_expire; + nvlist_t *mmp_label = NULL; + vdev_t *rvd = spa->spa_root_vdev; + kcondvar_t cv; + kmutex_t mtx; + int error = 0; + + cv_init(&cv, NULL, CV_DEFAULT, NULL); + mutex_init(&mtx, NULL, MUTEX_DEFAULT, NULL); + mutex_enter(&mtx); + + /* + * If ZPOOL_CONFIG_MMP_TXG is present an activity check was performed + * during the earlier tryimport. If the txg recorded there is 0 then + * the pool is known to be active on another host. + * + * Otherwise, the pool might be in use on another node. Check for + * changes in the uberblocks on disk if necessary. + */ + if (nvlist_exists(config, ZPOOL_CONFIG_LOAD_INFO)) { + nvlist_t *nvinfo = fnvlist_lookup_nvlist(config, + ZPOOL_CONFIG_LOAD_INFO); + + if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_TXG) && + fnvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_TXG) == 0) { + vdev_uberblock_load(rvd, ub, &mmp_label); + error = SET_ERROR(EREMOTEIO); + goto out; + } + } + + /* + * Preferentially use the zfs_multihost_interval from the node which + * last imported the pool. This value is stored in an MMP uberblock as. + * + * ub_mmp_delay * vdev_count_leaves() == zfs_multihost_interval + */ + if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay) + import_delay = MAX(import_delay, import_intervals * + ub->ub_mmp_delay * MAX(vdev_count_leaves(spa), 1)); + + /* Apply a floor using the local default values. */ + import_delay = MAX(import_delay, import_intervals * + MSEC2NSEC(MAX(zfs_multihost_interval, MMP_MIN_INTERVAL))); + + zfs_dbgmsg("import_delay=%llu ub_mmp_delay=%llu import_intervals=%u " + "leaves=%u", import_delay, ub->ub_mmp_delay, import_intervals, + vdev_count_leaves(spa)); + + /* Add a small random factor in case of simultaneous imports (0-25%) */ + import_expire = gethrtime() + import_delay + + (import_delay * spa_get_random(250) / 1000); + + while (gethrtime() < import_expire) { + vdev_uberblock_load(rvd, ub, &mmp_label); + + if (txg != ub->ub_txg || timestamp != ub->ub_timestamp) { + error = SET_ERROR(EREMOTEIO); + break; + } + + if (mmp_label) { + nvlist_free(mmp_label); + mmp_label = NULL; + } + + error = cv_timedwait_sig(&cv, &mtx, ddi_get_lbolt() + hz); + if (error != -1) { + error = SET_ERROR(EINTR); + break; + } + error = 0; + } + +out: + mutex_exit(&mtx); + mutex_destroy(&mtx); + cv_destroy(&cv); + + /* + * If the pool is determined to be active store the status in the + * spa->spa_load_info nvlist. If the remote hostname or hostid are + * available from configuration read from disk store them as well. + * This allows 'zpool import' to generate a more useful message. + * + * ZPOOL_CONFIG_MMP_STATE - observed pool status (mandatory) + * ZPOOL_CONFIG_MMP_HOSTNAME - hostname from the active pool + * ZPOOL_CONFIG_MMP_HOSTID - hostid from the active pool + */ + if (error == EREMOTEIO) { + char *hostname = ""; + uint64_t hostid = 0; + + if (mmp_label) { + if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTNAME)) { + hostname = fnvlist_lookup_string(mmp_label, + ZPOOL_CONFIG_HOSTNAME); + fnvlist_add_string(spa->spa_load_info, + ZPOOL_CONFIG_MMP_HOSTNAME, hostname); + } + + if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTID)) { + hostid = fnvlist_lookup_uint64(mmp_label, + ZPOOL_CONFIG_HOSTID); + fnvlist_add_uint64(spa->spa_load_info, + ZPOOL_CONFIG_MMP_HOSTID, hostid); + } + } + + fnvlist_add_uint64(spa->spa_load_info, + ZPOOL_CONFIG_MMP_STATE, MMP_STATE_ACTIVE); + fnvlist_add_uint64(spa->spa_load_info, + ZPOOL_CONFIG_MMP_TXG, 0); + + error = spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO); + } + + if (mmp_label) + nvlist_free(mmp_label); + + return (error); +} + +static int +spa_ld_parse_config(spa_t *spa, uint64_t pool_guid, nvlist_t *config, + spa_import_type_t type) +{ + int error = 0; + nvlist_t *nvtree = NULL; + int parse; + vdev_t *rvd; + + if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtree)) { + spa_load_failed(spa, "invalid config provided: '%s' missing", + ZPOOL_CONFIG_VDEV_TREE); + return (SET_ERROR(EINVAL)); + } + + parse = (type == SPA_IMPORT_EXISTING ? + VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT); + + /* + * Create "The Godfather" zio to hold all async IOs + */ + spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), + KM_SLEEP); + for (int i = 0; i < max_ncpus; i++) { + spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | + ZIO_FLAG_GODFATHER); + } + + /* + * Parse the configuration into a vdev tree. We explicitly set the + * value that will be returned by spa_version() since parsing the + * configuration requires knowing the version number. + */ + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + error = spa_config_parse(spa, &rvd, nvtree, NULL, 0, parse); + spa_config_exit(spa, SCL_ALL, FTAG); + + if (error != 0) { + spa_load_failed(spa, "unable to parse config [error=%d]", + error); + return (error); + } + + ASSERT(spa->spa_root_vdev == rvd); + ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT); + ASSERT3U(spa->spa_max_ashift, <=, SPA_MAXBLOCKSHIFT); + + if (type != SPA_IMPORT_ASSEMBLE) { + ASSERT(spa_guid(spa) == pool_guid); + } + + return (0); +} + +static int +spa_ld_open_vdevs(spa_t *spa) +{ + int error = 0; + + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + error = vdev_open(spa->spa_root_vdev); + spa_config_exit(spa, SCL_ALL, FTAG); + if (error != 0) { + spa_load_failed(spa, "unable to open vdev tree [error=%d]", + error); + } + + return (error); +} + +static int +spa_ld_validate_vdevs(spa_t *spa, spa_import_type_t type, + boolean_t trust_config) +{ + int error = 0; + vdev_t *rvd = spa->spa_root_vdev; + + /* + * We need to validate the vdev labels against the configuration that + * we have in hand, which is dependent on the setting of trust_config. + * If trust_config is true then we're validating the vdev labels based + * on that config. Otherwise, we're validating against the cached + * config (zpool.cache) that was read when we loaded the zfs module, and + * then later we will recursively call spa_load() and validate against + * the vdev config. + * + * If we're assembling a new pool that's been split off from an + * existing pool, the labels haven't yet been updated so we skip + * validation for now. + */ + if (type != SPA_IMPORT_ASSEMBLE) { + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + error = vdev_validate(rvd, trust_config); + spa_config_exit(spa, SCL_ALL, FTAG); + + if (error != 0) { + spa_load_failed(spa, "vdev_validate failed [error=%d]", + error); return (error); + } - if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) + if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { + spa_load_failed(spa, "cannot open vdev tree after " + "invalidating some vdevs"); return (SET_ERROR(ENXIO)); + } } + return (0); +} + +static int +spa_ld_select_uberblock(spa_t *spa, nvlist_t *config, spa_import_type_t type, + boolean_t trust_config) +{ + vdev_t *rvd = spa->spa_root_vdev; + nvlist_t *label; + uberblock_t *ub = &spa->spa_uberblock; + uint64_t children; + boolean_t activity_check = B_FALSE; + /* * Find the best uberblock. */ @@ -2296,14 +2742,48 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, */ if (ub->ub_txg == 0) { nvlist_free(label); + spa_load_failed(spa, "no valid uberblock found"); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); } + spa_load_note(spa, "using uberblock with txg=%llu", + (u_longlong_t)ub->ub_txg); + + + /* + * For pools which have the multihost property on determine if the + * pool is truly inactive and can be safely imported. Prevent + * hosts which don't have a hostid set from importing the pool. + */ + activity_check = spa_activity_check_required(spa, ub, label, config); + if (activity_check) { + if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay && + spa_get_hostid() == 0) { + nvlist_free(label); + fnvlist_add_uint64(spa->spa_load_info, + ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID); + return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO)); + } + + int error = spa_activity_check(spa, ub, config); + if (error) { + nvlist_free(label); + return (error); + } + + fnvlist_add_uint64(spa->spa_load_info, + ZPOOL_CONFIG_MMP_STATE, MMP_STATE_INACTIVE); + fnvlist_add_uint64(spa->spa_load_info, + ZPOOL_CONFIG_MMP_TXG, ub->ub_txg); + } + /* * If the pool has an unsupported version we can't open it. */ if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) { nvlist_free(label); + spa_load_failed(spa, "version %llu is not supported", + (u_longlong_t)ub->ub_version); return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP)); } @@ -2314,9 +2794,17 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, * If we weren't able to find what's necessary for reading the * MOS in the label, return failure. */ - if (label == NULL || nvlist_lookup_nvlist(label, - ZPOOL_CONFIG_FEATURES_FOR_READ, &features) != 0) { + if (label == NULL) { + spa_load_failed(spa, "label config unavailable"); + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, + ENXIO)); + } + + if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_FEATURES_FOR_READ, + &features) != 0) { nvlist_free(label); + spa_load_failed(spa, "invalid label: '%s' missing", + ZPOOL_CONFIG_FEATURES_FOR_READ); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); } @@ -2338,13 +2826,12 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, */ if (ub->ub_version >= SPA_VERSION_FEATURES) { nvlist_t *unsup_feat; - nvpair_t *nvp; VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) == 0); - for (nvp = nvlist_next_nvpair(spa->spa_label_features, NULL); - nvp != NULL; + for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features, + NULL); nvp != NULL; nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) { if (!zfeature_is_supported(nvpair_name(nvp))) { VERIFY(nvlist_add_string(unsup_feat, @@ -2356,6 +2843,7 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, VERIFY(nvlist_add_nvlist(spa->spa_load_info, ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0); nvlist_free(unsup_feat); + spa_load_failed(spa, "some features are unsupported"); return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, ENOTSUP)); } @@ -2371,9 +2859,14 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, * can handle missing vdevs. */ if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN, - &children) != 0 && mosconfig && type != SPA_IMPORT_ASSEMBLE && - rvd->vdev_guid_sum != ub->ub_guid_sum) + &children) != 0 && trust_config && type != SPA_IMPORT_ASSEMBLE && + rvd->vdev_guid_sum != ub->ub_guid_sum) { + spa_load_failed(spa, "guid sum in config doesn't match guid " + "sum in uberblock (%llu != %llu)", + (u_longlong_t)rvd->vdev_guid_sum, + (u_longlong_t)ub->ub_guid_sum); return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); + } if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) { spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); @@ -2395,31 +2888,125 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, spa->spa_claim_max_txg = spa->spa_first_txg; spa->spa_prev_software_version = ub->ub_software_version; + return (0); +} + +static int +spa_ld_open_rootbp(spa_t *spa) +{ + int error = 0; + vdev_t *rvd = spa->spa_root_vdev; + error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool); - if (error) + if (error != 0) { + spa_load_failed(spa, "unable to open rootbp in dsl_pool_init " + "[error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + } spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; - if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0) + return (0); +} + +static int +spa_ld_validate_config(spa_t *spa, spa_import_type_t type) +{ + vdev_t *rvd = spa->spa_root_vdev; + + if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object, B_TRUE) + != 0) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + + /* + * Validate the config, using the MOS config to fill in any + * information which might be missing. If we fail to validate + * the config then declare the pool unfit for use. If we're + * assembling a pool from a split, the log is not transferred + * over. + */ + if (type != SPA_IMPORT_ASSEMBLE) { + nvlist_t *mos_config; + if (load_nvlist(spa, spa->spa_config_object, &mos_config) + != 0) { + spa_load_failed(spa, "unable to retrieve MOS config"); + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + } + + if (!spa_config_valid(spa, mos_config)) { + nvlist_free(mos_config); + spa_load_failed(spa, "mismatch between config provided " + "and config stored in MOS"); + return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, + ENXIO)); + } + nvlist_free(mos_config); + + /* + * Now that we've validated the config, check the state of the + * root vdev. If it can't be opened, it indicates one or + * more toplevel vdevs are faulted. + */ + if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { + spa_load_failed(spa, "some top vdevs are unavailable"); + return (SET_ERROR(ENXIO)); + } + } + + return (0); +} + +static int +spa_ld_open_indirect_vdev_metadata(spa_t *spa) +{ + int error = 0; + vdev_t *rvd = spa->spa_root_vdev; + + /* + * Everything that we read before spa_remove_init() must be stored + * on concreted vdevs. Therefore we do this as early as possible. + */ + error = spa_remove_init(spa); + if (error != 0) { + spa_load_failed(spa, "spa_remove_init failed [error=%d]", + error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + } + + /* + * Retrieve information needed to condense indirect vdev mappings. + */ + error = spa_condense_init(spa); + if (error != 0) { + spa_load_failed(spa, "spa_condense_init failed [error=%d]", + error); + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); + } + + return (0); +} + +static int +spa_ld_check_features(spa_t *spa, boolean_t *missing_feat_writep) +{ + int error = 0; + vdev_t *rvd = spa->spa_root_vdev; if (spa_version(spa) >= SPA_VERSION_FEATURES) { boolean_t missing_feat_read = B_FALSE; nvlist_t *unsup_feat, *enabled_feat; - spa_feature_t i; if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ, - &spa->spa_feat_for_read_obj) != 0) { + &spa->spa_feat_for_read_obj, B_TRUE) != 0) { return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE, - &spa->spa_feat_for_write_obj) != 0) { + &spa->spa_feat_for_write_obj, B_TRUE) != 0) { return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS, - &spa->spa_feat_desc_obj) != 0) { + &spa->spa_feat_desc_obj, B_TRUE) != 0) { return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } @@ -2430,10 +3017,11 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, unsup_feat, enabled_feat)) missing_feat_read = B_TRUE; - if (spa_writeable(spa) || state == SPA_LOAD_TRYIMPORT) { + if (spa_writeable(spa) || + spa->spa_load_state == SPA_LOAD_TRYIMPORT) { if (!spa_features_check(spa, B_TRUE, unsup_feat, enabled_feat)) { - missing_feat_write = B_TRUE; + *missing_feat_writep = B_TRUE; } } @@ -2472,8 +3060,9 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, * userland in order to know whether to display the * abovementioned note. */ - if (missing_feat_read || (missing_feat_write && + if (missing_feat_read || (*missing_feat_writep && spa_writeable(spa))) { + spa_load_failed(spa, "pool uses unsupported features"); return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, ENOTSUP)); } @@ -2482,7 +3071,7 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, * Load refcounts for ZFS features from disk into an in-memory * cache during SPA initialization. */ - for (i = 0; i < SPA_FEATURES; i++) { + for (spa_feature_t i = 0; i < SPA_FEATURES; i++) { uint64_t refcount; error = feature_get_refcount_from_disk(spa, @@ -2493,6 +3082,9 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, spa->spa_feat_refcount_cache[i] = SPA_FEATURE_DISABLED; } else { + spa_load_failed(spa, "error getting refcount " + "for feature %s [error=%d]", + spa_feature_table[i].fi_guid, error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } @@ -2501,82 +3093,113 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) { if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG, - &spa->spa_feat_enabled_txg_obj) != 0) + &spa->spa_feat_enabled_txg_obj, B_TRUE) != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } + return (0); +} + +static int +spa_ld_load_special_directories(spa_t *spa) +{ + int error = 0; + vdev_t *rvd = spa->spa_root_vdev; + spa->spa_is_initializing = B_TRUE; error = dsl_pool_open(spa->spa_dsl_pool); spa->spa_is_initializing = B_FALSE; - if (error != 0) + if (error != 0) { + spa_load_failed(spa, "dsl_pool_open failed [error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + } - if (!mosconfig) { - uint64_t hostid; - nvlist_t *policy = NULL, *nvconfig; + return (0); +} - if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) - return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); +static int +spa_ld_prepare_for_reload(spa_t *spa, int orig_mode) +{ + vdev_t *rvd = spa->spa_root_vdev; - if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig, - ZPOOL_CONFIG_HOSTID, &hostid) == 0) { - char *hostname; - unsigned long myhostid = 0; + uint64_t hostid; + nvlist_t *policy = NULL; + nvlist_t *mos_config; - VERIFY(nvlist_lookup_string(nvconfig, - ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); + if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0) { + spa_load_failed(spa, "unable to retrieve MOS config"); + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + } -#ifdef _KERNEL - myhostid = zone_get_hostid(NULL); -#else /* _KERNEL */ - /* - * We're emulating the system's hostid in userland, so - * we can't use zone_get_hostid(). - */ - (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); -#endif /* _KERNEL */ - if (hostid != 0 && myhostid != 0 && - hostid != myhostid) { - nvlist_free(nvconfig); - cmn_err(CE_WARN, "pool '%s' could not be " - "loaded as it was last accessed by another " - "system (host: %s hostid: 0x%lx). See: " - "http://zfsonlinux.org/msg/ZFS-8000-EY", - spa_name(spa), hostname, - (unsigned long)hostid); - return (SET_ERROR(EBADF)); - } + if (!spa_is_root(spa) && nvlist_lookup_uint64(mos_config, + ZPOOL_CONFIG_HOSTID, &hostid) == 0) { + char *hostname; + unsigned long myhostid = 0; + + VERIFY(nvlist_lookup_string(mos_config, + ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); + + myhostid = spa_get_hostid(); + if (hostid && myhostid && hostid != myhostid) { + nvlist_free(mos_config); + return (SET_ERROR(EBADF)); } - if (nvlist_lookup_nvlist(spa->spa_config, - ZPOOL_REWIND_POLICY, &policy) == 0) - VERIFY(nvlist_add_nvlist(nvconfig, - ZPOOL_REWIND_POLICY, policy) == 0); + } + if (nvlist_lookup_nvlist(spa->spa_config, + ZPOOL_REWIND_POLICY, &policy) == 0) + VERIFY(nvlist_add_nvlist(mos_config, + ZPOOL_REWIND_POLICY, policy) == 0); - spa_config_set(spa, nvconfig); - spa_unload(spa); - spa_deactivate(spa); - spa_activate(spa, orig_mode); + spa_config_set(spa, mos_config); + spa_unload(spa); + spa_deactivate(spa); + spa_activate(spa, orig_mode); - return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE)); + return (0); +} + +static int +spa_ld_get_props(spa_t *spa) +{ + int error = 0; + uint64_t obj; + vdev_t *rvd = spa->spa_root_vdev; + + /* Grab the checksum salt from the MOS. */ + error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_CHECKSUM_SALT, 1, + sizeof (spa->spa_cksum_salt.zcs_bytes), + spa->spa_cksum_salt.zcs_bytes); + if (error == ENOENT) { + /* Generate a new salt for subsequent use */ + (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes, + sizeof (spa->spa_cksum_salt.zcs_bytes)); + } else if (error != 0) { + spa_load_failed(spa, "unable to retrieve checksum salt from " + "MOS [error=%d]", error); + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } - if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj) != 0) + if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj, B_TRUE) != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj); - if (error != 0) + if (error != 0) { + spa_load_failed(spa, "error opening deferred-frees bpobj " + "[error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + } /* * Load the bit that tells us to use the new accounting function * (raid-z deflation). If we have an older pool, this will not * be present. */ - error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate); + error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate, B_FALSE); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION, - &spa->spa_creation_version); + &spa->spa_creation_version, B_FALSE); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); @@ -2584,12 +3207,13 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, * Load the persistent error log. If we have an older pool, this will * not be present. */ - error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last); + error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last, + B_FALSE); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB, - &spa->spa_errlog_scrub); + &spa->spa_errlog_scrub, B_FALSE); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); @@ -2597,10 +3221,80 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, * Load the history object. If we have an older pool, this * will not be present. */ - error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history); + error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history, B_FALSE); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + /* + * Load the per-vdev ZAP map. If we have an older pool, this will not + * be present; in this case, defer its creation to a later time to + * avoid dirtying the MOS this early / out of sync context. See + * spa_sync_config_object. + */ + + /* The sentinel is only available in the MOS config. */ + nvlist_t *mos_config; + if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0) { + spa_load_failed(spa, "unable to retrieve MOS config"); + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + } + + error = spa_dir_prop(spa, DMU_POOL_VDEV_ZAP_MAP, + &spa->spa_all_vdev_zaps, B_FALSE); + + if (error == ENOENT) { + VERIFY(!nvlist_exists(mos_config, + ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)); + spa->spa_avz_action = AVZ_ACTION_INITIALIZE; + ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev)); + } else if (error != 0) { + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + } else if (!nvlist_exists(mos_config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)) { + /* + * An older version of ZFS overwrote the sentinel value, so + * we have orphaned per-vdev ZAPs in the MOS. Defer their + * destruction to later; see spa_sync_config_object. + */ + spa->spa_avz_action = AVZ_ACTION_DESTROY; + /* + * We're assuming that no vdevs have had their ZAPs created + * before this. Better be sure of it. + */ + ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev)); + } + nvlist_free(mos_config); + + spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); + + error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object, + B_FALSE); + if (error && error != ENOENT) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + + if (error == 0) { + uint64_t autoreplace; + + spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs); + spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace); + spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation); + spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode); + spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand); + spa_prop_find(spa, ZPOOL_PROP_MULTIHOST, &spa->spa_multihost); + spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO, + &spa->spa_dedup_ditto); + + spa->spa_autoreplace = (autoreplace != 0); + } + + return (0); +} + +static int +spa_ld_open_aux_vdevs(spa_t *spa, spa_import_type_t type) +{ + int error = 0; + vdev_t *rvd = spa->spa_root_vdev; + /* * If we're assembling the pool from the split-off vdevs of * an existing pool, we don't want to attach the spares & cache @@ -2610,14 +3304,17 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, /* * Load any hot spares for this pool. */ - error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object); + error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object, + B_FALSE); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); if (load_nvlist(spa, spa->spa_spares.sav_object, - &spa->spa_spares.sav_config) != 0) + &spa->spa_spares.sav_config) != 0) { + spa_load_failed(spa, "error loading spares nvlist"); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + } spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_spares(spa); @@ -2630,14 +3327,16 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, * Load any level 2 ARC devices for this pool. */ error = spa_dir_prop(spa, DMU_POOL_L2CACHE, - &spa->spa_l2cache.sav_object); + &spa->spa_l2cache.sav_object, B_FALSE); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); if (load_nvlist(spa, spa->spa_l2cache.sav_object, - &spa->spa_l2cache.sav_config) != 0) + &spa->spa_l2cache.sav_config) != 0) { + spa_load_failed(spa, "error loading l2cache nvlist"); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + } spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_l2cache(spa); @@ -2646,100 +3345,336 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, spa->spa_l2cache.sav_sync = B_TRUE; } - spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); + return (0); +} - error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object); - if (error && error != ENOENT) +static int +spa_ld_load_vdev_metadata(spa_t *spa) +{ + int error = 0; + vdev_t *rvd = spa->spa_root_vdev; + + /* + * If the 'multihost' property is set, then never allow a pool to + * be imported when the system hostid is zero. The exception to + * this rule is zdb which is always allowed to access pools. + */ + if (spa_multihost(spa) && spa_get_hostid() == 0 && + (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP) == 0) { + fnvlist_add_uint64(spa->spa_load_info, + ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID); + return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO)); + } + + /* + * If the 'autoreplace' property is set, then post a resource notifying + * the ZFS DE that it should not issue any faults for unopenable + * devices. We also iterate over the vdevs, and post a sysevent for any + * unopenable vdevs so that the normal autoreplace handler can take + * over. + */ + if (spa->spa_autoreplace && spa->spa_load_state != SPA_LOAD_TRYIMPORT) { + spa_check_removed(spa->spa_root_vdev); + /* + * For the import case, this is done in spa_import(), because + * at this point we're using the spare definitions from + * the MOS config, not necessarily from the userland config. + */ + if (spa->spa_load_state != SPA_LOAD_IMPORT) { + spa_aux_check_removed(&spa->spa_spares); + spa_aux_check_removed(&spa->spa_l2cache); + } + } + + /* + * Load the vdev metadata such as metaslabs, DTLs, spacemap object, etc. + */ + error = vdev_load(rvd); + if (error != 0) { + spa_load_failed(spa, "vdev_load failed [error=%d]", error); + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); + } + + /* + * Propagate the leaf DTLs we just loaded all the way up the vdev tree. + */ + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + vdev_dtl_reassess(rvd, 0, 0, B_FALSE); + spa_config_exit(spa, SCL_ALL, FTAG); + + return (0); +} + +static int +spa_ld_load_dedup_tables(spa_t *spa) +{ + int error = 0; + vdev_t *rvd = spa->spa_root_vdev; + + error = ddt_load(spa); + if (error != 0) { + spa_load_failed(spa, "ddt_load failed [error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + } - if (error == 0) { - uint64_t autoreplace = 0; + return (0); +} - spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs); - spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace); - spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation); - spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode); - spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand); - spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO, - &spa->spa_dedup_ditto); +static int +spa_ld_verify_logs(spa_t *spa, spa_import_type_t type, char **ereport) +{ + vdev_t *rvd = spa->spa_root_vdev; - spa->spa_autoreplace = (autoreplace != 0); + if (type != SPA_IMPORT_ASSEMBLE && spa_writeable(spa)) { + boolean_t missing = spa_check_logs(spa); + if (missing) { + *ereport = FM_EREPORT_ZFS_LOG_REPLAY; + spa_load_failed(spa, "spa_check_logs failed"); + return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO)); + } } + return (0); +} + +static int +spa_ld_verify_pool_data(spa_t *spa) +{ + int error = 0; + vdev_t *rvd = spa->spa_root_vdev; + + /* + * We've successfully opened the pool, verify that we're ready + * to start pushing transactions. + */ + if (spa->spa_load_state != SPA_LOAD_TRYIMPORT) { + error = spa_load_verify(spa); + if (error != 0) { + spa_load_failed(spa, "spa_load_verify failed " + "[error=%d]", error); + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, + error)); + } + } + + return (0); +} + +static void +spa_ld_claim_log_blocks(spa_t *spa) +{ + dmu_tx_t *tx; + dsl_pool_t *dp = spa_get_dsl(spa); + + /* + * Claim log blocks that haven't been committed yet. + * This must all happen in a single txg. + * Note: spa_claim_max_txg is updated by spa_claim_notify(), + * invoked from zil_claim_log_block()'s i/o done callback. + * Price of rollback is that we abandon the log. + */ + spa->spa_claiming = B_TRUE; + + tx = dmu_tx_create_assigned(dp, spa_first_txg(spa)); + (void) dmu_objset_find_dp(dp, dp->dp_root_dir_obj, + zil_claim, tx, DS_FIND_CHILDREN); + dmu_tx_commit(tx); + + spa->spa_claiming = B_FALSE; + + spa_set_log_state(spa, SPA_LOG_GOOD); +} + +static void +spa_ld_check_for_config_update(spa_t *spa, uint64_t config_cache_txg) +{ + vdev_t *rvd = spa->spa_root_vdev; + int need_update = B_FALSE; + + /* + * If the config cache is stale, or we have uninitialized + * metaslabs (see spa_vdev_add()), then update the config. + * + * If this is a verbatim import, trust the current + * in-core spa_config and update the disk labels. + */ + if (config_cache_txg != spa->spa_config_txg || + spa->spa_load_state == SPA_LOAD_IMPORT || + spa->spa_load_state == SPA_LOAD_RECOVER || + (spa->spa_import_flags & ZFS_IMPORT_VERBATIM)) + need_update = B_TRUE; + + for (int c = 0; c < rvd->vdev_children; c++) + if (rvd->vdev_child[c]->vdev_ms_array == 0) + need_update = B_TRUE; + + /* + * Update the config cache asychronously in case we're the + * root pool, in which case the config cache isn't writable yet. + */ + if (need_update) + spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); +} + +/* + * Load an existing storage pool, using the config provided. This config + * describes which vdevs are part of the pool and is later validated against + * partial configs present in each vdev's label and an entire copy of the + * config stored in the MOS. + */ +static int +spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, + spa_load_state_t state, spa_import_type_t type, boolean_t trust_config, + char **ereport) +{ + int error = 0; + uint64_t config_cache_txg = spa->spa_config_txg; + int orig_mode = spa->spa_mode; + boolean_t missing_feat_write = B_FALSE; + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + spa->spa_load_state = state; + spa_load_note(spa, "LOADING"); + + /* + * If this is an untrusted config, first access the pool in read-only + * mode. We will then retrieve a trusted copy of the config from the MOS + * and use it to reopen the pool in read-write mode. + */ + if (!trust_config) + spa->spa_mode = FREAD; + + /* + * Parse the config provided to create a vdev tree. + */ + error = spa_ld_parse_config(spa, pool_guid, config, type); + if (error != 0) + return (error); + + /* + * Now that we have the vdev tree, try to open each vdev. This involves + * opening the underlying physical device, retrieving its geometry and + * probing the vdev with a dummy I/O. The state of each vdev will be set + * based on the success of those operations. After this we'll be ready + * to read from the vdevs. + */ + error = spa_ld_open_vdevs(spa); + if (error != 0) + return (error); + + /* + * Read the label of each vdev and make sure that the GUIDs stored + * there match the GUIDs in the config provided. + */ + error = spa_ld_validate_vdevs(spa, type, trust_config); + if (error != 0) + return (error); + + /* + * Read vdev labels to find the best uberblock (i.e. latest, unless + * spa_load_max_txg is set) and store it in spa_uberblock. We get the + * list of features required to read blkptrs in the MOS from the vdev + * label with the best uberblock and verify that our version of zfs + * supports them all. + */ + error = spa_ld_select_uberblock(spa, config, type, trust_config); + if (error != 0) + return (error); + + /* + * Pass that uberblock to the dsl_pool layer which will open the root + * blkptr. This blkptr points to the latest version of the MOS and will + * allow us to read its contents. + */ + error = spa_ld_open_rootbp(spa); + if (error != 0) + return (error); + + /* + * Retrieve the config stored in the MOS and use it to validate the + * config provided. Also extract some information from the MOS config + * to update our vdev tree. + */ + error = spa_ld_validate_config(spa, type); + if (error != 0) + return (error); + + /* + * Retrieve the mapping of indirect vdevs. Those vdevs were removed + * from the pool and their contents were re-mapped to other vdevs. Note + * that everything that we read before this step must have been + * rewritten on concrete vdevs after the last device removal was + * initiated. Otherwise we could be reading from indirect vdevs before + * we have loaded their mappings. + */ + error = spa_ld_open_indirect_vdev_metadata(spa); + if (error != 0) + return (error); + /* - * If the 'autoreplace' property is set, then post a resource notifying - * the ZFS DE that it should not issue any faults for unopenable - * devices. We also iterate over the vdevs, and post a sysevent for any - * unopenable vdevs so that the normal autoreplace handler can take - * over. + * Retrieve the full list of active features from the MOS and check if + * they are all supported. */ - if (spa->spa_autoreplace && state != SPA_LOAD_TRYIMPORT) { - spa_check_removed(spa->spa_root_vdev); - /* - * For the import case, this is done in spa_import(), because - * at this point we're using the spare definitions from - * the MOS config, not necessarily from the userland config. - */ - if (state != SPA_LOAD_IMPORT) { - spa_aux_check_removed(&spa->spa_spares); - spa_aux_check_removed(&spa->spa_l2cache); - } - } + error = spa_ld_check_features(spa, &missing_feat_write); + if (error != 0) + return (error); /* - * Load the vdev state for all toplevel vdevs. + * Load several special directories from the MOS needed by the dsl_pool + * layer. */ - vdev_load(rvd); + error = spa_ld_load_special_directories(spa); + if (error != 0) + return (error); /* - * Propagate the leaf DTLs we just loaded all the way up the tree. + * If the config provided is not trusted, discard it and use the config + * from the MOS to reload the pool. */ - spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); - vdev_dtl_reassess(rvd, 0, 0, B_FALSE); - spa_config_exit(spa, SCL_ALL, FTAG); + if (!trust_config) { + error = spa_ld_prepare_for_reload(spa, orig_mode); + if (error != 0) + return (error); + + spa_load_note(spa, "RELOADING"); + return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE)); + } /* - * Load the DDTs (dedup tables). + * Retrieve pool properties from the MOS. */ - error = ddt_load(spa); + error = spa_ld_get_props(spa); if (error != 0) - return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); - - spa_update_dspace(spa); + return (error); /* - * Validate the config, using the MOS config to fill in any - * information which might be missing. If we fail to validate - * the config then declare the pool unfit for use. If we're - * assembling a pool from a split, the log is not transferred - * over. + * Retrieve the list of auxiliary devices - cache devices and spares - + * and open them. */ - if (type != SPA_IMPORT_ASSEMBLE) { - nvlist_t *nvconfig; - - if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) - return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + error = spa_ld_open_aux_vdevs(spa, type); + if (error != 0) + return (error); - if (!spa_config_valid(spa, nvconfig)) { - nvlist_free(nvconfig); - return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, - ENXIO)); - } - nvlist_free(nvconfig); + /* + * Load the metadata for all vdevs. Also check if unopenable devices + * should be autoreplaced. + */ + error = spa_ld_load_vdev_metadata(spa); + if (error != 0) + return (error); - /* - * Now that we've validated the config, check the state of the - * root vdev. If it can't be opened, it indicates one or - * more toplevel vdevs are faulted. - */ - if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) - return (SET_ERROR(ENXIO)); + error = spa_ld_load_dedup_tables(spa); + if (error != 0) + return (error); - if (spa_writeable(spa) && spa_check_logs(spa)) { - *ereport = FM_EREPORT_ZFS_LOG_REPLAY; - return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO)); - } - } + /* + * Verify the logs now to make sure we don't have any unexpected errors + * when we claim log blocks later. + */ + error = spa_ld_verify_logs(spa, type, ereport); + if (error != 0) + return (error); if (missing_feat_write) { ASSERT(state == SPA_LOAD_TRYIMPORT); @@ -2749,93 +3684,75 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, * read-only mode but not read-write mode. We now have enough * information and can return to userland. */ - return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, ENOTSUP)); + return (spa_vdev_err(spa->spa_root_vdev, VDEV_AUX_UNSUP_FEAT, + ENOTSUP)); } /* - * We've successfully opened the pool, verify that we're ready - * to start pushing transactions. + * Traverse the last txgs to make sure the pool was left off in a safe + * state. When performing an extreme rewind, we verify the whole pool, + * which can take a very long time. */ - if (state != SPA_LOAD_TRYIMPORT) { - if ((error = spa_load_verify(spa))) - return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, - error)); - } + error = spa_ld_verify_pool_data(spa); + if (error != 0) + return (error); + /* + * Calculate the deflated space for the pool. This must be done before + * we write anything to the pool because we'd need to update the space + * accounting using the deflated sizes. + */ + spa_update_dspace(spa); + + /* + * We have now retrieved all the information we needed to open the + * pool. If we are importing the pool in read-write mode, a few + * additional steps must be performed to finish the import. + */ if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER || spa->spa_load_max_txg == UINT64_MAX)) { - dmu_tx_t *tx; - int need_update = B_FALSE; - dsl_pool_t *dp = spa_get_dsl(spa); - int c; - ASSERT(state != SPA_LOAD_TRYIMPORT); /* - * Claim log blocks that haven't been committed yet. - * This must all happen in a single txg. - * Note: spa_claim_max_txg is updated by spa_claim_notify(), - * invoked from zil_claim_log_block()'s i/o done callback. - * Price of rollback is that we abandon the log. + * Traverse the ZIL and claim all blocks. */ - spa->spa_claiming = B_TRUE; - - tx = dmu_tx_create_assigned(dp, spa_first_txg(spa)); - (void) dmu_objset_find_dp(dp, dp->dp_root_dir_obj, - zil_claim, tx, DS_FIND_CHILDREN); - dmu_tx_commit(tx); - - spa->spa_claiming = B_FALSE; + spa_ld_claim_log_blocks(spa); - spa_set_log_state(spa, SPA_LOG_GOOD); + /* + * Kick-off the syncing thread. + */ spa->spa_sync_on = B_TRUE; txg_sync_start(spa->spa_dsl_pool); + mmp_thread_start(spa); /* * Wait for all claims to sync. We sync up to the highest * claimed log block birth time so that claimed log blocks * don't appear to be from the future. spa_claim_max_txg - * will have been set for us by either zil_check_log_chain() - * (invoked from spa_check_logs()) or zil_claim() above. + * will have been set for us by ZIL traversal operations + * performed above. */ txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg); /* - * If the config cache is stale, or we have uninitialized - * metaslabs (see spa_vdev_add()), then update the config. - * - * If this is a verbatim import, trust the current - * in-core spa_config and update the disk labels. - */ - if (config_cache_txg != spa->spa_config_txg || - state == SPA_LOAD_IMPORT || - state == SPA_LOAD_RECOVER || - (spa->spa_import_flags & ZFS_IMPORT_VERBATIM)) - need_update = B_TRUE; - - for (c = 0; c < rvd->vdev_children; c++) - if (rvd->vdev_child[c]->vdev_ms_array == 0) - need_update = B_TRUE; - - /* - * Update the config cache asychronously in case we're the - * root pool, in which case the config cache isn't writable yet. + * Check if we need to request an update of the config. On the + * next sync, we would update the config stored in vdev labels + * and the cachefile (by default /etc/zfs/zpool.cache). */ - if (need_update) - spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); + spa_ld_check_for_config_update(spa, config_cache_txg); /* * Check all DTLs to see if anything needs resilvering. */ if (!dsl_scan_resilvering(spa->spa_dsl_pool) && - vdev_resilver_needed(rvd, NULL, NULL)) + vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) spa_async_request(spa, SPA_ASYNC_RESILVER); /* * Log the fact that we booted up (so that we can detect if * we rebooted in the middle of an operation). */ - spa_history_log_version(spa, "open"); + spa_history_log_version(spa, "open", NULL); /* * Delete any inconsistent datasets. @@ -2847,13 +3764,19 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, * Clean up any stale temporary dataset userrefs. */ dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); + + spa_restart_removal(spa); + + spa_spawn_aux_threads(spa); } + spa_load_note(spa, "LOADED"); + return (0); } static int -spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig) +spa_load_retry(spa_t *spa, spa_load_state_t state, int trust_config) { int mode = spa->spa_mode; @@ -2865,7 +3788,10 @@ spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig) spa_activate(spa, mode); spa_async_suspend(spa); - return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig)); + spa_load_note(spa, "spa_load_retry: rewind, max txg: %llu", + (u_longlong_t)spa->spa_load_max_txg); + + return (spa_load(spa, state, SPA_IMPORT_EXISTING, trust_config)); } /* @@ -2876,7 +3802,7 @@ spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig) * spa_load(). */ static int -spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig, +spa_load_best(spa_t *spa, spa_load_state_t state, int trust_config, uint64_t max_request, int rewind_flags) { nvlist_t *loadinfo = NULL; @@ -2895,7 +3821,7 @@ spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig, } load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING, - mosconfig); + trust_config); if (load_error == 0) return (0); @@ -2936,7 +3862,7 @@ spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig, spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) { if (spa->spa_load_max_txg < safe_rewind_txg) spa->spa_extreme_rewind = B_TRUE; - rewind_error = spa_load_retry(spa, state, mosconfig); + rewind_error = spa_load_retry(spa, state, trust_config); } spa->spa_extreme_rewind = B_FALSE; @@ -2944,6 +3870,8 @@ spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig, if (config && (rewind_error || state != SPA_LOAD_RECOVER)) spa_config_set(spa, config); + else + nvlist_free(config); if (state == SPA_LOAD_RECOVER) { ASSERT3P(loadinfo, ==, NULL); @@ -2991,7 +3919,7 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, * up calling spa_open() again. The real fix is to figure out how to * avoid dsl_dir_open() calling this in the first place. */ - if (mutex_owner(&spa_namespace_lock) != curthread) { + if (MUTEX_NOT_HELD(&spa_namespace_lock)) { mutex_enter(&spa_namespace_lock); locked = B_TRUE; } @@ -3017,6 +3945,7 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, if (state != SPA_LOAD_RECOVER) spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; + zfs_dbgmsg("spa_open_common: opening %s", pool); error = spa_load_best(spa, state, B_FALSE, policy.zrp_txg, policy.zrp_request); @@ -3030,7 +3959,7 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, */ spa_unload(spa); spa_deactivate(spa); - spa_config_sync(spa, B_TRUE, B_TRUE); + spa_write_cachefile(spa, B_TRUE, B_TRUE); spa_remove(spa); if (locked) mutex_exit(&spa_namespace_lock); @@ -3081,10 +4010,8 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, mutex_exit(&spa_namespace_lock); } -#ifdef _KERNEL if (firstopen) - zvol_create_minors(spa->spa_name); -#endif + zvol_create_minors(spa, spa_name(spa), B_TRUE); *spapp = spa; @@ -3232,6 +4159,8 @@ spa_add_l2cache(spa_t *spa, nvlist_t *config) ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) == 0); vdev_get_stats(vd, vs); + vdev_config_generate_stats(vd, l2cache[i]); + } } } @@ -3350,10 +4279,14 @@ spa_get_stats(const char *name, nvlist_t **config, ZPOOL_CONFIG_ERRCOUNT, spa_get_errlog_size(spa)) == 0); - if (spa_suspended(spa)) + if (spa_suspended(spa)) { VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_SUSPENDED, spa->spa_failmode) == 0); + VERIFY(nvlist_add_uint64(*config, + ZPOOL_CONFIG_SUSPENDED_REASON, + spa->spa_suspended) == 0); + } spa_add_spares(spa, *config); spa_add_l2cache(spa, *config); @@ -3440,18 +4373,6 @@ spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, goto out; } - /* - * The L2ARC currently only supports disk devices in - * kernel context. For user-level testing, we allow it. - */ -#ifdef _KERNEL - if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) && - strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) { - error = SET_ERROR(ENOTBLK); - vdev_free(vd); - goto out; - } -#endif vd->vdev_top = vd; if ((error = vdev_open(vd)) == 0 && @@ -3505,7 +4426,7 @@ spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, nvlist_t **newdevs; /* - * Generate new dev list by concatentating with the + * Generate new dev list by concatenating with the * current dev list. */ VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config, @@ -3561,12 +4482,28 @@ spa_l2cache_drop(spa_t *spa) } } +/* + * Verify encryption parameters for spa creation. If we are encrypting, we must + * have the encryption feature flag enabled. + */ +static int +spa_create_check_encryption_params(dsl_crypto_params_t *dcp, + boolean_t has_encryption) +{ + if (dcp->cp_crypt != ZIO_CRYPT_OFF && + dcp->cp_crypt != ZIO_CRYPT_INHERIT && + !has_encryption) + return (SET_ERROR(ENOTSUP)); + + return (dmu_objset_create_crypt_check(NULL, dcp)); +} + /* * Pool Creation */ int spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, - nvlist_t *zplprops) + nvlist_t *zplprops, dsl_crypto_params_t *dcp) { spa_t *spa; char *altroot = NULL; @@ -3577,10 +4514,11 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, uint64_t txg = TXG_INITIAL; nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; - uint64_t version, obj; + uint64_t version, obj, root_dsobj = 0; boolean_t has_features; - nvpair_t *elem; - int c, i; + boolean_t has_encryption; + spa_feature_t feat; + char *feat_name; char *poolname; nvlist_t *nvl; @@ -3621,10 +4559,28 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, spa->spa_import_flags |= ZFS_IMPORT_TEMP_NAME; has_features = B_FALSE; - for (elem = nvlist_next_nvpair(props, NULL); + has_encryption = B_FALSE; + for (nvpair_t *elem = nvlist_next_nvpair(props, NULL); elem != NULL; elem = nvlist_next_nvpair(props, elem)) { - if (zpool_prop_feature(nvpair_name(elem))) + if (zpool_prop_feature(nvpair_name(elem))) { has_features = B_TRUE; + + feat_name = strchr(nvpair_name(elem), '@') + 1; + VERIFY0(zfeature_lookup_name(feat_name, &feat)); + if (feat == SPA_FEATURE_ENCRYPTION) + has_encryption = B_TRUE; + } + } + + /* verify encryption params, if they were provided */ + if (dcp != NULL) { + error = spa_create_check_encryption_params(dcp, has_encryption); + if (error != 0) { + spa_deactivate(spa); + spa_remove(spa); + mutex_exit(&spa_namespace_lock); + return (error); + } } if (has_features || nvlist_lookup_uint64(props, @@ -3637,13 +4593,17 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, spa->spa_uberblock.ub_txg = txg - 1; spa->spa_uberblock.ub_version = version; spa->spa_ubsync = spa->spa_uberblock; + spa->spa_load_state = SPA_LOAD_CREATE; + spa->spa_removing_phys.sr_state = DSS_NONE; + spa->spa_removing_phys.sr_removing_vdev = -1; + spa->spa_removing_phys.sr_prev_indirect_vdev = -1; /* * Create "The Godfather" zio to hold all async IOs */ spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), KM_SLEEP); - for (i = 0; i < max_ncpus; i++) { + for (int i = 0; i < max_ncpus; i++) { spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); @@ -3666,7 +4626,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, (error = vdev_create(rvd, txg, B_FALSE)) == 0 && (error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) == 0) { - for (c = 0; c < rvd->vdev_children; c++) { + for (int c = 0; c < rvd->vdev_children; c++) { vdev_metaslab_set_size(rvd->vdev_child[c]); vdev_expand(rvd->vdev_child[c], txg); } @@ -3713,8 +4673,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, } spa->spa_is_initializing = B_TRUE; - spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg); - spa->spa_meta_objset = dp->dp_meta_objset; + spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, dcp, txg); spa->spa_is_initializing = B_FALSE; /* @@ -3726,6 +4685,15 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, tx = dmu_tx_create_assigned(dp, txg); + /* + * Create the pool's history object. + */ + if (version >= SPA_VERSION_ZPOOL_HISTORY && !spa->spa_history) + spa_history_create_obj(spa, tx); + + spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_CREATE); + spa_history_log_version(spa, "create", tx); + /* * Create the pool config object. */ @@ -3739,9 +4707,6 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, cmn_err(CE_PANIC, "failed to add pool config"); } - if (spa_version(spa) >= SPA_VERSION_FEATURES) - spa_feature_create_zap_objects(spa, tx); - if (zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION, sizeof (uint64_t), 1, &version, tx) != 0) { @@ -3752,279 +4717,88 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, if (version >= SPA_VERSION_RAIDZ_DEFLATE) { spa->spa_deflate = TRUE; if (zap_add(spa->spa_meta_objset, - DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, - sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { - cmn_err(CE_PANIC, "failed to add deflate"); - } - } - - /* - * Create the deferred-free bpobj. Turn off compression - * because sync-to-convergence takes longer if the blocksize - * keeps changing. - */ - obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx); - dmu_object_set_compress(spa->spa_meta_objset, obj, - ZIO_COMPRESS_OFF, tx); - if (zap_add(spa->spa_meta_objset, - DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ, - sizeof (uint64_t), 1, &obj, tx) != 0) { - cmn_err(CE_PANIC, "failed to add bpobj"); - } - VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj, - spa->spa_meta_objset, obj)); - - /* - * Create the pool's history object. - */ - if (version >= SPA_VERSION_ZPOOL_HISTORY) - spa_history_create_obj(spa, tx); - - /* - * Set pool properties. - */ - spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); - spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); - spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); - spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND); - - if (props != NULL) { - spa_configfile_set(spa, props, B_FALSE); - spa_sync_props(props, tx); - } - - dmu_tx_commit(tx); - - spa->spa_sync_on = B_TRUE; - txg_sync_start(spa->spa_dsl_pool); - - /* - * We explicitly wait for the first transaction to complete so that our - * bean counters are appropriately updated. - */ - txg_wait_synced(spa->spa_dsl_pool, txg); - - spa_config_sync(spa, B_FALSE, B_TRUE); - - spa_history_log_version(spa, "create"); - - /* - * Don't count references from objsets that are already closed - * and are making their way through the eviction process. - */ - spa_evicting_os_wait(spa); - spa->spa_minref = refcount_count(&spa->spa_refcount); - - mutex_exit(&spa_namespace_lock); - - return (0); -} - -#ifdef _KERNEL -/* - * Get the root pool information from the root disk, then import the root pool - * during the system boot up time. - */ -extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **); - -static nvlist_t * -spa_generate_rootconf(char *devpath, char *devid, uint64_t *guid) -{ - nvlist_t *config; - nvlist_t *nvtop, *nvroot; - uint64_t pgid; - - if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0) - return (NULL); - - /* - * Add this top-level vdev to the child array. - */ - VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, - &nvtop) == 0); - VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, - &pgid) == 0); - VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0); - - /* - * Put this pool's top-level vdevs into a root vdev. - */ - VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); - VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, - VDEV_TYPE_ROOT) == 0); - VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); - VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); - VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, - &nvtop, 1) == 0); - - /* - * Replace the existing vdev_tree with the new root vdev in - * this pool's configuration (remove the old, add the new). - */ - VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); - nvlist_free(nvroot); - return (config); -} - -/* - * Walk the vdev tree and see if we can find a device with "better" - * configuration. A configuration is "better" if the label on that - * device has a more recent txg. - */ -static void -spa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg) -{ - int c; - - for (c = 0; c < vd->vdev_children; c++) - spa_alt_rootvdev(vd->vdev_child[c], avd, txg); - - if (vd->vdev_ops->vdev_op_leaf) { - nvlist_t *label; - uint64_t label_txg; - - if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid, - &label) != 0) - return; - - VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG, - &label_txg) == 0); - - /* - * Do we have a better boot device? - */ - if (label_txg > *txg) { - *txg = label_txg; - *avd = vd; - } - nvlist_free(label); - } -} - -/* - * Import a root pool. - * - * For x86. devpath_list will consist of devid and/or physpath name of - * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a"). - * The GRUB "findroot" command will return the vdev we should boot. - * - * For Sparc, devpath_list consists the physpath name of the booting device - * no matter the rootpool is a single device pool or a mirrored pool. - * e.g. - * "/pci@1f,0/ide@d/disk@0,0:a" - */ -int -spa_import_rootpool(char *devpath, char *devid) -{ - spa_t *spa; - vdev_t *rvd, *bvd, *avd = NULL; - nvlist_t *config, *nvtop; - uint64_t guid, txg; - char *pname; - int error; - - /* - * Read the label from the boot device and generate a configuration. - */ - config = spa_generate_rootconf(devpath, devid, &guid); -#if defined(_OBP) && defined(_KERNEL) - if (config == NULL) { - if (strstr(devpath, "/iscsi/ssd") != NULL) { - /* iscsi boot */ - get_iscsi_bootpath_phy(devpath); - config = spa_generate_rootconf(devpath, devid, &guid); - } - } -#endif - if (config == NULL) { - cmn_err(CE_NOTE, "Cannot read the pool label from '%s'", - devpath); - return (SET_ERROR(EIO)); - } - - VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, - &pname) == 0); - VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); - - mutex_enter(&spa_namespace_lock); - if ((spa = spa_lookup(pname)) != NULL) { - /* - * Remove the existing root pool from the namespace so that we - * can replace it with the correct config we just read in. - */ - spa_remove(spa); + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, + sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { + cmn_err(CE_PANIC, "failed to add deflate"); + } } - spa = spa_add(pname, config, NULL); - spa->spa_is_root = B_TRUE; - spa->spa_import_flags = ZFS_IMPORT_VERBATIM; - /* - * Build up a vdev tree based on the boot device's label config. + * Create the deferred-free bpobj. Turn off compression + * because sync-to-convergence takes longer if the blocksize + * keeps changing. */ - VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, - &nvtop) == 0); - spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); - error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, - VDEV_ALLOC_ROOTPOOL); - spa_config_exit(spa, SCL_ALL, FTAG); - if (error) { - mutex_exit(&spa_namespace_lock); - nvlist_free(config); - cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", - pname); - return (error); + obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx); + dmu_object_set_compress(spa->spa_meta_objset, obj, + ZIO_COMPRESS_OFF, tx); + if (zap_add(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ, + sizeof (uint64_t), 1, &obj, tx) != 0) { + cmn_err(CE_PANIC, "failed to add bpobj"); } + VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj, + spa->spa_meta_objset, obj)); /* - * Get the boot vdev. + * Generate some random noise for salted checksums to operate on. */ - if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) { - cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu", - (u_longlong_t)guid); - error = SET_ERROR(ENOENT); - goto out; - } + (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes, + sizeof (spa->spa_cksum_salt.zcs_bytes)); /* - * Determine if there is a better boot device. + * Set pool properties. */ - avd = bvd; - spa_alt_rootvdev(rvd, &avd, &txg); - if (avd != bvd) { - cmn_err(CE_NOTE, "The boot device is 'degraded'. Please " - "try booting from '%s'", avd->vdev_path); - error = SET_ERROR(EINVAL); - goto out; + spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); + spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); + spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); + spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND); + spa->spa_multihost = zpool_prop_default_numeric(ZPOOL_PROP_MULTIHOST); + + if (props != NULL) { + spa_configfile_set(spa, props, B_FALSE); + spa_sync_props(props, tx); } + dmu_tx_commit(tx); + /* - * If the boot device is part of a spare vdev then ensure that - * we're booting off the active spare. + * If the root dataset is encrypted we will need to create key mappings + * for the zio layer before we start to write any data to disk and hold + * them until after the first txg has been synced. Waiting for the first + * transaction to complete also ensures that our bean counters are + * appropriately updated. */ - if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops && - !bvd->vdev_isspare) { - cmn_err(CE_NOTE, "The boot device is currently spared. Please " - "try booting from '%s'", - bvd->vdev_parent-> - vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path); - error = SET_ERROR(EINVAL); - goto out; + if (dp->dp_root_dir->dd_crypto_obj != 0) { + root_dsobj = dsl_dir_phys(dp->dp_root_dir)->dd_head_dataset_obj; + VERIFY0(spa_keystore_create_mapping_impl(spa, root_dsobj, + dp->dp_root_dir, FTAG)); } - error = 0; -out: - spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); - vdev_free(rvd); - spa_config_exit(spa, SCL_ALL, FTAG); + spa->spa_sync_on = B_TRUE; + txg_sync_start(dp); + mmp_thread_start(spa); + txg_wait_synced(dp, txg); + + if (dp->dp_root_dir->dd_crypto_obj != 0) + VERIFY0(spa_keystore_remove_mapping(spa, root_dsobj, FTAG)); + + spa_spawn_aux_threads(spa); + + spa_write_cachefile(spa, B_FALSE, B_TRUE); + + /* + * Don't count references from objsets that are already closed + * and are making their way through the eviction process. + */ + spa_evicting_os_wait(spa); + spa->spa_minref = refcount_count(&spa->spa_refcount); + spa->spa_load_state = SPA_LOAD_NONE; + mutex_exit(&spa_namespace_lock); - nvlist_free(config); - return (error); + return (0); } -#endif - /* * Import a non-root pool into the system. */ @@ -4071,8 +4845,9 @@ spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) if (props != NULL) spa_configfile_set(spa, props, B_FALSE); - spa_config_sync(spa, B_FALSE, B_TRUE); - + spa_write_cachefile(spa, B_FALSE, B_TRUE); + spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT); + zfs_dbgmsg("spa_import: verbatim import of %s", pool); mutex_exit(&spa_namespace_lock); return (0); } @@ -4089,13 +4864,15 @@ spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) state = SPA_LOAD_RECOVER; /* - * Pass off the heavy lifting to spa_load(). Pass TRUE for mosconfig + * Pass off the heavy lifting to spa_load(). Pass TRUE for trust_config * because the user-supplied config is actually the one to trust when * doing an import. */ if (state != SPA_LOAD_RECOVER) spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; + zfs_dbgmsg("spa_import: importing %s%s", pool, + (state == SPA_LOAD_RECOVER) ? " (RECOVERY MODE)" : ""); error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg, policy.zrp_request); @@ -4124,12 +4901,6 @@ spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); - if (error == 0) - error = spa_validate_aux(spa, nvroot, -1ULL, - VDEV_ALLOC_SPARE); - if (error == 0) - error = spa_validate_aux(spa, nvroot, -1ULL, - VDEV_ALLOC_L2CACHE); spa_config_exit(spa, SCL_ALL, FTAG); if (props != NULL) @@ -4202,12 +4973,13 @@ spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) */ spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); - mutex_exit(&spa_namespace_lock); - spa_history_log_version(spa, "import"); + spa_history_log_version(spa, "import", NULL); -#ifdef _KERNEL - zvol_create_minors(pool); -#endif + spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT); + + zvol_create_minors(spa, pool, B_TRUE); + + mutex_exit(&spa_namespace_lock); return (0); } @@ -4234,9 +5006,11 @@ spa_tryimport(nvlist_t *tryconfig) spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL); spa_activate(spa, FREAD); + zfs_dbgmsg("spa_tryimport: importing %s", poolname); + /* * Pass off the heavy lifting to spa_load(). - * Pass TRUE for mosconfig because the user-supplied config + * Pass TRUE for trust_config because the user-supplied config * is actually the one to trust when doing an import. */ error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING, B_TRUE); @@ -4342,6 +5116,10 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, spa_open_ref(spa, FTAG); mutex_exit(&spa_namespace_lock); spa_async_suspend(spa); + if (spa->spa_zvol_taskq) { + zvol_remove_minors(spa, spa_name(spa), B_TRUE); + taskq_wait(spa->spa_zvol_taskq); + } mutex_enter(&spa_namespace_lock); spa_close(spa, FTAG); @@ -4400,7 +5178,10 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, } export_spa: - spa_event_notify(spa, NULL, FM_EREPORT_ZFS_POOL_DESTROY); + if (new_state == POOL_STATE_DESTROYED) + spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_DESTROY); + else if (new_state == POOL_STATE_EXPORTED) + spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_EXPORT); if (spa->spa_state != POOL_STATE_UNINITIALIZED) { spa_unload(spa); @@ -4412,7 +5193,7 @@ export_spa: if (new_state != POOL_STATE_UNINITIALIZED) { if (!hardforce) - spa_config_sync(spa, B_TRUE, B_TRUE); + spa_write_cachefile(spa, B_TRUE, B_TRUE); spa_remove(spa); } mutex_exit(&spa_namespace_lock); @@ -4470,7 +5251,6 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot) vdev_t *vd, *tvd; nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; - int c; ASSERT(spa_writeable(spa)); @@ -4505,9 +5285,41 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot) return (spa_vdev_exit(spa, vd, txg, error)); /* - * Transfer each new top-level vdev from vd to rvd. + * If we are in the middle of a device removal, we can only add + * devices which match the existing devices in the pool. + * If we are in the middle of a removal, or have some indirect + * vdevs, we can not add raidz toplevels. */ - for (c = 0; c < vd->vdev_children; c++) { + if (spa->spa_vdev_removal != NULL || + spa->spa_removing_phys.sr_prev_indirect_vdev != -1) { + for (int c = 0; c < vd->vdev_children; c++) { + tvd = vd->vdev_child[c]; + if (spa->spa_vdev_removal != NULL && + tvd->vdev_ashift != spa->spa_max_ashift) { + return (spa_vdev_exit(spa, vd, txg, EINVAL)); + } + /* Fail if top level vdev is raidz */ + if (tvd->vdev_ops == &vdev_raidz_ops) { + return (spa_vdev_exit(spa, vd, txg, EINVAL)); + } + /* + * Need the top level mirror to be + * a mirror of leaf vdevs only + */ + if (tvd->vdev_ops == &vdev_mirror_ops) { + for (uint64_t cid = 0; + cid < tvd->vdev_children; cid++) { + vdev_t *cvd = tvd->vdev_child[cid]; + if (!cvd->vdev_ops->vdev_op_leaf) { + return (spa_vdev_exit(spa, vd, + txg, EINVAL)); + } + } + } + } + } + + for (int c = 0; c < vd->vdev_children; c++) { /* * Set the vdev id to the first hole, if one exists. @@ -4556,6 +5368,7 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot) mutex_enter(&spa_namespace_lock); spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); + spa_event_notify(spa, NULL, NULL, ESC_ZFS_VDEV_ADD); mutex_exit(&spa_namespace_lock); return (0); @@ -4578,12 +5391,12 @@ int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) { uint64_t txg, dtl_max_txg; + ASSERTV(vdev_t *rvd = spa->spa_root_vdev); vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; vdev_ops_t *pvops; char *oldvdpath, *newvdpath; int newvd_isspare; int error; - ASSERTV(vdev_t *rvd = spa->spa_root_vdev); ASSERT(spa_writeable(spa)); @@ -4591,6 +5404,9 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); + if (spa->spa_vdev_removal != NULL) + return (spa_vdev_exit(spa, NULL, txg, EBUSY)); + if (oldvd == NULL) return (spa_vdev_exit(spa, NULL, txg, ENODEV)); @@ -4713,6 +5529,11 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) newvd->vdev_crtxg = oldvd->vdev_crtxg; vdev_add_child(pvd, newvd); + /* + * Reevaluate the parent vdev state. + */ + vdev_propagate_state(pvd); + tvd = newvd->vdev_top; ASSERT(pvd->vdev_top == tvd); ASSERT(tvd->vdev_parent == rvd); @@ -4731,7 +5552,7 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) if (newvd->vdev_isspare) { spa_spare_activate(newvd); - spa_event_notify(spa, newvd, FM_EREPORT_ZFS_DEVICE_SPARE); + spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE); } oldvdpath = spa_strdup(oldvd->vdev_path); @@ -4750,6 +5571,11 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) */ dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg); + if (spa->spa_bootfs) + spa_event_notify(spa, newvd, NULL, ESC_ZFS_BOOTFS_VDEV_ATTACH); + + spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_ATTACH); + /* * Commit the config */ @@ -4764,9 +5590,6 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) spa_strfree(oldvdpath); spa_strfree(newvdpath); - if (spa->spa_bootfs) - spa_event_notify(spa, newvd, FM_EREPORT_ZFS_BOOTFS_VDEV_ATTACH); - return (0); } @@ -4781,12 +5604,12 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) { uint64_t txg; int error; + ASSERTV(vdev_t *rvd = spa->spa_root_vdev); vdev_t *vd, *pvd, *cvd, *tvd; boolean_t unspare = B_FALSE; uint64_t unspare_guid = 0; char *vdpath; - int c, t; - ASSERTV(vdev_t *rvd = spa->spa_root_vdev); + ASSERT(spa_writeable(spa)); txg = spa_vdev_enter(spa); @@ -4853,7 +5676,7 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) vd->vdev_path != NULL) { size_t len = strlen(vd->vdev_path); - for (c = 0; c < pvd->vdev_children; c++) { + for (int c = 0; c < pvd->vdev_children; c++) { cvd = pvd->vdev_child[c]; if (cvd == vd || cvd->vdev_path == NULL) @@ -4959,13 +5782,13 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) * But first make sure we're not on any *other* txg's DTL list, to * prevent vd from being accessed after it's freed. */ - vdpath = spa_strdup(vd->vdev_path); - for (t = 0; t < TXG_SIZE; t++) + vdpath = spa_strdup(vd->vdev_path ? vd->vdev_path : "none"); + for (int t = 0; t < TXG_SIZE; t++) (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); vd->vdev_detached = B_TRUE; vdev_dirty(tvd, VDD_DTL, vd, txg); - spa_event_notify(spa, vd, FM_EREPORT_ZFS_DEVICE_REMOVE); + spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE); /* hang on to the spa before we release the lock */ spa_open_ref(spa, FTAG); @@ -5034,7 +5857,7 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, /* clear the log and flush everything up to now */ activate_slog = spa_passivate_log(spa); (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); - error = spa_offline_log(spa); + error = spa_reset_logs(spa); txg = spa_vdev_config_enter(spa); if (activate_slog) @@ -5062,7 +5885,7 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, vdev_t *vd = rvd->vdev_child[c]; /* don't count the holes & logs as children */ - if (vd->vdev_islog || vd->vdev_ishole) { + if (vd->vdev_islog || !vdev_is_concrete(vd)) { if (lastlog == 0) lastlog = c; continue; @@ -5115,7 +5938,7 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, /* make sure there's nothing stopping the split */ if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops || vml[c]->vdev_islog || - vml[c]->vdev_ishole || + !vdev_is_concrete(vml[c]) || vml[c]->vdev_isspare || vml[c]->vdev_isl2cache || !vdev_writeable(vml[c]) || @@ -5140,6 +5963,16 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, vml[c]->vdev_top->vdev_asize) == 0); VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT, vml[c]->vdev_top->vdev_ashift) == 0); + + /* transfer per-vdev ZAPs */ + ASSERT3U(vml[c]->vdev_leaf_zap, !=, 0); + VERIFY0(nvlist_add_uint64(child[c], + ZPOOL_CONFIG_VDEV_LEAF_ZAP, vml[c]->vdev_leaf_zap)); + + ASSERT3U(vml[c]->vdev_top->vdev_top_zap, !=, 0); + VERIFY0(nvlist_add_uint64(child[c], + ZPOOL_CONFIG_VDEV_TOP_ZAP, + vml[c]->vdev_parent->vdev_top_zap)); } if (error != 0) { @@ -5181,11 +6014,13 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, spa->spa_config_txg) == 0); VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, spa_generate_guid(NULL)) == 0); + VERIFY0(nvlist_add_boolean(config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)); (void) nvlist_lookup_string(props, zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); /* add the new pool to the namespace */ newspa = spa_add(newname, config, altroot); + newspa->spa_avz_action = AVZ_ACTION_REBUILD; newspa->spa_config_txg = spa->spa_config_txg; spa_set_log_state(newspa, SPA_LOG_CLEAR); @@ -5195,342 +6030,101 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, if (zio_injection_enabled) zio_handle_panic_injection(spa, FTAG, 1); - spa_activate(newspa, spa_mode_global); - spa_async_suspend(newspa); - - /* create the new pool from the disks of the original pool */ - error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE); - if (error) - goto out; - - /* if that worked, generate a real config for the new pool */ - if (newspa->spa_root_vdev != NULL) { - VERIFY(nvlist_alloc(&newspa->spa_config_splitting, - NV_UNIQUE_NAME, KM_SLEEP) == 0); - VERIFY(nvlist_add_uint64(newspa->spa_config_splitting, - ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0); - spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL, - B_TRUE)); - } - - /* set the props */ - if (props != NULL) { - spa_configfile_set(newspa, props, B_FALSE); - error = spa_prop_set(newspa, props); - if (error) - goto out; - } - - /* flush everything */ - txg = spa_vdev_config_enter(newspa); - vdev_config_dirty(newspa->spa_root_vdev); - (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG); - - if (zio_injection_enabled) - zio_handle_panic_injection(spa, FTAG, 2); - - spa_async_resume(newspa); - - /* finally, update the original pool's config */ - txg = spa_vdev_config_enter(spa); - tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error != 0) - dmu_tx_abort(tx); - for (c = 0; c < children; c++) { - if (vml[c] != NULL) { - vdev_split(vml[c]); - if (error == 0) - spa_history_log_internal(spa, "detach", tx, - "vdev=%s", vml[c]->vdev_path); - vdev_free(vml[c]); - } - } - vdev_config_dirty(spa->spa_root_vdev); - spa->spa_config_splitting = NULL; - nvlist_free(nvl); - if (error == 0) - dmu_tx_commit(tx); - (void) spa_vdev_exit(spa, NULL, txg, 0); - - if (zio_injection_enabled) - zio_handle_panic_injection(spa, FTAG, 3); - - /* split is complete; log a history record */ - spa_history_log_internal(newspa, "split", NULL, - "from pool %s", spa_name(spa)); - - kmem_free(vml, children * sizeof (vdev_t *)); - - /* if we're not going to mount the filesystems in userland, export */ - if (exp) - error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL, - B_FALSE, B_FALSE); - - return (error); - -out: - spa_unload(newspa); - spa_deactivate(newspa); - spa_remove(newspa); - - txg = spa_vdev_config_enter(spa); - - /* re-online all offlined disks */ - for (c = 0; c < children; c++) { - if (vml[c] != NULL) - vml[c]->vdev_offline = B_FALSE; - } - vdev_reopen(spa->spa_root_vdev); - - nvlist_free(spa->spa_config_splitting); - spa->spa_config_splitting = NULL; - (void) spa_vdev_exit(spa, NULL, txg, error); - - kmem_free(vml, children * sizeof (vdev_t *)); - return (error); -} - -static nvlist_t * -spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid) -{ - int i; - - for (i = 0; i < count; i++) { - uint64_t guid; - - VERIFY(nvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID, - &guid) == 0); - - if (guid == target_guid) - return (nvpp[i]); - } - - return (NULL); -} - -static void -spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count, - nvlist_t *dev_to_remove) -{ - nvlist_t **newdev = NULL; - int i, j; - - if (count > 1) - newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP); - - for (i = 0, j = 0; i < count; i++) { - if (dev[i] == dev_to_remove) - continue; - VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0); - } - - VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0); - VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0); - - for (i = 0; i < count - 1; i++) - nvlist_free(newdev[i]); - - if (count > 1) - kmem_free(newdev, (count - 1) * sizeof (void *)); -} - -/* - * Evacuate the device. - */ -static int -spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd) -{ - uint64_t txg; - int error = 0; - - ASSERT(MUTEX_HELD(&spa_namespace_lock)); - ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); - ASSERT(vd == vd->vdev_top); - - /* - * Evacuate the device. We don't hold the config lock as writer - * since we need to do I/O but we do keep the - * spa_namespace_lock held. Once this completes the device - * should no longer have any blocks allocated on it. - */ - if (vd->vdev_islog) { - if (vd->vdev_stat.vs_alloc != 0) - error = spa_offline_log(spa); - } else { - error = SET_ERROR(ENOTSUP); - } - - if (error) - return (error); - - /* - * The evacuation succeeded. Remove any remaining MOS metadata - * associated with this vdev, and wait for these changes to sync. - */ - ASSERT0(vd->vdev_stat.vs_alloc); - txg = spa_vdev_config_enter(spa); - vd->vdev_removing = B_TRUE; - vdev_dirty_leaves(vd, VDD_DTL, txg); - vdev_config_dirty(vd); - spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); - - return (0); -} - -/* - * Complete the removal by cleaning up the namespace. - */ -static void -spa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd) -{ - vdev_t *rvd = spa->spa_root_vdev; - uint64_t id = vd->vdev_id; - boolean_t last_vdev = (id == (rvd->vdev_children - 1)); - - ASSERT(MUTEX_HELD(&spa_namespace_lock)); - ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); - ASSERT(vd == vd->vdev_top); - - /* - * Only remove any devices which are empty. - */ - if (vd->vdev_stat.vs_alloc != 0) - return; - - (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); - - if (list_link_active(&vd->vdev_state_dirty_node)) - vdev_state_clean(vd); - if (list_link_active(&vd->vdev_config_dirty_node)) - vdev_config_clean(vd); + spa_activate(newspa, spa_mode_global); + spa_async_suspend(newspa); - vdev_free(vd); + /* create the new pool from the disks of the original pool */ + error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE); + if (error) + goto out; - if (last_vdev) { - vdev_compact_children(rvd); - } else { - vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops); - vdev_add_child(rvd, vd); + /* if that worked, generate a real config for the new pool */ + if (newspa->spa_root_vdev != NULL) { + VERIFY(nvlist_alloc(&newspa->spa_config_splitting, + NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_add_uint64(newspa->spa_config_splitting, + ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0); + spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL, + B_TRUE)); } - vdev_config_dirty(rvd); - /* - * Reassess the health of our root vdev. - */ - vdev_reopen(rvd); -} + /* set the props */ + if (props != NULL) { + spa_configfile_set(newspa, props, B_FALSE); + error = spa_prop_set(newspa, props); + if (error) + goto out; + } -/* - * Remove a device from the pool - - * - * Removing a device from the vdev namespace requires several steps - * and can take a significant amount of time. As a result we use - * the spa_vdev_config_[enter/exit] functions which allow us to - * grab and release the spa_config_lock while still holding the namespace - * lock. During each step the configuration is synced out. - * - * Currently, this supports removing only hot spares, slogs, and level 2 ARC - * devices. - */ -int -spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) -{ - vdev_t *vd; - metaslab_group_t *mg; - nvlist_t **spares, **l2cache, *nv; - uint64_t txg = 0; - uint_t nspares, nl2cache; - int error = 0; - boolean_t locked = MUTEX_HELD(&spa_namespace_lock); + /* flush everything */ + txg = spa_vdev_config_enter(newspa); + vdev_config_dirty(newspa->spa_root_vdev); + (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG); - ASSERT(spa_writeable(spa)); + if (zio_injection_enabled) + zio_handle_panic_injection(spa, FTAG, 2); - if (!locked) - txg = spa_vdev_enter(spa); + spa_async_resume(newspa); - vd = spa_lookup_by_guid(spa, guid, B_FALSE); + /* finally, update the original pool's config */ + txg = spa_vdev_config_enter(spa); + tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error != 0) + dmu_tx_abort(tx); + for (c = 0; c < children; c++) { + if (vml[c] != NULL) { + vdev_split(vml[c]); + if (error == 0) + spa_history_log_internal(spa, "detach", tx, + "vdev=%s", vml[c]->vdev_path); - if (spa->spa_spares.sav_vdevs != NULL && - nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, - ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 && - (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) { - /* - * Only remove the hot spare if it's not currently in use - * in this pool. - */ - if (vd == NULL || unspare) { - spa_vdev_remove_aux(spa->spa_spares.sav_config, - ZPOOL_CONFIG_SPARES, spares, nspares, nv); - spa_load_spares(spa); - spa->spa_spares.sav_sync = B_TRUE; - } else { - error = SET_ERROR(EBUSY); + vdev_free(vml[c]); } - } else if (spa->spa_l2cache.sav_vdevs != NULL && - nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, - ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 && - (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) { - /* - * Cache devices can always be removed. - */ - spa_vdev_remove_aux(spa->spa_l2cache.sav_config, - ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv); - spa_load_l2cache(spa); - spa->spa_l2cache.sav_sync = B_TRUE; - } else if (vd != NULL && vd->vdev_islog) { - ASSERT(!locked); - ASSERT(vd == vd->vdev_top); + } + spa->spa_avz_action = AVZ_ACTION_REBUILD; + vdev_config_dirty(spa->spa_root_vdev); + spa->spa_config_splitting = NULL; + nvlist_free(nvl); + if (error == 0) + dmu_tx_commit(tx); + (void) spa_vdev_exit(spa, NULL, txg, 0); - mg = vd->vdev_mg; + if (zio_injection_enabled) + zio_handle_panic_injection(spa, FTAG, 3); - /* - * Stop allocating from this vdev. - */ - metaslab_group_passivate(mg); + /* split is complete; log a history record */ + spa_history_log_internal(newspa, "split", NULL, + "from pool %s", spa_name(spa)); - /* - * Wait for the youngest allocations and frees to sync, - * and then wait for the deferral of those frees to finish. - */ - spa_vdev_config_exit(spa, NULL, - txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); + kmem_free(vml, children * sizeof (vdev_t *)); - /* - * Attempt to evacuate the vdev. - */ - error = spa_vdev_remove_evacuate(spa, vd); + /* if we're not going to mount the filesystems in userland, export */ + if (exp) + error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL, + B_FALSE, B_FALSE); - txg = spa_vdev_config_enter(spa); + return (error); - /* - * If we couldn't evacuate the vdev, unwind. - */ - if (error) { - metaslab_group_activate(mg); - return (spa_vdev_exit(spa, NULL, txg, error)); - } +out: + spa_unload(newspa); + spa_deactivate(newspa); + spa_remove(newspa); - /* - * Clean up the vdev namespace. - */ - spa_vdev_remove_from_namespace(spa, vd); + txg = spa_vdev_config_enter(spa); - } else if (vd != NULL) { - /* - * Normal vdevs cannot be removed (yet). - */ - error = SET_ERROR(ENOTSUP); - } else { - /* - * There is no vdev of any kind with the specified guid. - */ - error = SET_ERROR(ENOENT); + /* re-online all offlined disks */ + for (c = 0; c < children; c++) { + if (vml[c] != NULL) + vml[c]->vdev_offline = B_FALSE; } + vdev_reopen(spa->spa_root_vdev); - if (!locked) - return (spa_vdev_exit(spa, NULL, txg, error)); + nvlist_free(spa->spa_config_splitting); + spa->spa_config_splitting = NULL; + (void) spa_vdev_exit(spa, NULL, txg, error); + kmem_free(vml, children * sizeof (vdev_t *)); return (error); } @@ -5542,9 +6136,8 @@ static vdev_t * spa_vdev_resilver_done_hunt(vdev_t *vd) { vdev_t *newvd, *oldvd; - int c; - for (c = 0; c < vd->vdev_children; c++) { + for (int c = 0; c < vd->vdev_children; c++) { oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); if (oldvd != NULL) return (oldvd); @@ -5709,6 +6302,16 @@ spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru) * SPA Scanning * ========================================================================== */ +int +spa_scrub_pause_resume(spa_t *spa, pool_scrub_cmd_t cmd) +{ + ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); + + if (dsl_scan_resilvering(spa->spa_dsl_pool)) + return (SET_ERROR(EBUSY)); + + return (dsl_scrub_set_pause_resume(spa->spa_dsl_pool, cmd)); +} int spa_scan_stop(spa_t *spa) @@ -5749,8 +6352,6 @@ spa_scan(spa_t *spa, pool_scan_func_t func) static void spa_async_remove(spa_t *spa, vdev_t *vd) { - int c; - if (vd->vdev_remove_wanted) { vd->vdev_remove_wanted = B_FALSE; vd->vdev_delayed_close = B_FALSE; @@ -5769,33 +6370,29 @@ spa_async_remove(spa_t *spa, vdev_t *vd) vdev_state_dirty(vd->vdev_top); } - for (c = 0; c < vd->vdev_children; c++) + for (int c = 0; c < vd->vdev_children; c++) spa_async_remove(spa, vd->vdev_child[c]); } static void spa_async_probe(spa_t *spa, vdev_t *vd) { - int c; - if (vd->vdev_probe_wanted) { vd->vdev_probe_wanted = B_FALSE; vdev_reopen(vd); /* vdev_open() does the actual probe */ } - for (c = 0; c < vd->vdev_children; c++) + for (int c = 0; c < vd->vdev_children; c++) spa_async_probe(spa, vd->vdev_child[c]); } static void spa_async_autoexpand(spa_t *spa, vdev_t *vd) { - int c; - if (!spa->spa_autoexpand) return; - for (c = 0; c < vd->vdev_children; c++) { + for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; spa_async_autoexpand(spa, cvd); } @@ -5803,13 +6400,14 @@ spa_async_autoexpand(spa_t *spa, vdev_t *vd) if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL) return; - spa_event_notify(vd->vdev_spa, vd, FM_EREPORT_ZFS_DEVICE_AUTOEXPAND); + spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_AUTOEXPAND); } static void -spa_async_thread(spa_t *spa) +spa_async_thread(void *arg) { - int tasks, i; + spa_t *spa = (spa_t *)arg; + int tasks; ASSERT(spa->spa_sync_on); @@ -5847,9 +6445,9 @@ spa_async_thread(spa_t *spa) if (tasks & SPA_ASYNC_REMOVE) { spa_vdev_state_enter(spa, SCL_NONE); spa_async_remove(spa, spa->spa_root_vdev); - for (i = 0; i < spa->spa_l2cache.sav_count; i++) + for (int i = 0; i < spa->spa_l2cache.sav_count; i++) spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); - for (i = 0; i < spa->spa_spares.sav_count; i++) + for (int i = 0; i < spa->spa_spares.sav_count; i++) spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]); (void) spa_vdev_state_exit(spa, NULL, 0); } @@ -5899,6 +6497,12 @@ spa_async_suspend(spa_t *spa) while (spa->spa_async_thread != NULL) cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); mutex_exit(&spa->spa_async_lock); + + spa_vdev_remove_suspend(spa); + + zthr_t *condense_thread = spa->spa_condense_zthr; + if (condense_thread != NULL && zthr_isrunning(condense_thread)) + VERIFY0(zthr_cancel(condense_thread)); } void @@ -5908,15 +6512,41 @@ spa_async_resume(spa_t *spa) ASSERT(spa->spa_async_suspended != 0); spa->spa_async_suspended--; mutex_exit(&spa->spa_async_lock); + spa_restart_removal(spa); + + zthr_t *condense_thread = spa->spa_condense_zthr; + if (condense_thread != NULL && !zthr_isrunning(condense_thread)) + zthr_resume(condense_thread); +} + +static boolean_t +spa_async_tasks_pending(spa_t *spa) +{ + uint_t non_config_tasks; + uint_t config_task; + boolean_t config_task_suspended; + + non_config_tasks = spa->spa_async_tasks & ~SPA_ASYNC_CONFIG_UPDATE; + config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE; + if (spa->spa_ccw_fail_time == 0) { + config_task_suspended = B_FALSE; + } else { + config_task_suspended = + (gethrtime() - spa->spa_ccw_fail_time) < + ((hrtime_t)zfs_ccw_retry_interval * NANOSEC); + } + + return (non_config_tasks || (config_task && !config_task_suspended)); } static void spa_async_dispatch(spa_t *spa) { mutex_enter(&spa->spa_async_lock); - if (spa->spa_async_tasks && !spa->spa_async_suspended && + if (spa_async_tasks_pending(spa) && + !spa->spa_async_suspended && spa->spa_async_thread == NULL && - rootdir != NULL && !vn_is_readonly(rootdir)) + rootdir != NULL) spa->spa_async_thread = thread_create(NULL, 0, spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); mutex_exit(&spa->spa_async_lock); @@ -6058,16 +6688,120 @@ spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, sav->sav_sync = B_FALSE; } +/* + * Rebuild spa's all-vdev ZAP from the vdev ZAPs indicated in each vdev_t. + * The all-vdev ZAP must be empty. + */ +static void +spa_avz_build(vdev_t *vd, uint64_t avz, dmu_tx_t *tx) +{ + spa_t *spa = vd->vdev_spa; + + if (vd->vdev_top_zap != 0) { + VERIFY0(zap_add_int(spa->spa_meta_objset, avz, + vd->vdev_top_zap, tx)); + } + if (vd->vdev_leaf_zap != 0) { + VERIFY0(zap_add_int(spa->spa_meta_objset, avz, + vd->vdev_leaf_zap, tx)); + } + for (uint64_t i = 0; i < vd->vdev_children; i++) { + spa_avz_build(vd->vdev_child[i], avz, tx); + } +} + static void spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) { nvlist_t *config; - if (list_is_empty(&spa->spa_config_dirty_list)) + /* + * If the pool is being imported from a pre-per-vdev-ZAP version of ZFS, + * its config may not be dirty but we still need to build per-vdev ZAPs. + * Similarly, if the pool is being assembled (e.g. after a split), we + * need to rebuild the AVZ although the config may not be dirty. + */ + if (list_is_empty(&spa->spa_config_dirty_list) && + spa->spa_avz_action == AVZ_ACTION_NONE) return; spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); + ASSERT(spa->spa_avz_action == AVZ_ACTION_NONE || + spa->spa_avz_action == AVZ_ACTION_INITIALIZE || + spa->spa_all_vdev_zaps != 0); + + if (spa->spa_avz_action == AVZ_ACTION_REBUILD) { + /* Make and build the new AVZ */ + uint64_t new_avz = zap_create(spa->spa_meta_objset, + DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx); + spa_avz_build(spa->spa_root_vdev, new_avz, tx); + + /* Diff old AVZ with new one */ + zap_cursor_t zc; + zap_attribute_t za; + + for (zap_cursor_init(&zc, spa->spa_meta_objset, + spa->spa_all_vdev_zaps); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + uint64_t vdzap = za.za_first_integer; + if (zap_lookup_int(spa->spa_meta_objset, new_avz, + vdzap) == ENOENT) { + /* + * ZAP is listed in old AVZ but not in new one; + * destroy it + */ + VERIFY0(zap_destroy(spa->spa_meta_objset, vdzap, + tx)); + } + } + + zap_cursor_fini(&zc); + + /* Destroy the old AVZ */ + VERIFY0(zap_destroy(spa->spa_meta_objset, + spa->spa_all_vdev_zaps, tx)); + + /* Replace the old AVZ in the dir obj with the new one */ + VERIFY0(zap_update(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, + sizeof (new_avz), 1, &new_avz, tx)); + + spa->spa_all_vdev_zaps = new_avz; + } else if (spa->spa_avz_action == AVZ_ACTION_DESTROY) { + zap_cursor_t zc; + zap_attribute_t za; + + /* Walk through the AVZ and destroy all listed ZAPs */ + for (zap_cursor_init(&zc, spa->spa_meta_objset, + spa->spa_all_vdev_zaps); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + uint64_t zap = za.za_first_integer; + VERIFY0(zap_destroy(spa->spa_meta_objset, zap, tx)); + } + + zap_cursor_fini(&zc); + + /* Destroy and unlink the AVZ itself */ + VERIFY0(zap_destroy(spa->spa_meta_objset, + spa->spa_all_vdev_zaps, tx)); + VERIFY0(zap_remove(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, tx)); + spa->spa_all_vdev_zaps = 0; + } + + if (spa->spa_all_vdev_zaps == 0) { + spa->spa_all_vdev_zaps = zap_create_link(spa->spa_meta_objset, + DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_VDEV_ZAP_MAP, tx); + } + spa->spa_avz_action = AVZ_ACTION_NONE; + + /* Create ZAPs for vdevs that don't have them. */ + vdev_construct_zaps(spa->spa_root_vdev, tx); + config = spa_config_generate(spa, spa->spa_root_vdev, dmu_tx_get_txg(tx), B_FALSE); @@ -6081,8 +6815,7 @@ spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) spa_config_exit(spa, SCL_STATE, FTAG); - if (spa->spa_config_syncing) - nvlist_free(spa->spa_config_syncing); + nvlist_free(spa->spa_config_syncing); spa->spa_config_syncing = config; spa_sync_nvlist(spa, spa->spa_config_object, config, tx); @@ -6129,9 +6862,8 @@ spa_sync_props(void *arg, dmu_tx_t *tx) zprop_type_t proptype; spa_feature_t fid; - prop = zpool_name_to_prop(nvpair_name(elem)); - switch ((int)prop) { - case ZPROP_INVAL: + switch (prop = zpool_name_to_prop(nvpair_name(elem))) { + case ZPOOL_PROP_INVAL: /* * We checked this earlier in spa_prop_validate(). */ @@ -6148,7 +6880,7 @@ spa_sync_props(void *arg, dmu_tx_t *tx) case ZPOOL_PROP_VERSION: intval = fnvpair_value_uint64(elem); /* - * The version is synced seperatly before other + * The version is synced separately before other * properties and should be correct by now. */ ASSERT3U(spa_version(spa), >=, intval); @@ -6178,7 +6910,7 @@ spa_sync_props(void *arg, dmu_tx_t *tx) * We need to dirty the configuration on all the vdevs * so that their labels get updated. It's unnecessary * to do this for pool creation since the vdev's - * configuratoin has already been dirtied. + * configuration has already been dirtied. */ if (tx->tx_txg != TXG_INITIAL) vdev_config_dirty(spa->spa_root_vdev); @@ -6241,6 +6973,9 @@ spa_sync_props(void *arg, dmu_tx_t *tx) spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); break; + case ZPOOL_PROP_MULTIHOST: + spa->spa_multihost = intval; + break; case ZPOOL_PROP_DEDUPDITTO: spa->spa_dedup_ditto = intval; break; @@ -6311,9 +7046,56 @@ spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) if (lz4_en && !lz4_ac) spa_feature_incr(spa, SPA_FEATURE_LZ4_COMPRESS, tx); } + + /* + * If we haven't written the salt, do so now. Note that the + * feature may not be activated yet, but that's fine since + * the presence of this ZAP entry is backwards compatible. + */ + if (zap_contains(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_CHECKSUM_SALT) == ENOENT) { + VERIFY0(zap_add(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT, 1, + sizeof (spa->spa_cksum_salt.zcs_bytes), + spa->spa_cksum_salt.zcs_bytes, tx)); + } + rrw_exit(&dp->dp_config_rwlock, FTAG); } +static void +vdev_indirect_state_sync_verify(vdev_t *vd) +{ + ASSERTV(vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping); + ASSERTV(vdev_indirect_births_t *vib = vd->vdev_indirect_births); + + if (vd->vdev_ops == &vdev_indirect_ops) { + ASSERT(vim != NULL); + ASSERT(vib != NULL); + } + + if (vdev_obsolete_sm_object(vd) != 0) { + ASSERT(vd->vdev_obsolete_sm != NULL); + ASSERT(vd->vdev_removing || + vd->vdev_ops == &vdev_indirect_ops); + ASSERT(vdev_indirect_mapping_num_entries(vim) > 0); + ASSERT(vdev_indirect_mapping_bytes_mapped(vim) > 0); + + ASSERT3U(vdev_obsolete_sm_object(vd), ==, + space_map_object(vd->vdev_obsolete_sm)); + ASSERT3U(vdev_indirect_mapping_bytes_mapped(vim), >=, + space_map_allocated(vd->vdev_obsolete_sm)); + } + ASSERT(vd->vdev_obsolete_segments != NULL); + + /* + * Since frees / remaps to an indirect vdev can only + * happen in syncing context, the obsolete segments + * tree must be empty when we start syncing. + */ + ASSERT0(range_tree_space(vd->vdev_obsolete_segments)); +} + /* * Sync the specified transaction group. New blocks may be dirtied as * part of the process, so we iterate until it converges. @@ -6328,10 +7110,18 @@ spa_sync(spa_t *spa, uint64_t txg) vdev_t *vd; dmu_tx_t *tx; int error; - int c; + uint32_t max_queue_depth = zfs_vdev_async_write_max_active * + zfs_vdev_queue_depth_pct / 100; VERIFY(spa_writeable(spa)); + /* + * Wait for i/os issued in open context that need to complete + * before this txg syncs. + */ + VERIFY0(zio_wait(spa->spa_txg_zio[txg & TXG_MASK])); + spa->spa_txg_zio[txg & TXG_MASK] = zio_root(spa, NULL, NULL, 0); + /* * Lock out configuration changes. */ @@ -6340,6 +7130,10 @@ spa_sync(spa_t *spa, uint64_t txg) spa->spa_syncing_txg = txg; spa->spa_sync_pass = 0; + mutex_enter(&spa->spa_alloc_lock); + VERIFY0(avl_numnodes(&spa->spa_alloc_tree)); + mutex_exit(&spa->spa_alloc_lock); + /* * If there are any pending vdev state changes, convert them * into config changes that go out with this transaction group. @@ -6367,8 +7161,8 @@ spa_sync(spa_t *spa, uint64_t txg) tx = dmu_tx_create_assigned(dp, txg); spa->spa_sync_starttime = gethrtime(); - taskq_cancel_id(system_taskq, spa->spa_deadman_tqid); - spa->spa_deadman_tqid = taskq_dispatch_delay(system_taskq, + taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid); + spa->spa_deadman_tqid = taskq_dispatch_delay(system_delay_taskq, spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() + NSEC_TO_TICK(spa->spa_deadman_synctime)); @@ -6394,18 +7188,45 @@ spa_sync(spa_t *spa, uint64_t txg) } /* - * If anything has changed in this txg, or if someone is waiting - * for this txg to sync (eg, spa_vdev_remove()), push the - * deferred frees from the previous txg. If not, leave them - * alone so that we don't generate work on an otherwise idle - * system. + * Set the top-level vdev's max queue depth. Evaluate each + * top-level's async write queue depth in case it changed. + * The max queue depth will not change in the middle of syncing + * out this txg. */ - if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || - !txg_list_empty(&dp->dp_dirty_dirs, txg) || - !txg_list_empty(&dp->dp_sync_tasks, txg) || - ((dsl_scan_active(dp->dp_scan) || - txg_sync_waiting(dp)) && !spa_shutting_down(spa))) { - spa_sync_deferred_frees(spa, tx); + uint64_t queue_depth_total = 0; + for (int c = 0; c < rvd->vdev_children; c++) { + vdev_t *tvd = rvd->vdev_child[c]; + metaslab_group_t *mg = tvd->vdev_mg; + + if (mg == NULL || mg->mg_class != spa_normal_class(spa) || + !metaslab_group_initialized(mg)) + continue; + + /* + * It is safe to do a lock-free check here because only async + * allocations look at mg_max_alloc_queue_depth, and async + * allocations all happen from spa_sync(). + */ + ASSERT0(refcount_count(&mg->mg_alloc_queue_depth)); + mg->mg_max_alloc_queue_depth = max_queue_depth; + queue_depth_total += mg->mg_max_alloc_queue_depth; + } + metaslab_class_t *mc = spa_normal_class(spa); + ASSERT0(refcount_count(&mc->mc_alloc_slots)); + mc->mc_alloc_max_slots = queue_depth_total; + mc->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; + + ASSERT3U(mc->mc_alloc_max_slots, <=, + max_queue_depth * rvd->vdev_children); + + for (int c = 0; c < rvd->vdev_children; c++) { + vdev_t *vd = rvd->vdev_child[c]; + vdev_indirect_state_sync_verify(vd); + + if (vdev_indirect_should_condense(vd)) { + spa_condense_indirect_start_sync(vd, tx); + break; + } } /* @@ -6425,6 +7246,11 @@ spa_sync(spa_t *spa, uint64_t txg) if (pass < zfs_sync_pass_deferred_free) { spa_sync_frees(spa, free_bpl, tx); } else { + /* + * We can not defer frees in pass 1, because + * we sync the deferred frees later in pass 1. + */ + ASSERT3U(pass, >, 1); bplist_iterate(free_bpl, bpobj_enqueue_cb, &spa->spa_deferred_bpobj, tx); } @@ -6432,14 +7258,68 @@ spa_sync(spa_t *spa, uint64_t txg) ddt_sync(spa, txg); dsl_scan_sync(dp, tx); - while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, txg))) + if (spa->spa_vdev_removal != NULL) + svr_sync(spa, tx); + + while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) + != NULL) vdev_sync(vd, txg); - if (pass == 1) + if (pass == 1) { spa_sync_upgrades(spa, tx); + ASSERT3U(txg, >=, + spa->spa_uberblock.ub_rootbp.blk_birth); + /* + * Note: We need to check if the MOS is dirty + * because we could have marked the MOS dirty + * without updating the uberblock (e.g. if we + * have sync tasks but no dirty user data). We + * need to check the uberblock's rootbp because + * it is updated if we have synced out dirty + * data (though in this case the MOS will most + * likely also be dirty due to second order + * effects, we don't want to rely on that here). + */ + if (spa->spa_uberblock.ub_rootbp.blk_birth < txg && + !dmu_objset_is_dirty(mos, txg)) { + /* + * Nothing changed on the first pass, + * therefore this TXG is a no-op. Avoid + * syncing deferred frees, so that we + * can keep this TXG as a no-op. + */ + ASSERT(txg_list_empty(&dp->dp_dirty_datasets, + txg)); + ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); + ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg)); + break; + } + spa_sync_deferred_frees(spa, tx); + } } while (dmu_objset_is_dirty(mos, txg)); +#ifdef ZFS_DEBUG + if (!list_is_empty(&spa->spa_config_dirty_list)) { + /* + * Make sure that the number of ZAPs for all the vdevs matches + * the number of ZAPs in the per-vdev ZAP list. This only gets + * called if the config is dirty; otherwise there may be + * outstanding AVZ operations that weren't completed in + * spa_sync_config_object. + */ + uint64_t all_vdev_zap_entry_count; + ASSERT0(zap_count(spa->spa_meta_objset, + spa->spa_all_vdev_zaps, &all_vdev_zap_entry_count)); + ASSERT3U(vdev_count_verify_zaps(spa->spa_root_vdev), ==, + all_vdev_zap_entry_count); + } +#endif + + if (spa->spa_vdev_removal != NULL) { + ASSERT0(spa->spa_vdev_removal->svr_bytes_done[txg & TXG_MASK]); + } + /* * Rewrite the vdev configuration (which includes the uberblock) * to commit the transaction group. @@ -6462,24 +7342,19 @@ spa_sync(spa_t *spa, uint64_t txg) int children = rvd->vdev_children; int c0 = spa_get_random(children); - for (c = 0; c < children; c++) { + for (int c = 0; c < children; c++) { vd = rvd->vdev_child[(c0 + c) % children]; - if (vd->vdev_ms_array == 0 || vd->vdev_islog) + if (vd->vdev_ms_array == 0 || vd->vdev_islog || + !vdev_is_concrete(vd)) continue; svd[svdcount++] = vd; if (svdcount == SPA_DVAS_PER_BP) break; } - error = vdev_config_sync(svd, svdcount, txg, B_FALSE); - if (error != 0) - error = vdev_config_sync(svd, svdcount, txg, - B_TRUE); + error = vdev_config_sync(svd, svdcount, txg); } else { error = vdev_config_sync(rvd->vdev_child, - rvd->vdev_children, txg, B_FALSE); - if (error != 0) - error = vdev_config_sync(rvd->vdev_child, - rvd->vdev_children, txg, B_TRUE); + rvd->vdev_children, txg); } if (error == 0) @@ -6489,12 +7364,12 @@ spa_sync(spa_t *spa, uint64_t txg) if (error == 0) break; - zio_suspend(spa, NULL); + zio_suspend(spa, NULL, ZIO_SUSPEND_IOERR); zio_resume_wait(spa); } dmu_tx_commit(tx); - taskq_cancel_id(system_taskq, spa->spa_deadman_tqid); + taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid); spa->spa_deadman_tqid = 0; /* @@ -6513,10 +7388,12 @@ spa_sync(spa_t *spa, uint64_t txg) spa->spa_config_syncing = NULL; } - spa->spa_ubsync = spa->spa_uberblock; - dsl_pool_sync_done(dp, txg); + mutex_enter(&spa->spa_alloc_lock); + VERIFY0(avl_numnodes(&spa->spa_alloc_tree)); + mutex_exit(&spa->spa_alloc_lock); + /* * Update usable space statistics. */ @@ -6535,6 +7412,13 @@ spa_sync(spa_t *spa, uint64_t txg) spa->spa_sync_pass = 0; + /* + * Update the last synced uberblock here. We want to do this at + * the end of spa_sync() so that consumers of spa_last_synced_txg() + * will be guaranteed that all the processing associated with + * that txg has been completed. + */ + spa->spa_ubsync = spa->spa_uberblock; spa_config_exit(spa, SCL_CONFIG, FTAG); spa_handle_ignored_writes(spa); @@ -6698,18 +7582,44 @@ spa_has_active_shared_spare(spa_t *spa) return (B_FALSE); } +sysevent_t * +spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name) +{ + sysevent_t *ev = NULL; +#ifdef _KERNEL + nvlist_t *resource; + + resource = zfs_event_create(spa, vd, FM_SYSEVENT_CLASS, name, hist_nvl); + if (resource) { + ev = kmem_alloc(sizeof (sysevent_t), KM_SLEEP); + ev->resource = resource; + } +#endif + return (ev); +} + +void +spa_event_post(sysevent_t *ev) +{ +#ifdef _KERNEL + if (ev) { + zfs_zevent_post(ev->resource, NULL, zfs_zevent_post_cb); + kmem_free(ev, sizeof (*ev)); + } +#endif +} + /* - * Post a FM_EREPORT_ZFS_* event from sys/fm/fs/zfs.h. The payload will be + * Post a zevent corresponding to the given sysevent. The 'name' must be one + * of the event definitions in sys/sysevent/eventdefs.h. The payload will be * filled in from the spa and (optionally) the vdev. This doesn't do anything * in the userland libzpool, as we don't want consumers to misinterpret ztest * or zdb as real changes. */ void -spa_event_notify(spa_t *spa, vdev_t *vd, const char *name) +spa_event_notify(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name) { -#ifdef _KERNEL - zfs_ereport_post(name, spa, vd, NULL, 0, 0); -#endif + spa_event_post(spa_event_create(spa, vd, hist_nvl, name)); } #if defined(_KERNEL) && defined(HAVE_SPL) @@ -6718,7 +7628,6 @@ EXPORT_SYMBOL(spa_open); EXPORT_SYMBOL(spa_open_rewind); EXPORT_SYMBOL(spa_get_stats); EXPORT_SYMBOL(spa_create); -EXPORT_SYMBOL(spa_import_rootpool); EXPORT_SYMBOL(spa_import); EXPORT_SYMBOL(spa_tryimport); EXPORT_SYMBOL(spa_destroy); @@ -6736,7 +7645,6 @@ EXPORT_SYMBOL(spa_scan_get_stats); EXPORT_SYMBOL(spa_vdev_add); EXPORT_SYMBOL(spa_vdev_attach); EXPORT_SYMBOL(spa_vdev_detach); -EXPORT_SYMBOL(spa_vdev_remove); EXPORT_SYMBOL(spa_vdev_setpath); EXPORT_SYMBOL(spa_vdev_setfru); EXPORT_SYMBOL(spa_vdev_split_mirror); @@ -6783,4 +7691,10 @@ MODULE_PARM_DESC(spa_load_verify_metadata, module_param(spa_load_verify_data, int, 0644); MODULE_PARM_DESC(spa_load_verify_data, "Set to traverse data on pool import"); + +/* CSTYLED */ +module_param(zio_taskq_batch_pct, uint, 0444); +MODULE_PARM_DESC(zio_taskq_batch_pct, + "Percentage of CPUs to run an IO worker thread"); + #endif