module/zfs/spa.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright (c) 2013 by Delphix. All rights reserved.
  25  * Copyright (c) 2015, Nexenta Systems, Inc.  All rights reserved.
  26  * Copyright (c) 2013, 2014, Nexenta Systems, Inc.  All rights reserved.
  27  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  28  * Copyright 2013 Saso Kiselkov. All rights reserved.
  29  * Copyright (c) 2016 Actifio, Inc. All rights reserved.
  30  */
  31
  32 /*
  33  * SPA: Storage Pool Allocator
  34  *
  35  * This file contains all the routines used when modifying on-disk SPA state.
  36  * This includes opening, importing, destroying, exporting a pool, and syncing a
  37  * pool.
  38  */
  39
  40 #include <sys/zfs_context.h>
  41 #include <sys/fm/fs/zfs.h>
  42 #include <sys/spa_impl.h>
  43 #include <sys/zio.h>
  44 #include <sys/zio_checksum.h>
  45 #include <sys/dmu.h>
  46 #include <sys/dmu_tx.h>
  47 #include <sys/zap.h>
  48 #include <sys/zil.h>
  49 #include <sys/ddt.h>
  50 #include <sys/vdev_impl.h>
  51 #include <sys/vdev_disk.h>
  52 #include <sys/metaslab.h>
  53 #include <sys/metaslab_impl.h>
  54 #include <sys/uberblock_impl.h>
  55 #include <sys/txg.h>
  56 #include <sys/avl.h>
  57 #include <sys/dmu_traverse.h>
  58 #include <sys/dmu_objset.h>
  59 #include <sys/unique.h>
  60 #include <sys/dsl_pool.h>
  61 #include <sys/dsl_dataset.h>
  62 #include <sys/dsl_dir.h>
  63 #include <sys/dsl_prop.h>
  64 #include <sys/dsl_synctask.h>
  65 #include <sys/fs/zfs.h>
  66 #include <sys/arc.h>
  67 #include <sys/callb.h>
  68 #include <sys/systeminfo.h>
  69 #include <sys/spa_boot.h>
  70 #include <sys/zfs_ioctl.h>
  71 #include <sys/dsl_scan.h>
  72 #include <sys/zfeature.h>
  73 #include <sys/dsl_destroy.h>
  74 #include <sys/zvol.h>
  75
  76 #ifdef  _KERNEL
  77 #include <sys/bootprops.h>
  78 #include <sys/callb.h>
  79 #include <sys/cpupart.h>
  80 #include <sys/pool.h>
  81 #include <sys/sysdc.h>
  82 #include <sys/zone.h>
  83 #endif  /* _KERNEL */
  84
  85 #include "zfs_prop.h"
  86 #include "zfs_comutil.h"
  87
  88 /*
  89  * The interval, in seconds, at which failed configuration cache file writes
  90  * should be retried.
  91  */
  92 static int zfs_ccw_retry_interval = 300;
  93
  94 typedef enum zti_modes {
  95         ZTI_MODE_FIXED,                 /* value is # of threads (min 1) */
  96         ZTI_MODE_BATCH,                 /* cpu-intensive; value is ignored */
  97         ZTI_MODE_NULL,                  /* don't create a taskq */
  98         ZTI_NMODES
  99 } zti_modes_t;
 100
 101 #define ZTI_P(n, q)     { ZTI_MODE_FIXED, (n), (q) }
 102 #define ZTI_PCT(n)      { ZTI_MODE_ONLINE_PERCENT, (n), 1 }
 103 #define ZTI_BATCH       { ZTI_MODE_BATCH, 0, 1 }
 104 #define ZTI_NULL        { ZTI_MODE_NULL, 0, 0 }
 105
 106 #define ZTI_N(n)        ZTI_P(n, 1)
 107 #define ZTI_ONE         ZTI_N(1)
 108
 109 typedef struct zio_taskq_info {
 110         zti_modes_t zti_mode;
 111         uint_t zti_value;
 112         uint_t zti_count;
 113 } zio_taskq_info_t;
 114
 115 static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
 116         "iss", "iss_h", "int", "int_h"
 117 };
 118
 119 /*
 120  * This table defines the taskq settings for each ZFS I/O type. When
 121  * initializing a pool, we use this table to create an appropriately sized
 122  * taskq. Some operations are low volume and therefore have a small, static
 123  * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE
 124  * macros. Other operations process a large amount of data; the ZTI_BATCH
 125  * macro causes us to create a taskq oriented for throughput. Some operations
 126  * are so high frequency and short-lived that the taskq itself can become a a
 127  * point of lock contention. The ZTI_P(#, #) macro indicates that we need an
 128  * additional degree of parallelism specified by the number of threads per-
 129  * taskq and the number of taskqs; when dispatching an event in this case, the
 130  * particular taskq is chosen at random.
 131  *
 132  * The different taskq priorities are to handle the different contexts (issue
 133  * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that
 134  * need to be handled with minimum delay.
 135  */
 136 const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
 137         /* ISSUE        ISSUE_HIGH      INTR            INTR_HIGH */
 138         { ZTI_ONE,      ZTI_NULL,       ZTI_ONE,        ZTI_NULL }, /* NULL */
 139         { ZTI_N(8),     ZTI_NULL,       ZTI_P(12, 8),   ZTI_NULL }, /* READ */
 140         { ZTI_BATCH,    ZTI_N(5),       ZTI_P(12, 8),   ZTI_N(5) }, /* WRITE */
 141         { ZTI_P(12, 8), ZTI_NULL,       ZTI_ONE,        ZTI_NULL }, /* FREE */
 142         { ZTI_ONE,      ZTI_NULL,       ZTI_ONE,        ZTI_NULL }, /* CLAIM */
 143         { ZTI_ONE,      ZTI_NULL,       ZTI_ONE,        ZTI_NULL }, /* IOCTL */
 144 };
 145
 146 static void spa_sync_version(void *arg, dmu_tx_t *tx);
 147 static void spa_sync_props(void *arg, dmu_tx_t *tx);
 148 static boolean_t spa_has_active_shared_spare(spa_t *spa);
 149 static inline int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config,
 150     spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig,
 151     char **ereport);
 152 static void spa_vdev_resilver_done(spa_t *spa);
 153
 154 uint_t          zio_taskq_batch_pct = 75;       /* 1 thread per cpu in pset */
 155 id_t            zio_taskq_psrset_bind = PS_NONE;
 156 boolean_t       zio_taskq_sysdc = B_TRUE;       /* use SDC scheduling class */
 157 uint_t          zio_taskq_basedc = 80;          /* base duty cycle */
 158
 159 boolean_t       spa_create_process = B_TRUE;    /* no process ==> no sysdc */
 160
 161 /*
 162  * This (illegal) pool name is used when temporarily importing a spa_t in order
 163  * to get the vdev stats associated with the imported devices.
 164  */
 165 #define TRYIMPORT_NAME  "$import"
 166
 167 /*
 168  * ==========================================================================
 169  * SPA properties routines
 170  * ==========================================================================
 171  */
 172
 173 /*
 174  * Add a (source=src, propname=propval) list to an nvlist.
 175  */
 176 static void
 177 spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval,
 178     uint64_t intval, zprop_source_t src)
 179 {
 180         const char *propname = zpool_prop_to_name(prop);
 181         nvlist_t *propval;
 182
 183         VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 184         VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0);
 185
 186         if (strval != NULL)
 187                 VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0);
 188         else
 189                 VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0);
 190
 191         VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0);
 192         nvlist_free(propval);
 193 }
 194
 195 /*
 196  * Get property values from the spa configuration.
 197  */
 198 static void
 199 spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
 200 {
 201         vdev_t *rvd = spa->spa_root_vdev;
 202         dsl_pool_t *pool = spa->spa_dsl_pool;
 203         uint64_t size, alloc, cap, version;
 204         const zprop_source_t src = ZPROP_SRC_NONE;
 205         spa_config_dirent_t *dp;
 206         metaslab_class_t *mc = spa_normal_class(spa);
 207
 208         ASSERT(MUTEX_HELD(&spa->spa_props_lock));
 209
 210         if (rvd != NULL) {
 211                 alloc = metaslab_class_get_alloc(spa_normal_class(spa));
 212                 size = metaslab_class_get_space(spa_normal_class(spa));
 213                 spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src);
 214                 spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src);
 215                 spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src);
 216                 spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL,
 217                     size - alloc, src);
 218
 219                 spa_prop_add_list(*nvp, ZPOOL_PROP_FRAGMENTATION, NULL,
 220                     metaslab_class_fragmentation(mc), src);
 221                 spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL,
 222                     metaslab_class_expandable_space(mc), src);
 223                 spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL,
 224                     (spa_mode(spa) == FREAD), src);
 225
 226                 cap = (size == 0) ? 0 : (alloc * 100 / size);
 227                 spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src);
 228
 229                 spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL,
 230                     ddt_get_pool_dedup_ratio(spa), src);
 231
 232                 spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL,
 233                     rvd->vdev_state, src);
 234
 235                 version = spa_version(spa);
 236                 if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) {
 237                         spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL,
 238                             version, ZPROP_SRC_DEFAULT);
 239                 } else {
 240                         spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL,
 241                             version, ZPROP_SRC_LOCAL);
 242                 }
 243         }
 244
 245         if (pool != NULL) {
 246                 /*
 247                  * The $FREE directory was introduced in SPA_VERSION_DEADLISTS,
 248                  * when opening pools before this version freedir will be NULL.
 249                  */
 250                 if (pool->dp_free_dir != NULL) {
 251                         spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL,
 252                             dsl_dir_phys(pool->dp_free_dir)->dd_used_bytes,
 253                             src);
 254                 } else {
 255                         spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING,
 256                             NULL, 0, src);
 257                 }
 258
 259                 if (pool->dp_leak_dir != NULL) {
 260                         spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL,
 261                             dsl_dir_phys(pool->dp_leak_dir)->dd_used_bytes,
 262                             src);
 263                 } else {
 264                         spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED,
 265                             NULL, 0, src);
 266                 }
 267         }
 268
 269         spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src);
 270
 271         if (spa->spa_comment != NULL) {
 272                 spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment,
 273                     0, ZPROP_SRC_LOCAL);
 274         }
 275
 276         if (spa->spa_root != NULL)
 277                 spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root,
 278                     0, ZPROP_SRC_LOCAL);
 279
 280         if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) {
 281                 spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL,
 282                     MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE), ZPROP_SRC_NONE);
 283         } else {
 284                 spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL,
 285                     SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE);
 286         }
 287
 288         if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE)) {
 289                 spa_prop_add_list(*nvp, ZPOOL_PROP_MAXDNODESIZE, NULL,
 290                     DNODE_MAX_SIZE, ZPROP_SRC_NONE);
 291         } else {
 292                 spa_prop_add_list(*nvp, ZPOOL_PROP_MAXDNODESIZE, NULL,
 293                     DNODE_MIN_SIZE, ZPROP_SRC_NONE);
 294         }
 295
 296         if ((dp = list_head(&spa->spa_config_list)) != NULL) {
 297                 if (dp->scd_path == NULL) {
 298                         spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
 299                             "none", 0, ZPROP_SRC_LOCAL);
 300                 } else if (strcmp(dp->scd_path, spa_config_path) != 0) {
 301                         spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
 302                             dp->scd_path, 0, ZPROP_SRC_LOCAL);
 303                 }
 304         }
 305 }
 306
 307 /*
 308  * Get zpool property values.
 309  */
 310 int
 311 spa_prop_get(spa_t *spa, nvlist_t **nvp)
 312 {
 313         objset_t *mos = spa->spa_meta_objset;
 314         zap_cursor_t zc;
 315         zap_attribute_t za;
 316         int err;
 317
 318         err = nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP);
 319         if (err)
 320                 return (err);
 321
 322         mutex_enter(&spa->spa_props_lock);
 323
 324         /*
 325          * Get properties from the spa config.
 326          */
 327         spa_prop_get_config(spa, nvp);
 328
 329         /* If no pool property object, no more prop to get. */
 330         if (mos == NULL || spa->spa_pool_props_object == 0) {
 331                 mutex_exit(&spa->spa_props_lock);
 332                 goto out;
 333         }
 334
 335         /*
 336          * Get properties from the MOS pool property object.
 337          */
 338         for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object);
 339             (err = zap_cursor_retrieve(&zc, &za)) == 0;
 340             zap_cursor_advance(&zc)) {
 341                 uint64_t intval = 0;
 342                 char *strval = NULL;
 343                 zprop_source_t src = ZPROP_SRC_DEFAULT;
 344                 zpool_prop_t prop;
 345
 346                 if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL)
 347                         continue;
 348
 349                 switch (za.za_integer_length) {
 350                 case 8:
 351                         /* integer property */
 352                         if (za.za_first_integer !=
 353                             zpool_prop_default_numeric(prop))
 354                                 src = ZPROP_SRC_LOCAL;
 355
 356                         if (prop == ZPOOL_PROP_BOOTFS) {
 357                                 dsl_pool_t *dp;
 358                                 dsl_dataset_t *ds = NULL;
 359
 360                                 dp = spa_get_dsl(spa);
 361                                 dsl_pool_config_enter(dp, FTAG);
 362                                 if ((err = dsl_dataset_hold_obj(dp,
 363                                     za.za_first_integer, FTAG, &ds))) {
 364                                         dsl_pool_config_exit(dp, FTAG);
 365                                         break;
 366                                 }
 367
 368                                 strval = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN,
 369                                     KM_SLEEP);
 370                                 dsl_dataset_name(ds, strval);
 371                                 dsl_dataset_rele(ds, FTAG);
 372                                 dsl_pool_config_exit(dp, FTAG);
 373                         } else {
 374                                 strval = NULL;
 375                                 intval = za.za_first_integer;
 376                         }
 377
 378                         spa_prop_add_list(*nvp, prop, strval, intval, src);
 379
 380                         if (strval != NULL)
 381                                 kmem_free(strval, ZFS_MAX_DATASET_NAME_LEN);
 382
 383                         break;
 384
 385                 case 1:
 386                         /* string property */
 387                         strval = kmem_alloc(za.za_num_integers, KM_SLEEP);
 388                         err = zap_lookup(mos, spa->spa_pool_props_object,
 389                             za.za_name, 1, za.za_num_integers, strval);
 390                         if (err) {
 391                                 kmem_free(strval, za.za_num_integers);
 392                                 break;
 393                         }
 394                         spa_prop_add_list(*nvp, prop, strval, 0, src);
 395                         kmem_free(strval, za.za_num_integers);
 396                         break;
 397
 398                 default:
 399                         break;
 400                 }
 401         }
 402         zap_cursor_fini(&zc);
 403         mutex_exit(&spa->spa_props_lock);
 404 out:
 405         if (err && err != ENOENT) {
 406                 nvlist_free(*nvp);
 407                 *nvp = NULL;
 408                 return (err);
 409         }
 410
 411         return (0);
 412 }
 413
 414 /*
 415  * Validate the given pool properties nvlist and modify the list
 416  * for the property values to be set.
 417  */
 418 static int
 419 spa_prop_validate(spa_t *spa, nvlist_t *props)
 420 {
 421         nvpair_t *elem;
 422         int error = 0, reset_bootfs = 0;
 423         uint64_t objnum = 0;
 424         boolean_t has_feature = B_FALSE;
 425
 426         elem = NULL;
 427         while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
 428                 uint64_t intval;
 429                 char *strval, *slash, *check, *fname;
 430                 const char *propname = nvpair_name(elem);
 431                 zpool_prop_t prop = zpool_name_to_prop(propname);
 432
 433                 switch ((int)prop) {
 434                 case ZPROP_INVAL:
 435                         if (!zpool_prop_feature(propname)) {
 436                                 error = SET_ERROR(EINVAL);
 437                                 break;
 438                         }
 439
 440                         /*
 441                          * Sanitize the input.
 442                          */
 443                         if (nvpair_type(elem) != DATA_TYPE_UINT64) {
 444                                 error = SET_ERROR(EINVAL);
 445                                 break;
 446                         }
 447
 448                         if (nvpair_value_uint64(elem, &intval) != 0) {
 449                                 error = SET_ERROR(EINVAL);
 450                                 break;
 451                         }
 452
 453                         if (intval != 0) {
 454                                 error = SET_ERROR(EINVAL);
 455                                 break;
 456                         }
 457
 458                         fname = strchr(propname, '@') + 1;
 459                         if (zfeature_lookup_name(fname, NULL) != 0) {
 460                                 error = SET_ERROR(EINVAL);
 461                                 break;
 462                         }
 463
 464                         has_feature = B_TRUE;
 465                         break;
 466
 467                 case ZPOOL_PROP_VERSION:
 468                         error = nvpair_value_uint64(elem, &intval);
 469                         if (!error &&
 470                             (intval < spa_version(spa) ||
 471                             intval > SPA_VERSION_BEFORE_FEATURES ||
 472                             has_feature))
 473                                 error = SET_ERROR(EINVAL);
 474                         break;
 475
 476                 case ZPOOL_PROP_DELEGATION:
 477                 case ZPOOL_PROP_AUTOREPLACE:
 478                 case ZPOOL_PROP_LISTSNAPS:
 479                 case ZPOOL_PROP_AUTOEXPAND:
 480                         error = nvpair_value_uint64(elem, &intval);
 481                         if (!error && intval > 1)
 482                                 error = SET_ERROR(EINVAL);
 483                         break;
 484
 485                 case ZPOOL_PROP_BOOTFS:
 486                         /*
 487                          * If the pool version is less than SPA_VERSION_BOOTFS,
 488                          * or the pool is still being created (version == 0),
 489                          * the bootfs property cannot be set.
 490                          */
 491                         if (spa_version(spa) < SPA_VERSION_BOOTFS) {
 492                                 error = SET_ERROR(ENOTSUP);
 493                                 break;
 494                         }
 495
 496                         /*
 497                          * Make sure the vdev config is bootable
 498                          */
 499                         if (!vdev_is_bootable(spa->spa_root_vdev)) {
 500                                 error = SET_ERROR(ENOTSUP);
 501                                 break;
 502                         }
 503
 504                         reset_bootfs = 1;
 505
 506                         error = nvpair_value_string(elem, &strval);
 507
 508                         if (!error) {
 509                                 objset_t *os;
 510                                 uint64_t propval;
 511
 512                                 if (strval == NULL || strval[0] == '\0') {
 513                                         objnum = zpool_prop_default_numeric(
 514                                             ZPOOL_PROP_BOOTFS);
 515                                         break;
 516                                 }
 517
 518                                 error = dmu_objset_hold(strval, FTAG, &os);
 519                                 if (error)
 520                                         break;
 521
 522                                 /*
 523                                  * Must be ZPL, and its property settings
 524                                  * must be supported by GRUB (compression
 525                                  * is not gzip, and large blocks or large
 526                                  * dnodes are not used).
 527                                  */
 528
 529                                 if (dmu_objset_type(os) != DMU_OST_ZFS) {
 530                                         error = SET_ERROR(ENOTSUP);
 531                                 } else if ((error =
 532                                     dsl_prop_get_int_ds(dmu_objset_ds(os),
 533                                     zfs_prop_to_name(ZFS_PROP_COMPRESSION),
 534                                     &propval)) == 0 &&
 535                                     !BOOTFS_COMPRESS_VALID(propval)) {
 536                                         error = SET_ERROR(ENOTSUP);
 537                                 } else if ((error =
 538                                     dsl_prop_get_int_ds(dmu_objset_ds(os),
 539                                     zfs_prop_to_name(ZFS_PROP_RECORDSIZE),
 540                                     &propval)) == 0 &&
 541                                     propval > SPA_OLD_MAXBLOCKSIZE) {
 542                                         error = SET_ERROR(ENOTSUP);
 543                                 } else if ((error =
 544                                     dsl_prop_get_int_ds(dmu_objset_ds(os),
 545                                     zfs_prop_to_name(ZFS_PROP_DNODESIZE),
 546                                     &propval)) == 0 &&
 547                                     propval != ZFS_DNSIZE_LEGACY) {
 548                                         error = SET_ERROR(ENOTSUP);
 549                                 } else {
 550                                         objnum = dmu_objset_id(os);
 551                                 }
 552                                 dmu_objset_rele(os, FTAG);
 553                         }
 554                         break;
 555
 556                 case ZPOOL_PROP_FAILUREMODE:
 557                         error = nvpair_value_uint64(elem, &intval);
 558                         if (!error && (intval < ZIO_FAILURE_MODE_WAIT ||
 559                             intval > ZIO_FAILURE_MODE_PANIC))
 560                                 error = SET_ERROR(EINVAL);
 561
 562                         /*
 563                          * This is a special case which only occurs when
 564                          * the pool has completely failed. This allows
 565                          * the user to change the in-core failmode property
 566                          * without syncing it out to disk (I/Os might
 567                          * currently be blocked). We do this by returning
 568                          * EIO to the caller (spa_prop_set) to trick it
 569                          * into thinking we encountered a property validation
 570                          * error.
 571                          */
 572                         if (!error && spa_suspended(spa)) {
 573                                 spa->spa_failmode = intval;
 574                                 error = SET_ERROR(EIO);
 575                         }
 576                         break;
 577
 578                 case ZPOOL_PROP_CACHEFILE:
 579                         if ((error = nvpair_value_string(elem, &strval)) != 0)
 580                                 break;
 581
 582                         if (strval[0] == '\0')
 583                                 break;
 584
 585                         if (strcmp(strval, "none") == 0)
 586                                 break;
 587
 588                         if (strval[0] != '/') {
 589                                 error = SET_ERROR(EINVAL);
 590                                 break;
 591                         }
 592
 593                         slash = strrchr(strval, '/');
 594                         ASSERT(slash != NULL);
 595
 596                         if (slash[1] == '\0' || strcmp(slash, "/.") == 0 ||
 597                             strcmp(slash, "/..") == 0)
 598                                 error = SET_ERROR(EINVAL);
 599                         break;
 600
 601                 case ZPOOL_PROP_COMMENT:
 602                         if ((error = nvpair_value_string(elem, &strval)) != 0)
 603                                 break;
 604                         for (check = strval; *check != '\0'; check++) {
 605                                 if (!isprint(*check)) {
 606                                         error = SET_ERROR(EINVAL);
 607                                         break;
 608                                 }
 609                         }
 610                         if (strlen(strval) > ZPROP_MAX_COMMENT)
 611                                 error = SET_ERROR(E2BIG);
 612                         break;
 613
 614                 case ZPOOL_PROP_DEDUPDITTO:
 615                         if (spa_version(spa) < SPA_VERSION_DEDUP)
 616                                 error = SET_ERROR(ENOTSUP);
 617                         else
 618                                 error = nvpair_value_uint64(elem, &intval);
 619                         if (error == 0 &&
 620                             intval != 0 && intval < ZIO_DEDUPDITTO_MIN)
 621                                 error = SET_ERROR(EINVAL);
 622                         break;
 623
 624                 default:
 625                         break;
 626                 }
 627
 628                 if (error)
 629                         break;
 630         }
 631
 632         if (!error && reset_bootfs) {
 633                 error = nvlist_remove(props,
 634                     zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING);
 635
 636                 if (!error) {
 637                         error = nvlist_add_uint64(props,
 638                             zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum);
 639                 }
 640         }
 641
 642         return (error);
 643 }
 644
 645 void
 646 spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync)
 647 {
 648         char *cachefile;
 649         spa_config_dirent_t *dp;
 650
 651         if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE),
 652             &cachefile) != 0)
 653                 return;
 654
 655         dp = kmem_alloc(sizeof (spa_config_dirent_t),
 656             KM_SLEEP);
 657
 658         if (cachefile[0] == '\0')
 659                 dp->scd_path = spa_strdup(spa_config_path);
 660         else if (strcmp(cachefile, "none") == 0)
 661                 dp->scd_path = NULL;
 662         else
 663                 dp->scd_path = spa_strdup(cachefile);
 664
 665         list_insert_head(&spa->spa_config_list, dp);
 666         if (need_sync)
 667                 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
 668 }
 669
 670 int
 671 spa_prop_set(spa_t *spa, nvlist_t *nvp)
 672 {
 673         int error;
 674         nvpair_t *elem = NULL;
 675         boolean_t need_sync = B_FALSE;
 676
 677         if ((error = spa_prop_validate(spa, nvp)) != 0)
 678                 return (error);
 679
 680         while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) {
 681                 zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem));
 682
 683                 if (prop == ZPOOL_PROP_CACHEFILE ||
 684                     prop == ZPOOL_PROP_ALTROOT ||
 685                     prop == ZPOOL_PROP_READONLY)
 686                         continue;
 687
 688                 if (prop == ZPOOL_PROP_VERSION || prop == ZPROP_INVAL) {
 689                         uint64_t ver;
 690
 691                         if (prop == ZPOOL_PROP_VERSION) {
 692                                 VERIFY(nvpair_value_uint64(elem, &ver) == 0);
 693                         } else {
 694                                 ASSERT(zpool_prop_feature(nvpair_name(elem)));
 695                                 ver = SPA_VERSION_FEATURES;
 696                                 need_sync = B_TRUE;
 697                         }
 698
 699                         /* Save time if the version is already set. */
 700                         if (ver == spa_version(spa))
 701                                 continue;
 702
 703                         /*
 704                          * In addition to the pool directory object, we might
 705                          * create the pool properties object, the features for
 706                          * read object, the features for write object, or the
 707                          * feature descriptions object.
 708                          */
 709                         error = dsl_sync_task(spa->spa_name, NULL,
 710                             spa_sync_version, &ver,
 711                             6, ZFS_SPACE_CHECK_RESERVED);
 712                         if (error)
 713                                 return (error);
 714                         continue;
 715                 }
 716
 717                 need_sync = B_TRUE;
 718                 break;
 719         }
 720
 721         if (need_sync) {
 722                 return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props,
 723                     nvp, 6, ZFS_SPACE_CHECK_RESERVED));
 724         }
 725
 726         return (0);
 727 }
 728
 729 /*
 730  * If the bootfs property value is dsobj, clear it.
 731  */
 732 void
 733 spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx)
 734 {
 735         if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) {
 736                 VERIFY(zap_remove(spa->spa_meta_objset,
 737                     spa->spa_pool_props_object,
 738                     zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0);
 739                 spa->spa_bootfs = 0;
 740         }
 741 }
 742
 743 /*ARGSUSED*/
 744 static int
 745 spa_change_guid_check(void *arg, dmu_tx_t *tx)
 746 {
 747         spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 748         vdev_t *rvd = spa->spa_root_vdev;
 749         uint64_t vdev_state;
 750         ASSERTV(uint64_t *newguid = arg);
 751
 752         spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 753         vdev_state = rvd->vdev_state;
 754         spa_config_exit(spa, SCL_STATE, FTAG);
 755
 756         if (vdev_state != VDEV_STATE_HEALTHY)
 757                 return (SET_ERROR(ENXIO));
 758
 759         ASSERT3U(spa_guid(spa), !=, *newguid);
 760
 761         return (0);
 762 }
 763
 764 static void
 765 spa_change_guid_sync(void *arg, dmu_tx_t *tx)
 766 {
 767         uint64_t *newguid = arg;
 768         spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 769         uint64_t oldguid;
 770         vdev_t *rvd = spa->spa_root_vdev;
 771
 772         oldguid = spa_guid(spa);
 773
 774         spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 775         rvd->vdev_guid = *newguid;
 776         rvd->vdev_guid_sum += (*newguid - oldguid);
 777         vdev_config_dirty(rvd);
 778         spa_config_exit(spa, SCL_STATE, FTAG);
 779
 780         spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu",
 781             oldguid, *newguid);
 782 }
 783
 784 /*
 785  * Change the GUID for the pool.  This is done so that we can later
 786  * re-import a pool built from a clone of our own vdevs.  We will modify
 787  * the root vdev's guid, our own pool guid, and then mark all of our
 788  * vdevs dirty.  Note that we must make sure that all our vdevs are
 789  * online when we do this, or else any vdevs that weren't present
 790  * would be orphaned from our pool.  We are also going to issue a
 791  * sysevent to update any watchers.
 792  */
 793 int
 794 spa_change_guid(spa_t *spa)
 795 {
 796         int error;
 797         uint64_t guid;
 798
 799         mutex_enter(&spa->spa_vdev_top_lock);
 800         mutex_enter(&spa_namespace_lock);
 801         guid = spa_generate_guid(NULL);
 802
 803         error = dsl_sync_task(spa->spa_name, spa_change_guid_check,
 804             spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED);
 805
 806         if (error == 0) {
 807                 spa_config_sync(spa, B_FALSE, B_TRUE);
 808                 spa_event_notify(spa, NULL, ESC_ZFS_POOL_REGUID);
 809         }
 810
 811         mutex_exit(&spa_namespace_lock);
 812         mutex_exit(&spa->spa_vdev_top_lock);
 813
 814         return (error);
 815 }
 816
 817 /*
 818  * ==========================================================================
 819  * SPA state manipulation (open/create/destroy/import/export)
 820  * ==========================================================================
 821  */
 822
 823 static int
 824 spa_error_entry_compare(const void *a, const void *b)
 825 {
 826         const spa_error_entry_t *sa = (const spa_error_entry_t *)a;
 827         const spa_error_entry_t *sb = (const spa_error_entry_t *)b;
 828         int ret;
 829
 830         ret = memcmp(&sa->se_bookmark, &sb->se_bookmark,
 831             sizeof (zbookmark_phys_t));
 832
 833         return (AVL_ISIGN(ret));
 834 }
 835
 836 /*
 837  * Utility function which retrieves copies of the current logs and
 838  * re-initializes them in the process.
 839  */
 840 void
 841 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub)
 842 {
 843         ASSERT(MUTEX_HELD(&spa->spa_errlist_lock));
 844
 845         bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t));
 846         bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t));
 847
 848         avl_create(&spa->spa_errlist_scrub,
 849             spa_error_entry_compare, sizeof (spa_error_entry_t),
 850             offsetof(spa_error_entry_t, se_avl));
 851         avl_create(&spa->spa_errlist_last,
 852             spa_error_entry_compare, sizeof (spa_error_entry_t),
 853             offsetof(spa_error_entry_t, se_avl));
 854 }
 855
 856 static void
 857 spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
 858 {
 859         const zio_taskq_info_t *ztip = &zio_taskqs[t][q];
 860         enum zti_modes mode = ztip->zti_mode;
 861         uint_t value = ztip->zti_value;
 862         uint_t count = ztip->zti_count;
 863         spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
 864         char name[32];
 865         uint_t i, flags = TASKQ_DYNAMIC;
 866         boolean_t batch = B_FALSE;
 867
 868         if (mode == ZTI_MODE_NULL) {
 869                 tqs->stqs_count = 0;
 870                 tqs->stqs_taskq = NULL;
 871                 return;
 872         }
 873
 874         ASSERT3U(count, >, 0);
 875
 876         tqs->stqs_count = count;
 877         tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP);
 878
 879         switch (mode) {
 880         case ZTI_MODE_FIXED:
 881                 ASSERT3U(value, >=, 1);
 882                 value = MAX(value, 1);
 883                 break;
 884
 885         case ZTI_MODE_BATCH:
 886                 batch = B_TRUE;
 887                 flags |= TASKQ_THREADS_CPU_PCT;
 888                 value = MIN(zio_taskq_batch_pct, 100);
 889                 break;
 890
 891         default:
 892                 panic("unrecognized mode for %s_%s taskq (%u:%u) in "
 893                     "spa_activate()",
 894                     zio_type_name[t], zio_taskq_types[q], mode, value);
 895                 break;
 896         }
 897
 898         for (i = 0; i < count; i++) {
 899                 taskq_t *tq;
 900
 901                 if (count > 1) {
 902                         (void) snprintf(name, sizeof (name), "%s_%s_%u",
 903                             zio_type_name[t], zio_taskq_types[q], i);
 904                 } else {
 905                         (void) snprintf(name, sizeof (name), "%s_%s",
 906                             zio_type_name[t], zio_taskq_types[q]);
 907                 }
 908
 909                 if (zio_taskq_sysdc && spa->spa_proc != &p0) {
 910                         if (batch)
 911                                 flags |= TASKQ_DC_BATCH;
 912
 913                         tq = taskq_create_sysdc(name, value, 50, INT_MAX,
 914                             spa->spa_proc, zio_taskq_basedc, flags);
 915                 } else {
 916                         pri_t pri = maxclsyspri;
 917                         /*
 918                          * The write issue taskq can be extremely CPU
 919                          * intensive.  Run it at slightly less important
 920                          * priority than the other taskqs.  Under Linux this
 921                          * means incrementing the priority value on platforms
 922                          * like illumos it should be decremented.
 923                          */
 924                         if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE)
 925                                 pri++;
 926
 927                         tq = taskq_create_proc(name, value, pri, 50,
 928                             INT_MAX, spa->spa_proc, flags);
 929                 }
 930
 931                 tqs->stqs_taskq[i] = tq;
 932         }
 933 }
 934
 935 static void
 936 spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
 937 {
 938         spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
 939         uint_t i;
 940
 941         if (tqs->stqs_taskq == NULL) {
 942                 ASSERT3U(tqs->stqs_count, ==, 0);
 943                 return;
 944         }
 945
 946         for (i = 0; i < tqs->stqs_count; i++) {
 947                 ASSERT3P(tqs->stqs_taskq[i], !=, NULL);
 948                 taskq_destroy(tqs->stqs_taskq[i]);
 949         }
 950
 951         kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *));
 952         tqs->stqs_taskq = NULL;
 953 }
 954
 955 /*
 956  * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority.
 957  * Note that a type may have multiple discrete taskqs to avoid lock contention
 958  * on the taskq itself. In that case we choose which taskq at random by using
 959  * the low bits of gethrtime().
 960  */
 961 void
 962 spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
 963     task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent)
 964 {
 965         spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
 966         taskq_t *tq;
 967
 968         ASSERT3P(tqs->stqs_taskq, !=, NULL);
 969         ASSERT3U(tqs->stqs_count, !=, 0);
 970
 971         if (tqs->stqs_count == 1) {
 972                 tq = tqs->stqs_taskq[0];
 973         } else {
 974                 tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count];
 975         }
 976
 977         taskq_dispatch_ent(tq, func, arg, flags, ent);
 978 }
 979
 980 /*
 981  * Same as spa_taskq_dispatch_ent() but block on the task until completion.
 982  */
 983 void
 984 spa_taskq_dispatch_sync(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
 985     task_func_t *func, void *arg, uint_t flags)
 986 {
 987         spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
 988         taskq_t *tq;
 989         taskqid_t id;
 990
 991         ASSERT3P(tqs->stqs_taskq, !=, NULL);
 992         ASSERT3U(tqs->stqs_count, !=, 0);
 993
 994         if (tqs->stqs_count == 1) {
 995                 tq = tqs->stqs_taskq[0];
 996         } else {
 997                 tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count];
 998         }
 999
1000         id = taskq_dispatch(tq, func, arg, flags);
1001         if (id)
1002                 taskq_wait_id(tq, id);
1003 }
1004
1005 static void
1006 spa_create_zio_taskqs(spa_t *spa)
1007 {
1008         int t, q;
1009
1010         for (t = 0; t < ZIO_TYPES; t++) {
1011                 for (q = 0; q < ZIO_TASKQ_TYPES; q++) {
1012                         spa_taskqs_init(spa, t, q);
1013                 }
1014         }
1015 }
1016
1017 #if defined(_KERNEL) && defined(HAVE_SPA_THREAD)
1018 static void
1019 spa_thread(void *arg)
1020 {
1021         callb_cpr_t cprinfo;
1022
1023         spa_t *spa = arg;
1024         user_t *pu = PTOU(curproc);
1025
1026         CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr,
1027             spa->spa_name);
1028
1029         ASSERT(curproc != &p0);
1030         (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs),
1031             "zpool-%s", spa->spa_name);
1032         (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm));
1033
1034         /* bind this thread to the requested psrset */
1035         if (zio_taskq_psrset_bind != PS_NONE) {
1036                 pool_lock();
1037                 mutex_enter(&cpu_lock);
1038                 mutex_enter(&pidlock);
1039                 mutex_enter(&curproc->p_lock);
1040
1041                 if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind,
1042                     0, NULL, NULL) == 0)  {
1043                         curthread->t_bind_pset = zio_taskq_psrset_bind;
1044                 } else {
1045                         cmn_err(CE_WARN,
1046                             "Couldn't bind process for zfs pool \"%s\" to "
1047                             "pset %d\n", spa->spa_name, zio_taskq_psrset_bind);
1048                 }
1049
1050                 mutex_exit(&curproc->p_lock);
1051                 mutex_exit(&pidlock);
1052                 mutex_exit(&cpu_lock);
1053                 pool_unlock();
1054         }
1055
1056         if (zio_taskq_sysdc) {
1057                 sysdc_thread_enter(curthread, 100, 0);
1058         }
1059
1060         spa->spa_proc = curproc;
1061         spa->spa_did = curthread->t_did;
1062
1063         spa_create_zio_taskqs(spa);
1064
1065         mutex_enter(&spa->spa_proc_lock);
1066         ASSERT(spa->spa_proc_state == SPA_PROC_CREATED);
1067
1068         spa->spa_proc_state = SPA_PROC_ACTIVE;
1069         cv_broadcast(&spa->spa_proc_cv);
1070
1071         CALLB_CPR_SAFE_BEGIN(&cprinfo);
1072         while (spa->spa_proc_state == SPA_PROC_ACTIVE)
1073                 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
1074         CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock);
1075
1076         ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE);
1077         spa->spa_proc_state = SPA_PROC_GONE;
1078         spa->spa_proc = &p0;
1079         cv_broadcast(&spa->spa_proc_cv);
1080         CALLB_CPR_EXIT(&cprinfo);       /* drops spa_proc_lock */
1081
1082         mutex_enter(&curproc->p_lock);
1083         lwp_exit();
1084 }
1085 #endif
1086
1087 /*
1088  * Activate an uninitialized pool.
1089  */
1090 static void
1091 spa_activate(spa_t *spa, int mode)
1092 {
1093         ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
1094
1095         spa->spa_state = POOL_STATE_ACTIVE;
1096         spa->spa_mode = mode;
1097
1098         spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops);
1099         spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops);
1100
1101         /* Try to create a covering process */
1102         mutex_enter(&spa->spa_proc_lock);
1103         ASSERT(spa->spa_proc_state == SPA_PROC_NONE);
1104         ASSERT(spa->spa_proc == &p0);
1105         spa->spa_did = 0;
1106
1107 #ifdef HAVE_SPA_THREAD
1108         /* Only create a process if we're going to be around a while. */
1109         if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) {
1110                 if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri,
1111                     NULL, 0) == 0) {
1112                         spa->spa_proc_state = SPA_PROC_CREATED;
1113                         while (spa->spa_proc_state == SPA_PROC_CREATED) {
1114                                 cv_wait(&spa->spa_proc_cv,
1115                                     &spa->spa_proc_lock);
1116                         }
1117                         ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
1118                         ASSERT(spa->spa_proc != &p0);
1119                         ASSERT(spa->spa_did != 0);
1120                 } else {
1121 #ifdef _KERNEL
1122                         cmn_err(CE_WARN,
1123                             "Couldn't create process for zfs pool \"%s\"\n",
1124                             spa->spa_name);
1125 #endif
1126                 }
1127         }
1128 #endif /* HAVE_SPA_THREAD */
1129         mutex_exit(&spa->spa_proc_lock);
1130
1131         /* If we didn't create a process, we need to create our taskqs. */
1132         if (spa->spa_proc == &p0) {
1133                 spa_create_zio_taskqs(spa);
1134         }
1135
1136         list_create(&spa->spa_config_dirty_list, sizeof (vdev_t),
1137             offsetof(vdev_t, vdev_config_dirty_node));
1138         list_create(&spa->spa_evicting_os_list, sizeof (objset_t),
1139             offsetof(objset_t, os_evicting_node));
1140         list_create(&spa->spa_state_dirty_list, sizeof (vdev_t),
1141             offsetof(vdev_t, vdev_state_dirty_node));
1142
1143         txg_list_create(&spa->spa_vdev_txg_list,
1144             offsetof(struct vdev, vdev_txg_node));
1145
1146         avl_create(&spa->spa_errlist_scrub,
1147             spa_error_entry_compare, sizeof (spa_error_entry_t),
1148             offsetof(spa_error_entry_t, se_avl));
1149         avl_create(&spa->spa_errlist_last,
1150             spa_error_entry_compare, sizeof (spa_error_entry_t),
1151             offsetof(spa_error_entry_t, se_avl));
1152
1153         /*
1154          * This taskq is used to perform zvol-minor-related tasks
1155          * asynchronously. This has several advantages, including easy
1156          * resolution of various deadlocks (zfsonlinux bug #3681).
1157          *
1158          * The taskq must be single threaded to ensure tasks are always
1159          * processed in the order in which they were dispatched.
1160          *
1161          * A taskq per pool allows one to keep the pools independent.
1162          * This way if one pool is suspended, it will not impact another.
1163          *
1164          * The preferred location to dispatch a zvol minor task is a sync
1165          * task. In this context, there is easy access to the spa_t and minimal
1166          * error handling is required because the sync task must succeed.
1167          */
1168         spa->spa_zvol_taskq = taskq_create("z_zvol", 1, defclsyspri,
1169             1, INT_MAX, 0);
1170 }
1171
1172 /*
1173  * Opposite of spa_activate().
1174  */
1175 static void
1176 spa_deactivate(spa_t *spa)
1177 {
1178         int t, q;
1179
1180         ASSERT(spa->spa_sync_on == B_FALSE);
1181         ASSERT(spa->spa_dsl_pool == NULL);
1182         ASSERT(spa->spa_root_vdev == NULL);
1183         ASSERT(spa->spa_async_zio_root == NULL);
1184         ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED);
1185
1186         spa_evicting_os_wait(spa);
1187
1188         if (spa->spa_zvol_taskq) {
1189                 taskq_destroy(spa->spa_zvol_taskq);
1190                 spa->spa_zvol_taskq = NULL;
1191         }
1192
1193         txg_list_destroy(&spa->spa_vdev_txg_list);
1194
1195         list_destroy(&spa->spa_config_dirty_list);
1196         list_destroy(&spa->spa_evicting_os_list);
1197         list_destroy(&spa->spa_state_dirty_list);
1198
1199         taskq_cancel_id(system_taskq, spa->spa_deadman_tqid);
1200
1201         for (t = 0; t < ZIO_TYPES; t++) {
1202                 for (q = 0; q < ZIO_TASKQ_TYPES; q++) {
1203                         spa_taskqs_fini(spa, t, q);
1204                 }
1205         }
1206
1207         metaslab_class_destroy(spa->spa_normal_class);
1208         spa->spa_normal_class = NULL;
1209
1210         metaslab_class_destroy(spa->spa_log_class);
1211         spa->spa_log_class = NULL;
1212
1213         /*
1214          * If this was part of an import or the open otherwise failed, we may
1215          * still have errors left in the queues.  Empty them just in case.
1216          */
1217         spa_errlog_drain(spa);
1218
1219         avl_destroy(&spa->spa_errlist_scrub);
1220         avl_destroy(&spa->spa_errlist_last);
1221
1222         spa->spa_state = POOL_STATE_UNINITIALIZED;
1223
1224         mutex_enter(&spa->spa_proc_lock);
1225         if (spa->spa_proc_state != SPA_PROC_NONE) {
1226                 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
1227                 spa->spa_proc_state = SPA_PROC_DEACTIVATE;
1228                 cv_broadcast(&spa->spa_proc_cv);
1229                 while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) {
1230                         ASSERT(spa->spa_proc != &p0);
1231                         cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
1232                 }
1233                 ASSERT(spa->spa_proc_state == SPA_PROC_GONE);
1234                 spa->spa_proc_state = SPA_PROC_NONE;
1235         }
1236         ASSERT(spa->spa_proc == &p0);
1237         mutex_exit(&spa->spa_proc_lock);
1238
1239         /*
1240          * We want to make sure spa_thread() has actually exited the ZFS
1241          * module, so that the module can't be unloaded out from underneath
1242          * it.
1243          */
1244         if (spa->spa_did != 0) {
1245                 thread_join(spa->spa_did);
1246                 spa->spa_did = 0;
1247         }
1248 }
1249
1250 /*
1251  * Verify a pool configuration, and construct the vdev tree appropriately.  This
1252  * will create all the necessary vdevs in the appropriate layout, with each vdev
1253  * in the CLOSED state.  This will prep the pool before open/creation/import.
1254  * All vdev validation is done by the vdev_alloc() routine.
1255  */
1256 static int
1257 spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent,
1258     uint_t id, int atype)
1259 {
1260         nvlist_t **child;
1261         uint_t children;
1262         int error;
1263         int c;
1264
1265         if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0)
1266                 return (error);
1267
1268         if ((*vdp)->vdev_ops->vdev_op_leaf)
1269                 return (0);
1270
1271         error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
1272             &child, &children);
1273
1274         if (error == ENOENT)
1275                 return (0);
1276
1277         if (error) {
1278                 vdev_free(*vdp);
1279                 *vdp = NULL;
1280                 return (SET_ERROR(EINVAL));
1281         }
1282
1283         for (c = 0; c < children; c++) {
1284                 vdev_t *vd;
1285                 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c,
1286                     atype)) != 0) {
1287                         vdev_free(*vdp);
1288                         *vdp = NULL;
1289                         return (error);
1290                 }
1291         }
1292
1293         ASSERT(*vdp != NULL);
1294
1295         return (0);
1296 }
1297
1298 /*
1299  * Opposite of spa_load().
1300  */
1301 static void
1302 spa_unload(spa_t *spa)
1303 {
1304         int i;
1305
1306         ASSERT(MUTEX_HELD(&spa_namespace_lock));
1307
1308         /*
1309          * Stop async tasks.
1310          */
1311         spa_async_suspend(spa);
1312
1313         /*
1314          * Stop syncing.
1315          */
1316         if (spa->spa_sync_on) {
1317                 txg_sync_stop(spa->spa_dsl_pool);
1318                 spa->spa_sync_on = B_FALSE;
1319         }
1320
1321         /*
1322          * Wait for any outstanding async I/O to complete.
1323          */
1324         if (spa->spa_async_zio_root != NULL) {
1325                 for (i = 0; i < max_ncpus; i++)
1326                         (void) zio_wait(spa->spa_async_zio_root[i]);
1327                 kmem_free(spa->spa_async_zio_root, max_ncpus * sizeof (void *));
1328                 spa->spa_async_zio_root = NULL;
1329         }
1330
1331         bpobj_close(&spa->spa_deferred_bpobj);
1332
1333         spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1334
1335         /*
1336          * Close all vdevs.
1337          */
1338         if (spa->spa_root_vdev)
1339                 vdev_free(spa->spa_root_vdev);
1340         ASSERT(spa->spa_root_vdev == NULL);
1341
1342         /*
1343          * Close the dsl pool.
1344          */
1345         if (spa->spa_dsl_pool) {
1346                 dsl_pool_close(spa->spa_dsl_pool);
1347                 spa->spa_dsl_pool = NULL;
1348                 spa->spa_meta_objset = NULL;
1349         }
1350
1351         ddt_unload(spa);
1352
1353
1354         /*
1355          * Drop and purge level 2 cache
1356          */
1357         spa_l2cache_drop(spa);
1358
1359         for (i = 0; i < spa->spa_spares.sav_count; i++)
1360                 vdev_free(spa->spa_spares.sav_vdevs[i]);
1361         if (spa->spa_spares.sav_vdevs) {
1362                 kmem_free(spa->spa_spares.sav_vdevs,
1363                     spa->spa_spares.sav_count * sizeof (void *));
1364                 spa->spa_spares.sav_vdevs = NULL;
1365         }
1366         if (spa->spa_spares.sav_config) {
1367                 nvlist_free(spa->spa_spares.sav_config);
1368                 spa->spa_spares.sav_config = NULL;
1369         }
1370         spa->spa_spares.sav_count = 0;
1371
1372         for (i = 0; i < spa->spa_l2cache.sav_count; i++) {
1373                 vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]);
1374                 vdev_free(spa->spa_l2cache.sav_vdevs[i]);
1375         }
1376         if (spa->spa_l2cache.sav_vdevs) {
1377                 kmem_free(spa->spa_l2cache.sav_vdevs,
1378                     spa->spa_l2cache.sav_count * sizeof (void *));
1379                 spa->spa_l2cache.sav_vdevs = NULL;
1380         }
1381         if (spa->spa_l2cache.sav_config) {
1382                 nvlist_free(spa->spa_l2cache.sav_config);
1383                 spa->spa_l2cache.sav_config = NULL;
1384         }
1385         spa->spa_l2cache.sav_count = 0;
1386
1387         spa->spa_async_suspended = 0;
1388
1389         if (spa->spa_comment != NULL) {
1390                 spa_strfree(spa->spa_comment);
1391                 spa->spa_comment = NULL;
1392         }
1393
1394         spa_config_exit(spa, SCL_ALL, FTAG);
1395 }
1396
1397 /*
1398  * Load (or re-load) the current list of vdevs describing the active spares for
1399  * this pool.  When this is called, we have some form of basic information in
1400  * 'spa_spares.sav_config'.  We parse this into vdevs, try to open them, and
1401  * then re-generate a more complete list including status information.
1402  */
1403 static void
1404 spa_load_spares(spa_t *spa)
1405 {
1406         nvlist_t **spares;
1407         uint_t nspares;
1408         int i;
1409         vdev_t *vd, *tvd;
1410
1411         ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
1412
1413         /*
1414          * First, close and free any existing spare vdevs.
1415          */
1416         for (i = 0; i < spa->spa_spares.sav_count; i++) {
1417                 vd = spa->spa_spares.sav_vdevs[i];
1418
1419                 /* Undo the call to spa_activate() below */
1420                 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
1421                     B_FALSE)) != NULL && tvd->vdev_isspare)
1422                         spa_spare_remove(tvd);
1423                 vdev_close(vd);
1424                 vdev_free(vd);
1425         }
1426
1427         if (spa->spa_spares.sav_vdevs)
1428                 kmem_free(spa->spa_spares.sav_vdevs,
1429                     spa->spa_spares.sav_count * sizeof (void *));
1430
1431         if (spa->spa_spares.sav_config == NULL)
1432                 nspares = 0;
1433         else
1434                 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
1435                     ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
1436
1437         spa->spa_spares.sav_count = (int)nspares;
1438         spa->spa_spares.sav_vdevs = NULL;
1439
1440         if (nspares == 0)
1441                 return;
1442
1443         /*
1444          * Construct the array of vdevs, opening them to get status in the
1445          * process.   For each spare, there is potentially two different vdev_t
1446          * structures associated with it: one in the list of spares (used only
1447          * for basic validation purposes) and one in the active vdev
1448          * configuration (if it's spared in).  During this phase we open and
1449          * validate each vdev on the spare list.  If the vdev also exists in the
1450          * active configuration, then we also mark this vdev as an active spare.
1451          */
1452         spa->spa_spares.sav_vdevs = kmem_zalloc(nspares * sizeof (void *),
1453             KM_SLEEP);
1454         for (i = 0; i < spa->spa_spares.sav_count; i++) {
1455                 VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0,
1456                     VDEV_ALLOC_SPARE) == 0);
1457                 ASSERT(vd != NULL);
1458
1459                 spa->spa_spares.sav_vdevs[i] = vd;
1460
1461                 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
1462                     B_FALSE)) != NULL) {
1463                         if (!tvd->vdev_isspare)
1464                                 spa_spare_add(tvd);
1465
1466                         /*
1467                          * We only mark the spare active if we were successfully
1468                          * able to load the vdev.  Otherwise, importing a pool
1469                          * with a bad active spare would result in strange
1470                          * behavior, because multiple pool would think the spare
1471                          * is actively in use.
1472                          *
1473                          * There is a vulnerability here to an equally bizarre
1474                          * circumstance, where a dead active spare is later
1475                          * brought back to life (onlined or otherwise).  Given
1476                          * the rarity of this scenario, and the extra complexity
1477                          * it adds, we ignore the possibility.
1478                          */
1479                         if (!vdev_is_dead(tvd))
1480                                 spa_spare_activate(tvd);
1481                 }
1482
1483                 vd->vdev_top = vd;
1484                 vd->vdev_aux = &spa->spa_spares;
1485
1486                 if (vdev_open(vd) != 0)
1487                         continue;
1488
1489                 if (vdev_validate_aux(vd) == 0)
1490                         spa_spare_add(vd);
1491         }
1492
1493         /*
1494          * Recompute the stashed list of spares, with status information
1495          * this time.
1496          */
1497         VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES,
1498             DATA_TYPE_NVLIST_ARRAY) == 0);
1499
1500         spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *),
1501             KM_SLEEP);
1502         for (i = 0; i < spa->spa_spares.sav_count; i++)
1503                 spares[i] = vdev_config_generate(spa,
1504                     spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE);
1505         VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
1506             ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0);
1507         for (i = 0; i < spa->spa_spares.sav_count; i++)
1508                 nvlist_free(spares[i]);
1509         kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *));
1510 }
1511
1512 /*
1513  * Load (or re-load) the current list of vdevs describing the active l2cache for
1514  * this pool.  When this is called, we have some form of basic information in
1515  * 'spa_l2cache.sav_config'.  We parse this into vdevs, try to open them, and
1516  * then re-generate a more complete list including status information.
1517  * Devices which are already active have their details maintained, and are
1518  * not re-opened.
1519  */
1520 static void
1521 spa_load_l2cache(spa_t *spa)
1522 {
1523         nvlist_t **l2cache;
1524         uint_t nl2cache;
1525         int i, j, oldnvdevs;
1526         uint64_t guid;
1527         vdev_t *vd, **oldvdevs, **newvdevs;
1528         spa_aux_vdev_t *sav = &spa->spa_l2cache;
1529
1530         ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
1531
1532         oldvdevs = sav->sav_vdevs;
1533         oldnvdevs = sav->sav_count;
1534         sav->sav_vdevs = NULL;
1535         sav->sav_count = 0;
1536
1537         if (sav->sav_config == NULL) {
1538                 nl2cache = 0;
1539                 newvdevs = NULL;
1540                 goto out;
1541         }
1542
1543         VERIFY(nvlist_lookup_nvlist_array(sav->sav_config,
1544             ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
1545         newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP);
1546
1547         /*
1548          * Process new nvlist of vdevs.
1549          */
1550         for (i = 0; i < nl2cache; i++) {
1551                 VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID,
1552                     &guid) == 0);
1553
1554                 newvdevs[i] = NULL;
1555                 for (j = 0; j < oldnvdevs; j++) {
1556                         vd = oldvdevs[j];
1557                         if (vd != NULL && guid == vd->vdev_guid) {
1558                                 /*
1559                                  * Retain previous vdev for add/remove ops.
1560                                  */
1561                                 newvdevs[i] = vd;
1562                                 oldvdevs[j] = NULL;
1563                                 break;
1564                         }
1565                 }
1566
1567                 if (newvdevs[i] == NULL) {
1568                         /*
1569                          * Create new vdev
1570                          */
1571                         VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0,
1572                             VDEV_ALLOC_L2CACHE) == 0);
1573                         ASSERT(vd != NULL);
1574                         newvdevs[i] = vd;
1575
1576                         /*
1577                          * Commit this vdev as an l2cache device,
1578                          * even if it fails to open.
1579                          */
1580                         spa_l2cache_add(vd);
1581
1582                         vd->vdev_top = vd;
1583                         vd->vdev_aux = sav;
1584
1585                         spa_l2cache_activate(vd);
1586
1587                         if (vdev_open(vd) != 0)
1588                                 continue;
1589
1590                         (void) vdev_validate_aux(vd);
1591
1592                         if (!vdev_is_dead(vd))
1593                                 l2arc_add_vdev(spa, vd);
1594                 }
1595         }
1596
1597         sav->sav_vdevs = newvdevs;
1598         sav->sav_count = (int)nl2cache;
1599
1600         /*
1601          * Recompute the stashed list of l2cache devices, with status
1602          * information this time.
1603          */
1604         VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE,
1605             DATA_TYPE_NVLIST_ARRAY) == 0);
1606
1607         l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
1608         for (i = 0; i < sav->sav_count; i++)
1609                 l2cache[i] = vdev_config_generate(spa,
1610                     sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE);
1611         VERIFY(nvlist_add_nvlist_array(sav->sav_config,
1612             ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0);
1613
1614 out:
1615         /*
1616          * Purge vdevs that were dropped
1617          */
1618         for (i = 0; i < oldnvdevs; i++) {
1619                 uint64_t pool;
1620
1621                 vd = oldvdevs[i];
1622                 if (vd != NULL) {
1623                         ASSERT(vd->vdev_isl2cache);
1624
1625                         if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
1626                             pool != 0ULL && l2arc_vdev_present(vd))
1627                                 l2arc_remove_vdev(vd);
1628                         vdev_clear_stats(vd);
1629                         vdev_free(vd);
1630                 }
1631         }
1632
1633         if (oldvdevs)
1634                 kmem_free(oldvdevs, oldnvdevs * sizeof (void *));
1635
1636         for (i = 0; i < sav->sav_count; i++)
1637                 nvlist_free(l2cache[i]);
1638         if (sav->sav_count)
1639                 kmem_free(l2cache, sav->sav_count * sizeof (void *));
1640 }
1641
1642 static int
1643 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value)
1644 {
1645         dmu_buf_t *db;
1646         char *packed = NULL;
1647         size_t nvsize = 0;
1648         int error;
1649         *value = NULL;
1650
1651         error = dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db);
1652         if (error)
1653                 return (error);
1654
1655         nvsize = *(uint64_t *)db->db_data;
1656         dmu_buf_rele(db, FTAG);
1657
1658         packed = vmem_alloc(nvsize, KM_SLEEP);
1659         error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed,
1660             DMU_READ_PREFETCH);
1661         if (error == 0)
1662                 error = nvlist_unpack(packed, nvsize, value, 0);
1663         vmem_free(packed, nvsize);
1664
1665         return (error);
1666 }
1667
1668 /*
1669  * Checks to see if the given vdev could not be opened, in which case we post a
1670  * sysevent to notify the autoreplace code that the device has been removed.
1671  */
1672 static void
1673 spa_check_removed(vdev_t *vd)
1674 {
1675         int c;
1676
1677         for (c = 0; c < vd->vdev_children; c++)
1678                 spa_check_removed(vd->vdev_child[c]);
1679
1680         if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) &&
1681             !vd->vdev_ishole) {
1682                 zfs_post_autoreplace(vd->vdev_spa, vd);
1683                 spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK);
1684         }
1685 }
1686
1687 static void
1688 spa_config_valid_zaps(vdev_t *vd, vdev_t *mvd)
1689 {
1690         uint64_t i;
1691
1692         ASSERT3U(vd->vdev_children, ==, mvd->vdev_children);
1693
1694         vd->vdev_top_zap = mvd->vdev_top_zap;
1695         vd->vdev_leaf_zap = mvd->vdev_leaf_zap;
1696
1697         for (i = 0; i < vd->vdev_children; i++) {
1698                 spa_config_valid_zaps(vd->vdev_child[i], mvd->vdev_child[i]);
1699         }
1700 }
1701
1702 /*
1703  * Validate the current config against the MOS config
1704  */
1705 static boolean_t
1706 spa_config_valid(spa_t *spa, nvlist_t *config)
1707 {
1708         vdev_t *mrvd, *rvd = spa->spa_root_vdev;
1709         nvlist_t *nv;
1710         int c, i;
1711
1712         VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nv) == 0);
1713
1714         spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1715         VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0);
1716
1717         ASSERT3U(rvd->vdev_children, ==, mrvd->vdev_children);
1718
1719         /*
1720          * If we're doing a normal import, then build up any additional
1721          * diagnostic information about missing devices in this config.
1722          * We'll pass this up to the user for further processing.
1723          */
1724         if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) {
1725                 nvlist_t **child, *nv;
1726                 uint64_t idx = 0;
1727
1728                 child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t *),
1729                     KM_SLEEP);
1730                 VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
1731
1732                 for (c = 0; c < rvd->vdev_children; c++) {
1733                         vdev_t *tvd = rvd->vdev_child[c];
1734                         vdev_t *mtvd  = mrvd->vdev_child[c];
1735
1736                         if (tvd->vdev_ops == &vdev_missing_ops &&
1737                             mtvd->vdev_ops != &vdev_missing_ops &&
1738                             mtvd->vdev_islog)
1739                                 child[idx++] = vdev_config_generate(spa, mtvd,
1740                                     B_FALSE, 0);
1741                 }
1742
1743                 if (idx) {
1744                         VERIFY(nvlist_add_nvlist_array(nv,
1745                             ZPOOL_CONFIG_CHILDREN, child, idx) == 0);
1746                         VERIFY(nvlist_add_nvlist(spa->spa_load_info,
1747                             ZPOOL_CONFIG_MISSING_DEVICES, nv) == 0);
1748
1749                         for (i = 0; i < idx; i++)
1750                                 nvlist_free(child[i]);
1751                 }
1752                 nvlist_free(nv);
1753                 kmem_free(child, rvd->vdev_children * sizeof (char **));
1754         }
1755
1756         /*
1757          * Compare the root vdev tree with the information we have
1758          * from the MOS config (mrvd). Check each top-level vdev
1759          * with the corresponding MOS config top-level (mtvd).
1760          */
1761         for (c = 0; c < rvd->vdev_children; c++) {
1762                 vdev_t *tvd = rvd->vdev_child[c];
1763                 vdev_t *mtvd  = mrvd->vdev_child[c];
1764
1765                 /*
1766                  * Resolve any "missing" vdevs in the current configuration.
1767                  * If we find that the MOS config has more accurate information
1768                  * about the top-level vdev then use that vdev instead.
1769                  */
1770                 if (tvd->vdev_ops == &vdev_missing_ops &&
1771                     mtvd->vdev_ops != &vdev_missing_ops) {
1772
1773                         if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG))
1774                                 continue;
1775
1776                         /*
1777                          * Device specific actions.
1778                          */
1779                         if (mtvd->vdev_islog) {
1780                                 spa_set_log_state(spa, SPA_LOG_CLEAR);
1781                         } else {
1782                                 /*
1783                                  * XXX - once we have 'readonly' pool
1784                                  * support we should be able to handle
1785                                  * missing data devices by transitioning
1786                                  * the pool to readonly.
1787                                  */
1788                                 continue;
1789                         }
1790
1791                         /*
1792                          * Swap the missing vdev with the data we were
1793                          * able to obtain from the MOS config.
1794                          */
1795                         vdev_remove_child(rvd, tvd);
1796                         vdev_remove_child(mrvd, mtvd);
1797
1798                         vdev_add_child(rvd, mtvd);
1799                         vdev_add_child(mrvd, tvd);
1800
1801                         spa_config_exit(spa, SCL_ALL, FTAG);
1802                         vdev_load(mtvd);
1803                         spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1804
1805                         vdev_reopen(rvd);
1806                 } else {
1807                         if (mtvd->vdev_islog) {
1808                                 /*
1809                                  * Load the slog device's state from the MOS
1810                                  * config since it's possible that the label
1811                                  * does not contain the most up-to-date
1812                                  * information.
1813                                  */
1814                                 vdev_load_log_state(tvd, mtvd);
1815                                 vdev_reopen(tvd);
1816                         }
1817
1818                         /*
1819                          * Per-vdev ZAP info is stored exclusively in the MOS.
1820                          */
1821                         spa_config_valid_zaps(tvd, mtvd);
1822                 }
1823         }
1824
1825         vdev_free(mrvd);
1826         spa_config_exit(spa, SCL_ALL, FTAG);
1827
1828         /*
1829          * Ensure we were able to validate the config.
1830          */
1831         return (rvd->vdev_guid_sum == spa->spa_uberblock.ub_guid_sum);
1832 }
1833
1834 /*
1835  * Check for missing log devices
1836  */
1837 static boolean_t
1838 spa_check_logs(spa_t *spa)
1839 {
1840         boolean_t rv = B_FALSE;
1841         dsl_pool_t *dp = spa_get_dsl(spa);
1842
1843         switch (spa->spa_log_state) {
1844         default:
1845                 break;
1846         case SPA_LOG_MISSING:
1847                 /* need to recheck in case slog has been restored */
1848         case SPA_LOG_UNKNOWN:
1849                 rv = (dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
1850                     zil_check_log_chain, NULL, DS_FIND_CHILDREN) != 0);
1851                 if (rv)
1852                         spa_set_log_state(spa, SPA_LOG_MISSING);
1853                 break;
1854         }
1855         return (rv);
1856 }
1857
1858 static boolean_t
1859 spa_passivate_log(spa_t *spa)
1860 {
1861         vdev_t *rvd = spa->spa_root_vdev;
1862         boolean_t slog_found = B_FALSE;
1863         int c;
1864
1865         ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
1866
1867         if (!spa_has_slogs(spa))
1868                 return (B_FALSE);
1869
1870         for (c = 0; c < rvd->vdev_children; c++) {
1871                 vdev_t *tvd = rvd->vdev_child[c];
1872                 metaslab_group_t *mg = tvd->vdev_mg;
1873
1874                 if (tvd->vdev_islog) {
1875                         metaslab_group_passivate(mg);
1876                         slog_found = B_TRUE;
1877                 }
1878         }
1879
1880         return (slog_found);
1881 }
1882
1883 static void
1884 spa_activate_log(spa_t *spa)
1885 {
1886         vdev_t *rvd = spa->spa_root_vdev;
1887         int c;
1888
1889         ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
1890
1891         for (c = 0; c < rvd->vdev_children; c++) {
1892                 vdev_t *tvd = rvd->vdev_child[c];
1893                 metaslab_group_t *mg = tvd->vdev_mg;
1894
1895                 if (tvd->vdev_islog)
1896                         metaslab_group_activate(mg);
1897         }
1898 }
1899
1900 int
1901 spa_offline_log(spa_t *spa)
1902 {
1903         int error;
1904
1905         error = dmu_objset_find(spa_name(spa), zil_vdev_offline,
1906             NULL, DS_FIND_CHILDREN);
1907         if (error == 0) {
1908                 /*
1909                  * We successfully offlined the log device, sync out the
1910                  * current txg so that the "stubby" block can be removed
1911                  * by zil_sync().
1912                  */
1913                 txg_wait_synced(spa->spa_dsl_pool, 0);
1914         }
1915         return (error);
1916 }
1917
1918 static void
1919 spa_aux_check_removed(spa_aux_vdev_t *sav)
1920 {
1921         int i;
1922
1923         for (i = 0; i < sav->sav_count; i++)
1924                 spa_check_removed(sav->sav_vdevs[i]);
1925 }
1926
1927 void
1928 spa_claim_notify(zio_t *zio)
1929 {
1930         spa_t *spa = zio->io_spa;
1931
1932         if (zio->io_error)
1933                 return;
1934
1935         mutex_enter(&spa->spa_props_lock);      /* any mutex will do */
1936         if (spa->spa_claim_max_txg < zio->io_bp->blk_birth)
1937                 spa->spa_claim_max_txg = zio->io_bp->blk_birth;
1938         mutex_exit(&spa->spa_props_lock);
1939 }
1940
1941 typedef struct spa_load_error {
1942         uint64_t        sle_meta_count;
1943         uint64_t        sle_data_count;
1944 } spa_load_error_t;
1945
1946 static void
1947 spa_load_verify_done(zio_t *zio)
1948 {
1949         blkptr_t *bp = zio->io_bp;
1950         spa_load_error_t *sle = zio->io_private;
1951         dmu_object_type_t type = BP_GET_TYPE(bp);
1952         int error = zio->io_error;
1953         spa_t *spa = zio->io_spa;
1954
1955         if (error) {
1956                 if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) &&
1957                     type != DMU_OT_INTENT_LOG)
1958                         atomic_inc_64(&sle->sle_meta_count);
1959                 else
1960                         atomic_inc_64(&sle->sle_data_count);
1961         }
1962         zio_data_buf_free(zio->io_data, zio->io_size);
1963
1964         mutex_enter(&spa->spa_scrub_lock);
1965         spa->spa_scrub_inflight--;
1966         cv_broadcast(&spa->spa_scrub_io_cv);
1967         mutex_exit(&spa->spa_scrub_lock);
1968 }
1969
1970 /*
1971  * Maximum number of concurrent scrub i/os to create while verifying
1972  * a pool while importing it.
1973  */
1974 int spa_load_verify_maxinflight = 10000;
1975 int spa_load_verify_metadata = B_TRUE;
1976 int spa_load_verify_data = B_TRUE;
1977
1978 /*ARGSUSED*/
1979 static int
1980 spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
1981     const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
1982 {
1983         zio_t *rio;
1984         size_t size;
1985         void *data;
1986
1987         if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
1988                 return (0);
1989         /*
1990          * Note: normally this routine will not be called if
1991          * spa_load_verify_metadata is not set.  However, it may be useful
1992          * to manually set the flag after the traversal has begun.
1993          */
1994         if (!spa_load_verify_metadata)
1995                 return (0);
1996         if (BP_GET_BUFC_TYPE(bp) == ARC_BUFC_DATA && !spa_load_verify_data)
1997                 return (0);
1998
1999         rio = arg;
2000         size = BP_GET_PSIZE(bp);
2001         data = zio_data_buf_alloc(size);
2002
2003         mutex_enter(&spa->spa_scrub_lock);
2004         while (spa->spa_scrub_inflight >= spa_load_verify_maxinflight)
2005                 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
2006         spa->spa_scrub_inflight++;
2007         mutex_exit(&spa->spa_scrub_lock);
2008
2009         zio_nowait(zio_read(rio, spa, bp, data, size,
2010             spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB,
2011             ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL |
2012             ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb));
2013         return (0);
2014 }
2015
2016 /* ARGSUSED */
2017 int
2018 verify_dataset_name_len(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
2019 {
2020         if (dsl_dataset_namelen(ds) >= ZFS_MAX_DATASET_NAME_LEN)
2021                 return (SET_ERROR(ENAMETOOLONG));
2022
2023         return (0);
2024 }
2025
2026 static int
2027 spa_load_verify(spa_t *spa)
2028 {
2029         zio_t *rio;
2030         spa_load_error_t sle = { 0 };
2031         zpool_rewind_policy_t policy;
2032         boolean_t verify_ok = B_FALSE;
2033         int error = 0;
2034
2035         zpool_get_rewind_policy(spa->spa_config, &policy);
2036
2037         if (policy.zrp_request & ZPOOL_NEVER_REWIND)
2038                 return (0);
2039
2040         dsl_pool_config_enter(spa->spa_dsl_pool, FTAG);
2041         error = dmu_objset_find_dp(spa->spa_dsl_pool,
2042             spa->spa_dsl_pool->dp_root_dir_obj, verify_dataset_name_len, NULL,
2043             DS_FIND_CHILDREN);
2044         dsl_pool_config_exit(spa->spa_dsl_pool, FTAG);
2045         if (error != 0)
2046                 return (error);
2047
2048         rio = zio_root(spa, NULL, &sle,
2049             ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
2050
2051         if (spa_load_verify_metadata) {
2052                 error = traverse_pool(spa, spa->spa_verify_min_txg,
2053                     TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA,
2054                     spa_load_verify_cb, rio);
2055         }
2056
2057         (void) zio_wait(rio);
2058
2059         spa->spa_load_meta_errors = sle.sle_meta_count;
2060         spa->spa_load_data_errors = sle.sle_data_count;
2061
2062         if (!error && sle.sle_meta_count <= policy.zrp_maxmeta &&
2063             sle.sle_data_count <= policy.zrp_maxdata) {
2064                 int64_t loss = 0;
2065
2066                 verify_ok = B_TRUE;
2067                 spa->spa_load_txg = spa->spa_uberblock.ub_txg;
2068                 spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp;
2069
2070                 loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts;
2071                 VERIFY(nvlist_add_uint64(spa->spa_load_info,
2072                     ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0);
2073                 VERIFY(nvlist_add_int64(spa->spa_load_info,
2074                     ZPOOL_CONFIG_REWIND_TIME, loss) == 0);
2075                 VERIFY(nvlist_add_uint64(spa->spa_load_info,
2076                     ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0);
2077         } else {
2078                 spa->spa_load_max_txg = spa->spa_uberblock.ub_txg;
2079         }
2080
2081         if (error) {
2082                 if (error != ENXIO && error != EIO)
2083                         error = SET_ERROR(EIO);
2084                 return (error);
2085         }
2086
2087         return (verify_ok ? 0 : EIO);
2088 }
2089
2090 /*
2091  * Find a value in the pool props object.
2092  */
2093 static void
2094 spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val)
2095 {
2096         (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object,
2097             zpool_prop_to_name(prop), sizeof (uint64_t), 1, val);
2098 }
2099
2100 /*
2101  * Find a value in the pool directory object.
2102  */
2103 static int
2104 spa_dir_prop(spa_t *spa, const char *name, uint64_t *val)
2105 {
2106         return (zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
2107             name, sizeof (uint64_t), 1, val));
2108 }
2109
2110 static int
2111 spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err)
2112 {
2113         vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux);
2114         return (err);
2115 }
2116
2117 /*
2118  * Fix up config after a partly-completed split.  This is done with the
2119  * ZPOOL_CONFIG_SPLIT nvlist.  Both the splitting pool and the split-off
2120  * pool have that entry in their config, but only the splitting one contains
2121  * a list of all the guids of the vdevs that are being split off.
2122  *
2123  * This function determines what to do with that list: either rejoin
2124  * all the disks to the pool, or complete the splitting process.  To attempt
2125  * the rejoin, each disk that is offlined is marked online again, and
2126  * we do a reopen() call.  If the vdev label for every disk that was
2127  * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL)
2128  * then we call vdev_split() on each disk, and complete the split.
2129  *
2130  * Otherwise we leave the config alone, with all the vdevs in place in
2131  * the original pool.
2132  */
2133 static void
2134 spa_try_repair(spa_t *spa, nvlist_t *config)
2135 {
2136         uint_t extracted;
2137         uint64_t *glist;
2138         uint_t i, gcount;
2139         nvlist_t *nvl;
2140         vdev_t **vd;
2141         boolean_t attempt_reopen;
2142
2143         if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0)
2144                 return;
2145
2146         /* check that the config is complete */
2147         if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
2148             &glist, &gcount) != 0)
2149                 return;
2150
2151         vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP);
2152
2153         /* attempt to online all the vdevs & validate */
2154         attempt_reopen = B_TRUE;
2155         for (i = 0; i < gcount; i++) {
2156                 if (glist[i] == 0)      /* vdev is hole */
2157                         continue;
2158
2159                 vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE);
2160                 if (vd[i] == NULL) {
2161                         /*
2162                          * Don't bother attempting to reopen the disks;
2163                          * just do the split.
2164                          */
2165                         attempt_reopen = B_FALSE;
2166                 } else {
2167                         /* attempt to re-online it */
2168                         vd[i]->vdev_offline = B_FALSE;
2169                 }
2170         }
2171
2172         if (attempt_reopen) {
2173                 vdev_reopen(spa->spa_root_vdev);
2174
2175                 /* check each device to see what state it's in */
2176                 for (extracted = 0, i = 0; i < gcount; i++) {
2177                         if (vd[i] != NULL &&
2178                             vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL)
2179                                 break;
2180                         ++extracted;
2181                 }
2182         }
2183
2184         /*
2185          * If every disk has been moved to the new pool, or if we never
2186          * even attempted to look at them, then we split them off for
2187          * good.
2188          */
2189         if (!attempt_reopen || gcount == extracted) {
2190                 for (i = 0; i < gcount; i++)
2191                         if (vd[i] != NULL)
2192                                 vdev_split(vd[i]);
2193                 vdev_reopen(spa->spa_root_vdev);
2194         }
2195
2196         kmem_free(vd, gcount * sizeof (vdev_t *));
2197 }
2198
2199 static int
2200 spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type,
2201     boolean_t mosconfig)
2202 {
2203         nvlist_t *config = spa->spa_config;
2204         char *ereport = FM_EREPORT_ZFS_POOL;
2205         char *comment;
2206         int error;
2207         uint64_t pool_guid;
2208         nvlist_t *nvl;
2209
2210         if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid))
2211                 return (SET_ERROR(EINVAL));
2212
2213         ASSERT(spa->spa_comment == NULL);
2214         if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0)
2215                 spa->spa_comment = spa_strdup(comment);
2216
2217         /*
2218          * Versioning wasn't explicitly added to the label until later, so if
2219          * it's not present treat it as the initial version.
2220          */
2221         if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
2222             &spa->spa_ubsync.ub_version) != 0)
2223                 spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
2224
2225         (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
2226             &spa->spa_config_txg);
2227
2228         if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) &&
2229             spa_guid_exists(pool_guid, 0)) {
2230                 error = SET_ERROR(EEXIST);
2231         } else {
2232                 spa->spa_config_guid = pool_guid;
2233
2234                 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT,
2235                     &nvl) == 0) {
2236                         VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting,
2237                             KM_SLEEP) == 0);
2238                 }
2239
2240                 nvlist_free(spa->spa_load_info);
2241                 spa->spa_load_info = fnvlist_alloc();
2242
2243                 gethrestime(&spa->spa_loaded_ts);
2244                 error = spa_load_impl(spa, pool_guid, config, state, type,
2245                     mosconfig, &ereport);
2246         }
2247
2248         /*
2249          * Don't count references from objsets that are already closed
2250          * and are making their way through the eviction process.
2251          */
2252         spa_evicting_os_wait(spa);
2253         spa->spa_minref = refcount_count(&spa->spa_refcount);
2254         if (error) {
2255                 if (error != EEXIST) {
2256                         spa->spa_loaded_ts.tv_sec = 0;
2257                         spa->spa_loaded_ts.tv_nsec = 0;
2258                 }
2259                 if (error != EBADF) {
2260                         zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0);
2261                 }
2262         }
2263         spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE;
2264         spa->spa_ena = 0;
2265
2266         return (error);
2267 }
2268
2269 #ifdef ZFS_DEBUG
2270 /*
2271  * Count the number of per-vdev ZAPs associated with all of the vdevs in the
2272  * vdev tree rooted in the given vd, and ensure that each ZAP is present in the
2273  * spa's per-vdev ZAP list.
2274  */
2275 static uint64_t
2276 vdev_count_verify_zaps(vdev_t *vd)
2277 {
2278         spa_t *spa = vd->vdev_spa;
2279         uint64_t total = 0;
2280         uint64_t i;
2281
2282         if (vd->vdev_top_zap != 0) {
2283                 total++;
2284                 ASSERT0(zap_lookup_int(spa->spa_meta_objset,
2285                     spa->spa_all_vdev_zaps, vd->vdev_top_zap));
2286         }
2287         if (vd->vdev_leaf_zap != 0) {
2288                 total++;
2289                 ASSERT0(zap_lookup_int(spa->spa_meta_objset,
2290                     spa->spa_all_vdev_zaps, vd->vdev_leaf_zap));
2291         }
2292
2293         for (i = 0; i < vd->vdev_children; i++) {
2294                 total += vdev_count_verify_zaps(vd->vdev_child[i]);
2295         }
2296
2297         return (total);
2298 }
2299 #endif
2300
2301 /*
2302  * Load an existing storage pool, using the pool's builtin spa_config as a
2303  * source of configuration information.
2304  */
2305 __attribute__((always_inline))
2306 static inline int
2307 spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
2308     spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig,
2309     char **ereport)
2310 {
2311         int error = 0;
2312         nvlist_t *nvroot = NULL;
2313         nvlist_t *label;
2314         vdev_t *rvd;
2315         uberblock_t *ub = &spa->spa_uberblock;
2316         uint64_t children, config_cache_txg = spa->spa_config_txg;
2317         int orig_mode = spa->spa_mode;
2318         int parse, i;
2319         uint64_t obj;
2320         boolean_t missing_feat_write = B_FALSE;
2321         nvlist_t *mos_config;
2322
2323         /*
2324          * If this is an untrusted config, access the pool in read-only mode.
2325          * This prevents things like resilvering recently removed devices.
2326          */
2327         if (!mosconfig)
2328                 spa->spa_mode = FREAD;
2329
2330         ASSERT(MUTEX_HELD(&spa_namespace_lock));
2331
2332         spa->spa_load_state = state;
2333
2334         if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot))
2335                 return (SET_ERROR(EINVAL));
2336
2337         parse = (type == SPA_IMPORT_EXISTING ?
2338             VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT);
2339
2340         /*
2341          * Create "The Godfather" zio to hold all async IOs
2342          */
2343         spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *),
2344             KM_SLEEP);
2345         for (i = 0; i < max_ncpus; i++) {
2346                 spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL,
2347                     ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
2348                     ZIO_FLAG_GODFATHER);
2349         }
2350
2351         /*
2352          * Parse the configuration into a vdev tree.  We explicitly set the
2353          * value that will be returned by spa_version() since parsing the
2354          * configuration requires knowing the version number.
2355          */
2356         spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2357         error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, parse);
2358         spa_config_exit(spa, SCL_ALL, FTAG);
2359
2360         if (error != 0)
2361                 return (error);
2362
2363         ASSERT(spa->spa_root_vdev == rvd);
2364         ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT);
2365         ASSERT3U(spa->spa_max_ashift, <=, SPA_MAXBLOCKSHIFT);
2366
2367         if (type != SPA_IMPORT_ASSEMBLE) {
2368                 ASSERT(spa_guid(spa) == pool_guid);
2369         }
2370
2371         /*
2372          * Try to open all vdevs, loading each label in the process.
2373          */
2374         spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2375         error = vdev_open(rvd);
2376         spa_config_exit(spa, SCL_ALL, FTAG);
2377         if (error != 0)
2378                 return (error);
2379
2380         /*
2381          * We need to validate the vdev labels against the configuration that
2382          * we have in hand, which is dependent on the setting of mosconfig. If
2383          * mosconfig is true then we're validating the vdev labels based on
2384          * that config.  Otherwise, we're validating against the cached config
2385          * (zpool.cache) that was read when we loaded the zfs module, and then
2386          * later we will recursively call spa_load() and validate against
2387          * the vdev config.
2388          *
2389          * If we're assembling a new pool that's been split off from an
2390          * existing pool, the labels haven't yet been updated so we skip
2391          * validation for now.
2392          */
2393         if (type != SPA_IMPORT_ASSEMBLE) {
2394                 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2395                 error = vdev_validate(rvd, mosconfig);
2396                 spa_config_exit(spa, SCL_ALL, FTAG);
2397
2398                 if (error != 0)
2399                         return (error);
2400
2401                 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
2402                         return (SET_ERROR(ENXIO));
2403         }
2404
2405         /*
2406          * Find the best uberblock.
2407          */
2408         vdev_uberblock_load(rvd, ub, &label);
2409
2410         /*
2411          * If we weren't able to find a single valid uberblock, return failure.
2412          */
2413         if (ub->ub_txg == 0) {
2414                 nvlist_free(label);
2415                 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO));
2416         }
2417
2418         /*
2419          * If the pool has an unsupported version we can't open it.
2420          */
2421         if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) {
2422                 nvlist_free(label);
2423                 return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP));
2424         }
2425
2426         if (ub->ub_version >= SPA_VERSION_FEATURES) {
2427                 nvlist_t *features;
2428
2429                 /*
2430                  * If we weren't able to find what's necessary for reading the
2431                  * MOS in the label, return failure.
2432                  */
2433                 if (label == NULL || nvlist_lookup_nvlist(label,
2434                     ZPOOL_CONFIG_FEATURES_FOR_READ, &features) != 0) {
2435                         nvlist_free(label);
2436                         return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
2437                             ENXIO));
2438                 }
2439
2440                 /*
2441                  * Update our in-core representation with the definitive values
2442                  * from the label.
2443                  */
2444                 nvlist_free(spa->spa_label_features);
2445                 VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0);
2446         }
2447
2448         nvlist_free(label);
2449
2450         /*
2451          * Look through entries in the label nvlist's features_for_read. If
2452          * there is a feature listed there which we don't understand then we
2453          * cannot open a pool.
2454          */
2455         if (ub->ub_version >= SPA_VERSION_FEATURES) {
2456                 nvlist_t *unsup_feat;
2457                 nvpair_t *nvp;
2458
2459                 VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) ==
2460                     0);
2461
2462                 for (nvp = nvlist_next_nvpair(spa->spa_label_features, NULL);
2463                     nvp != NULL;
2464                     nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) {
2465                         if (!zfeature_is_supported(nvpair_name(nvp))) {
2466                                 VERIFY(nvlist_add_string(unsup_feat,
2467                                     nvpair_name(nvp), "") == 0);
2468                         }
2469                 }
2470
2471                 if (!nvlist_empty(unsup_feat)) {
2472                         VERIFY(nvlist_add_nvlist(spa->spa_load_info,
2473                             ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0);
2474                         nvlist_free(unsup_feat);
2475                         return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
2476                             ENOTSUP));
2477                 }
2478
2479                 nvlist_free(unsup_feat);
2480         }
2481
2482         /*
2483          * If the vdev guid sum doesn't match the uberblock, we have an
2484          * incomplete configuration.  We first check to see if the pool
2485          * is aware of the complete config (i.e ZPOOL_CONFIG_VDEV_CHILDREN).
2486          * If it is, defer the vdev_guid_sum check till later so we
2487          * can handle missing vdevs.
2488          */
2489         if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN,
2490             &children) != 0 && mosconfig && type != SPA_IMPORT_ASSEMBLE &&
2491             rvd->vdev_guid_sum != ub->ub_guid_sum)
2492                 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO));
2493
2494         if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) {
2495                 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2496                 spa_try_repair(spa, config);
2497                 spa_config_exit(spa, SCL_ALL, FTAG);
2498                 nvlist_free(spa->spa_config_splitting);
2499                 spa->spa_config_splitting = NULL;
2500         }
2501
2502         /*
2503          * Initialize internal SPA structures.
2504          */
2505         spa->spa_state = POOL_STATE_ACTIVE;
2506         spa->spa_ubsync = spa->spa_uberblock;
2507         spa->spa_verify_min_txg = spa->spa_extreme_rewind ?
2508             TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1;
2509         spa->spa_first_txg = spa->spa_last_ubsync_txg ?
2510             spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1;
2511         spa->spa_claim_max_txg = spa->spa_first_txg;
2512         spa->spa_prev_software_version = ub->ub_software_version;
2513
2514         error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
2515         if (error)
2516                 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2517         spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
2518
2519         if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0)
2520                 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2521
2522         if (spa_version(spa) >= SPA_VERSION_FEATURES) {
2523                 boolean_t missing_feat_read = B_FALSE;
2524                 nvlist_t *unsup_feat, *enabled_feat;
2525                 spa_feature_t i;
2526
2527                 if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ,
2528                     &spa->spa_feat_for_read_obj) != 0) {
2529                         return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2530                 }
2531
2532                 if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE,
2533                     &spa->spa_feat_for_write_obj) != 0) {
2534                         return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2535                 }
2536
2537                 if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS,
2538                     &spa->spa_feat_desc_obj) != 0) {
2539                         return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2540                 }
2541
2542                 enabled_feat = fnvlist_alloc();
2543                 unsup_feat = fnvlist_alloc();
2544
2545                 if (!spa_features_check(spa, B_FALSE,
2546                     unsup_feat, enabled_feat))
2547                         missing_feat_read = B_TRUE;
2548
2549                 if (spa_writeable(spa) || state == SPA_LOAD_TRYIMPORT) {
2550                         if (!spa_features_check(spa, B_TRUE,
2551                             unsup_feat, enabled_feat)) {
2552                                 missing_feat_write = B_TRUE;
2553                         }
2554                 }
2555
2556                 fnvlist_add_nvlist(spa->spa_load_info,
2557                     ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat);
2558
2559                 if (!nvlist_empty(unsup_feat)) {
2560                         fnvlist_add_nvlist(spa->spa_load_info,
2561                             ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat);
2562                 }
2563
2564                 fnvlist_free(enabled_feat);
2565                 fnvlist_free(unsup_feat);
2566
2567                 if (!missing_feat_read) {
2568                         fnvlist_add_boolean(spa->spa_load_info,
2569                             ZPOOL_CONFIG_CAN_RDONLY);
2570                 }
2571
2572                 /*
2573                  * If the state is SPA_LOAD_TRYIMPORT, our objective is
2574                  * twofold: to determine whether the pool is available for
2575                  * import in read-write mode and (if it is not) whether the
2576                  * pool is available for import in read-only mode. If the pool
2577                  * is available for import in read-write mode, it is displayed
2578                  * as available in userland; if it is not available for import
2579                  * in read-only mode, it is displayed as unavailable in
2580                  * userland. If the pool is available for import in read-only
2581                  * mode but not read-write mode, it is displayed as unavailable
2582                  * in userland with a special note that the pool is actually
2583                  * available for open in read-only mode.
2584                  *
2585                  * As a result, if the state is SPA_LOAD_TRYIMPORT and we are
2586                  * missing a feature for write, we must first determine whether
2587                  * the pool can be opened read-only before returning to
2588                  * userland in order to know whether to display the
2589                  * abovementioned note.
2590                  */
2591                 if (missing_feat_read || (missing_feat_write &&
2592                     spa_writeable(spa))) {
2593                         return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
2594                             ENOTSUP));
2595                 }
2596
2597                 /*
2598                  * Load refcounts for ZFS features from disk into an in-memory
2599                  * cache during SPA initialization.
2600                  */
2601                 for (i = 0; i < SPA_FEATURES; i++) {
2602                         uint64_t refcount;
2603
2604                         error = feature_get_refcount_from_disk(spa,
2605                             &spa_feature_table[i], &refcount);
2606                         if (error == 0) {
2607                                 spa->spa_feat_refcount_cache[i] = refcount;
2608                         } else if (error == ENOTSUP) {
2609                                 spa->spa_feat_refcount_cache[i] =
2610                                     SPA_FEATURE_DISABLED;
2611                         } else {
2612                                 return (spa_vdev_err(rvd,
2613                                     VDEV_AUX_CORRUPT_DATA, EIO));
2614                         }
2615                 }
2616         }
2617
2618         if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) {
2619                 if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG,
2620                     &spa->spa_feat_enabled_txg_obj) != 0)
2621                         return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2622         }
2623
2624         spa->spa_is_initializing = B_TRUE;
2625         error = dsl_pool_open(spa->spa_dsl_pool);
2626         spa->spa_is_initializing = B_FALSE;
2627         if (error != 0)
2628                 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2629
2630         if (!mosconfig) {
2631                 uint64_t hostid;
2632                 nvlist_t *policy = NULL, *nvconfig;
2633
2634                 if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0)
2635                         return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2636
2637                 if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig,
2638                     ZPOOL_CONFIG_HOSTID, &hostid) == 0) {
2639                         char *hostname;
2640                         unsigned long myhostid = 0;
2641
2642                         VERIFY(nvlist_lookup_string(nvconfig,
2643                             ZPOOL_CONFIG_HOSTNAME, &hostname) == 0);
2644
2645 #ifdef  _KERNEL
2646                         myhostid = zone_get_hostid(NULL);
2647 #else   /* _KERNEL */
2648                         /*
2649                          * We're emulating the system's hostid in userland, so
2650                          * we can't use zone_get_hostid().
2651                          */
2652                         (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid);
2653 #endif  /* _KERNEL */
2654                         if (hostid != 0 && myhostid != 0 &&
2655                             hostid != myhostid) {
2656                                 nvlist_free(nvconfig);
2657                                 cmn_err(CE_WARN, "pool '%s' could not be "
2658                                     "loaded as it was last accessed by another "
2659                                     "system (host: %s hostid: 0x%lx). See: "
2660                                     "http://zfsonlinux.org/msg/ZFS-8000-EY",
2661                                     spa_name(spa), hostname,
2662                                     (unsigned long)hostid);
2663                                 return (SET_ERROR(EBADF));
2664                         }
2665                 }
2666                 if (nvlist_lookup_nvlist(spa->spa_config,
2667                     ZPOOL_REWIND_POLICY, &policy) == 0)
2668                         VERIFY(nvlist_add_nvlist(nvconfig,
2669                             ZPOOL_REWIND_POLICY, policy) == 0);
2670
2671                 spa_config_set(spa, nvconfig);
2672                 spa_unload(spa);
2673                 spa_deactivate(spa);
2674                 spa_activate(spa, orig_mode);
2675
2676                 return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE));
2677         }
2678
2679         /* Grab the checksum salt from the MOS. */
2680         error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
2681             DMU_POOL_CHECKSUM_SALT, 1,
2682             sizeof (spa->spa_cksum_salt.zcs_bytes),
2683             spa->spa_cksum_salt.zcs_bytes);
2684         if (error == ENOENT) {
2685                 /* Generate a new salt for subsequent use */
2686                 (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes,
2687                     sizeof (spa->spa_cksum_salt.zcs_bytes));
2688         } else if (error != 0) {
2689                 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2690         }
2691
2692         if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj) != 0)
2693                 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2694         error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj);
2695         if (error != 0)
2696                 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2697
2698         /*
2699          * Load the bit that tells us to use the new accounting function
2700          * (raid-z deflation).  If we have an older pool, this will not
2701          * be present.
2702          */
2703         error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate);
2704         if (error != 0 && error != ENOENT)
2705                 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2706
2707         error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION,
2708             &spa->spa_creation_version);
2709         if (error != 0 && error != ENOENT)
2710                 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2711
2712         /*
2713          * Load the persistent error log.  If we have an older pool, this will
2714          * not be present.
2715          */
2716         error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last);
2717         if (error != 0 && error != ENOENT)
2718                 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2719
2720         error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB,
2721             &spa->spa_errlog_scrub);
2722         if (error != 0 && error != ENOENT)
2723                 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2724
2725         /*
2726          * Load the history object.  If we have an older pool, this
2727          * will not be present.
2728          */
2729         error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history);
2730         if (error != 0 && error != ENOENT)
2731                 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2732
2733         /*
2734          * Load the per-vdev ZAP map. If we have an older pool, this will not
2735          * be present; in this case, defer its creation to a later time to
2736          * avoid dirtying the MOS this early / out of sync context. See
2737          * spa_sync_config_object.
2738          */
2739
2740         /* The sentinel is only available in the MOS config. */
2741         if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0)
2742                 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2743
2744         error = spa_dir_prop(spa, DMU_POOL_VDEV_ZAP_MAP,
2745             &spa->spa_all_vdev_zaps);
2746
2747         if (error != ENOENT && error != 0) {
2748                 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2749         } else if (error == 0 && !nvlist_exists(mos_config,
2750             ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)) {
2751                 /*
2752                  * An older version of ZFS overwrote the sentinel value, so
2753                  * we have orphaned per-vdev ZAPs in the MOS. Defer their
2754                  * destruction to later; see spa_sync_config_object.
2755                  */
2756                 spa->spa_avz_action = AVZ_ACTION_DESTROY;
2757                 /*
2758                  * We're assuming that no vdevs have had their ZAPs created
2759                  * before this. Better be sure of it.
2760                  */
2761                 ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev));
2762         }
2763         nvlist_free(mos_config);
2764
2765         /*
2766          * If we're assembling the pool from the split-off vdevs of
2767          * an existing pool, we don't want to attach the spares & cache
2768          * devices.
2769          */
2770
2771         /*
2772          * Load any hot spares for this pool.
2773          */
2774         error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object);
2775         if (error != 0 && error != ENOENT)
2776                 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2777         if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
2778                 ASSERT(spa_version(spa) >= SPA_VERSION_SPARES);
2779                 if (load_nvlist(spa, spa->spa_spares.sav_object,
2780                     &spa->spa_spares.sav_config) != 0)
2781                         return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2782
2783                 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2784                 spa_load_spares(spa);
2785                 spa_config_exit(spa, SCL_ALL, FTAG);
2786         } else if (error == 0) {
2787                 spa->spa_spares.sav_sync = B_TRUE;
2788         }
2789
2790         /*
2791          * Load any level 2 ARC devices for this pool.
2792          */
2793         error = spa_dir_prop(spa, DMU_POOL_L2CACHE,
2794             &spa->spa_l2cache.sav_object);
2795         if (error != 0 && error != ENOENT)
2796                 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2797         if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
2798                 ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE);
2799                 if (load_nvlist(spa, spa->spa_l2cache.sav_object,
2800                     &spa->spa_l2cache.sav_config) != 0)
2801                         return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2802
2803                 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2804                 spa_load_l2cache(spa);
2805                 spa_config_exit(spa, SCL_ALL, FTAG);
2806         } else if (error == 0) {
2807                 spa->spa_l2cache.sav_sync = B_TRUE;
2808         }
2809
2810         spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
2811
2812         error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object);
2813         if (error && error != ENOENT)
2814                 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2815
2816         if (error == 0) {
2817                 uint64_t autoreplace = 0;
2818
2819                 spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs);
2820                 spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace);
2821                 spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation);
2822                 spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode);
2823                 spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand);
2824                 spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO,
2825                     &spa->spa_dedup_ditto);
2826
2827                 spa->spa_autoreplace = (autoreplace != 0);
2828         }
2829
2830         /*
2831          * If the 'autoreplace' property is set, then post a resource notifying
2832          * the ZFS DE that it should not issue any faults for unopenable
2833          * devices.  We also iterate over the vdevs, and post a sysevent for any
2834          * unopenable vdevs so that the normal autoreplace handler can take
2835          * over.
2836          */
2837         if (spa->spa_autoreplace && state != SPA_LOAD_TRYIMPORT) {
2838                 spa_check_removed(spa->spa_root_vdev);
2839                 /*
2840                  * For the import case, this is done in spa_import(), because
2841                  * at this point we're using the spare definitions from
2842                  * the MOS config, not necessarily from the userland config.
2843                  */
2844                 if (state != SPA_LOAD_IMPORT) {
2845                         spa_aux_check_removed(&spa->spa_spares);
2846                         spa_aux_check_removed(&spa->spa_l2cache);
2847                 }
2848         }
2849
2850         /*
2851          * Load the vdev state for all toplevel vdevs.
2852          */
2853         vdev_load(rvd);
2854
2855         /*
2856          * Propagate the leaf DTLs we just loaded all the way up the tree.
2857          */
2858         spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2859         vdev_dtl_reassess(rvd, 0, 0, B_FALSE);
2860         spa_config_exit(spa, SCL_ALL, FTAG);
2861
2862         /*
2863          * Load the DDTs (dedup tables).
2864          */
2865         error = ddt_load(spa);
2866         if (error != 0)
2867                 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2868
2869         spa_update_dspace(spa);
2870
2871         /*
2872          * Validate the config, using the MOS config to fill in any
2873          * information which might be missing.  If we fail to validate
2874          * the config then declare the pool unfit for use. If we're
2875          * assembling a pool from a split, the log is not transferred
2876          * over.
2877          */
2878         if (type != SPA_IMPORT_ASSEMBLE) {
2879                 nvlist_t *nvconfig;
2880
2881                 if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0)
2882                         return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2883
2884                 if (!spa_config_valid(spa, nvconfig)) {
2885                         nvlist_free(nvconfig);
2886                         return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM,
2887                             ENXIO));
2888                 }
2889                 nvlist_free(nvconfig);
2890
2891                 /*
2892                  * Now that we've validated the config, check the state of the
2893                  * root vdev.  If it can't be opened, it indicates one or
2894                  * more toplevel vdevs are faulted.
2895                  */
2896                 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
2897                         return (SET_ERROR(ENXIO));
2898
2899                 if (spa_writeable(spa) && spa_check_logs(spa)) {
2900                         *ereport = FM_EREPORT_ZFS_LOG_REPLAY;
2901                         return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO));
2902                 }
2903         }
2904
2905         if (missing_feat_write) {
2906                 ASSERT(state == SPA_LOAD_TRYIMPORT);
2907
2908                 /*
2909                  * At this point, we know that we can open the pool in
2910                  * read-only mode but not read-write mode. We now have enough
2911                  * information and can return to userland.
2912                  */
2913                 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, ENOTSUP));
2914         }
2915
2916         /*
2917          * We've successfully opened the pool, verify that we're ready
2918          * to start pushing transactions.
2919          */
2920         if (state != SPA_LOAD_TRYIMPORT) {
2921                 if ((error = spa_load_verify(spa)))
2922                         return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
2923                             error));
2924         }
2925
2926         if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER ||
2927             spa->spa_load_max_txg == UINT64_MAX)) {
2928                 dmu_tx_t *tx;
2929                 int need_update = B_FALSE;
2930                 dsl_pool_t *dp = spa_get_dsl(spa);
2931                 int c;
2932
2933                 ASSERT(state != SPA_LOAD_TRYIMPORT);
2934
2935                 /*
2936                  * Claim log blocks that haven't been committed yet.
2937                  * This must all happen in a single txg.
2938                  * Note: spa_claim_max_txg is updated by spa_claim_notify(),
2939                  * invoked from zil_claim_log_block()'s i/o done callback.
2940                  * Price of rollback is that we abandon the log.
2941                  */
2942                 spa->spa_claiming = B_TRUE;
2943
2944                 tx = dmu_tx_create_assigned(dp, spa_first_txg(spa));
2945                 (void) dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
2946                     zil_claim, tx, DS_FIND_CHILDREN);
2947                 dmu_tx_commit(tx);
2948
2949                 spa->spa_claiming = B_FALSE;
2950
2951                 spa_set_log_state(spa, SPA_LOG_GOOD);
2952                 spa->spa_sync_on = B_TRUE;
2953                 txg_sync_start(spa->spa_dsl_pool);
2954
2955                 /*
2956                  * Wait for all claims to sync.  We sync up to the highest
2957                  * claimed log block birth time so that claimed log blocks
2958                  * don't appear to be from the future.  spa_claim_max_txg
2959                  * will have been set for us by either zil_check_log_chain()
2960                  * (invoked from spa_check_logs()) or zil_claim() above.
2961                  */
2962                 txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg);
2963
2964                 /*
2965                  * If the config cache is stale, or we have uninitialized
2966                  * metaslabs (see spa_vdev_add()), then update the config.
2967                  *
2968                  * If this is a verbatim import, trust the current
2969                  * in-core spa_config and update the disk labels.
2970                  */
2971                 if (config_cache_txg != spa->spa_config_txg ||
2972                     state == SPA_LOAD_IMPORT ||
2973                     state == SPA_LOAD_RECOVER ||
2974                     (spa->spa_import_flags & ZFS_IMPORT_VERBATIM))
2975                         need_update = B_TRUE;
2976
2977                 for (c = 0; c < rvd->vdev_children; c++)
2978                         if (rvd->vdev_child[c]->vdev_ms_array == 0)
2979                                 need_update = B_TRUE;
2980
2981                 /*
2982                  * Update the config cache asychronously in case we're the
2983                  * root pool, in which case the config cache isn't writable yet.
2984                  */
2985                 if (need_update)
2986                         spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
2987
2988                 /*
2989                  * Check all DTLs to see if anything needs resilvering.
2990                  */
2991                 if (!dsl_scan_resilvering(spa->spa_dsl_pool) &&
2992                     vdev_resilver_needed(rvd, NULL, NULL))
2993                         spa_async_request(spa, SPA_ASYNC_RESILVER);
2994
2995                 /*
2996                  * Log the fact that we booted up (so that we can detect if
2997                  * we rebooted in the middle of an operation).
2998                  */
2999                 spa_history_log_version(spa, "open");
3000
3001                 /*
3002                  * Delete any inconsistent datasets.
3003                  */
3004                 (void) dmu_objset_find(spa_name(spa),
3005                     dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN);
3006
3007                 /*
3008                  * Clean up any stale temporary dataset userrefs.
3009                  */
3010                 dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool);
3011         }
3012
3013         return (0);
3014 }
3015
3016 static int
3017 spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig)
3018 {
3019         int mode = spa->spa_mode;
3020
3021         spa_unload(spa);
3022         spa_deactivate(spa);
3023
3024         spa->spa_load_max_txg = spa->spa_uberblock.ub_txg - 1;
3025
3026         spa_activate(spa, mode);
3027         spa_async_suspend(spa);
3028
3029         return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig));
3030 }
3031
3032 /*
3033  * If spa_load() fails this function will try loading prior txg's. If
3034  * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool
3035  * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this
3036  * function will not rewind the pool and will return the same error as
3037  * spa_load().
3038  */
3039 static int
3040 spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig,
3041     uint64_t max_request, int rewind_flags)
3042 {
3043         nvlist_t *loadinfo = NULL;
3044         nvlist_t *config = NULL;
3045         int load_error, rewind_error;
3046         uint64_t safe_rewind_txg;
3047         uint64_t min_txg;
3048
3049         if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) {
3050                 spa->spa_load_max_txg = spa->spa_load_txg;
3051                 spa_set_log_state(spa, SPA_LOG_CLEAR);
3052         } else {
3053                 spa->spa_load_max_txg = max_request;
3054                 if (max_request != UINT64_MAX)
3055                         spa->spa_extreme_rewind = B_TRUE;
3056         }
3057
3058         load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING,
3059             mosconfig);
3060         if (load_error == 0)
3061                 return (0);
3062
3063         if (spa->spa_root_vdev != NULL)
3064                 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
3065
3066         spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg;
3067         spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp;
3068
3069         if (rewind_flags & ZPOOL_NEVER_REWIND) {
3070                 nvlist_free(config);
3071                 return (load_error);
3072         }
3073
3074         if (state == SPA_LOAD_RECOVER) {
3075                 /* Price of rolling back is discarding txgs, including log */
3076                 spa_set_log_state(spa, SPA_LOG_CLEAR);
3077         } else {
3078                 /*
3079                  * If we aren't rolling back save the load info from our first
3080                  * import attempt so that we can restore it after attempting
3081                  * to rewind.
3082                  */
3083                 loadinfo = spa->spa_load_info;
3084                 spa->spa_load_info = fnvlist_alloc();
3085         }
3086
3087         spa->spa_load_max_txg = spa->spa_last_ubsync_txg;
3088         safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE;
3089         min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ?
3090             TXG_INITIAL : safe_rewind_txg;
3091
3092         /*
3093          * Continue as long as we're finding errors, we're still within
3094          * the acceptable rewind range, and we're still finding uberblocks
3095          */
3096         while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg &&
3097             spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) {
3098                 if (spa->spa_load_max_txg < safe_rewind_txg)
3099                         spa->spa_extreme_rewind = B_TRUE;
3100                 rewind_error = spa_load_retry(spa, state, mosconfig);
3101         }
3102
3103         spa->spa_extreme_rewind = B_FALSE;
3104         spa->spa_load_max_txg = UINT64_MAX;
3105
3106         if (config && (rewind_error || state != SPA_LOAD_RECOVER))
3107                 spa_config_set(spa, config);
3108         else
3109                 nvlist_free(config);
3110
3111         if (state == SPA_LOAD_RECOVER) {
3112                 ASSERT3P(loadinfo, ==, NULL);
3113                 return (rewind_error);
3114         } else {
3115                 /* Store the rewind info as part of the initial load info */
3116                 fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO,
3117                     spa->spa_load_info);
3118
3119                 /* Restore the initial load info */
3120                 fnvlist_free(spa->spa_load_info);
3121                 spa->spa_load_info = loadinfo;
3122
3123                 return (load_error);
3124         }
3125 }
3126
3127 /*
3128  * Pool Open/Import
3129  *
3130  * The import case is identical to an open except that the configuration is sent
3131  * down from userland, instead of grabbed from the configuration cache.  For the
3132  * case of an open, the pool configuration will exist in the
3133  * POOL_STATE_UNINITIALIZED state.
3134  *
3135  * The stats information (gen/count/ustats) is used to gather vdev statistics at
3136  * the same time open the pool, without having to keep around the spa_t in some
3137  * ambiguous state.
3138  */
3139 static int
3140 spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy,
3141     nvlist_t **config)
3142 {
3143         spa_t *spa;
3144         spa_load_state_t state = SPA_LOAD_OPEN;
3145         int error;
3146         int locked = B_FALSE;
3147         int firstopen = B_FALSE;
3148
3149         *spapp = NULL;
3150
3151         /*
3152          * As disgusting as this is, we need to support recursive calls to this
3153          * function because dsl_dir_open() is called during spa_load(), and ends
3154          * up calling spa_open() again.  The real fix is to figure out how to
3155          * avoid dsl_dir_open() calling this in the first place.
3156          */
3157         if (mutex_owner(&spa_namespace_lock) != curthread) {
3158                 mutex_enter(&spa_namespace_lock);
3159                 locked = B_TRUE;
3160         }
3161
3162         if ((spa = spa_lookup(pool)) == NULL) {
3163                 if (locked)
3164                         mutex_exit(&spa_namespace_lock);
3165                 return (SET_ERROR(ENOENT));
3166         }
3167
3168         if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
3169                 zpool_rewind_policy_t policy;
3170
3171                 firstopen = B_TRUE;
3172
3173                 zpool_get_rewind_policy(nvpolicy ? nvpolicy : spa->spa_config,
3174                     &policy);
3175                 if (policy.zrp_request & ZPOOL_DO_REWIND)
3176                         state = SPA_LOAD_RECOVER;
3177
3178                 spa_activate(spa, spa_mode_global);
3179
3180                 if (state != SPA_LOAD_RECOVER)
3181                         spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
3182
3183                 error = spa_load_best(spa, state, B_FALSE, policy.zrp_txg,
3184                     policy.zrp_request);
3185
3186                 if (error == EBADF) {
3187                         /*
3188                          * If vdev_validate() returns failure (indicated by
3189                          * EBADF), it indicates that one of the vdevs indicates
3190                          * that the pool has been exported or destroyed.  If
3191                          * this is the case, the config cache is out of sync and
3192                          * we should remove the pool from the namespace.
3193                          */
3194                         spa_unload(spa);
3195                         spa_deactivate(spa);
3196                         spa_config_sync(spa, B_TRUE, B_TRUE);
3197                         spa_remove(spa);
3198                         if (locked)
3199                                 mutex_exit(&spa_namespace_lock);
3200                         return (SET_ERROR(ENOENT));
3201                 }
3202
3203                 if (error) {
3204                         /*
3205                          * We can't open the pool, but we still have useful
3206                          * information: the state of each vdev after the
3207                          * attempted vdev_open().  Return this to the user.
3208                          */
3209                         if (config != NULL && spa->spa_config) {
3210                                 VERIFY(nvlist_dup(spa->spa_config, config,
3211                                     KM_SLEEP) == 0);
3212                                 VERIFY(nvlist_add_nvlist(*config,
3213                                     ZPOOL_CONFIG_LOAD_INFO,
3214                                     spa->spa_load_info) == 0);
3215                         }
3216                         spa_unload(spa);
3217                         spa_deactivate(spa);
3218                         spa->spa_last_open_failed = error;
3219                         if (locked)
3220                                 mutex_exit(&spa_namespace_lock);
3221                         *spapp = NULL;
3222                         return (error);
3223                 }
3224         }
3225
3226         spa_open_ref(spa, tag);
3227
3228         if (config != NULL)
3229                 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
3230
3231         /*
3232          * If we've recovered the pool, pass back any information we
3233          * gathered while doing the load.
3234          */
3235         if (state == SPA_LOAD_RECOVER) {
3236                 VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO,
3237                     spa->spa_load_info) == 0);
3238         }
3239
3240         if (locked) {
3241                 spa->spa_last_open_failed = 0;
3242                 spa->spa_last_ubsync_txg = 0;
3243                 spa->spa_load_txg = 0;
3244                 mutex_exit(&spa_namespace_lock);
3245         }
3246
3247         if (firstopen)
3248                 zvol_create_minors(spa, spa_name(spa), B_TRUE);
3249
3250         *spapp = spa;
3251
3252         return (0);
3253 }
3254
3255 int
3256 spa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy,
3257     nvlist_t **config)
3258 {
3259         return (spa_open_common(name, spapp, tag, policy, config));
3260 }
3261
3262 int
3263 spa_open(const char *name, spa_t **spapp, void *tag)
3264 {
3265         return (spa_open_common(name, spapp, tag, NULL, NULL));
3266 }
3267
3268 /*
3269  * Lookup the given spa_t, incrementing the inject count in the process,
3270  * preventing it from being exported or destroyed.
3271  */
3272 spa_t *
3273 spa_inject_addref(char *name)
3274 {
3275         spa_t *spa;
3276
3277         mutex_enter(&spa_namespace_lock);
3278         if ((spa = spa_lookup(name)) == NULL) {
3279                 mutex_exit(&spa_namespace_lock);
3280                 return (NULL);
3281         }
3282         spa->spa_inject_ref++;
3283         mutex_exit(&spa_namespace_lock);
3284
3285         return (spa);
3286 }
3287
3288 void
3289 spa_inject_delref(spa_t *spa)
3290 {
3291         mutex_enter(&spa_namespace_lock);
3292         spa->spa_inject_ref--;
3293         mutex_exit(&spa_namespace_lock);
3294 }
3295
3296 /*
3297  * Add spares device information to the nvlist.
3298  */
3299 static void
3300 spa_add_spares(spa_t *spa, nvlist_t *config)
3301 {
3302         nvlist_t **spares;
3303         uint_t i, nspares;
3304         nvlist_t *nvroot;
3305         uint64_t guid;
3306         vdev_stat_t *vs;
3307         uint_t vsc;
3308         uint64_t pool;
3309
3310         ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
3311
3312         if (spa->spa_spares.sav_count == 0)
3313                 return;
3314
3315         VERIFY(nvlist_lookup_nvlist(config,
3316             ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
3317         VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
3318             ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
3319         if (nspares != 0) {
3320                 VERIFY(nvlist_add_nvlist_array(nvroot,
3321                     ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
3322                 VERIFY(nvlist_lookup_nvlist_array(nvroot,
3323                     ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
3324
3325                 /*
3326                  * Go through and find any spares which have since been
3327                  * repurposed as an active spare.  If this is the case, update
3328                  * their status appropriately.
3329                  */
3330                 for (i = 0; i < nspares; i++) {
3331                         VERIFY(nvlist_lookup_uint64(spares[i],
3332                             ZPOOL_CONFIG_GUID, &guid) == 0);
3333                         if (spa_spare_exists(guid, &pool, NULL) &&
3334                             pool != 0ULL) {
3335                                 VERIFY(nvlist_lookup_uint64_array(
3336                                     spares[i], ZPOOL_CONFIG_VDEV_STATS,
3337                                     (uint64_t **)&vs, &vsc) == 0);
3338                                 vs->vs_state = VDEV_STATE_CANT_OPEN;
3339                                 vs->vs_aux = VDEV_AUX_SPARED;
3340                         }
3341                 }
3342         }
3343 }
3344
3345 /*
3346  * Add l2cache device information to the nvlist, including vdev stats.
3347  */
3348 static void
3349 spa_add_l2cache(spa_t *spa, nvlist_t *config)
3350 {
3351         nvlist_t **l2cache;
3352         uint_t i, j, nl2cache;
3353         nvlist_t *nvroot;
3354         uint64_t guid;
3355         vdev_t *vd;
3356         vdev_stat_t *vs;
3357         uint_t vsc;
3358
3359         ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
3360
3361         if (spa->spa_l2cache.sav_count == 0)
3362                 return;
3363
3364         VERIFY(nvlist_lookup_nvlist(config,
3365             ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
3366         VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
3367             ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
3368         if (nl2cache != 0) {
3369                 VERIFY(nvlist_add_nvlist_array(nvroot,
3370                     ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
3371                 VERIFY(nvlist_lookup_nvlist_array(nvroot,
3372                     ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
3373
3374                 /*
3375                  * Update level 2 cache device stats.
3376                  */
3377
3378                 for (i = 0; i < nl2cache; i++) {
3379                         VERIFY(nvlist_lookup_uint64(l2cache[i],
3380                             ZPOOL_CONFIG_GUID, &guid) == 0);
3381
3382                         vd = NULL;
3383                         for (j = 0; j < spa->spa_l2cache.sav_count; j++) {
3384                                 if (guid ==
3385                                     spa->spa_l2cache.sav_vdevs[j]->vdev_guid) {
3386                                         vd = spa->spa_l2cache.sav_vdevs[j];
3387                                         break;
3388                                 }
3389                         }
3390                         ASSERT(vd != NULL);
3391
3392                         VERIFY(nvlist_lookup_uint64_array(l2cache[i],
3393                             ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)
3394                             == 0);
3395                         vdev_get_stats(vd, vs);
3396                         vdev_config_generate_stats(vd, l2cache[i]);
3397
3398                 }
3399         }
3400 }
3401
3402 static void
3403 spa_feature_stats_from_disk(spa_t *spa, nvlist_t *features)
3404 {
3405         zap_cursor_t zc;
3406         zap_attribute_t za;
3407
3408         if (spa->spa_feat_for_read_obj != 0) {
3409                 for (zap_cursor_init(&zc, spa->spa_meta_objset,
3410                     spa->spa_feat_for_read_obj);
3411                     zap_cursor_retrieve(&zc, &za) == 0;
3412                     zap_cursor_advance(&zc)) {
3413                         ASSERT(za.za_integer_length == sizeof (uint64_t) &&
3414                             za.za_num_integers == 1);
3415                         VERIFY0(nvlist_add_uint64(features, za.za_name,
3416                             za.za_first_integer));
3417                 }
3418                 zap_cursor_fini(&zc);
3419         }
3420
3421         if (spa->spa_feat_for_write_obj != 0) {
3422                 for (zap_cursor_init(&zc, spa->spa_meta_objset,
3423                     spa->spa_feat_for_write_obj);
3424                     zap_cursor_retrieve(&zc, &za) == 0;
3425                     zap_cursor_advance(&zc)) {
3426                         ASSERT(za.za_integer_length == sizeof (uint64_t) &&
3427                             za.za_num_integers == 1);
3428                         VERIFY0(nvlist_add_uint64(features, za.za_name,
3429                             za.za_first_integer));
3430                 }
3431                 zap_cursor_fini(&zc);
3432         }
3433 }
3434
3435 static void
3436 spa_feature_stats_from_cache(spa_t *spa, nvlist_t *features)
3437 {
3438         int i;
3439
3440         for (i = 0; i < SPA_FEATURES; i++) {
3441                 zfeature_info_t feature = spa_feature_table[i];
3442                 uint64_t refcount;
3443
3444                 if (feature_get_refcount(spa, &feature, &refcount) != 0)
3445                         continue;
3446
3447                 VERIFY0(nvlist_add_uint64(features, feature.fi_guid, refcount));
3448         }
3449 }
3450
3451 /*
3452  * Store a list of pool features and their reference counts in the
3453  * config.
3454  *
3455  * The first time this is called on a spa, allocate a new nvlist, fetch
3456  * the pool features and reference counts from disk, then save the list
3457  * in the spa. In subsequent calls on the same spa use the saved nvlist
3458  * and refresh its values from the cached reference counts.  This
3459  * ensures we don't block here on I/O on a suspended pool so 'zpool
3460  * clear' can resume the pool.
3461  */
3462 static void
3463 spa_add_feature_stats(spa_t *spa, nvlist_t *config)
3464 {
3465         nvlist_t *features;
3466
3467         ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
3468
3469         mutex_enter(&spa->spa_feat_stats_lock);
3470         features = spa->spa_feat_stats;
3471
3472         if (features != NULL) {
3473                 spa_feature_stats_from_cache(spa, features);
3474         } else {
3475                 VERIFY0(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP));
3476                 spa->spa_feat_stats = features;
3477                 spa_feature_stats_from_disk(spa, features);
3478         }
3479
3480         VERIFY0(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS,
3481             features));
3482
3483         mutex_exit(&spa->spa_feat_stats_lock);
3484 }
3485
3486 int
3487 spa_get_stats(const char *name, nvlist_t **config,
3488     char *altroot, size_t buflen)
3489 {
3490         int error;
3491         spa_t *spa;
3492
3493         *config = NULL;
3494         error = spa_open_common(name, &spa, FTAG, NULL, config);
3495
3496         if (spa != NULL) {
3497                 /*
3498                  * This still leaves a window of inconsistency where the spares
3499                  * or l2cache devices could change and the config would be
3500                  * self-inconsistent.
3501                  */
3502                 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
3503
3504                 if (*config != NULL) {
3505                         uint64_t loadtimes[2];
3506
3507                         loadtimes[0] = spa->spa_loaded_ts.tv_sec;
3508                         loadtimes[1] = spa->spa_loaded_ts.tv_nsec;
3509                         VERIFY(nvlist_add_uint64_array(*config,
3510                             ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0);
3511
3512                         VERIFY(nvlist_add_uint64(*config,
3513                             ZPOOL_CONFIG_ERRCOUNT,
3514                             spa_get_errlog_size(spa)) == 0);
3515
3516                         if (spa_suspended(spa))
3517                                 VERIFY(nvlist_add_uint64(*config,
3518                                     ZPOOL_CONFIG_SUSPENDED,
3519                                     spa->spa_failmode) == 0);
3520
3521                         spa_add_spares(spa, *config);
3522                         spa_add_l2cache(spa, *config);
3523                         spa_add_feature_stats(spa, *config);
3524                 }
3525         }
3526
3527         /*
3528          * We want to get the alternate root even for faulted pools, so we cheat
3529          * and call spa_lookup() directly.
3530          */
3531         if (altroot) {
3532                 if (spa == NULL) {
3533                         mutex_enter(&spa_namespace_lock);
3534                         spa = spa_lookup(name);
3535                         if (spa)
3536                                 spa_altroot(spa, altroot, buflen);
3537                         else
3538                                 altroot[0] = '\0';
3539                         spa = NULL;
3540                         mutex_exit(&spa_namespace_lock);
3541                 } else {
3542                         spa_altroot(spa, altroot, buflen);
3543                 }
3544         }
3545
3546         if (spa != NULL) {
3547                 spa_config_exit(spa, SCL_CONFIG, FTAG);
3548                 spa_close(spa, FTAG);
3549         }
3550
3551         return (error);
3552 }
3553
3554 /*
3555  * Validate that the auxiliary device array is well formed.  We must have an
3556  * array of nvlists, each which describes a valid leaf vdev.  If this is an
3557  * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be
3558  * specified, as long as they are well-formed.
3559  */
3560 static int
3561 spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode,
3562     spa_aux_vdev_t *sav, const char *config, uint64_t version,
3563     vdev_labeltype_t label)
3564 {
3565         nvlist_t **dev;
3566         uint_t i, ndev;
3567         vdev_t *vd;
3568         int error;
3569
3570         ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
3571
3572         /*
3573          * It's acceptable to have no devs specified.
3574          */
3575         if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0)
3576                 return (0);
3577
3578         if (ndev == 0)
3579                 return (SET_ERROR(EINVAL));
3580
3581         /*
3582          * Make sure the pool is formatted with a version that supports this
3583          * device type.
3584          */
3585         if (spa_version(spa) < version)
3586                 return (SET_ERROR(ENOTSUP));
3587
3588         /*
3589          * Set the pending device list so we correctly handle device in-use
3590          * checking.
3591          */
3592         sav->sav_pending = dev;
3593         sav->sav_npending = ndev;
3594
3595         for (i = 0; i < ndev; i++) {
3596                 if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0,
3597                     mode)) != 0)
3598                         goto out;
3599
3600                 if (!vd->vdev_ops->vdev_op_leaf) {
3601                         vdev_free(vd);
3602                         error = SET_ERROR(EINVAL);
3603                         goto out;
3604                 }
3605
3606                 /*
3607                  * The L2ARC currently only supports disk devices in
3608                  * kernel context.  For user-level testing, we allow it.
3609                  */
3610 #ifdef _KERNEL
3611                 if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) &&
3612                     strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) {
3613                         error = SET_ERROR(ENOTBLK);
3614                         vdev_free(vd);
3615                         goto out;
3616                 }
3617 #endif
3618                 vd->vdev_top = vd;
3619
3620                 if ((error = vdev_open(vd)) == 0 &&
3621                     (error = vdev_label_init(vd, crtxg, label)) == 0) {
3622                         VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID,
3623                             vd->vdev_guid) == 0);
3624                 }
3625
3626                 vdev_free(vd);
3627
3628                 if (error &&
3629                     (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE))
3630                         goto out;
3631                 else
3632                         error = 0;
3633         }
3634
3635 out:
3636         sav->sav_pending = NULL;
3637         sav->sav_npending = 0;
3638         return (error);
3639 }
3640
3641 static int
3642 spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode)
3643 {
3644         int error;
3645
3646         ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
3647
3648         if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode,
3649             &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES,
3650             VDEV_LABEL_SPARE)) != 0) {
3651                 return (error);
3652         }
3653
3654         return (spa_validate_aux_devs(spa, nvroot, crtxg, mode,
3655             &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE,
3656             VDEV_LABEL_L2CACHE));
3657 }
3658
3659 static void
3660 spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs,
3661     const char *config)
3662 {
3663         int i;
3664
3665         if (sav->sav_config != NULL) {
3666                 nvlist_t **olddevs;
3667                 uint_t oldndevs;
3668                 nvlist_t **newdevs;
3669
3670                 /*
3671                  * Generate new dev list by concatentating with the
3672                  * current dev list.
3673                  */
3674                 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config,
3675                     &olddevs, &oldndevs) == 0);
3676
3677                 newdevs = kmem_alloc(sizeof (void *) *
3678                     (ndevs + oldndevs), KM_SLEEP);
3679                 for (i = 0; i < oldndevs; i++)
3680                         VERIFY(nvlist_dup(olddevs[i], &newdevs[i],
3681                             KM_SLEEP) == 0);
3682                 for (i = 0; i < ndevs; i++)
3683                         VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs],
3684                             KM_SLEEP) == 0);
3685
3686                 VERIFY(nvlist_remove(sav->sav_config, config,
3687                     DATA_TYPE_NVLIST_ARRAY) == 0);
3688
3689                 VERIFY(nvlist_add_nvlist_array(sav->sav_config,
3690                     config, newdevs, ndevs + oldndevs) == 0);
3691                 for (i = 0; i < oldndevs + ndevs; i++)
3692                         nvlist_free(newdevs[i]);
3693                 kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *));
3694         } else {
3695                 /*
3696                  * Generate a new dev list.
3697                  */
3698                 VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME,
3699                     KM_SLEEP) == 0);
3700                 VERIFY(nvlist_add_nvlist_array(sav->sav_config, config,
3701                     devs, ndevs) == 0);
3702         }
3703 }
3704
3705 /*
3706  * Stop and drop level 2 ARC devices
3707  */
3708 void
3709 spa_l2cache_drop(spa_t *spa)
3710 {
3711         vdev_t *vd;
3712         int i;
3713         spa_aux_vdev_t *sav = &spa->spa_l2cache;
3714
3715         for (i = 0; i < sav->sav_count; i++) {
3716                 uint64_t pool;
3717
3718                 vd = sav->sav_vdevs[i];
3719                 ASSERT(vd != NULL);
3720
3721                 if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
3722                     pool != 0ULL && l2arc_vdev_present(vd))
3723                         l2arc_remove_vdev(vd);
3724         }
3725 }
3726
3727 /*
3728  * Pool Creation
3729  */
3730 int
3731 spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
3732     nvlist_t *zplprops)
3733 {
3734         spa_t *spa;
3735         char *altroot = NULL;
3736         vdev_t *rvd;
3737         dsl_pool_t *dp;
3738         dmu_tx_t *tx;
3739         int error = 0;
3740         uint64_t txg = TXG_INITIAL;
3741         nvlist_t **spares, **l2cache;
3742         uint_t nspares, nl2cache;
3743         uint64_t version, obj;
3744         boolean_t has_features;
3745         nvpair_t *elem;
3746         int c, i;
3747         char *poolname;
3748         nvlist_t *nvl;
3749
3750         if (nvlist_lookup_string(props, "tname", &poolname) != 0)
3751                 poolname = (char *)pool;
3752
3753         /*
3754          * If this pool already exists, return failure.
3755          */
3756         mutex_enter(&spa_namespace_lock);
3757         if (spa_lookup(poolname) != NULL) {
3758                 mutex_exit(&spa_namespace_lock);
3759                 return (SET_ERROR(EEXIST));
3760         }
3761
3762         /*
3763          * Allocate a new spa_t structure.
3764          */
3765         nvl = fnvlist_alloc();
3766         fnvlist_add_string(nvl, ZPOOL_CONFIG_POOL_NAME, pool);
3767         (void) nvlist_lookup_string(props,
3768             zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
3769         spa = spa_add(poolname, nvl, altroot);
3770         fnvlist_free(nvl);
3771         spa_activate(spa, spa_mode_global);
3772
3773         if (props && (error = spa_prop_validate(spa, props))) {
3774                 spa_deactivate(spa);
3775                 spa_remove(spa);
3776                 mutex_exit(&spa_namespace_lock);
3777                 return (error);
3778         }
3779
3780         /*
3781          * Temporary pool names should never be written to disk.
3782          */
3783         if (poolname != pool)
3784                 spa->spa_import_flags |= ZFS_IMPORT_TEMP_NAME;
3785
3786         has_features = B_FALSE;
3787         for (elem = nvlist_next_nvpair(props, NULL);
3788             elem != NULL; elem = nvlist_next_nvpair(props, elem)) {
3789                 if (zpool_prop_feature(nvpair_name(elem)))
3790                         has_features = B_TRUE;
3791         }
3792
3793         if (has_features || nvlist_lookup_uint64(props,
3794             zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) {
3795                 version = SPA_VERSION;
3796         }
3797         ASSERT(SPA_VERSION_IS_SUPPORTED(version));
3798
3799         spa->spa_first_txg = txg;
3800         spa->spa_uberblock.ub_txg = txg - 1;
3801         spa->spa_uberblock.ub_version = version;
3802         spa->spa_ubsync = spa->spa_uberblock;
3803
3804         /*
3805          * Create "The Godfather" zio to hold all async IOs
3806          */
3807         spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *),
3808             KM_SLEEP);
3809         for (i = 0; i < max_ncpus; i++) {
3810                 spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL,
3811                     ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
3812                     ZIO_FLAG_GODFATHER);
3813         }
3814
3815         /*
3816          * Create the root vdev.
3817          */
3818         spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3819
3820         error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD);
3821
3822         ASSERT(error != 0 || rvd != NULL);
3823         ASSERT(error != 0 || spa->spa_root_vdev == rvd);
3824
3825         if (error == 0 && !zfs_allocatable_devs(nvroot))
3826                 error = SET_ERROR(EINVAL);
3827
3828         if (error == 0 &&
3829             (error = vdev_create(rvd, txg, B_FALSE)) == 0 &&
3830             (error = spa_validate_aux(spa, nvroot, txg,
3831             VDEV_ALLOC_ADD)) == 0) {
3832                 for (c = 0; c < rvd->vdev_children; c++) {
3833                         vdev_metaslab_set_size(rvd->vdev_child[c]);
3834                         vdev_expand(rvd->vdev_child[c], txg);
3835                 }
3836         }
3837
3838         spa_config_exit(spa, SCL_ALL, FTAG);
3839
3840         if (error != 0) {
3841                 spa_unload(spa);
3842                 spa_deactivate(spa);
3843                 spa_remove(spa);
3844                 mutex_exit(&spa_namespace_lock);
3845                 return (error);
3846         }
3847
3848         /*
3849          * Get the list of spares, if specified.
3850          */
3851         if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
3852             &spares, &nspares) == 0) {
3853                 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME,
3854                     KM_SLEEP) == 0);
3855                 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
3856                     ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
3857                 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3858                 spa_load_spares(spa);
3859                 spa_config_exit(spa, SCL_ALL, FTAG);
3860                 spa->spa_spares.sav_sync = B_TRUE;
3861         }
3862
3863         /*
3864          * Get the list of level 2 cache devices, if specified.
3865          */
3866         if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
3867             &l2cache, &nl2cache) == 0) {
3868                 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
3869                     NV_UNIQUE_NAME, KM_SLEEP) == 0);
3870                 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
3871                     ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
3872                 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3873                 spa_load_l2cache(spa);
3874                 spa_config_exit(spa, SCL_ALL, FTAG);
3875                 spa->spa_l2cache.sav_sync = B_TRUE;
3876         }
3877
3878         spa->spa_is_initializing = B_TRUE;
3879         spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg);
3880         spa->spa_meta_objset = dp->dp_meta_objset;
3881         spa->spa_is_initializing = B_FALSE;
3882
3883         /*
3884          * Create DDTs (dedup tables).
3885          */
3886         ddt_create(spa);
3887
3888         spa_update_dspace(spa);
3889
3890         tx = dmu_tx_create_assigned(dp, txg);
3891
3892         /*
3893          * Create the pool config object.
3894          */
3895         spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset,
3896             DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE,
3897             DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx);
3898
3899         if (zap_add(spa->spa_meta_objset,
3900             DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
3901             sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) {
3902                 cmn_err(CE_PANIC, "failed to add pool config");
3903         }
3904
3905         if (spa_version(spa) >= SPA_VERSION_FEATURES)
3906                 spa_feature_create_zap_objects(spa, tx);
3907
3908         if (zap_add(spa->spa_meta_objset,
3909             DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION,
3910             sizeof (uint64_t), 1, &version, tx) != 0) {
3911                 cmn_err(CE_PANIC, "failed to add pool version");
3912         }
3913
3914         /* Newly created pools with the right version are always deflated. */
3915         if (version >= SPA_VERSION_RAIDZ_DEFLATE) {
3916                 spa->spa_deflate = TRUE;
3917                 if (zap_add(spa->spa_meta_objset,
3918                     DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
3919                     sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) {
3920                         cmn_err(CE_PANIC, "failed to add deflate");
3921                 }
3922         }
3923
3924         /*
3925          * Create the deferred-free bpobj.  Turn off compression
3926          * because sync-to-convergence takes longer if the blocksize
3927          * keeps changing.
3928          */
3929         obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx);
3930         dmu_object_set_compress(spa->spa_meta_objset, obj,
3931             ZIO_COMPRESS_OFF, tx);
3932         if (zap_add(spa->spa_meta_objset,
3933             DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ,
3934             sizeof (uint64_t), 1, &obj, tx) != 0) {
3935                 cmn_err(CE_PANIC, "failed to add bpobj");
3936         }
3937         VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj,
3938             spa->spa_meta_objset, obj));
3939
3940         /*
3941          * Create the pool's history object.
3942          */
3943         if (version >= SPA_VERSION_ZPOOL_HISTORY)
3944                 spa_history_create_obj(spa, tx);
3945
3946         /*
3947          * Generate some random noise for salted checksums to operate on.
3948          */
3949         (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes,
3950             sizeof (spa->spa_cksum_salt.zcs_bytes));
3951
3952         /*
3953          * Set pool properties.
3954          */
3955         spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS);
3956         spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
3957         spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE);
3958         spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND);
3959
3960         if (props != NULL) {
3961                 spa_configfile_set(spa, props, B_FALSE);
3962                 spa_sync_props(props, tx);
3963         }
3964
3965         dmu_tx_commit(tx);
3966
3967         spa->spa_sync_on = B_TRUE;
3968         txg_sync_start(spa->spa_dsl_pool);
3969
3970         /*
3971          * We explicitly wait for the first transaction to complete so that our
3972          * bean counters are appropriately updated.
3973          */
3974         txg_wait_synced(spa->spa_dsl_pool, txg);
3975
3976         spa_config_sync(spa, B_FALSE, B_TRUE);
3977         spa_event_notify(spa, NULL, ESC_ZFS_POOL_CREATE);
3978
3979         spa_history_log_version(spa, "create");
3980
3981         /*
3982          * Don't count references from objsets that are already closed
3983          * and are making their way through the eviction process.
3984          */
3985         spa_evicting_os_wait(spa);
3986         spa->spa_minref = refcount_count(&spa->spa_refcount);
3987
3988         mutex_exit(&spa_namespace_lock);
3989
3990         return (0);
3991 }
3992
3993 /*
3994  * Import a non-root pool into the system.
3995  */
3996 int
3997 spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags)
3998 {
3999         spa_t *spa;
4000         char *altroot = NULL;
4001         spa_load_state_t state = SPA_LOAD_IMPORT;
4002         zpool_rewind_policy_t policy;
4003         uint64_t mode = spa_mode_global;
4004         uint64_t readonly = B_FALSE;
4005         int error;
4006         nvlist_t *nvroot;
4007         nvlist_t **spares, **l2cache;
4008         uint_t nspares, nl2cache;
4009
4010         /*
4011          * If a pool with this name exists, return failure.
4012          */
4013         mutex_enter(&spa_namespace_lock);
4014         if (spa_lookup(pool) != NULL) {
4015                 mutex_exit(&spa_namespace_lock);
4016                 return (SET_ERROR(EEXIST));
4017         }
4018
4019         /*
4020          * Create and initialize the spa structure.
4021          */
4022         (void) nvlist_lookup_string(props,
4023             zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
4024         (void) nvlist_lookup_uint64(props,
4025             zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly);
4026         if (readonly)
4027                 mode = FREAD;
4028         spa = spa_add(pool, config, altroot);
4029         spa->spa_import_flags = flags;
4030
4031         /*
4032          * Verbatim import - Take a pool and insert it into the namespace
4033          * as if it had been loaded at boot.
4034          */
4035         if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) {
4036                 if (props != NULL)
4037                         spa_configfile_set(spa, props, B_FALSE);
4038
4039                 spa_config_sync(spa, B_FALSE, B_TRUE);
4040                 spa_event_notify(spa, NULL, ESC_ZFS_POOL_IMPORT);
4041
4042                 mutex_exit(&spa_namespace_lock);
4043                 return (0);
4044         }
4045
4046         spa_activate(spa, mode);
4047
4048         /*
4049          * Don't start async tasks until we know everything is healthy.
4050          */
4051         spa_async_suspend(spa);
4052
4053         zpool_get_rewind_policy(config, &policy);
4054         if (policy.zrp_request & ZPOOL_DO_REWIND)
4055                 state = SPA_LOAD_RECOVER;
4056
4057         /*
4058          * Pass off the heavy lifting to spa_load().  Pass TRUE for mosconfig
4059          * because the user-supplied config is actually the one to trust when
4060          * doing an import.
4061          */
4062         if (state != SPA_LOAD_RECOVER)
4063                 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
4064
4065         error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg,
4066             policy.zrp_request);
4067
4068         /*
4069          * Propagate anything learned while loading the pool and pass it
4070          * back to caller (i.e. rewind info, missing devices, etc).
4071          */
4072         VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO,
4073             spa->spa_load_info) == 0);
4074
4075         spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
4076         /*
4077          * Toss any existing sparelist, as it doesn't have any validity
4078          * anymore, and conflicts with spa_has_spare().
4079          */
4080         if (spa->spa_spares.sav_config) {
4081                 nvlist_free(spa->spa_spares.sav_config);
4082                 spa->spa_spares.sav_config = NULL;
4083                 spa_load_spares(spa);
4084         }
4085         if (spa->spa_l2cache.sav_config) {
4086                 nvlist_free(spa->spa_l2cache.sav_config);
4087                 spa->spa_l2cache.sav_config = NULL;
4088                 spa_load_l2cache(spa);
4089         }
4090
4091         VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
4092             &nvroot) == 0);
4093         if (error == 0)
4094                 error = spa_validate_aux(spa, nvroot, -1ULL,
4095                     VDEV_ALLOC_SPARE);
4096         if (error == 0)
4097                 error = spa_validate_aux(spa, nvroot, -1ULL,
4098                     VDEV_ALLOC_L2CACHE);
4099         spa_config_exit(spa, SCL_ALL, FTAG);
4100
4101         if (props != NULL)
4102                 spa_configfile_set(spa, props, B_FALSE);
4103
4104         if (error != 0 || (props && spa_writeable(spa) &&
4105             (error = spa_prop_set(spa, props)))) {
4106                 spa_unload(spa);
4107                 spa_deactivate(spa);
4108                 spa_remove(spa);
4109                 mutex_exit(&spa_namespace_lock);
4110                 return (error);
4111         }
4112
4113         spa_async_resume(spa);
4114
4115         /*
4116          * Override any spares and level 2 cache devices as specified by
4117          * the user, as these may have correct device names/devids, etc.
4118          */
4119         if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
4120             &spares, &nspares) == 0) {
4121                 if (spa->spa_spares.sav_config)
4122                         VERIFY(nvlist_remove(spa->spa_spares.sav_config,
4123                             ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0);
4124                 else
4125                         VERIFY(nvlist_alloc(&spa->spa_spares.sav_config,
4126                             NV_UNIQUE_NAME, KM_SLEEP) == 0);
4127                 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
4128                     ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
4129                 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
4130                 spa_load_spares(spa);
4131                 spa_config_exit(spa, SCL_ALL, FTAG);
4132                 spa->spa_spares.sav_sync = B_TRUE;
4133         }
4134         if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
4135             &l2cache, &nl2cache) == 0) {
4136                 if (spa->spa_l2cache.sav_config)
4137                         VERIFY(nvlist_remove(spa->spa_l2cache.sav_config,
4138                             ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0);
4139                 else
4140                         VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
4141                             NV_UNIQUE_NAME, KM_SLEEP) == 0);
4142                 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
4143                     ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
4144                 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
4145                 spa_load_l2cache(spa);
4146                 spa_config_exit(spa, SCL_ALL, FTAG);
4147                 spa->spa_l2cache.sav_sync = B_TRUE;
4148         }
4149
4150         /*
4151          * Check for any removed devices.
4152          */
4153         if (spa->spa_autoreplace) {
4154                 spa_aux_check_removed(&spa->spa_spares);
4155                 spa_aux_check_removed(&spa->spa_l2cache);
4156         }
4157
4158         if (spa_writeable(spa)) {
4159                 /*
4160                  * Update the config cache to include the newly-imported pool.
4161                  */
4162                 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
4163         }
4164
4165         /*
4166          * It's possible that the pool was expanded while it was exported.
4167          * We kick off an async task to handle this for us.
4168          */
4169         spa_async_request(spa, SPA_ASYNC_AUTOEXPAND);
4170
4171         spa_history_log_version(spa, "import");
4172
4173         spa_event_notify(spa, NULL, ESC_ZFS_POOL_IMPORT);
4174
4175         zvol_create_minors(spa, pool, B_TRUE);
4176
4177         mutex_exit(&spa_namespace_lock);
4178
4179         return (0);
4180 }
4181
4182 nvlist_t *
4183 spa_tryimport(nvlist_t *tryconfig)
4184 {
4185         nvlist_t *config = NULL;
4186         char *poolname;
4187         spa_t *spa;
4188         uint64_t state;
4189         int error;
4190
4191         if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname))
4192                 return (NULL);
4193
4194         if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state))
4195                 return (NULL);
4196
4197         /*
4198          * Create and initialize the spa structure.
4199          */
4200         mutex_enter(&spa_namespace_lock);
4201         spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL);
4202         spa_activate(spa, FREAD);
4203
4204         /*
4205          * Pass off the heavy lifting to spa_load().
4206          * Pass TRUE for mosconfig because the user-supplied config
4207          * is actually the one to trust when doing an import.
4208          */
4209         error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING, B_TRUE);
4210
4211         /*
4212          * If 'tryconfig' was at least parsable, return the current config.
4213          */
4214         if (spa->spa_root_vdev != NULL) {
4215                 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
4216                 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME,
4217                     poolname) == 0);
4218                 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
4219                     state) == 0);
4220                 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP,
4221                     spa->spa_uberblock.ub_timestamp) == 0);
4222                 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO,
4223                     spa->spa_load_info) == 0);
4224                 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_ERRATA,
4225                     spa->spa_errata) == 0);
4226
4227                 /*
4228                  * If the bootfs property exists on this pool then we
4229                  * copy it out so that external consumers can tell which
4230                  * pools are bootable.
4231                  */
4232                 if ((!error || error == EEXIST) && spa->spa_bootfs) {
4233                         char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
4234
4235                         /*
4236                          * We have to play games with the name since the
4237                          * pool was opened as TRYIMPORT_NAME.
4238                          */
4239                         if (dsl_dsobj_to_dsname(spa_name(spa),
4240                             spa->spa_bootfs, tmpname) == 0) {
4241                                 char *cp;
4242                                 char *dsname;
4243
4244                                 dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
4245
4246                                 cp = strchr(tmpname, '/');
4247                                 if (cp == NULL) {
4248                                         (void) strlcpy(dsname, tmpname,
4249                                             MAXPATHLEN);
4250                                 } else {
4251                                         (void) snprintf(dsname, MAXPATHLEN,
4252                                             "%s/%s", poolname, ++cp);
4253                                 }
4254                                 VERIFY(nvlist_add_string(config,
4255                                     ZPOOL_CONFIG_BOOTFS, dsname) == 0);
4256                                 kmem_free(dsname, MAXPATHLEN);
4257                         }
4258                         kmem_free(tmpname, MAXPATHLEN);
4259                 }
4260
4261                 /*
4262                  * Add the list of hot spares and level 2 cache devices.
4263                  */
4264                 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
4265                 spa_add_spares(spa, config);
4266                 spa_add_l2cache(spa, config);
4267                 spa_config_exit(spa, SCL_CONFIG, FTAG);
4268         }
4269
4270         spa_unload(spa);
4271         spa_deactivate(spa);
4272         spa_remove(spa);
4273         mutex_exit(&spa_namespace_lock);
4274
4275         return (config);
4276 }
4277
4278 /*
4279  * Pool export/destroy
4280  *
4281  * The act of destroying or exporting a pool is very simple.  We make sure there
4282  * is no more pending I/O and any references to the pool are gone.  Then, we
4283  * update the pool state and sync all the labels to disk, removing the
4284  * configuration from the cache afterwards. If the 'hardforce' flag is set, then
4285  * we don't sync the labels or remove the configuration cache.
4286  */
4287 static int
4288 spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
4289     boolean_t force, boolean_t hardforce)
4290 {
4291         spa_t *spa;
4292
4293         if (oldconfig)
4294                 *oldconfig = NULL;
4295
4296         if (!(spa_mode_global & FWRITE))
4297                 return (SET_ERROR(EROFS));
4298
4299         mutex_enter(&spa_namespace_lock);
4300         if ((spa = spa_lookup(pool)) == NULL) {
4301                 mutex_exit(&spa_namespace_lock);
4302                 return (SET_ERROR(ENOENT));
4303         }
4304
4305         /*
4306          * Put a hold on the pool, drop the namespace lock, stop async tasks,
4307          * reacquire the namespace lock, and see if we can export.
4308          */
4309         spa_open_ref(spa, FTAG);
4310         mutex_exit(&spa_namespace_lock);
4311         spa_async_suspend(spa);
4312         if (spa->spa_zvol_taskq) {
4313                 zvol_remove_minors(spa, spa_name(spa), B_TRUE);
4314                 taskq_wait(spa->spa_zvol_taskq);
4315         }
4316         mutex_enter(&spa_namespace_lock);
4317         spa_close(spa, FTAG);
4318
4319         if (spa->spa_state == POOL_STATE_UNINITIALIZED)
4320                 goto export_spa;
4321         /*
4322          * The pool will be in core if it's openable, in which case we can
4323          * modify its state.  Objsets may be open only because they're dirty,
4324          * so we have to force it to sync before checking spa_refcnt.
4325          */
4326         if (spa->spa_sync_on) {
4327                 txg_wait_synced(spa->spa_dsl_pool, 0);
4328                 spa_evicting_os_wait(spa);
4329         }
4330
4331         /*
4332          * A pool cannot be exported or destroyed if there are active
4333          * references.  If we are resetting a pool, allow references by
4334          * fault injection handlers.
4335          */
4336         if (!spa_refcount_zero(spa) ||
4337             (spa->spa_inject_ref != 0 &&
4338             new_state != POOL_STATE_UNINITIALIZED)) {
4339                 spa_async_resume(spa);
4340                 mutex_exit(&spa_namespace_lock);
4341                 return (SET_ERROR(EBUSY));
4342         }
4343
4344         if (spa->spa_sync_on) {
4345                 /*
4346                  * A pool cannot be exported if it has an active shared spare.
4347                  * This is to prevent other pools stealing the active spare
4348                  * from an exported pool. At user's own will, such pool can
4349                  * be forcedly exported.
4350                  */
4351                 if (!force && new_state == POOL_STATE_EXPORTED &&
4352                     spa_has_active_shared_spare(spa)) {
4353                         spa_async_resume(spa);
4354                         mutex_exit(&spa_namespace_lock);
4355                         return (SET_ERROR(EXDEV));
4356                 }
4357
4358                 /*
4359                  * We want this to be reflected on every label,
4360                  * so mark them all dirty.  spa_unload() will do the
4361                  * final sync that pushes these changes out.
4362                  */
4363                 if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) {
4364                         spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
4365                         spa->spa_state = new_state;
4366                         spa->spa_final_txg = spa_last_synced_txg(spa) +
4367                             TXG_DEFER_SIZE + 1;
4368                         vdev_config_dirty(spa->spa_root_vdev);
4369                         spa_config_exit(spa, SCL_ALL, FTAG);
4370                 }
4371         }
4372
4373 export_spa:
4374         spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY);
4375
4376         if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
4377                 spa_unload(spa);
4378                 spa_deactivate(spa);
4379         }
4380
4381         if (oldconfig && spa->spa_config)
4382                 VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0);
4383
4384         if (new_state != POOL_STATE_UNINITIALIZED) {
4385                 if (!hardforce)
4386                         spa_config_sync(spa, B_TRUE, B_TRUE);
4387                 spa_remove(spa);
4388         }
4389         mutex_exit(&spa_namespace_lock);
4390
4391         return (0);
4392 }
4393
4394 /*
4395  * Destroy a storage pool.
4396  */
4397 int
4398 spa_destroy(char *pool)
4399 {
4400         return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL,
4401             B_FALSE, B_FALSE));
4402 }
4403
4404 /*
4405  * Export a storage pool.
4406  */
4407 int
4408 spa_export(char *pool, nvlist_t **oldconfig, boolean_t force,
4409     boolean_t hardforce)
4410 {
4411         return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig,
4412             force, hardforce));
4413 }
4414
4415 /*
4416  * Similar to spa_export(), this unloads the spa_t without actually removing it
4417  * from the namespace in any way.
4418  */
4419 int
4420 spa_reset(char *pool)
4421 {
4422         return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL,
4423             B_FALSE, B_FALSE));
4424 }
4425
4426 /*
4427  * ==========================================================================
4428  * Device manipulation
4429  * ==========================================================================
4430  */
4431
4432 /*
4433  * Add a device to a storage pool.
4434  */
4435 int
4436 spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
4437 {
4438         uint64_t txg, id;
4439         int error;
4440         vdev_t *rvd = spa->spa_root_vdev;
4441         vdev_t *vd, *tvd;
4442         nvlist_t **spares, **l2cache;
4443         uint_t nspares, nl2cache;
4444         int c;
4445
4446         ASSERT(spa_writeable(spa));
4447
4448         txg = spa_vdev_enter(spa);
4449
4450         if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0,
4451             VDEV_ALLOC_ADD)) != 0)
4452                 return (spa_vdev_exit(spa, NULL, txg, error));
4453
4454         spa->spa_pending_vdev = vd;     /* spa_vdev_exit() will clear this */
4455
4456         if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares,
4457             &nspares) != 0)
4458                 nspares = 0;
4459
4460         if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache,
4461             &nl2cache) != 0)
4462                 nl2cache = 0;
4463
4464         if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0)
4465                 return (spa_vdev_exit(spa, vd, txg, EINVAL));
4466
4467         if (vd->vdev_children != 0 &&
4468             (error = vdev_create(vd, txg, B_FALSE)) != 0)
4469                 return (spa_vdev_exit(spa, vd, txg, error));
4470
4471         /*
4472          * We must validate the spares and l2cache devices after checking the
4473          * children.  Otherwise, vdev_inuse() will blindly overwrite the spare.
4474          */
4475         if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0)
4476                 return (spa_vdev_exit(spa, vd, txg, error));
4477
4478         /*
4479          * Transfer each new top-level vdev from vd to rvd.
4480          */
4481         for (c = 0; c < vd->vdev_children; c++) {
4482
4483                 /*
4484                  * Set the vdev id to the first hole, if one exists.
4485                  */
4486                 for (id = 0; id < rvd->vdev_children; id++) {
4487                         if (rvd->vdev_child[id]->vdev_ishole) {
4488                                 vdev_free(rvd->vdev_child[id]);
4489                                 break;
4490                         }
4491                 }
4492                 tvd = vd->vdev_child[c];
4493                 vdev_remove_child(vd, tvd);
4494                 tvd->vdev_id = id;
4495                 vdev_add_child(rvd, tvd);
4496                 vdev_config_dirty(tvd);
4497         }
4498
4499         if (nspares != 0) {
4500                 spa_set_aux_vdevs(&spa->spa_spares, spares, nspares,
4501                     ZPOOL_CONFIG_SPARES);
4502                 spa_load_spares(spa);
4503                 spa->spa_spares.sav_sync = B_TRUE;
4504         }
4505
4506         if (nl2cache != 0) {
4507                 spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache,
4508                     ZPOOL_CONFIG_L2CACHE);
4509                 spa_load_l2cache(spa);
4510                 spa->spa_l2cache.sav_sync = B_TRUE;
4511         }
4512
4513         /*
4514          * We have to be careful when adding new vdevs to an existing pool.
4515          * If other threads start allocating from these vdevs before we
4516          * sync the config cache, and we lose power, then upon reboot we may
4517          * fail to open the pool because there are DVAs that the config cache
4518          * can't translate.  Therefore, we first add the vdevs without
4519          * initializing metaslabs; sync the config cache (via spa_vdev_exit());
4520          * and then let spa_config_update() initialize the new metaslabs.
4521          *
4522          * spa_load() checks for added-but-not-initialized vdevs, so that
4523          * if we lose power at any point in this sequence, the remaining
4524          * steps will be completed the next time we load the pool.
4525          */
4526         (void) spa_vdev_exit(spa, vd, txg, 0);
4527
4528         mutex_enter(&spa_namespace_lock);
4529         spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
4530         spa_event_notify(spa, NULL, ESC_ZFS_VDEV_ADD);
4531         mutex_exit(&spa_namespace_lock);
4532
4533         return (0);
4534 }
4535
4536 /*
4537  * Attach a device to a mirror.  The arguments are the path to any device
4538  * in the mirror, and the nvroot for the new device.  If the path specifies
4539  * a device that is not mirrored, we automatically insert the mirror vdev.
4540  *
4541  * If 'replacing' is specified, the new device is intended to replace the
4542  * existing device; in this case the two devices are made into their own
4543  * mirror using the 'replacing' vdev, which is functionally identical to
4544  * the mirror vdev (it actually reuses all the same ops) but has a few
4545  * extra rules: you can't attach to it after it's been created, and upon
4546  * completion of resilvering, the first disk (the one being replaced)
4547  * is automatically detached.
4548  */
4549 int
4550 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
4551 {
4552         uint64_t txg, dtl_max_txg;
4553         vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd;
4554         vdev_ops_t *pvops;
4555         char *oldvdpath, *newvdpath;
4556         int newvd_isspare;
4557         int error;
4558         ASSERTV(vdev_t *rvd = spa->spa_root_vdev);
4559
4560         ASSERT(spa_writeable(spa));
4561
4562         txg = spa_vdev_enter(spa);
4563
4564         oldvd = spa_lookup_by_guid(spa, guid, B_FALSE);
4565
4566         if (oldvd == NULL)
4567                 return (spa_vdev_exit(spa, NULL, txg, ENODEV));
4568
4569         if (!oldvd->vdev_ops->vdev_op_leaf)
4570                 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
4571
4572         pvd = oldvd->vdev_parent;
4573
4574         if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0,
4575             VDEV_ALLOC_ATTACH)) != 0)
4576                 return (spa_vdev_exit(spa, NULL, txg, EINVAL));
4577
4578         if (newrootvd->vdev_children != 1)
4579                 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
4580
4581         newvd = newrootvd->vdev_child[0];
4582
4583         if (!newvd->vdev_ops->vdev_op_leaf)
4584                 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
4585
4586         if ((error = vdev_create(newrootvd, txg, replacing)) != 0)
4587                 return (spa_vdev_exit(spa, newrootvd, txg, error));
4588
4589         /*
4590          * Spares can't replace logs
4591          */
4592         if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare)
4593                 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
4594
4595         if (!replacing) {
4596                 /*
4597                  * For attach, the only allowable parent is a mirror or the root
4598                  * vdev.
4599                  */
4600                 if (pvd->vdev_ops != &vdev_mirror_ops &&
4601                     pvd->vdev_ops != &vdev_root_ops)
4602                         return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
4603
4604                 pvops = &vdev_mirror_ops;
4605         } else {
4606                 /*
4607                  * Active hot spares can only be replaced by inactive hot
4608                  * spares.
4609                  */
4610                 if (pvd->vdev_ops == &vdev_spare_ops &&
4611                     oldvd->vdev_isspare &&
4612                     !spa_has_spare(spa, newvd->vdev_guid))
4613                         return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
4614
4615                 /*
4616                  * If the source is a hot spare, and the parent isn't already a
4617                  * spare, then we want to create a new hot spare.  Otherwise, we
4618                  * want to create a replacing vdev.  The user is not allowed to
4619                  * attach to a spared vdev child unless the 'isspare' state is
4620                  * the same (spare replaces spare, non-spare replaces
4621                  * non-spare).
4622                  */
4623                 if (pvd->vdev_ops == &vdev_replacing_ops &&
4624                     spa_version(spa) < SPA_VERSION_MULTI_REPLACE) {
4625                         return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
4626                 } else if (pvd->vdev_ops == &vdev_spare_ops &&
4627                     newvd->vdev_isspare != oldvd->vdev_isspare) {
4628                         return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
4629                 }
4630
4631                 if (newvd->vdev_isspare)
4632                         pvops = &vdev_spare_ops;
4633                 else
4634                         pvops = &vdev_replacing_ops;
4635         }
4636
4637         /*
4638          * Make sure the new device is big enough.
4639          */
4640         if (newvd->vdev_asize < vdev_get_min_asize(oldvd))
4641                 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW));
4642
4643         /*
4644          * The new device cannot have a higher alignment requirement
4645          * than the top-level vdev.
4646          */
4647         if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift)
4648                 return (spa_vdev_exit(spa, newrootvd, txg, EDOM));
4649
4650         /*
4651          * If this is an in-place replacement, update oldvd's path and devid
4652          * to make it distinguishable from newvd, and unopenable from now on.
4653          */
4654         if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) {
4655                 spa_strfree(oldvd->vdev_path);
4656                 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5,
4657                     KM_SLEEP);
4658                 (void) sprintf(oldvd->vdev_path, "%s/%s",
4659                     newvd->vdev_path, "old");
4660                 if (oldvd->vdev_devid != NULL) {
4661                         spa_strfree(oldvd->vdev_devid);
4662                         oldvd->vdev_devid = NULL;
4663                 }
4664         }
4665
4666         /* mark the device being resilvered */
4667         newvd->vdev_resilver_txg = txg;
4668
4669         /*
4670          * If the parent is not a mirror, or if we're replacing, insert the new
4671          * mirror/replacing/spare vdev above oldvd.
4672          */
4673         if (pvd->vdev_ops != pvops)
4674                 pvd = vdev_add_parent(oldvd, pvops);
4675
4676         ASSERT(pvd->vdev_top->vdev_parent == rvd);
4677         ASSERT(pvd->vdev_ops == pvops);
4678         ASSERT(oldvd->vdev_parent == pvd);
4679
4680         /*
4681          * Extract the new device from its root and add it to pvd.
4682          */
4683         vdev_remove_child(newrootvd, newvd);
4684         newvd->vdev_id = pvd->vdev_children;
4685         newvd->vdev_crtxg = oldvd->vdev_crtxg;
4686         vdev_add_child(pvd, newvd);
4687
4688         tvd = newvd->vdev_top;
4689         ASSERT(pvd->vdev_top == tvd);
4690         ASSERT(tvd->vdev_parent == rvd);
4691
4692         vdev_config_dirty(tvd);
4693
4694         /*
4695          * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account
4696          * for any dmu_sync-ed blocks.  It will propagate upward when
4697          * spa_vdev_exit() calls vdev_dtl_reassess().
4698          */
4699         dtl_max_txg = txg + TXG_CONCURRENT_STATES;
4700
4701         vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL,
4702             dtl_max_txg - TXG_INITIAL);
4703
4704         if (newvd->vdev_isspare) {
4705                 spa_spare_activate(newvd);
4706                 spa_event_notify(spa, newvd, ESC_ZFS_VDEV_SPARE);
4707         }
4708
4709         oldvdpath = spa_strdup(oldvd->vdev_path);
4710         newvdpath = spa_strdup(newvd->vdev_path);
4711         newvd_isspare = newvd->vdev_isspare;
4712
4713         /*
4714          * Mark newvd's DTL dirty in this txg.
4715          */
4716         vdev_dirty(tvd, VDD_DTL, newvd, txg);
4717
4718         /*
4719          * Schedule the resilver to restart in the future. We do this to
4720          * ensure that dmu_sync-ed blocks have been stitched into the
4721          * respective datasets.
4722          */
4723         dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg);
4724
4725         if (spa->spa_bootfs)
4726                 spa_event_notify(spa, newvd, ESC_ZFS_BOOTFS_VDEV_ATTACH);
4727
4728         spa_event_notify(spa, newvd, ESC_ZFS_VDEV_ATTACH);
4729
4730         /*
4731          * Commit the config
4732          */
4733         (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0);
4734
4735         spa_history_log_internal(spa, "vdev attach", NULL,
4736             "%s vdev=%s %s vdev=%s",
4737             replacing && newvd_isspare ? "spare in" :
4738             replacing ? "replace" : "attach", newvdpath,
4739             replacing ? "for" : "to", oldvdpath);
4740
4741         spa_strfree(oldvdpath);
4742         spa_strfree(newvdpath);
4743
4744         return (0);
4745 }
4746
4747 /*
4748  * Detach a device from a mirror or replacing vdev.
4749  *
4750  * If 'replace_done' is specified, only detach if the parent
4751  * is a replacing vdev.
4752  */
4753 int
4754 spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
4755 {
4756         uint64_t txg;
4757         int error;
4758         vdev_t *vd, *pvd, *cvd, *tvd;
4759         boolean_t unspare = B_FALSE;
4760         uint64_t unspare_guid = 0;
4761         char *vdpath;
4762         int c, t;
4763         ASSERTV(vdev_t *rvd = spa->spa_root_vdev);
4764         ASSERT(spa_writeable(spa));
4765
4766         txg = spa_vdev_enter(spa);
4767
4768         vd = spa_lookup_by_guid(spa, guid, B_FALSE);
4769
4770         if (vd == NULL)
4771                 return (spa_vdev_exit(spa, NULL, txg, ENODEV));
4772
4773         if (!vd->vdev_ops->vdev_op_leaf)
4774                 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
4775
4776         pvd = vd->vdev_parent;
4777
4778         /*
4779          * If the parent/child relationship is not as expected, don't do it.
4780          * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing
4781          * vdev that's replacing B with C.  The user's intent in replacing
4782          * is to go from M(A,B) to M(A,C).  If the user decides to cancel
4783          * the replace by detaching C, the expected behavior is to end up
4784          * M(A,B).  But suppose that right after deciding to detach C,
4785          * the replacement of B completes.  We would have M(A,C), and then
4786          * ask to detach C, which would leave us with just A -- not what
4787          * the user wanted.  To prevent this, we make sure that the
4788          * parent/child relationship hasn't changed -- in this example,
4789          * that C's parent is still the replacing vdev R.
4790          */
4791         if (pvd->vdev_guid != pguid && pguid != 0)
4792                 return (spa_vdev_exit(spa, NULL, txg, EBUSY));
4793
4794         /*
4795          * Only 'replacing' or 'spare' vdevs can be replaced.
4796          */
4797         if (replace_done && pvd->vdev_ops != &vdev_replacing_ops &&
4798             pvd->vdev_ops != &vdev_spare_ops)
4799                 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
4800
4801         ASSERT(pvd->vdev_ops != &vdev_spare_ops ||
4802             spa_version(spa) >= SPA_VERSION_SPARES);
4803
4804         /*
4805          * Only mirror, replacing, and spare vdevs support detach.
4806          */
4807         if (pvd->vdev_ops != &vdev_replacing_ops &&
4808             pvd->vdev_ops != &vdev_mirror_ops &&
4809             pvd->vdev_ops != &vdev_spare_ops)
4810                 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
4811
4812         /*
4813          * If this device has the only valid copy of some data,
4814          * we cannot safely detach it.
4815          */
4816         if (vdev_dtl_required(vd))
4817                 return (spa_vdev_exit(spa, NULL, txg, EBUSY));
4818
4819         ASSERT(pvd->vdev_children >= 2);
4820
4821         /*
4822          * If we are detaching the second disk from a replacing vdev, then
4823          * check to see if we changed the original vdev's path to have "/old"
4824          * at the end in spa_vdev_attach().  If so, undo that change now.
4825          */
4826         if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 &&
4827             vd->vdev_path != NULL) {
4828                 size_t len = strlen(vd->vdev_path);
4829
4830                 for (c = 0; c < pvd->vdev_children; c++) {
4831                         cvd = pvd->vdev_child[c];
4832
4833                         if (cvd == vd || cvd->vdev_path == NULL)
4834                                 continue;
4835
4836                         if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 &&
4837                             strcmp(cvd->vdev_path + len, "/old") == 0) {
4838                                 spa_strfree(cvd->vdev_path);
4839                                 cvd->vdev_path = spa_strdup(vd->vdev_path);
4840                                 break;
4841                         }
4842                 }
4843         }
4844
4845         /*
4846          * If we are detaching the original disk from a spare, then it implies
4847          * that the spare should become a real disk, and be removed from the
4848          * active spare list for the pool.
4849          */
4850         if (pvd->vdev_ops == &vdev_spare_ops &&
4851             vd->vdev_id == 0 &&
4852             pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare)
4853                 unspare = B_TRUE;
4854
4855         /*
4856          * Erase the disk labels so the disk can be used for other things.
4857          * This must be done after all other error cases are handled,
4858          * but before we disembowel vd (so we can still do I/O to it).
4859          * But if we can't do it, don't treat the error as fatal --
4860          * it may be that the unwritability of the disk is the reason
4861          * it's being detached!
4862          */
4863         error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
4864
4865         /*
4866          * Remove vd from its parent and compact the parent's children.
4867          */
4868         vdev_remove_child(pvd, vd);
4869         vdev_compact_children(pvd);
4870
4871         /*
4872          * Remember one of the remaining children so we can get tvd below.
4873          */
4874         cvd = pvd->vdev_child[pvd->vdev_children - 1];
4875
4876         /*
4877          * If we need to remove the remaining child from the list of hot spares,
4878          * do it now, marking the vdev as no longer a spare in the process.
4879          * We must do this before vdev_remove_parent(), because that can
4880          * change the GUID if it creates a new toplevel GUID.  For a similar
4881          * reason, we must remove the spare now, in the same txg as the detach;
4882          * otherwise someone could attach a new sibling, change the GUID, and
4883          * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail.
4884          */
4885         if (unspare) {
4886                 ASSERT(cvd->vdev_isspare);
4887                 spa_spare_remove(cvd);
4888                 unspare_guid = cvd->vdev_guid;
4889                 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
4890                 cvd->vdev_unspare = B_TRUE;
4891         }
4892
4893         /*
4894          * If the parent mirror/replacing vdev only has one child,
4895          * the parent is no longer needed.  Remove it from the tree.
4896          */
4897         if (pvd->vdev_children == 1) {
4898                 if (pvd->vdev_ops == &vdev_spare_ops)
4899                         cvd->vdev_unspare = B_FALSE;
4900                 vdev_remove_parent(cvd);
4901         }
4902
4903
4904         /*
4905          * We don't set tvd until now because the parent we just removed
4906          * may have been the previous top-level vdev.
4907          */
4908         tvd = cvd->vdev_top;
4909         ASSERT(tvd->vdev_parent == rvd);
4910
4911         /*
4912          * Reevaluate the parent vdev state.
4913          */
4914         vdev_propagate_state(cvd);
4915
4916         /*
4917          * If the 'autoexpand' property is set on the pool then automatically
4918          * try to expand the size of the pool. For example if the device we
4919          * just detached was smaller than the others, it may be possible to
4920          * add metaslabs (i.e. grow the pool). We need to reopen the vdev
4921          * first so that we can obtain the updated sizes of the leaf vdevs.
4922          */
4923         if (spa->spa_autoexpand) {
4924                 vdev_reopen(tvd);
4925                 vdev_expand(tvd, txg);
4926         }
4927
4928         vdev_config_dirty(tvd);
4929
4930         /*
4931          * Mark vd's DTL as dirty in this txg.  vdev_dtl_sync() will see that
4932          * vd->vdev_detached is set and free vd's DTL object in syncing context.
4933          * But first make sure we're not on any *other* txg's DTL list, to
4934          * prevent vd from being accessed after it's freed.
4935          */
4936         vdpath = spa_strdup(vd->vdev_path);
4937         for (t = 0; t < TXG_SIZE; t++)
4938                 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
4939         vd->vdev_detached = B_TRUE;
4940         vdev_dirty(tvd, VDD_DTL, vd, txg);
4941
4942         spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE);
4943
4944         /* hang on to the spa before we release the lock */
4945         spa_open_ref(spa, FTAG);
4946
4947         error = spa_vdev_exit(spa, vd, txg, 0);
4948
4949         spa_history_log_internal(spa, "detach", NULL,
4950             "vdev=%s", vdpath);
4951         spa_strfree(vdpath);
4952
4953         /*
4954          * If this was the removal of the original device in a hot spare vdev,
4955          * then we want to go through and remove the device from the hot spare
4956          * list of every other pool.
4957          */
4958         if (unspare) {
4959                 spa_t *altspa = NULL;
4960
4961                 mutex_enter(&spa_namespace_lock);
4962                 while ((altspa = spa_next(altspa)) != NULL) {
4963                         if (altspa->spa_state != POOL_STATE_ACTIVE ||
4964                             altspa == spa)
4965                                 continue;
4966
4967                         spa_open_ref(altspa, FTAG);
4968                         mutex_exit(&spa_namespace_lock);
4969                         (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE);
4970                         mutex_enter(&spa_namespace_lock);
4971                         spa_close(altspa, FTAG);
4972                 }
4973                 mutex_exit(&spa_namespace_lock);
4974
4975                 /* search the rest of the vdevs for spares to remove */
4976                 spa_vdev_resilver_done(spa);
4977         }
4978
4979         /* all done with the spa; OK to release */
4980         mutex_enter(&spa_namespace_lock);
4981         spa_close(spa, FTAG);
4982         mutex_exit(&spa_namespace_lock);
4983
4984         return (error);
4985 }
4986
4987 /*
4988  * Split a set of devices from their mirrors, and create a new pool from them.
4989  */
4990 int
4991 spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
4992     nvlist_t *props, boolean_t exp)
4993 {
4994         int error = 0;
4995         uint64_t txg, *glist;
4996         spa_t *newspa;
4997         uint_t c, children, lastlog;
4998         nvlist_t **child, *nvl, *tmp;
4999         dmu_tx_t *tx;
5000         char *altroot = NULL;
5001         vdev_t *rvd, **vml = NULL;                      /* vdev modify list */
5002         boolean_t activate_slog;
5003
5004         ASSERT(spa_writeable(spa));
5005
5006         txg = spa_vdev_enter(spa);
5007
5008         /* clear the log and flush everything up to now */
5009         activate_slog = spa_passivate_log(spa);
5010         (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
5011         error = spa_offline_log(spa);
5012         txg = spa_vdev_config_enter(spa);
5013
5014         if (activate_slog)
5015                 spa_activate_log(spa);
5016
5017         if (error != 0)
5018                 return (spa_vdev_exit(spa, NULL, txg, error));
5019
5020         /* check new spa name before going any further */
5021         if (spa_lookup(newname) != NULL)
5022                 return (spa_vdev_exit(spa, NULL, txg, EEXIST));
5023
5024         /*
5025          * scan through all the children to ensure they're all mirrors
5026          */
5027         if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 ||
5028             nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child,
5029             &children) != 0)
5030                 return (spa_vdev_exit(spa, NULL, txg, EINVAL));
5031
5032         /* first, check to ensure we've got the right child count */
5033         rvd = spa->spa_root_vdev;
5034         lastlog = 0;
5035         for (c = 0; c < rvd->vdev_children; c++) {
5036                 vdev_t *vd = rvd->vdev_child[c];
5037
5038                 /* don't count the holes & logs as children */
5039                 if (vd->vdev_islog || vd->vdev_ishole) {
5040                         if (lastlog == 0)
5041                                 lastlog = c;
5042                         continue;
5043                 }
5044
5045                 lastlog = 0;
5046         }
5047         if (children != (lastlog != 0 ? lastlog : rvd->vdev_children))
5048                 return (spa_vdev_exit(spa, NULL, txg, EINVAL));
5049
5050         /* next, ensure no spare or cache devices are part of the split */
5051         if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 ||
5052             nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0)
5053                 return (spa_vdev_exit(spa, NULL, txg, EINVAL));
5054
5055         vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP);
5056         glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP);
5057
5058         /* then, loop over each vdev and validate it */
5059         for (c = 0; c < children; c++) {
5060                 uint64_t is_hole = 0;
5061
5062                 (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE,
5063                     &is_hole);
5064
5065                 if (is_hole != 0) {
5066                         if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole ||
5067                             spa->spa_root_vdev->vdev_child[c]->vdev_islog) {
5068                                 continue;
5069                         } else {
5070                                 error = SET_ERROR(EINVAL);
5071                                 break;
5072                         }
5073                 }
5074
5075                 /* which disk is going to be split? */
5076                 if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID,
5077                     &glist[c]) != 0) {
5078                         error = SET_ERROR(EINVAL);
5079                         break;
5080                 }
5081
5082                 /* look it up in the spa */
5083                 vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE);
5084                 if (vml[c] == NULL) {
5085                         error = SET_ERROR(ENODEV);
5086                         break;
5087                 }
5088
5089                 /* make sure there's nothing stopping the split */
5090                 if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops ||
5091                     vml[c]->vdev_islog ||
5092                     vml[c]->vdev_ishole ||
5093                     vml[c]->vdev_isspare ||
5094                     vml[c]->vdev_isl2cache ||
5095                     !vdev_writeable(vml[c]) ||
5096                     vml[c]->vdev_children != 0 ||
5097                     vml[c]->vdev_state != VDEV_STATE_HEALTHY ||
5098                     c != spa->spa_root_vdev->vdev_child[c]->vdev_id) {
5099                         error = SET_ERROR(EINVAL);
5100                         break;
5101                 }
5102
5103                 if (vdev_dtl_required(vml[c])) {
5104                         error = SET_ERROR(EBUSY);
5105                         break;
5106                 }
5107
5108                 /* we need certain info from the top level */
5109                 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY,
5110                     vml[c]->vdev_top->vdev_ms_array) == 0);
5111                 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT,
5112                     vml[c]->vdev_top->vdev_ms_shift) == 0);
5113                 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE,
5114                     vml[c]->vdev_top->vdev_asize) == 0);
5115                 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT,
5116                     vml[c]->vdev_top->vdev_ashift) == 0);
5117
5118                 /* transfer per-vdev ZAPs */
5119                 ASSERT3U(vml[c]->vdev_leaf_zap, !=, 0);
5120                 VERIFY0(nvlist_add_uint64(child[c],
5121                     ZPOOL_CONFIG_VDEV_LEAF_ZAP, vml[c]->vdev_leaf_zap));
5122
5123                 ASSERT3U(vml[c]->vdev_top->vdev_top_zap, !=, 0);
5124                 VERIFY0(nvlist_add_uint64(child[c],
5125                     ZPOOL_CONFIG_VDEV_TOP_ZAP,
5126                     vml[c]->vdev_parent->vdev_top_zap));
5127         }
5128
5129         if (error != 0) {
5130                 kmem_free(vml, children * sizeof (vdev_t *));
5131                 kmem_free(glist, children * sizeof (uint64_t));
5132                 return (spa_vdev_exit(spa, NULL, txg, error));
5133         }
5134
5135         /* stop writers from using the disks */
5136         for (c = 0; c < children; c++) {
5137                 if (vml[c] != NULL)
5138                         vml[c]->vdev_offline = B_TRUE;
5139         }
5140         vdev_reopen(spa->spa_root_vdev);
5141
5142         /*
5143          * Temporarily record the splitting vdevs in the spa config.  This
5144          * will disappear once the config is regenerated.
5145          */
5146         VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0);
5147         VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
5148             glist, children) == 0);
5149         kmem_free(glist, children * sizeof (uint64_t));
5150
5151         mutex_enter(&spa->spa_props_lock);
5152         VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT,
5153             nvl) == 0);
5154         mutex_exit(&spa->spa_props_lock);
5155         spa->spa_config_splitting = nvl;
5156         vdev_config_dirty(spa->spa_root_vdev);
5157
5158         /* configure and create the new pool */
5159         VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0);
5160         VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
5161             exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0);
5162         VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
5163             spa_version(spa)) == 0);
5164         VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG,
5165             spa->spa_config_txg) == 0);
5166         VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID,
5167             spa_generate_guid(NULL)) == 0);
5168         VERIFY0(nvlist_add_boolean(config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS));
5169         (void) nvlist_lookup_string(props,
5170             zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
5171
5172         /* add the new pool to the namespace */
5173         newspa = spa_add(newname, config, altroot);
5174         newspa->spa_avz_action = AVZ_ACTION_REBUILD;
5175         newspa->spa_config_txg = spa->spa_config_txg;
5176         spa_set_log_state(newspa, SPA_LOG_CLEAR);
5177
5178         /* release the spa config lock, retaining the namespace lock */
5179         spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
5180
5181         if (zio_injection_enabled)
5182                 zio_handle_panic_injection(spa, FTAG, 1);
5183
5184         spa_activate(newspa, spa_mode_global);
5185         spa_async_suspend(newspa);
5186
5187         /* create the new pool from the disks of the original pool */
5188         error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE);
5189         if (error)
5190                 goto out;
5191
5192         /* if that worked, generate a real config for the new pool */
5193         if (newspa->spa_root_vdev != NULL) {
5194                 VERIFY(nvlist_alloc(&newspa->spa_config_splitting,
5195                     NV_UNIQUE_NAME, KM_SLEEP) == 0);
5196                 VERIFY(nvlist_add_uint64(newspa->spa_config_splitting,
5197                     ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0);
5198                 spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL,
5199                     B_TRUE));
5200         }
5201
5202         /* set the props */
5203         if (props != NULL) {
5204                 spa_configfile_set(newspa, props, B_FALSE);
5205                 error = spa_prop_set(newspa, props);
5206                 if (error)
5207                         goto out;
5208         }
5209
5210         /* flush everything */
5211         txg = spa_vdev_config_enter(newspa);
5212         vdev_config_dirty(newspa->spa_root_vdev);
5213         (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG);
5214
5215         if (zio_injection_enabled)
5216                 zio_handle_panic_injection(spa, FTAG, 2);
5217
5218         spa_async_resume(newspa);
5219
5220         /* finally, update the original pool's config */
5221         txg = spa_vdev_config_enter(spa);
5222         tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
5223         error = dmu_tx_assign(tx, TXG_WAIT);
5224         if (error != 0)
5225                 dmu_tx_abort(tx);
5226         for (c = 0; c < children; c++) {
5227                 if (vml[c] != NULL) {
5228                         vdev_split(vml[c]);
5229                         if (error == 0)
5230                                 spa_history_log_internal(spa, "detach", tx,
5231                                     "vdev=%s", vml[c]->vdev_path);
5232
5233                         vdev_free(vml[c]);
5234                 }
5235         }
5236         spa->spa_avz_action = AVZ_ACTION_REBUILD;
5237         vdev_config_dirty(spa->spa_root_vdev);
5238         spa->spa_config_splitting = NULL;
5239         nvlist_free(nvl);
5240         if (error == 0)
5241                 dmu_tx_commit(tx);
5242         (void) spa_vdev_exit(spa, NULL, txg, 0);
5243
5244         if (zio_injection_enabled)
5245                 zio_handle_panic_injection(spa, FTAG, 3);
5246
5247         /* split is complete; log a history record */
5248         spa_history_log_internal(newspa, "split", NULL,
5249             "from pool %s", spa_name(spa));
5250
5251         kmem_free(vml, children * sizeof (vdev_t *));
5252
5253         /* if we're not going to mount the filesystems in userland, export */
5254         if (exp)
5255                 error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL,
5256                     B_FALSE, B_FALSE);
5257
5258         return (error);
5259
5260 out:
5261         spa_unload(newspa);
5262         spa_deactivate(newspa);
5263         spa_remove(newspa);
5264
5265         txg = spa_vdev_config_enter(spa);
5266
5267         /* re-online all offlined disks */
5268         for (c = 0; c < children; c++) {
5269                 if (vml[c] != NULL)
5270                         vml[c]->vdev_offline = B_FALSE;
5271         }
5272         vdev_reopen(spa->spa_root_vdev);
5273
5274         nvlist_free(spa->spa_config_splitting);
5275         spa->spa_config_splitting = NULL;
5276         (void) spa_vdev_exit(spa, NULL, txg, error);
5277
5278         kmem_free(vml, children * sizeof (vdev_t *));
5279         return (error);
5280 }
5281
5282 static nvlist_t *
5283 spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid)
5284 {
5285         int i;
5286
5287         for (i = 0; i < count; i++) {
5288                 uint64_t guid;
5289
5290                 VERIFY(nvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID,
5291                     &guid) == 0);
5292
5293                 if (guid == target_guid)
5294                         return (nvpp[i]);
5295         }
5296
5297         return (NULL);
5298 }
5299
5300 static void
5301 spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count,
5302         nvlist_t *dev_to_remove)
5303 {
5304         nvlist_t **newdev = NULL;
5305         int i, j;
5306
5307         if (count > 1)
5308                 newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP);
5309
5310         for (i = 0, j = 0; i < count; i++) {
5311                 if (dev[i] == dev_to_remove)
5312                         continue;
5313                 VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0);
5314         }
5315
5316         VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0);
5317         VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0);
5318
5319         for (i = 0; i < count - 1; i++)
5320                 nvlist_free(newdev[i]);
5321
5322         if (count > 1)
5323                 kmem_free(newdev, (count - 1) * sizeof (void *));
5324 }
5325
5326 /*
5327  * Evacuate the device.
5328  */
5329 static int
5330 spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd)
5331 {
5332         uint64_t txg;
5333         int error = 0;
5334
5335         ASSERT(MUTEX_HELD(&spa_namespace_lock));
5336         ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
5337         ASSERT(vd == vd->vdev_top);
5338
5339         /*
5340          * Evacuate the device.  We don't hold the config lock as writer
5341          * since we need to do I/O but we do keep the
5342          * spa_namespace_lock held.  Once this completes the device
5343          * should no longer have any blocks allocated on it.
5344          */
5345         if (vd->vdev_islog) {
5346                 if (vd->vdev_stat.vs_alloc != 0)
5347                         error = spa_offline_log(spa);
5348         } else {
5349                 error = SET_ERROR(ENOTSUP);
5350         }
5351
5352         if (error)
5353                 return (error);
5354
5355         /*
5356          * The evacuation succeeded.  Remove any remaining MOS metadata
5357          * associated with this vdev, and wait for these changes to sync.
5358          */
5359         ASSERT0(vd->vdev_stat.vs_alloc);
5360         txg = spa_vdev_config_enter(spa);
5361         vd->vdev_removing = B_TRUE;
5362         vdev_dirty_leaves(vd, VDD_DTL, txg);
5363         vdev_config_dirty(vd);
5364         spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
5365
5366         return (0);
5367 }
5368
5369 /*
5370  * Complete the removal by cleaning up the namespace.
5371  */
5372 static void
5373 spa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd)
5374 {
5375         vdev_t *rvd = spa->spa_root_vdev;
5376         uint64_t id = vd->vdev_id;
5377         boolean_t last_vdev = (id == (rvd->vdev_children - 1));
5378
5379         ASSERT(MUTEX_HELD(&spa_namespace_lock));
5380         ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
5381         ASSERT(vd == vd->vdev_top);
5382
5383         /*
5384          * Only remove any devices which are empty.
5385          */
5386         if (vd->vdev_stat.vs_alloc != 0)
5387                 return;
5388
5389         (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
5390
5391         if (list_link_active(&vd->vdev_state_dirty_node))
5392                 vdev_state_clean(vd);
5393         if (list_link_active(&vd->vdev_config_dirty_node))
5394                 vdev_config_clean(vd);
5395
5396         vdev_free(vd);
5397
5398         if (last_vdev) {
5399                 vdev_compact_children(rvd);
5400         } else {
5401                 vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops);
5402                 vdev_add_child(rvd, vd);
5403         }
5404         vdev_config_dirty(rvd);
5405
5406         /*
5407          * Reassess the health of our root vdev.
5408          */
5409         vdev_reopen(rvd);
5410 }
5411
5412 /*
5413  * Remove a device from the pool -
5414  *
5415  * Removing a device from the vdev namespace requires several steps
5416  * and can take a significant amount of time.  As a result we use
5417  * the spa_vdev_config_[enter/exit] functions which allow us to
5418  * grab and release the spa_config_lock while still holding the namespace
5419  * lock.  During each step the configuration is synced out.
5420  *
5421  * Currently, this supports removing only hot spares, slogs, and level 2 ARC
5422  * devices.
5423  */
5424 int
5425 spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
5426 {
5427         vdev_t *vd;
5428         metaslab_group_t *mg;
5429         nvlist_t **spares, **l2cache, *nv;
5430         uint64_t txg = 0;
5431         uint_t nspares, nl2cache;
5432         int error = 0;
5433         boolean_t locked = MUTEX_HELD(&spa_namespace_lock);
5434
5435         ASSERT(spa_writeable(spa));
5436
5437         if (!locked)
5438                 txg = spa_vdev_enter(spa);
5439
5440         vd = spa_lookup_by_guid(spa, guid, B_FALSE);
5441
5442         if (spa->spa_spares.sav_vdevs != NULL &&
5443             nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
5444             ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 &&
5445             (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) {
5446                 /*
5447                  * Only remove the hot spare if it's not currently in use
5448                  * in this pool.
5449                  */
5450                 if (vd == NULL || unspare) {
5451                         spa_vdev_remove_aux(spa->spa_spares.sav_config,
5452                             ZPOOL_CONFIG_SPARES, spares, nspares, nv);
5453                         spa_load_spares(spa);
5454                         spa->spa_spares.sav_sync = B_TRUE;
5455                 } else {
5456                         error = SET_ERROR(EBUSY);
5457                 }
5458                 spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE_AUX);
5459         } else if (spa->spa_l2cache.sav_vdevs != NULL &&
5460             nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
5461             ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 &&
5462             (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) {
5463                 /*
5464                  * Cache devices can always be removed.
5465                  */
5466                 spa_vdev_remove_aux(spa->spa_l2cache.sav_config,
5467                     ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv);
5468                 spa_load_l2cache(spa);
5469                 spa->spa_l2cache.sav_sync = B_TRUE;
5470                 spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE_AUX);
5471         } else if (vd != NULL && vd->vdev_islog) {
5472                 ASSERT(!locked);
5473                 ASSERT(vd == vd->vdev_top);
5474
5475                 mg = vd->vdev_mg;
5476
5477                 /*
5478                  * Stop allocating from this vdev.
5479                  */
5480                 metaslab_group_passivate(mg);
5481
5482                 /*
5483                  * Wait for the youngest allocations and frees to sync,
5484                  * and then wait for the deferral of those frees to finish.
5485                  */
5486                 spa_vdev_config_exit(spa, NULL,
5487                     txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
5488
5489                 /*
5490                  * Attempt to evacuate the vdev.
5491                  */
5492                 error = spa_vdev_remove_evacuate(spa, vd);
5493
5494                 txg = spa_vdev_config_enter(spa);
5495
5496                 /*
5497                  * If we couldn't evacuate the vdev, unwind.
5498                  */
5499                 if (error) {
5500                         metaslab_group_activate(mg);
5501                         return (spa_vdev_exit(spa, NULL, txg, error));
5502                 }
5503
5504                 /*
5505                  * Clean up the vdev namespace.
5506                  */
5507                 spa_vdev_remove_from_namespace(spa, vd);
5508
5509                 spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE_DEV);
5510         } else if (vd != NULL) {
5511                 /*
5512                  * Normal vdevs cannot be removed (yet).
5513                  */
5514                 error = SET_ERROR(ENOTSUP);
5515         } else {
5516                 /*
5517                  * There is no vdev of any kind with the specified guid.
5518                  */
5519                 error = SET_ERROR(ENOENT);
5520         }
5521
5522         if (!locked)
5523                 return (spa_vdev_exit(spa, NULL, txg, error));
5524
5525         return (error);
5526 }
5527
5528 /*
5529  * Find any device that's done replacing, or a vdev marked 'unspare' that's
5530  * currently spared, so we can detach it.
5531  */
5532 static vdev_t *
5533 spa_vdev_resilver_done_hunt(vdev_t *vd)
5534 {
5535         vdev_t *newvd, *oldvd;
5536         int c;
5537
5538         for (c = 0; c < vd->vdev_children; c++) {
5539                 oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]);
5540                 if (oldvd != NULL)
5541                         return (oldvd);
5542         }
5543
5544         /*
5545          * Check for a completed replacement.  We always consider the first
5546          * vdev in the list to be the oldest vdev, and the last one to be
5547          * the newest (see spa_vdev_attach() for how that works).  In
5548          * the case where the newest vdev is faulted, we will not automatically
5549          * remove it after a resilver completes.  This is OK as it will require
5550          * user intervention to determine which disk the admin wishes to keep.
5551          */
5552         if (vd->vdev_ops == &vdev_replacing_ops) {
5553                 ASSERT(vd->vdev_children > 1);
5554
5555                 newvd = vd->vdev_child[vd->vdev_children - 1];
5556                 oldvd = vd->vdev_child[0];
5557
5558                 if (vdev_dtl_empty(newvd, DTL_MISSING) &&
5559                     vdev_dtl_empty(newvd, DTL_OUTAGE) &&
5560                     !vdev_dtl_required(oldvd))
5561                         return (oldvd);
5562         }
5563
5564         /*
5565          * Check for a completed resilver with the 'unspare' flag set.
5566          */
5567         if (vd->vdev_ops == &vdev_spare_ops) {
5568                 vdev_t *first = vd->vdev_child[0];
5569                 vdev_t *last = vd->vdev_child[vd->vdev_children - 1];
5570
5571                 if (last->vdev_unspare) {
5572                         oldvd = first;
5573                         newvd = last;
5574                 } else if (first->vdev_unspare) {
5575                         oldvd = last;
5576                         newvd = first;
5577                 } else {
5578                         oldvd = NULL;
5579                 }
5580
5581                 if (oldvd != NULL &&
5582                     vdev_dtl_empty(newvd, DTL_MISSING) &&
5583                     vdev_dtl_empty(newvd, DTL_OUTAGE) &&
5584                     !vdev_dtl_required(oldvd))
5585                         return (oldvd);
5586
5587                 /*
5588                  * If there are more than two spares attached to a disk,
5589                  * and those spares are not required, then we want to
5590                  * attempt to free them up now so that they can be used
5591                  * by other pools.  Once we're back down to a single
5592                  * disk+spare, we stop removing them.
5593                  */
5594                 if (vd->vdev_children > 2) {
5595                         newvd = vd->vdev_child[1];
5596
5597                         if (newvd->vdev_isspare && last->vdev_isspare &&
5598                             vdev_dtl_empty(last, DTL_MISSING) &&
5599                             vdev_dtl_empty(last, DTL_OUTAGE) &&
5600                             !vdev_dtl_required(newvd))
5601                                 return (newvd);
5602                 }
5603         }
5604
5605         return (NULL);
5606 }
5607
5608 static void
5609 spa_vdev_resilver_done(spa_t *spa)
5610 {
5611         vdev_t *vd, *pvd, *ppvd;
5612         uint64_t guid, sguid, pguid, ppguid;
5613
5614         spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
5615
5616         while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) {
5617                 pvd = vd->vdev_parent;
5618                 ppvd = pvd->vdev_parent;
5619                 guid = vd->vdev_guid;
5620                 pguid = pvd->vdev_guid;
5621                 ppguid = ppvd->vdev_guid;
5622                 sguid = 0;
5623                 /*
5624                  * If we have just finished replacing a hot spared device, then
5625                  * we need to detach the parent's first child (the original hot
5626                  * spare) as well.
5627                  */
5628                 if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 &&
5629                     ppvd->vdev_children == 2) {
5630                         ASSERT(pvd->vdev_ops == &vdev_replacing_ops);
5631                         sguid = ppvd->vdev_child[1]->vdev_guid;
5632                 }
5633                 ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd));
5634
5635                 spa_config_exit(spa, SCL_ALL, FTAG);
5636                 if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0)
5637                         return;
5638                 if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0)
5639                         return;
5640                 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
5641         }
5642
5643         spa_config_exit(spa, SCL_ALL, FTAG);
5644 }
5645
5646 /*
5647  * Update the stored path or FRU for this vdev.
5648  */
5649 int
5650 spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value,
5651     boolean_t ispath)
5652 {
5653         vdev_t *vd;
5654         boolean_t sync = B_FALSE;
5655
5656         ASSERT(spa_writeable(spa));
5657
5658         spa_vdev_state_enter(spa, SCL_ALL);
5659
5660         if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
5661                 return (spa_vdev_state_exit(spa, NULL, ENOENT));
5662
5663         if (!vd->vdev_ops->vdev_op_leaf)
5664                 return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
5665
5666         if (ispath) {
5667                 if (strcmp(value, vd->vdev_path) != 0) {
5668                         spa_strfree(vd->vdev_path);
5669                         vd->vdev_path = spa_strdup(value);
5670                         sync = B_TRUE;
5671                 }
5672         } else {
5673                 if (vd->vdev_fru == NULL) {
5674                         vd->vdev_fru = spa_strdup(value);
5675                         sync = B_TRUE;
5676                 } else if (strcmp(value, vd->vdev_fru) != 0) {
5677                         spa_strfree(vd->vdev_fru);
5678                         vd->vdev_fru = spa_strdup(value);
5679                         sync = B_TRUE;
5680                 }
5681         }
5682
5683         return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0));
5684 }
5685
5686 int
5687 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath)
5688 {
5689         return (spa_vdev_set_common(spa, guid, newpath, B_TRUE));
5690 }
5691
5692 int
5693 spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru)
5694 {
5695         return (spa_vdev_set_common(spa, guid, newfru, B_FALSE));
5696 }
5697
5698 /*
5699  * ==========================================================================
5700  * SPA Scanning
5701  * ==========================================================================
5702  */
5703
5704 int
5705 spa_scan_stop(spa_t *spa)
5706 {
5707         ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
5708         if (dsl_scan_resilvering(spa->spa_dsl_pool))
5709                 return (SET_ERROR(EBUSY));
5710         return (dsl_scan_cancel(spa->spa_dsl_pool));
5711 }
5712
5713 int
5714 spa_scan(spa_t *spa, pool_scan_func_t func)
5715 {
5716         ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
5717
5718         if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE)
5719                 return (SET_ERROR(ENOTSUP));
5720
5721         /*
5722          * If a resilver was requested, but there is no DTL on a
5723          * writeable leaf device, we have nothing to do.
5724          */
5725         if (func == POOL_SCAN_RESILVER &&
5726             !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) {
5727                 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
5728                 return (0);
5729         }
5730
5731         return (dsl_scan(spa->spa_dsl_pool, func));
5732 }
5733
5734 /*
5735  * ==========================================================================
5736  * SPA async task processing
5737  * ==========================================================================
5738  */
5739
5740 static void
5741 spa_async_remove(spa_t *spa, vdev_t *vd)
5742 {
5743         int c;
5744
5745         if (vd->vdev_remove_wanted) {
5746                 vd->vdev_remove_wanted = B_FALSE;
5747                 vd->vdev_delayed_close = B_FALSE;
5748                 vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE);
5749
5750                 /*
5751                  * We want to clear the stats, but we don't want to do a full
5752                  * vdev_clear() as that will cause us to throw away
5753                  * degraded/faulted state as well as attempt to reopen the
5754                  * device, all of which is a waste.
5755                  */
5756                 vd->vdev_stat.vs_read_errors = 0;
5757                 vd->vdev_stat.vs_write_errors = 0;
5758                 vd->vdev_stat.vs_checksum_errors = 0;
5759
5760                 vdev_state_dirty(vd->vdev_top);
5761         }
5762
5763         for (c = 0; c < vd->vdev_children; c++)
5764                 spa_async_remove(spa, vd->vdev_child[c]);
5765 }
5766
5767 static void
5768 spa_async_probe(spa_t *spa, vdev_t *vd)
5769 {
5770         int c;
5771
5772         if (vd->vdev_probe_wanted) {
5773                 vd->vdev_probe_wanted = B_FALSE;
5774                 vdev_reopen(vd);        /* vdev_open() does the actual probe */
5775         }
5776
5777         for (c = 0; c < vd->vdev_children; c++)
5778                 spa_async_probe(spa, vd->vdev_child[c]);
5779 }
5780
5781 static void
5782 spa_async_autoexpand(spa_t *spa, vdev_t *vd)
5783 {
5784         int c;
5785
5786         if (!spa->spa_autoexpand)
5787                 return;
5788
5789         for (c = 0; c < vd->vdev_children; c++) {
5790                 vdev_t *cvd = vd->vdev_child[c];
5791                 spa_async_autoexpand(spa, cvd);
5792         }
5793
5794         if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL)
5795                 return;
5796
5797         spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_AUTOEXPAND);
5798 }
5799
5800 static void
5801 spa_async_thread(spa_t *spa)
5802 {
5803         int tasks, i;
5804
5805         ASSERT(spa->spa_sync_on);
5806
5807         mutex_enter(&spa->spa_async_lock);
5808         tasks = spa->spa_async_tasks;
5809         spa->spa_async_tasks = 0;
5810         mutex_exit(&spa->spa_async_lock);
5811
5812         /*
5813          * See if the config needs to be updated.
5814          */
5815         if (tasks & SPA_ASYNC_CONFIG_UPDATE) {
5816                 uint64_t old_space, new_space;
5817
5818                 mutex_enter(&spa_namespace_lock);
5819                 old_space = metaslab_class_get_space(spa_normal_class(spa));
5820                 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
5821                 new_space = metaslab_class_get_space(spa_normal_class(spa));
5822                 mutex_exit(&spa_namespace_lock);
5823
5824                 /*
5825                  * If the pool grew as a result of the config update,
5826                  * then log an internal history event.
5827                  */
5828                 if (new_space != old_space) {
5829                         spa_history_log_internal(spa, "vdev online", NULL,
5830                             "pool '%s' size: %llu(+%llu)",
5831                             spa_name(spa), new_space, new_space - old_space);
5832                 }
5833         }
5834
5835         /*
5836          * See if any devices need to be marked REMOVED.
5837          */
5838         if (tasks & SPA_ASYNC_REMOVE) {
5839                 spa_vdev_state_enter(spa, SCL_NONE);
5840                 spa_async_remove(spa, spa->spa_root_vdev);
5841                 for (i = 0; i < spa->spa_l2cache.sav_count; i++)
5842                         spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]);
5843                 for (i = 0; i < spa->spa_spares.sav_count; i++)
5844                         spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]);
5845                 (void) spa_vdev_state_exit(spa, NULL, 0);
5846         }
5847
5848         if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) {
5849                 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
5850                 spa_async_autoexpand(spa, spa->spa_root_vdev);
5851                 spa_config_exit(spa, SCL_CONFIG, FTAG);
5852         }
5853
5854         /*
5855          * See if any devices need to be probed.
5856          */
5857         if (tasks & SPA_ASYNC_PROBE) {
5858                 spa_vdev_state_enter(spa, SCL_NONE);
5859                 spa_async_probe(spa, spa->spa_root_vdev);
5860                 (void) spa_vdev_state_exit(spa, NULL, 0);
5861         }
5862
5863         /*
5864          * If any devices are done replacing, detach them.
5865          */
5866         if (tasks & SPA_ASYNC_RESILVER_DONE)
5867                 spa_vdev_resilver_done(spa);
5868
5869         /*
5870          * Kick off a resilver.
5871          */
5872         if (tasks & SPA_ASYNC_RESILVER)
5873                 dsl_resilver_restart(spa->spa_dsl_pool, 0);
5874
5875         /*
5876          * Let the world know that we're done.
5877          */
5878         mutex_enter(&spa->spa_async_lock);
5879         spa->spa_async_thread = NULL;
5880         cv_broadcast(&spa->spa_async_cv);
5881         mutex_exit(&spa->spa_async_lock);
5882         thread_exit();
5883 }
5884
5885 void
5886 spa_async_suspend(spa_t *spa)
5887 {
5888         mutex_enter(&spa->spa_async_lock);
5889         spa->spa_async_suspended++;
5890         while (spa->spa_async_thread != NULL)
5891                 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock);
5892         mutex_exit(&spa->spa_async_lock);
5893 }
5894
5895 void
5896 spa_async_resume(spa_t *spa)
5897 {
5898         mutex_enter(&spa->spa_async_lock);
5899         ASSERT(spa->spa_async_suspended != 0);
5900         spa->spa_async_suspended--;
5901         mutex_exit(&spa->spa_async_lock);
5902 }
5903
5904 static boolean_t
5905 spa_async_tasks_pending(spa_t *spa)
5906 {
5907         uint_t non_config_tasks;
5908         uint_t config_task;
5909         boolean_t config_task_suspended;
5910
5911         non_config_tasks = spa->spa_async_tasks & ~SPA_ASYNC_CONFIG_UPDATE;
5912         config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE;
5913         if (spa->spa_ccw_fail_time == 0) {
5914                 config_task_suspended = B_FALSE;
5915         } else {
5916                 config_task_suspended =
5917                     (gethrtime() - spa->spa_ccw_fail_time) <
5918                     (zfs_ccw_retry_interval * NANOSEC);
5919         }
5920
5921         return (non_config_tasks || (config_task && !config_task_suspended));
5922 }
5923
5924 static void
5925 spa_async_dispatch(spa_t *spa)
5926 {
5927         mutex_enter(&spa->spa_async_lock);
5928         if (spa_async_tasks_pending(spa) &&
5929             !spa->spa_async_suspended &&
5930             spa->spa_async_thread == NULL &&
5931             rootdir != NULL)
5932                 spa->spa_async_thread = thread_create(NULL, 0,
5933                     spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri);
5934         mutex_exit(&spa->spa_async_lock);
5935 }
5936
5937 void
5938 spa_async_request(spa_t *spa, int task)
5939 {
5940         zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task);
5941         mutex_enter(&spa->spa_async_lock);
5942         spa->spa_async_tasks |= task;
5943         mutex_exit(&spa->spa_async_lock);
5944 }
5945
5946 /*
5947  * ==========================================================================
5948  * SPA syncing routines
5949  * ==========================================================================
5950  */
5951
5952 static int
5953 bpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
5954 {
5955         bpobj_t *bpo = arg;
5956         bpobj_enqueue(bpo, bp, tx);
5957         return (0);
5958 }
5959
5960 static int
5961 spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
5962 {
5963         zio_t *zio = arg;
5964
5965         zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp,
5966             zio->io_flags));
5967         return (0);
5968 }
5969
5970 /*
5971  * Note: this simple function is not inlined to make it easier to dtrace the
5972  * amount of time spent syncing frees.
5973  */
5974 static void
5975 spa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx)
5976 {
5977         zio_t *zio = zio_root(spa, NULL, NULL, 0);
5978         bplist_iterate(bpl, spa_free_sync_cb, zio, tx);
5979         VERIFY(zio_wait(zio) == 0);
5980 }
5981
5982 /*
5983  * Note: this simple function is not inlined to make it easier to dtrace the
5984  * amount of time spent syncing deferred frees.
5985  */
5986 static void
5987 spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx)
5988 {
5989         zio_t *zio = zio_root(spa, NULL, NULL, 0);
5990         VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj,
5991             spa_free_sync_cb, zio, tx), ==, 0);
5992         VERIFY0(zio_wait(zio));
5993 }
5994
5995 static void
5996 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx)
5997 {
5998         char *packed = NULL;
5999         size_t bufsize;
6000         size_t nvsize = 0;
6001         dmu_buf_t *db;
6002
6003         VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0);
6004
6005         /*
6006          * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration
6007          * information.  This avoids the dmu_buf_will_dirty() path and
6008          * saves us a pre-read to get data we don't actually care about.
6009          */
6010         bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE);
6011         packed = vmem_alloc(bufsize, KM_SLEEP);
6012
6013         VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR,
6014             KM_SLEEP) == 0);
6015         bzero(packed + nvsize, bufsize - nvsize);
6016
6017         dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx);
6018
6019         vmem_free(packed, bufsize);
6020
6021         VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
6022         dmu_buf_will_dirty(db, tx);
6023         *(uint64_t *)db->db_data = nvsize;
6024         dmu_buf_rele(db, FTAG);
6025 }
6026
6027 static void
6028 spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx,
6029     const char *config, const char *entry)
6030 {
6031         nvlist_t *nvroot;
6032         nvlist_t **list;
6033         int i;
6034
6035         if (!sav->sav_sync)
6036                 return;
6037
6038         /*
6039          * Update the MOS nvlist describing the list of available devices.
6040          * spa_validate_aux() will have already made sure this nvlist is
6041          * valid and the vdevs are labeled appropriately.
6042          */
6043         if (sav->sav_object == 0) {
6044                 sav->sav_object = dmu_object_alloc(spa->spa_meta_objset,
6045                     DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE,
6046                     sizeof (uint64_t), tx);
6047                 VERIFY(zap_update(spa->spa_meta_objset,
6048                     DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1,
6049                     &sav->sav_object, tx) == 0);
6050         }
6051
6052         VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
6053         if (sav->sav_count == 0) {
6054                 VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0);
6055         } else {
6056                 list = kmem_alloc(sav->sav_count*sizeof (void *), KM_SLEEP);
6057                 for (i = 0; i < sav->sav_count; i++)
6058                         list[i] = vdev_config_generate(spa, sav->sav_vdevs[i],
6059                             B_FALSE, VDEV_CONFIG_L2CACHE);
6060                 VERIFY(nvlist_add_nvlist_array(nvroot, config, list,
6061                     sav->sav_count) == 0);
6062                 for (i = 0; i < sav->sav_count; i++)
6063                         nvlist_free(list[i]);
6064                 kmem_free(list, sav->sav_count * sizeof (void *));
6065         }
6066
6067         spa_sync_nvlist(spa, sav->sav_object, nvroot, tx);
6068         nvlist_free(nvroot);
6069
6070         sav->sav_sync = B_FALSE;
6071 }
6072
6073 /*
6074  * Rebuild spa's all-vdev ZAP from the vdev ZAPs indicated in each vdev_t.
6075  * The all-vdev ZAP must be empty.
6076  */
6077 static void
6078 spa_avz_build(vdev_t *vd, uint64_t avz, dmu_tx_t *tx)
6079 {
6080         spa_t *spa = vd->vdev_spa;
6081         uint64_t i;
6082
6083         if (vd->vdev_top_zap != 0) {
6084                 VERIFY0(zap_add_int(spa->spa_meta_objset, avz,
6085                     vd->vdev_top_zap, tx));
6086         }
6087         if (vd->vdev_leaf_zap != 0) {
6088                 VERIFY0(zap_add_int(spa->spa_meta_objset, avz,
6089                     vd->vdev_leaf_zap, tx));
6090         }
6091         for (i = 0; i < vd->vdev_children; i++) {
6092                 spa_avz_build(vd->vdev_child[i], avz, tx);
6093         }
6094 }
6095
6096 static void
6097 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx)
6098 {
6099         nvlist_t *config;
6100
6101         /*
6102          * If the pool is being imported from a pre-per-vdev-ZAP version of ZFS,
6103          * its config may not be dirty but we still need to build per-vdev ZAPs.
6104          * Similarly, if the pool is being assembled (e.g. after a split), we
6105          * need to rebuild the AVZ although the config may not be dirty.
6106          */
6107         if (list_is_empty(&spa->spa_config_dirty_list) &&
6108             spa->spa_avz_action == AVZ_ACTION_NONE)
6109                 return;
6110
6111         spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
6112
6113         ASSERT(spa->spa_avz_action == AVZ_ACTION_NONE ||
6114             spa->spa_all_vdev_zaps != 0);
6115
6116         if (spa->spa_avz_action == AVZ_ACTION_REBUILD) {
6117                 zap_cursor_t zc;
6118                 zap_attribute_t za;
6119
6120                 /* Make and build the new AVZ */
6121                 uint64_t new_avz = zap_create(spa->spa_meta_objset,
6122                     DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx);
6123                 spa_avz_build(spa->spa_root_vdev, new_avz, tx);
6124
6125                 /* Diff old AVZ with new one */
6126                 for (zap_cursor_init(&zc, spa->spa_meta_objset,
6127                     spa->spa_all_vdev_zaps);
6128                     zap_cursor_retrieve(&zc, &za) == 0;
6129                     zap_cursor_advance(&zc)) {
6130                         uint64_t vdzap = za.za_first_integer;
6131                         if (zap_lookup_int(spa->spa_meta_objset, new_avz,
6132                             vdzap) == ENOENT) {
6133                                 /*
6134                                  * ZAP is listed in old AVZ but not in new one;
6135                                  * destroy it
6136                                  */
6137                                 VERIFY0(zap_destroy(spa->spa_meta_objset, vdzap,
6138                                     tx));
6139                         }
6140                 }
6141
6142                 zap_cursor_fini(&zc);
6143
6144                 /* Destroy the old AVZ */
6145                 VERIFY0(zap_destroy(spa->spa_meta_objset,
6146                     spa->spa_all_vdev_zaps, tx));
6147
6148                 /* Replace the old AVZ in the dir obj with the new one */
6149                 VERIFY0(zap_update(spa->spa_meta_objset,
6150                     DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP,
6151                     sizeof (new_avz), 1, &new_avz, tx));
6152
6153                 spa->spa_all_vdev_zaps = new_avz;
6154         } else if (spa->spa_avz_action == AVZ_ACTION_DESTROY) {
6155                 zap_cursor_t zc;
6156                 zap_attribute_t za;
6157
6158                 /* Walk through the AVZ and destroy all listed ZAPs */
6159                 for (zap_cursor_init(&zc, spa->spa_meta_objset,
6160                     spa->spa_all_vdev_zaps);
6161                     zap_cursor_retrieve(&zc, &za) == 0;
6162                     zap_cursor_advance(&zc)) {
6163                         uint64_t zap = za.za_first_integer;
6164                         VERIFY0(zap_destroy(spa->spa_meta_objset, zap, tx));
6165                 }
6166
6167                 zap_cursor_fini(&zc);
6168
6169                 /* Destroy and unlink the AVZ itself */
6170                 VERIFY0(zap_destroy(spa->spa_meta_objset,
6171                     spa->spa_all_vdev_zaps, tx));
6172                 VERIFY0(zap_remove(spa->spa_meta_objset,
6173                     DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, tx));
6174                 spa->spa_all_vdev_zaps = 0;
6175         }
6176
6177         if (spa->spa_all_vdev_zaps == 0) {
6178                 spa->spa_all_vdev_zaps = zap_create_link(spa->spa_meta_objset,
6179                     DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT,
6180                     DMU_POOL_VDEV_ZAP_MAP, tx);
6181         }
6182         spa->spa_avz_action = AVZ_ACTION_NONE;
6183
6184         /* Create ZAPs for vdevs that don't have them. */
6185         vdev_construct_zaps(spa->spa_root_vdev, tx);
6186
6187         config = spa_config_generate(spa, spa->spa_root_vdev,
6188             dmu_tx_get_txg(tx), B_FALSE);
6189
6190         /*
6191          * If we're upgrading the spa version then make sure that
6192          * the config object gets updated with the correct version.
6193          */
6194         if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version)
6195                 fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
6196                     spa->spa_uberblock.ub_version);
6197
6198         spa_config_exit(spa, SCL_STATE, FTAG);
6199
6200         nvlist_free(spa->spa_config_syncing);
6201         spa->spa_config_syncing = config;
6202
6203         spa_sync_nvlist(spa, spa->spa_config_object, config, tx);
6204 }
6205
6206 static void
6207 spa_sync_version(void *arg, dmu_tx_t *tx)
6208 {
6209         uint64_t *versionp = arg;
6210         uint64_t version = *versionp;
6211         spa_t *spa = dmu_tx_pool(tx)->dp_spa;
6212
6213         /*
6214          * Setting the version is special cased when first creating the pool.
6215          */
6216         ASSERT(tx->tx_txg != TXG_INITIAL);
6217
6218         ASSERT(SPA_VERSION_IS_SUPPORTED(version));
6219         ASSERT(version >= spa_version(spa));
6220
6221         spa->spa_uberblock.ub_version = version;
6222         vdev_config_dirty(spa->spa_root_vdev);
6223         spa_history_log_internal(spa, "set", tx, "version=%lld", version);
6224 }
6225
6226 /*
6227  * Set zpool properties.
6228  */
6229 static void
6230 spa_sync_props(void *arg, dmu_tx_t *tx)
6231 {
6232         nvlist_t *nvp = arg;
6233         spa_t *spa = dmu_tx_pool(tx)->dp_spa;
6234         objset_t *mos = spa->spa_meta_objset;
6235         nvpair_t *elem = NULL;
6236
6237         mutex_enter(&spa->spa_props_lock);
6238
6239         while ((elem = nvlist_next_nvpair(nvp, elem))) {
6240                 uint64_t intval;
6241                 char *strval, *fname;
6242                 zpool_prop_t prop;
6243                 const char *propname;
6244                 zprop_type_t proptype;
6245                 spa_feature_t fid;
6246
6247                 prop = zpool_name_to_prop(nvpair_name(elem));
6248                 switch ((int)prop) {
6249                 case ZPROP_INVAL:
6250                         /*
6251                          * We checked this earlier in spa_prop_validate().
6252                          */
6253                         ASSERT(zpool_prop_feature(nvpair_name(elem)));
6254
6255                         fname = strchr(nvpair_name(elem), '@') + 1;
6256                         VERIFY0(zfeature_lookup_name(fname, &fid));
6257
6258                         spa_feature_enable(spa, fid, tx);
6259                         spa_history_log_internal(spa, "set", tx,
6260                             "%s=enabled", nvpair_name(elem));
6261                         break;
6262
6263                 case ZPOOL_PROP_VERSION:
6264                         intval = fnvpair_value_uint64(elem);
6265                         /*
6266                          * The version is synced seperatly before other
6267                          * properties and should be correct by now.
6268                          */
6269                         ASSERT3U(spa_version(spa), >=, intval);
6270                         break;
6271
6272                 case ZPOOL_PROP_ALTROOT:
6273                         /*
6274                          * 'altroot' is a non-persistent property. It should
6275                          * have been set temporarily at creation or import time.
6276                          */
6277                         ASSERT(spa->spa_root != NULL);
6278                         break;
6279
6280                 case ZPOOL_PROP_READONLY:
6281                 case ZPOOL_PROP_CACHEFILE:
6282                         /*
6283                          * 'readonly' and 'cachefile' are also non-persisitent
6284                          * properties.
6285                          */
6286                         break;
6287                 case ZPOOL_PROP_COMMENT:
6288                         strval = fnvpair_value_string(elem);
6289                         if (spa->spa_comment != NULL)
6290                                 spa_strfree(spa->spa_comment);
6291                         spa->spa_comment = spa_strdup(strval);
6292                         /*
6293                          * We need to dirty the configuration on all the vdevs
6294                          * so that their labels get updated.  It's unnecessary
6295                          * to do this for pool creation since the vdev's
6296                          * configuratoin has already been dirtied.
6297                          */
6298                         if (tx->tx_txg != TXG_INITIAL)
6299                                 vdev_config_dirty(spa->spa_root_vdev);
6300                         spa_history_log_internal(spa, "set", tx,
6301                             "%s=%s", nvpair_name(elem), strval);
6302                         break;
6303                 default:
6304                         /*
6305                          * Set pool property values in the poolprops mos object.
6306                          */
6307                         if (spa->spa_pool_props_object == 0) {
6308                                 spa->spa_pool_props_object =
6309                                     zap_create_link(mos, DMU_OT_POOL_PROPS,
6310                                     DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS,
6311                                     tx);
6312                         }
6313
6314                         /* normalize the property name */
6315                         propname = zpool_prop_to_name(prop);
6316                         proptype = zpool_prop_get_type(prop);
6317
6318                         if (nvpair_type(elem) == DATA_TYPE_STRING) {
6319                                 ASSERT(proptype == PROP_TYPE_STRING);
6320                                 strval = fnvpair_value_string(elem);
6321                                 VERIFY0(zap_update(mos,
6322                                     spa->spa_pool_props_object, propname,
6323                                     1, strlen(strval) + 1, strval, tx));
6324                                 spa_history_log_internal(spa, "set", tx,
6325                                     "%s=%s", nvpair_name(elem), strval);
6326                         } else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
6327                                 intval = fnvpair_value_uint64(elem);
6328
6329                                 if (proptype == PROP_TYPE_INDEX) {
6330                                         const char *unused;
6331                                         VERIFY0(zpool_prop_index_to_string(
6332                                             prop, intval, &unused));
6333                                 }
6334                                 VERIFY0(zap_update(mos,
6335                                     spa->spa_pool_props_object, propname,
6336                                     8, 1, &intval, tx));
6337                                 spa_history_log_internal(spa, "set", tx,
6338                                     "%s=%lld", nvpair_name(elem), intval);
6339                         } else {
6340                                 ASSERT(0); /* not allowed */
6341                         }
6342
6343                         switch (prop) {
6344                         case ZPOOL_PROP_DELEGATION:
6345                                 spa->spa_delegation = intval;
6346                                 break;
6347                         case ZPOOL_PROP_BOOTFS:
6348                                 spa->spa_bootfs = intval;
6349                                 break;
6350                         case ZPOOL_PROP_FAILUREMODE:
6351                                 spa->spa_failmode = intval;
6352                                 break;
6353                         case ZPOOL_PROP_AUTOEXPAND:
6354                                 spa->spa_autoexpand = intval;
6355                                 if (tx->tx_txg != TXG_INITIAL)
6356                                         spa_async_request(spa,
6357                                             SPA_ASYNC_AUTOEXPAND);
6358                                 break;
6359                         case ZPOOL_PROP_DEDUPDITTO:
6360                                 spa->spa_dedup_ditto = intval;
6361                                 break;
6362                         default:
6363                                 break;
6364                         }
6365                 }
6366
6367         }
6368
6369         mutex_exit(&spa->spa_props_lock);
6370 }
6371
6372 /*
6373  * Perform one-time upgrade on-disk changes.  spa_version() does not
6374  * reflect the new version this txg, so there must be no changes this
6375  * txg to anything that the upgrade code depends on after it executes.
6376  * Therefore this must be called after dsl_pool_sync() does the sync
6377  * tasks.
6378  */
6379 static void
6380 spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx)
6381 {
6382         dsl_pool_t *dp = spa->spa_dsl_pool;
6383
6384         ASSERT(spa->spa_sync_pass == 1);
6385
6386         rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
6387
6388         if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN &&
6389             spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) {
6390                 dsl_pool_create_origin(dp, tx);
6391
6392                 /* Keeping the origin open increases spa_minref */
6393                 spa->spa_minref += 3;
6394         }
6395
6396         if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES &&
6397             spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) {
6398                 dsl_pool_upgrade_clones(dp, tx);
6399         }
6400
6401         if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES &&
6402             spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) {
6403                 dsl_pool_upgrade_dir_clones(dp, tx);
6404
6405                 /* Keeping the freedir open increases spa_minref */
6406                 spa->spa_minref += 3;
6407         }
6408
6409         if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES &&
6410             spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) {
6411                 spa_feature_create_zap_objects(spa, tx);
6412         }
6413
6414         /*
6415          * LZ4_COMPRESS feature's behaviour was changed to activate_on_enable
6416          * when possibility to use lz4 compression for metadata was added
6417          * Old pools that have this feature enabled must be upgraded to have
6418          * this feature active
6419          */
6420         if (spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) {
6421                 boolean_t lz4_en = spa_feature_is_enabled(spa,
6422                     SPA_FEATURE_LZ4_COMPRESS);
6423                 boolean_t lz4_ac = spa_feature_is_active(spa,
6424                     SPA_FEATURE_LZ4_COMPRESS);
6425
6426                 if (lz4_en && !lz4_ac)
6427                         spa_feature_incr(spa, SPA_FEATURE_LZ4_COMPRESS, tx);
6428         }
6429
6430         /*
6431          * If we haven't written the salt, do so now.  Note that the
6432          * feature may not be activated yet, but that's fine since
6433          * the presence of this ZAP entry is backwards compatible.
6434          */
6435         if (zap_contains(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
6436             DMU_POOL_CHECKSUM_SALT) == ENOENT) {
6437                 VERIFY0(zap_add(spa->spa_meta_objset,
6438                     DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT, 1,
6439                     sizeof (spa->spa_cksum_salt.zcs_bytes),
6440                     spa->spa_cksum_salt.zcs_bytes, tx));
6441         }
6442
6443         rrw_exit(&dp->dp_config_rwlock, FTAG);
6444 }
6445
6446 /*
6447  * Sync the specified transaction group.  New blocks may be dirtied as
6448  * part of the process, so we iterate until it converges.
6449  */
6450 void
6451 spa_sync(spa_t *spa, uint64_t txg)
6452 {
6453         dsl_pool_t *dp = spa->spa_dsl_pool;
6454         objset_t *mos = spa->spa_meta_objset;
6455         bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK];
6456         vdev_t *rvd = spa->spa_root_vdev;
6457         vdev_t *vd;
6458         dmu_tx_t *tx;
6459         int error;
6460         int c;
6461
6462         VERIFY(spa_writeable(spa));
6463
6464         /*
6465          * Lock out configuration changes.
6466          */
6467         spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
6468
6469         spa->spa_syncing_txg = txg;
6470         spa->spa_sync_pass = 0;
6471
6472         /*
6473          * If there are any pending vdev state changes, convert them
6474          * into config changes that go out with this transaction group.
6475          */
6476         spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
6477         while (list_head(&spa->spa_state_dirty_list) != NULL) {
6478                 /*
6479                  * We need the write lock here because, for aux vdevs,
6480                  * calling vdev_config_dirty() modifies sav_config.
6481                  * This is ugly and will become unnecessary when we
6482                  * eliminate the aux vdev wart by integrating all vdevs
6483                  * into the root vdev tree.
6484                  */
6485                 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
6486                 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER);
6487                 while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) {
6488                         vdev_state_clean(vd);
6489                         vdev_config_dirty(vd);
6490                 }
6491                 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
6492                 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
6493         }
6494         spa_config_exit(spa, SCL_STATE, FTAG);
6495
6496         tx = dmu_tx_create_assigned(dp, txg);
6497
6498         spa->spa_sync_starttime = gethrtime();
6499         taskq_cancel_id(system_taskq, spa->spa_deadman_tqid);
6500         spa->spa_deadman_tqid = taskq_dispatch_delay(system_taskq,
6501             spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() +
6502             NSEC_TO_TICK(spa->spa_deadman_synctime));
6503
6504         /*
6505          * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg,
6506          * set spa_deflate if we have no raid-z vdevs.
6507          */
6508         if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE &&
6509             spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) {
6510                 int i;
6511
6512                 for (i = 0; i < rvd->vdev_children; i++) {
6513                         vd = rvd->vdev_child[i];
6514                         if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE)
6515                                 break;
6516                 }
6517                 if (i == rvd->vdev_children) {
6518                         spa->spa_deflate = TRUE;
6519                         VERIFY(0 == zap_add(spa->spa_meta_objset,
6520                             DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
6521                             sizeof (uint64_t), 1, &spa->spa_deflate, tx));
6522                 }
6523         }
6524
6525         /*
6526          * Iterate to convergence.
6527          */
6528         do {
6529                 int pass = ++spa->spa_sync_pass;
6530
6531                 spa_sync_config_object(spa, tx);
6532                 spa_sync_aux_dev(spa, &spa->spa_spares, tx,
6533                     ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES);
6534                 spa_sync_aux_dev(spa, &spa->spa_l2cache, tx,
6535                     ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE);
6536                 spa_errlog_sync(spa, txg);
6537                 dsl_pool_sync(dp, txg);
6538
6539                 if (pass < zfs_sync_pass_deferred_free) {
6540                         spa_sync_frees(spa, free_bpl, tx);
6541                 } else {
6542                         /*
6543                          * We can not defer frees in pass 1, because
6544                          * we sync the deferred frees later in pass 1.
6545                          */
6546                         ASSERT3U(pass, >, 1);
6547                         bplist_iterate(free_bpl, bpobj_enqueue_cb,
6548                             &spa->spa_deferred_bpobj, tx);
6549                 }
6550
6551                 ddt_sync(spa, txg);
6552                 dsl_scan_sync(dp, tx);
6553
6554                 while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)))
6555                         vdev_sync(vd, txg);
6556
6557                 if (pass == 1) {
6558                         spa_sync_upgrades(spa, tx);
6559                         ASSERT3U(txg, >=,
6560                             spa->spa_uberblock.ub_rootbp.blk_birth);
6561                         /*
6562                          * Note: We need to check if the MOS is dirty
6563                          * because we could have marked the MOS dirty
6564                          * without updating the uberblock (e.g. if we
6565                          * have sync tasks but no dirty user data).  We
6566                          * need to check the uberblock's rootbp because
6567                          * it is updated if we have synced out dirty
6568                          * data (though in this case the MOS will most
6569                          * likely also be dirty due to second order
6570                          * effects, we don't want to rely on that here).
6571                          */
6572                         if (spa->spa_uberblock.ub_rootbp.blk_birth < txg &&
6573                             !dmu_objset_is_dirty(mos, txg)) {
6574                                 /*
6575                                  * Nothing changed on the first pass,
6576                                  * therefore this TXG is a no-op.  Avoid
6577                                  * syncing deferred frees, so that we
6578                                  * can keep this TXG as a no-op.
6579                                  */
6580                                 ASSERT(txg_list_empty(&dp->dp_dirty_datasets,
6581                                     txg));
6582                                 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
6583                                 ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg));
6584                                 break;
6585                         }
6586                         spa_sync_deferred_frees(spa, tx);
6587                 }
6588
6589         } while (dmu_objset_is_dirty(mos, txg));
6590
6591 #ifdef ZFS_DEBUG
6592         if (!list_is_empty(&spa->spa_config_dirty_list)) {
6593                 /*
6594                  * Make sure that the number of ZAPs for all the vdevs matches
6595                  * the number of ZAPs in the per-vdev ZAP list. This only gets
6596                  * called if the config is dirty; otherwise there may be
6597                  * outstanding AVZ operations that weren't completed in
6598                  * spa_sync_config_object.
6599                  */
6600                 uint64_t all_vdev_zap_entry_count;
6601                 ASSERT0(zap_count(spa->spa_meta_objset,
6602                     spa->spa_all_vdev_zaps, &all_vdev_zap_entry_count));
6603                 ASSERT3U(vdev_count_verify_zaps(spa->spa_root_vdev), ==,
6604                     all_vdev_zap_entry_count);
6605         }
6606 #endif
6607
6608         /*
6609          * Rewrite the vdev configuration (which includes the uberblock)
6610          * to commit the transaction group.
6611          *
6612          * If there are no dirty vdevs, we sync the uberblock to a few
6613          * random top-level vdevs that are known to be visible in the
6614          * config cache (see spa_vdev_add() for a complete description).
6615          * If there *are* dirty vdevs, sync the uberblock to all vdevs.
6616          */
6617         for (;;) {
6618                 /*
6619                  * We hold SCL_STATE to prevent vdev open/close/etc.
6620                  * while we're attempting to write the vdev labels.
6621                  */
6622                 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
6623
6624                 if (list_is_empty(&spa->spa_config_dirty_list)) {
6625                         vdev_t *svd[SPA_DVAS_PER_BP];
6626                         int svdcount = 0;
6627                         int children = rvd->vdev_children;
6628                         int c0 = spa_get_random(children);
6629
6630                         for (c = 0; c < children; c++) {
6631                                 vd = rvd->vdev_child[(c0 + c) % children];
6632                                 if (vd->vdev_ms_array == 0 || vd->vdev_islog)
6633                                         continue;
6634                                 svd[svdcount++] = vd;
6635                                 if (svdcount == SPA_DVAS_PER_BP)
6636                                         break;
6637                         }
6638                         error = vdev_config_sync(svd, svdcount, txg);
6639                 } else {
6640                         error = vdev_config_sync(rvd->vdev_child,
6641                             rvd->vdev_children, txg);
6642                 }
6643
6644                 if (error == 0)
6645                         spa->spa_last_synced_guid = rvd->vdev_guid;
6646
6647                 spa_config_exit(spa, SCL_STATE, FTAG);
6648
6649                 if (error == 0)
6650                         break;
6651                 zio_suspend(spa, NULL);
6652                 zio_resume_wait(spa);
6653         }
6654         dmu_tx_commit(tx);
6655
6656         taskq_cancel_id(system_taskq, spa->spa_deadman_tqid);
6657         spa->spa_deadman_tqid = 0;
6658
6659         /*
6660          * Clear the dirty config list.
6661          */
6662         while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL)
6663                 vdev_config_clean(vd);
6664
6665         /*
6666          * Now that the new config has synced transactionally,
6667          * let it become visible to the config cache.
6668          */
6669         if (spa->spa_config_syncing != NULL) {
6670                 spa_config_set(spa, spa->spa_config_syncing);
6671                 spa->spa_config_txg = txg;
6672                 spa->spa_config_syncing = NULL;
6673         }
6674
6675         spa->spa_ubsync = spa->spa_uberblock;
6676
6677         dsl_pool_sync_done(dp, txg);
6678
6679         /*
6680          * Update usable space statistics.
6681          */
6682         while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))))
6683                 vdev_sync_done(vd, txg);
6684
6685         spa_update_dspace(spa);
6686
6687         /*
6688          * It had better be the case that we didn't dirty anything
6689          * since vdev_config_sync().
6690          */
6691         ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
6692         ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
6693         ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg));
6694
6695         spa->spa_sync_pass = 0;
6696
6697         spa_config_exit(spa, SCL_CONFIG, FTAG);
6698
6699         spa_handle_ignored_writes(spa);
6700
6701         /*
6702          * If any async tasks have been requested, kick them off.
6703          */
6704         spa_async_dispatch(spa);
6705 }
6706
6707 /*
6708  * Sync all pools.  We don't want to hold the namespace lock across these
6709  * operations, so we take a reference on the spa_t and drop the lock during the
6710  * sync.
6711  */
6712 void
6713 spa_sync_allpools(void)
6714 {
6715         spa_t *spa = NULL;
6716         mutex_enter(&spa_namespace_lock);
6717         while ((spa = spa_next(spa)) != NULL) {
6718                 if (spa_state(spa) != POOL_STATE_ACTIVE ||
6719                     !spa_writeable(spa) || spa_suspended(spa))
6720                         continue;
6721                 spa_open_ref(spa, FTAG);
6722                 mutex_exit(&spa_namespace_lock);
6723                 txg_wait_synced(spa_get_dsl(spa), 0);
6724                 mutex_enter(&spa_namespace_lock);
6725                 spa_close(spa, FTAG);
6726         }
6727         mutex_exit(&spa_namespace_lock);
6728 }
6729
6730 /*
6731  * ==========================================================================
6732  * Miscellaneous routines
6733  * ==========================================================================
6734  */
6735
6736 /*
6737  * Remove all pools in the system.
6738  */
6739 void
6740 spa_evict_all(void)
6741 {
6742         spa_t *spa;
6743
6744         /*
6745          * Remove all cached state.  All pools should be closed now,
6746          * so every spa in the AVL tree should be unreferenced.
6747          */
6748         mutex_enter(&spa_namespace_lock);
6749         while ((spa = spa_next(NULL)) != NULL) {
6750                 /*
6751                  * Stop async tasks.  The async thread may need to detach
6752                  * a device that's been replaced, which requires grabbing
6753                  * spa_namespace_lock, so we must drop it here.
6754                  */
6755                 spa_open_ref(spa, FTAG);
6756                 mutex_exit(&spa_namespace_lock);
6757                 spa_async_suspend(spa);
6758                 mutex_enter(&spa_namespace_lock);
6759                 spa_close(spa, FTAG);
6760
6761                 if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
6762                         spa_unload(spa);
6763                         spa_deactivate(spa);
6764                 }
6765                 spa_remove(spa);
6766         }
6767         mutex_exit(&spa_namespace_lock);
6768 }
6769
6770 vdev_t *
6771 spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux)
6772 {
6773         vdev_t *vd;
6774         int i;
6775
6776         if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL)
6777                 return (vd);
6778
6779         if (aux) {
6780                 for (i = 0; i < spa->spa_l2cache.sav_count; i++) {
6781                         vd = spa->spa_l2cache.sav_vdevs[i];
6782                         if (vd->vdev_guid == guid)
6783                                 return (vd);
6784                 }
6785
6786                 for (i = 0; i < spa->spa_spares.sav_count; i++) {
6787                         vd = spa->spa_spares.sav_vdevs[i];
6788                         if (vd->vdev_guid == guid)
6789                                 return (vd);
6790                 }
6791         }
6792
6793         return (NULL);
6794 }
6795
6796 void
6797 spa_upgrade(spa_t *spa, uint64_t version)
6798 {
6799         ASSERT(spa_writeable(spa));
6800
6801         spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
6802
6803         /*
6804          * This should only be called for a non-faulted pool, and since a
6805          * future version would result in an unopenable pool, this shouldn't be
6806          * possible.
6807          */
6808         ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version));
6809         ASSERT3U(version, >=, spa->spa_uberblock.ub_version);
6810
6811         spa->spa_uberblock.ub_version = version;
6812         vdev_config_dirty(spa->spa_root_vdev);
6813
6814         spa_config_exit(spa, SCL_ALL, FTAG);
6815
6816         txg_wait_synced(spa_get_dsl(spa), 0);
6817 }
6818
6819 boolean_t
6820 spa_has_spare(spa_t *spa, uint64_t guid)
6821 {
6822         int i;
6823         uint64_t spareguid;
6824         spa_aux_vdev_t *sav = &spa->spa_spares;
6825
6826         for (i = 0; i < sav->sav_count; i++)
6827                 if (sav->sav_vdevs[i]->vdev_guid == guid)
6828                         return (B_TRUE);
6829
6830         for (i = 0; i < sav->sav_npending; i++) {
6831                 if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID,
6832                     &spareguid) == 0 && spareguid == guid)
6833                         return (B_TRUE);
6834         }
6835
6836         return (B_FALSE);
6837 }
6838
6839 /*
6840  * Check if a pool has an active shared spare device.
6841  * Note: reference count of an active spare is 2, as a spare and as a replace
6842  */
6843 static boolean_t
6844 spa_has_active_shared_spare(spa_t *spa)
6845 {
6846         int i, refcnt;
6847         uint64_t pool;
6848         spa_aux_vdev_t *sav = &spa->spa_spares;
6849
6850         for (i = 0; i < sav->sav_count; i++) {
6851                 if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool,
6852                     &refcnt) && pool != 0ULL && pool == spa_guid(spa) &&
6853                     refcnt > 2)
6854                         return (B_TRUE);
6855         }
6856
6857         return (B_FALSE);
6858 }
6859
6860 /*
6861  * Post a zevent corresponding to the given sysevent.   The 'name' must be one
6862  * of the event definitions in sys/sysevent/eventdefs.h.  The payload will be
6863  * filled in from the spa and (optionally) the vdev.  This doesn't do anything
6864  * in the userland libzpool, as we don't want consumers to misinterpret ztest
6865  * or zdb as real changes.
6866  */
6867 void
6868 spa_event_notify(spa_t *spa, vdev_t *vd, const char *name)
6869 {
6870         zfs_post_sysevent(spa, vd, name);
6871 }
6872
6873 #if defined(_KERNEL) && defined(HAVE_SPL)
6874 /* state manipulation functions */
6875 EXPORT_SYMBOL(spa_open);
6876 EXPORT_SYMBOL(spa_open_rewind);
6877 EXPORT_SYMBOL(spa_get_stats);
6878 EXPORT_SYMBOL(spa_create);
6879 EXPORT_SYMBOL(spa_import);
6880 EXPORT_SYMBOL(spa_tryimport);
6881 EXPORT_SYMBOL(spa_destroy);
6882 EXPORT_SYMBOL(spa_export);
6883 EXPORT_SYMBOL(spa_reset);
6884 EXPORT_SYMBOL(spa_async_request);
6885 EXPORT_SYMBOL(spa_async_suspend);
6886 EXPORT_SYMBOL(spa_async_resume);
6887 EXPORT_SYMBOL(spa_inject_addref);
6888 EXPORT_SYMBOL(spa_inject_delref);
6889 EXPORT_SYMBOL(spa_scan_stat_init);
6890 EXPORT_SYMBOL(spa_scan_get_stats);
6891
6892 /* device maniion */
6893 EXPORT_SYMBOL(spa_vdev_add);
6894 EXPORT_SYMBOL(spa_vdev_attach);
6895 EXPORT_SYMBOL(spa_vdev_detach);
6896 EXPORT_SYMBOL(spa_vdev_remove);
6897 EXPORT_SYMBOL(spa_vdev_setpath);
6898 EXPORT_SYMBOL(spa_vdev_setfru);
6899 EXPORT_SYMBOL(spa_vdev_split_mirror);
6900
6901 /* spare statech is global across all pools) */
6902 EXPORT_SYMBOL(spa_spare_add);
6903 EXPORT_SYMBOL(spa_spare_remove);
6904 EXPORT_SYMBOL(spa_spare_exists);
6905 EXPORT_SYMBOL(spa_spare_activate);
6906
6907 /* L2ARC statech is global across all pools) */
6908 EXPORT_SYMBOL(spa_l2cache_add);
6909 EXPORT_SYMBOL(spa_l2cache_remove);
6910 EXPORT_SYMBOL(spa_l2cache_exists);
6911 EXPORT_SYMBOL(spa_l2cache_activate);
6912 EXPORT_SYMBOL(spa_l2cache_drop);
6913
6914 /* scanning */
6915 EXPORT_SYMBOL(spa_scan);
6916 EXPORT_SYMBOL(spa_scan_stop);
6917
6918 /* spa syncing */
6919 EXPORT_SYMBOL(spa_sync); /* only for DMU use */
6920 EXPORT_SYMBOL(spa_sync_allpools);
6921
6922 /* properties */
6923 EXPORT_SYMBOL(spa_prop_set);
6924 EXPORT_SYMBOL(spa_prop_get);
6925 EXPORT_SYMBOL(spa_prop_clear_bootfs);
6926
6927 /* asynchronous event notification */
6928 EXPORT_SYMBOL(spa_event_notify);
6929 #endif
6930
6931 #if defined(_KERNEL) && defined(HAVE_SPL)
6932 module_param(spa_load_verify_maxinflight, int, 0644);
6933 MODULE_PARM_DESC(spa_load_verify_maxinflight,
6934         "Max concurrent traversal I/Os while verifying pool during import -X");
6935
6936 module_param(spa_load_verify_metadata, int, 0644);
6937 MODULE_PARM_DESC(spa_load_verify_metadata,
6938         "Set to traverse metadata on pool import");
6939
6940 module_param(spa_load_verify_data, int, 0644);
6941 MODULE_PARM_DESC(spa_load_verify_data,
6942         "Set to traverse data on pool import");
6943
6944 module_param(zio_taskq_batch_pct, uint, 0444);
6945 MODULE_PARM_DESC(zio_taskq_batch_pct,
6946         "Percentage of CPUs to run an IO worker thread");
6947
6948 #endif