module/zfs/vdev.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright (c) 2011, 2021 by Delphix. All rights reserved.
  25  * Copyright 2017 Nexenta Systems, Inc.
  26  * Copyright (c) 2014 Integros [integros.com]
  27  * Copyright 2016 Toomas Soome <tsoome@me.com>
  28  * Copyright 2017 Joyent, Inc.
  29  * Copyright (c) 2017, Intel Corporation.
  30  * Copyright (c) 2019, Datto Inc. All rights reserved.
  31  * Copyright (c) 2021, Klara Inc.
  32  * Copyright [2021] Hewlett Packard Enterprise Development LP
  33  */
  34
  35 #include <sys/zfs_context.h>
  36 #include <sys/fm/fs/zfs.h>
  37 #include <sys/spa.h>
  38 #include <sys/spa_impl.h>
  39 #include <sys/bpobj.h>
  40 #include <sys/dmu.h>
  41 #include <sys/dmu_tx.h>
  42 #include <sys/dsl_dir.h>
  43 #include <sys/vdev_impl.h>
  44 #include <sys/vdev_rebuild.h>
  45 #include <sys/vdev_draid.h>
  46 #include <sys/uberblock_impl.h>
  47 #include <sys/metaslab.h>
  48 #include <sys/metaslab_impl.h>
  49 #include <sys/space_map.h>
  50 #include <sys/space_reftree.h>
  51 #include <sys/zio.h>
  52 #include <sys/zap.h>
  53 #include <sys/fs/zfs.h>
  54 #include <sys/arc.h>
  55 #include <sys/zil.h>
  56 #include <sys/dsl_scan.h>
  57 #include <sys/vdev_raidz.h>
  58 #include <sys/abd.h>
  59 #include <sys/vdev_initialize.h>
  60 #include <sys/vdev_trim.h>
  61 #include <sys/zvol.h>
  62 #include <sys/zfs_ratelimit.h>
  63 #include "zfs_prop.h"
  64
  65 /*
  66  * One metaslab from each (normal-class) vdev is used by the ZIL.  These are
  67  * called "embedded slog metaslabs", are referenced by vdev_log_mg, and are
  68  * part of the spa_embedded_log_class.  The metaslab with the most free space
  69  * in each vdev is selected for this purpose when the pool is opened (or a
  70  * vdev is added).  See vdev_metaslab_init().
  71  *
  72  * Log blocks can be allocated from the following locations.  Each one is tried
  73  * in order until the allocation succeeds:
  74  * 1. dedicated log vdevs, aka "slog" (spa_log_class)
  75  * 2. embedded slog metaslabs (spa_embedded_log_class)
  76  * 3. other metaslabs in normal vdevs (spa_normal_class)
  77  *
  78  * zfs_embedded_slog_min_ms disables the embedded slog if there are fewer
  79  * than this number of metaslabs in the vdev.  This ensures that we don't set
  80  * aside an unreasonable amount of space for the ZIL.  If set to less than
  81  * 1 << (spa_slop_shift + 1), on small pools the usable space may be reduced
  82  * (by more than 1<<spa_slop_shift) due to the embedded slog metaslab.
  83  */
  84 static int zfs_embedded_slog_min_ms = 64;
  85
  86 /* default target for number of metaslabs per top-level vdev */
  87 static int zfs_vdev_default_ms_count = 200;
  88
  89 /* minimum number of metaslabs per top-level vdev */
  90 static int zfs_vdev_min_ms_count = 16;
  91
  92 /* practical upper limit of total metaslabs per top-level vdev */
  93 static int zfs_vdev_ms_count_limit = 1ULL << 17;
  94
  95 /* lower limit for metaslab size (512M) */
  96 static int zfs_vdev_default_ms_shift = 29;
  97
  98 /* upper limit for metaslab size (16G) */
  99 static const int zfs_vdev_max_ms_shift = 34;
 100
 101 int vdev_validate_skip = B_FALSE;
 102
 103 /*
 104  * Since the DTL space map of a vdev is not expected to have a lot of
 105  * entries, we default its block size to 4K.
 106  */
 107 int zfs_vdev_dtl_sm_blksz = (1 << 12);
 108
 109 /*
 110  * Rate limit slow IO (delay) events to this many per second.
 111  */
 112 static unsigned int zfs_slow_io_events_per_second = 20;
 113
 114 /*
 115  * Rate limit checksum events after this many checksum errors per second.
 116  */
 117 static unsigned int zfs_checksum_events_per_second = 20;
 118
 119 /*
 120  * Ignore errors during scrub/resilver.  Allows to work around resilver
 121  * upon import when there are pool errors.
 122  */
 123 static int zfs_scan_ignore_errors = 0;
 124
 125 /*
 126  * vdev-wide space maps that have lots of entries written to them at
 127  * the end of each transaction can benefit from a higher I/O bandwidth
 128  * (e.g. vdev_obsolete_sm), thus we default their block size to 128K.
 129  */
 130 int zfs_vdev_standard_sm_blksz = (1 << 17);
 131
 132 /*
 133  * Tunable parameter for debugging or performance analysis. Setting this
 134  * will cause pool corruption on power loss if a volatile out-of-order
 135  * write cache is enabled.
 136  */
 137 int zfs_nocacheflush = 0;
 138
 139 uint64_t zfs_vdev_max_auto_ashift = ASHIFT_MAX;
 140 uint64_t zfs_vdev_min_auto_ashift = ASHIFT_MIN;
 141
 142 void
 143 vdev_dbgmsg(vdev_t *vd, const char *fmt, ...)
 144 {
 145         va_list adx;
 146         char buf[256];
 147
 148         va_start(adx, fmt);
 149         (void) vsnprintf(buf, sizeof (buf), fmt, adx);
 150         va_end(adx);
 151
 152         if (vd->vdev_path != NULL) {
 153                 zfs_dbgmsg("%s vdev '%s': %s", vd->vdev_ops->vdev_op_type,
 154                     vd->vdev_path, buf);
 155         } else {
 156                 zfs_dbgmsg("%s-%llu vdev (guid %llu): %s",
 157                     vd->vdev_ops->vdev_op_type,
 158                     (u_longlong_t)vd->vdev_id,
 159                     (u_longlong_t)vd->vdev_guid, buf);
 160         }
 161 }
 162
 163 void
 164 vdev_dbgmsg_print_tree(vdev_t *vd, int indent)
 165 {
 166         char state[20];
 167
 168         if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) {
 169                 zfs_dbgmsg("%*svdev %llu: %s", indent, "",
 170                     (u_longlong_t)vd->vdev_id,
 171                     vd->vdev_ops->vdev_op_type);
 172                 return;
 173         }
 174
 175         switch (vd->vdev_state) {
 176         case VDEV_STATE_UNKNOWN:
 177                 (void) snprintf(state, sizeof (state), "unknown");
 178                 break;
 179         case VDEV_STATE_CLOSED:
 180                 (void) snprintf(state, sizeof (state), "closed");
 181                 break;
 182         case VDEV_STATE_OFFLINE:
 183                 (void) snprintf(state, sizeof (state), "offline");
 184                 break;
 185         case VDEV_STATE_REMOVED:
 186                 (void) snprintf(state, sizeof (state), "removed");
 187                 break;
 188         case VDEV_STATE_CANT_OPEN:
 189                 (void) snprintf(state, sizeof (state), "can't open");
 190                 break;
 191         case VDEV_STATE_FAULTED:
 192                 (void) snprintf(state, sizeof (state), "faulted");
 193                 break;
 194         case VDEV_STATE_DEGRADED:
 195                 (void) snprintf(state, sizeof (state), "degraded");
 196                 break;
 197         case VDEV_STATE_HEALTHY:
 198                 (void) snprintf(state, sizeof (state), "healthy");
 199                 break;
 200         default:
 201                 (void) snprintf(state, sizeof (state), "<state %u>",
 202                     (uint_t)vd->vdev_state);
 203         }
 204
 205         zfs_dbgmsg("%*svdev %u: %s%s, guid: %llu, path: %s, %s", indent,
 206             "", (int)vd->vdev_id, vd->vdev_ops->vdev_op_type,
 207             vd->vdev_islog ? " (log)" : "",
 208             (u_longlong_t)vd->vdev_guid,
 209             vd->vdev_path ? vd->vdev_path : "N/A", state);
 210
 211         for (uint64_t i = 0; i < vd->vdev_children; i++)
 212                 vdev_dbgmsg_print_tree(vd->vdev_child[i], indent + 2);
 213 }
 214
 215 /*
 216  * Virtual device management.
 217  */
 218
 219 static const vdev_ops_t *const vdev_ops_table[] = {
 220         &vdev_root_ops,
 221         &vdev_raidz_ops,
 222         &vdev_draid_ops,
 223         &vdev_draid_spare_ops,
 224         &vdev_mirror_ops,
 225         &vdev_replacing_ops,
 226         &vdev_spare_ops,
 227         &vdev_disk_ops,
 228         &vdev_file_ops,
 229         &vdev_missing_ops,
 230         &vdev_hole_ops,
 231         &vdev_indirect_ops,
 232         NULL
 233 };
 234
 235 /*
 236  * Given a vdev type, return the appropriate ops vector.
 237  */
 238 static vdev_ops_t *
 239 vdev_getops(const char *type)
 240 {
 241         const vdev_ops_t *ops, *const *opspp;
 242
 243         for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++)
 244                 if (strcmp(ops->vdev_op_type, type) == 0)
 245                         break;
 246
 247         return (ops);
 248 }
 249
 250 /*
 251  * Given a vdev and a metaslab class, find which metaslab group we're
 252  * interested in. All vdevs may belong to two different metaslab classes.
 253  * Dedicated slog devices use only the primary metaslab group, rather than a
 254  * separate log group. For embedded slogs, the vdev_log_mg will be non-NULL.
 255  */
 256 metaslab_group_t *
 257 vdev_get_mg(vdev_t *vd, metaslab_class_t *mc)
 258 {
 259         if (mc == spa_embedded_log_class(vd->vdev_spa) &&
 260             vd->vdev_log_mg != NULL)
 261                 return (vd->vdev_log_mg);
 262         else
 263                 return (vd->vdev_mg);
 264 }
 265
 266 void
 267 vdev_default_xlate(vdev_t *vd, const range_seg64_t *logical_rs,
 268     range_seg64_t *physical_rs, range_seg64_t *remain_rs)
 269 {
 270         (void) vd, (void) remain_rs;
 271
 272         physical_rs->rs_start = logical_rs->rs_start;
 273         physical_rs->rs_end = logical_rs->rs_end;
 274 }
 275
 276 /*
 277  * Derive the enumerated allocation bias from string input.
 278  * String origin is either the per-vdev zap or zpool(8).
 279  */
 280 static vdev_alloc_bias_t
 281 vdev_derive_alloc_bias(const char *bias)
 282 {
 283         vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE;
 284
 285         if (strcmp(bias, VDEV_ALLOC_BIAS_LOG) == 0)
 286                 alloc_bias = VDEV_BIAS_LOG;
 287         else if (strcmp(bias, VDEV_ALLOC_BIAS_SPECIAL) == 0)
 288                 alloc_bias = VDEV_BIAS_SPECIAL;
 289         else if (strcmp(bias, VDEV_ALLOC_BIAS_DEDUP) == 0)
 290                 alloc_bias = VDEV_BIAS_DEDUP;
 291
 292         return (alloc_bias);
 293 }
 294
 295 /*
 296  * Default asize function: return the MAX of psize with the asize of
 297  * all children.  This is what's used by anything other than RAID-Z.
 298  */
 299 uint64_t
 300 vdev_default_asize(vdev_t *vd, uint64_t psize)
 301 {
 302         uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift);
 303         uint64_t csize;
 304
 305         for (int c = 0; c < vd->vdev_children; c++) {
 306                 csize = vdev_psize_to_asize(vd->vdev_child[c], psize);
 307                 asize = MAX(asize, csize);
 308         }
 309
 310         return (asize);
 311 }
 312
 313 uint64_t
 314 vdev_default_min_asize(vdev_t *vd)
 315 {
 316         return (vd->vdev_min_asize);
 317 }
 318
 319 /*
 320  * Get the minimum allocatable size. We define the allocatable size as
 321  * the vdev's asize rounded to the nearest metaslab. This allows us to
 322  * replace or attach devices which don't have the same physical size but
 323  * can still satisfy the same number of allocations.
 324  */
 325 uint64_t
 326 vdev_get_min_asize(vdev_t *vd)
 327 {
 328         vdev_t *pvd = vd->vdev_parent;
 329
 330         /*
 331          * If our parent is NULL (inactive spare or cache) or is the root,
 332          * just return our own asize.
 333          */
 334         if (pvd == NULL)
 335                 return (vd->vdev_asize);
 336
 337         /*
 338          * The top-level vdev just returns the allocatable size rounded
 339          * to the nearest metaslab.
 340          */
 341         if (vd == vd->vdev_top)
 342                 return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift));
 343
 344         return (pvd->vdev_ops->vdev_op_min_asize(pvd));
 345 }
 346
 347 void
 348 vdev_set_min_asize(vdev_t *vd)
 349 {
 350         vd->vdev_min_asize = vdev_get_min_asize(vd);
 351
 352         for (int c = 0; c < vd->vdev_children; c++)
 353                 vdev_set_min_asize(vd->vdev_child[c]);
 354 }
 355
 356 /*
 357  * Get the minimal allocation size for the top-level vdev.
 358  */
 359 uint64_t
 360 vdev_get_min_alloc(vdev_t *vd)
 361 {
 362         uint64_t min_alloc = 1ULL << vd->vdev_ashift;
 363
 364         if (vd->vdev_ops->vdev_op_min_alloc != NULL)
 365                 min_alloc = vd->vdev_ops->vdev_op_min_alloc(vd);
 366
 367         return (min_alloc);
 368 }
 369
 370 /*
 371  * Get the parity level for a top-level vdev.
 372  */
 373 uint64_t
 374 vdev_get_nparity(vdev_t *vd)
 375 {
 376         uint64_t nparity = 0;
 377
 378         if (vd->vdev_ops->vdev_op_nparity != NULL)
 379                 nparity = vd->vdev_ops->vdev_op_nparity(vd);
 380
 381         return (nparity);
 382 }
 383
 384 /*
 385  * Get the number of data disks for a top-level vdev.
 386  */
 387 uint64_t
 388 vdev_get_ndisks(vdev_t *vd)
 389 {
 390         uint64_t ndisks = 1;
 391
 392         if (vd->vdev_ops->vdev_op_ndisks != NULL)
 393                 ndisks = vd->vdev_ops->vdev_op_ndisks(vd);
 394
 395         return (ndisks);
 396 }
 397
 398 vdev_t *
 399 vdev_lookup_top(spa_t *spa, uint64_t vdev)
 400 {
 401         vdev_t *rvd = spa->spa_root_vdev;
 402
 403         ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
 404
 405         if (vdev < rvd->vdev_children) {
 406                 ASSERT(rvd->vdev_child[vdev] != NULL);
 407                 return (rvd->vdev_child[vdev]);
 408         }
 409
 410         return (NULL);
 411 }
 412
 413 vdev_t *
 414 vdev_lookup_by_guid(vdev_t *vd, uint64_t guid)
 415 {
 416         vdev_t *mvd;
 417
 418         if (vd->vdev_guid == guid)
 419                 return (vd);
 420
 421         for (int c = 0; c < vd->vdev_children; c++)
 422                 if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) !=
 423                     NULL)
 424                         return (mvd);
 425
 426         return (NULL);
 427 }
 428
 429 static int
 430 vdev_count_leaves_impl(vdev_t *vd)
 431 {
 432         int n = 0;
 433
 434         if (vd->vdev_ops->vdev_op_leaf)
 435                 return (1);
 436
 437         for (int c = 0; c < vd->vdev_children; c++)
 438                 n += vdev_count_leaves_impl(vd->vdev_child[c]);
 439
 440         return (n);
 441 }
 442
 443 int
 444 vdev_count_leaves(spa_t *spa)
 445 {
 446         int rc;
 447
 448         spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
 449         rc = vdev_count_leaves_impl(spa->spa_root_vdev);
 450         spa_config_exit(spa, SCL_VDEV, FTAG);
 451
 452         return (rc);
 453 }
 454
 455 void
 456 vdev_add_child(vdev_t *pvd, vdev_t *cvd)
 457 {
 458         size_t oldsize, newsize;
 459         uint64_t id = cvd->vdev_id;
 460         vdev_t **newchild;
 461
 462         ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 463         ASSERT(cvd->vdev_parent == NULL);
 464
 465         cvd->vdev_parent = pvd;
 466
 467         if (pvd == NULL)
 468                 return;
 469
 470         ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL);
 471
 472         oldsize = pvd->vdev_children * sizeof (vdev_t *);
 473         pvd->vdev_children = MAX(pvd->vdev_children, id + 1);
 474         newsize = pvd->vdev_children * sizeof (vdev_t *);
 475
 476         newchild = kmem_alloc(newsize, KM_SLEEP);
 477         if (pvd->vdev_child != NULL) {
 478                 memcpy(newchild, pvd->vdev_child, oldsize);
 479                 kmem_free(pvd->vdev_child, oldsize);
 480         }
 481
 482         pvd->vdev_child = newchild;
 483         pvd->vdev_child[id] = cvd;
 484
 485         cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd);
 486         ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL);
 487
 488         /*
 489          * Walk up all ancestors to update guid sum.
 490          */
 491         for (; pvd != NULL; pvd = pvd->vdev_parent)
 492                 pvd->vdev_guid_sum += cvd->vdev_guid_sum;
 493
 494         if (cvd->vdev_ops->vdev_op_leaf) {
 495                 list_insert_head(&cvd->vdev_spa->spa_leaf_list, cvd);
 496                 cvd->vdev_spa->spa_leaf_list_gen++;
 497         }
 498 }
 499
 500 void
 501 vdev_remove_child(vdev_t *pvd, vdev_t *cvd)
 502 {
 503         int c;
 504         uint_t id = cvd->vdev_id;
 505
 506         ASSERT(cvd->vdev_parent == pvd);
 507
 508         if (pvd == NULL)
 509                 return;
 510
 511         ASSERT(id < pvd->vdev_children);
 512         ASSERT(pvd->vdev_child[id] == cvd);
 513
 514         pvd->vdev_child[id] = NULL;
 515         cvd->vdev_parent = NULL;
 516
 517         for (c = 0; c < pvd->vdev_children; c++)
 518                 if (pvd->vdev_child[c])
 519                         break;
 520
 521         if (c == pvd->vdev_children) {
 522                 kmem_free(pvd->vdev_child, c * sizeof (vdev_t *));
 523                 pvd->vdev_child = NULL;
 524                 pvd->vdev_children = 0;
 525         }
 526
 527         if (cvd->vdev_ops->vdev_op_leaf) {
 528                 spa_t *spa = cvd->vdev_spa;
 529                 list_remove(&spa->spa_leaf_list, cvd);
 530                 spa->spa_leaf_list_gen++;
 531         }
 532
 533         /*
 534          * Walk up all ancestors to update guid sum.
 535          */
 536         for (; pvd != NULL; pvd = pvd->vdev_parent)
 537                 pvd->vdev_guid_sum -= cvd->vdev_guid_sum;
 538 }
 539
 540 /*
 541  * Remove any holes in the child array.
 542  */
 543 void
 544 vdev_compact_children(vdev_t *pvd)
 545 {
 546         vdev_t **newchild, *cvd;
 547         int oldc = pvd->vdev_children;
 548         int newc;
 549
 550         ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 551
 552         if (oldc == 0)
 553                 return;
 554
 555         for (int c = newc = 0; c < oldc; c++)
 556                 if (pvd->vdev_child[c])
 557                         newc++;
 558
 559         if (newc > 0) {
 560                 newchild = kmem_zalloc(newc * sizeof (vdev_t *), KM_SLEEP);
 561
 562                 for (int c = newc = 0; c < oldc; c++) {
 563                         if ((cvd = pvd->vdev_child[c]) != NULL) {
 564                                 newchild[newc] = cvd;
 565                                 cvd->vdev_id = newc++;
 566                         }
 567                 }
 568         } else {
 569                 newchild = NULL;
 570         }
 571
 572         kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *));
 573         pvd->vdev_child = newchild;
 574         pvd->vdev_children = newc;
 575 }
 576
 577 /*
 578  * Allocate and minimally initialize a vdev_t.
 579  */
 580 vdev_t *
 581 vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
 582 {
 583         vdev_t *vd;
 584         vdev_indirect_config_t *vic;
 585
 586         vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP);
 587         vic = &vd->vdev_indirect_config;
 588
 589         if (spa->spa_root_vdev == NULL) {
 590                 ASSERT(ops == &vdev_root_ops);
 591                 spa->spa_root_vdev = vd;
 592                 spa->spa_load_guid = spa_generate_guid(NULL);
 593         }
 594
 595         if (guid == 0 && ops != &vdev_hole_ops) {
 596                 if (spa->spa_root_vdev == vd) {
 597                         /*
 598                          * The root vdev's guid will also be the pool guid,
 599                          * which must be unique among all pools.
 600                          */
 601                         guid = spa_generate_guid(NULL);
 602                 } else {
 603                         /*
 604                          * Any other vdev's guid must be unique within the pool.
 605                          */
 606                         guid = spa_generate_guid(spa);
 607                 }
 608                 ASSERT(!spa_guid_exists(spa_guid(spa), guid));
 609         }
 610
 611         vd->vdev_spa = spa;
 612         vd->vdev_id = id;
 613         vd->vdev_guid = guid;
 614         vd->vdev_guid_sum = guid;
 615         vd->vdev_ops = ops;
 616         vd->vdev_state = VDEV_STATE_CLOSED;
 617         vd->vdev_ishole = (ops == &vdev_hole_ops);
 618         vic->vic_prev_indirect_vdev = UINT64_MAX;
 619
 620         rw_init(&vd->vdev_indirect_rwlock, NULL, RW_DEFAULT, NULL);
 621         mutex_init(&vd->vdev_obsolete_lock, NULL, MUTEX_DEFAULT, NULL);
 622         vd->vdev_obsolete_segments = range_tree_create(NULL, RANGE_SEG64, NULL,
 623             0, 0);
 624
 625         /*
 626          * Initialize rate limit structs for events.  We rate limit ZIO delay
 627          * and checksum events so that we don't overwhelm ZED with thousands
 628          * of events when a disk is acting up.
 629          */
 630         zfs_ratelimit_init(&vd->vdev_delay_rl, &zfs_slow_io_events_per_second,
 631             1);
 632         zfs_ratelimit_init(&vd->vdev_deadman_rl, &zfs_slow_io_events_per_second,
 633             1);
 634         zfs_ratelimit_init(&vd->vdev_checksum_rl,
 635             &zfs_checksum_events_per_second, 1);
 636
 637         list_link_init(&vd->vdev_config_dirty_node);
 638         list_link_init(&vd->vdev_state_dirty_node);
 639         list_link_init(&vd->vdev_initialize_node);
 640         list_link_init(&vd->vdev_leaf_node);
 641         list_link_init(&vd->vdev_trim_node);
 642
 643         mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_NOLOCKDEP, NULL);
 644         mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
 645         mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
 646         mutex_init(&vd->vdev_scan_io_queue_lock, NULL, MUTEX_DEFAULT, NULL);
 647
 648         mutex_init(&vd->vdev_initialize_lock, NULL, MUTEX_DEFAULT, NULL);
 649         mutex_init(&vd->vdev_initialize_io_lock, NULL, MUTEX_DEFAULT, NULL);
 650         cv_init(&vd->vdev_initialize_cv, NULL, CV_DEFAULT, NULL);
 651         cv_init(&vd->vdev_initialize_io_cv, NULL, CV_DEFAULT, NULL);
 652
 653         mutex_init(&vd->vdev_trim_lock, NULL, MUTEX_DEFAULT, NULL);
 654         mutex_init(&vd->vdev_autotrim_lock, NULL, MUTEX_DEFAULT, NULL);
 655         mutex_init(&vd->vdev_trim_io_lock, NULL, MUTEX_DEFAULT, NULL);
 656         cv_init(&vd->vdev_trim_cv, NULL, CV_DEFAULT, NULL);
 657         cv_init(&vd->vdev_autotrim_cv, NULL, CV_DEFAULT, NULL);
 658         cv_init(&vd->vdev_trim_io_cv, NULL, CV_DEFAULT, NULL);
 659
 660         mutex_init(&vd->vdev_rebuild_lock, NULL, MUTEX_DEFAULT, NULL);
 661         cv_init(&vd->vdev_rebuild_cv, NULL, CV_DEFAULT, NULL);
 662
 663         for (int t = 0; t < DTL_TYPES; t++) {
 664                 vd->vdev_dtl[t] = range_tree_create(NULL, RANGE_SEG64, NULL, 0,
 665                     0);
 666         }
 667
 668         txg_list_create(&vd->vdev_ms_list, spa,
 669             offsetof(struct metaslab, ms_txg_node));
 670         txg_list_create(&vd->vdev_dtl_list, spa,
 671             offsetof(struct vdev, vdev_dtl_node));
 672         vd->vdev_stat.vs_timestamp = gethrtime();
 673         vdev_queue_init(vd);
 674         vdev_cache_init(vd);
 675
 676         return (vd);
 677 }
 678
 679 /*
 680  * Allocate a new vdev.  The 'alloctype' is used to control whether we are
 681  * creating a new vdev or loading an existing one - the behavior is slightly
 682  * different for each case.
 683  */
 684 int
 685 vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
 686     int alloctype)
 687 {
 688         vdev_ops_t *ops;
 689         char *type;
 690         uint64_t guid = 0, islog;
 691         vdev_t *vd;
 692         vdev_indirect_config_t *vic;
 693         char *tmp = NULL;
 694         int rc;
 695         vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE;
 696         boolean_t top_level = (parent && !parent->vdev_parent);
 697
 698         ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 699
 700         if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0)
 701                 return (SET_ERROR(EINVAL));
 702
 703         if ((ops = vdev_getops(type)) == NULL)
 704                 return (SET_ERROR(EINVAL));
 705
 706         /*
 707          * If this is a load, get the vdev guid from the nvlist.
 708          * Otherwise, vdev_alloc_common() will generate one for us.
 709          */
 710         if (alloctype == VDEV_ALLOC_LOAD) {
 711                 uint64_t label_id;
 712
 713                 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) ||
 714                     label_id != id)
 715                         return (SET_ERROR(EINVAL));
 716
 717                 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
 718                         return (SET_ERROR(EINVAL));
 719         } else if (alloctype == VDEV_ALLOC_SPARE) {
 720                 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
 721                         return (SET_ERROR(EINVAL));
 722         } else if (alloctype == VDEV_ALLOC_L2CACHE) {
 723                 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
 724                         return (SET_ERROR(EINVAL));
 725         } else if (alloctype == VDEV_ALLOC_ROOTPOOL) {
 726                 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
 727                         return (SET_ERROR(EINVAL));
 728         }
 729
 730         /*
 731          * The first allocated vdev must be of type 'root'.
 732          */
 733         if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL)
 734                 return (SET_ERROR(EINVAL));
 735
 736         /*
 737          * Determine whether we're a log vdev.
 738          */
 739         islog = 0;
 740         (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog);
 741         if (islog && spa_version(spa) < SPA_VERSION_SLOGS)
 742                 return (SET_ERROR(ENOTSUP));
 743
 744         if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES)
 745                 return (SET_ERROR(ENOTSUP));
 746
 747         if (top_level && alloctype == VDEV_ALLOC_ADD) {
 748                 char *bias;
 749
 750                 /*
 751                  * If creating a top-level vdev, check for allocation
 752                  * classes input.
 753                  */
 754                 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS,
 755                     &bias) == 0) {
 756                         alloc_bias = vdev_derive_alloc_bias(bias);
 757
 758                         /* spa_vdev_add() expects feature to be enabled */
 759                         if (spa->spa_load_state != SPA_LOAD_CREATE &&
 760                             !spa_feature_is_enabled(spa,
 761                             SPA_FEATURE_ALLOCATION_CLASSES)) {
 762                                 return (SET_ERROR(ENOTSUP));
 763                         }
 764                 }
 765
 766                 /* spa_vdev_add() expects feature to be enabled */
 767                 if (ops == &vdev_draid_ops &&
 768                     spa->spa_load_state != SPA_LOAD_CREATE &&
 769                     !spa_feature_is_enabled(spa, SPA_FEATURE_DRAID)) {
 770                         return (SET_ERROR(ENOTSUP));
 771                 }
 772         }
 773
 774         /*
 775          * Initialize the vdev specific data.  This is done before calling
 776          * vdev_alloc_common() since it may fail and this simplifies the
 777          * error reporting and cleanup code paths.
 778          */
 779         void *tsd = NULL;
 780         if (ops->vdev_op_init != NULL) {
 781                 rc = ops->vdev_op_init(spa, nv, &tsd);
 782                 if (rc != 0) {
 783                         return (rc);
 784                 }
 785         }
 786
 787         vd = vdev_alloc_common(spa, id, guid, ops);
 788         vd->vdev_tsd = tsd;
 789         vd->vdev_islog = islog;
 790
 791         if (top_level && alloc_bias != VDEV_BIAS_NONE)
 792                 vd->vdev_alloc_bias = alloc_bias;
 793
 794         if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0)
 795                 vd->vdev_path = spa_strdup(vd->vdev_path);
 796
 797         /*
 798          * ZPOOL_CONFIG_AUX_STATE = "external" means we previously forced a
 799          * fault on a vdev and want it to persist across imports (like with
 800          * zpool offline -f).
 801          */
 802         rc = nvlist_lookup_string(nv, ZPOOL_CONFIG_AUX_STATE, &tmp);
 803         if (rc == 0 && tmp != NULL && strcmp(tmp, "external") == 0) {
 804                 vd->vdev_stat.vs_aux = VDEV_AUX_EXTERNAL;
 805                 vd->vdev_faulted = 1;
 806                 vd->vdev_label_aux = VDEV_AUX_EXTERNAL;
 807         }
 808
 809         if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0)
 810                 vd->vdev_devid = spa_strdup(vd->vdev_devid);
 811         if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH,
 812             &vd->vdev_physpath) == 0)
 813                 vd->vdev_physpath = spa_strdup(vd->vdev_physpath);
 814
 815         if (nvlist_lookup_string(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH,
 816             &vd->vdev_enc_sysfs_path) == 0)
 817                 vd->vdev_enc_sysfs_path = spa_strdup(vd->vdev_enc_sysfs_path);
 818
 819         if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &vd->vdev_fru) == 0)
 820                 vd->vdev_fru = spa_strdup(vd->vdev_fru);
 821
 822         /*
 823          * Set the whole_disk property.  If it's not specified, leave the value
 824          * as -1.
 825          */
 826         if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
 827             &vd->vdev_wholedisk) != 0)
 828                 vd->vdev_wholedisk = -1ULL;
 829
 830         vic = &vd->vdev_indirect_config;
 831
 832         ASSERT0(vic->vic_mapping_object);
 833         (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_OBJECT,
 834             &vic->vic_mapping_object);
 835         ASSERT0(vic->vic_births_object);
 836         (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_BIRTHS,
 837             &vic->vic_births_object);
 838         ASSERT3U(vic->vic_prev_indirect_vdev, ==, UINT64_MAX);
 839         (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_PREV_INDIRECT_VDEV,
 840             &vic->vic_prev_indirect_vdev);
 841
 842         /*
 843          * Look for the 'not present' flag.  This will only be set if the device
 844          * was not present at the time of import.
 845          */
 846         (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT,
 847             &vd->vdev_not_present);
 848
 849         /*
 850          * Get the alignment requirement.
 851          */
 852         (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift);
 853
 854         /*
 855          * Retrieve the vdev creation time.
 856          */
 857         (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG,
 858             &vd->vdev_crtxg);
 859
 860         /*
 861          * If we're a top-level vdev, try to load the allocation parameters.
 862          */
 863         if (top_level &&
 864             (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) {
 865                 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
 866                     &vd->vdev_ms_array);
 867                 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
 868                     &vd->vdev_ms_shift);
 869                 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE,
 870                     &vd->vdev_asize);
 871                 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NONALLOCATING,
 872                     &vd->vdev_noalloc);
 873                 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING,
 874                     &vd->vdev_removing);
 875                 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP,
 876                     &vd->vdev_top_zap);
 877         } else {
 878                 ASSERT0(vd->vdev_top_zap);
 879         }
 880
 881         if (top_level && alloctype != VDEV_ALLOC_ATTACH) {
 882                 ASSERT(alloctype == VDEV_ALLOC_LOAD ||
 883                     alloctype == VDEV_ALLOC_ADD ||
 884                     alloctype == VDEV_ALLOC_SPLIT ||
 885                     alloctype == VDEV_ALLOC_ROOTPOOL);
 886                 /* Note: metaslab_group_create() is now deferred */
 887         }
 888
 889         if (vd->vdev_ops->vdev_op_leaf &&
 890             (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) {
 891                 (void) nvlist_lookup_uint64(nv,
 892                     ZPOOL_CONFIG_VDEV_LEAF_ZAP, &vd->vdev_leaf_zap);
 893         } else {
 894                 ASSERT0(vd->vdev_leaf_zap);
 895         }
 896
 897         /*
 898          * If we're a leaf vdev, try to load the DTL object and other state.
 899          */
 900
 901         if (vd->vdev_ops->vdev_op_leaf &&
 902             (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE ||
 903             alloctype == VDEV_ALLOC_ROOTPOOL)) {
 904                 if (alloctype == VDEV_ALLOC_LOAD) {
 905                         (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL,
 906                             &vd->vdev_dtl_object);
 907                         (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE,
 908                             &vd->vdev_unspare);
 909                 }
 910
 911                 if (alloctype == VDEV_ALLOC_ROOTPOOL) {
 912                         uint64_t spare = 0;
 913
 914                         if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE,
 915                             &spare) == 0 && spare)
 916                                 spa_spare_add(vd);
 917                 }
 918
 919                 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE,
 920                     &vd->vdev_offline);
 921
 922                 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG,
 923                     &vd->vdev_resilver_txg);
 924
 925                 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REBUILD_TXG,
 926                     &vd->vdev_rebuild_txg);
 927
 928                 if (nvlist_exists(nv, ZPOOL_CONFIG_RESILVER_DEFER))
 929                         vdev_defer_resilver(vd);
 930
 931                 /*
 932                  * In general, when importing a pool we want to ignore the
 933                  * persistent fault state, as the diagnosis made on another
 934                  * system may not be valid in the current context.  The only
 935                  * exception is if we forced a vdev to a persistently faulted
 936                  * state with 'zpool offline -f'.  The persistent fault will
 937                  * remain across imports until cleared.
 938                  *
 939                  * Local vdevs will remain in the faulted state.
 940                  */
 941                 if (spa_load_state(spa) == SPA_LOAD_OPEN ||
 942                     spa_load_state(spa) == SPA_LOAD_IMPORT) {
 943                         (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED,
 944                             &vd->vdev_faulted);
 945                         (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED,
 946                             &vd->vdev_degraded);
 947                         (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED,
 948                             &vd->vdev_removed);
 949
 950                         if (vd->vdev_faulted || vd->vdev_degraded) {
 951                                 char *aux;
 952
 953                                 vd->vdev_label_aux =
 954                                     VDEV_AUX_ERR_EXCEEDED;
 955                                 if (nvlist_lookup_string(nv,
 956                                     ZPOOL_CONFIG_AUX_STATE, &aux) == 0 &&
 957                                     strcmp(aux, "external") == 0)
 958                                         vd->vdev_label_aux = VDEV_AUX_EXTERNAL;
 959                                 else
 960                                         vd->vdev_faulted = 0ULL;
 961                         }
 962                 }
 963         }
 964
 965         /*
 966          * Add ourselves to the parent's list of children.
 967          */
 968         vdev_add_child(parent, vd);
 969
 970         *vdp = vd;
 971
 972         return (0);
 973 }
 974
 975 void
 976 vdev_free(vdev_t *vd)
 977 {
 978         spa_t *spa = vd->vdev_spa;
 979
 980         ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
 981         ASSERT3P(vd->vdev_trim_thread, ==, NULL);
 982         ASSERT3P(vd->vdev_autotrim_thread, ==, NULL);
 983         ASSERT3P(vd->vdev_rebuild_thread, ==, NULL);
 984
 985         /*
 986          * Scan queues are normally destroyed at the end of a scan. If the
 987          * queue exists here, that implies the vdev is being removed while
 988          * the scan is still running.
 989          */
 990         if (vd->vdev_scan_io_queue != NULL) {
 991                 mutex_enter(&vd->vdev_scan_io_queue_lock);
 992                 dsl_scan_io_queue_destroy(vd->vdev_scan_io_queue);
 993                 vd->vdev_scan_io_queue = NULL;
 994                 mutex_exit(&vd->vdev_scan_io_queue_lock);
 995         }
 996
 997         /*
 998          * vdev_free() implies closing the vdev first.  This is simpler than
 999          * trying to ensure complicated semantics for all callers.
1000          */
1001         vdev_close(vd);
1002
1003         ASSERT(!list_link_active(&vd->vdev_config_dirty_node));
1004         ASSERT(!list_link_active(&vd->vdev_state_dirty_node));
1005
1006         /*
1007          * Free all children.
1008          */
1009         for (int c = 0; c < vd->vdev_children; c++)
1010                 vdev_free(vd->vdev_child[c]);
1011
1012         ASSERT(vd->vdev_child == NULL);
1013         ASSERT(vd->vdev_guid_sum == vd->vdev_guid);
1014
1015         if (vd->vdev_ops->vdev_op_fini != NULL)
1016                 vd->vdev_ops->vdev_op_fini(vd);
1017
1018         /*
1019          * Discard allocation state.
1020          */
1021         if (vd->vdev_mg != NULL) {
1022                 vdev_metaslab_fini(vd);
1023                 metaslab_group_destroy(vd->vdev_mg);
1024                 vd->vdev_mg = NULL;
1025         }
1026         if (vd->vdev_log_mg != NULL) {
1027                 ASSERT0(vd->vdev_ms_count);
1028                 metaslab_group_destroy(vd->vdev_log_mg);
1029                 vd->vdev_log_mg = NULL;
1030         }
1031
1032         ASSERT0(vd->vdev_stat.vs_space);
1033         ASSERT0(vd->vdev_stat.vs_dspace);
1034         ASSERT0(vd->vdev_stat.vs_alloc);
1035
1036         /*
1037          * Remove this vdev from its parent's child list.
1038          */
1039         vdev_remove_child(vd->vdev_parent, vd);
1040
1041         ASSERT(vd->vdev_parent == NULL);
1042         ASSERT(!list_link_active(&vd->vdev_leaf_node));
1043
1044         /*
1045          * Clean up vdev structure.
1046          */
1047         vdev_queue_fini(vd);
1048         vdev_cache_fini(vd);
1049
1050         if (vd->vdev_path)
1051                 spa_strfree(vd->vdev_path);
1052         if (vd->vdev_devid)
1053                 spa_strfree(vd->vdev_devid);
1054         if (vd->vdev_physpath)
1055                 spa_strfree(vd->vdev_physpath);
1056
1057         if (vd->vdev_enc_sysfs_path)
1058                 spa_strfree(vd->vdev_enc_sysfs_path);
1059
1060         if (vd->vdev_fru)
1061                 spa_strfree(vd->vdev_fru);
1062
1063         if (vd->vdev_isspare)
1064                 spa_spare_remove(vd);
1065         if (vd->vdev_isl2cache)
1066                 spa_l2cache_remove(vd);
1067
1068         txg_list_destroy(&vd->vdev_ms_list);
1069         txg_list_destroy(&vd->vdev_dtl_list);
1070
1071         mutex_enter(&vd->vdev_dtl_lock);
1072         space_map_close(vd->vdev_dtl_sm);
1073         for (int t = 0; t < DTL_TYPES; t++) {
1074                 range_tree_vacate(vd->vdev_dtl[t], NULL, NULL);
1075                 range_tree_destroy(vd->vdev_dtl[t]);
1076         }
1077         mutex_exit(&vd->vdev_dtl_lock);
1078
1079         EQUIV(vd->vdev_indirect_births != NULL,
1080             vd->vdev_indirect_mapping != NULL);
1081         if (vd->vdev_indirect_births != NULL) {
1082                 vdev_indirect_mapping_close(vd->vdev_indirect_mapping);
1083                 vdev_indirect_births_close(vd->vdev_indirect_births);
1084         }
1085
1086         if (vd->vdev_obsolete_sm != NULL) {
1087                 ASSERT(vd->vdev_removing ||
1088                     vd->vdev_ops == &vdev_indirect_ops);
1089                 space_map_close(vd->vdev_obsolete_sm);
1090                 vd->vdev_obsolete_sm = NULL;
1091         }
1092         range_tree_destroy(vd->vdev_obsolete_segments);
1093         rw_destroy(&vd->vdev_indirect_rwlock);
1094         mutex_destroy(&vd->vdev_obsolete_lock);
1095
1096         mutex_destroy(&vd->vdev_dtl_lock);
1097         mutex_destroy(&vd->vdev_stat_lock);
1098         mutex_destroy(&vd->vdev_probe_lock);
1099         mutex_destroy(&vd->vdev_scan_io_queue_lock);
1100
1101         mutex_destroy(&vd->vdev_initialize_lock);
1102         mutex_destroy(&vd->vdev_initialize_io_lock);
1103         cv_destroy(&vd->vdev_initialize_io_cv);
1104         cv_destroy(&vd->vdev_initialize_cv);
1105
1106         mutex_destroy(&vd->vdev_trim_lock);
1107         mutex_destroy(&vd->vdev_autotrim_lock);
1108         mutex_destroy(&vd->vdev_trim_io_lock);
1109         cv_destroy(&vd->vdev_trim_cv);
1110         cv_destroy(&vd->vdev_autotrim_cv);
1111         cv_destroy(&vd->vdev_trim_io_cv);
1112
1113         mutex_destroy(&vd->vdev_rebuild_lock);
1114         cv_destroy(&vd->vdev_rebuild_cv);
1115
1116         zfs_ratelimit_fini(&vd->vdev_delay_rl);
1117         zfs_ratelimit_fini(&vd->vdev_deadman_rl);
1118         zfs_ratelimit_fini(&vd->vdev_checksum_rl);
1119
1120         if (vd == spa->spa_root_vdev)
1121                 spa->spa_root_vdev = NULL;
1122
1123         kmem_free(vd, sizeof (vdev_t));
1124 }
1125
1126 /*
1127  * Transfer top-level vdev state from svd to tvd.
1128  */
1129 static void
1130 vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
1131 {
1132         spa_t *spa = svd->vdev_spa;
1133         metaslab_t *msp;
1134         vdev_t *vd;
1135         int t;
1136
1137         ASSERT(tvd == tvd->vdev_top);
1138
1139         tvd->vdev_pending_fastwrite = svd->vdev_pending_fastwrite;
1140         tvd->vdev_ms_array = svd->vdev_ms_array;
1141         tvd->vdev_ms_shift = svd->vdev_ms_shift;
1142         tvd->vdev_ms_count = svd->vdev_ms_count;
1143         tvd->vdev_top_zap = svd->vdev_top_zap;
1144
1145         svd->vdev_ms_array = 0;
1146         svd->vdev_ms_shift = 0;
1147         svd->vdev_ms_count = 0;
1148         svd->vdev_top_zap = 0;
1149
1150         if (tvd->vdev_mg)
1151                 ASSERT3P(tvd->vdev_mg, ==, svd->vdev_mg);
1152         if (tvd->vdev_log_mg)
1153                 ASSERT3P(tvd->vdev_log_mg, ==, svd->vdev_log_mg);
1154         tvd->vdev_mg = svd->vdev_mg;
1155         tvd->vdev_log_mg = svd->vdev_log_mg;
1156         tvd->vdev_ms = svd->vdev_ms;
1157
1158         svd->vdev_mg = NULL;
1159         svd->vdev_log_mg = NULL;
1160         svd->vdev_ms = NULL;
1161
1162         if (tvd->vdev_mg != NULL)
1163                 tvd->vdev_mg->mg_vd = tvd;
1164         if (tvd->vdev_log_mg != NULL)
1165                 tvd->vdev_log_mg->mg_vd = tvd;
1166
1167         tvd->vdev_checkpoint_sm = svd->vdev_checkpoint_sm;
1168         svd->vdev_checkpoint_sm = NULL;
1169
1170         tvd->vdev_alloc_bias = svd->vdev_alloc_bias;
1171         svd->vdev_alloc_bias = VDEV_BIAS_NONE;
1172
1173         tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc;
1174         tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space;
1175         tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace;
1176
1177         svd->vdev_stat.vs_alloc = 0;
1178         svd->vdev_stat.vs_space = 0;
1179         svd->vdev_stat.vs_dspace = 0;
1180
1181         /*
1182          * State which may be set on a top-level vdev that's in the
1183          * process of being removed.
1184          */
1185         ASSERT0(tvd->vdev_indirect_config.vic_births_object);
1186         ASSERT0(tvd->vdev_indirect_config.vic_mapping_object);
1187         ASSERT3U(tvd->vdev_indirect_config.vic_prev_indirect_vdev, ==, -1ULL);
1188         ASSERT3P(tvd->vdev_indirect_mapping, ==, NULL);
1189         ASSERT3P(tvd->vdev_indirect_births, ==, NULL);
1190         ASSERT3P(tvd->vdev_obsolete_sm, ==, NULL);
1191         ASSERT0(tvd->vdev_noalloc);
1192         ASSERT0(tvd->vdev_removing);
1193         ASSERT0(tvd->vdev_rebuilding);
1194         tvd->vdev_noalloc = svd->vdev_noalloc;
1195         tvd->vdev_removing = svd->vdev_removing;
1196         tvd->vdev_rebuilding = svd->vdev_rebuilding;
1197         tvd->vdev_rebuild_config = svd->vdev_rebuild_config;
1198         tvd->vdev_indirect_config = svd->vdev_indirect_config;
1199         tvd->vdev_indirect_mapping = svd->vdev_indirect_mapping;
1200         tvd->vdev_indirect_births = svd->vdev_indirect_births;
1201         range_tree_swap(&svd->vdev_obsolete_segments,
1202             &tvd->vdev_obsolete_segments);
1203         tvd->vdev_obsolete_sm = svd->vdev_obsolete_sm;
1204         svd->vdev_indirect_config.vic_mapping_object = 0;
1205         svd->vdev_indirect_config.vic_births_object = 0;
1206         svd->vdev_indirect_config.vic_prev_indirect_vdev = -1ULL;
1207         svd->vdev_indirect_mapping = NULL;
1208         svd->vdev_indirect_births = NULL;
1209         svd->vdev_obsolete_sm = NULL;
1210         svd->vdev_noalloc = 0;
1211         svd->vdev_removing = 0;
1212         svd->vdev_rebuilding = 0;
1213
1214         for (t = 0; t < TXG_SIZE; t++) {
1215                 while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL)
1216                         (void) txg_list_add(&tvd->vdev_ms_list, msp, t);
1217                 while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL)
1218                         (void) txg_list_add(&tvd->vdev_dtl_list, vd, t);
1219                 if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t))
1220                         (void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t);
1221         }
1222
1223         if (list_link_active(&svd->vdev_config_dirty_node)) {
1224                 vdev_config_clean(svd);
1225                 vdev_config_dirty(tvd);
1226         }
1227
1228         if (list_link_active(&svd->vdev_state_dirty_node)) {
1229                 vdev_state_clean(svd);
1230                 vdev_state_dirty(tvd);
1231         }
1232
1233         tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio;
1234         svd->vdev_deflate_ratio = 0;
1235
1236         tvd->vdev_islog = svd->vdev_islog;
1237         svd->vdev_islog = 0;
1238
1239         dsl_scan_io_queue_vdev_xfer(svd, tvd);
1240 }
1241
1242 static void
1243 vdev_top_update(vdev_t *tvd, vdev_t *vd)
1244 {
1245         if (vd == NULL)
1246                 return;
1247
1248         vd->vdev_top = tvd;
1249
1250         for (int c = 0; c < vd->vdev_children; c++)
1251                 vdev_top_update(tvd, vd->vdev_child[c]);
1252 }
1253
1254 /*
1255  * Add a mirror/replacing vdev above an existing vdev.  There is no need to
1256  * call .vdev_op_init() since mirror/replacing vdevs do not have private state.
1257  */
1258 vdev_t *
1259 vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops)
1260 {
1261         spa_t *spa = cvd->vdev_spa;
1262         vdev_t *pvd = cvd->vdev_parent;
1263         vdev_t *mvd;
1264
1265         ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
1266
1267         mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops);
1268
1269         mvd->vdev_asize = cvd->vdev_asize;
1270         mvd->vdev_min_asize = cvd->vdev_min_asize;
1271         mvd->vdev_max_asize = cvd->vdev_max_asize;
1272         mvd->vdev_psize = cvd->vdev_psize;
1273         mvd->vdev_ashift = cvd->vdev_ashift;
1274         mvd->vdev_logical_ashift = cvd->vdev_logical_ashift;
1275         mvd->vdev_physical_ashift = cvd->vdev_physical_ashift;
1276         mvd->vdev_state = cvd->vdev_state;
1277         mvd->vdev_crtxg = cvd->vdev_crtxg;
1278
1279         vdev_remove_child(pvd, cvd);
1280         vdev_add_child(pvd, mvd);
1281         cvd->vdev_id = mvd->vdev_children;
1282         vdev_add_child(mvd, cvd);
1283         vdev_top_update(cvd->vdev_top, cvd->vdev_top);
1284
1285         if (mvd == mvd->vdev_top)
1286                 vdev_top_transfer(cvd, mvd);
1287
1288         return (mvd);
1289 }
1290
1291 /*
1292  * Remove a 1-way mirror/replacing vdev from the tree.
1293  */
1294 void
1295 vdev_remove_parent(vdev_t *cvd)
1296 {
1297         vdev_t *mvd = cvd->vdev_parent;
1298         vdev_t *pvd = mvd->vdev_parent;
1299
1300         ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
1301
1302         ASSERT(mvd->vdev_children == 1);
1303         ASSERT(mvd->vdev_ops == &vdev_mirror_ops ||
1304             mvd->vdev_ops == &vdev_replacing_ops ||
1305             mvd->vdev_ops == &vdev_spare_ops);
1306         cvd->vdev_ashift = mvd->vdev_ashift;
1307         cvd->vdev_logical_ashift = mvd->vdev_logical_ashift;
1308         cvd->vdev_physical_ashift = mvd->vdev_physical_ashift;
1309         vdev_remove_child(mvd, cvd);
1310         vdev_remove_child(pvd, mvd);
1311
1312         /*
1313          * If cvd will replace mvd as a top-level vdev, preserve mvd's guid.
1314          * Otherwise, we could have detached an offline device, and when we
1315          * go to import the pool we'll think we have two top-level vdevs,
1316          * instead of a different version of the same top-level vdev.
1317          */
1318         if (mvd->vdev_top == mvd) {
1319                 uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid;
1320                 cvd->vdev_orig_guid = cvd->vdev_guid;
1321                 cvd->vdev_guid += guid_delta;
1322                 cvd->vdev_guid_sum += guid_delta;
1323
1324                 /*
1325                  * If pool not set for autoexpand, we need to also preserve
1326                  * mvd's asize to prevent automatic expansion of cvd.
1327                  * Otherwise if we are adjusting the mirror by attaching and
1328                  * detaching children of non-uniform sizes, the mirror could
1329                  * autoexpand, unexpectedly requiring larger devices to
1330                  * re-establish the mirror.
1331                  */
1332                 if (!cvd->vdev_spa->spa_autoexpand)
1333                         cvd->vdev_asize = mvd->vdev_asize;
1334         }
1335         cvd->vdev_id = mvd->vdev_id;
1336         vdev_add_child(pvd, cvd);
1337         vdev_top_update(cvd->vdev_top, cvd->vdev_top);
1338
1339         if (cvd == cvd->vdev_top)
1340                 vdev_top_transfer(mvd, cvd);
1341
1342         ASSERT(mvd->vdev_children == 0);
1343         vdev_free(mvd);
1344 }
1345
1346 void
1347 vdev_metaslab_group_create(vdev_t *vd)
1348 {
1349         spa_t *spa = vd->vdev_spa;
1350
1351         /*
1352          * metaslab_group_create was delayed until allocation bias was available
1353          */
1354         if (vd->vdev_mg == NULL) {
1355                 metaslab_class_t *mc;
1356
1357                 if (vd->vdev_islog && vd->vdev_alloc_bias == VDEV_BIAS_NONE)
1358                         vd->vdev_alloc_bias = VDEV_BIAS_LOG;
1359
1360                 ASSERT3U(vd->vdev_islog, ==,
1361                     (vd->vdev_alloc_bias == VDEV_BIAS_LOG));
1362
1363                 switch (vd->vdev_alloc_bias) {
1364                 case VDEV_BIAS_LOG:
1365                         mc = spa_log_class(spa);
1366                         break;
1367                 case VDEV_BIAS_SPECIAL:
1368                         mc = spa_special_class(spa);
1369                         break;
1370                 case VDEV_BIAS_DEDUP:
1371                         mc = spa_dedup_class(spa);
1372                         break;
1373                 default:
1374                         mc = spa_normal_class(spa);
1375                 }
1376
1377                 vd->vdev_mg = metaslab_group_create(mc, vd,
1378                     spa->spa_alloc_count);
1379
1380                 if (!vd->vdev_islog) {
1381                         vd->vdev_log_mg = metaslab_group_create(
1382                             spa_embedded_log_class(spa), vd, 1);
1383                 }
1384
1385                 /*
1386                  * The spa ashift min/max only apply for the normal metaslab
1387                  * class. Class destination is late binding so ashift boundary
1388                  * setting had to wait until now.
1389                  */
1390                 if (vd->vdev_top == vd && vd->vdev_ashift != 0 &&
1391                     mc == spa_normal_class(spa) && vd->vdev_aux == NULL) {
1392                         if (vd->vdev_ashift > spa->spa_max_ashift)
1393                                 spa->spa_max_ashift = vd->vdev_ashift;
1394                         if (vd->vdev_ashift < spa->spa_min_ashift)
1395                                 spa->spa_min_ashift = vd->vdev_ashift;
1396
1397                         uint64_t min_alloc = vdev_get_min_alloc(vd);
1398                         if (min_alloc < spa->spa_min_alloc)
1399                                 spa->spa_min_alloc = min_alloc;
1400                 }
1401         }
1402 }
1403
1404 int
1405 vdev_metaslab_init(vdev_t *vd, uint64_t txg)
1406 {
1407         spa_t *spa = vd->vdev_spa;
1408         uint64_t oldc = vd->vdev_ms_count;
1409         uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift;
1410         metaslab_t **mspp;
1411         int error;
1412         boolean_t expanding = (oldc != 0);
1413
1414         ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER));
1415
1416         /*
1417          * This vdev is not being allocated from yet or is a hole.
1418          */
1419         if (vd->vdev_ms_shift == 0)
1420                 return (0);
1421
1422         ASSERT(!vd->vdev_ishole);
1423
1424         ASSERT(oldc <= newc);
1425
1426         mspp = vmem_zalloc(newc * sizeof (*mspp), KM_SLEEP);
1427
1428         if (expanding) {
1429                 memcpy(mspp, vd->vdev_ms, oldc * sizeof (*mspp));
1430                 vmem_free(vd->vdev_ms, oldc * sizeof (*mspp));
1431         }
1432
1433         vd->vdev_ms = mspp;
1434         vd->vdev_ms_count = newc;
1435
1436         for (uint64_t m = oldc; m < newc; m++) {
1437                 uint64_t object = 0;
1438                 /*
1439                  * vdev_ms_array may be 0 if we are creating the "fake"
1440                  * metaslabs for an indirect vdev for zdb's leak detection.
1441                  * See zdb_leak_init().
1442                  */
1443                 if (txg == 0 && vd->vdev_ms_array != 0) {
1444                         error = dmu_read(spa->spa_meta_objset,
1445                             vd->vdev_ms_array,
1446                             m * sizeof (uint64_t), sizeof (uint64_t), &object,
1447                             DMU_READ_PREFETCH);
1448                         if (error != 0) {
1449                                 vdev_dbgmsg(vd, "unable to read the metaslab "
1450                                     "array [error=%d]", error);
1451                                 return (error);
1452                         }
1453                 }
1454
1455                 error = metaslab_init(vd->vdev_mg, m, object, txg,
1456                     &(vd->vdev_ms[m]));
1457                 if (error != 0) {
1458                         vdev_dbgmsg(vd, "metaslab_init failed [error=%d]",
1459                             error);
1460                         return (error);
1461                 }
1462         }
1463
1464         /*
1465          * Find the emptiest metaslab on the vdev and mark it for use for
1466          * embedded slog by moving it from the regular to the log metaslab
1467          * group.
1468          */
1469         if (vd->vdev_mg->mg_class == spa_normal_class(spa) &&
1470             vd->vdev_ms_count > zfs_embedded_slog_min_ms &&
1471             avl_is_empty(&vd->vdev_log_mg->mg_metaslab_tree)) {
1472                 uint64_t slog_msid = 0;
1473                 uint64_t smallest = UINT64_MAX;
1474
1475                 /*
1476                  * Note, we only search the new metaslabs, because the old
1477                  * (pre-existing) ones may be active (e.g. have non-empty
1478                  * range_tree's), and we don't move them to the new
1479                  * metaslab_t.
1480                  */
1481                 for (uint64_t m = oldc; m < newc; m++) {
1482                         uint64_t alloc =
1483                             space_map_allocated(vd->vdev_ms[m]->ms_sm);
1484                         if (alloc < smallest) {
1485                                 slog_msid = m;
1486                                 smallest = alloc;
1487                         }
1488                 }
1489                 metaslab_t *slog_ms = vd->vdev_ms[slog_msid];
1490                 /*
1491                  * The metaslab was marked as dirty at the end of
1492                  * metaslab_init(). Remove it from the dirty list so that we
1493                  * can uninitialize and reinitialize it to the new class.
1494                  */
1495                 if (txg != 0) {
1496                         (void) txg_list_remove_this(&vd->vdev_ms_list,
1497                             slog_ms, txg);
1498                 }
1499                 uint64_t sm_obj = space_map_object(slog_ms->ms_sm);
1500                 metaslab_fini(slog_ms);
1501                 VERIFY0(metaslab_init(vd->vdev_log_mg, slog_msid, sm_obj, txg,
1502                     &vd->vdev_ms[slog_msid]));
1503         }
1504
1505         if (txg == 0)
1506                 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER);
1507
1508         /*
1509          * If the vdev is marked as non-allocating then don't
1510          * activate the metaslabs since we want to ensure that
1511          * no allocations are performed on this device.
1512          */
1513         if (vd->vdev_noalloc) {
1514                 /* track non-allocating vdev space */
1515                 spa->spa_nonallocating_dspace += spa_deflate(spa) ?
1516                     vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space;
1517         } else if (!expanding) {
1518                 metaslab_group_activate(vd->vdev_mg);
1519                 if (vd->vdev_log_mg != NULL)
1520                         metaslab_group_activate(vd->vdev_log_mg);
1521         }
1522
1523         if (txg == 0)
1524                 spa_config_exit(spa, SCL_ALLOC, FTAG);
1525
1526         /*
1527          * Regardless whether this vdev was just added or it is being
1528          * expanded, the metaslab count has changed. Recalculate the
1529          * block limit.
1530          */
1531         spa_log_sm_set_blocklimit(spa);
1532
1533         return (0);
1534 }
1535
1536 void
1537 vdev_metaslab_fini(vdev_t *vd)
1538 {
1539         if (vd->vdev_checkpoint_sm != NULL) {
1540                 ASSERT(spa_feature_is_active(vd->vdev_spa,
1541                     SPA_FEATURE_POOL_CHECKPOINT));
1542                 space_map_close(vd->vdev_checkpoint_sm);
1543                 /*
1544                  * Even though we close the space map, we need to set its
1545                  * pointer to NULL. The reason is that vdev_metaslab_fini()
1546                  * may be called multiple times for certain operations
1547                  * (i.e. when destroying a pool) so we need to ensure that
1548                  * this clause never executes twice. This logic is similar
1549                  * to the one used for the vdev_ms clause below.
1550                  */
1551                 vd->vdev_checkpoint_sm = NULL;
1552         }
1553
1554         if (vd->vdev_ms != NULL) {
1555                 metaslab_group_t *mg = vd->vdev_mg;
1556
1557                 metaslab_group_passivate(mg);
1558                 if (vd->vdev_log_mg != NULL) {
1559                         ASSERT(!vd->vdev_islog);
1560                         metaslab_group_passivate(vd->vdev_log_mg);
1561                 }
1562
1563                 uint64_t count = vd->vdev_ms_count;
1564                 for (uint64_t m = 0; m < count; m++) {
1565                         metaslab_t *msp = vd->vdev_ms[m];
1566                         if (msp != NULL)
1567                                 metaslab_fini(msp);
1568                 }
1569                 vmem_free(vd->vdev_ms, count * sizeof (metaslab_t *));
1570                 vd->vdev_ms = NULL;
1571                 vd->vdev_ms_count = 0;
1572
1573                 for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
1574                         ASSERT0(mg->mg_histogram[i]);
1575                         if (vd->vdev_log_mg != NULL)
1576                                 ASSERT0(vd->vdev_log_mg->mg_histogram[i]);
1577                 }
1578         }
1579         ASSERT0(vd->vdev_ms_count);
1580         ASSERT3U(vd->vdev_pending_fastwrite, ==, 0);
1581 }
1582
1583 typedef struct vdev_probe_stats {
1584         boolean_t       vps_readable;
1585         boolean_t       vps_writeable;
1586         int             vps_flags;
1587 } vdev_probe_stats_t;
1588
1589 static void
1590 vdev_probe_done(zio_t *zio)
1591 {
1592         spa_t *spa = zio->io_spa;
1593         vdev_t *vd = zio->io_vd;
1594         vdev_probe_stats_t *vps = zio->io_private;
1595
1596         ASSERT(vd->vdev_probe_zio != NULL);
1597
1598         if (zio->io_type == ZIO_TYPE_READ) {
1599                 if (zio->io_error == 0)
1600                         vps->vps_readable = 1;
1601                 if (zio->io_error == 0 && spa_writeable(spa)) {
1602                         zio_nowait(zio_write_phys(vd->vdev_probe_zio, vd,
1603                             zio->io_offset, zio->io_size, zio->io_abd,
1604                             ZIO_CHECKSUM_OFF, vdev_probe_done, vps,
1605                             ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE));
1606                 } else {
1607                         abd_free(zio->io_abd);
1608                 }
1609         } else if (zio->io_type == ZIO_TYPE_WRITE) {
1610                 if (zio->io_error == 0)
1611                         vps->vps_writeable = 1;
1612                 abd_free(zio->io_abd);
1613         } else if (zio->io_type == ZIO_TYPE_NULL) {
1614                 zio_t *pio;
1615                 zio_link_t *zl;
1616
1617                 vd->vdev_cant_read |= !vps->vps_readable;
1618                 vd->vdev_cant_write |= !vps->vps_writeable;
1619
1620                 if (vdev_readable(vd) &&
1621                     (vdev_writeable(vd) || !spa_writeable(spa))) {
1622                         zio->io_error = 0;
1623                 } else {
1624                         ASSERT(zio->io_error != 0);
1625                         vdev_dbgmsg(vd, "failed probe");
1626                         (void) zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE,
1627                             spa, vd, NULL, NULL, 0);
1628                         zio->io_error = SET_ERROR(ENXIO);
1629                 }
1630
1631                 mutex_enter(&vd->vdev_probe_lock);
1632                 ASSERT(vd->vdev_probe_zio == zio);
1633                 vd->vdev_probe_zio = NULL;
1634                 mutex_exit(&vd->vdev_probe_lock);
1635
1636                 zl = NULL;
1637                 while ((pio = zio_walk_parents(zio, &zl)) != NULL)
1638                         if (!vdev_accessible(vd, pio))
1639                                 pio->io_error = SET_ERROR(ENXIO);
1640
1641                 kmem_free(vps, sizeof (*vps));
1642         }
1643 }
1644
1645 /*
1646  * Determine whether this device is accessible.
1647  *
1648  * Read and write to several known locations: the pad regions of each
1649  * vdev label but the first, which we leave alone in case it contains
1650  * a VTOC.
1651  */
1652 zio_t *
1653 vdev_probe(vdev_t *vd, zio_t *zio)
1654 {
1655         spa_t *spa = vd->vdev_spa;
1656         vdev_probe_stats_t *vps = NULL;
1657         zio_t *pio;
1658
1659         ASSERT(vd->vdev_ops->vdev_op_leaf);
1660
1661         /*
1662          * Don't probe the probe.
1663          */
1664         if (zio && (zio->io_flags & ZIO_FLAG_PROBE))
1665                 return (NULL);
1666
1667         /*
1668          * To prevent 'probe storms' when a device fails, we create
1669          * just one probe i/o at a time.  All zios that want to probe
1670          * this vdev will become parents of the probe io.
1671          */
1672         mutex_enter(&vd->vdev_probe_lock);
1673
1674         if ((pio = vd->vdev_probe_zio) == NULL) {
1675                 vps = kmem_zalloc(sizeof (*vps), KM_SLEEP);
1676
1677                 vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE |
1678                     ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE |
1679                     ZIO_FLAG_TRYHARD;
1680
1681                 if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) {
1682                         /*
1683                          * vdev_cant_read and vdev_cant_write can only
1684                          * transition from TRUE to FALSE when we have the
1685                          * SCL_ZIO lock as writer; otherwise they can only
1686                          * transition from FALSE to TRUE.  This ensures that
1687                          * any zio looking at these values can assume that
1688                          * failures persist for the life of the I/O.  That's
1689                          * important because when a device has intermittent
1690                          * connectivity problems, we want to ensure that
1691                          * they're ascribed to the device (ENXIO) and not
1692                          * the zio (EIO).
1693                          *
1694                          * Since we hold SCL_ZIO as writer here, clear both
1695                          * values so the probe can reevaluate from first
1696                          * principles.
1697                          */
1698                         vps->vps_flags |= ZIO_FLAG_CONFIG_WRITER;
1699                         vd->vdev_cant_read = B_FALSE;
1700                         vd->vdev_cant_write = B_FALSE;
1701                 }
1702
1703                 vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd,
1704                     vdev_probe_done, vps,
1705                     vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE);
1706
1707                 /*
1708                  * We can't change the vdev state in this context, so we
1709                  * kick off an async task to do it on our behalf.
1710                  */
1711                 if (zio != NULL) {
1712                         vd->vdev_probe_wanted = B_TRUE;
1713                         spa_async_request(spa, SPA_ASYNC_PROBE);
1714                 }
1715         }
1716
1717         if (zio != NULL)
1718                 zio_add_child(zio, pio);
1719
1720         mutex_exit(&vd->vdev_probe_lock);
1721
1722         if (vps == NULL) {
1723                 ASSERT(zio != NULL);
1724                 return (NULL);
1725         }
1726
1727         for (int l = 1; l < VDEV_LABELS; l++) {
1728                 zio_nowait(zio_read_phys(pio, vd,
1729                     vdev_label_offset(vd->vdev_psize, l,
1730                     offsetof(vdev_label_t, vl_be)), VDEV_PAD_SIZE,
1731                     abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE),
1732                     ZIO_CHECKSUM_OFF, vdev_probe_done, vps,
1733                     ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE));
1734         }
1735
1736         if (zio == NULL)
1737                 return (pio);
1738
1739         zio_nowait(pio);
1740         return (NULL);
1741 }
1742
1743 static void
1744 vdev_load_child(void *arg)
1745 {
1746         vdev_t *vd = arg;
1747
1748         vd->vdev_load_error = vdev_load(vd);
1749 }
1750
1751 static void
1752 vdev_open_child(void *arg)
1753 {
1754         vdev_t *vd = arg;
1755
1756         vd->vdev_open_thread = curthread;
1757         vd->vdev_open_error = vdev_open(vd);
1758         vd->vdev_open_thread = NULL;
1759 }
1760
1761 static boolean_t
1762 vdev_uses_zvols(vdev_t *vd)
1763 {
1764 #ifdef _KERNEL
1765         if (zvol_is_zvol(vd->vdev_path))
1766                 return (B_TRUE);
1767 #endif
1768
1769         for (int c = 0; c < vd->vdev_children; c++)
1770                 if (vdev_uses_zvols(vd->vdev_child[c]))
1771                         return (B_TRUE);
1772
1773         return (B_FALSE);
1774 }
1775
1776 /*
1777  * Returns B_TRUE if the passed child should be opened.
1778  */
1779 static boolean_t
1780 vdev_default_open_children_func(vdev_t *vd)
1781 {
1782         (void) vd;
1783         return (B_TRUE);
1784 }
1785
1786 /*
1787  * Open the requested child vdevs.  If any of the leaf vdevs are using
1788  * a ZFS volume then do the opens in a single thread.  This avoids a
1789  * deadlock when the current thread is holding the spa_namespace_lock.
1790  */
1791 static void
1792 vdev_open_children_impl(vdev_t *vd, vdev_open_children_func_t *open_func)
1793 {
1794         int children = vd->vdev_children;
1795
1796         taskq_t *tq = taskq_create("vdev_open", children, minclsyspri,
1797             children, children, TASKQ_PREPOPULATE);
1798         vd->vdev_nonrot = B_TRUE;
1799
1800         for (int c = 0; c < children; c++) {
1801                 vdev_t *cvd = vd->vdev_child[c];
1802
1803                 if (open_func(cvd) == B_FALSE)
1804                         continue;
1805
1806                 if (tq == NULL || vdev_uses_zvols(vd)) {
1807                         cvd->vdev_open_error = vdev_open(cvd);
1808                 } else {
1809                         VERIFY(taskq_dispatch(tq, vdev_open_child,
1810                             cvd, TQ_SLEEP) != TASKQID_INVALID);
1811                 }
1812
1813                 vd->vdev_nonrot &= cvd->vdev_nonrot;
1814         }
1815
1816         if (tq != NULL) {
1817                 taskq_wait(tq);
1818                 taskq_destroy(tq);
1819         }
1820 }
1821
1822 /*
1823  * Open all child vdevs.
1824  */
1825 void
1826 vdev_open_children(vdev_t *vd)
1827 {
1828         vdev_open_children_impl(vd, vdev_default_open_children_func);
1829 }
1830
1831 /*
1832  * Conditionally open a subset of child vdevs.
1833  */
1834 void
1835 vdev_open_children_subset(vdev_t *vd, vdev_open_children_func_t *open_func)
1836 {
1837         vdev_open_children_impl(vd, open_func);
1838 }
1839
1840 /*
1841  * Compute the raidz-deflation ratio.  Note, we hard-code
1842  * in 128k (1 << 17) because it is the "typical" blocksize.
1843  * Even though SPA_MAXBLOCKSIZE changed, this algorithm can not change,
1844  * otherwise it would inconsistently account for existing bp's.
1845  */
1846 static void
1847 vdev_set_deflate_ratio(vdev_t *vd)
1848 {
1849         if (vd == vd->vdev_top && !vd->vdev_ishole && vd->vdev_ashift != 0) {
1850                 vd->vdev_deflate_ratio = (1 << 17) /
1851                     (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT);
1852         }
1853 }
1854
1855 /*
1856  * Maximize performance by inflating the configured ashift for top level
1857  * vdevs to be as close to the physical ashift as possible while maintaining
1858  * administrator defined limits and ensuring it doesn't go below the
1859  * logical ashift.
1860  */
1861 static void
1862 vdev_ashift_optimize(vdev_t *vd)
1863 {
1864         ASSERT(vd == vd->vdev_top);
1865
1866         if (vd->vdev_ashift < vd->vdev_physical_ashift) {
1867                 vd->vdev_ashift = MIN(
1868                     MAX(zfs_vdev_max_auto_ashift, vd->vdev_ashift),
1869                     MAX(zfs_vdev_min_auto_ashift,
1870                     vd->vdev_physical_ashift));
1871         } else {
1872                 /*
1873                  * If the logical and physical ashifts are the same, then
1874                  * we ensure that the top-level vdev's ashift is not smaller
1875                  * than our minimum ashift value. For the unusual case
1876                  * where logical ashift > physical ashift, we can't cap
1877                  * the calculated ashift based on max ashift as that
1878                  * would cause failures.
1879                  * We still check if we need to increase it to match
1880                  * the min ashift.
1881                  */
1882                 vd->vdev_ashift = MAX(zfs_vdev_min_auto_ashift,
1883                     vd->vdev_ashift);
1884         }
1885 }
1886
1887 /*
1888  * Prepare a virtual device for access.
1889  */
1890 int
1891 vdev_open(vdev_t *vd)
1892 {
1893         spa_t *spa = vd->vdev_spa;
1894         int error;
1895         uint64_t osize = 0;
1896         uint64_t max_osize = 0;
1897         uint64_t asize, max_asize, psize;
1898         uint64_t logical_ashift = 0;
1899         uint64_t physical_ashift = 0;
1900
1901         ASSERT(vd->vdev_open_thread == curthread ||
1902             spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
1903         ASSERT(vd->vdev_state == VDEV_STATE_CLOSED ||
1904             vd->vdev_state == VDEV_STATE_CANT_OPEN ||
1905             vd->vdev_state == VDEV_STATE_OFFLINE);
1906
1907         vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
1908         vd->vdev_cant_read = B_FALSE;
1909         vd->vdev_cant_write = B_FALSE;
1910         vd->vdev_min_asize = vdev_get_min_asize(vd);
1911
1912         /*
1913          * If this vdev is not removed, check its fault status.  If it's
1914          * faulted, bail out of the open.
1915          */
1916         if (!vd->vdev_removed && vd->vdev_faulted) {
1917                 ASSERT(vd->vdev_children == 0);
1918                 ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED ||
1919                     vd->vdev_label_aux == VDEV_AUX_EXTERNAL);
1920                 vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
1921                     vd->vdev_label_aux);
1922                 return (SET_ERROR(ENXIO));
1923         } else if (vd->vdev_offline) {
1924                 ASSERT(vd->vdev_children == 0);
1925                 vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE);
1926                 return (SET_ERROR(ENXIO));
1927         }
1928
1929         error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize,
1930             &logical_ashift, &physical_ashift);
1931         /*
1932          * Physical volume size should never be larger than its max size, unless
1933          * the disk has shrunk while we were reading it or the device is buggy
1934          * or damaged: either way it's not safe for use, bail out of the open.
1935          */
1936         if (osize > max_osize) {
1937                 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
1938                     VDEV_AUX_OPEN_FAILED);
1939                 return (SET_ERROR(ENXIO));
1940         }
1941
1942         /*
1943          * Reset the vdev_reopening flag so that we actually close
1944          * the vdev on error.
1945          */
1946         vd->vdev_reopening = B_FALSE;
1947         if (zio_injection_enabled && error == 0)
1948                 error = zio_handle_device_injection(vd, NULL, SET_ERROR(ENXIO));
1949
1950         if (error) {
1951                 if (vd->vdev_removed &&
1952                     vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED)
1953                         vd->vdev_removed = B_FALSE;
1954
1955                 if (vd->vdev_stat.vs_aux == VDEV_AUX_CHILDREN_OFFLINE) {
1956                         vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE,
1957                             vd->vdev_stat.vs_aux);
1958                 } else {
1959                         vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
1960                             vd->vdev_stat.vs_aux);
1961                 }
1962                 return (error);
1963         }
1964
1965         vd->vdev_removed = B_FALSE;
1966
1967         /*
1968          * Recheck the faulted flag now that we have confirmed that
1969          * the vdev is accessible.  If we're faulted, bail.
1970          */
1971         if (vd->vdev_faulted) {
1972                 ASSERT(vd->vdev_children == 0);
1973                 ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED ||
1974                     vd->vdev_label_aux == VDEV_AUX_EXTERNAL);
1975                 vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
1976                     vd->vdev_label_aux);
1977                 return (SET_ERROR(ENXIO));
1978         }
1979
1980         if (vd->vdev_degraded) {
1981                 ASSERT(vd->vdev_children == 0);
1982                 vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
1983                     VDEV_AUX_ERR_EXCEEDED);
1984         } else {
1985                 vdev_set_state(vd, B_TRUE, VDEV_STATE_HEALTHY, 0);
1986         }
1987
1988         /*
1989          * For hole or missing vdevs we just return success.
1990          */
1991         if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops)
1992                 return (0);
1993
1994         for (int c = 0; c < vd->vdev_children; c++) {
1995                 if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) {
1996                         vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
1997                             VDEV_AUX_NONE);
1998                         break;
1999                 }
2000         }
2001
2002         osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t));
2003         max_osize = P2ALIGN(max_osize, (uint64_t)sizeof (vdev_label_t));
2004
2005         if (vd->vdev_children == 0) {
2006                 if (osize < SPA_MINDEVSIZE) {
2007                         vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
2008                             VDEV_AUX_TOO_SMALL);
2009                         return (SET_ERROR(EOVERFLOW));
2010                 }
2011                 psize = osize;
2012                 asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE);
2013                 max_asize = max_osize - (VDEV_LABEL_START_SIZE +
2014                     VDEV_LABEL_END_SIZE);
2015         } else {
2016                 if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE -
2017                     (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) {
2018                         vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
2019                             VDEV_AUX_TOO_SMALL);
2020                         return (SET_ERROR(EOVERFLOW));
2021                 }
2022                 psize = 0;
2023                 asize = osize;
2024                 max_asize = max_osize;
2025         }
2026
2027         /*
2028          * If the vdev was expanded, record this so that we can re-create the
2029          * uberblock rings in labels {2,3}, during the next sync.
2030          */
2031         if ((psize > vd->vdev_psize) && (vd->vdev_psize != 0))
2032                 vd->vdev_copy_uberblocks = B_TRUE;
2033
2034         vd->vdev_psize = psize;
2035
2036         /*
2037          * Make sure the allocatable size hasn't shrunk too much.
2038          */
2039         if (asize < vd->vdev_min_asize) {
2040                 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
2041                     VDEV_AUX_BAD_LABEL);
2042                 return (SET_ERROR(EINVAL));
2043         }
2044
2045         /*
2046          * We can always set the logical/physical ashift members since
2047          * their values are only used to calculate the vdev_ashift when
2048          * the device is first added to the config. These values should
2049          * not be used for anything else since they may change whenever
2050          * the device is reopened and we don't store them in the label.
2051          */
2052         vd->vdev_physical_ashift =
2053             MAX(physical_ashift, vd->vdev_physical_ashift);
2054         vd->vdev_logical_ashift = MAX(logical_ashift,
2055             vd->vdev_logical_ashift);
2056
2057         if (vd->vdev_asize == 0) {
2058                 /*
2059                  * This is the first-ever open, so use the computed values.
2060                  * For compatibility, a different ashift can be requested.
2061                  */
2062                 vd->vdev_asize = asize;
2063                 vd->vdev_max_asize = max_asize;
2064
2065                 /*
2066                  * If the vdev_ashift was not overridden at creation time,
2067                  * then set it the logical ashift and optimize the ashift.
2068                  */
2069                 if (vd->vdev_ashift == 0) {
2070                         vd->vdev_ashift = vd->vdev_logical_ashift;
2071
2072                         if (vd->vdev_logical_ashift > ASHIFT_MAX) {
2073                                 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
2074                                     VDEV_AUX_ASHIFT_TOO_BIG);
2075                                 return (SET_ERROR(EDOM));
2076                         }
2077
2078                         if (vd->vdev_top == vd) {
2079                                 vdev_ashift_optimize(vd);
2080                         }
2081                 }
2082                 if (vd->vdev_ashift != 0 && (vd->vdev_ashift < ASHIFT_MIN ||
2083                     vd->vdev_ashift > ASHIFT_MAX)) {
2084                         vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
2085                             VDEV_AUX_BAD_ASHIFT);
2086                         return (SET_ERROR(EDOM));
2087                 }
2088         } else {
2089                 /*
2090                  * Make sure the alignment required hasn't increased.
2091                  */
2092                 if (vd->vdev_ashift > vd->vdev_top->vdev_ashift &&
2093                     vd->vdev_ops->vdev_op_leaf) {
2094                         (void) zfs_ereport_post(
2095                             FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT,
2096                             spa, vd, NULL, NULL, 0);
2097                         vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
2098                             VDEV_AUX_BAD_LABEL);
2099                         return (SET_ERROR(EDOM));
2100                 }
2101                 vd->vdev_max_asize = max_asize;
2102         }
2103
2104         /*
2105          * If all children are healthy we update asize if either:
2106          * The asize has increased, due to a device expansion caused by dynamic
2107          * LUN growth or vdev replacement, and automatic expansion is enabled;
2108          * making the additional space available.
2109          *
2110          * The asize has decreased, due to a device shrink usually caused by a
2111          * vdev replace with a smaller device. This ensures that calculations
2112          * based of max_asize and asize e.g. esize are always valid. It's safe
2113          * to do this as we've already validated that asize is greater than
2114          * vdev_min_asize.
2115          */
2116         if (vd->vdev_state == VDEV_STATE_HEALTHY &&
2117             ((asize > vd->vdev_asize &&
2118             (vd->vdev_expanding || spa->spa_autoexpand)) ||
2119             (asize < vd->vdev_asize)))
2120                 vd->vdev_asize = asize;
2121
2122         vdev_set_min_asize(vd);
2123
2124         /*
2125          * Ensure we can issue some IO before declaring the
2126          * vdev open for business.
2127          */
2128         if (vd->vdev_ops->vdev_op_leaf &&
2129             (error = zio_wait(vdev_probe(vd, NULL))) != 0) {
2130                 vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
2131                     VDEV_AUX_ERR_EXCEEDED);
2132                 return (error);
2133         }
2134
2135         /*
2136          * Track the minimum allocation size.
2137          */
2138         if (vd->vdev_top == vd && vd->vdev_ashift != 0 &&
2139             vd->vdev_islog == 0 && vd->vdev_aux == NULL) {
2140                 uint64_t min_alloc = vdev_get_min_alloc(vd);
2141                 if (min_alloc < spa->spa_min_alloc)
2142                         spa->spa_min_alloc = min_alloc;
2143         }
2144
2145         /*
2146          * If this is a leaf vdev, assess whether a resilver is needed.
2147          * But don't do this if we are doing a reopen for a scrub, since
2148          * this would just restart the scrub we are already doing.
2149          */
2150         if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen)
2151                 dsl_scan_assess_vdev(spa->spa_dsl_pool, vd);
2152
2153         return (0);
2154 }
2155
2156 static void
2157 vdev_validate_child(void *arg)
2158 {
2159         vdev_t *vd = arg;
2160
2161         vd->vdev_validate_thread = curthread;
2162         vd->vdev_validate_error = vdev_validate(vd);
2163         vd->vdev_validate_thread = NULL;
2164 }
2165
2166 /*
2167  * Called once the vdevs are all opened, this routine validates the label
2168  * contents. This needs to be done before vdev_load() so that we don't
2169  * inadvertently do repair I/Os to the wrong device.
2170  *
2171  * This function will only return failure if one of the vdevs indicates that it
2172  * has since been destroyed or exported.  This is only possible if
2173  * /etc/zfs/zpool.cache was readonly at the time.  Otherwise, the vdev state
2174  * will be updated but the function will return 0.
2175  */
2176 int
2177 vdev_validate(vdev_t *vd)
2178 {
2179         spa_t *spa = vd->vdev_spa;
2180         taskq_t *tq = NULL;
2181         nvlist_t *label;
2182         uint64_t guid = 0, aux_guid = 0, top_guid;
2183         uint64_t state;
2184         nvlist_t *nvl;
2185         uint64_t txg;
2186         int children = vd->vdev_children;
2187
2188         if (vdev_validate_skip)
2189                 return (0);
2190
2191         if (children > 0) {
2192                 tq = taskq_create("vdev_validate", children, minclsyspri,
2193                     children, children, TASKQ_PREPOPULATE);
2194         }
2195
2196         for (uint64_t c = 0; c < children; c++) {
2197                 vdev_t *cvd = vd->vdev_child[c];
2198
2199                 if (tq == NULL || vdev_uses_zvols(cvd)) {
2200                         vdev_validate_child(cvd);
2201                 } else {
2202                         VERIFY(taskq_dispatch(tq, vdev_validate_child, cvd,
2203                             TQ_SLEEP) != TASKQID_INVALID);
2204                 }
2205         }
2206         if (tq != NULL) {
2207                 taskq_wait(tq);
2208                 taskq_destroy(tq);
2209         }
2210         for (int c = 0; c < children; c++) {
2211                 int error = vd->vdev_child[c]->vdev_validate_error;
2212
2213                 if (error != 0)
2214                         return (SET_ERROR(EBADF));
2215         }
2216
2217
2218         /*
2219          * If the device has already failed, or was marked offline, don't do
2220          * any further validation.  Otherwise, label I/O will fail and we will
2221          * overwrite the previous state.
2222          */
2223         if (!vd->vdev_ops->vdev_op_leaf || !vdev_readable(vd))
2224                 return (0);
2225
2226         /*
2227          * If we are performing an extreme rewind, we allow for a label that
2228          * was modified at a point after the current txg.
2229          * If config lock is not held do not check for the txg. spa_sync could
2230          * be updating the vdev's label before updating spa_last_synced_txg.
2231          */
2232         if (spa->spa_extreme_rewind || spa_last_synced_txg(spa) == 0 ||
2233             spa_config_held(spa, SCL_CONFIG, RW_WRITER) != SCL_CONFIG)
2234                 txg = UINT64_MAX;
2235         else
2236                 txg = spa_last_synced_txg(spa);
2237
2238         if ((label = vdev_label_read_config(vd, txg)) == NULL) {
2239                 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2240                     VDEV_AUX_BAD_LABEL);
2241                 vdev_dbgmsg(vd, "vdev_validate: failed reading config for "
2242                     "txg %llu", (u_longlong_t)txg);
2243                 return (0);
2244         }
2245
2246         /*
2247          * Determine if this vdev has been split off into another
2248          * pool.  If so, then refuse to open it.
2249          */
2250         if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID,
2251             &aux_guid) == 0 && aux_guid == spa_guid(spa)) {
2252                 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2253                     VDEV_AUX_SPLIT_POOL);
2254                 nvlist_free(label);
2255                 vdev_dbgmsg(vd, "vdev_validate: vdev split into other pool");
2256                 return (0);
2257         }
2258
2259         if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, &guid) != 0) {
2260                 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2261                     VDEV_AUX_CORRUPT_DATA);
2262                 nvlist_free(label);
2263                 vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label",
2264                     ZPOOL_CONFIG_POOL_GUID);
2265                 return (0);
2266         }
2267
2268         /*
2269          * If config is not trusted then ignore the spa guid check. This is
2270          * necessary because if the machine crashed during a re-guid the new
2271          * guid might have been written to all of the vdev labels, but not the
2272          * cached config. The check will be performed again once we have the
2273          * trusted config from the MOS.
2274          */
2275         if (spa->spa_trust_config && guid != spa_guid(spa)) {
2276                 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2277                     VDEV_AUX_CORRUPT_DATA);
2278                 nvlist_free(label);
2279                 vdev_dbgmsg(vd, "vdev_validate: vdev label pool_guid doesn't "
2280                     "match config (%llu != %llu)", (u_longlong_t)guid,
2281                     (u_longlong_t)spa_guid(spa));
2282                 return (0);
2283         }
2284
2285         if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl)
2286             != 0 || nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID,
2287             &aux_guid) != 0)
2288                 aux_guid = 0;
2289
2290         if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0) {
2291                 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2292                     VDEV_AUX_CORRUPT_DATA);
2293                 nvlist_free(label);
2294                 vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label",
2295                     ZPOOL_CONFIG_GUID);
2296                 return (0);
2297         }
2298
2299         if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID, &top_guid)
2300             != 0) {
2301                 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2302                     VDEV_AUX_CORRUPT_DATA);
2303                 nvlist_free(label);
2304                 vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label",
2305                     ZPOOL_CONFIG_TOP_GUID);
2306                 return (0);
2307         }
2308
2309         /*
2310          * If this vdev just became a top-level vdev because its sibling was
2311          * detached, it will have adopted the parent's vdev guid -- but the
2312          * label may or may not be on disk yet. Fortunately, either version
2313          * of the label will have the same top guid, so if we're a top-level
2314          * vdev, we can safely compare to that instead.
2315          * However, if the config comes from a cachefile that failed to update
2316          * after the detach, a top-level vdev will appear as a non top-level
2317          * vdev in the config. Also relax the constraints if we perform an
2318          * extreme rewind.
2319          *
2320          * If we split this vdev off instead, then we also check the
2321          * original pool's guid. We don't want to consider the vdev
2322          * corrupt if it is partway through a split operation.
2323          */
2324         if (vd->vdev_guid != guid && vd->vdev_guid != aux_guid) {
2325                 boolean_t mismatch = B_FALSE;
2326                 if (spa->spa_trust_config && !spa->spa_extreme_rewind) {
2327                         if (vd != vd->vdev_top || vd->vdev_guid != top_guid)
2328                                 mismatch = B_TRUE;
2329                 } else {
2330                         if (vd->vdev_guid != top_guid &&
2331                             vd->vdev_top->vdev_guid != guid)
2332                                 mismatch = B_TRUE;
2333                 }
2334
2335                 if (mismatch) {
2336                         vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2337                             VDEV_AUX_CORRUPT_DATA);
2338                         nvlist_free(label);
2339                         vdev_dbgmsg(vd, "vdev_validate: config guid "
2340                             "doesn't match label guid");
2341                         vdev_dbgmsg(vd, "CONFIG: guid %llu, top_guid %llu",
2342                             (u_longlong_t)vd->vdev_guid,
2343                             (u_longlong_t)vd->vdev_top->vdev_guid);
2344                         vdev_dbgmsg(vd, "LABEL: guid %llu, top_guid %llu, "
2345                             "aux_guid %llu", (u_longlong_t)guid,
2346                             (u_longlong_t)top_guid, (u_longlong_t)aux_guid);
2347                         return (0);
2348                 }
2349         }
2350
2351         if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE,
2352             &state) != 0) {
2353                 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2354                     VDEV_AUX_CORRUPT_DATA);
2355                 nvlist_free(label);
2356                 vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label",
2357                     ZPOOL_CONFIG_POOL_STATE);
2358                 return (0);
2359         }
2360
2361         nvlist_free(label);
2362
2363         /*
2364          * If this is a verbatim import, no need to check the
2365          * state of the pool.
2366          */
2367         if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) &&
2368             spa_load_state(spa) == SPA_LOAD_OPEN &&
2369             state != POOL_STATE_ACTIVE) {
2370                 vdev_dbgmsg(vd, "vdev_validate: invalid pool state (%llu) "
2371                     "for spa %s", (u_longlong_t)state, spa->spa_name);
2372                 return (SET_ERROR(EBADF));
2373         }
2374
2375         /*
2376          * If we were able to open and validate a vdev that was
2377          * previously marked permanently unavailable, clear that state
2378          * now.
2379          */
2380         if (vd->vdev_not_present)
2381                 vd->vdev_not_present = 0;
2382
2383         return (0);
2384 }
2385
2386 static void
2387 vdev_copy_path_impl(vdev_t *svd, vdev_t *dvd)
2388 {
2389         char *old, *new;
2390         if (svd->vdev_path != NULL && dvd->vdev_path != NULL) {
2391                 if (strcmp(svd->vdev_path, dvd->vdev_path) != 0) {
2392                         zfs_dbgmsg("vdev_copy_path: vdev %llu: path changed "
2393                             "from '%s' to '%s'", (u_longlong_t)dvd->vdev_guid,
2394                             dvd->vdev_path, svd->vdev_path);
2395                         spa_strfree(dvd->vdev_path);
2396                         dvd->vdev_path = spa_strdup(svd->vdev_path);
2397                 }
2398         } else if (svd->vdev_path != NULL) {
2399                 dvd->vdev_path = spa_strdup(svd->vdev_path);
2400                 zfs_dbgmsg("vdev_copy_path: vdev %llu: path set to '%s'",
2401                     (u_longlong_t)dvd->vdev_guid, dvd->vdev_path);
2402         }
2403
2404         /*
2405          * Our enclosure sysfs path may have changed between imports
2406          */
2407         old = dvd->vdev_enc_sysfs_path;
2408         new = svd->vdev_enc_sysfs_path;
2409         if ((old != NULL && new == NULL) ||
2410             (old == NULL && new != NULL) ||
2411             ((old != NULL && new != NULL) && strcmp(new, old) != 0)) {
2412                 zfs_dbgmsg("vdev_copy_path: vdev %llu: vdev_enc_sysfs_path "
2413                     "changed from '%s' to '%s'", (u_longlong_t)dvd->vdev_guid,
2414                     old, new);
2415
2416                 if (dvd->vdev_enc_sysfs_path)
2417                         spa_strfree(dvd->vdev_enc_sysfs_path);
2418
2419                 if (svd->vdev_enc_sysfs_path) {
2420                         dvd->vdev_enc_sysfs_path = spa_strdup(
2421                             svd->vdev_enc_sysfs_path);
2422                 } else {
2423                         dvd->vdev_enc_sysfs_path = NULL;
2424                 }
2425         }
2426 }
2427
2428 /*
2429  * Recursively copy vdev paths from one vdev to another. Source and destination
2430  * vdev trees must have same geometry otherwise return error. Intended to copy
2431  * paths from userland config into MOS config.
2432  */
2433 int
2434 vdev_copy_path_strict(vdev_t *svd, vdev_t *dvd)
2435 {
2436         if ((svd->vdev_ops == &vdev_missing_ops) ||
2437             (svd->vdev_ishole && dvd->vdev_ishole) ||
2438             (dvd->vdev_ops == &vdev_indirect_ops))
2439                 return (0);
2440
2441         if (svd->vdev_ops != dvd->vdev_ops) {
2442                 vdev_dbgmsg(svd, "vdev_copy_path: vdev type mismatch: %s != %s",
2443                     svd->vdev_ops->vdev_op_type, dvd->vdev_ops->vdev_op_type);
2444                 return (SET_ERROR(EINVAL));
2445         }
2446
2447         if (svd->vdev_guid != dvd->vdev_guid) {
2448                 vdev_dbgmsg(svd, "vdev_copy_path: guids mismatch (%llu != "
2449                     "%llu)", (u_longlong_t)svd->vdev_guid,
2450                     (u_longlong_t)dvd->vdev_guid);
2451                 return (SET_ERROR(EINVAL));
2452         }
2453
2454         if (svd->vdev_children != dvd->vdev_children) {
2455                 vdev_dbgmsg(svd, "vdev_copy_path: children count mismatch: "
2456                     "%llu != %llu", (u_longlong_t)svd->vdev_children,
2457                     (u_longlong_t)dvd->vdev_children);
2458                 return (SET_ERROR(EINVAL));
2459         }
2460
2461         for (uint64_t i = 0; i < svd->vdev_children; i++) {
2462                 int error = vdev_copy_path_strict(svd->vdev_child[i],
2463                     dvd->vdev_child[i]);
2464                 if (error != 0)
2465                         return (error);
2466         }
2467
2468         if (svd->vdev_ops->vdev_op_leaf)
2469                 vdev_copy_path_impl(svd, dvd);
2470
2471         return (0);
2472 }
2473
2474 static void
2475 vdev_copy_path_search(vdev_t *stvd, vdev_t *dvd)
2476 {
2477         ASSERT(stvd->vdev_top == stvd);
2478         ASSERT3U(stvd->vdev_id, ==, dvd->vdev_top->vdev_id);
2479
2480         for (uint64_t i = 0; i < dvd->vdev_children; i++) {
2481                 vdev_copy_path_search(stvd, dvd->vdev_child[i]);
2482         }
2483
2484         if (!dvd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(dvd))
2485                 return;
2486
2487         /*
2488          * The idea here is that while a vdev can shift positions within
2489          * a top vdev (when replacing, attaching mirror, etc.) it cannot
2490          * step outside of it.
2491          */
2492         vdev_t *vd = vdev_lookup_by_guid(stvd, dvd->vdev_guid);
2493
2494         if (vd == NULL || vd->vdev_ops != dvd->vdev_ops)
2495                 return;
2496
2497         ASSERT(vd->vdev_ops->vdev_op_leaf);
2498
2499         vdev_copy_path_impl(vd, dvd);
2500 }
2501
2502 /*
2503  * Recursively copy vdev paths from one root vdev to another. Source and
2504  * destination vdev trees may differ in geometry. For each destination leaf
2505  * vdev, search a vdev with the same guid and top vdev id in the source.
2506  * Intended to copy paths from userland config into MOS config.
2507  */
2508 void
2509 vdev_copy_path_relaxed(vdev_t *srvd, vdev_t *drvd)
2510 {
2511         uint64_t children = MIN(srvd->vdev_children, drvd->vdev_children);
2512         ASSERT(srvd->vdev_ops == &vdev_root_ops);
2513         ASSERT(drvd->vdev_ops == &vdev_root_ops);
2514
2515         for (uint64_t i = 0; i < children; i++) {
2516                 vdev_copy_path_search(srvd->vdev_child[i],
2517                     drvd->vdev_child[i]);
2518         }
2519 }
2520
2521 /*
2522  * Close a virtual device.
2523  */
2524 void
2525 vdev_close(vdev_t *vd)
2526 {
2527         vdev_t *pvd = vd->vdev_parent;
2528         spa_t *spa __maybe_unused = vd->vdev_spa;
2529
2530         ASSERT(vd != NULL);
2531         ASSERT(vd->vdev_open_thread == curthread ||
2532             spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
2533
2534         /*
2535          * If our parent is reopening, then we are as well, unless we are
2536          * going offline.
2537          */
2538         if (pvd != NULL && pvd->vdev_reopening)
2539                 vd->vdev_reopening = (pvd->vdev_reopening && !vd->vdev_offline);
2540
2541         vd->vdev_ops->vdev_op_close(vd);
2542
2543         vdev_cache_purge(vd);
2544
2545         /*
2546          * We record the previous state before we close it, so that if we are
2547          * doing a reopen(), we don't generate FMA ereports if we notice that
2548          * it's still faulted.
2549          */
2550         vd->vdev_prevstate = vd->vdev_state;
2551
2552         if (vd->vdev_offline)
2553                 vd->vdev_state = VDEV_STATE_OFFLINE;
2554         else
2555                 vd->vdev_state = VDEV_STATE_CLOSED;
2556         vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
2557 }
2558
2559 void
2560 vdev_hold(vdev_t *vd)
2561 {
2562         spa_t *spa = vd->vdev_spa;
2563
2564         ASSERT(spa_is_root(spa));
2565         if (spa->spa_state == POOL_STATE_UNINITIALIZED)
2566                 return;
2567
2568         for (int c = 0; c < vd->vdev_children; c++)
2569                 vdev_hold(vd->vdev_child[c]);
2570
2571         if (vd->vdev_ops->vdev_op_leaf && vd->vdev_ops->vdev_op_hold != NULL)
2572                 vd->vdev_ops->vdev_op_hold(vd);
2573 }
2574
2575 void
2576 vdev_rele(vdev_t *vd)
2577 {
2578         ASSERT(spa_is_root(vd->vdev_spa));
2579         for (int c = 0; c < vd->vdev_children; c++)
2580                 vdev_rele(vd->vdev_child[c]);
2581
2582         if (vd->vdev_ops->vdev_op_leaf && vd->vdev_ops->vdev_op_rele != NULL)
2583                 vd->vdev_ops->vdev_op_rele(vd);
2584 }
2585
2586 /*
2587  * Reopen all interior vdevs and any unopened leaves.  We don't actually
2588  * reopen leaf vdevs which had previously been opened as they might deadlock
2589  * on the spa_config_lock.  Instead we only obtain the leaf's physical size.
2590  * If the leaf has never been opened then open it, as usual.
2591  */
2592 void
2593 vdev_reopen(vdev_t *vd)
2594 {
2595         spa_t *spa = vd->vdev_spa;
2596
2597         ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
2598
2599         /* set the reopening flag unless we're taking the vdev offline */
2600         vd->vdev_reopening = !vd->vdev_offline;
2601         vdev_close(vd);
2602         (void) vdev_open(vd);
2603
2604         /*
2605          * Call vdev_validate() here to make sure we have the same device.
2606          * Otherwise, a device with an invalid label could be successfully
2607          * opened in response to vdev_reopen().
2608          */
2609         if (vd->vdev_aux) {
2610                 (void) vdev_validate_aux(vd);
2611                 if (vdev_readable(vd) && vdev_writeable(vd) &&
2612                     vd->vdev_aux == &spa->spa_l2cache) {
2613                         /*
2614                          * In case the vdev is present we should evict all ARC
2615                          * buffers and pointers to log blocks and reclaim their
2616                          * space before restoring its contents to L2ARC.
2617                          */
2618                         if (l2arc_vdev_present(vd)) {
2619                                 l2arc_rebuild_vdev(vd, B_TRUE);
2620                         } else {
2621                                 l2arc_add_vdev(spa, vd);
2622                         }
2623                         spa_async_request(spa, SPA_ASYNC_L2CACHE_REBUILD);
2624                         spa_async_request(spa, SPA_ASYNC_L2CACHE_TRIM);
2625                 }
2626         } else {
2627                 (void) vdev_validate(vd);
2628         }
2629
2630         /*
2631          * Reassess parent vdev's health.
2632          */
2633         vdev_propagate_state(vd);
2634 }
2635
2636 int
2637 vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing)
2638 {
2639         int error;
2640
2641         /*
2642          * Normally, partial opens (e.g. of a mirror) are allowed.
2643          * For a create, however, we want to fail the request if
2644          * there are any components we can't open.
2645          */
2646         error = vdev_open(vd);
2647
2648         if (error || vd->vdev_state != VDEV_STATE_HEALTHY) {
2649                 vdev_close(vd);
2650                 return (error ? error : SET_ERROR(ENXIO));
2651         }
2652
2653         /*
2654          * Recursively load DTLs and initialize all labels.
2655          */
2656         if ((error = vdev_dtl_load(vd)) != 0 ||
2657             (error = vdev_label_init(vd, txg, isreplacing ?
2658             VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) {
2659                 vdev_close(vd);
2660                 return (error);
2661         }
2662
2663         return (0);
2664 }
2665
2666 void
2667 vdev_metaslab_set_size(vdev_t *vd)
2668 {
2669         uint64_t asize = vd->vdev_asize;
2670         uint64_t ms_count = asize >> zfs_vdev_default_ms_shift;
2671         uint64_t ms_shift;
2672
2673         /*
2674          * There are two dimensions to the metaslab sizing calculation:
2675          * the size of the metaslab and the count of metaslabs per vdev.
2676          *
2677          * The default values used below are a good balance between memory
2678          * usage (larger metaslab size means more memory needed for loaded
2679          * metaslabs; more metaslabs means more memory needed for the
2680          * metaslab_t structs), metaslab load time (larger metaslabs take
2681          * longer to load), and metaslab sync time (more metaslabs means
2682          * more time spent syncing all of them).
2683          *
2684          * In general, we aim for zfs_vdev_default_ms_count (200) metaslabs.
2685          * The range of the dimensions are as follows:
2686          *
2687          *      2^29 <= ms_size  <= 2^34
2688          *        16 <= ms_count <= 131,072
2689          *
2690          * On the lower end of vdev sizes, we aim for metaslabs sizes of
2691          * at least 512MB (2^29) to minimize fragmentation effects when
2692          * testing with smaller devices.  However, the count constraint
2693          * of at least 16 metaslabs will override this minimum size goal.
2694          *
2695          * On the upper end of vdev sizes, we aim for a maximum metaslab
2696          * size of 16GB.  However, we will cap the total count to 2^17
2697          * metaslabs to keep our memory footprint in check and let the
2698          * metaslab size grow from there if that limit is hit.
2699          *
2700          * The net effect of applying above constrains is summarized below.
2701          *
2702          *   vdev size       metaslab count
2703          *  --------------|-----------------
2704          *      < 8GB        ~16
2705          *  8GB   - 100GB   one per 512MB
2706          *  100GB - 3TB     ~200
2707          *  3TB   - 2PB     one per 16GB
2708          *      > 2PB       ~131,072
2709          *  --------------------------------
2710          *
2711          *  Finally, note that all of the above calculate the initial
2712          *  number of metaslabs. Expanding a top-level vdev will result
2713          *  in additional metaslabs being allocated making it possible
2714          *  to exceed the zfs_vdev_ms_count_limit.
2715          */
2716
2717         if (ms_count < zfs_vdev_min_ms_count)
2718                 ms_shift = highbit64(asize / zfs_vdev_min_ms_count);
2719         else if (ms_count > zfs_vdev_default_ms_count)
2720                 ms_shift = highbit64(asize / zfs_vdev_default_ms_count);
2721         else
2722                 ms_shift = zfs_vdev_default_ms_shift;
2723
2724         if (ms_shift < SPA_MAXBLOCKSHIFT) {
2725                 ms_shift = SPA_MAXBLOCKSHIFT;
2726         } else if (ms_shift > zfs_vdev_max_ms_shift) {
2727                 ms_shift = zfs_vdev_max_ms_shift;
2728                 /* cap the total count to constrain memory footprint */
2729                 if ((asize >> ms_shift) > zfs_vdev_ms_count_limit)
2730                         ms_shift = highbit64(asize / zfs_vdev_ms_count_limit);
2731         }
2732
2733         vd->vdev_ms_shift = ms_shift;
2734         ASSERT3U(vd->vdev_ms_shift, >=, SPA_MAXBLOCKSHIFT);
2735 }
2736
2737 void
2738 vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg)
2739 {
2740         ASSERT(vd == vd->vdev_top);
2741         /* indirect vdevs don't have metaslabs or dtls */
2742         ASSERT(vdev_is_concrete(vd) || flags == 0);
2743         ASSERT(ISP2(flags));
2744         ASSERT(spa_writeable(vd->vdev_spa));
2745
2746         if (flags & VDD_METASLAB)
2747                 (void) txg_list_add(&vd->vdev_ms_list, arg, txg);
2748
2749         if (flags & VDD_DTL)
2750                 (void) txg_list_add(&vd->vdev_dtl_list, arg, txg);
2751
2752         (void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg);
2753 }
2754
2755 void
2756 vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg)
2757 {
2758         for (int c = 0; c < vd->vdev_children; c++)
2759                 vdev_dirty_leaves(vd->vdev_child[c], flags, txg);
2760
2761         if (vd->vdev_ops->vdev_op_leaf)
2762                 vdev_dirty(vd->vdev_top, flags, vd, txg);
2763 }
2764
2765 /*
2766  * DTLs.
2767  *
2768  * A vdev's DTL (dirty time log) is the set of transaction groups for which
2769  * the vdev has less than perfect replication.  There are four kinds of DTL:
2770  *
2771  * DTL_MISSING: txgs for which the vdev has no valid copies of the data
2772  *
2773  * DTL_PARTIAL: txgs for which data is available, but not fully replicated
2774  *
2775  * DTL_SCRUB: the txgs that could not be repaired by the last scrub; upon
2776  *      scrub completion, DTL_SCRUB replaces DTL_MISSING in the range of
2777  *      txgs that was scrubbed.
2778  *
2779  * DTL_OUTAGE: txgs which cannot currently be read, whether due to
2780  *      persistent errors or just some device being offline.
2781  *      Unlike the other three, the DTL_OUTAGE map is not generally
2782  *      maintained; it's only computed when needed, typically to
2783  *      determine whether a device can be detached.
2784  *
2785  * For leaf vdevs, DTL_MISSING and DTL_PARTIAL are identical: the device
2786  * either has the data or it doesn't.
2787  *
2788  * For interior vdevs such as mirror and RAID-Z the picture is more complex.
2789  * A vdev's DTL_PARTIAL is the union of its children's DTL_PARTIALs, because
2790  * if any child is less than fully replicated, then so is its parent.
2791  * A vdev's DTL_MISSING is a modified union of its children's DTL_MISSINGs,
2792  * comprising only those txgs which appear in 'maxfaults' or more children;
2793  * those are the txgs we don't have enough replication to read.  For example,
2794  * double-parity RAID-Z can tolerate up to two missing devices (maxfaults == 2);
2795  * thus, its DTL_MISSING consists of the set of txgs that appear in more than
2796  * two child DTL_MISSING maps.
2797  *
2798  * It should be clear from the above that to compute the DTLs and outage maps
2799  * for all vdevs, it suffices to know just the leaf vdevs' DTL_MISSING maps.
2800  * Therefore, that is all we keep on disk.  When loading the pool, or after
2801  * a configuration change, we generate all other DTLs from first principles.
2802  */
2803 void
2804 vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
2805 {
2806         range_tree_t *rt = vd->vdev_dtl[t];
2807
2808         ASSERT(t < DTL_TYPES);
2809         ASSERT(vd != vd->vdev_spa->spa_root_vdev);
2810         ASSERT(spa_writeable(vd->vdev_spa));
2811
2812         mutex_enter(&vd->vdev_dtl_lock);
2813         if (!range_tree_contains(rt, txg, size))
2814                 range_tree_add(rt, txg, size);
2815         mutex_exit(&vd->vdev_dtl_lock);
2816 }
2817
2818 boolean_t
2819 vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
2820 {
2821         range_tree_t *rt = vd->vdev_dtl[t];
2822         boolean_t dirty = B_FALSE;
2823
2824         ASSERT(t < DTL_TYPES);
2825         ASSERT(vd != vd->vdev_spa->spa_root_vdev);
2826
2827         /*
2828          * While we are loading the pool, the DTLs have not been loaded yet.
2829          * This isn't a problem but it can result in devices being tried
2830          * which are known to not have the data.  In which case, the import
2831          * is relying on the checksum to ensure that we get the right data.
2832          * Note that while importing we are only reading the MOS, which is
2833          * always checksummed.
2834          */
2835         mutex_enter(&vd->vdev_dtl_lock);
2836         if (!range_tree_is_empty(rt))
2837                 dirty = range_tree_contains(rt, txg, size);
2838         mutex_exit(&vd->vdev_dtl_lock);
2839
2840         return (dirty);
2841 }
2842
2843 boolean_t
2844 vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t)
2845 {
2846         range_tree_t *rt = vd->vdev_dtl[t];
2847         boolean_t empty;
2848
2849         mutex_enter(&vd->vdev_dtl_lock);
2850         empty = range_tree_is_empty(rt);
2851         mutex_exit(&vd->vdev_dtl_lock);
2852
2853         return (empty);
2854 }
2855
2856 /*
2857  * Check if the txg falls within the range which must be
2858  * resilvered.  DVAs outside this range can always be skipped.
2859  */
2860 boolean_t
2861 vdev_default_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,
2862     uint64_t phys_birth)
2863 {
2864         (void) dva, (void) psize;
2865
2866         /* Set by sequential resilver. */
2867         if (phys_birth == TXG_UNKNOWN)
2868                 return (B_TRUE);
2869
2870         return (vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1));
2871 }
2872
2873 /*
2874  * Returns B_TRUE if the vdev determines the DVA needs to be resilvered.
2875  */
2876 boolean_t
2877 vdev_dtl_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,
2878     uint64_t phys_birth)
2879 {
2880         ASSERT(vd != vd->vdev_spa->spa_root_vdev);
2881
2882         if (vd->vdev_ops->vdev_op_need_resilver == NULL ||
2883             vd->vdev_ops->vdev_op_leaf)
2884                 return (B_TRUE);
2885
2886         return (vd->vdev_ops->vdev_op_need_resilver(vd, dva, psize,
2887             phys_birth));
2888 }
2889
2890 /*
2891  * Returns the lowest txg in the DTL range.
2892  */
2893 static uint64_t
2894 vdev_dtl_min(vdev_t *vd)
2895 {
2896         ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock));
2897         ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0);
2898         ASSERT0(vd->vdev_children);
2899
2900         return (range_tree_min(vd->vdev_dtl[DTL_MISSING]) - 1);
2901 }
2902
2903 /*
2904  * Returns the highest txg in the DTL.
2905  */
2906 static uint64_t
2907 vdev_dtl_max(vdev_t *vd)
2908 {
2909         ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock));
2910         ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0);
2911         ASSERT0(vd->vdev_children);
2912
2913         return (range_tree_max(vd->vdev_dtl[DTL_MISSING]));
2914 }
2915
2916 /*
2917  * Determine if a resilvering vdev should remove any DTL entries from
2918  * its range. If the vdev was resilvering for the entire duration of the
2919  * scan then it should excise that range from its DTLs. Otherwise, this
2920  * vdev is considered partially resilvered and should leave its DTL
2921  * entries intact. The comment in vdev_dtl_reassess() describes how we
2922  * excise the DTLs.
2923  */
2924 static boolean_t
2925 vdev_dtl_should_excise(vdev_t *vd, boolean_t rebuild_done)
2926 {
2927         ASSERT0(vd->vdev_children);
2928
2929         if (vd->vdev_state < VDEV_STATE_DEGRADED)
2930                 return (B_FALSE);
2931
2932         if (vd->vdev_resilver_deferred)
2933                 return (B_FALSE);
2934
2935         if (range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]))
2936                 return (B_TRUE);
2937
2938         if (rebuild_done) {
2939                 vdev_rebuild_t *vr = &vd->vdev_top->vdev_rebuild_config;
2940                 vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
2941
2942                 /* Rebuild not initiated by attach */
2943                 if (vd->vdev_rebuild_txg == 0)
2944                         return (B_TRUE);
2945
2946                 /*
2947                  * When a rebuild completes without error then all missing data
2948                  * up to the rebuild max txg has been reconstructed and the DTL
2949                  * is eligible for excision.
2950                  */
2951                 if (vrp->vrp_rebuild_state == VDEV_REBUILD_COMPLETE &&
2952                     vdev_dtl_max(vd) <= vrp->vrp_max_txg) {
2953                         ASSERT3U(vrp->vrp_min_txg, <=, vdev_dtl_min(vd));
2954                         ASSERT3U(vrp->vrp_min_txg, <, vd->vdev_rebuild_txg);
2955                         ASSERT3U(vd->vdev_rebuild_txg, <=, vrp->vrp_max_txg);
2956                         return (B_TRUE);
2957                 }
2958         } else {
2959                 dsl_scan_t *scn = vd->vdev_spa->spa_dsl_pool->dp_scan;
2960                 dsl_scan_phys_t *scnp __maybe_unused = &scn->scn_phys;
2961
2962                 /* Resilver not initiated by attach */
2963                 if (vd->vdev_resilver_txg == 0)
2964                         return (B_TRUE);
2965
2966                 /*
2967                  * When a resilver is initiated the scan will assign the
2968                  * scn_max_txg value to the highest txg value that exists
2969                  * in all DTLs. If this device's max DTL is not part of this
2970                  * scan (i.e. it is not in the range (scn_min_txg, scn_max_txg]
2971                  * then it is not eligible for excision.
2972                  */
2973                 if (vdev_dtl_max(vd) <= scn->scn_phys.scn_max_txg) {
2974                         ASSERT3U(scnp->scn_min_txg, <=, vdev_dtl_min(vd));
2975                         ASSERT3U(scnp->scn_min_txg, <, vd->vdev_resilver_txg);
2976                         ASSERT3U(vd->vdev_resilver_txg, <=, scnp->scn_max_txg);
2977                         return (B_TRUE);
2978                 }
2979         }
2980
2981         return (B_FALSE);
2982 }
2983
2984 /*
2985  * Reassess DTLs after a config change or scrub completion. If txg == 0 no
2986  * write operations will be issued to the pool.
2987  */
2988 void
2989 vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg,
2990     boolean_t scrub_done, boolean_t rebuild_done)
2991 {
2992         spa_t *spa = vd->vdev_spa;
2993         avl_tree_t reftree;
2994         int minref;
2995
2996         ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
2997
2998         for (int c = 0; c < vd->vdev_children; c++)
2999                 vdev_dtl_reassess(vd->vdev_child[c], txg,
3000                     scrub_txg, scrub_done, rebuild_done);
3001
3002         if (vd == spa->spa_root_vdev || !vdev_is_concrete(vd) || vd->vdev_aux)
3003                 return;
3004
3005         if (vd->vdev_ops->vdev_op_leaf) {
3006                 dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
3007                 vdev_rebuild_t *vr = &vd->vdev_top->vdev_rebuild_config;
3008                 boolean_t check_excise = B_FALSE;
3009                 boolean_t wasempty = B_TRUE;
3010
3011                 mutex_enter(&vd->vdev_dtl_lock);
3012
3013                 /*
3014                  * If requested, pretend the scan or rebuild completed cleanly.
3015                  */
3016                 if (zfs_scan_ignore_errors) {
3017                         if (scn != NULL)
3018                                 scn->scn_phys.scn_errors = 0;
3019                         if (vr != NULL)
3020                                 vr->vr_rebuild_phys.vrp_errors = 0;
3021                 }
3022
3023                 if (scrub_txg != 0 &&
3024                     !range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) {
3025                         wasempty = B_FALSE;
3026                         zfs_dbgmsg("guid:%llu txg:%llu scrub:%llu started:%d "
3027                             "dtl:%llu/%llu errors:%llu",
3028                             (u_longlong_t)vd->vdev_guid, (u_longlong_t)txg,
3029                             (u_longlong_t)scrub_txg, spa->spa_scrub_started,
3030                             (u_longlong_t)vdev_dtl_min(vd),
3031                             (u_longlong_t)vdev_dtl_max(vd),
3032                             (u_longlong_t)(scn ? scn->scn_phys.scn_errors : 0));
3033                 }
3034
3035                 /*
3036                  * If we've completed a scrub/resilver or a rebuild cleanly
3037                  * then determine if this vdev should remove any DTLs. We
3038                  * only want to excise regions on vdevs that were available
3039                  * during the entire duration of this scan.
3040                  */
3041                 if (rebuild_done &&
3042                     vr != NULL && vr->vr_rebuild_phys.vrp_errors == 0) {
3043                         check_excise = B_TRUE;
3044                 } else {
3045                         if (spa->spa_scrub_started ||
3046                             (scn != NULL && scn->scn_phys.scn_errors == 0)) {
3047                                 check_excise = B_TRUE;
3048                         }
3049                 }
3050
3051                 if (scrub_txg && check_excise &&
3052                     vdev_dtl_should_excise(vd, rebuild_done)) {
3053                         /*
3054                          * We completed a scrub, resilver or rebuild up to
3055                          * scrub_txg.  If we did it without rebooting, then
3056                          * the scrub dtl will be valid, so excise the old
3057                          * region and fold in the scrub dtl.  Otherwise,
3058                          * leave the dtl as-is if there was an error.
3059                          *
3060                          * There's little trick here: to excise the beginning
3061                          * of the DTL_MISSING map, we put it into a reference
3062                          * tree and then add a segment with refcnt -1 that
3063                          * covers the range [0, scrub_txg).  This means
3064                          * that each txg in that range has refcnt -1 or 0.
3065                          * We then add DTL_SCRUB with a refcnt of 2, so that
3066                          * entries in the range [0, scrub_txg) will have a
3067                          * positive refcnt -- either 1 or 2.  We then convert
3068                          * the reference tree into the new DTL_MISSING map.
3069                          */
3070                         space_reftree_create(&reftree);
3071                         space_reftree_add_map(&reftree,
3072                             vd->vdev_dtl[DTL_MISSING], 1);
3073                         space_reftree_add_seg(&reftree, 0, scrub_txg, -1);
3074                         space_reftree_add_map(&reftree,
3075                             vd->vdev_dtl[DTL_SCRUB], 2);
3076                         space_reftree_generate_map(&reftree,
3077                             vd->vdev_dtl[DTL_MISSING], 1);
3078                         space_reftree_destroy(&reftree);
3079
3080                         if (!range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) {
3081                                 zfs_dbgmsg("update DTL_MISSING:%llu/%llu",
3082                                     (u_longlong_t)vdev_dtl_min(vd),
3083                                     (u_longlong_t)vdev_dtl_max(vd));
3084                         } else if (!wasempty) {
3085                                 zfs_dbgmsg("DTL_MISSING is now empty");
3086                         }
3087                 }
3088                 range_tree_vacate(vd->vdev_dtl[DTL_PARTIAL], NULL, NULL);
3089                 range_tree_walk(vd->vdev_dtl[DTL_MISSING],
3090                     range_tree_add, vd->vdev_dtl[DTL_PARTIAL]);
3091                 if (scrub_done)
3092                         range_tree_vacate(vd->vdev_dtl[DTL_SCRUB], NULL, NULL);
3093                 range_tree_vacate(vd->vdev_dtl[DTL_OUTAGE], NULL, NULL);
3094                 if (!vdev_readable(vd))
3095                         range_tree_add(vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL);
3096                 else
3097                         range_tree_walk(vd->vdev_dtl[DTL_MISSING],
3098                             range_tree_add, vd->vdev_dtl[DTL_OUTAGE]);
3099
3100                 /*
3101                  * If the vdev was resilvering or rebuilding and no longer
3102                  * has any DTLs then reset the appropriate flag and dirty
3103                  * the top level so that we persist the change.
3104                  */
3105                 if (txg != 0 &&
3106                     range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) &&
3107                     range_tree_is_empty(vd->vdev_dtl[DTL_OUTAGE])) {
3108                         if (vd->vdev_rebuild_txg != 0) {
3109                                 vd->vdev_rebuild_txg = 0;
3110                                 vdev_config_dirty(vd->vdev_top);
3111                         } else if (vd->vdev_resilver_txg != 0) {
3112                                 vd->vdev_resilver_txg = 0;
3113                                 vdev_config_dirty(vd->vdev_top);
3114                         }
3115                 }
3116
3117                 mutex_exit(&vd->vdev_dtl_lock);
3118
3119                 if (txg != 0)
3120                         vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg);
3121                 return;
3122         }
3123
3124         mutex_enter(&vd->vdev_dtl_lock);
3125         for (int t = 0; t < DTL_TYPES; t++) {
3126                 /* account for child's outage in parent's missing map */
3127                 int s = (t == DTL_MISSING) ? DTL_OUTAGE: t;
3128                 if (t == DTL_SCRUB)
3129                         continue;                       /* leaf vdevs only */
3130                 if (t == DTL_PARTIAL)
3131                         minref = 1;                     /* i.e. non-zero */
3132                 else if (vdev_get_nparity(vd) != 0)
3133                         minref = vdev_get_nparity(vd) + 1; /* RAID-Z, dRAID */
3134                 else
3135                         minref = vd->vdev_children;     /* any kind of mirror */
3136                 space_reftree_create(&reftree);
3137                 for (int c = 0; c < vd->vdev_children; c++) {
3138                         vdev_t *cvd = vd->vdev_child[c];
3139                         mutex_enter(&cvd->vdev_dtl_lock);
3140                         space_reftree_add_map(&reftree, cvd->vdev_dtl[s], 1);
3141                         mutex_exit(&cvd->vdev_dtl_lock);
3142                 }
3143                 space_reftree_generate_map(&reftree, vd->vdev_dtl[t], minref);
3144                 space_reftree_destroy(&reftree);
3145         }
3146         mutex_exit(&vd->vdev_dtl_lock);
3147 }
3148
3149 int
3150 vdev_dtl_load(vdev_t *vd)
3151 {
3152         spa_t *spa = vd->vdev_spa;
3153         objset_t *mos = spa->spa_meta_objset;
3154         range_tree_t *rt;
3155         int error = 0;
3156
3157         if (vd->vdev_ops->vdev_op_leaf && vd->vdev_dtl_object != 0) {
3158                 ASSERT(vdev_is_concrete(vd));
3159
3160                 /*
3161                  * If the dtl cannot be sync'd there is no need to open it.
3162                  */
3163                 if (spa->spa_mode == SPA_MODE_READ && !spa->spa_read_spacemaps)
3164                         return (0);
3165
3166                 error = space_map_open(&vd->vdev_dtl_sm, mos,
3167                     vd->vdev_dtl_object, 0, -1ULL, 0);
3168                 if (error)
3169                         return (error);
3170                 ASSERT(vd->vdev_dtl_sm != NULL);
3171
3172                 rt = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0);
3173                 error = space_map_load(vd->vdev_dtl_sm, rt, SM_ALLOC);
3174                 if (error == 0) {
3175                         mutex_enter(&vd->vdev_dtl_lock);
3176                         range_tree_walk(rt, range_tree_add,
3177                             vd->vdev_dtl[DTL_MISSING]);
3178                         mutex_exit(&vd->vdev_dtl_lock);
3179                 }
3180
3181                 range_tree_vacate(rt, NULL, NULL);
3182                 range_tree_destroy(rt);
3183
3184                 return (error);
3185         }
3186
3187         for (int c = 0; c < vd->vdev_children; c++) {
3188                 error = vdev_dtl_load(vd->vdev_child[c]);
3189                 if (error != 0)
3190                         break;
3191         }
3192
3193         return (error);
3194 }
3195
3196 static void
3197 vdev_zap_allocation_data(vdev_t *vd, dmu_tx_t *tx)
3198 {
3199         spa_t *spa = vd->vdev_spa;
3200         objset_t *mos = spa->spa_meta_objset;
3201         vdev_alloc_bias_t alloc_bias = vd->vdev_alloc_bias;
3202         const char *string;
3203
3204         ASSERT(alloc_bias != VDEV_BIAS_NONE);
3205
3206         string =
3207             (alloc_bias == VDEV_BIAS_LOG) ? VDEV_ALLOC_BIAS_LOG :
3208             (alloc_bias == VDEV_BIAS_SPECIAL) ? VDEV_ALLOC_BIAS_SPECIAL :
3209             (alloc_bias == VDEV_BIAS_DEDUP) ? VDEV_ALLOC_BIAS_DEDUP : NULL;
3210
3211         ASSERT(string != NULL);
3212         VERIFY0(zap_add(mos, vd->vdev_top_zap, VDEV_TOP_ZAP_ALLOCATION_BIAS,
3213             1, strlen(string) + 1, string, tx));
3214
3215         if (alloc_bias == VDEV_BIAS_SPECIAL || alloc_bias == VDEV_BIAS_DEDUP) {
3216                 spa_activate_allocation_classes(spa, tx);
3217         }
3218 }
3219
3220 void
3221 vdev_destroy_unlink_zap(vdev_t *vd, uint64_t zapobj, dmu_tx_t *tx)
3222 {
3223         spa_t *spa = vd->vdev_spa;
3224
3225         VERIFY0(zap_destroy(spa->spa_meta_objset, zapobj, tx));
3226         VERIFY0(zap_remove_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps,
3227             zapobj, tx));
3228 }
3229
3230 uint64_t
3231 vdev_create_link_zap(vdev_t *vd, dmu_tx_t *tx)
3232 {
3233         spa_t *spa = vd->vdev_spa;
3234         uint64_t zap = zap_create(spa->spa_meta_objset, DMU_OTN_ZAP_METADATA,
3235             DMU_OT_NONE, 0, tx);
3236
3237         ASSERT(zap != 0);
3238         VERIFY0(zap_add_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps,
3239             zap, tx));
3240
3241         return (zap);
3242 }
3243
3244 void
3245 vdev_construct_zaps(vdev_t *vd, dmu_tx_t *tx)
3246 {
3247         if (vd->vdev_ops != &vdev_hole_ops &&
3248             vd->vdev_ops != &vdev_missing_ops &&
3249             vd->vdev_ops != &vdev_root_ops &&
3250             !vd->vdev_top->vdev_removing) {
3251                 if (vd->vdev_ops->vdev_op_leaf && vd->vdev_leaf_zap == 0) {
3252                         vd->vdev_leaf_zap = vdev_create_link_zap(vd, tx);
3253                 }
3254                 if (vd == vd->vdev_top && vd->vdev_top_zap == 0) {
3255                         vd->vdev_top_zap = vdev_create_link_zap(vd, tx);
3256                         if (vd->vdev_alloc_bias != VDEV_BIAS_NONE)
3257                                 vdev_zap_allocation_data(vd, tx);
3258                 }
3259         }
3260
3261         for (uint64_t i = 0; i < vd->vdev_children; i++) {
3262                 vdev_construct_zaps(vd->vdev_child[i], tx);
3263         }
3264 }
3265
3266 static void
3267 vdev_dtl_sync(vdev_t *vd, uint64_t txg)
3268 {
3269         spa_t *spa = vd->vdev_spa;
3270         range_tree_t *rt = vd->vdev_dtl[DTL_MISSING];
3271         objset_t *mos = spa->spa_meta_objset;
3272         range_tree_t *rtsync;
3273         dmu_tx_t *tx;
3274         uint64_t object = space_map_object(vd->vdev_dtl_sm);
3275
3276         ASSERT(vdev_is_concrete(vd));
3277         ASSERT(vd->vdev_ops->vdev_op_leaf);
3278
3279         tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
3280
3281         if (vd->vdev_detached || vd->vdev_top->vdev_removing) {
3282                 mutex_enter(&vd->vdev_dtl_lock);
3283                 space_map_free(vd->vdev_dtl_sm, tx);
3284                 space_map_close(vd->vdev_dtl_sm);
3285                 vd->vdev_dtl_sm = NULL;
3286                 mutex_exit(&vd->vdev_dtl_lock);
3287
3288                 /*
3289                  * We only destroy the leaf ZAP for detached leaves or for
3290                  * removed log devices. Removed data devices handle leaf ZAP
3291                  * cleanup later, once cancellation is no longer possible.
3292                  */
3293                 if (vd->vdev_leaf_zap != 0 && (vd->vdev_detached ||
3294                     vd->vdev_top->vdev_islog)) {
3295                         vdev_destroy_unlink_zap(vd, vd->vdev_leaf_zap, tx);
3296                         vd->vdev_leaf_zap = 0;
3297                 }
3298
3299                 dmu_tx_commit(tx);
3300                 return;
3301         }
3302
3303         if (vd->vdev_dtl_sm == NULL) {
3304                 uint64_t new_object;
3305
3306                 new_object = space_map_alloc(mos, zfs_vdev_dtl_sm_blksz, tx);
3307                 VERIFY3U(new_object, !=, 0);
3308
3309                 VERIFY0(space_map_open(&vd->vdev_dtl_sm, mos, new_object,
3310                     0, -1ULL, 0));
3311                 ASSERT(vd->vdev_dtl_sm != NULL);
3312         }
3313
3314         rtsync = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0);
3315
3316         mutex_enter(&vd->vdev_dtl_lock);
3317         range_tree_walk(rt, range_tree_add, rtsync);
3318         mutex_exit(&vd->vdev_dtl_lock);
3319
3320         space_map_truncate(vd->vdev_dtl_sm, zfs_vdev_dtl_sm_blksz, tx);
3321         space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, SM_NO_VDEVID, tx);
3322         range_tree_vacate(rtsync, NULL, NULL);
3323
3324         range_tree_destroy(rtsync);
3325
3326         /*
3327          * If the object for the space map has changed then dirty
3328          * the top level so that we update the config.
3329          */
3330         if (object != space_map_object(vd->vdev_dtl_sm)) {
3331                 vdev_dbgmsg(vd, "txg %llu, spa %s, DTL old object %llu, "
3332                     "new object %llu", (u_longlong_t)txg, spa_name(spa),
3333                     (u_longlong_t)object,
3334                     (u_longlong_t)space_map_object(vd->vdev_dtl_sm));
3335                 vdev_config_dirty(vd->vdev_top);
3336         }
3337
3338         dmu_tx_commit(tx);
3339 }
3340
3341 /*
3342  * Determine whether the specified vdev can be offlined/detached/removed
3343  * without losing data.
3344  */
3345 boolean_t
3346 vdev_dtl_required(vdev_t *vd)
3347 {
3348         spa_t *spa = vd->vdev_spa;
3349         vdev_t *tvd = vd->vdev_top;
3350         uint8_t cant_read = vd->vdev_cant_read;
3351         boolean_t required;
3352
3353         ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
3354
3355         if (vd == spa->spa_root_vdev || vd == tvd)
3356                 return (B_TRUE);
3357
3358         /*
3359          * Temporarily mark the device as unreadable, and then determine
3360          * whether this results in any DTL outages in the top-level vdev.
3361          * If not, we can safely offline/detach/remove the device.
3362          */
3363         vd->vdev_cant_read = B_TRUE;
3364         vdev_dtl_reassess(tvd, 0, 0, B_FALSE, B_FALSE);
3365         required = !vdev_dtl_empty(tvd, DTL_OUTAGE);
3366         vd->vdev_cant_read = cant_read;
3367         vdev_dtl_reassess(tvd, 0, 0, B_FALSE, B_FALSE);
3368
3369         if (!required && zio_injection_enabled) {
3370                 required = !!zio_handle_device_injection(vd, NULL,
3371                     SET_ERROR(ECHILD));
3372         }
3373
3374         return (required);
3375 }
3376
3377 /*
3378  * Determine if resilver is needed, and if so the txg range.
3379  */
3380 boolean_t
3381 vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp)
3382 {
3383         boolean_t needed = B_FALSE;
3384         uint64_t thismin = UINT64_MAX;
3385         uint64_t thismax = 0;
3386
3387         if (vd->vdev_children == 0) {
3388                 mutex_enter(&vd->vdev_dtl_lock);
3389                 if (!range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) &&
3390                     vdev_writeable(vd)) {
3391
3392                         thismin = vdev_dtl_min(vd);
3393                         thismax = vdev_dtl_max(vd);
3394                         needed = B_TRUE;
3395                 }
3396                 mutex_exit(&vd->vdev_dtl_lock);
3397         } else {
3398                 for (int c = 0; c < vd->vdev_children; c++) {
3399                         vdev_t *cvd = vd->vdev_child[c];
3400                         uint64_t cmin, cmax;
3401
3402                         if (vdev_resilver_needed(cvd, &cmin, &cmax)) {
3403                                 thismin = MIN(thismin, cmin);
3404                                 thismax = MAX(thismax, cmax);
3405                                 needed = B_TRUE;
3406                         }
3407                 }
3408         }
3409
3410         if (needed && minp) {
3411                 *minp = thismin;
3412                 *maxp = thismax;
3413         }
3414         return (needed);
3415 }
3416
3417 /*
3418  * Gets the checkpoint space map object from the vdev's ZAP.  On success sm_obj
3419  * will contain either the checkpoint spacemap object or zero if none exists.
3420  * All other errors are returned to the caller.
3421  */
3422 int
3423 vdev_checkpoint_sm_object(vdev_t *vd, uint64_t *sm_obj)
3424 {
3425         ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
3426
3427         if (vd->vdev_top_zap == 0) {
3428                 *sm_obj = 0;
3429                 return (0);
3430         }
3431
3432         int error = zap_lookup(spa_meta_objset(vd->vdev_spa), vd->vdev_top_zap,
3433             VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1, sm_obj);
3434         if (error == ENOENT) {
3435                 *sm_obj = 0;
3436                 error = 0;
3437         }
3438
3439         return (error);
3440 }
3441
3442 int
3443 vdev_load(vdev_t *vd)
3444 {
3445         int children = vd->vdev_children;
3446         int error = 0;
3447         taskq_t *tq = NULL;
3448
3449         /*
3450          * It's only worthwhile to use the taskq for the root vdev, because the
3451          * slow part is metaslab_init, and that only happens for top-level
3452          * vdevs.
3453          */
3454         if (vd->vdev_ops == &vdev_root_ops && vd->vdev_children > 0) {
3455                 tq = taskq_create("vdev_load", children, minclsyspri,
3456                     children, children, TASKQ_PREPOPULATE);
3457         }
3458
3459         /*
3460          * Recursively load all children.
3461          */
3462         for (int c = 0; c < vd->vdev_children; c++) {
3463                 vdev_t *cvd = vd->vdev_child[c];
3464
3465                 if (tq == NULL || vdev_uses_zvols(cvd)) {
3466                         cvd->vdev_load_error = vdev_load(cvd);
3467                 } else {
3468                         VERIFY(taskq_dispatch(tq, vdev_load_child,
3469                             cvd, TQ_SLEEP) != TASKQID_INVALID);
3470                 }
3471         }
3472
3473         if (tq != NULL) {
3474                 taskq_wait(tq);
3475                 taskq_destroy(tq);
3476         }
3477
3478         for (int c = 0; c < vd->vdev_children; c++) {
3479                 int error = vd->vdev_child[c]->vdev_load_error;
3480
3481                 if (error != 0)
3482                         return (error);
3483         }
3484
3485         vdev_set_deflate_ratio(vd);
3486
3487         /*
3488          * On spa_load path, grab the allocation bias from our zap
3489          */
3490         if (vd == vd->vdev_top && vd->vdev_top_zap != 0) {
3491                 spa_t *spa = vd->vdev_spa;
3492                 char bias_str[64];
3493
3494                 error = zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap,
3495                     VDEV_TOP_ZAP_ALLOCATION_BIAS, 1, sizeof (bias_str),
3496                     bias_str);
3497                 if (error == 0) {
3498                         ASSERT(vd->vdev_alloc_bias == VDEV_BIAS_NONE);
3499                         vd->vdev_alloc_bias = vdev_derive_alloc_bias(bias_str);
3500                 } else if (error != ENOENT) {
3501                         vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
3502                             VDEV_AUX_CORRUPT_DATA);
3503                         vdev_dbgmsg(vd, "vdev_load: zap_lookup(top_zap=%llu) "
3504                             "failed [error=%d]",
3505                             (u_longlong_t)vd->vdev_top_zap, error);
3506                         return (error);
3507                 }
3508         }
3509
3510         /*
3511          * Load any rebuild state from the top-level vdev zap.
3512          */
3513         if (vd == vd->vdev_top && vd->vdev_top_zap != 0) {
3514                 error = vdev_rebuild_load(vd);
3515                 if (error && error != ENOTSUP) {
3516                         vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
3517                             VDEV_AUX_CORRUPT_DATA);
3518                         vdev_dbgmsg(vd, "vdev_load: vdev_rebuild_load "
3519                             "failed [error=%d]", error);
3520                         return (error);
3521                 }
3522         }
3523
3524         /*
3525          * If this is a top-level vdev, initialize its metaslabs.
3526          */
3527         if (vd == vd->vdev_top && vdev_is_concrete(vd)) {
3528                 vdev_metaslab_group_create(vd);
3529
3530                 if (vd->vdev_ashift == 0 || vd->vdev_asize == 0) {
3531                         vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
3532                             VDEV_AUX_CORRUPT_DATA);
3533                         vdev_dbgmsg(vd, "vdev_load: invalid size. ashift=%llu, "
3534                             "asize=%llu", (u_longlong_t)vd->vdev_ashift,
3535                             (u_longlong_t)vd->vdev_asize);
3536                         return (SET_ERROR(ENXIO));
3537                 }
3538
3539                 error = vdev_metaslab_init(vd, 0);
3540                 if (error != 0) {
3541                         vdev_dbgmsg(vd, "vdev_load: metaslab_init failed "
3542                             "[error=%d]", error);
3543                         vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
3544                             VDEV_AUX_CORRUPT_DATA);
3545                         return (error);
3546                 }
3547
3548                 uint64_t checkpoint_sm_obj;
3549                 error = vdev_checkpoint_sm_object(vd, &checkpoint_sm_obj);
3550                 if (error == 0 && checkpoint_sm_obj != 0) {
3551                         objset_t *mos = spa_meta_objset(vd->vdev_spa);
3552                         ASSERT(vd->vdev_asize != 0);
3553                         ASSERT3P(vd->vdev_checkpoint_sm, ==, NULL);
3554
3555                         error = space_map_open(&vd->vdev_checkpoint_sm,
3556                             mos, checkpoint_sm_obj, 0, vd->vdev_asize,
3557                             vd->vdev_ashift);
3558                         if (error != 0) {
3559                                 vdev_dbgmsg(vd, "vdev_load: space_map_open "
3560                                     "failed for checkpoint spacemap (obj %llu) "
3561                                     "[error=%d]",
3562                                     (u_longlong_t)checkpoint_sm_obj, error);
3563                                 return (error);
3564                         }
3565                         ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
3566
3567                         /*
3568                          * Since the checkpoint_sm contains free entries
3569                          * exclusively we can use space_map_allocated() to
3570                          * indicate the cumulative checkpointed space that
3571                          * has been freed.
3572                          */
3573                         vd->vdev_stat.vs_checkpoint_space =
3574                             -space_map_allocated(vd->vdev_checkpoint_sm);
3575                         vd->vdev_spa->spa_checkpoint_info.sci_dspace +=
3576                             vd->vdev_stat.vs_checkpoint_space;
3577                 } else if (error != 0) {
3578                         vdev_dbgmsg(vd, "vdev_load: failed to retrieve "
3579                             "checkpoint space map object from vdev ZAP "
3580                             "[error=%d]", error);
3581                         return (error);
3582                 }
3583         }
3584
3585         /*
3586          * If this is a leaf vdev, load its DTL.
3587          */
3588         if (vd->vdev_ops->vdev_op_leaf && (error = vdev_dtl_load(vd)) != 0) {
3589                 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
3590                     VDEV_AUX_CORRUPT_DATA);
3591                 vdev_dbgmsg(vd, "vdev_load: vdev_dtl_load failed "
3592                     "[error=%d]", error);
3593                 return (error);
3594         }
3595
3596         uint64_t obsolete_sm_object;
3597         error = vdev_obsolete_sm_object(vd, &obsolete_sm_object);
3598         if (error == 0 && obsolete_sm_object != 0) {
3599                 objset_t *mos = vd->vdev_spa->spa_meta_objset;
3600                 ASSERT(vd->vdev_asize != 0);
3601                 ASSERT3P(vd->vdev_obsolete_sm, ==, NULL);
3602
3603                 if ((error = space_map_open(&vd->vdev_obsolete_sm, mos,
3604                     obsolete_sm_object, 0, vd->vdev_asize, 0))) {
3605                         vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
3606                             VDEV_AUX_CORRUPT_DATA);
3607                         vdev_dbgmsg(vd, "vdev_load: space_map_open failed for "
3608                             "obsolete spacemap (obj %llu) [error=%d]",
3609                             (u_longlong_t)obsolete_sm_object, error);
3610                         return (error);
3611                 }
3612         } else if (error != 0) {
3613                 vdev_dbgmsg(vd, "vdev_load: failed to retrieve obsolete "
3614                     "space map object from vdev ZAP [error=%d]", error);
3615                 return (error);
3616         }
3617
3618         return (0);
3619 }
3620
3621 /*
3622  * The special vdev case is used for hot spares and l2cache devices.  Its
3623  * sole purpose it to set the vdev state for the associated vdev.  To do this,
3624  * we make sure that we can open the underlying device, then try to read the
3625  * label, and make sure that the label is sane and that it hasn't been
3626  * repurposed to another pool.
3627  */
3628 int
3629 vdev_validate_aux(vdev_t *vd)
3630 {
3631         nvlist_t *label;
3632         uint64_t guid, version;
3633         uint64_t state;
3634
3635         if (!vdev_readable(vd))
3636                 return (0);
3637
3638         if ((label = vdev_label_read_config(vd, -1ULL)) == NULL) {
3639                 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
3640                     VDEV_AUX_CORRUPT_DATA);
3641                 return (-1);
3642         }
3643
3644         if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 ||
3645             !SPA_VERSION_IS_SUPPORTED(version) ||
3646             nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 ||
3647             guid != vd->vdev_guid ||
3648             nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) {
3649                 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
3650                     VDEV_AUX_CORRUPT_DATA);
3651                 nvlist_free(label);
3652                 return (-1);
3653         }
3654
3655         /*
3656          * We don't actually check the pool state here.  If it's in fact in
3657          * use by another pool, we update this fact on the fly when requested.
3658          */
3659         nvlist_free(label);
3660         return (0);
3661 }
3662
3663 static void
3664 vdev_destroy_ms_flush_data(vdev_t *vd, dmu_tx_t *tx)
3665 {
3666         objset_t *mos = spa_meta_objset(vd->vdev_spa);
3667
3668         if (vd->vdev_top_zap == 0)
3669                 return;
3670
3671         uint64_t object = 0;
3672         int err = zap_lookup(mos, vd->vdev_top_zap,
3673             VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1, &object);
3674         if (err == ENOENT)
3675                 return;
3676         VERIFY0(err);
3677
3678         VERIFY0(dmu_object_free(mos, object, tx));
3679         VERIFY0(zap_remove(mos, vd->vdev_top_zap,
3680             VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, tx));
3681 }
3682
3683 /*
3684  * Free the objects used to store this vdev's spacemaps, and the array
3685  * that points to them.
3686  */
3687 void
3688 vdev_destroy_spacemaps(vdev_t *vd, dmu_tx_t *tx)
3689 {
3690         if (vd->vdev_ms_array == 0)
3691                 return;
3692
3693         objset_t *mos = vd->vdev_spa->spa_meta_objset;
3694         uint64_t array_count = vd->vdev_asize >> vd->vdev_ms_shift;
3695         size_t array_bytes = array_count * sizeof (uint64_t);
3696         uint64_t *smobj_array = kmem_alloc(array_bytes, KM_SLEEP);
3697         VERIFY0(dmu_read(mos, vd->vdev_ms_array, 0,
3698             array_bytes, smobj_array, 0));
3699
3700         for (uint64_t i = 0; i < array_count; i++) {
3701                 uint64_t smobj = smobj_array[i];
3702                 if (smobj == 0)
3703                         continue;
3704
3705                 space_map_free_obj(mos, smobj, tx);
3706         }
3707
3708         kmem_free(smobj_array, array_bytes);
3709         VERIFY0(dmu_object_free(mos, vd->vdev_ms_array, tx));
3710         vdev_destroy_ms_flush_data(vd, tx);
3711         vd->vdev_ms_array = 0;
3712 }
3713
3714 static void
3715 vdev_remove_empty_log(vdev_t *vd, uint64_t txg)
3716 {
3717         spa_t *spa = vd->vdev_spa;
3718
3719         ASSERT(vd->vdev_islog);
3720         ASSERT(vd == vd->vdev_top);
3721         ASSERT3U(txg, ==, spa_syncing_txg(spa));
3722
3723         dmu_tx_t *tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
3724
3725         vdev_destroy_spacemaps(vd, tx);
3726         if (vd->vdev_top_zap != 0) {
3727                 vdev_destroy_unlink_zap(vd, vd->vdev_top_zap, tx);
3728                 vd->vdev_top_zap = 0;
3729         }
3730
3731         dmu_tx_commit(tx);
3732 }
3733
3734 void
3735 vdev_sync_done(vdev_t *vd, uint64_t txg)
3736 {
3737         metaslab_t *msp;
3738         boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg));
3739
3740         ASSERT(vdev_is_concrete(vd));
3741
3742         while ((msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))
3743             != NULL)
3744                 metaslab_sync_done(msp, txg);
3745
3746         if (reassess) {
3747                 metaslab_sync_reassess(vd->vdev_mg);
3748                 if (vd->vdev_log_mg != NULL)
3749                         metaslab_sync_reassess(vd->vdev_log_mg);
3750         }
3751 }
3752
3753 void
3754 vdev_sync(vdev_t *vd, uint64_t txg)
3755 {
3756         spa_t *spa = vd->vdev_spa;
3757         vdev_t *lvd;
3758         metaslab_t *msp;
3759
3760         ASSERT3U(txg, ==, spa->spa_syncing_txg);
3761         dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
3762         if (range_tree_space(vd->vdev_obsolete_segments) > 0) {
3763                 ASSERT(vd->vdev_removing ||
3764                     vd->vdev_ops == &vdev_indirect_ops);
3765
3766                 vdev_indirect_sync_obsolete(vd, tx);
3767
3768                 /*
3769                  * If the vdev is indirect, it can't have dirty
3770                  * metaslabs or DTLs.
3771                  */
3772                 if (vd->vdev_ops == &vdev_indirect_ops) {
3773                         ASSERT(txg_list_empty(&vd->vdev_ms_list, txg));
3774                         ASSERT(txg_list_empty(&vd->vdev_dtl_list, txg));
3775                         dmu_tx_commit(tx);
3776                         return;
3777                 }
3778         }
3779
3780         ASSERT(vdev_is_concrete(vd));
3781
3782         if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0 &&
3783             !vd->vdev_removing) {
3784                 ASSERT(vd == vd->vdev_top);
3785                 ASSERT0(vd->vdev_indirect_config.vic_mapping_object);
3786                 vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset,
3787                     DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx);
3788                 ASSERT(vd->vdev_ms_array != 0);
3789                 vdev_config_dirty(vd);
3790         }
3791
3792         while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) {
3793                 metaslab_sync(msp, txg);
3794                 (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg));
3795         }
3796
3797         while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL)
3798                 vdev_dtl_sync(lvd, txg);
3799
3800         /*
3801          * If this is an empty log device being removed, destroy the
3802          * metadata associated with it.
3803          */
3804         if (vd->vdev_islog && vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing)
3805                 vdev_remove_empty_log(vd, txg);
3806
3807         (void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg));
3808         dmu_tx_commit(tx);
3809 }
3810
3811 uint64_t
3812 vdev_psize_to_asize(vdev_t *vd, uint64_t psize)
3813 {
3814         return (vd->vdev_ops->vdev_op_asize(vd, psize));
3815 }
3816
3817 /*
3818  * Mark the given vdev faulted.  A faulted vdev behaves as if the device could
3819  * not be opened, and no I/O is attempted.
3820  */
3821 int
3822 vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux)
3823 {
3824         vdev_t *vd, *tvd;
3825
3826         spa_vdev_state_enter(spa, SCL_NONE);
3827
3828         if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
3829                 return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV)));
3830
3831         if (!vd->vdev_ops->vdev_op_leaf)
3832                 return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP)));
3833
3834         tvd = vd->vdev_top;
3835
3836         /*
3837          * If user did a 'zpool offline -f' then make the fault persist across
3838          * reboots.
3839          */
3840         if (aux == VDEV_AUX_EXTERNAL_PERSIST) {
3841                 /*
3842                  * There are two kinds of forced faults: temporary and
3843                  * persistent.  Temporary faults go away at pool import, while
3844                  * persistent faults stay set.  Both types of faults can be
3845                  * cleared with a zpool clear.
3846                  *
3847                  * We tell if a vdev is persistently faulted by looking at the
3848                  * ZPOOL_CONFIG_AUX_STATE nvpair.  If it's set to "external" at
3849                  * import then it's a persistent fault.  Otherwise, it's
3850                  * temporary.  We get ZPOOL_CONFIG_AUX_STATE set to "external"
3851                  * by setting vd.vdev_stat.vs_aux to VDEV_AUX_EXTERNAL.  This
3852                  * tells vdev_config_generate() (which gets run later) to set
3853                  * ZPOOL_CONFIG_AUX_STATE to "external" in the nvlist.
3854                  */
3855                 vd->vdev_stat.vs_aux = VDEV_AUX_EXTERNAL;
3856                 vd->vdev_tmpoffline = B_FALSE;
3857                 aux = VDEV_AUX_EXTERNAL;
3858         } else {
3859                 vd->vdev_tmpoffline = B_TRUE;
3860         }
3861
3862         /*
3863          * We don't directly use the aux state here, but if we do a
3864          * vdev_reopen(), we need this value to be present to remember why we
3865          * were faulted.
3866          */
3867         vd->vdev_label_aux = aux;
3868
3869         /*
3870          * Faulted state takes precedence over degraded.
3871          */
3872         vd->vdev_delayed_close = B_FALSE;
3873         vd->vdev_faulted = 1ULL;
3874         vd->vdev_degraded = 0ULL;
3875         vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, aux);
3876
3877         /*
3878          * If this device has the only valid copy of the data, then
3879          * back off and simply mark the vdev as degraded instead.
3880          */
3881         if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) {
3882                 vd->vdev_degraded = 1ULL;
3883                 vd->vdev_faulted = 0ULL;
3884
3885                 /*
3886                  * If we reopen the device and it's not dead, only then do we
3887                  * mark it degraded.
3888                  */
3889                 vdev_reopen(tvd);
3890
3891                 if (vdev_readable(vd))
3892                         vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux);
3893         }
3894
3895         return (spa_vdev_state_exit(spa, vd, 0));
3896 }
3897
3898 /*
3899  * Mark the given vdev degraded.  A degraded vdev is purely an indication to the
3900  * user that something is wrong.  The vdev continues to operate as normal as far
3901  * as I/O is concerned.
3902  */
3903 int
3904 vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux)
3905 {
3906         vdev_t *vd;
3907
3908         spa_vdev_state_enter(spa, SCL_NONE);
3909
3910         if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
3911                 return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV)));
3912
3913         if (!vd->vdev_ops->vdev_op_leaf)
3914                 return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP)));
3915
3916         /*
3917          * If the vdev is already faulted, then don't do anything.
3918          */
3919         if (vd->vdev_faulted || vd->vdev_degraded)
3920                 return (spa_vdev_state_exit(spa, NULL, 0));
3921
3922         vd->vdev_degraded = 1ULL;
3923         if (!vdev_is_dead(vd))
3924                 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED,
3925                     aux);
3926
3927         return (spa_vdev_state_exit(spa, vd, 0));
3928 }
3929
3930 /*
3931  * Online the given vdev.
3932  *
3933  * If 'ZFS_ONLINE_UNSPARE' is set, it implies two things.  First, any attached
3934  * spare device should be detached when the device finishes resilvering.
3935  * Second, the online should be treated like a 'test' online case, so no FMA
3936  * events are generated if the device fails to open.
3937  */
3938 int
3939 vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
3940 {
3941         vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev;
3942         boolean_t wasoffline;
3943         vdev_state_t oldstate;
3944
3945         spa_vdev_state_enter(spa, SCL_NONE);
3946
3947         if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
3948                 return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV)));
3949
3950         if (!vd->vdev_ops->vdev_op_leaf)
3951                 return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP)));
3952
3953         wasoffline = (vd->vdev_offline || vd->vdev_tmpoffline);
3954         oldstate = vd->vdev_state;
3955
3956         tvd = vd->vdev_top;
3957         vd->vdev_offline = B_FALSE;
3958         vd->vdev_tmpoffline = B_FALSE;
3959         vd->vdev_checkremove = !!(flags & ZFS_ONLINE_CHECKREMOVE);
3960         vd->vdev_forcefault = !!(flags & ZFS_ONLINE_FORCEFAULT);
3961
3962         /* XXX - L2ARC 1.0 does not support expansion */
3963         if (!vd->vdev_aux) {
3964                 for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
3965                         pvd->vdev_expanding = !!((flags & ZFS_ONLINE_EXPAND) ||
3966                             spa->spa_autoexpand);
3967                 vd->vdev_expansion_time = gethrestime_sec();
3968         }
3969
3970         vdev_reopen(tvd);
3971         vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE;
3972
3973         if (!vd->vdev_aux) {
3974                 for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
3975                         pvd->vdev_expanding = B_FALSE;
3976         }
3977
3978         if (newstate)
3979                 *newstate = vd->vdev_state;
3980         if ((flags & ZFS_ONLINE_UNSPARE) &&
3981             !vdev_is_dead(vd) && vd->vdev_parent &&
3982             vd->vdev_parent->vdev_ops == &vdev_spare_ops &&
3983             vd->vdev_parent->vdev_child[0] == vd)
3984                 vd->vdev_unspare = B_TRUE;
3985
3986         if ((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand) {
3987
3988                 /* XXX - L2ARC 1.0 does not support expansion */
3989                 if (vd->vdev_aux)
3990                         return (spa_vdev_state_exit(spa, vd, ENOTSUP));
3991                 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
3992         }
3993
3994         /* Restart initializing if necessary */
3995         mutex_enter(&vd->vdev_initialize_lock);
3996         if (vdev_writeable(vd) &&
3997             vd->vdev_initialize_thread == NULL &&
3998             vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE) {
3999                 (void) vdev_initialize(vd);
4000         }
4001         mutex_exit(&vd->vdev_initialize_lock);
4002
4003         /*
4004          * Restart trimming if necessary. We do not restart trimming for cache
4005          * devices here. This is triggered by l2arc_rebuild_vdev()
4006          * asynchronously for the whole device or in l2arc_evict() as it evicts
4007          * space for upcoming writes.
4008          */
4009         mutex_enter(&vd->vdev_trim_lock);
4010         if (vdev_writeable(vd) && !vd->vdev_isl2cache &&
4011             vd->vdev_trim_thread == NULL &&
4012             vd->vdev_trim_state == VDEV_TRIM_ACTIVE) {
4013                 (void) vdev_trim(vd, vd->vdev_trim_rate, vd->vdev_trim_partial,
4014                     vd->vdev_trim_secure);
4015         }
4016         mutex_exit(&vd->vdev_trim_lock);
4017
4018         if (wasoffline ||
4019             (oldstate < VDEV_STATE_DEGRADED &&
4020             vd->vdev_state >= VDEV_STATE_DEGRADED))
4021                 spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_ONLINE);
4022
4023         return (spa_vdev_state_exit(spa, vd, 0));
4024 }
4025
4026 static int
4027 vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags)
4028 {
4029         vdev_t *vd, *tvd;
4030         int error = 0;
4031         uint64_t generation;
4032         metaslab_group_t *mg;
4033
4034 top:
4035         spa_vdev_state_enter(spa, SCL_ALLOC);
4036
4037         if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
4038                 return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV)));
4039
4040         if (!vd->vdev_ops->vdev_op_leaf)
4041                 return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP)));
4042
4043         if (vd->vdev_ops == &vdev_draid_spare_ops)
4044                 return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
4045
4046         tvd = vd->vdev_top;
4047         mg = tvd->vdev_mg;
4048         generation = spa->spa_config_generation + 1;
4049
4050         /*
4051          * If the device isn't already offline, try to offline it.
4052          */
4053         if (!vd->vdev_offline) {
4054                 /*
4055                  * If this device has the only valid copy of some data,
4056                  * don't allow it to be offlined. Log devices are always
4057                  * expendable.
4058                  */
4059                 if (!tvd->vdev_islog && vd->vdev_aux == NULL &&
4060                     vdev_dtl_required(vd))
4061                         return (spa_vdev_state_exit(spa, NULL,
4062                             SET_ERROR(EBUSY)));
4063
4064                 /*
4065                  * If the top-level is a slog and it has had allocations
4066                  * then proceed.  We check that the vdev's metaslab group
4067                  * is not NULL since it's possible that we may have just
4068                  * added this vdev but not yet initialized its metaslabs.
4069                  */
4070                 if (tvd->vdev_islog && mg != NULL) {
4071                         /*
4072                          * Prevent any future allocations.
4073                          */
4074                         ASSERT3P(tvd->vdev_log_mg, ==, NULL);
4075                         metaslab_group_passivate(mg);
4076                         (void) spa_vdev_state_exit(spa, vd, 0);
4077
4078                         error = spa_reset_logs(spa);
4079
4080                         /*
4081                          * If the log device was successfully reset but has
4082                          * checkpointed data, do not offline it.
4083                          */
4084                         if (error == 0 &&
4085                             tvd->vdev_checkpoint_sm != NULL) {
4086                                 ASSERT3U(space_map_allocated(
4087                                     tvd->vdev_checkpoint_sm), !=, 0);
4088                                 error = ZFS_ERR_CHECKPOINT_EXISTS;
4089                         }
4090
4091                         spa_vdev_state_enter(spa, SCL_ALLOC);
4092
4093                         /*
4094                          * Check to see if the config has changed.
4095                          */
4096                         if (error || generation != spa->spa_config_generation) {
4097                                 metaslab_group_activate(mg);
4098                                 if (error)
4099                                         return (spa_vdev_state_exit(spa,
4100                                             vd, error));
4101                                 (void) spa_vdev_state_exit(spa, vd, 0);
4102                                 goto top;
4103                         }
4104                         ASSERT0(tvd->vdev_stat.vs_alloc);
4105                 }
4106
4107                 /*
4108                  * Offline this device and reopen its top-level vdev.
4109                  * If the top-level vdev is a log device then just offline
4110                  * it. Otherwise, if this action results in the top-level
4111                  * vdev becoming unusable, undo it and fail the request.
4112                  */
4113                 vd->vdev_offline = B_TRUE;
4114                 vdev_reopen(tvd);
4115
4116                 if (!tvd->vdev_islog && vd->vdev_aux == NULL &&
4117                     vdev_is_dead(tvd)) {
4118                         vd->vdev_offline = B_FALSE;
4119                         vdev_reopen(tvd);
4120                         return (spa_vdev_state_exit(spa, NULL,
4121                             SET_ERROR(EBUSY)));
4122                 }
4123
4124                 /*
4125                  * Add the device back into the metaslab rotor so that
4126                  * once we online the device it's open for business.
4127                  */
4128                 if (tvd->vdev_islog && mg != NULL)
4129                         metaslab_group_activate(mg);
4130         }
4131
4132         vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY);
4133
4134         return (spa_vdev_state_exit(spa, vd, 0));
4135 }
4136
4137 int
4138 vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
4139 {
4140         int error;
4141
4142         mutex_enter(&spa->spa_vdev_top_lock);
4143         error = vdev_offline_locked(spa, guid, flags);
4144         mutex_exit(&spa->spa_vdev_top_lock);
4145
4146         return (error);
4147 }
4148
4149 /*
4150  * Clear the error counts associated with this vdev.  Unlike vdev_online() and
4151  * vdev_offline(), we assume the spa config is locked.  We also clear all
4152  * children.  If 'vd' is NULL, then the user wants to clear all vdevs.
4153  */
4154 void
4155 vdev_clear(spa_t *spa, vdev_t *vd)
4156 {
4157         vdev_t *rvd = spa->spa_root_vdev;
4158
4159         ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
4160
4161         if (vd == NULL)
4162                 vd = rvd;
4163
4164         vd->vdev_stat.vs_read_errors = 0;
4165         vd->vdev_stat.vs_write_errors = 0;
4166         vd->vdev_stat.vs_checksum_errors = 0;
4167         vd->vdev_stat.vs_slow_ios = 0;
4168
4169         for (int c = 0; c < vd->vdev_children; c++)
4170                 vdev_clear(spa, vd->vdev_child[c]);
4171
4172         /*
4173          * It makes no sense to "clear" an indirect vdev.
4174          */
4175         if (!vdev_is_concrete(vd))
4176                 return;
4177
4178         /*
4179          * If we're in the FAULTED state or have experienced failed I/O, then
4180          * clear the persistent state and attempt to reopen the device.  We
4181          * also mark the vdev config dirty, so that the new faulted state is
4182          * written out to disk.
4183          */
4184         if (vd->vdev_faulted || vd->vdev_degraded ||
4185             !vdev_readable(vd) || !vdev_writeable(vd)) {
4186                 /*
4187                  * When reopening in response to a clear event, it may be due to
4188                  * a fmadm repair request.  In this case, if the device is
4189                  * still broken, we want to still post the ereport again.
4190                  */
4191                 vd->vdev_forcefault = B_TRUE;
4192
4193                 vd->vdev_faulted = vd->vdev_degraded = 0ULL;
4194                 vd->vdev_cant_read = B_FALSE;
4195                 vd->vdev_cant_write = B_FALSE;
4196                 vd->vdev_stat.vs_aux = 0;
4197
4198                 vdev_reopen(vd == rvd ? rvd : vd->vdev_top);
4199
4200                 vd->vdev_forcefault = B_FALSE;
4201
4202                 if (vd != rvd && vdev_writeable(vd->vdev_top))
4203                         vdev_state_dirty(vd->vdev_top);
4204
4205                 /* If a resilver isn't required, check if vdevs can be culled */
4206                 if (vd->vdev_aux == NULL && !vdev_is_dead(vd) &&
4207                     !dsl_scan_resilvering(spa->spa_dsl_pool) &&
4208                     !dsl_scan_resilver_scheduled(spa->spa_dsl_pool))
4209                         spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
4210
4211                 spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_CLEAR);
4212         }
4213
4214         /*
4215          * When clearing a FMA-diagnosed fault, we always want to
4216          * unspare the device, as we assume that the original spare was
4217          * done in response to the FMA fault.
4218          */
4219         if (!vdev_is_dead(vd) && vd->vdev_parent != NULL &&
4220             vd->vdev_parent->vdev_ops == &vdev_spare_ops &&
4221             vd->vdev_parent->vdev_child[0] == vd)
4222                 vd->vdev_unspare = B_TRUE;
4223
4224         /* Clear recent error events cache (i.e. duplicate events tracking) */
4225         zfs_ereport_clear(spa, vd);
4226 }
4227
4228 boolean_t
4229 vdev_is_dead(vdev_t *vd)
4230 {
4231         /*
4232          * Holes and missing devices are always considered "dead".
4233          * This simplifies the code since we don't have to check for
4234          * these types of devices in the various code paths.
4235          * Instead we rely on the fact that we skip over dead devices
4236          * before issuing I/O to them.
4237          */
4238         return (vd->vdev_state < VDEV_STATE_DEGRADED ||
4239             vd->vdev_ops == &vdev_hole_ops ||
4240             vd->vdev_ops == &vdev_missing_ops);
4241 }
4242
4243 boolean_t
4244 vdev_readable(vdev_t *vd)
4245 {
4246         return (!vdev_is_dead(vd) && !vd->vdev_cant_read);
4247 }
4248
4249 boolean_t
4250 vdev_writeable(vdev_t *vd)
4251 {
4252         return (!vdev_is_dead(vd) && !vd->vdev_cant_write &&
4253             vdev_is_concrete(vd));
4254 }
4255
4256 boolean_t
4257 vdev_allocatable(vdev_t *vd)
4258 {
4259         uint64_t state = vd->vdev_state;
4260
4261         /*
4262          * We currently allow allocations from vdevs which may be in the
4263          * process of reopening (i.e. VDEV_STATE_CLOSED). If the device
4264          * fails to reopen then we'll catch it later when we're holding
4265          * the proper locks.  Note that we have to get the vdev state
4266          * in a local variable because although it changes atomically,
4267          * we're asking two separate questions about it.
4268          */
4269         return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) &&
4270             !vd->vdev_cant_write && vdev_is_concrete(vd) &&
4271             vd->vdev_mg->mg_initialized);
4272 }
4273
4274 boolean_t
4275 vdev_accessible(vdev_t *vd, zio_t *zio)
4276 {
4277         ASSERT(zio->io_vd == vd);
4278
4279         if (vdev_is_dead(vd) || vd->vdev_remove_wanted)
4280                 return (B_FALSE);
4281
4282         if (zio->io_type == ZIO_TYPE_READ)
4283                 return (!vd->vdev_cant_read);
4284
4285         if (zio->io_type == ZIO_TYPE_WRITE)
4286                 return (!vd->vdev_cant_write);
4287
4288         return (B_TRUE);
4289 }
4290
4291 static void
4292 vdev_get_child_stat(vdev_t *cvd, vdev_stat_t *vs, vdev_stat_t *cvs)
4293 {
4294         /*
4295          * Exclude the dRAID spare when aggregating to avoid double counting
4296          * the ops and bytes.  These IOs are counted by the physical leaves.
4297          */
4298         if (cvd->vdev_ops == &vdev_draid_spare_ops)
4299                 return;
4300
4301         for (int t = 0; t < VS_ZIO_TYPES; t++) {
4302                 vs->vs_ops[t] += cvs->vs_ops[t];
4303                 vs->vs_bytes[t] += cvs->vs_bytes[t];
4304         }
4305
4306         cvs->vs_scan_removing = cvd->vdev_removing;
4307 }
4308
4309 /*
4310  * Get extended stats
4311  */
4312 static void
4313 vdev_get_child_stat_ex(vdev_t *cvd, vdev_stat_ex_t *vsx, vdev_stat_ex_t *cvsx)
4314 {
4315         (void) cvd;
4316
4317         int t, b;
4318         for (t = 0; t < ZIO_TYPES; t++) {
4319                 for (b = 0; b < ARRAY_SIZE(vsx->vsx_disk_histo[0]); b++)
4320                         vsx->vsx_disk_histo[t][b] += cvsx->vsx_disk_histo[t][b];
4321
4322                 for (b = 0; b < ARRAY_SIZE(vsx->vsx_total_histo[0]); b++) {
4323                         vsx->vsx_total_histo[t][b] +=
4324                             cvsx->vsx_total_histo[t][b];
4325                 }
4326         }
4327
4328         for (t = 0; t < ZIO_PRIORITY_NUM_QUEUEABLE; t++) {
4329                 for (b = 0; b < ARRAY_SIZE(vsx->vsx_queue_histo[0]); b++) {
4330                         vsx->vsx_queue_histo[t][b] +=
4331                             cvsx->vsx_queue_histo[t][b];
4332                 }
4333                 vsx->vsx_active_queue[t] += cvsx->vsx_active_queue[t];
4334                 vsx->vsx_pend_queue[t] += cvsx->vsx_pend_queue[t];
4335
4336                 for (b = 0; b < ARRAY_SIZE(vsx->vsx_ind_histo[0]); b++)
4337                         vsx->vsx_ind_histo[t][b] += cvsx->vsx_ind_histo[t][b];
4338
4339                 for (b = 0; b < ARRAY_SIZE(vsx->vsx_agg_histo[0]); b++)
4340                         vsx->vsx_agg_histo[t][b] += cvsx->vsx_agg_histo[t][b];
4341         }
4342
4343 }
4344
4345 boolean_t
4346 vdev_is_spacemap_addressable(vdev_t *vd)
4347 {
4348         if (spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_SPACEMAP_V2))
4349                 return (B_TRUE);
4350
4351         /*
4352          * If double-word space map entries are not enabled we assume
4353          * 47 bits of the space map entry are dedicated to the entry's
4354          * offset (see SM_OFFSET_BITS in space_map.h). We then use that
4355          * to calculate the maximum address that can be described by a
4356          * space map entry for the given device.
4357          */
4358         uint64_t shift = vd->vdev_ashift + SM_OFFSET_BITS;
4359
4360         if (shift >= 63) /* detect potential overflow */
4361                 return (B_TRUE);
4362
4363         return (vd->vdev_asize < (1ULL << shift));
4364 }
4365
4366 /*
4367  * Get statistics for the given vdev.
4368  */
4369 static void
4370 vdev_get_stats_ex_impl(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
4371 {
4372         int t;
4373         /*
4374          * If we're getting stats on the root vdev, aggregate the I/O counts
4375          * over all top-level vdevs (i.e. the direct children of the root).
4376          */
4377         if (!vd->vdev_ops->vdev_op_leaf) {
4378                 if (vs) {
4379                         memset(vs->vs_ops, 0, sizeof (vs->vs_ops));
4380                         memset(vs->vs_bytes, 0, sizeof (vs->vs_bytes));
4381                 }
4382                 if (vsx)
4383                         memset(vsx, 0, sizeof (*vsx));
4384
4385                 for (int c = 0; c < vd->vdev_children; c++) {
4386                         vdev_t *cvd = vd->vdev_child[c];
4387                         vdev_stat_t *cvs = &cvd->vdev_stat;
4388                         vdev_stat_ex_t *cvsx = &cvd->vdev_stat_ex;
4389
4390                         vdev_get_stats_ex_impl(cvd, cvs, cvsx);
4391                         if (vs)
4392                                 vdev_get_child_stat(cvd, vs, cvs);
4393                         if (vsx)
4394                                 vdev_get_child_stat_ex(cvd, vsx, cvsx);
4395                 }
4396         } else {
4397                 /*
4398                  * We're a leaf.  Just copy our ZIO active queue stats in.  The
4399                  * other leaf stats are updated in vdev_stat_update().
4400                  */
4401                 if (!vsx)
4402                         return;
4403
4404                 memcpy(vsx, &vd->vdev_stat_ex, sizeof (vd->vdev_stat_ex));
4405
4406                 for (t = 0; t < ARRAY_SIZE(vd->vdev_queue.vq_class); t++) {
4407                         vsx->vsx_active_queue[t] =
4408                             vd->vdev_queue.vq_class[t].vqc_active;
4409                         vsx->vsx_pend_queue[t] = avl_numnodes(
4410                             &vd->vdev_queue.vq_class[t].vqc_queued_tree);
4411                 }
4412         }
4413 }
4414
4415 void
4416 vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
4417 {
4418         vdev_t *tvd = vd->vdev_top;
4419         mutex_enter(&vd->vdev_stat_lock);
4420         if (vs) {
4421                 memcpy(vs, &vd->vdev_stat, sizeof (*vs));
4422                 vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
4423                 vs->vs_state = vd->vdev_state;
4424                 vs->vs_rsize = vdev_get_min_asize(vd);
4425
4426                 if (vd->vdev_ops->vdev_op_leaf) {
4427                         vs->vs_pspace = vd->vdev_psize;
4428                         vs->vs_rsize += VDEV_LABEL_START_SIZE +
4429                             VDEV_LABEL_END_SIZE;
4430                         /*
4431                          * Report initializing progress. Since we don't
4432                          * have the initializing locks held, this is only
4433                          * an estimate (although a fairly accurate one).
4434                          */
4435                         vs->vs_initialize_bytes_done =
4436                             vd->vdev_initialize_bytes_done;
4437                         vs->vs_initialize_bytes_est =
4438                             vd->vdev_initialize_bytes_est;
4439                         vs->vs_initialize_state = vd->vdev_initialize_state;
4440                         vs->vs_initialize_action_time =
4441                             vd->vdev_initialize_action_time;
4442
4443                         /*
4444                          * Report manual TRIM progress. Since we don't have
4445                          * the manual TRIM locks held, this is only an
4446                          * estimate (although fairly accurate one).
4447                          */
4448                         vs->vs_trim_notsup = !vd->vdev_has_trim;
4449                         vs->vs_trim_bytes_done = vd->vdev_trim_bytes_done;
4450                         vs->vs_trim_bytes_est = vd->vdev_trim_bytes_est;
4451                         vs->vs_trim_state = vd->vdev_trim_state;
4452                         vs->vs_trim_action_time = vd->vdev_trim_action_time;
4453
4454                         /* Set when there is a deferred resilver. */
4455                         vs->vs_resilver_deferred = vd->vdev_resilver_deferred;
4456                 }
4457
4458                 /*
4459                  * Report expandable space on top-level, non-auxiliary devices
4460                  * only. The expandable space is reported in terms of metaslab
4461                  * sized units since that determines how much space the pool
4462                  * can expand.
4463                  */
4464                 if (vd->vdev_aux == NULL && tvd != NULL) {
4465                         vs->vs_esize = P2ALIGN(
4466                             vd->vdev_max_asize - vd->vdev_asize,
4467                             1ULL << tvd->vdev_ms_shift);
4468                 }
4469
4470                 vs->vs_configured_ashift = vd->vdev_top != NULL
4471                     ? vd->vdev_top->vdev_ashift : vd->vdev_ashift;
4472                 vs->vs_logical_ashift = vd->vdev_logical_ashift;
4473                 vs->vs_physical_ashift = vd->vdev_physical_ashift;
4474
4475                 /*
4476                  * Report fragmentation and rebuild progress for top-level,
4477                  * non-auxiliary, concrete devices.
4478                  */
4479                 if (vd->vdev_aux == NULL && vd == vd->vdev_top &&
4480                     vdev_is_concrete(vd)) {
4481                         /*
4482                          * The vdev fragmentation rating doesn't take into
4483                          * account the embedded slog metaslab (vdev_log_mg).
4484                          * Since it's only one metaslab, it would have a tiny
4485                          * impact on the overall fragmentation.
4486                          */
4487                         vs->vs_fragmentation = (vd->vdev_mg != NULL) ?
4488                             vd->vdev_mg->mg_fragmentation : 0;
4489                 }
4490                 vs->vs_noalloc = MAX(vd->vdev_noalloc,
4491                     tvd ? tvd->vdev_noalloc : 0);
4492         }
4493
4494         vdev_get_stats_ex_impl(vd, vs, vsx);
4495         mutex_exit(&vd->vdev_stat_lock);
4496 }
4497
4498 void
4499 vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
4500 {
4501         return (vdev_get_stats_ex(vd, vs, NULL));
4502 }
4503
4504 void
4505 vdev_clear_stats(vdev_t *vd)
4506 {
4507         mutex_enter(&vd->vdev_stat_lock);
4508         vd->vdev_stat.vs_space = 0;
4509         vd->vdev_stat.vs_dspace = 0;
4510         vd->vdev_stat.vs_alloc = 0;
4511         mutex_exit(&vd->vdev_stat_lock);
4512 }
4513
4514 void
4515 vdev_scan_stat_init(vdev_t *vd)
4516 {
4517         vdev_stat_t *vs = &vd->vdev_stat;
4518
4519         for (int c = 0; c < vd->vdev_children; c++)
4520                 vdev_scan_stat_init(vd->vdev_child[c]);
4521
4522         mutex_enter(&vd->vdev_stat_lock);
4523         vs->vs_scan_processed = 0;
4524         mutex_exit(&vd->vdev_stat_lock);
4525 }
4526
4527 void
4528 vdev_stat_update(zio_t *zio, uint64_t psize)
4529 {
4530         spa_t *spa = zio->io_spa;
4531         vdev_t *rvd = spa->spa_root_vdev;
4532         vdev_t *vd = zio->io_vd ? zio->io_vd : rvd;
4533         vdev_t *pvd;
4534         uint64_t txg = zio->io_txg;
4535         vdev_stat_t *vs = vd ? &vd->vdev_stat : NULL;
4536         vdev_stat_ex_t *vsx = vd ? &vd->vdev_stat_ex : NULL;
4537         zio_type_t type = zio->io_type;
4538         int flags = zio->io_flags;
4539
4540         /*
4541          * If this i/o is a gang leader, it didn't do any actual work.
4542          */
4543         if (zio->io_gang_tree)
4544                 return;
4545
4546         if (zio->io_error == 0) {
4547                 /*
4548                  * If this is a root i/o, don't count it -- we've already
4549                  * counted the top-level vdevs, and vdev_get_stats() will
4550                  * aggregate them when asked.  This reduces contention on
4551                  * the root vdev_stat_lock and implicitly handles blocks
4552                  * that compress away to holes, for which there is no i/o.
4553                  * (Holes never create vdev children, so all the counters
4554                  * remain zero, which is what we want.)
4555                  *
4556                  * Note: this only applies to successful i/o (io_error == 0)
4557                  * because unlike i/o counts, errors are not additive.
4558                  * When reading a ditto block, for example, failure of
4559                  * one top-level vdev does not imply a root-level error.
4560                  */
4561                 if (vd == rvd)
4562                         return;
4563
4564                 ASSERT(vd == zio->io_vd);
4565
4566                 if (flags & ZIO_FLAG_IO_BYPASS)
4567                         return;
4568
4569                 mutex_enter(&vd->vdev_stat_lock);
4570
4571                 if (flags & ZIO_FLAG_IO_REPAIR) {
4572                         /*
4573                          * Repair is the result of a resilver issued by the
4574                          * scan thread (spa_sync).
4575                          */
4576                         if (flags & ZIO_FLAG_SCAN_THREAD) {
4577                                 dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
4578                                 dsl_scan_phys_t *scn_phys = &scn->scn_phys;
4579                                 uint64_t *processed = &scn_phys->scn_processed;
4580
4581                                 if (vd->vdev_ops->vdev_op_leaf)
4582                                         atomic_add_64(processed, psize);
4583                                 vs->vs_scan_processed += psize;
4584                         }
4585
4586                         /*
4587                          * Repair is the result of a rebuild issued by the
4588                          * rebuild thread (vdev_rebuild_thread).  To avoid
4589                          * double counting repaired bytes the virtual dRAID
4590                          * spare vdev is excluded from the processed bytes.
4591                          */
4592                         if (zio->io_priority == ZIO_PRIORITY_REBUILD) {
4593                                 vdev_t *tvd = vd->vdev_top;
4594                                 vdev_rebuild_t *vr = &tvd->vdev_rebuild_config;
4595                                 vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
4596                                 uint64_t *rebuilt = &vrp->vrp_bytes_rebuilt;
4597
4598                                 if (vd->vdev_ops->vdev_op_leaf &&
4599                                     vd->vdev_ops != &vdev_draid_spare_ops) {
4600                                         atomic_add_64(rebuilt, psize);
4601                                 }
4602                                 vs->vs_rebuild_processed += psize;
4603                         }
4604
4605                         if (flags & ZIO_FLAG_SELF_HEAL)
4606                                 vs->vs_self_healed += psize;
4607                 }
4608
4609                 /*
4610                  * The bytes/ops/histograms are recorded at the leaf level and
4611                  * aggregated into the higher level vdevs in vdev_get_stats().
4612                  */
4613                 if (vd->vdev_ops->vdev_op_leaf &&
4614                     (zio->io_priority < ZIO_PRIORITY_NUM_QUEUEABLE)) {
4615                         zio_type_t vs_type = type;
4616                         zio_priority_t priority = zio->io_priority;
4617
4618                         /*
4619                          * TRIM ops and bytes are reported to user space as
4620                          * ZIO_TYPE_IOCTL.  This is done to preserve the
4621                          * vdev_stat_t structure layout for user space.
4622                          */
4623                         if (type == ZIO_TYPE_TRIM)
4624                                 vs_type = ZIO_TYPE_IOCTL;
4625
4626                         /*
4627                          * Solely for the purposes of 'zpool iostat -lqrw'
4628                          * reporting use the priority to categorize the IO.
4629                          * Only the following are reported to user space:
4630                          *
4631                          *   ZIO_PRIORITY_SYNC_READ,
4632                          *   ZIO_PRIORITY_SYNC_WRITE,
4633                          *   ZIO_PRIORITY_ASYNC_READ,
4634                          *   ZIO_PRIORITY_ASYNC_WRITE,
4635                          *   ZIO_PRIORITY_SCRUB,
4636                          *   ZIO_PRIORITY_TRIM,
4637                          *   ZIO_PRIORITY_REBUILD.
4638                          */
4639                         if (priority == ZIO_PRIORITY_INITIALIZING) {
4640                                 ASSERT3U(type, ==, ZIO_TYPE_WRITE);
4641                                 priority = ZIO_PRIORITY_ASYNC_WRITE;
4642                         } else if (priority == ZIO_PRIORITY_REMOVAL) {
4643                                 priority = ((type == ZIO_TYPE_WRITE) ?
4644                                     ZIO_PRIORITY_ASYNC_WRITE :
4645                                     ZIO_PRIORITY_ASYNC_READ);
4646                         }
4647
4648                         vs->vs_ops[vs_type]++;
4649                         vs->vs_bytes[vs_type] += psize;
4650
4651                         if (flags & ZIO_FLAG_DELEGATED) {
4652                                 vsx->vsx_agg_histo[priority]
4653                                     [RQ_HISTO(zio->io_size)]++;
4654                         } else {
4655                                 vsx->vsx_ind_histo[priority]
4656                                     [RQ_HISTO(zio->io_size)]++;
4657                         }
4658
4659                         if (zio->io_delta && zio->io_delay) {
4660                                 vsx->vsx_queue_histo[priority]
4661                                     [L_HISTO(zio->io_delta - zio->io_delay)]++;
4662                                 vsx->vsx_disk_histo[type]
4663                                     [L_HISTO(zio->io_delay)]++;
4664                                 vsx->vsx_total_histo[type]
4665                                     [L_HISTO(zio->io_delta)]++;
4666                         }
4667                 }
4668
4669                 mutex_exit(&vd->vdev_stat_lock);
4670                 return;
4671         }
4672
4673         if (flags & ZIO_FLAG_SPECULATIVE)
4674                 return;
4675
4676         /*
4677          * If this is an I/O error that is going to be retried, then ignore the
4678          * error.  Otherwise, the user may interpret B_FAILFAST I/O errors as
4679          * hard errors, when in reality they can happen for any number of
4680          * innocuous reasons (bus resets, MPxIO link failure, etc).
4681          */
4682         if (zio->io_error == EIO &&
4683             !(zio->io_flags & ZIO_FLAG_IO_RETRY))
4684                 return;
4685
4686         /*
4687          * Intent logs writes won't propagate their error to the root
4688          * I/O so don't mark these types of failures as pool-level
4689          * errors.
4690          */
4691         if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
4692                 return;
4693
4694         if (type == ZIO_TYPE_WRITE && txg != 0 &&
4695             (!(flags & ZIO_FLAG_IO_REPAIR) ||
4696             (flags & ZIO_FLAG_SCAN_THREAD) ||
4697             spa->spa_claiming)) {
4698                 /*
4699                  * This is either a normal write (not a repair), or it's
4700                  * a repair induced by the scrub thread, or it's a repair
4701                  * made by zil_claim() during spa_load() in the first txg.
4702                  * In the normal case, we commit the DTL change in the same
4703                  * txg as the block was born.  In the scrub-induced repair
4704                  * case, we know that scrubs run in first-pass syncing context,
4705                  * so we commit the DTL change in spa_syncing_txg(spa).
4706                  * In the zil_claim() case, we commit in spa_first_txg(spa).
4707                  *
4708                  * We currently do not make DTL entries for failed spontaneous
4709                  * self-healing writes triggered by normal (non-scrubbing)
4710                  * reads, because we have no transactional context in which to
4711                  * do so -- and it's not clear that it'd be desirable anyway.
4712                  */
4713                 if (vd->vdev_ops->vdev_op_leaf) {
4714                         uint64_t commit_txg = txg;
4715                         if (flags & ZIO_FLAG_SCAN_THREAD) {
4716                                 ASSERT(flags & ZIO_FLAG_IO_REPAIR);
4717                                 ASSERT(spa_sync_pass(spa) == 1);
4718                                 vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1);
4719                                 commit_txg = spa_syncing_txg(spa);
4720                         } else if (spa->spa_claiming) {
4721                                 ASSERT(flags & ZIO_FLAG_IO_REPAIR);
4722                                 commit_txg = spa_first_txg(spa);
4723                         }
4724                         ASSERT(commit_txg >= spa_syncing_txg(spa));
4725                         if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1))
4726                                 return;
4727                         for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
4728                                 vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, 1);
4729                         vdev_dirty(vd->vdev_top, VDD_DTL, vd, commit_txg);
4730                 }
4731                 if (vd != rvd)
4732                         vdev_dtl_dirty(vd, DTL_MISSING, txg, 1);
4733         }
4734 }
4735
4736 int64_t
4737 vdev_deflated_space(vdev_t *vd, int64_t space)
4738 {
4739         ASSERT((space & (SPA_MINBLOCKSIZE-1)) == 0);
4740         ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache);
4741
4742         return ((space >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio);
4743 }
4744
4745 /*
4746  * Update the in-core space usage stats for this vdev, its metaslab class,
4747  * and the root vdev.
4748  */
4749 void
4750 vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta,
4751     int64_t space_delta)
4752 {
4753         (void) defer_delta;
4754         int64_t dspace_delta;
4755         spa_t *spa = vd->vdev_spa;
4756         vdev_t *rvd = spa->spa_root_vdev;
4757
4758         ASSERT(vd == vd->vdev_top);
4759
4760         /*
4761          * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion
4762          * factor.  We must calculate this here and not at the root vdev
4763          * because the root vdev's psize-to-asize is simply the max of its
4764          * children's, thus not accurate enough for us.
4765          */
4766         dspace_delta = vdev_deflated_space(vd, space_delta);
4767
4768         mutex_enter(&vd->vdev_stat_lock);
4769         /* ensure we won't underflow */
4770         if (alloc_delta < 0) {
4771                 ASSERT3U(vd->vdev_stat.vs_alloc, >=, -alloc_delta);
4772         }
4773
4774         vd->vdev_stat.vs_alloc += alloc_delta;
4775         vd->vdev_stat.vs_space += space_delta;
4776         vd->vdev_stat.vs_dspace += dspace_delta;
4777         mutex_exit(&vd->vdev_stat_lock);
4778
4779         /* every class but log contributes to root space stats */
4780         if (vd->vdev_mg != NULL && !vd->vdev_islog) {
4781                 ASSERT(!vd->vdev_isl2cache);
4782                 mutex_enter(&rvd->vdev_stat_lock);
4783                 rvd->vdev_stat.vs_alloc += alloc_delta;
4784                 rvd->vdev_stat.vs_space += space_delta;
4785                 rvd->vdev_stat.vs_dspace += dspace_delta;
4786                 mutex_exit(&rvd->vdev_stat_lock);
4787         }
4788         /* Note: metaslab_class_space_update moved to metaslab_space_update */
4789 }
4790
4791 /*
4792  * Mark a top-level vdev's config as dirty, placing it on the dirty list
4793  * so that it will be written out next time the vdev configuration is synced.
4794  * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs.
4795  */
4796 void
4797 vdev_config_dirty(vdev_t *vd)
4798 {
4799         spa_t *spa = vd->vdev_spa;
4800         vdev_t *rvd = spa->spa_root_vdev;
4801         int c;
4802
4803         ASSERT(spa_writeable(spa));
4804
4805         /*
4806          * If this is an aux vdev (as with l2cache and spare devices), then we
4807          * update the vdev config manually and set the sync flag.
4808          */
4809         if (vd->vdev_aux != NULL) {
4810                 spa_aux_vdev_t *sav = vd->vdev_aux;
4811                 nvlist_t **aux;
4812                 uint_t naux;
4813
4814                 for (c = 0; c < sav->sav_count; c++) {
4815                         if (sav->sav_vdevs[c] == vd)
4816                                 break;
4817                 }
4818
4819                 if (c == sav->sav_count) {
4820                         /*
4821                          * We're being removed.  There's nothing more to do.
4822                          */
4823                         ASSERT(sav->sav_sync == B_TRUE);
4824                         return;
4825                 }
4826
4827                 sav->sav_sync = B_TRUE;
4828
4829                 if (nvlist_lookup_nvlist_array(sav->sav_config,
4830                     ZPOOL_CONFIG_L2CACHE, &aux, &naux) != 0) {
4831                         VERIFY(nvlist_lookup_nvlist_array(sav->sav_config,
4832                             ZPOOL_CONFIG_SPARES, &aux, &naux) == 0);
4833                 }
4834
4835                 ASSERT(c < naux);
4836
4837                 /*
4838                  * Setting the nvlist in the middle if the array is a little
4839                  * sketchy, but it will work.
4840                  */
4841                 nvlist_free(aux[c]);
4842                 aux[c] = vdev_config_generate(spa, vd, B_TRUE, 0);
4843
4844                 return;
4845         }
4846
4847         /*
4848          * The dirty list is protected by the SCL_CONFIG lock.  The caller
4849          * must either hold SCL_CONFIG as writer, or must be the sync thread
4850          * (which holds SCL_CONFIG as reader).  There's only one sync thread,
4851          * so this is sufficient to ensure mutual exclusion.
4852          */
4853         ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) ||
4854             (dsl_pool_sync_context(spa_get_dsl(spa)) &&
4855             spa_config_held(spa, SCL_CONFIG, RW_READER)));
4856
4857         if (vd == rvd) {
4858                 for (c = 0; c < rvd->vdev_children; c++)
4859                         vdev_config_dirty(rvd->vdev_child[c]);
4860         } else {
4861                 ASSERT(vd == vd->vdev_top);
4862
4863                 if (!list_link_active(&vd->vdev_config_dirty_node) &&
4864                     vdev_is_concrete(vd)) {
4865                         list_insert_head(&spa->spa_config_dirty_list, vd);
4866                 }
4867         }
4868 }
4869
4870 void
4871 vdev_config_clean(vdev_t *vd)
4872 {
4873         spa_t *spa = vd->vdev_spa;
4874
4875         ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) ||
4876             (dsl_pool_sync_context(spa_get_dsl(spa)) &&
4877             spa_config_held(spa, SCL_CONFIG, RW_READER)));
4878
4879         ASSERT(list_link_active(&vd->vdev_config_dirty_node));
4880         list_remove(&spa->spa_config_dirty_list, vd);
4881 }
4882
4883 /*
4884  * Mark a top-level vdev's state as dirty, so that the next pass of
4885  * spa_sync() can convert this into vdev_config_dirty().  We distinguish
4886  * the state changes from larger config changes because they require
4887  * much less locking, and are often needed for administrative actions.
4888  */
4889 void
4890 vdev_state_dirty(vdev_t *vd)
4891 {
4892         spa_t *spa = vd->vdev_spa;
4893
4894         ASSERT(spa_writeable(spa));
4895         ASSERT(vd == vd->vdev_top);
4896
4897         /*
4898          * The state list is protected by the SCL_STATE lock.  The caller
4899          * must either hold SCL_STATE as writer, or must be the sync thread
4900          * (which holds SCL_STATE as reader).  There's only one sync thread,
4901          * so this is sufficient to ensure mutual exclusion.
4902          */
4903         ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) ||
4904             (dsl_pool_sync_context(spa_get_dsl(spa)) &&
4905             spa_config_held(spa, SCL_STATE, RW_READER)));
4906
4907         if (!list_link_active(&vd->vdev_state_dirty_node) &&
4908             vdev_is_concrete(vd))
4909                 list_insert_head(&spa->spa_state_dirty_list, vd);
4910 }
4911
4912 void
4913 vdev_state_clean(vdev_t *vd)
4914 {
4915         spa_t *spa = vd->vdev_spa;
4916
4917         ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) ||
4918             (dsl_pool_sync_context(spa_get_dsl(spa)) &&
4919             spa_config_held(spa, SCL_STATE, RW_READER)));
4920
4921         ASSERT(list_link_active(&vd->vdev_state_dirty_node));
4922         list_remove(&spa->spa_state_dirty_list, vd);
4923 }
4924
4925 /*
4926  * Propagate vdev state up from children to parent.
4927  */
4928 void
4929 vdev_propagate_state(vdev_t *vd)
4930 {
4931         spa_t *spa = vd->vdev_spa;
4932         vdev_t *rvd = spa->spa_root_vdev;
4933         int degraded = 0, faulted = 0;
4934         int corrupted = 0;
4935         vdev_t *child;
4936
4937         if (vd->vdev_children > 0) {
4938                 for (int c = 0; c < vd->vdev_children; c++) {
4939                         child = vd->vdev_child[c];
4940
4941                         /*
4942                          * Don't factor holes or indirect vdevs into the
4943                          * decision.
4944                          */
4945                         if (!vdev_is_concrete(child))
4946                                 continue;
4947
4948                         if (!vdev_readable(child) ||
4949                             (!vdev_writeable(child) && spa_writeable(spa))) {
4950                                 /*
4951                                  * Root special: if there is a top-level log
4952                                  * device, treat the root vdev as if it were
4953                                  * degraded.
4954                                  */
4955                                 if (child->vdev_islog && vd == rvd)
4956                                         degraded++;
4957                                 else
4958                                         faulted++;
4959                         } else if (child->vdev_state <= VDEV_STATE_DEGRADED) {
4960                                 degraded++;
4961                         }
4962
4963                         if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA)
4964                                 corrupted++;
4965                 }
4966
4967                 vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded);
4968
4969                 /*
4970                  * Root special: if there is a top-level vdev that cannot be
4971                  * opened due to corrupted metadata, then propagate the root
4972                  * vdev's aux state as 'corrupt' rather than 'insufficient
4973                  * replicas'.
4974                  */
4975                 if (corrupted && vd == rvd &&
4976                     rvd->vdev_state == VDEV_STATE_CANT_OPEN)
4977                         vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN,
4978                             VDEV_AUX_CORRUPT_DATA);
4979         }
4980
4981         if (vd->vdev_parent)
4982                 vdev_propagate_state(vd->vdev_parent);
4983 }
4984
4985 /*
4986  * Set a vdev's state.  If this is during an open, we don't update the parent
4987  * state, because we're in the process of opening children depth-first.
4988  * Otherwise, we propagate the change to the parent.
4989  *
4990  * If this routine places a device in a faulted state, an appropriate ereport is
4991  * generated.
4992  */
4993 void
4994 vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
4995 {
4996         uint64_t save_state;
4997         spa_t *spa = vd->vdev_spa;
4998
4999         if (state == vd->vdev_state) {
5000                 /*
5001                  * Since vdev_offline() code path is already in an offline
5002                  * state we can miss a statechange event to OFFLINE. Check
5003                  * the previous state to catch this condition.
5004                  */
5005                 if (vd->vdev_ops->vdev_op_leaf &&
5006                     (state == VDEV_STATE_OFFLINE) &&
5007                     (vd->vdev_prevstate >= VDEV_STATE_FAULTED)) {
5008                         /* post an offline state change */
5009                         zfs_post_state_change(spa, vd, vd->vdev_prevstate);
5010                 }
5011                 vd->vdev_stat.vs_aux = aux;
5012                 return;
5013         }
5014
5015         save_state = vd->vdev_state;
5016
5017         vd->vdev_state = state;
5018         vd->vdev_stat.vs_aux = aux;
5019
5020         /*
5021          * If we are setting the vdev state to anything but an open state, then
5022          * always close the underlying device unless the device has requested
5023          * a delayed close (i.e. we're about to remove or fault the device).
5024          * Otherwise, we keep accessible but invalid devices open forever.
5025          * We don't call vdev_close() itself, because that implies some extra
5026          * checks (offline, etc) that we don't want here.  This is limited to
5027          * leaf devices, because otherwise closing the device will affect other
5028          * children.
5029          */
5030         if (!vd->vdev_delayed_close && vdev_is_dead(vd) &&
5031             vd->vdev_ops->vdev_op_leaf)
5032                 vd->vdev_ops->vdev_op_close(vd);
5033
5034         if (vd->vdev_removed &&
5035             state == VDEV_STATE_CANT_OPEN &&
5036             (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) {
5037                 /*
5038                  * If the previous state is set to VDEV_STATE_REMOVED, then this
5039                  * device was previously marked removed and someone attempted to
5040                  * reopen it.  If this failed due to a nonexistent device, then
5041                  * keep the device in the REMOVED state.  We also let this be if
5042                  * it is one of our special test online cases, which is only
5043                  * attempting to online the device and shouldn't generate an FMA
5044                  * fault.
5045                  */
5046                 vd->vdev_state = VDEV_STATE_REMOVED;
5047                 vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
5048         } else if (state == VDEV_STATE_REMOVED) {
5049                 vd->vdev_removed = B_TRUE;
5050         } else if (state == VDEV_STATE_CANT_OPEN) {
5051                 /*
5052                  * If we fail to open a vdev during an import or recovery, we
5053                  * mark it as "not available", which signifies that it was
5054                  * never there to begin with.  Failure to open such a device
5055                  * is not considered an error.
5056                  */
5057                 if ((spa_load_state(spa) == SPA_LOAD_IMPORT ||
5058                     spa_load_state(spa) == SPA_LOAD_RECOVER) &&
5059                     vd->vdev_ops->vdev_op_leaf)
5060                         vd->vdev_not_present = 1;
5061
5062                 /*
5063                  * Post the appropriate ereport.  If the 'prevstate' field is
5064                  * set to something other than VDEV_STATE_UNKNOWN, it indicates
5065                  * that this is part of a vdev_reopen().  In this case, we don't
5066                  * want to post the ereport if the device was already in the
5067                  * CANT_OPEN state beforehand.
5068                  *
5069                  * If the 'checkremove' flag is set, then this is an attempt to
5070                  * online the device in response to an insertion event.  If we
5071                  * hit this case, then we have detected an insertion event for a
5072                  * faulted or offline device that wasn't in the removed state.
5073                  * In this scenario, we don't post an ereport because we are
5074                  * about to replace the device, or attempt an online with
5075                  * vdev_forcefault, which will generate the fault for us.
5076                  */
5077                 if ((vd->vdev_prevstate != state || vd->vdev_forcefault) &&
5078                     !vd->vdev_not_present && !vd->vdev_checkremove &&
5079                     vd != spa->spa_root_vdev) {
5080                         const char *class;
5081
5082                         switch (aux) {
5083                         case VDEV_AUX_OPEN_FAILED:
5084                                 class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED;
5085                                 break;
5086                         case VDEV_AUX_CORRUPT_DATA:
5087                                 class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA;
5088                                 break;
5089                         case VDEV_AUX_NO_REPLICAS:
5090                                 class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS;
5091                                 break;
5092                         case VDEV_AUX_BAD_GUID_SUM:
5093                                 class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM;
5094                                 break;
5095                         case VDEV_AUX_TOO_SMALL:
5096                                 class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL;
5097                                 break;
5098                         case VDEV_AUX_BAD_LABEL:
5099                                 class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL;
5100                                 break;
5101                         case VDEV_AUX_BAD_ASHIFT:
5102                                 class = FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT;
5103                                 break;
5104                         default:
5105                                 class = FM_EREPORT_ZFS_DEVICE_UNKNOWN;
5106                         }
5107
5108                         (void) zfs_ereport_post(class, spa, vd, NULL, NULL,
5109                             save_state);
5110                 }
5111
5112                 /* Erase any notion of persistent removed state */
5113                 vd->vdev_removed = B_FALSE;
5114         } else {
5115                 vd->vdev_removed = B_FALSE;
5116         }
5117
5118         /*
5119          * Notify ZED of any significant state-change on a leaf vdev.
5120          *
5121          */
5122         if (vd->vdev_ops->vdev_op_leaf) {
5123                 /* preserve original state from a vdev_reopen() */
5124                 if ((vd->vdev_prevstate != VDEV_STATE_UNKNOWN) &&
5125                     (vd->vdev_prevstate != vd->vdev_state) &&
5126                     (save_state <= VDEV_STATE_CLOSED))
5127                         save_state = vd->vdev_prevstate;
5128
5129                 /* filter out state change due to initial vdev_open */
5130                 if (save_state > VDEV_STATE_CLOSED)
5131                         zfs_post_state_change(spa, vd, save_state);
5132         }
5133
5134         if (!isopen && vd->vdev_parent)
5135                 vdev_propagate_state(vd->vdev_parent);
5136 }
5137
5138 boolean_t
5139 vdev_children_are_offline(vdev_t *vd)
5140 {
5141         ASSERT(!vd->vdev_ops->vdev_op_leaf);
5142
5143         for (uint64_t i = 0; i < vd->vdev_children; i++) {
5144                 if (vd->vdev_child[i]->vdev_state != VDEV_STATE_OFFLINE)
5145                         return (B_FALSE);
5146         }
5147
5148         return (B_TRUE);
5149 }
5150
5151 /*
5152  * Check the vdev configuration to ensure that it's capable of supporting
5153  * a root pool. We do not support partial configuration.
5154  */
5155 boolean_t
5156 vdev_is_bootable(vdev_t *vd)
5157 {
5158         if (!vd->vdev_ops->vdev_op_leaf) {
5159                 const char *vdev_type = vd->vdev_ops->vdev_op_type;
5160
5161                 if (strcmp(vdev_type, VDEV_TYPE_MISSING) == 0)
5162                         return (B_FALSE);
5163         }
5164
5165         for (int c = 0; c < vd->vdev_children; c++) {
5166                 if (!vdev_is_bootable(vd->vdev_child[c]))
5167                         return (B_FALSE);
5168         }
5169         return (B_TRUE);
5170 }
5171
5172 boolean_t
5173 vdev_is_concrete(vdev_t *vd)
5174 {
5175         vdev_ops_t *ops = vd->vdev_ops;
5176         if (ops == &vdev_indirect_ops || ops == &vdev_hole_ops ||
5177             ops == &vdev_missing_ops || ops == &vdev_root_ops) {
5178                 return (B_FALSE);
5179         } else {
5180                 return (B_TRUE);
5181         }
5182 }
5183
5184 /*
5185  * Determine if a log device has valid content.  If the vdev was
5186  * removed or faulted in the MOS config then we know that
5187  * the content on the log device has already been written to the pool.
5188  */
5189 boolean_t
5190 vdev_log_state_valid(vdev_t *vd)
5191 {
5192         if (vd->vdev_ops->vdev_op_leaf && !vd->vdev_faulted &&
5193             !vd->vdev_removed)
5194                 return (B_TRUE);
5195
5196         for (int c = 0; c < vd->vdev_children; c++)
5197                 if (vdev_log_state_valid(vd->vdev_child[c]))
5198                         return (B_TRUE);
5199
5200         return (B_FALSE);
5201 }
5202
5203 /*
5204  * Expand a vdev if possible.
5205  */
5206 void
5207 vdev_expand(vdev_t *vd, uint64_t txg)
5208 {
5209         ASSERT(vd->vdev_top == vd);
5210         ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
5211         ASSERT(vdev_is_concrete(vd));
5212
5213         vdev_set_deflate_ratio(vd);
5214
5215         if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count &&
5216             vdev_is_concrete(vd)) {
5217                 vdev_metaslab_group_create(vd);
5218                 VERIFY(vdev_metaslab_init(vd, txg) == 0);
5219                 vdev_config_dirty(vd);
5220         }
5221 }
5222
5223 /*
5224  * Split a vdev.
5225  */
5226 void
5227 vdev_split(vdev_t *vd)
5228 {
5229         vdev_t *cvd, *pvd = vd->vdev_parent;
5230
5231         vdev_remove_child(pvd, vd);
5232         vdev_compact_children(pvd);
5233
5234         cvd = pvd->vdev_child[0];
5235         if (pvd->vdev_children == 1) {
5236                 vdev_remove_parent(cvd);
5237                 cvd->vdev_splitting = B_TRUE;
5238         }
5239         vdev_propagate_state(cvd);
5240 }
5241
5242 void
5243 vdev_deadman(vdev_t *vd, char *tag)
5244 {
5245         for (int c = 0; c < vd->vdev_children; c++) {
5246                 vdev_t *cvd = vd->vdev_child[c];
5247
5248                 vdev_deadman(cvd, tag);
5249         }
5250
5251         if (vd->vdev_ops->vdev_op_leaf) {
5252                 vdev_queue_t *vq = &vd->vdev_queue;
5253
5254                 mutex_enter(&vq->vq_lock);
5255                 if (avl_numnodes(&vq->vq_active_tree) > 0) {
5256                         spa_t *spa = vd->vdev_spa;
5257                         zio_t *fio;
5258                         uint64_t delta;
5259
5260                         zfs_dbgmsg("slow vdev: %s has %lu active IOs",
5261                             vd->vdev_path, avl_numnodes(&vq->vq_active_tree));
5262
5263                         /*
5264                          * Look at the head of all the pending queues,
5265                          * if any I/O has been outstanding for longer than
5266                          * the spa_deadman_synctime invoke the deadman logic.
5267                          */
5268                         fio = avl_first(&vq->vq_active_tree);
5269                         delta = gethrtime() - fio->io_timestamp;
5270                         if (delta > spa_deadman_synctime(spa))
5271                                 zio_deadman(fio, tag);
5272                 }
5273                 mutex_exit(&vq->vq_lock);
5274         }
5275 }
5276
5277 void
5278 vdev_defer_resilver(vdev_t *vd)
5279 {
5280         ASSERT(vd->vdev_ops->vdev_op_leaf);
5281
5282         vd->vdev_resilver_deferred = B_TRUE;
5283         vd->vdev_spa->spa_resilver_deferred = B_TRUE;
5284 }
5285
5286 /*
5287  * Clears the resilver deferred flag on all leaf devs under vd. Returns
5288  * B_TRUE if we have devices that need to be resilvered and are available to
5289  * accept resilver I/Os.
5290  */
5291 boolean_t
5292 vdev_clear_resilver_deferred(vdev_t *vd, dmu_tx_t *tx)
5293 {
5294         boolean_t resilver_needed = B_FALSE;
5295         spa_t *spa = vd->vdev_spa;
5296
5297         for (int c = 0; c < vd->vdev_children; c++) {
5298                 vdev_t *cvd = vd->vdev_child[c];
5299                 resilver_needed |= vdev_clear_resilver_deferred(cvd, tx);
5300         }
5301
5302         if (vd == spa->spa_root_vdev &&
5303             spa_feature_is_active(spa, SPA_FEATURE_RESILVER_DEFER)) {
5304                 spa_feature_decr(spa, SPA_FEATURE_RESILVER_DEFER, tx);
5305                 vdev_config_dirty(vd);
5306                 spa->spa_resilver_deferred = B_FALSE;
5307                 return (resilver_needed);
5308         }
5309
5310         if (!vdev_is_concrete(vd) || vd->vdev_aux ||
5311             !vd->vdev_ops->vdev_op_leaf)
5312                 return (resilver_needed);
5313
5314         vd->vdev_resilver_deferred = B_FALSE;
5315
5316         return (!vdev_is_dead(vd) && !vd->vdev_offline &&
5317             vdev_resilver_needed(vd, NULL, NULL));
5318 }
5319
5320 boolean_t
5321 vdev_xlate_is_empty(range_seg64_t *rs)
5322 {
5323         return (rs->rs_start == rs->rs_end);
5324 }
5325
5326 /*
5327  * Translate a logical range to the first contiguous physical range for the
5328  * specified vdev_t.  This function is initially called with a leaf vdev and
5329  * will walk each parent vdev until it reaches a top-level vdev. Once the
5330  * top-level is reached the physical range is initialized and the recursive
5331  * function begins to unwind. As it unwinds it calls the parent's vdev
5332  * specific translation function to do the real conversion.
5333  */
5334 void
5335 vdev_xlate(vdev_t *vd, const range_seg64_t *logical_rs,
5336     range_seg64_t *physical_rs, range_seg64_t *remain_rs)
5337 {
5338         /*
5339          * Walk up the vdev tree
5340          */
5341         if (vd != vd->vdev_top) {
5342                 vdev_xlate(vd->vdev_parent, logical_rs, physical_rs,
5343                     remain_rs);
5344         } else {
5345                 /*
5346                  * We've reached the top-level vdev, initialize the physical
5347                  * range to the logical range and set an empty remaining
5348                  * range then start to unwind.
5349                  */
5350                 physical_rs->rs_start = logical_rs->rs_start;
5351                 physical_rs->rs_end = logical_rs->rs_end;
5352
5353                 remain_rs->rs_start = logical_rs->rs_start;
5354                 remain_rs->rs_end = logical_rs->rs_start;
5355
5356                 return;
5357         }
5358
5359         vdev_t *pvd = vd->vdev_parent;
5360         ASSERT3P(pvd, !=, NULL);
5361         ASSERT3P(pvd->vdev_ops->vdev_op_xlate, !=, NULL);
5362
5363         /*
5364          * As this recursive function unwinds, translate the logical
5365          * range into its physical and any remaining components by calling
5366          * the vdev specific translate function.
5367          */
5368         range_seg64_t intermediate = { 0 };
5369         pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate, remain_rs);
5370
5371         physical_rs->rs_start = intermediate.rs_start;
5372         physical_rs->rs_end = intermediate.rs_end;
5373 }
5374
5375 void
5376 vdev_xlate_walk(vdev_t *vd, const range_seg64_t *logical_rs,
5377     vdev_xlate_func_t *func, void *arg)
5378 {
5379         range_seg64_t iter_rs = *logical_rs;
5380         range_seg64_t physical_rs;
5381         range_seg64_t remain_rs;
5382
5383         while (!vdev_xlate_is_empty(&iter_rs)) {
5384
5385                 vdev_xlate(vd, &iter_rs, &physical_rs, &remain_rs);
5386
5387                 /*
5388                  * With raidz and dRAID, it's possible that the logical range
5389                  * does not live on this leaf vdev. Only when there is a non-
5390                  * zero physical size call the provided function.
5391                  */
5392                 if (!vdev_xlate_is_empty(&physical_rs))
5393                         func(arg, &physical_rs);
5394
5395                 iter_rs = remain_rs;
5396         }
5397 }
5398
5399 static char *
5400 vdev_name(vdev_t *vd, char *buf, int buflen)
5401 {
5402         if (vd->vdev_path == NULL) {
5403                 if (strcmp(vd->vdev_ops->vdev_op_type, "root") == 0) {
5404                         strlcpy(buf, vd->vdev_spa->spa_name, buflen);
5405                 } else if (!vd->vdev_ops->vdev_op_leaf) {
5406                         snprintf(buf, buflen, "%s-%llu",
5407                             vd->vdev_ops->vdev_op_type,
5408                             (u_longlong_t)vd->vdev_id);
5409                 }
5410         } else {
5411                 strlcpy(buf, vd->vdev_path, buflen);
5412         }
5413         return (buf);
5414 }
5415
5416 /*
5417  * Look at the vdev tree and determine whether any devices are currently being
5418  * replaced.
5419  */
5420 boolean_t
5421 vdev_replace_in_progress(vdev_t *vdev)
5422 {
5423         ASSERT(spa_config_held(vdev->vdev_spa, SCL_ALL, RW_READER) != 0);
5424
5425         if (vdev->vdev_ops == &vdev_replacing_ops)
5426                 return (B_TRUE);
5427
5428         /*
5429          * A 'spare' vdev indicates that we have a replace in progress, unless
5430          * it has exactly two children, and the second, the hot spare, has
5431          * finished being resilvered.
5432          */
5433         if (vdev->vdev_ops == &vdev_spare_ops && (vdev->vdev_children > 2 ||
5434             !vdev_dtl_empty(vdev->vdev_child[1], DTL_MISSING)))
5435                 return (B_TRUE);
5436
5437         for (int i = 0; i < vdev->vdev_children; i++) {
5438                 if (vdev_replace_in_progress(vdev->vdev_child[i]))
5439                         return (B_TRUE);
5440         }
5441
5442         return (B_FALSE);
5443 }
5444
5445 /*
5446  * Add a (source=src, propname=propval) list to an nvlist.
5447  */
5448 static void
5449 vdev_prop_add_list(nvlist_t *nvl, const char *propname, char *strval,
5450     uint64_t intval, zprop_source_t src)
5451 {
5452         nvlist_t *propval;
5453
5454         propval = fnvlist_alloc();
5455         fnvlist_add_uint64(propval, ZPROP_SOURCE, src);
5456
5457         if (strval != NULL)
5458                 fnvlist_add_string(propval, ZPROP_VALUE, strval);
5459         else
5460                 fnvlist_add_uint64(propval, ZPROP_VALUE, intval);
5461
5462         fnvlist_add_nvlist(nvl, propname, propval);
5463         nvlist_free(propval);
5464 }
5465
5466 static void
5467 vdev_props_set_sync(void *arg, dmu_tx_t *tx)
5468 {
5469         vdev_t *vd;
5470         nvlist_t *nvp = arg;
5471         spa_t *spa = dmu_tx_pool(tx)->dp_spa;
5472         objset_t *mos = spa->spa_meta_objset;
5473         nvpair_t *elem = NULL;
5474         uint64_t vdev_guid;
5475         nvlist_t *nvprops;
5476
5477         vdev_guid = fnvlist_lookup_uint64(nvp, ZPOOL_VDEV_PROPS_SET_VDEV);
5478         nvprops = fnvlist_lookup_nvlist(nvp, ZPOOL_VDEV_PROPS_SET_PROPS);
5479         vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE);
5480
5481         /* this vdev could get removed while waiting for this sync task */
5482         if (vd == NULL)
5483                 return;
5484
5485         mutex_enter(&spa->spa_props_lock);
5486
5487         while ((elem = nvlist_next_nvpair(nvprops, elem)) != NULL) {
5488                 uint64_t intval, objid = 0;
5489                 char *strval;
5490                 vdev_prop_t prop;
5491                 const char *propname = nvpair_name(elem);
5492                 zprop_type_t proptype;
5493
5494                 /*
5495                  * Set vdev property values in the vdev props mos object.
5496                  */
5497                 if (vd->vdev_top_zap != 0) {
5498                         objid = vd->vdev_top_zap;
5499                 } else if (vd->vdev_leaf_zap != 0) {
5500                         objid = vd->vdev_leaf_zap;
5501                 } else {
5502                         panic("vdev not top or leaf");
5503                 }
5504
5505                 switch (prop = vdev_name_to_prop(propname)) {
5506                 case VDEV_PROP_USER:
5507                         if (vdev_prop_user(propname)) {
5508                                 strval = fnvpair_value_string(elem);
5509                                 if (strlen(strval) == 0) {
5510                                         /* remove the property if value == "" */
5511                                         (void) zap_remove(mos, objid, propname,
5512                                             tx);
5513                                 } else {
5514                                         VERIFY0(zap_update(mos, objid, propname,
5515                                             1, strlen(strval) + 1, strval, tx));
5516                                 }
5517                                 spa_history_log_internal(spa, "vdev set", tx,
5518                                     "vdev_guid=%llu: %s=%s",
5519                                     (u_longlong_t)vdev_guid, nvpair_name(elem),
5520                                     strval);
5521                         }
5522                         break;
5523                 default:
5524                         /* normalize the property name */
5525                         propname = vdev_prop_to_name(prop);
5526                         proptype = vdev_prop_get_type(prop);
5527
5528                         if (nvpair_type(elem) == DATA_TYPE_STRING) {
5529                                 ASSERT(proptype == PROP_TYPE_STRING);
5530                                 strval = fnvpair_value_string(elem);
5531                                 VERIFY0(zap_update(mos, objid, propname,
5532                                     1, strlen(strval) + 1, strval, tx));
5533                                 spa_history_log_internal(spa, "vdev set", tx,
5534                                     "vdev_guid=%llu: %s=%s",
5535                                     (u_longlong_t)vdev_guid, nvpair_name(elem),
5536                                     strval);
5537                         } else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
5538                                 intval = fnvpair_value_uint64(elem);
5539
5540                                 if (proptype == PROP_TYPE_INDEX) {
5541                                         const char *unused;
5542                                         VERIFY0(vdev_prop_index_to_string(
5543                                             prop, intval, &unused));
5544                                 }
5545                                 VERIFY0(zap_update(mos, objid, propname,
5546                                     sizeof (uint64_t), 1, &intval, tx));
5547                                 spa_history_log_internal(spa, "vdev set", tx,
5548                                     "vdev_guid=%llu: %s=%lld",
5549                                     (u_longlong_t)vdev_guid,
5550                                     nvpair_name(elem), (longlong_t)intval);
5551                         } else {
5552                                 panic("invalid vdev property type %u",
5553                                     nvpair_type(elem));
5554                         }
5555                 }
5556
5557         }
5558
5559         mutex_exit(&spa->spa_props_lock);
5560 }
5561
5562 int
5563 vdev_prop_set(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
5564 {
5565         spa_t *spa = vd->vdev_spa;
5566         nvpair_t *elem = NULL;
5567         uint64_t vdev_guid;
5568         nvlist_t *nvprops;
5569         int error;
5570
5571         ASSERT(vd != NULL);
5572
5573         if (nvlist_lookup_uint64(innvl, ZPOOL_VDEV_PROPS_SET_VDEV,
5574             &vdev_guid) != 0)
5575                 return (SET_ERROR(EINVAL));
5576
5577         if (nvlist_lookup_nvlist(innvl, ZPOOL_VDEV_PROPS_SET_PROPS,
5578             &nvprops) != 0)
5579                 return (SET_ERROR(EINVAL));
5580
5581         if ((vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE)) == NULL)
5582                 return (SET_ERROR(EINVAL));
5583
5584         while ((elem = nvlist_next_nvpair(nvprops, elem)) != NULL) {
5585                 char *propname = nvpair_name(elem);
5586                 vdev_prop_t prop = vdev_name_to_prop(propname);
5587                 uint64_t intval = 0;
5588                 char *strval = NULL;
5589
5590                 if (prop == VDEV_PROP_USER && !vdev_prop_user(propname)) {
5591                         error = EINVAL;
5592                         goto end;
5593                 }
5594
5595                 if (vdev_prop_readonly(prop)) {
5596                         error = EROFS;
5597                         goto end;
5598                 }
5599
5600                 /* Special Processing */
5601                 switch (prop) {
5602                 case VDEV_PROP_PATH:
5603                         if (vd->vdev_path == NULL) {
5604                                 error = EROFS;
5605                                 break;
5606                         }
5607                         if (nvpair_value_string(elem, &strval) != 0) {
5608                                 error = EINVAL;
5609                                 break;
5610                         }
5611                         /* New path must start with /dev/ */
5612                         if (strncmp(strval, "/dev/", 5)) {
5613                                 error = EINVAL;
5614                                 break;
5615                         }
5616                         error = spa_vdev_setpath(spa, vdev_guid, strval);
5617                         break;
5618                 case VDEV_PROP_ALLOCATING:
5619                         if (nvpair_value_uint64(elem, &intval) != 0) {
5620                                 error = EINVAL;
5621                                 break;
5622                         }
5623                         if (intval != vd->vdev_noalloc)
5624                                 break;
5625                         if (intval == 0)
5626                                 error = spa_vdev_noalloc(spa, vdev_guid);
5627                         else
5628                                 error = spa_vdev_alloc(spa, vdev_guid);
5629                         break;
5630                 default:
5631                         /* Most processing is done in vdev_props_set_sync */
5632                         break;
5633                 }
5634 end:
5635                 if (error != 0) {
5636                         intval = error;
5637                         vdev_prop_add_list(outnvl, propname, strval, intval, 0);
5638                         return (error);
5639                 }
5640         }
5641
5642         return (dsl_sync_task(spa->spa_name, NULL, vdev_props_set_sync,
5643             innvl, 6, ZFS_SPACE_CHECK_EXTRA_RESERVED));
5644 }
5645
5646 int
5647 vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
5648 {
5649         spa_t *spa = vd->vdev_spa;
5650         objset_t *mos = spa->spa_meta_objset;
5651         int err = 0;
5652         uint64_t objid;
5653         uint64_t vdev_guid;
5654         nvpair_t *elem = NULL;
5655         nvlist_t *nvprops = NULL;
5656         uint64_t intval = 0;
5657         char *strval = NULL;
5658         const char *propname = NULL;
5659         vdev_prop_t prop;
5660
5661         ASSERT(vd != NULL);
5662         ASSERT(mos != NULL);
5663
5664         if (nvlist_lookup_uint64(innvl, ZPOOL_VDEV_PROPS_GET_VDEV,
5665             &vdev_guid) != 0)
5666                 return (SET_ERROR(EINVAL));
5667
5668         nvlist_lookup_nvlist(innvl, ZPOOL_VDEV_PROPS_GET_PROPS, &nvprops);
5669
5670         if (vd->vdev_top_zap != 0) {
5671                 objid = vd->vdev_top_zap;
5672         } else if (vd->vdev_leaf_zap != 0) {
5673                 objid = vd->vdev_leaf_zap;
5674         } else {
5675                 return (SET_ERROR(EINVAL));
5676         }
5677         ASSERT(objid != 0);
5678
5679         mutex_enter(&spa->spa_props_lock);
5680
5681         if (nvprops != NULL) {
5682                 char namebuf[64] = { 0 };
5683
5684                 while ((elem = nvlist_next_nvpair(nvprops, elem)) != NULL) {
5685                         intval = 0;
5686                         strval = NULL;
5687                         propname = nvpair_name(elem);
5688                         prop = vdev_name_to_prop(propname);
5689                         zprop_source_t src = ZPROP_SRC_DEFAULT;
5690                         uint64_t integer_size, num_integers;
5691
5692                         switch (prop) {
5693                         /* Special Read-only Properties */
5694                         case VDEV_PROP_NAME:
5695                                 strval = vdev_name(vd, namebuf,
5696                                     sizeof (namebuf));
5697                                 if (strval == NULL)
5698                                         continue;
5699                                 vdev_prop_add_list(outnvl, propname, strval, 0,
5700                                     ZPROP_SRC_NONE);
5701                                 continue;
5702                         case VDEV_PROP_CAPACITY:
5703                                 /* percent used */
5704                                 intval = (vd->vdev_stat.vs_dspace == 0) ? 0 :
5705                                     (vd->vdev_stat.vs_alloc * 100 /
5706                                     vd->vdev_stat.vs_dspace);
5707                                 vdev_prop_add_list(outnvl, propname, NULL,
5708                                     intval, ZPROP_SRC_NONE);
5709                                 continue;
5710                         case VDEV_PROP_STATE:
5711                                 vdev_prop_add_list(outnvl, propname, NULL,
5712                                     vd->vdev_state, ZPROP_SRC_NONE);
5713                                 continue;
5714                         case VDEV_PROP_GUID:
5715                                 vdev_prop_add_list(outnvl, propname, NULL,
5716                                     vd->vdev_guid, ZPROP_SRC_NONE);
5717                                 continue;
5718                         case VDEV_PROP_ASIZE:
5719                                 vdev_prop_add_list(outnvl, propname, NULL,
5720                                     vd->vdev_asize, ZPROP_SRC_NONE);
5721                                 continue;
5722                         case VDEV_PROP_PSIZE:
5723                                 vdev_prop_add_list(outnvl, propname, NULL,
5724                                     vd->vdev_psize, ZPROP_SRC_NONE);
5725                                 continue;
5726                         case VDEV_PROP_ASHIFT:
5727                                 vdev_prop_add_list(outnvl, propname, NULL,
5728                                     vd->vdev_ashift, ZPROP_SRC_NONE);
5729                                 continue;
5730                         case VDEV_PROP_SIZE:
5731                                 vdev_prop_add_list(outnvl, propname, NULL,
5732                                     vd->vdev_stat.vs_dspace, ZPROP_SRC_NONE);
5733                                 continue;
5734                         case VDEV_PROP_FREE:
5735                                 vdev_prop_add_list(outnvl, propname, NULL,
5736                                     vd->vdev_stat.vs_dspace -
5737                                     vd->vdev_stat.vs_alloc, ZPROP_SRC_NONE);
5738                                 continue;
5739                         case VDEV_PROP_ALLOCATED:
5740                                 vdev_prop_add_list(outnvl, propname, NULL,
5741                                     vd->vdev_stat.vs_alloc, ZPROP_SRC_NONE);
5742                                 continue;
5743                         case VDEV_PROP_EXPANDSZ:
5744                                 vdev_prop_add_list(outnvl, propname, NULL,
5745                                     vd->vdev_stat.vs_esize, ZPROP_SRC_NONE);
5746                                 continue;
5747                         case VDEV_PROP_FRAGMENTATION:
5748                                 vdev_prop_add_list(outnvl, propname, NULL,
5749                                     vd->vdev_stat.vs_fragmentation,
5750                                     ZPROP_SRC_NONE);
5751                                 continue;
5752                         case VDEV_PROP_PARITY:
5753                                 vdev_prop_add_list(outnvl, propname, NULL,
5754                                     vdev_get_nparity(vd), ZPROP_SRC_NONE);
5755                                 continue;
5756                         case VDEV_PROP_PATH:
5757                                 if (vd->vdev_path == NULL)
5758                                         continue;
5759                                 vdev_prop_add_list(outnvl, propname,
5760                                     vd->vdev_path, 0, ZPROP_SRC_NONE);
5761                                 continue;
5762                         case VDEV_PROP_DEVID:
5763                                 if (vd->vdev_devid == NULL)
5764                                         continue;
5765                                 vdev_prop_add_list(outnvl, propname,
5766                                     vd->vdev_devid, 0, ZPROP_SRC_NONE);
5767                                 continue;
5768                         case VDEV_PROP_PHYS_PATH:
5769                                 if (vd->vdev_physpath == NULL)
5770                                         continue;
5771                                 vdev_prop_add_list(outnvl, propname,
5772                                     vd->vdev_physpath, 0, ZPROP_SRC_NONE);
5773                                 continue;
5774                         case VDEV_PROP_ENC_PATH:
5775                                 if (vd->vdev_enc_sysfs_path == NULL)
5776                                         continue;
5777                                 vdev_prop_add_list(outnvl, propname,
5778                                     vd->vdev_enc_sysfs_path, 0, ZPROP_SRC_NONE);
5779                                 continue;
5780                         case VDEV_PROP_FRU:
5781                                 if (vd->vdev_fru == NULL)
5782                                         continue;
5783                                 vdev_prop_add_list(outnvl, propname,
5784                                     vd->vdev_fru, 0, ZPROP_SRC_NONE);
5785                                 continue;
5786                         case VDEV_PROP_PARENT:
5787                                 if (vd->vdev_parent != NULL) {
5788                                         strval = vdev_name(vd->vdev_parent,
5789                                             namebuf, sizeof (namebuf));
5790                                         vdev_prop_add_list(outnvl, propname,
5791                                             strval, 0, ZPROP_SRC_NONE);
5792                                 }
5793                                 continue;
5794                         case VDEV_PROP_CHILDREN:
5795                                 if (vd->vdev_children > 0)
5796                                         strval = kmem_zalloc(ZAP_MAXVALUELEN,
5797                                             KM_SLEEP);
5798                                 for (uint64_t i = 0; i < vd->vdev_children;
5799                                     i++) {
5800                                         char *vname;
5801
5802                                         vname = vdev_name(vd->vdev_child[i],
5803                                             namebuf, sizeof (namebuf));
5804                                         if (vname == NULL)
5805                                                 vname = "(unknown)";
5806                                         if (strlen(strval) > 0)
5807                                                 strlcat(strval, ",",
5808                                                     ZAP_MAXVALUELEN);
5809                                         strlcat(strval, vname, ZAP_MAXVALUELEN);
5810                                 }
5811                                 if (strval != NULL) {
5812                                         vdev_prop_add_list(outnvl, propname,
5813                                             strval, 0, ZPROP_SRC_NONE);
5814                                         kmem_free(strval, ZAP_MAXVALUELEN);
5815                                 }
5816                                 continue;
5817                         case VDEV_PROP_NUMCHILDREN:
5818                                 vdev_prop_add_list(outnvl, propname, NULL,
5819                                     vd->vdev_children, ZPROP_SRC_NONE);
5820                                 continue;
5821                         case VDEV_PROP_READ_ERRORS:
5822                                 vdev_prop_add_list(outnvl, propname, NULL,
5823                                     vd->vdev_stat.vs_read_errors,
5824                                     ZPROP_SRC_NONE);
5825                                 continue;
5826                         case VDEV_PROP_WRITE_ERRORS:
5827                                 vdev_prop_add_list(outnvl, propname, NULL,
5828                                     vd->vdev_stat.vs_write_errors,
5829                                     ZPROP_SRC_NONE);
5830                                 continue;
5831                         case VDEV_PROP_CHECKSUM_ERRORS:
5832                                 vdev_prop_add_list(outnvl, propname, NULL,
5833                                     vd->vdev_stat.vs_checksum_errors,
5834                                     ZPROP_SRC_NONE);
5835                                 continue;
5836                         case VDEV_PROP_INITIALIZE_ERRORS:
5837                                 vdev_prop_add_list(outnvl, propname, NULL,
5838                                     vd->vdev_stat.vs_initialize_errors,
5839                                     ZPROP_SRC_NONE);
5840                                 continue;
5841                         case VDEV_PROP_OPS_NULL:
5842                                 vdev_prop_add_list(outnvl, propname, NULL,
5843                                     vd->vdev_stat.vs_ops[ZIO_TYPE_NULL],
5844                                     ZPROP_SRC_NONE);
5845                                 continue;
5846                         case VDEV_PROP_OPS_READ:
5847                                 vdev_prop_add_list(outnvl, propname, NULL,
5848                                     vd->vdev_stat.vs_ops[ZIO_TYPE_READ],
5849                                     ZPROP_SRC_NONE);
5850                                 continue;
5851                         case VDEV_PROP_OPS_WRITE:
5852                                 vdev_prop_add_list(outnvl, propname, NULL,
5853                                     vd->vdev_stat.vs_ops[ZIO_TYPE_WRITE],
5854                                     ZPROP_SRC_NONE);
5855                                 continue;
5856                         case VDEV_PROP_OPS_FREE:
5857                                 vdev_prop_add_list(outnvl, propname, NULL,
5858                                     vd->vdev_stat.vs_ops[ZIO_TYPE_FREE],
5859                                     ZPROP_SRC_NONE);
5860                                 continue;
5861                         case VDEV_PROP_OPS_CLAIM:
5862                                 vdev_prop_add_list(outnvl, propname, NULL,
5863                                     vd->vdev_stat.vs_ops[ZIO_TYPE_CLAIM],
5864                                     ZPROP_SRC_NONE);
5865                                 continue;
5866                         case VDEV_PROP_OPS_TRIM:
5867                                 /*
5868                                  * TRIM ops and bytes are reported to user
5869                                  * space as ZIO_TYPE_IOCTL.  This is done to
5870                                  * preserve the vdev_stat_t structure layout
5871                                  * for user space.
5872                                  */
5873                                 vdev_prop_add_list(outnvl, propname, NULL,
5874                                     vd->vdev_stat.vs_ops[ZIO_TYPE_IOCTL],
5875                                     ZPROP_SRC_NONE);
5876                                 continue;
5877                         case VDEV_PROP_BYTES_NULL:
5878                                 vdev_prop_add_list(outnvl, propname, NULL,
5879                                     vd->vdev_stat.vs_bytes[ZIO_TYPE_NULL],
5880                                     ZPROP_SRC_NONE);
5881                                 continue;
5882                         case VDEV_PROP_BYTES_READ:
5883                                 vdev_prop_add_list(outnvl, propname, NULL,
5884                                     vd->vdev_stat.vs_bytes[ZIO_TYPE_READ],
5885                                     ZPROP_SRC_NONE);
5886                                 continue;
5887                         case VDEV_PROP_BYTES_WRITE:
5888                                 vdev_prop_add_list(outnvl, propname, NULL,
5889                                     vd->vdev_stat.vs_bytes[ZIO_TYPE_WRITE],
5890                                     ZPROP_SRC_NONE);
5891                                 continue;
5892                         case VDEV_PROP_BYTES_FREE:
5893                                 vdev_prop_add_list(outnvl, propname, NULL,
5894                                     vd->vdev_stat.vs_bytes[ZIO_TYPE_FREE],
5895                                     ZPROP_SRC_NONE);
5896                                 continue;
5897                         case VDEV_PROP_BYTES_CLAIM:
5898                                 vdev_prop_add_list(outnvl, propname, NULL,
5899                                     vd->vdev_stat.vs_bytes[ZIO_TYPE_CLAIM],
5900                                     ZPROP_SRC_NONE);
5901                                 continue;
5902                         case VDEV_PROP_BYTES_TRIM:
5903                                 /*
5904                                  * TRIM ops and bytes are reported to user
5905                                  * space as ZIO_TYPE_IOCTL.  This is done to
5906                                  * preserve the vdev_stat_t structure layout
5907                                  * for user space.
5908                                  */
5909                                 vdev_prop_add_list(outnvl, propname, NULL,
5910                                     vd->vdev_stat.vs_bytes[ZIO_TYPE_IOCTL],
5911                                     ZPROP_SRC_NONE);
5912                                 continue;
5913                         case VDEV_PROP_REMOVING:
5914                                 vdev_prop_add_list(outnvl, propname, NULL,
5915                                     vd->vdev_removing, ZPROP_SRC_NONE);
5916                                 continue;
5917                         /* Numeric Properites */
5918                         case VDEV_PROP_ALLOCATING:
5919                                 src = ZPROP_SRC_LOCAL;
5920                                 strval = NULL;
5921
5922                                 err = zap_lookup(mos, objid, nvpair_name(elem),
5923                                     sizeof (uint64_t), 1, &intval);
5924                                 if (err == ENOENT) {
5925                                         intval =
5926                                             vdev_prop_default_numeric(prop);
5927                                         err = 0;
5928                                 } else if (err)
5929                                         break;
5930                                 if (intval == vdev_prop_default_numeric(prop))
5931                                         src = ZPROP_SRC_DEFAULT;
5932
5933                                 /* Leaf vdevs cannot have this property */
5934                                 if (vd->vdev_mg == NULL &&
5935                                     vd->vdev_top != NULL) {
5936                                         src = ZPROP_SRC_NONE;
5937                                         intval = ZPROP_BOOLEAN_NA;
5938                                 }
5939
5940                                 vdev_prop_add_list(outnvl, propname, strval,
5941                                     intval, src);
5942                                 break;
5943                         /* Text Properties */
5944                         case VDEV_PROP_COMMENT:
5945                                 /* Exists in the ZAP below */
5946                                 /* FALLTHRU */
5947                         case VDEV_PROP_USER:
5948                                 /* User Properites */
5949                                 src = ZPROP_SRC_LOCAL;
5950
5951                                 err = zap_length(mos, objid, nvpair_name(elem),
5952                                     &integer_size, &num_integers);
5953                                 if (err)
5954                                         break;
5955
5956                                 switch (integer_size) {
5957                                 case 8:
5958                                         /* User properties cannot be integers */
5959                                         err = EINVAL;
5960                                         break;
5961                                 case 1:
5962                                         /* string property */
5963                                         strval = kmem_alloc(num_integers,
5964                                             KM_SLEEP);
5965                                         err = zap_lookup(mos, objid,
5966                                             nvpair_name(elem), 1,
5967                                             num_integers, strval);
5968                                         if (err) {
5969                                                 kmem_free(strval,
5970                                                     num_integers);
5971                                                 break;
5972                                         }
5973                                         vdev_prop_add_list(outnvl, propname,
5974                                             strval, 0, src);
5975                                         kmem_free(strval, num_integers);
5976                                         break;
5977                                 }
5978                                 break;
5979                         default:
5980                                 err = ENOENT;
5981                                 break;
5982                         }
5983                         if (err)
5984                                 break;
5985                 }
5986         } else {
5987                 /*
5988                  * Get all properties from the MOS vdev property object.
5989                  */
5990                 zap_cursor_t zc;
5991                 zap_attribute_t za;
5992                 for (zap_cursor_init(&zc, mos, objid);
5993                     (err = zap_cursor_retrieve(&zc, &za)) == 0;
5994                     zap_cursor_advance(&zc)) {
5995                         intval = 0;
5996                         strval = NULL;
5997                         zprop_source_t src = ZPROP_SRC_DEFAULT;
5998                         propname = za.za_name;
5999                         prop = vdev_name_to_prop(propname);
6000
6001                         switch (za.za_integer_length) {
6002                         case 8:
6003                                 /* We do not allow integer user properties */
6004                                 /* This is likely an internal value */
6005                                 break;
6006                         case 1:
6007                                 /* string property */
6008                                 strval = kmem_alloc(za.za_num_integers,
6009                                     KM_SLEEP);
6010                                 err = zap_lookup(mos, objid, za.za_name, 1,
6011                                     za.za_num_integers, strval);
6012                                 if (err) {
6013                                         kmem_free(strval, za.za_num_integers);
6014                                         break;
6015                                 }
6016                                 vdev_prop_add_list(outnvl, propname, strval, 0,
6017                                     src);
6018                                 kmem_free(strval, za.za_num_integers);
6019                                 break;
6020
6021                         default:
6022                                 break;
6023                         }
6024                 }
6025                 zap_cursor_fini(&zc);
6026         }
6027
6028         mutex_exit(&spa->spa_props_lock);
6029         if (err && err != ENOENT) {
6030                 return (err);
6031         }
6032
6033         return (0);
6034 }
6035
6036 EXPORT_SYMBOL(vdev_fault);
6037 EXPORT_SYMBOL(vdev_degrade);
6038 EXPORT_SYMBOL(vdev_online);
6039 EXPORT_SYMBOL(vdev_offline);
6040 EXPORT_SYMBOL(vdev_clear);
6041
6042 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, default_ms_count, INT, ZMOD_RW,
6043         "Target number of metaslabs per top-level vdev");
6044
6045 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, default_ms_shift, INT, ZMOD_RW,
6046         "Default limit for metaslab size");
6047
6048 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, min_ms_count, INT, ZMOD_RW,
6049         "Minimum number of metaslabs per top-level vdev");
6050
6051 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, ms_count_limit, INT, ZMOD_RW,
6052         "Practical upper limit of total metaslabs per top-level vdev");
6053
6054 ZFS_MODULE_PARAM(zfs, zfs_, slow_io_events_per_second, UINT, ZMOD_RW,
6055         "Rate limit slow IO (delay) events to this many per second");
6056
6057 /* BEGIN CSTYLED */
6058 ZFS_MODULE_PARAM(zfs, zfs_, checksum_events_per_second, UINT, ZMOD_RW,
6059         "Rate limit checksum events to this many checksum errors per second "
6060         "(do not set below ZED threshold).");
6061 /* END CSTYLED */
6062
6063 ZFS_MODULE_PARAM(zfs, zfs_, scan_ignore_errors, INT, ZMOD_RW,
6064         "Ignore errors during resilver/scrub");
6065
6066 ZFS_MODULE_PARAM(zfs_vdev, vdev_, validate_skip, INT, ZMOD_RW,
6067         "Bypass vdev_validate()");
6068
6069 ZFS_MODULE_PARAM(zfs, zfs_, nocacheflush, INT, ZMOD_RW,
6070         "Disable cache flushes");
6071
6072 ZFS_MODULE_PARAM(zfs, zfs_, embedded_slog_min_ms, INT, ZMOD_RW,
6073         "Minimum number of metaslabs required to dedicate one for log blocks");
6074
6075 /* BEGIN CSTYLED */
6076 ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, min_auto_ashift,
6077         param_set_min_auto_ashift, param_get_ulong, ZMOD_RW,
6078         "Minimum ashift used when creating new top-level vdevs");
6079
6080 ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, max_auto_ashift,
6081         param_set_max_auto_ashift, param_get_ulong, ZMOD_RW,
6082         "Maximum ashift used when optimizing for logical -> physical sector "
6083         "size on new top-level vdevs");
6084 /* END CSTYLED */