]> git.proxmox.com Git - mirror_zfs.git/blame - module/zfs/spa.c
FreeBSD compile fix
[mirror_zfs.git] / module / zfs / spa.c
CommitLineData
34dc7c2f
BB
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
1d3ba0bf 9 * or https://opensource.org/licenses/CDDL-1.0.
34dc7c2f
BB
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
428870ff 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
4f072827 24 * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
733b5722 25 * Copyright (c) 2018, Nexenta Systems, Inc. All rights reserved.
0c66c32d 26 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
3c67d83a 27 * Copyright 2013 Saso Kiselkov. All rights reserved.
e550644f
BB
28 * Copyright (c) 2014 Integros [integros.com]
29 * Copyright 2016 Toomas Soome <tsoome@me.com>
a0bd735a 30 * Copyright (c) 2016 Actifio, Inc. All rights reserved.
f65fbee1 31 * Copyright 2018 Joyent, Inc.
3c819a2c 32 * Copyright (c) 2017, 2019, Datto Inc. All rights reserved.
12fa0466 33 * Copyright 2017 Joyent, Inc.
cc99f275 34 * Copyright (c) 2017, Intel Corporation.
658fb802 35 * Copyright (c) 2021, Colm Buckley <colm@tuatha.org>
a38718a6 36 */
34dc7c2f 37
34dc7c2f 38/*
e49f1e20
WA
39 * SPA: Storage Pool Allocator
40 *
34dc7c2f
BB
41 * This file contains all the routines used when modifying on-disk SPA state.
42 * This includes opening, importing, destroying, exporting a pool, and syncing a
43 * pool.
44 */
45
46#include <sys/zfs_context.h>
47#include <sys/fm/fs/zfs.h>
48#include <sys/spa_impl.h>
49#include <sys/zio.h>
50#include <sys/zio_checksum.h>
34dc7c2f
BB
51#include <sys/dmu.h>
52#include <sys/dmu_tx.h>
53#include <sys/zap.h>
54#include <sys/zil.h>
428870ff 55#include <sys/ddt.h>
34dc7c2f 56#include <sys/vdev_impl.h>
a1d477c2
MA
57#include <sys/vdev_removal.h>
58#include <sys/vdev_indirect_mapping.h>
59#include <sys/vdev_indirect_births.h>
619f0976 60#include <sys/vdev_initialize.h>
9a49d3f3 61#include <sys/vdev_rebuild.h>
1b939560 62#include <sys/vdev_trim.h>
c28b2279 63#include <sys/vdev_disk.h>
b2255edc 64#include <sys/vdev_draid.h>
34dc7c2f 65#include <sys/metaslab.h>
428870ff 66#include <sys/metaslab_impl.h>
379ca9cf 67#include <sys/mmp.h>
34dc7c2f
BB
68#include <sys/uberblock_impl.h>
69#include <sys/txg.h>
70#include <sys/avl.h>
a1d477c2 71#include <sys/bpobj.h>
34dc7c2f
BB
72#include <sys/dmu_traverse.h>
73#include <sys/dmu_objset.h>
74#include <sys/unique.h>
75#include <sys/dsl_pool.h>
76#include <sys/dsl_dataset.h>
77#include <sys/dsl_dir.h>
78#include <sys/dsl_prop.h>
79#include <sys/dsl_synctask.h>
80#include <sys/fs/zfs.h>
81#include <sys/arc.h>
82#include <sys/callb.h>
83#include <sys/systeminfo.h>
34dc7c2f 84#include <sys/spa_boot.h>
9babb374 85#include <sys/zfs_ioctl.h>
428870ff 86#include <sys/dsl_scan.h>
9ae529ec 87#include <sys/zfeature.h>
13fe0198 88#include <sys/dsl_destroy.h>
526af785 89#include <sys/zvol.h>
34dc7c2f 90
d164b209 91#ifdef _KERNEL
12fa0466
DE
92#include <sys/fm/protocol.h>
93#include <sys/fm/util.h>
428870ff 94#include <sys/callb.h>
d164b209 95#include <sys/zone.h>
c8242a96 96#include <sys/vmsystm.h>
d164b209
BB
97#endif /* _KERNEL */
98
34dc7c2f
BB
99#include "zfs_prop.h"
100#include "zfs_comutil.h"
101
e6cfd633
WA
102/*
103 * The interval, in seconds, at which failed configuration cache file writes
104 * should be retried.
105 */
a1d477c2 106int zfs_ccw_retry_interval = 300;
e6cfd633 107
428870ff 108typedef enum zti_modes {
7ef5e54e 109 ZTI_MODE_FIXED, /* value is # of threads (min 1) */
7ef5e54e 110 ZTI_MODE_BATCH, /* cpu-intensive; value is ignored */
7457b024 111 ZTI_MODE_SCALE, /* Taskqs scale with CPUs. */
7ef5e54e
AL
112 ZTI_MODE_NULL, /* don't create a taskq */
113 ZTI_NMODES
428870ff 114} zti_modes_t;
34dc7c2f 115
7ef5e54e
AL
116#define ZTI_P(n, q) { ZTI_MODE_FIXED, (n), (q) }
117#define ZTI_PCT(n) { ZTI_MODE_ONLINE_PERCENT, (n), 1 }
118#define ZTI_BATCH { ZTI_MODE_BATCH, 0, 1 }
7457b024 119#define ZTI_SCALE { ZTI_MODE_SCALE, 0, 1 }
7ef5e54e 120#define ZTI_NULL { ZTI_MODE_NULL, 0, 0 }
9babb374 121
7ef5e54e
AL
122#define ZTI_N(n) ZTI_P(n, 1)
123#define ZTI_ONE ZTI_N(1)
9babb374
BB
124
125typedef struct zio_taskq_info {
7ef5e54e 126 zti_modes_t zti_mode;
428870ff 127 uint_t zti_value;
7ef5e54e 128 uint_t zti_count;
9babb374
BB
129} zio_taskq_info_t;
130
131static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
451041db 132 "iss", "iss_h", "int", "int_h"
9babb374
BB
133};
134
428870ff 135/*
7ef5e54e
AL
136 * This table defines the taskq settings for each ZFS I/O type. When
137 * initializing a pool, we use this table to create an appropriately sized
138 * taskq. Some operations are low volume and therefore have a small, static
139 * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE
140 * macros. Other operations process a large amount of data; the ZTI_BATCH
141 * macro causes us to create a taskq oriented for throughput. Some operations
1b939560 142 * are so high frequency and short-lived that the taskq itself can become a
7ef5e54e
AL
143 * point of lock contention. The ZTI_P(#, #) macro indicates that we need an
144 * additional degree of parallelism specified by the number of threads per-
145 * taskq and the number of taskqs; when dispatching an event in this case, the
7457b024
AM
146 * particular taskq is chosen at random. ZTI_SCALE is similar to ZTI_BATCH,
147 * but with number of taskqs also scaling with number of CPUs.
7ef5e54e
AL
148 *
149 * The different taskq priorities are to handle the different contexts (issue
150 * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that
151 * need to be handled with minimum delay.
428870ff 152 */
18168da7 153static const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
428870ff 154 /* ISSUE ISSUE_HIGH INTR INTR_HIGH */
7ef5e54e 155 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */
7457b024
AM
156 { ZTI_N(8), ZTI_NULL, ZTI_SCALE, ZTI_NULL }, /* READ */
157 { ZTI_BATCH, ZTI_N(5), ZTI_SCALE, ZTI_N(5) }, /* WRITE */
158 { ZTI_SCALE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */
7ef5e54e
AL
159 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* CLAIM */
160 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* IOCTL */
1b939560 161 { ZTI_N(4), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* TRIM */
9babb374
BB
162};
163
13fe0198
MA
164static void spa_sync_version(void *arg, dmu_tx_t *tx);
165static void spa_sync_props(void *arg, dmu_tx_t *tx);
b128c09f 166static boolean_t spa_has_active_shared_spare(spa_t *spa);
a926aab9
AZ
167static int spa_load_impl(spa_t *spa, spa_import_type_t type,
168 const char **ereport);
572e2857 169static void spa_vdev_resilver_done(spa_t *spa);
428870ff 170
18168da7
AZ
171static uint_t zio_taskq_batch_pct = 80; /* 1 thread per cpu in pset */
172static uint_t zio_taskq_batch_tpq; /* threads per taskq */
173static const boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */
174static const uint_t zio_taskq_basedc = 80; /* base duty cycle */
428870ff 175
18168da7 176static const boolean_t spa_create_process = B_TRUE; /* no process => no sysdc */
428870ff 177
afd2f7b7
PZ
178/*
179 * Report any spa_load_verify errors found, but do not fail spa_load.
180 * This is used by zdb to analyze non-idle pools.
181 */
182boolean_t spa_load_verify_dryrun = B_FALSE;
183
e39fe05b
FU
184/*
185 * Allow read spacemaps in case of readonly import (spa_mode == SPA_MODE_READ).
186 * This is used by zdb for spacemaps verification.
187 */
188boolean_t spa_mode_readable_spacemaps = B_FALSE;
189
428870ff
BB
190/*
191 * This (illegal) pool name is used when temporarily importing a spa_t in order
192 * to get the vdev stats associated with the imported devices.
193 */
194#define TRYIMPORT_NAME "$import"
34dc7c2f 195
6cb8e530
PZ
196/*
197 * For debugging purposes: print out vdev tree during pool import.
198 */
18168da7 199static int spa_load_print_vdev_tree = B_FALSE;
6cb8e530
PZ
200
201/*
202 * A non-zero value for zfs_max_missing_tvds means that we allow importing
203 * pools with missing top-level vdevs. This is strictly intended for advanced
204 * pool recovery cases since missing data is almost inevitable. Pools with
205 * missing devices can only be imported read-only for safety reasons, and their
206 * fail-mode will be automatically set to "continue".
207 *
208 * With 1 missing vdev we should be able to import the pool and mount all
209 * datasets. User data that was not modified after the missing device has been
210 * added should be recoverable. This means that snapshots created prior to the
211 * addition of that device should be completely intact.
212 *
213 * With 2 missing vdevs, some datasets may fail to mount since there are
214 * dataset statistics that are stored as regular metadata. Some data might be
215 * recoverable if those vdevs were added recently.
216 *
217 * With 3 or more missing vdevs, the pool is severely damaged and MOS entries
218 * may be missing entirely. Chances of data recovery are very low. Note that
219 * there are also risks of performing an inadvertent rewind as we might be
220 * missing all the vdevs with the latest uberblocks.
221 */
222unsigned long zfs_max_missing_tvds = 0;
223
224/*
225 * The parameters below are similar to zfs_max_missing_tvds but are only
226 * intended for a preliminary open of the pool with an untrusted config which
227 * might be incomplete or out-dated.
228 *
229 * We are more tolerant for pools opened from a cachefile since we could have
230 * an out-dated cachefile where a device removal was not registered.
231 * We could have set the limit arbitrarily high but in the case where devices
232 * are really missing we would want to return the proper error codes; we chose
233 * SPA_DVAS_PER_BP - 1 so that some copies of the MOS would still be available
234 * and we get a chance to retrieve the trusted config.
235 */
236uint64_t zfs_max_missing_tvds_cachefile = SPA_DVAS_PER_BP - 1;
d2734cce 237
6cb8e530
PZ
238/*
239 * In the case where config was assembled by scanning device paths (/dev/dsks
240 * by default) we are less tolerant since all the existing devices should have
241 * been detected and we want spa_load to return the right error codes.
242 */
243uint64_t zfs_max_missing_tvds_scan = 0;
244
d2734cce
SD
245/*
246 * Debugging aid that pauses spa_sync() towards the end.
247 */
18168da7 248static const boolean_t zfs_pause_spa_sync = B_FALSE;
d2734cce 249
37f03da8
SH
250/*
251 * Variables to indicate the livelist condense zthr func should wait at certain
252 * points for the livelist to be removed - used to test condense/destroy races
253 */
18168da7
AZ
254static int zfs_livelist_condense_zthr_pause = 0;
255static int zfs_livelist_condense_sync_pause = 0;
37f03da8
SH
256
257/*
258 * Variables to track whether or not condense cancellation has been
259 * triggered in testing.
260 */
18168da7
AZ
261static int zfs_livelist_condense_sync_cancel = 0;
262static int zfs_livelist_condense_zthr_cancel = 0;
37f03da8
SH
263
264/*
265 * Variable to track whether or not extra ALLOC blkptrs were added to a
266 * livelist entry while it was being condensed (caused by the way we track
267 * remapped blkptrs in dbuf_remap_impl)
268 */
18168da7 269static int zfs_livelist_condense_new_alloc = 0;
37f03da8 270
34dc7c2f
BB
271/*
272 * ==========================================================================
273 * SPA properties routines
274 * ==========================================================================
275 */
276
277/*
278 * Add a (source=src, propname=propval) list to an nvlist.
279 */
280static void
a926aab9 281spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, const char *strval,
34dc7c2f
BB
282 uint64_t intval, zprop_source_t src)
283{
284 const char *propname = zpool_prop_to_name(prop);
285 nvlist_t *propval;
286
65ad5d11
AJ
287 propval = fnvlist_alloc();
288 fnvlist_add_uint64(propval, ZPROP_SOURCE, src);
34dc7c2f
BB
289
290 if (strval != NULL)
65ad5d11 291 fnvlist_add_string(propval, ZPROP_VALUE, strval);
34dc7c2f 292 else
65ad5d11 293 fnvlist_add_uint64(propval, ZPROP_VALUE, intval);
34dc7c2f 294
65ad5d11 295 fnvlist_add_nvlist(nvl, propname, propval);
34dc7c2f
BB
296 nvlist_free(propval);
297}
298
299/*
300 * Get property values from the spa configuration.
301 */
302static void
303spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
304{
1bd201e7 305 vdev_t *rvd = spa->spa_root_vdev;
9ae529ec 306 dsl_pool_t *pool = spa->spa_dsl_pool;
f3a7f661 307 uint64_t size, alloc, cap, version;
82ab6848 308 const zprop_source_t src = ZPROP_SRC_NONE;
b128c09f 309 spa_config_dirent_t *dp;
f3a7f661 310 metaslab_class_t *mc = spa_normal_class(spa);
b128c09f
BB
311
312 ASSERT(MUTEX_HELD(&spa->spa_props_lock));
34dc7c2f 313
1bd201e7 314 if (rvd != NULL) {
cc99f275
DB
315 alloc = metaslab_class_get_alloc(mc);
316 alloc += metaslab_class_get_alloc(spa_special_class(spa));
317 alloc += metaslab_class_get_alloc(spa_dedup_class(spa));
aa755b35 318 alloc += metaslab_class_get_alloc(spa_embedded_log_class(spa));
cc99f275
DB
319
320 size = metaslab_class_get_space(mc);
321 size += metaslab_class_get_space(spa_special_class(spa));
322 size += metaslab_class_get_space(spa_dedup_class(spa));
aa755b35 323 size += metaslab_class_get_space(spa_embedded_log_class(spa));
cc99f275 324
d164b209
BB
325 spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src);
326 spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src);
428870ff
BB
327 spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src);
328 spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL,
329 size - alloc, src);
d2734cce
SD
330 spa_prop_add_list(*nvp, ZPOOL_PROP_CHECKPOINT, NULL,
331 spa->spa_checkpoint_info.sci_dspace, src);
1bd201e7 332
f3a7f661
GW
333 spa_prop_add_list(*nvp, ZPOOL_PROP_FRAGMENTATION, NULL,
334 metaslab_class_fragmentation(mc), src);
335 spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL,
336 metaslab_class_expandable_space(mc), src);
572e2857 337 spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL,
da92d5cb 338 (spa_mode(spa) == SPA_MODE_READ), src);
d164b209 339
428870ff 340 cap = (size == 0) ? 0 : (alloc * 100 / size);
d164b209
BB
341 spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src);
342
428870ff
BB
343 spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL,
344 ddt_get_pool_dedup_ratio(spa), src);
345
d164b209 346 spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL,
1bd201e7 347 rvd->vdev_state, src);
d164b209
BB
348
349 version = spa_version(spa);
82ab6848
HM
350 if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) {
351 spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL,
352 version, ZPROP_SRC_DEFAULT);
353 } else {
354 spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL,
355 version, ZPROP_SRC_LOCAL);
356 }
a448a255
SD
357 spa_prop_add_list(*nvp, ZPOOL_PROP_LOAD_GUID,
358 NULL, spa_load_guid(spa), src);
d164b209 359 }
34dc7c2f 360
9ae529ec 361 if (pool != NULL) {
9ae529ec
CS
362 /*
363 * The $FREE directory was introduced in SPA_VERSION_DEADLISTS,
364 * when opening pools before this version freedir will be NULL.
365 */
fbeddd60 366 if (pool->dp_free_dir != NULL) {
9ae529ec 367 spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL,
d683ddbb
JG
368 dsl_dir_phys(pool->dp_free_dir)->dd_used_bytes,
369 src);
9ae529ec
CS
370 } else {
371 spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING,
372 NULL, 0, src);
373 }
fbeddd60
MA
374
375 if (pool->dp_leak_dir != NULL) {
376 spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL,
d683ddbb
JG
377 dsl_dir_phys(pool->dp_leak_dir)->dd_used_bytes,
378 src);
fbeddd60
MA
379 } else {
380 spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED,
381 NULL, 0, src);
382 }
9ae529ec
CS
383 }
384
34dc7c2f 385 spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src);
34dc7c2f 386
d96eb2b1
DM
387 if (spa->spa_comment != NULL) {
388 spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment,
389 0, ZPROP_SRC_LOCAL);
390 }
391
658fb802
CB
392 if (spa->spa_compatibility != NULL) {
393 spa_prop_add_list(*nvp, ZPOOL_PROP_COMPATIBILITY,
394 spa->spa_compatibility, 0, ZPROP_SRC_LOCAL);
395 }
396
34dc7c2f
BB
397 if (spa->spa_root != NULL)
398 spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root,
399 0, ZPROP_SRC_LOCAL);
400
f1512ee6
MA
401 if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) {
402 spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL,
403 MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE), ZPROP_SRC_NONE);
404 } else {
405 spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL,
406 SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE);
407 }
408
50c957f7
NB
409 if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE)) {
410 spa_prop_add_list(*nvp, ZPOOL_PROP_MAXDNODESIZE, NULL,
411 DNODE_MAX_SIZE, ZPROP_SRC_NONE);
412 } else {
413 spa_prop_add_list(*nvp, ZPOOL_PROP_MAXDNODESIZE, NULL,
414 DNODE_MIN_SIZE, ZPROP_SRC_NONE);
415 }
416
b128c09f
BB
417 if ((dp = list_head(&spa->spa_config_list)) != NULL) {
418 if (dp->scd_path == NULL) {
34dc7c2f 419 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
b128c09f
BB
420 "none", 0, ZPROP_SRC_LOCAL);
421 } else if (strcmp(dp->scd_path, spa_config_path) != 0) {
34dc7c2f 422 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
b128c09f 423 dp->scd_path, 0, ZPROP_SRC_LOCAL);
34dc7c2f
BB
424 }
425 }
426}
427
428/*
429 * Get zpool property values.
430 */
431int
432spa_prop_get(spa_t *spa, nvlist_t **nvp)
433{
428870ff 434 objset_t *mos = spa->spa_meta_objset;
34dc7c2f
BB
435 zap_cursor_t zc;
436 zap_attribute_t za;
1743c737 437 dsl_pool_t *dp;
34dc7c2f
BB
438 int err;
439
79c76d5b 440 err = nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP);
c28b2279 441 if (err)
d1d7e268 442 return (err);
34dc7c2f 443
1743c737
AM
444 dp = spa_get_dsl(spa);
445 dsl_pool_config_enter(dp, FTAG);
b128c09f
BB
446 mutex_enter(&spa->spa_props_lock);
447
34dc7c2f
BB
448 /*
449 * Get properties from the spa config.
450 */
451 spa_prop_get_config(spa, nvp);
452
34dc7c2f 453 /* If no pool property object, no more prop to get. */
1743c737 454 if (mos == NULL || spa->spa_pool_props_object == 0)
c28b2279 455 goto out;
34dc7c2f
BB
456
457 /*
458 * Get properties from the MOS pool property object.
459 */
460 for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object);
461 (err = zap_cursor_retrieve(&zc, &za)) == 0;
462 zap_cursor_advance(&zc)) {
463 uint64_t intval = 0;
464 char *strval = NULL;
465 zprop_source_t src = ZPROP_SRC_DEFAULT;
466 zpool_prop_t prop;
467
31864e3d 468 if ((prop = zpool_name_to_prop(za.za_name)) == ZPOOL_PROP_INVAL)
34dc7c2f
BB
469 continue;
470
471 switch (za.za_integer_length) {
472 case 8:
473 /* integer property */
474 if (za.za_first_integer !=
475 zpool_prop_default_numeric(prop))
476 src = ZPROP_SRC_LOCAL;
477
478 if (prop == ZPOOL_PROP_BOOTFS) {
34dc7c2f
BB
479 dsl_dataset_t *ds = NULL;
480
619f0976
GW
481 err = dsl_dataset_hold_obj(dp,
482 za.za_first_integer, FTAG, &ds);
1743c737 483 if (err != 0)
34dc7c2f 484 break;
34dc7c2f 485
eca7b760 486 strval = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN,
79c76d5b 487 KM_SLEEP);
34dc7c2f 488 dsl_dataset_name(ds, strval);
b128c09f 489 dsl_dataset_rele(ds, FTAG);
34dc7c2f
BB
490 } else {
491 strval = NULL;
492 intval = za.za_first_integer;
493 }
494
495 spa_prop_add_list(*nvp, prop, strval, intval, src);
496
497 if (strval != NULL)
eca7b760 498 kmem_free(strval, ZFS_MAX_DATASET_NAME_LEN);
34dc7c2f
BB
499
500 break;
501
502 case 1:
503 /* string property */
79c76d5b 504 strval = kmem_alloc(za.za_num_integers, KM_SLEEP);
34dc7c2f
BB
505 err = zap_lookup(mos, spa->spa_pool_props_object,
506 za.za_name, 1, za.za_num_integers, strval);
507 if (err) {
508 kmem_free(strval, za.za_num_integers);
509 break;
510 }
511 spa_prop_add_list(*nvp, prop, strval, 0, src);
512 kmem_free(strval, za.za_num_integers);
513 break;
514
515 default:
516 break;
517 }
518 }
519 zap_cursor_fini(&zc);
34dc7c2f 520out:
1743c737
AM
521 mutex_exit(&spa->spa_props_lock);
522 dsl_pool_config_exit(dp, FTAG);
34dc7c2f
BB
523 if (err && err != ENOENT) {
524 nvlist_free(*nvp);
525 *nvp = NULL;
526 return (err);
527 }
528
529 return (0);
530}
531
532/*
533 * Validate the given pool properties nvlist and modify the list
534 * for the property values to be set.
535 */
536static int
537spa_prop_validate(spa_t *spa, nvlist_t *props)
538{
539 nvpair_t *elem;
540 int error = 0, reset_bootfs = 0;
d4ed6673 541 uint64_t objnum = 0;
9ae529ec 542 boolean_t has_feature = B_FALSE;
34dc7c2f
BB
543
544 elem = NULL;
545 while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
34dc7c2f 546 uint64_t intval;
9ae529ec
CS
547 char *strval, *slash, *check, *fname;
548 const char *propname = nvpair_name(elem);
549 zpool_prop_t prop = zpool_name_to_prop(propname);
550
31864e3d
BB
551 switch (prop) {
552 case ZPOOL_PROP_INVAL:
9ae529ec 553 if (!zpool_prop_feature(propname)) {
2e528b49 554 error = SET_ERROR(EINVAL);
9ae529ec
CS
555 break;
556 }
557
558 /*
559 * Sanitize the input.
560 */
561 if (nvpair_type(elem) != DATA_TYPE_UINT64) {
2e528b49 562 error = SET_ERROR(EINVAL);
9ae529ec
CS
563 break;
564 }
565
566 if (nvpair_value_uint64(elem, &intval) != 0) {
2e528b49 567 error = SET_ERROR(EINVAL);
9ae529ec
CS
568 break;
569 }
34dc7c2f 570
9ae529ec 571 if (intval != 0) {
2e528b49 572 error = SET_ERROR(EINVAL);
9ae529ec
CS
573 break;
574 }
34dc7c2f 575
9ae529ec
CS
576 fname = strchr(propname, '@') + 1;
577 if (zfeature_lookup_name(fname, NULL) != 0) {
2e528b49 578 error = SET_ERROR(EINVAL);
9ae529ec
CS
579 break;
580 }
581
582 has_feature = B_TRUE;
583 break;
34dc7c2f 584
34dc7c2f
BB
585 case ZPOOL_PROP_VERSION:
586 error = nvpair_value_uint64(elem, &intval);
587 if (!error &&
9ae529ec
CS
588 (intval < spa_version(spa) ||
589 intval > SPA_VERSION_BEFORE_FEATURES ||
590 has_feature))
2e528b49 591 error = SET_ERROR(EINVAL);
34dc7c2f
BB
592 break;
593
594 case ZPOOL_PROP_DELEGATION:
595 case ZPOOL_PROP_AUTOREPLACE:
b128c09f 596 case ZPOOL_PROP_LISTSNAPS:
9babb374 597 case ZPOOL_PROP_AUTOEXPAND:
1b939560 598 case ZPOOL_PROP_AUTOTRIM:
34dc7c2f
BB
599 error = nvpair_value_uint64(elem, &intval);
600 if (!error && intval > 1)
2e528b49 601 error = SET_ERROR(EINVAL);
34dc7c2f
BB
602 break;
603
379ca9cf
OF
604 case ZPOOL_PROP_MULTIHOST:
605 error = nvpair_value_uint64(elem, &intval);
606 if (!error && intval > 1)
607 error = SET_ERROR(EINVAL);
608
25f06d67
BB
609 if (!error) {
610 uint32_t hostid = zone_get_hostid(NULL);
611 if (hostid)
612 spa->spa_hostid = hostid;
613 else
614 error = SET_ERROR(ENOTSUP);
615 }
379ca9cf
OF
616
617 break;
618
34dc7c2f 619 case ZPOOL_PROP_BOOTFS:
9babb374
BB
620 /*
621 * If the pool version is less than SPA_VERSION_BOOTFS,
622 * or the pool is still being created (version == 0),
623 * the bootfs property cannot be set.
624 */
34dc7c2f 625 if (spa_version(spa) < SPA_VERSION_BOOTFS) {
2e528b49 626 error = SET_ERROR(ENOTSUP);
34dc7c2f
BB
627 break;
628 }
629
630 /*
b128c09f 631 * Make sure the vdev config is bootable
34dc7c2f 632 */
b128c09f 633 if (!vdev_is_bootable(spa->spa_root_vdev)) {
2e528b49 634 error = SET_ERROR(ENOTSUP);
34dc7c2f
BB
635 break;
636 }
637
638 reset_bootfs = 1;
639
640 error = nvpair_value_string(elem, &strval);
641
642 if (!error) {
9ae529ec 643 objset_t *os;
b128c09f 644
34dc7c2f
BB
645 if (strval == NULL || strval[0] == '\0') {
646 objnum = zpool_prop_default_numeric(
647 ZPOOL_PROP_BOOTFS);
648 break;
649 }
650
d1d7e268 651 error = dmu_objset_hold(strval, FTAG, &os);
619f0976 652 if (error != 0)
34dc7c2f 653 break;
b128c09f 654
eaa25f1a 655 /* Must be ZPL. */
428870ff 656 if (dmu_objset_type(os) != DMU_OST_ZFS) {
2e528b49 657 error = SET_ERROR(ENOTSUP);
b128c09f
BB
658 } else {
659 objnum = dmu_objset_id(os);
660 }
428870ff 661 dmu_objset_rele(os, FTAG);
34dc7c2f
BB
662 }
663 break;
b128c09f 664
34dc7c2f
BB
665 case ZPOOL_PROP_FAILUREMODE:
666 error = nvpair_value_uint64(elem, &intval);
3bfd95d5 667 if (!error && intval > ZIO_FAILURE_MODE_PANIC)
2e528b49 668 error = SET_ERROR(EINVAL);
34dc7c2f
BB
669
670 /*
671 * This is a special case which only occurs when
672 * the pool has completely failed. This allows
673 * the user to change the in-core failmode property
674 * without syncing it out to disk (I/Os might
675 * currently be blocked). We do this by returning
676 * EIO to the caller (spa_prop_set) to trick it
677 * into thinking we encountered a property validation
678 * error.
679 */
b128c09f 680 if (!error && spa_suspended(spa)) {
34dc7c2f 681 spa->spa_failmode = intval;
2e528b49 682 error = SET_ERROR(EIO);
34dc7c2f
BB
683 }
684 break;
685
686 case ZPOOL_PROP_CACHEFILE:
687 if ((error = nvpair_value_string(elem, &strval)) != 0)
688 break;
689
690 if (strval[0] == '\0')
691 break;
692
693 if (strcmp(strval, "none") == 0)
694 break;
695
696 if (strval[0] != '/') {
2e528b49 697 error = SET_ERROR(EINVAL);
34dc7c2f
BB
698 break;
699 }
700
701 slash = strrchr(strval, '/');
702 ASSERT(slash != NULL);
703
704 if (slash[1] == '\0' || strcmp(slash, "/.") == 0 ||
705 strcmp(slash, "/..") == 0)
2e528b49 706 error = SET_ERROR(EINVAL);
34dc7c2f 707 break;
428870ff 708
d96eb2b1
DM
709 case ZPOOL_PROP_COMMENT:
710 if ((error = nvpair_value_string(elem, &strval)) != 0)
711 break;
712 for (check = strval; *check != '\0'; check++) {
713 if (!isprint(*check)) {
2e528b49 714 error = SET_ERROR(EINVAL);
d96eb2b1
DM
715 break;
716 }
d96eb2b1
DM
717 }
718 if (strlen(strval) > ZPROP_MAX_COMMENT)
2e528b49 719 error = SET_ERROR(E2BIG);
d96eb2b1
DM
720 break;
721
e75c13c3
BB
722 default:
723 break;
34dc7c2f
BB
724 }
725
726 if (error)
727 break;
728 }
729
050d720c
MA
730 (void) nvlist_remove_all(props,
731 zpool_prop_to_name(ZPOOL_PROP_DEDUPDITTO));
732
34dc7c2f
BB
733 if (!error && reset_bootfs) {
734 error = nvlist_remove(props,
735 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING);
736
737 if (!error) {
738 error = nvlist_add_uint64(props,
739 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum);
740 }
741 }
742
743 return (error);
744}
745
d164b209
BB
746void
747spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync)
748{
749 char *cachefile;
750 spa_config_dirent_t *dp;
751
752 if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE),
753 &cachefile) != 0)
754 return;
755
756 dp = kmem_alloc(sizeof (spa_config_dirent_t),
79c76d5b 757 KM_SLEEP);
d164b209
BB
758
759 if (cachefile[0] == '\0')
760 dp->scd_path = spa_strdup(spa_config_path);
761 else if (strcmp(cachefile, "none") == 0)
762 dp->scd_path = NULL;
763 else
764 dp->scd_path = spa_strdup(cachefile);
765
766 list_insert_head(&spa->spa_config_list, dp);
767 if (need_sync)
768 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
769}
770
34dc7c2f
BB
771int
772spa_prop_set(spa_t *spa, nvlist_t *nvp)
773{
774 int error;
9ae529ec 775 nvpair_t *elem = NULL;
d164b209 776 boolean_t need_sync = B_FALSE;
34dc7c2f
BB
777
778 if ((error = spa_prop_validate(spa, nvp)) != 0)
779 return (error);
780
d164b209 781 while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) {
9ae529ec 782 zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem));
d164b209 783
572e2857
BB
784 if (prop == ZPOOL_PROP_CACHEFILE ||
785 prop == ZPOOL_PROP_ALTROOT ||
786 prop == ZPOOL_PROP_READONLY)
d164b209
BB
787 continue;
788
31864e3d 789 if (prop == ZPOOL_PROP_VERSION || prop == ZPOOL_PROP_INVAL) {
2a673e76 790 uint64_t ver = 0;
9ae529ec
CS
791
792 if (prop == ZPOOL_PROP_VERSION) {
793 VERIFY(nvpair_value_uint64(elem, &ver) == 0);
794 } else {
795 ASSERT(zpool_prop_feature(nvpair_name(elem)));
796 ver = SPA_VERSION_FEATURES;
797 need_sync = B_TRUE;
798 }
799
800 /* Save time if the version is already set. */
801 if (ver == spa_version(spa))
802 continue;
803
804 /*
805 * In addition to the pool directory object, we might
806 * create the pool properties object, the features for
807 * read object, the features for write object, or the
808 * feature descriptions object.
809 */
13fe0198 810 error = dsl_sync_task(spa->spa_name, NULL,
3d45fdd6
MA
811 spa_sync_version, &ver,
812 6, ZFS_SPACE_CHECK_RESERVED);
9ae529ec
CS
813 if (error)
814 return (error);
815 continue;
816 }
817
d164b209
BB
818 need_sync = B_TRUE;
819 break;
820 }
821
9ae529ec 822 if (need_sync) {
13fe0198 823 return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props,
3d45fdd6 824 nvp, 6, ZFS_SPACE_CHECK_RESERVED));
9ae529ec
CS
825 }
826
827 return (0);
34dc7c2f
BB
828}
829
830/*
831 * If the bootfs property value is dsobj, clear it.
832 */
833void
834spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx)
835{
836 if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) {
837 VERIFY(zap_remove(spa->spa_meta_objset,
838 spa->spa_pool_props_object,
839 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0);
840 spa->spa_bootfs = 0;
841 }
842}
843
3bc7e0fb 844static int
13fe0198 845spa_change_guid_check(void *arg, dmu_tx_t *tx)
3bc7e0fb 846{
2a8ba608 847 uint64_t *newguid __maybe_unused = arg;
13fe0198 848 spa_t *spa = dmu_tx_pool(tx)->dp_spa;
3bc7e0fb
GW
849 vdev_t *rvd = spa->spa_root_vdev;
850 uint64_t vdev_state;
3bc7e0fb 851
d2734cce
SD
852 if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
853 int error = (spa_has_checkpoint(spa)) ?
854 ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
855 return (SET_ERROR(error));
856 }
857
3bc7e0fb
GW
858 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
859 vdev_state = rvd->vdev_state;
860 spa_config_exit(spa, SCL_STATE, FTAG);
861
862 if (vdev_state != VDEV_STATE_HEALTHY)
2e528b49 863 return (SET_ERROR(ENXIO));
3bc7e0fb
GW
864
865 ASSERT3U(spa_guid(spa), !=, *newguid);
866
867 return (0);
868}
869
870static void
13fe0198 871spa_change_guid_sync(void *arg, dmu_tx_t *tx)
3bc7e0fb 872{
13fe0198
MA
873 uint64_t *newguid = arg;
874 spa_t *spa = dmu_tx_pool(tx)->dp_spa;
3bc7e0fb
GW
875 uint64_t oldguid;
876 vdev_t *rvd = spa->spa_root_vdev;
877
878 oldguid = spa_guid(spa);
879
880 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
881 rvd->vdev_guid = *newguid;
882 rvd->vdev_guid_sum += (*newguid - oldguid);
883 vdev_config_dirty(rvd);
884 spa_config_exit(spa, SCL_STATE, FTAG);
885
6f1ffb06 886 spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu",
74756182 887 (u_longlong_t)oldguid, (u_longlong_t)*newguid);
3bc7e0fb
GW
888}
889
3541dc6d
GA
890/*
891 * Change the GUID for the pool. This is done so that we can later
892 * re-import a pool built from a clone of our own vdevs. We will modify
893 * the root vdev's guid, our own pool guid, and then mark all of our
894 * vdevs dirty. Note that we must make sure that all our vdevs are
895 * online when we do this, or else any vdevs that weren't present
896 * would be orphaned from our pool. We are also going to issue a
897 * sysevent to update any watchers.
898 */
899int
900spa_change_guid(spa_t *spa)
901{
3bc7e0fb
GW
902 int error;
903 uint64_t guid;
3541dc6d 904
621dd7bb 905 mutex_enter(&spa->spa_vdev_top_lock);
3bc7e0fb
GW
906 mutex_enter(&spa_namespace_lock);
907 guid = spa_generate_guid(NULL);
3541dc6d 908
13fe0198 909 error = dsl_sync_task(spa->spa_name, spa_change_guid_check,
3d45fdd6 910 spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED);
3541dc6d 911
3bc7e0fb 912 if (error == 0) {
a1d477c2 913 spa_write_cachefile(spa, B_FALSE, B_TRUE);
12fa0466 914 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_REGUID);
3bc7e0fb 915 }
3541dc6d 916
3bc7e0fb 917 mutex_exit(&spa_namespace_lock);
621dd7bb 918 mutex_exit(&spa->spa_vdev_top_lock);
3541dc6d 919
3bc7e0fb 920 return (error);
3541dc6d
GA
921}
922
34dc7c2f
BB
923/*
924 * ==========================================================================
925 * SPA state manipulation (open/create/destroy/import/export)
926 * ==========================================================================
927 */
928
929static int
930spa_error_entry_compare(const void *a, const void *b)
931{
ee36c709
GN
932 const spa_error_entry_t *sa = (const spa_error_entry_t *)a;
933 const spa_error_entry_t *sb = (const spa_error_entry_t *)b;
34dc7c2f
BB
934 int ret;
935
ee36c709 936 ret = memcmp(&sa->se_bookmark, &sb->se_bookmark,
5dbd68a3 937 sizeof (zbookmark_phys_t));
34dc7c2f 938
ca577779 939 return (TREE_ISIGN(ret));
34dc7c2f
BB
940}
941
942/*
943 * Utility function which retrieves copies of the current logs and
944 * re-initializes them in the process.
945 */
946void
947spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub)
948{
949 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock));
950
861166b0
AZ
951 memcpy(last, &spa->spa_errlist_last, sizeof (avl_tree_t));
952 memcpy(scrub, &spa->spa_errlist_scrub, sizeof (avl_tree_t));
34dc7c2f
BB
953
954 avl_create(&spa->spa_errlist_scrub,
955 spa_error_entry_compare, sizeof (spa_error_entry_t),
956 offsetof(spa_error_entry_t, se_avl));
957 avl_create(&spa->spa_errlist_last,
958 spa_error_entry_compare, sizeof (spa_error_entry_t),
959 offsetof(spa_error_entry_t, se_avl));
960}
961
7ef5e54e
AL
962static void
963spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
34dc7c2f 964{
7ef5e54e
AL
965 const zio_taskq_info_t *ztip = &zio_taskqs[t][q];
966 enum zti_modes mode = ztip->zti_mode;
967 uint_t value = ztip->zti_value;
968 uint_t count = ztip->zti_count;
969 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
7457b024 970 uint_t cpus, flags = TASKQ_DYNAMIC;
428870ff 971 boolean_t batch = B_FALSE;
34dc7c2f 972
e8b96c60
MA
973 switch (mode) {
974 case ZTI_MODE_FIXED:
7457b024 975 ASSERT3U(value, >, 0);
e8b96c60 976 break;
7ef5e54e 977
e8b96c60
MA
978 case ZTI_MODE_BATCH:
979 batch = B_TRUE;
980 flags |= TASKQ_THREADS_CPU_PCT;
dcb6bed1 981 value = MIN(zio_taskq_batch_pct, 100);
e8b96c60 982 break;
7ef5e54e 983
7457b024
AM
984 case ZTI_MODE_SCALE:
985 flags |= TASKQ_THREADS_CPU_PCT;
986 /*
987 * We want more taskqs to reduce lock contention, but we want
988 * less for better request ordering and CPU utilization.
989 */
990 cpus = MAX(1, boot_ncpus * zio_taskq_batch_pct / 100);
991 if (zio_taskq_batch_tpq > 0) {
992 count = MAX(1, (cpus + zio_taskq_batch_tpq / 2) /
993 zio_taskq_batch_tpq);
994 } else {
995 /*
996 * Prefer 6 threads per taskq, but no more taskqs
997 * than threads in them on large systems. For 80%:
998 *
999 * taskq taskq total
1000 * cpus taskqs percent threads threads
1001 * ------- ------- ------- ------- -------
1002 * 1 1 80% 1 1
1003 * 2 1 80% 1 1
1004 * 4 1 80% 3 3
1005 * 8 2 40% 3 6
1006 * 16 3 27% 4 12
1007 * 32 5 16% 5 25
1008 * 64 7 11% 7 49
1009 * 128 10 8% 10 100
1010 * 256 14 6% 15 210
1011 */
1012 count = 1 + cpus / 6;
1013 while (count * count > cpus)
1014 count--;
1015 }
1016 /* Limit each taskq within 100% to not trigger assertion. */
1017 count = MAX(count, (zio_taskq_batch_pct + 99) / 100);
1018 value = (zio_taskq_batch_pct + count / 2) / count;
1019 break;
1020
1021 case ZTI_MODE_NULL:
1022 tqs->stqs_count = 0;
1023 tqs->stqs_taskq = NULL;
1024 return;
1025
e8b96c60
MA
1026 default:
1027 panic("unrecognized mode for %s_%s taskq (%u:%u) in "
1028 "spa_activate()",
1029 zio_type_name[t], zio_taskq_types[q], mode, value);
1030 break;
1031 }
7ef5e54e 1032
7457b024
AM
1033 ASSERT3U(count, >, 0);
1034 tqs->stqs_count = count;
1035 tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP);
1036
1c27024e 1037 for (uint_t i = 0; i < count; i++) {
e8b96c60 1038 taskq_t *tq;
af430294 1039 char name[32];
7ef5e54e 1040
7457b024
AM
1041 if (count > 1)
1042 (void) snprintf(name, sizeof (name), "%s_%s_%u",
1043 zio_type_name[t], zio_taskq_types[q], i);
1044 else
1045 (void) snprintf(name, sizeof (name), "%s_%s",
1046 zio_type_name[t], zio_taskq_types[q]);
7ef5e54e
AL
1047
1048 if (zio_taskq_sysdc && spa->spa_proc != &p0) {
1049 if (batch)
1050 flags |= TASKQ_DC_BATCH;
1051
18168da7 1052 (void) zio_taskq_basedc;
7ef5e54e
AL
1053 tq = taskq_create_sysdc(name, value, 50, INT_MAX,
1054 spa->spa_proc, zio_taskq_basedc, flags);
1055 } else {
e8b96c60
MA
1056 pri_t pri = maxclsyspri;
1057 /*
1058 * The write issue taskq can be extremely CPU
1229323d 1059 * intensive. Run it at slightly less important
7432d297
MM
1060 * priority than the other taskqs.
1061 *
1062 * Under Linux and FreeBSD this means incrementing
1063 * the priority value as opposed to platforms like
1064 * illumos where it should be decremented.
1065 *
1066 * On FreeBSD, if priorities divided by four (RQ_PPQ)
1067 * are equal then a difference between them is
1068 * insignificant.
e8b96c60 1069 */
7432d297
MM
1070 if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE) {
1071#if defined(__linux__)
1229323d 1072 pri++;
7432d297
MM
1073#elif defined(__FreeBSD__)
1074 pri += 4;
1075#else
1076#error "unknown OS"
1077#endif
1078 }
e8b96c60 1079 tq = taskq_create_proc(name, value, pri, 50,
7ef5e54e
AL
1080 INT_MAX, spa->spa_proc, flags);
1081 }
1082
1083 tqs->stqs_taskq[i] = tq;
1084 }
1085}
1086
1087static void
1088spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
1089{
1090 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
7ef5e54e
AL
1091
1092 if (tqs->stqs_taskq == NULL) {
1093 ASSERT3U(tqs->stqs_count, ==, 0);
1094 return;
1095 }
1096
1c27024e 1097 for (uint_t i = 0; i < tqs->stqs_count; i++) {
7ef5e54e
AL
1098 ASSERT3P(tqs->stqs_taskq[i], !=, NULL);
1099 taskq_destroy(tqs->stqs_taskq[i]);
428870ff 1100 }
34dc7c2f 1101
7ef5e54e
AL
1102 kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *));
1103 tqs->stqs_taskq = NULL;
1104}
34dc7c2f 1105
7ef5e54e
AL
1106/*
1107 * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority.
1108 * Note that a type may have multiple discrete taskqs to avoid lock contention
1109 * on the taskq itself. In that case we choose which taskq at random by using
1110 * the low bits of gethrtime().
1111 */
1112void
1113spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
1114 task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent)
1115{
1116 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
1117 taskq_t *tq;
1118
1119 ASSERT3P(tqs->stqs_taskq, !=, NULL);
1120 ASSERT3U(tqs->stqs_count, !=, 0);
1121
1122 if (tqs->stqs_count == 1) {
1123 tq = tqs->stqs_taskq[0];
1124 } else {
c12936b1 1125 tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count];
428870ff 1126 }
7ef5e54e
AL
1127
1128 taskq_dispatch_ent(tq, func, arg, flags, ent);
428870ff
BB
1129}
1130
044baf00
BB
1131/*
1132 * Same as spa_taskq_dispatch_ent() but block on the task until completion.
1133 */
1134void
1135spa_taskq_dispatch_sync(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
1136 task_func_t *func, void *arg, uint_t flags)
1137{
1138 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
1139 taskq_t *tq;
1140 taskqid_t id;
1141
1142 ASSERT3P(tqs->stqs_taskq, !=, NULL);
1143 ASSERT3U(tqs->stqs_count, !=, 0);
1144
1145 if (tqs->stqs_count == 1) {
1146 tq = tqs->stqs_taskq[0];
1147 } else {
c12936b1 1148 tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count];
044baf00
BB
1149 }
1150
1151 id = taskq_dispatch(tq, func, arg, flags);
1152 if (id)
1153 taskq_wait_id(tq, id);
1154}
1155
428870ff
BB
1156static void
1157spa_create_zio_taskqs(spa_t *spa)
1158{
1c27024e
DB
1159 for (int t = 0; t < ZIO_TYPES; t++) {
1160 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
7ef5e54e 1161 spa_taskqs_init(spa, t, q);
428870ff
BB
1162 }
1163 }
1164}
9babb374 1165
c25b8f99
BB
1166/*
1167 * Disabled until spa_thread() can be adapted for Linux.
1168 */
1169#undef HAVE_SPA_THREAD
1170
7b89a549 1171#if defined(_KERNEL) && defined(HAVE_SPA_THREAD)
428870ff
BB
1172static void
1173spa_thread(void *arg)
1174{
93ce2b4c 1175 psetid_t zio_taskq_psrset_bind = PS_NONE;
428870ff 1176 callb_cpr_t cprinfo;
9babb374 1177
428870ff
BB
1178 spa_t *spa = arg;
1179 user_t *pu = PTOU(curproc);
9babb374 1180
428870ff
BB
1181 CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr,
1182 spa->spa_name);
9babb374 1183
428870ff
BB
1184 ASSERT(curproc != &p0);
1185 (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs),
1186 "zpool-%s", spa->spa_name);
1187 (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm));
1188
1189 /* bind this thread to the requested psrset */
1190 if (zio_taskq_psrset_bind != PS_NONE) {
1191 pool_lock();
1192 mutex_enter(&cpu_lock);
1193 mutex_enter(&pidlock);
1194 mutex_enter(&curproc->p_lock);
1195
1196 if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind,
1197 0, NULL, NULL) == 0) {
1198 curthread->t_bind_pset = zio_taskq_psrset_bind;
1199 } else {
1200 cmn_err(CE_WARN,
1201 "Couldn't bind process for zfs pool \"%s\" to "
1202 "pset %d\n", spa->spa_name, zio_taskq_psrset_bind);
1203 }
1204
1205 mutex_exit(&curproc->p_lock);
1206 mutex_exit(&pidlock);
1207 mutex_exit(&cpu_lock);
1208 pool_unlock();
1209 }
1210
1211 if (zio_taskq_sysdc) {
1212 sysdc_thread_enter(curthread, 100, 0);
1213 }
1214
1215 spa->spa_proc = curproc;
1216 spa->spa_did = curthread->t_did;
1217
1218 spa_create_zio_taskqs(spa);
1219
1220 mutex_enter(&spa->spa_proc_lock);
1221 ASSERT(spa->spa_proc_state == SPA_PROC_CREATED);
1222
1223 spa->spa_proc_state = SPA_PROC_ACTIVE;
1224 cv_broadcast(&spa->spa_proc_cv);
1225
1226 CALLB_CPR_SAFE_BEGIN(&cprinfo);
1227 while (spa->spa_proc_state == SPA_PROC_ACTIVE)
1228 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
1229 CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock);
1230
1231 ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE);
1232 spa->spa_proc_state = SPA_PROC_GONE;
1233 spa->spa_proc = &p0;
1234 cv_broadcast(&spa->spa_proc_cv);
1235 CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */
1236
1237 mutex_enter(&curproc->p_lock);
1238 lwp_exit();
1239}
1240#endif
1241
1242/*
1243 * Activate an uninitialized pool.
1244 */
1245static void
da92d5cb 1246spa_activate(spa_t *spa, spa_mode_t mode)
428870ff
BB
1247{
1248 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
1249
1250 spa->spa_state = POOL_STATE_ACTIVE;
1251 spa->spa_mode = mode;
e39fe05b 1252 spa->spa_read_spacemaps = spa_mode_readable_spacemaps;
428870ff 1253
18168da7
AZ
1254 spa->spa_normal_class = metaslab_class_create(spa, &zfs_metaslab_ops);
1255 spa->spa_log_class = metaslab_class_create(spa, &zfs_metaslab_ops);
aa755b35 1256 spa->spa_embedded_log_class =
18168da7
AZ
1257 metaslab_class_create(spa, &zfs_metaslab_ops);
1258 spa->spa_special_class = metaslab_class_create(spa, &zfs_metaslab_ops);
1259 spa->spa_dedup_class = metaslab_class_create(spa, &zfs_metaslab_ops);
428870ff
BB
1260
1261 /* Try to create a covering process */
1262 mutex_enter(&spa->spa_proc_lock);
1263 ASSERT(spa->spa_proc_state == SPA_PROC_NONE);
1264 ASSERT(spa->spa_proc == &p0);
1265 spa->spa_did = 0;
1266
18168da7 1267 (void) spa_create_process;
7b89a549 1268#ifdef HAVE_SPA_THREAD
428870ff
BB
1269 /* Only create a process if we're going to be around a while. */
1270 if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) {
1271 if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri,
1272 NULL, 0) == 0) {
1273 spa->spa_proc_state = SPA_PROC_CREATED;
1274 while (spa->spa_proc_state == SPA_PROC_CREATED) {
1275 cv_wait(&spa->spa_proc_cv,
1276 &spa->spa_proc_lock);
9babb374 1277 }
428870ff
BB
1278 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
1279 ASSERT(spa->spa_proc != &p0);
1280 ASSERT(spa->spa_did != 0);
1281 } else {
1282#ifdef _KERNEL
1283 cmn_err(CE_WARN,
1284 "Couldn't create process for zfs pool \"%s\"\n",
1285 spa->spa_name);
1286#endif
b128c09f 1287 }
34dc7c2f 1288 }
7b89a549 1289#endif /* HAVE_SPA_THREAD */
428870ff
BB
1290 mutex_exit(&spa->spa_proc_lock);
1291
1292 /* If we didn't create a process, we need to create our taskqs. */
1293 if (spa->spa_proc == &p0) {
1294 spa_create_zio_taskqs(spa);
1295 }
34dc7c2f 1296
619f0976
GW
1297 for (size_t i = 0; i < TXG_SIZE; i++) {
1298 spa->spa_txg_zio[i] = zio_root(spa, NULL, NULL,
1299 ZIO_FLAG_CANFAIL);
1300 }
a1d477c2 1301
b128c09f
BB
1302 list_create(&spa->spa_config_dirty_list, sizeof (vdev_t),
1303 offsetof(vdev_t, vdev_config_dirty_node));
0c66c32d
JG
1304 list_create(&spa->spa_evicting_os_list, sizeof (objset_t),
1305 offsetof(objset_t, os_evicting_node));
b128c09f
BB
1306 list_create(&spa->spa_state_dirty_list, sizeof (vdev_t),
1307 offsetof(vdev_t, vdev_state_dirty_node));
34dc7c2f 1308
4747a7d3 1309 txg_list_create(&spa->spa_vdev_txg_list, spa,
34dc7c2f
BB
1310 offsetof(struct vdev, vdev_txg_node));
1311
1312 avl_create(&spa->spa_errlist_scrub,
1313 spa_error_entry_compare, sizeof (spa_error_entry_t),
1314 offsetof(spa_error_entry_t, se_avl));
1315 avl_create(&spa->spa_errlist_last,
1316 spa_error_entry_compare, sizeof (spa_error_entry_t),
1317 offsetof(spa_error_entry_t, se_avl));
a0bd735a 1318
4759342a
JL
1319 spa_activate_os(spa);
1320
b5256303
TC
1321 spa_keystore_init(&spa->spa_keystore);
1322
a0bd735a
BP
1323 /*
1324 * This taskq is used to perform zvol-minor-related tasks
1325 * asynchronously. This has several advantages, including easy
d0249a4b 1326 * resolution of various deadlocks.
a0bd735a
BP
1327 *
1328 * The taskq must be single threaded to ensure tasks are always
1329 * processed in the order in which they were dispatched.
1330 *
1331 * A taskq per pool allows one to keep the pools independent.
1332 * This way if one pool is suspended, it will not impact another.
1333 *
1334 * The preferred location to dispatch a zvol minor task is a sync
1335 * task. In this context, there is easy access to the spa_t and minimal
1336 * error handling is required because the sync task must succeed.
1337 */
1338 spa->spa_zvol_taskq = taskq_create("z_zvol", 1, defclsyspri,
1339 1, INT_MAX, 0);
1de321e6 1340
77d8a0f1 1341 /*
1342 * Taskq dedicated to prefetcher threads: this is used to prevent the
1343 * pool traverse code from monopolizing the global (and limited)
1344 * system_taskq by inappropriately scheduling long running tasks on it.
1345 */
60a4c7d2
PD
1346 spa->spa_prefetch_taskq = taskq_create("z_prefetch", 100,
1347 defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT);
77d8a0f1 1348
1de321e6
JX
1349 /*
1350 * The taskq to upgrade datasets in this pool. Currently used by
9c5167d1 1351 * feature SPA_FEATURE_USEROBJ_ACCOUNTING/SPA_FEATURE_PROJECT_QUOTA.
1de321e6 1352 */
60a4c7d2
PD
1353 spa->spa_upgrade_taskq = taskq_create("z_upgrade", 100,
1354 defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT);
34dc7c2f
BB
1355}
1356
1357/*
1358 * Opposite of spa_activate().
1359 */
1360static void
1361spa_deactivate(spa_t *spa)
1362{
34dc7c2f
BB
1363 ASSERT(spa->spa_sync_on == B_FALSE);
1364 ASSERT(spa->spa_dsl_pool == NULL);
1365 ASSERT(spa->spa_root_vdev == NULL);
9babb374 1366 ASSERT(spa->spa_async_zio_root == NULL);
34dc7c2f
BB
1367 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED);
1368
0c66c32d
JG
1369 spa_evicting_os_wait(spa);
1370
a0bd735a
BP
1371 if (spa->spa_zvol_taskq) {
1372 taskq_destroy(spa->spa_zvol_taskq);
1373 spa->spa_zvol_taskq = NULL;
1374 }
1375
77d8a0f1 1376 if (spa->spa_prefetch_taskq) {
1377 taskq_destroy(spa->spa_prefetch_taskq);
1378 spa->spa_prefetch_taskq = NULL;
1379 }
1380
1de321e6
JX
1381 if (spa->spa_upgrade_taskq) {
1382 taskq_destroy(spa->spa_upgrade_taskq);
1383 spa->spa_upgrade_taskq = NULL;
1384 }
1385
34dc7c2f
BB
1386 txg_list_destroy(&spa->spa_vdev_txg_list);
1387
b128c09f 1388 list_destroy(&spa->spa_config_dirty_list);
0c66c32d 1389 list_destroy(&spa->spa_evicting_os_list);
b128c09f 1390 list_destroy(&spa->spa_state_dirty_list);
34dc7c2f 1391
57ddcda1 1392 taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid);
cc92e9d0 1393
1c27024e
DB
1394 for (int t = 0; t < ZIO_TYPES; t++) {
1395 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
7ef5e54e 1396 spa_taskqs_fini(spa, t, q);
b128c09f 1397 }
34dc7c2f
BB
1398 }
1399
a1d477c2
MA
1400 for (size_t i = 0; i < TXG_SIZE; i++) {
1401 ASSERT3P(spa->spa_txg_zio[i], !=, NULL);
1402 VERIFY0(zio_wait(spa->spa_txg_zio[i]));
1403 spa->spa_txg_zio[i] = NULL;
1404 }
1405
34dc7c2f
BB
1406 metaslab_class_destroy(spa->spa_normal_class);
1407 spa->spa_normal_class = NULL;
1408
1409 metaslab_class_destroy(spa->spa_log_class);
1410 spa->spa_log_class = NULL;
1411
aa755b35
MA
1412 metaslab_class_destroy(spa->spa_embedded_log_class);
1413 spa->spa_embedded_log_class = NULL;
1414
cc99f275
DB
1415 metaslab_class_destroy(spa->spa_special_class);
1416 spa->spa_special_class = NULL;
1417
1418 metaslab_class_destroy(spa->spa_dedup_class);
1419 spa->spa_dedup_class = NULL;
1420
34dc7c2f
BB
1421 /*
1422 * If this was part of an import or the open otherwise failed, we may
1423 * still have errors left in the queues. Empty them just in case.
1424 */
1425 spa_errlog_drain(spa);
34dc7c2f
BB
1426 avl_destroy(&spa->spa_errlist_scrub);
1427 avl_destroy(&spa->spa_errlist_last);
1428
b5256303
TC
1429 spa_keystore_fini(&spa->spa_keystore);
1430
34dc7c2f 1431 spa->spa_state = POOL_STATE_UNINITIALIZED;
428870ff
BB
1432
1433 mutex_enter(&spa->spa_proc_lock);
1434 if (spa->spa_proc_state != SPA_PROC_NONE) {
1435 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
1436 spa->spa_proc_state = SPA_PROC_DEACTIVATE;
1437 cv_broadcast(&spa->spa_proc_cv);
1438 while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) {
1439 ASSERT(spa->spa_proc != &p0);
1440 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
1441 }
1442 ASSERT(spa->spa_proc_state == SPA_PROC_GONE);
1443 spa->spa_proc_state = SPA_PROC_NONE;
1444 }
1445 ASSERT(spa->spa_proc == &p0);
1446 mutex_exit(&spa->spa_proc_lock);
1447
1448 /*
1449 * We want to make sure spa_thread() has actually exited the ZFS
1450 * module, so that the module can't be unloaded out from underneath
1451 * it.
1452 */
1453 if (spa->spa_did != 0) {
1454 thread_join(spa->spa_did);
1455 spa->spa_did = 0;
1456 }
4759342a
JL
1457
1458 spa_deactivate_os(spa);
1459
34dc7c2f
BB
1460}
1461
1462/*
1463 * Verify a pool configuration, and construct the vdev tree appropriately. This
1464 * will create all the necessary vdevs in the appropriate layout, with each vdev
1465 * in the CLOSED state. This will prep the pool before open/creation/import.
1466 * All vdev validation is done by the vdev_alloc() routine.
1467 */
4a22ba5b 1468int
34dc7c2f
BB
1469spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent,
1470 uint_t id, int atype)
1471{
1472 nvlist_t **child;
9babb374 1473 uint_t children;
34dc7c2f
BB
1474 int error;
1475
1476 if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0)
1477 return (error);
1478
1479 if ((*vdp)->vdev_ops->vdev_op_leaf)
1480 return (0);
1481
b128c09f
BB
1482 error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
1483 &child, &children);
1484
1485 if (error == ENOENT)
1486 return (0);
1487
1488 if (error) {
34dc7c2f
BB
1489 vdev_free(*vdp);
1490 *vdp = NULL;
2e528b49 1491 return (SET_ERROR(EINVAL));
34dc7c2f
BB
1492 }
1493
1c27024e 1494 for (int c = 0; c < children; c++) {
34dc7c2f
BB
1495 vdev_t *vd;
1496 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c,
1497 atype)) != 0) {
1498 vdev_free(*vdp);
1499 *vdp = NULL;
1500 return (error);
1501 }
1502 }
1503
1504 ASSERT(*vdp != NULL);
1505
1506 return (0);
1507}
1508
93e28d66
SD
1509static boolean_t
1510spa_should_flush_logs_on_unload(spa_t *spa)
1511{
1512 if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
1513 return (B_FALSE);
1514
1515 if (!spa_writeable(spa))
1516 return (B_FALSE);
1517
1518 if (!spa->spa_sync_on)
1519 return (B_FALSE);
1520
1521 if (spa_state(spa) != POOL_STATE_EXPORTED)
1522 return (B_FALSE);
1523
1524 if (zfs_keep_log_spacemaps_at_export)
1525 return (B_FALSE);
1526
1527 return (B_TRUE);
1528}
1529
1530/*
1531 * Opens a transaction that will set the flag that will instruct
1532 * spa_sync to attempt to flush all the metaslabs for that txg.
1533 */
1534static void
1535spa_unload_log_sm_flush_all(spa_t *spa)
1536{
1537 dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
1538 VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
1539
1540 ASSERT3U(spa->spa_log_flushall_txg, ==, 0);
1541 spa->spa_log_flushall_txg = dmu_tx_get_txg(tx);
1542
1543 dmu_tx_commit(tx);
1544 txg_wait_synced(spa_get_dsl(spa), spa->spa_log_flushall_txg);
1545}
1546
1547static void
1548spa_unload_log_sm_metadata(spa_t *spa)
1549{
1550 void *cookie = NULL;
1551 spa_log_sm_t *sls;
1552 while ((sls = avl_destroy_nodes(&spa->spa_sm_logs_by_txg,
1553 &cookie)) != NULL) {
1554 VERIFY0(sls->sls_mscount);
1555 kmem_free(sls, sizeof (spa_log_sm_t));
1556 }
1557
1558 for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
1559 e != NULL; e = list_head(&spa->spa_log_summary)) {
1560 VERIFY0(e->lse_mscount);
1561 list_remove(&spa->spa_log_summary, e);
1562 kmem_free(e, sizeof (log_summary_entry_t));
1563 }
1564
1565 spa->spa_unflushed_stats.sus_nblocks = 0;
1566 spa->spa_unflushed_stats.sus_memused = 0;
1567 spa->spa_unflushed_stats.sus_blocklimit = 0;
1568}
1569
37f03da8
SH
1570static void
1571spa_destroy_aux_threads(spa_t *spa)
1572{
1573 if (spa->spa_condense_zthr != NULL) {
1574 zthr_destroy(spa->spa_condense_zthr);
1575 spa->spa_condense_zthr = NULL;
1576 }
1577 if (spa->spa_checkpoint_discard_zthr != NULL) {
1578 zthr_destroy(spa->spa_checkpoint_discard_zthr);
1579 spa->spa_checkpoint_discard_zthr = NULL;
1580 }
1581 if (spa->spa_livelist_delete_zthr != NULL) {
1582 zthr_destroy(spa->spa_livelist_delete_zthr);
1583 spa->spa_livelist_delete_zthr = NULL;
1584 }
1585 if (spa->spa_livelist_condense_zthr != NULL) {
1586 zthr_destroy(spa->spa_livelist_condense_zthr);
1587 spa->spa_livelist_condense_zthr = NULL;
1588 }
1589}
1590
34dc7c2f
BB
1591/*
1592 * Opposite of spa_load().
1593 */
1594static void
1595spa_unload(spa_t *spa)
1596{
b128c09f 1597 ASSERT(MUTEX_HELD(&spa_namespace_lock));
93e28d66 1598 ASSERT(spa_state(spa) != POOL_STATE_UNINITIALIZED);
b128c09f 1599
ca95f70d 1600 spa_import_progress_remove(spa_guid(spa));
4a0ee12a
PZ
1601 spa_load_note(spa, "UNLOADING");
1602
e60e158e
JG
1603 spa_wake_waiters(spa);
1604
93e28d66 1605 /*
2fb52853
GA
1606 * If we have set the spa_final_txg, we have already performed the
1607 * tasks below in spa_export_common(). We should not redo it here since
1608 * we delay the final TXGs beyond what spa_final_txg is set at.
93e28d66 1609 */
2fb52853
GA
1610 if (spa->spa_final_txg == UINT64_MAX) {
1611 /*
1612 * If the log space map feature is enabled and the pool is
1613 * getting exported (but not destroyed), we want to spend some
1614 * time flushing as many metaslabs as we can in an attempt to
1615 * destroy log space maps and save import time.
1616 */
1617 if (spa_should_flush_logs_on_unload(spa))
1618 spa_unload_log_sm_flush_all(spa);
93e28d66 1619
2fb52853
GA
1620 /*
1621 * Stop async tasks.
1622 */
1623 spa_async_suspend(spa);
34dc7c2f 1624
2fb52853
GA
1625 if (spa->spa_root_vdev) {
1626 vdev_t *root_vdev = spa->spa_root_vdev;
1627 vdev_initialize_stop_all(root_vdev,
1628 VDEV_INITIALIZE_ACTIVE);
1629 vdev_trim_stop_all(root_vdev, VDEV_TRIM_ACTIVE);
1630 vdev_autotrim_stop_all(spa);
1631 vdev_rebuild_stop_all(spa);
1632 }
619f0976
GW
1633 }
1634
34dc7c2f
BB
1635 /*
1636 * Stop syncing.
1637 */
1638 if (spa->spa_sync_on) {
1639 txg_sync_stop(spa->spa_dsl_pool);
1640 spa->spa_sync_on = B_FALSE;
1641 }
1642
4e21fd06 1643 /*
93e28d66
SD
1644 * This ensures that there is no async metaslab prefetching
1645 * while we attempt to unload the spa.
4e21fd06
DB
1646 */
1647 if (spa->spa_root_vdev != NULL) {
93e28d66
SD
1648 for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) {
1649 vdev_t *vc = spa->spa_root_vdev->vdev_child[c];
1650 if (vc->vdev_mg != NULL)
1651 taskq_wait(vc->vdev_mg->mg_taskq);
1652 }
4e21fd06
DB
1653 }
1654
379ca9cf
OF
1655 if (spa->spa_mmp.mmp_thread)
1656 mmp_thread_stop(spa);
1657
34dc7c2f 1658 /*
b128c09f 1659 * Wait for any outstanding async I/O to complete.
34dc7c2f 1660 */
9babb374 1661 if (spa->spa_async_zio_root != NULL) {
1c27024e 1662 for (int i = 0; i < max_ncpus; i++)
e022864d
MA
1663 (void) zio_wait(spa->spa_async_zio_root[i]);
1664 kmem_free(spa->spa_async_zio_root, max_ncpus * sizeof (void *));
9babb374
BB
1665 spa->spa_async_zio_root = NULL;
1666 }
34dc7c2f 1667
a1d477c2
MA
1668 if (spa->spa_vdev_removal != NULL) {
1669 spa_vdev_removal_destroy(spa->spa_vdev_removal);
1670 spa->spa_vdev_removal = NULL;
1671 }
1672
37f03da8 1673 spa_destroy_aux_threads(spa);
d2734cce 1674
a1d477c2
MA
1675 spa_condense_fini(spa);
1676
428870ff
BB
1677 bpobj_close(&spa->spa_deferred_bpobj);
1678
619f0976 1679 spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
93cf2076
GW
1680
1681 /*
1682 * Close all vdevs.
1683 */
1684 if (spa->spa_root_vdev)
1685 vdev_free(spa->spa_root_vdev);
1686 ASSERT(spa->spa_root_vdev == NULL);
1687
34dc7c2f
BB
1688 /*
1689 * Close the dsl pool.
1690 */
1691 if (spa->spa_dsl_pool) {
1692 dsl_pool_close(spa->spa_dsl_pool);
1693 spa->spa_dsl_pool = NULL;
428870ff 1694 spa->spa_meta_objset = NULL;
34dc7c2f
BB
1695 }
1696
428870ff 1697 ddt_unload(spa);
93e28d66 1698 spa_unload_log_sm_metadata(spa);
428870ff 1699
fb5f0bc8
BB
1700 /*
1701 * Drop and purge level 2 cache
1702 */
1703 spa_l2cache_drop(spa);
1704
93e28d66 1705 for (int i = 0; i < spa->spa_spares.sav_count; i++)
34dc7c2f
BB
1706 vdev_free(spa->spa_spares.sav_vdevs[i]);
1707 if (spa->spa_spares.sav_vdevs) {
1708 kmem_free(spa->spa_spares.sav_vdevs,
1709 spa->spa_spares.sav_count * sizeof (void *));
1710 spa->spa_spares.sav_vdevs = NULL;
1711 }
1712 if (spa->spa_spares.sav_config) {
1713 nvlist_free(spa->spa_spares.sav_config);
1714 spa->spa_spares.sav_config = NULL;
1715 }
b128c09f 1716 spa->spa_spares.sav_count = 0;
34dc7c2f 1717
93e28d66 1718 for (int i = 0; i < spa->spa_l2cache.sav_count; i++) {
5ffb9d1d 1719 vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]);
34dc7c2f 1720 vdev_free(spa->spa_l2cache.sav_vdevs[i]);
5ffb9d1d 1721 }
34dc7c2f
BB
1722 if (spa->spa_l2cache.sav_vdevs) {
1723 kmem_free(spa->spa_l2cache.sav_vdevs,
1724 spa->spa_l2cache.sav_count * sizeof (void *));
1725 spa->spa_l2cache.sav_vdevs = NULL;
1726 }
1727 if (spa->spa_l2cache.sav_config) {
1728 nvlist_free(spa->spa_l2cache.sav_config);
1729 spa->spa_l2cache.sav_config = NULL;
1730 }
b128c09f 1731 spa->spa_l2cache.sav_count = 0;
34dc7c2f
BB
1732
1733 spa->spa_async_suspended = 0;
fb5f0bc8 1734
a1d477c2
MA
1735 spa->spa_indirect_vdevs_loaded = B_FALSE;
1736
d96eb2b1
DM
1737 if (spa->spa_comment != NULL) {
1738 spa_strfree(spa->spa_comment);
1739 spa->spa_comment = NULL;
1740 }
658fb802
CB
1741 if (spa->spa_compatibility != NULL) {
1742 spa_strfree(spa->spa_compatibility);
1743 spa->spa_compatibility = NULL;
1744 }
d96eb2b1 1745
619f0976 1746 spa_config_exit(spa, SCL_ALL, spa);
34dc7c2f
BB
1747}
1748
1749/*
1750 * Load (or re-load) the current list of vdevs describing the active spares for
1751 * this pool. When this is called, we have some form of basic information in
1752 * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and
1753 * then re-generate a more complete list including status information.
1754 */
a1d477c2 1755void
34dc7c2f
BB
1756spa_load_spares(spa_t *spa)
1757{
1758 nvlist_t **spares;
1759 uint_t nspares;
1760 int i;
1761 vdev_t *vd, *tvd;
1762
d2734cce
SD
1763#ifndef _KERNEL
1764 /*
1765 * zdb opens both the current state of the pool and the
1766 * checkpointed state (if present), with a different spa_t.
1767 *
1768 * As spare vdevs are shared among open pools, we skip loading
1769 * them when we load the checkpointed state of the pool.
1770 */
1771 if (!spa_writeable(spa))
1772 return;
1773#endif
1774
b128c09f
BB
1775 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
1776
34dc7c2f
BB
1777 /*
1778 * First, close and free any existing spare vdevs.
1779 */
1780 for (i = 0; i < spa->spa_spares.sav_count; i++) {
1781 vd = spa->spa_spares.sav_vdevs[i];
1782
1783 /* Undo the call to spa_activate() below */
b128c09f
BB
1784 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
1785 B_FALSE)) != NULL && tvd->vdev_isspare)
34dc7c2f
BB
1786 spa_spare_remove(tvd);
1787 vdev_close(vd);
1788 vdev_free(vd);
1789 }
1790
1791 if (spa->spa_spares.sav_vdevs)
1792 kmem_free(spa->spa_spares.sav_vdevs,
1793 spa->spa_spares.sav_count * sizeof (void *));
1794
1795 if (spa->spa_spares.sav_config == NULL)
1796 nspares = 0;
1797 else
65ad5d11
AJ
1798 VERIFY0(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
1799 ZPOOL_CONFIG_SPARES, &spares, &nspares));
34dc7c2f
BB
1800
1801 spa->spa_spares.sav_count = (int)nspares;
1802 spa->spa_spares.sav_vdevs = NULL;
1803
1804 if (nspares == 0)
1805 return;
1806
1807 /*
1808 * Construct the array of vdevs, opening them to get status in the
1809 * process. For each spare, there is potentially two different vdev_t
1810 * structures associated with it: one in the list of spares (used only
1811 * for basic validation purposes) and one in the active vdev
1812 * configuration (if it's spared in). During this phase we open and
1813 * validate each vdev on the spare list. If the vdev also exists in the
1814 * active configuration, then we also mark this vdev as an active spare.
1815 */
904ea276 1816 spa->spa_spares.sav_vdevs = kmem_zalloc(nspares * sizeof (void *),
79c76d5b 1817 KM_SLEEP);
34dc7c2f
BB
1818 for (i = 0; i < spa->spa_spares.sav_count; i++) {
1819 VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0,
1820 VDEV_ALLOC_SPARE) == 0);
1821 ASSERT(vd != NULL);
1822
1823 spa->spa_spares.sav_vdevs[i] = vd;
1824
b128c09f
BB
1825 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
1826 B_FALSE)) != NULL) {
34dc7c2f
BB
1827 if (!tvd->vdev_isspare)
1828 spa_spare_add(tvd);
1829
1830 /*
1831 * We only mark the spare active if we were successfully
1832 * able to load the vdev. Otherwise, importing a pool
1833 * with a bad active spare would result in strange
1834 * behavior, because multiple pool would think the spare
1835 * is actively in use.
1836 *
1837 * There is a vulnerability here to an equally bizarre
1838 * circumstance, where a dead active spare is later
1839 * brought back to life (onlined or otherwise). Given
1840 * the rarity of this scenario, and the extra complexity
1841 * it adds, we ignore the possibility.
1842 */
1843 if (!vdev_is_dead(tvd))
1844 spa_spare_activate(tvd);
1845 }
1846
b128c09f 1847 vd->vdev_top = vd;
9babb374 1848 vd->vdev_aux = &spa->spa_spares;
b128c09f 1849
34dc7c2f
BB
1850 if (vdev_open(vd) != 0)
1851 continue;
1852
34dc7c2f
BB
1853 if (vdev_validate_aux(vd) == 0)
1854 spa_spare_add(vd);
1855 }
1856
1857 /*
1858 * Recompute the stashed list of spares, with status information
1859 * this time.
1860 */
65ad5d11 1861 fnvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES);
34dc7c2f
BB
1862
1863 spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *),
79c76d5b 1864 KM_SLEEP);
34dc7c2f
BB
1865 for (i = 0; i < spa->spa_spares.sav_count; i++)
1866 spares[i] = vdev_config_generate(spa,
428870ff 1867 spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE);
65ad5d11 1868 fnvlist_add_nvlist_array(spa->spa_spares.sav_config,
795075e6
PD
1869 ZPOOL_CONFIG_SPARES, (const nvlist_t * const *)spares,
1870 spa->spa_spares.sav_count);
34dc7c2f
BB
1871 for (i = 0; i < spa->spa_spares.sav_count; i++)
1872 nvlist_free(spares[i]);
1873 kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *));
1874}
1875
1876/*
1877 * Load (or re-load) the current list of vdevs describing the active l2cache for
1878 * this pool. When this is called, we have some form of basic information in
1879 * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and
1880 * then re-generate a more complete list including status information.
1881 * Devices which are already active have their details maintained, and are
1882 * not re-opened.
1883 */
a1d477c2 1884void
34dc7c2f
BB
1885spa_load_l2cache(spa_t *spa)
1886{
460f239e 1887 nvlist_t **l2cache = NULL;
34dc7c2f
BB
1888 uint_t nl2cache;
1889 int i, j, oldnvdevs;
9babb374 1890 uint64_t guid;
a117a6d6 1891 vdev_t *vd, **oldvdevs, **newvdevs;
34dc7c2f
BB
1892 spa_aux_vdev_t *sav = &spa->spa_l2cache;
1893
d2734cce
SD
1894#ifndef _KERNEL
1895 /*
1896 * zdb opens both the current state of the pool and the
1897 * checkpointed state (if present), with a different spa_t.
1898 *
1899 * As L2 caches are part of the ARC which is shared among open
1900 * pools, we skip loading them when we load the checkpointed
1901 * state of the pool.
1902 */
1903 if (!spa_writeable(spa))
1904 return;
1905#endif
1906
b128c09f
BB
1907 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
1908
34dc7c2f
BB
1909 oldvdevs = sav->sav_vdevs;
1910 oldnvdevs = sav->sav_count;
1911 sav->sav_vdevs = NULL;
1912 sav->sav_count = 0;
1913
67d60824
NB
1914 if (sav->sav_config == NULL) {
1915 nl2cache = 0;
1916 newvdevs = NULL;
1917 goto out;
1918 }
1919
65ad5d11
AJ
1920 VERIFY0(nvlist_lookup_nvlist_array(sav->sav_config,
1921 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache));
67d60824
NB
1922 newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP);
1923
34dc7c2f
BB
1924 /*
1925 * Process new nvlist of vdevs.
1926 */
1927 for (i = 0; i < nl2cache; i++) {
65ad5d11 1928 guid = fnvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID);
34dc7c2f
BB
1929
1930 newvdevs[i] = NULL;
1931 for (j = 0; j < oldnvdevs; j++) {
1932 vd = oldvdevs[j];
1933 if (vd != NULL && guid == vd->vdev_guid) {
1934 /*
1935 * Retain previous vdev for add/remove ops.
1936 */
1937 newvdevs[i] = vd;
1938 oldvdevs[j] = NULL;
1939 break;
1940 }
1941 }
1942
1943 if (newvdevs[i] == NULL) {
1944 /*
1945 * Create new vdev
1946 */
1947 VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0,
1948 VDEV_ALLOC_L2CACHE) == 0);
1949 ASSERT(vd != NULL);
1950 newvdevs[i] = vd;
1951
1952 /*
1953 * Commit this vdev as an l2cache device,
1954 * even if it fails to open.
1955 */
1956 spa_l2cache_add(vd);
1957
b128c09f
BB
1958 vd->vdev_top = vd;
1959 vd->vdev_aux = sav;
1960
1961 spa_l2cache_activate(vd);
1962
34dc7c2f
BB
1963 if (vdev_open(vd) != 0)
1964 continue;
1965
34dc7c2f
BB
1966 (void) vdev_validate_aux(vd);
1967
9babb374
BB
1968 if (!vdev_is_dead(vd))
1969 l2arc_add_vdev(spa, vd);
b7654bd7
GA
1970
1971 /*
1972 * Upon cache device addition to a pool or pool
1973 * creation with a cache device or if the header
1974 * of the device is invalid we issue an async
1975 * TRIM command for the whole device which will
1976 * execute if l2arc_trim_ahead > 0.
1977 */
1978 spa_async_request(spa, SPA_ASYNC_L2CACHE_TRIM);
34dc7c2f
BB
1979 }
1980 }
1981
67d60824
NB
1982 sav->sav_vdevs = newvdevs;
1983 sav->sav_count = (int)nl2cache;
1984
1985 /*
1986 * Recompute the stashed list of l2cache devices, with status
1987 * information this time.
1988 */
65ad5d11 1989 fnvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE);
67d60824 1990
460f239e
D
1991 if (sav->sav_count > 0)
1992 l2cache = kmem_alloc(sav->sav_count * sizeof (void *),
1993 KM_SLEEP);
67d60824
NB
1994 for (i = 0; i < sav->sav_count; i++)
1995 l2cache[i] = vdev_config_generate(spa,
1996 sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE);
795075e6
PD
1997 fnvlist_add_nvlist_array(sav->sav_config, ZPOOL_CONFIG_L2CACHE,
1998 (const nvlist_t * const *)l2cache, sav->sav_count);
67d60824
NB
1999
2000out:
34dc7c2f
BB
2001 /*
2002 * Purge vdevs that were dropped
2003 */
2004 for (i = 0; i < oldnvdevs; i++) {
2005 uint64_t pool;
2006
2007 vd = oldvdevs[i];
2008 if (vd != NULL) {
5ffb9d1d
GW
2009 ASSERT(vd->vdev_isl2cache);
2010
fb5f0bc8
BB
2011 if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
2012 pool != 0ULL && l2arc_vdev_present(vd))
34dc7c2f 2013 l2arc_remove_vdev(vd);
5ffb9d1d
GW
2014 vdev_clear_stats(vd);
2015 vdev_free(vd);
34dc7c2f
BB
2016 }
2017 }
2018
2019 if (oldvdevs)
2020 kmem_free(oldvdevs, oldnvdevs * sizeof (void *));
2021
34dc7c2f
BB
2022 for (i = 0; i < sav->sav_count; i++)
2023 nvlist_free(l2cache[i]);
2024 if (sav->sav_count)
2025 kmem_free(l2cache, sav->sav_count * sizeof (void *));
2026}
2027
2028static int
2029load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value)
2030{
2031 dmu_buf_t *db;
2032 char *packed = NULL;
2033 size_t nvsize = 0;
2034 int error;
2035 *value = NULL;
2036
c3275b56
BB
2037 error = dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db);
2038 if (error)
2039 return (error);
2040
34dc7c2f
BB
2041 nvsize = *(uint64_t *)db->db_data;
2042 dmu_buf_rele(db, FTAG);
2043
77aef6f6 2044 packed = vmem_alloc(nvsize, KM_SLEEP);
9babb374
BB
2045 error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed,
2046 DMU_READ_PREFETCH);
34dc7c2f
BB
2047 if (error == 0)
2048 error = nvlist_unpack(packed, nvsize, value, 0);
77aef6f6 2049 vmem_free(packed, nvsize);
34dc7c2f
BB
2050
2051 return (error);
2052}
2053
6cb8e530
PZ
2054/*
2055 * Concrete top-level vdevs that are not missing and are not logs. At every
2056 * spa_sync we write new uberblocks to at least SPA_SYNC_MIN_VDEVS core tvds.
2057 */
2058static uint64_t
2059spa_healthy_core_tvds(spa_t *spa)
2060{
2061 vdev_t *rvd = spa->spa_root_vdev;
2062 uint64_t tvds = 0;
2063
2064 for (uint64_t i = 0; i < rvd->vdev_children; i++) {
2065 vdev_t *vd = rvd->vdev_child[i];
2066 if (vd->vdev_islog)
2067 continue;
2068 if (vdev_is_concrete(vd) && !vdev_is_dead(vd))
2069 tvds++;
2070 }
2071
2072 return (tvds);
2073}
2074
34dc7c2f
BB
2075/*
2076 * Checks to see if the given vdev could not be opened, in which case we post a
2077 * sysevent to notify the autoreplace code that the device has been removed.
2078 */
2079static void
2080spa_check_removed(vdev_t *vd)
2081{
6cb8e530 2082 for (uint64_t c = 0; c < vd->vdev_children; c++)
34dc7c2f
BB
2083 spa_check_removed(vd->vdev_child[c]);
2084
7011fb60 2085 if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) &&
a1d477c2 2086 vdev_is_concrete(vd)) {
fb390aaf 2087 zfs_post_autoreplace(vd->vdev_spa, vd);
12fa0466 2088 spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_CHECK);
34dc7c2f
BB
2089 }
2090}
2091
6cb8e530
PZ
2092static int
2093spa_check_for_missing_logs(spa_t *spa)
9babb374 2094{
6cb8e530 2095 vdev_t *rvd = spa->spa_root_vdev;
9babb374 2096
428870ff 2097 /*
572e2857 2098 * If we're doing a normal import, then build up any additional
6cb8e530 2099 * diagnostic information about missing log devices.
572e2857 2100 * We'll pass this up to the user for further processing.
428870ff 2101 */
572e2857
BB
2102 if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) {
2103 nvlist_t **child, *nv;
2104 uint64_t idx = 0;
2105
160987b5 2106 child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t *),
79c76d5b 2107 KM_SLEEP);
65ad5d11 2108 nv = fnvlist_alloc();
572e2857 2109
6cb8e530 2110 for (uint64_t c = 0; c < rvd->vdev_children; c++) {
572e2857 2111 vdev_t *tvd = rvd->vdev_child[c];
572e2857 2112
6cb8e530
PZ
2113 /*
2114 * We consider a device as missing only if it failed
2115 * to open (i.e. offline or faulted is not considered
2116 * as missing).
2117 */
2118 if (tvd->vdev_islog &&
2119 tvd->vdev_state == VDEV_STATE_CANT_OPEN) {
2120 child[idx++] = vdev_config_generate(spa, tvd,
2121 B_FALSE, VDEV_CONFIG_MISSING);
2122 }
572e2857 2123 }
9babb374 2124
6cb8e530 2125 if (idx > 0) {
795075e6
PD
2126 fnvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
2127 (const nvlist_t * const *)child, idx);
6cb8e530
PZ
2128 fnvlist_add_nvlist(spa->spa_load_info,
2129 ZPOOL_CONFIG_MISSING_DEVICES, nv);
572e2857 2130
6cb8e530 2131 for (uint64_t i = 0; i < idx; i++)
572e2857
BB
2132 nvlist_free(child[i]);
2133 }
2134 nvlist_free(nv);
2135 kmem_free(child, rvd->vdev_children * sizeof (char **));
572e2857 2136
6cb8e530
PZ
2137 if (idx > 0) {
2138 spa_load_failed(spa, "some log devices are missing");
db7d07e1 2139 vdev_dbgmsg_print_tree(rvd, 2);
6cb8e530
PZ
2140 return (SET_ERROR(ENXIO));
2141 }
2142 } else {
2143 for (uint64_t c = 0; c < rvd->vdev_children; c++) {
2144 vdev_t *tvd = rvd->vdev_child[c];
a1d477c2 2145
6cb8e530
PZ
2146 if (tvd->vdev_islog &&
2147 tvd->vdev_state == VDEV_STATE_CANT_OPEN) {
572e2857 2148 spa_set_log_state(spa, SPA_LOG_CLEAR);
6cb8e530
PZ
2149 spa_load_note(spa, "some log devices are "
2150 "missing, ZIL is dropped.");
db7d07e1 2151 vdev_dbgmsg_print_tree(rvd, 2);
6cb8e530 2152 break;
e0ab3ab5 2153 }
572e2857 2154 }
9babb374 2155 }
e0ab3ab5 2156
6cb8e530 2157 return (0);
9babb374
BB
2158}
2159
b128c09f
BB
2160/*
2161 * Check for missing log devices
2162 */
13fe0198 2163static boolean_t
b128c09f
BB
2164spa_check_logs(spa_t *spa)
2165{
13fe0198 2166 boolean_t rv = B_FALSE;
9c43027b 2167 dsl_pool_t *dp = spa_get_dsl(spa);
13fe0198 2168
b128c09f 2169 switch (spa->spa_log_state) {
e75c13c3
BB
2170 default:
2171 break;
b128c09f
BB
2172 case SPA_LOG_MISSING:
2173 /* need to recheck in case slog has been restored */
2174 case SPA_LOG_UNKNOWN:
9c43027b
AJ
2175 rv = (dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
2176 zil_check_log_chain, NULL, DS_FIND_CHILDREN) != 0);
13fe0198 2177 if (rv)
428870ff 2178 spa_set_log_state(spa, SPA_LOG_MISSING);
b128c09f 2179 break;
b128c09f 2180 }
13fe0198 2181 return (rv);
b128c09f
BB
2182}
2183
aa755b35
MA
2184/*
2185 * Passivate any log vdevs (note, does not apply to embedded log metaslabs).
2186 */
428870ff
BB
2187static boolean_t
2188spa_passivate_log(spa_t *spa)
34dc7c2f 2189{
428870ff
BB
2190 vdev_t *rvd = spa->spa_root_vdev;
2191 boolean_t slog_found = B_FALSE;
b128c09f 2192
428870ff 2193 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
fb5f0bc8 2194
1c27024e 2195 for (int c = 0; c < rvd->vdev_children; c++) {
428870ff 2196 vdev_t *tvd = rvd->vdev_child[c];
34dc7c2f 2197
428870ff 2198 if (tvd->vdev_islog) {
aa755b35
MA
2199 ASSERT3P(tvd->vdev_log_mg, ==, NULL);
2200 metaslab_group_passivate(tvd->vdev_mg);
428870ff
BB
2201 slog_found = B_TRUE;
2202 }
34dc7c2f
BB
2203 }
2204
428870ff
BB
2205 return (slog_found);
2206}
34dc7c2f 2207
aa755b35
MA
2208/*
2209 * Activate any log vdevs (note, does not apply to embedded log metaslabs).
2210 */
428870ff
BB
2211static void
2212spa_activate_log(spa_t *spa)
2213{
2214 vdev_t *rvd = spa->spa_root_vdev;
34dc7c2f 2215
428870ff
BB
2216 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
2217
1c27024e 2218 for (int c = 0; c < rvd->vdev_children; c++) {
428870ff 2219 vdev_t *tvd = rvd->vdev_child[c];
428870ff 2220
aa755b35
MA
2221 if (tvd->vdev_islog) {
2222 ASSERT3P(tvd->vdev_log_mg, ==, NULL);
2223 metaslab_group_activate(tvd->vdev_mg);
2224 }
34dc7c2f 2225 }
428870ff 2226}
34dc7c2f 2227
428870ff 2228int
a1d477c2 2229spa_reset_logs(spa_t *spa)
428870ff 2230{
13fe0198 2231 int error;
9babb374 2232
a1d477c2 2233 error = dmu_objset_find(spa_name(spa), zil_reset,
13fe0198
MA
2234 NULL, DS_FIND_CHILDREN);
2235 if (error == 0) {
428870ff
BB
2236 /*
2237 * We successfully offlined the log device, sync out the
2238 * current txg so that the "stubby" block can be removed
2239 * by zil_sync().
2240 */
2241 txg_wait_synced(spa->spa_dsl_pool, 0);
2242 }
2243 return (error);
2244}
34dc7c2f 2245
428870ff
BB
2246static void
2247spa_aux_check_removed(spa_aux_vdev_t *sav)
2248{
1c27024e 2249 for (int i = 0; i < sav->sav_count; i++)
428870ff
BB
2250 spa_check_removed(sav->sav_vdevs[i]);
2251}
34dc7c2f 2252
428870ff
BB
2253void
2254spa_claim_notify(zio_t *zio)
2255{
2256 spa_t *spa = zio->io_spa;
34dc7c2f 2257
428870ff
BB
2258 if (zio->io_error)
2259 return;
34dc7c2f 2260
428870ff
BB
2261 mutex_enter(&spa->spa_props_lock); /* any mutex will do */
2262 if (spa->spa_claim_max_txg < zio->io_bp->blk_birth)
2263 spa->spa_claim_max_txg = zio->io_bp->blk_birth;
2264 mutex_exit(&spa->spa_props_lock);
2265}
34dc7c2f 2266
428870ff 2267typedef struct spa_load_error {
f2c5bc15 2268 boolean_t sle_verify_data;
428870ff
BB
2269 uint64_t sle_meta_count;
2270 uint64_t sle_data_count;
2271} spa_load_error_t;
34dc7c2f 2272
428870ff
BB
2273static void
2274spa_load_verify_done(zio_t *zio)
2275{
2276 blkptr_t *bp = zio->io_bp;
2277 spa_load_error_t *sle = zio->io_private;
2278 dmu_object_type_t type = BP_GET_TYPE(bp);
2279 int error = zio->io_error;
dea377c0 2280 spa_t *spa = zio->io_spa;
34dc7c2f 2281
a6255b7f 2282 abd_free(zio->io_abd);
428870ff 2283 if (error) {
9ae529ec 2284 if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) &&
428870ff 2285 type != DMU_OT_INTENT_LOG)
bc89ac84 2286 atomic_inc_64(&sle->sle_meta_count);
428870ff 2287 else
bc89ac84 2288 atomic_inc_64(&sle->sle_data_count);
34dc7c2f 2289 }
dea377c0
MA
2290
2291 mutex_enter(&spa->spa_scrub_lock);
c8242a96 2292 spa->spa_load_verify_bytes -= BP_GET_PSIZE(bp);
dea377c0
MA
2293 cv_broadcast(&spa->spa_scrub_io_cv);
2294 mutex_exit(&spa->spa_scrub_lock);
428870ff 2295}
34dc7c2f 2296
dea377c0 2297/*
e1cfd73f 2298 * Maximum number of inflight bytes is the log2 fraction of the arc size.
c8242a96 2299 * By default, we set it to 1/16th of the arc.
dea377c0 2300 */
18168da7
AZ
2301static int spa_load_verify_shift = 4;
2302static int spa_load_verify_metadata = B_TRUE;
2303static int spa_load_verify_data = B_TRUE;
dea377c0 2304
428870ff
BB
2305static int
2306spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
5dbd68a3 2307 const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
428870ff 2308{
f2c5bc15
AM
2309 zio_t *rio = arg;
2310 spa_load_error_t *sle = rio->io_private;
2311
14e4e3cb
AZ
2312 (void) zilog, (void) dnp;
2313
dea377c0
MA
2314 /*
2315 * Note: normally this routine will not be called if
2316 * spa_load_verify_metadata is not set. However, it may be useful
2317 * to manually set the flag after the traversal has begun.
2318 */
2319 if (!spa_load_verify_metadata)
2320 return (0);
2cd0f98f
BB
2321
2322 /*
2323 * Sanity check the block pointer in order to detect obvious damage
2324 * before using the contents in subsequent checks or in zio_read().
2325 * When damaged consider it to be a metadata error since we cannot
2326 * trust the BP_GET_TYPE and BP_GET_LEVEL values.
2327 */
2328 if (!zfs_blkptr_verify(spa, bp, B_FALSE, BLK_VERIFY_LOG)) {
2329 atomic_inc_64(&sle->sle_meta_count);
2330 return (0);
2331 }
2332
2333 if (zb->zb_level == ZB_DNODE_LEVEL || BP_IS_HOLE(bp) ||
2334 BP_IS_EMBEDDED(bp) || BP_IS_REDACTED(bp))
2335 return (0);
2336
f2c5bc15
AM
2337 if (!BP_IS_METADATA(bp) &&
2338 (!spa_load_verify_data || !sle->sle_verify_data))
dea377c0
MA
2339 return (0);
2340
1e527162
GW
2341 uint64_t maxinflight_bytes =
2342 arc_target_bytes() >> spa_load_verify_shift;
1c27024e 2343 size_t size = BP_GET_PSIZE(bp);
dea377c0
MA
2344
2345 mutex_enter(&spa->spa_scrub_lock);
c8242a96 2346 while (spa->spa_load_verify_bytes >= maxinflight_bytes)
dea377c0 2347 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
c8242a96 2348 spa->spa_load_verify_bytes += size;
dea377c0
MA
2349 mutex_exit(&spa->spa_scrub_lock);
2350
a6255b7f 2351 zio_nowait(zio_read(rio, spa, bp, abd_alloc_for_io(size, B_FALSE), size,
dea377c0
MA
2352 spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB,
2353 ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL |
2354 ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb));
428870ff
BB
2355 return (0);
2356}
34dc7c2f 2357
65c7cc49 2358static int
d1d19c78
PD
2359verify_dataset_name_len(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
2360{
14e4e3cb
AZ
2361 (void) dp, (void) arg;
2362
d1d19c78
PD
2363 if (dsl_dataset_namelen(ds) >= ZFS_MAX_DATASET_NAME_LEN)
2364 return (SET_ERROR(ENAMETOOLONG));
2365
2366 return (0);
2367}
2368
428870ff
BB
2369static int
2370spa_load_verify(spa_t *spa)
2371{
2372 zio_t *rio;
2373 spa_load_error_t sle = { 0 };
8a393be3 2374 zpool_load_policy_t policy;
428870ff 2375 boolean_t verify_ok = B_FALSE;
dea377c0 2376 int error = 0;
34dc7c2f 2377
8a393be3 2378 zpool_get_load_policy(spa->spa_config, &policy);
34dc7c2f 2379
f2c5bc15
AM
2380 if (policy.zlp_rewind & ZPOOL_NEVER_REWIND ||
2381 policy.zlp_maxmeta == UINT64_MAX)
428870ff 2382 return (0);
34dc7c2f 2383
d1d19c78
PD
2384 dsl_pool_config_enter(spa->spa_dsl_pool, FTAG);
2385 error = dmu_objset_find_dp(spa->spa_dsl_pool,
2386 spa->spa_dsl_pool->dp_root_dir_obj, verify_dataset_name_len, NULL,
2387 DS_FIND_CHILDREN);
2388 dsl_pool_config_exit(spa->spa_dsl_pool, FTAG);
2389 if (error != 0)
2390 return (error);
2391
f2c5bc15
AM
2392 /*
2393 * Verify data only if we are rewinding or error limit was set.
2394 * Otherwise nothing except dbgmsg care about it to waste time.
2395 */
2396 sle.sle_verify_data = (policy.zlp_rewind & ZPOOL_REWIND_MASK) ||
2397 (policy.zlp_maxdata < UINT64_MAX);
2398
428870ff
BB
2399 rio = zio_root(spa, NULL, &sle,
2400 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
34dc7c2f 2401
dea377c0 2402 if (spa_load_verify_metadata) {
4a0ee12a
PZ
2403 if (spa->spa_extreme_rewind) {
2404 spa_load_note(spa, "performing a complete scan of the "
2405 "pool since extreme rewind is on. This may take "
2406 "a very long time.\n (spa_load_verify_data=%u, "
2407 "spa_load_verify_metadata=%u)",
2408 spa_load_verify_data, spa_load_verify_metadata);
2409 }
c8242a96 2410
dea377c0 2411 error = traverse_pool(spa, spa->spa_verify_min_txg,
b5256303
TC
2412 TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA |
2413 TRAVERSE_NO_DECRYPT, spa_load_verify_cb, rio);
dea377c0 2414 }
428870ff
BB
2415
2416 (void) zio_wait(rio);
c8242a96 2417 ASSERT0(spa->spa_load_verify_bytes);
428870ff
BB
2418
2419 spa->spa_load_meta_errors = sle.sle_meta_count;
2420 spa->spa_load_data_errors = sle.sle_data_count;
2421
afd2f7b7
PZ
2422 if (sle.sle_meta_count != 0 || sle.sle_data_count != 0) {
2423 spa_load_note(spa, "spa_load_verify found %llu metadata errors "
2424 "and %llu data errors", (u_longlong_t)sle.sle_meta_count,
2425 (u_longlong_t)sle.sle_data_count);
2426 }
2427
2428 if (spa_load_verify_dryrun ||
8a393be3
PZ
2429 (!error && sle.sle_meta_count <= policy.zlp_maxmeta &&
2430 sle.sle_data_count <= policy.zlp_maxdata)) {
572e2857
BB
2431 int64_t loss = 0;
2432
428870ff
BB
2433 verify_ok = B_TRUE;
2434 spa->spa_load_txg = spa->spa_uberblock.ub_txg;
2435 spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp;
572e2857
BB
2436
2437 loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts;
65ad5d11
AJ
2438 fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_LOAD_TIME,
2439 spa->spa_load_txg_ts);
2440 fnvlist_add_int64(spa->spa_load_info, ZPOOL_CONFIG_REWIND_TIME,
2441 loss);
f2c5bc15
AM
2442 fnvlist_add_uint64(spa->spa_load_info,
2443 ZPOOL_CONFIG_LOAD_META_ERRORS, sle.sle_meta_count);
65ad5d11
AJ
2444 fnvlist_add_uint64(spa->spa_load_info,
2445 ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count);
428870ff
BB
2446 } else {
2447 spa->spa_load_max_txg = spa->spa_uberblock.ub_txg;
2448 }
2449
afd2f7b7
PZ
2450 if (spa_load_verify_dryrun)
2451 return (0);
2452
428870ff
BB
2453 if (error) {
2454 if (error != ENXIO && error != EIO)
2e528b49 2455 error = SET_ERROR(EIO);
428870ff
BB
2456 return (error);
2457 }
2458
2459 return (verify_ok ? 0 : EIO);
2460}
2461
2462/*
2463 * Find a value in the pool props object.
2464 */
2465static void
2466spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val)
2467{
2468 (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object,
2469 zpool_prop_to_name(prop), sizeof (uint64_t), 1, val);
2470}
2471
2472/*
2473 * Find a value in the pool directory object.
2474 */
2475static int
4a0ee12a 2476spa_dir_prop(spa_t *spa, const char *name, uint64_t *val, boolean_t log_enoent)
428870ff 2477{
4a0ee12a
PZ
2478 int error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
2479 name, sizeof (uint64_t), 1, val);
2480
2481 if (error != 0 && (error != ENOENT || log_enoent)) {
2482 spa_load_failed(spa, "couldn't get '%s' value in MOS directory "
2483 "[error=%d]", name, error);
2484 }
2485
2486 return (error);
428870ff
BB
2487}
2488
2489static int
2490spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err)
2491{
2492 vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux);
a1d477c2 2493 return (SET_ERROR(err));
428870ff
BB
2494}
2495
37f03da8
SH
2496boolean_t
2497spa_livelist_delete_check(spa_t *spa)
2498{
2499 return (spa->spa_livelists_to_delete != 0);
2500}
2501
37f03da8
SH
2502static boolean_t
2503spa_livelist_delete_cb_check(void *arg, zthr_t *z)
2504{
14e4e3cb 2505 (void) z;
37f03da8
SH
2506 spa_t *spa = arg;
2507 return (spa_livelist_delete_check(spa));
2508}
2509
2510static int
2511delete_blkptr_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
2512{
2513 spa_t *spa = arg;
2514 zio_free(spa, tx->tx_txg, bp);
2515 dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD,
2516 -bp_get_dsize_sync(spa, bp),
2517 -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx);
2518 return (0);
2519}
2520
2521static int
2522dsl_get_next_livelist_obj(objset_t *os, uint64_t zap_obj, uint64_t *llp)
2523{
2524 int err;
2525 zap_cursor_t zc;
2526 zap_attribute_t za;
2527 zap_cursor_init(&zc, os, zap_obj);
2528 err = zap_cursor_retrieve(&zc, &za);
2529 zap_cursor_fini(&zc);
2530 if (err == 0)
2531 *llp = za.za_first_integer;
2532 return (err);
2533}
2534
2535/*
2536 * Components of livelist deletion that must be performed in syncing
2537 * context: freeing block pointers and updating the pool-wide data
2538 * structures to indicate how much work is left to do
2539 */
2540typedef struct sublist_delete_arg {
2541 spa_t *spa;
2542 dsl_deadlist_t *ll;
2543 uint64_t key;
2544 bplist_t *to_free;
2545} sublist_delete_arg_t;
2546
2547static void
2548sublist_delete_sync(void *arg, dmu_tx_t *tx)
2549{
2550 sublist_delete_arg_t *sda = arg;
2551 spa_t *spa = sda->spa;
2552 dsl_deadlist_t *ll = sda->ll;
2553 uint64_t key = sda->key;
2554 bplist_t *to_free = sda->to_free;
2555
2556 bplist_iterate(to_free, delete_blkptr_cb, spa, tx);
2557 dsl_deadlist_remove_entry(ll, key, tx);
2558}
2559
2560typedef struct livelist_delete_arg {
2561 spa_t *spa;
2562 uint64_t ll_obj;
2563 uint64_t zap_obj;
2564} livelist_delete_arg_t;
2565
2566static void
2567livelist_delete_sync(void *arg, dmu_tx_t *tx)
2568{
2569 livelist_delete_arg_t *lda = arg;
2570 spa_t *spa = lda->spa;
2571 uint64_t ll_obj = lda->ll_obj;
2572 uint64_t zap_obj = lda->zap_obj;
2573 objset_t *mos = spa->spa_meta_objset;
2574 uint64_t count;
2575
2576 /* free the livelist and decrement the feature count */
2577 VERIFY0(zap_remove_int(mos, zap_obj, ll_obj, tx));
2578 dsl_deadlist_free(mos, ll_obj, tx);
2579 spa_feature_decr(spa, SPA_FEATURE_LIVELIST, tx);
2580 VERIFY0(zap_count(mos, zap_obj, &count));
2581 if (count == 0) {
2582 /* no more livelists to delete */
2583 VERIFY0(zap_remove(mos, DMU_POOL_DIRECTORY_OBJECT,
2584 DMU_POOL_DELETED_CLONES, tx));
2585 VERIFY0(zap_destroy(mos, zap_obj, tx));
2586 spa->spa_livelists_to_delete = 0;
e60e158e 2587 spa_notify_waiters(spa);
37f03da8
SH
2588 }
2589}
2590
2591/*
2592 * Load in the value for the livelist to be removed and open it. Then,
2593 * load its first sublist and determine which block pointers should actually
2594 * be freed. Then, call a synctask which performs the actual frees and updates
2595 * the pool-wide livelist data.
2596 */
65c7cc49 2597static void
37f03da8
SH
2598spa_livelist_delete_cb(void *arg, zthr_t *z)
2599{
2600 spa_t *spa = arg;
2601 uint64_t ll_obj = 0, count;
2602 objset_t *mos = spa->spa_meta_objset;
2603 uint64_t zap_obj = spa->spa_livelists_to_delete;
2604 /*
2605 * Determine the next livelist to delete. This function should only
2606 * be called if there is at least one deleted clone.
2607 */
2608 VERIFY0(dsl_get_next_livelist_obj(mos, zap_obj, &ll_obj));
2609 VERIFY0(zap_count(mos, ll_obj, &count));
2610 if (count > 0) {
c9562576 2611 dsl_deadlist_t *ll;
37f03da8
SH
2612 dsl_deadlist_entry_t *dle;
2613 bplist_t to_free;
c9562576
PS
2614 ll = kmem_zalloc(sizeof (dsl_deadlist_t), KM_SLEEP);
2615 dsl_deadlist_open(ll, mos, ll_obj);
2616 dle = dsl_deadlist_first(ll);
37f03da8
SH
2617 ASSERT3P(dle, !=, NULL);
2618 bplist_create(&to_free);
2619 int err = dsl_process_sub_livelist(&dle->dle_bpobj, &to_free,
2620 z, NULL);
2621 if (err == 0) {
2622 sublist_delete_arg_t sync_arg = {
2623 .spa = spa,
c9562576 2624 .ll = ll,
37f03da8
SH
2625 .key = dle->dle_mintxg,
2626 .to_free = &to_free
2627 };
2628 zfs_dbgmsg("deleting sublist (id %llu) from"
8e739b2c
RE
2629 " livelist %llu, %lld remaining",
2630 (u_longlong_t)dle->dle_bpobj.bpo_object,
2631 (u_longlong_t)ll_obj, (longlong_t)count - 1);
37f03da8
SH
2632 VERIFY0(dsl_sync_task(spa_name(spa), NULL,
2633 sublist_delete_sync, &sync_arg, 0,
2634 ZFS_SPACE_CHECK_DESTROY));
2635 } else {
d87676a9 2636 VERIFY3U(err, ==, EINTR);
37f03da8
SH
2637 }
2638 bplist_clear(&to_free);
2639 bplist_destroy(&to_free);
c9562576
PS
2640 dsl_deadlist_close(ll);
2641 kmem_free(ll, sizeof (dsl_deadlist_t));
37f03da8
SH
2642 } else {
2643 livelist_delete_arg_t sync_arg = {
2644 .spa = spa,
2645 .ll_obj = ll_obj,
2646 .zap_obj = zap_obj
2647 };
8e739b2c
RE
2648 zfs_dbgmsg("deletion of livelist %llu completed",
2649 (u_longlong_t)ll_obj);
37f03da8
SH
2650 VERIFY0(dsl_sync_task(spa_name(spa), NULL, livelist_delete_sync,
2651 &sync_arg, 0, ZFS_SPACE_CHECK_DESTROY));
2652 }
2653}
2654
65c7cc49 2655static void
37f03da8
SH
2656spa_start_livelist_destroy_thread(spa_t *spa)
2657{
2658 ASSERT3P(spa->spa_livelist_delete_zthr, ==, NULL);
843e9ca2
SD
2659 spa->spa_livelist_delete_zthr =
2660 zthr_create("z_livelist_destroy",
6bc61d22
TN
2661 spa_livelist_delete_cb_check, spa_livelist_delete_cb, spa,
2662 minclsyspri);
37f03da8
SH
2663}
2664
2665typedef struct livelist_new_arg {
2666 bplist_t *allocs;
2667 bplist_t *frees;
2668} livelist_new_arg_t;
2669
2670static int
2671livelist_track_new_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
2672 dmu_tx_t *tx)
2673{
2674 ASSERT(tx == NULL);
2675 livelist_new_arg_t *lna = arg;
2676 if (bp_freed) {
2677 bplist_append(lna->frees, bp);
2678 } else {
2679 bplist_append(lna->allocs, bp);
2680 zfs_livelist_condense_new_alloc++;
2681 }
2682 return (0);
2683}
2684
2685typedef struct livelist_condense_arg {
2686 spa_t *spa;
2687 bplist_t to_keep;
2688 uint64_t first_size;
2689 uint64_t next_size;
2690} livelist_condense_arg_t;
2691
2692static void
2693spa_livelist_condense_sync(void *arg, dmu_tx_t *tx)
2694{
2695 livelist_condense_arg_t *lca = arg;
2696 spa_t *spa = lca->spa;
2697 bplist_t new_frees;
2698 dsl_dataset_t *ds = spa->spa_to_condense.ds;
2699
2700 /* Have we been cancelled? */
2701 if (spa->spa_to_condense.cancelled) {
2702 zfs_livelist_condense_sync_cancel++;
2703 goto out;
2704 }
2705
2706 dsl_deadlist_entry_t *first = spa->spa_to_condense.first;
2707 dsl_deadlist_entry_t *next = spa->spa_to_condense.next;
2708 dsl_deadlist_t *ll = &ds->ds_dir->dd_livelist;
2709
2710 /*
2711 * It's possible that the livelist was changed while the zthr was
2712 * running. Therefore, we need to check for new blkptrs in the two
2713 * entries being condensed and continue to track them in the livelist.
2714 * Because of the way we handle remapped blkptrs (see dbuf_remap_impl),
2715 * it's possible that the newly added blkptrs are FREEs or ALLOCs so
2716 * we need to sort them into two different bplists.
2717 */
2718 uint64_t first_obj = first->dle_bpobj.bpo_object;
2719 uint64_t next_obj = next->dle_bpobj.bpo_object;
2720 uint64_t cur_first_size = first->dle_bpobj.bpo_phys->bpo_num_blkptrs;
2721 uint64_t cur_next_size = next->dle_bpobj.bpo_phys->bpo_num_blkptrs;
2722
2723 bplist_create(&new_frees);
2724 livelist_new_arg_t new_bps = {
2725 .allocs = &lca->to_keep,
2726 .frees = &new_frees,
2727 };
2728
2729 if (cur_first_size > lca->first_size) {
2730 VERIFY0(livelist_bpobj_iterate_from_nofree(&first->dle_bpobj,
2731 livelist_track_new_cb, &new_bps, lca->first_size));
2732 }
2733 if (cur_next_size > lca->next_size) {
2734 VERIFY0(livelist_bpobj_iterate_from_nofree(&next->dle_bpobj,
2735 livelist_track_new_cb, &new_bps, lca->next_size));
2736 }
2737
2738 dsl_deadlist_clear_entry(first, ll, tx);
2739 ASSERT(bpobj_is_empty(&first->dle_bpobj));
2740 dsl_deadlist_remove_entry(ll, next->dle_mintxg, tx);
2741
2742 bplist_iterate(&lca->to_keep, dsl_deadlist_insert_alloc_cb, ll, tx);
2743 bplist_iterate(&new_frees, dsl_deadlist_insert_free_cb, ll, tx);
2744 bplist_destroy(&new_frees);
2745
2746 char dsname[ZFS_MAX_DATASET_NAME_LEN];
2747 dsl_dataset_name(ds, dsname);
2748 zfs_dbgmsg("txg %llu condensing livelist of %s (id %llu), bpobj %llu "
2749 "(%llu blkptrs) and bpobj %llu (%llu blkptrs) -> bpobj %llu "
8e739b2c
RE
2750 "(%llu blkptrs)", (u_longlong_t)tx->tx_txg, dsname,
2751 (u_longlong_t)ds->ds_object, (u_longlong_t)first_obj,
2752 (u_longlong_t)cur_first_size, (u_longlong_t)next_obj,
2753 (u_longlong_t)cur_next_size,
2754 (u_longlong_t)first->dle_bpobj.bpo_object,
2755 (u_longlong_t)first->dle_bpobj.bpo_phys->bpo_num_blkptrs);
37f03da8
SH
2756out:
2757 dmu_buf_rele(ds->ds_dbuf, spa);
2758 spa->spa_to_condense.ds = NULL;
2759 bplist_clear(&lca->to_keep);
2760 bplist_destroy(&lca->to_keep);
2761 kmem_free(lca, sizeof (livelist_condense_arg_t));
2762 spa->spa_to_condense.syncing = B_FALSE;
2763}
2764
65c7cc49 2765static void
37f03da8
SH
2766spa_livelist_condense_cb(void *arg, zthr_t *t)
2767{
2768 while (zfs_livelist_condense_zthr_pause &&
2769 !(zthr_has_waiters(t) || zthr_iscancelled(t)))
2770 delay(1);
2771
2772 spa_t *spa = arg;
2773 dsl_deadlist_entry_t *first = spa->spa_to_condense.first;
2774 dsl_deadlist_entry_t *next = spa->spa_to_condense.next;
2775 uint64_t first_size, next_size;
2776
2777 livelist_condense_arg_t *lca =
2778 kmem_alloc(sizeof (livelist_condense_arg_t), KM_SLEEP);
2779 bplist_create(&lca->to_keep);
2780
2781 /*
2782 * Process the livelists (matching FREEs and ALLOCs) in open context
2783 * so we have minimal work in syncing context to condense.
2784 *
2785 * We save bpobj sizes (first_size and next_size) to use later in
2786 * syncing context to determine if entries were added to these sublists
2787 * while in open context. This is possible because the clone is still
2788 * active and open for normal writes and we want to make sure the new,
2789 * unprocessed blockpointers are inserted into the livelist normally.
2790 *
2791 * Note that dsl_process_sub_livelist() both stores the size number of
2792 * blockpointers and iterates over them while the bpobj's lock held, so
2793 * the sizes returned to us are consistent which what was actually
2794 * processed.
2795 */
2796 int err = dsl_process_sub_livelist(&first->dle_bpobj, &lca->to_keep, t,
2797 &first_size);
2798 if (err == 0)
2799 err = dsl_process_sub_livelist(&next->dle_bpobj, &lca->to_keep,
2800 t, &next_size);
2801
2802 if (err == 0) {
2803 while (zfs_livelist_condense_sync_pause &&
2804 !(zthr_has_waiters(t) || zthr_iscancelled(t)))
2805 delay(1);
2806
2807 dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
2808 dmu_tx_mark_netfree(tx);
2809 dmu_tx_hold_space(tx, 1);
2810 err = dmu_tx_assign(tx, TXG_NOWAIT | TXG_NOTHROTTLE);
2811 if (err == 0) {
2812 /*
2813 * Prevent the condense zthr restarting before
2814 * the synctask completes.
2815 */
2816 spa->spa_to_condense.syncing = B_TRUE;
2817 lca->spa = spa;
2818 lca->first_size = first_size;
2819 lca->next_size = next_size;
2820 dsl_sync_task_nowait(spa_get_dsl(spa),
38080324 2821 spa_livelist_condense_sync, lca, tx);
37f03da8
SH
2822 dmu_tx_commit(tx);
2823 return;
2824 }
2825 }
2826 /*
2827 * Condensing can not continue: either it was externally stopped or
2828 * we were unable to assign to a tx because the pool has run out of
2829 * space. In the second case, we'll just end up trying to condense
2830 * again in a later txg.
2831 */
2832 ASSERT(err != 0);
2833 bplist_clear(&lca->to_keep);
2834 bplist_destroy(&lca->to_keep);
2835 kmem_free(lca, sizeof (livelist_condense_arg_t));
2836 dmu_buf_rele(spa->spa_to_condense.ds->ds_dbuf, spa);
2837 spa->spa_to_condense.ds = NULL;
2838 if (err == EINTR)
2839 zfs_livelist_condense_zthr_cancel++;
2840}
2841
37f03da8
SH
2842/*
2843 * Check that there is something to condense but that a condense is not
2844 * already in progress and that condensing has not been cancelled.
2845 */
2846static boolean_t
2847spa_livelist_condense_cb_check(void *arg, zthr_t *z)
2848{
14e4e3cb 2849 (void) z;
37f03da8
SH
2850 spa_t *spa = arg;
2851 if ((spa->spa_to_condense.ds != NULL) &&
2852 (spa->spa_to_condense.syncing == B_FALSE) &&
2853 (spa->spa_to_condense.cancelled == B_FALSE)) {
2854 return (B_TRUE);
2855 }
2856 return (B_FALSE);
2857}
2858
65c7cc49 2859static void
37f03da8
SH
2860spa_start_livelist_condensing_thread(spa_t *spa)
2861{
2862 spa->spa_to_condense.ds = NULL;
2863 spa->spa_to_condense.first = NULL;
2864 spa->spa_to_condense.next = NULL;
2865 spa->spa_to_condense.syncing = B_FALSE;
2866 spa->spa_to_condense.cancelled = B_FALSE;
2867
2868 ASSERT3P(spa->spa_livelist_condense_zthr, ==, NULL);
843e9ca2
SD
2869 spa->spa_livelist_condense_zthr =
2870 zthr_create("z_livelist_condense",
2871 spa_livelist_condense_cb_check,
6bc61d22 2872 spa_livelist_condense_cb, spa, minclsyspri);
37f03da8
SH
2873}
2874
9d5b5245
SD
2875static void
2876spa_spawn_aux_threads(spa_t *spa)
2877{
2878 ASSERT(spa_writeable(spa));
2879
2880 ASSERT(MUTEX_HELD(&spa_namespace_lock));
2881
2882 spa_start_indirect_condensing_thread(spa);
37f03da8
SH
2883 spa_start_livelist_destroy_thread(spa);
2884 spa_start_livelist_condensing_thread(spa);
d2734cce
SD
2885
2886 ASSERT3P(spa->spa_checkpoint_discard_zthr, ==, NULL);
2887 spa->spa_checkpoint_discard_zthr =
843e9ca2
SD
2888 zthr_create("z_checkpoint_discard",
2889 spa_checkpoint_discard_thread_check,
6bc61d22 2890 spa_checkpoint_discard_thread, spa, minclsyspri);
9d5b5245
SD
2891}
2892
428870ff
BB
2893/*
2894 * Fix up config after a partly-completed split. This is done with the
2895 * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off
2896 * pool have that entry in their config, but only the splitting one contains
2897 * a list of all the guids of the vdevs that are being split off.
2898 *
2899 * This function determines what to do with that list: either rejoin
2900 * all the disks to the pool, or complete the splitting process. To attempt
2901 * the rejoin, each disk that is offlined is marked online again, and
2902 * we do a reopen() call. If the vdev label for every disk that was
2903 * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL)
2904 * then we call vdev_split() on each disk, and complete the split.
2905 *
2906 * Otherwise we leave the config alone, with all the vdevs in place in
2907 * the original pool.
2908 */
2909static void
2910spa_try_repair(spa_t *spa, nvlist_t *config)
2911{
2912 uint_t extracted;
2913 uint64_t *glist;
2914 uint_t i, gcount;
2915 nvlist_t *nvl;
2916 vdev_t **vd;
2917 boolean_t attempt_reopen;
2918
2919 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0)
2920 return;
2921
2922 /* check that the config is complete */
2923 if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
2924 &glist, &gcount) != 0)
2925 return;
2926
79c76d5b 2927 vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP);
428870ff
BB
2928
2929 /* attempt to online all the vdevs & validate */
2930 attempt_reopen = B_TRUE;
2931 for (i = 0; i < gcount; i++) {
2932 if (glist[i] == 0) /* vdev is hole */
2933 continue;
2934
2935 vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE);
2936 if (vd[i] == NULL) {
2937 /*
2938 * Don't bother attempting to reopen the disks;
2939 * just do the split.
2940 */
2941 attempt_reopen = B_FALSE;
2942 } else {
2943 /* attempt to re-online it */
2944 vd[i]->vdev_offline = B_FALSE;
2945 }
2946 }
2947
2948 if (attempt_reopen) {
2949 vdev_reopen(spa->spa_root_vdev);
2950
2951 /* check each device to see what state it's in */
2952 for (extracted = 0, i = 0; i < gcount; i++) {
2953 if (vd[i] != NULL &&
2954 vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL)
2955 break;
2956 ++extracted;
2957 }
2958 }
2959
2960 /*
2961 * If every disk has been moved to the new pool, or if we never
2962 * even attempted to look at them, then we split them off for
2963 * good.
2964 */
2965 if (!attempt_reopen || gcount == extracted) {
2966 for (i = 0; i < gcount; i++)
2967 if (vd[i] != NULL)
2968 vdev_split(vd[i]);
2969 vdev_reopen(spa->spa_root_vdev);
2970 }
2971
2972 kmem_free(vd, gcount * sizeof (vdev_t *));
2973}
2974
2975static int
6cb8e530 2976spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type)
428870ff 2977{
a926aab9 2978 const char *ereport = FM_EREPORT_ZFS_POOL;
428870ff 2979 int error;
428870ff 2980
6cb8e530 2981 spa->spa_load_state = state;
ca95f70d
OF
2982 (void) spa_import_progress_set_state(spa_guid(spa),
2983 spa_load_state(spa));
9ae529ec 2984
6cb8e530 2985 gethrestime(&spa->spa_loaded_ts);
d2734cce 2986 error = spa_load_impl(spa, type, &ereport);
428870ff 2987
0c66c32d
JG
2988 /*
2989 * Don't count references from objsets that are already closed
2990 * and are making their way through the eviction process.
2991 */
2992 spa_evicting_os_wait(spa);
424fd7c3 2993 spa->spa_minref = zfs_refcount_count(&spa->spa_refcount);
572e2857
BB
2994 if (error) {
2995 if (error != EEXIST) {
2996 spa->spa_loaded_ts.tv_sec = 0;
2997 spa->spa_loaded_ts.tv_nsec = 0;
2998 }
2999 if (error != EBADF) {
1144586b 3000 (void) zfs_ereport_post(ereport, spa,
4f072827 3001 NULL, NULL, NULL, 0);
572e2857
BB
3002 }
3003 }
428870ff
BB
3004 spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE;
3005 spa->spa_ena = 0;
3006
ca95f70d
OF
3007 (void) spa_import_progress_set_state(spa_guid(spa),
3008 spa_load_state(spa));
3009
428870ff
BB
3010 return (error);
3011}
3012
33cf67cd 3013#ifdef ZFS_DEBUG
e0ab3ab5
JS
3014/*
3015 * Count the number of per-vdev ZAPs associated with all of the vdevs in the
3016 * vdev tree rooted in the given vd, and ensure that each ZAP is present in the
3017 * spa's per-vdev ZAP list.
3018 */
3019static uint64_t
3020vdev_count_verify_zaps(vdev_t *vd)
3021{
3022 spa_t *spa = vd->vdev_spa;
3023 uint64_t total = 0;
e0ab3ab5
JS
3024
3025 if (vd->vdev_top_zap != 0) {
3026 total++;
3027 ASSERT0(zap_lookup_int(spa->spa_meta_objset,
3028 spa->spa_all_vdev_zaps, vd->vdev_top_zap));
3029 }
3030 if (vd->vdev_leaf_zap != 0) {
3031 total++;
3032 ASSERT0(zap_lookup_int(spa->spa_meta_objset,
3033 spa->spa_all_vdev_zaps, vd->vdev_leaf_zap));
3034 }
3035
1c27024e 3036 for (uint64_t i = 0; i < vd->vdev_children; i++) {
e0ab3ab5
JS
3037 total += vdev_count_verify_zaps(vd->vdev_child[i]);
3038 }
3039
3040 return (total);
3041}
36542b06
AZ
3042#else
3043#define vdev_count_verify_zaps(vd) ((void) sizeof (vd), 0)
33cf67cd 3044#endif
e0ab3ab5 3045
379ca9cf
OF
3046/*
3047 * Determine whether the activity check is required.
3048 */
3049static boolean_t
bbffb59e
BB
3050spa_activity_check_required(spa_t *spa, uberblock_t *ub, nvlist_t *label,
3051 nvlist_t *config)
379ca9cf
OF
3052{
3053 uint64_t state = 0;
3054 uint64_t hostid = 0;
3055 uint64_t tryconfig_txg = 0;
3056 uint64_t tryconfig_timestamp = 0;
060f0226 3057 uint16_t tryconfig_mmp_seq = 0;
379ca9cf
OF
3058 nvlist_t *nvinfo;
3059
3060 if (nvlist_exists(config, ZPOOL_CONFIG_LOAD_INFO)) {
3061 nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO);
3062 (void) nvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_TXG,
3063 &tryconfig_txg);
3064 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_TIMESTAMP,
3065 &tryconfig_timestamp);
060f0226
OF
3066 (void) nvlist_lookup_uint16(nvinfo, ZPOOL_CONFIG_MMP_SEQ,
3067 &tryconfig_mmp_seq);
379ca9cf
OF
3068 }
3069
3070 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, &state);
379ca9cf
OF
3071
3072 /*
3073 * Disable the MMP activity check - This is used by zdb which
3074 * is intended to be used on potentially active pools.
3075 */
3076 if (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP)
3077 return (B_FALSE);
3078
3079 /*
3080 * Skip the activity check when the MMP feature is disabled.
3081 */
3082 if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay == 0)
3083 return (B_FALSE);
ca95f70d 3084
379ca9cf 3085 /*
060f0226
OF
3086 * If the tryconfig_ values are nonzero, they are the results of an
3087 * earlier tryimport. If they all match the uberblock we just found,
3088 * then the pool has not changed and we return false so we do not test
3089 * a second time.
379ca9cf
OF
3090 */
3091 if (tryconfig_txg && tryconfig_txg == ub->ub_txg &&
060f0226
OF
3092 tryconfig_timestamp && tryconfig_timestamp == ub->ub_timestamp &&
3093 tryconfig_mmp_seq && tryconfig_mmp_seq ==
3094 (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0))
379ca9cf
OF
3095 return (B_FALSE);
3096
3097 /*
3098 * Allow the activity check to be skipped when importing the pool
bbffb59e
BB
3099 * on the same host which last imported it. Since the hostid from
3100 * configuration may be stale use the one read from the label.
379ca9cf 3101 */
bbffb59e
BB
3102 if (nvlist_exists(label, ZPOOL_CONFIG_HOSTID))
3103 hostid = fnvlist_lookup_uint64(label, ZPOOL_CONFIG_HOSTID);
3104
25f06d67 3105 if (hostid == spa_get_hostid(spa))
379ca9cf
OF
3106 return (B_FALSE);
3107
3108 /*
3109 * Skip the activity test when the pool was cleanly exported.
3110 */
3111 if (state != POOL_STATE_ACTIVE)
3112 return (B_FALSE);
3113
3114 return (B_TRUE);
3115}
3116
060f0226
OF
3117/*
3118 * Nanoseconds the activity check must watch for changes on-disk.
3119 */
3120static uint64_t
3121spa_activity_check_duration(spa_t *spa, uberblock_t *ub)
3122{
3123 uint64_t import_intervals = MAX(zfs_multihost_import_intervals, 1);
3124 uint64_t multihost_interval = MSEC2NSEC(
3125 MMP_INTERVAL_OK(zfs_multihost_interval));
3126 uint64_t import_delay = MAX(NANOSEC, import_intervals *
3127 multihost_interval);
3128
3129 /*
3130 * Local tunables determine a minimum duration except for the case
3131 * where we know when the remote host will suspend the pool if MMP
3132 * writes do not land.
3133 *
3134 * See Big Theory comment at the top of mmp.c for the reasoning behind
3135 * these cases and times.
3136 */
3137
3138 ASSERT(MMP_IMPORT_SAFETY_FACTOR >= 100);
3139
3140 if (MMP_INTERVAL_VALID(ub) && MMP_FAIL_INT_VALID(ub) &&
3141 MMP_FAIL_INT(ub) > 0) {
3142
3143 /* MMP on remote host will suspend pool after failed writes */
3144 import_delay = MMP_FAIL_INT(ub) * MSEC2NSEC(MMP_INTERVAL(ub)) *
3145 MMP_IMPORT_SAFETY_FACTOR / 100;
3146
3147 zfs_dbgmsg("fail_intvals>0 import_delay=%llu ub_mmp "
3148 "mmp_fails=%llu ub_mmp mmp_interval=%llu "
8e739b2c
RE
3149 "import_intervals=%llu", (u_longlong_t)import_delay,
3150 (u_longlong_t)MMP_FAIL_INT(ub),
3151 (u_longlong_t)MMP_INTERVAL(ub),
3152 (u_longlong_t)import_intervals);
060f0226
OF
3153
3154 } else if (MMP_INTERVAL_VALID(ub) && MMP_FAIL_INT_VALID(ub) &&
3155 MMP_FAIL_INT(ub) == 0) {
3156
3157 /* MMP on remote host will never suspend pool */
3158 import_delay = MAX(import_delay, (MSEC2NSEC(MMP_INTERVAL(ub)) +
3159 ub->ub_mmp_delay) * import_intervals);
3160
3161 zfs_dbgmsg("fail_intvals=0 import_delay=%llu ub_mmp "
3162 "mmp_interval=%llu ub_mmp_delay=%llu "
8e739b2c
RE
3163 "import_intervals=%llu", (u_longlong_t)import_delay,
3164 (u_longlong_t)MMP_INTERVAL(ub),
3165 (u_longlong_t)ub->ub_mmp_delay,
3166 (u_longlong_t)import_intervals);
060f0226
OF
3167
3168 } else if (MMP_VALID(ub)) {
3169 /*
e1cfd73f 3170 * zfs-0.7 compatibility case
060f0226
OF
3171 */
3172
3173 import_delay = MAX(import_delay, (multihost_interval +
3174 ub->ub_mmp_delay) * import_intervals);
3175
3176 zfs_dbgmsg("import_delay=%llu ub_mmp_delay=%llu "
8e739b2c
RE
3177 "import_intervals=%llu leaves=%u",
3178 (u_longlong_t)import_delay,
3179 (u_longlong_t)ub->ub_mmp_delay,
3180 (u_longlong_t)import_intervals,
060f0226
OF
3181 vdev_count_leaves(spa));
3182 } else {
3183 /* Using local tunings is the only reasonable option */
3184 zfs_dbgmsg("pool last imported on non-MMP aware "
3185 "host using import_delay=%llu multihost_interval=%llu "
8e739b2c
RE
3186 "import_intervals=%llu", (u_longlong_t)import_delay,
3187 (u_longlong_t)multihost_interval,
3188 (u_longlong_t)import_intervals);
060f0226
OF
3189 }
3190
3191 return (import_delay);
3192}
3193
379ca9cf
OF
3194/*
3195 * Perform the import activity check. If the user canceled the import or
3196 * we detected activity then fail.
3197 */
3198static int
3199spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config)
3200{
379ca9cf
OF
3201 uint64_t txg = ub->ub_txg;
3202 uint64_t timestamp = ub->ub_timestamp;
060f0226
OF
3203 uint64_t mmp_config = ub->ub_mmp_config;
3204 uint16_t mmp_seq = MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0;
3205 uint64_t import_delay;
379ca9cf
OF
3206 hrtime_t import_expire;
3207 nvlist_t *mmp_label = NULL;
3208 vdev_t *rvd = spa->spa_root_vdev;
3209 kcondvar_t cv;
3210 kmutex_t mtx;
3211 int error = 0;
3212
3213 cv_init(&cv, NULL, CV_DEFAULT, NULL);
3214 mutex_init(&mtx, NULL, MUTEX_DEFAULT, NULL);
3215 mutex_enter(&mtx);
3216
3217 /*
3218 * If ZPOOL_CONFIG_MMP_TXG is present an activity check was performed
3219 * during the earlier tryimport. If the txg recorded there is 0 then
3220 * the pool is known to be active on another host.
3221 *
060f0226 3222 * Otherwise, the pool might be in use on another host. Check for
379ca9cf
OF
3223 * changes in the uberblocks on disk if necessary.
3224 */
3225 if (nvlist_exists(config, ZPOOL_CONFIG_LOAD_INFO)) {
3226 nvlist_t *nvinfo = fnvlist_lookup_nvlist(config,
3227 ZPOOL_CONFIG_LOAD_INFO);
3228
3229 if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_TXG) &&
3230 fnvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_TXG) == 0) {
3231 vdev_uberblock_load(rvd, ub, &mmp_label);
3232 error = SET_ERROR(EREMOTEIO);
3233 goto out;
3234 }
3235 }
3236
060f0226 3237 import_delay = spa_activity_check_duration(spa, ub);
533ea041 3238
379ca9cf 3239 /* Add a small random factor in case of simultaneous imports (0-25%) */
29274c9f 3240 import_delay += import_delay * random_in_range(250) / 1000;
ca95f70d
OF
3241
3242 import_expire = gethrtime() + import_delay;
379ca9cf
OF
3243
3244 while (gethrtime() < import_expire) {
ca95f70d
OF
3245 (void) spa_import_progress_set_mmp_check(spa_guid(spa),
3246 NSEC2SEC(import_expire - gethrtime()));
3247
379ca9cf
OF
3248 vdev_uberblock_load(rvd, ub, &mmp_label);
3249
060f0226
OF
3250 if (txg != ub->ub_txg || timestamp != ub->ub_timestamp ||
3251 mmp_seq != (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)) {
3252 zfs_dbgmsg("multihost activity detected "
3253 "txg %llu ub_txg %llu "
3254 "timestamp %llu ub_timestamp %llu "
3255 "mmp_config %#llx ub_mmp_config %#llx",
8e739b2c
RE
3256 (u_longlong_t)txg, (u_longlong_t)ub->ub_txg,
3257 (u_longlong_t)timestamp,
3258 (u_longlong_t)ub->ub_timestamp,
3259 (u_longlong_t)mmp_config,
3260 (u_longlong_t)ub->ub_mmp_config);
060f0226 3261
379ca9cf
OF
3262 error = SET_ERROR(EREMOTEIO);
3263 break;
3264 }
3265
3266 if (mmp_label) {
3267 nvlist_free(mmp_label);
3268 mmp_label = NULL;
3269 }
3270
3271 error = cv_timedwait_sig(&cv, &mtx, ddi_get_lbolt() + hz);
3272 if (error != -1) {
3273 error = SET_ERROR(EINTR);
3274 break;
3275 }
3276 error = 0;
3277 }
3278
3279out:
3280 mutex_exit(&mtx);
3281 mutex_destroy(&mtx);
3282 cv_destroy(&cv);
3283
3284 /*
3285 * If the pool is determined to be active store the status in the
3286 * spa->spa_load_info nvlist. If the remote hostname or hostid are
3287 * available from configuration read from disk store them as well.
3288 * This allows 'zpool import' to generate a more useful message.
3289 *
3290 * ZPOOL_CONFIG_MMP_STATE - observed pool status (mandatory)
3291 * ZPOOL_CONFIG_MMP_HOSTNAME - hostname from the active pool
3292 * ZPOOL_CONFIG_MMP_HOSTID - hostid from the active pool
3293 */
3294 if (error == EREMOTEIO) {
a926aab9 3295 const char *hostname = "<unknown>";
379ca9cf
OF
3296 uint64_t hostid = 0;
3297
3298 if (mmp_label) {
3299 if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTNAME)) {
3300 hostname = fnvlist_lookup_string(mmp_label,
3301 ZPOOL_CONFIG_HOSTNAME);
3302 fnvlist_add_string(spa->spa_load_info,
3303 ZPOOL_CONFIG_MMP_HOSTNAME, hostname);
3304 }
3305
3306 if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTID)) {
3307 hostid = fnvlist_lookup_uint64(mmp_label,
3308 ZPOOL_CONFIG_HOSTID);
3309 fnvlist_add_uint64(spa->spa_load_info,
3310 ZPOOL_CONFIG_MMP_HOSTID, hostid);
3311 }
3312 }
3313
3314 fnvlist_add_uint64(spa->spa_load_info,
3315 ZPOOL_CONFIG_MMP_STATE, MMP_STATE_ACTIVE);
3316 fnvlist_add_uint64(spa->spa_load_info,
3317 ZPOOL_CONFIG_MMP_TXG, 0);
3318
3319 error = spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO);
3320 }
3321
3322 if (mmp_label)
3323 nvlist_free(mmp_label);
3324
3325 return (error);
3326}
3327
9eb7b46e 3328static int
6cb8e530
PZ
3329spa_verify_host(spa_t *spa, nvlist_t *mos_config)
3330{
3331 uint64_t hostid;
3332 char *hostname;
3333 uint64_t myhostid = 0;
3334
3335 if (!spa_is_root(spa) && nvlist_lookup_uint64(mos_config,
3336 ZPOOL_CONFIG_HOSTID, &hostid) == 0) {
3337 hostname = fnvlist_lookup_string(mos_config,
3338 ZPOOL_CONFIG_HOSTNAME);
3339
3340 myhostid = zone_get_hostid(NULL);
3341
3342 if (hostid != 0 && myhostid != 0 && hostid != myhostid) {
3343 cmn_err(CE_WARN, "pool '%s' could not be "
3344 "loaded as it was last accessed by "
3345 "another system (host: %s hostid: 0x%llx). "
a2f944a1
RM
3346 "See: https://openzfs.github.io/openzfs-docs/msg/"
3347 "ZFS-8000-EY",
6cb8e530
PZ
3348 spa_name(spa), hostname, (u_longlong_t)hostid);
3349 spa_load_failed(spa, "hostid verification failed: pool "
3350 "last accessed by host: %s (hostid: 0x%llx)",
3351 hostname, (u_longlong_t)hostid);
3352 return (SET_ERROR(EBADF));
3353 }
3354 }
3355
3356 return (0);
3357}
3358
3359static int
3360spa_ld_parse_config(spa_t *spa, spa_import_type_t type)
428870ff
BB
3361{
3362 int error = 0;
6cb8e530 3363 nvlist_t *nvtree, *nvl, *config = spa->spa_config;
1c27024e 3364 int parse;
9eb7b46e 3365 vdev_t *rvd;
6cb8e530
PZ
3366 uint64_t pool_guid;
3367 char *comment;
658fb802 3368 char *compatibility;
6cb8e530
PZ
3369
3370 /*
3371 * Versioning wasn't explicitly added to the label until later, so if
3372 * it's not present treat it as the initial version.
3373 */
3374 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
3375 &spa->spa_ubsync.ub_version) != 0)
3376 spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
3377
3378 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) {
3379 spa_load_failed(spa, "invalid config provided: '%s' missing",
3380 ZPOOL_CONFIG_POOL_GUID);
3381 return (SET_ERROR(EINVAL));
3382 }
3383
d2734cce
SD
3384 /*
3385 * If we are doing an import, ensure that the pool is not already
3386 * imported by checking if its pool guid already exists in the
3387 * spa namespace.
3388 *
3389 * The only case that we allow an already imported pool to be
3390 * imported again, is when the pool is checkpointed and we want to
3391 * look at its checkpointed state from userland tools like zdb.
3392 */
3393#ifdef _KERNEL
3394 if ((spa->spa_load_state == SPA_LOAD_IMPORT ||
3395 spa->spa_load_state == SPA_LOAD_TRYIMPORT) &&
3396 spa_guid_exists(pool_guid, 0)) {
3397#else
3398 if ((spa->spa_load_state == SPA_LOAD_IMPORT ||
3399 spa->spa_load_state == SPA_LOAD_TRYIMPORT) &&
3400 spa_guid_exists(pool_guid, 0) &&
3401 !spa_importing_readonly_checkpoint(spa)) {
3402#endif
6cb8e530
PZ
3403 spa_load_failed(spa, "a pool with guid %llu is already open",
3404 (u_longlong_t)pool_guid);
3405 return (SET_ERROR(EEXIST));
3406 }
3407
3408 spa->spa_config_guid = pool_guid;
3409
3410 nvlist_free(spa->spa_load_info);
3411 spa->spa_load_info = fnvlist_alloc();
3412
3413 ASSERT(spa->spa_comment == NULL);
3414 if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0)
3415 spa->spa_comment = spa_strdup(comment);
3416
658fb802
CB
3417 ASSERT(spa->spa_compatibility == NULL);
3418 if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMPATIBILITY,
3419 &compatibility) == 0)
3420 spa->spa_compatibility = spa_strdup(compatibility);
3421
6cb8e530
PZ
3422 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
3423 &spa->spa_config_txg);
3424
3425 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) == 0)
3426 spa->spa_config_splitting = fnvlist_dup(nvl);
428870ff 3427
4a0ee12a
PZ
3428 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtree)) {
3429 spa_load_failed(spa, "invalid config provided: '%s' missing",
3430 ZPOOL_CONFIG_VDEV_TREE);
2e528b49 3431 return (SET_ERROR(EINVAL));
4a0ee12a 3432 }
428870ff 3433
428870ff
BB
3434 /*
3435 * Create "The Godfather" zio to hold all async IOs
3436 */
e022864d
MA
3437 spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *),
3438 KM_SLEEP);
1c27024e 3439 for (int i = 0; i < max_ncpus; i++) {
e022864d
MA
3440 spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL,
3441 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
3442 ZIO_FLAG_GODFATHER);
3443 }
428870ff
BB
3444
3445 /*
3446 * Parse the configuration into a vdev tree. We explicitly set the
3447 * value that will be returned by spa_version() since parsing the
3448 * configuration requires knowing the version number.
3449 */
3450 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
6cb8e530
PZ
3451 parse = (type == SPA_IMPORT_EXISTING ?
3452 VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT);
9eb7b46e 3453 error = spa_config_parse(spa, &rvd, nvtree, NULL, 0, parse);
428870ff
BB
3454 spa_config_exit(spa, SCL_ALL, FTAG);
3455
4a0ee12a
PZ
3456 if (error != 0) {
3457 spa_load_failed(spa, "unable to parse config [error=%d]",
3458 error);
428870ff 3459 return (error);
4a0ee12a 3460 }
428870ff
BB
3461
3462 ASSERT(spa->spa_root_vdev == rvd);
c3520e7f
MA
3463 ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT);
3464 ASSERT3U(spa->spa_max_ashift, <=, SPA_MAXBLOCKSHIFT);
428870ff
BB
3465
3466 if (type != SPA_IMPORT_ASSEMBLE) {
3467 ASSERT(spa_guid(spa) == pool_guid);
3468 }
3469
9eb7b46e
PZ
3470 return (0);
3471}
3472
6cb8e530
PZ
3473/*
3474 * Recursively open all vdevs in the vdev tree. This function is called twice:
3475 * first with the untrusted config, then with the trusted config.
3476 */
9eb7b46e
PZ
3477static int
3478spa_ld_open_vdevs(spa_t *spa)
3479{
3480 int error = 0;
3481
6cb8e530
PZ
3482 /*
3483 * spa_missing_tvds_allowed defines how many top-level vdevs can be
3484 * missing/unopenable for the root vdev to be still considered openable.
3485 */
3486 if (spa->spa_trust_config) {
3487 spa->spa_missing_tvds_allowed = zfs_max_missing_tvds;
3488 } else if (spa->spa_config_source == SPA_CONFIG_SRC_CACHEFILE) {
3489 spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_cachefile;
3490 } else if (spa->spa_config_source == SPA_CONFIG_SRC_SCAN) {
3491 spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_scan;
3492 } else {
3493 spa->spa_missing_tvds_allowed = 0;
3494 }
3495
3496 spa->spa_missing_tvds_allowed =
3497 MAX(zfs_max_missing_tvds, spa->spa_missing_tvds_allowed);
3498
428870ff 3499 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
9eb7b46e 3500 error = vdev_open(spa->spa_root_vdev);
428870ff 3501 spa_config_exit(spa, SCL_ALL, FTAG);
6cb8e530
PZ
3502
3503 if (spa->spa_missing_tvds != 0) {
3504 spa_load_note(spa, "vdev tree has %lld missing top-level "
3505 "vdevs.", (u_longlong_t)spa->spa_missing_tvds);
da92d5cb 3506 if (spa->spa_trust_config && (spa->spa_mode & SPA_MODE_WRITE)) {
6cb8e530
PZ
3507 /*
3508 * Although theoretically we could allow users to open
3509 * incomplete pools in RW mode, we'd need to add a lot
3510 * of extra logic (e.g. adjust pool space to account
3511 * for missing vdevs).
3512 * This limitation also prevents users from accidentally
3513 * opening the pool in RW mode during data recovery and
3514 * damaging it further.
3515 */
3516 spa_load_note(spa, "pools with missing top-level "
3517 "vdevs can only be opened in read-only mode.");
3518 error = SET_ERROR(ENXIO);
3519 } else {
3520 spa_load_note(spa, "current settings allow for maximum "
3521 "%lld missing top-level vdevs at this stage.",
3522 (u_longlong_t)spa->spa_missing_tvds_allowed);
3523 }
3524 }
4a0ee12a
PZ
3525 if (error != 0) {
3526 spa_load_failed(spa, "unable to open vdev tree [error=%d]",
3527 error);
3528 }
6cb8e530
PZ
3529 if (spa->spa_missing_tvds != 0 || error != 0)
3530 vdev_dbgmsg_print_tree(spa->spa_root_vdev, 2);
9eb7b46e
PZ
3531
3532 return (error);
3533}
3534
6cb8e530
PZ
3535/*
3536 * We need to validate the vdev labels against the configuration that
3537 * we have in hand. This function is called twice: first with an untrusted
3538 * config, then with a trusted config. The validation is more strict when the
3539 * config is trusted.
3540 */
9eb7b46e 3541static int
6cb8e530 3542spa_ld_validate_vdevs(spa_t *spa)
9eb7b46e
PZ
3543{
3544 int error = 0;
3545 vdev_t *rvd = spa->spa_root_vdev;
428870ff 3546
6cb8e530
PZ
3547 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3548 error = vdev_validate(rvd);
3549 spa_config_exit(spa, SCL_ALL, FTAG);
428870ff 3550
6cb8e530
PZ
3551 if (error != 0) {
3552 spa_load_failed(spa, "vdev_validate failed [error=%d]", error);
3553 return (error);
3554 }
428870ff 3555
6cb8e530
PZ
3556 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) {
3557 spa_load_failed(spa, "cannot open vdev tree after invalidating "
3558 "some vdevs");
3559 vdev_dbgmsg_print_tree(rvd, 2);
3560 return (SET_ERROR(ENXIO));
428870ff
BB
3561 }
3562
9eb7b46e
PZ
3563 return (0);
3564}
3565
d2734cce
SD
3566static void
3567spa_ld_select_uberblock_done(spa_t *spa, uberblock_t *ub)
3568{
3569 spa->spa_state = POOL_STATE_ACTIVE;
3570 spa->spa_ubsync = spa->spa_uberblock;
3571 spa->spa_verify_min_txg = spa->spa_extreme_rewind ?
3572 TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1;
3573 spa->spa_first_txg = spa->spa_last_ubsync_txg ?
3574 spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1;
3575 spa->spa_claim_max_txg = spa->spa_first_txg;
3576 spa->spa_prev_software_version = ub->ub_software_version;
3577}
3578
9eb7b46e 3579static int
6cb8e530 3580spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type)
9eb7b46e
PZ
3581{
3582 vdev_t *rvd = spa->spa_root_vdev;
3583 nvlist_t *label;
3584 uberblock_t *ub = &spa->spa_uberblock;
9eb7b46e
PZ
3585 boolean_t activity_check = B_FALSE;
3586
d2734cce
SD
3587 /*
3588 * If we are opening the checkpointed state of the pool by
3589 * rewinding to it, at this point we will have written the
3590 * checkpointed uberblock to the vdev labels, so searching
3591 * the labels will find the right uberblock. However, if
3592 * we are opening the checkpointed state read-only, we have
3593 * not modified the labels. Therefore, we must ignore the
3594 * labels and continue using the spa_uberblock that was set
3595 * by spa_ld_checkpoint_rewind.
3596 *
3597 * Note that it would be fine to ignore the labels when
3598 * rewinding (opening writeable) as well. However, if we
3599 * crash just after writing the labels, we will end up
3600 * searching the labels. Doing so in the common case means
3601 * that this code path gets exercised normally, rather than
3602 * just in the edge case.
3603 */
3604 if (ub->ub_checkpoint_txg != 0 &&
3605 spa_importing_readonly_checkpoint(spa)) {
3606 spa_ld_select_uberblock_done(spa, ub);
3607 return (0);
3608 }
3609
428870ff
BB
3610 /*
3611 * Find the best uberblock.
3612 */
9ae529ec 3613 vdev_uberblock_load(rvd, ub, &label);
428870ff
BB
3614
3615 /*
3616 * If we weren't able to find a single valid uberblock, return failure.
3617 */
9ae529ec
CS
3618 if (ub->ub_txg == 0) {
3619 nvlist_free(label);
4a0ee12a 3620 spa_load_failed(spa, "no valid uberblock found");
428870ff 3621 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO));
9ae529ec 3622 }
428870ff 3623
ca95f70d
OF
3624 if (spa->spa_load_max_txg != UINT64_MAX) {
3625 (void) spa_import_progress_set_max_txg(spa_guid(spa),
3626 (u_longlong_t)spa->spa_load_max_txg);
3627 }
4a0ee12a
PZ
3628 spa_load_note(spa, "using uberblock with txg=%llu",
3629 (u_longlong_t)ub->ub_txg);
3630
3631
379ca9cf
OF
3632 /*
3633 * For pools which have the multihost property on determine if the
3634 * pool is truly inactive and can be safely imported. Prevent
3635 * hosts which don't have a hostid set from importing the pool.
3636 */
6cb8e530
PZ
3637 activity_check = spa_activity_check_required(spa, ub, label,
3638 spa->spa_config);
379ca9cf 3639 if (activity_check) {
379ca9cf 3640 if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay &&
25f06d67 3641 spa_get_hostid(spa) == 0) {
379ca9cf
OF
3642 nvlist_free(label);
3643 fnvlist_add_uint64(spa->spa_load_info,
3644 ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID);
3645 return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO));
3646 }
3647
6cb8e530 3648 int error = spa_activity_check(spa, ub, spa->spa_config);
e889f0f5
OF
3649 if (error) {
3650 nvlist_free(label);
3651 return (error);
3652 }
3653
379ca9cf
OF
3654 fnvlist_add_uint64(spa->spa_load_info,
3655 ZPOOL_CONFIG_MMP_STATE, MMP_STATE_INACTIVE);
3656 fnvlist_add_uint64(spa->spa_load_info,
3657 ZPOOL_CONFIG_MMP_TXG, ub->ub_txg);
060f0226
OF
3658 fnvlist_add_uint16(spa->spa_load_info,
3659 ZPOOL_CONFIG_MMP_SEQ,
3660 (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0));
379ca9cf
OF
3661 }
3662
428870ff 3663 /*
9ae529ec 3664 * If the pool has an unsupported version we can't open it.
428870ff 3665 */
9ae529ec
CS
3666 if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) {
3667 nvlist_free(label);
4a0ee12a
PZ
3668 spa_load_failed(spa, "version %llu is not supported",
3669 (u_longlong_t)ub->ub_version);
428870ff 3670 return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP));
9ae529ec
CS
3671 }
3672
3673 if (ub->ub_version >= SPA_VERSION_FEATURES) {
3674 nvlist_t *features;
3675
3676 /*
3677 * If we weren't able to find what's necessary for reading the
3678 * MOS in the label, return failure.
3679 */
4a0ee12a
PZ
3680 if (label == NULL) {
3681 spa_load_failed(spa, "label config unavailable");
3682 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
3683 ENXIO));
3684 }
3685
3686 if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_FEATURES_FOR_READ,
3687 &features) != 0) {
9ae529ec 3688 nvlist_free(label);
4a0ee12a
PZ
3689 spa_load_failed(spa, "invalid label: '%s' missing",
3690 ZPOOL_CONFIG_FEATURES_FOR_READ);
9ae529ec
CS
3691 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
3692 ENXIO));
3693 }
3694
3695 /*
3696 * Update our in-core representation with the definitive values
3697 * from the label.
3698 */
3699 nvlist_free(spa->spa_label_features);
65ad5d11 3700 spa->spa_label_features = fnvlist_dup(features);
9ae529ec
CS
3701 }
3702
3703 nvlist_free(label);
3704
3705 /*
3706 * Look through entries in the label nvlist's features_for_read. If
3707 * there is a feature listed there which we don't understand then we
3708 * cannot open a pool.
3709 */
3710 if (ub->ub_version >= SPA_VERSION_FEATURES) {
3711 nvlist_t *unsup_feat;
9ae529ec 3712
65ad5d11 3713 unsup_feat = fnvlist_alloc();
9ae529ec 3714
1c27024e
DB
3715 for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features,
3716 NULL); nvp != NULL;
9ae529ec
CS
3717 nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) {
3718 if (!zfeature_is_supported(nvpair_name(nvp))) {
65ad5d11
AJ
3719 fnvlist_add_string(unsup_feat,
3720 nvpair_name(nvp), "");
9ae529ec
CS
3721 }
3722 }
3723
3724 if (!nvlist_empty(unsup_feat)) {
65ad5d11
AJ
3725 fnvlist_add_nvlist(spa->spa_load_info,
3726 ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat);
9ae529ec 3727 nvlist_free(unsup_feat);
4a0ee12a 3728 spa_load_failed(spa, "some features are unsupported");
9ae529ec
CS
3729 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
3730 ENOTSUP));
3731 }
3732
3733 nvlist_free(unsup_feat);
3734 }
428870ff 3735
428870ff
BB
3736 if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) {
3737 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
6cb8e530 3738 spa_try_repair(spa, spa->spa_config);
428870ff
BB
3739 spa_config_exit(spa, SCL_ALL, FTAG);
3740 nvlist_free(spa->spa_config_splitting);
3741 spa->spa_config_splitting = NULL;
3742 }
3743
3744 /*
3745 * Initialize internal SPA structures.
3746 */
d2734cce 3747 spa_ld_select_uberblock_done(spa, ub);
428870ff 3748
9eb7b46e
PZ
3749 return (0);
3750}
3751
3752static int
3753spa_ld_open_rootbp(spa_t *spa)
3754{
3755 int error = 0;
3756 vdev_t *rvd = spa->spa_root_vdev;
a1d477c2 3757
9ae529ec 3758 error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
4a0ee12a
PZ
3759 if (error != 0) {
3760 spa_load_failed(spa, "unable to open rootbp in dsl_pool_init "
3761 "[error=%d]", error);
428870ff 3762 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
4a0ee12a 3763 }
428870ff
BB
3764 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
3765
9eb7b46e
PZ
3766 return (0);
3767}
3768
3769static int
d2734cce 3770spa_ld_trusted_config(spa_t *spa, spa_import_type_t type,
6cb8e530 3771 boolean_t reloading)
9eb7b46e 3772{
6cb8e530
PZ
3773 vdev_t *mrvd, *rvd = spa->spa_root_vdev;
3774 nvlist_t *nv, *mos_config, *policy;
3775 int error = 0, copy_error;
3776 uint64_t healthy_tvds, healthy_tvds_mos;
3777 uint64_t mos_config_txg;
9eb7b46e 3778
4a0ee12a
PZ
3779 if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object, B_TRUE)
3780 != 0)
428870ff
BB
3781 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3782
a1d477c2 3783 /*
6cb8e530
PZ
3784 * If we're assembling a pool from a split, the config provided is
3785 * already trusted so there is nothing to do.
a1d477c2 3786 */
6cb8e530
PZ
3787 if (type == SPA_IMPORT_ASSEMBLE)
3788 return (0);
3789
3790 healthy_tvds = spa_healthy_core_tvds(spa);
a1d477c2 3791
6cb8e530
PZ
3792 if (load_nvlist(spa, spa->spa_config_object, &mos_config)
3793 != 0) {
3794 spa_load_failed(spa, "unable to retrieve MOS config");
3795 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3796 }
3797
3798 /*
3799 * If we are doing an open, pool owner wasn't verified yet, thus do
3800 * the verification here.
3801 */
3802 if (spa->spa_load_state == SPA_LOAD_OPEN) {
3803 error = spa_verify_host(spa, mos_config);
3804 if (error != 0) {
a1d477c2 3805 nvlist_free(mos_config);
6cb8e530 3806 return (error);
a1d477c2 3807 }
6cb8e530
PZ
3808 }
3809
3810 nv = fnvlist_lookup_nvlist(mos_config, ZPOOL_CONFIG_VDEV_TREE);
a1d477c2 3811
6cb8e530
PZ
3812 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3813
3814 /*
3815 * Build a new vdev tree from the trusted config
3816 */
b2255edc
BB
3817 error = spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD);
3818 if (error != 0) {
3819 nvlist_free(mos_config);
3820 spa_config_exit(spa, SCL_ALL, FTAG);
3821 spa_load_failed(spa, "spa_config_parse failed [error=%d]",
3822 error);
3823 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
3824 }
6cb8e530
PZ
3825
3826 /*
3827 * Vdev paths in the MOS may be obsolete. If the untrusted config was
3828 * obtained by scanning /dev/dsk, then it will have the right vdev
3829 * paths. We update the trusted MOS config with this information.
3830 * We first try to copy the paths with vdev_copy_path_strict, which
3831 * succeeds only when both configs have exactly the same vdev tree.
3832 * If that fails, we fall back to a more flexible method that has a
3833 * best effort policy.
3834 */
3835 copy_error = vdev_copy_path_strict(rvd, mrvd);
3836 if (copy_error != 0 || spa_load_print_vdev_tree) {
3837 spa_load_note(spa, "provided vdev tree:");
3838 vdev_dbgmsg_print_tree(rvd, 2);
3839 spa_load_note(spa, "MOS vdev tree:");
3840 vdev_dbgmsg_print_tree(mrvd, 2);
3841 }
3842 if (copy_error != 0) {
3843 spa_load_note(spa, "vdev_copy_path_strict failed, falling "
3844 "back to vdev_copy_path_relaxed");
3845 vdev_copy_path_relaxed(rvd, mrvd);
3846 }
3847
3848 vdev_close(rvd);
3849 vdev_free(rvd);
3850 spa->spa_root_vdev = mrvd;
3851 rvd = mrvd;
3852 spa_config_exit(spa, SCL_ALL, FTAG);
3853
3854 /*
3855 * We will use spa_config if we decide to reload the spa or if spa_load
3856 * fails and we rewind. We must thus regenerate the config using the
8a393be3
PZ
3857 * MOS information with the updated paths. ZPOOL_LOAD_POLICY is used to
3858 * pass settings on how to load the pool and is not stored in the MOS.
3859 * We copy it over to our new, trusted config.
6cb8e530
PZ
3860 */
3861 mos_config_txg = fnvlist_lookup_uint64(mos_config,
3862 ZPOOL_CONFIG_POOL_TXG);
3863 nvlist_free(mos_config);
3864 mos_config = spa_config_generate(spa, NULL, mos_config_txg, B_FALSE);
8a393be3 3865 if (nvlist_lookup_nvlist(spa->spa_config, ZPOOL_LOAD_POLICY,
6cb8e530 3866 &policy) == 0)
8a393be3 3867 fnvlist_add_nvlist(mos_config, ZPOOL_LOAD_POLICY, policy);
6cb8e530
PZ
3868 spa_config_set(spa, mos_config);
3869 spa->spa_config_source = SPA_CONFIG_SRC_MOS;
3870
3871 /*
3872 * Now that we got the config from the MOS, we should be more strict
3873 * in checking blkptrs and can make assumptions about the consistency
3874 * of the vdev tree. spa_trust_config must be set to true before opening
3875 * vdevs in order for them to be writeable.
3876 */
3877 spa->spa_trust_config = B_TRUE;
3878
3879 /*
3880 * Open and validate the new vdev tree
3881 */
3882 error = spa_ld_open_vdevs(spa);
3883 if (error != 0)
3884 return (error);
3885
3886 error = spa_ld_validate_vdevs(spa);
3887 if (error != 0)
3888 return (error);
3889
3890 if (copy_error != 0 || spa_load_print_vdev_tree) {
3891 spa_load_note(spa, "final vdev tree:");
3892 vdev_dbgmsg_print_tree(rvd, 2);
3893 }
3894
3895 if (spa->spa_load_state != SPA_LOAD_TRYIMPORT &&
3896 !spa->spa_extreme_rewind && zfs_max_missing_tvds == 0) {
a1d477c2 3897 /*
6cb8e530
PZ
3898 * Sanity check to make sure that we are indeed loading the
3899 * latest uberblock. If we missed SPA_SYNC_MIN_VDEVS tvds
3900 * in the config provided and they happened to be the only ones
3901 * to have the latest uberblock, we could involuntarily perform
3902 * an extreme rewind.
a1d477c2 3903 */
6cb8e530
PZ
3904 healthy_tvds_mos = spa_healthy_core_tvds(spa);
3905 if (healthy_tvds_mos - healthy_tvds >=
3906 SPA_SYNC_MIN_VDEVS) {
3907 spa_load_note(spa, "config provided misses too many "
3908 "top-level vdevs compared to MOS (%lld vs %lld). ",
3909 (u_longlong_t)healthy_tvds,
3910 (u_longlong_t)healthy_tvds_mos);
3911 spa_load_note(spa, "vdev tree:");
3912 vdev_dbgmsg_print_tree(rvd, 2);
3913 if (reloading) {
3914 spa_load_failed(spa, "config was already "
3915 "provided from MOS. Aborting.");
3916 return (spa_vdev_err(rvd,
3917 VDEV_AUX_CORRUPT_DATA, EIO));
3918 }
3919 spa_load_note(spa, "spa must be reloaded using MOS "
3920 "config");
3921 return (SET_ERROR(EAGAIN));
4a0ee12a 3922 }
a1d477c2
MA
3923 }
3924
6cb8e530
PZ
3925 error = spa_check_for_missing_logs(spa);
3926 if (error != 0)
3927 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO));
3928
3929 if (rvd->vdev_guid_sum != spa->spa_uberblock.ub_guid_sum) {
3930 spa_load_failed(spa, "uberblock guid sum doesn't match MOS "
3931 "guid sum (%llu != %llu)",
3932 (u_longlong_t)spa->spa_uberblock.ub_guid_sum,
3933 (u_longlong_t)rvd->vdev_guid_sum);
3934 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM,
3935 ENXIO));
3936 }
3937
9eb7b46e
PZ
3938 return (0);
3939}
3940
3941static int
3942spa_ld_open_indirect_vdev_metadata(spa_t *spa)
3943{
3944 int error = 0;
3945 vdev_t *rvd = spa->spa_root_vdev;
3946
a1d477c2
MA
3947 /*
3948 * Everything that we read before spa_remove_init() must be stored
3949 * on concreted vdevs. Therefore we do this as early as possible.
3950 */
4a0ee12a
PZ
3951 error = spa_remove_init(spa);
3952 if (error != 0) {
3953 spa_load_failed(spa, "spa_remove_init failed [error=%d]",
3954 error);
a1d477c2 3955 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
4a0ee12a 3956 }
a1d477c2 3957
9eb7b46e
PZ
3958 /*
3959 * Retrieve information needed to condense indirect vdev mappings.
3960 */
3961 error = spa_condense_init(spa);
3962 if (error != 0) {
4a0ee12a
PZ
3963 spa_load_failed(spa, "spa_condense_init failed [error=%d]",
3964 error);
9eb7b46e
PZ
3965 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
3966 }
3967
3968 return (0);
3969}
3970
3971static int
4a0ee12a 3972spa_ld_check_features(spa_t *spa, boolean_t *missing_feat_writep)
9eb7b46e
PZ
3973{
3974 int error = 0;
3975 vdev_t *rvd = spa->spa_root_vdev;
3976
9ae529ec
CS
3977 if (spa_version(spa) >= SPA_VERSION_FEATURES) {
3978 boolean_t missing_feat_read = B_FALSE;
b9b24bb4 3979 nvlist_t *unsup_feat, *enabled_feat;
9ae529ec
CS
3980
3981 if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ,
4a0ee12a 3982 &spa->spa_feat_for_read_obj, B_TRUE) != 0) {
9ae529ec
CS
3983 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3984 }
3985
3986 if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE,
4a0ee12a 3987 &spa->spa_feat_for_write_obj, B_TRUE) != 0) {
9ae529ec
CS
3988 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3989 }
3990
3991 if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS,
4a0ee12a 3992 &spa->spa_feat_desc_obj, B_TRUE) != 0) {
9ae529ec
CS
3993 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3994 }
3995
b9b24bb4
CS
3996 enabled_feat = fnvlist_alloc();
3997 unsup_feat = fnvlist_alloc();
9ae529ec 3998
fa86b5db 3999 if (!spa_features_check(spa, B_FALSE,
b9b24bb4 4000 unsup_feat, enabled_feat))
9ae529ec
CS
4001 missing_feat_read = B_TRUE;
4002
4a0ee12a
PZ
4003 if (spa_writeable(spa) ||
4004 spa->spa_load_state == SPA_LOAD_TRYIMPORT) {
fa86b5db 4005 if (!spa_features_check(spa, B_TRUE,
b9b24bb4 4006 unsup_feat, enabled_feat)) {
9eb7b46e 4007 *missing_feat_writep = B_TRUE;
b9b24bb4 4008 }
9ae529ec
CS
4009 }
4010
b9b24bb4
CS
4011 fnvlist_add_nvlist(spa->spa_load_info,
4012 ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat);
4013
9ae529ec 4014 if (!nvlist_empty(unsup_feat)) {
b9b24bb4
CS
4015 fnvlist_add_nvlist(spa->spa_load_info,
4016 ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat);
9ae529ec
CS
4017 }
4018
b9b24bb4
CS
4019 fnvlist_free(enabled_feat);
4020 fnvlist_free(unsup_feat);
9ae529ec
CS
4021
4022 if (!missing_feat_read) {
4023 fnvlist_add_boolean(spa->spa_load_info,
4024 ZPOOL_CONFIG_CAN_RDONLY);
4025 }
4026
4027 /*
4028 * If the state is SPA_LOAD_TRYIMPORT, our objective is
4029 * twofold: to determine whether the pool is available for
4030 * import in read-write mode and (if it is not) whether the
4031 * pool is available for import in read-only mode. If the pool
4032 * is available for import in read-write mode, it is displayed
4033 * as available in userland; if it is not available for import
4034 * in read-only mode, it is displayed as unavailable in
4035 * userland. If the pool is available for import in read-only
4036 * mode but not read-write mode, it is displayed as unavailable
4037 * in userland with a special note that the pool is actually
4038 * available for open in read-only mode.
4039 *
4040 * As a result, if the state is SPA_LOAD_TRYIMPORT and we are
4041 * missing a feature for write, we must first determine whether
4042 * the pool can be opened read-only before returning to
4043 * userland in order to know whether to display the
4044 * abovementioned note.
4045 */
9eb7b46e 4046 if (missing_feat_read || (*missing_feat_writep &&
9ae529ec 4047 spa_writeable(spa))) {
4a0ee12a 4048 spa_load_failed(spa, "pool uses unsupported features");
9ae529ec
CS
4049 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
4050 ENOTSUP));
4051 }
b0bc7a84
MG
4052
4053 /*
4054 * Load refcounts for ZFS features from disk into an in-memory
4055 * cache during SPA initialization.
4056 */
1c27024e 4057 for (spa_feature_t i = 0; i < SPA_FEATURES; i++) {
b0bc7a84
MG
4058 uint64_t refcount;
4059
4060 error = feature_get_refcount_from_disk(spa,
4061 &spa_feature_table[i], &refcount);
4062 if (error == 0) {
4063 spa->spa_feat_refcount_cache[i] = refcount;
4064 } else if (error == ENOTSUP) {
4065 spa->spa_feat_refcount_cache[i] =
4066 SPA_FEATURE_DISABLED;
4067 } else {
4a0ee12a
PZ
4068 spa_load_failed(spa, "error getting refcount "
4069 "for feature %s [error=%d]",
4070 spa_feature_table[i].fi_guid, error);
b0bc7a84
MG
4071 return (spa_vdev_err(rvd,
4072 VDEV_AUX_CORRUPT_DATA, EIO));
4073 }
4074 }
4075 }
4076
4077 if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) {
4078 if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG,
4a0ee12a 4079 &spa->spa_feat_enabled_txg_obj, B_TRUE) != 0)
b0bc7a84 4080 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
9ae529ec
CS
4081 }
4082
f00ab3f2
TC
4083 /*
4084 * Encryption was added before bookmark_v2, even though bookmark_v2
4085 * is now a dependency. If this pool has encryption enabled without
4086 * bookmark_v2, trigger an errata message.
4087 */
4088 if (spa_feature_is_enabled(spa, SPA_FEATURE_ENCRYPTION) &&
4089 !spa_feature_is_enabled(spa, SPA_FEATURE_BOOKMARK_V2)) {
4090 spa->spa_errata = ZPOOL_ERRATA_ZOL_8308_ENCRYPTION;
4091 }
4092
9eb7b46e
PZ
4093 return (0);
4094}
4095
4096static int
4097spa_ld_load_special_directories(spa_t *spa)
4098{
4099 int error = 0;
4100 vdev_t *rvd = spa->spa_root_vdev;
4101
9ae529ec
CS
4102 spa->spa_is_initializing = B_TRUE;
4103 error = dsl_pool_open(spa->spa_dsl_pool);
4104 spa->spa_is_initializing = B_FALSE;
4a0ee12a
PZ
4105 if (error != 0) {
4106 spa_load_failed(spa, "dsl_pool_open failed [error=%d]", error);
9ae529ec 4107 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
4a0ee12a 4108 }
9ae529ec 4109
9eb7b46e
PZ
4110 return (0);
4111}
428870ff 4112
9eb7b46e
PZ
4113static int
4114spa_ld_get_props(spa_t *spa)
4115{
4116 int error = 0;
4117 uint64_t obj;
4118 vdev_t *rvd = spa->spa_root_vdev;
34dc7c2f 4119
3c67d83a
TH
4120 /* Grab the checksum salt from the MOS. */
4121 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
4122 DMU_POOL_CHECKSUM_SALT, 1,
4123 sizeof (spa->spa_cksum_salt.zcs_bytes),
4124 spa->spa_cksum_salt.zcs_bytes);
4125 if (error == ENOENT) {
4126 /* Generate a new salt for subsequent use */
4127 (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes,
4128 sizeof (spa->spa_cksum_salt.zcs_bytes));
4129 } else if (error != 0) {
4a0ee12a
PZ
4130 spa_load_failed(spa, "unable to retrieve checksum salt from "
4131 "MOS [error=%d]", error);
3c67d83a
TH
4132 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
4133 }
4134
4a0ee12a 4135 if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj, B_TRUE) != 0)
428870ff
BB
4136 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
4137 error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj);
4a0ee12a
PZ
4138 if (error != 0) {
4139 spa_load_failed(spa, "error opening deferred-frees bpobj "
4140 "[error=%d]", error);
428870ff 4141 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
4a0ee12a 4142 }
34dc7c2f
BB
4143
4144 /*
4145 * Load the bit that tells us to use the new accounting function
4146 * (raid-z deflation). If we have an older pool, this will not
4147 * be present.
4148 */
4a0ee12a 4149 error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate, B_FALSE);
428870ff
BB
4150 if (error != 0 && error != ENOENT)
4151 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
4152
4153 error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION,
4a0ee12a 4154 &spa->spa_creation_version, B_FALSE);
428870ff
BB
4155 if (error != 0 && error != ENOENT)
4156 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
34dc7c2f
BB
4157
4158 /*
4159 * Load the persistent error log. If we have an older pool, this will
4160 * not be present.
4161 */
4a0ee12a
PZ
4162 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last,
4163 B_FALSE);
428870ff
BB
4164 if (error != 0 && error != ENOENT)
4165 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
34dc7c2f 4166
428870ff 4167 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB,
4a0ee12a 4168 &spa->spa_errlog_scrub, B_FALSE);
428870ff
BB
4169 if (error != 0 && error != ENOENT)
4170 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
34dc7c2f 4171
37f03da8
SH
4172 /*
4173 * Load the livelist deletion field. If a livelist is queued for
4174 * deletion, indicate that in the spa
4175 */
4176 error = spa_dir_prop(spa, DMU_POOL_DELETED_CLONES,
4177 &spa->spa_livelists_to_delete, B_FALSE);
4178 if (error != 0 && error != ENOENT)
4179 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
4180
34dc7c2f
BB
4181 /*
4182 * Load the history object. If we have an older pool, this
4183 * will not be present.
4184 */
4a0ee12a 4185 error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history, B_FALSE);
428870ff
BB
4186 if (error != 0 && error != ENOENT)
4187 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
4188
e0ab3ab5
JS
4189 /*
4190 * Load the per-vdev ZAP map. If we have an older pool, this will not
4191 * be present; in this case, defer its creation to a later time to
4192 * avoid dirtying the MOS this early / out of sync context. See
4193 * spa_sync_config_object.
4194 */
4195
4196 /* The sentinel is only available in the MOS config. */
1c27024e 4197 nvlist_t *mos_config;
4a0ee12a
PZ
4198 if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0) {
4199 spa_load_failed(spa, "unable to retrieve MOS config");
e0ab3ab5 4200 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
4a0ee12a 4201 }
e0ab3ab5
JS
4202
4203 error = spa_dir_prop(spa, DMU_POOL_VDEV_ZAP_MAP,
4a0ee12a 4204 &spa->spa_all_vdev_zaps, B_FALSE);
e0ab3ab5 4205
38640550
DB
4206 if (error == ENOENT) {
4207 VERIFY(!nvlist_exists(mos_config,
4208 ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS));
4209 spa->spa_avz_action = AVZ_ACTION_INITIALIZE;
4210 ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev));
4211 } else if (error != 0) {
cb01da68 4212 nvlist_free(mos_config);
e0ab3ab5 4213 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
38640550 4214 } else if (!nvlist_exists(mos_config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)) {
e0ab3ab5
JS
4215 /*
4216 * An older version of ZFS overwrote the sentinel value, so
4217 * we have orphaned per-vdev ZAPs in the MOS. Defer their
4218 * destruction to later; see spa_sync_config_object.
4219 */
4220 spa->spa_avz_action = AVZ_ACTION_DESTROY;
4221 /*
4222 * We're assuming that no vdevs have had their ZAPs created
4223 * before this. Better be sure of it.
4224 */
4225 ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev));
4226 }
4227 nvlist_free(mos_config);
4228
9eb7b46e
PZ
4229 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
4230
4a0ee12a
PZ
4231 error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object,
4232 B_FALSE);
9eb7b46e
PZ
4233 if (error && error != ENOENT)
4234 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
4235
4236 if (error == 0) {
da27b8bc 4237 uint64_t autoreplace = 0;
9eb7b46e
PZ
4238
4239 spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs);
4240 spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace);
4241 spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation);
4242 spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode);
4243 spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand);
c02c1bec 4244 spa_prop_find(spa, ZPOOL_PROP_MULTIHOST, &spa->spa_multihost);
1b939560 4245 spa_prop_find(spa, ZPOOL_PROP_AUTOTRIM, &spa->spa_autotrim);
9eb7b46e
PZ
4246 spa->spa_autoreplace = (autoreplace != 0);
4247 }
4248
6cb8e530
PZ
4249 /*
4250 * If we are importing a pool with missing top-level vdevs,
4251 * we enforce that the pool doesn't panic or get suspended on
4252 * error since the likelihood of missing data is extremely high.
4253 */
4254 if (spa->spa_missing_tvds > 0 &&
4255 spa->spa_failmode != ZIO_FAILURE_MODE_CONTINUE &&
4256 spa->spa_load_state != SPA_LOAD_TRYIMPORT) {
4257 spa_load_note(spa, "forcing failmode to 'continue' "
4258 "as some top level vdevs are missing");
4259 spa->spa_failmode = ZIO_FAILURE_MODE_CONTINUE;
4260 }
4261
9eb7b46e
PZ
4262 return (0);
4263}
4264
4265static int
4266spa_ld_open_aux_vdevs(spa_t *spa, spa_import_type_t type)
4267{
4268 int error = 0;
4269 vdev_t *rvd = spa->spa_root_vdev;
4270
428870ff
BB
4271 /*
4272 * If we're assembling the pool from the split-off vdevs of
4273 * an existing pool, we don't want to attach the spares & cache
4274 * devices.
4275 */
34dc7c2f
BB
4276
4277 /*
4278 * Load any hot spares for this pool.
4279 */
4a0ee12a
PZ
4280 error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object,
4281 B_FALSE);
428870ff
BB
4282 if (error != 0 && error != ENOENT)
4283 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
4284 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
34dc7c2f
BB
4285 ASSERT(spa_version(spa) >= SPA_VERSION_SPARES);
4286 if (load_nvlist(spa, spa->spa_spares.sav_object,
4a0ee12a
PZ
4287 &spa->spa_spares.sav_config) != 0) {
4288 spa_load_failed(spa, "error loading spares nvlist");
428870ff 4289 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
4a0ee12a 4290 }
34dc7c2f 4291
b128c09f 4292 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
34dc7c2f 4293 spa_load_spares(spa);
b128c09f 4294 spa_config_exit(spa, SCL_ALL, FTAG);
428870ff
BB
4295 } else if (error == 0) {
4296 spa->spa_spares.sav_sync = B_TRUE;
34dc7c2f
BB
4297 }
4298
4299 /*
4300 * Load any level 2 ARC devices for this pool.
4301 */
428870ff 4302 error = spa_dir_prop(spa, DMU_POOL_L2CACHE,
4a0ee12a 4303 &spa->spa_l2cache.sav_object, B_FALSE);
428870ff
BB
4304 if (error != 0 && error != ENOENT)
4305 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
4306 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
34dc7c2f
BB
4307 ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE);
4308 if (load_nvlist(spa, spa->spa_l2cache.sav_object,
4a0ee12a
PZ
4309 &spa->spa_l2cache.sav_config) != 0) {
4310 spa_load_failed(spa, "error loading l2cache nvlist");
428870ff 4311 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
4a0ee12a 4312 }
34dc7c2f 4313
b128c09f 4314 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
34dc7c2f 4315 spa_load_l2cache(spa);
b128c09f 4316 spa_config_exit(spa, SCL_ALL, FTAG);
428870ff
BB
4317 } else if (error == 0) {
4318 spa->spa_l2cache.sav_sync = B_TRUE;
b128c09f
BB
4319 }
4320
9eb7b46e
PZ
4321 return (0);
4322}
428870ff 4323
9eb7b46e 4324static int
4a0ee12a 4325spa_ld_load_vdev_metadata(spa_t *spa)
9eb7b46e
PZ
4326{
4327 int error = 0;
4328 vdev_t *rvd = spa->spa_root_vdev;
34dc7c2f 4329
379ca9cf
OF
4330 /*
4331 * If the 'multihost' property is set, then never allow a pool to
4332 * be imported when the system hostid is zero. The exception to
4333 * this rule is zdb which is always allowed to access pools.
4334 */
25f06d67 4335 if (spa_multihost(spa) && spa_get_hostid(spa) == 0 &&
379ca9cf
OF
4336 (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP) == 0) {
4337 fnvlist_add_uint64(spa->spa_load_info,
4338 ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID);
4339 return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO));
4340 }
4341
34dc7c2f
BB
4342 /*
4343 * If the 'autoreplace' property is set, then post a resource notifying
4344 * the ZFS DE that it should not issue any faults for unopenable
4345 * devices. We also iterate over the vdevs, and post a sysevent for any
4346 * unopenable vdevs so that the normal autoreplace handler can take
4347 * over.
4348 */
4a0ee12a 4349 if (spa->spa_autoreplace && spa->spa_load_state != SPA_LOAD_TRYIMPORT) {
34dc7c2f 4350 spa_check_removed(spa->spa_root_vdev);
428870ff
BB
4351 /*
4352 * For the import case, this is done in spa_import(), because
4353 * at this point we're using the spare definitions from
4354 * the MOS config, not necessarily from the userland config.
4355 */
4a0ee12a 4356 if (spa->spa_load_state != SPA_LOAD_IMPORT) {
428870ff
BB
4357 spa_aux_check_removed(&spa->spa_spares);
4358 spa_aux_check_removed(&spa->spa_l2cache);
4359 }
4360 }
34dc7c2f
BB
4361
4362 /*
9eb7b46e 4363 * Load the vdev metadata such as metaslabs, DTLs, spacemap object, etc.
34dc7c2f 4364 */
a1d477c2
MA
4365 error = vdev_load(rvd);
4366 if (error != 0) {
4a0ee12a 4367 spa_load_failed(spa, "vdev_load failed [error=%d]", error);
a1d477c2
MA
4368 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
4369 }
4370
93e28d66
SD
4371 error = spa_ld_log_spacemaps(spa);
4372 if (error != 0) {
600a02b8 4373 spa_load_failed(spa, "spa_ld_log_spacemaps failed [error=%d]",
93e28d66
SD
4374 error);
4375 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
4376 }
4377
34dc7c2f 4378 /*
9eb7b46e 4379 * Propagate the leaf DTLs we just loaded all the way up the vdev tree.
34dc7c2f 4380 */
b128c09f 4381 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
9a49d3f3 4382 vdev_dtl_reassess(rvd, 0, 0, B_FALSE, B_FALSE);
b128c09f 4383 spa_config_exit(spa, SCL_ALL, FTAG);
34dc7c2f 4384
9eb7b46e
PZ
4385 return (0);
4386}
4387
4388static int
4389spa_ld_load_dedup_tables(spa_t *spa)
4390{
4391 int error = 0;
4392 vdev_t *rvd = spa->spa_root_vdev;
4393
428870ff 4394 error = ddt_load(spa);
4a0ee12a
PZ
4395 if (error != 0) {
4396 spa_load_failed(spa, "ddt_load failed [error=%d]", error);
428870ff 4397 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
4a0ee12a 4398 }
428870ff 4399
9eb7b46e
PZ
4400 return (0);
4401}
4402
4403static int
a926aab9 4404spa_ld_verify_logs(spa_t *spa, spa_import_type_t type, const char **ereport)
9eb7b46e
PZ
4405{
4406 vdev_t *rvd = spa->spa_root_vdev;
428870ff 4407
4a0ee12a
PZ
4408 if (type != SPA_IMPORT_ASSEMBLE && spa_writeable(spa)) {
4409 boolean_t missing = spa_check_logs(spa);
4410 if (missing) {
6cb8e530
PZ
4411 if (spa->spa_missing_tvds != 0) {
4412 spa_load_note(spa, "spa_check_logs failed "
4413 "so dropping the logs");
4414 } else {
4415 *ereport = FM_EREPORT_ZFS_LOG_REPLAY;
4416 spa_load_failed(spa, "spa_check_logs failed");
4417 return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG,
4418 ENXIO));
4419 }
4a0ee12a 4420 }
428870ff
BB
4421 }
4422
9eb7b46e
PZ
4423 return (0);
4424}
4425
4426static int
4a0ee12a 4427spa_ld_verify_pool_data(spa_t *spa)
9eb7b46e
PZ
4428{
4429 int error = 0;
4430 vdev_t *rvd = spa->spa_root_vdev;
4431
4432 /*
4433 * We've successfully opened the pool, verify that we're ready
4434 * to start pushing transactions.
4435 */
4a0ee12a 4436 if (spa->spa_load_state != SPA_LOAD_TRYIMPORT) {
9eb7b46e
PZ
4437 error = spa_load_verify(spa);
4438 if (error != 0) {
4a0ee12a
PZ
4439 spa_load_failed(spa, "spa_load_verify failed "
4440 "[error=%d]", error);
9eb7b46e
PZ
4441 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
4442 error));
4443 }
4444 }
4445
4446 return (0);
4447}
4448
4449static void
4450spa_ld_claim_log_blocks(spa_t *spa)
4451{
4452 dmu_tx_t *tx;
4453 dsl_pool_t *dp = spa_get_dsl(spa);
4454
4455 /*
4456 * Claim log blocks that haven't been committed yet.
4457 * This must all happen in a single txg.
4458 * Note: spa_claim_max_txg is updated by spa_claim_notify(),
4459 * invoked from zil_claim_log_block()'s i/o done callback.
4460 * Price of rollback is that we abandon the log.
4461 */
4462 spa->spa_claiming = B_TRUE;
4463
4464 tx = dmu_tx_create_assigned(dp, spa_first_txg(spa));
4465 (void) dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
4466 zil_claim, tx, DS_FIND_CHILDREN);
4467 dmu_tx_commit(tx);
4468
4469 spa->spa_claiming = B_FALSE;
4470
4471 spa_set_log_state(spa, SPA_LOG_GOOD);
4472}
4473
4474static void
6cb8e530 4475spa_ld_check_for_config_update(spa_t *spa, uint64_t config_cache_txg,
d2734cce 4476 boolean_t update_config_cache)
9eb7b46e
PZ
4477{
4478 vdev_t *rvd = spa->spa_root_vdev;
4479 int need_update = B_FALSE;
4480
4481 /*
4482 * If the config cache is stale, or we have uninitialized
4483 * metaslabs (see spa_vdev_add()), then update the config.
4484 *
4485 * If this is a verbatim import, trust the current
4486 * in-core spa_config and update the disk labels.
4487 */
d2734cce 4488 if (update_config_cache || config_cache_txg != spa->spa_config_txg ||
4a0ee12a
PZ
4489 spa->spa_load_state == SPA_LOAD_IMPORT ||
4490 spa->spa_load_state == SPA_LOAD_RECOVER ||
9eb7b46e
PZ
4491 (spa->spa_import_flags & ZFS_IMPORT_VERBATIM))
4492 need_update = B_TRUE;
4493
4494 for (int c = 0; c < rvd->vdev_children; c++)
4495 if (rvd->vdev_child[c]->vdev_ms_array == 0)
4496 need_update = B_TRUE;
4497
4498 /*
e1cfd73f 4499 * Update the config cache asynchronously in case we're the
9eb7b46e
PZ
4500 * root pool, in which case the config cache isn't writable yet.
4501 */
4502 if (need_update)
4503 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
4504}
4505
6cb8e530
PZ
4506static void
4507spa_ld_prepare_for_reload(spa_t *spa)
4508{
da92d5cb 4509 spa_mode_t mode = spa->spa_mode;
6cb8e530
PZ
4510 int async_suspended = spa->spa_async_suspended;
4511
4512 spa_unload(spa);
4513 spa_deactivate(spa);
4514 spa_activate(spa, mode);
4515
4516 /*
4517 * We save the value of spa_async_suspended as it gets reset to 0 by
4518 * spa_unload(). We want to restore it back to the original value before
4519 * returning as we might be calling spa_async_resume() later.
4520 */
4521 spa->spa_async_suspended = async_suspended;
4522}
4523
9eb7b46e 4524static int
d2734cce
SD
4525spa_ld_read_checkpoint_txg(spa_t *spa)
4526{
4527 uberblock_t checkpoint;
4528 int error = 0;
4529
4530 ASSERT0(spa->spa_checkpoint_txg);
4531 ASSERT(MUTEX_HELD(&spa_namespace_lock));
4532
4533 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
4534 DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t),
4535 sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint);
4536
4537 if (error == ENOENT)
4538 return (0);
4539
4540 if (error != 0)
4541 return (error);
4542
4543 ASSERT3U(checkpoint.ub_txg, !=, 0);
4544 ASSERT3U(checkpoint.ub_checkpoint_txg, !=, 0);
4545 ASSERT3U(checkpoint.ub_timestamp, !=, 0);
4546 spa->spa_checkpoint_txg = checkpoint.ub_txg;
4547 spa->spa_checkpoint_info.sci_timestamp = checkpoint.ub_timestamp;
4548
4549 return (0);
4550}
4551
4552static int
4553spa_ld_mos_init(spa_t *spa, spa_import_type_t type)
9eb7b46e
PZ
4554{
4555 int error = 0;
9eb7b46e 4556
4a0ee12a 4557 ASSERT(MUTEX_HELD(&spa_namespace_lock));
6cb8e530 4558 ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE);
4a0ee12a 4559
9eb7b46e 4560 /*
6cb8e530
PZ
4561 * Never trust the config that is provided unless we are assembling
4562 * a pool following a split.
4563 * This means don't trust blkptrs and the vdev tree in general. This
4564 * also effectively puts the spa in read-only mode since
4565 * spa_writeable() checks for spa_trust_config to be true.
4566 * We will later load a trusted config from the MOS.
9eb7b46e 4567 */
6cb8e530
PZ
4568 if (type != SPA_IMPORT_ASSEMBLE)
4569 spa->spa_trust_config = B_FALSE;
4570
9eb7b46e
PZ
4571 /*
4572 * Parse the config provided to create a vdev tree.
4573 */
6cb8e530 4574 error = spa_ld_parse_config(spa, type);
9eb7b46e
PZ
4575 if (error != 0)
4576 return (error);
4577
ca95f70d
OF
4578 spa_import_progress_add(spa);
4579
9eb7b46e
PZ
4580 /*
4581 * Now that we have the vdev tree, try to open each vdev. This involves
4582 * opening the underlying physical device, retrieving its geometry and
4583 * probing the vdev with a dummy I/O. The state of each vdev will be set
4584 * based on the success of those operations. After this we'll be ready
4585 * to read from the vdevs.
4586 */
4587 error = spa_ld_open_vdevs(spa);
4588 if (error != 0)
4589 return (error);
4590
4591 /*
4592 * Read the label of each vdev and make sure that the GUIDs stored
4593 * there match the GUIDs in the config provided.
6cb8e530
PZ
4594 * If we're assembling a new pool that's been split off from an
4595 * existing pool, the labels haven't yet been updated so we skip
4596 * validation for now.
9eb7b46e 4597 */
6cb8e530
PZ
4598 if (type != SPA_IMPORT_ASSEMBLE) {
4599 error = spa_ld_validate_vdevs(spa);
4600 if (error != 0)
4601 return (error);
4602 }
9eb7b46e
PZ
4603
4604 /*
d2734cce
SD
4605 * Read all vdev labels to find the best uberblock (i.e. latest,
4606 * unless spa_load_max_txg is set) and store it in spa_uberblock. We
4607 * get the list of features required to read blkptrs in the MOS from
4608 * the vdev label with the best uberblock and verify that our version
4609 * of zfs supports them all.
9eb7b46e 4610 */
6cb8e530 4611 error = spa_ld_select_uberblock(spa, type);
9eb7b46e
PZ
4612 if (error != 0)
4613 return (error);
4614
4615 /*
4616 * Pass that uberblock to the dsl_pool layer which will open the root
4617 * blkptr. This blkptr points to the latest version of the MOS and will
4618 * allow us to read its contents.
4619 */
4620 error = spa_ld_open_rootbp(spa);
4621 if (error != 0)
4622 return (error);
4623
d2734cce
SD
4624 return (0);
4625}
4626
4627static int
4628spa_ld_checkpoint_rewind(spa_t *spa)
4629{
4630 uberblock_t checkpoint;
4631 int error = 0;
4632
4633 ASSERT(MUTEX_HELD(&spa_namespace_lock));
4634 ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT);
4635
4636 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
4637 DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t),
4638 sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint);
4639
4640 if (error != 0) {
4641 spa_load_failed(spa, "unable to retrieve checkpointed "
4642 "uberblock from the MOS config [error=%d]", error);
4643
4644 if (error == ENOENT)
4645 error = ZFS_ERR_NO_CHECKPOINT;
4646
4647 return (error);
4648 }
4649
4650 ASSERT3U(checkpoint.ub_txg, <, spa->spa_uberblock.ub_txg);
4651 ASSERT3U(checkpoint.ub_txg, ==, checkpoint.ub_checkpoint_txg);
4652
4653 /*
4654 * We need to update the txg and timestamp of the checkpointed
4655 * uberblock to be higher than the latest one. This ensures that
4656 * the checkpointed uberblock is selected if we were to close and
4657 * reopen the pool right after we've written it in the vdev labels.
4658 * (also see block comment in vdev_uberblock_compare)
4659 */
4660 checkpoint.ub_txg = spa->spa_uberblock.ub_txg + 1;
4661 checkpoint.ub_timestamp = gethrestime_sec();
4662
4663 /*
4664 * Set current uberblock to be the checkpointed uberblock.
4665 */
4666 spa->spa_uberblock = checkpoint;
4667
4668 /*
4669 * If we are doing a normal rewind, then the pool is open for
4670 * writing and we sync the "updated" checkpointed uberblock to
4671 * disk. Once this is done, we've basically rewound the whole
4672 * pool and there is no way back.
4673 *
4674 * There are cases when we don't want to attempt and sync the
4675 * checkpointed uberblock to disk because we are opening a
4676 * pool as read-only. Specifically, verifying the checkpointed
4677 * state with zdb, and importing the checkpointed state to get
4678 * a "preview" of its content.
4679 */
4680 if (spa_writeable(spa)) {
4681 vdev_t *rvd = spa->spa_root_vdev;
4682
4683 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
4684 vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL };
4685 int svdcount = 0;
4686 int children = rvd->vdev_children;
29274c9f 4687 int c0 = random_in_range(children);
d2734cce
SD
4688
4689 for (int c = 0; c < children; c++) {
4690 vdev_t *vd = rvd->vdev_child[(c0 + c) % children];
4691
4692 /* Stop when revisiting the first vdev */
4693 if (c > 0 && svd[0] == vd)
4694 break;
4695
4696 if (vd->vdev_ms_array == 0 || vd->vdev_islog ||
4697 !vdev_is_concrete(vd))
4698 continue;
4699
4700 svd[svdcount++] = vd;
4701 if (svdcount == SPA_SYNC_MIN_VDEVS)
4702 break;
4703 }
4704 error = vdev_config_sync(svd, svdcount, spa->spa_first_txg);
4705 if (error == 0)
4706 spa->spa_last_synced_guid = rvd->vdev_guid;
4707 spa_config_exit(spa, SCL_ALL, FTAG);
4708
4709 if (error != 0) {
4710 spa_load_failed(spa, "failed to write checkpointed "
4711 "uberblock to the vdev labels [error=%d]", error);
4712 return (error);
4713 }
4714 }
4715
4716 return (0);
4717}
4718
4719static int
4720spa_ld_mos_with_trusted_config(spa_t *spa, spa_import_type_t type,
4721 boolean_t *update_config_cache)
4722{
4723 int error;
4724
4725 /*
4726 * Parse the config for pool, open and validate vdevs,
4727 * select an uberblock, and use that uberblock to open
4728 * the MOS.
4729 */
4730 error = spa_ld_mos_init(spa, type);
4731 if (error != 0)
4732 return (error);
4733
9eb7b46e 4734 /*
6cb8e530
PZ
4735 * Retrieve the trusted config stored in the MOS and use it to create
4736 * a new, exact version of the vdev tree, then reopen all vdevs.
9eb7b46e 4737 */
d2734cce 4738 error = spa_ld_trusted_config(spa, type, B_FALSE);
6cb8e530 4739 if (error == EAGAIN) {
d2734cce
SD
4740 if (update_config_cache != NULL)
4741 *update_config_cache = B_TRUE;
4742
6cb8e530
PZ
4743 /*
4744 * Redo the loading process with the trusted config if it is
4745 * too different from the untrusted config.
4746 */
4747 spa_ld_prepare_for_reload(spa);
d2734cce
SD
4748 spa_load_note(spa, "RELOADING");
4749 error = spa_ld_mos_init(spa, type);
4750 if (error != 0)
4751 return (error);
4752
4753 error = spa_ld_trusted_config(spa, type, B_TRUE);
4754 if (error != 0)
4755 return (error);
4756
6cb8e530 4757 } else if (error != 0) {
9eb7b46e 4758 return (error);
6cb8e530 4759 }
9eb7b46e 4760
d2734cce
SD
4761 return (0);
4762}
4763
4764/*
4765 * Load an existing storage pool, using the config provided. This config
4766 * describes which vdevs are part of the pool and is later validated against
4767 * partial configs present in each vdev's label and an entire copy of the
4768 * config stored in the MOS.
4769 */
4770static int
a926aab9 4771spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
d2734cce
SD
4772{
4773 int error = 0;
4774 boolean_t missing_feat_write = B_FALSE;
4775 boolean_t checkpoint_rewind =
4776 (spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT);
4777 boolean_t update_config_cache = B_FALSE;
4778
4779 ASSERT(MUTEX_HELD(&spa_namespace_lock));
4780 ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE);
4781
4782 spa_load_note(spa, "LOADING");
4783
4784 error = spa_ld_mos_with_trusted_config(spa, type, &update_config_cache);
4785 if (error != 0)
4786 return (error);
4787
4788 /*
4789 * If we are rewinding to the checkpoint then we need to repeat
4790 * everything we've done so far in this function but this time
4791 * selecting the checkpointed uberblock and using that to open
4792 * the MOS.
4793 */
4794 if (checkpoint_rewind) {
4795 /*
4796 * If we are rewinding to the checkpoint update config cache
4797 * anyway.
4798 */
4799 update_config_cache = B_TRUE;
4800
4801 /*
4802 * Extract the checkpointed uberblock from the current MOS
4803 * and use this as the pool's uberblock from now on. If the
4804 * pool is imported as writeable we also write the checkpoint
4805 * uberblock to the labels, making the rewind permanent.
4806 */
4807 error = spa_ld_checkpoint_rewind(spa);
4808 if (error != 0)
4809 return (error);
4810
4811 /*
e1cfd73f 4812 * Redo the loading process again with the
d2734cce
SD
4813 * checkpointed uberblock.
4814 */
4815 spa_ld_prepare_for_reload(spa);
4816 spa_load_note(spa, "LOADING checkpointed uberblock");
4817 error = spa_ld_mos_with_trusted_config(spa, type, NULL);
4818 if (error != 0)
4819 return (error);
4820 }
4821
4822 /*
4823 * Retrieve the checkpoint txg if the pool has a checkpoint.
4824 */
4825 error = spa_ld_read_checkpoint_txg(spa);
4826 if (error != 0)
4827 return (error);
4828
9eb7b46e
PZ
4829 /*
4830 * Retrieve the mapping of indirect vdevs. Those vdevs were removed
4831 * from the pool and their contents were re-mapped to other vdevs. Note
4832 * that everything that we read before this step must have been
4833 * rewritten on concrete vdevs after the last device removal was
4834 * initiated. Otherwise we could be reading from indirect vdevs before
4835 * we have loaded their mappings.
4836 */
4837 error = spa_ld_open_indirect_vdev_metadata(spa);
4838 if (error != 0)
4839 return (error);
4840
4841 /*
4842 * Retrieve the full list of active features from the MOS and check if
4843 * they are all supported.
4844 */
4a0ee12a 4845 error = spa_ld_check_features(spa, &missing_feat_write);
9eb7b46e
PZ
4846 if (error != 0)
4847 return (error);
4848
4849 /*
4850 * Load several special directories from the MOS needed by the dsl_pool
4851 * layer.
4852 */
4853 error = spa_ld_load_special_directories(spa);
4854 if (error != 0)
4855 return (error);
4856
9eb7b46e
PZ
4857 /*
4858 * Retrieve pool properties from the MOS.
4859 */
4860 error = spa_ld_get_props(spa);
4861 if (error != 0)
4862 return (error);
4863
4864 /*
4865 * Retrieve the list of auxiliary devices - cache devices and spares -
4866 * and open them.
4867 */
4868 error = spa_ld_open_aux_vdevs(spa, type);
4869 if (error != 0)
4870 return (error);
4871
4872 /*
4873 * Load the metadata for all vdevs. Also check if unopenable devices
4874 * should be autoreplaced.
4875 */
4a0ee12a 4876 error = spa_ld_load_vdev_metadata(spa);
9eb7b46e
PZ
4877 if (error != 0)
4878 return (error);
4879
4880 error = spa_ld_load_dedup_tables(spa);
4881 if (error != 0)
4882 return (error);
4883
4884 /*
4885 * Verify the logs now to make sure we don't have any unexpected errors
4886 * when we claim log blocks later.
4887 */
4888 error = spa_ld_verify_logs(spa, type, ereport);
4889 if (error != 0)
4890 return (error);
4891
9ae529ec 4892 if (missing_feat_write) {
6cb8e530 4893 ASSERT(spa->spa_load_state == SPA_LOAD_TRYIMPORT);
9ae529ec
CS
4894
4895 /*
4896 * At this point, we know that we can open the pool in
4897 * read-only mode but not read-write mode. We now have enough
4898 * information and can return to userland.
4899 */
9eb7b46e
PZ
4900 return (spa_vdev_err(spa->spa_root_vdev, VDEV_AUX_UNSUP_FEAT,
4901 ENOTSUP));
9ae529ec
CS
4902 }
4903
572e2857 4904 /*
9eb7b46e
PZ
4905 * Traverse the last txgs to make sure the pool was left off in a safe
4906 * state. When performing an extreme rewind, we verify the whole pool,
4907 * which can take a very long time.
572e2857 4908 */
4a0ee12a 4909 error = spa_ld_verify_pool_data(spa);
9eb7b46e
PZ
4910 if (error != 0)
4911 return (error);
572e2857 4912
9eb7b46e
PZ
4913 /*
4914 * Calculate the deflated space for the pool. This must be done before
4915 * we write anything to the pool because we'd need to update the space
4916 * accounting using the deflated sizes.
4917 */
4918 spa_update_dspace(spa);
4919
4920 /*
4921 * We have now retrieved all the information we needed to open the
4922 * pool. If we are importing the pool in read-write mode, a few
4923 * additional steps must be performed to finish the import.
4924 */
6cb8e530 4925 if (spa_writeable(spa) && (spa->spa_load_state == SPA_LOAD_RECOVER ||
428870ff 4926 spa->spa_load_max_txg == UINT64_MAX)) {
6cb8e530
PZ
4927 uint64_t config_cache_txg = spa->spa_config_txg;
4928
4929 ASSERT(spa->spa_load_state != SPA_LOAD_TRYIMPORT);
34dc7c2f 4930
d2734cce
SD
4931 /*
4932 * In case of a checkpoint rewind, log the original txg
4933 * of the checkpointed uberblock.
4934 */
4935 if (checkpoint_rewind) {
4936 spa_history_log_internal(spa, "checkpoint rewind",
4937 NULL, "rewound state to txg=%llu",
4938 (u_longlong_t)spa->spa_uberblock.ub_checkpoint_txg);
4939 }
4940
34dc7c2f 4941 /*
9eb7b46e 4942 * Traverse the ZIL and claim all blocks.
34dc7c2f 4943 */
9eb7b46e 4944 spa_ld_claim_log_blocks(spa);
428870ff 4945
9eb7b46e
PZ
4946 /*
4947 * Kick-off the syncing thread.
4948 */
34dc7c2f
BB
4949 spa->spa_sync_on = B_TRUE;
4950 txg_sync_start(spa->spa_dsl_pool);
379ca9cf 4951 mmp_thread_start(spa);
34dc7c2f
BB
4952
4953 /*
428870ff
BB
4954 * Wait for all claims to sync. We sync up to the highest
4955 * claimed log block birth time so that claimed log blocks
4956 * don't appear to be from the future. spa_claim_max_txg
9eb7b46e
PZ
4957 * will have been set for us by ZIL traversal operations
4958 * performed above.
34dc7c2f 4959 */
428870ff 4960 txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg);
34dc7c2f
BB
4961
4962 /*
9eb7b46e
PZ
4963 * Check if we need to request an update of the config. On the
4964 * next sync, we would update the config stored in vdev labels
4965 * and the cachefile (by default /etc/zfs/zpool.cache).
34dc7c2f 4966 */
6cb8e530 4967 spa_ld_check_for_config_update(spa, config_cache_txg,
d2734cce 4968 update_config_cache);
fb5f0bc8
BB
4969
4970 /*
9a49d3f3
BB
4971 * Check if a rebuild was in progress and if so resume it.
4972 * Then check all DTLs to see if anything needs resilvering.
4973 * The resilver will be deferred if a rebuild was started.
fb5f0bc8 4974 */
9a49d3f3
BB
4975 if (vdev_rebuild_active(spa->spa_root_vdev)) {
4976 vdev_rebuild_restart(spa);
4977 } else if (!dsl_scan_resilvering(spa->spa_dsl_pool) &&
4978 vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) {
fb5f0bc8 4979 spa_async_request(spa, SPA_ASYNC_RESILVER);
9a49d3f3 4980 }
428870ff 4981
6f1ffb06
MA
4982 /*
4983 * Log the fact that we booted up (so that we can detect if
4984 * we rebooted in the middle of an operation).
4985 */
d5e024cb 4986 spa_history_log_version(spa, "open", NULL);
6f1ffb06 4987
9b2266e3
SD
4988 spa_restart_removal(spa);
4989 spa_spawn_aux_threads(spa);
4990
428870ff
BB
4991 /*
4992 * Delete any inconsistent datasets.
9b2266e3
SD
4993 *
4994 * Note:
4995 * Since we may be issuing deletes for clones here,
4996 * we make sure to do so after we've spawned all the
4997 * auxiliary threads above (from which the livelist
4998 * deletion zthr is part of).
428870ff
BB
4999 */
5000 (void) dmu_objset_find(spa_name(spa),
5001 dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN);
5002
5003 /*
5004 * Clean up any stale temporary dataset userrefs.
5005 */
5006 dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool);
a1d477c2 5007
619f0976
GW
5008 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
5009 vdev_initialize_restart(spa->spa_root_vdev);
1b939560
BB
5010 vdev_trim_restart(spa->spa_root_vdev);
5011 vdev_autotrim_restart(spa);
619f0976 5012 spa_config_exit(spa, SCL_CONFIG, FTAG);
34dc7c2f
BB
5013 }
5014
ca95f70d 5015 spa_import_progress_remove(spa_guid(spa));
77f6826b
GA
5016 spa_async_request(spa, SPA_ASYNC_L2CACHE_REBUILD);
5017
4a0ee12a
PZ
5018 spa_load_note(spa, "LOADED");
5019
428870ff
BB
5020 return (0);
5021}
34dc7c2f 5022
428870ff 5023static int
6cb8e530 5024spa_load_retry(spa_t *spa, spa_load_state_t state)
428870ff 5025{
da92d5cb 5026 spa_mode_t mode = spa->spa_mode;
572e2857 5027
428870ff
BB
5028 spa_unload(spa);
5029 spa_deactivate(spa);
5030
dea377c0 5031 spa->spa_load_max_txg = spa->spa_uberblock.ub_txg - 1;
428870ff 5032
572e2857 5033 spa_activate(spa, mode);
428870ff
BB
5034 spa_async_suspend(spa);
5035
4a0ee12a
PZ
5036 spa_load_note(spa, "spa_load_retry: rewind, max txg: %llu",
5037 (u_longlong_t)spa->spa_load_max_txg);
5038
6cb8e530 5039 return (spa_load(spa, state, SPA_IMPORT_EXISTING));
428870ff
BB
5040}
5041
9ae529ec
CS
5042/*
5043 * If spa_load() fails this function will try loading prior txg's. If
5044 * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool
5045 * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this
5046 * function will not rewind the pool and will return the same error as
5047 * spa_load().
5048 */
428870ff 5049static int
6cb8e530
PZ
5050spa_load_best(spa_t *spa, spa_load_state_t state, uint64_t max_request,
5051 int rewind_flags)
428870ff 5052{
9ae529ec 5053 nvlist_t *loadinfo = NULL;
428870ff
BB
5054 nvlist_t *config = NULL;
5055 int load_error, rewind_error;
5056 uint64_t safe_rewind_txg;
5057 uint64_t min_txg;
5058
5059 if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) {
5060 spa->spa_load_max_txg = spa->spa_load_txg;
5061 spa_set_log_state(spa, SPA_LOG_CLEAR);
5062 } else {
5063 spa->spa_load_max_txg = max_request;
dea377c0
MA
5064 if (max_request != UINT64_MAX)
5065 spa->spa_extreme_rewind = B_TRUE;
428870ff
BB
5066 }
5067
6cb8e530 5068 load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING);
428870ff
BB
5069 if (load_error == 0)
5070 return (0);
d2734cce
SD
5071 if (load_error == ZFS_ERR_NO_CHECKPOINT) {
5072 /*
5073 * When attempting checkpoint-rewind on a pool with no
5074 * checkpoint, we should not attempt to load uberblocks
5075 * from previous txgs when spa_load fails.
5076 */
5077 ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT);
ca95f70d 5078 spa_import_progress_remove(spa_guid(spa));
d2734cce
SD
5079 return (load_error);
5080 }
428870ff
BB
5081
5082 if (spa->spa_root_vdev != NULL)
5083 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
5084
5085 spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg;
5086 spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp;
5087
5088 if (rewind_flags & ZPOOL_NEVER_REWIND) {
5089 nvlist_free(config);
ca95f70d 5090 spa_import_progress_remove(spa_guid(spa));
428870ff
BB
5091 return (load_error);
5092 }
5093
9ae529ec
CS
5094 if (state == SPA_LOAD_RECOVER) {
5095 /* Price of rolling back is discarding txgs, including log */
428870ff 5096 spa_set_log_state(spa, SPA_LOG_CLEAR);
9ae529ec
CS
5097 } else {
5098 /*
5099 * If we aren't rolling back save the load info from our first
5100 * import attempt so that we can restore it after attempting
5101 * to rewind.
5102 */
5103 loadinfo = spa->spa_load_info;
5104 spa->spa_load_info = fnvlist_alloc();
5105 }
428870ff
BB
5106
5107 spa->spa_load_max_txg = spa->spa_last_ubsync_txg;
5108 safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE;
5109 min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ?
5110 TXG_INITIAL : safe_rewind_txg;
5111
5112 /*
5113 * Continue as long as we're finding errors, we're still within
5114 * the acceptable rewind range, and we're still finding uberblocks
5115 */
5116 while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg &&
5117 spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) {
5118 if (spa->spa_load_max_txg < safe_rewind_txg)
5119 spa->spa_extreme_rewind = B_TRUE;
6cb8e530 5120 rewind_error = spa_load_retry(spa, state);
428870ff
BB
5121 }
5122
428870ff
BB
5123 spa->spa_extreme_rewind = B_FALSE;
5124 spa->spa_load_max_txg = UINT64_MAX;
5125
5126 if (config && (rewind_error || state != SPA_LOAD_RECOVER))
5127 spa_config_set(spa, config);
ee6370a7 5128 else
5129 nvlist_free(config);
428870ff 5130
9ae529ec
CS
5131 if (state == SPA_LOAD_RECOVER) {
5132 ASSERT3P(loadinfo, ==, NULL);
ca95f70d 5133 spa_import_progress_remove(spa_guid(spa));
9ae529ec
CS
5134 return (rewind_error);
5135 } else {
5136 /* Store the rewind info as part of the initial load info */
5137 fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO,
5138 spa->spa_load_info);
5139
5140 /* Restore the initial load info */
5141 fnvlist_free(spa->spa_load_info);
5142 spa->spa_load_info = loadinfo;
5143
ca95f70d 5144 spa_import_progress_remove(spa_guid(spa));
9ae529ec
CS
5145 return (load_error);
5146 }
34dc7c2f
BB
5147}
5148
5149/*
5150 * Pool Open/Import
5151 *
5152 * The import case is identical to an open except that the configuration is sent
5153 * down from userland, instead of grabbed from the configuration cache. For the
5154 * case of an open, the pool configuration will exist in the
5155 * POOL_STATE_UNINITIALIZED state.
5156 *
5157 * The stats information (gen/count/ustats) is used to gather vdev statistics at
5158 * the same time open the pool, without having to keep around the spa_t in some
5159 * ambiguous state.
5160 */
5161static int
a926aab9
AZ
5162spa_open_common(const char *pool, spa_t **spapp, const void *tag,
5163 nvlist_t *nvpolicy, nvlist_t **config)
34dc7c2f
BB
5164{
5165 spa_t *spa;
572e2857 5166 spa_load_state_t state = SPA_LOAD_OPEN;
34dc7c2f 5167 int error;
34dc7c2f 5168 int locked = B_FALSE;
526af785 5169 int firstopen = B_FALSE;
34dc7c2f
BB
5170
5171 *spapp = NULL;
5172
5173 /*
5174 * As disgusting as this is, we need to support recursive calls to this
5175 * function because dsl_dir_open() is called during spa_load(), and ends
5176 * up calling spa_open() again. The real fix is to figure out how to
5177 * avoid dsl_dir_open() calling this in the first place.
5178 */
c25b8f99 5179 if (MUTEX_NOT_HELD(&spa_namespace_lock)) {
34dc7c2f
BB
5180 mutex_enter(&spa_namespace_lock);
5181 locked = B_TRUE;
5182 }
5183
5184 if ((spa = spa_lookup(pool)) == NULL) {
5185 if (locked)
5186 mutex_exit(&spa_namespace_lock);
2e528b49 5187 return (SET_ERROR(ENOENT));
34dc7c2f 5188 }
428870ff 5189
34dc7c2f 5190 if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
8a393be3 5191 zpool_load_policy_t policy;
428870ff 5192
526af785
PJD
5193 firstopen = B_TRUE;
5194
8a393be3 5195 zpool_get_load_policy(nvpolicy ? nvpolicy : spa->spa_config,
428870ff 5196 &policy);
8a393be3 5197 if (policy.zlp_rewind & ZPOOL_DO_REWIND)
428870ff 5198 state = SPA_LOAD_RECOVER;
34dc7c2f 5199
fb5f0bc8 5200 spa_activate(spa, spa_mode_global);
34dc7c2f 5201
428870ff
BB
5202 if (state != SPA_LOAD_RECOVER)
5203 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
6cb8e530 5204 spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE;
428870ff 5205
4a0ee12a 5206 zfs_dbgmsg("spa_open_common: opening %s", pool);
8a393be3
PZ
5207 error = spa_load_best(spa, state, policy.zlp_txg,
5208 policy.zlp_rewind);
34dc7c2f
BB
5209
5210 if (error == EBADF) {
5211 /*
5212 * If vdev_validate() returns failure (indicated by
5213 * EBADF), it indicates that one of the vdevs indicates
5214 * that the pool has been exported or destroyed. If
5215 * this is the case, the config cache is out of sync and
5216 * we should remove the pool from the namespace.
5217 */
34dc7c2f
BB
5218 spa_unload(spa);
5219 spa_deactivate(spa);
a1d477c2 5220 spa_write_cachefile(spa, B_TRUE, B_TRUE);
34dc7c2f 5221 spa_remove(spa);
34dc7c2f
BB
5222 if (locked)
5223 mutex_exit(&spa_namespace_lock);
2e528b49 5224 return (SET_ERROR(ENOENT));
34dc7c2f
BB
5225 }
5226
5227 if (error) {
5228 /*
5229 * We can't open the pool, but we still have useful
5230 * information: the state of each vdev after the
5231 * attempted vdev_open(). Return this to the user.
5232 */
572e2857 5233 if (config != NULL && spa->spa_config) {
65ad5d11
AJ
5234 *config = fnvlist_dup(spa->spa_config);
5235 fnvlist_add_nvlist(*config,
572e2857 5236 ZPOOL_CONFIG_LOAD_INFO,
65ad5d11 5237 spa->spa_load_info);
572e2857 5238 }
34dc7c2f
BB
5239 spa_unload(spa);
5240 spa_deactivate(spa);
428870ff 5241 spa->spa_last_open_failed = error;
34dc7c2f
BB
5242 if (locked)
5243 mutex_exit(&spa_namespace_lock);
5244 *spapp = NULL;
5245 return (error);
34dc7c2f 5246 }
34dc7c2f
BB
5247 }
5248
5249 spa_open_ref(spa, tag);
5250
b128c09f 5251 if (config != NULL)
34dc7c2f 5252 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
34dc7c2f 5253
572e2857
BB
5254 /*
5255 * If we've recovered the pool, pass back any information we
5256 * gathered while doing the load.
5257 */
5258 if (state == SPA_LOAD_RECOVER) {
65ad5d11
AJ
5259 fnvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO,
5260 spa->spa_load_info);
572e2857
BB
5261 }
5262
428870ff
BB
5263 if (locked) {
5264 spa->spa_last_open_failed = 0;
5265 spa->spa_last_ubsync_txg = 0;
5266 spa->spa_load_txg = 0;
5267 mutex_exit(&spa_namespace_lock);
5268 }
5269
526af785 5270 if (firstopen)
ec213971 5271 zvol_create_minors_recursive(spa_name(spa));
526af785 5272
428870ff
BB
5273 *spapp = spa;
5274
34dc7c2f
BB
5275 return (0);
5276}
5277
428870ff 5278int
a926aab9
AZ
5279spa_open_rewind(const char *name, spa_t **spapp, const void *tag,
5280 nvlist_t *policy, nvlist_t **config)
428870ff
BB
5281{
5282 return (spa_open_common(name, spapp, tag, policy, config));
5283}
5284
34dc7c2f 5285int
a926aab9 5286spa_open(const char *name, spa_t **spapp, const void *tag)
34dc7c2f 5287{
428870ff 5288 return (spa_open_common(name, spapp, tag, NULL, NULL));
34dc7c2f
BB
5289}
5290
5291/*
5292 * Lookup the given spa_t, incrementing the inject count in the process,
5293 * preventing it from being exported or destroyed.
5294 */
5295spa_t *
5296spa_inject_addref(char *name)
5297{
5298 spa_t *spa;
5299
5300 mutex_enter(&spa_namespace_lock);
5301 if ((spa = spa_lookup(name)) == NULL) {
5302 mutex_exit(&spa_namespace_lock);
5303 return (NULL);
5304 }
5305 spa->spa_inject_ref++;
5306 mutex_exit(&spa_namespace_lock);
5307
5308 return (spa);
5309}
5310
5311void
5312spa_inject_delref(spa_t *spa)
5313{
5314 mutex_enter(&spa_namespace_lock);
5315 spa->spa_inject_ref--;
5316 mutex_exit(&spa_namespace_lock);
5317}
5318
5319/*
5320 * Add spares device information to the nvlist.
5321 */
5322static void
5323spa_add_spares(spa_t *spa, nvlist_t *config)
5324{
5325 nvlist_t **spares;
5326 uint_t i, nspares;
5327 nvlist_t *nvroot;
5328 uint64_t guid;
5329 vdev_stat_t *vs;
5330 uint_t vsc;
5331 uint64_t pool;
5332
9babb374
BB
5333 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
5334
34dc7c2f
BB
5335 if (spa->spa_spares.sav_count == 0)
5336 return;
5337
65ad5d11
AJ
5338 nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE);
5339 VERIFY0(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
5340 ZPOOL_CONFIG_SPARES, &spares, &nspares));
34dc7c2f 5341 if (nspares != 0) {
795075e6
PD
5342 fnvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
5343 (const nvlist_t * const *)spares, nspares);
65ad5d11
AJ
5344 VERIFY0(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
5345 &spares, &nspares));
34dc7c2f
BB
5346
5347 /*
5348 * Go through and find any spares which have since been
5349 * repurposed as an active spare. If this is the case, update
5350 * their status appropriately.
5351 */
5352 for (i = 0; i < nspares; i++) {
65ad5d11
AJ
5353 guid = fnvlist_lookup_uint64(spares[i],
5354 ZPOOL_CONFIG_GUID);
b128c09f
BB
5355 if (spa_spare_exists(guid, &pool, NULL) &&
5356 pool != 0ULL) {
65ad5d11
AJ
5357 VERIFY0(nvlist_lookup_uint64_array(spares[i],
5358 ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs,
5359 &vsc));
34dc7c2f
BB
5360 vs->vs_state = VDEV_STATE_CANT_OPEN;
5361 vs->vs_aux = VDEV_AUX_SPARED;
5362 }
5363 }
5364 }
5365}
5366
5367/*
5368 * Add l2cache device information to the nvlist, including vdev stats.
5369 */
5370static void
5371spa_add_l2cache(spa_t *spa, nvlist_t *config)
5372{
5373 nvlist_t **l2cache;
5374 uint_t i, j, nl2cache;
5375 nvlist_t *nvroot;
5376 uint64_t guid;
5377 vdev_t *vd;
5378 vdev_stat_t *vs;
5379 uint_t vsc;
5380
9babb374
BB
5381 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
5382
34dc7c2f
BB
5383 if (spa->spa_l2cache.sav_count == 0)
5384 return;
5385
65ad5d11
AJ
5386 nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE);
5387 VERIFY0(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
5388 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache));
34dc7c2f 5389 if (nl2cache != 0) {
795075e6
PD
5390 fnvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
5391 (const nvlist_t * const *)l2cache, nl2cache);
65ad5d11
AJ
5392 VERIFY0(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
5393 &l2cache, &nl2cache));
34dc7c2f
BB
5394
5395 /*
5396 * Update level 2 cache device stats.
5397 */
5398
5399 for (i = 0; i < nl2cache; i++) {
65ad5d11
AJ
5400 guid = fnvlist_lookup_uint64(l2cache[i],
5401 ZPOOL_CONFIG_GUID);
34dc7c2f
BB
5402
5403 vd = NULL;
5404 for (j = 0; j < spa->spa_l2cache.sav_count; j++) {
5405 if (guid ==
5406 spa->spa_l2cache.sav_vdevs[j]->vdev_guid) {
5407 vd = spa->spa_l2cache.sav_vdevs[j];
5408 break;
5409 }
5410 }
5411 ASSERT(vd != NULL);
5412
65ad5d11
AJ
5413 VERIFY0(nvlist_lookup_uint64_array(l2cache[i],
5414 ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc));
34dc7c2f 5415 vdev_get_stats(vd, vs);
193a37cb
TH
5416 vdev_config_generate_stats(vd, l2cache[i]);
5417
34dc7c2f
BB
5418 }
5419 }
34dc7c2f
BB
5420}
5421
9ae529ec 5422static void
417104bd 5423spa_feature_stats_from_disk(spa_t *spa, nvlist_t *features)
9ae529ec 5424{
9ae529ec
CS
5425 zap_cursor_t zc;
5426 zap_attribute_t za;
5427
9ae529ec
CS
5428 if (spa->spa_feat_for_read_obj != 0) {
5429 for (zap_cursor_init(&zc, spa->spa_meta_objset,
5430 spa->spa_feat_for_read_obj);
5431 zap_cursor_retrieve(&zc, &za) == 0;
5432 zap_cursor_advance(&zc)) {
5433 ASSERT(za.za_integer_length == sizeof (uint64_t) &&
5434 za.za_num_integers == 1);
417104bd 5435 VERIFY0(nvlist_add_uint64(features, za.za_name,
9ae529ec
CS
5436 za.za_first_integer));
5437 }
5438 zap_cursor_fini(&zc);
5439 }
5440
5441 if (spa->spa_feat_for_write_obj != 0) {
5442 for (zap_cursor_init(&zc, spa->spa_meta_objset,
5443 spa->spa_feat_for_write_obj);
5444 zap_cursor_retrieve(&zc, &za) == 0;
5445 zap_cursor_advance(&zc)) {
5446 ASSERT(za.za_integer_length == sizeof (uint64_t) &&
5447 za.za_num_integers == 1);
417104bd 5448 VERIFY0(nvlist_add_uint64(features, za.za_name,
9ae529ec
CS
5449 za.za_first_integer));
5450 }
5451 zap_cursor_fini(&zc);
5452 }
417104bd
NB
5453}
5454
5455static void
5456spa_feature_stats_from_cache(spa_t *spa, nvlist_t *features)
5457{
5458 int i;
5459
5460 for (i = 0; i < SPA_FEATURES; i++) {
5461 zfeature_info_t feature = spa_feature_table[i];
5462 uint64_t refcount;
5463
5464 if (feature_get_refcount(spa, &feature, &refcount) != 0)
5465 continue;
5466
5467 VERIFY0(nvlist_add_uint64(features, feature.fi_guid, refcount));
5468 }
5469}
5470
5471/*
5472 * Store a list of pool features and their reference counts in the
5473 * config.
5474 *
5475 * The first time this is called on a spa, allocate a new nvlist, fetch
5476 * the pool features and reference counts from disk, then save the list
5477 * in the spa. In subsequent calls on the same spa use the saved nvlist
5478 * and refresh its values from the cached reference counts. This
5479 * ensures we don't block here on I/O on a suspended pool so 'zpool
5480 * clear' can resume the pool.
5481 */
5482static void
5483spa_add_feature_stats(spa_t *spa, nvlist_t *config)
5484{
4eb30c68 5485 nvlist_t *features;
417104bd
NB
5486
5487 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
5488
4eb30c68
NB
5489 mutex_enter(&spa->spa_feat_stats_lock);
5490 features = spa->spa_feat_stats;
5491
417104bd
NB
5492 if (features != NULL) {
5493 spa_feature_stats_from_cache(spa, features);
5494 } else {
5495 VERIFY0(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP));
5496 spa->spa_feat_stats = features;
5497 spa_feature_stats_from_disk(spa, features);
5498 }
9ae529ec 5499
417104bd
NB
5500 VERIFY0(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS,
5501 features));
4eb30c68
NB
5502
5503 mutex_exit(&spa->spa_feat_stats_lock);
9ae529ec
CS
5504}
5505
34dc7c2f 5506int
9ae529ec
CS
5507spa_get_stats(const char *name, nvlist_t **config,
5508 char *altroot, size_t buflen)
34dc7c2f
BB
5509{
5510 int error;
5511 spa_t *spa;
5512
5513 *config = NULL;
428870ff 5514 error = spa_open_common(name, &spa, FTAG, NULL, config);
34dc7c2f 5515
9babb374
BB
5516 if (spa != NULL) {
5517 /*
5518 * This still leaves a window of inconsistency where the spares
5519 * or l2cache devices could change and the config would be
5520 * self-inconsistent.
5521 */
5522 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
34dc7c2f 5523
9babb374 5524 if (*config != NULL) {
572e2857
BB
5525 uint64_t loadtimes[2];
5526
5527 loadtimes[0] = spa->spa_loaded_ts.tv_sec;
5528 loadtimes[1] = spa->spa_loaded_ts.tv_nsec;
65ad5d11
AJ
5529 fnvlist_add_uint64_array(*config,
5530 ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2);
572e2857 5531
65ad5d11 5532 fnvlist_add_uint64(*config,
9babb374 5533 ZPOOL_CONFIG_ERRCOUNT,
65ad5d11 5534 spa_get_errlog_size(spa));
9babb374 5535
cec3a0a1 5536 if (spa_suspended(spa)) {
65ad5d11 5537 fnvlist_add_uint64(*config,
9babb374 5538 ZPOOL_CONFIG_SUSPENDED,
65ad5d11
AJ
5539 spa->spa_failmode);
5540 fnvlist_add_uint64(*config,
cec3a0a1 5541 ZPOOL_CONFIG_SUSPENDED_REASON,
65ad5d11 5542 spa->spa_suspended);
cec3a0a1 5543 }
b128c09f 5544
9babb374
BB
5545 spa_add_spares(spa, *config);
5546 spa_add_l2cache(spa, *config);
9ae529ec 5547 spa_add_feature_stats(spa, *config);
9babb374 5548 }
34dc7c2f
BB
5549 }
5550
5551 /*
5552 * We want to get the alternate root even for faulted pools, so we cheat
5553 * and call spa_lookup() directly.
5554 */
5555 if (altroot) {
5556 if (spa == NULL) {
5557 mutex_enter(&spa_namespace_lock);
5558 spa = spa_lookup(name);
5559 if (spa)
5560 spa_altroot(spa, altroot, buflen);
5561 else
5562 altroot[0] = '\0';
5563 spa = NULL;
5564 mutex_exit(&spa_namespace_lock);
5565 } else {
5566 spa_altroot(spa, altroot, buflen);
5567 }
5568 }
5569
9babb374
BB
5570 if (spa != NULL) {
5571 spa_config_exit(spa, SCL_CONFIG, FTAG);
34dc7c2f 5572 spa_close(spa, FTAG);
9babb374 5573 }
34dc7c2f
BB
5574
5575 return (error);
5576}
5577
5578/*
5579 * Validate that the auxiliary device array is well formed. We must have an
5580 * array of nvlists, each which describes a valid leaf vdev. If this is an
5581 * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be
5582 * specified, as long as they are well-formed.
5583 */
5584static int
5585spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode,
5586 spa_aux_vdev_t *sav, const char *config, uint64_t version,
5587 vdev_labeltype_t label)
5588{
5589 nvlist_t **dev;
5590 uint_t i, ndev;
5591 vdev_t *vd;
5592 int error;
5593
b128c09f
BB
5594 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
5595
34dc7c2f
BB
5596 /*
5597 * It's acceptable to have no devs specified.
5598 */
5599 if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0)
5600 return (0);
5601
5602 if (ndev == 0)
2e528b49 5603 return (SET_ERROR(EINVAL));
34dc7c2f
BB
5604
5605 /*
5606 * Make sure the pool is formatted with a version that supports this
5607 * device type.
5608 */
5609 if (spa_version(spa) < version)
2e528b49 5610 return (SET_ERROR(ENOTSUP));
34dc7c2f
BB
5611
5612 /*
5613 * Set the pending device list so we correctly handle device in-use
5614 * checking.
5615 */
5616 sav->sav_pending = dev;
5617 sav->sav_npending = ndev;
5618
5619 for (i = 0; i < ndev; i++) {
5620 if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0,
5621 mode)) != 0)
5622 goto out;
5623
5624 if (!vd->vdev_ops->vdev_op_leaf) {
5625 vdev_free(vd);
2e528b49 5626 error = SET_ERROR(EINVAL);
34dc7c2f
BB
5627 goto out;
5628 }
5629
34dc7c2f
BB
5630 vd->vdev_top = vd;
5631
5632 if ((error = vdev_open(vd)) == 0 &&
5633 (error = vdev_label_init(vd, crtxg, label)) == 0) {
65ad5d11
AJ
5634 fnvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID,
5635 vd->vdev_guid);
34dc7c2f
BB
5636 }
5637
5638 vdev_free(vd);
5639
5640 if (error &&
5641 (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE))
5642 goto out;
5643 else
5644 error = 0;
5645 }
5646
5647out:
5648 sav->sav_pending = NULL;
5649 sav->sav_npending = 0;
5650 return (error);
5651}
5652
5653static int
5654spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode)
5655{
5656 int error;
5657
b128c09f
BB
5658 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
5659
34dc7c2f
BB
5660 if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode,
5661 &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES,
5662 VDEV_LABEL_SPARE)) != 0) {
5663 return (error);
5664 }
5665
5666 return (spa_validate_aux_devs(spa, nvroot, crtxg, mode,
5667 &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE,
5668 VDEV_LABEL_L2CACHE));
5669}
5670
5671static void
5672spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs,
5673 const char *config)
5674{
5675 int i;
5676
5677 if (sav->sav_config != NULL) {
5678 nvlist_t **olddevs;
5679 uint_t oldndevs;
5680 nvlist_t **newdevs;
5681
5682 /*
4e33ba4c 5683 * Generate new dev list by concatenating with the
34dc7c2f
BB
5684 * current dev list.
5685 */
65ad5d11
AJ
5686 VERIFY0(nvlist_lookup_nvlist_array(sav->sav_config, config,
5687 &olddevs, &oldndevs));
34dc7c2f
BB
5688
5689 newdevs = kmem_alloc(sizeof (void *) *
79c76d5b 5690 (ndevs + oldndevs), KM_SLEEP);
34dc7c2f 5691 for (i = 0; i < oldndevs; i++)
65ad5d11 5692 newdevs[i] = fnvlist_dup(olddevs[i]);
34dc7c2f 5693 for (i = 0; i < ndevs; i++)
65ad5d11 5694 newdevs[i + oldndevs] = fnvlist_dup(devs[i]);
34dc7c2f 5695
65ad5d11 5696 fnvlist_remove(sav->sav_config, config);
34dc7c2f 5697
795075e6
PD
5698 fnvlist_add_nvlist_array(sav->sav_config, config,
5699 (const nvlist_t * const *)newdevs, ndevs + oldndevs);
34dc7c2f
BB
5700 for (i = 0; i < oldndevs + ndevs; i++)
5701 nvlist_free(newdevs[i]);
5702 kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *));
5703 } else {
5704 /*
5705 * Generate a new dev list.
5706 */
65ad5d11 5707 sav->sav_config = fnvlist_alloc();
795075e6
PD
5708 fnvlist_add_nvlist_array(sav->sav_config, config,
5709 (const nvlist_t * const *)devs, ndevs);
34dc7c2f
BB
5710 }
5711}
5712
5713/*
5714 * Stop and drop level 2 ARC devices
5715 */
5716void
5717spa_l2cache_drop(spa_t *spa)
5718{
5719 vdev_t *vd;
5720 int i;
5721 spa_aux_vdev_t *sav = &spa->spa_l2cache;
5722
5723 for (i = 0; i < sav->sav_count; i++) {
5724 uint64_t pool;
5725
5726 vd = sav->sav_vdevs[i];
5727 ASSERT(vd != NULL);
5728
fb5f0bc8
BB
5729 if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
5730 pool != 0ULL && l2arc_vdev_present(vd))
34dc7c2f 5731 l2arc_remove_vdev(vd);
34dc7c2f
BB
5732 }
5733}
5734
b5256303
TC
5735/*
5736 * Verify encryption parameters for spa creation. If we are encrypting, we must
5737 * have the encryption feature flag enabled.
5738 */
5739static int
5740spa_create_check_encryption_params(dsl_crypto_params_t *dcp,
5741 boolean_t has_encryption)
5742{
5743 if (dcp->cp_crypt != ZIO_CRYPT_OFF &&
5744 dcp->cp_crypt != ZIO_CRYPT_INHERIT &&
5745 !has_encryption)
5746 return (SET_ERROR(ENOTSUP));
5747
1fff937a 5748 return (dmu_objset_create_crypt_check(NULL, dcp, NULL));
b5256303
TC
5749}
5750
34dc7c2f
BB
5751/*
5752 * Pool Creation
5753 */
5754int
5755spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
b5256303 5756 nvlist_t *zplprops, dsl_crypto_params_t *dcp)
34dc7c2f
BB
5757{
5758 spa_t *spa;
5759 char *altroot = NULL;
5760 vdev_t *rvd;
5761 dsl_pool_t *dp;
5762 dmu_tx_t *tx;
9babb374 5763 int error = 0;
34dc7c2f
BB
5764 uint64_t txg = TXG_INITIAL;
5765 nvlist_t **spares, **l2cache;
5766 uint_t nspares, nl2cache;
b2255edc 5767 uint64_t version, obj, ndraid = 0;
9ae529ec 5768 boolean_t has_features;
b5256303 5769 boolean_t has_encryption;
715c996d 5770 boolean_t has_allocclass;
b5256303
TC
5771 spa_feature_t feat;
5772 char *feat_name;
83e9986f
RY
5773 char *poolname;
5774 nvlist_t *nvl;
5775
cc99f275
DB
5776 if (props == NULL ||
5777 nvlist_lookup_string(props, "tname", &poolname) != 0)
83e9986f 5778 poolname = (char *)pool;
34dc7c2f
BB
5779
5780 /*
5781 * If this pool already exists, return failure.
5782 */
5783 mutex_enter(&spa_namespace_lock);
83e9986f 5784 if (spa_lookup(poolname) != NULL) {
34dc7c2f 5785 mutex_exit(&spa_namespace_lock);
2e528b49 5786 return (SET_ERROR(EEXIST));
34dc7c2f
BB
5787 }
5788
5789 /*
5790 * Allocate a new spa_t structure.
5791 */
83e9986f
RY
5792 nvl = fnvlist_alloc();
5793 fnvlist_add_string(nvl, ZPOOL_CONFIG_POOL_NAME, pool);
34dc7c2f
BB
5794 (void) nvlist_lookup_string(props,
5795 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
83e9986f
RY
5796 spa = spa_add(poolname, nvl, altroot);
5797 fnvlist_free(nvl);
fb5f0bc8 5798 spa_activate(spa, spa_mode_global);
34dc7c2f 5799
34dc7c2f 5800 if (props && (error = spa_prop_validate(spa, props))) {
34dc7c2f
BB
5801 spa_deactivate(spa);
5802 spa_remove(spa);
b128c09f 5803 mutex_exit(&spa_namespace_lock);
34dc7c2f
BB
5804 return (error);
5805 }
5806
83e9986f
RY
5807 /*
5808 * Temporary pool names should never be written to disk.
5809 */
5810 if (poolname != pool)
5811 spa->spa_import_flags |= ZFS_IMPORT_TEMP_NAME;
5812
9ae529ec 5813 has_features = B_FALSE;
b5256303 5814 has_encryption = B_FALSE;
715c996d 5815 has_allocclass = B_FALSE;
1c27024e 5816 for (nvpair_t *elem = nvlist_next_nvpair(props, NULL);
9ae529ec 5817 elem != NULL; elem = nvlist_next_nvpair(props, elem)) {
b5256303 5818 if (zpool_prop_feature(nvpair_name(elem))) {
9ae529ec 5819 has_features = B_TRUE;
b5256303
TC
5820
5821 feat_name = strchr(nvpair_name(elem), '@') + 1;
5822 VERIFY0(zfeature_lookup_name(feat_name, &feat));
5823 if (feat == SPA_FEATURE_ENCRYPTION)
5824 has_encryption = B_TRUE;
715c996d 5825 if (feat == SPA_FEATURE_ALLOCATION_CLASSES)
5826 has_allocclass = B_TRUE;
b5256303
TC
5827 }
5828 }
5829
5830 /* verify encryption params, if they were provided */
5831 if (dcp != NULL) {
5832 error = spa_create_check_encryption_params(dcp, has_encryption);
5833 if (error != 0) {
5834 spa_deactivate(spa);
5835 spa_remove(spa);
5836 mutex_exit(&spa_namespace_lock);
5837 return (error);
5838 }
9ae529ec 5839 }
c24fa4b1 5840 if (!has_allocclass && zfs_special_devs(nvroot, NULL)) {
715c996d 5841 spa_deactivate(spa);
5842 spa_remove(spa);
5843 mutex_exit(&spa_namespace_lock);
5844 return (ENOTSUP);
5845 }
9ae529ec
CS
5846
5847 if (has_features || nvlist_lookup_uint64(props,
5848 zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) {
34dc7c2f 5849 version = SPA_VERSION;
9ae529ec
CS
5850 }
5851 ASSERT(SPA_VERSION_IS_SUPPORTED(version));
428870ff
BB
5852
5853 spa->spa_first_txg = txg;
5854 spa->spa_uberblock.ub_txg = txg - 1;
34dc7c2f
BB
5855 spa->spa_uberblock.ub_version = version;
5856 spa->spa_ubsync = spa->spa_uberblock;
3dfb57a3 5857 spa->spa_load_state = SPA_LOAD_CREATE;
a1d477c2
MA
5858 spa->spa_removing_phys.sr_state = DSS_NONE;
5859 spa->spa_removing_phys.sr_removing_vdev = -1;
5860 spa->spa_removing_phys.sr_prev_indirect_vdev = -1;
944a3724 5861 spa->spa_indirect_vdevs_loaded = B_TRUE;
34dc7c2f 5862
9babb374
BB
5863 /*
5864 * Create "The Godfather" zio to hold all async IOs
5865 */
e022864d
MA
5866 spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *),
5867 KM_SLEEP);
1c27024e 5868 for (int i = 0; i < max_ncpus; i++) {
e022864d
MA
5869 spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL,
5870 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
5871 ZIO_FLAG_GODFATHER);
5872 }
9babb374 5873
34dc7c2f
BB
5874 /*
5875 * Create the root vdev.
5876 */
b128c09f 5877 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
34dc7c2f
BB
5878
5879 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD);
5880
5881 ASSERT(error != 0 || rvd != NULL);
5882 ASSERT(error != 0 || spa->spa_root_vdev == rvd);
5883
5884 if (error == 0 && !zfs_allocatable_devs(nvroot))
2e528b49 5885 error = SET_ERROR(EINVAL);
34dc7c2f
BB
5886
5887 if (error == 0 &&
5888 (error = vdev_create(rvd, txg, B_FALSE)) == 0 &&
b2255edc
BB
5889 (error = vdev_draid_spare_create(nvroot, rvd, &ndraid, 0)) == 0 &&
5890 (error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) == 0) {
cc99f275
DB
5891 /*
5892 * instantiate the metaslab groups (this will dirty the vdevs)
5893 * we can no longer error exit past this point
5894 */
5895 for (int c = 0; error == 0 && c < rvd->vdev_children; c++) {
5896 vdev_t *vd = rvd->vdev_child[c];
5897
5898 vdev_metaslab_set_size(vd);
5899 vdev_expand(vd, txg);
9babb374 5900 }
34dc7c2f
BB
5901 }
5902
b128c09f 5903 spa_config_exit(spa, SCL_ALL, FTAG);
34dc7c2f
BB
5904
5905 if (error != 0) {
5906 spa_unload(spa);
5907 spa_deactivate(spa);
5908 spa_remove(spa);
5909 mutex_exit(&spa_namespace_lock);
5910 return (error);
5911 }
5912
5913 /*
5914 * Get the list of spares, if specified.
5915 */
5916 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
5917 &spares, &nspares) == 0) {
65ad5d11
AJ
5918 spa->spa_spares.sav_config = fnvlist_alloc();
5919 fnvlist_add_nvlist_array(spa->spa_spares.sav_config,
795075e6
PD
5920 ZPOOL_CONFIG_SPARES, (const nvlist_t * const *)spares,
5921 nspares);
b128c09f 5922 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
34dc7c2f 5923 spa_load_spares(spa);
b128c09f 5924 spa_config_exit(spa, SCL_ALL, FTAG);
34dc7c2f
BB
5925 spa->spa_spares.sav_sync = B_TRUE;
5926 }
5927
5928 /*
5929 * Get the list of level 2 cache devices, if specified.
5930 */
5931 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
5932 &l2cache, &nl2cache) == 0) {
795075e6
PD
5933 VERIFY0(nvlist_alloc(&spa->spa_l2cache.sav_config,
5934 NV_UNIQUE_NAME, KM_SLEEP));
65ad5d11 5935 fnvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
795075e6
PD
5936 ZPOOL_CONFIG_L2CACHE, (const nvlist_t * const *)l2cache,
5937 nl2cache);
b128c09f 5938 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
34dc7c2f 5939 spa_load_l2cache(spa);
b128c09f 5940 spa_config_exit(spa, SCL_ALL, FTAG);
34dc7c2f
BB
5941 spa->spa_l2cache.sav_sync = B_TRUE;
5942 }
5943
9ae529ec 5944 spa->spa_is_initializing = B_TRUE;
b5256303 5945 spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, dcp, txg);
9ae529ec 5946 spa->spa_is_initializing = B_FALSE;
34dc7c2f 5947
428870ff
BB
5948 /*
5949 * Create DDTs (dedup tables).
5950 */
5951 ddt_create(spa);
5952
5953 spa_update_dspace(spa);
5954
34dc7c2f
BB
5955 tx = dmu_tx_create_assigned(dp, txg);
5956
d5e024cb
BB
5957 /*
5958 * Create the pool's history object.
5959 */
5960 if (version >= SPA_VERSION_ZPOOL_HISTORY && !spa->spa_history)
5961 spa_history_create_obj(spa, tx);
5962
5963 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_CREATE);
5964 spa_history_log_version(spa, "create", tx);
5965
34dc7c2f
BB
5966 /*
5967 * Create the pool config object.
5968 */
5969 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset,
b128c09f 5970 DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE,
34dc7c2f
BB
5971 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx);
5972
5973 if (zap_add(spa->spa_meta_objset,
5974 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
5975 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) {
5976 cmn_err(CE_PANIC, "failed to add pool config");
5977 }
5978
428870ff
BB
5979 if (zap_add(spa->spa_meta_objset,
5980 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION,
5981 sizeof (uint64_t), 1, &version, tx) != 0) {
5982 cmn_err(CE_PANIC, "failed to add pool version");
5983 }
5984
34dc7c2f
BB
5985 /* Newly created pools with the right version are always deflated. */
5986 if (version >= SPA_VERSION_RAIDZ_DEFLATE) {
5987 spa->spa_deflate = TRUE;
5988 if (zap_add(spa->spa_meta_objset,
5989 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
5990 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) {
5991 cmn_err(CE_PANIC, "failed to add deflate");
5992 }
5993 }
5994
5995 /*
428870ff 5996 * Create the deferred-free bpobj. Turn off compression
34dc7c2f
BB
5997 * because sync-to-convergence takes longer if the blocksize
5998 * keeps changing.
5999 */
428870ff
BB
6000 obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx);
6001 dmu_object_set_compress(spa->spa_meta_objset, obj,
34dc7c2f 6002 ZIO_COMPRESS_OFF, tx);
34dc7c2f 6003 if (zap_add(spa->spa_meta_objset,
428870ff
BB
6004 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ,
6005 sizeof (uint64_t), 1, &obj, tx) != 0) {
6006 cmn_err(CE_PANIC, "failed to add bpobj");
34dc7c2f 6007 }
428870ff
BB
6008 VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj,
6009 spa->spa_meta_objset, obj));
34dc7c2f 6010
3c67d83a
TH
6011 /*
6012 * Generate some random noise for salted checksums to operate on.
6013 */
6014 (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes,
6015 sizeof (spa->spa_cksum_salt.zcs_bytes));
6016
34dc7c2f
BB
6017 /*
6018 * Set pool properties.
6019 */
6020 spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS);
6021 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
6022 spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE);
9babb374 6023 spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND);
379ca9cf 6024 spa->spa_multihost = zpool_prop_default_numeric(ZPOOL_PROP_MULTIHOST);
1b939560 6025 spa->spa_autotrim = zpool_prop_default_numeric(ZPOOL_PROP_AUTOTRIM);
428870ff 6026
d164b209
BB
6027 if (props != NULL) {
6028 spa_configfile_set(spa, props, B_FALSE);
13fe0198 6029 spa_sync_props(props, tx);
d164b209 6030 }
34dc7c2f 6031
b2255edc
BB
6032 for (int i = 0; i < ndraid; i++)
6033 spa_feature_incr(spa, SPA_FEATURE_DRAID, tx);
6034
34dc7c2f
BB
6035 dmu_tx_commit(tx);
6036
6037 spa->spa_sync_on = B_TRUE;
b5256303 6038 txg_sync_start(dp);
379ca9cf 6039 mmp_thread_start(spa);
b5256303 6040 txg_wait_synced(dp, txg);
34dc7c2f 6041
9d5b5245
SD
6042 spa_spawn_aux_threads(spa);
6043
a1d477c2 6044 spa_write_cachefile(spa, B_FALSE, B_TRUE);
34dc7c2f 6045
0c66c32d
JG
6046 /*
6047 * Don't count references from objsets that are already closed
6048 * and are making their way through the eviction process.
6049 */
6050 spa_evicting_os_wait(spa);
424fd7c3 6051 spa->spa_minref = zfs_refcount_count(&spa->spa_refcount);
3dfb57a3 6052 spa->spa_load_state = SPA_LOAD_NONE;
b128c09f 6053
4759342a
JL
6054 spa_import_os(spa);
6055
d164b209
BB
6056 mutex_exit(&spa_namespace_lock);
6057
34dc7c2f
BB
6058 return (0);
6059}
6060
9babb374
BB
6061/*
6062 * Import a non-root pool into the system.
6063 */
6064int
13fe0198 6065spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags)
34dc7c2f
BB
6066{
6067 spa_t *spa;
6068 char *altroot = NULL;
428870ff 6069 spa_load_state_t state = SPA_LOAD_IMPORT;
8a393be3 6070 zpool_load_policy_t policy;
da92d5cb 6071 spa_mode_t mode = spa_mode_global;
572e2857 6072 uint64_t readonly = B_FALSE;
9babb374 6073 int error;
34dc7c2f
BB
6074 nvlist_t *nvroot;
6075 nvlist_t **spares, **l2cache;
6076 uint_t nspares, nl2cache;
34dc7c2f
BB
6077
6078 /*
6079 * If a pool with this name exists, return failure.
6080 */
6081 mutex_enter(&spa_namespace_lock);
428870ff 6082 if (spa_lookup(pool) != NULL) {
9babb374 6083 mutex_exit(&spa_namespace_lock);
2e528b49 6084 return (SET_ERROR(EEXIST));
34dc7c2f
BB
6085 }
6086
6087 /*
6088 * Create and initialize the spa structure.
6089 */
6090 (void) nvlist_lookup_string(props,
6091 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
572e2857
BB
6092 (void) nvlist_lookup_uint64(props,
6093 zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly);
6094 if (readonly)
da92d5cb 6095 mode = SPA_MODE_READ;
428870ff 6096 spa = spa_add(pool, config, altroot);
572e2857
BB
6097 spa->spa_import_flags = flags;
6098
6099 /*
6100 * Verbatim import - Take a pool and insert it into the namespace
6101 * as if it had been loaded at boot.
6102 */
6103 if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) {
6104 if (props != NULL)
6105 spa_configfile_set(spa, props, B_FALSE);
6106
a1d477c2 6107 spa_write_cachefile(spa, B_FALSE, B_TRUE);
12fa0466 6108 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT);
4a0ee12a 6109 zfs_dbgmsg("spa_import: verbatim import of %s", pool);
572e2857 6110 mutex_exit(&spa_namespace_lock);
572e2857
BB
6111 return (0);
6112 }
6113
6114 spa_activate(spa, mode);
34dc7c2f 6115
9babb374
BB
6116 /*
6117 * Don't start async tasks until we know everything is healthy.
6118 */
6119 spa_async_suspend(spa);
b128c09f 6120
8a393be3
PZ
6121 zpool_get_load_policy(config, &policy);
6122 if (policy.zlp_rewind & ZPOOL_DO_REWIND)
572e2857
BB
6123 state = SPA_LOAD_RECOVER;
6124
6cb8e530 6125 spa->spa_config_source = SPA_CONFIG_SRC_TRYIMPORT;
572e2857 6126
6cb8e530
PZ
6127 if (state != SPA_LOAD_RECOVER) {
6128 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
6129 zfs_dbgmsg("spa_import: importing %s", pool);
6130 } else {
6131 zfs_dbgmsg("spa_import: importing %s, max_txg=%lld "
8a393be3 6132 "(RECOVERY MODE)", pool, (longlong_t)policy.zlp_txg);
6cb8e530 6133 }
8a393be3 6134 error = spa_load_best(spa, state, policy.zlp_txg, policy.zlp_rewind);
428870ff
BB
6135
6136 /*
572e2857
BB
6137 * Propagate anything learned while loading the pool and pass it
6138 * back to caller (i.e. rewind info, missing devices, etc).
428870ff 6139 */
65ad5d11 6140 fnvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, spa->spa_load_info);
34dc7c2f 6141
b128c09f 6142 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
34dc7c2f 6143 /*
9babb374
BB
6144 * Toss any existing sparelist, as it doesn't have any validity
6145 * anymore, and conflicts with spa_has_spare().
34dc7c2f 6146 */
9babb374 6147 if (spa->spa_spares.sav_config) {
34dc7c2f
BB
6148 nvlist_free(spa->spa_spares.sav_config);
6149 spa->spa_spares.sav_config = NULL;
6150 spa_load_spares(spa);
6151 }
9babb374 6152 if (spa->spa_l2cache.sav_config) {
34dc7c2f
BB
6153 nvlist_free(spa->spa_l2cache.sav_config);
6154 spa->spa_l2cache.sav_config = NULL;
6155 spa_load_l2cache(spa);
6156 }
6157
65ad5d11 6158 nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE);
b128c09f 6159 spa_config_exit(spa, SCL_ALL, FTAG);
34dc7c2f 6160
d164b209
BB
6161 if (props != NULL)
6162 spa_configfile_set(spa, props, B_FALSE);
6163
fb5f0bc8
BB
6164 if (error != 0 || (props && spa_writeable(spa) &&
6165 (error = spa_prop_set(spa, props)))) {
9babb374
BB
6166 spa_unload(spa);
6167 spa_deactivate(spa);
6168 spa_remove(spa);
34dc7c2f
BB
6169 mutex_exit(&spa_namespace_lock);
6170 return (error);
6171 }
6172
572e2857
BB
6173 spa_async_resume(spa);
6174
34dc7c2f
BB
6175 /*
6176 * Override any spares and level 2 cache devices as specified by
6177 * the user, as these may have correct device names/devids, etc.
6178 */
6179 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
6180 &spares, &nspares) == 0) {
6181 if (spa->spa_spares.sav_config)
65ad5d11
AJ
6182 fnvlist_remove(spa->spa_spares.sav_config,
6183 ZPOOL_CONFIG_SPARES);
34dc7c2f 6184 else
65ad5d11
AJ
6185 spa->spa_spares.sav_config = fnvlist_alloc();
6186 fnvlist_add_nvlist_array(spa->spa_spares.sav_config,
795075e6
PD
6187 ZPOOL_CONFIG_SPARES, (const nvlist_t * const *)spares,
6188 nspares);
b128c09f 6189 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
34dc7c2f 6190 spa_load_spares(spa);
b128c09f 6191 spa_config_exit(spa, SCL_ALL, FTAG);
34dc7c2f
BB
6192 spa->spa_spares.sav_sync = B_TRUE;
6193 }
6194 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
6195 &l2cache, &nl2cache) == 0) {
6196 if (spa->spa_l2cache.sav_config)
65ad5d11
AJ
6197 fnvlist_remove(spa->spa_l2cache.sav_config,
6198 ZPOOL_CONFIG_L2CACHE);
34dc7c2f 6199 else
65ad5d11
AJ
6200 spa->spa_l2cache.sav_config = fnvlist_alloc();
6201 fnvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
795075e6
PD
6202 ZPOOL_CONFIG_L2CACHE, (const nvlist_t * const *)l2cache,
6203 nl2cache);
b128c09f 6204 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
34dc7c2f 6205 spa_load_l2cache(spa);
b128c09f 6206 spa_config_exit(spa, SCL_ALL, FTAG);
34dc7c2f
BB
6207 spa->spa_l2cache.sav_sync = B_TRUE;
6208 }
6209
428870ff
BB
6210 /*
6211 * Check for any removed devices.
6212 */
6213 if (spa->spa_autoreplace) {
6214 spa_aux_check_removed(&spa->spa_spares);
6215 spa_aux_check_removed(&spa->spa_l2cache);
6216 }
6217
fb5f0bc8 6218 if (spa_writeable(spa)) {
b128c09f
BB
6219 /*
6220 * Update the config cache to include the newly-imported pool.
6221 */
45d1cae3 6222 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
b128c09f 6223 }
34dc7c2f 6224
34dc7c2f 6225 /*
9babb374
BB
6226 * It's possible that the pool was expanded while it was exported.
6227 * We kick off an async task to handle this for us.
34dc7c2f 6228 */
9babb374 6229 spa_async_request(spa, SPA_ASYNC_AUTOEXPAND);
b128c09f 6230
d5e024cb 6231 spa_history_log_version(spa, "import", NULL);
fb390aaf 6232
12fa0466 6233 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT);
fb390aaf 6234
fb390aaf
HR
6235 mutex_exit(&spa_namespace_lock);
6236
ec213971 6237 zvol_create_minors_recursive(pool);
4a22ba5b 6238
4759342a
JL
6239 spa_import_os(spa);
6240
b128c09f
BB
6241 return (0);
6242}
6243
34dc7c2f
BB
6244nvlist_t *
6245spa_tryimport(nvlist_t *tryconfig)
6246{
6247 nvlist_t *config = NULL;
6cb8e530 6248 char *poolname, *cachefile;
34dc7c2f
BB
6249 spa_t *spa;
6250 uint64_t state;
d164b209 6251 int error;
8a393be3 6252 zpool_load_policy_t policy;
34dc7c2f
BB
6253
6254 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname))
6255 return (NULL);
6256
6257 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state))
6258 return (NULL);
6259
6260 /*
6261 * Create and initialize the spa structure.
6262 */
6263 mutex_enter(&spa_namespace_lock);
428870ff 6264 spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL);
da92d5cb 6265 spa_activate(spa, SPA_MODE_READ);
34dc7c2f
BB
6266
6267 /*
8a393be3 6268 * Rewind pool if a max txg was provided.
34dc7c2f 6269 */
8a393be3
PZ
6270 zpool_get_load_policy(spa->spa_config, &policy);
6271 if (policy.zlp_txg != UINT64_MAX) {
6272 spa->spa_load_max_txg = policy.zlp_txg;
6cb8e530
PZ
6273 spa->spa_extreme_rewind = B_TRUE;
6274 zfs_dbgmsg("spa_tryimport: importing %s, max_txg=%lld",
8a393be3 6275 poolname, (longlong_t)policy.zlp_txg);
6cb8e530
PZ
6276 } else {
6277 zfs_dbgmsg("spa_tryimport: importing %s", poolname);
6278 }
6279
6280 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_CACHEFILE, &cachefile)
6281 == 0) {
6282 zfs_dbgmsg("spa_tryimport: using cachefile '%s'", cachefile);
6283 spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE;
6284 } else {
6285 spa->spa_config_source = SPA_CONFIG_SRC_SCAN;
6286 }
6287
6288 error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING);
34dc7c2f
BB
6289
6290 /*
6291 * If 'tryconfig' was at least parsable, return the current config.
6292 */
6293 if (spa->spa_root_vdev != NULL) {
34dc7c2f 6294 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
65ad5d11
AJ
6295 fnvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, poolname);
6296 fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, state);
6297 fnvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP,
6298 spa->spa_uberblock.ub_timestamp);
6299 fnvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO,
6300 spa->spa_load_info);
6301 fnvlist_add_uint64(config, ZPOOL_CONFIG_ERRATA,
6302 spa->spa_errata);
34dc7c2f
BB
6303
6304 /*
6305 * If the bootfs property exists on this pool then we
6306 * copy it out so that external consumers can tell which
6307 * pools are bootable.
6308 */
d164b209 6309 if ((!error || error == EEXIST) && spa->spa_bootfs) {
79c76d5b 6310 char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
34dc7c2f
BB
6311
6312 /*
6313 * We have to play games with the name since the
6314 * pool was opened as TRYIMPORT_NAME.
6315 */
b128c09f 6316 if (dsl_dsobj_to_dsname(spa_name(spa),
34dc7c2f
BB
6317 spa->spa_bootfs, tmpname) == 0) {
6318 char *cp;
d1d7e268
MK
6319 char *dsname;
6320
79c76d5b 6321 dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
34dc7c2f
BB
6322
6323 cp = strchr(tmpname, '/');
6324 if (cp == NULL) {
6325 (void) strlcpy(dsname, tmpname,
6326 MAXPATHLEN);
6327 } else {
6328 (void) snprintf(dsname, MAXPATHLEN,
6329 "%s/%s", poolname, ++cp);
6330 }
65ad5d11
AJ
6331 fnvlist_add_string(config, ZPOOL_CONFIG_BOOTFS,
6332 dsname);
34dc7c2f
BB
6333 kmem_free(dsname, MAXPATHLEN);
6334 }
6335 kmem_free(tmpname, MAXPATHLEN);
6336 }
6337
6338 /*
6339 * Add the list of hot spares and level 2 cache devices.
6340 */
9babb374 6341 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
34dc7c2f
BB
6342 spa_add_spares(spa, config);
6343 spa_add_l2cache(spa, config);
9babb374 6344 spa_config_exit(spa, SCL_CONFIG, FTAG);
34dc7c2f
BB
6345 }
6346
6347 spa_unload(spa);
6348 spa_deactivate(spa);
6349 spa_remove(spa);
6350 mutex_exit(&spa_namespace_lock);
6351
6352 return (config);
6353}
6354
6355/*
6356 * Pool export/destroy
6357 *
6358 * The act of destroying or exporting a pool is very simple. We make sure there
6359 * is no more pending I/O and any references to the pool are gone. Then, we
6360 * update the pool state and sync all the labels to disk, removing the
fb5f0bc8
BB
6361 * configuration from the cache afterwards. If the 'hardforce' flag is set, then
6362 * we don't sync the labels or remove the configuration cache.
34dc7c2f
BB
6363 */
6364static int
4d55ea81 6365spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig,
fb5f0bc8 6366 boolean_t force, boolean_t hardforce)
34dc7c2f 6367{
f4f50a70 6368 int error;
34dc7c2f
BB
6369 spa_t *spa;
6370
6371 if (oldconfig)
6372 *oldconfig = NULL;
6373
da92d5cb 6374 if (!(spa_mode_global & SPA_MODE_WRITE))
2e528b49 6375 return (SET_ERROR(EROFS));
34dc7c2f
BB
6376
6377 mutex_enter(&spa_namespace_lock);
6378 if ((spa = spa_lookup(pool)) == NULL) {
6379 mutex_exit(&spa_namespace_lock);
2e528b49 6380 return (SET_ERROR(ENOENT));
34dc7c2f
BB
6381 }
6382
43a85362
SD
6383 if (spa->spa_is_exporting) {
6384 /* the pool is being exported by another thread */
6385 mutex_exit(&spa_namespace_lock);
6386 return (SET_ERROR(ZFS_ERR_EXPORT_IN_PROGRESS));
6387 }
6388 spa->spa_is_exporting = B_TRUE;
6389
34dc7c2f
BB
6390 /*
6391 * Put a hold on the pool, drop the namespace lock, stop async tasks,
6392 * reacquire the namespace lock, and see if we can export.
6393 */
6394 spa_open_ref(spa, FTAG);
6395 mutex_exit(&spa_namespace_lock);
6396 spa_async_suspend(spa);
a0bd735a
BP
6397 if (spa->spa_zvol_taskq) {
6398 zvol_remove_minors(spa, spa_name(spa), B_TRUE);
6399 taskq_wait(spa->spa_zvol_taskq);
6400 }
34dc7c2f
BB
6401 mutex_enter(&spa_namespace_lock);
6402 spa_close(spa, FTAG);
6403
d14cfd83
IH
6404 if (spa->spa_state == POOL_STATE_UNINITIALIZED)
6405 goto export_spa;
34dc7c2f 6406 /*
d14cfd83
IH
6407 * The pool will be in core if it's openable, in which case we can
6408 * modify its state. Objsets may be open only because they're dirty,
6409 * so we have to force it to sync before checking spa_refcnt.
34dc7c2f 6410 */
0c66c32d 6411 if (spa->spa_sync_on) {
34dc7c2f 6412 txg_wait_synced(spa->spa_dsl_pool, 0);
0c66c32d
JG
6413 spa_evicting_os_wait(spa);
6414 }
34dc7c2f 6415
d14cfd83
IH
6416 /*
6417 * A pool cannot be exported or destroyed if there are active
6418 * references. If we are resetting a pool, allow references by
6419 * fault injection handlers.
6420 */
f4f50a70
WA
6421 if (!spa_refcount_zero(spa) || (spa->spa_inject_ref != 0)) {
6422 error = SET_ERROR(EBUSY);
6423 goto fail;
d14cfd83 6424 }
34dc7c2f 6425
d14cfd83 6426 if (spa->spa_sync_on) {
b128c09f
BB
6427 /*
6428 * A pool cannot be exported if it has an active shared spare.
6429 * This is to prevent other pools stealing the active spare
6430 * from an exported pool. At user's own will, such pool can
6431 * be forcedly exported.
6432 */
6433 if (!force && new_state == POOL_STATE_EXPORTED &&
6434 spa_has_active_shared_spare(spa)) {
f4f50a70
WA
6435 error = SET_ERROR(EXDEV);
6436 goto fail;
b128c09f 6437 }
34dc7c2f 6438
619f0976
GW
6439 /*
6440 * We're about to export or destroy this pool. Make sure
1b939560
BB
6441 * we stop all initialization and trim activity here before
6442 * we set the spa_final_txg. This will ensure that all
619f0976
GW
6443 * dirty data resulting from the initialization is
6444 * committed to disk before we unload the pool.
6445 */
6446 if (spa->spa_root_vdev != NULL) {
1b939560
BB
6447 vdev_t *rvd = spa->spa_root_vdev;
6448 vdev_initialize_stop_all(rvd, VDEV_INITIALIZE_ACTIVE);
6449 vdev_trim_stop_all(rvd, VDEV_TRIM_ACTIVE);
6450 vdev_autotrim_stop_all(spa);
9a49d3f3 6451 vdev_rebuild_stop_all(spa);
619f0976
GW
6452 }
6453
34dc7c2f
BB
6454 /*
6455 * We want this to be reflected on every label,
6456 * so mark them all dirty. spa_unload() will do the
6457 * final sync that pushes these changes out.
6458 */
fb5f0bc8 6459 if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) {
b128c09f 6460 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
34dc7c2f 6461 spa->spa_state = new_state;
2fb52853
GA
6462 vdev_config_dirty(spa->spa_root_vdev);
6463 spa_config_exit(spa, SCL_ALL, FTAG);
6464 }
6465
6466 /*
6467 * If the log space map feature is enabled and the pool is
6468 * getting exported (but not destroyed), we want to spend some
6469 * time flushing as many metaslabs as we can in an attempt to
6470 * destroy log space maps and save import time. This has to be
6471 * done before we set the spa_final_txg, otherwise
6472 * spa_sync() -> spa_flush_metaslabs() may dirty the final TXGs.
6473 * spa_should_flush_logs_on_unload() should be called after
6474 * spa_state has been set to the new_state.
6475 */
6476 if (spa_should_flush_logs_on_unload(spa))
6477 spa_unload_log_sm_flush_all(spa);
6478
6479 if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) {
6480 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
428870ff
BB
6481 spa->spa_final_txg = spa_last_synced_txg(spa) +
6482 TXG_DEFER_SIZE + 1;
b128c09f 6483 spa_config_exit(spa, SCL_ALL, FTAG);
34dc7c2f
BB
6484 }
6485 }
6486
d14cfd83 6487export_spa:
4759342a
JL
6488 spa_export_os(spa);
6489
d5e024cb
BB
6490 if (new_state == POOL_STATE_DESTROYED)
6491 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_DESTROY);
6492 else if (new_state == POOL_STATE_EXPORTED)
6493 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_EXPORT);
34dc7c2f
BB
6494
6495 if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
6496 spa_unload(spa);
6497 spa_deactivate(spa);
6498 }
6499
6500 if (oldconfig && spa->spa_config)
65ad5d11 6501 *oldconfig = fnvlist_dup(spa->spa_config);
34dc7c2f
BB
6502
6503 if (new_state != POOL_STATE_UNINITIALIZED) {
fb5f0bc8 6504 if (!hardforce)
a1d477c2 6505 spa_write_cachefile(spa, B_TRUE, B_TRUE);
34dc7c2f 6506 spa_remove(spa);
43a85362
SD
6507 } else {
6508 /*
6509 * If spa_remove() is not called for this spa_t and
6510 * there is any possibility that it can be reused,
6511 * we make sure to reset the exporting flag.
6512 */
6513 spa->spa_is_exporting = B_FALSE;
34dc7c2f 6514 }
34dc7c2f 6515
43a85362 6516 mutex_exit(&spa_namespace_lock);
34dc7c2f 6517 return (0);
f4f50a70
WA
6518
6519fail:
6520 spa->spa_is_exporting = B_FALSE;
6521 spa_async_resume(spa);
6522 mutex_exit(&spa_namespace_lock);
6523 return (error);
34dc7c2f
BB
6524}
6525
6526/*
6527 * Destroy a storage pool.
6528 */
6529int
4d55ea81 6530spa_destroy(const char *pool)
34dc7c2f 6531{
fb5f0bc8
BB
6532 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL,
6533 B_FALSE, B_FALSE));
34dc7c2f
BB
6534}
6535
6536/*
6537 * Export a storage pool.
6538 */
6539int
4d55ea81 6540spa_export(const char *pool, nvlist_t **oldconfig, boolean_t force,
fb5f0bc8 6541 boolean_t hardforce)
34dc7c2f 6542{
fb5f0bc8
BB
6543 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig,
6544 force, hardforce));
34dc7c2f
BB
6545}
6546
6547/*
6548 * Similar to spa_export(), this unloads the spa_t without actually removing it
6549 * from the namespace in any way.
6550 */
6551int
4d55ea81 6552spa_reset(const char *pool)
34dc7c2f 6553{
b128c09f 6554 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL,
fb5f0bc8 6555 B_FALSE, B_FALSE));
34dc7c2f
BB
6556}
6557
34dc7c2f
BB
6558/*
6559 * ==========================================================================
6560 * Device manipulation
6561 * ==========================================================================
6562 */
6563
b2255edc
BB
6564/*
6565 * This is called as a synctask to increment the draid feature flag
6566 */
6567static void
6568spa_draid_feature_incr(void *arg, dmu_tx_t *tx)
6569{
6570 spa_t *spa = dmu_tx_pool(tx)->dp_spa;
6571 int draid = (int)(uintptr_t)arg;
6572
6573 for (int c = 0; c < draid; c++)
6574 spa_feature_incr(spa, SPA_FEATURE_DRAID, tx);
6575}
6576
34dc7c2f
BB
6577/*
6578 * Add a device to a storage pool.
6579 */
6580int
6581spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
6582{
b2255edc 6583 uint64_t txg, ndraid = 0;
fb5f0bc8 6584 int error;
34dc7c2f
BB
6585 vdev_t *rvd = spa->spa_root_vdev;
6586 vdev_t *vd, *tvd;
6587 nvlist_t **spares, **l2cache;
6588 uint_t nspares, nl2cache;
6589
572e2857
BB
6590 ASSERT(spa_writeable(spa));
6591
34dc7c2f
BB
6592 txg = spa_vdev_enter(spa);
6593
6594 if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0,
6595 VDEV_ALLOC_ADD)) != 0)
6596 return (spa_vdev_exit(spa, NULL, txg, error));
6597
b128c09f 6598 spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */
34dc7c2f
BB
6599
6600 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares,
6601 &nspares) != 0)
6602 nspares = 0;
6603
6604 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache,
6605 &nl2cache) != 0)
6606 nl2cache = 0;
6607
b128c09f 6608 if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0)
34dc7c2f 6609 return (spa_vdev_exit(spa, vd, txg, EINVAL));
34dc7c2f 6610
b128c09f 6611 if (vd->vdev_children != 0 &&
b2255edc 6612 (error = vdev_create(vd, txg, B_FALSE)) != 0) {
b128c09f 6613 return (spa_vdev_exit(spa, vd, txg, error));
b2255edc
BB
6614 }
6615
6616 /*
6617 * The virtual dRAID spares must be added after vdev tree is created
bf169e9f 6618 * and the vdev guids are generated. The guid of their associated
b2255edc
BB
6619 * dRAID is stored in the config and used when opening the spare.
6620 */
6621 if ((error = vdev_draid_spare_create(nvroot, vd, &ndraid,
6622 rvd->vdev_children)) == 0) {
6623 if (ndraid > 0 && nvlist_lookup_nvlist_array(nvroot,
6624 ZPOOL_CONFIG_SPARES, &spares, &nspares) != 0)
6625 nspares = 0;
6626 } else {
6627 return (spa_vdev_exit(spa, vd, txg, error));
6628 }
34dc7c2f
BB
6629
6630 /*
6631 * We must validate the spares and l2cache devices after checking the
6632 * children. Otherwise, vdev_inuse() will blindly overwrite the spare.
6633 */
b128c09f 6634 if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0)
34dc7c2f 6635 return (spa_vdev_exit(spa, vd, txg, error));
34dc7c2f
BB
6636
6637 /*
a1d477c2
MA
6638 * If we are in the middle of a device removal, we can only add
6639 * devices which match the existing devices in the pool.
6640 * If we are in the middle of a removal, or have some indirect
b2255edc 6641 * vdevs, we can not add raidz or dRAID top levels.
34dc7c2f 6642 */
a1d477c2
MA
6643 if (spa->spa_vdev_removal != NULL ||
6644 spa->spa_removing_phys.sr_prev_indirect_vdev != -1) {
6645 for (int c = 0; c < vd->vdev_children; c++) {
6646 tvd = vd->vdev_child[c];
6647 if (spa->spa_vdev_removal != NULL &&
9e052db4 6648 tvd->vdev_ashift != spa->spa_max_ashift) {
a1d477c2
MA
6649 return (spa_vdev_exit(spa, vd, txg, EINVAL));
6650 }
b2255edc
BB
6651 /* Fail if top level vdev is raidz or a dRAID */
6652 if (vdev_get_nparity(tvd) != 0)
a1d477c2 6653 return (spa_vdev_exit(spa, vd, txg, EINVAL));
b2255edc 6654
a1d477c2
MA
6655 /*
6656 * Need the top level mirror to be
6657 * a mirror of leaf vdevs only
6658 */
6659 if (tvd->vdev_ops == &vdev_mirror_ops) {
6660 for (uint64_t cid = 0;
6661 cid < tvd->vdev_children; cid++) {
6662 vdev_t *cvd = tvd->vdev_child[cid];
6663 if (!cvd->vdev_ops->vdev_op_leaf) {
6664 return (spa_vdev_exit(spa, vd,
6665 txg, EINVAL));
6666 }
6667 }
6668 }
6669 }
6670 }
6671
1c27024e 6672 for (int c = 0; c < vd->vdev_children; c++) {
34dc7c2f
BB
6673 tvd = vd->vdev_child[c];
6674 vdev_remove_child(vd, tvd);
93e28d66 6675 tvd->vdev_id = rvd->vdev_children;
34dc7c2f
BB
6676 vdev_add_child(rvd, tvd);
6677 vdev_config_dirty(tvd);
6678 }
6679
6680 if (nspares != 0) {
6681 spa_set_aux_vdevs(&spa->spa_spares, spares, nspares,
6682 ZPOOL_CONFIG_SPARES);
6683 spa_load_spares(spa);
6684 spa->spa_spares.sav_sync = B_TRUE;
6685 }
6686
6687 if (nl2cache != 0) {
6688 spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache,
6689 ZPOOL_CONFIG_L2CACHE);
6690 spa_load_l2cache(spa);
6691 spa->spa_l2cache.sav_sync = B_TRUE;
6692 }
6693
b2255edc
BB
6694 /*
6695 * We can't increment a feature while holding spa_vdev so we
6696 * have to do it in a synctask.
6697 */
6698 if (ndraid != 0) {
6699 dmu_tx_t *tx;
6700
6701 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
6702 dsl_sync_task_nowait(spa->spa_dsl_pool, spa_draid_feature_incr,
6703 (void *)(uintptr_t)ndraid, tx);
6704 dmu_tx_commit(tx);
6705 }
6706
34dc7c2f
BB
6707 /*
6708 * We have to be careful when adding new vdevs to an existing pool.
6709 * If other threads start allocating from these vdevs before we
6710 * sync the config cache, and we lose power, then upon reboot we may
6711 * fail to open the pool because there are DVAs that the config cache
6712 * can't translate. Therefore, we first add the vdevs without
6713 * initializing metaslabs; sync the config cache (via spa_vdev_exit());
6714 * and then let spa_config_update() initialize the new metaslabs.
6715 *
6716 * spa_load() checks for added-but-not-initialized vdevs, so that
6717 * if we lose power at any point in this sequence, the remaining
6718 * steps will be completed the next time we load the pool.
6719 */
6720 (void) spa_vdev_exit(spa, vd, txg, 0);
6721
6722 mutex_enter(&spa_namespace_lock);
6723 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
12fa0466 6724 spa_event_notify(spa, NULL, NULL, ESC_ZFS_VDEV_ADD);
34dc7c2f
BB
6725 mutex_exit(&spa_namespace_lock);
6726
6727 return (0);
6728}
6729
6730/*
6731 * Attach a device to a mirror. The arguments are the path to any device
6732 * in the mirror, and the nvroot for the new device. If the path specifies
6733 * a device that is not mirrored, we automatically insert the mirror vdev.
6734 *
6735 * If 'replacing' is specified, the new device is intended to replace the
6736 * existing device; in this case the two devices are made into their own
6737 * mirror using the 'replacing' vdev, which is functionally identical to
6738 * the mirror vdev (it actually reuses all the same ops) but has a few
6739 * extra rules: you can't attach to it after it's been created, and upon
6740 * completion of resilvering, the first disk (the one being replaced)
6741 * is automatically detached.
9a49d3f3
BB
6742 *
6743 * If 'rebuild' is specified, then sequential reconstruction (a.ka. rebuild)
6744 * should be performed instead of traditional healing reconstruction. From
6745 * an administrators perspective these are both resilver operations.
34dc7c2f
BB
6746 */
6747int
9a49d3f3
BB
6748spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,
6749 int rebuild)
34dc7c2f 6750{
428870ff 6751 uint64_t txg, dtl_max_txg;
9a49d3f3 6752 vdev_t *rvd = spa->spa_root_vdev;
34dc7c2f
BB
6753 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd;
6754 vdev_ops_t *pvops;
b128c09f
BB
6755 char *oldvdpath, *newvdpath;
6756 int newvd_isspare;
6757 int error;
34dc7c2f 6758
572e2857
BB
6759 ASSERT(spa_writeable(spa));
6760
34dc7c2f
BB
6761 txg = spa_vdev_enter(spa);
6762
b128c09f 6763 oldvd = spa_lookup_by_guid(spa, guid, B_FALSE);
34dc7c2f 6764
d2734cce
SD
6765 ASSERT(MUTEX_HELD(&spa_namespace_lock));
6766 if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
6767 error = (spa_has_checkpoint(spa)) ?
6768 ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
6769 return (spa_vdev_exit(spa, NULL, txg, error));
6770 }
6771
9a49d3f3
BB
6772 if (rebuild) {
6773 if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD))
6774 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
6775
6776 if (dsl_scan_resilvering(spa_get_dsl(spa)))
6777 return (spa_vdev_exit(spa, NULL, txg,
6778 ZFS_ERR_RESILVER_IN_PROGRESS));
6779 } else {
6780 if (vdev_rebuild_active(rvd))
6781 return (spa_vdev_exit(spa, NULL, txg,
6782 ZFS_ERR_REBUILD_IN_PROGRESS));
6783 }
6784
9e052db4 6785 if (spa->spa_vdev_removal != NULL)
a1d477c2 6786 return (spa_vdev_exit(spa, NULL, txg, EBUSY));
a1d477c2 6787
34dc7c2f
BB
6788 if (oldvd == NULL)
6789 return (spa_vdev_exit(spa, NULL, txg, ENODEV));
6790
6791 if (!oldvd->vdev_ops->vdev_op_leaf)
6792 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
6793
6794 pvd = oldvd->vdev_parent;
6795
6796 if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0,
5ffb9d1d 6797 VDEV_ALLOC_ATTACH)) != 0)
34dc7c2f
BB
6798 return (spa_vdev_exit(spa, NULL, txg, EINVAL));
6799
6800 if (newrootvd->vdev_children != 1)
6801 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
6802
6803 newvd = newrootvd->vdev_child[0];
6804
6805 if (!newvd->vdev_ops->vdev_op_leaf)
6806 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
6807
6808 if ((error = vdev_create(newrootvd, txg, replacing)) != 0)
6809 return (spa_vdev_exit(spa, newrootvd, txg, error));
6810
6811 /*
6812 * Spares can't replace logs
6813 */
b128c09f 6814 if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare)
34dc7c2f
BB
6815 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
6816
b2255edc
BB
6817 /*
6818 * A dRAID spare can only replace a child of its parent dRAID vdev.
6819 */
6820 if (newvd->vdev_ops == &vdev_draid_spare_ops &&
6821 oldvd->vdev_top != vdev_draid_spare_get_parent(newvd)) {
6822 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
6823 }
6824
9a49d3f3
BB
6825 if (rebuild) {
6826 /*
b2255edc 6827 * For rebuilds, the top vdev must support reconstruction
9a49d3f3 6828 * using only space maps. This means the only allowable
b2255edc 6829 * vdevs types are the root vdev, a mirror, or dRAID.
9a49d3f3 6830 */
b2255edc
BB
6831 tvd = pvd;
6832 if (pvd->vdev_top != NULL)
6833 tvd = pvd->vdev_top;
6834
6835 if (tvd->vdev_ops != &vdev_mirror_ops &&
6836 tvd->vdev_ops != &vdev_root_ops &&
6837 tvd->vdev_ops != &vdev_draid_ops) {
9a49d3f3
BB
6838 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
6839 }
6840 }
6841
34dc7c2f
BB
6842 if (!replacing) {
6843 /*
6844 * For attach, the only allowable parent is a mirror or the root
6845 * vdev.
6846 */
6847 if (pvd->vdev_ops != &vdev_mirror_ops &&
6848 pvd->vdev_ops != &vdev_root_ops)
6849 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
6850
6851 pvops = &vdev_mirror_ops;
6852 } else {
6853 /*
6854 * Active hot spares can only be replaced by inactive hot
6855 * spares.
6856 */
6857 if (pvd->vdev_ops == &vdev_spare_ops &&
572e2857 6858 oldvd->vdev_isspare &&
34dc7c2f
BB
6859 !spa_has_spare(spa, newvd->vdev_guid))
6860 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
6861
6862 /*
6863 * If the source is a hot spare, and the parent isn't already a
6864 * spare, then we want to create a new hot spare. Otherwise, we
6865 * want to create a replacing vdev. The user is not allowed to
6866 * attach to a spared vdev child unless the 'isspare' state is
6867 * the same (spare replaces spare, non-spare replaces
6868 * non-spare).
6869 */
572e2857
BB
6870 if (pvd->vdev_ops == &vdev_replacing_ops &&
6871 spa_version(spa) < SPA_VERSION_MULTI_REPLACE) {
34dc7c2f 6872 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
572e2857
BB
6873 } else if (pvd->vdev_ops == &vdev_spare_ops &&
6874 newvd->vdev_isspare != oldvd->vdev_isspare) {
34dc7c2f 6875 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
572e2857
BB
6876 }
6877
6878 if (newvd->vdev_isspare)
34dc7c2f
BB
6879 pvops = &vdev_spare_ops;
6880 else
6881 pvops = &vdev_replacing_ops;
6882 }
6883
6884 /*
9babb374 6885 * Make sure the new device is big enough.
34dc7c2f 6886 */
9babb374 6887 if (newvd->vdev_asize < vdev_get_min_asize(oldvd))
34dc7c2f
BB
6888 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW));
6889
6890 /*
6891 * The new device cannot have a higher alignment requirement
6892 * than the top-level vdev.
6893 */
6894 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift)
9a49d3f3 6895 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
34dc7c2f
BB
6896
6897 /*
6898 * If this is an in-place replacement, update oldvd's path and devid
6899 * to make it distinguishable from newvd, and unopenable from now on.
6900 */
6901 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) {
6902 spa_strfree(oldvd->vdev_path);
6903 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5,
79c76d5b 6904 KM_SLEEP);
c9e319fa
JL
6905 (void) snprintf(oldvd->vdev_path, strlen(newvd->vdev_path) + 5,
6906 "%s/%s", newvd->vdev_path, "old");
34dc7c2f
BB
6907 if (oldvd->vdev_devid != NULL) {
6908 spa_strfree(oldvd->vdev_devid);
6909 oldvd->vdev_devid = NULL;
6910 }
6911 }
6912
6913 /*
6914 * If the parent is not a mirror, or if we're replacing, insert the new
6915 * mirror/replacing/spare vdev above oldvd.
6916 */
6917 if (pvd->vdev_ops != pvops)
6918 pvd = vdev_add_parent(oldvd, pvops);
6919
6920 ASSERT(pvd->vdev_top->vdev_parent == rvd);
6921 ASSERT(pvd->vdev_ops == pvops);
6922 ASSERT(oldvd->vdev_parent == pvd);
6923
6924 /*
6925 * Extract the new device from its root and add it to pvd.
6926 */
6927 vdev_remove_child(newrootvd, newvd);
6928 newvd->vdev_id = pvd->vdev_children;
428870ff 6929 newvd->vdev_crtxg = oldvd->vdev_crtxg;
34dc7c2f
BB
6930 vdev_add_child(pvd, newvd);
6931
6d82f98c
IH
6932 /*
6933 * Reevaluate the parent vdev state.
6934 */
6935 vdev_propagate_state(pvd);
6936
34dc7c2f
BB
6937 tvd = newvd->vdev_top;
6938 ASSERT(pvd->vdev_top == tvd);
6939 ASSERT(tvd->vdev_parent == rvd);
6940
6941 vdev_config_dirty(tvd);
6942
6943 /*
428870ff
BB
6944 * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account
6945 * for any dmu_sync-ed blocks. It will propagate upward when
6946 * spa_vdev_exit() calls vdev_dtl_reassess().
34dc7c2f 6947 */
428870ff 6948 dtl_max_txg = txg + TXG_CONCURRENT_STATES;
34dc7c2f 6949
9a49d3f3
BB
6950 vdev_dtl_dirty(newvd, DTL_MISSING,
6951 TXG_INITIAL, dtl_max_txg - TXG_INITIAL);
34dc7c2f 6952
9babb374 6953 if (newvd->vdev_isspare) {
34dc7c2f 6954 spa_spare_activate(newvd);
12fa0466 6955 spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE);
9babb374
BB
6956 }
6957
b128c09f
BB
6958 oldvdpath = spa_strdup(oldvd->vdev_path);
6959 newvdpath = spa_strdup(newvd->vdev_path);
6960 newvd_isspare = newvd->vdev_isspare;
34dc7c2f
BB
6961
6962 /*
6963 * Mark newvd's DTL dirty in this txg.
6964 */
6965 vdev_dirty(tvd, VDD_DTL, newvd, txg);
6966
428870ff 6967 /*
9a49d3f3
BB
6968 * Schedule the resilver or rebuild to restart in the future. We do
6969 * this to ensure that dmu_sync-ed blocks have been stitched into the
6970 * respective datasets.
428870ff 6971 */
9a49d3f3
BB
6972 if (rebuild) {
6973 newvd->vdev_rebuild_txg = txg;
6974
6975 vdev_rebuild(tvd);
6976 } else {
6977 newvd->vdev_resilver_txg = txg;
6978
6979 if (dsl_scan_resilvering(spa_get_dsl(spa)) &&
6980 spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER)) {
6981 vdev_defer_resilver(newvd);
6982 } else {
6983 dsl_scan_restart_resilver(spa->spa_dsl_pool,
6984 dtl_max_txg);
6985 }
6986 }
428870ff 6987
fb390aaf 6988 if (spa->spa_bootfs)
12fa0466 6989 spa_event_notify(spa, newvd, NULL, ESC_ZFS_BOOTFS_VDEV_ATTACH);
fb390aaf 6990
12fa0466 6991 spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_ATTACH);
fb390aaf 6992
428870ff
BB
6993 /*
6994 * Commit the config
6995 */
6996 (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0);
34dc7c2f 6997
6f1ffb06 6998 spa_history_log_internal(spa, "vdev attach", NULL,
428870ff 6999 "%s vdev=%s %s vdev=%s",
45d1cae3
BB
7000 replacing && newvd_isspare ? "spare in" :
7001 replacing ? "replace" : "attach", newvdpath,
7002 replacing ? "for" : "to", oldvdpath);
b128c09f
BB
7003
7004 spa_strfree(oldvdpath);
7005 spa_strfree(newvdpath);
7006
34dc7c2f
BB
7007 return (0);
7008}
7009
7010/*
7011 * Detach a device from a mirror or replacing vdev.
d3cc8b15 7012 *
34dc7c2f
BB
7013 * If 'replace_done' is specified, only detach if the parent
7014 * is a replacing vdev.
7015 */
7016int
fb5f0bc8 7017spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
34dc7c2f
BB
7018{
7019 uint64_t txg;
fb5f0bc8 7020 int error;
2a8ba608 7021 vdev_t *rvd __maybe_unused = spa->spa_root_vdev;
34dc7c2f
BB
7022 vdev_t *vd, *pvd, *cvd, *tvd;
7023 boolean_t unspare = B_FALSE;
d4ed6673 7024 uint64_t unspare_guid = 0;
428870ff 7025 char *vdpath;
1c27024e 7026
572e2857
BB
7027 ASSERT(spa_writeable(spa));
7028
9a49d3f3 7029 txg = spa_vdev_detach_enter(spa, guid);
34dc7c2f 7030
b128c09f 7031 vd = spa_lookup_by_guid(spa, guid, B_FALSE);
34dc7c2f 7032
d2734cce
SD
7033 /*
7034 * Besides being called directly from the userland through the
7035 * ioctl interface, spa_vdev_detach() can be potentially called
7036 * at the end of spa_vdev_resilver_done().
7037 *
7038 * In the regular case, when we have a checkpoint this shouldn't
7039 * happen as we never empty the DTLs of a vdev during the scrub
7040 * [see comment in dsl_scan_done()]. Thus spa_vdev_resilvering_done()
7041 * should never get here when we have a checkpoint.
7042 *
7043 * That said, even in a case when we checkpoint the pool exactly
7044 * as spa_vdev_resilver_done() calls this function everything
7045 * should be fine as the resilver will return right away.
7046 */
7047 ASSERT(MUTEX_HELD(&spa_namespace_lock));
7048 if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
7049 error = (spa_has_checkpoint(spa)) ?
7050 ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
7051 return (spa_vdev_exit(spa, NULL, txg, error));
7052 }
7053
34dc7c2f
BB
7054 if (vd == NULL)
7055 return (spa_vdev_exit(spa, NULL, txg, ENODEV));
7056
7057 if (!vd->vdev_ops->vdev_op_leaf)
7058 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
7059
7060 pvd = vd->vdev_parent;
7061
fb5f0bc8
BB
7062 /*
7063 * If the parent/child relationship is not as expected, don't do it.
7064 * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing
7065 * vdev that's replacing B with C. The user's intent in replacing
7066 * is to go from M(A,B) to M(A,C). If the user decides to cancel
7067 * the replace by detaching C, the expected behavior is to end up
7068 * M(A,B). But suppose that right after deciding to detach C,
7069 * the replacement of B completes. We would have M(A,C), and then
7070 * ask to detach C, which would leave us with just A -- not what
7071 * the user wanted. To prevent this, we make sure that the
7072 * parent/child relationship hasn't changed -- in this example,
7073 * that C's parent is still the replacing vdev R.
7074 */
7075 if (pvd->vdev_guid != pguid && pguid != 0)
7076 return (spa_vdev_exit(spa, NULL, txg, EBUSY));
7077
34dc7c2f 7078 /*
572e2857 7079 * Only 'replacing' or 'spare' vdevs can be replaced.
34dc7c2f 7080 */
572e2857
BB
7081 if (replace_done && pvd->vdev_ops != &vdev_replacing_ops &&
7082 pvd->vdev_ops != &vdev_spare_ops)
7083 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
34dc7c2f
BB
7084
7085 ASSERT(pvd->vdev_ops != &vdev_spare_ops ||
7086 spa_version(spa) >= SPA_VERSION_SPARES);
7087
7088 /*
7089 * Only mirror, replacing, and spare vdevs support detach.
7090 */
7091 if (pvd->vdev_ops != &vdev_replacing_ops &&
7092 pvd->vdev_ops != &vdev_mirror_ops &&
7093 pvd->vdev_ops != &vdev_spare_ops)
7094 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
7095
7096 /*
fb5f0bc8
BB
7097 * If this device has the only valid copy of some data,
7098 * we cannot safely detach it.
34dc7c2f 7099 */
fb5f0bc8 7100 if (vdev_dtl_required(vd))
34dc7c2f
BB
7101 return (spa_vdev_exit(spa, NULL, txg, EBUSY));
7102
fb5f0bc8 7103 ASSERT(pvd->vdev_children >= 2);
34dc7c2f 7104
b128c09f
BB
7105 /*
7106 * If we are detaching the second disk from a replacing vdev, then
7107 * check to see if we changed the original vdev's path to have "/old"
7108 * at the end in spa_vdev_attach(). If so, undo that change now.
7109 */
572e2857
BB
7110 if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 &&
7111 vd->vdev_path != NULL) {
7112 size_t len = strlen(vd->vdev_path);
7113
1c27024e 7114 for (int c = 0; c < pvd->vdev_children; c++) {
572e2857
BB
7115 cvd = pvd->vdev_child[c];
7116
7117 if (cvd == vd || cvd->vdev_path == NULL)
7118 continue;
7119
7120 if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 &&
7121 strcmp(cvd->vdev_path + len, "/old") == 0) {
7122 spa_strfree(cvd->vdev_path);
7123 cvd->vdev_path = spa_strdup(vd->vdev_path);
7124 break;
7125 }
b128c09f
BB
7126 }
7127 }
7128
34dc7c2f 7129 /*
b2255edc
BB
7130 * If we are detaching the original disk from a normal spare, then it
7131 * implies that the spare should become a real disk, and be removed
7132 * from the active spare list for the pool. dRAID spares on the
7133 * other hand are coupled to the pool and thus should never be removed
7134 * from the spares list.
34dc7c2f 7135 */
b2255edc
BB
7136 if (pvd->vdev_ops == &vdev_spare_ops && vd->vdev_id == 0) {
7137 vdev_t *last_cvd = pvd->vdev_child[pvd->vdev_children - 1];
7138
7139 if (last_cvd->vdev_isspare &&
7140 last_cvd->vdev_ops != &vdev_draid_spare_ops) {
7141 unspare = B_TRUE;
7142 }
7143 }
34dc7c2f
BB
7144
7145 /*
7146 * Erase the disk labels so the disk can be used for other things.
7147 * This must be done after all other error cases are handled,
7148 * but before we disembowel vd (so we can still do I/O to it).
7149 * But if we can't do it, don't treat the error as fatal --
7150 * it may be that the unwritability of the disk is the reason
7151 * it's being detached!
7152 */
7153 error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
7154
7155 /*
7156 * Remove vd from its parent and compact the parent's children.
7157 */
7158 vdev_remove_child(pvd, vd);
7159 vdev_compact_children(pvd);
7160
7161 /*
7162 * Remember one of the remaining children so we can get tvd below.
7163 */
572e2857 7164 cvd = pvd->vdev_child[pvd->vdev_children - 1];
34dc7c2f
BB
7165
7166 /*
7167 * If we need to remove the remaining child from the list of hot spares,
fb5f0bc8
BB
7168 * do it now, marking the vdev as no longer a spare in the process.
7169 * We must do this before vdev_remove_parent(), because that can
7170 * change the GUID if it creates a new toplevel GUID. For a similar
7171 * reason, we must remove the spare now, in the same txg as the detach;
7172 * otherwise someone could attach a new sibling, change the GUID, and
7173 * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail.
34dc7c2f
BB
7174 */
7175 if (unspare) {
7176 ASSERT(cvd->vdev_isspare);
7177 spa_spare_remove(cvd);
7178 unspare_guid = cvd->vdev_guid;
fb5f0bc8 7179 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
572e2857 7180 cvd->vdev_unspare = B_TRUE;
34dc7c2f
BB
7181 }
7182
428870ff
BB
7183 /*
7184 * If the parent mirror/replacing vdev only has one child,
7185 * the parent is no longer needed. Remove it from the tree.
7186 */
572e2857
BB
7187 if (pvd->vdev_children == 1) {
7188 if (pvd->vdev_ops == &vdev_spare_ops)
7189 cvd->vdev_unspare = B_FALSE;
428870ff 7190 vdev_remove_parent(cvd);
572e2857
BB
7191 }
7192
428870ff
BB
7193 /*
7194 * We don't set tvd until now because the parent we just removed
7195 * may have been the previous top-level vdev.
7196 */
7197 tvd = cvd->vdev_top;
7198 ASSERT(tvd->vdev_parent == rvd);
7199
7200 /*
7201 * Reevaluate the parent vdev state.
7202 */
7203 vdev_propagate_state(cvd);
7204
7205 /*
7206 * If the 'autoexpand' property is set on the pool then automatically
7207 * try to expand the size of the pool. For example if the device we
7208 * just detached was smaller than the others, it may be possible to
7209 * add metaslabs (i.e. grow the pool). We need to reopen the vdev
7210 * first so that we can obtain the updated sizes of the leaf vdevs.
7211 */
7212 if (spa->spa_autoexpand) {
7213 vdev_reopen(tvd);
7214 vdev_expand(tvd, txg);
7215 }
7216
7217 vdev_config_dirty(tvd);
7218
7219 /*
7220 * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that
7221 * vd->vdev_detached is set and free vd's DTL object in syncing context.
7222 * But first make sure we're not on any *other* txg's DTL list, to
7223 * prevent vd from being accessed after it's freed.
7224 */
b6ca6193 7225 vdpath = spa_strdup(vd->vdev_path ? vd->vdev_path : "none");
1c27024e 7226 for (int t = 0; t < TXG_SIZE; t++)
428870ff
BB
7227 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
7228 vd->vdev_detached = B_TRUE;
7229 vdev_dirty(tvd, VDD_DTL, vd, txg);
7230
12fa0466 7231 spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE);
e60e158e 7232 spa_notify_waiters(spa);
428870ff 7233
572e2857
BB
7234 /* hang on to the spa before we release the lock */
7235 spa_open_ref(spa, FTAG);
7236
428870ff
BB
7237 error = spa_vdev_exit(spa, vd, txg, 0);
7238
6f1ffb06 7239 spa_history_log_internal(spa, "detach", NULL,
428870ff
BB
7240 "vdev=%s", vdpath);
7241 spa_strfree(vdpath);
7242
7243 /*
7244 * If this was the removal of the original device in a hot spare vdev,
7245 * then we want to go through and remove the device from the hot spare
7246 * list of every other pool.
7247 */
7248 if (unspare) {
572e2857
BB
7249 spa_t *altspa = NULL;
7250
428870ff 7251 mutex_enter(&spa_namespace_lock);
572e2857
BB
7252 while ((altspa = spa_next(altspa)) != NULL) {
7253 if (altspa->spa_state != POOL_STATE_ACTIVE ||
7254 altspa == spa)
428870ff 7255 continue;
572e2857
BB
7256
7257 spa_open_ref(altspa, FTAG);
428870ff 7258 mutex_exit(&spa_namespace_lock);
572e2857 7259 (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE);
428870ff 7260 mutex_enter(&spa_namespace_lock);
572e2857 7261 spa_close(altspa, FTAG);
428870ff
BB
7262 }
7263 mutex_exit(&spa_namespace_lock);
572e2857
BB
7264
7265 /* search the rest of the vdevs for spares to remove */
7266 spa_vdev_resilver_done(spa);
428870ff
BB
7267 }
7268
572e2857
BB
7269 /* all done with the spa; OK to release */
7270 mutex_enter(&spa_namespace_lock);
7271 spa_close(spa, FTAG);
7272 mutex_exit(&spa_namespace_lock);
7273
428870ff
BB
7274 return (error);
7275}
7276
c10d37dd
GW
7277static int
7278spa_vdev_initialize_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type,
7279 list_t *vd_list)
619f0976 7280{
c10d37dd
GW
7281 ASSERT(MUTEX_HELD(&spa_namespace_lock));
7282
619f0976
GW
7283 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
7284
7285 /* Look up vdev and ensure it's a leaf. */
7286 vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE);
7287 if (vd == NULL || vd->vdev_detached) {
7288 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
619f0976
GW
7289 return (SET_ERROR(ENODEV));
7290 } else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) {
7291 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
619f0976
GW
7292 return (SET_ERROR(EINVAL));
7293 } else if (!vdev_writeable(vd)) {
7294 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
619f0976
GW
7295 return (SET_ERROR(EROFS));
7296 }
7297 mutex_enter(&vd->vdev_initialize_lock);
7298 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
7299
7300 /*
7301 * When we activate an initialize action we check to see
7302 * if the vdev_initialize_thread is NULL. We do this instead
7303 * of using the vdev_initialize_state since there might be
7304 * a previous initialization process which has completed but
7305 * the thread is not exited.
7306 */
1b939560 7307 if (cmd_type == POOL_INITIALIZE_START &&
619f0976
GW
7308 (vd->vdev_initialize_thread != NULL ||
7309 vd->vdev_top->vdev_removing)) {
7310 mutex_exit(&vd->vdev_initialize_lock);
619f0976
GW
7311 return (SET_ERROR(EBUSY));
7312 } else if (cmd_type == POOL_INITIALIZE_CANCEL &&
7313 (vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE &&
7314 vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED)) {
7315 mutex_exit(&vd->vdev_initialize_lock);
619f0976
GW
7316 return (SET_ERROR(ESRCH));
7317 } else if (cmd_type == POOL_INITIALIZE_SUSPEND &&
7318 vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE) {
7319 mutex_exit(&vd->vdev_initialize_lock);
619f0976
GW
7320 return (SET_ERROR(ESRCH));
7321 }
7322
7323 switch (cmd_type) {
1b939560 7324 case POOL_INITIALIZE_START:
619f0976
GW
7325 vdev_initialize(vd);
7326 break;
7327 case POOL_INITIALIZE_CANCEL:
c10d37dd 7328 vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED, vd_list);
619f0976
GW
7329 break;
7330 case POOL_INITIALIZE_SUSPEND:
c10d37dd 7331 vdev_initialize_stop(vd, VDEV_INITIALIZE_SUSPENDED, vd_list);
619f0976
GW
7332 break;
7333 default:
7334 panic("invalid cmd_type %llu", (unsigned long long)cmd_type);
7335 }
7336 mutex_exit(&vd->vdev_initialize_lock);
7337
c10d37dd
GW
7338 return (0);
7339}
7340
7341int
7342spa_vdev_initialize(spa_t *spa, nvlist_t *nv, uint64_t cmd_type,
7343 nvlist_t *vdev_errlist)
7344{
7345 int total_errors = 0;
7346 list_t vd_list;
7347
7348 list_create(&vd_list, sizeof (vdev_t),
7349 offsetof(vdev_t, vdev_initialize_node));
7350
7351 /*
7352 * We hold the namespace lock through the whole function
7353 * to prevent any changes to the pool while we're starting or
7354 * stopping initialization. The config and state locks are held so that
7355 * we can properly assess the vdev state before we commit to
7356 * the initializing operation.
7357 */
7358 mutex_enter(&spa_namespace_lock);
7359
7360 for (nvpair_t *pair = nvlist_next_nvpair(nv, NULL);
7361 pair != NULL; pair = nvlist_next_nvpair(nv, pair)) {
7362 uint64_t vdev_guid = fnvpair_value_uint64(pair);
7363
7364 int error = spa_vdev_initialize_impl(spa, vdev_guid, cmd_type,
7365 &vd_list);
7366 if (error != 0) {
7367 char guid_as_str[MAXNAMELEN];
7368
7369 (void) snprintf(guid_as_str, sizeof (guid_as_str),
7370 "%llu", (unsigned long long)vdev_guid);
7371 fnvlist_add_int64(vdev_errlist, guid_as_str, error);
7372 total_errors++;
7373 }
7374 }
7375
7376 /* Wait for all initialize threads to stop. */
7377 vdev_initialize_stop_wait(spa, &vd_list);
7378
619f0976
GW
7379 /* Sync out the initializing state */
7380 txg_wait_synced(spa->spa_dsl_pool, 0);
7381 mutex_exit(&spa_namespace_lock);
7382
c10d37dd 7383 list_destroy(&vd_list);
619f0976 7384
c10d37dd
GW
7385 return (total_errors);
7386}
619f0976 7387
1b939560
BB
7388static int
7389spa_vdev_trim_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type,
7390 uint64_t rate, boolean_t partial, boolean_t secure, list_t *vd_list)
7391{
7392 ASSERT(MUTEX_HELD(&spa_namespace_lock));
7393
7394 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
7395
7396 /* Look up vdev and ensure it's a leaf. */
7397 vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE);
7398 if (vd == NULL || vd->vdev_detached) {
7399 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
7400 return (SET_ERROR(ENODEV));
7401 } else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) {
7402 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
7403 return (SET_ERROR(EINVAL));
7404 } else if (!vdev_writeable(vd)) {
7405 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
7406 return (SET_ERROR(EROFS));
7407 } else if (!vd->vdev_has_trim) {
7408 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
7409 return (SET_ERROR(EOPNOTSUPP));
7410 } else if (secure && !vd->vdev_has_securetrim) {
7411 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
7412 return (SET_ERROR(EOPNOTSUPP));
7413 }
7414 mutex_enter(&vd->vdev_trim_lock);
7415 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
7416
7417 /*
7418 * When we activate a TRIM action we check to see if the
7419 * vdev_trim_thread is NULL. We do this instead of using the
7420 * vdev_trim_state since there might be a previous TRIM process
7421 * which has completed but the thread is not exited.
7422 */
7423 if (cmd_type == POOL_TRIM_START &&
7424 (vd->vdev_trim_thread != NULL || vd->vdev_top->vdev_removing)) {
7425 mutex_exit(&vd->vdev_trim_lock);
7426 return (SET_ERROR(EBUSY));
7427 } else if (cmd_type == POOL_TRIM_CANCEL &&
7428 (vd->vdev_trim_state != VDEV_TRIM_ACTIVE &&
7429 vd->vdev_trim_state != VDEV_TRIM_SUSPENDED)) {
7430 mutex_exit(&vd->vdev_trim_lock);
7431 return (SET_ERROR(ESRCH));
7432 } else if (cmd_type == POOL_TRIM_SUSPEND &&
7433 vd->vdev_trim_state != VDEV_TRIM_ACTIVE) {
7434 mutex_exit(&vd->vdev_trim_lock);
7435 return (SET_ERROR(ESRCH));
7436 }
7437
7438 switch (cmd_type) {
7439 case POOL_TRIM_START:
7440 vdev_trim(vd, rate, partial, secure);
7441 break;
7442 case POOL_TRIM_CANCEL:
7443 vdev_trim_stop(vd, VDEV_TRIM_CANCELED, vd_list);
7444 break;
7445 case POOL_TRIM_SUSPEND:
7446 vdev_trim_stop(vd, VDEV_TRIM_SUSPENDED, vd_list);
7447 break;
7448 default:
7449 panic("invalid cmd_type %llu", (unsigned long long)cmd_type);
7450 }
7451 mutex_exit(&vd->vdev_trim_lock);
7452
7453 return (0);
7454}
7455
7456/*
7457 * Initiates a manual TRIM for the requested vdevs. This kicks off individual
7458 * TRIM threads for each child vdev. These threads pass over all of the free
7459 * space in the vdev's metaslabs and issues TRIM commands for that space.
7460 */
7461int
7462spa_vdev_trim(spa_t *spa, nvlist_t *nv, uint64_t cmd_type, uint64_t rate,
7463 boolean_t partial, boolean_t secure, nvlist_t *vdev_errlist)
7464{
7465 int total_errors = 0;
7466 list_t vd_list;
7467
7468 list_create(&vd_list, sizeof (vdev_t),
7469 offsetof(vdev_t, vdev_trim_node));
7470
7471 /*
7472 * We hold the namespace lock through the whole function
7473 * to prevent any changes to the pool while we're starting or
7474 * stopping TRIM. The config and state locks are held so that
7475 * we can properly assess the vdev state before we commit to
7476 * the TRIM operation.
7477 */
7478 mutex_enter(&spa_namespace_lock);
7479
7480 for (nvpair_t *pair = nvlist_next_nvpair(nv, NULL);
7481 pair != NULL; pair = nvlist_next_nvpair(nv, pair)) {
7482 uint64_t vdev_guid = fnvpair_value_uint64(pair);
7483
7484 int error = spa_vdev_trim_impl(spa, vdev_guid, cmd_type,
7485 rate, partial, secure, &vd_list);
7486 if (error != 0) {
7487 char guid_as_str[MAXNAMELEN];
7488
7489 (void) snprintf(guid_as_str, sizeof (guid_as_str),
7490 "%llu", (unsigned long long)vdev_guid);
7491 fnvlist_add_int64(vdev_errlist, guid_as_str, error);
7492 total_errors++;
7493 }
7494 }
7495
7496 /* Wait for all TRIM threads to stop. */
7497 vdev_trim_stop_wait(spa, &vd_list);
7498
7499 /* Sync out the TRIM state */
7500 txg_wait_synced(spa->spa_dsl_pool, 0);
7501 mutex_exit(&spa_namespace_lock);
7502
7503 list_destroy(&vd_list);
7504
7505 return (total_errors);
7506}
7507
428870ff
BB
7508/*
7509 * Split a set of devices from their mirrors, and create a new pool from them.
7510 */
7511int
a926aab9 7512spa_vdev_split_mirror(spa_t *spa, const char *newname, nvlist_t *config,
428870ff
BB
7513 nvlist_t *props, boolean_t exp)
7514{
7515 int error = 0;
7516 uint64_t txg, *glist;
7517 spa_t *newspa;
7518 uint_t c, children, lastlog;
7519 nvlist_t **child, *nvl, *tmp;
7520 dmu_tx_t *tx;
7521 char *altroot = NULL;
7522 vdev_t *rvd, **vml = NULL; /* vdev modify list */
7523 boolean_t activate_slog;
7524
572e2857 7525 ASSERT(spa_writeable(spa));
428870ff
BB
7526
7527 txg = spa_vdev_enter(spa);
7528
d2734cce
SD
7529 ASSERT(MUTEX_HELD(&spa_namespace_lock));
7530 if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
7531 error = (spa_has_checkpoint(spa)) ?
7532 ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
7533 return (spa_vdev_exit(spa, NULL, txg, error));
7534 }
7535
428870ff
BB
7536 /* clear the log and flush everything up to now */
7537 activate_slog = spa_passivate_log(spa);
7538 (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
a1d477c2 7539 error = spa_reset_logs(spa);
428870ff
BB
7540 txg = spa_vdev_config_enter(spa);
7541
7542 if (activate_slog)
7543 spa_activate_log(spa);
7544
7545 if (error != 0)
7546 return (spa_vdev_exit(spa, NULL, txg, error));
7547
7548 /* check new spa name before going any further */
7549 if (spa_lookup(newname) != NULL)
7550 return (spa_vdev_exit(spa, NULL, txg, EEXIST));
7551
7552 /*
7553 * scan through all the children to ensure they're all mirrors
7554 */
7555 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 ||
7556 nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child,
7557 &children) != 0)
7558 return (spa_vdev_exit(spa, NULL, txg, EINVAL));
7559
7560 /* first, check to ensure we've got the right child count */
7561 rvd = spa->spa_root_vdev;
7562 lastlog = 0;
7563 for (c = 0; c < rvd->vdev_children; c++) {
7564 vdev_t *vd = rvd->vdev_child[c];
7565
7566 /* don't count the holes & logs as children */
1b664952
GA
7567 if (vd->vdev_islog || (vd->vdev_ops != &vdev_indirect_ops &&
7568 !vdev_is_concrete(vd))) {
428870ff
BB
7569 if (lastlog == 0)
7570 lastlog = c;
7571 continue;
7572 }
7573
7574 lastlog = 0;
7575 }
7576 if (children != (lastlog != 0 ? lastlog : rvd->vdev_children))
7577 return (spa_vdev_exit(spa, NULL, txg, EINVAL));
7578
7579 /* next, ensure no spare or cache devices are part of the split */
7580 if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 ||
7581 nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0)
7582 return (spa_vdev_exit(spa, NULL, txg, EINVAL));
7583
79c76d5b
BB
7584 vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP);
7585 glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP);
428870ff
BB
7586
7587 /* then, loop over each vdev and validate it */
7588 for (c = 0; c < children; c++) {
7589 uint64_t is_hole = 0;
7590
7591 (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE,
7592 &is_hole);
7593
7594 if (is_hole != 0) {
7595 if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole ||
7596 spa->spa_root_vdev->vdev_child[c]->vdev_islog) {
7597 continue;
7598 } else {
2e528b49 7599 error = SET_ERROR(EINVAL);
428870ff
BB
7600 break;
7601 }
7602 }
7603
1b664952
GA
7604 /* deal with indirect vdevs */
7605 if (spa->spa_root_vdev->vdev_child[c]->vdev_ops ==
7606 &vdev_indirect_ops)
7607 continue;
7608
428870ff
BB
7609 /* which disk is going to be split? */
7610 if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID,
7611 &glist[c]) != 0) {
2e528b49 7612 error = SET_ERROR(EINVAL);
428870ff
BB
7613 break;
7614 }
7615
7616 /* look it up in the spa */
7617 vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE);
7618 if (vml[c] == NULL) {
2e528b49 7619 error = SET_ERROR(ENODEV);
428870ff
BB
7620 break;
7621 }
7622
7623 /* make sure there's nothing stopping the split */
7624 if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops ||
7625 vml[c]->vdev_islog ||
a1d477c2 7626 !vdev_is_concrete(vml[c]) ||
428870ff
BB
7627 vml[c]->vdev_isspare ||
7628 vml[c]->vdev_isl2cache ||
7629 !vdev_writeable(vml[c]) ||
7630 vml[c]->vdev_children != 0 ||
7631 vml[c]->vdev_state != VDEV_STATE_HEALTHY ||
7632 c != spa->spa_root_vdev->vdev_child[c]->vdev_id) {
2e528b49 7633 error = SET_ERROR(EINVAL);
428870ff
BB
7634 break;
7635 }
7636
733b5722
RS
7637 if (vdev_dtl_required(vml[c]) ||
7638 vdev_resilver_needed(vml[c], NULL, NULL)) {
2e528b49 7639 error = SET_ERROR(EBUSY);
428870ff
BB
7640 break;
7641 }
7642
7643 /* we need certain info from the top level */
65ad5d11
AJ
7644 fnvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY,
7645 vml[c]->vdev_top->vdev_ms_array);
7646 fnvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT,
7647 vml[c]->vdev_top->vdev_ms_shift);
7648 fnvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE,
7649 vml[c]->vdev_top->vdev_asize);
7650 fnvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT,
7651 vml[c]->vdev_top->vdev_ashift);
e0ab3ab5
JS
7652
7653 /* transfer per-vdev ZAPs */
7654 ASSERT3U(vml[c]->vdev_leaf_zap, !=, 0);
7655 VERIFY0(nvlist_add_uint64(child[c],
7656 ZPOOL_CONFIG_VDEV_LEAF_ZAP, vml[c]->vdev_leaf_zap));
7657
7658 ASSERT3U(vml[c]->vdev_top->vdev_top_zap, !=, 0);
7659 VERIFY0(nvlist_add_uint64(child[c],
7660 ZPOOL_CONFIG_VDEV_TOP_ZAP,
7661 vml[c]->vdev_parent->vdev_top_zap));
428870ff
BB
7662 }
7663
7664 if (error != 0) {
7665 kmem_free(vml, children * sizeof (vdev_t *));
7666 kmem_free(glist, children * sizeof (uint64_t));
7667 return (spa_vdev_exit(spa, NULL, txg, error));
7668 }
7669
7670 /* stop writers from using the disks */
7671 for (c = 0; c < children; c++) {
7672 if (vml[c] != NULL)
7673 vml[c]->vdev_offline = B_TRUE;
7674 }
7675 vdev_reopen(spa->spa_root_vdev);
34dc7c2f
BB
7676
7677 /*
428870ff
BB
7678 * Temporarily record the splitting vdevs in the spa config. This
7679 * will disappear once the config is regenerated.
34dc7c2f 7680 */
65ad5d11
AJ
7681 nvl = fnvlist_alloc();
7682 fnvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, glist, children);
428870ff 7683 kmem_free(glist, children * sizeof (uint64_t));
34dc7c2f 7684
428870ff 7685 mutex_enter(&spa->spa_props_lock);
65ad5d11 7686 fnvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT, nvl);
428870ff
BB
7687 mutex_exit(&spa->spa_props_lock);
7688 spa->spa_config_splitting = nvl;
7689 vdev_config_dirty(spa->spa_root_vdev);
7690
7691 /* configure and create the new pool */
65ad5d11
AJ
7692 fnvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname);
7693 fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
7694 exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE);
7695 fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, spa_version(spa));
7696 fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, spa->spa_config_txg);
7697 fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID,
7698 spa_generate_guid(NULL));
e0ab3ab5 7699 VERIFY0(nvlist_add_boolean(config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS));
428870ff
BB
7700 (void) nvlist_lookup_string(props,
7701 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
34dc7c2f 7702
428870ff
BB
7703 /* add the new pool to the namespace */
7704 newspa = spa_add(newname, config, altroot);
e0ab3ab5 7705 newspa->spa_avz_action = AVZ_ACTION_REBUILD;
428870ff
BB
7706 newspa->spa_config_txg = spa->spa_config_txg;
7707 spa_set_log_state(newspa, SPA_LOG_CLEAR);
7708
7709 /* release the spa config lock, retaining the namespace lock */
7710 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
7711
7712 if (zio_injection_enabled)
7713 zio_handle_panic_injection(spa, FTAG, 1);
7714
7715 spa_activate(newspa, spa_mode_global);
7716 spa_async_suspend(newspa);
7717
c10d37dd 7718 /*
1b939560
BB
7719 * Temporarily stop the initializing and TRIM activity. We set the
7720 * state to ACTIVE so that we know to resume initializing or TRIM
7721 * once the split has completed.
c10d37dd 7722 */
1b939560
BB
7723 list_t vd_initialize_list;
7724 list_create(&vd_initialize_list, sizeof (vdev_t),
c10d37dd
GW
7725 offsetof(vdev_t, vdev_initialize_node));
7726
1b939560
BB
7727 list_t vd_trim_list;
7728 list_create(&vd_trim_list, sizeof (vdev_t),
7729 offsetof(vdev_t, vdev_trim_node));
7730
619f0976 7731 for (c = 0; c < children; c++) {
1b664952 7732 if (vml[c] != NULL && vml[c]->vdev_ops != &vdev_indirect_ops) {
619f0976 7733 mutex_enter(&vml[c]->vdev_initialize_lock);
1b939560
BB
7734 vdev_initialize_stop(vml[c],
7735 VDEV_INITIALIZE_ACTIVE, &vd_initialize_list);
619f0976 7736 mutex_exit(&vml[c]->vdev_initialize_lock);
1b939560
BB
7737
7738 mutex_enter(&vml[c]->vdev_trim_lock);
7739 vdev_trim_stop(vml[c], VDEV_TRIM_ACTIVE, &vd_trim_list);
7740 mutex_exit(&vml[c]->vdev_trim_lock);
619f0976
GW
7741 }
7742 }
1b939560
BB
7743
7744 vdev_initialize_stop_wait(spa, &vd_initialize_list);
7745 vdev_trim_stop_wait(spa, &vd_trim_list);
7746
7747 list_destroy(&vd_initialize_list);
7748 list_destroy(&vd_trim_list);
619f0976 7749
6cb8e530 7750 newspa->spa_config_source = SPA_CONFIG_SRC_SPLIT;
8b27e08e 7751 newspa->spa_is_splitting = B_TRUE;
6cb8e530 7752
428870ff 7753 /* create the new pool from the disks of the original pool */
6cb8e530 7754 error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE);
428870ff
BB
7755 if (error)
7756 goto out;
7757
7758 /* if that worked, generate a real config for the new pool */
7759 if (newspa->spa_root_vdev != NULL) {
65ad5d11
AJ
7760 newspa->spa_config_splitting = fnvlist_alloc();
7761 fnvlist_add_uint64(newspa->spa_config_splitting,
7762 ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa));
428870ff
BB
7763 spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL,
7764 B_TRUE));
9babb374 7765 }
34dc7c2f 7766
428870ff
BB
7767 /* set the props */
7768 if (props != NULL) {
7769 spa_configfile_set(newspa, props, B_FALSE);
7770 error = spa_prop_set(newspa, props);
7771 if (error)
7772 goto out;
7773 }
34dc7c2f 7774
428870ff
BB
7775 /* flush everything */
7776 txg = spa_vdev_config_enter(newspa);
7777 vdev_config_dirty(newspa->spa_root_vdev);
7778 (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG);
34dc7c2f 7779
428870ff
BB
7780 if (zio_injection_enabled)
7781 zio_handle_panic_injection(spa, FTAG, 2);
34dc7c2f 7782
428870ff 7783 spa_async_resume(newspa);
34dc7c2f 7784
428870ff
BB
7785 /* finally, update the original pool's config */
7786 txg = spa_vdev_config_enter(spa);
7787 tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
7788 error = dmu_tx_assign(tx, TXG_WAIT);
7789 if (error != 0)
7790 dmu_tx_abort(tx);
7791 for (c = 0; c < children; c++) {
1b664952 7792 if (vml[c] != NULL && vml[c]->vdev_ops != &vdev_indirect_ops) {
234234ca
RS
7793 vdev_t *tvd = vml[c]->vdev_top;
7794
7795 /*
7796 * Need to be sure the detachable VDEV is not
7797 * on any *other* txg's DTL list to prevent it
7798 * from being accessed after it's freed.
7799 */
7800 for (int t = 0; t < TXG_SIZE; t++) {
7801 (void) txg_list_remove_this(
7802 &tvd->vdev_dtl_list, vml[c], t);
7803 }
7804
428870ff
BB
7805 vdev_split(vml[c]);
7806 if (error == 0)
6f1ffb06
MA
7807 spa_history_log_internal(spa, "detach", tx,
7808 "vdev=%s", vml[c]->vdev_path);
e0ab3ab5 7809
428870ff 7810 vdev_free(vml[c]);
34dc7c2f 7811 }
34dc7c2f 7812 }
e0ab3ab5 7813 spa->spa_avz_action = AVZ_ACTION_REBUILD;
428870ff
BB
7814 vdev_config_dirty(spa->spa_root_vdev);
7815 spa->spa_config_splitting = NULL;
7816 nvlist_free(nvl);
7817 if (error == 0)
7818 dmu_tx_commit(tx);
7819 (void) spa_vdev_exit(spa, NULL, txg, 0);
7820
7821 if (zio_injection_enabled)
7822 zio_handle_panic_injection(spa, FTAG, 3);
7823
7824 /* split is complete; log a history record */
6f1ffb06
MA
7825 spa_history_log_internal(newspa, "split", NULL,
7826 "from pool %s", spa_name(spa));
428870ff 7827
8b27e08e 7828 newspa->spa_is_splitting = B_FALSE;
428870ff
BB
7829 kmem_free(vml, children * sizeof (vdev_t *));
7830
7831 /* if we're not going to mount the filesystems in userland, export */
7832 if (exp)
7833 error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL,
7834 B_FALSE, B_FALSE);
7835
7836 return (error);
7837
7838out:
7839 spa_unload(newspa);
7840 spa_deactivate(newspa);
7841 spa_remove(newspa);
7842
7843 txg = spa_vdev_config_enter(spa);
7844
7845 /* re-online all offlined disks */
7846 for (c = 0; c < children; c++) {
7847 if (vml[c] != NULL)
7848 vml[c]->vdev_offline = B_FALSE;
7849 }
619f0976 7850
1b939560 7851 /* restart initializing or trimming disks as necessary */
619f0976 7852 spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART);
1b939560
BB
7853 spa_async_request(spa, SPA_ASYNC_TRIM_RESTART);
7854 spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART);
619f0976 7855
428870ff
BB
7856 vdev_reopen(spa->spa_root_vdev);
7857
7858 nvlist_free(spa->spa_config_splitting);
7859 spa->spa_config_splitting = NULL;
7860 (void) spa_vdev_exit(spa, NULL, txg, error);
34dc7c2f 7861
428870ff 7862 kmem_free(vml, children * sizeof (vdev_t *));
34dc7c2f
BB
7863 return (error);
7864}
7865
34dc7c2f
BB
7866/*
7867 * Find any device that's done replacing, or a vdev marked 'unspare' that's
d3cc8b15 7868 * currently spared, so we can detach it.
34dc7c2f
BB
7869 */
7870static vdev_t *
7871spa_vdev_resilver_done_hunt(vdev_t *vd)
7872{
7873 vdev_t *newvd, *oldvd;
34dc7c2f 7874
1c27024e 7875 for (int c = 0; c < vd->vdev_children; c++) {
34dc7c2f
BB
7876 oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]);
7877 if (oldvd != NULL)
7878 return (oldvd);
7879 }
7880
7881 /*
572e2857
BB
7882 * Check for a completed replacement. We always consider the first
7883 * vdev in the list to be the oldest vdev, and the last one to be
7884 * the newest (see spa_vdev_attach() for how that works). In
7885 * the case where the newest vdev is faulted, we will not automatically
7886 * remove it after a resilver completes. This is OK as it will require
7887 * user intervention to determine which disk the admin wishes to keep.
34dc7c2f 7888 */
572e2857
BB
7889 if (vd->vdev_ops == &vdev_replacing_ops) {
7890 ASSERT(vd->vdev_children > 1);
7891
7892 newvd = vd->vdev_child[vd->vdev_children - 1];
34dc7c2f 7893 oldvd = vd->vdev_child[0];
34dc7c2f 7894
fb5f0bc8 7895 if (vdev_dtl_empty(newvd, DTL_MISSING) &&
428870ff 7896 vdev_dtl_empty(newvd, DTL_OUTAGE) &&
fb5f0bc8 7897 !vdev_dtl_required(oldvd))
34dc7c2f 7898 return (oldvd);
34dc7c2f
BB
7899 }
7900
7901 /*
7902 * Check for a completed resilver with the 'unspare' flag set.
f65fbee1 7903 * Also potentially update faulted state.
34dc7c2f 7904 */
572e2857
BB
7905 if (vd->vdev_ops == &vdev_spare_ops) {
7906 vdev_t *first = vd->vdev_child[0];
7907 vdev_t *last = vd->vdev_child[vd->vdev_children - 1];
7908
7909 if (last->vdev_unspare) {
7910 oldvd = first;
7911 newvd = last;
7912 } else if (first->vdev_unspare) {
7913 oldvd = last;
7914 newvd = first;
7915 } else {
7916 oldvd = NULL;
7917 }
34dc7c2f 7918
572e2857 7919 if (oldvd != NULL &&
fb5f0bc8 7920 vdev_dtl_empty(newvd, DTL_MISSING) &&
428870ff 7921 vdev_dtl_empty(newvd, DTL_OUTAGE) &&
572e2857 7922 !vdev_dtl_required(oldvd))
34dc7c2f 7923 return (oldvd);
572e2857 7924
f65fbee1
JJ
7925 vdev_propagate_state(vd);
7926
572e2857
BB
7927 /*
7928 * If there are more than two spares attached to a disk,
7929 * and those spares are not required, then we want to
7930 * attempt to free them up now so that they can be used
7931 * by other pools. Once we're back down to a single
7932 * disk+spare, we stop removing them.
7933 */
7934 if (vd->vdev_children > 2) {
7935 newvd = vd->vdev_child[1];
7936
7937 if (newvd->vdev_isspare && last->vdev_isspare &&
7938 vdev_dtl_empty(last, DTL_MISSING) &&
7939 vdev_dtl_empty(last, DTL_OUTAGE) &&
7940 !vdev_dtl_required(newvd))
7941 return (newvd);
34dc7c2f 7942 }
34dc7c2f
BB
7943 }
7944
7945 return (NULL);
7946}
7947
7948static void
7949spa_vdev_resilver_done(spa_t *spa)
7950{
fb5f0bc8
BB
7951 vdev_t *vd, *pvd, *ppvd;
7952 uint64_t guid, sguid, pguid, ppguid;
34dc7c2f 7953
fb5f0bc8 7954 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
34dc7c2f
BB
7955
7956 while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) {
fb5f0bc8
BB
7957 pvd = vd->vdev_parent;
7958 ppvd = pvd->vdev_parent;
34dc7c2f 7959 guid = vd->vdev_guid;
fb5f0bc8
BB
7960 pguid = pvd->vdev_guid;
7961 ppguid = ppvd->vdev_guid;
7962 sguid = 0;
34dc7c2f
BB
7963 /*
7964 * If we have just finished replacing a hot spared device, then
7965 * we need to detach the parent's first child (the original hot
7966 * spare) as well.
7967 */
572e2857
BB
7968 if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 &&
7969 ppvd->vdev_children == 2) {
34dc7c2f 7970 ASSERT(pvd->vdev_ops == &vdev_replacing_ops);
fb5f0bc8 7971 sguid = ppvd->vdev_child[1]->vdev_guid;
34dc7c2f 7972 }
5d1f7fb6
GW
7973 ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd));
7974
fb5f0bc8
BB
7975 spa_config_exit(spa, SCL_ALL, FTAG);
7976 if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0)
34dc7c2f 7977 return;
fb5f0bc8 7978 if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0)
34dc7c2f 7979 return;
fb5f0bc8 7980 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
34dc7c2f
BB
7981 }
7982
fb5f0bc8 7983 spa_config_exit(spa, SCL_ALL, FTAG);
9a49d3f3
BB
7984
7985 /*
7986 * If a detach was not performed above replace waiters will not have
7987 * been notified. In which case we must do so now.
7988 */
7989 spa_notify_waiters(spa);
34dc7c2f
BB
7990}
7991
7992/*
428870ff 7993 * Update the stored path or FRU for this vdev.
34dc7c2f 7994 */
65c7cc49 7995static int
9babb374
BB
7996spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value,
7997 boolean_t ispath)
34dc7c2f 7998{
b128c09f 7999 vdev_t *vd;
428870ff 8000 boolean_t sync = B_FALSE;
34dc7c2f 8001
572e2857
BB
8002 ASSERT(spa_writeable(spa));
8003
428870ff 8004 spa_vdev_state_enter(spa, SCL_ALL);
34dc7c2f 8005
9babb374 8006 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
428870ff 8007 return (spa_vdev_state_exit(spa, NULL, ENOENT));
34dc7c2f
BB
8008
8009 if (!vd->vdev_ops->vdev_op_leaf)
428870ff 8010 return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
34dc7c2f 8011
9babb374 8012 if (ispath) {
428870ff
BB
8013 if (strcmp(value, vd->vdev_path) != 0) {
8014 spa_strfree(vd->vdev_path);
8015 vd->vdev_path = spa_strdup(value);
8016 sync = B_TRUE;
8017 }
9babb374 8018 } else {
428870ff
BB
8019 if (vd->vdev_fru == NULL) {
8020 vd->vdev_fru = spa_strdup(value);
8021 sync = B_TRUE;
8022 } else if (strcmp(value, vd->vdev_fru) != 0) {
9babb374 8023 spa_strfree(vd->vdev_fru);
428870ff
BB
8024 vd->vdev_fru = spa_strdup(value);
8025 sync = B_TRUE;
8026 }
9babb374 8027 }
34dc7c2f 8028
428870ff 8029 return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0));
34dc7c2f
BB
8030}
8031
9babb374
BB
8032int
8033spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath)
8034{
8035 return (spa_vdev_set_common(spa, guid, newpath, B_TRUE));
8036}
8037
8038int
8039spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru)
8040{
8041 return (spa_vdev_set_common(spa, guid, newfru, B_FALSE));
8042}
8043
34dc7c2f
BB
8044/*
8045 * ==========================================================================
428870ff 8046 * SPA Scanning
34dc7c2f
BB
8047 * ==========================================================================
8048 */
0ea05c64
AP
8049int
8050spa_scrub_pause_resume(spa_t *spa, pool_scrub_cmd_t cmd)
8051{
8052 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
8053
8054 if (dsl_scan_resilvering(spa->spa_dsl_pool))
8055 return (SET_ERROR(EBUSY));
8056
8057 return (dsl_scrub_set_pause_resume(spa->spa_dsl_pool, cmd));
8058}
34dc7c2f 8059
34dc7c2f 8060int
428870ff
BB
8061spa_scan_stop(spa_t *spa)
8062{
8063 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
8064 if (dsl_scan_resilvering(spa->spa_dsl_pool))
2e528b49 8065 return (SET_ERROR(EBUSY));
428870ff
BB
8066 return (dsl_scan_cancel(spa->spa_dsl_pool));
8067}
8068
8069int
8070spa_scan(spa_t *spa, pool_scan_func_t func)
34dc7c2f 8071{
b128c09f 8072 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
34dc7c2f 8073
428870ff 8074 if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE)
2e528b49 8075 return (SET_ERROR(ENOTSUP));
34dc7c2f 8076
fa241660
TC
8077 if (func == POOL_SCAN_RESILVER &&
8078 !spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER))
8079 return (SET_ERROR(ENOTSUP));
8080
34dc7c2f 8081 /*
b128c09f
BB
8082 * If a resilver was requested, but there is no DTL on a
8083 * writeable leaf device, we have nothing to do.
34dc7c2f 8084 */
428870ff 8085 if (func == POOL_SCAN_RESILVER &&
b128c09f
BB
8086 !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) {
8087 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
34dc7c2f
BB
8088 return (0);
8089 }
8090
428870ff 8091 return (dsl_scan(spa->spa_dsl_pool, func));
34dc7c2f
BB
8092}
8093
8094/*
8095 * ==========================================================================
8096 * SPA async task processing
8097 * ==========================================================================
8098 */
8099
8100static void
8101spa_async_remove(spa_t *spa, vdev_t *vd)
8102{
b128c09f 8103 if (vd->vdev_remove_wanted) {
428870ff
BB
8104 vd->vdev_remove_wanted = B_FALSE;
8105 vd->vdev_delayed_close = B_FALSE;
b128c09f 8106 vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE);
428870ff
BB
8107
8108 /*
8109 * We want to clear the stats, but we don't want to do a full
8110 * vdev_clear() as that will cause us to throw away
8111 * degraded/faulted state as well as attempt to reopen the
8112 * device, all of which is a waste.
8113 */
8114 vd->vdev_stat.vs_read_errors = 0;
8115 vd->vdev_stat.vs_write_errors = 0;
8116 vd->vdev_stat.vs_checksum_errors = 0;
8117
b128c09f 8118 vdev_state_dirty(vd->vdev_top);
0aacde2e
RM
8119
8120 /* Tell userspace that the vdev is gone. */
8121 zfs_post_remove(spa, vd);
b128c09f 8122 }
34dc7c2f 8123
1c27024e 8124 for (int c = 0; c < vd->vdev_children; c++)
b128c09f
BB
8125 spa_async_remove(spa, vd->vdev_child[c]);
8126}
8127
8128static void
8129spa_async_probe(spa_t *spa, vdev_t *vd)
8130{
8131 if (vd->vdev_probe_wanted) {
428870ff 8132 vd->vdev_probe_wanted = B_FALSE;
b128c09f 8133 vdev_reopen(vd); /* vdev_open() does the actual probe */
34dc7c2f 8134 }
b128c09f 8135
1c27024e 8136 for (int c = 0; c < vd->vdev_children; c++)
b128c09f 8137 spa_async_probe(spa, vd->vdev_child[c]);
34dc7c2f
BB
8138}
8139
9babb374
BB
8140static void
8141spa_async_autoexpand(spa_t *spa, vdev_t *vd)
8142{
9babb374
BB
8143 if (!spa->spa_autoexpand)
8144 return;
8145
1c27024e 8146 for (int c = 0; c < vd->vdev_children; c++) {
9babb374
BB
8147 vdev_t *cvd = vd->vdev_child[c];
8148 spa_async_autoexpand(spa, cvd);
8149 }
8150
8151 if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL)
8152 return;
8153
12fa0466 8154 spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_AUTOEXPAND);
9babb374
BB
8155}
8156
460748d4 8157static __attribute__((noreturn)) void
c25b8f99 8158spa_async_thread(void *arg)
34dc7c2f 8159{
c25b8f99 8160 spa_t *spa = (spa_t *)arg;
80a91e74 8161 dsl_pool_t *dp = spa->spa_dsl_pool;
867959b5 8162 int tasks;
34dc7c2f
BB
8163
8164 ASSERT(spa->spa_sync_on);
8165
8166 mutex_enter(&spa->spa_async_lock);
8167 tasks = spa->spa_async_tasks;
8168 spa->spa_async_tasks = 0;
8169 mutex_exit(&spa->spa_async_lock);
8170
8171 /*
8172 * See if the config needs to be updated.
8173 */
8174 if (tasks & SPA_ASYNC_CONFIG_UPDATE) {
428870ff 8175 uint64_t old_space, new_space;
9babb374 8176
34dc7c2f 8177 mutex_enter(&spa_namespace_lock);
428870ff 8178 old_space = metaslab_class_get_space(spa_normal_class(spa));
cc99f275
DB
8179 old_space += metaslab_class_get_space(spa_special_class(spa));
8180 old_space += metaslab_class_get_space(spa_dedup_class(spa));
aa755b35
MA
8181 old_space += metaslab_class_get_space(
8182 spa_embedded_log_class(spa));
cc99f275 8183
34dc7c2f 8184 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
cc99f275 8185
428870ff 8186 new_space = metaslab_class_get_space(spa_normal_class(spa));
cc99f275
DB
8187 new_space += metaslab_class_get_space(spa_special_class(spa));
8188 new_space += metaslab_class_get_space(spa_dedup_class(spa));
aa755b35
MA
8189 new_space += metaslab_class_get_space(
8190 spa_embedded_log_class(spa));
34dc7c2f 8191 mutex_exit(&spa_namespace_lock);
9babb374
BB
8192
8193 /*
8194 * If the pool grew as a result of the config update,
8195 * then log an internal history event.
8196 */
428870ff 8197 if (new_space != old_space) {
6f1ffb06 8198 spa_history_log_internal(spa, "vdev online", NULL,
45d1cae3 8199 "pool '%s' size: %llu(+%llu)",
74756182
MM
8200 spa_name(spa), (u_longlong_t)new_space,
8201 (u_longlong_t)(new_space - old_space));
9babb374 8202 }
34dc7c2f
BB
8203 }
8204
8205 /*
8206 * See if any devices need to be marked REMOVED.
34dc7c2f 8207 */
b128c09f 8208 if (tasks & SPA_ASYNC_REMOVE) {
428870ff 8209 spa_vdev_state_enter(spa, SCL_NONE);
34dc7c2f 8210 spa_async_remove(spa, spa->spa_root_vdev);
867959b5 8211 for (int i = 0; i < spa->spa_l2cache.sav_count; i++)
b128c09f 8212 spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]);
867959b5 8213 for (int i = 0; i < spa->spa_spares.sav_count; i++)
b128c09f
BB
8214 spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]);
8215 (void) spa_vdev_state_exit(spa, NULL, 0);
34dc7c2f
BB
8216 }
8217
9babb374
BB
8218 if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) {
8219 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
8220 spa_async_autoexpand(spa, spa->spa_root_vdev);
8221 spa_config_exit(spa, SCL_CONFIG, FTAG);
8222 }
8223
34dc7c2f 8224 /*
b128c09f 8225 * See if any devices need to be probed.
34dc7c2f 8226 */
b128c09f 8227 if (tasks & SPA_ASYNC_PROBE) {
428870ff 8228 spa_vdev_state_enter(spa, SCL_NONE);
b128c09f
BB
8229 spa_async_probe(spa, spa->spa_root_vdev);
8230 (void) spa_vdev_state_exit(spa, NULL, 0);
8231 }
34dc7c2f
BB
8232
8233 /*
b128c09f 8234 * If any devices are done replacing, detach them.
34dc7c2f 8235 */
b2255edc
BB
8236 if (tasks & SPA_ASYNC_RESILVER_DONE ||
8237 tasks & SPA_ASYNC_REBUILD_DONE) {
b128c09f 8238 spa_vdev_resilver_done(spa);
9a49d3f3
BB
8239 }
8240
34dc7c2f
BB
8241 /*
8242 * Kick off a resilver.
8243 */
80a91e74 8244 if (tasks & SPA_ASYNC_RESILVER &&
9a49d3f3 8245 !vdev_rebuild_active(spa->spa_root_vdev) &&
80a91e74
TC
8246 (!dsl_scan_resilvering(dp) ||
8247 !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_RESILVER_DEFER)))
3c819a2c 8248 dsl_scan_restart_resilver(dp, 0);
34dc7c2f 8249
619f0976
GW
8250 if (tasks & SPA_ASYNC_INITIALIZE_RESTART) {
8251 mutex_enter(&spa_namespace_lock);
8252 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
8253 vdev_initialize_restart(spa->spa_root_vdev);
8254 spa_config_exit(spa, SCL_CONFIG, FTAG);
8255 mutex_exit(&spa_namespace_lock);
8256 }
8257
1b939560
BB
8258 if (tasks & SPA_ASYNC_TRIM_RESTART) {
8259 mutex_enter(&spa_namespace_lock);
8260 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
8261 vdev_trim_restart(spa->spa_root_vdev);
8262 spa_config_exit(spa, SCL_CONFIG, FTAG);
8263 mutex_exit(&spa_namespace_lock);
8264 }
8265
8266 if (tasks & SPA_ASYNC_AUTOTRIM_RESTART) {
8267 mutex_enter(&spa_namespace_lock);
8268 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
8269 vdev_autotrim_restart(spa);
8270 spa_config_exit(spa, SCL_CONFIG, FTAG);
8271 mutex_exit(&spa_namespace_lock);
8272 }
8273
b7654bd7
GA
8274 /*
8275 * Kick off L2 cache whole device TRIM.
8276 */
8277 if (tasks & SPA_ASYNC_L2CACHE_TRIM) {
8278 mutex_enter(&spa_namespace_lock);
8279 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
8280 vdev_trim_l2arc(spa);
8281 spa_config_exit(spa, SCL_CONFIG, FTAG);
8282 mutex_exit(&spa_namespace_lock);
8283 }
8284
77f6826b
GA
8285 /*
8286 * Kick off L2 cache rebuilding.
8287 */
8288 if (tasks & SPA_ASYNC_L2CACHE_REBUILD) {
8289 mutex_enter(&spa_namespace_lock);
8290 spa_config_enter(spa, SCL_L2ARC, FTAG, RW_READER);
8291 l2arc_spa_rebuild_start(spa);
8292 spa_config_exit(spa, SCL_L2ARC, FTAG);
8293 mutex_exit(&spa_namespace_lock);
8294 }
8295
34dc7c2f
BB
8296 /*
8297 * Let the world know that we're done.
8298 */
8299 mutex_enter(&spa->spa_async_lock);
8300 spa->spa_async_thread = NULL;
8301 cv_broadcast(&spa->spa_async_cv);
8302 mutex_exit(&spa->spa_async_lock);
8303 thread_exit();
8304}
8305
8306void
8307spa_async_suspend(spa_t *spa)
8308{
8309 mutex_enter(&spa->spa_async_lock);
8310 spa->spa_async_suspended++;
9d5b5245 8311 while (spa->spa_async_thread != NULL)
34dc7c2f
BB
8312 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock);
8313 mutex_exit(&spa->spa_async_lock);
a1d477c2
MA
8314
8315 spa_vdev_remove_suspend(spa);
9d5b5245
SD
8316
8317 zthr_t *condense_thread = spa->spa_condense_zthr;
61c3391a
SD
8318 if (condense_thread != NULL)
8319 zthr_cancel(condense_thread);
d2734cce
SD
8320
8321 zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr;
61c3391a
SD
8322 if (discard_thread != NULL)
8323 zthr_cancel(discard_thread);
37f03da8
SH
8324
8325 zthr_t *ll_delete_thread = spa->spa_livelist_delete_zthr;
8326 if (ll_delete_thread != NULL)
8327 zthr_cancel(ll_delete_thread);
8328
8329 zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr;
8330 if (ll_condense_thread != NULL)
8331 zthr_cancel(ll_condense_thread);
34dc7c2f
BB
8332}
8333
8334void
8335spa_async_resume(spa_t *spa)
8336{
8337 mutex_enter(&spa->spa_async_lock);
8338 ASSERT(spa->spa_async_suspended != 0);
8339 spa->spa_async_suspended--;
8340 mutex_exit(&spa->spa_async_lock);
a1d477c2 8341 spa_restart_removal(spa);
9d5b5245
SD
8342
8343 zthr_t *condense_thread = spa->spa_condense_zthr;
61c3391a 8344 if (condense_thread != NULL)
9d5b5245 8345 zthr_resume(condense_thread);
d2734cce
SD
8346
8347 zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr;
61c3391a 8348 if (discard_thread != NULL)
d2734cce 8349 zthr_resume(discard_thread);
37f03da8
SH
8350
8351 zthr_t *ll_delete_thread = spa->spa_livelist_delete_zthr;
8352 if (ll_delete_thread != NULL)
8353 zthr_resume(ll_delete_thread);
8354
8355 zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr;
8356 if (ll_condense_thread != NULL)
8357 zthr_resume(ll_condense_thread);
34dc7c2f
BB
8358}
8359
e6cfd633
WA
8360static boolean_t
8361spa_async_tasks_pending(spa_t *spa)
8362{
8363 uint_t non_config_tasks;
8364 uint_t config_task;
8365 boolean_t config_task_suspended;
8366
8367 non_config_tasks = spa->spa_async_tasks & ~SPA_ASYNC_CONFIG_UPDATE;
8368 config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE;
8369 if (spa->spa_ccw_fail_time == 0) {
8370 config_task_suspended = B_FALSE;
8371 } else {
8372 config_task_suspended =
8373 (gethrtime() - spa->spa_ccw_fail_time) <
05852b34 8374 ((hrtime_t)zfs_ccw_retry_interval * NANOSEC);
e6cfd633
WA
8375 }
8376
8377 return (non_config_tasks || (config_task && !config_task_suspended));
8378}
8379
34dc7c2f
BB
8380static void
8381spa_async_dispatch(spa_t *spa)
8382{
8383 mutex_enter(&spa->spa_async_lock);
e6cfd633
WA
8384 if (spa_async_tasks_pending(spa) &&
8385 !spa->spa_async_suspended &&
da92d5cb 8386 spa->spa_async_thread == NULL)
34dc7c2f
BB
8387 spa->spa_async_thread = thread_create(NULL, 0,
8388 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri);
8389 mutex_exit(&spa->spa_async_lock);
8390}
8391
8392void
8393spa_async_request(spa_t *spa, int task)
8394{
428870ff 8395 zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task);
34dc7c2f
BB
8396 mutex_enter(&spa->spa_async_lock);
8397 spa->spa_async_tasks |= task;
8398 mutex_exit(&spa->spa_async_lock);
8399}
8400
3c819a2c
JP
8401int
8402spa_async_tasks(spa_t *spa)
8403{
8404 return (spa->spa_async_tasks);
8405}
8406
34dc7c2f
BB
8407/*
8408 * ==========================================================================
8409 * SPA syncing routines
8410 * ==========================================================================
8411 */
8412
37f03da8 8413
428870ff 8414static int
37f03da8
SH
8415bpobj_enqueue_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
8416 dmu_tx_t *tx)
34dc7c2f 8417{
428870ff 8418 bpobj_t *bpo = arg;
37f03da8 8419 bpobj_enqueue(bpo, bp, bp_freed, tx);
428870ff
BB
8420 return (0);
8421}
34dc7c2f 8422
37f03da8
SH
8423int
8424bpobj_enqueue_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
8425{
8426 return (bpobj_enqueue_cb(arg, bp, B_FALSE, tx));
8427}
8428
8429int
8430bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
8431{
8432 return (bpobj_enqueue_cb(arg, bp, B_TRUE, tx));
8433}
8434
428870ff
BB
8435static int
8436spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
8437{
9cdf7b1f 8438 zio_t *pio = arg;
34dc7c2f 8439
9cdf7b1f
MA
8440 zio_nowait(zio_free_sync(pio, pio->io_spa, dmu_tx_get_txg(tx), bp,
8441 pio->io_flags));
428870ff 8442 return (0);
34dc7c2f
BB
8443}
8444
37f03da8
SH
8445static int
8446bpobj_spa_free_sync_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
8447 dmu_tx_t *tx)
8448{
8449 ASSERT(!bp_freed);
8450 return (spa_free_sync_cb(arg, bp, tx));
8451}
8452
e8b96c60
MA
8453/*
8454 * Note: this simple function is not inlined to make it easier to dtrace the
8455 * amount of time spent syncing frees.
8456 */
8457static void
8458spa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx)
8459{
8460 zio_t *zio = zio_root(spa, NULL, NULL, 0);
8461 bplist_iterate(bpl, spa_free_sync_cb, zio, tx);
8462 VERIFY(zio_wait(zio) == 0);
8463}
8464
8465/*
8466 * Note: this simple function is not inlined to make it easier to dtrace the
8467 * amount of time spent syncing deferred frees.
8468 */
8469static void
8470spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx)
8471{
8dc2197b
SD
8472 if (spa_sync_pass(spa) != 1)
8473 return;
8474
93e28d66
SD
8475 /*
8476 * Note:
8477 * If the log space map feature is active, we stop deferring
8478 * frees to the next TXG and therefore running this function
8479 * would be considered a no-op as spa_deferred_bpobj should
8480 * not have any entries.
8481 *
8482 * That said we run this function anyway (instead of returning
8483 * immediately) for the edge-case scenario where we just
8484 * activated the log space map feature in this TXG but we have
8485 * deferred frees from the previous TXG.
8486 */
e8b96c60
MA
8487 zio_t *zio = zio_root(spa, NULL, NULL, 0);
8488 VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj,
37f03da8 8489 bpobj_spa_free_sync_cb, zio, tx), ==, 0);
e8b96c60
MA
8490 VERIFY0(zio_wait(zio));
8491}
8492
34dc7c2f
BB
8493static void
8494spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx)
8495{
8496 char *packed = NULL;
b128c09f 8497 size_t bufsize;
34dc7c2f
BB
8498 size_t nvsize = 0;
8499 dmu_buf_t *db;
8500
8501 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0);
8502
b128c09f
BB
8503 /*
8504 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration
b0bc7a84 8505 * information. This avoids the dmu_buf_will_dirty() path and
b128c09f
BB
8506 * saves us a pre-read to get data we don't actually care about.
8507 */
9ae529ec 8508 bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE);
79c76d5b 8509 packed = vmem_alloc(bufsize, KM_SLEEP);
34dc7c2f
BB
8510
8511 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR,
79c76d5b 8512 KM_SLEEP) == 0);
861166b0 8513 memset(packed + nvsize, 0, bufsize - nvsize);
34dc7c2f 8514
b128c09f 8515 dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx);
34dc7c2f 8516
00b46022 8517 vmem_free(packed, bufsize);
34dc7c2f
BB
8518
8519 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
8520 dmu_buf_will_dirty(db, tx);
8521 *(uint64_t *)db->db_data = nvsize;
8522 dmu_buf_rele(db, FTAG);
8523}
8524
8525static void
8526spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx,
8527 const char *config, const char *entry)
8528{
8529 nvlist_t *nvroot;
8530 nvlist_t **list;
8531 int i;
8532
8533 if (!sav->sav_sync)
8534 return;
8535
8536 /*
8537 * Update the MOS nvlist describing the list of available devices.
8538 * spa_validate_aux() will have already made sure this nvlist is
8539 * valid and the vdevs are labeled appropriately.
8540 */
8541 if (sav->sav_object == 0) {
8542 sav->sav_object = dmu_object_alloc(spa->spa_meta_objset,
8543 DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE,
8544 sizeof (uint64_t), tx);
8545 VERIFY(zap_update(spa->spa_meta_objset,
8546 DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1,
8547 &sav->sav_object, tx) == 0);
8548 }
8549
65ad5d11 8550 nvroot = fnvlist_alloc();
34dc7c2f 8551 if (sav->sav_count == 0) {
795075e6
PD
8552 fnvlist_add_nvlist_array(nvroot, config,
8553 (const nvlist_t * const *)NULL, 0);
34dc7c2f 8554 } else {
79c76d5b 8555 list = kmem_alloc(sav->sav_count*sizeof (void *), KM_SLEEP);
34dc7c2f
BB
8556 for (i = 0; i < sav->sav_count; i++)
8557 list[i] = vdev_config_generate(spa, sav->sav_vdevs[i],
428870ff 8558 B_FALSE, VDEV_CONFIG_L2CACHE);
795075e6
PD
8559 fnvlist_add_nvlist_array(nvroot, config,
8560 (const nvlist_t * const *)list, sav->sav_count);
34dc7c2f
BB
8561 for (i = 0; i < sav->sav_count; i++)
8562 nvlist_free(list[i]);
8563 kmem_free(list, sav->sav_count * sizeof (void *));
8564 }
8565
8566 spa_sync_nvlist(spa, sav->sav_object, nvroot, tx);
8567 nvlist_free(nvroot);
8568
8569 sav->sav_sync = B_FALSE;
8570}
8571
e0ab3ab5
JS
8572/*
8573 * Rebuild spa's all-vdev ZAP from the vdev ZAPs indicated in each vdev_t.
8574 * The all-vdev ZAP must be empty.
8575 */
8576static void
8577spa_avz_build(vdev_t *vd, uint64_t avz, dmu_tx_t *tx)
8578{
8579 spa_t *spa = vd->vdev_spa;
e0ab3ab5
JS
8580
8581 if (vd->vdev_top_zap != 0) {
8582 VERIFY0(zap_add_int(spa->spa_meta_objset, avz,
8583 vd->vdev_top_zap, tx));
8584 }
8585 if (vd->vdev_leaf_zap != 0) {
8586 VERIFY0(zap_add_int(spa->spa_meta_objset, avz,
8587 vd->vdev_leaf_zap, tx));
8588 }
1c27024e 8589 for (uint64_t i = 0; i < vd->vdev_children; i++) {
e0ab3ab5
JS
8590 spa_avz_build(vd->vdev_child[i], avz, tx);
8591 }
8592}
8593
34dc7c2f
BB
8594static void
8595spa_sync_config_object(spa_t *spa, dmu_tx_t *tx)
8596{
8597 nvlist_t *config;
8598
e0ab3ab5
JS
8599 /*
8600 * If the pool is being imported from a pre-per-vdev-ZAP version of ZFS,
8601 * its config may not be dirty but we still need to build per-vdev ZAPs.
8602 * Similarly, if the pool is being assembled (e.g. after a split), we
8603 * need to rebuild the AVZ although the config may not be dirty.
8604 */
8605 if (list_is_empty(&spa->spa_config_dirty_list) &&
8606 spa->spa_avz_action == AVZ_ACTION_NONE)
34dc7c2f
BB
8607 return;
8608
b128c09f
BB
8609 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
8610
e0ab3ab5 8611 ASSERT(spa->spa_avz_action == AVZ_ACTION_NONE ||
38640550 8612 spa->spa_avz_action == AVZ_ACTION_INITIALIZE ||
e0ab3ab5
JS
8613 spa->spa_all_vdev_zaps != 0);
8614
8615 if (spa->spa_avz_action == AVZ_ACTION_REBUILD) {
e0ab3ab5
JS
8616 /* Make and build the new AVZ */
8617 uint64_t new_avz = zap_create(spa->spa_meta_objset,
8618 DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx);
8619 spa_avz_build(spa->spa_root_vdev, new_avz, tx);
8620
8621 /* Diff old AVZ with new one */
1c27024e
DB
8622 zap_cursor_t zc;
8623 zap_attribute_t za;
8624
e0ab3ab5
JS
8625 for (zap_cursor_init(&zc, spa->spa_meta_objset,
8626 spa->spa_all_vdev_zaps);
8627 zap_cursor_retrieve(&zc, &za) == 0;
8628 zap_cursor_advance(&zc)) {
8629 uint64_t vdzap = za.za_first_integer;
8630 if (zap_lookup_int(spa->spa_meta_objset, new_avz,
8631 vdzap) == ENOENT) {
8632 /*
8633 * ZAP is listed in old AVZ but not in new one;
8634 * destroy it
8635 */
8636 VERIFY0(zap_destroy(spa->spa_meta_objset, vdzap,
8637 tx));
8638 }
8639 }
8640
8641 zap_cursor_fini(&zc);
8642
8643 /* Destroy the old AVZ */
8644 VERIFY0(zap_destroy(spa->spa_meta_objset,
8645 spa->spa_all_vdev_zaps, tx));
8646
8647 /* Replace the old AVZ in the dir obj with the new one */
8648 VERIFY0(zap_update(spa->spa_meta_objset,
8649 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP,
8650 sizeof (new_avz), 1, &new_avz, tx));
8651
8652 spa->spa_all_vdev_zaps = new_avz;
8653 } else if (spa->spa_avz_action == AVZ_ACTION_DESTROY) {
8654 zap_cursor_t zc;
8655 zap_attribute_t za;
8656
8657 /* Walk through the AVZ and destroy all listed ZAPs */
8658 for (zap_cursor_init(&zc, spa->spa_meta_objset,
8659 spa->spa_all_vdev_zaps);
8660 zap_cursor_retrieve(&zc, &za) == 0;
8661 zap_cursor_advance(&zc)) {
8662 uint64_t zap = za.za_first_integer;
8663 VERIFY0(zap_destroy(spa->spa_meta_objset, zap, tx));
8664 }
8665
8666 zap_cursor_fini(&zc);
8667
8668 /* Destroy and unlink the AVZ itself */
8669 VERIFY0(zap_destroy(spa->spa_meta_objset,
8670 spa->spa_all_vdev_zaps, tx));
8671 VERIFY0(zap_remove(spa->spa_meta_objset,
8672 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, tx));
8673 spa->spa_all_vdev_zaps = 0;
8674 }
8675
8676 if (spa->spa_all_vdev_zaps == 0) {
8677 spa->spa_all_vdev_zaps = zap_create_link(spa->spa_meta_objset,
8678 DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT,
8679 DMU_POOL_VDEV_ZAP_MAP, tx);
8680 }
8681 spa->spa_avz_action = AVZ_ACTION_NONE;
8682
8683 /* Create ZAPs for vdevs that don't have them. */
8684 vdev_construct_zaps(spa->spa_root_vdev, tx);
8685
b128c09f
BB
8686 config = spa_config_generate(spa, spa->spa_root_vdev,
8687 dmu_tx_get_txg(tx), B_FALSE);
8688
ea0b2538
GW
8689 /*
8690 * If we're upgrading the spa version then make sure that
8691 * the config object gets updated with the correct version.
8692 */
8693 if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version)
8694 fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
8695 spa->spa_uberblock.ub_version);
8696
b128c09f 8697 spa_config_exit(spa, SCL_STATE, FTAG);
34dc7c2f 8698
8a5fc748 8699 nvlist_free(spa->spa_config_syncing);
34dc7c2f
BB
8700 spa->spa_config_syncing = config;
8701
8702 spa_sync_nvlist(spa, spa->spa_config_object, config, tx);
8703}
8704
9ae529ec 8705static void
13fe0198 8706spa_sync_version(void *arg, dmu_tx_t *tx)
9ae529ec 8707{
13fe0198
MA
8708 uint64_t *versionp = arg;
8709 uint64_t version = *versionp;
8710 spa_t *spa = dmu_tx_pool(tx)->dp_spa;
9ae529ec
CS
8711
8712 /*
8713 * Setting the version is special cased when first creating the pool.
8714 */
8715 ASSERT(tx->tx_txg != TXG_INITIAL);
8716
8dca0a9a 8717 ASSERT(SPA_VERSION_IS_SUPPORTED(version));
9ae529ec
CS
8718 ASSERT(version >= spa_version(spa));
8719
8720 spa->spa_uberblock.ub_version = version;
8721 vdev_config_dirty(spa->spa_root_vdev);
74756182
MM
8722 spa_history_log_internal(spa, "set", tx, "version=%lld",
8723 (longlong_t)version);
9ae529ec
CS
8724}
8725
34dc7c2f
BB
8726/*
8727 * Set zpool properties.
8728 */
8729static void
13fe0198 8730spa_sync_props(void *arg, dmu_tx_t *tx)
34dc7c2f 8731{
13fe0198
MA
8732 nvlist_t *nvp = arg;
8733 spa_t *spa = dmu_tx_pool(tx)->dp_spa;
34dc7c2f 8734 objset_t *mos = spa->spa_meta_objset;
9ae529ec 8735 nvpair_t *elem = NULL;
b128c09f
BB
8736
8737 mutex_enter(&spa->spa_props_lock);
34dc7c2f 8738
34dc7c2f 8739 while ((elem = nvlist_next_nvpair(nvp, elem))) {
9ae529ec
CS
8740 uint64_t intval;
8741 char *strval, *fname;
8742 zpool_prop_t prop;
8743 const char *propname;
8744 zprop_type_t proptype;
fa86b5db 8745 spa_feature_t fid;
9ae529ec 8746
31864e3d
BB
8747 switch (prop = zpool_name_to_prop(nvpair_name(elem))) {
8748 case ZPOOL_PROP_INVAL:
9ae529ec
CS
8749 /*
8750 * We checked this earlier in spa_prop_validate().
8751 */
8752 ASSERT(zpool_prop_feature(nvpair_name(elem)));
8753
8754 fname = strchr(nvpair_name(elem), '@') + 1;
fa86b5db 8755 VERIFY0(zfeature_lookup_name(fname, &fid));
9ae529ec 8756
fa86b5db 8757 spa_feature_enable(spa, fid, tx);
6f1ffb06
MA
8758 spa_history_log_internal(spa, "set", tx,
8759 "%s=enabled", nvpair_name(elem));
9ae529ec
CS
8760 break;
8761
34dc7c2f 8762 case ZPOOL_PROP_VERSION:
93cf2076 8763 intval = fnvpair_value_uint64(elem);
34dc7c2f 8764 /*
4e33ba4c 8765 * The version is synced separately before other
9ae529ec 8766 * properties and should be correct by now.
34dc7c2f 8767 */
9ae529ec 8768 ASSERT3U(spa_version(spa), >=, intval);
34dc7c2f
BB
8769 break;
8770
8771 case ZPOOL_PROP_ALTROOT:
8772 /*
8773 * 'altroot' is a non-persistent property. It should
8774 * have been set temporarily at creation or import time.
8775 */
8776 ASSERT(spa->spa_root != NULL);
8777 break;
8778
572e2857 8779 case ZPOOL_PROP_READONLY:
34dc7c2f
BB
8780 case ZPOOL_PROP_CACHEFILE:
8781 /*
e1cfd73f 8782 * 'readonly' and 'cachefile' are also non-persistent
572e2857 8783 * properties.
34dc7c2f 8784 */
34dc7c2f 8785 break;
d96eb2b1 8786 case ZPOOL_PROP_COMMENT:
93cf2076 8787 strval = fnvpair_value_string(elem);
d96eb2b1
DM
8788 if (spa->spa_comment != NULL)
8789 spa_strfree(spa->spa_comment);
8790 spa->spa_comment = spa_strdup(strval);
8791 /*
8792 * We need to dirty the configuration on all the vdevs
88a48330
BB
8793 * so that their labels get updated. We also need to
8794 * update the cache file to keep it in sync with the
8795 * MOS version. It's unnecessary to do this for pool
8796 * creation since the vdev's configuration has already
8797 * been dirtied.
d96eb2b1 8798 */
88a48330 8799 if (tx->tx_txg != TXG_INITIAL) {
d96eb2b1 8800 vdev_config_dirty(spa->spa_root_vdev);
88a48330
BB
8801 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
8802 }
6f1ffb06
MA
8803 spa_history_log_internal(spa, "set", tx,
8804 "%s=%s", nvpair_name(elem), strval);
d96eb2b1 8805 break;
658fb802
CB
8806 case ZPOOL_PROP_COMPATIBILITY:
8807 strval = fnvpair_value_string(elem);
8808 if (spa->spa_compatibility != NULL)
8809 spa_strfree(spa->spa_compatibility);
8810 spa->spa_compatibility = spa_strdup(strval);
8811 /*
8812 * Dirty the configuration on vdevs as above.
8813 */
88a48330 8814 if (tx->tx_txg != TXG_INITIAL) {
658fb802 8815 vdev_config_dirty(spa->spa_root_vdev);
88a48330
BB
8816 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
8817 }
8818
658fb802
CB
8819 spa_history_log_internal(spa, "set", tx,
8820 "%s=%s", nvpair_name(elem), strval);
8821 break;
8822
34dc7c2f
BB
8823 default:
8824 /*
8825 * Set pool property values in the poolprops mos object.
8826 */
34dc7c2f 8827 if (spa->spa_pool_props_object == 0) {
9ae529ec
CS
8828 spa->spa_pool_props_object =
8829 zap_create_link(mos, DMU_OT_POOL_PROPS,
34dc7c2f 8830 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS,
9ae529ec 8831 tx);
34dc7c2f 8832 }
34dc7c2f
BB
8833
8834 /* normalize the property name */
8835 propname = zpool_prop_to_name(prop);
8836 proptype = zpool_prop_get_type(prop);
8837
8838 if (nvpair_type(elem) == DATA_TYPE_STRING) {
8839 ASSERT(proptype == PROP_TYPE_STRING);
93cf2076
GW
8840 strval = fnvpair_value_string(elem);
8841 VERIFY0(zap_update(mos,
34dc7c2f 8842 spa->spa_pool_props_object, propname,
93cf2076 8843 1, strlen(strval) + 1, strval, tx));
6f1ffb06
MA
8844 spa_history_log_internal(spa, "set", tx,
8845 "%s=%s", nvpair_name(elem), strval);
34dc7c2f 8846 } else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
93cf2076 8847 intval = fnvpair_value_uint64(elem);
34dc7c2f
BB
8848
8849 if (proptype == PROP_TYPE_INDEX) {
8850 const char *unused;
93cf2076
GW
8851 VERIFY0(zpool_prop_index_to_string(
8852 prop, intval, &unused));
34dc7c2f 8853 }
93cf2076 8854 VERIFY0(zap_update(mos,
34dc7c2f 8855 spa->spa_pool_props_object, propname,
93cf2076 8856 8, 1, &intval, tx));
6f1ffb06 8857 spa_history_log_internal(spa, "set", tx,
74756182
MM
8858 "%s=%lld", nvpair_name(elem),
8859 (longlong_t)intval);
34dc7c2f
BB
8860 } else {
8861 ASSERT(0); /* not allowed */
8862 }
8863
8864 switch (prop) {
8865 case ZPOOL_PROP_DELEGATION:
8866 spa->spa_delegation = intval;
8867 break;
8868 case ZPOOL_PROP_BOOTFS:
8869 spa->spa_bootfs = intval;
8870 break;
8871 case ZPOOL_PROP_FAILUREMODE:
8872 spa->spa_failmode = intval;
8873 break;
1b939560
BB
8874 case ZPOOL_PROP_AUTOTRIM:
8875 spa->spa_autotrim = intval;
8876 spa_async_request(spa,
8877 SPA_ASYNC_AUTOTRIM_RESTART);
8878 break;
9babb374
BB
8879 case ZPOOL_PROP_AUTOEXPAND:
8880 spa->spa_autoexpand = intval;
428870ff
BB
8881 if (tx->tx_txg != TXG_INITIAL)
8882 spa_async_request(spa,
8883 SPA_ASYNC_AUTOEXPAND);
8884 break;
379ca9cf
OF
8885 case ZPOOL_PROP_MULTIHOST:
8886 spa->spa_multihost = intval;
8887 break;
34dc7c2f
BB
8888 default:
8889 break;
8890 }
8891 }
8892
34dc7c2f 8893 }
b128c09f
BB
8894
8895 mutex_exit(&spa->spa_props_lock);
34dc7c2f
BB
8896}
8897
428870ff
BB
8898/*
8899 * Perform one-time upgrade on-disk changes. spa_version() does not
8900 * reflect the new version this txg, so there must be no changes this
8901 * txg to anything that the upgrade code depends on after it executes.
8902 * Therefore this must be called after dsl_pool_sync() does the sync
8903 * tasks.
8904 */
8905static void
8906spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx)
8907{
8dc2197b
SD
8908 if (spa_sync_pass(spa) != 1)
8909 return;
428870ff 8910
8dc2197b 8911 dsl_pool_t *dp = spa->spa_dsl_pool;
13fe0198
MA
8912 rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
8913
428870ff
BB
8914 if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN &&
8915 spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) {
8916 dsl_pool_create_origin(dp, tx);
8917
8918 /* Keeping the origin open increases spa_minref */
8919 spa->spa_minref += 3;
8920 }
8921
8922 if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES &&
8923 spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) {
8924 dsl_pool_upgrade_clones(dp, tx);
8925 }
8926
8927 if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES &&
8928 spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) {
8929 dsl_pool_upgrade_dir_clones(dp, tx);
8930
8931 /* Keeping the freedir open increases spa_minref */
8932 spa->spa_minref += 3;
8933 }
9ae529ec
CS
8934
8935 if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES &&
8936 spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) {
8937 spa_feature_create_zap_objects(spa, tx);
8938 }
62bdd5eb
DL
8939
8940 /*
8941 * LZ4_COMPRESS feature's behaviour was changed to activate_on_enable
8942 * when possibility to use lz4 compression for metadata was added
8943 * Old pools that have this feature enabled must be upgraded to have
8944 * this feature active
8945 */
8946 if (spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) {
8947 boolean_t lz4_en = spa_feature_is_enabled(spa,
8948 SPA_FEATURE_LZ4_COMPRESS);
8949 boolean_t lz4_ac = spa_feature_is_active(spa,
8950 SPA_FEATURE_LZ4_COMPRESS);
8951
8952 if (lz4_en && !lz4_ac)
8953 spa_feature_incr(spa, SPA_FEATURE_LZ4_COMPRESS, tx);
8954 }
3c67d83a
TH
8955
8956 /*
8957 * If we haven't written the salt, do so now. Note that the
8958 * feature may not be activated yet, but that's fine since
8959 * the presence of this ZAP entry is backwards compatible.
8960 */
8961 if (zap_contains(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
8962 DMU_POOL_CHECKSUM_SALT) == ENOENT) {
8963 VERIFY0(zap_add(spa->spa_meta_objset,
8964 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT, 1,
8965 sizeof (spa->spa_cksum_salt.zcs_bytes),
8966 spa->spa_cksum_salt.zcs_bytes, tx));
8967 }
8968
13fe0198 8969 rrw_exit(&dp->dp_config_rwlock, FTAG);
428870ff
BB
8970}
8971
a1d477c2
MA
8972static void
8973vdev_indirect_state_sync_verify(vdev_t *vd)
8974{
2a8ba608
MM
8975 vdev_indirect_mapping_t *vim __maybe_unused = vd->vdev_indirect_mapping;
8976 vdev_indirect_births_t *vib __maybe_unused = vd->vdev_indirect_births;
a1d477c2
MA
8977
8978 if (vd->vdev_ops == &vdev_indirect_ops) {
8979 ASSERT(vim != NULL);
8980 ASSERT(vib != NULL);
8981 }
8982
27f80e85
BB
8983 uint64_t obsolete_sm_object = 0;
8984 ASSERT0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));
8985 if (obsolete_sm_object != 0) {
a1d477c2
MA
8986 ASSERT(vd->vdev_obsolete_sm != NULL);
8987 ASSERT(vd->vdev_removing ||
8988 vd->vdev_ops == &vdev_indirect_ops);
8989 ASSERT(vdev_indirect_mapping_num_entries(vim) > 0);
8990 ASSERT(vdev_indirect_mapping_bytes_mapped(vim) > 0);
27f80e85 8991 ASSERT3U(obsolete_sm_object, ==,
a1d477c2
MA
8992 space_map_object(vd->vdev_obsolete_sm));
8993 ASSERT3U(vdev_indirect_mapping_bytes_mapped(vim), >=,
8994 space_map_allocated(vd->vdev_obsolete_sm));
8995 }
8996 ASSERT(vd->vdev_obsolete_segments != NULL);
8997
8998 /*
8999 * Since frees / remaps to an indirect vdev can only
9000 * happen in syncing context, the obsolete segments
9001 * tree must be empty when we start syncing.
9002 */
9003 ASSERT0(range_tree_space(vd->vdev_obsolete_segments));
9004}
9005
34dc7c2f 9006/*
8dc2197b
SD
9007 * Set the top-level vdev's max queue depth. Evaluate each top-level's
9008 * async write queue depth in case it changed. The max queue depth will
9009 * not change in the middle of syncing out this txg.
34dc7c2f 9010 */
8dc2197b
SD
9011static void
9012spa_sync_adjust_vdev_max_queue_depth(spa_t *spa)
34dc7c2f 9013{
8dc2197b
SD
9014 ASSERT(spa_writeable(spa));
9015
34dc7c2f 9016 vdev_t *rvd = spa->spa_root_vdev;
3dfb57a3
DB
9017 uint32_t max_queue_depth = zfs_vdev_async_write_max_active *
9018 zfs_vdev_queue_depth_pct / 100;
8dc2197b
SD
9019 metaslab_class_t *normal = spa_normal_class(spa);
9020 metaslab_class_t *special = spa_special_class(spa);
9021 metaslab_class_t *dedup = spa_dedup_class(spa);
34dc7c2f 9022
492f64e9 9023 uint64_t slots_per_allocator = 0;
1c27024e 9024 for (int c = 0; c < rvd->vdev_children; c++) {
3dfb57a3 9025 vdev_t *tvd = rvd->vdev_child[c];
cc99f275 9026
8dc2197b 9027 metaslab_group_t *mg = tvd->vdev_mg;
cc99f275
DB
9028 if (mg == NULL || !metaslab_group_initialized(mg))
9029 continue;
3dfb57a3 9030
8dc2197b 9031 metaslab_class_t *mc = mg->mg_class;
cc99f275 9032 if (mc != normal && mc != special && mc != dedup)
3dfb57a3
DB
9033 continue;
9034
9035 /*
9036 * It is safe to do a lock-free check here because only async
9037 * allocations look at mg_max_alloc_queue_depth, and async
9038 * allocations all happen from spa_sync().
9039 */
32d805c3 9040 for (int i = 0; i < mg->mg_allocators; i++) {
424fd7c3 9041 ASSERT0(zfs_refcount_count(
32d805c3
MA
9042 &(mg->mg_allocator[i].mga_alloc_queue_depth)));
9043 }
3dfb57a3 9044 mg->mg_max_alloc_queue_depth = max_queue_depth;
492f64e9 9045
32d805c3
MA
9046 for (int i = 0; i < mg->mg_allocators; i++) {
9047 mg->mg_allocator[i].mga_cur_max_alloc_queue_depth =
492f64e9
PD
9048 zfs_vdev_def_queue_depth;
9049 }
9050 slots_per_allocator += zfs_vdev_def_queue_depth;
3dfb57a3 9051 }
cc99f275 9052
492f64e9 9053 for (int i = 0; i < spa->spa_alloc_count; i++) {
f8020c93
AM
9054 ASSERT0(zfs_refcount_count(&normal->mc_allocator[i].
9055 mca_alloc_slots));
9056 ASSERT0(zfs_refcount_count(&special->mc_allocator[i].
9057 mca_alloc_slots));
9058 ASSERT0(zfs_refcount_count(&dedup->mc_allocator[i].
9059 mca_alloc_slots));
9060 normal->mc_allocator[i].mca_alloc_max_slots =
9061 slots_per_allocator;
9062 special->mc_allocator[i].mca_alloc_max_slots =
9063 slots_per_allocator;
9064 dedup->mc_allocator[i].mca_alloc_max_slots =
9065 slots_per_allocator;
cc99f275
DB
9066 }
9067 normal->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
9068 special->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
9069 dedup->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
8dc2197b
SD
9070}
9071
9072static void
9073spa_sync_condense_indirect(spa_t *spa, dmu_tx_t *tx)
9074{
9075 ASSERT(spa_writeable(spa));
3dfb57a3 9076
8dc2197b 9077 vdev_t *rvd = spa->spa_root_vdev;
a1d477c2
MA
9078 for (int c = 0; c < rvd->vdev_children; c++) {
9079 vdev_t *vd = rvd->vdev_child[c];
9080 vdev_indirect_state_sync_verify(vd);
9081
9082 if (vdev_indirect_should_condense(vd)) {
9083 spa_condense_indirect_start_sync(vd, tx);
9084 break;
9085 }
9086 }
8dc2197b
SD
9087}
9088
9089static void
9090spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx)
9091{
9092 objset_t *mos = spa->spa_meta_objset;
9093 dsl_pool_t *dp = spa->spa_dsl_pool;
9094 uint64_t txg = tx->tx_txg;
9095 bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK];
a1d477c2 9096
34dc7c2f 9097 do {
428870ff 9098 int pass = ++spa->spa_sync_pass;
34dc7c2f
BB
9099
9100 spa_sync_config_object(spa, tx);
9101 spa_sync_aux_dev(spa, &spa->spa_spares, tx,
9102 ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES);
9103 spa_sync_aux_dev(spa, &spa->spa_l2cache, tx,
9104 ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE);
9105 spa_errlog_sync(spa, txg);
9106 dsl_pool_sync(dp, txg);
9107
93e28d66
SD
9108 if (pass < zfs_sync_pass_deferred_free ||
9109 spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) {
9110 /*
9111 * If the log space map feature is active we don't
9112 * care about deferred frees and the deferred bpobj
9113 * as the log space map should effectively have the
9114 * same results (i.e. appending only to one object).
9115 */
e8b96c60 9116 spa_sync_frees(spa, free_bpl, tx);
428870ff 9117 } else {
905edb40
MA
9118 /*
9119 * We can not defer frees in pass 1, because
9120 * we sync the deferred frees later in pass 1.
9121 */
9122 ASSERT3U(pass, >, 1);
37f03da8 9123 bplist_iterate(free_bpl, bpobj_enqueue_alloc_cb,
e8b96c60 9124 &spa->spa_deferred_bpobj, tx);
34dc7c2f
BB
9125 }
9126
428870ff
BB
9127 ddt_sync(spa, txg);
9128 dsl_scan_sync(dp, tx);
8dc2197b
SD
9129 svr_sync(spa, tx);
9130 spa_sync_upgrades(spa, tx);
34dc7c2f 9131
93e28d66
SD
9132 spa_flush_metaslabs(spa, tx);
9133
8dc2197b 9134 vdev_t *vd = NULL;
a1d477c2
MA
9135 while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, txg))
9136 != NULL)
428870ff
BB
9137 vdev_sync(vd, txg);
9138
8dc2197b
SD
9139 /*
9140 * Note: We need to check if the MOS is dirty because we could
9141 * have marked the MOS dirty without updating the uberblock
9142 * (e.g. if we have sync tasks but no dirty user data). We need
9143 * to check the uberblock's rootbp because it is updated if we
9144 * have synced out dirty data (though in this case the MOS will
9145 * most likely also be dirty due to second order effects, we
9146 * don't want to rely on that here).
9147 */
9148 if (pass == 1 &&
9149 spa->spa_uberblock.ub_rootbp.blk_birth < txg &&
9150 !dmu_objset_is_dirty(mos, txg)) {
905edb40 9151 /*
8dc2197b
SD
9152 * Nothing changed on the first pass, therefore this
9153 * TXG is a no-op. Avoid syncing deferred frees, so
9154 * that we can keep this TXG as a no-op.
905edb40 9155 */
8dc2197b
SD
9156 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
9157 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
9158 ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg));
9159 ASSERT(txg_list_empty(&dp->dp_early_sync_tasks, txg));
9160 break;
905edb40 9161 }
34dc7c2f 9162
8dc2197b 9163 spa_sync_deferred_frees(spa, tx);
428870ff 9164 } while (dmu_objset_is_dirty(mos, txg));
8dc2197b 9165}
34dc7c2f 9166
8dc2197b
SD
9167/*
9168 * Rewrite the vdev configuration (which includes the uberblock) to
9169 * commit the transaction group.
9170 *
9171 * If there are no dirty vdevs, we sync the uberblock to a few random
9172 * top-level vdevs that are known to be visible in the config cache
9173 * (see spa_vdev_add() for a complete description). If there *are* dirty
9174 * vdevs, sync the uberblock to all vdevs.
9175 */
9176static void
9177spa_sync_rewrite_vdev_config(spa_t *spa, dmu_tx_t *tx)
9178{
9179 vdev_t *rvd = spa->spa_root_vdev;
9180 uint64_t txg = tx->tx_txg;
a1d477c2 9181
b128c09f 9182 for (;;) {
8dc2197b
SD
9183 int error = 0;
9184
b128c09f
BB
9185 /*
9186 * We hold SCL_STATE to prevent vdev open/close/etc.
9187 * while we're attempting to write the vdev labels.
9188 */
9189 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
9190
9191 if (list_is_empty(&spa->spa_config_dirty_list)) {
d2734cce 9192 vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL };
b128c09f
BB
9193 int svdcount = 0;
9194 int children = rvd->vdev_children;
29274c9f 9195 int c0 = random_in_range(children);
b128c09f 9196
1c27024e 9197 for (int c = 0; c < children; c++) {
8dc2197b
SD
9198 vdev_t *vd =
9199 rvd->vdev_child[(c0 + c) % children];
d2734cce
SD
9200
9201 /* Stop when revisiting the first vdev */
9202 if (c > 0 && svd[0] == vd)
9203 break;
9204
8dc2197b
SD
9205 if (vd->vdev_ms_array == 0 ||
9206 vd->vdev_islog ||
a1d477c2 9207 !vdev_is_concrete(vd))
b128c09f 9208 continue;
d2734cce 9209
b128c09f 9210 svd[svdcount++] = vd;
6cb8e530 9211 if (svdcount == SPA_SYNC_MIN_VDEVS)
b128c09f
BB
9212 break;
9213 }
b6fcb792 9214 error = vdev_config_sync(svd, svdcount, txg);
b128c09f
BB
9215 } else {
9216 error = vdev_config_sync(rvd->vdev_child,
b6fcb792 9217 rvd->vdev_children, txg);
34dc7c2f 9218 }
34dc7c2f 9219
3bc7e0fb
GW
9220 if (error == 0)
9221 spa->spa_last_synced_guid = rvd->vdev_guid;
9222
b128c09f
BB
9223 spa_config_exit(spa, SCL_STATE, FTAG);
9224
9225 if (error == 0)
9226 break;
cec3a0a1 9227 zio_suspend(spa, NULL, ZIO_SUSPEND_IOERR);
b128c09f
BB
9228 zio_resume_wait(spa);
9229 }
8dc2197b
SD
9230}
9231
9232/*
9233 * Sync the specified transaction group. New blocks may be dirtied as
9234 * part of the process, so we iterate until it converges.
9235 */
9236void
9237spa_sync(spa_t *spa, uint64_t txg)
9238{
9239 vdev_t *vd = NULL;
9240
9241 VERIFY(spa_writeable(spa));
9242
9243 /*
9244 * Wait for i/os issued in open context that need to complete
9245 * before this txg syncs.
9246 */
9247 (void) zio_wait(spa->spa_txg_zio[txg & TXG_MASK]);
9248 spa->spa_txg_zio[txg & TXG_MASK] = zio_root(spa, NULL, NULL,
9249 ZIO_FLAG_CANFAIL);
9250
9251 /*
9252 * Lock out configuration changes.
9253 */
9254 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
9255
9256 spa->spa_syncing_txg = txg;
9257 spa->spa_sync_pass = 0;
9258
9259 for (int i = 0; i < spa->spa_alloc_count; i++) {
1b50749c
AM
9260 mutex_enter(&spa->spa_allocs[i].spaa_lock);
9261 VERIFY0(avl_numnodes(&spa->spa_allocs[i].spaa_tree));
9262 mutex_exit(&spa->spa_allocs[i].spaa_lock);
8dc2197b
SD
9263 }
9264
9265 /*
9266 * If there are any pending vdev state changes, convert them
9267 * into config changes that go out with this transaction group.
9268 */
9269 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
9270 while (list_head(&spa->spa_state_dirty_list) != NULL) {
9271 /*
9272 * We need the write lock here because, for aux vdevs,
9273 * calling vdev_config_dirty() modifies sav_config.
9274 * This is ugly and will become unnecessary when we
9275 * eliminate the aux vdev wart by integrating all vdevs
9276 * into the root vdev tree.
9277 */
9278 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
9279 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER);
9280 while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) {
9281 vdev_state_clean(vd);
9282 vdev_config_dirty(vd);
9283 }
9284 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
9285 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
9286 }
9287 spa_config_exit(spa, SCL_STATE, FTAG);
9288
9289 dsl_pool_t *dp = spa->spa_dsl_pool;
9290 dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg);
9291
9292 spa->spa_sync_starttime = gethrtime();
9293 taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid);
9294 spa->spa_deadman_tqid = taskq_dispatch_delay(system_delay_taskq,
9295 spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() +
9296 NSEC_TO_TICK(spa->spa_deadman_synctime));
9297
9298 /*
9299 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg,
9300 * set spa_deflate if we have no raid-z vdevs.
9301 */
9302 if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE &&
9303 spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) {
9304 vdev_t *rvd = spa->spa_root_vdev;
9305
9306 int i;
9307 for (i = 0; i < rvd->vdev_children; i++) {
9308 vd = rvd->vdev_child[i];
9309 if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE)
9310 break;
9311 }
9312 if (i == rvd->vdev_children) {
9313 spa->spa_deflate = TRUE;
9314 VERIFY0(zap_add(spa->spa_meta_objset,
9315 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
9316 sizeof (uint64_t), 1, &spa->spa_deflate, tx));
9317 }
9318 }
9319
9320 spa_sync_adjust_vdev_max_queue_depth(spa);
9321
9322 spa_sync_condense_indirect(spa, tx);
9323
9324 spa_sync_iterate_to_convergence(spa, tx);
9325
9326#ifdef ZFS_DEBUG
9327 if (!list_is_empty(&spa->spa_config_dirty_list)) {
9328 /*
9329 * Make sure that the number of ZAPs for all the vdevs matches
9330 * the number of ZAPs in the per-vdev ZAP list. This only gets
9331 * called if the config is dirty; otherwise there may be
9332 * outstanding AVZ operations that weren't completed in
9333 * spa_sync_config_object.
9334 */
9335 uint64_t all_vdev_zap_entry_count;
9336 ASSERT0(zap_count(spa->spa_meta_objset,
9337 spa->spa_all_vdev_zaps, &all_vdev_zap_entry_count));
9338 ASSERT3U(vdev_count_verify_zaps(spa->spa_root_vdev), ==,
9339 all_vdev_zap_entry_count);
9340 }
9341#endif
9342
9343 if (spa->spa_vdev_removal != NULL) {
9344 ASSERT0(spa->spa_vdev_removal->svr_bytes_done[txg & TXG_MASK]);
9345 }
9346
9347 spa_sync_rewrite_vdev_config(spa, tx);
34dc7c2f
BB
9348 dmu_tx_commit(tx);
9349
57ddcda1 9350 taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid);
cc92e9d0
GW
9351 spa->spa_deadman_tqid = 0;
9352
34dc7c2f
BB
9353 /*
9354 * Clear the dirty config list.
9355 */
b128c09f 9356 while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL)
34dc7c2f
BB
9357 vdev_config_clean(vd);
9358
9359 /*
9360 * Now that the new config has synced transactionally,
9361 * let it become visible to the config cache.
9362 */
9363 if (spa->spa_config_syncing != NULL) {
9364 spa_config_set(spa, spa->spa_config_syncing);
9365 spa->spa_config_txg = txg;
9366 spa->spa_config_syncing = NULL;
9367 }
9368
428870ff 9369 dsl_pool_sync_done(dp, txg);
34dc7c2f 9370
492f64e9 9371 for (int i = 0; i < spa->spa_alloc_count; i++) {
1b50749c
AM
9372 mutex_enter(&spa->spa_allocs[i].spaa_lock);
9373 VERIFY0(avl_numnodes(&spa->spa_allocs[i].spaa_tree));
9374 mutex_exit(&spa->spa_allocs[i].spaa_lock);
492f64e9 9375 }
3dfb57a3 9376
34dc7c2f
BB
9377 /*
9378 * Update usable space statistics.
9379 */
619f0976
GW
9380 while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)))
9381 != NULL)
34dc7c2f 9382 vdev_sync_done(vd, txg);
f09fda50
PD
9383
9384 metaslab_class_evict_old(spa->spa_normal_class, txg);
9385 metaslab_class_evict_old(spa->spa_log_class, txg);
9386
93e28d66 9387 spa_sync_close_syncing_log_sm(spa);
34dc7c2f 9388
428870ff
BB
9389 spa_update_dspace(spa);
9390
34dc7c2f
BB
9391 /*
9392 * It had better be the case that we didn't dirty anything
9393 * since vdev_config_sync().
9394 */
9395 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
9396 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
9397 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg));
428870ff 9398
d2734cce
SD
9399 while (zfs_pause_spa_sync)
9400 delay(1);
9401
428870ff 9402 spa->spa_sync_pass = 0;
34dc7c2f 9403
55922e73
GW
9404 /*
9405 * Update the last synced uberblock here. We want to do this at
9406 * the end of spa_sync() so that consumers of spa_last_synced_txg()
9407 * will be guaranteed that all the processing associated with
9408 * that txg has been completed.
9409 */
9410 spa->spa_ubsync = spa->spa_uberblock;
b128c09f 9411 spa_config_exit(spa, SCL_CONFIG, FTAG);
34dc7c2f 9412
428870ff
BB
9413 spa_handle_ignored_writes(spa);
9414
34dc7c2f
BB
9415 /*
9416 * If any async tasks have been requested, kick them off.
9417 */
9418 spa_async_dispatch(spa);
9419}
9420
9421/*
9422 * Sync all pools. We don't want to hold the namespace lock across these
9423 * operations, so we take a reference on the spa_t and drop the lock during the
9424 * sync.
9425 */
9426void
9427spa_sync_allpools(void)
9428{
9429 spa_t *spa = NULL;
9430 mutex_enter(&spa_namespace_lock);
9431 while ((spa = spa_next(spa)) != NULL) {
572e2857
BB
9432 if (spa_state(spa) != POOL_STATE_ACTIVE ||
9433 !spa_writeable(spa) || spa_suspended(spa))
34dc7c2f
BB
9434 continue;
9435 spa_open_ref(spa, FTAG);
9436 mutex_exit(&spa_namespace_lock);
9437 txg_wait_synced(spa_get_dsl(spa), 0);
9438 mutex_enter(&spa_namespace_lock);
9439 spa_close(spa, FTAG);
9440 }
9441 mutex_exit(&spa_namespace_lock);
9442}
9443
9444/*
9445 * ==========================================================================
9446 * Miscellaneous routines
9447 * ==========================================================================
9448 */
9449
9450/*
9451 * Remove all pools in the system.
9452 */
9453void
9454spa_evict_all(void)
9455{
9456 spa_t *spa;
9457
9458 /*
9459 * Remove all cached state. All pools should be closed now,
9460 * so every spa in the AVL tree should be unreferenced.
9461 */
9462 mutex_enter(&spa_namespace_lock);
9463 while ((spa = spa_next(NULL)) != NULL) {
9464 /*
9465 * Stop async tasks. The async thread may need to detach
9466 * a device that's been replaced, which requires grabbing
9467 * spa_namespace_lock, so we must drop it here.
9468 */
9469 spa_open_ref(spa, FTAG);
9470 mutex_exit(&spa_namespace_lock);
9471 spa_async_suspend(spa);
9472 mutex_enter(&spa_namespace_lock);
34dc7c2f
BB
9473 spa_close(spa, FTAG);
9474
9475 if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
9476 spa_unload(spa);
9477 spa_deactivate(spa);
9478 }
9479 spa_remove(spa);
9480 }
9481 mutex_exit(&spa_namespace_lock);
9482}
9483
9484vdev_t *
9babb374 9485spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux)
34dc7c2f 9486{
b128c09f
BB
9487 vdev_t *vd;
9488 int i;
9489
9490 if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL)
9491 return (vd);
9492
9babb374 9493 if (aux) {
b128c09f
BB
9494 for (i = 0; i < spa->spa_l2cache.sav_count; i++) {
9495 vd = spa->spa_l2cache.sav_vdevs[i];
9babb374
BB
9496 if (vd->vdev_guid == guid)
9497 return (vd);
9498 }
9499
9500 for (i = 0; i < spa->spa_spares.sav_count; i++) {
9501 vd = spa->spa_spares.sav_vdevs[i];
b128c09f
BB
9502 if (vd->vdev_guid == guid)
9503 return (vd);
9504 }
9505 }
9506
9507 return (NULL);
34dc7c2f
BB
9508}
9509
9510void
9511spa_upgrade(spa_t *spa, uint64_t version)
9512{
572e2857
BB
9513 ASSERT(spa_writeable(spa));
9514
b128c09f 9515 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
34dc7c2f
BB
9516
9517 /*
9518 * This should only be called for a non-faulted pool, and since a
9519 * future version would result in an unopenable pool, this shouldn't be
9520 * possible.
9521 */
8dca0a9a 9522 ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version));
9b67f605 9523 ASSERT3U(version, >=, spa->spa_uberblock.ub_version);
34dc7c2f
BB
9524
9525 spa->spa_uberblock.ub_version = version;
9526 vdev_config_dirty(spa->spa_root_vdev);
9527
b128c09f 9528 spa_config_exit(spa, SCL_ALL, FTAG);
34dc7c2f
BB
9529
9530 txg_wait_synced(spa_get_dsl(spa), 0);
9531}
9532
49d42425
FU
9533static boolean_t
9534spa_has_aux_vdev(spa_t *spa, uint64_t guid, spa_aux_vdev_t *sav)
34dc7c2f 9535{
14e4e3cb 9536 (void) spa;
34dc7c2f 9537 int i;
49d42425 9538 uint64_t vdev_guid;
34dc7c2f
BB
9539
9540 for (i = 0; i < sav->sav_count; i++)
9541 if (sav->sav_vdevs[i]->vdev_guid == guid)
9542 return (B_TRUE);
9543
9544 for (i = 0; i < sav->sav_npending; i++) {
9545 if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID,
49d42425 9546 &vdev_guid) == 0 && vdev_guid == guid)
34dc7c2f
BB
9547 return (B_TRUE);
9548 }
9549
9550 return (B_FALSE);
9551}
9552
49d42425
FU
9553boolean_t
9554spa_has_l2cache(spa_t *spa, uint64_t guid)
9555{
9556 return (spa_has_aux_vdev(spa, guid, &spa->spa_l2cache));
9557}
9558
9559boolean_t
9560spa_has_spare(spa_t *spa, uint64_t guid)
9561{
9562 return (spa_has_aux_vdev(spa, guid, &spa->spa_spares));
9563}
9564
b128c09f
BB
9565/*
9566 * Check if a pool has an active shared spare device.
9567 * Note: reference count of an active spare is 2, as a spare and as a replace
9568 */
9569static boolean_t
9570spa_has_active_shared_spare(spa_t *spa)
9571{
9572 int i, refcnt;
9573 uint64_t pool;
9574 spa_aux_vdev_t *sav = &spa->spa_spares;
9575
9576 for (i = 0; i < sav->sav_count; i++) {
9577 if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool,
9578 &refcnt) && pool != 0ULL && pool == spa_guid(spa) &&
9579 refcnt > 2)
9580 return (B_TRUE);
9581 }
9582
9583 return (B_FALSE);
9584}
9585
93e28d66
SD
9586uint64_t
9587spa_total_metaslabs(spa_t *spa)
9588{
9589 vdev_t *rvd = spa->spa_root_vdev;
9590
9591 uint64_t m = 0;
9592 for (uint64_t c = 0; c < rvd->vdev_children; c++) {
9593 vdev_t *vd = rvd->vdev_child[c];
9594 if (!vdev_is_concrete(vd))
9595 continue;
9596 m += vd->vdev_ms_count;
9597 }
9598 return (m);
9599}
9600
e60e158e
JG
9601/*
9602 * Notify any waiting threads that some activity has switched from being in-
9603 * progress to not-in-progress so that the thread can wake up and determine
9604 * whether it is finished waiting.
9605 */
9606void
9607spa_notify_waiters(spa_t *spa)
9608{
9609 /*
9610 * Acquiring spa_activities_lock here prevents the cv_broadcast from
9611 * happening between the waiting thread's check and cv_wait.
9612 */
9613 mutex_enter(&spa->spa_activities_lock);
9614 cv_broadcast(&spa->spa_activities_cv);
9615 mutex_exit(&spa->spa_activities_lock);
9616}
9617
9618/*
9619 * Notify any waiting threads that the pool is exporting, and then block until
9620 * they are finished using the spa_t.
9621 */
9622void
9623spa_wake_waiters(spa_t *spa)
9624{
9625 mutex_enter(&spa->spa_activities_lock);
9626 spa->spa_waiters_cancel = B_TRUE;
9627 cv_broadcast(&spa->spa_activities_cv);
9628 while (spa->spa_waiters != 0)
9629 cv_wait(&spa->spa_waiters_cv, &spa->spa_activities_lock);
9630 spa->spa_waiters_cancel = B_FALSE;
9631 mutex_exit(&spa->spa_activities_lock);
9632}
9633
2288d419 9634/* Whether the vdev or any of its descendants are being initialized/trimmed. */
e60e158e 9635static boolean_t
2288d419 9636spa_vdev_activity_in_progress_impl(vdev_t *vd, zpool_wait_activity_t activity)
e60e158e
JG
9637{
9638 spa_t *spa = vd->vdev_spa;
e60e158e
JG
9639
9640 ASSERT(spa_config_held(spa, SCL_CONFIG | SCL_STATE, RW_READER));
9641 ASSERT(MUTEX_HELD(&spa->spa_activities_lock));
2288d419
BB
9642 ASSERT(activity == ZPOOL_WAIT_INITIALIZE ||
9643 activity == ZPOOL_WAIT_TRIM);
9644
9645 kmutex_t *lock = activity == ZPOOL_WAIT_INITIALIZE ?
9646 &vd->vdev_initialize_lock : &vd->vdev_trim_lock;
e60e158e
JG
9647
9648 mutex_exit(&spa->spa_activities_lock);
2288d419 9649 mutex_enter(lock);
e60e158e
JG
9650 mutex_enter(&spa->spa_activities_lock);
9651
2288d419
BB
9652 boolean_t in_progress = (activity == ZPOOL_WAIT_INITIALIZE) ?
9653 (vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE) :
9654 (vd->vdev_trim_state == VDEV_TRIM_ACTIVE);
9655 mutex_exit(lock);
e60e158e 9656
2288d419 9657 if (in_progress)
e60e158e
JG
9658 return (B_TRUE);
9659
9660 for (int i = 0; i < vd->vdev_children; i++) {
2288d419
BB
9661 if (spa_vdev_activity_in_progress_impl(vd->vdev_child[i],
9662 activity))
e60e158e
JG
9663 return (B_TRUE);
9664 }
9665
9666 return (B_FALSE);
9667}
9668
9669/*
9670 * If use_guid is true, this checks whether the vdev specified by guid is
2288d419
BB
9671 * being initialized/trimmed. Otherwise, it checks whether any vdev in the pool
9672 * is being initialized/trimmed. The caller must hold the config lock and
9673 * spa_activities_lock.
e60e158e
JG
9674 */
9675static int
2288d419
BB
9676spa_vdev_activity_in_progress(spa_t *spa, boolean_t use_guid, uint64_t guid,
9677 zpool_wait_activity_t activity, boolean_t *in_progress)
e60e158e
JG
9678{
9679 mutex_exit(&spa->spa_activities_lock);
9680 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
9681 mutex_enter(&spa->spa_activities_lock);
9682
9683 vdev_t *vd;
9684 if (use_guid) {
9685 vd = spa_lookup_by_guid(spa, guid, B_FALSE);
9686 if (vd == NULL || !vd->vdev_ops->vdev_op_leaf) {
9687 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
9688 return (EINVAL);
9689 }
9690 } else {
9691 vd = spa->spa_root_vdev;
9692 }
9693
2288d419 9694 *in_progress = spa_vdev_activity_in_progress_impl(vd, activity);
e60e158e
JG
9695
9696 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
9697 return (0);
9698}
9699
9700/*
9701 * Locking for waiting threads
9702 * ---------------------------
9703 *
9704 * Waiting threads need a way to check whether a given activity is in progress,
9705 * and then, if it is, wait for it to complete. Each activity will have some
9706 * in-memory representation of the relevant on-disk state which can be used to
9707 * determine whether or not the activity is in progress. The in-memory state and
9708 * the locking used to protect it will be different for each activity, and may
9709 * not be suitable for use with a cvar (e.g., some state is protected by the
9710 * config lock). To allow waiting threads to wait without any races, another
9711 * lock, spa_activities_lock, is used.
9712 *
9713 * When the state is checked, both the activity-specific lock (if there is one)
9714 * and spa_activities_lock are held. In some cases, the activity-specific lock
9715 * is acquired explicitly (e.g. the config lock). In others, the locking is
9716 * internal to some check (e.g. bpobj_is_empty). After checking, the waiting
9717 * thread releases the activity-specific lock and, if the activity is in
9718 * progress, then cv_waits using spa_activities_lock.
9719 *
9720 * The waiting thread is woken when another thread, one completing some
9721 * activity, updates the state of the activity and then calls
9722 * spa_notify_waiters, which will cv_broadcast. This 'completing' thread only
9723 * needs to hold its activity-specific lock when updating the state, and this
9724 * lock can (but doesn't have to) be dropped before calling spa_notify_waiters.
9725 *
9726 * Because spa_notify_waiters acquires spa_activities_lock before broadcasting,
9727 * and because it is held when the waiting thread checks the state of the
9728 * activity, it can never be the case that the completing thread both updates
9729 * the activity state and cv_broadcasts in between the waiting thread's check
9730 * and cv_wait. Thus, a waiting thread can never miss a wakeup.
9731 *
9732 * In order to prevent deadlock, when the waiting thread does its check, in some
9733 * cases it will temporarily drop spa_activities_lock in order to acquire the
9734 * activity-specific lock. The order in which spa_activities_lock and the
9735 * activity specific lock are acquired in the waiting thread is determined by
9736 * the order in which they are acquired in the completing thread; if the
9737 * completing thread calls spa_notify_waiters with the activity-specific lock
9738 * held, then the waiting thread must also acquire the activity-specific lock
9739 * first.
9740 */
9741
9742static int
9743spa_activity_in_progress(spa_t *spa, zpool_wait_activity_t activity,
9744 boolean_t use_tag, uint64_t tag, boolean_t *in_progress)
9745{
9746 int error = 0;
9747
9748 ASSERT(MUTEX_HELD(&spa->spa_activities_lock));
9749
9750 switch (activity) {
9751 case ZPOOL_WAIT_CKPT_DISCARD:
9752 *in_progress =
9753 (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT) &&
9754 zap_contains(spa_meta_objset(spa),
9755 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT) ==
9756 ENOENT);
9757 break;
9758 case ZPOOL_WAIT_FREE:
9759 *in_progress = ((spa_version(spa) >= SPA_VERSION_DEADLISTS &&
9760 !bpobj_is_empty(&spa->spa_dsl_pool->dp_free_bpobj)) ||
9761 spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY) ||
9762 spa_livelist_delete_check(spa));
9763 break;
9764 case ZPOOL_WAIT_INITIALIZE:
2288d419
BB
9765 case ZPOOL_WAIT_TRIM:
9766 error = spa_vdev_activity_in_progress(spa, use_tag, tag,
9767 activity, in_progress);
e60e158e
JG
9768 break;
9769 case ZPOOL_WAIT_REPLACE:
9770 mutex_exit(&spa->spa_activities_lock);
9771 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
9772 mutex_enter(&spa->spa_activities_lock);
9773
9774 *in_progress = vdev_replace_in_progress(spa->spa_root_vdev);
9775 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
9776 break;
9777 case ZPOOL_WAIT_REMOVE:
9778 *in_progress = (spa->spa_removing_phys.sr_state ==
9779 DSS_SCANNING);
9780 break;
9781 case ZPOOL_WAIT_RESILVER:
9a49d3f3
BB
9782 if ((*in_progress = vdev_rebuild_active(spa->spa_root_vdev)))
9783 break;
9a70e97f 9784 zfs_fallthrough;
e60e158e
JG
9785 case ZPOOL_WAIT_SCRUB:
9786 {
9787 boolean_t scanning, paused, is_scrub;
9788 dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
9789
9790 is_scrub = (scn->scn_phys.scn_func == POOL_SCAN_SCRUB);
9791 scanning = (scn->scn_phys.scn_state == DSS_SCANNING);
9792 paused = dsl_scan_is_paused_scrub(scn);
9793 *in_progress = (scanning && !paused &&
9794 is_scrub == (activity == ZPOOL_WAIT_SCRUB));
9795 break;
9796 }
9797 default:
9798 panic("unrecognized value for activity %d", activity);
9799 }
9800
9801 return (error);
9802}
9803
9804static int
9805spa_wait_common(const char *pool, zpool_wait_activity_t activity,
9806 boolean_t use_tag, uint64_t tag, boolean_t *waited)
9807{
9808 /*
9809 * The tag is used to distinguish between instances of an activity.
2288d419
BB
9810 * 'initialize' and 'trim' are the only activities that we use this for.
9811 * The other activities can only have a single instance in progress in a
9812 * pool at one time, making the tag unnecessary.
e60e158e
JG
9813 *
9814 * There can be multiple devices being replaced at once, but since they
9815 * all finish once resilvering finishes, we don't bother keeping track
9816 * of them individually, we just wait for them all to finish.
9817 */
2288d419
BB
9818 if (use_tag && activity != ZPOOL_WAIT_INITIALIZE &&
9819 activity != ZPOOL_WAIT_TRIM)
e60e158e
JG
9820 return (EINVAL);
9821
9822 if (activity < 0 || activity >= ZPOOL_WAIT_NUM_ACTIVITIES)
9823 return (EINVAL);
9824
9825 spa_t *spa;
9826 int error = spa_open(pool, &spa, FTAG);
9827 if (error != 0)
9828 return (error);
9829
9830 /*
9831 * Increment the spa's waiter count so that we can call spa_close and
9832 * still ensure that the spa_t doesn't get freed before this thread is
9833 * finished with it when the pool is exported. We want to call spa_close
9834 * before we start waiting because otherwise the additional ref would
9835 * prevent the pool from being exported or destroyed throughout the
9836 * potentially long wait.
9837 */
9838 mutex_enter(&spa->spa_activities_lock);
9839 spa->spa_waiters++;
9840 spa_close(spa, FTAG);
9841
9842 *waited = B_FALSE;
9843 for (;;) {
9844 boolean_t in_progress;
9845 error = spa_activity_in_progress(spa, activity, use_tag, tag,
9846 &in_progress);
9847
b24771a8 9848 if (error || !in_progress || spa->spa_waiters_cancel)
e60e158e
JG
9849 break;
9850
9851 *waited = B_TRUE;
9852
9853 if (cv_wait_sig(&spa->spa_activities_cv,
9854 &spa->spa_activities_lock) == 0) {
9855 error = EINTR;
9856 break;
9857 }
9858 }
9859
9860 spa->spa_waiters--;
9861 cv_signal(&spa->spa_waiters_cv);
9862 mutex_exit(&spa->spa_activities_lock);
9863
9864 return (error);
9865}
9866
9867/*
9868 * Wait for a particular instance of the specified activity to complete, where
9869 * the instance is identified by 'tag'
9870 */
9871int
9872spa_wait_tag(const char *pool, zpool_wait_activity_t activity, uint64_t tag,
9873 boolean_t *waited)
9874{
9875 return (spa_wait_common(pool, activity, B_TRUE, tag, waited));
9876}
9877
9878/*
9879 * Wait for all instances of the specified activity complete
9880 */
9881int
9882spa_wait(const char *pool, zpool_wait_activity_t activity, boolean_t *waited)
9883{
9884
9885 return (spa_wait_common(pool, activity, B_FALSE, 0, waited));
9886}
9887
a1d477c2 9888sysevent_t *
12fa0466
DE
9889spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name)
9890{
9891 sysevent_t *ev = NULL;
9892#ifdef _KERNEL
9893 nvlist_t *resource;
9894
9895 resource = zfs_event_create(spa, vd, FM_SYSEVENT_CLASS, name, hist_nvl);
9896 if (resource) {
9897 ev = kmem_alloc(sizeof (sysevent_t), KM_SLEEP);
9898 ev->resource = resource;
9899 }
14e4e3cb
AZ
9900#else
9901 (void) spa, (void) vd, (void) hist_nvl, (void) name;
12fa0466
DE
9902#endif
9903 return (ev);
9904}
9905
a1d477c2 9906void
12fa0466
DE
9907spa_event_post(sysevent_t *ev)
9908{
9909#ifdef _KERNEL
9910 if (ev) {
9911 zfs_zevent_post(ev->resource, NULL, zfs_zevent_post_cb);
9912 kmem_free(ev, sizeof (*ev));
9913 }
14e4e3cb
AZ
9914#else
9915 (void) ev;
12fa0466
DE
9916#endif
9917}
9918
34dc7c2f 9919/*
fb390aaf
HR
9920 * Post a zevent corresponding to the given sysevent. The 'name' must be one
9921 * of the event definitions in sys/sysevent/eventdefs.h. The payload will be
34dc7c2f
BB
9922 * filled in from the spa and (optionally) the vdev. This doesn't do anything
9923 * in the userland libzpool, as we don't want consumers to misinterpret ztest
9924 * or zdb as real changes.
9925 */
9926void
12fa0466 9927spa_event_notify(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name)
34dc7c2f 9928{
12fa0466 9929 spa_event_post(spa_event_create(spa, vd, hist_nvl, name));
34dc7c2f 9930}
c28b2279 9931
c28b2279
BB
9932/* state manipulation functions */
9933EXPORT_SYMBOL(spa_open);
9934EXPORT_SYMBOL(spa_open_rewind);
9935EXPORT_SYMBOL(spa_get_stats);
9936EXPORT_SYMBOL(spa_create);
c28b2279
BB
9937EXPORT_SYMBOL(spa_import);
9938EXPORT_SYMBOL(spa_tryimport);
9939EXPORT_SYMBOL(spa_destroy);
9940EXPORT_SYMBOL(spa_export);
9941EXPORT_SYMBOL(spa_reset);
9942EXPORT_SYMBOL(spa_async_request);
9943EXPORT_SYMBOL(spa_async_suspend);
9944EXPORT_SYMBOL(spa_async_resume);
9945EXPORT_SYMBOL(spa_inject_addref);
9946EXPORT_SYMBOL(spa_inject_delref);
9947EXPORT_SYMBOL(spa_scan_stat_init);
9948EXPORT_SYMBOL(spa_scan_get_stats);
9949
e1cfd73f 9950/* device manipulation */
c28b2279
BB
9951EXPORT_SYMBOL(spa_vdev_add);
9952EXPORT_SYMBOL(spa_vdev_attach);
9953EXPORT_SYMBOL(spa_vdev_detach);
c28b2279
BB
9954EXPORT_SYMBOL(spa_vdev_setpath);
9955EXPORT_SYMBOL(spa_vdev_setfru);
9956EXPORT_SYMBOL(spa_vdev_split_mirror);
9957
9958/* spare statech is global across all pools) */
9959EXPORT_SYMBOL(spa_spare_add);
9960EXPORT_SYMBOL(spa_spare_remove);
9961EXPORT_SYMBOL(spa_spare_exists);
9962EXPORT_SYMBOL(spa_spare_activate);
9963
9964/* L2ARC statech is global across all pools) */
9965EXPORT_SYMBOL(spa_l2cache_add);
9966EXPORT_SYMBOL(spa_l2cache_remove);
9967EXPORT_SYMBOL(spa_l2cache_exists);
9968EXPORT_SYMBOL(spa_l2cache_activate);
9969EXPORT_SYMBOL(spa_l2cache_drop);
9970
9971/* scanning */
9972EXPORT_SYMBOL(spa_scan);
9973EXPORT_SYMBOL(spa_scan_stop);
9974
9975/* spa syncing */
9976EXPORT_SYMBOL(spa_sync); /* only for DMU use */
9977EXPORT_SYMBOL(spa_sync_allpools);
9978
9979/* properties */
9980EXPORT_SYMBOL(spa_prop_set);
9981EXPORT_SYMBOL(spa_prop_get);
9982EXPORT_SYMBOL(spa_prop_clear_bootfs);
9983
9984/* asynchronous event notification */
9985EXPORT_SYMBOL(spa_event_notify);
dea377c0 9986
c8242a96 9987/* BEGIN CSTYLED */
03fdcb9a 9988ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_shift, INT, ZMOD_RW,
458f8231 9989 "log2 fraction of arc that can be used by inflight I/Os when "
03fdcb9a 9990 "verifying pool during import");
7ada752a 9991/* END CSTYLED */
dea377c0 9992
03fdcb9a 9993ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_metadata, INT, ZMOD_RW,
dea377c0
MA
9994 "Set to traverse metadata on pool import");
9995
03fdcb9a 9996ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_data, INT, ZMOD_RW,
dea377c0 9997 "Set to traverse data on pool import");
dcb6bed1 9998
03fdcb9a 9999ZFS_MODULE_PARAM(zfs_spa, spa_, load_print_vdev_tree, INT, ZMOD_RW,
6cb8e530
PZ
10000 "Print vdev tree to zfs_dbgmsg during pool import");
10001
03fdcb9a 10002ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_pct, UINT, ZMOD_RD,
dcb6bed1
D
10003 "Percentage of CPUs to run an IO worker thread");
10004
7457b024
AM
10005ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_tpq, UINT, ZMOD_RD,
10006 "Number of threads per IO worker taskqueue");
10007
7ada752a 10008/* BEGIN CSTYLED */
03fdcb9a
MM
10009ZFS_MODULE_PARAM(zfs, zfs_, max_missing_tvds, ULONG, ZMOD_RW,
10010 "Allow importing pool with up to this number of missing top-level "
10011 "vdevs (in read-only mode)");
7ada752a 10012/* END CSTYLED */
6cb8e530 10013
7ada752a
AZ
10014ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_pause, INT,
10015 ZMOD_RW, "Set the livelist condense zthr to pause");
03fdcb9a 10016
7ada752a
AZ
10017ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, sync_pause, INT,
10018 ZMOD_RW, "Set the livelist condense synctask to pause");
37f03da8 10019
7ada752a
AZ
10020/* BEGIN CSTYLED */
10021ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, sync_cancel,
10022 INT, ZMOD_RW,
37f03da8 10023 "Whether livelist condensing was canceled in the synctask");
03fdcb9a 10024
7ada752a
AZ
10025ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_cancel,
10026 INT, ZMOD_RW,
37f03da8
SH
10027 "Whether livelist condensing was canceled in the zthr function");
10028
7ada752a
AZ
10029ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, new_alloc, INT,
10030 ZMOD_RW,
03fdcb9a
MM
10031 "Whether extra ALLOC blkptrs were added to a livelist entry while it "
10032 "was being condensed");
37f03da8 10033/* END CSTYLED */