]> git.proxmox.com Git - mirror_zfs.git/blame - module/zfs/spa.c
ARC: Drop different size headers for crypto
[mirror_zfs.git] / module / zfs / spa.c
CommitLineData
34dc7c2f
BB
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
1d3ba0bf 9 * or https://opensource.org/licenses/CDDL-1.0.
34dc7c2f
BB
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
428870ff 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
4f072827 24 * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
733b5722 25 * Copyright (c) 2018, Nexenta Systems, Inc. All rights reserved.
0c66c32d 26 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
3c67d83a 27 * Copyright 2013 Saso Kiselkov. All rights reserved.
e550644f
BB
28 * Copyright (c) 2014 Integros [integros.com]
29 * Copyright 2016 Toomas Soome <tsoome@me.com>
a0bd735a 30 * Copyright (c) 2016 Actifio, Inc. All rights reserved.
f65fbee1 31 * Copyright 2018 Joyent, Inc.
3c819a2c 32 * Copyright (c) 2017, 2019, Datto Inc. All rights reserved.
12fa0466 33 * Copyright 2017 Joyent, Inc.
cc99f275 34 * Copyright (c) 2017, Intel Corporation.
658fb802 35 * Copyright (c) 2021, Colm Buckley <colm@tuatha.org>
9d618615 36 * Copyright (c) 2023 Hewlett Packard Enterprise Development LP.
a38718a6 37 */
34dc7c2f 38
34dc7c2f 39/*
e49f1e20
WA
40 * SPA: Storage Pool Allocator
41 *
34dc7c2f
BB
42 * This file contains all the routines used when modifying on-disk SPA state.
43 * This includes opening, importing, destroying, exporting a pool, and syncing a
44 * pool.
45 */
46
47#include <sys/zfs_context.h>
48#include <sys/fm/fs/zfs.h>
49#include <sys/spa_impl.h>
50#include <sys/zio.h>
51#include <sys/zio_checksum.h>
34dc7c2f
BB
52#include <sys/dmu.h>
53#include <sys/dmu_tx.h>
54#include <sys/zap.h>
55#include <sys/zil.h>
67a1b037 56#include <sys/brt.h>
428870ff 57#include <sys/ddt.h>
34dc7c2f 58#include <sys/vdev_impl.h>
a1d477c2
MA
59#include <sys/vdev_removal.h>
60#include <sys/vdev_indirect_mapping.h>
61#include <sys/vdev_indirect_births.h>
619f0976 62#include <sys/vdev_initialize.h>
9a49d3f3 63#include <sys/vdev_rebuild.h>
1b939560 64#include <sys/vdev_trim.h>
c28b2279 65#include <sys/vdev_disk.h>
b2255edc 66#include <sys/vdev_draid.h>
34dc7c2f 67#include <sys/metaslab.h>
428870ff 68#include <sys/metaslab_impl.h>
379ca9cf 69#include <sys/mmp.h>
34dc7c2f
BB
70#include <sys/uberblock_impl.h>
71#include <sys/txg.h>
72#include <sys/avl.h>
a1d477c2 73#include <sys/bpobj.h>
34dc7c2f
BB
74#include <sys/dmu_traverse.h>
75#include <sys/dmu_objset.h>
76#include <sys/unique.h>
77#include <sys/dsl_pool.h>
78#include <sys/dsl_dataset.h>
79#include <sys/dsl_dir.h>
80#include <sys/dsl_prop.h>
81#include <sys/dsl_synctask.h>
82#include <sys/fs/zfs.h>
83#include <sys/arc.h>
84#include <sys/callb.h>
85#include <sys/systeminfo.h>
9babb374 86#include <sys/zfs_ioctl.h>
428870ff 87#include <sys/dsl_scan.h>
9ae529ec 88#include <sys/zfeature.h>
13fe0198 89#include <sys/dsl_destroy.h>
526af785 90#include <sys/zvol.h>
34dc7c2f 91
d164b209 92#ifdef _KERNEL
12fa0466
DE
93#include <sys/fm/protocol.h>
94#include <sys/fm/util.h>
428870ff 95#include <sys/callb.h>
d164b209 96#include <sys/zone.h>
c8242a96 97#include <sys/vmsystm.h>
d164b209
BB
98#endif /* _KERNEL */
99
34dc7c2f
BB
100#include "zfs_prop.h"
101#include "zfs_comutil.h"
102
e6cfd633
WA
103/*
104 * The interval, in seconds, at which failed configuration cache file writes
105 * should be retried.
106 */
a1d477c2 107int zfs_ccw_retry_interval = 300;
e6cfd633 108
428870ff 109typedef enum zti_modes {
7ef5e54e 110 ZTI_MODE_FIXED, /* value is # of threads (min 1) */
7ef5e54e 111 ZTI_MODE_BATCH, /* cpu-intensive; value is ignored */
7457b024 112 ZTI_MODE_SCALE, /* Taskqs scale with CPUs. */
7ef5e54e
AL
113 ZTI_MODE_NULL, /* don't create a taskq */
114 ZTI_NMODES
428870ff 115} zti_modes_t;
34dc7c2f 116
7ef5e54e
AL
117#define ZTI_P(n, q) { ZTI_MODE_FIXED, (n), (q) }
118#define ZTI_PCT(n) { ZTI_MODE_ONLINE_PERCENT, (n), 1 }
119#define ZTI_BATCH { ZTI_MODE_BATCH, 0, 1 }
7457b024 120#define ZTI_SCALE { ZTI_MODE_SCALE, 0, 1 }
7ef5e54e 121#define ZTI_NULL { ZTI_MODE_NULL, 0, 0 }
9babb374 122
7ef5e54e
AL
123#define ZTI_N(n) ZTI_P(n, 1)
124#define ZTI_ONE ZTI_N(1)
9babb374
BB
125
126typedef struct zio_taskq_info {
7ef5e54e 127 zti_modes_t zti_mode;
428870ff 128 uint_t zti_value;
7ef5e54e 129 uint_t zti_count;
9babb374
BB
130} zio_taskq_info_t;
131
132static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
451041db 133 "iss", "iss_h", "int", "int_h"
9babb374
BB
134};
135
428870ff 136/*
7ef5e54e
AL
137 * This table defines the taskq settings for each ZFS I/O type. When
138 * initializing a pool, we use this table to create an appropriately sized
139 * taskq. Some operations are low volume and therefore have a small, static
140 * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE
141 * macros. Other operations process a large amount of data; the ZTI_BATCH
142 * macro causes us to create a taskq oriented for throughput. Some operations
1b939560 143 * are so high frequency and short-lived that the taskq itself can become a
7ef5e54e
AL
144 * point of lock contention. The ZTI_P(#, #) macro indicates that we need an
145 * additional degree of parallelism specified by the number of threads per-
146 * taskq and the number of taskqs; when dispatching an event in this case, the
7457b024
AM
147 * particular taskq is chosen at random. ZTI_SCALE is similar to ZTI_BATCH,
148 * but with number of taskqs also scaling with number of CPUs.
7ef5e54e
AL
149 *
150 * The different taskq priorities are to handle the different contexts (issue
151 * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that
152 * need to be handled with minimum delay.
428870ff 153 */
18168da7 154static const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
428870ff 155 /* ISSUE ISSUE_HIGH INTR INTR_HIGH */
7ef5e54e 156 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */
7457b024
AM
157 { ZTI_N(8), ZTI_NULL, ZTI_SCALE, ZTI_NULL }, /* READ */
158 { ZTI_BATCH, ZTI_N(5), ZTI_SCALE, ZTI_N(5) }, /* WRITE */
159 { ZTI_SCALE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */
7ef5e54e
AL
160 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* CLAIM */
161 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* IOCTL */
1b939560 162 { ZTI_N(4), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* TRIM */
9babb374
BB
163};
164
13fe0198
MA
165static void spa_sync_version(void *arg, dmu_tx_t *tx);
166static void spa_sync_props(void *arg, dmu_tx_t *tx);
b128c09f 167static boolean_t spa_has_active_shared_spare(spa_t *spa);
a926aab9
AZ
168static int spa_load_impl(spa_t *spa, spa_import_type_t type,
169 const char **ereport);
572e2857 170static void spa_vdev_resilver_done(spa_t *spa);
428870ff 171
18168da7
AZ
172static uint_t zio_taskq_batch_pct = 80; /* 1 thread per cpu in pset */
173static uint_t zio_taskq_batch_tpq; /* threads per taskq */
174static const boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */
175static const uint_t zio_taskq_basedc = 80; /* base duty cycle */
428870ff 176
18168da7 177static const boolean_t spa_create_process = B_TRUE; /* no process => no sysdc */
428870ff 178
afd2f7b7
PZ
179/*
180 * Report any spa_load_verify errors found, but do not fail spa_load.
181 * This is used by zdb to analyze non-idle pools.
182 */
183boolean_t spa_load_verify_dryrun = B_FALSE;
184
e39fe05b
FU
185/*
186 * Allow read spacemaps in case of readonly import (spa_mode == SPA_MODE_READ).
187 * This is used by zdb for spacemaps verification.
188 */
189boolean_t spa_mode_readable_spacemaps = B_FALSE;
190
428870ff
BB
191/*
192 * This (illegal) pool name is used when temporarily importing a spa_t in order
193 * to get the vdev stats associated with the imported devices.
194 */
195#define TRYIMPORT_NAME "$import"
34dc7c2f 196
6cb8e530
PZ
197/*
198 * For debugging purposes: print out vdev tree during pool import.
199 */
18168da7 200static int spa_load_print_vdev_tree = B_FALSE;
6cb8e530
PZ
201
202/*
203 * A non-zero value for zfs_max_missing_tvds means that we allow importing
204 * pools with missing top-level vdevs. This is strictly intended for advanced
205 * pool recovery cases since missing data is almost inevitable. Pools with
206 * missing devices can only be imported read-only for safety reasons, and their
207 * fail-mode will be automatically set to "continue".
208 *
209 * With 1 missing vdev we should be able to import the pool and mount all
210 * datasets. User data that was not modified after the missing device has been
211 * added should be recoverable. This means that snapshots created prior to the
212 * addition of that device should be completely intact.
213 *
214 * With 2 missing vdevs, some datasets may fail to mount since there are
215 * dataset statistics that are stored as regular metadata. Some data might be
216 * recoverable if those vdevs were added recently.
217 *
218 * With 3 or more missing vdevs, the pool is severely damaged and MOS entries
219 * may be missing entirely. Chances of data recovery are very low. Note that
220 * there are also risks of performing an inadvertent rewind as we might be
221 * missing all the vdevs with the latest uberblocks.
222 */
ab8d9c17 223uint64_t zfs_max_missing_tvds = 0;
6cb8e530
PZ
224
225/*
226 * The parameters below are similar to zfs_max_missing_tvds but are only
227 * intended for a preliminary open of the pool with an untrusted config which
228 * might be incomplete or out-dated.
229 *
230 * We are more tolerant for pools opened from a cachefile since we could have
231 * an out-dated cachefile where a device removal was not registered.
232 * We could have set the limit arbitrarily high but in the case where devices
233 * are really missing we would want to return the proper error codes; we chose
234 * SPA_DVAS_PER_BP - 1 so that some copies of the MOS would still be available
235 * and we get a chance to retrieve the trusted config.
236 */
237uint64_t zfs_max_missing_tvds_cachefile = SPA_DVAS_PER_BP - 1;
d2734cce 238
6cb8e530
PZ
239/*
240 * In the case where config was assembled by scanning device paths (/dev/dsks
241 * by default) we are less tolerant since all the existing devices should have
242 * been detected and we want spa_load to return the right error codes.
243 */
244uint64_t zfs_max_missing_tvds_scan = 0;
245
d2734cce
SD
246/*
247 * Debugging aid that pauses spa_sync() towards the end.
248 */
18168da7 249static const boolean_t zfs_pause_spa_sync = B_FALSE;
d2734cce 250
37f03da8
SH
251/*
252 * Variables to indicate the livelist condense zthr func should wait at certain
253 * points for the livelist to be removed - used to test condense/destroy races
254 */
18168da7
AZ
255static int zfs_livelist_condense_zthr_pause = 0;
256static int zfs_livelist_condense_sync_pause = 0;
37f03da8
SH
257
258/*
259 * Variables to track whether or not condense cancellation has been
260 * triggered in testing.
261 */
18168da7
AZ
262static int zfs_livelist_condense_sync_cancel = 0;
263static int zfs_livelist_condense_zthr_cancel = 0;
37f03da8
SH
264
265/*
266 * Variable to track whether or not extra ALLOC blkptrs were added to a
267 * livelist entry while it was being condensed (caused by the way we track
268 * remapped blkptrs in dbuf_remap_impl)
269 */
18168da7 270static int zfs_livelist_condense_new_alloc = 0;
37f03da8 271
34dc7c2f
BB
272/*
273 * ==========================================================================
274 * SPA properties routines
275 * ==========================================================================
276 */
277
278/*
279 * Add a (source=src, propname=propval) list to an nvlist.
280 */
281static void
a926aab9 282spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, const char *strval,
34dc7c2f
BB
283 uint64_t intval, zprop_source_t src)
284{
285 const char *propname = zpool_prop_to_name(prop);
286 nvlist_t *propval;
287
65ad5d11
AJ
288 propval = fnvlist_alloc();
289 fnvlist_add_uint64(propval, ZPROP_SOURCE, src);
34dc7c2f
BB
290
291 if (strval != NULL)
65ad5d11 292 fnvlist_add_string(propval, ZPROP_VALUE, strval);
34dc7c2f 293 else
65ad5d11 294 fnvlist_add_uint64(propval, ZPROP_VALUE, intval);
34dc7c2f 295
65ad5d11 296 fnvlist_add_nvlist(nvl, propname, propval);
34dc7c2f
BB
297 nvlist_free(propval);
298}
299
8eae2d21
AJ
300/*
301 * Add a user property (source=src, propname=propval) to an nvlist.
302 */
303static void
304spa_prop_add_user(nvlist_t *nvl, const char *propname, char *strval,
305 zprop_source_t src)
306{
307 nvlist_t *propval;
308
309 VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
310 VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0);
311 VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0);
312 VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0);
313 nvlist_free(propval);
314}
315
34dc7c2f
BB
316/*
317 * Get property values from the spa configuration.
318 */
319static void
320spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
321{
1bd201e7 322 vdev_t *rvd = spa->spa_root_vdev;
9ae529ec 323 dsl_pool_t *pool = spa->spa_dsl_pool;
f3a7f661 324 uint64_t size, alloc, cap, version;
82ab6848 325 const zprop_source_t src = ZPROP_SRC_NONE;
b128c09f 326 spa_config_dirent_t *dp;
f3a7f661 327 metaslab_class_t *mc = spa_normal_class(spa);
b128c09f
BB
328
329 ASSERT(MUTEX_HELD(&spa->spa_props_lock));
34dc7c2f 330
1bd201e7 331 if (rvd != NULL) {
cc99f275
DB
332 alloc = metaslab_class_get_alloc(mc);
333 alloc += metaslab_class_get_alloc(spa_special_class(spa));
334 alloc += metaslab_class_get_alloc(spa_dedup_class(spa));
aa755b35 335 alloc += metaslab_class_get_alloc(spa_embedded_log_class(spa));
cc99f275
DB
336
337 size = metaslab_class_get_space(mc);
338 size += metaslab_class_get_space(spa_special_class(spa));
339 size += metaslab_class_get_space(spa_dedup_class(spa));
aa755b35 340 size += metaslab_class_get_space(spa_embedded_log_class(spa));
cc99f275 341
d164b209
BB
342 spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src);
343 spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src);
428870ff
BB
344 spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src);
345 spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL,
346 size - alloc, src);
d2734cce
SD
347 spa_prop_add_list(*nvp, ZPOOL_PROP_CHECKPOINT, NULL,
348 spa->spa_checkpoint_info.sci_dspace, src);
1bd201e7 349
f3a7f661
GW
350 spa_prop_add_list(*nvp, ZPOOL_PROP_FRAGMENTATION, NULL,
351 metaslab_class_fragmentation(mc), src);
352 spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL,
353 metaslab_class_expandable_space(mc), src);
572e2857 354 spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL,
da92d5cb 355 (spa_mode(spa) == SPA_MODE_READ), src);
d164b209 356
428870ff 357 cap = (size == 0) ? 0 : (alloc * 100 / size);
d164b209
BB
358 spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src);
359
428870ff
BB
360 spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL,
361 ddt_get_pool_dedup_ratio(spa), src);
67a1b037
PJD
362 spa_prop_add_list(*nvp, ZPOOL_PROP_BCLONEUSED, NULL,
363 brt_get_used(spa), src);
364 spa_prop_add_list(*nvp, ZPOOL_PROP_BCLONESAVED, NULL,
365 brt_get_saved(spa), src);
366 spa_prop_add_list(*nvp, ZPOOL_PROP_BCLONERATIO, NULL,
367 brt_get_ratio(spa), src);
428870ff 368
d164b209 369 spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL,
1bd201e7 370 rvd->vdev_state, src);
d164b209
BB
371
372 version = spa_version(spa);
82ab6848
HM
373 if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) {
374 spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL,
375 version, ZPROP_SRC_DEFAULT);
376 } else {
377 spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL,
378 version, ZPROP_SRC_LOCAL);
379 }
a448a255
SD
380 spa_prop_add_list(*nvp, ZPOOL_PROP_LOAD_GUID,
381 NULL, spa_load_guid(spa), src);
d164b209 382 }
34dc7c2f 383
9ae529ec 384 if (pool != NULL) {
9ae529ec
CS
385 /*
386 * The $FREE directory was introduced in SPA_VERSION_DEADLISTS,
387 * when opening pools before this version freedir will be NULL.
388 */
fbeddd60 389 if (pool->dp_free_dir != NULL) {
9ae529ec 390 spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL,
d683ddbb
JG
391 dsl_dir_phys(pool->dp_free_dir)->dd_used_bytes,
392 src);
9ae529ec
CS
393 } else {
394 spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING,
395 NULL, 0, src);
396 }
fbeddd60
MA
397
398 if (pool->dp_leak_dir != NULL) {
399 spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL,
d683ddbb
JG
400 dsl_dir_phys(pool->dp_leak_dir)->dd_used_bytes,
401 src);
fbeddd60
MA
402 } else {
403 spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED,
404 NULL, 0, src);
405 }
9ae529ec
CS
406 }
407
34dc7c2f 408 spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src);
34dc7c2f 409
d96eb2b1
DM
410 if (spa->spa_comment != NULL) {
411 spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment,
412 0, ZPROP_SRC_LOCAL);
413 }
414
658fb802
CB
415 if (spa->spa_compatibility != NULL) {
416 spa_prop_add_list(*nvp, ZPOOL_PROP_COMPATIBILITY,
417 spa->spa_compatibility, 0, ZPROP_SRC_LOCAL);
418 }
419
34dc7c2f
BB
420 if (spa->spa_root != NULL)
421 spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root,
422 0, ZPROP_SRC_LOCAL);
423
f1512ee6
MA
424 if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) {
425 spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL,
426 MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE), ZPROP_SRC_NONE);
427 } else {
428 spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL,
429 SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE);
430 }
431
50c957f7
NB
432 if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE)) {
433 spa_prop_add_list(*nvp, ZPOOL_PROP_MAXDNODESIZE, NULL,
434 DNODE_MAX_SIZE, ZPROP_SRC_NONE);
435 } else {
436 spa_prop_add_list(*nvp, ZPOOL_PROP_MAXDNODESIZE, NULL,
437 DNODE_MIN_SIZE, ZPROP_SRC_NONE);
438 }
439
b128c09f
BB
440 if ((dp = list_head(&spa->spa_config_list)) != NULL) {
441 if (dp->scd_path == NULL) {
34dc7c2f 442 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
b128c09f
BB
443 "none", 0, ZPROP_SRC_LOCAL);
444 } else if (strcmp(dp->scd_path, spa_config_path) != 0) {
34dc7c2f 445 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
b128c09f 446 dp->scd_path, 0, ZPROP_SRC_LOCAL);
34dc7c2f
BB
447 }
448 }
449}
450
451/*
452 * Get zpool property values.
453 */
454int
455spa_prop_get(spa_t *spa, nvlist_t **nvp)
456{
428870ff 457 objset_t *mos = spa->spa_meta_objset;
34dc7c2f
BB
458 zap_cursor_t zc;
459 zap_attribute_t za;
1743c737 460 dsl_pool_t *dp;
34dc7c2f
BB
461 int err;
462
79c76d5b 463 err = nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP);
c28b2279 464 if (err)
d1d7e268 465 return (err);
34dc7c2f 466
1743c737
AM
467 dp = spa_get_dsl(spa);
468 dsl_pool_config_enter(dp, FTAG);
b128c09f
BB
469 mutex_enter(&spa->spa_props_lock);
470
34dc7c2f
BB
471 /*
472 * Get properties from the spa config.
473 */
474 spa_prop_get_config(spa, nvp);
475
34dc7c2f 476 /* If no pool property object, no more prop to get. */
1743c737 477 if (mos == NULL || spa->spa_pool_props_object == 0)
c28b2279 478 goto out;
34dc7c2f
BB
479
480 /*
481 * Get properties from the MOS pool property object.
482 */
483 for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object);
484 (err = zap_cursor_retrieve(&zc, &za)) == 0;
485 zap_cursor_advance(&zc)) {
486 uint64_t intval = 0;
487 char *strval = NULL;
488 zprop_source_t src = ZPROP_SRC_DEFAULT;
489 zpool_prop_t prop;
490
8eae2d21
AJ
491 if ((prop = zpool_name_to_prop(za.za_name)) ==
492 ZPOOL_PROP_INVAL && !zfs_prop_user(za.za_name))
34dc7c2f
BB
493 continue;
494
495 switch (za.za_integer_length) {
496 case 8:
497 /* integer property */
498 if (za.za_first_integer !=
499 zpool_prop_default_numeric(prop))
500 src = ZPROP_SRC_LOCAL;
501
502 if (prop == ZPOOL_PROP_BOOTFS) {
34dc7c2f
BB
503 dsl_dataset_t *ds = NULL;
504
619f0976
GW
505 err = dsl_dataset_hold_obj(dp,
506 za.za_first_integer, FTAG, &ds);
1743c737 507 if (err != 0)
34dc7c2f 508 break;
34dc7c2f 509
eca7b760 510 strval = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN,
79c76d5b 511 KM_SLEEP);
34dc7c2f 512 dsl_dataset_name(ds, strval);
b128c09f 513 dsl_dataset_rele(ds, FTAG);
34dc7c2f
BB
514 } else {
515 strval = NULL;
516 intval = za.za_first_integer;
517 }
518
519 spa_prop_add_list(*nvp, prop, strval, intval, src);
520
521 if (strval != NULL)
eca7b760 522 kmem_free(strval, ZFS_MAX_DATASET_NAME_LEN);
34dc7c2f
BB
523
524 break;
525
526 case 1:
527 /* string property */
79c76d5b 528 strval = kmem_alloc(za.za_num_integers, KM_SLEEP);
34dc7c2f
BB
529 err = zap_lookup(mos, spa->spa_pool_props_object,
530 za.za_name, 1, za.za_num_integers, strval);
531 if (err) {
532 kmem_free(strval, za.za_num_integers);
533 break;
534 }
8eae2d21
AJ
535 if (prop != ZPOOL_PROP_INVAL) {
536 spa_prop_add_list(*nvp, prop, strval, 0, src);
537 } else {
538 src = ZPROP_SRC_LOCAL;
539 spa_prop_add_user(*nvp, za.za_name, strval,
540 src);
541 }
34dc7c2f
BB
542 kmem_free(strval, za.za_num_integers);
543 break;
544
545 default:
546 break;
547 }
548 }
549 zap_cursor_fini(&zc);
34dc7c2f 550out:
1743c737
AM
551 mutex_exit(&spa->spa_props_lock);
552 dsl_pool_config_exit(dp, FTAG);
34dc7c2f
BB
553 if (err && err != ENOENT) {
554 nvlist_free(*nvp);
555 *nvp = NULL;
556 return (err);
557 }
558
559 return (0);
560}
561
562/*
563 * Validate the given pool properties nvlist and modify the list
564 * for the property values to be set.
565 */
566static int
567spa_prop_validate(spa_t *spa, nvlist_t *props)
568{
569 nvpair_t *elem;
570 int error = 0, reset_bootfs = 0;
d4ed6673 571 uint64_t objnum = 0;
9ae529ec 572 boolean_t has_feature = B_FALSE;
34dc7c2f
BB
573
574 elem = NULL;
575 while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
34dc7c2f 576 uint64_t intval;
d1807f16 577 const char *strval, *slash, *check, *fname;
9ae529ec
CS
578 const char *propname = nvpair_name(elem);
579 zpool_prop_t prop = zpool_name_to_prop(propname);
580
31864e3d
BB
581 switch (prop) {
582 case ZPOOL_PROP_INVAL:
9ae529ec
CS
583 /*
584 * Sanitize the input.
585 */
8eae2d21
AJ
586 if (zfs_prop_user(propname)) {
587 if (strlen(propname) >= ZAP_MAXNAMELEN) {
588 error = SET_ERROR(ENAMETOOLONG);
589 break;
590 }
9ae529ec 591
8eae2d21
AJ
592 if (strlen(fnvpair_value_string(elem)) >=
593 ZAP_MAXVALUELEN) {
594 error = SET_ERROR(E2BIG);
595 break;
596 }
597 } else if (zpool_prop_feature(propname)) {
598 if (nvpair_type(elem) != DATA_TYPE_UINT64) {
599 error = SET_ERROR(EINVAL);
600 break;
601 }
34dc7c2f 602
8eae2d21
AJ
603 if (nvpair_value_uint64(elem, &intval) != 0) {
604 error = SET_ERROR(EINVAL);
605 break;
606 }
607
608 if (intval != 0) {
609 error = SET_ERROR(EINVAL);
610 break;
611 }
612
613 fname = strchr(propname, '@') + 1;
614 if (zfeature_lookup_name(fname, NULL) != 0) {
615 error = SET_ERROR(EINVAL);
616 break;
617 }
34dc7c2f 618
8eae2d21
AJ
619 has_feature = B_TRUE;
620 } else {
2e528b49 621 error = SET_ERROR(EINVAL);
9ae529ec
CS
622 break;
623 }
9ae529ec 624 break;
34dc7c2f 625
34dc7c2f
BB
626 case ZPOOL_PROP_VERSION:
627 error = nvpair_value_uint64(elem, &intval);
628 if (!error &&
9ae529ec
CS
629 (intval < spa_version(spa) ||
630 intval > SPA_VERSION_BEFORE_FEATURES ||
631 has_feature))
2e528b49 632 error = SET_ERROR(EINVAL);
34dc7c2f
BB
633 break;
634
635 case ZPOOL_PROP_DELEGATION:
636 case ZPOOL_PROP_AUTOREPLACE:
b128c09f 637 case ZPOOL_PROP_LISTSNAPS:
9babb374 638 case ZPOOL_PROP_AUTOEXPAND:
1b939560 639 case ZPOOL_PROP_AUTOTRIM:
34dc7c2f
BB
640 error = nvpair_value_uint64(elem, &intval);
641 if (!error && intval > 1)
2e528b49 642 error = SET_ERROR(EINVAL);
34dc7c2f
BB
643 break;
644
379ca9cf
OF
645 case ZPOOL_PROP_MULTIHOST:
646 error = nvpair_value_uint64(elem, &intval);
647 if (!error && intval > 1)
648 error = SET_ERROR(EINVAL);
649
25f06d67
BB
650 if (!error) {
651 uint32_t hostid = zone_get_hostid(NULL);
652 if (hostid)
653 spa->spa_hostid = hostid;
654 else
655 error = SET_ERROR(ENOTSUP);
656 }
379ca9cf
OF
657
658 break;
659
34dc7c2f 660 case ZPOOL_PROP_BOOTFS:
9babb374
BB
661 /*
662 * If the pool version is less than SPA_VERSION_BOOTFS,
663 * or the pool is still being created (version == 0),
664 * the bootfs property cannot be set.
665 */
34dc7c2f 666 if (spa_version(spa) < SPA_VERSION_BOOTFS) {
2e528b49 667 error = SET_ERROR(ENOTSUP);
34dc7c2f
BB
668 break;
669 }
670
671 /*
b128c09f 672 * Make sure the vdev config is bootable
34dc7c2f 673 */
b128c09f 674 if (!vdev_is_bootable(spa->spa_root_vdev)) {
2e528b49 675 error = SET_ERROR(ENOTSUP);
34dc7c2f
BB
676 break;
677 }
678
679 reset_bootfs = 1;
680
681 error = nvpair_value_string(elem, &strval);
682
683 if (!error) {
9ae529ec 684 objset_t *os;
b128c09f 685
34dc7c2f
BB
686 if (strval == NULL || strval[0] == '\0') {
687 objnum = zpool_prop_default_numeric(
688 ZPOOL_PROP_BOOTFS);
689 break;
690 }
691
d1d7e268 692 error = dmu_objset_hold(strval, FTAG, &os);
619f0976 693 if (error != 0)
34dc7c2f 694 break;
b128c09f 695
eaa25f1a 696 /* Must be ZPL. */
428870ff 697 if (dmu_objset_type(os) != DMU_OST_ZFS) {
2e528b49 698 error = SET_ERROR(ENOTSUP);
b128c09f
BB
699 } else {
700 objnum = dmu_objset_id(os);
701 }
428870ff 702 dmu_objset_rele(os, FTAG);
34dc7c2f
BB
703 }
704 break;
b128c09f 705
34dc7c2f
BB
706 case ZPOOL_PROP_FAILUREMODE:
707 error = nvpair_value_uint64(elem, &intval);
3bfd95d5 708 if (!error && intval > ZIO_FAILURE_MODE_PANIC)
2e528b49 709 error = SET_ERROR(EINVAL);
34dc7c2f
BB
710
711 /*
712 * This is a special case which only occurs when
713 * the pool has completely failed. This allows
714 * the user to change the in-core failmode property
715 * without syncing it out to disk (I/Os might
716 * currently be blocked). We do this by returning
717 * EIO to the caller (spa_prop_set) to trick it
718 * into thinking we encountered a property validation
719 * error.
720 */
b128c09f 721 if (!error && spa_suspended(spa)) {
34dc7c2f 722 spa->spa_failmode = intval;
2e528b49 723 error = SET_ERROR(EIO);
34dc7c2f
BB
724 }
725 break;
726
727 case ZPOOL_PROP_CACHEFILE:
728 if ((error = nvpair_value_string(elem, &strval)) != 0)
729 break;
730
731 if (strval[0] == '\0')
732 break;
733
734 if (strcmp(strval, "none") == 0)
735 break;
736
737 if (strval[0] != '/') {
2e528b49 738 error = SET_ERROR(EINVAL);
34dc7c2f
BB
739 break;
740 }
741
742 slash = strrchr(strval, '/');
743 ASSERT(slash != NULL);
744
745 if (slash[1] == '\0' || strcmp(slash, "/.") == 0 ||
746 strcmp(slash, "/..") == 0)
2e528b49 747 error = SET_ERROR(EINVAL);
34dc7c2f 748 break;
428870ff 749
d96eb2b1
DM
750 case ZPOOL_PROP_COMMENT:
751 if ((error = nvpair_value_string(elem, &strval)) != 0)
752 break;
753 for (check = strval; *check != '\0'; check++) {
754 if (!isprint(*check)) {
2e528b49 755 error = SET_ERROR(EINVAL);
d96eb2b1
DM
756 break;
757 }
d96eb2b1
DM
758 }
759 if (strlen(strval) > ZPROP_MAX_COMMENT)
2e528b49 760 error = SET_ERROR(E2BIG);
d96eb2b1
DM
761 break;
762
e75c13c3
BB
763 default:
764 break;
34dc7c2f
BB
765 }
766
767 if (error)
768 break;
769 }
770
050d720c
MA
771 (void) nvlist_remove_all(props,
772 zpool_prop_to_name(ZPOOL_PROP_DEDUPDITTO));
773
34dc7c2f
BB
774 if (!error && reset_bootfs) {
775 error = nvlist_remove(props,
776 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING);
777
778 if (!error) {
779 error = nvlist_add_uint64(props,
780 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum);
781 }
782 }
783
784 return (error);
785}
786
d164b209
BB
787void
788spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync)
789{
d1807f16 790 const char *cachefile;
d164b209
BB
791 spa_config_dirent_t *dp;
792
793 if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE),
794 &cachefile) != 0)
795 return;
796
797 dp = kmem_alloc(sizeof (spa_config_dirent_t),
79c76d5b 798 KM_SLEEP);
d164b209
BB
799
800 if (cachefile[0] == '\0')
801 dp->scd_path = spa_strdup(spa_config_path);
802 else if (strcmp(cachefile, "none") == 0)
803 dp->scd_path = NULL;
804 else
805 dp->scd_path = spa_strdup(cachefile);
806
807 list_insert_head(&spa->spa_config_list, dp);
808 if (need_sync)
809 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
810}
811
34dc7c2f
BB
812int
813spa_prop_set(spa_t *spa, nvlist_t *nvp)
814{
815 int error;
9ae529ec 816 nvpair_t *elem = NULL;
d164b209 817 boolean_t need_sync = B_FALSE;
34dc7c2f
BB
818
819 if ((error = spa_prop_validate(spa, nvp)) != 0)
820 return (error);
821
d164b209 822 while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) {
9ae529ec 823 zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem));
d164b209 824
572e2857
BB
825 if (prop == ZPOOL_PROP_CACHEFILE ||
826 prop == ZPOOL_PROP_ALTROOT ||
827 prop == ZPOOL_PROP_READONLY)
d164b209
BB
828 continue;
829
8eae2d21
AJ
830 if (prop == ZPOOL_PROP_INVAL &&
831 zfs_prop_user(nvpair_name(elem))) {
832 need_sync = B_TRUE;
833 break;
834 }
835
31864e3d 836 if (prop == ZPOOL_PROP_VERSION || prop == ZPOOL_PROP_INVAL) {
2a673e76 837 uint64_t ver = 0;
9ae529ec
CS
838
839 if (prop == ZPOOL_PROP_VERSION) {
840 VERIFY(nvpair_value_uint64(elem, &ver) == 0);
841 } else {
842 ASSERT(zpool_prop_feature(nvpair_name(elem)));
843 ver = SPA_VERSION_FEATURES;
844 need_sync = B_TRUE;
845 }
846
847 /* Save time if the version is already set. */
848 if (ver == spa_version(spa))
849 continue;
850
851 /*
852 * In addition to the pool directory object, we might
853 * create the pool properties object, the features for
854 * read object, the features for write object, or the
855 * feature descriptions object.
856 */
13fe0198 857 error = dsl_sync_task(spa->spa_name, NULL,
3d45fdd6
MA
858 spa_sync_version, &ver,
859 6, ZFS_SPACE_CHECK_RESERVED);
9ae529ec
CS
860 if (error)
861 return (error);
862 continue;
863 }
864
d164b209
BB
865 need_sync = B_TRUE;
866 break;
867 }
868
9ae529ec 869 if (need_sync) {
13fe0198 870 return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props,
3d45fdd6 871 nvp, 6, ZFS_SPACE_CHECK_RESERVED));
9ae529ec
CS
872 }
873
874 return (0);
34dc7c2f
BB
875}
876
877/*
878 * If the bootfs property value is dsobj, clear it.
879 */
880void
881spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx)
882{
883 if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) {
884 VERIFY(zap_remove(spa->spa_meta_objset,
885 spa->spa_pool_props_object,
886 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0);
887 spa->spa_bootfs = 0;
888 }
889}
890
3bc7e0fb 891static int
13fe0198 892spa_change_guid_check(void *arg, dmu_tx_t *tx)
3bc7e0fb 893{
2a8ba608 894 uint64_t *newguid __maybe_unused = arg;
13fe0198 895 spa_t *spa = dmu_tx_pool(tx)->dp_spa;
3bc7e0fb
GW
896 vdev_t *rvd = spa->spa_root_vdev;
897 uint64_t vdev_state;
3bc7e0fb 898
d2734cce
SD
899 if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
900 int error = (spa_has_checkpoint(spa)) ?
901 ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
902 return (SET_ERROR(error));
903 }
904
3bc7e0fb
GW
905 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
906 vdev_state = rvd->vdev_state;
907 spa_config_exit(spa, SCL_STATE, FTAG);
908
909 if (vdev_state != VDEV_STATE_HEALTHY)
2e528b49 910 return (SET_ERROR(ENXIO));
3bc7e0fb
GW
911
912 ASSERT3U(spa_guid(spa), !=, *newguid);
913
914 return (0);
915}
916
917static void
13fe0198 918spa_change_guid_sync(void *arg, dmu_tx_t *tx)
3bc7e0fb 919{
13fe0198
MA
920 uint64_t *newguid = arg;
921 spa_t *spa = dmu_tx_pool(tx)->dp_spa;
3bc7e0fb
GW
922 uint64_t oldguid;
923 vdev_t *rvd = spa->spa_root_vdev;
924
925 oldguid = spa_guid(spa);
926
927 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
928 rvd->vdev_guid = *newguid;
929 rvd->vdev_guid_sum += (*newguid - oldguid);
930 vdev_config_dirty(rvd);
931 spa_config_exit(spa, SCL_STATE, FTAG);
932
6f1ffb06 933 spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu",
74756182 934 (u_longlong_t)oldguid, (u_longlong_t)*newguid);
3bc7e0fb
GW
935}
936
3541dc6d
GA
937/*
938 * Change the GUID for the pool. This is done so that we can later
939 * re-import a pool built from a clone of our own vdevs. We will modify
940 * the root vdev's guid, our own pool guid, and then mark all of our
941 * vdevs dirty. Note that we must make sure that all our vdevs are
942 * online when we do this, or else any vdevs that weren't present
943 * would be orphaned from our pool. We are also going to issue a
944 * sysevent to update any watchers.
945 */
946int
947spa_change_guid(spa_t *spa)
948{
3bc7e0fb
GW
949 int error;
950 uint64_t guid;
3541dc6d 951
621dd7bb 952 mutex_enter(&spa->spa_vdev_top_lock);
3bc7e0fb
GW
953 mutex_enter(&spa_namespace_lock);
954 guid = spa_generate_guid(NULL);
3541dc6d 955
13fe0198 956 error = dsl_sync_task(spa->spa_name, spa_change_guid_check,
3d45fdd6 957 spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED);
3541dc6d 958
3bc7e0fb 959 if (error == 0) {
55c12724
AH
960 /*
961 * Clear the kobj flag from all the vdevs to allow
962 * vdev_cache_process_kobj_evt() to post events to all the
963 * vdevs since GUID is updated.
964 */
965 vdev_clear_kobj_evt(spa->spa_root_vdev);
966 for (int i = 0; i < spa->spa_l2cache.sav_count; i++)
967 vdev_clear_kobj_evt(spa->spa_l2cache.sav_vdevs[i]);
968
969 spa_write_cachefile(spa, B_FALSE, B_TRUE, B_TRUE);
12fa0466 970 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_REGUID);
3bc7e0fb 971 }
3541dc6d 972
3bc7e0fb 973 mutex_exit(&spa_namespace_lock);
621dd7bb 974 mutex_exit(&spa->spa_vdev_top_lock);
3541dc6d 975
3bc7e0fb 976 return (error);
3541dc6d
GA
977}
978
34dc7c2f
BB
979/*
980 * ==========================================================================
981 * SPA state manipulation (open/create/destroy/import/export)
982 * ==========================================================================
983 */
984
985static int
986spa_error_entry_compare(const void *a, const void *b)
987{
ee36c709
GN
988 const spa_error_entry_t *sa = (const spa_error_entry_t *)a;
989 const spa_error_entry_t *sb = (const spa_error_entry_t *)b;
34dc7c2f
BB
990 int ret;
991
ee36c709 992 ret = memcmp(&sa->se_bookmark, &sb->se_bookmark,
5dbd68a3 993 sizeof (zbookmark_phys_t));
34dc7c2f 994
ca577779 995 return (TREE_ISIGN(ret));
34dc7c2f
BB
996}
997
998/*
999 * Utility function which retrieves copies of the current logs and
1000 * re-initializes them in the process.
1001 */
1002void
1003spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub)
1004{
1005 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock));
1006
861166b0
AZ
1007 memcpy(last, &spa->spa_errlist_last, sizeof (avl_tree_t));
1008 memcpy(scrub, &spa->spa_errlist_scrub, sizeof (avl_tree_t));
34dc7c2f
BB
1009
1010 avl_create(&spa->spa_errlist_scrub,
1011 spa_error_entry_compare, sizeof (spa_error_entry_t),
1012 offsetof(spa_error_entry_t, se_avl));
1013 avl_create(&spa->spa_errlist_last,
1014 spa_error_entry_compare, sizeof (spa_error_entry_t),
1015 offsetof(spa_error_entry_t, se_avl));
1016}
1017
7ef5e54e
AL
1018static void
1019spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
34dc7c2f 1020{
7ef5e54e
AL
1021 const zio_taskq_info_t *ztip = &zio_taskqs[t][q];
1022 enum zti_modes mode = ztip->zti_mode;
1023 uint_t value = ztip->zti_value;
1024 uint_t count = ztip->zti_count;
1025 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
7457b024 1026 uint_t cpus, flags = TASKQ_DYNAMIC;
428870ff 1027 boolean_t batch = B_FALSE;
34dc7c2f 1028
e8b96c60
MA
1029 switch (mode) {
1030 case ZTI_MODE_FIXED:
7457b024 1031 ASSERT3U(value, >, 0);
e8b96c60 1032 break;
7ef5e54e 1033
e8b96c60
MA
1034 case ZTI_MODE_BATCH:
1035 batch = B_TRUE;
1036 flags |= TASKQ_THREADS_CPU_PCT;
dcb6bed1 1037 value = MIN(zio_taskq_batch_pct, 100);
e8b96c60 1038 break;
7ef5e54e 1039
7457b024
AM
1040 case ZTI_MODE_SCALE:
1041 flags |= TASKQ_THREADS_CPU_PCT;
1042 /*
1043 * We want more taskqs to reduce lock contention, but we want
1044 * less for better request ordering and CPU utilization.
1045 */
1046 cpus = MAX(1, boot_ncpus * zio_taskq_batch_pct / 100);
1047 if (zio_taskq_batch_tpq > 0) {
1048 count = MAX(1, (cpus + zio_taskq_batch_tpq / 2) /
1049 zio_taskq_batch_tpq);
1050 } else {
1051 /*
1052 * Prefer 6 threads per taskq, but no more taskqs
1053 * than threads in them on large systems. For 80%:
1054 *
1055 * taskq taskq total
1056 * cpus taskqs percent threads threads
1057 * ------- ------- ------- ------- -------
1058 * 1 1 80% 1 1
1059 * 2 1 80% 1 1
1060 * 4 1 80% 3 3
1061 * 8 2 40% 3 6
1062 * 16 3 27% 4 12
1063 * 32 5 16% 5 25
1064 * 64 7 11% 7 49
1065 * 128 10 8% 10 100
1066 * 256 14 6% 15 210
1067 */
1068 count = 1 + cpus / 6;
1069 while (count * count > cpus)
1070 count--;
1071 }
1072 /* Limit each taskq within 100% to not trigger assertion. */
1073 count = MAX(count, (zio_taskq_batch_pct + 99) / 100);
1074 value = (zio_taskq_batch_pct + count / 2) / count;
1075 break;
1076
1077 case ZTI_MODE_NULL:
1078 tqs->stqs_count = 0;
1079 tqs->stqs_taskq = NULL;
1080 return;
1081
e8b96c60
MA
1082 default:
1083 panic("unrecognized mode for %s_%s taskq (%u:%u) in "
1084 "spa_activate()",
1085 zio_type_name[t], zio_taskq_types[q], mode, value);
1086 break;
1087 }
7ef5e54e 1088
7457b024
AM
1089 ASSERT3U(count, >, 0);
1090 tqs->stqs_count = count;
1091 tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP);
1092
1c27024e 1093 for (uint_t i = 0; i < count; i++) {
e8b96c60 1094 taskq_t *tq;
af430294 1095 char name[32];
7ef5e54e 1096
7457b024
AM
1097 if (count > 1)
1098 (void) snprintf(name, sizeof (name), "%s_%s_%u",
1099 zio_type_name[t], zio_taskq_types[q], i);
1100 else
1101 (void) snprintf(name, sizeof (name), "%s_%s",
1102 zio_type_name[t], zio_taskq_types[q]);
7ef5e54e
AL
1103
1104 if (zio_taskq_sysdc && spa->spa_proc != &p0) {
1105 if (batch)
1106 flags |= TASKQ_DC_BATCH;
1107
18168da7 1108 (void) zio_taskq_basedc;
7ef5e54e
AL
1109 tq = taskq_create_sysdc(name, value, 50, INT_MAX,
1110 spa->spa_proc, zio_taskq_basedc, flags);
1111 } else {
e8b96c60
MA
1112 pri_t pri = maxclsyspri;
1113 /*
1114 * The write issue taskq can be extremely CPU
1229323d 1115 * intensive. Run it at slightly less important
7432d297
MM
1116 * priority than the other taskqs.
1117 *
1118 * Under Linux and FreeBSD this means incrementing
1119 * the priority value as opposed to platforms like
1120 * illumos where it should be decremented.
1121 *
1122 * On FreeBSD, if priorities divided by four (RQ_PPQ)
1123 * are equal then a difference between them is
1124 * insignificant.
e8b96c60 1125 */
7432d297
MM
1126 if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE) {
1127#if defined(__linux__)
1229323d 1128 pri++;
7432d297
MM
1129#elif defined(__FreeBSD__)
1130 pri += 4;
1131#else
1132#error "unknown OS"
1133#endif
1134 }
e8b96c60 1135 tq = taskq_create_proc(name, value, pri, 50,
7ef5e54e
AL
1136 INT_MAX, spa->spa_proc, flags);
1137 }
1138
1139 tqs->stqs_taskq[i] = tq;
1140 }
1141}
1142
1143static void
1144spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
1145{
1146 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
7ef5e54e
AL
1147
1148 if (tqs->stqs_taskq == NULL) {
1149 ASSERT3U(tqs->stqs_count, ==, 0);
1150 return;
1151 }
1152
1c27024e 1153 for (uint_t i = 0; i < tqs->stqs_count; i++) {
7ef5e54e
AL
1154 ASSERT3P(tqs->stqs_taskq[i], !=, NULL);
1155 taskq_destroy(tqs->stqs_taskq[i]);
428870ff 1156 }
34dc7c2f 1157
7ef5e54e
AL
1158 kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *));
1159 tqs->stqs_taskq = NULL;
1160}
34dc7c2f 1161
7ef5e54e
AL
1162/*
1163 * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority.
1164 * Note that a type may have multiple discrete taskqs to avoid lock contention
1165 * on the taskq itself. In that case we choose which taskq at random by using
1166 * the low bits of gethrtime().
1167 */
1168void
1169spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
1170 task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent)
1171{
1172 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
1173 taskq_t *tq;
1174
1175 ASSERT3P(tqs->stqs_taskq, !=, NULL);
1176 ASSERT3U(tqs->stqs_count, !=, 0);
1177
1178 if (tqs->stqs_count == 1) {
1179 tq = tqs->stqs_taskq[0];
1180 } else {
c12936b1 1181 tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count];
428870ff 1182 }
7ef5e54e
AL
1183
1184 taskq_dispatch_ent(tq, func, arg, flags, ent);
428870ff
BB
1185}
1186
044baf00
BB
1187/*
1188 * Same as spa_taskq_dispatch_ent() but block on the task until completion.
1189 */
1190void
1191spa_taskq_dispatch_sync(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
1192 task_func_t *func, void *arg, uint_t flags)
1193{
1194 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
1195 taskq_t *tq;
1196 taskqid_t id;
1197
1198 ASSERT3P(tqs->stqs_taskq, !=, NULL);
1199 ASSERT3U(tqs->stqs_count, !=, 0);
1200
1201 if (tqs->stqs_count == 1) {
1202 tq = tqs->stqs_taskq[0];
1203 } else {
c12936b1 1204 tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count];
044baf00
BB
1205 }
1206
1207 id = taskq_dispatch(tq, func, arg, flags);
1208 if (id)
1209 taskq_wait_id(tq, id);
1210}
1211
428870ff
BB
1212static void
1213spa_create_zio_taskqs(spa_t *spa)
1214{
1c27024e
DB
1215 for (int t = 0; t < ZIO_TYPES; t++) {
1216 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
7ef5e54e 1217 spa_taskqs_init(spa, t, q);
428870ff
BB
1218 }
1219 }
1220}
9babb374 1221
c25b8f99
BB
1222/*
1223 * Disabled until spa_thread() can be adapted for Linux.
1224 */
1225#undef HAVE_SPA_THREAD
1226
7b89a549 1227#if defined(_KERNEL) && defined(HAVE_SPA_THREAD)
428870ff
BB
1228static void
1229spa_thread(void *arg)
1230{
93ce2b4c 1231 psetid_t zio_taskq_psrset_bind = PS_NONE;
428870ff 1232 callb_cpr_t cprinfo;
9babb374 1233
428870ff
BB
1234 spa_t *spa = arg;
1235 user_t *pu = PTOU(curproc);
9babb374 1236
428870ff
BB
1237 CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr,
1238 spa->spa_name);
9babb374 1239
428870ff
BB
1240 ASSERT(curproc != &p0);
1241 (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs),
1242 "zpool-%s", spa->spa_name);
1243 (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm));
1244
1245 /* bind this thread to the requested psrset */
1246 if (zio_taskq_psrset_bind != PS_NONE) {
1247 pool_lock();
1248 mutex_enter(&cpu_lock);
1249 mutex_enter(&pidlock);
1250 mutex_enter(&curproc->p_lock);
1251
1252 if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind,
1253 0, NULL, NULL) == 0) {
1254 curthread->t_bind_pset = zio_taskq_psrset_bind;
1255 } else {
1256 cmn_err(CE_WARN,
1257 "Couldn't bind process for zfs pool \"%s\" to "
1258 "pset %d\n", spa->spa_name, zio_taskq_psrset_bind);
1259 }
1260
1261 mutex_exit(&curproc->p_lock);
1262 mutex_exit(&pidlock);
1263 mutex_exit(&cpu_lock);
1264 pool_unlock();
1265 }
1266
1267 if (zio_taskq_sysdc) {
1268 sysdc_thread_enter(curthread, 100, 0);
1269 }
1270
1271 spa->spa_proc = curproc;
1272 spa->spa_did = curthread->t_did;
1273
1274 spa_create_zio_taskqs(spa);
1275
1276 mutex_enter(&spa->spa_proc_lock);
1277 ASSERT(spa->spa_proc_state == SPA_PROC_CREATED);
1278
1279 spa->spa_proc_state = SPA_PROC_ACTIVE;
1280 cv_broadcast(&spa->spa_proc_cv);
1281
1282 CALLB_CPR_SAFE_BEGIN(&cprinfo);
1283 while (spa->spa_proc_state == SPA_PROC_ACTIVE)
1284 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
1285 CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock);
1286
1287 ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE);
1288 spa->spa_proc_state = SPA_PROC_GONE;
1289 spa->spa_proc = &p0;
1290 cv_broadcast(&spa->spa_proc_cv);
1291 CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */
1292
1293 mutex_enter(&curproc->p_lock);
1294 lwp_exit();
1295}
1296#endif
1297
95f71c01
EN
1298extern metaslab_ops_t *metaslab_allocator(spa_t *spa);
1299
428870ff
BB
1300/*
1301 * Activate an uninitialized pool.
1302 */
1303static void
da92d5cb 1304spa_activate(spa_t *spa, spa_mode_t mode)
428870ff 1305{
95f71c01 1306 metaslab_ops_t *msp = metaslab_allocator(spa);
428870ff
BB
1307 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
1308
1309 spa->spa_state = POOL_STATE_ACTIVE;
1310 spa->spa_mode = mode;
e39fe05b 1311 spa->spa_read_spacemaps = spa_mode_readable_spacemaps;
428870ff 1312
95f71c01
EN
1313 spa->spa_normal_class = metaslab_class_create(spa, msp);
1314 spa->spa_log_class = metaslab_class_create(spa, msp);
1315 spa->spa_embedded_log_class = metaslab_class_create(spa, msp);
1316 spa->spa_special_class = metaslab_class_create(spa, msp);
1317 spa->spa_dedup_class = metaslab_class_create(spa, msp);
428870ff
BB
1318
1319 /* Try to create a covering process */
1320 mutex_enter(&spa->spa_proc_lock);
1321 ASSERT(spa->spa_proc_state == SPA_PROC_NONE);
1322 ASSERT(spa->spa_proc == &p0);
1323 spa->spa_did = 0;
1324
18168da7 1325 (void) spa_create_process;
7b89a549 1326#ifdef HAVE_SPA_THREAD
428870ff
BB
1327 /* Only create a process if we're going to be around a while. */
1328 if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) {
1329 if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri,
1330 NULL, 0) == 0) {
1331 spa->spa_proc_state = SPA_PROC_CREATED;
1332 while (spa->spa_proc_state == SPA_PROC_CREATED) {
1333 cv_wait(&spa->spa_proc_cv,
1334 &spa->spa_proc_lock);
9babb374 1335 }
428870ff
BB
1336 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
1337 ASSERT(spa->spa_proc != &p0);
1338 ASSERT(spa->spa_did != 0);
1339 } else {
1340#ifdef _KERNEL
1341 cmn_err(CE_WARN,
1342 "Couldn't create process for zfs pool \"%s\"\n",
1343 spa->spa_name);
1344#endif
b128c09f 1345 }
34dc7c2f 1346 }
7b89a549 1347#endif /* HAVE_SPA_THREAD */
428870ff
BB
1348 mutex_exit(&spa->spa_proc_lock);
1349
1350 /* If we didn't create a process, we need to create our taskqs. */
1351 if (spa->spa_proc == &p0) {
1352 spa_create_zio_taskqs(spa);
1353 }
34dc7c2f 1354
619f0976
GW
1355 for (size_t i = 0; i < TXG_SIZE; i++) {
1356 spa->spa_txg_zio[i] = zio_root(spa, NULL, NULL,
1357 ZIO_FLAG_CANFAIL);
1358 }
a1d477c2 1359
b128c09f
BB
1360 list_create(&spa->spa_config_dirty_list, sizeof (vdev_t),
1361 offsetof(vdev_t, vdev_config_dirty_node));
0c66c32d
JG
1362 list_create(&spa->spa_evicting_os_list, sizeof (objset_t),
1363 offsetof(objset_t, os_evicting_node));
b128c09f
BB
1364 list_create(&spa->spa_state_dirty_list, sizeof (vdev_t),
1365 offsetof(vdev_t, vdev_state_dirty_node));
34dc7c2f 1366
4747a7d3 1367 txg_list_create(&spa->spa_vdev_txg_list, spa,
34dc7c2f
BB
1368 offsetof(struct vdev, vdev_txg_node));
1369
1370 avl_create(&spa->spa_errlist_scrub,
1371 spa_error_entry_compare, sizeof (spa_error_entry_t),
1372 offsetof(spa_error_entry_t, se_avl));
1373 avl_create(&spa->spa_errlist_last,
1374 spa_error_entry_compare, sizeof (spa_error_entry_t),
1375 offsetof(spa_error_entry_t, se_avl));
e8cf3a4f
AP
1376 avl_create(&spa->spa_errlist_healed,
1377 spa_error_entry_compare, sizeof (spa_error_entry_t),
1378 offsetof(spa_error_entry_t, se_avl));
a0bd735a 1379
4759342a
JL
1380 spa_activate_os(spa);
1381
b5256303
TC
1382 spa_keystore_init(&spa->spa_keystore);
1383
a0bd735a
BP
1384 /*
1385 * This taskq is used to perform zvol-minor-related tasks
1386 * asynchronously. This has several advantages, including easy
d0249a4b 1387 * resolution of various deadlocks.
a0bd735a
BP
1388 *
1389 * The taskq must be single threaded to ensure tasks are always
1390 * processed in the order in which they were dispatched.
1391 *
1392 * A taskq per pool allows one to keep the pools independent.
1393 * This way if one pool is suspended, it will not impact another.
1394 *
1395 * The preferred location to dispatch a zvol minor task is a sync
1396 * task. In this context, there is easy access to the spa_t and minimal
1397 * error handling is required because the sync task must succeed.
1398 */
1399 spa->spa_zvol_taskq = taskq_create("z_zvol", 1, defclsyspri,
1400 1, INT_MAX, 0);
1de321e6 1401
77d8a0f1 1402 /*
1403 * Taskq dedicated to prefetcher threads: this is used to prevent the
1404 * pool traverse code from monopolizing the global (and limited)
1405 * system_taskq by inappropriately scheduling long running tasks on it.
1406 */
60a4c7d2
PD
1407 spa->spa_prefetch_taskq = taskq_create("z_prefetch", 100,
1408 defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT);
77d8a0f1 1409
1de321e6
JX
1410 /*
1411 * The taskq to upgrade datasets in this pool. Currently used by
9c5167d1 1412 * feature SPA_FEATURE_USEROBJ_ACCOUNTING/SPA_FEATURE_PROJECT_QUOTA.
1de321e6 1413 */
60a4c7d2
PD
1414 spa->spa_upgrade_taskq = taskq_create("z_upgrade", 100,
1415 defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT);
34dc7c2f
BB
1416}
1417
1418/*
1419 * Opposite of spa_activate().
1420 */
1421static void
1422spa_deactivate(spa_t *spa)
1423{
34dc7c2f
BB
1424 ASSERT(spa->spa_sync_on == B_FALSE);
1425 ASSERT(spa->spa_dsl_pool == NULL);
1426 ASSERT(spa->spa_root_vdev == NULL);
9babb374 1427 ASSERT(spa->spa_async_zio_root == NULL);
34dc7c2f
BB
1428 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED);
1429
0c66c32d
JG
1430 spa_evicting_os_wait(spa);
1431
a0bd735a
BP
1432 if (spa->spa_zvol_taskq) {
1433 taskq_destroy(spa->spa_zvol_taskq);
1434 spa->spa_zvol_taskq = NULL;
1435 }
1436
77d8a0f1 1437 if (spa->spa_prefetch_taskq) {
1438 taskq_destroy(spa->spa_prefetch_taskq);
1439 spa->spa_prefetch_taskq = NULL;
1440 }
1441
1de321e6
JX
1442 if (spa->spa_upgrade_taskq) {
1443 taskq_destroy(spa->spa_upgrade_taskq);
1444 spa->spa_upgrade_taskq = NULL;
1445 }
1446
34dc7c2f
BB
1447 txg_list_destroy(&spa->spa_vdev_txg_list);
1448
b128c09f 1449 list_destroy(&spa->spa_config_dirty_list);
0c66c32d 1450 list_destroy(&spa->spa_evicting_os_list);
b128c09f 1451 list_destroy(&spa->spa_state_dirty_list);
34dc7c2f 1452
57ddcda1 1453 taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid);
cc92e9d0 1454
1c27024e
DB
1455 for (int t = 0; t < ZIO_TYPES; t++) {
1456 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
7ef5e54e 1457 spa_taskqs_fini(spa, t, q);
b128c09f 1458 }
34dc7c2f
BB
1459 }
1460
a1d477c2
MA
1461 for (size_t i = 0; i < TXG_SIZE; i++) {
1462 ASSERT3P(spa->spa_txg_zio[i], !=, NULL);
1463 VERIFY0(zio_wait(spa->spa_txg_zio[i]));
1464 spa->spa_txg_zio[i] = NULL;
1465 }
1466
34dc7c2f
BB
1467 metaslab_class_destroy(spa->spa_normal_class);
1468 spa->spa_normal_class = NULL;
1469
1470 metaslab_class_destroy(spa->spa_log_class);
1471 spa->spa_log_class = NULL;
1472
aa755b35
MA
1473 metaslab_class_destroy(spa->spa_embedded_log_class);
1474 spa->spa_embedded_log_class = NULL;
1475
cc99f275
DB
1476 metaslab_class_destroy(spa->spa_special_class);
1477 spa->spa_special_class = NULL;
1478
1479 metaslab_class_destroy(spa->spa_dedup_class);
1480 spa->spa_dedup_class = NULL;
1481
34dc7c2f
BB
1482 /*
1483 * If this was part of an import or the open otherwise failed, we may
1484 * still have errors left in the queues. Empty them just in case.
1485 */
1486 spa_errlog_drain(spa);
34dc7c2f
BB
1487 avl_destroy(&spa->spa_errlist_scrub);
1488 avl_destroy(&spa->spa_errlist_last);
e8cf3a4f 1489 avl_destroy(&spa->spa_errlist_healed);
34dc7c2f 1490
b5256303
TC
1491 spa_keystore_fini(&spa->spa_keystore);
1492
34dc7c2f 1493 spa->spa_state = POOL_STATE_UNINITIALIZED;
428870ff
BB
1494
1495 mutex_enter(&spa->spa_proc_lock);
1496 if (spa->spa_proc_state != SPA_PROC_NONE) {
1497 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
1498 spa->spa_proc_state = SPA_PROC_DEACTIVATE;
1499 cv_broadcast(&spa->spa_proc_cv);
1500 while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) {
1501 ASSERT(spa->spa_proc != &p0);
1502 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
1503 }
1504 ASSERT(spa->spa_proc_state == SPA_PROC_GONE);
1505 spa->spa_proc_state = SPA_PROC_NONE;
1506 }
1507 ASSERT(spa->spa_proc == &p0);
1508 mutex_exit(&spa->spa_proc_lock);
1509
1510 /*
1511 * We want to make sure spa_thread() has actually exited the ZFS
1512 * module, so that the module can't be unloaded out from underneath
1513 * it.
1514 */
1515 if (spa->spa_did != 0) {
1516 thread_join(spa->spa_did);
1517 spa->spa_did = 0;
1518 }
4759342a
JL
1519
1520 spa_deactivate_os(spa);
1521
34dc7c2f
BB
1522}
1523
1524/*
1525 * Verify a pool configuration, and construct the vdev tree appropriately. This
1526 * will create all the necessary vdevs in the appropriate layout, with each vdev
1527 * in the CLOSED state. This will prep the pool before open/creation/import.
1528 * All vdev validation is done by the vdev_alloc() routine.
1529 */
4a22ba5b 1530int
34dc7c2f
BB
1531spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent,
1532 uint_t id, int atype)
1533{
1534 nvlist_t **child;
9babb374 1535 uint_t children;
34dc7c2f
BB
1536 int error;
1537
1538 if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0)
1539 return (error);
1540
1541 if ((*vdp)->vdev_ops->vdev_op_leaf)
1542 return (0);
1543
b128c09f
BB
1544 error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
1545 &child, &children);
1546
1547 if (error == ENOENT)
1548 return (0);
1549
1550 if (error) {
34dc7c2f
BB
1551 vdev_free(*vdp);
1552 *vdp = NULL;
2e528b49 1553 return (SET_ERROR(EINVAL));
34dc7c2f
BB
1554 }
1555
1c27024e 1556 for (int c = 0; c < children; c++) {
34dc7c2f
BB
1557 vdev_t *vd;
1558 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c,
1559 atype)) != 0) {
1560 vdev_free(*vdp);
1561 *vdp = NULL;
1562 return (error);
1563 }
1564 }
1565
1566 ASSERT(*vdp != NULL);
1567
1568 return (0);
1569}
1570
93e28d66
SD
1571static boolean_t
1572spa_should_flush_logs_on_unload(spa_t *spa)
1573{
1574 if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
1575 return (B_FALSE);
1576
1577 if (!spa_writeable(spa))
1578 return (B_FALSE);
1579
1580 if (!spa->spa_sync_on)
1581 return (B_FALSE);
1582
1583 if (spa_state(spa) != POOL_STATE_EXPORTED)
1584 return (B_FALSE);
1585
1586 if (zfs_keep_log_spacemaps_at_export)
1587 return (B_FALSE);
1588
1589 return (B_TRUE);
1590}
1591
1592/*
1593 * Opens a transaction that will set the flag that will instruct
1594 * spa_sync to attempt to flush all the metaslabs for that txg.
1595 */
1596static void
1597spa_unload_log_sm_flush_all(spa_t *spa)
1598{
1599 dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
1600 VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
1601
1602 ASSERT3U(spa->spa_log_flushall_txg, ==, 0);
1603 spa->spa_log_flushall_txg = dmu_tx_get_txg(tx);
1604
1605 dmu_tx_commit(tx);
1606 txg_wait_synced(spa_get_dsl(spa), spa->spa_log_flushall_txg);
1607}
1608
1609static void
1610spa_unload_log_sm_metadata(spa_t *spa)
1611{
1612 void *cookie = NULL;
1613 spa_log_sm_t *sls;
b3ad3f48
AM
1614 log_summary_entry_t *e;
1615
93e28d66
SD
1616 while ((sls = avl_destroy_nodes(&spa->spa_sm_logs_by_txg,
1617 &cookie)) != NULL) {
1618 VERIFY0(sls->sls_mscount);
1619 kmem_free(sls, sizeof (spa_log_sm_t));
1620 }
1621
b3ad3f48 1622 while ((e = list_remove_head(&spa->spa_log_summary)) != NULL) {
93e28d66 1623 VERIFY0(e->lse_mscount);
93e28d66
SD
1624 kmem_free(e, sizeof (log_summary_entry_t));
1625 }
1626
1627 spa->spa_unflushed_stats.sus_nblocks = 0;
1628 spa->spa_unflushed_stats.sus_memused = 0;
1629 spa->spa_unflushed_stats.sus_blocklimit = 0;
1630}
1631
37f03da8
SH
1632static void
1633spa_destroy_aux_threads(spa_t *spa)
1634{
1635 if (spa->spa_condense_zthr != NULL) {
1636 zthr_destroy(spa->spa_condense_zthr);
1637 spa->spa_condense_zthr = NULL;
1638 }
1639 if (spa->spa_checkpoint_discard_zthr != NULL) {
1640 zthr_destroy(spa->spa_checkpoint_discard_zthr);
1641 spa->spa_checkpoint_discard_zthr = NULL;
1642 }
1643 if (spa->spa_livelist_delete_zthr != NULL) {
1644 zthr_destroy(spa->spa_livelist_delete_zthr);
1645 spa->spa_livelist_delete_zthr = NULL;
1646 }
1647 if (spa->spa_livelist_condense_zthr != NULL) {
1648 zthr_destroy(spa->spa_livelist_condense_zthr);
1649 spa->spa_livelist_condense_zthr = NULL;
1650 }
1651}
1652
34dc7c2f
BB
1653/*
1654 * Opposite of spa_load().
1655 */
1656static void
1657spa_unload(spa_t *spa)
1658{
b128c09f 1659 ASSERT(MUTEX_HELD(&spa_namespace_lock));
93e28d66 1660 ASSERT(spa_state(spa) != POOL_STATE_UNINITIALIZED);
b128c09f 1661
ca95f70d 1662 spa_import_progress_remove(spa_guid(spa));
4a0ee12a
PZ
1663 spa_load_note(spa, "UNLOADING");
1664
e60e158e
JG
1665 spa_wake_waiters(spa);
1666
93e28d66 1667 /*
2fb52853
GA
1668 * If we have set the spa_final_txg, we have already performed the
1669 * tasks below in spa_export_common(). We should not redo it here since
1670 * we delay the final TXGs beyond what spa_final_txg is set at.
93e28d66 1671 */
2fb52853
GA
1672 if (spa->spa_final_txg == UINT64_MAX) {
1673 /*
1674 * If the log space map feature is enabled and the pool is
1675 * getting exported (but not destroyed), we want to spend some
1676 * time flushing as many metaslabs as we can in an attempt to
1677 * destroy log space maps and save import time.
1678 */
1679 if (spa_should_flush_logs_on_unload(spa))
1680 spa_unload_log_sm_flush_all(spa);
93e28d66 1681
2fb52853
GA
1682 /*
1683 * Stop async tasks.
1684 */
1685 spa_async_suspend(spa);
34dc7c2f 1686
2fb52853
GA
1687 if (spa->spa_root_vdev) {
1688 vdev_t *root_vdev = spa->spa_root_vdev;
1689 vdev_initialize_stop_all(root_vdev,
1690 VDEV_INITIALIZE_ACTIVE);
1691 vdev_trim_stop_all(root_vdev, VDEV_TRIM_ACTIVE);
1692 vdev_autotrim_stop_all(spa);
1693 vdev_rebuild_stop_all(spa);
1694 }
619f0976
GW
1695 }
1696
34dc7c2f
BB
1697 /*
1698 * Stop syncing.
1699 */
1700 if (spa->spa_sync_on) {
1701 txg_sync_stop(spa->spa_dsl_pool);
1702 spa->spa_sync_on = B_FALSE;
1703 }
1704
4e21fd06 1705 /*
93e28d66
SD
1706 * This ensures that there is no async metaslab prefetching
1707 * while we attempt to unload the spa.
4e21fd06
DB
1708 */
1709 if (spa->spa_root_vdev != NULL) {
93e28d66
SD
1710 for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) {
1711 vdev_t *vc = spa->spa_root_vdev->vdev_child[c];
1712 if (vc->vdev_mg != NULL)
1713 taskq_wait(vc->vdev_mg->mg_taskq);
1714 }
4e21fd06
DB
1715 }
1716
379ca9cf
OF
1717 if (spa->spa_mmp.mmp_thread)
1718 mmp_thread_stop(spa);
1719
34dc7c2f 1720 /*
b128c09f 1721 * Wait for any outstanding async I/O to complete.
34dc7c2f 1722 */
9babb374 1723 if (spa->spa_async_zio_root != NULL) {
1c27024e 1724 for (int i = 0; i < max_ncpus; i++)
e022864d
MA
1725 (void) zio_wait(spa->spa_async_zio_root[i]);
1726 kmem_free(spa->spa_async_zio_root, max_ncpus * sizeof (void *));
9babb374
BB
1727 spa->spa_async_zio_root = NULL;
1728 }
34dc7c2f 1729
a1d477c2
MA
1730 if (spa->spa_vdev_removal != NULL) {
1731 spa_vdev_removal_destroy(spa->spa_vdev_removal);
1732 spa->spa_vdev_removal = NULL;
1733 }
1734
37f03da8 1735 spa_destroy_aux_threads(spa);
d2734cce 1736
a1d477c2
MA
1737 spa_condense_fini(spa);
1738
428870ff
BB
1739 bpobj_close(&spa->spa_deferred_bpobj);
1740
619f0976 1741 spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
93cf2076
GW
1742
1743 /*
1744 * Close all vdevs.
1745 */
1746 if (spa->spa_root_vdev)
1747 vdev_free(spa->spa_root_vdev);
1748 ASSERT(spa->spa_root_vdev == NULL);
1749
34dc7c2f
BB
1750 /*
1751 * Close the dsl pool.
1752 */
1753 if (spa->spa_dsl_pool) {
1754 dsl_pool_close(spa->spa_dsl_pool);
1755 spa->spa_dsl_pool = NULL;
428870ff 1756 spa->spa_meta_objset = NULL;
34dc7c2f
BB
1757 }
1758
428870ff 1759 ddt_unload(spa);
67a1b037 1760 brt_unload(spa);
93e28d66 1761 spa_unload_log_sm_metadata(spa);
428870ff 1762
fb5f0bc8
BB
1763 /*
1764 * Drop and purge level 2 cache
1765 */
1766 spa_l2cache_drop(spa);
1767
34dc7c2f 1768 if (spa->spa_spares.sav_vdevs) {
cfb49616
RY
1769 for (int i = 0; i < spa->spa_spares.sav_count; i++)
1770 vdev_free(spa->spa_spares.sav_vdevs[i]);
34dc7c2f
BB
1771 kmem_free(spa->spa_spares.sav_vdevs,
1772 spa->spa_spares.sav_count * sizeof (void *));
1773 spa->spa_spares.sav_vdevs = NULL;
1774 }
1775 if (spa->spa_spares.sav_config) {
1776 nvlist_free(spa->spa_spares.sav_config);
1777 spa->spa_spares.sav_config = NULL;
1778 }
b128c09f 1779 spa->spa_spares.sav_count = 0;
34dc7c2f 1780
34dc7c2f 1781 if (spa->spa_l2cache.sav_vdevs) {
cfb49616
RY
1782 for (int i = 0; i < spa->spa_l2cache.sav_count; i++) {
1783 vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]);
1784 vdev_free(spa->spa_l2cache.sav_vdevs[i]);
1785 }
34dc7c2f
BB
1786 kmem_free(spa->spa_l2cache.sav_vdevs,
1787 spa->spa_l2cache.sav_count * sizeof (void *));
1788 spa->spa_l2cache.sav_vdevs = NULL;
1789 }
1790 if (spa->spa_l2cache.sav_config) {
1791 nvlist_free(spa->spa_l2cache.sav_config);
1792 spa->spa_l2cache.sav_config = NULL;
1793 }
b128c09f 1794 spa->spa_l2cache.sav_count = 0;
34dc7c2f
BB
1795
1796 spa->spa_async_suspended = 0;
fb5f0bc8 1797
a1d477c2
MA
1798 spa->spa_indirect_vdevs_loaded = B_FALSE;
1799
d96eb2b1
DM
1800 if (spa->spa_comment != NULL) {
1801 spa_strfree(spa->spa_comment);
1802 spa->spa_comment = NULL;
1803 }
658fb802
CB
1804 if (spa->spa_compatibility != NULL) {
1805 spa_strfree(spa->spa_compatibility);
1806 spa->spa_compatibility = NULL;
1807 }
d96eb2b1 1808
619f0976 1809 spa_config_exit(spa, SCL_ALL, spa);
34dc7c2f
BB
1810}
1811
1812/*
1813 * Load (or re-load) the current list of vdevs describing the active spares for
1814 * this pool. When this is called, we have some form of basic information in
1815 * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and
1816 * then re-generate a more complete list including status information.
1817 */
a1d477c2 1818void
34dc7c2f
BB
1819spa_load_spares(spa_t *spa)
1820{
1821 nvlist_t **spares;
1822 uint_t nspares;
1823 int i;
1824 vdev_t *vd, *tvd;
1825
d2734cce
SD
1826#ifndef _KERNEL
1827 /*
1828 * zdb opens both the current state of the pool and the
1829 * checkpointed state (if present), with a different spa_t.
1830 *
1831 * As spare vdevs are shared among open pools, we skip loading
1832 * them when we load the checkpointed state of the pool.
1833 */
1834 if (!spa_writeable(spa))
1835 return;
1836#endif
1837
b128c09f
BB
1838 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
1839
34dc7c2f
BB
1840 /*
1841 * First, close and free any existing spare vdevs.
1842 */
cfb49616
RY
1843 if (spa->spa_spares.sav_vdevs) {
1844 for (i = 0; i < spa->spa_spares.sav_count; i++) {
1845 vd = spa->spa_spares.sav_vdevs[i];
1846
1847 /* Undo the call to spa_activate() below */
1848 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
1849 B_FALSE)) != NULL && tvd->vdev_isspare)
1850 spa_spare_remove(tvd);
1851 vdev_close(vd);
1852 vdev_free(vd);
1853 }
34dc7c2f 1854
34dc7c2f
BB
1855 kmem_free(spa->spa_spares.sav_vdevs,
1856 spa->spa_spares.sav_count * sizeof (void *));
cfb49616 1857 }
34dc7c2f
BB
1858
1859 if (spa->spa_spares.sav_config == NULL)
1860 nspares = 0;
1861 else
65ad5d11
AJ
1862 VERIFY0(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
1863 ZPOOL_CONFIG_SPARES, &spares, &nspares));
34dc7c2f
BB
1864
1865 spa->spa_spares.sav_count = (int)nspares;
1866 spa->spa_spares.sav_vdevs = NULL;
1867
1868 if (nspares == 0)
1869 return;
1870
1871 /*
1872 * Construct the array of vdevs, opening them to get status in the
1873 * process. For each spare, there is potentially two different vdev_t
1874 * structures associated with it: one in the list of spares (used only
1875 * for basic validation purposes) and one in the active vdev
1876 * configuration (if it's spared in). During this phase we open and
1877 * validate each vdev on the spare list. If the vdev also exists in the
1878 * active configuration, then we also mark this vdev as an active spare.
1879 */
904ea276 1880 spa->spa_spares.sav_vdevs = kmem_zalloc(nspares * sizeof (void *),
79c76d5b 1881 KM_SLEEP);
34dc7c2f
BB
1882 for (i = 0; i < spa->spa_spares.sav_count; i++) {
1883 VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0,
1884 VDEV_ALLOC_SPARE) == 0);
1885 ASSERT(vd != NULL);
1886
1887 spa->spa_spares.sav_vdevs[i] = vd;
1888
b128c09f
BB
1889 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
1890 B_FALSE)) != NULL) {
34dc7c2f
BB
1891 if (!tvd->vdev_isspare)
1892 spa_spare_add(tvd);
1893
1894 /*
1895 * We only mark the spare active if we were successfully
1896 * able to load the vdev. Otherwise, importing a pool
1897 * with a bad active spare would result in strange
1898 * behavior, because multiple pool would think the spare
1899 * is actively in use.
1900 *
1901 * There is a vulnerability here to an equally bizarre
1902 * circumstance, where a dead active spare is later
1903 * brought back to life (onlined or otherwise). Given
1904 * the rarity of this scenario, and the extra complexity
1905 * it adds, we ignore the possibility.
1906 */
1907 if (!vdev_is_dead(tvd))
1908 spa_spare_activate(tvd);
1909 }
1910
b128c09f 1911 vd->vdev_top = vd;
9babb374 1912 vd->vdev_aux = &spa->spa_spares;
b128c09f 1913
34dc7c2f
BB
1914 if (vdev_open(vd) != 0)
1915 continue;
1916
34dc7c2f
BB
1917 if (vdev_validate_aux(vd) == 0)
1918 spa_spare_add(vd);
1919 }
1920
1921 /*
1922 * Recompute the stashed list of spares, with status information
1923 * this time.
1924 */
65ad5d11 1925 fnvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES);
34dc7c2f
BB
1926
1927 spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *),
79c76d5b 1928 KM_SLEEP);
34dc7c2f
BB
1929 for (i = 0; i < spa->spa_spares.sav_count; i++)
1930 spares[i] = vdev_config_generate(spa,
428870ff 1931 spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE);
65ad5d11 1932 fnvlist_add_nvlist_array(spa->spa_spares.sav_config,
795075e6
PD
1933 ZPOOL_CONFIG_SPARES, (const nvlist_t * const *)spares,
1934 spa->spa_spares.sav_count);
34dc7c2f
BB
1935 for (i = 0; i < spa->spa_spares.sav_count; i++)
1936 nvlist_free(spares[i]);
1937 kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *));
1938}
1939
1940/*
1941 * Load (or re-load) the current list of vdevs describing the active l2cache for
1942 * this pool. When this is called, we have some form of basic information in
1943 * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and
1944 * then re-generate a more complete list including status information.
1945 * Devices which are already active have their details maintained, and are
1946 * not re-opened.
1947 */
a1d477c2 1948void
34dc7c2f
BB
1949spa_load_l2cache(spa_t *spa)
1950{
460f239e 1951 nvlist_t **l2cache = NULL;
34dc7c2f
BB
1952 uint_t nl2cache;
1953 int i, j, oldnvdevs;
9babb374 1954 uint64_t guid;
a117a6d6 1955 vdev_t *vd, **oldvdevs, **newvdevs;
34dc7c2f
BB
1956 spa_aux_vdev_t *sav = &spa->spa_l2cache;
1957
d2734cce
SD
1958#ifndef _KERNEL
1959 /*
1960 * zdb opens both the current state of the pool and the
1961 * checkpointed state (if present), with a different spa_t.
1962 *
1963 * As L2 caches are part of the ARC which is shared among open
1964 * pools, we skip loading them when we load the checkpointed
1965 * state of the pool.
1966 */
1967 if (!spa_writeable(spa))
1968 return;
1969#endif
1970
b128c09f
BB
1971 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
1972
34dc7c2f
BB
1973 oldvdevs = sav->sav_vdevs;
1974 oldnvdevs = sav->sav_count;
1975 sav->sav_vdevs = NULL;
1976 sav->sav_count = 0;
1977
67d60824
NB
1978 if (sav->sav_config == NULL) {
1979 nl2cache = 0;
1980 newvdevs = NULL;
1981 goto out;
1982 }
1983
65ad5d11
AJ
1984 VERIFY0(nvlist_lookup_nvlist_array(sav->sav_config,
1985 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache));
67d60824
NB
1986 newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP);
1987
34dc7c2f
BB
1988 /*
1989 * Process new nvlist of vdevs.
1990 */
1991 for (i = 0; i < nl2cache; i++) {
65ad5d11 1992 guid = fnvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID);
34dc7c2f
BB
1993
1994 newvdevs[i] = NULL;
1995 for (j = 0; j < oldnvdevs; j++) {
1996 vd = oldvdevs[j];
1997 if (vd != NULL && guid == vd->vdev_guid) {
1998 /*
1999 * Retain previous vdev for add/remove ops.
2000 */
2001 newvdevs[i] = vd;
2002 oldvdevs[j] = NULL;
2003 break;
2004 }
2005 }
2006
2007 if (newvdevs[i] == NULL) {
2008 /*
2009 * Create new vdev
2010 */
2011 VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0,
2012 VDEV_ALLOC_L2CACHE) == 0);
2013 ASSERT(vd != NULL);
2014 newvdevs[i] = vd;
2015
2016 /*
2017 * Commit this vdev as an l2cache device,
2018 * even if it fails to open.
2019 */
2020 spa_l2cache_add(vd);
2021
b128c09f
BB
2022 vd->vdev_top = vd;
2023 vd->vdev_aux = sav;
2024
2025 spa_l2cache_activate(vd);
2026
34dc7c2f
BB
2027 if (vdev_open(vd) != 0)
2028 continue;
2029
34dc7c2f
BB
2030 (void) vdev_validate_aux(vd);
2031
9babb374
BB
2032 if (!vdev_is_dead(vd))
2033 l2arc_add_vdev(spa, vd);
b7654bd7
GA
2034
2035 /*
2036 * Upon cache device addition to a pool or pool
2037 * creation with a cache device or if the header
2038 * of the device is invalid we issue an async
2039 * TRIM command for the whole device which will
2040 * execute if l2arc_trim_ahead > 0.
2041 */
2042 spa_async_request(spa, SPA_ASYNC_L2CACHE_TRIM);
34dc7c2f
BB
2043 }
2044 }
2045
67d60824
NB
2046 sav->sav_vdevs = newvdevs;
2047 sav->sav_count = (int)nl2cache;
2048
2049 /*
2050 * Recompute the stashed list of l2cache devices, with status
2051 * information this time.
2052 */
65ad5d11 2053 fnvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE);
67d60824 2054
460f239e
D
2055 if (sav->sav_count > 0)
2056 l2cache = kmem_alloc(sav->sav_count * sizeof (void *),
2057 KM_SLEEP);
67d60824
NB
2058 for (i = 0; i < sav->sav_count; i++)
2059 l2cache[i] = vdev_config_generate(spa,
2060 sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE);
795075e6
PD
2061 fnvlist_add_nvlist_array(sav->sav_config, ZPOOL_CONFIG_L2CACHE,
2062 (const nvlist_t * const *)l2cache, sav->sav_count);
67d60824
NB
2063
2064out:
34dc7c2f
BB
2065 /*
2066 * Purge vdevs that were dropped
2067 */
cfb49616
RY
2068 if (oldvdevs) {
2069 for (i = 0; i < oldnvdevs; i++) {
2070 uint64_t pool;
2071
2072 vd = oldvdevs[i];
2073 if (vd != NULL) {
2074 ASSERT(vd->vdev_isl2cache);
2075
2076 if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
2077 pool != 0ULL && l2arc_vdev_present(vd))
2078 l2arc_remove_vdev(vd);
2079 vdev_clear_stats(vd);
2080 vdev_free(vd);
2081 }
34dc7c2f 2082 }
34dc7c2f 2083
34dc7c2f 2084 kmem_free(oldvdevs, oldnvdevs * sizeof (void *));
cfb49616 2085 }
34dc7c2f 2086
34dc7c2f
BB
2087 for (i = 0; i < sav->sav_count; i++)
2088 nvlist_free(l2cache[i]);
2089 if (sav->sav_count)
2090 kmem_free(l2cache, sav->sav_count * sizeof (void *));
2091}
2092
2093static int
2094load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value)
2095{
2096 dmu_buf_t *db;
2097 char *packed = NULL;
2098 size_t nvsize = 0;
2099 int error;
2100 *value = NULL;
2101
c3275b56
BB
2102 error = dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db);
2103 if (error)
2104 return (error);
2105
34dc7c2f
BB
2106 nvsize = *(uint64_t *)db->db_data;
2107 dmu_buf_rele(db, FTAG);
2108
77aef6f6 2109 packed = vmem_alloc(nvsize, KM_SLEEP);
9babb374
BB
2110 error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed,
2111 DMU_READ_PREFETCH);
34dc7c2f
BB
2112 if (error == 0)
2113 error = nvlist_unpack(packed, nvsize, value, 0);
77aef6f6 2114 vmem_free(packed, nvsize);
34dc7c2f
BB
2115
2116 return (error);
2117}
2118
6cb8e530
PZ
2119/*
2120 * Concrete top-level vdevs that are not missing and are not logs. At every
2121 * spa_sync we write new uberblocks to at least SPA_SYNC_MIN_VDEVS core tvds.
2122 */
2123static uint64_t
2124spa_healthy_core_tvds(spa_t *spa)
2125{
2126 vdev_t *rvd = spa->spa_root_vdev;
2127 uint64_t tvds = 0;
2128
2129 for (uint64_t i = 0; i < rvd->vdev_children; i++) {
2130 vdev_t *vd = rvd->vdev_child[i];
2131 if (vd->vdev_islog)
2132 continue;
2133 if (vdev_is_concrete(vd) && !vdev_is_dead(vd))
2134 tvds++;
2135 }
2136
2137 return (tvds);
2138}
2139
34dc7c2f
BB
2140/*
2141 * Checks to see if the given vdev could not be opened, in which case we post a
2142 * sysevent to notify the autoreplace code that the device has been removed.
2143 */
2144static void
2145spa_check_removed(vdev_t *vd)
2146{
6cb8e530 2147 for (uint64_t c = 0; c < vd->vdev_children; c++)
34dc7c2f
BB
2148 spa_check_removed(vd->vdev_child[c]);
2149
7011fb60 2150 if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) &&
a1d477c2 2151 vdev_is_concrete(vd)) {
fb390aaf 2152 zfs_post_autoreplace(vd->vdev_spa, vd);
12fa0466 2153 spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_CHECK);
34dc7c2f
BB
2154 }
2155}
2156
6cb8e530
PZ
2157static int
2158spa_check_for_missing_logs(spa_t *spa)
9babb374 2159{
6cb8e530 2160 vdev_t *rvd = spa->spa_root_vdev;
9babb374 2161
428870ff 2162 /*
572e2857 2163 * If we're doing a normal import, then build up any additional
6cb8e530 2164 * diagnostic information about missing log devices.
572e2857 2165 * We'll pass this up to the user for further processing.
428870ff 2166 */
572e2857
BB
2167 if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) {
2168 nvlist_t **child, *nv;
2169 uint64_t idx = 0;
2170
160987b5 2171 child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t *),
79c76d5b 2172 KM_SLEEP);
65ad5d11 2173 nv = fnvlist_alloc();
572e2857 2174
6cb8e530 2175 for (uint64_t c = 0; c < rvd->vdev_children; c++) {
572e2857 2176 vdev_t *tvd = rvd->vdev_child[c];
572e2857 2177
6cb8e530
PZ
2178 /*
2179 * We consider a device as missing only if it failed
2180 * to open (i.e. offline or faulted is not considered
2181 * as missing).
2182 */
2183 if (tvd->vdev_islog &&
2184 tvd->vdev_state == VDEV_STATE_CANT_OPEN) {
2185 child[idx++] = vdev_config_generate(spa, tvd,
2186 B_FALSE, VDEV_CONFIG_MISSING);
2187 }
572e2857 2188 }
9babb374 2189
6cb8e530 2190 if (idx > 0) {
795075e6
PD
2191 fnvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
2192 (const nvlist_t * const *)child, idx);
6cb8e530
PZ
2193 fnvlist_add_nvlist(spa->spa_load_info,
2194 ZPOOL_CONFIG_MISSING_DEVICES, nv);
572e2857 2195
6cb8e530 2196 for (uint64_t i = 0; i < idx; i++)
572e2857
BB
2197 nvlist_free(child[i]);
2198 }
2199 nvlist_free(nv);
2200 kmem_free(child, rvd->vdev_children * sizeof (char **));
572e2857 2201
6cb8e530
PZ
2202 if (idx > 0) {
2203 spa_load_failed(spa, "some log devices are missing");
db7d07e1 2204 vdev_dbgmsg_print_tree(rvd, 2);
6cb8e530
PZ
2205 return (SET_ERROR(ENXIO));
2206 }
2207 } else {
2208 for (uint64_t c = 0; c < rvd->vdev_children; c++) {
2209 vdev_t *tvd = rvd->vdev_child[c];
a1d477c2 2210
6cb8e530
PZ
2211 if (tvd->vdev_islog &&
2212 tvd->vdev_state == VDEV_STATE_CANT_OPEN) {
572e2857 2213 spa_set_log_state(spa, SPA_LOG_CLEAR);
6cb8e530
PZ
2214 spa_load_note(spa, "some log devices are "
2215 "missing, ZIL is dropped.");
db7d07e1 2216 vdev_dbgmsg_print_tree(rvd, 2);
6cb8e530 2217 break;
e0ab3ab5 2218 }
572e2857 2219 }
9babb374 2220 }
e0ab3ab5 2221
6cb8e530 2222 return (0);
9babb374
BB
2223}
2224
b128c09f
BB
2225/*
2226 * Check for missing log devices
2227 */
13fe0198 2228static boolean_t
b128c09f
BB
2229spa_check_logs(spa_t *spa)
2230{
13fe0198 2231 boolean_t rv = B_FALSE;
9c43027b 2232 dsl_pool_t *dp = spa_get_dsl(spa);
13fe0198 2233
b128c09f 2234 switch (spa->spa_log_state) {
e75c13c3
BB
2235 default:
2236 break;
b128c09f
BB
2237 case SPA_LOG_MISSING:
2238 /* need to recheck in case slog has been restored */
2239 case SPA_LOG_UNKNOWN:
9c43027b
AJ
2240 rv = (dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
2241 zil_check_log_chain, NULL, DS_FIND_CHILDREN) != 0);
13fe0198 2242 if (rv)
428870ff 2243 spa_set_log_state(spa, SPA_LOG_MISSING);
b128c09f 2244 break;
b128c09f 2245 }
13fe0198 2246 return (rv);
b128c09f
BB
2247}
2248
aa755b35
MA
2249/*
2250 * Passivate any log vdevs (note, does not apply to embedded log metaslabs).
2251 */
428870ff
BB
2252static boolean_t
2253spa_passivate_log(spa_t *spa)
34dc7c2f 2254{
428870ff
BB
2255 vdev_t *rvd = spa->spa_root_vdev;
2256 boolean_t slog_found = B_FALSE;
b128c09f 2257
428870ff 2258 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
fb5f0bc8 2259
1c27024e 2260 for (int c = 0; c < rvd->vdev_children; c++) {
428870ff 2261 vdev_t *tvd = rvd->vdev_child[c];
34dc7c2f 2262
428870ff 2263 if (tvd->vdev_islog) {
aa755b35
MA
2264 ASSERT3P(tvd->vdev_log_mg, ==, NULL);
2265 metaslab_group_passivate(tvd->vdev_mg);
428870ff
BB
2266 slog_found = B_TRUE;
2267 }
34dc7c2f
BB
2268 }
2269
428870ff
BB
2270 return (slog_found);
2271}
34dc7c2f 2272
aa755b35
MA
2273/*
2274 * Activate any log vdevs (note, does not apply to embedded log metaslabs).
2275 */
428870ff
BB
2276static void
2277spa_activate_log(spa_t *spa)
2278{
2279 vdev_t *rvd = spa->spa_root_vdev;
34dc7c2f 2280
428870ff
BB
2281 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
2282
1c27024e 2283 for (int c = 0; c < rvd->vdev_children; c++) {
428870ff 2284 vdev_t *tvd = rvd->vdev_child[c];
428870ff 2285
aa755b35
MA
2286 if (tvd->vdev_islog) {
2287 ASSERT3P(tvd->vdev_log_mg, ==, NULL);
2288 metaslab_group_activate(tvd->vdev_mg);
2289 }
34dc7c2f 2290 }
428870ff 2291}
34dc7c2f 2292
428870ff 2293int
a1d477c2 2294spa_reset_logs(spa_t *spa)
428870ff 2295{
13fe0198 2296 int error;
9babb374 2297
a1d477c2 2298 error = dmu_objset_find(spa_name(spa), zil_reset,
13fe0198
MA
2299 NULL, DS_FIND_CHILDREN);
2300 if (error == 0) {
428870ff
BB
2301 /*
2302 * We successfully offlined the log device, sync out the
2303 * current txg so that the "stubby" block can be removed
2304 * by zil_sync().
2305 */
2306 txg_wait_synced(spa->spa_dsl_pool, 0);
2307 }
2308 return (error);
2309}
34dc7c2f 2310
428870ff
BB
2311static void
2312spa_aux_check_removed(spa_aux_vdev_t *sav)
2313{
1c27024e 2314 for (int i = 0; i < sav->sav_count; i++)
428870ff
BB
2315 spa_check_removed(sav->sav_vdevs[i]);
2316}
34dc7c2f 2317
428870ff
BB
2318void
2319spa_claim_notify(zio_t *zio)
2320{
2321 spa_t *spa = zio->io_spa;
34dc7c2f 2322
428870ff
BB
2323 if (zio->io_error)
2324 return;
34dc7c2f 2325
428870ff
BB
2326 mutex_enter(&spa->spa_props_lock); /* any mutex will do */
2327 if (spa->spa_claim_max_txg < zio->io_bp->blk_birth)
2328 spa->spa_claim_max_txg = zio->io_bp->blk_birth;
2329 mutex_exit(&spa->spa_props_lock);
2330}
34dc7c2f 2331
428870ff 2332typedef struct spa_load_error {
f2c5bc15 2333 boolean_t sle_verify_data;
428870ff
BB
2334 uint64_t sle_meta_count;
2335 uint64_t sle_data_count;
2336} spa_load_error_t;
34dc7c2f 2337
428870ff
BB
2338static void
2339spa_load_verify_done(zio_t *zio)
2340{
2341 blkptr_t *bp = zio->io_bp;
2342 spa_load_error_t *sle = zio->io_private;
2343 dmu_object_type_t type = BP_GET_TYPE(bp);
2344 int error = zio->io_error;
dea377c0 2345 spa_t *spa = zio->io_spa;
34dc7c2f 2346
a6255b7f 2347 abd_free(zio->io_abd);
428870ff 2348 if (error) {
9ae529ec 2349 if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) &&
428870ff 2350 type != DMU_OT_INTENT_LOG)
bc89ac84 2351 atomic_inc_64(&sle->sle_meta_count);
428870ff 2352 else
bc89ac84 2353 atomic_inc_64(&sle->sle_data_count);
34dc7c2f 2354 }
dea377c0
MA
2355
2356 mutex_enter(&spa->spa_scrub_lock);
c8242a96 2357 spa->spa_load_verify_bytes -= BP_GET_PSIZE(bp);
dea377c0
MA
2358 cv_broadcast(&spa->spa_scrub_io_cv);
2359 mutex_exit(&spa->spa_scrub_lock);
428870ff 2360}
34dc7c2f 2361
dea377c0 2362/*
e1cfd73f 2363 * Maximum number of inflight bytes is the log2 fraction of the arc size.
c8242a96 2364 * By default, we set it to 1/16th of the arc.
dea377c0 2365 */
fdc2d303 2366static uint_t spa_load_verify_shift = 4;
18168da7
AZ
2367static int spa_load_verify_metadata = B_TRUE;
2368static int spa_load_verify_data = B_TRUE;
dea377c0 2369
428870ff
BB
2370static int
2371spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
5dbd68a3 2372 const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
428870ff 2373{
f2c5bc15
AM
2374 zio_t *rio = arg;
2375 spa_load_error_t *sle = rio->io_private;
2376
14e4e3cb
AZ
2377 (void) zilog, (void) dnp;
2378
dea377c0
MA
2379 /*
2380 * Note: normally this routine will not be called if
2381 * spa_load_verify_metadata is not set. However, it may be useful
2382 * to manually set the flag after the traversal has begun.
2383 */
2384 if (!spa_load_verify_metadata)
2385 return (0);
2cd0f98f
BB
2386
2387 /*
2388 * Sanity check the block pointer in order to detect obvious damage
2389 * before using the contents in subsequent checks or in zio_read().
2390 * When damaged consider it to be a metadata error since we cannot
2391 * trust the BP_GET_TYPE and BP_GET_LEVEL values.
2392 */
3095ca91 2393 if (!zfs_blkptr_verify(spa, bp, BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) {
2cd0f98f
BB
2394 atomic_inc_64(&sle->sle_meta_count);
2395 return (0);
2396 }
2397
2398 if (zb->zb_level == ZB_DNODE_LEVEL || BP_IS_HOLE(bp) ||
2399 BP_IS_EMBEDDED(bp) || BP_IS_REDACTED(bp))
2400 return (0);
2401
f2c5bc15
AM
2402 if (!BP_IS_METADATA(bp) &&
2403 (!spa_load_verify_data || !sle->sle_verify_data))
dea377c0
MA
2404 return (0);
2405
1e527162
GW
2406 uint64_t maxinflight_bytes =
2407 arc_target_bytes() >> spa_load_verify_shift;
1c27024e 2408 size_t size = BP_GET_PSIZE(bp);
dea377c0
MA
2409
2410 mutex_enter(&spa->spa_scrub_lock);
c8242a96 2411 while (spa->spa_load_verify_bytes >= maxinflight_bytes)
dea377c0 2412 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
c8242a96 2413 spa->spa_load_verify_bytes += size;
dea377c0
MA
2414 mutex_exit(&spa->spa_scrub_lock);
2415
a6255b7f 2416 zio_nowait(zio_read(rio, spa, bp, abd_alloc_for_io(size, B_FALSE), size,
dea377c0
MA
2417 spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB,
2418 ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL |
2419 ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb));
428870ff
BB
2420 return (0);
2421}
34dc7c2f 2422
65c7cc49 2423static int
d1d19c78
PD
2424verify_dataset_name_len(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
2425{
14e4e3cb
AZ
2426 (void) dp, (void) arg;
2427
d1d19c78
PD
2428 if (dsl_dataset_namelen(ds) >= ZFS_MAX_DATASET_NAME_LEN)
2429 return (SET_ERROR(ENAMETOOLONG));
2430
2431 return (0);
2432}
2433
428870ff
BB
2434static int
2435spa_load_verify(spa_t *spa)
2436{
2437 zio_t *rio;
2438 spa_load_error_t sle = { 0 };
8a393be3 2439 zpool_load_policy_t policy;
428870ff 2440 boolean_t verify_ok = B_FALSE;
dea377c0 2441 int error = 0;
34dc7c2f 2442
8a393be3 2443 zpool_get_load_policy(spa->spa_config, &policy);
34dc7c2f 2444
f2c5bc15
AM
2445 if (policy.zlp_rewind & ZPOOL_NEVER_REWIND ||
2446 policy.zlp_maxmeta == UINT64_MAX)
428870ff 2447 return (0);
34dc7c2f 2448
d1d19c78
PD
2449 dsl_pool_config_enter(spa->spa_dsl_pool, FTAG);
2450 error = dmu_objset_find_dp(spa->spa_dsl_pool,
2451 spa->spa_dsl_pool->dp_root_dir_obj, verify_dataset_name_len, NULL,
2452 DS_FIND_CHILDREN);
2453 dsl_pool_config_exit(spa->spa_dsl_pool, FTAG);
2454 if (error != 0)
2455 return (error);
2456
f2c5bc15
AM
2457 /*
2458 * Verify data only if we are rewinding or error limit was set.
2459 * Otherwise nothing except dbgmsg care about it to waste time.
2460 */
2461 sle.sle_verify_data = (policy.zlp_rewind & ZPOOL_REWIND_MASK) ||
2462 (policy.zlp_maxdata < UINT64_MAX);
2463
428870ff
BB
2464 rio = zio_root(spa, NULL, &sle,
2465 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
34dc7c2f 2466
dea377c0 2467 if (spa_load_verify_metadata) {
4a0ee12a
PZ
2468 if (spa->spa_extreme_rewind) {
2469 spa_load_note(spa, "performing a complete scan of the "
2470 "pool since extreme rewind is on. This may take "
2471 "a very long time.\n (spa_load_verify_data=%u, "
2472 "spa_load_verify_metadata=%u)",
2473 spa_load_verify_data, spa_load_verify_metadata);
2474 }
c8242a96 2475
dea377c0 2476 error = traverse_pool(spa, spa->spa_verify_min_txg,
b5256303
TC
2477 TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA |
2478 TRAVERSE_NO_DECRYPT, spa_load_verify_cb, rio);
dea377c0 2479 }
428870ff
BB
2480
2481 (void) zio_wait(rio);
c8242a96 2482 ASSERT0(spa->spa_load_verify_bytes);
428870ff
BB
2483
2484 spa->spa_load_meta_errors = sle.sle_meta_count;
2485 spa->spa_load_data_errors = sle.sle_data_count;
2486
afd2f7b7
PZ
2487 if (sle.sle_meta_count != 0 || sle.sle_data_count != 0) {
2488 spa_load_note(spa, "spa_load_verify found %llu metadata errors "
2489 "and %llu data errors", (u_longlong_t)sle.sle_meta_count,
2490 (u_longlong_t)sle.sle_data_count);
2491 }
2492
2493 if (spa_load_verify_dryrun ||
8a393be3
PZ
2494 (!error && sle.sle_meta_count <= policy.zlp_maxmeta &&
2495 sle.sle_data_count <= policy.zlp_maxdata)) {
572e2857
BB
2496 int64_t loss = 0;
2497
428870ff
BB
2498 verify_ok = B_TRUE;
2499 spa->spa_load_txg = spa->spa_uberblock.ub_txg;
2500 spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp;
572e2857
BB
2501
2502 loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts;
65ad5d11
AJ
2503 fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_LOAD_TIME,
2504 spa->spa_load_txg_ts);
2505 fnvlist_add_int64(spa->spa_load_info, ZPOOL_CONFIG_REWIND_TIME,
2506 loss);
f2c5bc15
AM
2507 fnvlist_add_uint64(spa->spa_load_info,
2508 ZPOOL_CONFIG_LOAD_META_ERRORS, sle.sle_meta_count);
65ad5d11
AJ
2509 fnvlist_add_uint64(spa->spa_load_info,
2510 ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count);
428870ff
BB
2511 } else {
2512 spa->spa_load_max_txg = spa->spa_uberblock.ub_txg;
2513 }
2514
afd2f7b7
PZ
2515 if (spa_load_verify_dryrun)
2516 return (0);
2517
428870ff
BB
2518 if (error) {
2519 if (error != ENXIO && error != EIO)
2e528b49 2520 error = SET_ERROR(EIO);
428870ff
BB
2521 return (error);
2522 }
2523
2524 return (verify_ok ? 0 : EIO);
2525}
2526
2527/*
2528 * Find a value in the pool props object.
2529 */
2530static void
2531spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val)
2532{
2533 (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object,
2534 zpool_prop_to_name(prop), sizeof (uint64_t), 1, val);
2535}
2536
2537/*
2538 * Find a value in the pool directory object.
2539 */
2540static int
4a0ee12a 2541spa_dir_prop(spa_t *spa, const char *name, uint64_t *val, boolean_t log_enoent)
428870ff 2542{
4a0ee12a
PZ
2543 int error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
2544 name, sizeof (uint64_t), 1, val);
2545
2546 if (error != 0 && (error != ENOENT || log_enoent)) {
2547 spa_load_failed(spa, "couldn't get '%s' value in MOS directory "
2548 "[error=%d]", name, error);
2549 }
2550
2551 return (error);
428870ff
BB
2552}
2553
2554static int
2555spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err)
2556{
2557 vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux);
a1d477c2 2558 return (SET_ERROR(err));
428870ff
BB
2559}
2560
37f03da8
SH
2561boolean_t
2562spa_livelist_delete_check(spa_t *spa)
2563{
2564 return (spa->spa_livelists_to_delete != 0);
2565}
2566
37f03da8
SH
2567static boolean_t
2568spa_livelist_delete_cb_check(void *arg, zthr_t *z)
2569{
14e4e3cb 2570 (void) z;
37f03da8
SH
2571 spa_t *spa = arg;
2572 return (spa_livelist_delete_check(spa));
2573}
2574
2575static int
2576delete_blkptr_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
2577{
2578 spa_t *spa = arg;
2579 zio_free(spa, tx->tx_txg, bp);
2580 dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD,
2581 -bp_get_dsize_sync(spa, bp),
2582 -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx);
2583 return (0);
2584}
2585
2586static int
2587dsl_get_next_livelist_obj(objset_t *os, uint64_t zap_obj, uint64_t *llp)
2588{
2589 int err;
2590 zap_cursor_t zc;
2591 zap_attribute_t za;
2592 zap_cursor_init(&zc, os, zap_obj);
2593 err = zap_cursor_retrieve(&zc, &za);
2594 zap_cursor_fini(&zc);
2595 if (err == 0)
2596 *llp = za.za_first_integer;
2597 return (err);
2598}
2599
2600/*
2601 * Components of livelist deletion that must be performed in syncing
2602 * context: freeing block pointers and updating the pool-wide data
2603 * structures to indicate how much work is left to do
2604 */
2605typedef struct sublist_delete_arg {
2606 spa_t *spa;
2607 dsl_deadlist_t *ll;
2608 uint64_t key;
2609 bplist_t *to_free;
2610} sublist_delete_arg_t;
2611
2612static void
2613sublist_delete_sync(void *arg, dmu_tx_t *tx)
2614{
2615 sublist_delete_arg_t *sda = arg;
2616 spa_t *spa = sda->spa;
2617 dsl_deadlist_t *ll = sda->ll;
2618 uint64_t key = sda->key;
2619 bplist_t *to_free = sda->to_free;
2620
2621 bplist_iterate(to_free, delete_blkptr_cb, spa, tx);
2622 dsl_deadlist_remove_entry(ll, key, tx);
2623}
2624
2625typedef struct livelist_delete_arg {
2626 spa_t *spa;
2627 uint64_t ll_obj;
2628 uint64_t zap_obj;
2629} livelist_delete_arg_t;
2630
2631static void
2632livelist_delete_sync(void *arg, dmu_tx_t *tx)
2633{
2634 livelist_delete_arg_t *lda = arg;
2635 spa_t *spa = lda->spa;
2636 uint64_t ll_obj = lda->ll_obj;
2637 uint64_t zap_obj = lda->zap_obj;
2638 objset_t *mos = spa->spa_meta_objset;
2639 uint64_t count;
2640
2641 /* free the livelist and decrement the feature count */
2642 VERIFY0(zap_remove_int(mos, zap_obj, ll_obj, tx));
2643 dsl_deadlist_free(mos, ll_obj, tx);
2644 spa_feature_decr(spa, SPA_FEATURE_LIVELIST, tx);
2645 VERIFY0(zap_count(mos, zap_obj, &count));
2646 if (count == 0) {
2647 /* no more livelists to delete */
2648 VERIFY0(zap_remove(mos, DMU_POOL_DIRECTORY_OBJECT,
2649 DMU_POOL_DELETED_CLONES, tx));
2650 VERIFY0(zap_destroy(mos, zap_obj, tx));
2651 spa->spa_livelists_to_delete = 0;
e60e158e 2652 spa_notify_waiters(spa);
37f03da8
SH
2653 }
2654}
2655
2656/*
2657 * Load in the value for the livelist to be removed and open it. Then,
2658 * load its first sublist and determine which block pointers should actually
2659 * be freed. Then, call a synctask which performs the actual frees and updates
2660 * the pool-wide livelist data.
2661 */
65c7cc49 2662static void
37f03da8
SH
2663spa_livelist_delete_cb(void *arg, zthr_t *z)
2664{
2665 spa_t *spa = arg;
2666 uint64_t ll_obj = 0, count;
2667 objset_t *mos = spa->spa_meta_objset;
2668 uint64_t zap_obj = spa->spa_livelists_to_delete;
2669 /*
2670 * Determine the next livelist to delete. This function should only
2671 * be called if there is at least one deleted clone.
2672 */
2673 VERIFY0(dsl_get_next_livelist_obj(mos, zap_obj, &ll_obj));
2674 VERIFY0(zap_count(mos, ll_obj, &count));
2675 if (count > 0) {
c9562576 2676 dsl_deadlist_t *ll;
37f03da8
SH
2677 dsl_deadlist_entry_t *dle;
2678 bplist_t to_free;
c9562576
PS
2679 ll = kmem_zalloc(sizeof (dsl_deadlist_t), KM_SLEEP);
2680 dsl_deadlist_open(ll, mos, ll_obj);
2681 dle = dsl_deadlist_first(ll);
37f03da8
SH
2682 ASSERT3P(dle, !=, NULL);
2683 bplist_create(&to_free);
2684 int err = dsl_process_sub_livelist(&dle->dle_bpobj, &to_free,
2685 z, NULL);
2686 if (err == 0) {
2687 sublist_delete_arg_t sync_arg = {
2688 .spa = spa,
c9562576 2689 .ll = ll,
37f03da8
SH
2690 .key = dle->dle_mintxg,
2691 .to_free = &to_free
2692 };
2693 zfs_dbgmsg("deleting sublist (id %llu) from"
8e739b2c
RE
2694 " livelist %llu, %lld remaining",
2695 (u_longlong_t)dle->dle_bpobj.bpo_object,
2696 (u_longlong_t)ll_obj, (longlong_t)count - 1);
37f03da8
SH
2697 VERIFY0(dsl_sync_task(spa_name(spa), NULL,
2698 sublist_delete_sync, &sync_arg, 0,
2699 ZFS_SPACE_CHECK_DESTROY));
2700 } else {
d87676a9 2701 VERIFY3U(err, ==, EINTR);
37f03da8
SH
2702 }
2703 bplist_clear(&to_free);
2704 bplist_destroy(&to_free);
c9562576
PS
2705 dsl_deadlist_close(ll);
2706 kmem_free(ll, sizeof (dsl_deadlist_t));
37f03da8
SH
2707 } else {
2708 livelist_delete_arg_t sync_arg = {
2709 .spa = spa,
2710 .ll_obj = ll_obj,
2711 .zap_obj = zap_obj
2712 };
8e739b2c
RE
2713 zfs_dbgmsg("deletion of livelist %llu completed",
2714 (u_longlong_t)ll_obj);
37f03da8
SH
2715 VERIFY0(dsl_sync_task(spa_name(spa), NULL, livelist_delete_sync,
2716 &sync_arg, 0, ZFS_SPACE_CHECK_DESTROY));
2717 }
2718}
2719
65c7cc49 2720static void
37f03da8
SH
2721spa_start_livelist_destroy_thread(spa_t *spa)
2722{
2723 ASSERT3P(spa->spa_livelist_delete_zthr, ==, NULL);
843e9ca2
SD
2724 spa->spa_livelist_delete_zthr =
2725 zthr_create("z_livelist_destroy",
6bc61d22
TN
2726 spa_livelist_delete_cb_check, spa_livelist_delete_cb, spa,
2727 minclsyspri);
37f03da8
SH
2728}
2729
2730typedef struct livelist_new_arg {
2731 bplist_t *allocs;
2732 bplist_t *frees;
2733} livelist_new_arg_t;
2734
2735static int
2736livelist_track_new_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
2737 dmu_tx_t *tx)
2738{
2739 ASSERT(tx == NULL);
2740 livelist_new_arg_t *lna = arg;
2741 if (bp_freed) {
2742 bplist_append(lna->frees, bp);
2743 } else {
2744 bplist_append(lna->allocs, bp);
2745 zfs_livelist_condense_new_alloc++;
2746 }
2747 return (0);
2748}
2749
2750typedef struct livelist_condense_arg {
2751 spa_t *spa;
2752 bplist_t to_keep;
2753 uint64_t first_size;
2754 uint64_t next_size;
2755} livelist_condense_arg_t;
2756
2757static void
2758spa_livelist_condense_sync(void *arg, dmu_tx_t *tx)
2759{
2760 livelist_condense_arg_t *lca = arg;
2761 spa_t *spa = lca->spa;
2762 bplist_t new_frees;
2763 dsl_dataset_t *ds = spa->spa_to_condense.ds;
2764
2765 /* Have we been cancelled? */
2766 if (spa->spa_to_condense.cancelled) {
2767 zfs_livelist_condense_sync_cancel++;
2768 goto out;
2769 }
2770
2771 dsl_deadlist_entry_t *first = spa->spa_to_condense.first;
2772 dsl_deadlist_entry_t *next = spa->spa_to_condense.next;
2773 dsl_deadlist_t *ll = &ds->ds_dir->dd_livelist;
2774
2775 /*
2776 * It's possible that the livelist was changed while the zthr was
2777 * running. Therefore, we need to check for new blkptrs in the two
2778 * entries being condensed and continue to track them in the livelist.
2779 * Because of the way we handle remapped blkptrs (see dbuf_remap_impl),
2780 * it's possible that the newly added blkptrs are FREEs or ALLOCs so
2781 * we need to sort them into two different bplists.
2782 */
2783 uint64_t first_obj = first->dle_bpobj.bpo_object;
2784 uint64_t next_obj = next->dle_bpobj.bpo_object;
2785 uint64_t cur_first_size = first->dle_bpobj.bpo_phys->bpo_num_blkptrs;
2786 uint64_t cur_next_size = next->dle_bpobj.bpo_phys->bpo_num_blkptrs;
2787
2788 bplist_create(&new_frees);
2789 livelist_new_arg_t new_bps = {
2790 .allocs = &lca->to_keep,
2791 .frees = &new_frees,
2792 };
2793
2794 if (cur_first_size > lca->first_size) {
2795 VERIFY0(livelist_bpobj_iterate_from_nofree(&first->dle_bpobj,
2796 livelist_track_new_cb, &new_bps, lca->first_size));
2797 }
2798 if (cur_next_size > lca->next_size) {
2799 VERIFY0(livelist_bpobj_iterate_from_nofree(&next->dle_bpobj,
2800 livelist_track_new_cb, &new_bps, lca->next_size));
2801 }
2802
2803 dsl_deadlist_clear_entry(first, ll, tx);
2804 ASSERT(bpobj_is_empty(&first->dle_bpobj));
2805 dsl_deadlist_remove_entry(ll, next->dle_mintxg, tx);
2806
2807 bplist_iterate(&lca->to_keep, dsl_deadlist_insert_alloc_cb, ll, tx);
2808 bplist_iterate(&new_frees, dsl_deadlist_insert_free_cb, ll, tx);
2809 bplist_destroy(&new_frees);
2810
2811 char dsname[ZFS_MAX_DATASET_NAME_LEN];
2812 dsl_dataset_name(ds, dsname);
2813 zfs_dbgmsg("txg %llu condensing livelist of %s (id %llu), bpobj %llu "
2814 "(%llu blkptrs) and bpobj %llu (%llu blkptrs) -> bpobj %llu "
8e739b2c
RE
2815 "(%llu blkptrs)", (u_longlong_t)tx->tx_txg, dsname,
2816 (u_longlong_t)ds->ds_object, (u_longlong_t)first_obj,
2817 (u_longlong_t)cur_first_size, (u_longlong_t)next_obj,
2818 (u_longlong_t)cur_next_size,
2819 (u_longlong_t)first->dle_bpobj.bpo_object,
2820 (u_longlong_t)first->dle_bpobj.bpo_phys->bpo_num_blkptrs);
37f03da8
SH
2821out:
2822 dmu_buf_rele(ds->ds_dbuf, spa);
2823 spa->spa_to_condense.ds = NULL;
2824 bplist_clear(&lca->to_keep);
2825 bplist_destroy(&lca->to_keep);
2826 kmem_free(lca, sizeof (livelist_condense_arg_t));
2827 spa->spa_to_condense.syncing = B_FALSE;
2828}
2829
65c7cc49 2830static void
37f03da8
SH
2831spa_livelist_condense_cb(void *arg, zthr_t *t)
2832{
2833 while (zfs_livelist_condense_zthr_pause &&
2834 !(zthr_has_waiters(t) || zthr_iscancelled(t)))
2835 delay(1);
2836
2837 spa_t *spa = arg;
2838 dsl_deadlist_entry_t *first = spa->spa_to_condense.first;
2839 dsl_deadlist_entry_t *next = spa->spa_to_condense.next;
2840 uint64_t first_size, next_size;
2841
2842 livelist_condense_arg_t *lca =
2843 kmem_alloc(sizeof (livelist_condense_arg_t), KM_SLEEP);
2844 bplist_create(&lca->to_keep);
2845
2846 /*
2847 * Process the livelists (matching FREEs and ALLOCs) in open context
2848 * so we have minimal work in syncing context to condense.
2849 *
2850 * We save bpobj sizes (first_size and next_size) to use later in
2851 * syncing context to determine if entries were added to these sublists
2852 * while in open context. This is possible because the clone is still
2853 * active and open for normal writes and we want to make sure the new,
2854 * unprocessed blockpointers are inserted into the livelist normally.
2855 *
2856 * Note that dsl_process_sub_livelist() both stores the size number of
2857 * blockpointers and iterates over them while the bpobj's lock held, so
2858 * the sizes returned to us are consistent which what was actually
2859 * processed.
2860 */
2861 int err = dsl_process_sub_livelist(&first->dle_bpobj, &lca->to_keep, t,
2862 &first_size);
2863 if (err == 0)
2864 err = dsl_process_sub_livelist(&next->dle_bpobj, &lca->to_keep,
2865 t, &next_size);
2866
2867 if (err == 0) {
2868 while (zfs_livelist_condense_sync_pause &&
2869 !(zthr_has_waiters(t) || zthr_iscancelled(t)))
2870 delay(1);
2871
2872 dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
2873 dmu_tx_mark_netfree(tx);
2874 dmu_tx_hold_space(tx, 1);
2875 err = dmu_tx_assign(tx, TXG_NOWAIT | TXG_NOTHROTTLE);
2876 if (err == 0) {
2877 /*
2878 * Prevent the condense zthr restarting before
2879 * the synctask completes.
2880 */
2881 spa->spa_to_condense.syncing = B_TRUE;
2882 lca->spa = spa;
2883 lca->first_size = first_size;
2884 lca->next_size = next_size;
2885 dsl_sync_task_nowait(spa_get_dsl(spa),
38080324 2886 spa_livelist_condense_sync, lca, tx);
37f03da8
SH
2887 dmu_tx_commit(tx);
2888 return;
2889 }
2890 }
2891 /*
2892 * Condensing can not continue: either it was externally stopped or
2893 * we were unable to assign to a tx because the pool has run out of
2894 * space. In the second case, we'll just end up trying to condense
2895 * again in a later txg.
2896 */
2897 ASSERT(err != 0);
2898 bplist_clear(&lca->to_keep);
2899 bplist_destroy(&lca->to_keep);
2900 kmem_free(lca, sizeof (livelist_condense_arg_t));
2901 dmu_buf_rele(spa->spa_to_condense.ds->ds_dbuf, spa);
2902 spa->spa_to_condense.ds = NULL;
2903 if (err == EINTR)
2904 zfs_livelist_condense_zthr_cancel++;
2905}
2906
37f03da8
SH
2907/*
2908 * Check that there is something to condense but that a condense is not
2909 * already in progress and that condensing has not been cancelled.
2910 */
2911static boolean_t
2912spa_livelist_condense_cb_check(void *arg, zthr_t *z)
2913{
14e4e3cb 2914 (void) z;
37f03da8
SH
2915 spa_t *spa = arg;
2916 if ((spa->spa_to_condense.ds != NULL) &&
2917 (spa->spa_to_condense.syncing == B_FALSE) &&
2918 (spa->spa_to_condense.cancelled == B_FALSE)) {
2919 return (B_TRUE);
2920 }
2921 return (B_FALSE);
2922}
2923
65c7cc49 2924static void
37f03da8
SH
2925spa_start_livelist_condensing_thread(spa_t *spa)
2926{
2927 spa->spa_to_condense.ds = NULL;
2928 spa->spa_to_condense.first = NULL;
2929 spa->spa_to_condense.next = NULL;
2930 spa->spa_to_condense.syncing = B_FALSE;
2931 spa->spa_to_condense.cancelled = B_FALSE;
2932
2933 ASSERT3P(spa->spa_livelist_condense_zthr, ==, NULL);
843e9ca2
SD
2934 spa->spa_livelist_condense_zthr =
2935 zthr_create("z_livelist_condense",
2936 spa_livelist_condense_cb_check,
6bc61d22 2937 spa_livelist_condense_cb, spa, minclsyspri);
37f03da8
SH
2938}
2939
9d5b5245
SD
2940static void
2941spa_spawn_aux_threads(spa_t *spa)
2942{
2943 ASSERT(spa_writeable(spa));
2944
2945 ASSERT(MUTEX_HELD(&spa_namespace_lock));
2946
2947 spa_start_indirect_condensing_thread(spa);
37f03da8
SH
2948 spa_start_livelist_destroy_thread(spa);
2949 spa_start_livelist_condensing_thread(spa);
d2734cce
SD
2950
2951 ASSERT3P(spa->spa_checkpoint_discard_zthr, ==, NULL);
2952 spa->spa_checkpoint_discard_zthr =
843e9ca2
SD
2953 zthr_create("z_checkpoint_discard",
2954 spa_checkpoint_discard_thread_check,
6bc61d22 2955 spa_checkpoint_discard_thread, spa, minclsyspri);
9d5b5245
SD
2956}
2957
428870ff
BB
2958/*
2959 * Fix up config after a partly-completed split. This is done with the
2960 * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off
2961 * pool have that entry in their config, but only the splitting one contains
2962 * a list of all the guids of the vdevs that are being split off.
2963 *
2964 * This function determines what to do with that list: either rejoin
2965 * all the disks to the pool, or complete the splitting process. To attempt
2966 * the rejoin, each disk that is offlined is marked online again, and
2967 * we do a reopen() call. If the vdev label for every disk that was
2968 * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL)
2969 * then we call vdev_split() on each disk, and complete the split.
2970 *
2971 * Otherwise we leave the config alone, with all the vdevs in place in
2972 * the original pool.
2973 */
2974static void
2975spa_try_repair(spa_t *spa, nvlist_t *config)
2976{
2977 uint_t extracted;
2978 uint64_t *glist;
2979 uint_t i, gcount;
2980 nvlist_t *nvl;
2981 vdev_t **vd;
2982 boolean_t attempt_reopen;
2983
2984 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0)
2985 return;
2986
2987 /* check that the config is complete */
2988 if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
2989 &glist, &gcount) != 0)
2990 return;
2991
79c76d5b 2992 vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP);
428870ff
BB
2993
2994 /* attempt to online all the vdevs & validate */
2995 attempt_reopen = B_TRUE;
2996 for (i = 0; i < gcount; i++) {
2997 if (glist[i] == 0) /* vdev is hole */
2998 continue;
2999
3000 vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE);
3001 if (vd[i] == NULL) {
3002 /*
3003 * Don't bother attempting to reopen the disks;
3004 * just do the split.
3005 */
3006 attempt_reopen = B_FALSE;
3007 } else {
3008 /* attempt to re-online it */
3009 vd[i]->vdev_offline = B_FALSE;
3010 }
3011 }
3012
3013 if (attempt_reopen) {
3014 vdev_reopen(spa->spa_root_vdev);
3015
3016 /* check each device to see what state it's in */
3017 for (extracted = 0, i = 0; i < gcount; i++) {
3018 if (vd[i] != NULL &&
3019 vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL)
3020 break;
3021 ++extracted;
3022 }
3023 }
3024
3025 /*
3026 * If every disk has been moved to the new pool, or if we never
3027 * even attempted to look at them, then we split them off for
3028 * good.
3029 */
3030 if (!attempt_reopen || gcount == extracted) {
3031 for (i = 0; i < gcount; i++)
3032 if (vd[i] != NULL)
3033 vdev_split(vd[i]);
3034 vdev_reopen(spa->spa_root_vdev);
3035 }
3036
3037 kmem_free(vd, gcount * sizeof (vdev_t *));
3038}
3039
3040static int
6cb8e530 3041spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type)
428870ff 3042{
a926aab9 3043 const char *ereport = FM_EREPORT_ZFS_POOL;
428870ff 3044 int error;
428870ff 3045
6cb8e530 3046 spa->spa_load_state = state;
ca95f70d
OF
3047 (void) spa_import_progress_set_state(spa_guid(spa),
3048 spa_load_state(spa));
9ae529ec 3049
6cb8e530 3050 gethrestime(&spa->spa_loaded_ts);
d2734cce 3051 error = spa_load_impl(spa, type, &ereport);
428870ff 3052
0c66c32d
JG
3053 /*
3054 * Don't count references from objsets that are already closed
3055 * and are making their way through the eviction process.
3056 */
3057 spa_evicting_os_wait(spa);
424fd7c3 3058 spa->spa_minref = zfs_refcount_count(&spa->spa_refcount);
572e2857
BB
3059 if (error) {
3060 if (error != EEXIST) {
3061 spa->spa_loaded_ts.tv_sec = 0;
3062 spa->spa_loaded_ts.tv_nsec = 0;
3063 }
3064 if (error != EBADF) {
1144586b 3065 (void) zfs_ereport_post(ereport, spa,
4f072827 3066 NULL, NULL, NULL, 0);
572e2857
BB
3067 }
3068 }
428870ff
BB
3069 spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE;
3070 spa->spa_ena = 0;
3071
ca95f70d
OF
3072 (void) spa_import_progress_set_state(spa_guid(spa),
3073 spa_load_state(spa));
3074
428870ff
BB
3075 return (error);
3076}
3077
33cf67cd 3078#ifdef ZFS_DEBUG
e0ab3ab5
JS
3079/*
3080 * Count the number of per-vdev ZAPs associated with all of the vdevs in the
3081 * vdev tree rooted in the given vd, and ensure that each ZAP is present in the
3082 * spa's per-vdev ZAP list.
3083 */
3084static uint64_t
3085vdev_count_verify_zaps(vdev_t *vd)
3086{
3087 spa_t *spa = vd->vdev_spa;
3088 uint64_t total = 0;
e0ab3ab5 3089
3e4ed421
RW
3090 if (spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_AVZ_V2) &&
3091 vd->vdev_root_zap != 0) {
3092 total++;
3093 ASSERT0(zap_lookup_int(spa->spa_meta_objset,
3094 spa->spa_all_vdev_zaps, vd->vdev_root_zap));
3095 }
e0ab3ab5
JS
3096 if (vd->vdev_top_zap != 0) {
3097 total++;
3098 ASSERT0(zap_lookup_int(spa->spa_meta_objset,
3099 spa->spa_all_vdev_zaps, vd->vdev_top_zap));
3100 }
3101 if (vd->vdev_leaf_zap != 0) {
3102 total++;
3103 ASSERT0(zap_lookup_int(spa->spa_meta_objset,
3104 spa->spa_all_vdev_zaps, vd->vdev_leaf_zap));
3105 }
3106
1c27024e 3107 for (uint64_t i = 0; i < vd->vdev_children; i++) {
e0ab3ab5
JS
3108 total += vdev_count_verify_zaps(vd->vdev_child[i]);
3109 }
3110
3111 return (total);
3112}
36542b06
AZ
3113#else
3114#define vdev_count_verify_zaps(vd) ((void) sizeof (vd), 0)
33cf67cd 3115#endif
e0ab3ab5 3116
379ca9cf
OF
3117/*
3118 * Determine whether the activity check is required.
3119 */
3120static boolean_t
bbffb59e
BB
3121spa_activity_check_required(spa_t *spa, uberblock_t *ub, nvlist_t *label,
3122 nvlist_t *config)
379ca9cf
OF
3123{
3124 uint64_t state = 0;
3125 uint64_t hostid = 0;
3126 uint64_t tryconfig_txg = 0;
3127 uint64_t tryconfig_timestamp = 0;
060f0226 3128 uint16_t tryconfig_mmp_seq = 0;
379ca9cf
OF
3129 nvlist_t *nvinfo;
3130
3131 if (nvlist_exists(config, ZPOOL_CONFIG_LOAD_INFO)) {
3132 nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO);
3133 (void) nvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_TXG,
3134 &tryconfig_txg);
3135 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_TIMESTAMP,
3136 &tryconfig_timestamp);
060f0226
OF
3137 (void) nvlist_lookup_uint16(nvinfo, ZPOOL_CONFIG_MMP_SEQ,
3138 &tryconfig_mmp_seq);
379ca9cf
OF
3139 }
3140
3141 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, &state);
379ca9cf
OF
3142
3143 /*
3144 * Disable the MMP activity check - This is used by zdb which
3145 * is intended to be used on potentially active pools.
3146 */
3147 if (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP)
3148 return (B_FALSE);
3149
3150 /*
3151 * Skip the activity check when the MMP feature is disabled.
3152 */
3153 if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay == 0)
3154 return (B_FALSE);
ca95f70d 3155
379ca9cf 3156 /*
060f0226
OF
3157 * If the tryconfig_ values are nonzero, they are the results of an
3158 * earlier tryimport. If they all match the uberblock we just found,
3159 * then the pool has not changed and we return false so we do not test
3160 * a second time.
379ca9cf
OF
3161 */
3162 if (tryconfig_txg && tryconfig_txg == ub->ub_txg &&
060f0226
OF
3163 tryconfig_timestamp && tryconfig_timestamp == ub->ub_timestamp &&
3164 tryconfig_mmp_seq && tryconfig_mmp_seq ==
3165 (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0))
379ca9cf
OF
3166 return (B_FALSE);
3167
3168 /*
3169 * Allow the activity check to be skipped when importing the pool
bbffb59e
BB
3170 * on the same host which last imported it. Since the hostid from
3171 * configuration may be stale use the one read from the label.
379ca9cf 3172 */
bbffb59e
BB
3173 if (nvlist_exists(label, ZPOOL_CONFIG_HOSTID))
3174 hostid = fnvlist_lookup_uint64(label, ZPOOL_CONFIG_HOSTID);
3175
25f06d67 3176 if (hostid == spa_get_hostid(spa))
379ca9cf
OF
3177 return (B_FALSE);
3178
3179 /*
3180 * Skip the activity test when the pool was cleanly exported.
3181 */
3182 if (state != POOL_STATE_ACTIVE)
3183 return (B_FALSE);
3184
3185 return (B_TRUE);
3186}
3187
060f0226
OF
3188/*
3189 * Nanoseconds the activity check must watch for changes on-disk.
3190 */
3191static uint64_t
3192spa_activity_check_duration(spa_t *spa, uberblock_t *ub)
3193{
3194 uint64_t import_intervals = MAX(zfs_multihost_import_intervals, 1);
3195 uint64_t multihost_interval = MSEC2NSEC(
3196 MMP_INTERVAL_OK(zfs_multihost_interval));
3197 uint64_t import_delay = MAX(NANOSEC, import_intervals *
3198 multihost_interval);
3199
3200 /*
3201 * Local tunables determine a minimum duration except for the case
3202 * where we know when the remote host will suspend the pool if MMP
3203 * writes do not land.
3204 *
3205 * See Big Theory comment at the top of mmp.c for the reasoning behind
3206 * these cases and times.
3207 */
3208
3209 ASSERT(MMP_IMPORT_SAFETY_FACTOR >= 100);
3210
3211 if (MMP_INTERVAL_VALID(ub) && MMP_FAIL_INT_VALID(ub) &&
3212 MMP_FAIL_INT(ub) > 0) {
3213
3214 /* MMP on remote host will suspend pool after failed writes */
3215 import_delay = MMP_FAIL_INT(ub) * MSEC2NSEC(MMP_INTERVAL(ub)) *
3216 MMP_IMPORT_SAFETY_FACTOR / 100;
3217
3218 zfs_dbgmsg("fail_intvals>0 import_delay=%llu ub_mmp "
3219 "mmp_fails=%llu ub_mmp mmp_interval=%llu "
8e739b2c
RE
3220 "import_intervals=%llu", (u_longlong_t)import_delay,
3221 (u_longlong_t)MMP_FAIL_INT(ub),
3222 (u_longlong_t)MMP_INTERVAL(ub),
3223 (u_longlong_t)import_intervals);
060f0226
OF
3224
3225 } else if (MMP_INTERVAL_VALID(ub) && MMP_FAIL_INT_VALID(ub) &&
3226 MMP_FAIL_INT(ub) == 0) {
3227
3228 /* MMP on remote host will never suspend pool */
3229 import_delay = MAX(import_delay, (MSEC2NSEC(MMP_INTERVAL(ub)) +
3230 ub->ub_mmp_delay) * import_intervals);
3231
3232 zfs_dbgmsg("fail_intvals=0 import_delay=%llu ub_mmp "
3233 "mmp_interval=%llu ub_mmp_delay=%llu "
8e739b2c
RE
3234 "import_intervals=%llu", (u_longlong_t)import_delay,
3235 (u_longlong_t)MMP_INTERVAL(ub),
3236 (u_longlong_t)ub->ub_mmp_delay,
3237 (u_longlong_t)import_intervals);
060f0226
OF
3238
3239 } else if (MMP_VALID(ub)) {
3240 /*
e1cfd73f 3241 * zfs-0.7 compatibility case
060f0226
OF
3242 */
3243
3244 import_delay = MAX(import_delay, (multihost_interval +
3245 ub->ub_mmp_delay) * import_intervals);
3246
3247 zfs_dbgmsg("import_delay=%llu ub_mmp_delay=%llu "
8e739b2c
RE
3248 "import_intervals=%llu leaves=%u",
3249 (u_longlong_t)import_delay,
3250 (u_longlong_t)ub->ub_mmp_delay,
3251 (u_longlong_t)import_intervals,
060f0226
OF
3252 vdev_count_leaves(spa));
3253 } else {
3254 /* Using local tunings is the only reasonable option */
3255 zfs_dbgmsg("pool last imported on non-MMP aware "
3256 "host using import_delay=%llu multihost_interval=%llu "
8e739b2c
RE
3257 "import_intervals=%llu", (u_longlong_t)import_delay,
3258 (u_longlong_t)multihost_interval,
3259 (u_longlong_t)import_intervals);
060f0226
OF
3260 }
3261
3262 return (import_delay);
3263}
3264
379ca9cf
OF
3265/*
3266 * Perform the import activity check. If the user canceled the import or
3267 * we detected activity then fail.
3268 */
3269static int
3270spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config)
3271{
379ca9cf
OF
3272 uint64_t txg = ub->ub_txg;
3273 uint64_t timestamp = ub->ub_timestamp;
060f0226
OF
3274 uint64_t mmp_config = ub->ub_mmp_config;
3275 uint16_t mmp_seq = MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0;
3276 uint64_t import_delay;
379ca9cf
OF
3277 hrtime_t import_expire;
3278 nvlist_t *mmp_label = NULL;
3279 vdev_t *rvd = spa->spa_root_vdev;
3280 kcondvar_t cv;
3281 kmutex_t mtx;
3282 int error = 0;
3283
3284 cv_init(&cv, NULL, CV_DEFAULT, NULL);
3285 mutex_init(&mtx, NULL, MUTEX_DEFAULT, NULL);
3286 mutex_enter(&mtx);
3287
3288 /*
3289 * If ZPOOL_CONFIG_MMP_TXG is present an activity check was performed
3290 * during the earlier tryimport. If the txg recorded there is 0 then
3291 * the pool is known to be active on another host.
3292 *
060f0226 3293 * Otherwise, the pool might be in use on another host. Check for
379ca9cf
OF
3294 * changes in the uberblocks on disk if necessary.
3295 */
3296 if (nvlist_exists(config, ZPOOL_CONFIG_LOAD_INFO)) {
3297 nvlist_t *nvinfo = fnvlist_lookup_nvlist(config,
3298 ZPOOL_CONFIG_LOAD_INFO);
3299
3300 if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_TXG) &&
3301 fnvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_TXG) == 0) {
3302 vdev_uberblock_load(rvd, ub, &mmp_label);
3303 error = SET_ERROR(EREMOTEIO);
3304 goto out;
3305 }
3306 }
3307
060f0226 3308 import_delay = spa_activity_check_duration(spa, ub);
533ea041 3309
379ca9cf 3310 /* Add a small random factor in case of simultaneous imports (0-25%) */
29274c9f 3311 import_delay += import_delay * random_in_range(250) / 1000;
ca95f70d
OF
3312
3313 import_expire = gethrtime() + import_delay;
379ca9cf
OF
3314
3315 while (gethrtime() < import_expire) {
ca95f70d
OF
3316 (void) spa_import_progress_set_mmp_check(spa_guid(spa),
3317 NSEC2SEC(import_expire - gethrtime()));
3318
379ca9cf
OF
3319 vdev_uberblock_load(rvd, ub, &mmp_label);
3320
060f0226
OF
3321 if (txg != ub->ub_txg || timestamp != ub->ub_timestamp ||
3322 mmp_seq != (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)) {
3323 zfs_dbgmsg("multihost activity detected "
3324 "txg %llu ub_txg %llu "
3325 "timestamp %llu ub_timestamp %llu "
3326 "mmp_config %#llx ub_mmp_config %#llx",
8e739b2c
RE
3327 (u_longlong_t)txg, (u_longlong_t)ub->ub_txg,
3328 (u_longlong_t)timestamp,
3329 (u_longlong_t)ub->ub_timestamp,
3330 (u_longlong_t)mmp_config,
3331 (u_longlong_t)ub->ub_mmp_config);
060f0226 3332
379ca9cf
OF
3333 error = SET_ERROR(EREMOTEIO);
3334 break;
3335 }
3336
3337 if (mmp_label) {
3338 nvlist_free(mmp_label);
3339 mmp_label = NULL;
3340 }
3341
3342 error = cv_timedwait_sig(&cv, &mtx, ddi_get_lbolt() + hz);
3343 if (error != -1) {
3344 error = SET_ERROR(EINTR);
3345 break;
3346 }
3347 error = 0;
3348 }
3349
3350out:
3351 mutex_exit(&mtx);
3352 mutex_destroy(&mtx);
3353 cv_destroy(&cv);
3354
3355 /*
3356 * If the pool is determined to be active store the status in the
3357 * spa->spa_load_info nvlist. If the remote hostname or hostid are
3358 * available from configuration read from disk store them as well.
3359 * This allows 'zpool import' to generate a more useful message.
3360 *
3361 * ZPOOL_CONFIG_MMP_STATE - observed pool status (mandatory)
3362 * ZPOOL_CONFIG_MMP_HOSTNAME - hostname from the active pool
3363 * ZPOOL_CONFIG_MMP_HOSTID - hostid from the active pool
3364 */
3365 if (error == EREMOTEIO) {
a926aab9 3366 const char *hostname = "<unknown>";
379ca9cf
OF
3367 uint64_t hostid = 0;
3368
3369 if (mmp_label) {
3370 if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTNAME)) {
3371 hostname = fnvlist_lookup_string(mmp_label,
3372 ZPOOL_CONFIG_HOSTNAME);
3373 fnvlist_add_string(spa->spa_load_info,
3374 ZPOOL_CONFIG_MMP_HOSTNAME, hostname);
3375 }
3376
3377 if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTID)) {
3378 hostid = fnvlist_lookup_uint64(mmp_label,
3379 ZPOOL_CONFIG_HOSTID);
3380 fnvlist_add_uint64(spa->spa_load_info,
3381 ZPOOL_CONFIG_MMP_HOSTID, hostid);
3382 }
3383 }
3384
3385 fnvlist_add_uint64(spa->spa_load_info,
3386 ZPOOL_CONFIG_MMP_STATE, MMP_STATE_ACTIVE);
3387 fnvlist_add_uint64(spa->spa_load_info,
3388 ZPOOL_CONFIG_MMP_TXG, 0);
3389
3390 error = spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO);
3391 }
3392
3393 if (mmp_label)
3394 nvlist_free(mmp_label);
3395
3396 return (error);
3397}
3398
9eb7b46e 3399static int
6cb8e530
PZ
3400spa_verify_host(spa_t *spa, nvlist_t *mos_config)
3401{
3402 uint64_t hostid;
d1807f16 3403 const char *hostname;
6cb8e530
PZ
3404 uint64_t myhostid = 0;
3405
3406 if (!spa_is_root(spa) && nvlist_lookup_uint64(mos_config,
3407 ZPOOL_CONFIG_HOSTID, &hostid) == 0) {
3408 hostname = fnvlist_lookup_string(mos_config,
3409 ZPOOL_CONFIG_HOSTNAME);
3410
3411 myhostid = zone_get_hostid(NULL);
3412
3413 if (hostid != 0 && myhostid != 0 && hostid != myhostid) {
3414 cmn_err(CE_WARN, "pool '%s' could not be "
3415 "loaded as it was last accessed by "
3416 "another system (host: %s hostid: 0x%llx). "
a2f944a1
RM
3417 "See: https://openzfs.github.io/openzfs-docs/msg/"
3418 "ZFS-8000-EY",
6cb8e530
PZ
3419 spa_name(spa), hostname, (u_longlong_t)hostid);
3420 spa_load_failed(spa, "hostid verification failed: pool "
3421 "last accessed by host: %s (hostid: 0x%llx)",
3422 hostname, (u_longlong_t)hostid);
3423 return (SET_ERROR(EBADF));
3424 }
3425 }
3426
3427 return (0);
3428}
3429
3430static int
3431spa_ld_parse_config(spa_t *spa, spa_import_type_t type)
428870ff
BB
3432{
3433 int error = 0;
6cb8e530 3434 nvlist_t *nvtree, *nvl, *config = spa->spa_config;
1c27024e 3435 int parse;
9eb7b46e 3436 vdev_t *rvd;
6cb8e530 3437 uint64_t pool_guid;
d1807f16
RY
3438 const char *comment;
3439 const char *compatibility;
6cb8e530
PZ
3440
3441 /*
3442 * Versioning wasn't explicitly added to the label until later, so if
3443 * it's not present treat it as the initial version.
3444 */
3445 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
3446 &spa->spa_ubsync.ub_version) != 0)
3447 spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
3448
3449 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) {
3450 spa_load_failed(spa, "invalid config provided: '%s' missing",
3451 ZPOOL_CONFIG_POOL_GUID);
3452 return (SET_ERROR(EINVAL));
3453 }
3454
d2734cce
SD
3455 /*
3456 * If we are doing an import, ensure that the pool is not already
3457 * imported by checking if its pool guid already exists in the
3458 * spa namespace.
3459 *
3460 * The only case that we allow an already imported pool to be
3461 * imported again, is when the pool is checkpointed and we want to
3462 * look at its checkpointed state from userland tools like zdb.
3463 */
3464#ifdef _KERNEL
3465 if ((spa->spa_load_state == SPA_LOAD_IMPORT ||
3466 spa->spa_load_state == SPA_LOAD_TRYIMPORT) &&
3467 spa_guid_exists(pool_guid, 0)) {
3468#else
3469 if ((spa->spa_load_state == SPA_LOAD_IMPORT ||
3470 spa->spa_load_state == SPA_LOAD_TRYIMPORT) &&
3471 spa_guid_exists(pool_guid, 0) &&
3472 !spa_importing_readonly_checkpoint(spa)) {
3473#endif
6cb8e530
PZ
3474 spa_load_failed(spa, "a pool with guid %llu is already open",
3475 (u_longlong_t)pool_guid);
3476 return (SET_ERROR(EEXIST));
3477 }
3478
3479 spa->spa_config_guid = pool_guid;
3480
3481 nvlist_free(spa->spa_load_info);
3482 spa->spa_load_info = fnvlist_alloc();
3483
3484 ASSERT(spa->spa_comment == NULL);
3485 if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0)
3486 spa->spa_comment = spa_strdup(comment);
3487
658fb802
CB
3488 ASSERT(spa->spa_compatibility == NULL);
3489 if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMPATIBILITY,
3490 &compatibility) == 0)
3491 spa->spa_compatibility = spa_strdup(compatibility);
3492
6cb8e530
PZ
3493 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
3494 &spa->spa_config_txg);
3495
3496 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) == 0)
3497 spa->spa_config_splitting = fnvlist_dup(nvl);
428870ff 3498
4a0ee12a
PZ
3499 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtree)) {
3500 spa_load_failed(spa, "invalid config provided: '%s' missing",
3501 ZPOOL_CONFIG_VDEV_TREE);
2e528b49 3502 return (SET_ERROR(EINVAL));
4a0ee12a 3503 }
428870ff 3504
428870ff
BB
3505 /*
3506 * Create "The Godfather" zio to hold all async IOs
3507 */
e022864d
MA
3508 spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *),
3509 KM_SLEEP);
1c27024e 3510 for (int i = 0; i < max_ncpus; i++) {
e022864d
MA
3511 spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL,
3512 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
3513 ZIO_FLAG_GODFATHER);
3514 }
428870ff
BB
3515
3516 /*
3517 * Parse the configuration into a vdev tree. We explicitly set the
3518 * value that will be returned by spa_version() since parsing the
3519 * configuration requires knowing the version number.
3520 */
3521 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
6cb8e530
PZ
3522 parse = (type == SPA_IMPORT_EXISTING ?
3523 VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT);
9eb7b46e 3524 error = spa_config_parse(spa, &rvd, nvtree, NULL, 0, parse);
428870ff
BB
3525 spa_config_exit(spa, SCL_ALL, FTAG);
3526
4a0ee12a
PZ
3527 if (error != 0) {
3528 spa_load_failed(spa, "unable to parse config [error=%d]",
3529 error);
428870ff 3530 return (error);
4a0ee12a 3531 }
428870ff
BB
3532
3533 ASSERT(spa->spa_root_vdev == rvd);
c3520e7f
MA
3534 ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT);
3535 ASSERT3U(spa->spa_max_ashift, <=, SPA_MAXBLOCKSHIFT);
428870ff
BB
3536
3537 if (type != SPA_IMPORT_ASSEMBLE) {
3538 ASSERT(spa_guid(spa) == pool_guid);
3539 }
3540
9eb7b46e
PZ
3541 return (0);
3542}
3543
6cb8e530
PZ
3544/*
3545 * Recursively open all vdevs in the vdev tree. This function is called twice:
3546 * first with the untrusted config, then with the trusted config.
3547 */
9eb7b46e
PZ
3548static int
3549spa_ld_open_vdevs(spa_t *spa)
3550{
3551 int error = 0;
3552
6cb8e530
PZ
3553 /*
3554 * spa_missing_tvds_allowed defines how many top-level vdevs can be
3555 * missing/unopenable for the root vdev to be still considered openable.
3556 */
3557 if (spa->spa_trust_config) {
3558 spa->spa_missing_tvds_allowed = zfs_max_missing_tvds;
3559 } else if (spa->spa_config_source == SPA_CONFIG_SRC_CACHEFILE) {
3560 spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_cachefile;
3561 } else if (spa->spa_config_source == SPA_CONFIG_SRC_SCAN) {
3562 spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_scan;
3563 } else {
3564 spa->spa_missing_tvds_allowed = 0;
3565 }
3566
3567 spa->spa_missing_tvds_allowed =
3568 MAX(zfs_max_missing_tvds, spa->spa_missing_tvds_allowed);
3569
428870ff 3570 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
9eb7b46e 3571 error = vdev_open(spa->spa_root_vdev);
428870ff 3572 spa_config_exit(spa, SCL_ALL, FTAG);
6cb8e530
PZ
3573
3574 if (spa->spa_missing_tvds != 0) {
3575 spa_load_note(spa, "vdev tree has %lld missing top-level "
3576 "vdevs.", (u_longlong_t)spa->spa_missing_tvds);
da92d5cb 3577 if (spa->spa_trust_config && (spa->spa_mode & SPA_MODE_WRITE)) {
6cb8e530
PZ
3578 /*
3579 * Although theoretically we could allow users to open
3580 * incomplete pools in RW mode, we'd need to add a lot
3581 * of extra logic (e.g. adjust pool space to account
3582 * for missing vdevs).
3583 * This limitation also prevents users from accidentally
3584 * opening the pool in RW mode during data recovery and
3585 * damaging it further.
3586 */
3587 spa_load_note(spa, "pools with missing top-level "
3588 "vdevs can only be opened in read-only mode.");
3589 error = SET_ERROR(ENXIO);
3590 } else {
3591 spa_load_note(spa, "current settings allow for maximum "
3592 "%lld missing top-level vdevs at this stage.",
3593 (u_longlong_t)spa->spa_missing_tvds_allowed);
3594 }
3595 }
4a0ee12a
PZ
3596 if (error != 0) {
3597 spa_load_failed(spa, "unable to open vdev tree [error=%d]",
3598 error);
3599 }
6cb8e530
PZ
3600 if (spa->spa_missing_tvds != 0 || error != 0)
3601 vdev_dbgmsg_print_tree(spa->spa_root_vdev, 2);
9eb7b46e
PZ
3602
3603 return (error);
3604}
3605
6cb8e530
PZ
3606/*
3607 * We need to validate the vdev labels against the configuration that
3608 * we have in hand. This function is called twice: first with an untrusted
3609 * config, then with a trusted config. The validation is more strict when the
3610 * config is trusted.
3611 */
9eb7b46e 3612static int
6cb8e530 3613spa_ld_validate_vdevs(spa_t *spa)
9eb7b46e
PZ
3614{
3615 int error = 0;
3616 vdev_t *rvd = spa->spa_root_vdev;
428870ff 3617
6cb8e530
PZ
3618 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3619 error = vdev_validate(rvd);
3620 spa_config_exit(spa, SCL_ALL, FTAG);
428870ff 3621
6cb8e530
PZ
3622 if (error != 0) {
3623 spa_load_failed(spa, "vdev_validate failed [error=%d]", error);
3624 return (error);
3625 }
428870ff 3626
6cb8e530
PZ
3627 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) {
3628 spa_load_failed(spa, "cannot open vdev tree after invalidating "
3629 "some vdevs");
3630 vdev_dbgmsg_print_tree(rvd, 2);
3631 return (SET_ERROR(ENXIO));
428870ff
BB
3632 }
3633
9eb7b46e
PZ
3634 return (0);
3635}
3636
d2734cce
SD
3637static void
3638spa_ld_select_uberblock_done(spa_t *spa, uberblock_t *ub)
3639{
3640 spa->spa_state = POOL_STATE_ACTIVE;
3641 spa->spa_ubsync = spa->spa_uberblock;
3642 spa->spa_verify_min_txg = spa->spa_extreme_rewind ?
3643 TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1;
3644 spa->spa_first_txg = spa->spa_last_ubsync_txg ?
3645 spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1;
3646 spa->spa_claim_max_txg = spa->spa_first_txg;
3647 spa->spa_prev_software_version = ub->ub_software_version;
3648}
3649
9eb7b46e 3650static int
6cb8e530 3651spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type)
9eb7b46e
PZ
3652{
3653 vdev_t *rvd = spa->spa_root_vdev;
3654 nvlist_t *label;
3655 uberblock_t *ub = &spa->spa_uberblock;
9eb7b46e
PZ
3656 boolean_t activity_check = B_FALSE;
3657
d2734cce
SD
3658 /*
3659 * If we are opening the checkpointed state of the pool by
3660 * rewinding to it, at this point we will have written the
3661 * checkpointed uberblock to the vdev labels, so searching
3662 * the labels will find the right uberblock. However, if
3663 * we are opening the checkpointed state read-only, we have
3664 * not modified the labels. Therefore, we must ignore the
3665 * labels and continue using the spa_uberblock that was set
3666 * by spa_ld_checkpoint_rewind.
3667 *
3668 * Note that it would be fine to ignore the labels when
3669 * rewinding (opening writeable) as well. However, if we
3670 * crash just after writing the labels, we will end up
3671 * searching the labels. Doing so in the common case means
3672 * that this code path gets exercised normally, rather than
3673 * just in the edge case.
3674 */
3675 if (ub->ub_checkpoint_txg != 0 &&
3676 spa_importing_readonly_checkpoint(spa)) {
3677 spa_ld_select_uberblock_done(spa, ub);
3678 return (0);
3679 }
3680
428870ff
BB
3681 /*
3682 * Find the best uberblock.
3683 */
9ae529ec 3684 vdev_uberblock_load(rvd, ub, &label);
428870ff
BB
3685
3686 /*
3687 * If we weren't able to find a single valid uberblock, return failure.
3688 */
9ae529ec
CS
3689 if (ub->ub_txg == 0) {
3690 nvlist_free(label);
4a0ee12a 3691 spa_load_failed(spa, "no valid uberblock found");
428870ff 3692 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO));
9ae529ec 3693 }
428870ff 3694
ca95f70d
OF
3695 if (spa->spa_load_max_txg != UINT64_MAX) {
3696 (void) spa_import_progress_set_max_txg(spa_guid(spa),
3697 (u_longlong_t)spa->spa_load_max_txg);
3698 }
4a0ee12a
PZ
3699 spa_load_note(spa, "using uberblock with txg=%llu",
3700 (u_longlong_t)ub->ub_txg);
3701
3702
379ca9cf
OF
3703 /*
3704 * For pools which have the multihost property on determine if the
3705 * pool is truly inactive and can be safely imported. Prevent
3706 * hosts which don't have a hostid set from importing the pool.
3707 */
6cb8e530
PZ
3708 activity_check = spa_activity_check_required(spa, ub, label,
3709 spa->spa_config);
379ca9cf 3710 if (activity_check) {
379ca9cf 3711 if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay &&
25f06d67 3712 spa_get_hostid(spa) == 0) {
379ca9cf
OF
3713 nvlist_free(label);
3714 fnvlist_add_uint64(spa->spa_load_info,
3715 ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID);
3716 return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO));
3717 }
3718
6cb8e530 3719 int error = spa_activity_check(spa, ub, spa->spa_config);
e889f0f5
OF
3720 if (error) {
3721 nvlist_free(label);
3722 return (error);
3723 }
3724
379ca9cf
OF
3725 fnvlist_add_uint64(spa->spa_load_info,
3726 ZPOOL_CONFIG_MMP_STATE, MMP_STATE_INACTIVE);
3727 fnvlist_add_uint64(spa->spa_load_info,
3728 ZPOOL_CONFIG_MMP_TXG, ub->ub_txg);
060f0226
OF
3729 fnvlist_add_uint16(spa->spa_load_info,
3730 ZPOOL_CONFIG_MMP_SEQ,
3731 (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0));
379ca9cf
OF
3732 }
3733
428870ff 3734 /*
9ae529ec 3735 * If the pool has an unsupported version we can't open it.
428870ff 3736 */
9ae529ec
CS
3737 if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) {
3738 nvlist_free(label);
4a0ee12a
PZ
3739 spa_load_failed(spa, "version %llu is not supported",
3740 (u_longlong_t)ub->ub_version);
428870ff 3741 return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP));
9ae529ec
CS
3742 }
3743
3744 if (ub->ub_version >= SPA_VERSION_FEATURES) {
3745 nvlist_t *features;
3746
3747 /*
3748 * If we weren't able to find what's necessary for reading the
3749 * MOS in the label, return failure.
3750 */
4a0ee12a
PZ
3751 if (label == NULL) {
3752 spa_load_failed(spa, "label config unavailable");
3753 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
3754 ENXIO));
3755 }
3756
3757 if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_FEATURES_FOR_READ,
3758 &features) != 0) {
9ae529ec 3759 nvlist_free(label);
4a0ee12a
PZ
3760 spa_load_failed(spa, "invalid label: '%s' missing",
3761 ZPOOL_CONFIG_FEATURES_FOR_READ);
9ae529ec
CS
3762 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
3763 ENXIO));
3764 }
3765
3766 /*
3767 * Update our in-core representation with the definitive values
3768 * from the label.
3769 */
3770 nvlist_free(spa->spa_label_features);
65ad5d11 3771 spa->spa_label_features = fnvlist_dup(features);
9ae529ec
CS
3772 }
3773
3774 nvlist_free(label);
3775
3776 /*
3777 * Look through entries in the label nvlist's features_for_read. If
3778 * there is a feature listed there which we don't understand then we
3779 * cannot open a pool.
3780 */
3781 if (ub->ub_version >= SPA_VERSION_FEATURES) {
3782 nvlist_t *unsup_feat;
9ae529ec 3783
65ad5d11 3784 unsup_feat = fnvlist_alloc();
9ae529ec 3785
1c27024e
DB
3786 for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features,
3787 NULL); nvp != NULL;
9ae529ec
CS
3788 nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) {
3789 if (!zfeature_is_supported(nvpair_name(nvp))) {
65ad5d11
AJ
3790 fnvlist_add_string(unsup_feat,
3791 nvpair_name(nvp), "");
9ae529ec
CS
3792 }
3793 }
3794
3795 if (!nvlist_empty(unsup_feat)) {
65ad5d11
AJ
3796 fnvlist_add_nvlist(spa->spa_load_info,
3797 ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat);
9ae529ec 3798 nvlist_free(unsup_feat);
4a0ee12a 3799 spa_load_failed(spa, "some features are unsupported");
9ae529ec
CS
3800 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
3801 ENOTSUP));
3802 }
3803
3804 nvlist_free(unsup_feat);
3805 }
428870ff 3806
428870ff
BB
3807 if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) {
3808 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
6cb8e530 3809 spa_try_repair(spa, spa->spa_config);
428870ff
BB
3810 spa_config_exit(spa, SCL_ALL, FTAG);
3811 nvlist_free(spa->spa_config_splitting);
3812 spa->spa_config_splitting = NULL;
3813 }
3814
3815 /*
3816 * Initialize internal SPA structures.
3817 */
d2734cce 3818 spa_ld_select_uberblock_done(spa, ub);
428870ff 3819
9eb7b46e
PZ
3820 return (0);
3821}
3822
3823static int
3824spa_ld_open_rootbp(spa_t *spa)
3825{
3826 int error = 0;
3827 vdev_t *rvd = spa->spa_root_vdev;
a1d477c2 3828
9ae529ec 3829 error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
4a0ee12a
PZ
3830 if (error != 0) {
3831 spa_load_failed(spa, "unable to open rootbp in dsl_pool_init "
3832 "[error=%d]", error);
428870ff 3833 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
4a0ee12a 3834 }
428870ff
BB
3835 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
3836
9eb7b46e
PZ
3837 return (0);
3838}
3839
3840static int
d2734cce 3841spa_ld_trusted_config(spa_t *spa, spa_import_type_t type,
6cb8e530 3842 boolean_t reloading)
9eb7b46e 3843{
6cb8e530
PZ
3844 vdev_t *mrvd, *rvd = spa->spa_root_vdev;
3845 nvlist_t *nv, *mos_config, *policy;
3846 int error = 0, copy_error;
3847 uint64_t healthy_tvds, healthy_tvds_mos;
3848 uint64_t mos_config_txg;
9eb7b46e 3849
4a0ee12a
PZ
3850 if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object, B_TRUE)
3851 != 0)
428870ff
BB
3852 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3853
a1d477c2 3854 /*
6cb8e530
PZ
3855 * If we're assembling a pool from a split, the config provided is
3856 * already trusted so there is nothing to do.
a1d477c2 3857 */
6cb8e530
PZ
3858 if (type == SPA_IMPORT_ASSEMBLE)
3859 return (0);
3860
3861 healthy_tvds = spa_healthy_core_tvds(spa);
a1d477c2 3862
6cb8e530
PZ
3863 if (load_nvlist(spa, spa->spa_config_object, &mos_config)
3864 != 0) {
3865 spa_load_failed(spa, "unable to retrieve MOS config");
3866 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3867 }
3868
3869 /*
3870 * If we are doing an open, pool owner wasn't verified yet, thus do
3871 * the verification here.
3872 */
3873 if (spa->spa_load_state == SPA_LOAD_OPEN) {
3874 error = spa_verify_host(spa, mos_config);
3875 if (error != 0) {
a1d477c2 3876 nvlist_free(mos_config);
6cb8e530 3877 return (error);
a1d477c2 3878 }
6cb8e530
PZ
3879 }
3880
3881 nv = fnvlist_lookup_nvlist(mos_config, ZPOOL_CONFIG_VDEV_TREE);
a1d477c2 3882
6cb8e530
PZ
3883 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3884
3885 /*
3886 * Build a new vdev tree from the trusted config
3887 */
b2255edc
BB
3888 error = spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD);
3889 if (error != 0) {
3890 nvlist_free(mos_config);
3891 spa_config_exit(spa, SCL_ALL, FTAG);
3892 spa_load_failed(spa, "spa_config_parse failed [error=%d]",
3893 error);
3894 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
3895 }
6cb8e530
PZ
3896
3897 /*
3898 * Vdev paths in the MOS may be obsolete. If the untrusted config was
3899 * obtained by scanning /dev/dsk, then it will have the right vdev
3900 * paths. We update the trusted MOS config with this information.
3901 * We first try to copy the paths with vdev_copy_path_strict, which
3902 * succeeds only when both configs have exactly the same vdev tree.
3903 * If that fails, we fall back to a more flexible method that has a
3904 * best effort policy.
3905 */
3906 copy_error = vdev_copy_path_strict(rvd, mrvd);
3907 if (copy_error != 0 || spa_load_print_vdev_tree) {
3908 spa_load_note(spa, "provided vdev tree:");
3909 vdev_dbgmsg_print_tree(rvd, 2);
3910 spa_load_note(spa, "MOS vdev tree:");
3911 vdev_dbgmsg_print_tree(mrvd, 2);
3912 }
3913 if (copy_error != 0) {
3914 spa_load_note(spa, "vdev_copy_path_strict failed, falling "
3915 "back to vdev_copy_path_relaxed");
3916 vdev_copy_path_relaxed(rvd, mrvd);
3917 }
3918
3919 vdev_close(rvd);
3920 vdev_free(rvd);
3921 spa->spa_root_vdev = mrvd;
3922 rvd = mrvd;
3923 spa_config_exit(spa, SCL_ALL, FTAG);
3924
3925 /*
3926 * We will use spa_config if we decide to reload the spa or if spa_load
3927 * fails and we rewind. We must thus regenerate the config using the
8a393be3
PZ
3928 * MOS information with the updated paths. ZPOOL_LOAD_POLICY is used to
3929 * pass settings on how to load the pool and is not stored in the MOS.
3930 * We copy it over to our new, trusted config.
6cb8e530
PZ
3931 */
3932 mos_config_txg = fnvlist_lookup_uint64(mos_config,
3933 ZPOOL_CONFIG_POOL_TXG);
3934 nvlist_free(mos_config);
3935 mos_config = spa_config_generate(spa, NULL, mos_config_txg, B_FALSE);
8a393be3 3936 if (nvlist_lookup_nvlist(spa->spa_config, ZPOOL_LOAD_POLICY,
6cb8e530 3937 &policy) == 0)
8a393be3 3938 fnvlist_add_nvlist(mos_config, ZPOOL_LOAD_POLICY, policy);
6cb8e530
PZ
3939 spa_config_set(spa, mos_config);
3940 spa->spa_config_source = SPA_CONFIG_SRC_MOS;
3941
3942 /*
3943 * Now that we got the config from the MOS, we should be more strict
3944 * in checking blkptrs and can make assumptions about the consistency
3945 * of the vdev tree. spa_trust_config must be set to true before opening
3946 * vdevs in order for them to be writeable.
3947 */
3948 spa->spa_trust_config = B_TRUE;
3949
3950 /*
3951 * Open and validate the new vdev tree
3952 */
3953 error = spa_ld_open_vdevs(spa);
3954 if (error != 0)
3955 return (error);
3956
3957 error = spa_ld_validate_vdevs(spa);
3958 if (error != 0)
3959 return (error);
3960
3961 if (copy_error != 0 || spa_load_print_vdev_tree) {
3962 spa_load_note(spa, "final vdev tree:");
3963 vdev_dbgmsg_print_tree(rvd, 2);
3964 }
3965
3966 if (spa->spa_load_state != SPA_LOAD_TRYIMPORT &&
3967 !spa->spa_extreme_rewind && zfs_max_missing_tvds == 0) {
a1d477c2 3968 /*
6cb8e530
PZ
3969 * Sanity check to make sure that we are indeed loading the
3970 * latest uberblock. If we missed SPA_SYNC_MIN_VDEVS tvds
3971 * in the config provided and they happened to be the only ones
3972 * to have the latest uberblock, we could involuntarily perform
3973 * an extreme rewind.
a1d477c2 3974 */
6cb8e530
PZ
3975 healthy_tvds_mos = spa_healthy_core_tvds(spa);
3976 if (healthy_tvds_mos - healthy_tvds >=
3977 SPA_SYNC_MIN_VDEVS) {
3978 spa_load_note(spa, "config provided misses too many "
3979 "top-level vdevs compared to MOS (%lld vs %lld). ",
3980 (u_longlong_t)healthy_tvds,
3981 (u_longlong_t)healthy_tvds_mos);
3982 spa_load_note(spa, "vdev tree:");
3983 vdev_dbgmsg_print_tree(rvd, 2);
3984 if (reloading) {
3985 spa_load_failed(spa, "config was already "
3986 "provided from MOS. Aborting.");
3987 return (spa_vdev_err(rvd,
3988 VDEV_AUX_CORRUPT_DATA, EIO));
3989 }
3990 spa_load_note(spa, "spa must be reloaded using MOS "
3991 "config");
3992 return (SET_ERROR(EAGAIN));
4a0ee12a 3993 }
a1d477c2
MA
3994 }
3995
6cb8e530
PZ
3996 error = spa_check_for_missing_logs(spa);
3997 if (error != 0)
3998 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO));
3999
4000 if (rvd->vdev_guid_sum != spa->spa_uberblock.ub_guid_sum) {
4001 spa_load_failed(spa, "uberblock guid sum doesn't match MOS "
4002 "guid sum (%llu != %llu)",
4003 (u_longlong_t)spa->spa_uberblock.ub_guid_sum,
4004 (u_longlong_t)rvd->vdev_guid_sum);
4005 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM,
4006 ENXIO));
4007 }
4008
9eb7b46e
PZ
4009 return (0);
4010}
4011
4012static int
4013spa_ld_open_indirect_vdev_metadata(spa_t *spa)
4014{
4015 int error = 0;
4016 vdev_t *rvd = spa->spa_root_vdev;
4017
a1d477c2
MA
4018 /*
4019 * Everything that we read before spa_remove_init() must be stored
4020 * on concreted vdevs. Therefore we do this as early as possible.
4021 */
4a0ee12a
PZ
4022 error = spa_remove_init(spa);
4023 if (error != 0) {
4024 spa_load_failed(spa, "spa_remove_init failed [error=%d]",
4025 error);
a1d477c2 4026 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
4a0ee12a 4027 }
a1d477c2 4028
9eb7b46e
PZ
4029 /*
4030 * Retrieve information needed to condense indirect vdev mappings.
4031 */
4032 error = spa_condense_init(spa);
4033 if (error != 0) {
4a0ee12a
PZ
4034 spa_load_failed(spa, "spa_condense_init failed [error=%d]",
4035 error);
9eb7b46e
PZ
4036 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
4037 }
4038
4039 return (0);
4040}
4041
4042static int
4a0ee12a 4043spa_ld_check_features(spa_t *spa, boolean_t *missing_feat_writep)
9eb7b46e
PZ
4044{
4045 int error = 0;
4046 vdev_t *rvd = spa->spa_root_vdev;
4047
9ae529ec
CS
4048 if (spa_version(spa) >= SPA_VERSION_FEATURES) {
4049 boolean_t missing_feat_read = B_FALSE;
b9b24bb4 4050 nvlist_t *unsup_feat, *enabled_feat;
9ae529ec
CS
4051
4052 if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ,
4a0ee12a 4053 &spa->spa_feat_for_read_obj, B_TRUE) != 0) {
9ae529ec
CS
4054 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
4055 }
4056
4057 if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE,
4a0ee12a 4058 &spa->spa_feat_for_write_obj, B_TRUE) != 0) {
9ae529ec
CS
4059 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
4060 }
4061
4062 if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS,
4a0ee12a 4063 &spa->spa_feat_desc_obj, B_TRUE) != 0) {
9ae529ec
CS
4064 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
4065 }
4066
b9b24bb4
CS
4067 enabled_feat = fnvlist_alloc();
4068 unsup_feat = fnvlist_alloc();
9ae529ec 4069
fa86b5db 4070 if (!spa_features_check(spa, B_FALSE,
b9b24bb4 4071 unsup_feat, enabled_feat))
9ae529ec
CS
4072 missing_feat_read = B_TRUE;
4073
4a0ee12a
PZ
4074 if (spa_writeable(spa) ||
4075 spa->spa_load_state == SPA_LOAD_TRYIMPORT) {
fa86b5db 4076 if (!spa_features_check(spa, B_TRUE,
b9b24bb4 4077 unsup_feat, enabled_feat)) {
9eb7b46e 4078 *missing_feat_writep = B_TRUE;
b9b24bb4 4079 }
9ae529ec
CS
4080 }
4081
b9b24bb4
CS
4082 fnvlist_add_nvlist(spa->spa_load_info,
4083 ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat);
4084
9ae529ec 4085 if (!nvlist_empty(unsup_feat)) {
b9b24bb4
CS
4086 fnvlist_add_nvlist(spa->spa_load_info,
4087 ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat);
9ae529ec
CS
4088 }
4089
b9b24bb4
CS
4090 fnvlist_free(enabled_feat);
4091 fnvlist_free(unsup_feat);
9ae529ec
CS
4092
4093 if (!missing_feat_read) {
4094 fnvlist_add_boolean(spa->spa_load_info,
4095 ZPOOL_CONFIG_CAN_RDONLY);
4096 }
4097
4098 /*
4099 * If the state is SPA_LOAD_TRYIMPORT, our objective is
4100 * twofold: to determine whether the pool is available for
4101 * import in read-write mode and (if it is not) whether the
4102 * pool is available for import in read-only mode. If the pool
4103 * is available for import in read-write mode, it is displayed
4104 * as available in userland; if it is not available for import
4105 * in read-only mode, it is displayed as unavailable in
4106 * userland. If the pool is available for import in read-only
4107 * mode but not read-write mode, it is displayed as unavailable
4108 * in userland with a special note that the pool is actually
4109 * available for open in read-only mode.
4110 *
4111 * As a result, if the state is SPA_LOAD_TRYIMPORT and we are
4112 * missing a feature for write, we must first determine whether
4113 * the pool can be opened read-only before returning to
4114 * userland in order to know whether to display the
4115 * abovementioned note.
4116 */
9eb7b46e 4117 if (missing_feat_read || (*missing_feat_writep &&
9ae529ec 4118 spa_writeable(spa))) {
4a0ee12a 4119 spa_load_failed(spa, "pool uses unsupported features");
9ae529ec
CS
4120 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
4121 ENOTSUP));
4122 }
b0bc7a84
MG
4123
4124 /*
4125 * Load refcounts for ZFS features from disk into an in-memory
4126 * cache during SPA initialization.
4127 */
1c27024e 4128 for (spa_feature_t i = 0; i < SPA_FEATURES; i++) {
b0bc7a84
MG
4129 uint64_t refcount;
4130
4131 error = feature_get_refcount_from_disk(spa,
4132 &spa_feature_table[i], &refcount);
4133 if (error == 0) {
4134 spa->spa_feat_refcount_cache[i] = refcount;
4135 } else if (error == ENOTSUP) {
4136 spa->spa_feat_refcount_cache[i] =
4137 SPA_FEATURE_DISABLED;
4138 } else {
4a0ee12a
PZ
4139 spa_load_failed(spa, "error getting refcount "
4140 "for feature %s [error=%d]",
4141 spa_feature_table[i].fi_guid, error);
b0bc7a84
MG
4142 return (spa_vdev_err(rvd,
4143 VDEV_AUX_CORRUPT_DATA, EIO));
4144 }
4145 }
4146 }
4147
4148 if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) {
4149 if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG,
4a0ee12a 4150 &spa->spa_feat_enabled_txg_obj, B_TRUE) != 0)
b0bc7a84 4151 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
9ae529ec
CS
4152 }
4153
f00ab3f2
TC
4154 /*
4155 * Encryption was added before bookmark_v2, even though bookmark_v2
4156 * is now a dependency. If this pool has encryption enabled without
4157 * bookmark_v2, trigger an errata message.
4158 */
4159 if (spa_feature_is_enabled(spa, SPA_FEATURE_ENCRYPTION) &&
4160 !spa_feature_is_enabled(spa, SPA_FEATURE_BOOKMARK_V2)) {
4161 spa->spa_errata = ZPOOL_ERRATA_ZOL_8308_ENCRYPTION;
4162 }
4163
9eb7b46e
PZ
4164 return (0);
4165}
4166
4167static int
4168spa_ld_load_special_directories(spa_t *spa)
4169{
4170 int error = 0;
4171 vdev_t *rvd = spa->spa_root_vdev;
4172
9ae529ec
CS
4173 spa->spa_is_initializing = B_TRUE;
4174 error = dsl_pool_open(spa->spa_dsl_pool);
4175 spa->spa_is_initializing = B_FALSE;
4a0ee12a
PZ
4176 if (error != 0) {
4177 spa_load_failed(spa, "dsl_pool_open failed [error=%d]", error);
9ae529ec 4178 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
4a0ee12a 4179 }
9ae529ec 4180
9eb7b46e
PZ
4181 return (0);
4182}
428870ff 4183
9eb7b46e
PZ
4184static int
4185spa_ld_get_props(spa_t *spa)
4186{
4187 int error = 0;
4188 uint64_t obj;
4189 vdev_t *rvd = spa->spa_root_vdev;
34dc7c2f 4190
3c67d83a
TH
4191 /* Grab the checksum salt from the MOS. */
4192 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
4193 DMU_POOL_CHECKSUM_SALT, 1,
4194 sizeof (spa->spa_cksum_salt.zcs_bytes),
4195 spa->spa_cksum_salt.zcs_bytes);
4196 if (error == ENOENT) {
4197 /* Generate a new salt for subsequent use */
4198 (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes,
4199 sizeof (spa->spa_cksum_salt.zcs_bytes));
4200 } else if (error != 0) {
4a0ee12a
PZ
4201 spa_load_failed(spa, "unable to retrieve checksum salt from "
4202 "MOS [error=%d]", error);
3c67d83a
TH
4203 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
4204 }
4205
4a0ee12a 4206 if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj, B_TRUE) != 0)
428870ff
BB
4207 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
4208 error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj);
4a0ee12a
PZ
4209 if (error != 0) {
4210 spa_load_failed(spa, "error opening deferred-frees bpobj "
4211 "[error=%d]", error);
428870ff 4212 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
4a0ee12a 4213 }
34dc7c2f
BB
4214
4215 /*
4216 * Load the bit that tells us to use the new accounting function
4217 * (raid-z deflation). If we have an older pool, this will not
4218 * be present.
4219 */
4a0ee12a 4220 error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate, B_FALSE);
428870ff
BB
4221 if (error != 0 && error != ENOENT)
4222 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
4223
4224 error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION,
4a0ee12a 4225 &spa->spa_creation_version, B_FALSE);
428870ff
BB
4226 if (error != 0 && error != ENOENT)
4227 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
34dc7c2f
BB
4228
4229 /*
4230 * Load the persistent error log. If we have an older pool, this will
4231 * not be present.
4232 */
4a0ee12a
PZ
4233 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last,
4234 B_FALSE);
428870ff
BB
4235 if (error != 0 && error != ENOENT)
4236 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
34dc7c2f 4237
428870ff 4238 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB,
4a0ee12a 4239 &spa->spa_errlog_scrub, B_FALSE);
428870ff
BB
4240 if (error != 0 && error != ENOENT)
4241 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
34dc7c2f 4242
37f03da8
SH
4243 /*
4244 * Load the livelist deletion field. If a livelist is queued for
4245 * deletion, indicate that in the spa
4246 */
4247 error = spa_dir_prop(spa, DMU_POOL_DELETED_CLONES,
4248 &spa->spa_livelists_to_delete, B_FALSE);
4249 if (error != 0 && error != ENOENT)
4250 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
4251
34dc7c2f
BB
4252 /*
4253 * Load the history object. If we have an older pool, this
4254 * will not be present.
4255 */
4a0ee12a 4256 error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history, B_FALSE);
428870ff
BB
4257 if (error != 0 && error != ENOENT)
4258 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
4259
e0ab3ab5
JS
4260 /*
4261 * Load the per-vdev ZAP map. If we have an older pool, this will not
4262 * be present; in this case, defer its creation to a later time to
4263 * avoid dirtying the MOS this early / out of sync context. See
4264 * spa_sync_config_object.
4265 */
4266
4267 /* The sentinel is only available in the MOS config. */
1c27024e 4268 nvlist_t *mos_config;
4a0ee12a
PZ
4269 if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0) {
4270 spa_load_failed(spa, "unable to retrieve MOS config");
e0ab3ab5 4271 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
4a0ee12a 4272 }
e0ab3ab5
JS
4273
4274 error = spa_dir_prop(spa, DMU_POOL_VDEV_ZAP_MAP,
4a0ee12a 4275 &spa->spa_all_vdev_zaps, B_FALSE);
e0ab3ab5 4276
38640550
DB
4277 if (error == ENOENT) {
4278 VERIFY(!nvlist_exists(mos_config,
4279 ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS));
4280 spa->spa_avz_action = AVZ_ACTION_INITIALIZE;
4281 ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev));
4282 } else if (error != 0) {
cb01da68 4283 nvlist_free(mos_config);
e0ab3ab5 4284 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
38640550 4285 } else if (!nvlist_exists(mos_config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)) {
e0ab3ab5
JS
4286 /*
4287 * An older version of ZFS overwrote the sentinel value, so
4288 * we have orphaned per-vdev ZAPs in the MOS. Defer their
4289 * destruction to later; see spa_sync_config_object.
4290 */
4291 spa->spa_avz_action = AVZ_ACTION_DESTROY;
4292 /*
4293 * We're assuming that no vdevs have had their ZAPs created
4294 * before this. Better be sure of it.
4295 */
4296 ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev));
4297 }
4298 nvlist_free(mos_config);
4299
9eb7b46e
PZ
4300 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
4301
4a0ee12a
PZ
4302 error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object,
4303 B_FALSE);
9eb7b46e
PZ
4304 if (error && error != ENOENT)
4305 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
4306
4307 if (error == 0) {
da27b8bc 4308 uint64_t autoreplace = 0;
9eb7b46e
PZ
4309
4310 spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs);
4311 spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace);
4312 spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation);
4313 spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode);
4314 spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand);
c02c1bec 4315 spa_prop_find(spa, ZPOOL_PROP_MULTIHOST, &spa->spa_multihost);
1b939560 4316 spa_prop_find(spa, ZPOOL_PROP_AUTOTRIM, &spa->spa_autotrim);
9eb7b46e
PZ
4317 spa->spa_autoreplace = (autoreplace != 0);
4318 }
4319
6cb8e530
PZ
4320 /*
4321 * If we are importing a pool with missing top-level vdevs,
4322 * we enforce that the pool doesn't panic or get suspended on
4323 * error since the likelihood of missing data is extremely high.
4324 */
4325 if (spa->spa_missing_tvds > 0 &&
4326 spa->spa_failmode != ZIO_FAILURE_MODE_CONTINUE &&
4327 spa->spa_load_state != SPA_LOAD_TRYIMPORT) {
4328 spa_load_note(spa, "forcing failmode to 'continue' "
4329 "as some top level vdevs are missing");
4330 spa->spa_failmode = ZIO_FAILURE_MODE_CONTINUE;
4331 }
4332
9eb7b46e
PZ
4333 return (0);
4334}
4335
4336static int
4337spa_ld_open_aux_vdevs(spa_t *spa, spa_import_type_t type)
4338{
4339 int error = 0;
4340 vdev_t *rvd = spa->spa_root_vdev;
4341
428870ff
BB
4342 /*
4343 * If we're assembling the pool from the split-off vdevs of
4344 * an existing pool, we don't want to attach the spares & cache
4345 * devices.
4346 */
34dc7c2f
BB
4347
4348 /*
4349 * Load any hot spares for this pool.
4350 */
4a0ee12a
PZ
4351 error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object,
4352 B_FALSE);
428870ff
BB
4353 if (error != 0 && error != ENOENT)
4354 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
4355 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
34dc7c2f
BB
4356 ASSERT(spa_version(spa) >= SPA_VERSION_SPARES);
4357 if (load_nvlist(spa, spa->spa_spares.sav_object,
4a0ee12a
PZ
4358 &spa->spa_spares.sav_config) != 0) {
4359 spa_load_failed(spa, "error loading spares nvlist");
428870ff 4360 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
4a0ee12a 4361 }
34dc7c2f 4362
b128c09f 4363 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
34dc7c2f 4364 spa_load_spares(spa);
b128c09f 4365 spa_config_exit(spa, SCL_ALL, FTAG);
428870ff
BB
4366 } else if (error == 0) {
4367 spa->spa_spares.sav_sync = B_TRUE;
34dc7c2f
BB
4368 }
4369
4370 /*
4371 * Load any level 2 ARC devices for this pool.
4372 */
428870ff 4373 error = spa_dir_prop(spa, DMU_POOL_L2CACHE,
4a0ee12a 4374 &spa->spa_l2cache.sav_object, B_FALSE);
428870ff
BB
4375 if (error != 0 && error != ENOENT)
4376 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
4377 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
34dc7c2f
BB
4378 ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE);
4379 if (load_nvlist(spa, spa->spa_l2cache.sav_object,
4a0ee12a
PZ
4380 &spa->spa_l2cache.sav_config) != 0) {
4381 spa_load_failed(spa, "error loading l2cache nvlist");
428870ff 4382 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
4a0ee12a 4383 }
34dc7c2f 4384
b128c09f 4385 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
34dc7c2f 4386 spa_load_l2cache(spa);
b128c09f 4387 spa_config_exit(spa, SCL_ALL, FTAG);
428870ff
BB
4388 } else if (error == 0) {
4389 spa->spa_l2cache.sav_sync = B_TRUE;
b128c09f
BB
4390 }
4391
9eb7b46e
PZ
4392 return (0);
4393}
428870ff 4394
9eb7b46e 4395static int
4a0ee12a 4396spa_ld_load_vdev_metadata(spa_t *spa)
9eb7b46e
PZ
4397{
4398 int error = 0;
4399 vdev_t *rvd = spa->spa_root_vdev;
34dc7c2f 4400
379ca9cf
OF
4401 /*
4402 * If the 'multihost' property is set, then never allow a pool to
4403 * be imported when the system hostid is zero. The exception to
4404 * this rule is zdb which is always allowed to access pools.
4405 */
25f06d67 4406 if (spa_multihost(spa) && spa_get_hostid(spa) == 0 &&
379ca9cf
OF
4407 (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP) == 0) {
4408 fnvlist_add_uint64(spa->spa_load_info,
4409 ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID);
4410 return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO));
4411 }
4412
34dc7c2f
BB
4413 /*
4414 * If the 'autoreplace' property is set, then post a resource notifying
4415 * the ZFS DE that it should not issue any faults for unopenable
4416 * devices. We also iterate over the vdevs, and post a sysevent for any
4417 * unopenable vdevs so that the normal autoreplace handler can take
4418 * over.
4419 */
4a0ee12a 4420 if (spa->spa_autoreplace && spa->spa_load_state != SPA_LOAD_TRYIMPORT) {
34dc7c2f 4421 spa_check_removed(spa->spa_root_vdev);
428870ff
BB
4422 /*
4423 * For the import case, this is done in spa_import(), because
4424 * at this point we're using the spare definitions from
4425 * the MOS config, not necessarily from the userland config.
4426 */
4a0ee12a 4427 if (spa->spa_load_state != SPA_LOAD_IMPORT) {
428870ff
BB
4428 spa_aux_check_removed(&spa->spa_spares);
4429 spa_aux_check_removed(&spa->spa_l2cache);
4430 }
4431 }
34dc7c2f
BB
4432
4433 /*
9eb7b46e 4434 * Load the vdev metadata such as metaslabs, DTLs, spacemap object, etc.
34dc7c2f 4435 */
a1d477c2
MA
4436 error = vdev_load(rvd);
4437 if (error != 0) {
4a0ee12a 4438 spa_load_failed(spa, "vdev_load failed [error=%d]", error);
a1d477c2
MA
4439 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
4440 }
4441
93e28d66
SD
4442 error = spa_ld_log_spacemaps(spa);
4443 if (error != 0) {
600a02b8 4444 spa_load_failed(spa, "spa_ld_log_spacemaps failed [error=%d]",
93e28d66
SD
4445 error);
4446 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
4447 }
4448
34dc7c2f 4449 /*
9eb7b46e 4450 * Propagate the leaf DTLs we just loaded all the way up the vdev tree.
34dc7c2f 4451 */
b128c09f 4452 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
9a49d3f3 4453 vdev_dtl_reassess(rvd, 0, 0, B_FALSE, B_FALSE);
b128c09f 4454 spa_config_exit(spa, SCL_ALL, FTAG);
34dc7c2f 4455
9eb7b46e
PZ
4456 return (0);
4457}
4458
4459static int
4460spa_ld_load_dedup_tables(spa_t *spa)
4461{
4462 int error = 0;
4463 vdev_t *rvd = spa->spa_root_vdev;
4464
428870ff 4465 error = ddt_load(spa);
4a0ee12a
PZ
4466 if (error != 0) {
4467 spa_load_failed(spa, "ddt_load failed [error=%d]", error);
428870ff 4468 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
4a0ee12a 4469 }
428870ff 4470
9eb7b46e
PZ
4471 return (0);
4472}
4473
67a1b037
PJD
4474static int
4475spa_ld_load_brt(spa_t *spa)
4476{
4477 int error = 0;
4478 vdev_t *rvd = spa->spa_root_vdev;
4479
4480 error = brt_load(spa);
4481 if (error != 0) {
4482 spa_load_failed(spa, "brt_load failed [error=%d]", error);
4483 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
4484 }
4485
4486 return (0);
4487}
4488
9eb7b46e 4489static int
a926aab9 4490spa_ld_verify_logs(spa_t *spa, spa_import_type_t type, const char **ereport)
9eb7b46e
PZ
4491{
4492 vdev_t *rvd = spa->spa_root_vdev;
428870ff 4493
4a0ee12a
PZ
4494 if (type != SPA_IMPORT_ASSEMBLE && spa_writeable(spa)) {
4495 boolean_t missing = spa_check_logs(spa);
4496 if (missing) {
6cb8e530
PZ
4497 if (spa->spa_missing_tvds != 0) {
4498 spa_load_note(spa, "spa_check_logs failed "
4499 "so dropping the logs");
4500 } else {
4501 *ereport = FM_EREPORT_ZFS_LOG_REPLAY;
4502 spa_load_failed(spa, "spa_check_logs failed");
4503 return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG,
4504 ENXIO));
4505 }
4a0ee12a 4506 }
428870ff
BB
4507 }
4508
9eb7b46e
PZ
4509 return (0);
4510}
4511
4512static int
4a0ee12a 4513spa_ld_verify_pool_data(spa_t *spa)
9eb7b46e
PZ
4514{
4515 int error = 0;
4516 vdev_t *rvd = spa->spa_root_vdev;
4517
4518 /*
4519 * We've successfully opened the pool, verify that we're ready
4520 * to start pushing transactions.
4521 */
4a0ee12a 4522 if (spa->spa_load_state != SPA_LOAD_TRYIMPORT) {
9eb7b46e
PZ
4523 error = spa_load_verify(spa);
4524 if (error != 0) {
4a0ee12a
PZ
4525 spa_load_failed(spa, "spa_load_verify failed "
4526 "[error=%d]", error);
9eb7b46e
PZ
4527 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
4528 error));
4529 }
4530 }
4531
4532 return (0);
4533}
4534
4535static void
4536spa_ld_claim_log_blocks(spa_t *spa)
4537{
4538 dmu_tx_t *tx;
4539 dsl_pool_t *dp = spa_get_dsl(spa);
4540
4541 /*
4542 * Claim log blocks that haven't been committed yet.
4543 * This must all happen in a single txg.
4544 * Note: spa_claim_max_txg is updated by spa_claim_notify(),
4545 * invoked from zil_claim_log_block()'s i/o done callback.
4546 * Price of rollback is that we abandon the log.
4547 */
4548 spa->spa_claiming = B_TRUE;
4549
4550 tx = dmu_tx_create_assigned(dp, spa_first_txg(spa));
4551 (void) dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
4552 zil_claim, tx, DS_FIND_CHILDREN);
4553 dmu_tx_commit(tx);
4554
4555 spa->spa_claiming = B_FALSE;
4556
4557 spa_set_log_state(spa, SPA_LOG_GOOD);
4558}
4559
4560static void
6cb8e530 4561spa_ld_check_for_config_update(spa_t *spa, uint64_t config_cache_txg,
d2734cce 4562 boolean_t update_config_cache)
9eb7b46e
PZ
4563{
4564 vdev_t *rvd = spa->spa_root_vdev;
4565 int need_update = B_FALSE;
4566
4567 /*
4568 * If the config cache is stale, or we have uninitialized
4569 * metaslabs (see spa_vdev_add()), then update the config.
4570 *
4571 * If this is a verbatim import, trust the current
4572 * in-core spa_config and update the disk labels.
4573 */
d2734cce 4574 if (update_config_cache || config_cache_txg != spa->spa_config_txg ||
4a0ee12a
PZ
4575 spa->spa_load_state == SPA_LOAD_IMPORT ||
4576 spa->spa_load_state == SPA_LOAD_RECOVER ||
9eb7b46e
PZ
4577 (spa->spa_import_flags & ZFS_IMPORT_VERBATIM))
4578 need_update = B_TRUE;
4579
4580 for (int c = 0; c < rvd->vdev_children; c++)
4581 if (rvd->vdev_child[c]->vdev_ms_array == 0)
4582 need_update = B_TRUE;
4583
4584 /*
e1cfd73f 4585 * Update the config cache asynchronously in case we're the
9eb7b46e
PZ
4586 * root pool, in which case the config cache isn't writable yet.
4587 */
4588 if (need_update)
4589 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
4590}
4591
6cb8e530
PZ
4592static void
4593spa_ld_prepare_for_reload(spa_t *spa)
4594{
da92d5cb 4595 spa_mode_t mode = spa->spa_mode;
6cb8e530
PZ
4596 int async_suspended = spa->spa_async_suspended;
4597
4598 spa_unload(spa);
4599 spa_deactivate(spa);
4600 spa_activate(spa, mode);
4601
4602 /*
4603 * We save the value of spa_async_suspended as it gets reset to 0 by
4604 * spa_unload(). We want to restore it back to the original value before
4605 * returning as we might be calling spa_async_resume() later.
4606 */
4607 spa->spa_async_suspended = async_suspended;
4608}
4609
9eb7b46e 4610static int
d2734cce
SD
4611spa_ld_read_checkpoint_txg(spa_t *spa)
4612{
4613 uberblock_t checkpoint;
4614 int error = 0;
4615
4616 ASSERT0(spa->spa_checkpoint_txg);
4617 ASSERT(MUTEX_HELD(&spa_namespace_lock));
4618
4619 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
4620 DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t),
4621 sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint);
4622
4623 if (error == ENOENT)
4624 return (0);
4625
4626 if (error != 0)
4627 return (error);
4628
4629 ASSERT3U(checkpoint.ub_txg, !=, 0);
4630 ASSERT3U(checkpoint.ub_checkpoint_txg, !=, 0);
4631 ASSERT3U(checkpoint.ub_timestamp, !=, 0);
4632 spa->spa_checkpoint_txg = checkpoint.ub_txg;
4633 spa->spa_checkpoint_info.sci_timestamp = checkpoint.ub_timestamp;
4634
4635 return (0);
4636}
4637
4638static int
4639spa_ld_mos_init(spa_t *spa, spa_import_type_t type)
9eb7b46e
PZ
4640{
4641 int error = 0;
9eb7b46e 4642
4a0ee12a 4643 ASSERT(MUTEX_HELD(&spa_namespace_lock));
6cb8e530 4644 ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE);
4a0ee12a 4645
9eb7b46e 4646 /*
6cb8e530
PZ
4647 * Never trust the config that is provided unless we are assembling
4648 * a pool following a split.
4649 * This means don't trust blkptrs and the vdev tree in general. This
4650 * also effectively puts the spa in read-only mode since
4651 * spa_writeable() checks for spa_trust_config to be true.
4652 * We will later load a trusted config from the MOS.
9eb7b46e 4653 */
6cb8e530
PZ
4654 if (type != SPA_IMPORT_ASSEMBLE)
4655 spa->spa_trust_config = B_FALSE;
4656
9eb7b46e
PZ
4657 /*
4658 * Parse the config provided to create a vdev tree.
4659 */
6cb8e530 4660 error = spa_ld_parse_config(spa, type);
9eb7b46e
PZ
4661 if (error != 0)
4662 return (error);
4663
ca95f70d
OF
4664 spa_import_progress_add(spa);
4665
9eb7b46e
PZ
4666 /*
4667 * Now that we have the vdev tree, try to open each vdev. This involves
4668 * opening the underlying physical device, retrieving its geometry and
4669 * probing the vdev with a dummy I/O. The state of each vdev will be set
4670 * based on the success of those operations. After this we'll be ready
4671 * to read from the vdevs.
4672 */
4673 error = spa_ld_open_vdevs(spa);
4674 if (error != 0)
4675 return (error);
4676
4677 /*
4678 * Read the label of each vdev and make sure that the GUIDs stored
4679 * there match the GUIDs in the config provided.
6cb8e530
PZ
4680 * If we're assembling a new pool that's been split off from an
4681 * existing pool, the labels haven't yet been updated so we skip
4682 * validation for now.
9eb7b46e 4683 */
6cb8e530
PZ
4684 if (type != SPA_IMPORT_ASSEMBLE) {
4685 error = spa_ld_validate_vdevs(spa);
4686 if (error != 0)
4687 return (error);
4688 }
9eb7b46e
PZ
4689
4690 /*
d2734cce
SD
4691 * Read all vdev labels to find the best uberblock (i.e. latest,
4692 * unless spa_load_max_txg is set) and store it in spa_uberblock. We
4693 * get the list of features required to read blkptrs in the MOS from
4694 * the vdev label with the best uberblock and verify that our version
4695 * of zfs supports them all.
9eb7b46e 4696 */
6cb8e530 4697 error = spa_ld_select_uberblock(spa, type);
9eb7b46e
PZ
4698 if (error != 0)
4699 return (error);
4700
4701 /*
4702 * Pass that uberblock to the dsl_pool layer which will open the root
4703 * blkptr. This blkptr points to the latest version of the MOS and will
4704 * allow us to read its contents.
4705 */
4706 error = spa_ld_open_rootbp(spa);
4707 if (error != 0)
4708 return (error);
4709
d2734cce
SD
4710 return (0);
4711}
4712
4713static int
4714spa_ld_checkpoint_rewind(spa_t *spa)
4715{
4716 uberblock_t checkpoint;
4717 int error = 0;
4718
4719 ASSERT(MUTEX_HELD(&spa_namespace_lock));
4720 ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT);
4721
4722 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
4723 DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t),
4724 sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint);
4725
4726 if (error != 0) {
4727 spa_load_failed(spa, "unable to retrieve checkpointed "
4728 "uberblock from the MOS config [error=%d]", error);
4729
4730 if (error == ENOENT)
4731 error = ZFS_ERR_NO_CHECKPOINT;
4732
4733 return (error);
4734 }
4735
4736 ASSERT3U(checkpoint.ub_txg, <, spa->spa_uberblock.ub_txg);
4737 ASSERT3U(checkpoint.ub_txg, ==, checkpoint.ub_checkpoint_txg);
4738
4739 /*
4740 * We need to update the txg and timestamp of the checkpointed
4741 * uberblock to be higher than the latest one. This ensures that
4742 * the checkpointed uberblock is selected if we were to close and
4743 * reopen the pool right after we've written it in the vdev labels.
4744 * (also see block comment in vdev_uberblock_compare)
4745 */
4746 checkpoint.ub_txg = spa->spa_uberblock.ub_txg + 1;
4747 checkpoint.ub_timestamp = gethrestime_sec();
4748
4749 /*
4750 * Set current uberblock to be the checkpointed uberblock.
4751 */
4752 spa->spa_uberblock = checkpoint;
4753
4754 /*
4755 * If we are doing a normal rewind, then the pool is open for
4756 * writing and we sync the "updated" checkpointed uberblock to
4757 * disk. Once this is done, we've basically rewound the whole
4758 * pool and there is no way back.
4759 *
4760 * There are cases when we don't want to attempt and sync the
4761 * checkpointed uberblock to disk because we are opening a
4762 * pool as read-only. Specifically, verifying the checkpointed
4763 * state with zdb, and importing the checkpointed state to get
4764 * a "preview" of its content.
4765 */
4766 if (spa_writeable(spa)) {
4767 vdev_t *rvd = spa->spa_root_vdev;
4768
4769 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
4770 vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL };
4771 int svdcount = 0;
4772 int children = rvd->vdev_children;
29274c9f 4773 int c0 = random_in_range(children);
d2734cce
SD
4774
4775 for (int c = 0; c < children; c++) {
4776 vdev_t *vd = rvd->vdev_child[(c0 + c) % children];
4777
4778 /* Stop when revisiting the first vdev */
4779 if (c > 0 && svd[0] == vd)
4780 break;
4781
4782 if (vd->vdev_ms_array == 0 || vd->vdev_islog ||
4783 !vdev_is_concrete(vd))
4784 continue;
4785
4786 svd[svdcount++] = vd;
4787 if (svdcount == SPA_SYNC_MIN_VDEVS)
4788 break;
4789 }
4790 error = vdev_config_sync(svd, svdcount, spa->spa_first_txg);
4791 if (error == 0)
4792 spa->spa_last_synced_guid = rvd->vdev_guid;
4793 spa_config_exit(spa, SCL_ALL, FTAG);
4794
4795 if (error != 0) {
4796 spa_load_failed(spa, "failed to write checkpointed "
4797 "uberblock to the vdev labels [error=%d]", error);
4798 return (error);
4799 }
4800 }
4801
4802 return (0);
4803}
4804
4805static int
4806spa_ld_mos_with_trusted_config(spa_t *spa, spa_import_type_t type,
4807 boolean_t *update_config_cache)
4808{
4809 int error;
4810
4811 /*
4812 * Parse the config for pool, open and validate vdevs,
4813 * select an uberblock, and use that uberblock to open
4814 * the MOS.
4815 */
4816 error = spa_ld_mos_init(spa, type);
4817 if (error != 0)
4818 return (error);
4819
9eb7b46e 4820 /*
6cb8e530
PZ
4821 * Retrieve the trusted config stored in the MOS and use it to create
4822 * a new, exact version of the vdev tree, then reopen all vdevs.
9eb7b46e 4823 */
d2734cce 4824 error = spa_ld_trusted_config(spa, type, B_FALSE);
6cb8e530 4825 if (error == EAGAIN) {
d2734cce
SD
4826 if (update_config_cache != NULL)
4827 *update_config_cache = B_TRUE;
4828
6cb8e530
PZ
4829 /*
4830 * Redo the loading process with the trusted config if it is
4831 * too different from the untrusted config.
4832 */
4833 spa_ld_prepare_for_reload(spa);
d2734cce
SD
4834 spa_load_note(spa, "RELOADING");
4835 error = spa_ld_mos_init(spa, type);
4836 if (error != 0)
4837 return (error);
4838
4839 error = spa_ld_trusted_config(spa, type, B_TRUE);
4840 if (error != 0)
4841 return (error);
4842
6cb8e530 4843 } else if (error != 0) {
9eb7b46e 4844 return (error);
6cb8e530 4845 }
9eb7b46e 4846
d2734cce
SD
4847 return (0);
4848}
4849
4850/*
4851 * Load an existing storage pool, using the config provided. This config
4852 * describes which vdevs are part of the pool and is later validated against
4853 * partial configs present in each vdev's label and an entire copy of the
4854 * config stored in the MOS.
4855 */
4856static int
a926aab9 4857spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
d2734cce
SD
4858{
4859 int error = 0;
4860 boolean_t missing_feat_write = B_FALSE;
4861 boolean_t checkpoint_rewind =
4862 (spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT);
4863 boolean_t update_config_cache = B_FALSE;
4864
4865 ASSERT(MUTEX_HELD(&spa_namespace_lock));
4866 ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE);
4867
4868 spa_load_note(spa, "LOADING");
4869
4870 error = spa_ld_mos_with_trusted_config(spa, type, &update_config_cache);
4871 if (error != 0)
4872 return (error);
4873
4874 /*
4875 * If we are rewinding to the checkpoint then we need to repeat
4876 * everything we've done so far in this function but this time
4877 * selecting the checkpointed uberblock and using that to open
4878 * the MOS.
4879 */
4880 if (checkpoint_rewind) {
4881 /*
4882 * If we are rewinding to the checkpoint update config cache
4883 * anyway.
4884 */
4885 update_config_cache = B_TRUE;
4886
4887 /*
4888 * Extract the checkpointed uberblock from the current MOS
4889 * and use this as the pool's uberblock from now on. If the
4890 * pool is imported as writeable we also write the checkpoint
4891 * uberblock to the labels, making the rewind permanent.
4892 */
4893 error = spa_ld_checkpoint_rewind(spa);
4894 if (error != 0)
4895 return (error);
4896
4897 /*
e1cfd73f 4898 * Redo the loading process again with the
d2734cce
SD
4899 * checkpointed uberblock.
4900 */
4901 spa_ld_prepare_for_reload(spa);
4902 spa_load_note(spa, "LOADING checkpointed uberblock");
4903 error = spa_ld_mos_with_trusted_config(spa, type, NULL);
4904 if (error != 0)
4905 return (error);
4906 }
4907
4908 /*
4909 * Retrieve the checkpoint txg if the pool has a checkpoint.
4910 */
4911 error = spa_ld_read_checkpoint_txg(spa);
4912 if (error != 0)
4913 return (error);
4914
9eb7b46e
PZ
4915 /*
4916 * Retrieve the mapping of indirect vdevs. Those vdevs were removed
4917 * from the pool and their contents were re-mapped to other vdevs. Note
4918 * that everything that we read before this step must have been
4919 * rewritten on concrete vdevs after the last device removal was
4920 * initiated. Otherwise we could be reading from indirect vdevs before
4921 * we have loaded their mappings.
4922 */
4923 error = spa_ld_open_indirect_vdev_metadata(spa);
4924 if (error != 0)
4925 return (error);
4926
4927 /*
4928 * Retrieve the full list of active features from the MOS and check if
4929 * they are all supported.
4930 */
4a0ee12a 4931 error = spa_ld_check_features(spa, &missing_feat_write);
9eb7b46e
PZ
4932 if (error != 0)
4933 return (error);
4934
4935 /*
4936 * Load several special directories from the MOS needed by the dsl_pool
4937 * layer.
4938 */
4939 error = spa_ld_load_special_directories(spa);
4940 if (error != 0)
4941 return (error);
4942
9eb7b46e
PZ
4943 /*
4944 * Retrieve pool properties from the MOS.
4945 */
4946 error = spa_ld_get_props(spa);
4947 if (error != 0)
4948 return (error);
4949
4950 /*
4951 * Retrieve the list of auxiliary devices - cache devices and spares -
4952 * and open them.
4953 */
4954 error = spa_ld_open_aux_vdevs(spa, type);
4955 if (error != 0)
4956 return (error);
4957
4958 /*
4959 * Load the metadata for all vdevs. Also check if unopenable devices
4960 * should be autoreplaced.
4961 */
4a0ee12a 4962 error = spa_ld_load_vdev_metadata(spa);
9eb7b46e
PZ
4963 if (error != 0)
4964 return (error);
4965
4966 error = spa_ld_load_dedup_tables(spa);
4967 if (error != 0)
4968 return (error);
4969
67a1b037
PJD
4970 error = spa_ld_load_brt(spa);
4971 if (error != 0)
4972 return (error);
4973
9eb7b46e
PZ
4974 /*
4975 * Verify the logs now to make sure we don't have any unexpected errors
4976 * when we claim log blocks later.
4977 */
4978 error = spa_ld_verify_logs(spa, type, ereport);
4979 if (error != 0)
4980 return (error);
4981
9ae529ec 4982 if (missing_feat_write) {
6cb8e530 4983 ASSERT(spa->spa_load_state == SPA_LOAD_TRYIMPORT);
9ae529ec
CS
4984
4985 /*
4986 * At this point, we know that we can open the pool in
4987 * read-only mode but not read-write mode. We now have enough
4988 * information and can return to userland.
4989 */
9eb7b46e
PZ
4990 return (spa_vdev_err(spa->spa_root_vdev, VDEV_AUX_UNSUP_FEAT,
4991 ENOTSUP));
9ae529ec
CS
4992 }
4993
572e2857 4994 /*
9eb7b46e
PZ
4995 * Traverse the last txgs to make sure the pool was left off in a safe
4996 * state. When performing an extreme rewind, we verify the whole pool,
4997 * which can take a very long time.
572e2857 4998 */
4a0ee12a 4999 error = spa_ld_verify_pool_data(spa);
9eb7b46e
PZ
5000 if (error != 0)
5001 return (error);
572e2857 5002
9eb7b46e
PZ
5003 /*
5004 * Calculate the deflated space for the pool. This must be done before
5005 * we write anything to the pool because we'd need to update the space
5006 * accounting using the deflated sizes.
5007 */
5008 spa_update_dspace(spa);
5009
5010 /*
5011 * We have now retrieved all the information we needed to open the
5012 * pool. If we are importing the pool in read-write mode, a few
5013 * additional steps must be performed to finish the import.
5014 */
6cb8e530 5015 if (spa_writeable(spa) && (spa->spa_load_state == SPA_LOAD_RECOVER ||
428870ff 5016 spa->spa_load_max_txg == UINT64_MAX)) {
6cb8e530
PZ
5017 uint64_t config_cache_txg = spa->spa_config_txg;
5018
5019 ASSERT(spa->spa_load_state != SPA_LOAD_TRYIMPORT);
34dc7c2f 5020
d2734cce
SD
5021 /*
5022 * In case of a checkpoint rewind, log the original txg
5023 * of the checkpointed uberblock.
5024 */
5025 if (checkpoint_rewind) {
5026 spa_history_log_internal(spa, "checkpoint rewind",
5027 NULL, "rewound state to txg=%llu",
5028 (u_longlong_t)spa->spa_uberblock.ub_checkpoint_txg);
5029 }
5030
34dc7c2f 5031 /*
9eb7b46e 5032 * Traverse the ZIL and claim all blocks.
34dc7c2f 5033 */
9eb7b46e 5034 spa_ld_claim_log_blocks(spa);
428870ff 5035
9eb7b46e
PZ
5036 /*
5037 * Kick-off the syncing thread.
5038 */
34dc7c2f
BB
5039 spa->spa_sync_on = B_TRUE;
5040 txg_sync_start(spa->spa_dsl_pool);
379ca9cf 5041 mmp_thread_start(spa);
34dc7c2f
BB
5042
5043 /*
428870ff
BB
5044 * Wait for all claims to sync. We sync up to the highest
5045 * claimed log block birth time so that claimed log blocks
5046 * don't appear to be from the future. spa_claim_max_txg
9eb7b46e
PZ
5047 * will have been set for us by ZIL traversal operations
5048 * performed above.
34dc7c2f 5049 */
428870ff 5050 txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg);
34dc7c2f
BB
5051
5052 /*
9eb7b46e
PZ
5053 * Check if we need to request an update of the config. On the
5054 * next sync, we would update the config stored in vdev labels
5055 * and the cachefile (by default /etc/zfs/zpool.cache).
34dc7c2f 5056 */
6cb8e530 5057 spa_ld_check_for_config_update(spa, config_cache_txg,
d2734cce 5058 update_config_cache);
fb5f0bc8
BB
5059
5060 /*
9a49d3f3
BB
5061 * Check if a rebuild was in progress and if so resume it.
5062 * Then check all DTLs to see if anything needs resilvering.
5063 * The resilver will be deferred if a rebuild was started.
fb5f0bc8 5064 */
9a49d3f3
BB
5065 if (vdev_rebuild_active(spa->spa_root_vdev)) {
5066 vdev_rebuild_restart(spa);
5067 } else if (!dsl_scan_resilvering(spa->spa_dsl_pool) &&
5068 vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) {
fb5f0bc8 5069 spa_async_request(spa, SPA_ASYNC_RESILVER);
9a49d3f3 5070 }
428870ff 5071
6f1ffb06
MA
5072 /*
5073 * Log the fact that we booted up (so that we can detect if
5074 * we rebooted in the middle of an operation).
5075 */
d5e024cb 5076 spa_history_log_version(spa, "open", NULL);
6f1ffb06 5077
9b2266e3
SD
5078 spa_restart_removal(spa);
5079 spa_spawn_aux_threads(spa);
5080
428870ff
BB
5081 /*
5082 * Delete any inconsistent datasets.
9b2266e3
SD
5083 *
5084 * Note:
5085 * Since we may be issuing deletes for clones here,
5086 * we make sure to do so after we've spawned all the
5087 * auxiliary threads above (from which the livelist
5088 * deletion zthr is part of).
428870ff
BB
5089 */
5090 (void) dmu_objset_find(spa_name(spa),
5091 dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN);
5092
5093 /*
5094 * Clean up any stale temporary dataset userrefs.
5095 */
5096 dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool);
a1d477c2 5097
619f0976
GW
5098 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
5099 vdev_initialize_restart(spa->spa_root_vdev);
1b939560
BB
5100 vdev_trim_restart(spa->spa_root_vdev);
5101 vdev_autotrim_restart(spa);
619f0976 5102 spa_config_exit(spa, SCL_CONFIG, FTAG);
34dc7c2f
BB
5103 }
5104
ca95f70d 5105 spa_import_progress_remove(spa_guid(spa));
77f6826b
GA
5106 spa_async_request(spa, SPA_ASYNC_L2CACHE_REBUILD);
5107
4a0ee12a
PZ
5108 spa_load_note(spa, "LOADED");
5109
428870ff
BB
5110 return (0);
5111}
34dc7c2f 5112
428870ff 5113static int
6cb8e530 5114spa_load_retry(spa_t *spa, spa_load_state_t state)
428870ff 5115{
da92d5cb 5116 spa_mode_t mode = spa->spa_mode;
572e2857 5117
428870ff
BB
5118 spa_unload(spa);
5119 spa_deactivate(spa);
5120
dea377c0 5121 spa->spa_load_max_txg = spa->spa_uberblock.ub_txg - 1;
428870ff 5122
572e2857 5123 spa_activate(spa, mode);
428870ff
BB
5124 spa_async_suspend(spa);
5125
4a0ee12a
PZ
5126 spa_load_note(spa, "spa_load_retry: rewind, max txg: %llu",
5127 (u_longlong_t)spa->spa_load_max_txg);
5128
6cb8e530 5129 return (spa_load(spa, state, SPA_IMPORT_EXISTING));
428870ff
BB
5130}
5131
9ae529ec
CS
5132/*
5133 * If spa_load() fails this function will try loading prior txg's. If
5134 * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool
5135 * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this
5136 * function will not rewind the pool and will return the same error as
5137 * spa_load().
5138 */
428870ff 5139static int
6cb8e530
PZ
5140spa_load_best(spa_t *spa, spa_load_state_t state, uint64_t max_request,
5141 int rewind_flags)
428870ff 5142{
9ae529ec 5143 nvlist_t *loadinfo = NULL;
428870ff
BB
5144 nvlist_t *config = NULL;
5145 int load_error, rewind_error;
5146 uint64_t safe_rewind_txg;
5147 uint64_t min_txg;
5148
5149 if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) {
5150 spa->spa_load_max_txg = spa->spa_load_txg;
5151 spa_set_log_state(spa, SPA_LOG_CLEAR);
5152 } else {
5153 spa->spa_load_max_txg = max_request;
dea377c0
MA
5154 if (max_request != UINT64_MAX)
5155 spa->spa_extreme_rewind = B_TRUE;
428870ff
BB
5156 }
5157
6cb8e530 5158 load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING);
428870ff
BB
5159 if (load_error == 0)
5160 return (0);
d2734cce
SD
5161 if (load_error == ZFS_ERR_NO_CHECKPOINT) {
5162 /*
5163 * When attempting checkpoint-rewind on a pool with no
5164 * checkpoint, we should not attempt to load uberblocks
5165 * from previous txgs when spa_load fails.
5166 */
5167 ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT);
ca95f70d 5168 spa_import_progress_remove(spa_guid(spa));
d2734cce
SD
5169 return (load_error);
5170 }
428870ff
BB
5171
5172 if (spa->spa_root_vdev != NULL)
5173 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
5174
5175 spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg;
5176 spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp;
5177
5178 if (rewind_flags & ZPOOL_NEVER_REWIND) {
5179 nvlist_free(config);
ca95f70d 5180 spa_import_progress_remove(spa_guid(spa));
428870ff
BB
5181 return (load_error);
5182 }
5183
9ae529ec
CS
5184 if (state == SPA_LOAD_RECOVER) {
5185 /* Price of rolling back is discarding txgs, including log */
428870ff 5186 spa_set_log_state(spa, SPA_LOG_CLEAR);
9ae529ec
CS
5187 } else {
5188 /*
5189 * If we aren't rolling back save the load info from our first
5190 * import attempt so that we can restore it after attempting
5191 * to rewind.
5192 */
5193 loadinfo = spa->spa_load_info;
5194 spa->spa_load_info = fnvlist_alloc();
5195 }
428870ff
BB
5196
5197 spa->spa_load_max_txg = spa->spa_last_ubsync_txg;
5198 safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE;
5199 min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ?
5200 TXG_INITIAL : safe_rewind_txg;
5201
5202 /*
5203 * Continue as long as we're finding errors, we're still within
5204 * the acceptable rewind range, and we're still finding uberblocks
5205 */
5206 while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg &&
5207 spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) {
5208 if (spa->spa_load_max_txg < safe_rewind_txg)
5209 spa->spa_extreme_rewind = B_TRUE;
6cb8e530 5210 rewind_error = spa_load_retry(spa, state);
428870ff
BB
5211 }
5212
428870ff
BB
5213 spa->spa_extreme_rewind = B_FALSE;
5214 spa->spa_load_max_txg = UINT64_MAX;
5215
5216 if (config && (rewind_error || state != SPA_LOAD_RECOVER))
5217 spa_config_set(spa, config);
ee6370a7 5218 else
5219 nvlist_free(config);
428870ff 5220
9ae529ec
CS
5221 if (state == SPA_LOAD_RECOVER) {
5222 ASSERT3P(loadinfo, ==, NULL);
ca95f70d 5223 spa_import_progress_remove(spa_guid(spa));
9ae529ec
CS
5224 return (rewind_error);
5225 } else {
5226 /* Store the rewind info as part of the initial load info */
5227 fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO,
5228 spa->spa_load_info);
5229
5230 /* Restore the initial load info */
5231 fnvlist_free(spa->spa_load_info);
5232 spa->spa_load_info = loadinfo;
5233
ca95f70d 5234 spa_import_progress_remove(spa_guid(spa));
9ae529ec
CS
5235 return (load_error);
5236 }
34dc7c2f
BB
5237}
5238
5239/*
5240 * Pool Open/Import
5241 *
5242 * The import case is identical to an open except that the configuration is sent
5243 * down from userland, instead of grabbed from the configuration cache. For the
5244 * case of an open, the pool configuration will exist in the
5245 * POOL_STATE_UNINITIALIZED state.
5246 *
5247 * The stats information (gen/count/ustats) is used to gather vdev statistics at
5248 * the same time open the pool, without having to keep around the spa_t in some
5249 * ambiguous state.
5250 */
5251static int
a926aab9
AZ
5252spa_open_common(const char *pool, spa_t **spapp, const void *tag,
5253 nvlist_t *nvpolicy, nvlist_t **config)
34dc7c2f
BB
5254{
5255 spa_t *spa;
572e2857 5256 spa_load_state_t state = SPA_LOAD_OPEN;
34dc7c2f 5257 int error;
34dc7c2f 5258 int locked = B_FALSE;
526af785 5259 int firstopen = B_FALSE;
34dc7c2f
BB
5260
5261 *spapp = NULL;
5262
5263 /*
5264 * As disgusting as this is, we need to support recursive calls to this
5265 * function because dsl_dir_open() is called during spa_load(), and ends
5266 * up calling spa_open() again. The real fix is to figure out how to
5267 * avoid dsl_dir_open() calling this in the first place.
5268 */
c25b8f99 5269 if (MUTEX_NOT_HELD(&spa_namespace_lock)) {
34dc7c2f
BB
5270 mutex_enter(&spa_namespace_lock);
5271 locked = B_TRUE;
5272 }
5273
5274 if ((spa = spa_lookup(pool)) == NULL) {
5275 if (locked)
5276 mutex_exit(&spa_namespace_lock);
2e528b49 5277 return (SET_ERROR(ENOENT));
34dc7c2f 5278 }
428870ff 5279
34dc7c2f 5280 if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
8a393be3 5281 zpool_load_policy_t policy;
428870ff 5282
526af785
PJD
5283 firstopen = B_TRUE;
5284
8a393be3 5285 zpool_get_load_policy(nvpolicy ? nvpolicy : spa->spa_config,
428870ff 5286 &policy);
8a393be3 5287 if (policy.zlp_rewind & ZPOOL_DO_REWIND)
428870ff 5288 state = SPA_LOAD_RECOVER;
34dc7c2f 5289
fb5f0bc8 5290 spa_activate(spa, spa_mode_global);
34dc7c2f 5291
428870ff
BB
5292 if (state != SPA_LOAD_RECOVER)
5293 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
6cb8e530 5294 spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE;
428870ff 5295
4a0ee12a 5296 zfs_dbgmsg("spa_open_common: opening %s", pool);
8a393be3
PZ
5297 error = spa_load_best(spa, state, policy.zlp_txg,
5298 policy.zlp_rewind);
34dc7c2f
BB
5299
5300 if (error == EBADF) {
5301 /*
5302 * If vdev_validate() returns failure (indicated by
5303 * EBADF), it indicates that one of the vdevs indicates
5304 * that the pool has been exported or destroyed. If
5305 * this is the case, the config cache is out of sync and
5306 * we should remove the pool from the namespace.
5307 */
34dc7c2f
BB
5308 spa_unload(spa);
5309 spa_deactivate(spa);
55c12724 5310 spa_write_cachefile(spa, B_TRUE, B_TRUE, B_FALSE);
34dc7c2f 5311 spa_remove(spa);
34dc7c2f
BB
5312 if (locked)
5313 mutex_exit(&spa_namespace_lock);
2e528b49 5314 return (SET_ERROR(ENOENT));
34dc7c2f
BB
5315 }
5316
5317 if (error) {
5318 /*
5319 * We can't open the pool, but we still have useful
5320 * information: the state of each vdev after the
5321 * attempted vdev_open(). Return this to the user.
5322 */
572e2857 5323 if (config != NULL && spa->spa_config) {
65ad5d11
AJ
5324 *config = fnvlist_dup(spa->spa_config);
5325 fnvlist_add_nvlist(*config,
572e2857 5326 ZPOOL_CONFIG_LOAD_INFO,
65ad5d11 5327 spa->spa_load_info);
572e2857 5328 }
34dc7c2f
BB
5329 spa_unload(spa);
5330 spa_deactivate(spa);
428870ff 5331 spa->spa_last_open_failed = error;
34dc7c2f
BB
5332 if (locked)
5333 mutex_exit(&spa_namespace_lock);
5334 *spapp = NULL;
5335 return (error);
34dc7c2f 5336 }
34dc7c2f
BB
5337 }
5338
5339 spa_open_ref(spa, tag);
5340
b128c09f 5341 if (config != NULL)
34dc7c2f 5342 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
34dc7c2f 5343
572e2857
BB
5344 /*
5345 * If we've recovered the pool, pass back any information we
5346 * gathered while doing the load.
5347 */
1bd02680 5348 if (state == SPA_LOAD_RECOVER && config != NULL) {
65ad5d11
AJ
5349 fnvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO,
5350 spa->spa_load_info);
572e2857
BB
5351 }
5352
428870ff
BB
5353 if (locked) {
5354 spa->spa_last_open_failed = 0;
5355 spa->spa_last_ubsync_txg = 0;
5356 spa->spa_load_txg = 0;
5357 mutex_exit(&spa_namespace_lock);
5358 }
5359
526af785 5360 if (firstopen)
ec213971 5361 zvol_create_minors_recursive(spa_name(spa));
526af785 5362
428870ff
BB
5363 *spapp = spa;
5364
34dc7c2f
BB
5365 return (0);
5366}
5367
428870ff 5368int
a926aab9
AZ
5369spa_open_rewind(const char *name, spa_t **spapp, const void *tag,
5370 nvlist_t *policy, nvlist_t **config)
428870ff
BB
5371{
5372 return (spa_open_common(name, spapp, tag, policy, config));
5373}
5374
34dc7c2f 5375int
a926aab9 5376spa_open(const char *name, spa_t **spapp, const void *tag)
34dc7c2f 5377{
428870ff 5378 return (spa_open_common(name, spapp, tag, NULL, NULL));
34dc7c2f
BB
5379}
5380
5381/*
5382 * Lookup the given spa_t, incrementing the inject count in the process,
5383 * preventing it from being exported or destroyed.
5384 */
5385spa_t *
5386spa_inject_addref(char *name)
5387{
5388 spa_t *spa;
5389
5390 mutex_enter(&spa_namespace_lock);
5391 if ((spa = spa_lookup(name)) == NULL) {
5392 mutex_exit(&spa_namespace_lock);
5393 return (NULL);
5394 }
5395 spa->spa_inject_ref++;
5396 mutex_exit(&spa_namespace_lock);
5397
5398 return (spa);
5399}
5400
5401void
5402spa_inject_delref(spa_t *spa)
5403{
5404 mutex_enter(&spa_namespace_lock);
5405 spa->spa_inject_ref--;
5406 mutex_exit(&spa_namespace_lock);
5407}
5408
5409/*
5410 * Add spares device information to the nvlist.
5411 */
5412static void
5413spa_add_spares(spa_t *spa, nvlist_t *config)
5414{
5415 nvlist_t **spares;
5416 uint_t i, nspares;
5417 nvlist_t *nvroot;
5418 uint64_t guid;
5419 vdev_stat_t *vs;
5420 uint_t vsc;
5421 uint64_t pool;
5422
9babb374
BB
5423 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
5424
34dc7c2f
BB
5425 if (spa->spa_spares.sav_count == 0)
5426 return;
5427
65ad5d11
AJ
5428 nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE);
5429 VERIFY0(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
5430 ZPOOL_CONFIG_SPARES, &spares, &nspares));
34dc7c2f 5431 if (nspares != 0) {
795075e6
PD
5432 fnvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
5433 (const nvlist_t * const *)spares, nspares);
65ad5d11
AJ
5434 VERIFY0(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
5435 &spares, &nspares));
34dc7c2f
BB
5436
5437 /*
5438 * Go through and find any spares which have since been
5439 * repurposed as an active spare. If this is the case, update
5440 * their status appropriately.
5441 */
5442 for (i = 0; i < nspares; i++) {
65ad5d11
AJ
5443 guid = fnvlist_lookup_uint64(spares[i],
5444 ZPOOL_CONFIG_GUID);
a05263b7
AH
5445 VERIFY0(nvlist_lookup_uint64_array(spares[i],
5446 ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc));
b128c09f
BB
5447 if (spa_spare_exists(guid, &pool, NULL) &&
5448 pool != 0ULL) {
34dc7c2f
BB
5449 vs->vs_state = VDEV_STATE_CANT_OPEN;
5450 vs->vs_aux = VDEV_AUX_SPARED;
a05263b7
AH
5451 } else {
5452 vs->vs_state =
5453 spa->spa_spares.sav_vdevs[i]->vdev_state;
34dc7c2f
BB
5454 }
5455 }
5456 }
5457}
5458
5459/*
5460 * Add l2cache device information to the nvlist, including vdev stats.
5461 */
5462static void
5463spa_add_l2cache(spa_t *spa, nvlist_t *config)
5464{
5465 nvlist_t **l2cache;
5466 uint_t i, j, nl2cache;
5467 nvlist_t *nvroot;
5468 uint64_t guid;
5469 vdev_t *vd;
5470 vdev_stat_t *vs;
5471 uint_t vsc;
5472
9babb374
BB
5473 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
5474
34dc7c2f
BB
5475 if (spa->spa_l2cache.sav_count == 0)
5476 return;
5477
65ad5d11
AJ
5478 nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE);
5479 VERIFY0(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
5480 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache));
34dc7c2f 5481 if (nl2cache != 0) {
795075e6
PD
5482 fnvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
5483 (const nvlist_t * const *)l2cache, nl2cache);
65ad5d11
AJ
5484 VERIFY0(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
5485 &l2cache, &nl2cache));
34dc7c2f
BB
5486
5487 /*
5488 * Update level 2 cache device stats.
5489 */
5490
5491 for (i = 0; i < nl2cache; i++) {
65ad5d11
AJ
5492 guid = fnvlist_lookup_uint64(l2cache[i],
5493 ZPOOL_CONFIG_GUID);
34dc7c2f
BB
5494
5495 vd = NULL;
5496 for (j = 0; j < spa->spa_l2cache.sav_count; j++) {
5497 if (guid ==
5498 spa->spa_l2cache.sav_vdevs[j]->vdev_guid) {
5499 vd = spa->spa_l2cache.sav_vdevs[j];
5500 break;
5501 }
5502 }
5503 ASSERT(vd != NULL);
5504
65ad5d11
AJ
5505 VERIFY0(nvlist_lookup_uint64_array(l2cache[i],
5506 ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc));
34dc7c2f 5507 vdev_get_stats(vd, vs);
193a37cb
TH
5508 vdev_config_generate_stats(vd, l2cache[i]);
5509
34dc7c2f
BB
5510 }
5511 }
34dc7c2f
BB
5512}
5513
9ae529ec 5514static void
417104bd 5515spa_feature_stats_from_disk(spa_t *spa, nvlist_t *features)
9ae529ec 5516{
9ae529ec
CS
5517 zap_cursor_t zc;
5518 zap_attribute_t za;
5519
9ae529ec
CS
5520 if (spa->spa_feat_for_read_obj != 0) {
5521 for (zap_cursor_init(&zc, spa->spa_meta_objset,
5522 spa->spa_feat_for_read_obj);
5523 zap_cursor_retrieve(&zc, &za) == 0;
5524 zap_cursor_advance(&zc)) {
5525 ASSERT(za.za_integer_length == sizeof (uint64_t) &&
5526 za.za_num_integers == 1);
417104bd 5527 VERIFY0(nvlist_add_uint64(features, za.za_name,
9ae529ec
CS
5528 za.za_first_integer));
5529 }
5530 zap_cursor_fini(&zc);
5531 }
5532
5533 if (spa->spa_feat_for_write_obj != 0) {
5534 for (zap_cursor_init(&zc, spa->spa_meta_objset,
5535 spa->spa_feat_for_write_obj);
5536 zap_cursor_retrieve(&zc, &za) == 0;
5537 zap_cursor_advance(&zc)) {
5538 ASSERT(za.za_integer_length == sizeof (uint64_t) &&
5539 za.za_num_integers == 1);
417104bd 5540 VERIFY0(nvlist_add_uint64(features, za.za_name,
9ae529ec
CS
5541 za.za_first_integer));
5542 }
5543 zap_cursor_fini(&zc);
5544 }
417104bd
NB
5545}
5546
5547static void
5548spa_feature_stats_from_cache(spa_t *spa, nvlist_t *features)
5549{
5550 int i;
5551
5552 for (i = 0; i < SPA_FEATURES; i++) {
5553 zfeature_info_t feature = spa_feature_table[i];
5554 uint64_t refcount;
5555
5556 if (feature_get_refcount(spa, &feature, &refcount) != 0)
5557 continue;
5558
5559 VERIFY0(nvlist_add_uint64(features, feature.fi_guid, refcount));
5560 }
5561}
5562
5563/*
5564 * Store a list of pool features and their reference counts in the
5565 * config.
5566 *
5567 * The first time this is called on a spa, allocate a new nvlist, fetch
5568 * the pool features and reference counts from disk, then save the list
5569 * in the spa. In subsequent calls on the same spa use the saved nvlist
5570 * and refresh its values from the cached reference counts. This
5571 * ensures we don't block here on I/O on a suspended pool so 'zpool
5572 * clear' can resume the pool.
5573 */
5574static void
5575spa_add_feature_stats(spa_t *spa, nvlist_t *config)
5576{
4eb30c68 5577 nvlist_t *features;
417104bd
NB
5578
5579 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
5580
4eb30c68
NB
5581 mutex_enter(&spa->spa_feat_stats_lock);
5582 features = spa->spa_feat_stats;
5583
417104bd
NB
5584 if (features != NULL) {
5585 spa_feature_stats_from_cache(spa, features);
5586 } else {
5587 VERIFY0(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP));
5588 spa->spa_feat_stats = features;
5589 spa_feature_stats_from_disk(spa, features);
5590 }
9ae529ec 5591
417104bd
NB
5592 VERIFY0(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS,
5593 features));
4eb30c68
NB
5594
5595 mutex_exit(&spa->spa_feat_stats_lock);
9ae529ec
CS
5596}
5597
34dc7c2f 5598int
9ae529ec
CS
5599spa_get_stats(const char *name, nvlist_t **config,
5600 char *altroot, size_t buflen)
34dc7c2f
BB
5601{
5602 int error;
5603 spa_t *spa;
5604
5605 *config = NULL;
428870ff 5606 error = spa_open_common(name, &spa, FTAG, NULL, config);
34dc7c2f 5607
9babb374
BB
5608 if (spa != NULL) {
5609 /*
5610 * This still leaves a window of inconsistency where the spares
5611 * or l2cache devices could change and the config would be
5612 * self-inconsistent.
5613 */
5614 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
34dc7c2f 5615
9babb374 5616 if (*config != NULL) {
572e2857
BB
5617 uint64_t loadtimes[2];
5618
5619 loadtimes[0] = spa->spa_loaded_ts.tv_sec;
5620 loadtimes[1] = spa->spa_loaded_ts.tv_nsec;
65ad5d11
AJ
5621 fnvlist_add_uint64_array(*config,
5622 ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2);
572e2857 5623
65ad5d11 5624 fnvlist_add_uint64(*config,
9babb374 5625 ZPOOL_CONFIG_ERRCOUNT,
018f2604 5626 spa_approx_errlog_size(spa));
9babb374 5627
cec3a0a1 5628 if (spa_suspended(spa)) {
65ad5d11 5629 fnvlist_add_uint64(*config,
9babb374 5630 ZPOOL_CONFIG_SUSPENDED,
65ad5d11
AJ
5631 spa->spa_failmode);
5632 fnvlist_add_uint64(*config,
cec3a0a1 5633 ZPOOL_CONFIG_SUSPENDED_REASON,
65ad5d11 5634 spa->spa_suspended);
cec3a0a1 5635 }
b128c09f 5636
9babb374
BB
5637 spa_add_spares(spa, *config);
5638 spa_add_l2cache(spa, *config);
9ae529ec 5639 spa_add_feature_stats(spa, *config);
9babb374 5640 }
34dc7c2f
BB
5641 }
5642
5643 /*
5644 * We want to get the alternate root even for faulted pools, so we cheat
5645 * and call spa_lookup() directly.
5646 */
5647 if (altroot) {
5648 if (spa == NULL) {
5649 mutex_enter(&spa_namespace_lock);
5650 spa = spa_lookup(name);
5651 if (spa)
5652 spa_altroot(spa, altroot, buflen);
5653 else
5654 altroot[0] = '\0';
5655 spa = NULL;
5656 mutex_exit(&spa_namespace_lock);
5657 } else {
5658 spa_altroot(spa, altroot, buflen);
5659 }
5660 }
5661
9babb374
BB
5662 if (spa != NULL) {
5663 spa_config_exit(spa, SCL_CONFIG, FTAG);
34dc7c2f 5664 spa_close(spa, FTAG);
9babb374 5665 }
34dc7c2f
BB
5666
5667 return (error);
5668}
5669
5670/*
5671 * Validate that the auxiliary device array is well formed. We must have an
5672 * array of nvlists, each which describes a valid leaf vdev. If this is an
5673 * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be
5674 * specified, as long as they are well-formed.
5675 */
5676static int
5677spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode,
5678 spa_aux_vdev_t *sav, const char *config, uint64_t version,
5679 vdev_labeltype_t label)
5680{
5681 nvlist_t **dev;
5682 uint_t i, ndev;
5683 vdev_t *vd;
5684 int error;
5685
b128c09f
BB
5686 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
5687
34dc7c2f
BB
5688 /*
5689 * It's acceptable to have no devs specified.
5690 */
5691 if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0)
5692 return (0);
5693
5694 if (ndev == 0)
2e528b49 5695 return (SET_ERROR(EINVAL));
34dc7c2f
BB
5696
5697 /*
5698 * Make sure the pool is formatted with a version that supports this
5699 * device type.
5700 */
5701 if (spa_version(spa) < version)
2e528b49 5702 return (SET_ERROR(ENOTSUP));
34dc7c2f
BB
5703
5704 /*
5705 * Set the pending device list so we correctly handle device in-use
5706 * checking.
5707 */
5708 sav->sav_pending = dev;
5709 sav->sav_npending = ndev;
5710
5711 for (i = 0; i < ndev; i++) {
5712 if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0,
5713 mode)) != 0)
5714 goto out;
5715
5716 if (!vd->vdev_ops->vdev_op_leaf) {
5717 vdev_free(vd);
2e528b49 5718 error = SET_ERROR(EINVAL);
34dc7c2f
BB
5719 goto out;
5720 }
5721
34dc7c2f
BB
5722 vd->vdev_top = vd;
5723
5724 if ((error = vdev_open(vd)) == 0 &&
5725 (error = vdev_label_init(vd, crtxg, label)) == 0) {
65ad5d11
AJ
5726 fnvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID,
5727 vd->vdev_guid);
34dc7c2f
BB
5728 }
5729
5730 vdev_free(vd);
5731
5732 if (error &&
5733 (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE))
5734 goto out;
5735 else
5736 error = 0;
5737 }
5738
5739out:
5740 sav->sav_pending = NULL;
5741 sav->sav_npending = 0;
5742 return (error);
5743}
5744
5745static int
5746spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode)
5747{
5748 int error;
5749
b128c09f
BB
5750 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
5751
34dc7c2f
BB
5752 if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode,
5753 &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES,
5754 VDEV_LABEL_SPARE)) != 0) {
5755 return (error);
5756 }
5757
5758 return (spa_validate_aux_devs(spa, nvroot, crtxg, mode,
5759 &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE,
5760 VDEV_LABEL_L2CACHE));
5761}
5762
5763static void
5764spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs,
5765 const char *config)
5766{
5767 int i;
5768
5769 if (sav->sav_config != NULL) {
5770 nvlist_t **olddevs;
5771 uint_t oldndevs;
5772 nvlist_t **newdevs;
5773
5774 /*
4e33ba4c 5775 * Generate new dev list by concatenating with the
34dc7c2f
BB
5776 * current dev list.
5777 */
65ad5d11
AJ
5778 VERIFY0(nvlist_lookup_nvlist_array(sav->sav_config, config,
5779 &olddevs, &oldndevs));
34dc7c2f
BB
5780
5781 newdevs = kmem_alloc(sizeof (void *) *
79c76d5b 5782 (ndevs + oldndevs), KM_SLEEP);
34dc7c2f 5783 for (i = 0; i < oldndevs; i++)
65ad5d11 5784 newdevs[i] = fnvlist_dup(olddevs[i]);
34dc7c2f 5785 for (i = 0; i < ndevs; i++)
65ad5d11 5786 newdevs[i + oldndevs] = fnvlist_dup(devs[i]);
34dc7c2f 5787
65ad5d11 5788 fnvlist_remove(sav->sav_config, config);
34dc7c2f 5789
795075e6
PD
5790 fnvlist_add_nvlist_array(sav->sav_config, config,
5791 (const nvlist_t * const *)newdevs, ndevs + oldndevs);
34dc7c2f
BB
5792 for (i = 0; i < oldndevs + ndevs; i++)
5793 nvlist_free(newdevs[i]);
5794 kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *));
5795 } else {
5796 /*
5797 * Generate a new dev list.
5798 */
65ad5d11 5799 sav->sav_config = fnvlist_alloc();
795075e6
PD
5800 fnvlist_add_nvlist_array(sav->sav_config, config,
5801 (const nvlist_t * const *)devs, ndevs);
34dc7c2f
BB
5802 }
5803}
5804
5805/*
5806 * Stop and drop level 2 ARC devices
5807 */
5808void
5809spa_l2cache_drop(spa_t *spa)
5810{
5811 vdev_t *vd;
5812 int i;
5813 spa_aux_vdev_t *sav = &spa->spa_l2cache;
5814
5815 for (i = 0; i < sav->sav_count; i++) {
5816 uint64_t pool;
5817
5818 vd = sav->sav_vdevs[i];
5819 ASSERT(vd != NULL);
5820
fb5f0bc8
BB
5821 if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
5822 pool != 0ULL && l2arc_vdev_present(vd))
34dc7c2f 5823 l2arc_remove_vdev(vd);
34dc7c2f
BB
5824 }
5825}
5826
b5256303
TC
5827/*
5828 * Verify encryption parameters for spa creation. If we are encrypting, we must
5829 * have the encryption feature flag enabled.
5830 */
5831static int
5832spa_create_check_encryption_params(dsl_crypto_params_t *dcp,
5833 boolean_t has_encryption)
5834{
5835 if (dcp->cp_crypt != ZIO_CRYPT_OFF &&
5836 dcp->cp_crypt != ZIO_CRYPT_INHERIT &&
5837 !has_encryption)
5838 return (SET_ERROR(ENOTSUP));
5839
1fff937a 5840 return (dmu_objset_create_crypt_check(NULL, dcp, NULL));
b5256303
TC
5841}
5842
34dc7c2f
BB
5843/*
5844 * Pool Creation
5845 */
5846int
5847spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
b5256303 5848 nvlist_t *zplprops, dsl_crypto_params_t *dcp)
34dc7c2f
BB
5849{
5850 spa_t *spa;
d1807f16 5851 const char *altroot = NULL;
34dc7c2f
BB
5852 vdev_t *rvd;
5853 dsl_pool_t *dp;
5854 dmu_tx_t *tx;
9babb374 5855 int error = 0;
34dc7c2f
BB
5856 uint64_t txg = TXG_INITIAL;
5857 nvlist_t **spares, **l2cache;
5858 uint_t nspares, nl2cache;
b2255edc 5859 uint64_t version, obj, ndraid = 0;
9ae529ec 5860 boolean_t has_features;
b5256303 5861 boolean_t has_encryption;
715c996d 5862 boolean_t has_allocclass;
b5256303 5863 spa_feature_t feat;
d1807f16
RY
5864 const char *feat_name;
5865 const char *poolname;
83e9986f
RY
5866 nvlist_t *nvl;
5867
cc99f275
DB
5868 if (props == NULL ||
5869 nvlist_lookup_string(props, "tname", &poolname) != 0)
83e9986f 5870 poolname = (char *)pool;
34dc7c2f
BB
5871
5872 /*
5873 * If this pool already exists, return failure.
5874 */
5875 mutex_enter(&spa_namespace_lock);
83e9986f 5876 if (spa_lookup(poolname) != NULL) {
34dc7c2f 5877 mutex_exit(&spa_namespace_lock);
2e528b49 5878 return (SET_ERROR(EEXIST));
34dc7c2f
BB
5879 }
5880
5881 /*
5882 * Allocate a new spa_t structure.
5883 */
83e9986f
RY
5884 nvl = fnvlist_alloc();
5885 fnvlist_add_string(nvl, ZPOOL_CONFIG_POOL_NAME, pool);
34dc7c2f
BB
5886 (void) nvlist_lookup_string(props,
5887 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
83e9986f
RY
5888 spa = spa_add(poolname, nvl, altroot);
5889 fnvlist_free(nvl);
fb5f0bc8 5890 spa_activate(spa, spa_mode_global);
34dc7c2f 5891
34dc7c2f 5892 if (props && (error = spa_prop_validate(spa, props))) {
34dc7c2f
BB
5893 spa_deactivate(spa);
5894 spa_remove(spa);
b128c09f 5895 mutex_exit(&spa_namespace_lock);
34dc7c2f
BB
5896 return (error);
5897 }
5898
83e9986f
RY
5899 /*
5900 * Temporary pool names should never be written to disk.
5901 */
5902 if (poolname != pool)
5903 spa->spa_import_flags |= ZFS_IMPORT_TEMP_NAME;
5904
9ae529ec 5905 has_features = B_FALSE;
b5256303 5906 has_encryption = B_FALSE;
715c996d 5907 has_allocclass = B_FALSE;
1c27024e 5908 for (nvpair_t *elem = nvlist_next_nvpair(props, NULL);
9ae529ec 5909 elem != NULL; elem = nvlist_next_nvpair(props, elem)) {
b5256303 5910 if (zpool_prop_feature(nvpair_name(elem))) {
9ae529ec 5911 has_features = B_TRUE;
b5256303
TC
5912
5913 feat_name = strchr(nvpair_name(elem), '@') + 1;
5914 VERIFY0(zfeature_lookup_name(feat_name, &feat));
5915 if (feat == SPA_FEATURE_ENCRYPTION)
5916 has_encryption = B_TRUE;
715c996d 5917 if (feat == SPA_FEATURE_ALLOCATION_CLASSES)
5918 has_allocclass = B_TRUE;
b5256303
TC
5919 }
5920 }
5921
5922 /* verify encryption params, if they were provided */
5923 if (dcp != NULL) {
5924 error = spa_create_check_encryption_params(dcp, has_encryption);
5925 if (error != 0) {
5926 spa_deactivate(spa);
5927 spa_remove(spa);
5928 mutex_exit(&spa_namespace_lock);
5929 return (error);
5930 }
9ae529ec 5931 }
c24fa4b1 5932 if (!has_allocclass && zfs_special_devs(nvroot, NULL)) {
715c996d 5933 spa_deactivate(spa);
5934 spa_remove(spa);
5935 mutex_exit(&spa_namespace_lock);
5936 return (ENOTSUP);
5937 }
9ae529ec
CS
5938
5939 if (has_features || nvlist_lookup_uint64(props,
5940 zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) {
34dc7c2f 5941 version = SPA_VERSION;
9ae529ec
CS
5942 }
5943 ASSERT(SPA_VERSION_IS_SUPPORTED(version));
428870ff
BB
5944
5945 spa->spa_first_txg = txg;
5946 spa->spa_uberblock.ub_txg = txg - 1;
34dc7c2f
BB
5947 spa->spa_uberblock.ub_version = version;
5948 spa->spa_ubsync = spa->spa_uberblock;
3dfb57a3 5949 spa->spa_load_state = SPA_LOAD_CREATE;
a1d477c2
MA
5950 spa->spa_removing_phys.sr_state = DSS_NONE;
5951 spa->spa_removing_phys.sr_removing_vdev = -1;
5952 spa->spa_removing_phys.sr_prev_indirect_vdev = -1;
944a3724 5953 spa->spa_indirect_vdevs_loaded = B_TRUE;
34dc7c2f 5954
9babb374
BB
5955 /*
5956 * Create "The Godfather" zio to hold all async IOs
5957 */
e022864d
MA
5958 spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *),
5959 KM_SLEEP);
1c27024e 5960 for (int i = 0; i < max_ncpus; i++) {
e022864d
MA
5961 spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL,
5962 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
5963 ZIO_FLAG_GODFATHER);
5964 }
9babb374 5965
34dc7c2f
BB
5966 /*
5967 * Create the root vdev.
5968 */
b128c09f 5969 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
34dc7c2f
BB
5970
5971 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD);
5972
5973 ASSERT(error != 0 || rvd != NULL);
5974 ASSERT(error != 0 || spa->spa_root_vdev == rvd);
5975
5976 if (error == 0 && !zfs_allocatable_devs(nvroot))
2e528b49 5977 error = SET_ERROR(EINVAL);
34dc7c2f
BB
5978
5979 if (error == 0 &&
5980 (error = vdev_create(rvd, txg, B_FALSE)) == 0 &&
b2255edc
BB
5981 (error = vdev_draid_spare_create(nvroot, rvd, &ndraid, 0)) == 0 &&
5982 (error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) == 0) {
cc99f275
DB
5983 /*
5984 * instantiate the metaslab groups (this will dirty the vdevs)
5985 * we can no longer error exit past this point
5986 */
5987 for (int c = 0; error == 0 && c < rvd->vdev_children; c++) {
5988 vdev_t *vd = rvd->vdev_child[c];
5989
5990 vdev_metaslab_set_size(vd);
5991 vdev_expand(vd, txg);
9babb374 5992 }
34dc7c2f
BB
5993 }
5994
b128c09f 5995 spa_config_exit(spa, SCL_ALL, FTAG);
34dc7c2f
BB
5996
5997 if (error != 0) {
5998 spa_unload(spa);
5999 spa_deactivate(spa);
6000 spa_remove(spa);
6001 mutex_exit(&spa_namespace_lock);
6002 return (error);
6003 }
6004
6005 /*
6006 * Get the list of spares, if specified.
6007 */
6008 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
6009 &spares, &nspares) == 0) {
65ad5d11
AJ
6010 spa->spa_spares.sav_config = fnvlist_alloc();
6011 fnvlist_add_nvlist_array(spa->spa_spares.sav_config,
795075e6
PD
6012 ZPOOL_CONFIG_SPARES, (const nvlist_t * const *)spares,
6013 nspares);
b128c09f 6014 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
34dc7c2f 6015 spa_load_spares(spa);
b128c09f 6016 spa_config_exit(spa, SCL_ALL, FTAG);
34dc7c2f
BB
6017 spa->spa_spares.sav_sync = B_TRUE;
6018 }
6019
6020 /*
6021 * Get the list of level 2 cache devices, if specified.
6022 */
6023 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
6024 &l2cache, &nl2cache) == 0) {
795075e6
PD
6025 VERIFY0(nvlist_alloc(&spa->spa_l2cache.sav_config,
6026 NV_UNIQUE_NAME, KM_SLEEP));
65ad5d11 6027 fnvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
795075e6
PD
6028 ZPOOL_CONFIG_L2CACHE, (const nvlist_t * const *)l2cache,
6029 nl2cache);
b128c09f 6030 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
34dc7c2f 6031 spa_load_l2cache(spa);
b128c09f 6032 spa_config_exit(spa, SCL_ALL, FTAG);
34dc7c2f
BB
6033 spa->spa_l2cache.sav_sync = B_TRUE;
6034 }
6035
9ae529ec 6036 spa->spa_is_initializing = B_TRUE;
b5256303 6037 spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, dcp, txg);
9ae529ec 6038 spa->spa_is_initializing = B_FALSE;
34dc7c2f 6039
428870ff
BB
6040 /*
6041 * Create DDTs (dedup tables).
6042 */
6043 ddt_create(spa);
67a1b037
PJD
6044 /*
6045 * Create BRT table and BRT table object.
6046 */
6047 brt_create(spa);
428870ff
BB
6048
6049 spa_update_dspace(spa);
6050
34dc7c2f
BB
6051 tx = dmu_tx_create_assigned(dp, txg);
6052
d5e024cb
BB
6053 /*
6054 * Create the pool's history object.
6055 */
6056 if (version >= SPA_VERSION_ZPOOL_HISTORY && !spa->spa_history)
6057 spa_history_create_obj(spa, tx);
6058
6059 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_CREATE);
6060 spa_history_log_version(spa, "create", tx);
6061
34dc7c2f
BB
6062 /*
6063 * Create the pool config object.
6064 */
6065 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset,
b128c09f 6066 DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE,
34dc7c2f
BB
6067 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx);
6068
6069 if (zap_add(spa->spa_meta_objset,
6070 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
6071 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) {
6072 cmn_err(CE_PANIC, "failed to add pool config");
6073 }
6074
428870ff
BB
6075 if (zap_add(spa->spa_meta_objset,
6076 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION,
6077 sizeof (uint64_t), 1, &version, tx) != 0) {
6078 cmn_err(CE_PANIC, "failed to add pool version");
6079 }
6080
34dc7c2f
BB
6081 /* Newly created pools with the right version are always deflated. */
6082 if (version >= SPA_VERSION_RAIDZ_DEFLATE) {
6083 spa->spa_deflate = TRUE;
6084 if (zap_add(spa->spa_meta_objset,
6085 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
6086 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) {
6087 cmn_err(CE_PANIC, "failed to add deflate");
6088 }
6089 }
6090
6091 /*
428870ff 6092 * Create the deferred-free bpobj. Turn off compression
34dc7c2f
BB
6093 * because sync-to-convergence takes longer if the blocksize
6094 * keeps changing.
6095 */
428870ff
BB
6096 obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx);
6097 dmu_object_set_compress(spa->spa_meta_objset, obj,
34dc7c2f 6098 ZIO_COMPRESS_OFF, tx);
34dc7c2f 6099 if (zap_add(spa->spa_meta_objset,
428870ff
BB
6100 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ,
6101 sizeof (uint64_t), 1, &obj, tx) != 0) {
6102 cmn_err(CE_PANIC, "failed to add bpobj");
34dc7c2f 6103 }
428870ff
BB
6104 VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj,
6105 spa->spa_meta_objset, obj));
34dc7c2f 6106
3c67d83a
TH
6107 /*
6108 * Generate some random noise for salted checksums to operate on.
6109 */
6110 (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes,
6111 sizeof (spa->spa_cksum_salt.zcs_bytes));
6112
34dc7c2f
BB
6113 /*
6114 * Set pool properties.
6115 */
6116 spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS);
6117 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
6118 spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE);
9babb374 6119 spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND);
379ca9cf 6120 spa->spa_multihost = zpool_prop_default_numeric(ZPOOL_PROP_MULTIHOST);
1b939560 6121 spa->spa_autotrim = zpool_prop_default_numeric(ZPOOL_PROP_AUTOTRIM);
428870ff 6122
d164b209
BB
6123 if (props != NULL) {
6124 spa_configfile_set(spa, props, B_FALSE);
13fe0198 6125 spa_sync_props(props, tx);
d164b209 6126 }
34dc7c2f 6127
b2255edc
BB
6128 for (int i = 0; i < ndraid; i++)
6129 spa_feature_incr(spa, SPA_FEATURE_DRAID, tx);
6130
34dc7c2f
BB
6131 dmu_tx_commit(tx);
6132
6133 spa->spa_sync_on = B_TRUE;
b5256303 6134 txg_sync_start(dp);
379ca9cf 6135 mmp_thread_start(spa);
b5256303 6136 txg_wait_synced(dp, txg);
34dc7c2f 6137
9d5b5245
SD
6138 spa_spawn_aux_threads(spa);
6139
55c12724 6140 spa_write_cachefile(spa, B_FALSE, B_TRUE, B_TRUE);
34dc7c2f 6141
0c66c32d
JG
6142 /*
6143 * Don't count references from objsets that are already closed
6144 * and are making their way through the eviction process.
6145 */
6146 spa_evicting_os_wait(spa);
424fd7c3 6147 spa->spa_minref = zfs_refcount_count(&spa->spa_refcount);
3dfb57a3 6148 spa->spa_load_state = SPA_LOAD_NONE;
b128c09f 6149
4759342a
JL
6150 spa_import_os(spa);
6151
d164b209
BB
6152 mutex_exit(&spa_namespace_lock);
6153
34dc7c2f
BB
6154 return (0);
6155}
6156
9babb374
BB
6157/*
6158 * Import a non-root pool into the system.
6159 */
6160int
13fe0198 6161spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags)
34dc7c2f
BB
6162{
6163 spa_t *spa;
d1807f16 6164 const char *altroot = NULL;
428870ff 6165 spa_load_state_t state = SPA_LOAD_IMPORT;
8a393be3 6166 zpool_load_policy_t policy;
da92d5cb 6167 spa_mode_t mode = spa_mode_global;
572e2857 6168 uint64_t readonly = B_FALSE;
9babb374 6169 int error;
34dc7c2f
BB
6170 nvlist_t *nvroot;
6171 nvlist_t **spares, **l2cache;
6172 uint_t nspares, nl2cache;
34dc7c2f
BB
6173
6174 /*
6175 * If a pool with this name exists, return failure.
6176 */
6177 mutex_enter(&spa_namespace_lock);
428870ff 6178 if (spa_lookup(pool) != NULL) {
9babb374 6179 mutex_exit(&spa_namespace_lock);
2e528b49 6180 return (SET_ERROR(EEXIST));
34dc7c2f
BB
6181 }
6182
6183 /*
6184 * Create and initialize the spa structure.
6185 */
6186 (void) nvlist_lookup_string(props,
6187 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
572e2857
BB
6188 (void) nvlist_lookup_uint64(props,
6189 zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly);
6190 if (readonly)
da92d5cb 6191 mode = SPA_MODE_READ;
428870ff 6192 spa = spa_add(pool, config, altroot);
572e2857
BB
6193 spa->spa_import_flags = flags;
6194
6195 /*
6196 * Verbatim import - Take a pool and insert it into the namespace
6197 * as if it had been loaded at boot.
6198 */
6199 if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) {
6200 if (props != NULL)
6201 spa_configfile_set(spa, props, B_FALSE);
6202
55c12724 6203 spa_write_cachefile(spa, B_FALSE, B_TRUE, B_FALSE);
12fa0466 6204 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT);
4a0ee12a 6205 zfs_dbgmsg("spa_import: verbatim import of %s", pool);
572e2857 6206 mutex_exit(&spa_namespace_lock);
572e2857
BB
6207 return (0);
6208 }
6209
6210 spa_activate(spa, mode);
34dc7c2f 6211
9babb374
BB
6212 /*
6213 * Don't start async tasks until we know everything is healthy.
6214 */
6215 spa_async_suspend(spa);
b128c09f 6216
8a393be3
PZ
6217 zpool_get_load_policy(config, &policy);
6218 if (policy.zlp_rewind & ZPOOL_DO_REWIND)
572e2857
BB
6219 state = SPA_LOAD_RECOVER;
6220
6cb8e530 6221 spa->spa_config_source = SPA_CONFIG_SRC_TRYIMPORT;
572e2857 6222
6cb8e530
PZ
6223 if (state != SPA_LOAD_RECOVER) {
6224 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
6225 zfs_dbgmsg("spa_import: importing %s", pool);
6226 } else {
6227 zfs_dbgmsg("spa_import: importing %s, max_txg=%lld "
8a393be3 6228 "(RECOVERY MODE)", pool, (longlong_t)policy.zlp_txg);
6cb8e530 6229 }
8a393be3 6230 error = spa_load_best(spa, state, policy.zlp_txg, policy.zlp_rewind);
428870ff
BB
6231
6232 /*
572e2857
BB
6233 * Propagate anything learned while loading the pool and pass it
6234 * back to caller (i.e. rewind info, missing devices, etc).
428870ff 6235 */
65ad5d11 6236 fnvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, spa->spa_load_info);
34dc7c2f 6237
b128c09f 6238 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
34dc7c2f 6239 /*
9babb374
BB
6240 * Toss any existing sparelist, as it doesn't have any validity
6241 * anymore, and conflicts with spa_has_spare().
34dc7c2f 6242 */
9babb374 6243 if (spa->spa_spares.sav_config) {
34dc7c2f
BB
6244 nvlist_free(spa->spa_spares.sav_config);
6245 spa->spa_spares.sav_config = NULL;
6246 spa_load_spares(spa);
6247 }
9babb374 6248 if (spa->spa_l2cache.sav_config) {
34dc7c2f
BB
6249 nvlist_free(spa->spa_l2cache.sav_config);
6250 spa->spa_l2cache.sav_config = NULL;
6251 spa_load_l2cache(spa);
6252 }
6253
65ad5d11 6254 nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE);
b128c09f 6255 spa_config_exit(spa, SCL_ALL, FTAG);
34dc7c2f 6256
d164b209
BB
6257 if (props != NULL)
6258 spa_configfile_set(spa, props, B_FALSE);
6259
fb5f0bc8
BB
6260 if (error != 0 || (props && spa_writeable(spa) &&
6261 (error = spa_prop_set(spa, props)))) {
9babb374
BB
6262 spa_unload(spa);
6263 spa_deactivate(spa);
6264 spa_remove(spa);
34dc7c2f
BB
6265 mutex_exit(&spa_namespace_lock);
6266 return (error);
6267 }
6268
572e2857
BB
6269 spa_async_resume(spa);
6270
34dc7c2f
BB
6271 /*
6272 * Override any spares and level 2 cache devices as specified by
6273 * the user, as these may have correct device names/devids, etc.
6274 */
6275 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
6276 &spares, &nspares) == 0) {
6277 if (spa->spa_spares.sav_config)
65ad5d11
AJ
6278 fnvlist_remove(spa->spa_spares.sav_config,
6279 ZPOOL_CONFIG_SPARES);
34dc7c2f 6280 else
65ad5d11
AJ
6281 spa->spa_spares.sav_config = fnvlist_alloc();
6282 fnvlist_add_nvlist_array(spa->spa_spares.sav_config,
795075e6
PD
6283 ZPOOL_CONFIG_SPARES, (const nvlist_t * const *)spares,
6284 nspares);
b128c09f 6285 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
34dc7c2f 6286 spa_load_spares(spa);
b128c09f 6287 spa_config_exit(spa, SCL_ALL, FTAG);
34dc7c2f
BB
6288 spa->spa_spares.sav_sync = B_TRUE;
6289 }
6290 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
6291 &l2cache, &nl2cache) == 0) {
6292 if (spa->spa_l2cache.sav_config)
65ad5d11
AJ
6293 fnvlist_remove(spa->spa_l2cache.sav_config,
6294 ZPOOL_CONFIG_L2CACHE);
34dc7c2f 6295 else
65ad5d11
AJ
6296 spa->spa_l2cache.sav_config = fnvlist_alloc();
6297 fnvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
795075e6
PD
6298 ZPOOL_CONFIG_L2CACHE, (const nvlist_t * const *)l2cache,
6299 nl2cache);
b128c09f 6300 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
34dc7c2f 6301 spa_load_l2cache(spa);
b128c09f 6302 spa_config_exit(spa, SCL_ALL, FTAG);
34dc7c2f
BB
6303 spa->spa_l2cache.sav_sync = B_TRUE;
6304 }
6305
428870ff
BB
6306 /*
6307 * Check for any removed devices.
6308 */
6309 if (spa->spa_autoreplace) {
6310 spa_aux_check_removed(&spa->spa_spares);
6311 spa_aux_check_removed(&spa->spa_l2cache);
6312 }
6313
fb5f0bc8 6314 if (spa_writeable(spa)) {
b128c09f
BB
6315 /*
6316 * Update the config cache to include the newly-imported pool.
6317 */
45d1cae3 6318 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
b128c09f 6319 }
34dc7c2f 6320
34dc7c2f 6321 /*
9babb374
BB
6322 * It's possible that the pool was expanded while it was exported.
6323 * We kick off an async task to handle this for us.
34dc7c2f 6324 */
9babb374 6325 spa_async_request(spa, SPA_ASYNC_AUTOEXPAND);
b128c09f 6326
d5e024cb 6327 spa_history_log_version(spa, "import", NULL);
fb390aaf 6328
12fa0466 6329 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT);
fb390aaf 6330
fb390aaf
HR
6331 mutex_exit(&spa_namespace_lock);
6332
ec213971 6333 zvol_create_minors_recursive(pool);
4a22ba5b 6334
4759342a
JL
6335 spa_import_os(spa);
6336
b128c09f
BB
6337 return (0);
6338}
6339
34dc7c2f
BB
6340nvlist_t *
6341spa_tryimport(nvlist_t *tryconfig)
6342{
6343 nvlist_t *config = NULL;
d1807f16 6344 const char *poolname, *cachefile;
34dc7c2f
BB
6345 spa_t *spa;
6346 uint64_t state;
d164b209 6347 int error;
8a393be3 6348 zpool_load_policy_t policy;
34dc7c2f
BB
6349
6350 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname))
6351 return (NULL);
6352
6353 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state))
6354 return (NULL);
6355
6356 /*
6357 * Create and initialize the spa structure.
6358 */
6359 mutex_enter(&spa_namespace_lock);
428870ff 6360 spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL);
da92d5cb 6361 spa_activate(spa, SPA_MODE_READ);
34dc7c2f
BB
6362
6363 /*
8a393be3 6364 * Rewind pool if a max txg was provided.
34dc7c2f 6365 */
8a393be3
PZ
6366 zpool_get_load_policy(spa->spa_config, &policy);
6367 if (policy.zlp_txg != UINT64_MAX) {
6368 spa->spa_load_max_txg = policy.zlp_txg;
6cb8e530
PZ
6369 spa->spa_extreme_rewind = B_TRUE;
6370 zfs_dbgmsg("spa_tryimport: importing %s, max_txg=%lld",
8a393be3 6371 poolname, (longlong_t)policy.zlp_txg);
6cb8e530
PZ
6372 } else {
6373 zfs_dbgmsg("spa_tryimport: importing %s", poolname);
6374 }
6375
6376 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_CACHEFILE, &cachefile)
6377 == 0) {
6378 zfs_dbgmsg("spa_tryimport: using cachefile '%s'", cachefile);
6379 spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE;
6380 } else {
6381 spa->spa_config_source = SPA_CONFIG_SRC_SCAN;
6382 }
6383
82ac409a
AH
6384 /*
6385 * spa_import() relies on a pool config fetched by spa_try_import()
6386 * for spare/cache devices. Import flags are not passed to
6387 * spa_tryimport(), which makes it return early due to a missing log
6388 * device and missing retrieving the cache device and spare eventually.
6389 * Passing ZFS_IMPORT_MISSING_LOG to spa_tryimport() makes it fetch
6390 * the correct configuration regardless of the missing log device.
6391 */
6392 spa->spa_import_flags |= ZFS_IMPORT_MISSING_LOG;
6393
6cb8e530 6394 error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING);
34dc7c2f
BB
6395
6396 /*
6397 * If 'tryconfig' was at least parsable, return the current config.
6398 */
6399 if (spa->spa_root_vdev != NULL) {
34dc7c2f 6400 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
65ad5d11
AJ
6401 fnvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, poolname);
6402 fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, state);
6403 fnvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP,
6404 spa->spa_uberblock.ub_timestamp);
6405 fnvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO,
6406 spa->spa_load_info);
6407 fnvlist_add_uint64(config, ZPOOL_CONFIG_ERRATA,
6408 spa->spa_errata);
34dc7c2f
BB
6409
6410 /*
6411 * If the bootfs property exists on this pool then we
6412 * copy it out so that external consumers can tell which
6413 * pools are bootable.
6414 */
d164b209 6415 if ((!error || error == EEXIST) && spa->spa_bootfs) {
79c76d5b 6416 char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
34dc7c2f
BB
6417
6418 /*
6419 * We have to play games with the name since the
6420 * pool was opened as TRYIMPORT_NAME.
6421 */
b128c09f 6422 if (dsl_dsobj_to_dsname(spa_name(spa),
34dc7c2f
BB
6423 spa->spa_bootfs, tmpname) == 0) {
6424 char *cp;
d1d7e268
MK
6425 char *dsname;
6426
79c76d5b 6427 dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
34dc7c2f
BB
6428
6429 cp = strchr(tmpname, '/');
6430 if (cp == NULL) {
6431 (void) strlcpy(dsname, tmpname,
6432 MAXPATHLEN);
6433 } else {
6434 (void) snprintf(dsname, MAXPATHLEN,
6435 "%s/%s", poolname, ++cp);
6436 }
65ad5d11
AJ
6437 fnvlist_add_string(config, ZPOOL_CONFIG_BOOTFS,
6438 dsname);
34dc7c2f
BB
6439 kmem_free(dsname, MAXPATHLEN);
6440 }
6441 kmem_free(tmpname, MAXPATHLEN);
6442 }
6443
6444 /*
6445 * Add the list of hot spares and level 2 cache devices.
6446 */
9babb374 6447 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
34dc7c2f
BB
6448 spa_add_spares(spa, config);
6449 spa_add_l2cache(spa, config);
9babb374 6450 spa_config_exit(spa, SCL_CONFIG, FTAG);
34dc7c2f
BB
6451 }
6452
6453 spa_unload(spa);
6454 spa_deactivate(spa);
6455 spa_remove(spa);
6456 mutex_exit(&spa_namespace_lock);
6457
6458 return (config);
6459}
6460
6461/*
6462 * Pool export/destroy
6463 *
6464 * The act of destroying or exporting a pool is very simple. We make sure there
6465 * is no more pending I/O and any references to the pool are gone. Then, we
6466 * update the pool state and sync all the labels to disk, removing the
fb5f0bc8
BB
6467 * configuration from the cache afterwards. If the 'hardforce' flag is set, then
6468 * we don't sync the labels or remove the configuration cache.
34dc7c2f
BB
6469 */
6470static int
4d55ea81 6471spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig,
fb5f0bc8 6472 boolean_t force, boolean_t hardforce)
34dc7c2f 6473{
f4f50a70 6474 int error;
34dc7c2f
BB
6475 spa_t *spa;
6476
6477 if (oldconfig)
6478 *oldconfig = NULL;
6479
da92d5cb 6480 if (!(spa_mode_global & SPA_MODE_WRITE))
2e528b49 6481 return (SET_ERROR(EROFS));
34dc7c2f
BB
6482
6483 mutex_enter(&spa_namespace_lock);
6484 if ((spa = spa_lookup(pool)) == NULL) {
6485 mutex_exit(&spa_namespace_lock);
2e528b49 6486 return (SET_ERROR(ENOENT));
34dc7c2f
BB
6487 }
6488
43a85362
SD
6489 if (spa->spa_is_exporting) {
6490 /* the pool is being exported by another thread */
6491 mutex_exit(&spa_namespace_lock);
6492 return (SET_ERROR(ZFS_ERR_EXPORT_IN_PROGRESS));
6493 }
6494 spa->spa_is_exporting = B_TRUE;
6495
34dc7c2f
BB
6496 /*
6497 * Put a hold on the pool, drop the namespace lock, stop async tasks,
6498 * reacquire the namespace lock, and see if we can export.
6499 */
6500 spa_open_ref(spa, FTAG);
6501 mutex_exit(&spa_namespace_lock);
6502 spa_async_suspend(spa);
a0bd735a
BP
6503 if (spa->spa_zvol_taskq) {
6504 zvol_remove_minors(spa, spa_name(spa), B_TRUE);
6505 taskq_wait(spa->spa_zvol_taskq);
6506 }
34dc7c2f
BB
6507 mutex_enter(&spa_namespace_lock);
6508 spa_close(spa, FTAG);
6509
d14cfd83
IH
6510 if (spa->spa_state == POOL_STATE_UNINITIALIZED)
6511 goto export_spa;
34dc7c2f 6512 /*
d14cfd83
IH
6513 * The pool will be in core if it's openable, in which case we can
6514 * modify its state. Objsets may be open only because they're dirty,
6515 * so we have to force it to sync before checking spa_refcnt.
34dc7c2f 6516 */
0c66c32d 6517 if (spa->spa_sync_on) {
34dc7c2f 6518 txg_wait_synced(spa->spa_dsl_pool, 0);
0c66c32d
JG
6519 spa_evicting_os_wait(spa);
6520 }
34dc7c2f 6521
d14cfd83
IH
6522 /*
6523 * A pool cannot be exported or destroyed if there are active
6524 * references. If we are resetting a pool, allow references by
6525 * fault injection handlers.
6526 */
f4f50a70
WA
6527 if (!spa_refcount_zero(spa) || (spa->spa_inject_ref != 0)) {
6528 error = SET_ERROR(EBUSY);
6529 goto fail;
d14cfd83 6530 }
34dc7c2f 6531
d14cfd83 6532 if (spa->spa_sync_on) {
88b199c2 6533 vdev_t *rvd = spa->spa_root_vdev;
b128c09f
BB
6534 /*
6535 * A pool cannot be exported if it has an active shared spare.
6536 * This is to prevent other pools stealing the active spare
6537 * from an exported pool. At user's own will, such pool can
6538 * be forcedly exported.
6539 */
6540 if (!force && new_state == POOL_STATE_EXPORTED &&
6541 spa_has_active_shared_spare(spa)) {
f4f50a70
WA
6542 error = SET_ERROR(EXDEV);
6543 goto fail;
b128c09f 6544 }
34dc7c2f 6545
619f0976
GW
6546 /*
6547 * We're about to export or destroy this pool. Make sure
1b939560
BB
6548 * we stop all initialization and trim activity here before
6549 * we set the spa_final_txg. This will ensure that all
619f0976
GW
6550 * dirty data resulting from the initialization is
6551 * committed to disk before we unload the pool.
6552 */
88b199c2
RY
6553 vdev_initialize_stop_all(rvd, VDEV_INITIALIZE_ACTIVE);
6554 vdev_trim_stop_all(rvd, VDEV_TRIM_ACTIVE);
6555 vdev_autotrim_stop_all(spa);
6556 vdev_rebuild_stop_all(spa);
619f0976 6557
34dc7c2f
BB
6558 /*
6559 * We want this to be reflected on every label,
6560 * so mark them all dirty. spa_unload() will do the
6561 * final sync that pushes these changes out.
6562 */
fb5f0bc8 6563 if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) {
b128c09f 6564 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
34dc7c2f 6565 spa->spa_state = new_state;
88b199c2 6566 vdev_config_dirty(rvd);
2fb52853
GA
6567 spa_config_exit(spa, SCL_ALL, FTAG);
6568 }
6569
6570 /*
6571 * If the log space map feature is enabled and the pool is
6572 * getting exported (but not destroyed), we want to spend some
6573 * time flushing as many metaslabs as we can in an attempt to
6574 * destroy log space maps and save import time. This has to be
6575 * done before we set the spa_final_txg, otherwise
6576 * spa_sync() -> spa_flush_metaslabs() may dirty the final TXGs.
6577 * spa_should_flush_logs_on_unload() should be called after
6578 * spa_state has been set to the new_state.
6579 */
6580 if (spa_should_flush_logs_on_unload(spa))
6581 spa_unload_log_sm_flush_all(spa);
6582
6583 if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) {
6584 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
428870ff
BB
6585 spa->spa_final_txg = spa_last_synced_txg(spa) +
6586 TXG_DEFER_SIZE + 1;
b128c09f 6587 spa_config_exit(spa, SCL_ALL, FTAG);
34dc7c2f
BB
6588 }
6589 }
6590
d14cfd83 6591export_spa:
4759342a
JL
6592 spa_export_os(spa);
6593
d5e024cb
BB
6594 if (new_state == POOL_STATE_DESTROYED)
6595 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_DESTROY);
6596 else if (new_state == POOL_STATE_EXPORTED)
6597 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_EXPORT);
34dc7c2f
BB
6598
6599 if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
6600 spa_unload(spa);
6601 spa_deactivate(spa);
6602 }
6603
6604 if (oldconfig && spa->spa_config)
65ad5d11 6605 *oldconfig = fnvlist_dup(spa->spa_config);
34dc7c2f
BB
6606
6607 if (new_state != POOL_STATE_UNINITIALIZED) {
fb5f0bc8 6608 if (!hardforce)
55c12724 6609 spa_write_cachefile(spa, B_TRUE, B_TRUE, B_FALSE);
34dc7c2f 6610 spa_remove(spa);
43a85362
SD
6611 } else {
6612 /*
6613 * If spa_remove() is not called for this spa_t and
6614 * there is any possibility that it can be reused,
6615 * we make sure to reset the exporting flag.
6616 */
6617 spa->spa_is_exporting = B_FALSE;
34dc7c2f 6618 }
34dc7c2f 6619
43a85362 6620 mutex_exit(&spa_namespace_lock);
34dc7c2f 6621 return (0);
f4f50a70
WA
6622
6623fail:
6624 spa->spa_is_exporting = B_FALSE;
6625 spa_async_resume(spa);
6626 mutex_exit(&spa_namespace_lock);
6627 return (error);
34dc7c2f
BB
6628}
6629
6630/*
6631 * Destroy a storage pool.
6632 */
6633int
4d55ea81 6634spa_destroy(const char *pool)
34dc7c2f 6635{
fb5f0bc8
BB
6636 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL,
6637 B_FALSE, B_FALSE));
34dc7c2f
BB
6638}
6639
6640/*
6641 * Export a storage pool.
6642 */
6643int
4d55ea81 6644spa_export(const char *pool, nvlist_t **oldconfig, boolean_t force,
fb5f0bc8 6645 boolean_t hardforce)
34dc7c2f 6646{
fb5f0bc8
BB
6647 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig,
6648 force, hardforce));
34dc7c2f
BB
6649}
6650
6651/*
6652 * Similar to spa_export(), this unloads the spa_t without actually removing it
6653 * from the namespace in any way.
6654 */
6655int
4d55ea81 6656spa_reset(const char *pool)
34dc7c2f 6657{
b128c09f 6658 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL,
fb5f0bc8 6659 B_FALSE, B_FALSE));
34dc7c2f
BB
6660}
6661
34dc7c2f
BB
6662/*
6663 * ==========================================================================
6664 * Device manipulation
6665 * ==========================================================================
6666 */
6667
b2255edc
BB
6668/*
6669 * This is called as a synctask to increment the draid feature flag
6670 */
6671static void
6672spa_draid_feature_incr(void *arg, dmu_tx_t *tx)
6673{
6674 spa_t *spa = dmu_tx_pool(tx)->dp_spa;
6675 int draid = (int)(uintptr_t)arg;
6676
6677 for (int c = 0; c < draid; c++)
6678 spa_feature_incr(spa, SPA_FEATURE_DRAID, tx);
6679}
6680
34dc7c2f
BB
6681/*
6682 * Add a device to a storage pool.
6683 */
6684int
6685spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
6686{
b2255edc 6687 uint64_t txg, ndraid = 0;
fb5f0bc8 6688 int error;
34dc7c2f
BB
6689 vdev_t *rvd = spa->spa_root_vdev;
6690 vdev_t *vd, *tvd;
6691 nvlist_t **spares, **l2cache;
6692 uint_t nspares, nl2cache;
6693
572e2857
BB
6694 ASSERT(spa_writeable(spa));
6695
34dc7c2f
BB
6696 txg = spa_vdev_enter(spa);
6697
6698 if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0,
6699 VDEV_ALLOC_ADD)) != 0)
6700 return (spa_vdev_exit(spa, NULL, txg, error));
6701
b128c09f 6702 spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */
34dc7c2f
BB
6703
6704 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares,
6705 &nspares) != 0)
6706 nspares = 0;
6707
6708 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache,
6709 &nl2cache) != 0)
6710 nl2cache = 0;
6711
b128c09f 6712 if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0)
34dc7c2f 6713 return (spa_vdev_exit(spa, vd, txg, EINVAL));
34dc7c2f 6714
b128c09f 6715 if (vd->vdev_children != 0 &&
b2255edc 6716 (error = vdev_create(vd, txg, B_FALSE)) != 0) {
b128c09f 6717 return (spa_vdev_exit(spa, vd, txg, error));
b2255edc
BB
6718 }
6719
6720 /*
6721 * The virtual dRAID spares must be added after vdev tree is created
bf169e9f 6722 * and the vdev guids are generated. The guid of their associated
b2255edc
BB
6723 * dRAID is stored in the config and used when opening the spare.
6724 */
6725 if ((error = vdev_draid_spare_create(nvroot, vd, &ndraid,
6726 rvd->vdev_children)) == 0) {
6727 if (ndraid > 0 && nvlist_lookup_nvlist_array(nvroot,
6728 ZPOOL_CONFIG_SPARES, &spares, &nspares) != 0)
6729 nspares = 0;
6730 } else {
6731 return (spa_vdev_exit(spa, vd, txg, error));
6732 }
34dc7c2f
BB
6733
6734 /*
6735 * We must validate the spares and l2cache devices after checking the
6736 * children. Otherwise, vdev_inuse() will blindly overwrite the spare.
6737 */
b128c09f 6738 if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0)
34dc7c2f 6739 return (spa_vdev_exit(spa, vd, txg, error));
34dc7c2f
BB
6740
6741 /*
a1d477c2
MA
6742 * If we are in the middle of a device removal, we can only add
6743 * devices which match the existing devices in the pool.
6744 * If we are in the middle of a removal, or have some indirect
b2255edc 6745 * vdevs, we can not add raidz or dRAID top levels.
34dc7c2f 6746 */
a1d477c2
MA
6747 if (spa->spa_vdev_removal != NULL ||
6748 spa->spa_removing_phys.sr_prev_indirect_vdev != -1) {
6749 for (int c = 0; c < vd->vdev_children; c++) {
6750 tvd = vd->vdev_child[c];
6751 if (spa->spa_vdev_removal != NULL &&
9e052db4 6752 tvd->vdev_ashift != spa->spa_max_ashift) {
a1d477c2
MA
6753 return (spa_vdev_exit(spa, vd, txg, EINVAL));
6754 }
b2255edc
BB
6755 /* Fail if top level vdev is raidz or a dRAID */
6756 if (vdev_get_nparity(tvd) != 0)
a1d477c2 6757 return (spa_vdev_exit(spa, vd, txg, EINVAL));
b2255edc 6758
a1d477c2
MA
6759 /*
6760 * Need the top level mirror to be
6761 * a mirror of leaf vdevs only
6762 */
6763 if (tvd->vdev_ops == &vdev_mirror_ops) {
6764 for (uint64_t cid = 0;
6765 cid < tvd->vdev_children; cid++) {
6766 vdev_t *cvd = tvd->vdev_child[cid];
6767 if (!cvd->vdev_ops->vdev_op_leaf) {
6768 return (spa_vdev_exit(spa, vd,
6769 txg, EINVAL));
6770 }
6771 }
6772 }
6773 }
6774 }
6775
1c27024e 6776 for (int c = 0; c < vd->vdev_children; c++) {
34dc7c2f
BB
6777 tvd = vd->vdev_child[c];
6778 vdev_remove_child(vd, tvd);
93e28d66 6779 tvd->vdev_id = rvd->vdev_children;
34dc7c2f
BB
6780 vdev_add_child(rvd, tvd);
6781 vdev_config_dirty(tvd);
6782 }
6783
6784 if (nspares != 0) {
6785 spa_set_aux_vdevs(&spa->spa_spares, spares, nspares,
6786 ZPOOL_CONFIG_SPARES);
6787 spa_load_spares(spa);
6788 spa->spa_spares.sav_sync = B_TRUE;
6789 }
6790
6791 if (nl2cache != 0) {
6792 spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache,
6793 ZPOOL_CONFIG_L2CACHE);
6794 spa_load_l2cache(spa);
6795 spa->spa_l2cache.sav_sync = B_TRUE;
6796 }
6797
b2255edc
BB
6798 /*
6799 * We can't increment a feature while holding spa_vdev so we
6800 * have to do it in a synctask.
6801 */
6802 if (ndraid != 0) {
6803 dmu_tx_t *tx;
6804
6805 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
6806 dsl_sync_task_nowait(spa->spa_dsl_pool, spa_draid_feature_incr,
6807 (void *)(uintptr_t)ndraid, tx);
6808 dmu_tx_commit(tx);
6809 }
6810
34dc7c2f
BB
6811 /*
6812 * We have to be careful when adding new vdevs to an existing pool.
6813 * If other threads start allocating from these vdevs before we
6814 * sync the config cache, and we lose power, then upon reboot we may
6815 * fail to open the pool because there are DVAs that the config cache
6816 * can't translate. Therefore, we first add the vdevs without
6817 * initializing metaslabs; sync the config cache (via spa_vdev_exit());
6818 * and then let spa_config_update() initialize the new metaslabs.
6819 *
6820 * spa_load() checks for added-but-not-initialized vdevs, so that
6821 * if we lose power at any point in this sequence, the remaining
6822 * steps will be completed the next time we load the pool.
6823 */
6824 (void) spa_vdev_exit(spa, vd, txg, 0);
6825
6826 mutex_enter(&spa_namespace_lock);
6827 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
12fa0466 6828 spa_event_notify(spa, NULL, NULL, ESC_ZFS_VDEV_ADD);
34dc7c2f
BB
6829 mutex_exit(&spa_namespace_lock);
6830
6831 return (0);
6832}
6833
6834/*
6835 * Attach a device to a mirror. The arguments are the path to any device
6836 * in the mirror, and the nvroot for the new device. If the path specifies
6837 * a device that is not mirrored, we automatically insert the mirror vdev.
6838 *
6839 * If 'replacing' is specified, the new device is intended to replace the
6840 * existing device; in this case the two devices are made into their own
6841 * mirror using the 'replacing' vdev, which is functionally identical to
6842 * the mirror vdev (it actually reuses all the same ops) but has a few
6843 * extra rules: you can't attach to it after it's been created, and upon
6844 * completion of resilvering, the first disk (the one being replaced)
6845 * is automatically detached.
9a49d3f3
BB
6846 *
6847 * If 'rebuild' is specified, then sequential reconstruction (a.ka. rebuild)
6848 * should be performed instead of traditional healing reconstruction. From
6849 * an administrators perspective these are both resilver operations.
34dc7c2f
BB
6850 */
6851int
9a49d3f3
BB
6852spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,
6853 int rebuild)
34dc7c2f 6854{
428870ff 6855 uint64_t txg, dtl_max_txg;
9a49d3f3 6856 vdev_t *rvd = spa->spa_root_vdev;
34dc7c2f
BB
6857 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd;
6858 vdev_ops_t *pvops;
b128c09f
BB
6859 char *oldvdpath, *newvdpath;
6860 int newvd_isspare;
6861 int error;
34dc7c2f 6862
572e2857
BB
6863 ASSERT(spa_writeable(spa));
6864
34dc7c2f
BB
6865 txg = spa_vdev_enter(spa);
6866
b128c09f 6867 oldvd = spa_lookup_by_guid(spa, guid, B_FALSE);
34dc7c2f 6868
d2734cce
SD
6869 ASSERT(MUTEX_HELD(&spa_namespace_lock));
6870 if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
6871 error = (spa_has_checkpoint(spa)) ?
6872 ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
6873 return (spa_vdev_exit(spa, NULL, txg, error));
6874 }
6875
9a49d3f3
BB
6876 if (rebuild) {
6877 if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD))
6878 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
6879
9d618615
A
6880 if (dsl_scan_resilvering(spa_get_dsl(spa)) ||
6881 dsl_scan_resilver_scheduled(spa_get_dsl(spa))) {
9a49d3f3
BB
6882 return (spa_vdev_exit(spa, NULL, txg,
6883 ZFS_ERR_RESILVER_IN_PROGRESS));
9d618615 6884 }
9a49d3f3
BB
6885 } else {
6886 if (vdev_rebuild_active(rvd))
6887 return (spa_vdev_exit(spa, NULL, txg,
6888 ZFS_ERR_REBUILD_IN_PROGRESS));
6889 }
6890
9e052db4 6891 if (spa->spa_vdev_removal != NULL)
a1d477c2 6892 return (spa_vdev_exit(spa, NULL, txg, EBUSY));
a1d477c2 6893
34dc7c2f
BB
6894 if (oldvd == NULL)
6895 return (spa_vdev_exit(spa, NULL, txg, ENODEV));
6896
6897 if (!oldvd->vdev_ops->vdev_op_leaf)
6898 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
6899
6900 pvd = oldvd->vdev_parent;
6901
6a42939f
RY
6902 if (spa_config_parse(spa, &newrootvd, nvroot, NULL, 0,
6903 VDEV_ALLOC_ATTACH) != 0)
34dc7c2f
BB
6904 return (spa_vdev_exit(spa, NULL, txg, EINVAL));
6905
6906 if (newrootvd->vdev_children != 1)
6907 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
6908
6909 newvd = newrootvd->vdev_child[0];
6910
6911 if (!newvd->vdev_ops->vdev_op_leaf)
6912 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
6913
6914 if ((error = vdev_create(newrootvd, txg, replacing)) != 0)
6915 return (spa_vdev_exit(spa, newrootvd, txg, error));
6916
6917 /*
c23738c7 6918 * log, dedup and special vdevs should not be replaced by spares.
34dc7c2f 6919 */
c23738c7
AH
6920 if ((oldvd->vdev_top->vdev_alloc_bias != VDEV_BIAS_NONE ||
6921 oldvd->vdev_top->vdev_islog) && newvd->vdev_isspare) {
34dc7c2f 6922 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
c23738c7 6923 }
34dc7c2f 6924
b2255edc
BB
6925 /*
6926 * A dRAID spare can only replace a child of its parent dRAID vdev.
6927 */
6928 if (newvd->vdev_ops == &vdev_draid_spare_ops &&
6929 oldvd->vdev_top != vdev_draid_spare_get_parent(newvd)) {
6930 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
6931 }
6932
9a49d3f3
BB
6933 if (rebuild) {
6934 /*
b2255edc 6935 * For rebuilds, the top vdev must support reconstruction
9a49d3f3 6936 * using only space maps. This means the only allowable
b2255edc 6937 * vdevs types are the root vdev, a mirror, or dRAID.
9a49d3f3 6938 */
b2255edc
BB
6939 tvd = pvd;
6940 if (pvd->vdev_top != NULL)
6941 tvd = pvd->vdev_top;
6942
6943 if (tvd->vdev_ops != &vdev_mirror_ops &&
6944 tvd->vdev_ops != &vdev_root_ops &&
6945 tvd->vdev_ops != &vdev_draid_ops) {
9a49d3f3
BB
6946 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
6947 }
6948 }
6949
34dc7c2f
BB
6950 if (!replacing) {
6951 /*
6952 * For attach, the only allowable parent is a mirror or the root
6953 * vdev.
6954 */
6955 if (pvd->vdev_ops != &vdev_mirror_ops &&
6956 pvd->vdev_ops != &vdev_root_ops)
6957 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
6958
6959 pvops = &vdev_mirror_ops;
6960 } else {
6961 /*
6962 * Active hot spares can only be replaced by inactive hot
6963 * spares.
6964 */
6965 if (pvd->vdev_ops == &vdev_spare_ops &&
572e2857 6966 oldvd->vdev_isspare &&
34dc7c2f
BB
6967 !spa_has_spare(spa, newvd->vdev_guid))
6968 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
6969
6970 /*
6971 * If the source is a hot spare, and the parent isn't already a
6972 * spare, then we want to create a new hot spare. Otherwise, we
6973 * want to create a replacing vdev. The user is not allowed to
6974 * attach to a spared vdev child unless the 'isspare' state is
6975 * the same (spare replaces spare, non-spare replaces
6976 * non-spare).
6977 */
572e2857
BB
6978 if (pvd->vdev_ops == &vdev_replacing_ops &&
6979 spa_version(spa) < SPA_VERSION_MULTI_REPLACE) {
34dc7c2f 6980 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
572e2857
BB
6981 } else if (pvd->vdev_ops == &vdev_spare_ops &&
6982 newvd->vdev_isspare != oldvd->vdev_isspare) {
34dc7c2f 6983 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
572e2857
BB
6984 }
6985
6986 if (newvd->vdev_isspare)
34dc7c2f
BB
6987 pvops = &vdev_spare_ops;
6988 else
6989 pvops = &vdev_replacing_ops;
6990 }
6991
6992 /*
9babb374 6993 * Make sure the new device is big enough.
34dc7c2f 6994 */
9babb374 6995 if (newvd->vdev_asize < vdev_get_min_asize(oldvd))
34dc7c2f
BB
6996 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW));
6997
6998 /*
6999 * The new device cannot have a higher alignment requirement
7000 * than the top-level vdev.
7001 */
7002 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift)
9a49d3f3 7003 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
34dc7c2f
BB
7004
7005 /*
7006 * If this is an in-place replacement, update oldvd's path and devid
7007 * to make it distinguishable from newvd, and unopenable from now on.
7008 */
7009 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) {
7010 spa_strfree(oldvd->vdev_path);
7011 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5,
79c76d5b 7012 KM_SLEEP);
c9e319fa
JL
7013 (void) snprintf(oldvd->vdev_path, strlen(newvd->vdev_path) + 5,
7014 "%s/%s", newvd->vdev_path, "old");
34dc7c2f
BB
7015 if (oldvd->vdev_devid != NULL) {
7016 spa_strfree(oldvd->vdev_devid);
7017 oldvd->vdev_devid = NULL;
7018 }
7019 }
7020
7021 /*
7022 * If the parent is not a mirror, or if we're replacing, insert the new
7023 * mirror/replacing/spare vdev above oldvd.
7024 */
7025 if (pvd->vdev_ops != pvops)
7026 pvd = vdev_add_parent(oldvd, pvops);
7027
7028 ASSERT(pvd->vdev_top->vdev_parent == rvd);
7029 ASSERT(pvd->vdev_ops == pvops);
7030 ASSERT(oldvd->vdev_parent == pvd);
7031
7032 /*
7033 * Extract the new device from its root and add it to pvd.
7034 */
7035 vdev_remove_child(newrootvd, newvd);
7036 newvd->vdev_id = pvd->vdev_children;
428870ff 7037 newvd->vdev_crtxg = oldvd->vdev_crtxg;
34dc7c2f
BB
7038 vdev_add_child(pvd, newvd);
7039
6d82f98c
IH
7040 /*
7041 * Reevaluate the parent vdev state.
7042 */
7043 vdev_propagate_state(pvd);
7044
34dc7c2f
BB
7045 tvd = newvd->vdev_top;
7046 ASSERT(pvd->vdev_top == tvd);
7047 ASSERT(tvd->vdev_parent == rvd);
7048
7049 vdev_config_dirty(tvd);
7050
7051 /*
428870ff
BB
7052 * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account
7053 * for any dmu_sync-ed blocks. It will propagate upward when
7054 * spa_vdev_exit() calls vdev_dtl_reassess().
34dc7c2f 7055 */
428870ff 7056 dtl_max_txg = txg + TXG_CONCURRENT_STATES;
34dc7c2f 7057
9a49d3f3
BB
7058 vdev_dtl_dirty(newvd, DTL_MISSING,
7059 TXG_INITIAL, dtl_max_txg - TXG_INITIAL);
34dc7c2f 7060
9babb374 7061 if (newvd->vdev_isspare) {
34dc7c2f 7062 spa_spare_activate(newvd);
12fa0466 7063 spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE);
9babb374
BB
7064 }
7065
b128c09f
BB
7066 oldvdpath = spa_strdup(oldvd->vdev_path);
7067 newvdpath = spa_strdup(newvd->vdev_path);
7068 newvd_isspare = newvd->vdev_isspare;
34dc7c2f
BB
7069
7070 /*
7071 * Mark newvd's DTL dirty in this txg.
7072 */
7073 vdev_dirty(tvd, VDD_DTL, newvd, txg);
7074
428870ff 7075 /*
9a49d3f3
BB
7076 * Schedule the resilver or rebuild to restart in the future. We do
7077 * this to ensure that dmu_sync-ed blocks have been stitched into the
7078 * respective datasets.
428870ff 7079 */
9a49d3f3
BB
7080 if (rebuild) {
7081 newvd->vdev_rebuild_txg = txg;
7082
7083 vdev_rebuild(tvd);
7084 } else {
7085 newvd->vdev_resilver_txg = txg;
7086
7087 if (dsl_scan_resilvering(spa_get_dsl(spa)) &&
7088 spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER)) {
7089 vdev_defer_resilver(newvd);
7090 } else {
7091 dsl_scan_restart_resilver(spa->spa_dsl_pool,
7092 dtl_max_txg);
7093 }
7094 }
428870ff 7095
fb390aaf 7096 if (spa->spa_bootfs)
12fa0466 7097 spa_event_notify(spa, newvd, NULL, ESC_ZFS_BOOTFS_VDEV_ATTACH);
fb390aaf 7098
12fa0466 7099 spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_ATTACH);
fb390aaf 7100
428870ff
BB
7101 /*
7102 * Commit the config
7103 */
7104 (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0);
34dc7c2f 7105
6f1ffb06 7106 spa_history_log_internal(spa, "vdev attach", NULL,
428870ff 7107 "%s vdev=%s %s vdev=%s",
45d1cae3
BB
7108 replacing && newvd_isspare ? "spare in" :
7109 replacing ? "replace" : "attach", newvdpath,
7110 replacing ? "for" : "to", oldvdpath);
b128c09f
BB
7111
7112 spa_strfree(oldvdpath);
7113 spa_strfree(newvdpath);
7114
34dc7c2f
BB
7115 return (0);
7116}
7117
7118/*
7119 * Detach a device from a mirror or replacing vdev.
d3cc8b15 7120 *
34dc7c2f 7121 * If 'replace_done' is specified, only detach if the parent
719534ca 7122 * is a replacing or a spare vdev.
34dc7c2f
BB
7123 */
7124int
fb5f0bc8 7125spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
34dc7c2f
BB
7126{
7127 uint64_t txg;
fb5f0bc8 7128 int error;
2a8ba608 7129 vdev_t *rvd __maybe_unused = spa->spa_root_vdev;
34dc7c2f
BB
7130 vdev_t *vd, *pvd, *cvd, *tvd;
7131 boolean_t unspare = B_FALSE;
d4ed6673 7132 uint64_t unspare_guid = 0;
428870ff 7133 char *vdpath;
1c27024e 7134
572e2857
BB
7135 ASSERT(spa_writeable(spa));
7136
9a49d3f3 7137 txg = spa_vdev_detach_enter(spa, guid);
34dc7c2f 7138
b128c09f 7139 vd = spa_lookup_by_guid(spa, guid, B_FALSE);
34dc7c2f 7140
d2734cce
SD
7141 /*
7142 * Besides being called directly from the userland through the
7143 * ioctl interface, spa_vdev_detach() can be potentially called
7144 * at the end of spa_vdev_resilver_done().
7145 *
7146 * In the regular case, when we have a checkpoint this shouldn't
7147 * happen as we never empty the DTLs of a vdev during the scrub
7148 * [see comment in dsl_scan_done()]. Thus spa_vdev_resilvering_done()
7149 * should never get here when we have a checkpoint.
7150 *
7151 * That said, even in a case when we checkpoint the pool exactly
7152 * as spa_vdev_resilver_done() calls this function everything
7153 * should be fine as the resilver will return right away.
7154 */
7155 ASSERT(MUTEX_HELD(&spa_namespace_lock));
7156 if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
7157 error = (spa_has_checkpoint(spa)) ?
7158 ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
7159 return (spa_vdev_exit(spa, NULL, txg, error));
7160 }
7161
34dc7c2f
BB
7162 if (vd == NULL)
7163 return (spa_vdev_exit(spa, NULL, txg, ENODEV));
7164
7165 if (!vd->vdev_ops->vdev_op_leaf)
7166 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
7167
7168 pvd = vd->vdev_parent;
7169
fb5f0bc8
BB
7170 /*
7171 * If the parent/child relationship is not as expected, don't do it.
7172 * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing
7173 * vdev that's replacing B with C. The user's intent in replacing
7174 * is to go from M(A,B) to M(A,C). If the user decides to cancel
7175 * the replace by detaching C, the expected behavior is to end up
7176 * M(A,B). But suppose that right after deciding to detach C,
7177 * the replacement of B completes. We would have M(A,C), and then
7178 * ask to detach C, which would leave us with just A -- not what
7179 * the user wanted. To prevent this, we make sure that the
7180 * parent/child relationship hasn't changed -- in this example,
7181 * that C's parent is still the replacing vdev R.
7182 */
7183 if (pvd->vdev_guid != pguid && pguid != 0)
7184 return (spa_vdev_exit(spa, NULL, txg, EBUSY));
7185
34dc7c2f 7186 /*
572e2857 7187 * Only 'replacing' or 'spare' vdevs can be replaced.
34dc7c2f 7188 */
572e2857
BB
7189 if (replace_done && pvd->vdev_ops != &vdev_replacing_ops &&
7190 pvd->vdev_ops != &vdev_spare_ops)
7191 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
34dc7c2f
BB
7192
7193 ASSERT(pvd->vdev_ops != &vdev_spare_ops ||
7194 spa_version(spa) >= SPA_VERSION_SPARES);
7195
7196 /*
7197 * Only mirror, replacing, and spare vdevs support detach.
7198 */
7199 if (pvd->vdev_ops != &vdev_replacing_ops &&
7200 pvd->vdev_ops != &vdev_mirror_ops &&
7201 pvd->vdev_ops != &vdev_spare_ops)
7202 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
7203
7204 /*
fb5f0bc8
BB
7205 * If this device has the only valid copy of some data,
7206 * we cannot safely detach it.
34dc7c2f 7207 */
fb5f0bc8 7208 if (vdev_dtl_required(vd))
34dc7c2f
BB
7209 return (spa_vdev_exit(spa, NULL, txg, EBUSY));
7210
fb5f0bc8 7211 ASSERT(pvd->vdev_children >= 2);
34dc7c2f 7212
b128c09f
BB
7213 /*
7214 * If we are detaching the second disk from a replacing vdev, then
7215 * check to see if we changed the original vdev's path to have "/old"
7216 * at the end in spa_vdev_attach(). If so, undo that change now.
7217 */
572e2857
BB
7218 if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 &&
7219 vd->vdev_path != NULL) {
7220 size_t len = strlen(vd->vdev_path);
7221
1c27024e 7222 for (int c = 0; c < pvd->vdev_children; c++) {
572e2857
BB
7223 cvd = pvd->vdev_child[c];
7224
7225 if (cvd == vd || cvd->vdev_path == NULL)
7226 continue;
7227
7228 if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 &&
7229 strcmp(cvd->vdev_path + len, "/old") == 0) {
7230 spa_strfree(cvd->vdev_path);
7231 cvd->vdev_path = spa_strdup(vd->vdev_path);
7232 break;
7233 }
b128c09f
BB
7234 }
7235 }
7236
34dc7c2f 7237 /*
b2255edc
BB
7238 * If we are detaching the original disk from a normal spare, then it
7239 * implies that the spare should become a real disk, and be removed
7240 * from the active spare list for the pool. dRAID spares on the
7241 * other hand are coupled to the pool and thus should never be removed
7242 * from the spares list.
34dc7c2f 7243 */
b2255edc
BB
7244 if (pvd->vdev_ops == &vdev_spare_ops && vd->vdev_id == 0) {
7245 vdev_t *last_cvd = pvd->vdev_child[pvd->vdev_children - 1];
7246
7247 if (last_cvd->vdev_isspare &&
7248 last_cvd->vdev_ops != &vdev_draid_spare_ops) {
7249 unspare = B_TRUE;
7250 }
7251 }
34dc7c2f
BB
7252
7253 /*
7254 * Erase the disk labels so the disk can be used for other things.
7255 * This must be done after all other error cases are handled,
7256 * but before we disembowel vd (so we can still do I/O to it).
7257 * But if we can't do it, don't treat the error as fatal --
7258 * it may be that the unwritability of the disk is the reason
7259 * it's being detached!
7260 */
6a42939f 7261 (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
34dc7c2f
BB
7262
7263 /*
7264 * Remove vd from its parent and compact the parent's children.
7265 */
7266 vdev_remove_child(pvd, vd);
7267 vdev_compact_children(pvd);
7268
7269 /*
7270 * Remember one of the remaining children so we can get tvd below.
7271 */
572e2857 7272 cvd = pvd->vdev_child[pvd->vdev_children - 1];
34dc7c2f
BB
7273
7274 /*
7275 * If we need to remove the remaining child from the list of hot spares,
fb5f0bc8
BB
7276 * do it now, marking the vdev as no longer a spare in the process.
7277 * We must do this before vdev_remove_parent(), because that can
7278 * change the GUID if it creates a new toplevel GUID. For a similar
7279 * reason, we must remove the spare now, in the same txg as the detach;
7280 * otherwise someone could attach a new sibling, change the GUID, and
7281 * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail.
34dc7c2f
BB
7282 */
7283 if (unspare) {
7284 ASSERT(cvd->vdev_isspare);
7285 spa_spare_remove(cvd);
7286 unspare_guid = cvd->vdev_guid;
fb5f0bc8 7287 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
572e2857 7288 cvd->vdev_unspare = B_TRUE;
34dc7c2f
BB
7289 }
7290
428870ff
BB
7291 /*
7292 * If the parent mirror/replacing vdev only has one child,
7293 * the parent is no longer needed. Remove it from the tree.
7294 */
572e2857
BB
7295 if (pvd->vdev_children == 1) {
7296 if (pvd->vdev_ops == &vdev_spare_ops)
7297 cvd->vdev_unspare = B_FALSE;
428870ff 7298 vdev_remove_parent(cvd);
572e2857
BB
7299 }
7300
428870ff
BB
7301 /*
7302 * We don't set tvd until now because the parent we just removed
7303 * may have been the previous top-level vdev.
7304 */
7305 tvd = cvd->vdev_top;
7306 ASSERT(tvd->vdev_parent == rvd);
7307
7308 /*
7309 * Reevaluate the parent vdev state.
7310 */
7311 vdev_propagate_state(cvd);
7312
7313 /*
7314 * If the 'autoexpand' property is set on the pool then automatically
7315 * try to expand the size of the pool. For example if the device we
7316 * just detached was smaller than the others, it may be possible to
7317 * add metaslabs (i.e. grow the pool). We need to reopen the vdev
7318 * first so that we can obtain the updated sizes of the leaf vdevs.
7319 */
7320 if (spa->spa_autoexpand) {
7321 vdev_reopen(tvd);
7322 vdev_expand(tvd, txg);
7323 }
7324
7325 vdev_config_dirty(tvd);
7326
7327 /*
7328 * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that
7329 * vd->vdev_detached is set and free vd's DTL object in syncing context.
7330 * But first make sure we're not on any *other* txg's DTL list, to
7331 * prevent vd from being accessed after it's freed.
7332 */
b6ca6193 7333 vdpath = spa_strdup(vd->vdev_path ? vd->vdev_path : "none");
1c27024e 7334 for (int t = 0; t < TXG_SIZE; t++)
428870ff
BB
7335 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
7336 vd->vdev_detached = B_TRUE;
7337 vdev_dirty(tvd, VDD_DTL, vd, txg);
7338
12fa0466 7339 spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE);
e60e158e 7340 spa_notify_waiters(spa);
428870ff 7341
572e2857
BB
7342 /* hang on to the spa before we release the lock */
7343 spa_open_ref(spa, FTAG);
7344
428870ff
BB
7345 error = spa_vdev_exit(spa, vd, txg, 0);
7346
6f1ffb06 7347 spa_history_log_internal(spa, "detach", NULL,
428870ff
BB
7348 "vdev=%s", vdpath);
7349 spa_strfree(vdpath);
7350
7351 /*
7352 * If this was the removal of the original device in a hot spare vdev,
7353 * then we want to go through and remove the device from the hot spare
7354 * list of every other pool.
7355 */
7356 if (unspare) {
572e2857
BB
7357 spa_t *altspa = NULL;
7358
428870ff 7359 mutex_enter(&spa_namespace_lock);
572e2857
BB
7360 while ((altspa = spa_next(altspa)) != NULL) {
7361 if (altspa->spa_state != POOL_STATE_ACTIVE ||
7362 altspa == spa)
428870ff 7363 continue;
572e2857
BB
7364
7365 spa_open_ref(altspa, FTAG);
428870ff 7366 mutex_exit(&spa_namespace_lock);
572e2857 7367 (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE);
428870ff 7368 mutex_enter(&spa_namespace_lock);
572e2857 7369 spa_close(altspa, FTAG);
428870ff
BB
7370 }
7371 mutex_exit(&spa_namespace_lock);
572e2857
BB
7372
7373 /* search the rest of the vdevs for spares to remove */
7374 spa_vdev_resilver_done(spa);
428870ff
BB
7375 }
7376
572e2857
BB
7377 /* all done with the spa; OK to release */
7378 mutex_enter(&spa_namespace_lock);
7379 spa_close(spa, FTAG);
7380 mutex_exit(&spa_namespace_lock);
7381
428870ff
BB
7382 return (error);
7383}
7384
c10d37dd
GW
7385static int
7386spa_vdev_initialize_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type,
7387 list_t *vd_list)
619f0976 7388{
c10d37dd
GW
7389 ASSERT(MUTEX_HELD(&spa_namespace_lock));
7390
619f0976
GW
7391 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
7392
7393 /* Look up vdev and ensure it's a leaf. */
7394 vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE);
7395 if (vd == NULL || vd->vdev_detached) {
7396 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
619f0976
GW
7397 return (SET_ERROR(ENODEV));
7398 } else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) {
7399 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
619f0976
GW
7400 return (SET_ERROR(EINVAL));
7401 } else if (!vdev_writeable(vd)) {
7402 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
619f0976
GW
7403 return (SET_ERROR(EROFS));
7404 }
7405 mutex_enter(&vd->vdev_initialize_lock);
7406 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
7407
7408 /*
7409 * When we activate an initialize action we check to see
7410 * if the vdev_initialize_thread is NULL. We do this instead
7411 * of using the vdev_initialize_state since there might be
7412 * a previous initialization process which has completed but
7413 * the thread is not exited.
7414 */
1b939560 7415 if (cmd_type == POOL_INITIALIZE_START &&
619f0976
GW
7416 (vd->vdev_initialize_thread != NULL ||
7417 vd->vdev_top->vdev_removing)) {
7418 mutex_exit(&vd->vdev_initialize_lock);
619f0976
GW
7419 return (SET_ERROR(EBUSY));
7420 } else if (cmd_type == POOL_INITIALIZE_CANCEL &&
7421 (vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE &&
7422 vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED)) {
7423 mutex_exit(&vd->vdev_initialize_lock);
619f0976
GW
7424 return (SET_ERROR(ESRCH));
7425 } else if (cmd_type == POOL_INITIALIZE_SUSPEND &&
7426 vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE) {
7427 mutex_exit(&vd->vdev_initialize_lock);
619f0976 7428 return (SET_ERROR(ESRCH));
e34e15ed
BB
7429 } else if (cmd_type == POOL_INITIALIZE_UNINIT &&
7430 vd->vdev_initialize_thread != NULL) {
7431 mutex_exit(&vd->vdev_initialize_lock);
7432 return (SET_ERROR(EBUSY));
619f0976
GW
7433 }
7434
7435 switch (cmd_type) {
1b939560 7436 case POOL_INITIALIZE_START:
619f0976
GW
7437 vdev_initialize(vd);
7438 break;
7439 case POOL_INITIALIZE_CANCEL:
c10d37dd 7440 vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED, vd_list);
619f0976
GW
7441 break;
7442 case POOL_INITIALIZE_SUSPEND:
c10d37dd 7443 vdev_initialize_stop(vd, VDEV_INITIALIZE_SUSPENDED, vd_list);
619f0976 7444 break;
e34e15ed
BB
7445 case POOL_INITIALIZE_UNINIT:
7446 vdev_uninitialize(vd);
7447 break;
619f0976
GW
7448 default:
7449 panic("invalid cmd_type %llu", (unsigned long long)cmd_type);
7450 }
7451 mutex_exit(&vd->vdev_initialize_lock);
7452
c10d37dd
GW
7453 return (0);
7454}
7455
7456int
7457spa_vdev_initialize(spa_t *spa, nvlist_t *nv, uint64_t cmd_type,
7458 nvlist_t *vdev_errlist)
7459{
7460 int total_errors = 0;
7461 list_t vd_list;
7462
7463 list_create(&vd_list, sizeof (vdev_t),
7464 offsetof(vdev_t, vdev_initialize_node));
7465
7466 /*
7467 * We hold the namespace lock through the whole function
7468 * to prevent any changes to the pool while we're starting or
7469 * stopping initialization. The config and state locks are held so that
7470 * we can properly assess the vdev state before we commit to
7471 * the initializing operation.
7472 */
7473 mutex_enter(&spa_namespace_lock);
7474
7475 for (nvpair_t *pair = nvlist_next_nvpair(nv, NULL);
7476 pair != NULL; pair = nvlist_next_nvpair(nv, pair)) {
7477 uint64_t vdev_guid = fnvpair_value_uint64(pair);
7478
7479 int error = spa_vdev_initialize_impl(spa, vdev_guid, cmd_type,
7480 &vd_list);
7481 if (error != 0) {
7482 char guid_as_str[MAXNAMELEN];
7483
7484 (void) snprintf(guid_as_str, sizeof (guid_as_str),
7485 "%llu", (unsigned long long)vdev_guid);
7486 fnvlist_add_int64(vdev_errlist, guid_as_str, error);
7487 total_errors++;
7488 }
7489 }
7490
7491 /* Wait for all initialize threads to stop. */
7492 vdev_initialize_stop_wait(spa, &vd_list);
7493
619f0976
GW
7494 /* Sync out the initializing state */
7495 txg_wait_synced(spa->spa_dsl_pool, 0);
7496 mutex_exit(&spa_namespace_lock);
7497
c10d37dd 7498 list_destroy(&vd_list);
619f0976 7499
c10d37dd
GW
7500 return (total_errors);
7501}
619f0976 7502
1b939560
BB
7503static int
7504spa_vdev_trim_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type,
7505 uint64_t rate, boolean_t partial, boolean_t secure, list_t *vd_list)
7506{
7507 ASSERT(MUTEX_HELD(&spa_namespace_lock));
7508
7509 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
7510
7511 /* Look up vdev and ensure it's a leaf. */
7512 vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE);
7513 if (vd == NULL || vd->vdev_detached) {
7514 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
7515 return (SET_ERROR(ENODEV));
7516 } else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) {
7517 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
7518 return (SET_ERROR(EINVAL));
7519 } else if (!vdev_writeable(vd)) {
7520 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
7521 return (SET_ERROR(EROFS));
7522 } else if (!vd->vdev_has_trim) {
7523 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
7524 return (SET_ERROR(EOPNOTSUPP));
7525 } else if (secure && !vd->vdev_has_securetrim) {
7526 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
7527 return (SET_ERROR(EOPNOTSUPP));
7528 }
7529 mutex_enter(&vd->vdev_trim_lock);
7530 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
7531
7532 /*
7533 * When we activate a TRIM action we check to see if the
7534 * vdev_trim_thread is NULL. We do this instead of using the
7535 * vdev_trim_state since there might be a previous TRIM process
7536 * which has completed but the thread is not exited.
7537 */
7538 if (cmd_type == POOL_TRIM_START &&
7539 (vd->vdev_trim_thread != NULL || vd->vdev_top->vdev_removing)) {
7540 mutex_exit(&vd->vdev_trim_lock);
7541 return (SET_ERROR(EBUSY));
7542 } else if (cmd_type == POOL_TRIM_CANCEL &&
7543 (vd->vdev_trim_state != VDEV_TRIM_ACTIVE &&
7544 vd->vdev_trim_state != VDEV_TRIM_SUSPENDED)) {
7545 mutex_exit(&vd->vdev_trim_lock);
7546 return (SET_ERROR(ESRCH));
7547 } else if (cmd_type == POOL_TRIM_SUSPEND &&
7548 vd->vdev_trim_state != VDEV_TRIM_ACTIVE) {
7549 mutex_exit(&vd->vdev_trim_lock);
7550 return (SET_ERROR(ESRCH));
7551 }
7552
7553 switch (cmd_type) {
7554 case POOL_TRIM_START:
7555 vdev_trim(vd, rate, partial, secure);
7556 break;
7557 case POOL_TRIM_CANCEL:
7558 vdev_trim_stop(vd, VDEV_TRIM_CANCELED, vd_list);
7559 break;
7560 case POOL_TRIM_SUSPEND:
7561 vdev_trim_stop(vd, VDEV_TRIM_SUSPENDED, vd_list);
7562 break;
7563 default:
7564 panic("invalid cmd_type %llu", (unsigned long long)cmd_type);
7565 }
7566 mutex_exit(&vd->vdev_trim_lock);
7567
7568 return (0);
7569}
7570
7571/*
7572 * Initiates a manual TRIM for the requested vdevs. This kicks off individual
7573 * TRIM threads for each child vdev. These threads pass over all of the free
7574 * space in the vdev's metaslabs and issues TRIM commands for that space.
7575 */
7576int
7577spa_vdev_trim(spa_t *spa, nvlist_t *nv, uint64_t cmd_type, uint64_t rate,
7578 boolean_t partial, boolean_t secure, nvlist_t *vdev_errlist)
7579{
7580 int total_errors = 0;
7581 list_t vd_list;
7582
7583 list_create(&vd_list, sizeof (vdev_t),
7584 offsetof(vdev_t, vdev_trim_node));
7585
7586 /*
7587 * We hold the namespace lock through the whole function
7588 * to prevent any changes to the pool while we're starting or
7589 * stopping TRIM. The config and state locks are held so that
7590 * we can properly assess the vdev state before we commit to
7591 * the TRIM operation.
7592 */
7593 mutex_enter(&spa_namespace_lock);
7594
7595 for (nvpair_t *pair = nvlist_next_nvpair(nv, NULL);
7596 pair != NULL; pair = nvlist_next_nvpair(nv, pair)) {
7597 uint64_t vdev_guid = fnvpair_value_uint64(pair);
7598
7599 int error = spa_vdev_trim_impl(spa, vdev_guid, cmd_type,
7600 rate, partial, secure, &vd_list);
7601 if (error != 0) {
7602 char guid_as_str[MAXNAMELEN];
7603
7604 (void) snprintf(guid_as_str, sizeof (guid_as_str),
7605 "%llu", (unsigned long long)vdev_guid);
7606 fnvlist_add_int64(vdev_errlist, guid_as_str, error);
7607 total_errors++;
7608 }
7609 }
7610
7611 /* Wait for all TRIM threads to stop. */
7612 vdev_trim_stop_wait(spa, &vd_list);
7613
7614 /* Sync out the TRIM state */
7615 txg_wait_synced(spa->spa_dsl_pool, 0);
7616 mutex_exit(&spa_namespace_lock);
7617
7618 list_destroy(&vd_list);
7619
7620 return (total_errors);
7621}
7622
428870ff
BB
7623/*
7624 * Split a set of devices from their mirrors, and create a new pool from them.
7625 */
7626int
a926aab9 7627spa_vdev_split_mirror(spa_t *spa, const char *newname, nvlist_t *config,
428870ff
BB
7628 nvlist_t *props, boolean_t exp)
7629{
7630 int error = 0;
7631 uint64_t txg, *glist;
7632 spa_t *newspa;
7633 uint_t c, children, lastlog;
7634 nvlist_t **child, *nvl, *tmp;
7635 dmu_tx_t *tx;
d1807f16 7636 const char *altroot = NULL;
428870ff
BB
7637 vdev_t *rvd, **vml = NULL; /* vdev modify list */
7638 boolean_t activate_slog;
7639
572e2857 7640 ASSERT(spa_writeable(spa));
428870ff
BB
7641
7642 txg = spa_vdev_enter(spa);
7643
d2734cce
SD
7644 ASSERT(MUTEX_HELD(&spa_namespace_lock));
7645 if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
7646 error = (spa_has_checkpoint(spa)) ?
7647 ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
7648 return (spa_vdev_exit(spa, NULL, txg, error));
7649 }
7650
428870ff
BB
7651 /* clear the log and flush everything up to now */
7652 activate_slog = spa_passivate_log(spa);
7653 (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
a1d477c2 7654 error = spa_reset_logs(spa);
428870ff
BB
7655 txg = spa_vdev_config_enter(spa);
7656
7657 if (activate_slog)
7658 spa_activate_log(spa);
7659
7660 if (error != 0)
7661 return (spa_vdev_exit(spa, NULL, txg, error));
7662
7663 /* check new spa name before going any further */
7664 if (spa_lookup(newname) != NULL)
7665 return (spa_vdev_exit(spa, NULL, txg, EEXIST));
7666
7667 /*
7668 * scan through all the children to ensure they're all mirrors
7669 */
7670 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 ||
7671 nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child,
7672 &children) != 0)
7673 return (spa_vdev_exit(spa, NULL, txg, EINVAL));
7674
7675 /* first, check to ensure we've got the right child count */
7676 rvd = spa->spa_root_vdev;
7677 lastlog = 0;
7678 for (c = 0; c < rvd->vdev_children; c++) {
7679 vdev_t *vd = rvd->vdev_child[c];
7680
7681 /* don't count the holes & logs as children */
1b664952
GA
7682 if (vd->vdev_islog || (vd->vdev_ops != &vdev_indirect_ops &&
7683 !vdev_is_concrete(vd))) {
428870ff
BB
7684 if (lastlog == 0)
7685 lastlog = c;
7686 continue;
7687 }
7688
7689 lastlog = 0;
7690 }
7691 if (children != (lastlog != 0 ? lastlog : rvd->vdev_children))
7692 return (spa_vdev_exit(spa, NULL, txg, EINVAL));
7693
7694 /* next, ensure no spare or cache devices are part of the split */
7695 if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 ||
7696 nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0)
7697 return (spa_vdev_exit(spa, NULL, txg, EINVAL));
7698
79c76d5b
BB
7699 vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP);
7700 glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP);
428870ff
BB
7701
7702 /* then, loop over each vdev and validate it */
7703 for (c = 0; c < children; c++) {
7704 uint64_t is_hole = 0;
7705
7706 (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE,
7707 &is_hole);
7708
7709 if (is_hole != 0) {
7710 if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole ||
7711 spa->spa_root_vdev->vdev_child[c]->vdev_islog) {
7712 continue;
7713 } else {
2e528b49 7714 error = SET_ERROR(EINVAL);
428870ff
BB
7715 break;
7716 }
7717 }
7718
1b664952
GA
7719 /* deal with indirect vdevs */
7720 if (spa->spa_root_vdev->vdev_child[c]->vdev_ops ==
7721 &vdev_indirect_ops)
7722 continue;
7723
428870ff
BB
7724 /* which disk is going to be split? */
7725 if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID,
7726 &glist[c]) != 0) {
2e528b49 7727 error = SET_ERROR(EINVAL);
428870ff
BB
7728 break;
7729 }
7730
7731 /* look it up in the spa */
7732 vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE);
7733 if (vml[c] == NULL) {
2e528b49 7734 error = SET_ERROR(ENODEV);
428870ff
BB
7735 break;
7736 }
7737
7738 /* make sure there's nothing stopping the split */
7739 if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops ||
7740 vml[c]->vdev_islog ||
a1d477c2 7741 !vdev_is_concrete(vml[c]) ||
428870ff
BB
7742 vml[c]->vdev_isspare ||
7743 vml[c]->vdev_isl2cache ||
7744 !vdev_writeable(vml[c]) ||
7745 vml[c]->vdev_children != 0 ||
7746 vml[c]->vdev_state != VDEV_STATE_HEALTHY ||
7747 c != spa->spa_root_vdev->vdev_child[c]->vdev_id) {
2e528b49 7748 error = SET_ERROR(EINVAL);
428870ff
BB
7749 break;
7750 }
7751
733b5722
RS
7752 if (vdev_dtl_required(vml[c]) ||
7753 vdev_resilver_needed(vml[c], NULL, NULL)) {
2e528b49 7754 error = SET_ERROR(EBUSY);
428870ff
BB
7755 break;
7756 }
7757
7758 /* we need certain info from the top level */
65ad5d11
AJ
7759 fnvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY,
7760 vml[c]->vdev_top->vdev_ms_array);
7761 fnvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT,
7762 vml[c]->vdev_top->vdev_ms_shift);
7763 fnvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE,
7764 vml[c]->vdev_top->vdev_asize);
7765 fnvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT,
7766 vml[c]->vdev_top->vdev_ashift);
e0ab3ab5
JS
7767
7768 /* transfer per-vdev ZAPs */
7769 ASSERT3U(vml[c]->vdev_leaf_zap, !=, 0);
7770 VERIFY0(nvlist_add_uint64(child[c],
7771 ZPOOL_CONFIG_VDEV_LEAF_ZAP, vml[c]->vdev_leaf_zap));
7772
7773 ASSERT3U(vml[c]->vdev_top->vdev_top_zap, !=, 0);
7774 VERIFY0(nvlist_add_uint64(child[c],
7775 ZPOOL_CONFIG_VDEV_TOP_ZAP,
7776 vml[c]->vdev_parent->vdev_top_zap));
428870ff
BB
7777 }
7778
7779 if (error != 0) {
7780 kmem_free(vml, children * sizeof (vdev_t *));
7781 kmem_free(glist, children * sizeof (uint64_t));
7782 return (spa_vdev_exit(spa, NULL, txg, error));
7783 }
7784
7785 /* stop writers from using the disks */
7786 for (c = 0; c < children; c++) {
7787 if (vml[c] != NULL)
7788 vml[c]->vdev_offline = B_TRUE;
7789 }
7790 vdev_reopen(spa->spa_root_vdev);
34dc7c2f
BB
7791
7792 /*
428870ff
BB
7793 * Temporarily record the splitting vdevs in the spa config. This
7794 * will disappear once the config is regenerated.
34dc7c2f 7795 */
65ad5d11
AJ
7796 nvl = fnvlist_alloc();
7797 fnvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, glist, children);
428870ff 7798 kmem_free(glist, children * sizeof (uint64_t));
34dc7c2f 7799
428870ff 7800 mutex_enter(&spa->spa_props_lock);
65ad5d11 7801 fnvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT, nvl);
428870ff
BB
7802 mutex_exit(&spa->spa_props_lock);
7803 spa->spa_config_splitting = nvl;
7804 vdev_config_dirty(spa->spa_root_vdev);
7805
7806 /* configure and create the new pool */
65ad5d11
AJ
7807 fnvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname);
7808 fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
7809 exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE);
7810 fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, spa_version(spa));
7811 fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, spa->spa_config_txg);
7812 fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID,
7813 spa_generate_guid(NULL));
e0ab3ab5 7814 VERIFY0(nvlist_add_boolean(config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS));
428870ff
BB
7815 (void) nvlist_lookup_string(props,
7816 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
34dc7c2f 7817
428870ff
BB
7818 /* add the new pool to the namespace */
7819 newspa = spa_add(newname, config, altroot);
e0ab3ab5 7820 newspa->spa_avz_action = AVZ_ACTION_REBUILD;
428870ff
BB
7821 newspa->spa_config_txg = spa->spa_config_txg;
7822 spa_set_log_state(newspa, SPA_LOG_CLEAR);
7823
7824 /* release the spa config lock, retaining the namespace lock */
7825 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
7826
7827 if (zio_injection_enabled)
7828 zio_handle_panic_injection(spa, FTAG, 1);
7829
7830 spa_activate(newspa, spa_mode_global);
7831 spa_async_suspend(newspa);
7832
c10d37dd 7833 /*
1b939560
BB
7834 * Temporarily stop the initializing and TRIM activity. We set the
7835 * state to ACTIVE so that we know to resume initializing or TRIM
7836 * once the split has completed.
c10d37dd 7837 */
1b939560
BB
7838 list_t vd_initialize_list;
7839 list_create(&vd_initialize_list, sizeof (vdev_t),
c10d37dd
GW
7840 offsetof(vdev_t, vdev_initialize_node));
7841
1b939560
BB
7842 list_t vd_trim_list;
7843 list_create(&vd_trim_list, sizeof (vdev_t),
7844 offsetof(vdev_t, vdev_trim_node));
7845
619f0976 7846 for (c = 0; c < children; c++) {
1b664952 7847 if (vml[c] != NULL && vml[c]->vdev_ops != &vdev_indirect_ops) {
619f0976 7848 mutex_enter(&vml[c]->vdev_initialize_lock);
1b939560
BB
7849 vdev_initialize_stop(vml[c],
7850 VDEV_INITIALIZE_ACTIVE, &vd_initialize_list);
619f0976 7851 mutex_exit(&vml[c]->vdev_initialize_lock);
1b939560
BB
7852
7853 mutex_enter(&vml[c]->vdev_trim_lock);
7854 vdev_trim_stop(vml[c], VDEV_TRIM_ACTIVE, &vd_trim_list);
7855 mutex_exit(&vml[c]->vdev_trim_lock);
619f0976
GW
7856 }
7857 }
1b939560
BB
7858
7859 vdev_initialize_stop_wait(spa, &vd_initialize_list);
7860 vdev_trim_stop_wait(spa, &vd_trim_list);
7861
7862 list_destroy(&vd_initialize_list);
7863 list_destroy(&vd_trim_list);
619f0976 7864
6cb8e530 7865 newspa->spa_config_source = SPA_CONFIG_SRC_SPLIT;
8b27e08e 7866 newspa->spa_is_splitting = B_TRUE;
6cb8e530 7867
428870ff 7868 /* create the new pool from the disks of the original pool */
6cb8e530 7869 error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE);
428870ff
BB
7870 if (error)
7871 goto out;
7872
7873 /* if that worked, generate a real config for the new pool */
7874 if (newspa->spa_root_vdev != NULL) {
65ad5d11
AJ
7875 newspa->spa_config_splitting = fnvlist_alloc();
7876 fnvlist_add_uint64(newspa->spa_config_splitting,
7877 ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa));
428870ff
BB
7878 spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL,
7879 B_TRUE));
9babb374 7880 }
34dc7c2f 7881
428870ff
BB
7882 /* set the props */
7883 if (props != NULL) {
7884 spa_configfile_set(newspa, props, B_FALSE);
7885 error = spa_prop_set(newspa, props);
7886 if (error)
7887 goto out;
7888 }
34dc7c2f 7889
428870ff
BB
7890 /* flush everything */
7891 txg = spa_vdev_config_enter(newspa);
7892 vdev_config_dirty(newspa->spa_root_vdev);
7893 (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG);
34dc7c2f 7894
428870ff
BB
7895 if (zio_injection_enabled)
7896 zio_handle_panic_injection(spa, FTAG, 2);
34dc7c2f 7897
428870ff 7898 spa_async_resume(newspa);
34dc7c2f 7899
428870ff
BB
7900 /* finally, update the original pool's config */
7901 txg = spa_vdev_config_enter(spa);
7902 tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
7903 error = dmu_tx_assign(tx, TXG_WAIT);
7904 if (error != 0)
7905 dmu_tx_abort(tx);
7906 for (c = 0; c < children; c++) {
1b664952 7907 if (vml[c] != NULL && vml[c]->vdev_ops != &vdev_indirect_ops) {
234234ca
RS
7908 vdev_t *tvd = vml[c]->vdev_top;
7909
7910 /*
7911 * Need to be sure the detachable VDEV is not
7912 * on any *other* txg's DTL list to prevent it
7913 * from being accessed after it's freed.
7914 */
7915 for (int t = 0; t < TXG_SIZE; t++) {
7916 (void) txg_list_remove_this(
7917 &tvd->vdev_dtl_list, vml[c], t);
7918 }
7919
428870ff
BB
7920 vdev_split(vml[c]);
7921 if (error == 0)
6f1ffb06
MA
7922 spa_history_log_internal(spa, "detach", tx,
7923 "vdev=%s", vml[c]->vdev_path);
e0ab3ab5 7924
428870ff 7925 vdev_free(vml[c]);
34dc7c2f 7926 }
34dc7c2f 7927 }
e0ab3ab5 7928 spa->spa_avz_action = AVZ_ACTION_REBUILD;
428870ff
BB
7929 vdev_config_dirty(spa->spa_root_vdev);
7930 spa->spa_config_splitting = NULL;
7931 nvlist_free(nvl);
7932 if (error == 0)
7933 dmu_tx_commit(tx);
7934 (void) spa_vdev_exit(spa, NULL, txg, 0);
7935
7936 if (zio_injection_enabled)
7937 zio_handle_panic_injection(spa, FTAG, 3);
7938
7939 /* split is complete; log a history record */
6f1ffb06
MA
7940 spa_history_log_internal(newspa, "split", NULL,
7941 "from pool %s", spa_name(spa));
428870ff 7942
8b27e08e 7943 newspa->spa_is_splitting = B_FALSE;
428870ff
BB
7944 kmem_free(vml, children * sizeof (vdev_t *));
7945
7946 /* if we're not going to mount the filesystems in userland, export */
7947 if (exp)
7948 error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL,
7949 B_FALSE, B_FALSE);
7950
7951 return (error);
7952
7953out:
7954 spa_unload(newspa);
7955 spa_deactivate(newspa);
7956 spa_remove(newspa);
7957
7958 txg = spa_vdev_config_enter(spa);
7959
7960 /* re-online all offlined disks */
7961 for (c = 0; c < children; c++) {
7962 if (vml[c] != NULL)
7963 vml[c]->vdev_offline = B_FALSE;
7964 }
619f0976 7965
1b939560 7966 /* restart initializing or trimming disks as necessary */
619f0976 7967 spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART);
1b939560
BB
7968 spa_async_request(spa, SPA_ASYNC_TRIM_RESTART);
7969 spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART);
619f0976 7970
428870ff
BB
7971 vdev_reopen(spa->spa_root_vdev);
7972
7973 nvlist_free(spa->spa_config_splitting);
7974 spa->spa_config_splitting = NULL;
7975 (void) spa_vdev_exit(spa, NULL, txg, error);
34dc7c2f 7976
428870ff 7977 kmem_free(vml, children * sizeof (vdev_t *));
34dc7c2f
BB
7978 return (error);
7979}
7980
34dc7c2f
BB
7981/*
7982 * Find any device that's done replacing, or a vdev marked 'unspare' that's
d3cc8b15 7983 * currently spared, so we can detach it.
34dc7c2f
BB
7984 */
7985static vdev_t *
7986spa_vdev_resilver_done_hunt(vdev_t *vd)
7987{
7988 vdev_t *newvd, *oldvd;
34dc7c2f 7989
1c27024e 7990 for (int c = 0; c < vd->vdev_children; c++) {
34dc7c2f
BB
7991 oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]);
7992 if (oldvd != NULL)
7993 return (oldvd);
7994 }
7995
7996 /*
572e2857
BB
7997 * Check for a completed replacement. We always consider the first
7998 * vdev in the list to be the oldest vdev, and the last one to be
7999 * the newest (see spa_vdev_attach() for how that works). In
8000 * the case where the newest vdev is faulted, we will not automatically
8001 * remove it after a resilver completes. This is OK as it will require
8002 * user intervention to determine which disk the admin wishes to keep.
34dc7c2f 8003 */
572e2857
BB
8004 if (vd->vdev_ops == &vdev_replacing_ops) {
8005 ASSERT(vd->vdev_children > 1);
8006
8007 newvd = vd->vdev_child[vd->vdev_children - 1];
34dc7c2f 8008 oldvd = vd->vdev_child[0];
34dc7c2f 8009
fb5f0bc8 8010 if (vdev_dtl_empty(newvd, DTL_MISSING) &&
428870ff 8011 vdev_dtl_empty(newvd, DTL_OUTAGE) &&
fb5f0bc8 8012 !vdev_dtl_required(oldvd))
34dc7c2f 8013 return (oldvd);
34dc7c2f
BB
8014 }
8015
8016 /*
8017 * Check for a completed resilver with the 'unspare' flag set.
f65fbee1 8018 * Also potentially update faulted state.
34dc7c2f 8019 */
572e2857
BB
8020 if (vd->vdev_ops == &vdev_spare_ops) {
8021 vdev_t *first = vd->vdev_child[0];
8022 vdev_t *last = vd->vdev_child[vd->vdev_children - 1];
8023
8024 if (last->vdev_unspare) {
8025 oldvd = first;
8026 newvd = last;
8027 } else if (first->vdev_unspare) {
8028 oldvd = last;
8029 newvd = first;
8030 } else {
8031 oldvd = NULL;
8032 }
34dc7c2f 8033
572e2857 8034 if (oldvd != NULL &&
fb5f0bc8 8035 vdev_dtl_empty(newvd, DTL_MISSING) &&
428870ff 8036 vdev_dtl_empty(newvd, DTL_OUTAGE) &&
572e2857 8037 !vdev_dtl_required(oldvd))
34dc7c2f 8038 return (oldvd);
572e2857 8039
f65fbee1
JJ
8040 vdev_propagate_state(vd);
8041
572e2857
BB
8042 /*
8043 * If there are more than two spares attached to a disk,
8044 * and those spares are not required, then we want to
8045 * attempt to free them up now so that they can be used
8046 * by other pools. Once we're back down to a single
8047 * disk+spare, we stop removing them.
8048 */
8049 if (vd->vdev_children > 2) {
8050 newvd = vd->vdev_child[1];
8051
8052 if (newvd->vdev_isspare && last->vdev_isspare &&
8053 vdev_dtl_empty(last, DTL_MISSING) &&
8054 vdev_dtl_empty(last, DTL_OUTAGE) &&
8055 !vdev_dtl_required(newvd))
8056 return (newvd);
34dc7c2f 8057 }
34dc7c2f
BB
8058 }
8059
8060 return (NULL);
8061}
8062
8063static void
8064spa_vdev_resilver_done(spa_t *spa)
8065{
fb5f0bc8
BB
8066 vdev_t *vd, *pvd, *ppvd;
8067 uint64_t guid, sguid, pguid, ppguid;
34dc7c2f 8068
fb5f0bc8 8069 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
34dc7c2f
BB
8070
8071 while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) {
fb5f0bc8
BB
8072 pvd = vd->vdev_parent;
8073 ppvd = pvd->vdev_parent;
34dc7c2f 8074 guid = vd->vdev_guid;
fb5f0bc8
BB
8075 pguid = pvd->vdev_guid;
8076 ppguid = ppvd->vdev_guid;
8077 sguid = 0;
34dc7c2f
BB
8078 /*
8079 * If we have just finished replacing a hot spared device, then
8080 * we need to detach the parent's first child (the original hot
8081 * spare) as well.
8082 */
572e2857
BB
8083 if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 &&
8084 ppvd->vdev_children == 2) {
34dc7c2f 8085 ASSERT(pvd->vdev_ops == &vdev_replacing_ops);
fb5f0bc8 8086 sguid = ppvd->vdev_child[1]->vdev_guid;
34dc7c2f 8087 }
5d1f7fb6
GW
8088 ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd));
8089
fb5f0bc8
BB
8090 spa_config_exit(spa, SCL_ALL, FTAG);
8091 if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0)
34dc7c2f 8092 return;
fb5f0bc8 8093 if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0)
34dc7c2f 8094 return;
fb5f0bc8 8095 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
34dc7c2f
BB
8096 }
8097
fb5f0bc8 8098 spa_config_exit(spa, SCL_ALL, FTAG);
9a49d3f3
BB
8099
8100 /*
8101 * If a detach was not performed above replace waiters will not have
8102 * been notified. In which case we must do so now.
8103 */
8104 spa_notify_waiters(spa);
34dc7c2f
BB
8105}
8106
8107/*
428870ff 8108 * Update the stored path or FRU for this vdev.
34dc7c2f 8109 */
65c7cc49 8110static int
9babb374
BB
8111spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value,
8112 boolean_t ispath)
34dc7c2f 8113{
b128c09f 8114 vdev_t *vd;
428870ff 8115 boolean_t sync = B_FALSE;
34dc7c2f 8116
572e2857
BB
8117 ASSERT(spa_writeable(spa));
8118
428870ff 8119 spa_vdev_state_enter(spa, SCL_ALL);
34dc7c2f 8120
9babb374 8121 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
428870ff 8122 return (spa_vdev_state_exit(spa, NULL, ENOENT));
34dc7c2f
BB
8123
8124 if (!vd->vdev_ops->vdev_op_leaf)
428870ff 8125 return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
34dc7c2f 8126
9babb374 8127 if (ispath) {
428870ff
BB
8128 if (strcmp(value, vd->vdev_path) != 0) {
8129 spa_strfree(vd->vdev_path);
8130 vd->vdev_path = spa_strdup(value);
8131 sync = B_TRUE;
8132 }
9babb374 8133 } else {
428870ff
BB
8134 if (vd->vdev_fru == NULL) {
8135 vd->vdev_fru = spa_strdup(value);
8136 sync = B_TRUE;
8137 } else if (strcmp(value, vd->vdev_fru) != 0) {
9babb374 8138 spa_strfree(vd->vdev_fru);
428870ff
BB
8139 vd->vdev_fru = spa_strdup(value);
8140 sync = B_TRUE;
8141 }
9babb374 8142 }
34dc7c2f 8143
428870ff 8144 return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0));
34dc7c2f
BB
8145}
8146
9babb374
BB
8147int
8148spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath)
8149{
8150 return (spa_vdev_set_common(spa, guid, newpath, B_TRUE));
8151}
8152
8153int
8154spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru)
8155{
8156 return (spa_vdev_set_common(spa, guid, newfru, B_FALSE));
8157}
8158
34dc7c2f
BB
8159/*
8160 * ==========================================================================
428870ff 8161 * SPA Scanning
34dc7c2f
BB
8162 * ==========================================================================
8163 */
0ea05c64
AP
8164int
8165spa_scrub_pause_resume(spa_t *spa, pool_scrub_cmd_t cmd)
8166{
8167 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
8168
8169 if (dsl_scan_resilvering(spa->spa_dsl_pool))
8170 return (SET_ERROR(EBUSY));
8171
8172 return (dsl_scrub_set_pause_resume(spa->spa_dsl_pool, cmd));
8173}
34dc7c2f 8174
34dc7c2f 8175int
428870ff
BB
8176spa_scan_stop(spa_t *spa)
8177{
8178 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
8179 if (dsl_scan_resilvering(spa->spa_dsl_pool))
2e528b49 8180 return (SET_ERROR(EBUSY));
482eeef8 8181
428870ff
BB
8182 return (dsl_scan_cancel(spa->spa_dsl_pool));
8183}
8184
8185int
8186spa_scan(spa_t *spa, pool_scan_func_t func)
34dc7c2f 8187{
b128c09f 8188 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
34dc7c2f 8189
428870ff 8190 if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE)
2e528b49 8191 return (SET_ERROR(ENOTSUP));
34dc7c2f 8192
fa241660
TC
8193 if (func == POOL_SCAN_RESILVER &&
8194 !spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER))
8195 return (SET_ERROR(ENOTSUP));
8196
34dc7c2f 8197 /*
b128c09f
BB
8198 * If a resilver was requested, but there is no DTL on a
8199 * writeable leaf device, we have nothing to do.
34dc7c2f 8200 */
428870ff 8201 if (func == POOL_SCAN_RESILVER &&
b128c09f
BB
8202 !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) {
8203 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
34dc7c2f
BB
8204 return (0);
8205 }
8206
482eeef8
GA
8207 if (func == POOL_SCAN_ERRORSCRUB &&
8208 !spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG))
8209 return (SET_ERROR(ENOTSUP));
8210
428870ff 8211 return (dsl_scan(spa->spa_dsl_pool, func));
34dc7c2f
BB
8212}
8213
8214/*
8215 * ==========================================================================
8216 * SPA async task processing
8217 * ==========================================================================
8218 */
8219
8220static void
8221spa_async_remove(spa_t *spa, vdev_t *vd)
8222{
b128c09f 8223 if (vd->vdev_remove_wanted) {
428870ff
BB
8224 vd->vdev_remove_wanted = B_FALSE;
8225 vd->vdev_delayed_close = B_FALSE;
b128c09f 8226 vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE);
428870ff
BB
8227
8228 /*
8229 * We want to clear the stats, but we don't want to do a full
8230 * vdev_clear() as that will cause us to throw away
8231 * degraded/faulted state as well as attempt to reopen the
8232 * device, all of which is a waste.
8233 */
8234 vd->vdev_stat.vs_read_errors = 0;
8235 vd->vdev_stat.vs_write_errors = 0;
8236 vd->vdev_stat.vs_checksum_errors = 0;
8237
b128c09f 8238 vdev_state_dirty(vd->vdev_top);
0aacde2e
RM
8239
8240 /* Tell userspace that the vdev is gone. */
8241 zfs_post_remove(spa, vd);
b128c09f 8242 }
34dc7c2f 8243
1c27024e 8244 for (int c = 0; c < vd->vdev_children; c++)
b128c09f
BB
8245 spa_async_remove(spa, vd->vdev_child[c]);
8246}
8247
8248static void
8249spa_async_probe(spa_t *spa, vdev_t *vd)
8250{
8251 if (vd->vdev_probe_wanted) {
428870ff 8252 vd->vdev_probe_wanted = B_FALSE;
b128c09f 8253 vdev_reopen(vd); /* vdev_open() does the actual probe */
34dc7c2f 8254 }
b128c09f 8255
1c27024e 8256 for (int c = 0; c < vd->vdev_children; c++)
b128c09f 8257 spa_async_probe(spa, vd->vdev_child[c]);
34dc7c2f
BB
8258}
8259
9babb374
BB
8260static void
8261spa_async_autoexpand(spa_t *spa, vdev_t *vd)
8262{
9babb374
BB
8263 if (!spa->spa_autoexpand)
8264 return;
8265
1c27024e 8266 for (int c = 0; c < vd->vdev_children; c++) {
9babb374
BB
8267 vdev_t *cvd = vd->vdev_child[c];
8268 spa_async_autoexpand(spa, cvd);
8269 }
8270
8271 if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL)
8272 return;
8273
12fa0466 8274 spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_AUTOEXPAND);
9babb374
BB
8275}
8276
460748d4 8277static __attribute__((noreturn)) void
c25b8f99 8278spa_async_thread(void *arg)
34dc7c2f 8279{
c25b8f99 8280 spa_t *spa = (spa_t *)arg;
80a91e74 8281 dsl_pool_t *dp = spa->spa_dsl_pool;
867959b5 8282 int tasks;
34dc7c2f
BB
8283
8284 ASSERT(spa->spa_sync_on);
8285
8286 mutex_enter(&spa->spa_async_lock);
8287 tasks = spa->spa_async_tasks;
8288 spa->spa_async_tasks = 0;
8289 mutex_exit(&spa->spa_async_lock);
8290
8291 /*
8292 * See if the config needs to be updated.
8293 */
8294 if (tasks & SPA_ASYNC_CONFIG_UPDATE) {
428870ff 8295 uint64_t old_space, new_space;
9babb374 8296
34dc7c2f 8297 mutex_enter(&spa_namespace_lock);
428870ff 8298 old_space = metaslab_class_get_space(spa_normal_class(spa));
cc99f275
DB
8299 old_space += metaslab_class_get_space(spa_special_class(spa));
8300 old_space += metaslab_class_get_space(spa_dedup_class(spa));
aa755b35
MA
8301 old_space += metaslab_class_get_space(
8302 spa_embedded_log_class(spa));
cc99f275 8303
34dc7c2f 8304 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
cc99f275 8305
428870ff 8306 new_space = metaslab_class_get_space(spa_normal_class(spa));
cc99f275
DB
8307 new_space += metaslab_class_get_space(spa_special_class(spa));
8308 new_space += metaslab_class_get_space(spa_dedup_class(spa));
aa755b35
MA
8309 new_space += metaslab_class_get_space(
8310 spa_embedded_log_class(spa));
34dc7c2f 8311 mutex_exit(&spa_namespace_lock);
9babb374
BB
8312
8313 /*
8314 * If the pool grew as a result of the config update,
8315 * then log an internal history event.
8316 */
428870ff 8317 if (new_space != old_space) {
6f1ffb06 8318 spa_history_log_internal(spa, "vdev online", NULL,
45d1cae3 8319 "pool '%s' size: %llu(+%llu)",
74756182
MM
8320 spa_name(spa), (u_longlong_t)new_space,
8321 (u_longlong_t)(new_space - old_space));
9babb374 8322 }
34dc7c2f
BB
8323 }
8324
8325 /*
8326 * See if any devices need to be marked REMOVED.
34dc7c2f 8327 */
b128c09f 8328 if (tasks & SPA_ASYNC_REMOVE) {
428870ff 8329 spa_vdev_state_enter(spa, SCL_NONE);
34dc7c2f 8330 spa_async_remove(spa, spa->spa_root_vdev);
867959b5 8331 for (int i = 0; i < spa->spa_l2cache.sav_count; i++)
b128c09f 8332 spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]);
867959b5 8333 for (int i = 0; i < spa->spa_spares.sav_count; i++)
b128c09f
BB
8334 spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]);
8335 (void) spa_vdev_state_exit(spa, NULL, 0);
34dc7c2f
BB
8336 }
8337
9babb374
BB
8338 if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) {
8339 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
8340 spa_async_autoexpand(spa, spa->spa_root_vdev);
8341 spa_config_exit(spa, SCL_CONFIG, FTAG);
8342 }
8343
34dc7c2f 8344 /*
b128c09f 8345 * See if any devices need to be probed.
34dc7c2f 8346 */
b128c09f 8347 if (tasks & SPA_ASYNC_PROBE) {
428870ff 8348 spa_vdev_state_enter(spa, SCL_NONE);
b128c09f
BB
8349 spa_async_probe(spa, spa->spa_root_vdev);
8350 (void) spa_vdev_state_exit(spa, NULL, 0);
8351 }
34dc7c2f
BB
8352
8353 /*
b128c09f 8354 * If any devices are done replacing, detach them.
34dc7c2f 8355 */
b2255edc 8356 if (tasks & SPA_ASYNC_RESILVER_DONE ||
719534ca
AH
8357 tasks & SPA_ASYNC_REBUILD_DONE ||
8358 tasks & SPA_ASYNC_DETACH_SPARE) {
b128c09f 8359 spa_vdev_resilver_done(spa);
9a49d3f3
BB
8360 }
8361
34dc7c2f
BB
8362 /*
8363 * Kick off a resilver.
8364 */
80a91e74 8365 if (tasks & SPA_ASYNC_RESILVER &&
9a49d3f3 8366 !vdev_rebuild_active(spa->spa_root_vdev) &&
80a91e74
TC
8367 (!dsl_scan_resilvering(dp) ||
8368 !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_RESILVER_DEFER)))
3c819a2c 8369 dsl_scan_restart_resilver(dp, 0);
34dc7c2f 8370
619f0976
GW
8371 if (tasks & SPA_ASYNC_INITIALIZE_RESTART) {
8372 mutex_enter(&spa_namespace_lock);
8373 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
8374 vdev_initialize_restart(spa->spa_root_vdev);
8375 spa_config_exit(spa, SCL_CONFIG, FTAG);
8376 mutex_exit(&spa_namespace_lock);
8377 }
8378
1b939560
BB
8379 if (tasks & SPA_ASYNC_TRIM_RESTART) {
8380 mutex_enter(&spa_namespace_lock);
8381 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
8382 vdev_trim_restart(spa->spa_root_vdev);
8383 spa_config_exit(spa, SCL_CONFIG, FTAG);
8384 mutex_exit(&spa_namespace_lock);
8385 }
8386
8387 if (tasks & SPA_ASYNC_AUTOTRIM_RESTART) {
8388 mutex_enter(&spa_namespace_lock);
8389 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
8390 vdev_autotrim_restart(spa);
8391 spa_config_exit(spa, SCL_CONFIG, FTAG);
8392 mutex_exit(&spa_namespace_lock);
8393 }
8394
b7654bd7
GA
8395 /*
8396 * Kick off L2 cache whole device TRIM.
8397 */
8398 if (tasks & SPA_ASYNC_L2CACHE_TRIM) {
8399 mutex_enter(&spa_namespace_lock);
8400 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
8401 vdev_trim_l2arc(spa);
8402 spa_config_exit(spa, SCL_CONFIG, FTAG);
8403 mutex_exit(&spa_namespace_lock);
8404 }
8405
77f6826b
GA
8406 /*
8407 * Kick off L2 cache rebuilding.
8408 */
8409 if (tasks & SPA_ASYNC_L2CACHE_REBUILD) {
8410 mutex_enter(&spa_namespace_lock);
8411 spa_config_enter(spa, SCL_L2ARC, FTAG, RW_READER);
8412 l2arc_spa_rebuild_start(spa);
8413 spa_config_exit(spa, SCL_L2ARC, FTAG);
8414 mutex_exit(&spa_namespace_lock);
8415 }
8416
34dc7c2f
BB
8417 /*
8418 * Let the world know that we're done.
8419 */
8420 mutex_enter(&spa->spa_async_lock);
8421 spa->spa_async_thread = NULL;
8422 cv_broadcast(&spa->spa_async_cv);
8423 mutex_exit(&spa->spa_async_lock);
8424 thread_exit();
8425}
8426
8427void
8428spa_async_suspend(spa_t *spa)
8429{
8430 mutex_enter(&spa->spa_async_lock);
8431 spa->spa_async_suspended++;
9d5b5245 8432 while (spa->spa_async_thread != NULL)
34dc7c2f
BB
8433 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock);
8434 mutex_exit(&spa->spa_async_lock);
a1d477c2
MA
8435
8436 spa_vdev_remove_suspend(spa);
9d5b5245
SD
8437
8438 zthr_t *condense_thread = spa->spa_condense_zthr;
61c3391a
SD
8439 if (condense_thread != NULL)
8440 zthr_cancel(condense_thread);
d2734cce
SD
8441
8442 zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr;
61c3391a
SD
8443 if (discard_thread != NULL)
8444 zthr_cancel(discard_thread);
37f03da8
SH
8445
8446 zthr_t *ll_delete_thread = spa->spa_livelist_delete_zthr;
8447 if (ll_delete_thread != NULL)
8448 zthr_cancel(ll_delete_thread);
8449
8450 zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr;
8451 if (ll_condense_thread != NULL)
8452 zthr_cancel(ll_condense_thread);
34dc7c2f
BB
8453}
8454
8455void
8456spa_async_resume(spa_t *spa)
8457{
8458 mutex_enter(&spa->spa_async_lock);
8459 ASSERT(spa->spa_async_suspended != 0);
8460 spa->spa_async_suspended--;
8461 mutex_exit(&spa->spa_async_lock);
a1d477c2 8462 spa_restart_removal(spa);
9d5b5245
SD
8463
8464 zthr_t *condense_thread = spa->spa_condense_zthr;
61c3391a 8465 if (condense_thread != NULL)
9d5b5245 8466 zthr_resume(condense_thread);
d2734cce
SD
8467
8468 zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr;
61c3391a 8469 if (discard_thread != NULL)
d2734cce 8470 zthr_resume(discard_thread);
37f03da8
SH
8471
8472 zthr_t *ll_delete_thread = spa->spa_livelist_delete_zthr;
8473 if (ll_delete_thread != NULL)
8474 zthr_resume(ll_delete_thread);
8475
8476 zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr;
8477 if (ll_condense_thread != NULL)
8478 zthr_resume(ll_condense_thread);
34dc7c2f
BB
8479}
8480
e6cfd633
WA
8481static boolean_t
8482spa_async_tasks_pending(spa_t *spa)
8483{
8484 uint_t non_config_tasks;
8485 uint_t config_task;
8486 boolean_t config_task_suspended;
8487
8488 non_config_tasks = spa->spa_async_tasks & ~SPA_ASYNC_CONFIG_UPDATE;
8489 config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE;
8490 if (spa->spa_ccw_fail_time == 0) {
8491 config_task_suspended = B_FALSE;
8492 } else {
8493 config_task_suspended =
8494 (gethrtime() - spa->spa_ccw_fail_time) <
05852b34 8495 ((hrtime_t)zfs_ccw_retry_interval * NANOSEC);
e6cfd633
WA
8496 }
8497
8498 return (non_config_tasks || (config_task && !config_task_suspended));
8499}
8500
34dc7c2f
BB
8501static void
8502spa_async_dispatch(spa_t *spa)
8503{
8504 mutex_enter(&spa->spa_async_lock);
e6cfd633
WA
8505 if (spa_async_tasks_pending(spa) &&
8506 !spa->spa_async_suspended &&
da92d5cb 8507 spa->spa_async_thread == NULL)
34dc7c2f
BB
8508 spa->spa_async_thread = thread_create(NULL, 0,
8509 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri);
8510 mutex_exit(&spa->spa_async_lock);
8511}
8512
8513void
8514spa_async_request(spa_t *spa, int task)
8515{
428870ff 8516 zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task);
34dc7c2f
BB
8517 mutex_enter(&spa->spa_async_lock);
8518 spa->spa_async_tasks |= task;
8519 mutex_exit(&spa->spa_async_lock);
8520}
8521
3c819a2c
JP
8522int
8523spa_async_tasks(spa_t *spa)
8524{
8525 return (spa->spa_async_tasks);
8526}
8527
34dc7c2f
BB
8528/*
8529 * ==========================================================================
8530 * SPA syncing routines
8531 * ==========================================================================
8532 */
8533
37f03da8 8534
428870ff 8535static int
37f03da8
SH
8536bpobj_enqueue_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
8537 dmu_tx_t *tx)
34dc7c2f 8538{
428870ff 8539 bpobj_t *bpo = arg;
37f03da8 8540 bpobj_enqueue(bpo, bp, bp_freed, tx);
428870ff
BB
8541 return (0);
8542}
34dc7c2f 8543
37f03da8
SH
8544int
8545bpobj_enqueue_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
8546{
8547 return (bpobj_enqueue_cb(arg, bp, B_FALSE, tx));
8548}
8549
8550int
8551bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
8552{
8553 return (bpobj_enqueue_cb(arg, bp, B_TRUE, tx));
8554}
8555
428870ff
BB
8556static int
8557spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
8558{
9cdf7b1f 8559 zio_t *pio = arg;
34dc7c2f 8560
9cdf7b1f
MA
8561 zio_nowait(zio_free_sync(pio, pio->io_spa, dmu_tx_get_txg(tx), bp,
8562 pio->io_flags));
428870ff 8563 return (0);
34dc7c2f
BB
8564}
8565
37f03da8
SH
8566static int
8567bpobj_spa_free_sync_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
8568 dmu_tx_t *tx)
8569{
8570 ASSERT(!bp_freed);
8571 return (spa_free_sync_cb(arg, bp, tx));
8572}
8573
e8b96c60
MA
8574/*
8575 * Note: this simple function is not inlined to make it easier to dtrace the
8576 * amount of time spent syncing frees.
8577 */
8578static void
8579spa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx)
8580{
8581 zio_t *zio = zio_root(spa, NULL, NULL, 0);
8582 bplist_iterate(bpl, spa_free_sync_cb, zio, tx);
8583 VERIFY(zio_wait(zio) == 0);
8584}
8585
8586/*
8587 * Note: this simple function is not inlined to make it easier to dtrace the
8588 * amount of time spent syncing deferred frees.
8589 */
8590static void
8591spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx)
8592{
8dc2197b
SD
8593 if (spa_sync_pass(spa) != 1)
8594 return;
8595
93e28d66
SD
8596 /*
8597 * Note:
8598 * If the log space map feature is active, we stop deferring
8599 * frees to the next TXG and therefore running this function
8600 * would be considered a no-op as spa_deferred_bpobj should
8601 * not have any entries.
8602 *
8603 * That said we run this function anyway (instead of returning
8604 * immediately) for the edge-case scenario where we just
8605 * activated the log space map feature in this TXG but we have
8606 * deferred frees from the previous TXG.
8607 */
e8b96c60
MA
8608 zio_t *zio = zio_root(spa, NULL, NULL, 0);
8609 VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj,
37f03da8 8610 bpobj_spa_free_sync_cb, zio, tx), ==, 0);
e8b96c60
MA
8611 VERIFY0(zio_wait(zio));
8612}
8613
34dc7c2f
BB
8614static void
8615spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx)
8616{
8617 char *packed = NULL;
b128c09f 8618 size_t bufsize;
34dc7c2f
BB
8619 size_t nvsize = 0;
8620 dmu_buf_t *db;
8621
8622 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0);
8623
b128c09f
BB
8624 /*
8625 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration
b0bc7a84 8626 * information. This avoids the dmu_buf_will_dirty() path and
b128c09f
BB
8627 * saves us a pre-read to get data we don't actually care about.
8628 */
9ae529ec 8629 bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE);
79c76d5b 8630 packed = vmem_alloc(bufsize, KM_SLEEP);
34dc7c2f
BB
8631
8632 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR,
79c76d5b 8633 KM_SLEEP) == 0);
861166b0 8634 memset(packed + nvsize, 0, bufsize - nvsize);
34dc7c2f 8635
b128c09f 8636 dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx);
34dc7c2f 8637
00b46022 8638 vmem_free(packed, bufsize);
34dc7c2f
BB
8639
8640 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
8641 dmu_buf_will_dirty(db, tx);
8642 *(uint64_t *)db->db_data = nvsize;
8643 dmu_buf_rele(db, FTAG);
8644}
8645
8646static void
8647spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx,
8648 const char *config, const char *entry)
8649{
8650 nvlist_t *nvroot;
8651 nvlist_t **list;
8652 int i;
8653
8654 if (!sav->sav_sync)
8655 return;
8656
8657 /*
8658 * Update the MOS nvlist describing the list of available devices.
8659 * spa_validate_aux() will have already made sure this nvlist is
8660 * valid and the vdevs are labeled appropriately.
8661 */
8662 if (sav->sav_object == 0) {
8663 sav->sav_object = dmu_object_alloc(spa->spa_meta_objset,
8664 DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE,
8665 sizeof (uint64_t), tx);
8666 VERIFY(zap_update(spa->spa_meta_objset,
8667 DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1,
8668 &sav->sav_object, tx) == 0);
8669 }
8670
65ad5d11 8671 nvroot = fnvlist_alloc();
34dc7c2f 8672 if (sav->sav_count == 0) {
795075e6
PD
8673 fnvlist_add_nvlist_array(nvroot, config,
8674 (const nvlist_t * const *)NULL, 0);
34dc7c2f 8675 } else {
79c76d5b 8676 list = kmem_alloc(sav->sav_count*sizeof (void *), KM_SLEEP);
34dc7c2f
BB
8677 for (i = 0; i < sav->sav_count; i++)
8678 list[i] = vdev_config_generate(spa, sav->sav_vdevs[i],
428870ff 8679 B_FALSE, VDEV_CONFIG_L2CACHE);
795075e6
PD
8680 fnvlist_add_nvlist_array(nvroot, config,
8681 (const nvlist_t * const *)list, sav->sav_count);
34dc7c2f
BB
8682 for (i = 0; i < sav->sav_count; i++)
8683 nvlist_free(list[i]);
8684 kmem_free(list, sav->sav_count * sizeof (void *));
8685 }
8686
8687 spa_sync_nvlist(spa, sav->sav_object, nvroot, tx);
8688 nvlist_free(nvroot);
8689
8690 sav->sav_sync = B_FALSE;
8691}
8692
e0ab3ab5
JS
8693/*
8694 * Rebuild spa's all-vdev ZAP from the vdev ZAPs indicated in each vdev_t.
8695 * The all-vdev ZAP must be empty.
8696 */
8697static void
8698spa_avz_build(vdev_t *vd, uint64_t avz, dmu_tx_t *tx)
8699{
8700 spa_t *spa = vd->vdev_spa;
e0ab3ab5 8701
3e4ed421
RW
8702 if (vd->vdev_root_zap != 0 &&
8703 spa_feature_is_active(spa, SPA_FEATURE_AVZ_V2)) {
8704 VERIFY0(zap_add_int(spa->spa_meta_objset, avz,
8705 vd->vdev_root_zap, tx));
8706 }
e0ab3ab5
JS
8707 if (vd->vdev_top_zap != 0) {
8708 VERIFY0(zap_add_int(spa->spa_meta_objset, avz,
8709 vd->vdev_top_zap, tx));
8710 }
8711 if (vd->vdev_leaf_zap != 0) {
8712 VERIFY0(zap_add_int(spa->spa_meta_objset, avz,
8713 vd->vdev_leaf_zap, tx));
8714 }
1c27024e 8715 for (uint64_t i = 0; i < vd->vdev_children; i++) {
e0ab3ab5
JS
8716 spa_avz_build(vd->vdev_child[i], avz, tx);
8717 }
8718}
8719
34dc7c2f
BB
8720static void
8721spa_sync_config_object(spa_t *spa, dmu_tx_t *tx)
8722{
8723 nvlist_t *config;
8724
e0ab3ab5
JS
8725 /*
8726 * If the pool is being imported from a pre-per-vdev-ZAP version of ZFS,
8727 * its config may not be dirty but we still need to build per-vdev ZAPs.
8728 * Similarly, if the pool is being assembled (e.g. after a split), we
8729 * need to rebuild the AVZ although the config may not be dirty.
8730 */
8731 if (list_is_empty(&spa->spa_config_dirty_list) &&
8732 spa->spa_avz_action == AVZ_ACTION_NONE)
34dc7c2f
BB
8733 return;
8734
b128c09f
BB
8735 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
8736
e0ab3ab5 8737 ASSERT(spa->spa_avz_action == AVZ_ACTION_NONE ||
38640550 8738 spa->spa_avz_action == AVZ_ACTION_INITIALIZE ||
e0ab3ab5
JS
8739 spa->spa_all_vdev_zaps != 0);
8740
8741 if (spa->spa_avz_action == AVZ_ACTION_REBUILD) {
e0ab3ab5
JS
8742 /* Make and build the new AVZ */
8743 uint64_t new_avz = zap_create(spa->spa_meta_objset,
8744 DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx);
8745 spa_avz_build(spa->spa_root_vdev, new_avz, tx);
8746
8747 /* Diff old AVZ with new one */
1c27024e
DB
8748 zap_cursor_t zc;
8749 zap_attribute_t za;
8750
e0ab3ab5
JS
8751 for (zap_cursor_init(&zc, spa->spa_meta_objset,
8752 spa->spa_all_vdev_zaps);
8753 zap_cursor_retrieve(&zc, &za) == 0;
8754 zap_cursor_advance(&zc)) {
8755 uint64_t vdzap = za.za_first_integer;
8756 if (zap_lookup_int(spa->spa_meta_objset, new_avz,
8757 vdzap) == ENOENT) {
8758 /*
8759 * ZAP is listed in old AVZ but not in new one;
8760 * destroy it
8761 */
8762 VERIFY0(zap_destroy(spa->spa_meta_objset, vdzap,
8763 tx));
8764 }
8765 }
8766
8767 zap_cursor_fini(&zc);
8768
8769 /* Destroy the old AVZ */
8770 VERIFY0(zap_destroy(spa->spa_meta_objset,
8771 spa->spa_all_vdev_zaps, tx));
8772
8773 /* Replace the old AVZ in the dir obj with the new one */
8774 VERIFY0(zap_update(spa->spa_meta_objset,
8775 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP,
8776 sizeof (new_avz), 1, &new_avz, tx));
8777
8778 spa->spa_all_vdev_zaps = new_avz;
8779 } else if (spa->spa_avz_action == AVZ_ACTION_DESTROY) {
8780 zap_cursor_t zc;
8781 zap_attribute_t za;
8782
8783 /* Walk through the AVZ and destroy all listed ZAPs */
8784 for (zap_cursor_init(&zc, spa->spa_meta_objset,
8785 spa->spa_all_vdev_zaps);
8786 zap_cursor_retrieve(&zc, &za) == 0;
8787 zap_cursor_advance(&zc)) {
8788 uint64_t zap = za.za_first_integer;
8789 VERIFY0(zap_destroy(spa->spa_meta_objset, zap, tx));
8790 }
8791
8792 zap_cursor_fini(&zc);
8793
8794 /* Destroy and unlink the AVZ itself */
8795 VERIFY0(zap_destroy(spa->spa_meta_objset,
8796 spa->spa_all_vdev_zaps, tx));
8797 VERIFY0(zap_remove(spa->spa_meta_objset,
8798 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, tx));
8799 spa->spa_all_vdev_zaps = 0;
8800 }
8801
8802 if (spa->spa_all_vdev_zaps == 0) {
8803 spa->spa_all_vdev_zaps = zap_create_link(spa->spa_meta_objset,
8804 DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT,
8805 DMU_POOL_VDEV_ZAP_MAP, tx);
8806 }
8807 spa->spa_avz_action = AVZ_ACTION_NONE;
8808
8809 /* Create ZAPs for vdevs that don't have them. */
8810 vdev_construct_zaps(spa->spa_root_vdev, tx);
8811
b128c09f
BB
8812 config = spa_config_generate(spa, spa->spa_root_vdev,
8813 dmu_tx_get_txg(tx), B_FALSE);
8814
ea0b2538
GW
8815 /*
8816 * If we're upgrading the spa version then make sure that
8817 * the config object gets updated with the correct version.
8818 */
8819 if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version)
8820 fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
8821 spa->spa_uberblock.ub_version);
8822
b128c09f 8823 spa_config_exit(spa, SCL_STATE, FTAG);
34dc7c2f 8824
8a5fc748 8825 nvlist_free(spa->spa_config_syncing);
34dc7c2f
BB
8826 spa->spa_config_syncing = config;
8827
8828 spa_sync_nvlist(spa, spa->spa_config_object, config, tx);
8829}
8830
9ae529ec 8831static void
13fe0198 8832spa_sync_version(void *arg, dmu_tx_t *tx)
9ae529ec 8833{
13fe0198
MA
8834 uint64_t *versionp = arg;
8835 uint64_t version = *versionp;
8836 spa_t *spa = dmu_tx_pool(tx)->dp_spa;
9ae529ec
CS
8837
8838 /*
8839 * Setting the version is special cased when first creating the pool.
8840 */
8841 ASSERT(tx->tx_txg != TXG_INITIAL);
8842
8dca0a9a 8843 ASSERT(SPA_VERSION_IS_SUPPORTED(version));
9ae529ec
CS
8844 ASSERT(version >= spa_version(spa));
8845
8846 spa->spa_uberblock.ub_version = version;
8847 vdev_config_dirty(spa->spa_root_vdev);
74756182
MM
8848 spa_history_log_internal(spa, "set", tx, "version=%lld",
8849 (longlong_t)version);
9ae529ec
CS
8850}
8851
34dc7c2f
BB
8852/*
8853 * Set zpool properties.
8854 */
8855static void
13fe0198 8856spa_sync_props(void *arg, dmu_tx_t *tx)
34dc7c2f 8857{
13fe0198
MA
8858 nvlist_t *nvp = arg;
8859 spa_t *spa = dmu_tx_pool(tx)->dp_spa;
34dc7c2f 8860 objset_t *mos = spa->spa_meta_objset;
9ae529ec 8861 nvpair_t *elem = NULL;
b128c09f
BB
8862
8863 mutex_enter(&spa->spa_props_lock);
34dc7c2f 8864
34dc7c2f 8865 while ((elem = nvlist_next_nvpair(nvp, elem))) {
9ae529ec 8866 uint64_t intval;
d1807f16 8867 const char *strval, *fname;
9ae529ec
CS
8868 zpool_prop_t prop;
8869 const char *propname;
8eae2d21 8870 const char *elemname = nvpair_name(elem);
9ae529ec 8871 zprop_type_t proptype;
fa86b5db 8872 spa_feature_t fid;
9ae529ec 8873
8eae2d21 8874 switch (prop = zpool_name_to_prop(elemname)) {
34dc7c2f 8875 case ZPOOL_PROP_VERSION:
93cf2076 8876 intval = fnvpair_value_uint64(elem);
34dc7c2f 8877 /*
4e33ba4c 8878 * The version is synced separately before other
9ae529ec 8879 * properties and should be correct by now.
34dc7c2f 8880 */
9ae529ec 8881 ASSERT3U(spa_version(spa), >=, intval);
34dc7c2f
BB
8882 break;
8883
8884 case ZPOOL_PROP_ALTROOT:
8885 /*
8886 * 'altroot' is a non-persistent property. It should
8887 * have been set temporarily at creation or import time.
8888 */
8889 ASSERT(spa->spa_root != NULL);
8890 break;
8891
572e2857 8892 case ZPOOL_PROP_READONLY:
34dc7c2f
BB
8893 case ZPOOL_PROP_CACHEFILE:
8894 /*
e1cfd73f 8895 * 'readonly' and 'cachefile' are also non-persistent
572e2857 8896 * properties.
34dc7c2f 8897 */
34dc7c2f 8898 break;
d96eb2b1 8899 case ZPOOL_PROP_COMMENT:
93cf2076 8900 strval = fnvpair_value_string(elem);
d96eb2b1
DM
8901 if (spa->spa_comment != NULL)
8902 spa_strfree(spa->spa_comment);
8903 spa->spa_comment = spa_strdup(strval);
8904 /*
8905 * We need to dirty the configuration on all the vdevs
88a48330
BB
8906 * so that their labels get updated. We also need to
8907 * update the cache file to keep it in sync with the
8908 * MOS version. It's unnecessary to do this for pool
8909 * creation since the vdev's configuration has already
8910 * been dirtied.
d96eb2b1 8911 */
88a48330 8912 if (tx->tx_txg != TXG_INITIAL) {
d96eb2b1 8913 vdev_config_dirty(spa->spa_root_vdev);
88a48330
BB
8914 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
8915 }
6f1ffb06 8916 spa_history_log_internal(spa, "set", tx,
8eae2d21 8917 "%s=%s", elemname, strval);
d96eb2b1 8918 break;
658fb802
CB
8919 case ZPOOL_PROP_COMPATIBILITY:
8920 strval = fnvpair_value_string(elem);
8921 if (spa->spa_compatibility != NULL)
8922 spa_strfree(spa->spa_compatibility);
8923 spa->spa_compatibility = spa_strdup(strval);
8924 /*
8925 * Dirty the configuration on vdevs as above.
8926 */
88a48330 8927 if (tx->tx_txg != TXG_INITIAL) {
658fb802 8928 vdev_config_dirty(spa->spa_root_vdev);
88a48330
BB
8929 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
8930 }
8931
658fb802
CB
8932 spa_history_log_internal(spa, "set", tx,
8933 "%s=%s", nvpair_name(elem), strval);
8934 break;
8935
8eae2d21
AJ
8936 case ZPOOL_PROP_INVAL:
8937 if (zpool_prop_feature(elemname)) {
8938 fname = strchr(elemname, '@') + 1;
8939 VERIFY0(zfeature_lookup_name(fname, &fid));
8940
8941 spa_feature_enable(spa, fid, tx);
8942 spa_history_log_internal(spa, "set", tx,
8943 "%s=enabled", elemname);
8944 break;
8945 } else if (!zfs_prop_user(elemname)) {
8946 ASSERT(zpool_prop_feature(elemname));
8947 break;
8948 }
8949 zfs_fallthrough;
34dc7c2f
BB
8950 default:
8951 /*
8952 * Set pool property values in the poolprops mos object.
8953 */
34dc7c2f 8954 if (spa->spa_pool_props_object == 0) {
9ae529ec
CS
8955 spa->spa_pool_props_object =
8956 zap_create_link(mos, DMU_OT_POOL_PROPS,
34dc7c2f 8957 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS,
9ae529ec 8958 tx);
34dc7c2f 8959 }
34dc7c2f
BB
8960
8961 /* normalize the property name */
ee7b71db 8962 if (prop == ZPOOL_PROP_INVAL) {
8eae2d21
AJ
8963 propname = elemname;
8964 proptype = PROP_TYPE_STRING;
ee7b71db
RY
8965 } else {
8966 propname = zpool_prop_to_name(prop);
8967 proptype = zpool_prop_get_type(prop);
8eae2d21 8968 }
34dc7c2f
BB
8969
8970 if (nvpair_type(elem) == DATA_TYPE_STRING) {
8971 ASSERT(proptype == PROP_TYPE_STRING);
93cf2076
GW
8972 strval = fnvpair_value_string(elem);
8973 VERIFY0(zap_update(mos,
34dc7c2f 8974 spa->spa_pool_props_object, propname,
93cf2076 8975 1, strlen(strval) + 1, strval, tx));
6f1ffb06 8976 spa_history_log_internal(spa, "set", tx,
8eae2d21 8977 "%s=%s", elemname, strval);
34dc7c2f 8978 } else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
93cf2076 8979 intval = fnvpair_value_uint64(elem);
34dc7c2f
BB
8980
8981 if (proptype == PROP_TYPE_INDEX) {
8982 const char *unused;
93cf2076
GW
8983 VERIFY0(zpool_prop_index_to_string(
8984 prop, intval, &unused));
34dc7c2f 8985 }
93cf2076 8986 VERIFY0(zap_update(mos,
34dc7c2f 8987 spa->spa_pool_props_object, propname,
93cf2076 8988 8, 1, &intval, tx));
6f1ffb06 8989 spa_history_log_internal(spa, "set", tx,
8eae2d21 8990 "%s=%lld", elemname,
74756182 8991 (longlong_t)intval);
34dc7c2f 8992
44f71818
RY
8993 switch (prop) {
8994 case ZPOOL_PROP_DELEGATION:
8995 spa->spa_delegation = intval;
8996 break;
8997 case ZPOOL_PROP_BOOTFS:
8998 spa->spa_bootfs = intval;
8999 break;
9000 case ZPOOL_PROP_FAILUREMODE:
9001 spa->spa_failmode = intval;
9002 break;
9003 case ZPOOL_PROP_AUTOTRIM:
9004 spa->spa_autotrim = intval;
428870ff 9005 spa_async_request(spa,
44f71818
RY
9006 SPA_ASYNC_AUTOTRIM_RESTART);
9007 break;
9008 case ZPOOL_PROP_AUTOEXPAND:
9009 spa->spa_autoexpand = intval;
9010 if (tx->tx_txg != TXG_INITIAL)
9011 spa_async_request(spa,
9012 SPA_ASYNC_AUTOEXPAND);
9013 break;
9014 case ZPOOL_PROP_MULTIHOST:
9015 spa->spa_multihost = intval;
9016 break;
9017 default:
9018 break;
9019 }
9020 } else {
9021 ASSERT(0); /* not allowed */
34dc7c2f
BB
9022 }
9023 }
9024
34dc7c2f 9025 }
b128c09f
BB
9026
9027 mutex_exit(&spa->spa_props_lock);
34dc7c2f
BB
9028}
9029
428870ff
BB
9030/*
9031 * Perform one-time upgrade on-disk changes. spa_version() does not
9032 * reflect the new version this txg, so there must be no changes this
9033 * txg to anything that the upgrade code depends on after it executes.
9034 * Therefore this must be called after dsl_pool_sync() does the sync
9035 * tasks.
9036 */
9037static void
9038spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx)
9039{
8dc2197b
SD
9040 if (spa_sync_pass(spa) != 1)
9041 return;
428870ff 9042
8dc2197b 9043 dsl_pool_t *dp = spa->spa_dsl_pool;
13fe0198
MA
9044 rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
9045
428870ff
BB
9046 if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN &&
9047 spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) {
9048 dsl_pool_create_origin(dp, tx);
9049
9050 /* Keeping the origin open increases spa_minref */
9051 spa->spa_minref += 3;
9052 }
9053
9054 if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES &&
9055 spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) {
9056 dsl_pool_upgrade_clones(dp, tx);
9057 }
9058
9059 if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES &&
9060 spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) {
9061 dsl_pool_upgrade_dir_clones(dp, tx);
9062
9063 /* Keeping the freedir open increases spa_minref */
9064 spa->spa_minref += 3;
9065 }
9ae529ec
CS
9066
9067 if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES &&
9068 spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) {
9069 spa_feature_create_zap_objects(spa, tx);
9070 }
62bdd5eb
DL
9071
9072 /*
9073 * LZ4_COMPRESS feature's behaviour was changed to activate_on_enable
9074 * when possibility to use lz4 compression for metadata was added
9075 * Old pools that have this feature enabled must be upgraded to have
9076 * this feature active
9077 */
9078 if (spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) {
9079 boolean_t lz4_en = spa_feature_is_enabled(spa,
9080 SPA_FEATURE_LZ4_COMPRESS);
9081 boolean_t lz4_ac = spa_feature_is_active(spa,
9082 SPA_FEATURE_LZ4_COMPRESS);
9083
9084 if (lz4_en && !lz4_ac)
9085 spa_feature_incr(spa, SPA_FEATURE_LZ4_COMPRESS, tx);
9086 }
3c67d83a
TH
9087
9088 /*
9089 * If we haven't written the salt, do so now. Note that the
9090 * feature may not be activated yet, but that's fine since
9091 * the presence of this ZAP entry is backwards compatible.
9092 */
9093 if (zap_contains(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
9094 DMU_POOL_CHECKSUM_SALT) == ENOENT) {
9095 VERIFY0(zap_add(spa->spa_meta_objset,
9096 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT, 1,
9097 sizeof (spa->spa_cksum_salt.zcs_bytes),
9098 spa->spa_cksum_salt.zcs_bytes, tx));
9099 }
9100
13fe0198 9101 rrw_exit(&dp->dp_config_rwlock, FTAG);
428870ff
BB
9102}
9103
a1d477c2
MA
9104static void
9105vdev_indirect_state_sync_verify(vdev_t *vd)
9106{
2a8ba608
MM
9107 vdev_indirect_mapping_t *vim __maybe_unused = vd->vdev_indirect_mapping;
9108 vdev_indirect_births_t *vib __maybe_unused = vd->vdev_indirect_births;
a1d477c2
MA
9109
9110 if (vd->vdev_ops == &vdev_indirect_ops) {
9111 ASSERT(vim != NULL);
9112 ASSERT(vib != NULL);
9113 }
9114
27f80e85
BB
9115 uint64_t obsolete_sm_object = 0;
9116 ASSERT0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));
9117 if (obsolete_sm_object != 0) {
a1d477c2
MA
9118 ASSERT(vd->vdev_obsolete_sm != NULL);
9119 ASSERT(vd->vdev_removing ||
9120 vd->vdev_ops == &vdev_indirect_ops);
9121 ASSERT(vdev_indirect_mapping_num_entries(vim) > 0);
9122 ASSERT(vdev_indirect_mapping_bytes_mapped(vim) > 0);
27f80e85 9123 ASSERT3U(obsolete_sm_object, ==,
a1d477c2
MA
9124 space_map_object(vd->vdev_obsolete_sm));
9125 ASSERT3U(vdev_indirect_mapping_bytes_mapped(vim), >=,
9126 space_map_allocated(vd->vdev_obsolete_sm));
9127 }
9128 ASSERT(vd->vdev_obsolete_segments != NULL);
9129
9130 /*
9131 * Since frees / remaps to an indirect vdev can only
9132 * happen in syncing context, the obsolete segments
9133 * tree must be empty when we start syncing.
9134 */
9135 ASSERT0(range_tree_space(vd->vdev_obsolete_segments));
9136}
9137
34dc7c2f 9138/*
8dc2197b
SD
9139 * Set the top-level vdev's max queue depth. Evaluate each top-level's
9140 * async write queue depth in case it changed. The max queue depth will
9141 * not change in the middle of syncing out this txg.
34dc7c2f 9142 */
8dc2197b
SD
9143static void
9144spa_sync_adjust_vdev_max_queue_depth(spa_t *spa)
34dc7c2f 9145{
8dc2197b
SD
9146 ASSERT(spa_writeable(spa));
9147
34dc7c2f 9148 vdev_t *rvd = spa->spa_root_vdev;
3dfb57a3
DB
9149 uint32_t max_queue_depth = zfs_vdev_async_write_max_active *
9150 zfs_vdev_queue_depth_pct / 100;
8dc2197b
SD
9151 metaslab_class_t *normal = spa_normal_class(spa);
9152 metaslab_class_t *special = spa_special_class(spa);
9153 metaslab_class_t *dedup = spa_dedup_class(spa);
34dc7c2f 9154
492f64e9 9155 uint64_t slots_per_allocator = 0;
1c27024e 9156 for (int c = 0; c < rvd->vdev_children; c++) {
3dfb57a3 9157 vdev_t *tvd = rvd->vdev_child[c];
cc99f275 9158
8dc2197b 9159 metaslab_group_t *mg = tvd->vdev_mg;
cc99f275
DB
9160 if (mg == NULL || !metaslab_group_initialized(mg))
9161 continue;
3dfb57a3 9162
8dc2197b 9163 metaslab_class_t *mc = mg->mg_class;
cc99f275 9164 if (mc != normal && mc != special && mc != dedup)
3dfb57a3
DB
9165 continue;
9166
9167 /*
9168 * It is safe to do a lock-free check here because only async
9169 * allocations look at mg_max_alloc_queue_depth, and async
9170 * allocations all happen from spa_sync().
9171 */
32d805c3 9172 for (int i = 0; i < mg->mg_allocators; i++) {
424fd7c3 9173 ASSERT0(zfs_refcount_count(
32d805c3
MA
9174 &(mg->mg_allocator[i].mga_alloc_queue_depth)));
9175 }
3dfb57a3 9176 mg->mg_max_alloc_queue_depth = max_queue_depth;
492f64e9 9177
32d805c3
MA
9178 for (int i = 0; i < mg->mg_allocators; i++) {
9179 mg->mg_allocator[i].mga_cur_max_alloc_queue_depth =
492f64e9
PD
9180 zfs_vdev_def_queue_depth;
9181 }
9182 slots_per_allocator += zfs_vdev_def_queue_depth;
3dfb57a3 9183 }
cc99f275 9184
492f64e9 9185 for (int i = 0; i < spa->spa_alloc_count; i++) {
f8020c93
AM
9186 ASSERT0(zfs_refcount_count(&normal->mc_allocator[i].
9187 mca_alloc_slots));
9188 ASSERT0(zfs_refcount_count(&special->mc_allocator[i].
9189 mca_alloc_slots));
9190 ASSERT0(zfs_refcount_count(&dedup->mc_allocator[i].
9191 mca_alloc_slots));
9192 normal->mc_allocator[i].mca_alloc_max_slots =
9193 slots_per_allocator;
9194 special->mc_allocator[i].mca_alloc_max_slots =
9195 slots_per_allocator;
9196 dedup->mc_allocator[i].mca_alloc_max_slots =
9197 slots_per_allocator;
cc99f275
DB
9198 }
9199 normal->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
9200 special->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
9201 dedup->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
8dc2197b
SD
9202}
9203
9204static void
9205spa_sync_condense_indirect(spa_t *spa, dmu_tx_t *tx)
9206{
9207 ASSERT(spa_writeable(spa));
3dfb57a3 9208
8dc2197b 9209 vdev_t *rvd = spa->spa_root_vdev;
a1d477c2
MA
9210 for (int c = 0; c < rvd->vdev_children; c++) {
9211 vdev_t *vd = rvd->vdev_child[c];
9212 vdev_indirect_state_sync_verify(vd);
9213
9214 if (vdev_indirect_should_condense(vd)) {
9215 spa_condense_indirect_start_sync(vd, tx);
9216 break;
9217 }
9218 }
8dc2197b
SD
9219}
9220
9221static void
9222spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx)
9223{
9224 objset_t *mos = spa->spa_meta_objset;
9225 dsl_pool_t *dp = spa->spa_dsl_pool;
9226 uint64_t txg = tx->tx_txg;
9227 bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK];
a1d477c2 9228
34dc7c2f 9229 do {
428870ff 9230 int pass = ++spa->spa_sync_pass;
34dc7c2f
BB
9231
9232 spa_sync_config_object(spa, tx);
9233 spa_sync_aux_dev(spa, &spa->spa_spares, tx,
9234 ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES);
9235 spa_sync_aux_dev(spa, &spa->spa_l2cache, tx,
9236 ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE);
9237 spa_errlog_sync(spa, txg);
9238 dsl_pool_sync(dp, txg);
9239
93e28d66
SD
9240 if (pass < zfs_sync_pass_deferred_free ||
9241 spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) {
9242 /*
9243 * If the log space map feature is active we don't
9244 * care about deferred frees and the deferred bpobj
9245 * as the log space map should effectively have the
9246 * same results (i.e. appending only to one object).
9247 */
e8b96c60 9248 spa_sync_frees(spa, free_bpl, tx);
428870ff 9249 } else {
905edb40
MA
9250 /*
9251 * We can not defer frees in pass 1, because
9252 * we sync the deferred frees later in pass 1.
9253 */
9254 ASSERT3U(pass, >, 1);
37f03da8 9255 bplist_iterate(free_bpl, bpobj_enqueue_alloc_cb,
e8b96c60 9256 &spa->spa_deferred_bpobj, tx);
34dc7c2f
BB
9257 }
9258
67a1b037 9259 brt_sync(spa, txg);
428870ff
BB
9260 ddt_sync(spa, txg);
9261 dsl_scan_sync(dp, tx);
482eeef8 9262 dsl_errorscrub_sync(dp, tx);
8dc2197b
SD
9263 svr_sync(spa, tx);
9264 spa_sync_upgrades(spa, tx);
34dc7c2f 9265
93e28d66
SD
9266 spa_flush_metaslabs(spa, tx);
9267
8dc2197b 9268 vdev_t *vd = NULL;
a1d477c2
MA
9269 while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, txg))
9270 != NULL)
428870ff
BB
9271 vdev_sync(vd, txg);
9272
8dc2197b
SD
9273 /*
9274 * Note: We need to check if the MOS is dirty because we could
9275 * have marked the MOS dirty without updating the uberblock
9276 * (e.g. if we have sync tasks but no dirty user data). We need
9277 * to check the uberblock's rootbp because it is updated if we
9278 * have synced out dirty data (though in this case the MOS will
9279 * most likely also be dirty due to second order effects, we
9280 * don't want to rely on that here).
9281 */
9282 if (pass == 1 &&
9283 spa->spa_uberblock.ub_rootbp.blk_birth < txg &&
9284 !dmu_objset_is_dirty(mos, txg)) {
905edb40 9285 /*
8dc2197b
SD
9286 * Nothing changed on the first pass, therefore this
9287 * TXG is a no-op. Avoid syncing deferred frees, so
9288 * that we can keep this TXG as a no-op.
905edb40 9289 */
8dc2197b
SD
9290 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
9291 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
9292 ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg));
9293 ASSERT(txg_list_empty(&dp->dp_early_sync_tasks, txg));
9294 break;
905edb40 9295 }
34dc7c2f 9296
8dc2197b 9297 spa_sync_deferred_frees(spa, tx);
428870ff 9298 } while (dmu_objset_is_dirty(mos, txg));
8dc2197b 9299}
34dc7c2f 9300
8dc2197b
SD
9301/*
9302 * Rewrite the vdev configuration (which includes the uberblock) to
9303 * commit the transaction group.
9304 *
9305 * If there are no dirty vdevs, we sync the uberblock to a few random
9306 * top-level vdevs that are known to be visible in the config cache
9307 * (see spa_vdev_add() for a complete description). If there *are* dirty
9308 * vdevs, sync the uberblock to all vdevs.
9309 */
9310static void
9311spa_sync_rewrite_vdev_config(spa_t *spa, dmu_tx_t *tx)
9312{
9313 vdev_t *rvd = spa->spa_root_vdev;
9314 uint64_t txg = tx->tx_txg;
a1d477c2 9315
b128c09f 9316 for (;;) {
8dc2197b
SD
9317 int error = 0;
9318
b128c09f
BB
9319 /*
9320 * We hold SCL_STATE to prevent vdev open/close/etc.
9321 * while we're attempting to write the vdev labels.
9322 */
9323 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
9324
9325 if (list_is_empty(&spa->spa_config_dirty_list)) {
d2734cce 9326 vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL };
b128c09f
BB
9327 int svdcount = 0;
9328 int children = rvd->vdev_children;
29274c9f 9329 int c0 = random_in_range(children);
b128c09f 9330
1c27024e 9331 for (int c = 0; c < children; c++) {
8dc2197b
SD
9332 vdev_t *vd =
9333 rvd->vdev_child[(c0 + c) % children];
d2734cce
SD
9334
9335 /* Stop when revisiting the first vdev */
9336 if (c > 0 && svd[0] == vd)
9337 break;
9338
8dc2197b
SD
9339 if (vd->vdev_ms_array == 0 ||
9340 vd->vdev_islog ||
a1d477c2 9341 !vdev_is_concrete(vd))
b128c09f 9342 continue;
d2734cce 9343
b128c09f 9344 svd[svdcount++] = vd;
6cb8e530 9345 if (svdcount == SPA_SYNC_MIN_VDEVS)
b128c09f
BB
9346 break;
9347 }
b6fcb792 9348 error = vdev_config_sync(svd, svdcount, txg);
b128c09f
BB
9349 } else {
9350 error = vdev_config_sync(rvd->vdev_child,
b6fcb792 9351 rvd->vdev_children, txg);
34dc7c2f 9352 }
34dc7c2f 9353
3bc7e0fb
GW
9354 if (error == 0)
9355 spa->spa_last_synced_guid = rvd->vdev_guid;
9356
b128c09f
BB
9357 spa_config_exit(spa, SCL_STATE, FTAG);
9358
9359 if (error == 0)
9360 break;
cec3a0a1 9361 zio_suspend(spa, NULL, ZIO_SUSPEND_IOERR);
b128c09f
BB
9362 zio_resume_wait(spa);
9363 }
8dc2197b
SD
9364}
9365
9366/*
9367 * Sync the specified transaction group. New blocks may be dirtied as
9368 * part of the process, so we iterate until it converges.
9369 */
9370void
9371spa_sync(spa_t *spa, uint64_t txg)
9372{
9373 vdev_t *vd = NULL;
9374
9375 VERIFY(spa_writeable(spa));
9376
9377 /*
9378 * Wait for i/os issued in open context that need to complete
9379 * before this txg syncs.
9380 */
9381 (void) zio_wait(spa->spa_txg_zio[txg & TXG_MASK]);
9382 spa->spa_txg_zio[txg & TXG_MASK] = zio_root(spa, NULL, NULL,
9383 ZIO_FLAG_CANFAIL);
9384
67a1b037
PJD
9385 /*
9386 * Now that there can be no more cloning in this transaction group,
9387 * but we are still before issuing frees, we can process pending BRT
9388 * updates.
9389 */
9390 brt_pending_apply(spa, txg);
9391
8dc2197b
SD
9392 /*
9393 * Lock out configuration changes.
9394 */
9395 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
9396
9397 spa->spa_syncing_txg = txg;
9398 spa->spa_sync_pass = 0;
9399
9400 for (int i = 0; i < spa->spa_alloc_count; i++) {
1b50749c
AM
9401 mutex_enter(&spa->spa_allocs[i].spaa_lock);
9402 VERIFY0(avl_numnodes(&spa->spa_allocs[i].spaa_tree));
9403 mutex_exit(&spa->spa_allocs[i].spaa_lock);
8dc2197b
SD
9404 }
9405
9406 /*
9407 * If there are any pending vdev state changes, convert them
9408 * into config changes that go out with this transaction group.
9409 */
9410 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
9f08b6e3
RY
9411 while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) {
9412 /* Avoid holding the write lock unless actually necessary */
9413 if (vd->vdev_aux == NULL) {
9414 vdev_state_clean(vd);
9415 vdev_config_dirty(vd);
9416 continue;
9417 }
8dc2197b
SD
9418 /*
9419 * We need the write lock here because, for aux vdevs,
9420 * calling vdev_config_dirty() modifies sav_config.
9421 * This is ugly and will become unnecessary when we
9422 * eliminate the aux vdev wart by integrating all vdevs
9423 * into the root vdev tree.
9424 */
9425 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
9426 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER);
9427 while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) {
9428 vdev_state_clean(vd);
9429 vdev_config_dirty(vd);
9430 }
9431 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
9432 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
9433 }
9434 spa_config_exit(spa, SCL_STATE, FTAG);
9435
9436 dsl_pool_t *dp = spa->spa_dsl_pool;
9437 dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg);
9438
9439 spa->spa_sync_starttime = gethrtime();
9440 taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid);
9441 spa->spa_deadman_tqid = taskq_dispatch_delay(system_delay_taskq,
9442 spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() +
9443 NSEC_TO_TICK(spa->spa_deadman_synctime));
9444
9445 /*
9446 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg,
9447 * set spa_deflate if we have no raid-z vdevs.
9448 */
9449 if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE &&
9450 spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) {
9451 vdev_t *rvd = spa->spa_root_vdev;
9452
9453 int i;
9454 for (i = 0; i < rvd->vdev_children; i++) {
9455 vd = rvd->vdev_child[i];
9456 if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE)
9457 break;
9458 }
9459 if (i == rvd->vdev_children) {
9460 spa->spa_deflate = TRUE;
9461 VERIFY0(zap_add(spa->spa_meta_objset,
9462 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
9463 sizeof (uint64_t), 1, &spa->spa_deflate, tx));
9464 }
9465 }
9466
9467 spa_sync_adjust_vdev_max_queue_depth(spa);
9468
9469 spa_sync_condense_indirect(spa, tx);
9470
9471 spa_sync_iterate_to_convergence(spa, tx);
9472
9473#ifdef ZFS_DEBUG
9474 if (!list_is_empty(&spa->spa_config_dirty_list)) {
9475 /*
9476 * Make sure that the number of ZAPs for all the vdevs matches
9477 * the number of ZAPs in the per-vdev ZAP list. This only gets
9478 * called if the config is dirty; otherwise there may be
9479 * outstanding AVZ operations that weren't completed in
9480 * spa_sync_config_object.
9481 */
9482 uint64_t all_vdev_zap_entry_count;
9483 ASSERT0(zap_count(spa->spa_meta_objset,
9484 spa->spa_all_vdev_zaps, &all_vdev_zap_entry_count));
9485 ASSERT3U(vdev_count_verify_zaps(spa->spa_root_vdev), ==,
9486 all_vdev_zap_entry_count);
9487 }
9488#endif
9489
9490 if (spa->spa_vdev_removal != NULL) {
9491 ASSERT0(spa->spa_vdev_removal->svr_bytes_done[txg & TXG_MASK]);
9492 }
9493
9494 spa_sync_rewrite_vdev_config(spa, tx);
34dc7c2f
BB
9495 dmu_tx_commit(tx);
9496
57ddcda1 9497 taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid);
cc92e9d0
GW
9498 spa->spa_deadman_tqid = 0;
9499
34dc7c2f
BB
9500 /*
9501 * Clear the dirty config list.
9502 */
b128c09f 9503 while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL)
34dc7c2f
BB
9504 vdev_config_clean(vd);
9505
9506 /*
9507 * Now that the new config has synced transactionally,
9508 * let it become visible to the config cache.
9509 */
9510 if (spa->spa_config_syncing != NULL) {
9511 spa_config_set(spa, spa->spa_config_syncing);
9512 spa->spa_config_txg = txg;
9513 spa->spa_config_syncing = NULL;
9514 }
9515
428870ff 9516 dsl_pool_sync_done(dp, txg);
34dc7c2f 9517
492f64e9 9518 for (int i = 0; i < spa->spa_alloc_count; i++) {
1b50749c
AM
9519 mutex_enter(&spa->spa_allocs[i].spaa_lock);
9520 VERIFY0(avl_numnodes(&spa->spa_allocs[i].spaa_tree));
9521 mutex_exit(&spa->spa_allocs[i].spaa_lock);
492f64e9 9522 }
3dfb57a3 9523
34dc7c2f
BB
9524 /*
9525 * Update usable space statistics.
9526 */
619f0976
GW
9527 while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)))
9528 != NULL)
34dc7c2f 9529 vdev_sync_done(vd, txg);
f09fda50
PD
9530
9531 metaslab_class_evict_old(spa->spa_normal_class, txg);
9532 metaslab_class_evict_old(spa->spa_log_class, txg);
9533
93e28d66 9534 spa_sync_close_syncing_log_sm(spa);
34dc7c2f 9535
428870ff
BB
9536 spa_update_dspace(spa);
9537
65d10bd8
KJ
9538 if (spa_get_autotrim(spa) == SPA_AUTOTRIM_ON)
9539 vdev_autotrim_kick(spa);
9540
34dc7c2f
BB
9541 /*
9542 * It had better be the case that we didn't dirty anything
9543 * since vdev_config_sync().
9544 */
9545 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
9546 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
9547 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg));
428870ff 9548
d2734cce
SD
9549 while (zfs_pause_spa_sync)
9550 delay(1);
9551
428870ff 9552 spa->spa_sync_pass = 0;
34dc7c2f 9553
55922e73
GW
9554 /*
9555 * Update the last synced uberblock here. We want to do this at
9556 * the end of spa_sync() so that consumers of spa_last_synced_txg()
9557 * will be guaranteed that all the processing associated with
9558 * that txg has been completed.
9559 */
9560 spa->spa_ubsync = spa->spa_uberblock;
b128c09f 9561 spa_config_exit(spa, SCL_CONFIG, FTAG);
34dc7c2f 9562
428870ff
BB
9563 spa_handle_ignored_writes(spa);
9564
34dc7c2f
BB
9565 /*
9566 * If any async tasks have been requested, kick them off.
9567 */
9568 spa_async_dispatch(spa);
9569}
9570
9571/*
9572 * Sync all pools. We don't want to hold the namespace lock across these
9573 * operations, so we take a reference on the spa_t and drop the lock during the
9574 * sync.
9575 */
9576void
9577spa_sync_allpools(void)
9578{
9579 spa_t *spa = NULL;
9580 mutex_enter(&spa_namespace_lock);
9581 while ((spa = spa_next(spa)) != NULL) {
572e2857
BB
9582 if (spa_state(spa) != POOL_STATE_ACTIVE ||
9583 !spa_writeable(spa) || spa_suspended(spa))
34dc7c2f
BB
9584 continue;
9585 spa_open_ref(spa, FTAG);
9586 mutex_exit(&spa_namespace_lock);
9587 txg_wait_synced(spa_get_dsl(spa), 0);
9588 mutex_enter(&spa_namespace_lock);
9589 spa_close(spa, FTAG);
9590 }
9591 mutex_exit(&spa_namespace_lock);
9592}
9593
9594/*
9595 * ==========================================================================
9596 * Miscellaneous routines
9597 * ==========================================================================
9598 */
9599
9600/*
9601 * Remove all pools in the system.
9602 */
9603void
9604spa_evict_all(void)
9605{
9606 spa_t *spa;
9607
9608 /*
9609 * Remove all cached state. All pools should be closed now,
9610 * so every spa in the AVL tree should be unreferenced.
9611 */
9612 mutex_enter(&spa_namespace_lock);
9613 while ((spa = spa_next(NULL)) != NULL) {
9614 /*
9615 * Stop async tasks. The async thread may need to detach
9616 * a device that's been replaced, which requires grabbing
9617 * spa_namespace_lock, so we must drop it here.
9618 */
9619 spa_open_ref(spa, FTAG);
9620 mutex_exit(&spa_namespace_lock);
9621 spa_async_suspend(spa);
9622 mutex_enter(&spa_namespace_lock);
34dc7c2f
BB
9623 spa_close(spa, FTAG);
9624
9625 if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
9626 spa_unload(spa);
9627 spa_deactivate(spa);
9628 }
9629 spa_remove(spa);
9630 }
9631 mutex_exit(&spa_namespace_lock);
9632}
9633
9634vdev_t *
9babb374 9635spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux)
34dc7c2f 9636{
b128c09f
BB
9637 vdev_t *vd;
9638 int i;
9639
9640 if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL)
9641 return (vd);
9642
9babb374 9643 if (aux) {
b128c09f
BB
9644 for (i = 0; i < spa->spa_l2cache.sav_count; i++) {
9645 vd = spa->spa_l2cache.sav_vdevs[i];
9babb374
BB
9646 if (vd->vdev_guid == guid)
9647 return (vd);
9648 }
9649
9650 for (i = 0; i < spa->spa_spares.sav_count; i++) {
9651 vd = spa->spa_spares.sav_vdevs[i];
b128c09f
BB
9652 if (vd->vdev_guid == guid)
9653 return (vd);
9654 }
9655 }
9656
9657 return (NULL);
34dc7c2f
BB
9658}
9659
9660void
9661spa_upgrade(spa_t *spa, uint64_t version)
9662{
572e2857
BB
9663 ASSERT(spa_writeable(spa));
9664
b128c09f 9665 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
34dc7c2f
BB
9666
9667 /*
9668 * This should only be called for a non-faulted pool, and since a
9669 * future version would result in an unopenable pool, this shouldn't be
9670 * possible.
9671 */
8dca0a9a 9672 ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version));
9b67f605 9673 ASSERT3U(version, >=, spa->spa_uberblock.ub_version);
34dc7c2f
BB
9674
9675 spa->spa_uberblock.ub_version = version;
9676 vdev_config_dirty(spa->spa_root_vdev);
9677
b128c09f 9678 spa_config_exit(spa, SCL_ALL, FTAG);
34dc7c2f
BB
9679
9680 txg_wait_synced(spa_get_dsl(spa), 0);
9681}
9682
49d42425
FU
9683static boolean_t
9684spa_has_aux_vdev(spa_t *spa, uint64_t guid, spa_aux_vdev_t *sav)
34dc7c2f 9685{
14e4e3cb 9686 (void) spa;
34dc7c2f 9687 int i;
49d42425 9688 uint64_t vdev_guid;
34dc7c2f
BB
9689
9690 for (i = 0; i < sav->sav_count; i++)
9691 if (sav->sav_vdevs[i]->vdev_guid == guid)
9692 return (B_TRUE);
9693
9694 for (i = 0; i < sav->sav_npending; i++) {
9695 if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID,
49d42425 9696 &vdev_guid) == 0 && vdev_guid == guid)
34dc7c2f
BB
9697 return (B_TRUE);
9698 }
9699
9700 return (B_FALSE);
9701}
9702
49d42425
FU
9703boolean_t
9704spa_has_l2cache(spa_t *spa, uint64_t guid)
9705{
9706 return (spa_has_aux_vdev(spa, guid, &spa->spa_l2cache));
9707}
9708
9709boolean_t
9710spa_has_spare(spa_t *spa, uint64_t guid)
9711{
9712 return (spa_has_aux_vdev(spa, guid, &spa->spa_spares));
9713}
9714
b128c09f
BB
9715/*
9716 * Check if a pool has an active shared spare device.
9717 * Note: reference count of an active spare is 2, as a spare and as a replace
9718 */
9719static boolean_t
9720spa_has_active_shared_spare(spa_t *spa)
9721{
9722 int i, refcnt;
9723 uint64_t pool;
9724 spa_aux_vdev_t *sav = &spa->spa_spares;
9725
9726 for (i = 0; i < sav->sav_count; i++) {
9727 if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool,
9728 &refcnt) && pool != 0ULL && pool == spa_guid(spa) &&
9729 refcnt > 2)
9730 return (B_TRUE);
9731 }
9732
9733 return (B_FALSE);
9734}
9735
93e28d66
SD
9736uint64_t
9737spa_total_metaslabs(spa_t *spa)
9738{
9739 vdev_t *rvd = spa->spa_root_vdev;
9740
9741 uint64_t m = 0;
9742 for (uint64_t c = 0; c < rvd->vdev_children; c++) {
9743 vdev_t *vd = rvd->vdev_child[c];
9744 if (!vdev_is_concrete(vd))
9745 continue;
9746 m += vd->vdev_ms_count;
9747 }
9748 return (m);
9749}
9750
e60e158e
JG
9751/*
9752 * Notify any waiting threads that some activity has switched from being in-
9753 * progress to not-in-progress so that the thread can wake up and determine
9754 * whether it is finished waiting.
9755 */
9756void
9757spa_notify_waiters(spa_t *spa)
9758{
9759 /*
9760 * Acquiring spa_activities_lock here prevents the cv_broadcast from
9761 * happening between the waiting thread's check and cv_wait.
9762 */
9763 mutex_enter(&spa->spa_activities_lock);
9764 cv_broadcast(&spa->spa_activities_cv);
9765 mutex_exit(&spa->spa_activities_lock);
9766}
9767
9768/*
9769 * Notify any waiting threads that the pool is exporting, and then block until
9770 * they are finished using the spa_t.
9771 */
9772void
9773spa_wake_waiters(spa_t *spa)
9774{
9775 mutex_enter(&spa->spa_activities_lock);
9776 spa->spa_waiters_cancel = B_TRUE;
9777 cv_broadcast(&spa->spa_activities_cv);
9778 while (spa->spa_waiters != 0)
9779 cv_wait(&spa->spa_waiters_cv, &spa->spa_activities_lock);
9780 spa->spa_waiters_cancel = B_FALSE;
9781 mutex_exit(&spa->spa_activities_lock);
9782}
9783
2288d419 9784/* Whether the vdev or any of its descendants are being initialized/trimmed. */
e60e158e 9785static boolean_t
2288d419 9786spa_vdev_activity_in_progress_impl(vdev_t *vd, zpool_wait_activity_t activity)
e60e158e
JG
9787{
9788 spa_t *spa = vd->vdev_spa;
e60e158e
JG
9789
9790 ASSERT(spa_config_held(spa, SCL_CONFIG | SCL_STATE, RW_READER));
9791 ASSERT(MUTEX_HELD(&spa->spa_activities_lock));
2288d419
BB
9792 ASSERT(activity == ZPOOL_WAIT_INITIALIZE ||
9793 activity == ZPOOL_WAIT_TRIM);
9794
9795 kmutex_t *lock = activity == ZPOOL_WAIT_INITIALIZE ?
9796 &vd->vdev_initialize_lock : &vd->vdev_trim_lock;
e60e158e
JG
9797
9798 mutex_exit(&spa->spa_activities_lock);
2288d419 9799 mutex_enter(lock);
e60e158e
JG
9800 mutex_enter(&spa->spa_activities_lock);
9801
2288d419
BB
9802 boolean_t in_progress = (activity == ZPOOL_WAIT_INITIALIZE) ?
9803 (vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE) :
9804 (vd->vdev_trim_state == VDEV_TRIM_ACTIVE);
9805 mutex_exit(lock);
e60e158e 9806
2288d419 9807 if (in_progress)
e60e158e
JG
9808 return (B_TRUE);
9809
9810 for (int i = 0; i < vd->vdev_children; i++) {
2288d419
BB
9811 if (spa_vdev_activity_in_progress_impl(vd->vdev_child[i],
9812 activity))
e60e158e
JG
9813 return (B_TRUE);
9814 }
9815
9816 return (B_FALSE);
9817}
9818
9819/*
9820 * If use_guid is true, this checks whether the vdev specified by guid is
2288d419
BB
9821 * being initialized/trimmed. Otherwise, it checks whether any vdev in the pool
9822 * is being initialized/trimmed. The caller must hold the config lock and
9823 * spa_activities_lock.
e60e158e
JG
9824 */
9825static int
2288d419
BB
9826spa_vdev_activity_in_progress(spa_t *spa, boolean_t use_guid, uint64_t guid,
9827 zpool_wait_activity_t activity, boolean_t *in_progress)
e60e158e
JG
9828{
9829 mutex_exit(&spa->spa_activities_lock);
9830 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
9831 mutex_enter(&spa->spa_activities_lock);
9832
9833 vdev_t *vd;
9834 if (use_guid) {
9835 vd = spa_lookup_by_guid(spa, guid, B_FALSE);
9836 if (vd == NULL || !vd->vdev_ops->vdev_op_leaf) {
9837 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
9838 return (EINVAL);
9839 }
9840 } else {
9841 vd = spa->spa_root_vdev;
9842 }
9843
2288d419 9844 *in_progress = spa_vdev_activity_in_progress_impl(vd, activity);
e60e158e
JG
9845
9846 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
9847 return (0);
9848}
9849
9850/*
9851 * Locking for waiting threads
9852 * ---------------------------
9853 *
9854 * Waiting threads need a way to check whether a given activity is in progress,
9855 * and then, if it is, wait for it to complete. Each activity will have some
9856 * in-memory representation of the relevant on-disk state which can be used to
9857 * determine whether or not the activity is in progress. The in-memory state and
9858 * the locking used to protect it will be different for each activity, and may
9859 * not be suitable for use with a cvar (e.g., some state is protected by the
9860 * config lock). To allow waiting threads to wait without any races, another
9861 * lock, spa_activities_lock, is used.
9862 *
9863 * When the state is checked, both the activity-specific lock (if there is one)
9864 * and spa_activities_lock are held. In some cases, the activity-specific lock
9865 * is acquired explicitly (e.g. the config lock). In others, the locking is
9866 * internal to some check (e.g. bpobj_is_empty). After checking, the waiting
9867 * thread releases the activity-specific lock and, if the activity is in
9868 * progress, then cv_waits using spa_activities_lock.
9869 *
9870 * The waiting thread is woken when another thread, one completing some
9871 * activity, updates the state of the activity and then calls
9872 * spa_notify_waiters, which will cv_broadcast. This 'completing' thread only
9873 * needs to hold its activity-specific lock when updating the state, and this
9874 * lock can (but doesn't have to) be dropped before calling spa_notify_waiters.
9875 *
9876 * Because spa_notify_waiters acquires spa_activities_lock before broadcasting,
9877 * and because it is held when the waiting thread checks the state of the
9878 * activity, it can never be the case that the completing thread both updates
9879 * the activity state and cv_broadcasts in between the waiting thread's check
9880 * and cv_wait. Thus, a waiting thread can never miss a wakeup.
9881 *
9882 * In order to prevent deadlock, when the waiting thread does its check, in some
9883 * cases it will temporarily drop spa_activities_lock in order to acquire the
9884 * activity-specific lock. The order in which spa_activities_lock and the
9885 * activity specific lock are acquired in the waiting thread is determined by
9886 * the order in which they are acquired in the completing thread; if the
9887 * completing thread calls spa_notify_waiters with the activity-specific lock
9888 * held, then the waiting thread must also acquire the activity-specific lock
9889 * first.
9890 */
9891
9892static int
9893spa_activity_in_progress(spa_t *spa, zpool_wait_activity_t activity,
9894 boolean_t use_tag, uint64_t tag, boolean_t *in_progress)
9895{
9896 int error = 0;
9897
9898 ASSERT(MUTEX_HELD(&spa->spa_activities_lock));
9899
9900 switch (activity) {
9901 case ZPOOL_WAIT_CKPT_DISCARD:
9902 *in_progress =
9903 (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT) &&
9904 zap_contains(spa_meta_objset(spa),
9905 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT) ==
9906 ENOENT);
9907 break;
9908 case ZPOOL_WAIT_FREE:
9909 *in_progress = ((spa_version(spa) >= SPA_VERSION_DEADLISTS &&
9910 !bpobj_is_empty(&spa->spa_dsl_pool->dp_free_bpobj)) ||
9911 spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY) ||
9912 spa_livelist_delete_check(spa));
9913 break;
9914 case ZPOOL_WAIT_INITIALIZE:
2288d419
BB
9915 case ZPOOL_WAIT_TRIM:
9916 error = spa_vdev_activity_in_progress(spa, use_tag, tag,
9917 activity, in_progress);
e60e158e
JG
9918 break;
9919 case ZPOOL_WAIT_REPLACE:
9920 mutex_exit(&spa->spa_activities_lock);
9921 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
9922 mutex_enter(&spa->spa_activities_lock);
9923
9924 *in_progress = vdev_replace_in_progress(spa->spa_root_vdev);
9925 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
9926 break;
9927 case ZPOOL_WAIT_REMOVE:
9928 *in_progress = (spa->spa_removing_phys.sr_state ==
9929 DSS_SCANNING);
9930 break;
9931 case ZPOOL_WAIT_RESILVER:
9a49d3f3
BB
9932 if ((*in_progress = vdev_rebuild_active(spa->spa_root_vdev)))
9933 break;
9a70e97f 9934 zfs_fallthrough;
e60e158e
JG
9935 case ZPOOL_WAIT_SCRUB:
9936 {
9937 boolean_t scanning, paused, is_scrub;
9938 dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
9939
9940 is_scrub = (scn->scn_phys.scn_func == POOL_SCAN_SCRUB);
9941 scanning = (scn->scn_phys.scn_state == DSS_SCANNING);
9942 paused = dsl_scan_is_paused_scrub(scn);
9943 *in_progress = (scanning && !paused &&
9944 is_scrub == (activity == ZPOOL_WAIT_SCRUB));
9945 break;
9946 }
9947 default:
9948 panic("unrecognized value for activity %d", activity);
9949 }
9950
9951 return (error);
9952}
9953
9954static int
9955spa_wait_common(const char *pool, zpool_wait_activity_t activity,
9956 boolean_t use_tag, uint64_t tag, boolean_t *waited)
9957{
9958 /*
9959 * The tag is used to distinguish between instances of an activity.
2288d419
BB
9960 * 'initialize' and 'trim' are the only activities that we use this for.
9961 * The other activities can only have a single instance in progress in a
9962 * pool at one time, making the tag unnecessary.
e60e158e
JG
9963 *
9964 * There can be multiple devices being replaced at once, but since they
9965 * all finish once resilvering finishes, we don't bother keeping track
9966 * of them individually, we just wait for them all to finish.
9967 */
2288d419
BB
9968 if (use_tag && activity != ZPOOL_WAIT_INITIALIZE &&
9969 activity != ZPOOL_WAIT_TRIM)
e60e158e
JG
9970 return (EINVAL);
9971
9972 if (activity < 0 || activity >= ZPOOL_WAIT_NUM_ACTIVITIES)
9973 return (EINVAL);
9974
9975 spa_t *spa;
9976 int error = spa_open(pool, &spa, FTAG);
9977 if (error != 0)
9978 return (error);
9979
9980 /*
9981 * Increment the spa's waiter count so that we can call spa_close and
9982 * still ensure that the spa_t doesn't get freed before this thread is
9983 * finished with it when the pool is exported. We want to call spa_close
9984 * before we start waiting because otherwise the additional ref would
9985 * prevent the pool from being exported or destroyed throughout the
9986 * potentially long wait.
9987 */
9988 mutex_enter(&spa->spa_activities_lock);
9989 spa->spa_waiters++;
9990 spa_close(spa, FTAG);
9991
9992 *waited = B_FALSE;
9993 for (;;) {
9994 boolean_t in_progress;
9995 error = spa_activity_in_progress(spa, activity, use_tag, tag,
9996 &in_progress);
9997
b24771a8 9998 if (error || !in_progress || spa->spa_waiters_cancel)
e60e158e
JG
9999 break;
10000
10001 *waited = B_TRUE;
10002
10003 if (cv_wait_sig(&spa->spa_activities_cv,
10004 &spa->spa_activities_lock) == 0) {
10005 error = EINTR;
10006 break;
10007 }
10008 }
10009
10010 spa->spa_waiters--;
10011 cv_signal(&spa->spa_waiters_cv);
10012 mutex_exit(&spa->spa_activities_lock);
10013
10014 return (error);
10015}
10016
10017/*
10018 * Wait for a particular instance of the specified activity to complete, where
10019 * the instance is identified by 'tag'
10020 */
10021int
10022spa_wait_tag(const char *pool, zpool_wait_activity_t activity, uint64_t tag,
10023 boolean_t *waited)
10024{
10025 return (spa_wait_common(pool, activity, B_TRUE, tag, waited));
10026}
10027
10028/*
10029 * Wait for all instances of the specified activity complete
10030 */
10031int
10032spa_wait(const char *pool, zpool_wait_activity_t activity, boolean_t *waited)
10033{
10034
10035 return (spa_wait_common(pool, activity, B_FALSE, 0, waited));
10036}
10037
a1d477c2 10038sysevent_t *
12fa0466
DE
10039spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name)
10040{
10041 sysevent_t *ev = NULL;
10042#ifdef _KERNEL
10043 nvlist_t *resource;
10044
10045 resource = zfs_event_create(spa, vd, FM_SYSEVENT_CLASS, name, hist_nvl);
10046 if (resource) {
10047 ev = kmem_alloc(sizeof (sysevent_t), KM_SLEEP);
10048 ev->resource = resource;
10049 }
14e4e3cb
AZ
10050#else
10051 (void) spa, (void) vd, (void) hist_nvl, (void) name;
12fa0466
DE
10052#endif
10053 return (ev);
10054}
10055
a1d477c2 10056void
12fa0466
DE
10057spa_event_post(sysevent_t *ev)
10058{
10059#ifdef _KERNEL
10060 if (ev) {
10061 zfs_zevent_post(ev->resource, NULL, zfs_zevent_post_cb);
10062 kmem_free(ev, sizeof (*ev));
10063 }
14e4e3cb
AZ
10064#else
10065 (void) ev;
12fa0466
DE
10066#endif
10067}
10068
34dc7c2f 10069/*
fb390aaf
HR
10070 * Post a zevent corresponding to the given sysevent. The 'name' must be one
10071 * of the event definitions in sys/sysevent/eventdefs.h. The payload will be
34dc7c2f
BB
10072 * filled in from the spa and (optionally) the vdev. This doesn't do anything
10073 * in the userland libzpool, as we don't want consumers to misinterpret ztest
10074 * or zdb as real changes.
10075 */
10076void
12fa0466 10077spa_event_notify(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name)
34dc7c2f 10078{
12fa0466 10079 spa_event_post(spa_event_create(spa, vd, hist_nvl, name));
34dc7c2f 10080}
c28b2279 10081
c28b2279
BB
10082/* state manipulation functions */
10083EXPORT_SYMBOL(spa_open);
10084EXPORT_SYMBOL(spa_open_rewind);
10085EXPORT_SYMBOL(spa_get_stats);
10086EXPORT_SYMBOL(spa_create);
c28b2279
BB
10087EXPORT_SYMBOL(spa_import);
10088EXPORT_SYMBOL(spa_tryimport);
10089EXPORT_SYMBOL(spa_destroy);
10090EXPORT_SYMBOL(spa_export);
10091EXPORT_SYMBOL(spa_reset);
10092EXPORT_SYMBOL(spa_async_request);
10093EXPORT_SYMBOL(spa_async_suspend);
10094EXPORT_SYMBOL(spa_async_resume);
10095EXPORT_SYMBOL(spa_inject_addref);
10096EXPORT_SYMBOL(spa_inject_delref);
10097EXPORT_SYMBOL(spa_scan_stat_init);
10098EXPORT_SYMBOL(spa_scan_get_stats);
10099
e1cfd73f 10100/* device manipulation */
c28b2279
BB
10101EXPORT_SYMBOL(spa_vdev_add);
10102EXPORT_SYMBOL(spa_vdev_attach);
10103EXPORT_SYMBOL(spa_vdev_detach);
c28b2279
BB
10104EXPORT_SYMBOL(spa_vdev_setpath);
10105EXPORT_SYMBOL(spa_vdev_setfru);
10106EXPORT_SYMBOL(spa_vdev_split_mirror);
10107
10108/* spare statech is global across all pools) */
10109EXPORT_SYMBOL(spa_spare_add);
10110EXPORT_SYMBOL(spa_spare_remove);
10111EXPORT_SYMBOL(spa_spare_exists);
10112EXPORT_SYMBOL(spa_spare_activate);
10113
10114/* L2ARC statech is global across all pools) */
10115EXPORT_SYMBOL(spa_l2cache_add);
10116EXPORT_SYMBOL(spa_l2cache_remove);
10117EXPORT_SYMBOL(spa_l2cache_exists);
10118EXPORT_SYMBOL(spa_l2cache_activate);
10119EXPORT_SYMBOL(spa_l2cache_drop);
10120
10121/* scanning */
10122EXPORT_SYMBOL(spa_scan);
10123EXPORT_SYMBOL(spa_scan_stop);
10124
10125/* spa syncing */
10126EXPORT_SYMBOL(spa_sync); /* only for DMU use */
10127EXPORT_SYMBOL(spa_sync_allpools);
10128
10129/* properties */
10130EXPORT_SYMBOL(spa_prop_set);
10131EXPORT_SYMBOL(spa_prop_get);
10132EXPORT_SYMBOL(spa_prop_clear_bootfs);
10133
10134/* asynchronous event notification */
10135EXPORT_SYMBOL(spa_event_notify);
dea377c0 10136
c8242a96 10137/* BEGIN CSTYLED */
fdc2d303 10138ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_shift, UINT, ZMOD_RW,
458f8231 10139 "log2 fraction of arc that can be used by inflight I/Os when "
03fdcb9a 10140 "verifying pool during import");
7ada752a 10141/* END CSTYLED */
dea377c0 10142
03fdcb9a 10143ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_metadata, INT, ZMOD_RW,
dea377c0
MA
10144 "Set to traverse metadata on pool import");
10145
03fdcb9a 10146ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_data, INT, ZMOD_RW,
dea377c0 10147 "Set to traverse data on pool import");
dcb6bed1 10148
03fdcb9a 10149ZFS_MODULE_PARAM(zfs_spa, spa_, load_print_vdev_tree, INT, ZMOD_RW,
6cb8e530
PZ
10150 "Print vdev tree to zfs_dbgmsg during pool import");
10151
03fdcb9a 10152ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_pct, UINT, ZMOD_RD,
dcb6bed1
D
10153 "Percentage of CPUs to run an IO worker thread");
10154
7457b024
AM
10155ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_tpq, UINT, ZMOD_RD,
10156 "Number of threads per IO worker taskqueue");
10157
7ada752a 10158/* BEGIN CSTYLED */
ab8d9c17 10159ZFS_MODULE_PARAM(zfs, zfs_, max_missing_tvds, U64, ZMOD_RW,
03fdcb9a
MM
10160 "Allow importing pool with up to this number of missing top-level "
10161 "vdevs (in read-only mode)");
7ada752a 10162/* END CSTYLED */
6cb8e530 10163
7ada752a
AZ
10164ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_pause, INT,
10165 ZMOD_RW, "Set the livelist condense zthr to pause");
03fdcb9a 10166
7ada752a
AZ
10167ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, sync_pause, INT,
10168 ZMOD_RW, "Set the livelist condense synctask to pause");
37f03da8 10169
7ada752a
AZ
10170/* BEGIN CSTYLED */
10171ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, sync_cancel,
10172 INT, ZMOD_RW,
37f03da8 10173 "Whether livelist condensing was canceled in the synctask");
03fdcb9a 10174
7ada752a
AZ
10175ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_cancel,
10176 INT, ZMOD_RW,
37f03da8
SH
10177 "Whether livelist condensing was canceled in the zthr function");
10178
7ada752a
AZ
10179ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, new_alloc, INT,
10180 ZMOD_RW,
03fdcb9a
MM
10181 "Whether extra ALLOC blkptrs were added to a livelist entry while it "
10182 "was being condensed");
37f03da8 10183/* END CSTYLED */