]> git.proxmox.com Git - mirror_zfs.git/blame - module/zfs/spa.c
cstyle: Allow spaces in all comments
[mirror_zfs.git] / module / zfs / spa.c
CommitLineData
34dc7c2f
BB
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
428870ff 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
2e528b49 24 * Copyright (c) 2013 by Delphix. All rights reserved.
7011fb60 25 * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
a38718a6 26 */
34dc7c2f 27
34dc7c2f 28/*
e49f1e20
WA
29 * SPA: Storage Pool Allocator
30 *
34dc7c2f
BB
31 * This file contains all the routines used when modifying on-disk SPA state.
32 * This includes opening, importing, destroying, exporting a pool, and syncing a
33 * pool.
34 */
35
36#include <sys/zfs_context.h>
37#include <sys/fm/fs/zfs.h>
38#include <sys/spa_impl.h>
39#include <sys/zio.h>
40#include <sys/zio_checksum.h>
34dc7c2f
BB
41#include <sys/dmu.h>
42#include <sys/dmu_tx.h>
43#include <sys/zap.h>
44#include <sys/zil.h>
428870ff 45#include <sys/ddt.h>
34dc7c2f 46#include <sys/vdev_impl.h>
c28b2279 47#include <sys/vdev_disk.h>
34dc7c2f 48#include <sys/metaslab.h>
428870ff 49#include <sys/metaslab_impl.h>
34dc7c2f
BB
50#include <sys/uberblock_impl.h>
51#include <sys/txg.h>
52#include <sys/avl.h>
53#include <sys/dmu_traverse.h>
54#include <sys/dmu_objset.h>
55#include <sys/unique.h>
56#include <sys/dsl_pool.h>
57#include <sys/dsl_dataset.h>
58#include <sys/dsl_dir.h>
59#include <sys/dsl_prop.h>
60#include <sys/dsl_synctask.h>
61#include <sys/fs/zfs.h>
62#include <sys/arc.h>
63#include <sys/callb.h>
64#include <sys/systeminfo.h>
34dc7c2f 65#include <sys/spa_boot.h>
9babb374 66#include <sys/zfs_ioctl.h>
428870ff 67#include <sys/dsl_scan.h>
9ae529ec 68#include <sys/zfeature.h>
13fe0198 69#include <sys/dsl_destroy.h>
526af785 70#include <sys/zvol.h>
34dc7c2f 71
d164b209 72#ifdef _KERNEL
428870ff
BB
73#include <sys/bootprops.h>
74#include <sys/callb.h>
75#include <sys/cpupart.h>
76#include <sys/pool.h>
77#include <sys/sysdc.h>
d164b209
BB
78#include <sys/zone.h>
79#endif /* _KERNEL */
80
34dc7c2f
BB
81#include "zfs_prop.h"
82#include "zfs_comutil.h"
83
428870ff 84typedef enum zti_modes {
7ef5e54e 85 ZTI_MODE_FIXED, /* value is # of threads (min 1) */
7ef5e54e
AL
86 ZTI_MODE_BATCH, /* cpu-intensive; value is ignored */
87 ZTI_MODE_NULL, /* don't create a taskq */
88 ZTI_NMODES
428870ff 89} zti_modes_t;
34dc7c2f 90
7ef5e54e
AL
91#define ZTI_P(n, q) { ZTI_MODE_FIXED, (n), (q) }
92#define ZTI_PCT(n) { ZTI_MODE_ONLINE_PERCENT, (n), 1 }
93#define ZTI_BATCH { ZTI_MODE_BATCH, 0, 1 }
94#define ZTI_NULL { ZTI_MODE_NULL, 0, 0 }
9babb374 95
7ef5e54e
AL
96#define ZTI_N(n) ZTI_P(n, 1)
97#define ZTI_ONE ZTI_N(1)
9babb374
BB
98
99typedef struct zio_taskq_info {
7ef5e54e 100 zti_modes_t zti_mode;
428870ff 101 uint_t zti_value;
7ef5e54e 102 uint_t zti_count;
9babb374
BB
103} zio_taskq_info_t;
104
105static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
451041db 106 "iss", "iss_h", "int", "int_h"
9babb374
BB
107};
108
428870ff 109/*
7ef5e54e
AL
110 * This table defines the taskq settings for each ZFS I/O type. When
111 * initializing a pool, we use this table to create an appropriately sized
112 * taskq. Some operations are low volume and therefore have a small, static
113 * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE
114 * macros. Other operations process a large amount of data; the ZTI_BATCH
115 * macro causes us to create a taskq oriented for throughput. Some operations
116 * are so high frequency and short-lived that the taskq itself can become a a
117 * point of lock contention. The ZTI_P(#, #) macro indicates that we need an
118 * additional degree of parallelism specified by the number of threads per-
119 * taskq and the number of taskqs; when dispatching an event in this case, the
120 * particular taskq is chosen at random.
121 *
122 * The different taskq priorities are to handle the different contexts (issue
123 * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that
124 * need to be handled with minimum delay.
428870ff
BB
125 */
126const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
127 /* ISSUE ISSUE_HIGH INTR INTR_HIGH */
7ef5e54e
AL
128 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */
129 { ZTI_N(8), ZTI_NULL, ZTI_BATCH, ZTI_NULL }, /* READ */
130 { ZTI_BATCH, ZTI_N(5), ZTI_N(16), ZTI_N(5) }, /* WRITE */
131 { ZTI_P(4, 8), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */
132 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* CLAIM */
133 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* IOCTL */
9babb374
BB
134};
135
13fe0198
MA
136static void spa_sync_version(void *arg, dmu_tx_t *tx);
137static void spa_sync_props(void *arg, dmu_tx_t *tx);
b128c09f 138static boolean_t spa_has_active_shared_spare(spa_t *spa);
bf701a83 139static inline int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config,
428870ff
BB
140 spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig,
141 char **ereport);
572e2857 142static void spa_vdev_resilver_done(spa_t *spa);
428870ff 143
e8b96c60 144uint_t zio_taskq_batch_pct = 75; /* 1 thread per cpu in pset */
428870ff
BB
145id_t zio_taskq_psrset_bind = PS_NONE;
146boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */
147uint_t zio_taskq_basedc = 80; /* base duty cycle */
148
149boolean_t spa_create_process = B_TRUE; /* no process ==> no sysdc */
150
151/*
152 * This (illegal) pool name is used when temporarily importing a spa_t in order
153 * to get the vdev stats associated with the imported devices.
154 */
155#define TRYIMPORT_NAME "$import"
34dc7c2f
BB
156
157/*
158 * ==========================================================================
159 * SPA properties routines
160 * ==========================================================================
161 */
162
163/*
164 * Add a (source=src, propname=propval) list to an nvlist.
165 */
166static void
167spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval,
168 uint64_t intval, zprop_source_t src)
169{
170 const char *propname = zpool_prop_to_name(prop);
171 nvlist_t *propval;
172
b8d06fca 173 VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_PUSHPAGE) == 0);
34dc7c2f
BB
174 VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0);
175
176 if (strval != NULL)
177 VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0);
178 else
179 VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0);
180
181 VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0);
182 nvlist_free(propval);
183}
184
185/*
186 * Get property values from the spa configuration.
187 */
188static void
189spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
190{
1bd201e7 191 vdev_t *rvd = spa->spa_root_vdev;
9ae529ec 192 dsl_pool_t *pool = spa->spa_dsl_pool;
d164b209 193 uint64_t size;
428870ff 194 uint64_t alloc;
1bd201e7 195 uint64_t space;
34dc7c2f
BB
196 uint64_t cap, version;
197 zprop_source_t src = ZPROP_SRC_NONE;
b128c09f 198 spa_config_dirent_t *dp;
1bd201e7 199 int c;
b128c09f
BB
200
201 ASSERT(MUTEX_HELD(&spa->spa_props_lock));
34dc7c2f 202
1bd201e7 203 if (rvd != NULL) {
428870ff
BB
204 alloc = metaslab_class_get_alloc(spa_normal_class(spa));
205 size = metaslab_class_get_space(spa_normal_class(spa));
d164b209
BB
206 spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src);
207 spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src);
428870ff
BB
208 spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src);
209 spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL,
210 size - alloc, src);
1bd201e7
CS
211
212 space = 0;
213 for (c = 0; c < rvd->vdev_children; c++) {
214 vdev_t *tvd = rvd->vdev_child[c];
215 space += tvd->vdev_max_asize - tvd->vdev_asize;
216 }
217 spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL, space,
218 src);
219
572e2857
BB
220 spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL,
221 (spa_mode(spa) == FREAD), src);
d164b209 222
428870ff 223 cap = (size == 0) ? 0 : (alloc * 100 / size);
d164b209
BB
224 spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src);
225
428870ff
BB
226 spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL,
227 ddt_get_pool_dedup_ratio(spa), src);
228
d164b209 229 spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL,
1bd201e7 230 rvd->vdev_state, src);
d164b209
BB
231
232 version = spa_version(spa);
233 if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION))
234 src = ZPROP_SRC_DEFAULT;
235 else
236 src = ZPROP_SRC_LOCAL;
237 spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src);
238 }
34dc7c2f 239
9ae529ec
CS
240 if (pool != NULL) {
241 dsl_dir_t *freedir = pool->dp_free_dir;
242
243 /*
244 * The $FREE directory was introduced in SPA_VERSION_DEADLISTS,
245 * when opening pools before this version freedir will be NULL.
246 */
247 if (freedir != NULL) {
248 spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL,
249 freedir->dd_phys->dd_used_bytes, src);
250 } else {
251 spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING,
252 NULL, 0, src);
253 }
254 }
255
34dc7c2f 256 spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src);
34dc7c2f 257
d96eb2b1
DM
258 if (spa->spa_comment != NULL) {
259 spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment,
260 0, ZPROP_SRC_LOCAL);
261 }
262
34dc7c2f
BB
263 if (spa->spa_root != NULL)
264 spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root,
265 0, ZPROP_SRC_LOCAL);
266
b128c09f
BB
267 if ((dp = list_head(&spa->spa_config_list)) != NULL) {
268 if (dp->scd_path == NULL) {
34dc7c2f 269 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
b128c09f
BB
270 "none", 0, ZPROP_SRC_LOCAL);
271 } else if (strcmp(dp->scd_path, spa_config_path) != 0) {
34dc7c2f 272 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
b128c09f 273 dp->scd_path, 0, ZPROP_SRC_LOCAL);
34dc7c2f
BB
274 }
275 }
276}
277
278/*
279 * Get zpool property values.
280 */
281int
282spa_prop_get(spa_t *spa, nvlist_t **nvp)
283{
428870ff 284 objset_t *mos = spa->spa_meta_objset;
34dc7c2f
BB
285 zap_cursor_t zc;
286 zap_attribute_t za;
34dc7c2f
BB
287 int err;
288
b8d06fca 289 err = nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_PUSHPAGE);
c28b2279
BB
290 if (err)
291 return err;
34dc7c2f 292
b128c09f
BB
293 mutex_enter(&spa->spa_props_lock);
294
34dc7c2f
BB
295 /*
296 * Get properties from the spa config.
297 */
298 spa_prop_get_config(spa, nvp);
299
34dc7c2f 300 /* If no pool property object, no more prop to get. */
428870ff 301 if (mos == NULL || spa->spa_pool_props_object == 0) {
34dc7c2f 302 mutex_exit(&spa->spa_props_lock);
c28b2279 303 goto out;
34dc7c2f
BB
304 }
305
306 /*
307 * Get properties from the MOS pool property object.
308 */
309 for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object);
310 (err = zap_cursor_retrieve(&zc, &za)) == 0;
311 zap_cursor_advance(&zc)) {
312 uint64_t intval = 0;
313 char *strval = NULL;
314 zprop_source_t src = ZPROP_SRC_DEFAULT;
315 zpool_prop_t prop;
316
317 if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL)
318 continue;
319
320 switch (za.za_integer_length) {
321 case 8:
322 /* integer property */
323 if (za.za_first_integer !=
324 zpool_prop_default_numeric(prop))
325 src = ZPROP_SRC_LOCAL;
326
327 if (prop == ZPOOL_PROP_BOOTFS) {
328 dsl_pool_t *dp;
329 dsl_dataset_t *ds = NULL;
330
331 dp = spa_get_dsl(spa);
13fe0198 332 dsl_pool_config_enter(dp, FTAG);
c65aa5b2
BB
333 if ((err = dsl_dataset_hold_obj(dp,
334 za.za_first_integer, FTAG, &ds))) {
13fe0198 335 dsl_pool_config_exit(dp, FTAG);
34dc7c2f
BB
336 break;
337 }
338
339 strval = kmem_alloc(
340 MAXNAMELEN + strlen(MOS_DIR_NAME) + 1,
b8d06fca 341 KM_PUSHPAGE);
34dc7c2f 342 dsl_dataset_name(ds, strval);
b128c09f 343 dsl_dataset_rele(ds, FTAG);
13fe0198 344 dsl_pool_config_exit(dp, FTAG);
34dc7c2f
BB
345 } else {
346 strval = NULL;
347 intval = za.za_first_integer;
348 }
349
350 spa_prop_add_list(*nvp, prop, strval, intval, src);
351
352 if (strval != NULL)
353 kmem_free(strval,
354 MAXNAMELEN + strlen(MOS_DIR_NAME) + 1);
355
356 break;
357
358 case 1:
359 /* string property */
b8d06fca 360 strval = kmem_alloc(za.za_num_integers, KM_PUSHPAGE);
34dc7c2f
BB
361 err = zap_lookup(mos, spa->spa_pool_props_object,
362 za.za_name, 1, za.za_num_integers, strval);
363 if (err) {
364 kmem_free(strval, za.za_num_integers);
365 break;
366 }
367 spa_prop_add_list(*nvp, prop, strval, 0, src);
368 kmem_free(strval, za.za_num_integers);
369 break;
370
371 default:
372 break;
373 }
374 }
375 zap_cursor_fini(&zc);
376 mutex_exit(&spa->spa_props_lock);
377out:
378 if (err && err != ENOENT) {
379 nvlist_free(*nvp);
380 *nvp = NULL;
381 return (err);
382 }
383
384 return (0);
385}
386
387/*
388 * Validate the given pool properties nvlist and modify the list
389 * for the property values to be set.
390 */
391static int
392spa_prop_validate(spa_t *spa, nvlist_t *props)
393{
394 nvpair_t *elem;
395 int error = 0, reset_bootfs = 0;
d4ed6673 396 uint64_t objnum = 0;
9ae529ec 397 boolean_t has_feature = B_FALSE;
34dc7c2f
BB
398
399 elem = NULL;
400 while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
34dc7c2f 401 uint64_t intval;
9ae529ec
CS
402 char *strval, *slash, *check, *fname;
403 const char *propname = nvpair_name(elem);
404 zpool_prop_t prop = zpool_name_to_prop(propname);
405
406 switch ((int)prop) {
407 case ZPROP_INVAL:
408 if (!zpool_prop_feature(propname)) {
2e528b49 409 error = SET_ERROR(EINVAL);
9ae529ec
CS
410 break;
411 }
412
413 /*
414 * Sanitize the input.
415 */
416 if (nvpair_type(elem) != DATA_TYPE_UINT64) {
2e528b49 417 error = SET_ERROR(EINVAL);
9ae529ec
CS
418 break;
419 }
420
421 if (nvpair_value_uint64(elem, &intval) != 0) {
2e528b49 422 error = SET_ERROR(EINVAL);
9ae529ec
CS
423 break;
424 }
34dc7c2f 425
9ae529ec 426 if (intval != 0) {
2e528b49 427 error = SET_ERROR(EINVAL);
9ae529ec
CS
428 break;
429 }
34dc7c2f 430
9ae529ec
CS
431 fname = strchr(propname, '@') + 1;
432 if (zfeature_lookup_name(fname, NULL) != 0) {
2e528b49 433 error = SET_ERROR(EINVAL);
9ae529ec
CS
434 break;
435 }
436
437 has_feature = B_TRUE;
438 break;
34dc7c2f 439
34dc7c2f
BB
440 case ZPOOL_PROP_VERSION:
441 error = nvpair_value_uint64(elem, &intval);
442 if (!error &&
9ae529ec
CS
443 (intval < spa_version(spa) ||
444 intval > SPA_VERSION_BEFORE_FEATURES ||
445 has_feature))
2e528b49 446 error = SET_ERROR(EINVAL);
34dc7c2f
BB
447 break;
448
449 case ZPOOL_PROP_DELEGATION:
450 case ZPOOL_PROP_AUTOREPLACE:
b128c09f 451 case ZPOOL_PROP_LISTSNAPS:
9babb374 452 case ZPOOL_PROP_AUTOEXPAND:
34dc7c2f
BB
453 error = nvpair_value_uint64(elem, &intval);
454 if (!error && intval > 1)
2e528b49 455 error = SET_ERROR(EINVAL);
34dc7c2f
BB
456 break;
457
458 case ZPOOL_PROP_BOOTFS:
9babb374
BB
459 /*
460 * If the pool version is less than SPA_VERSION_BOOTFS,
461 * or the pool is still being created (version == 0),
462 * the bootfs property cannot be set.
463 */
34dc7c2f 464 if (spa_version(spa) < SPA_VERSION_BOOTFS) {
2e528b49 465 error = SET_ERROR(ENOTSUP);
34dc7c2f
BB
466 break;
467 }
468
469 /*
b128c09f 470 * Make sure the vdev config is bootable
34dc7c2f 471 */
b128c09f 472 if (!vdev_is_bootable(spa->spa_root_vdev)) {
2e528b49 473 error = SET_ERROR(ENOTSUP);
34dc7c2f
BB
474 break;
475 }
476
477 reset_bootfs = 1;
478
479 error = nvpair_value_string(elem, &strval);
480
481 if (!error) {
9ae529ec 482 objset_t *os;
b128c09f
BB
483 uint64_t compress;
484
34dc7c2f
BB
485 if (strval == NULL || strval[0] == '\0') {
486 objnum = zpool_prop_default_numeric(
487 ZPOOL_PROP_BOOTFS);
488 break;
489 }
490
c65aa5b2 491 if ((error = dmu_objset_hold(strval,FTAG,&os)))
34dc7c2f 492 break;
b128c09f 493
428870ff
BB
494 /* Must be ZPL and not gzip compressed. */
495
496 if (dmu_objset_type(os) != DMU_OST_ZFS) {
2e528b49 497 error = SET_ERROR(ENOTSUP);
13fe0198
MA
498 } else if ((error =
499 dsl_prop_get_int_ds(dmu_objset_ds(os),
b128c09f 500 zfs_prop_to_name(ZFS_PROP_COMPRESSION),
13fe0198 501 &compress)) == 0 &&
b128c09f 502 !BOOTFS_COMPRESS_VALID(compress)) {
2e528b49 503 error = SET_ERROR(ENOTSUP);
b128c09f
BB
504 } else {
505 objnum = dmu_objset_id(os);
506 }
428870ff 507 dmu_objset_rele(os, FTAG);
34dc7c2f
BB
508 }
509 break;
b128c09f 510
34dc7c2f
BB
511 case ZPOOL_PROP_FAILUREMODE:
512 error = nvpair_value_uint64(elem, &intval);
513 if (!error && (intval < ZIO_FAILURE_MODE_WAIT ||
514 intval > ZIO_FAILURE_MODE_PANIC))
2e528b49 515 error = SET_ERROR(EINVAL);
34dc7c2f
BB
516
517 /*
518 * This is a special case which only occurs when
519 * the pool has completely failed. This allows
520 * the user to change the in-core failmode property
521 * without syncing it out to disk (I/Os might
522 * currently be blocked). We do this by returning
523 * EIO to the caller (spa_prop_set) to trick it
524 * into thinking we encountered a property validation
525 * error.
526 */
b128c09f 527 if (!error && spa_suspended(spa)) {
34dc7c2f 528 spa->spa_failmode = intval;
2e528b49 529 error = SET_ERROR(EIO);
34dc7c2f
BB
530 }
531 break;
532
533 case ZPOOL_PROP_CACHEFILE:
534 if ((error = nvpair_value_string(elem, &strval)) != 0)
535 break;
536
537 if (strval[0] == '\0')
538 break;
539
540 if (strcmp(strval, "none") == 0)
541 break;
542
543 if (strval[0] != '/') {
2e528b49 544 error = SET_ERROR(EINVAL);
34dc7c2f
BB
545 break;
546 }
547
548 slash = strrchr(strval, '/');
549 ASSERT(slash != NULL);
550
551 if (slash[1] == '\0' || strcmp(slash, "/.") == 0 ||
552 strcmp(slash, "/..") == 0)
2e528b49 553 error = SET_ERROR(EINVAL);
34dc7c2f 554 break;
428870ff 555
d96eb2b1
DM
556 case ZPOOL_PROP_COMMENT:
557 if ((error = nvpair_value_string(elem, &strval)) != 0)
558 break;
559 for (check = strval; *check != '\0'; check++) {
560 if (!isprint(*check)) {
2e528b49 561 error = SET_ERROR(EINVAL);
d96eb2b1
DM
562 break;
563 }
564 check++;
565 }
566 if (strlen(strval) > ZPROP_MAX_COMMENT)
2e528b49 567 error = SET_ERROR(E2BIG);
d96eb2b1
DM
568 break;
569
428870ff
BB
570 case ZPOOL_PROP_DEDUPDITTO:
571 if (spa_version(spa) < SPA_VERSION_DEDUP)
2e528b49 572 error = SET_ERROR(ENOTSUP);
428870ff
BB
573 else
574 error = nvpair_value_uint64(elem, &intval);
575 if (error == 0 &&
576 intval != 0 && intval < ZIO_DEDUPDITTO_MIN)
2e528b49 577 error = SET_ERROR(EINVAL);
428870ff 578 break;
e75c13c3
BB
579
580 default:
581 break;
34dc7c2f
BB
582 }
583
584 if (error)
585 break;
586 }
587
588 if (!error && reset_bootfs) {
589 error = nvlist_remove(props,
590 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING);
591
592 if (!error) {
593 error = nvlist_add_uint64(props,
594 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum);
595 }
596 }
597
598 return (error);
599}
600
d164b209
BB
601void
602spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync)
603{
604 char *cachefile;
605 spa_config_dirent_t *dp;
606
607 if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE),
608 &cachefile) != 0)
609 return;
610
611 dp = kmem_alloc(sizeof (spa_config_dirent_t),
b8d06fca 612 KM_PUSHPAGE);
d164b209
BB
613
614 if (cachefile[0] == '\0')
615 dp->scd_path = spa_strdup(spa_config_path);
616 else if (strcmp(cachefile, "none") == 0)
617 dp->scd_path = NULL;
618 else
619 dp->scd_path = spa_strdup(cachefile);
620
621 list_insert_head(&spa->spa_config_list, dp);
622 if (need_sync)
623 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
624}
625
34dc7c2f
BB
626int
627spa_prop_set(spa_t *spa, nvlist_t *nvp)
628{
629 int error;
9ae529ec 630 nvpair_t *elem = NULL;
d164b209 631 boolean_t need_sync = B_FALSE;
34dc7c2f
BB
632
633 if ((error = spa_prop_validate(spa, nvp)) != 0)
634 return (error);
635
d164b209 636 while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) {
9ae529ec 637 zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem));
d164b209 638
572e2857
BB
639 if (prop == ZPOOL_PROP_CACHEFILE ||
640 prop == ZPOOL_PROP_ALTROOT ||
641 prop == ZPOOL_PROP_READONLY)
d164b209
BB
642 continue;
643
9ae529ec
CS
644 if (prop == ZPOOL_PROP_VERSION || prop == ZPROP_INVAL) {
645 uint64_t ver;
646
647 if (prop == ZPOOL_PROP_VERSION) {
648 VERIFY(nvpair_value_uint64(elem, &ver) == 0);
649 } else {
650 ASSERT(zpool_prop_feature(nvpair_name(elem)));
651 ver = SPA_VERSION_FEATURES;
652 need_sync = B_TRUE;
653 }
654
655 /* Save time if the version is already set. */
656 if (ver == spa_version(spa))
657 continue;
658
659 /*
660 * In addition to the pool directory object, we might
661 * create the pool properties object, the features for
662 * read object, the features for write object, or the
663 * feature descriptions object.
664 */
13fe0198
MA
665 error = dsl_sync_task(spa->spa_name, NULL,
666 spa_sync_version, &ver, 6);
9ae529ec
CS
667 if (error)
668 return (error);
669 continue;
670 }
671
d164b209
BB
672 need_sync = B_TRUE;
673 break;
674 }
675
9ae529ec 676 if (need_sync) {
13fe0198
MA
677 return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props,
678 nvp, 6));
9ae529ec
CS
679 }
680
681 return (0);
34dc7c2f
BB
682}
683
684/*
685 * If the bootfs property value is dsobj, clear it.
686 */
687void
688spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx)
689{
690 if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) {
691 VERIFY(zap_remove(spa->spa_meta_objset,
692 spa->spa_pool_props_object,
693 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0);
694 spa->spa_bootfs = 0;
695 }
696}
697
3bc7e0fb
GW
698/*ARGSUSED*/
699static int
13fe0198 700spa_change_guid_check(void *arg, dmu_tx_t *tx)
3bc7e0fb 701{
13fe0198 702 spa_t *spa = dmu_tx_pool(tx)->dp_spa;
3bc7e0fb
GW
703 vdev_t *rvd = spa->spa_root_vdev;
704 uint64_t vdev_state;
13fe0198 705 ASSERTV(uint64_t *newguid = arg);
3bc7e0fb
GW
706
707 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
708 vdev_state = rvd->vdev_state;
709 spa_config_exit(spa, SCL_STATE, FTAG);
710
711 if (vdev_state != VDEV_STATE_HEALTHY)
2e528b49 712 return (SET_ERROR(ENXIO));
3bc7e0fb
GW
713
714 ASSERT3U(spa_guid(spa), !=, *newguid);
715
716 return (0);
717}
718
719static void
13fe0198 720spa_change_guid_sync(void *arg, dmu_tx_t *tx)
3bc7e0fb 721{
13fe0198
MA
722 uint64_t *newguid = arg;
723 spa_t *spa = dmu_tx_pool(tx)->dp_spa;
3bc7e0fb
GW
724 uint64_t oldguid;
725 vdev_t *rvd = spa->spa_root_vdev;
726
727 oldguid = spa_guid(spa);
728
729 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
730 rvd->vdev_guid = *newguid;
731 rvd->vdev_guid_sum += (*newguid - oldguid);
732 vdev_config_dirty(rvd);
733 spa_config_exit(spa, SCL_STATE, FTAG);
734
6f1ffb06
MA
735 spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu",
736 oldguid, *newguid);
3bc7e0fb
GW
737}
738
3541dc6d
GA
739/*
740 * Change the GUID for the pool. This is done so that we can later
741 * re-import a pool built from a clone of our own vdevs. We will modify
742 * the root vdev's guid, our own pool guid, and then mark all of our
743 * vdevs dirty. Note that we must make sure that all our vdevs are
744 * online when we do this, or else any vdevs that weren't present
745 * would be orphaned from our pool. We are also going to issue a
746 * sysevent to update any watchers.
747 */
748int
749spa_change_guid(spa_t *spa)
750{
3bc7e0fb
GW
751 int error;
752 uint64_t guid;
3541dc6d 753
621dd7bb 754 mutex_enter(&spa->spa_vdev_top_lock);
3bc7e0fb
GW
755 mutex_enter(&spa_namespace_lock);
756 guid = spa_generate_guid(NULL);
3541dc6d 757
13fe0198
MA
758 error = dsl_sync_task(spa->spa_name, spa_change_guid_check,
759 spa_change_guid_sync, &guid, 5);
3541dc6d 760
3bc7e0fb
GW
761 if (error == 0) {
762 spa_config_sync(spa, B_FALSE, B_TRUE);
763 spa_event_notify(spa, NULL, FM_EREPORT_ZFS_POOL_REGUID);
764 }
3541dc6d 765
3bc7e0fb 766 mutex_exit(&spa_namespace_lock);
621dd7bb 767 mutex_exit(&spa->spa_vdev_top_lock);
3541dc6d 768
3bc7e0fb 769 return (error);
3541dc6d
GA
770}
771
34dc7c2f
BB
772/*
773 * ==========================================================================
774 * SPA state manipulation (open/create/destroy/import/export)
775 * ==========================================================================
776 */
777
778static int
779spa_error_entry_compare(const void *a, const void *b)
780{
781 spa_error_entry_t *sa = (spa_error_entry_t *)a;
782 spa_error_entry_t *sb = (spa_error_entry_t *)b;
783 int ret;
784
785 ret = bcmp(&sa->se_bookmark, &sb->se_bookmark,
786 sizeof (zbookmark_t));
787
788 if (ret < 0)
789 return (-1);
790 else if (ret > 0)
791 return (1);
792 else
793 return (0);
794}
795
796/*
797 * Utility function which retrieves copies of the current logs and
798 * re-initializes them in the process.
799 */
800void
801spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub)
802{
803 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock));
804
805 bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t));
806 bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t));
807
808 avl_create(&spa->spa_errlist_scrub,
809 spa_error_entry_compare, sizeof (spa_error_entry_t),
810 offsetof(spa_error_entry_t, se_avl));
811 avl_create(&spa->spa_errlist_last,
812 spa_error_entry_compare, sizeof (spa_error_entry_t),
813 offsetof(spa_error_entry_t, se_avl));
814}
815
7ef5e54e
AL
816static void
817spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
34dc7c2f 818{
7ef5e54e
AL
819 const zio_taskq_info_t *ztip = &zio_taskqs[t][q];
820 enum zti_modes mode = ztip->zti_mode;
821 uint_t value = ztip->zti_value;
822 uint_t count = ztip->zti_count;
823 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
824 char name[32];
825 uint_t i, flags = 0;
428870ff 826 boolean_t batch = B_FALSE;
34dc7c2f 827
7ef5e54e
AL
828 if (mode == ZTI_MODE_NULL) {
829 tqs->stqs_count = 0;
830 tqs->stqs_taskq = NULL;
831 return;
832 }
428870ff 833
7ef5e54e 834 ASSERT3U(count, >, 0);
428870ff 835
7ef5e54e
AL
836 tqs->stqs_count = count;
837 tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP);
428870ff 838
e8b96c60
MA
839 switch (mode) {
840 case ZTI_MODE_FIXED:
841 ASSERT3U(value, >=, 1);
842 value = MAX(value, 1);
843 break;
7ef5e54e 844
e8b96c60
MA
845 case ZTI_MODE_BATCH:
846 batch = B_TRUE;
847 flags |= TASKQ_THREADS_CPU_PCT;
848 value = zio_taskq_batch_pct;
849 break;
7ef5e54e 850
e8b96c60
MA
851 default:
852 panic("unrecognized mode for %s_%s taskq (%u:%u) in "
853 "spa_activate()",
854 zio_type_name[t], zio_taskq_types[q], mode, value);
855 break;
856 }
7ef5e54e 857
e8b96c60
MA
858 for (i = 0; i < count; i++) {
859 taskq_t *tq;
7ef5e54e
AL
860
861 if (count > 1) {
862 (void) snprintf(name, sizeof (name), "%s_%s_%u",
863 zio_type_name[t], zio_taskq_types[q], i);
864 } else {
865 (void) snprintf(name, sizeof (name), "%s_%s",
866 zio_type_name[t], zio_taskq_types[q]);
867 }
868
869 if (zio_taskq_sysdc && spa->spa_proc != &p0) {
870 if (batch)
871 flags |= TASKQ_DC_BATCH;
872
873 tq = taskq_create_sysdc(name, value, 50, INT_MAX,
874 spa->spa_proc, zio_taskq_basedc, flags);
875 } else {
e8b96c60
MA
876 pri_t pri = maxclsyspri;
877 /*
878 * The write issue taskq can be extremely CPU
879 * intensive. Run it at slightly lower priority
880 * than the other taskqs.
881 */
882 if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE)
883 pri--;
884
885 tq = taskq_create_proc(name, value, pri, 50,
7ef5e54e
AL
886 INT_MAX, spa->spa_proc, flags);
887 }
888
889 tqs->stqs_taskq[i] = tq;
890 }
891}
892
893static void
894spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
895{
896 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
897 uint_t i;
898
899 if (tqs->stqs_taskq == NULL) {
900 ASSERT3U(tqs->stqs_count, ==, 0);
901 return;
902 }
903
904 for (i = 0; i < tqs->stqs_count; i++) {
905 ASSERT3P(tqs->stqs_taskq[i], !=, NULL);
906 taskq_destroy(tqs->stqs_taskq[i]);
428870ff 907 }
34dc7c2f 908
7ef5e54e
AL
909 kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *));
910 tqs->stqs_taskq = NULL;
911}
34dc7c2f 912
7ef5e54e
AL
913/*
914 * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority.
915 * Note that a type may have multiple discrete taskqs to avoid lock contention
916 * on the taskq itself. In that case we choose which taskq at random by using
917 * the low bits of gethrtime().
918 */
919void
920spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
921 task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent)
922{
923 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
924 taskq_t *tq;
925
926 ASSERT3P(tqs->stqs_taskq, !=, NULL);
927 ASSERT3U(tqs->stqs_count, !=, 0);
928
929 if (tqs->stqs_count == 1) {
930 tq = tqs->stqs_taskq[0];
931 } else {
c12936b1 932 tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count];
428870ff 933 }
7ef5e54e
AL
934
935 taskq_dispatch_ent(tq, func, arg, flags, ent);
428870ff
BB
936}
937
044baf00
BB
938/*
939 * Same as spa_taskq_dispatch_ent() but block on the task until completion.
940 */
941void
942spa_taskq_dispatch_sync(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
943 task_func_t *func, void *arg, uint_t flags)
944{
945 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
946 taskq_t *tq;
947 taskqid_t id;
948
949 ASSERT3P(tqs->stqs_taskq, !=, NULL);
950 ASSERT3U(tqs->stqs_count, !=, 0);
951
952 if (tqs->stqs_count == 1) {
953 tq = tqs->stqs_taskq[0];
954 } else {
c12936b1 955 tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count];
044baf00
BB
956 }
957
958 id = taskq_dispatch(tq, func, arg, flags);
959 if (id)
960 taskq_wait_id(tq, id);
961}
962
428870ff
BB
963static void
964spa_create_zio_taskqs(spa_t *spa)
965{
d6320ddb
BB
966 int t, q;
967
968 for (t = 0; t < ZIO_TYPES; t++) {
969 for (q = 0; q < ZIO_TASKQ_TYPES; q++) {
7ef5e54e 970 spa_taskqs_init(spa, t, q);
428870ff
BB
971 }
972 }
973}
9babb374 974
7b89a549 975#if defined(_KERNEL) && defined(HAVE_SPA_THREAD)
428870ff
BB
976static void
977spa_thread(void *arg)
978{
979 callb_cpr_t cprinfo;
9babb374 980
428870ff
BB
981 spa_t *spa = arg;
982 user_t *pu = PTOU(curproc);
9babb374 983
428870ff
BB
984 CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr,
985 spa->spa_name);
9babb374 986
428870ff
BB
987 ASSERT(curproc != &p0);
988 (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs),
989 "zpool-%s", spa->spa_name);
990 (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm));
991
992 /* bind this thread to the requested psrset */
993 if (zio_taskq_psrset_bind != PS_NONE) {
994 pool_lock();
995 mutex_enter(&cpu_lock);
996 mutex_enter(&pidlock);
997 mutex_enter(&curproc->p_lock);
998
999 if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind,
1000 0, NULL, NULL) == 0) {
1001 curthread->t_bind_pset = zio_taskq_psrset_bind;
1002 } else {
1003 cmn_err(CE_WARN,
1004 "Couldn't bind process for zfs pool \"%s\" to "
1005 "pset %d\n", spa->spa_name, zio_taskq_psrset_bind);
1006 }
1007
1008 mutex_exit(&curproc->p_lock);
1009 mutex_exit(&pidlock);
1010 mutex_exit(&cpu_lock);
1011 pool_unlock();
1012 }
1013
1014 if (zio_taskq_sysdc) {
1015 sysdc_thread_enter(curthread, 100, 0);
1016 }
1017
1018 spa->spa_proc = curproc;
1019 spa->spa_did = curthread->t_did;
1020
1021 spa_create_zio_taskqs(spa);
1022
1023 mutex_enter(&spa->spa_proc_lock);
1024 ASSERT(spa->spa_proc_state == SPA_PROC_CREATED);
1025
1026 spa->spa_proc_state = SPA_PROC_ACTIVE;
1027 cv_broadcast(&spa->spa_proc_cv);
1028
1029 CALLB_CPR_SAFE_BEGIN(&cprinfo);
1030 while (spa->spa_proc_state == SPA_PROC_ACTIVE)
1031 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
1032 CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock);
1033
1034 ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE);
1035 spa->spa_proc_state = SPA_PROC_GONE;
1036 spa->spa_proc = &p0;
1037 cv_broadcast(&spa->spa_proc_cv);
1038 CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */
1039
1040 mutex_enter(&curproc->p_lock);
1041 lwp_exit();
1042}
1043#endif
1044
1045/*
1046 * Activate an uninitialized pool.
1047 */
1048static void
1049spa_activate(spa_t *spa, int mode)
1050{
1051 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
1052
1053 spa->spa_state = POOL_STATE_ACTIVE;
1054 spa->spa_mode = mode;
1055
1056 spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops);
1057 spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops);
1058
1059 /* Try to create a covering process */
1060 mutex_enter(&spa->spa_proc_lock);
1061 ASSERT(spa->spa_proc_state == SPA_PROC_NONE);
1062 ASSERT(spa->spa_proc == &p0);
1063 spa->spa_did = 0;
1064
7b89a549 1065#ifdef HAVE_SPA_THREAD
428870ff
BB
1066 /* Only create a process if we're going to be around a while. */
1067 if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) {
1068 if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri,
1069 NULL, 0) == 0) {
1070 spa->spa_proc_state = SPA_PROC_CREATED;
1071 while (spa->spa_proc_state == SPA_PROC_CREATED) {
1072 cv_wait(&spa->spa_proc_cv,
1073 &spa->spa_proc_lock);
9babb374 1074 }
428870ff
BB
1075 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
1076 ASSERT(spa->spa_proc != &p0);
1077 ASSERT(spa->spa_did != 0);
1078 } else {
1079#ifdef _KERNEL
1080 cmn_err(CE_WARN,
1081 "Couldn't create process for zfs pool \"%s\"\n",
1082 spa->spa_name);
1083#endif
b128c09f 1084 }
34dc7c2f 1085 }
7b89a549 1086#endif /* HAVE_SPA_THREAD */
428870ff
BB
1087 mutex_exit(&spa->spa_proc_lock);
1088
1089 /* If we didn't create a process, we need to create our taskqs. */
1090 if (spa->spa_proc == &p0) {
1091 spa_create_zio_taskqs(spa);
1092 }
34dc7c2f 1093
b128c09f
BB
1094 list_create(&spa->spa_config_dirty_list, sizeof (vdev_t),
1095 offsetof(vdev_t, vdev_config_dirty_node));
1096 list_create(&spa->spa_state_dirty_list, sizeof (vdev_t),
1097 offsetof(vdev_t, vdev_state_dirty_node));
34dc7c2f
BB
1098
1099 txg_list_create(&spa->spa_vdev_txg_list,
1100 offsetof(struct vdev, vdev_txg_node));
1101
1102 avl_create(&spa->spa_errlist_scrub,
1103 spa_error_entry_compare, sizeof (spa_error_entry_t),
1104 offsetof(spa_error_entry_t, se_avl));
1105 avl_create(&spa->spa_errlist_last,
1106 spa_error_entry_compare, sizeof (spa_error_entry_t),
1107 offsetof(spa_error_entry_t, se_avl));
1108}
1109
1110/*
1111 * Opposite of spa_activate().
1112 */
1113static void
1114spa_deactivate(spa_t *spa)
1115{
d6320ddb
BB
1116 int t, q;
1117
34dc7c2f
BB
1118 ASSERT(spa->spa_sync_on == B_FALSE);
1119 ASSERT(spa->spa_dsl_pool == NULL);
1120 ASSERT(spa->spa_root_vdev == NULL);
9babb374 1121 ASSERT(spa->spa_async_zio_root == NULL);
34dc7c2f
BB
1122 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED);
1123
1124 txg_list_destroy(&spa->spa_vdev_txg_list);
1125
b128c09f
BB
1126 list_destroy(&spa->spa_config_dirty_list);
1127 list_destroy(&spa->spa_state_dirty_list);
34dc7c2f 1128
cc92e9d0
GW
1129 taskq_cancel_id(system_taskq, spa->spa_deadman_tqid);
1130
d6320ddb
BB
1131 for (t = 0; t < ZIO_TYPES; t++) {
1132 for (q = 0; q < ZIO_TASKQ_TYPES; q++) {
7ef5e54e 1133 spa_taskqs_fini(spa, t, q);
b128c09f 1134 }
34dc7c2f
BB
1135 }
1136
1137 metaslab_class_destroy(spa->spa_normal_class);
1138 spa->spa_normal_class = NULL;
1139
1140 metaslab_class_destroy(spa->spa_log_class);
1141 spa->spa_log_class = NULL;
1142
1143 /*
1144 * If this was part of an import or the open otherwise failed, we may
1145 * still have errors left in the queues. Empty them just in case.
1146 */
1147 spa_errlog_drain(spa);
1148
1149 avl_destroy(&spa->spa_errlist_scrub);
1150 avl_destroy(&spa->spa_errlist_last);
1151
1152 spa->spa_state = POOL_STATE_UNINITIALIZED;
428870ff
BB
1153
1154 mutex_enter(&spa->spa_proc_lock);
1155 if (spa->spa_proc_state != SPA_PROC_NONE) {
1156 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
1157 spa->spa_proc_state = SPA_PROC_DEACTIVATE;
1158 cv_broadcast(&spa->spa_proc_cv);
1159 while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) {
1160 ASSERT(spa->spa_proc != &p0);
1161 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
1162 }
1163 ASSERT(spa->spa_proc_state == SPA_PROC_GONE);
1164 spa->spa_proc_state = SPA_PROC_NONE;
1165 }
1166 ASSERT(spa->spa_proc == &p0);
1167 mutex_exit(&spa->spa_proc_lock);
1168
1169 /*
1170 * We want to make sure spa_thread() has actually exited the ZFS
1171 * module, so that the module can't be unloaded out from underneath
1172 * it.
1173 */
1174 if (spa->spa_did != 0) {
1175 thread_join(spa->spa_did);
1176 spa->spa_did = 0;
1177 }
34dc7c2f
BB
1178}
1179
1180/*
1181 * Verify a pool configuration, and construct the vdev tree appropriately. This
1182 * will create all the necessary vdevs in the appropriate layout, with each vdev
1183 * in the CLOSED state. This will prep the pool before open/creation/import.
1184 * All vdev validation is done by the vdev_alloc() routine.
1185 */
1186static int
1187spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent,
1188 uint_t id, int atype)
1189{
1190 nvlist_t **child;
9babb374 1191 uint_t children;
34dc7c2f 1192 int error;
d6320ddb 1193 int c;
34dc7c2f
BB
1194
1195 if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0)
1196 return (error);
1197
1198 if ((*vdp)->vdev_ops->vdev_op_leaf)
1199 return (0);
1200
b128c09f
BB
1201 error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
1202 &child, &children);
1203
1204 if (error == ENOENT)
1205 return (0);
1206
1207 if (error) {
34dc7c2f
BB
1208 vdev_free(*vdp);
1209 *vdp = NULL;
2e528b49 1210 return (SET_ERROR(EINVAL));
34dc7c2f
BB
1211 }
1212
d6320ddb 1213 for (c = 0; c < children; c++) {
34dc7c2f
BB
1214 vdev_t *vd;
1215 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c,
1216 atype)) != 0) {
1217 vdev_free(*vdp);
1218 *vdp = NULL;
1219 return (error);
1220 }
1221 }
1222
1223 ASSERT(*vdp != NULL);
1224
1225 return (0);
1226}
1227
1228/*
1229 * Opposite of spa_load().
1230 */
1231static void
1232spa_unload(spa_t *spa)
1233{
1234 int i;
1235
b128c09f
BB
1236 ASSERT(MUTEX_HELD(&spa_namespace_lock));
1237
34dc7c2f
BB
1238 /*
1239 * Stop async tasks.
1240 */
1241 spa_async_suspend(spa);
1242
1243 /*
1244 * Stop syncing.
1245 */
1246 if (spa->spa_sync_on) {
1247 txg_sync_stop(spa->spa_dsl_pool);
1248 spa->spa_sync_on = B_FALSE;
1249 }
1250
1251 /*
b128c09f 1252 * Wait for any outstanding async I/O to complete.
34dc7c2f 1253 */
9babb374
BB
1254 if (spa->spa_async_zio_root != NULL) {
1255 (void) zio_wait(spa->spa_async_zio_root);
1256 spa->spa_async_zio_root = NULL;
1257 }
34dc7c2f 1258
428870ff
BB
1259 bpobj_close(&spa->spa_deferred_bpobj);
1260
34dc7c2f
BB
1261 /*
1262 * Close the dsl pool.
1263 */
1264 if (spa->spa_dsl_pool) {
1265 dsl_pool_close(spa->spa_dsl_pool);
1266 spa->spa_dsl_pool = NULL;
428870ff 1267 spa->spa_meta_objset = NULL;
34dc7c2f
BB
1268 }
1269
428870ff
BB
1270 ddt_unload(spa);
1271
fb5f0bc8
BB
1272 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1273
1274 /*
1275 * Drop and purge level 2 cache
1276 */
1277 spa_l2cache_drop(spa);
1278
34dc7c2f
BB
1279 /*
1280 * Close all vdevs.
1281 */
1282 if (spa->spa_root_vdev)
1283 vdev_free(spa->spa_root_vdev);
1284 ASSERT(spa->spa_root_vdev == NULL);
1285
1286 for (i = 0; i < spa->spa_spares.sav_count; i++)
1287 vdev_free(spa->spa_spares.sav_vdevs[i]);
1288 if (spa->spa_spares.sav_vdevs) {
1289 kmem_free(spa->spa_spares.sav_vdevs,
1290 spa->spa_spares.sav_count * sizeof (void *));
1291 spa->spa_spares.sav_vdevs = NULL;
1292 }
1293 if (spa->spa_spares.sav_config) {
1294 nvlist_free(spa->spa_spares.sav_config);
1295 spa->spa_spares.sav_config = NULL;
1296 }
b128c09f 1297 spa->spa_spares.sav_count = 0;
34dc7c2f 1298
5ffb9d1d
GW
1299 for (i = 0; i < spa->spa_l2cache.sav_count; i++) {
1300 vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]);
34dc7c2f 1301 vdev_free(spa->spa_l2cache.sav_vdevs[i]);
5ffb9d1d 1302 }
34dc7c2f
BB
1303 if (spa->spa_l2cache.sav_vdevs) {
1304 kmem_free(spa->spa_l2cache.sav_vdevs,
1305 spa->spa_l2cache.sav_count * sizeof (void *));
1306 spa->spa_l2cache.sav_vdevs = NULL;
1307 }
1308 if (spa->spa_l2cache.sav_config) {
1309 nvlist_free(spa->spa_l2cache.sav_config);
1310 spa->spa_l2cache.sav_config = NULL;
1311 }
b128c09f 1312 spa->spa_l2cache.sav_count = 0;
34dc7c2f
BB
1313
1314 spa->spa_async_suspended = 0;
fb5f0bc8 1315
d96eb2b1
DM
1316 if (spa->spa_comment != NULL) {
1317 spa_strfree(spa->spa_comment);
1318 spa->spa_comment = NULL;
1319 }
1320
fb5f0bc8 1321 spa_config_exit(spa, SCL_ALL, FTAG);
34dc7c2f
BB
1322}
1323
1324/*
1325 * Load (or re-load) the current list of vdevs describing the active spares for
1326 * this pool. When this is called, we have some form of basic information in
1327 * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and
1328 * then re-generate a more complete list including status information.
1329 */
1330static void
1331spa_load_spares(spa_t *spa)
1332{
1333 nvlist_t **spares;
1334 uint_t nspares;
1335 int i;
1336 vdev_t *vd, *tvd;
1337
b128c09f
BB
1338 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
1339
34dc7c2f
BB
1340 /*
1341 * First, close and free any existing spare vdevs.
1342 */
1343 for (i = 0; i < spa->spa_spares.sav_count; i++) {
1344 vd = spa->spa_spares.sav_vdevs[i];
1345
1346 /* Undo the call to spa_activate() below */
b128c09f
BB
1347 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
1348 B_FALSE)) != NULL && tvd->vdev_isspare)
34dc7c2f
BB
1349 spa_spare_remove(tvd);
1350 vdev_close(vd);
1351 vdev_free(vd);
1352 }
1353
1354 if (spa->spa_spares.sav_vdevs)
1355 kmem_free(spa->spa_spares.sav_vdevs,
1356 spa->spa_spares.sav_count * sizeof (void *));
1357
1358 if (spa->spa_spares.sav_config == NULL)
1359 nspares = 0;
1360 else
1361 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
1362 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
1363
1364 spa->spa_spares.sav_count = (int)nspares;
1365 spa->spa_spares.sav_vdevs = NULL;
1366
1367 if (nspares == 0)
1368 return;
1369
1370 /*
1371 * Construct the array of vdevs, opening them to get status in the
1372 * process. For each spare, there is potentially two different vdev_t
1373 * structures associated with it: one in the list of spares (used only
1374 * for basic validation purposes) and one in the active vdev
1375 * configuration (if it's spared in). During this phase we open and
1376 * validate each vdev on the spare list. If the vdev also exists in the
1377 * active configuration, then we also mark this vdev as an active spare.
1378 */
1379 spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *),
b8d06fca 1380 KM_PUSHPAGE);
34dc7c2f
BB
1381 for (i = 0; i < spa->spa_spares.sav_count; i++) {
1382 VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0,
1383 VDEV_ALLOC_SPARE) == 0);
1384 ASSERT(vd != NULL);
1385
1386 spa->spa_spares.sav_vdevs[i] = vd;
1387
b128c09f
BB
1388 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
1389 B_FALSE)) != NULL) {
34dc7c2f
BB
1390 if (!tvd->vdev_isspare)
1391 spa_spare_add(tvd);
1392
1393 /*
1394 * We only mark the spare active if we were successfully
1395 * able to load the vdev. Otherwise, importing a pool
1396 * with a bad active spare would result in strange
1397 * behavior, because multiple pool would think the spare
1398 * is actively in use.
1399 *
1400 * There is a vulnerability here to an equally bizarre
1401 * circumstance, where a dead active spare is later
1402 * brought back to life (onlined or otherwise). Given
1403 * the rarity of this scenario, and the extra complexity
1404 * it adds, we ignore the possibility.
1405 */
1406 if (!vdev_is_dead(tvd))
1407 spa_spare_activate(tvd);
1408 }
1409
b128c09f 1410 vd->vdev_top = vd;
9babb374 1411 vd->vdev_aux = &spa->spa_spares;
b128c09f 1412
34dc7c2f
BB
1413 if (vdev_open(vd) != 0)
1414 continue;
1415
34dc7c2f
BB
1416 if (vdev_validate_aux(vd) == 0)
1417 spa_spare_add(vd);
1418 }
1419
1420 /*
1421 * Recompute the stashed list of spares, with status information
1422 * this time.
1423 */
1424 VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES,
1425 DATA_TYPE_NVLIST_ARRAY) == 0);
1426
1427 spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *),
b8d06fca 1428 KM_PUSHPAGE);
34dc7c2f
BB
1429 for (i = 0; i < spa->spa_spares.sav_count; i++)
1430 spares[i] = vdev_config_generate(spa,
428870ff 1431 spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE);
34dc7c2f
BB
1432 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
1433 ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0);
1434 for (i = 0; i < spa->spa_spares.sav_count; i++)
1435 nvlist_free(spares[i]);
1436 kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *));
1437}
1438
1439/*
1440 * Load (or re-load) the current list of vdevs describing the active l2cache for
1441 * this pool. When this is called, we have some form of basic information in
1442 * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and
1443 * then re-generate a more complete list including status information.
1444 * Devices which are already active have their details maintained, and are
1445 * not re-opened.
1446 */
1447static void
1448spa_load_l2cache(spa_t *spa)
1449{
1450 nvlist_t **l2cache;
1451 uint_t nl2cache;
1452 int i, j, oldnvdevs;
9babb374 1453 uint64_t guid;
a117a6d6 1454 vdev_t *vd, **oldvdevs, **newvdevs;
34dc7c2f
BB
1455 spa_aux_vdev_t *sav = &spa->spa_l2cache;
1456
b128c09f
BB
1457 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
1458
34dc7c2f
BB
1459 if (sav->sav_config != NULL) {
1460 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config,
1461 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
b8d06fca 1462 newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_PUSHPAGE);
34dc7c2f
BB
1463 } else {
1464 nl2cache = 0;
a117a6d6 1465 newvdevs = NULL;
34dc7c2f
BB
1466 }
1467
1468 oldvdevs = sav->sav_vdevs;
1469 oldnvdevs = sav->sav_count;
1470 sav->sav_vdevs = NULL;
1471 sav->sav_count = 0;
1472
1473 /*
1474 * Process new nvlist of vdevs.
1475 */
1476 for (i = 0; i < nl2cache; i++) {
1477 VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID,
1478 &guid) == 0);
1479
1480 newvdevs[i] = NULL;
1481 for (j = 0; j < oldnvdevs; j++) {
1482 vd = oldvdevs[j];
1483 if (vd != NULL && guid == vd->vdev_guid) {
1484 /*
1485 * Retain previous vdev for add/remove ops.
1486 */
1487 newvdevs[i] = vd;
1488 oldvdevs[j] = NULL;
1489 break;
1490 }
1491 }
1492
1493 if (newvdevs[i] == NULL) {
1494 /*
1495 * Create new vdev
1496 */
1497 VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0,
1498 VDEV_ALLOC_L2CACHE) == 0);
1499 ASSERT(vd != NULL);
1500 newvdevs[i] = vd;
1501
1502 /*
1503 * Commit this vdev as an l2cache device,
1504 * even if it fails to open.
1505 */
1506 spa_l2cache_add(vd);
1507
b128c09f
BB
1508 vd->vdev_top = vd;
1509 vd->vdev_aux = sav;
1510
1511 spa_l2cache_activate(vd);
1512
34dc7c2f
BB
1513 if (vdev_open(vd) != 0)
1514 continue;
1515
34dc7c2f
BB
1516 (void) vdev_validate_aux(vd);
1517
9babb374
BB
1518 if (!vdev_is_dead(vd))
1519 l2arc_add_vdev(spa, vd);
34dc7c2f
BB
1520 }
1521 }
1522
1523 /*
1524 * Purge vdevs that were dropped
1525 */
1526 for (i = 0; i < oldnvdevs; i++) {
1527 uint64_t pool;
1528
1529 vd = oldvdevs[i];
1530 if (vd != NULL) {
5ffb9d1d
GW
1531 ASSERT(vd->vdev_isl2cache);
1532
fb5f0bc8
BB
1533 if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
1534 pool != 0ULL && l2arc_vdev_present(vd))
34dc7c2f 1535 l2arc_remove_vdev(vd);
5ffb9d1d
GW
1536 vdev_clear_stats(vd);
1537 vdev_free(vd);
34dc7c2f
BB
1538 }
1539 }
1540
1541 if (oldvdevs)
1542 kmem_free(oldvdevs, oldnvdevs * sizeof (void *));
1543
1544 if (sav->sav_config == NULL)
1545 goto out;
1546
1547 sav->sav_vdevs = newvdevs;
1548 sav->sav_count = (int)nl2cache;
1549
1550 /*
1551 * Recompute the stashed list of l2cache devices, with status
1552 * information this time.
1553 */
1554 VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE,
1555 DATA_TYPE_NVLIST_ARRAY) == 0);
1556
b8d06fca 1557 l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_PUSHPAGE);
34dc7c2f
BB
1558 for (i = 0; i < sav->sav_count; i++)
1559 l2cache[i] = vdev_config_generate(spa,
428870ff 1560 sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE);
34dc7c2f
BB
1561 VERIFY(nvlist_add_nvlist_array(sav->sav_config,
1562 ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0);
1563out:
1564 for (i = 0; i < sav->sav_count; i++)
1565 nvlist_free(l2cache[i]);
1566 if (sav->sav_count)
1567 kmem_free(l2cache, sav->sav_count * sizeof (void *));
1568}
1569
1570static int
1571load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value)
1572{
1573 dmu_buf_t *db;
1574 char *packed = NULL;
1575 size_t nvsize = 0;
1576 int error;
1577 *value = NULL;
1578
c3275b56
BB
1579 error = dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db);
1580 if (error)
1581 return (error);
1582
34dc7c2f
BB
1583 nvsize = *(uint64_t *)db->db_data;
1584 dmu_buf_rele(db, FTAG);
1585
b8d06fca 1586 packed = kmem_alloc(nvsize, KM_PUSHPAGE | KM_NODEBUG);
9babb374
BB
1587 error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed,
1588 DMU_READ_PREFETCH);
34dc7c2f
BB
1589 if (error == 0)
1590 error = nvlist_unpack(packed, nvsize, value, 0);
1591 kmem_free(packed, nvsize);
1592
1593 return (error);
1594}
1595
1596/*
1597 * Checks to see if the given vdev could not be opened, in which case we post a
1598 * sysevent to notify the autoreplace code that the device has been removed.
1599 */
1600static void
1601spa_check_removed(vdev_t *vd)
1602{
d6320ddb
BB
1603 int c;
1604
1605 for (c = 0; c < vd->vdev_children; c++)
34dc7c2f
BB
1606 spa_check_removed(vd->vdev_child[c]);
1607
7011fb60
YP
1608 if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) &&
1609 !vd->vdev_ishole) {
26685276
BB
1610 zfs_ereport_post(FM_EREPORT_RESOURCE_AUTOREPLACE,
1611 vd->vdev_spa, vd, NULL, 0, 0);
1612 spa_event_notify(vd->vdev_spa, vd, FM_EREPORT_ZFS_DEVICE_CHECK);
34dc7c2f
BB
1613 }
1614}
1615
9babb374 1616/*
572e2857 1617 * Validate the current config against the MOS config
9babb374 1618 */
572e2857
BB
1619static boolean_t
1620spa_config_valid(spa_t *spa, nvlist_t *config)
9babb374 1621{
572e2857
BB
1622 vdev_t *mrvd, *rvd = spa->spa_root_vdev;
1623 nvlist_t *nv;
d6320ddb 1624 int c, i;
572e2857
BB
1625
1626 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nv) == 0);
1627
1628 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1629 VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0);
1630
1631 ASSERT3U(rvd->vdev_children, ==, mrvd->vdev_children);
9babb374 1632
428870ff 1633 /*
572e2857
BB
1634 * If we're doing a normal import, then build up any additional
1635 * diagnostic information about missing devices in this config.
1636 * We'll pass this up to the user for further processing.
428870ff 1637 */
572e2857
BB
1638 if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) {
1639 nvlist_t **child, *nv;
1640 uint64_t idx = 0;
1641
1642 child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t **),
b8d06fca
RY
1643 KM_PUSHPAGE);
1644 VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_PUSHPAGE) == 0);
572e2857 1645
d6320ddb 1646 for (c = 0; c < rvd->vdev_children; c++) {
572e2857
BB
1647 vdev_t *tvd = rvd->vdev_child[c];
1648 vdev_t *mtvd = mrvd->vdev_child[c];
1649
1650 if (tvd->vdev_ops == &vdev_missing_ops &&
1651 mtvd->vdev_ops != &vdev_missing_ops &&
1652 mtvd->vdev_islog)
1653 child[idx++] = vdev_config_generate(spa, mtvd,
1654 B_FALSE, 0);
1655 }
9babb374 1656
572e2857
BB
1657 if (idx) {
1658 VERIFY(nvlist_add_nvlist_array(nv,
1659 ZPOOL_CONFIG_CHILDREN, child, idx) == 0);
1660 VERIFY(nvlist_add_nvlist(spa->spa_load_info,
1661 ZPOOL_CONFIG_MISSING_DEVICES, nv) == 0);
1662
d6320ddb 1663 for (i = 0; i < idx; i++)
572e2857
BB
1664 nvlist_free(child[i]);
1665 }
1666 nvlist_free(nv);
1667 kmem_free(child, rvd->vdev_children * sizeof (char **));
1668 }
1669
1670 /*
1671 * Compare the root vdev tree with the information we have
1672 * from the MOS config (mrvd). Check each top-level vdev
1673 * with the corresponding MOS config top-level (mtvd).
1674 */
d6320ddb 1675 for (c = 0; c < rvd->vdev_children; c++) {
572e2857
BB
1676 vdev_t *tvd = rvd->vdev_child[c];
1677 vdev_t *mtvd = mrvd->vdev_child[c];
1678
1679 /*
1680 * Resolve any "missing" vdevs in the current configuration.
1681 * If we find that the MOS config has more accurate information
1682 * about the top-level vdev then use that vdev instead.
1683 */
1684 if (tvd->vdev_ops == &vdev_missing_ops &&
1685 mtvd->vdev_ops != &vdev_missing_ops) {
1686
1687 if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG))
1688 continue;
1689
1690 /*
1691 * Device specific actions.
1692 */
1693 if (mtvd->vdev_islog) {
1694 spa_set_log_state(spa, SPA_LOG_CLEAR);
1695 } else {
1696 /*
1697 * XXX - once we have 'readonly' pool
1698 * support we should be able to handle
1699 * missing data devices by transitioning
1700 * the pool to readonly.
1701 */
1702 continue;
1703 }
1704
1705 /*
1706 * Swap the missing vdev with the data we were
1707 * able to obtain from the MOS config.
1708 */
1709 vdev_remove_child(rvd, tvd);
1710 vdev_remove_child(mrvd, mtvd);
1711
1712 vdev_add_child(rvd, mtvd);
1713 vdev_add_child(mrvd, tvd);
1714
1715 spa_config_exit(spa, SCL_ALL, FTAG);
1716 vdev_load(mtvd);
1717 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1718
1719 vdev_reopen(rvd);
1720 } else if (mtvd->vdev_islog) {
1721 /*
1722 * Load the slog device's state from the MOS config
1723 * since it's possible that the label does not
1724 * contain the most up-to-date information.
1725 */
1726 vdev_load_log_state(tvd, mtvd);
1727 vdev_reopen(tvd);
1728 }
9babb374 1729 }
572e2857 1730 vdev_free(mrvd);
428870ff 1731 spa_config_exit(spa, SCL_ALL, FTAG);
572e2857
BB
1732
1733 /*
1734 * Ensure we were able to validate the config.
1735 */
1736 return (rvd->vdev_guid_sum == spa->spa_uberblock.ub_guid_sum);
9babb374
BB
1737}
1738
b128c09f
BB
1739/*
1740 * Check for missing log devices
1741 */
13fe0198 1742static boolean_t
b128c09f
BB
1743spa_check_logs(spa_t *spa)
1744{
13fe0198
MA
1745 boolean_t rv = B_FALSE;
1746
b128c09f 1747 switch (spa->spa_log_state) {
e75c13c3
BB
1748 default:
1749 break;
b128c09f
BB
1750 case SPA_LOG_MISSING:
1751 /* need to recheck in case slog has been restored */
1752 case SPA_LOG_UNKNOWN:
13fe0198
MA
1753 rv = (dmu_objset_find(spa->spa_name, zil_check_log_chain,
1754 NULL, DS_FIND_CHILDREN) != 0);
1755 if (rv)
428870ff 1756 spa_set_log_state(spa, SPA_LOG_MISSING);
b128c09f 1757 break;
b128c09f 1758 }
13fe0198 1759 return (rv);
b128c09f
BB
1760}
1761
428870ff
BB
1762static boolean_t
1763spa_passivate_log(spa_t *spa)
34dc7c2f 1764{
428870ff
BB
1765 vdev_t *rvd = spa->spa_root_vdev;
1766 boolean_t slog_found = B_FALSE;
d6320ddb 1767 int c;
b128c09f 1768
428870ff 1769 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
fb5f0bc8 1770
428870ff
BB
1771 if (!spa_has_slogs(spa))
1772 return (B_FALSE);
34dc7c2f 1773
d6320ddb 1774 for (c = 0; c < rvd->vdev_children; c++) {
428870ff
BB
1775 vdev_t *tvd = rvd->vdev_child[c];
1776 metaslab_group_t *mg = tvd->vdev_mg;
34dc7c2f 1777
428870ff
BB
1778 if (tvd->vdev_islog) {
1779 metaslab_group_passivate(mg);
1780 slog_found = B_TRUE;
1781 }
34dc7c2f
BB
1782 }
1783
428870ff
BB
1784 return (slog_found);
1785}
34dc7c2f 1786
428870ff
BB
1787static void
1788spa_activate_log(spa_t *spa)
1789{
1790 vdev_t *rvd = spa->spa_root_vdev;
d6320ddb 1791 int c;
34dc7c2f 1792
428870ff
BB
1793 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
1794
d6320ddb 1795 for (c = 0; c < rvd->vdev_children; c++) {
428870ff
BB
1796 vdev_t *tvd = rvd->vdev_child[c];
1797 metaslab_group_t *mg = tvd->vdev_mg;
1798
1799 if (tvd->vdev_islog)
1800 metaslab_group_activate(mg);
34dc7c2f 1801 }
428870ff 1802}
34dc7c2f 1803
428870ff
BB
1804int
1805spa_offline_log(spa_t *spa)
1806{
13fe0198 1807 int error;
9babb374 1808
13fe0198
MA
1809 error = dmu_objset_find(spa_name(spa), zil_vdev_offline,
1810 NULL, DS_FIND_CHILDREN);
1811 if (error == 0) {
428870ff
BB
1812 /*
1813 * We successfully offlined the log device, sync out the
1814 * current txg so that the "stubby" block can be removed
1815 * by zil_sync().
1816 */
1817 txg_wait_synced(spa->spa_dsl_pool, 0);
1818 }
1819 return (error);
1820}
34dc7c2f 1821
428870ff
BB
1822static void
1823spa_aux_check_removed(spa_aux_vdev_t *sav)
1824{
d6320ddb
BB
1825 int i;
1826
1827 for (i = 0; i < sav->sav_count; i++)
428870ff
BB
1828 spa_check_removed(sav->sav_vdevs[i]);
1829}
34dc7c2f 1830
428870ff
BB
1831void
1832spa_claim_notify(zio_t *zio)
1833{
1834 spa_t *spa = zio->io_spa;
34dc7c2f 1835
428870ff
BB
1836 if (zio->io_error)
1837 return;
34dc7c2f 1838
428870ff
BB
1839 mutex_enter(&spa->spa_props_lock); /* any mutex will do */
1840 if (spa->spa_claim_max_txg < zio->io_bp->blk_birth)
1841 spa->spa_claim_max_txg = zio->io_bp->blk_birth;
1842 mutex_exit(&spa->spa_props_lock);
1843}
34dc7c2f 1844
428870ff
BB
1845typedef struct spa_load_error {
1846 uint64_t sle_meta_count;
1847 uint64_t sle_data_count;
1848} spa_load_error_t;
34dc7c2f 1849
428870ff
BB
1850static void
1851spa_load_verify_done(zio_t *zio)
1852{
1853 blkptr_t *bp = zio->io_bp;
1854 spa_load_error_t *sle = zio->io_private;
1855 dmu_object_type_t type = BP_GET_TYPE(bp);
1856 int error = zio->io_error;
34dc7c2f 1857
428870ff 1858 if (error) {
9ae529ec 1859 if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) &&
428870ff
BB
1860 type != DMU_OT_INTENT_LOG)
1861 atomic_add_64(&sle->sle_meta_count, 1);
1862 else
1863 atomic_add_64(&sle->sle_data_count, 1);
34dc7c2f 1864 }
428870ff
BB
1865 zio_data_buf_free(zio->io_data, zio->io_size);
1866}
34dc7c2f 1867
428870ff
BB
1868/*ARGSUSED*/
1869static int
1870spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
294f6806 1871 const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
428870ff
BB
1872{
1873 if (bp != NULL) {
1874 zio_t *rio = arg;
1875 size_t size = BP_GET_PSIZE(bp);
1876 void *data = zio_data_buf_alloc(size);
34dc7c2f 1877
428870ff
BB
1878 zio_nowait(zio_read(rio, spa, bp, data, size,
1879 spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB,
1880 ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL |
1881 ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb));
34dc7c2f 1882 }
428870ff
BB
1883 return (0);
1884}
34dc7c2f 1885
428870ff
BB
1886static int
1887spa_load_verify(spa_t *spa)
1888{
1889 zio_t *rio;
1890 spa_load_error_t sle = { 0 };
1891 zpool_rewind_policy_t policy;
1892 boolean_t verify_ok = B_FALSE;
1893 int error;
34dc7c2f 1894
428870ff 1895 zpool_get_rewind_policy(spa->spa_config, &policy);
34dc7c2f 1896
428870ff
BB
1897 if (policy.zrp_request & ZPOOL_NEVER_REWIND)
1898 return (0);
34dc7c2f 1899
428870ff
BB
1900 rio = zio_root(spa, NULL, &sle,
1901 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
34dc7c2f 1902
428870ff
BB
1903 error = traverse_pool(spa, spa->spa_verify_min_txg,
1904 TRAVERSE_PRE | TRAVERSE_PREFETCH, spa_load_verify_cb, rio);
1905
1906 (void) zio_wait(rio);
1907
1908 spa->spa_load_meta_errors = sle.sle_meta_count;
1909 spa->spa_load_data_errors = sle.sle_data_count;
1910
1911 if (!error && sle.sle_meta_count <= policy.zrp_maxmeta &&
1912 sle.sle_data_count <= policy.zrp_maxdata) {
572e2857
BB
1913 int64_t loss = 0;
1914
428870ff
BB
1915 verify_ok = B_TRUE;
1916 spa->spa_load_txg = spa->spa_uberblock.ub_txg;
1917 spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp;
572e2857
BB
1918
1919 loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts;
1920 VERIFY(nvlist_add_uint64(spa->spa_load_info,
1921 ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0);
1922 VERIFY(nvlist_add_int64(spa->spa_load_info,
1923 ZPOOL_CONFIG_REWIND_TIME, loss) == 0);
1924 VERIFY(nvlist_add_uint64(spa->spa_load_info,
1925 ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0);
428870ff
BB
1926 } else {
1927 spa->spa_load_max_txg = spa->spa_uberblock.ub_txg;
1928 }
1929
1930 if (error) {
1931 if (error != ENXIO && error != EIO)
2e528b49 1932 error = SET_ERROR(EIO);
428870ff
BB
1933 return (error);
1934 }
1935
1936 return (verify_ok ? 0 : EIO);
1937}
1938
1939/*
1940 * Find a value in the pool props object.
1941 */
1942static void
1943spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val)
1944{
1945 (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object,
1946 zpool_prop_to_name(prop), sizeof (uint64_t), 1, val);
1947}
1948
1949/*
1950 * Find a value in the pool directory object.
1951 */
1952static int
1953spa_dir_prop(spa_t *spa, const char *name, uint64_t *val)
1954{
1955 return (zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
1956 name, sizeof (uint64_t), 1, val));
1957}
1958
1959static int
1960spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err)
1961{
1962 vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux);
1963 return (err);
1964}
1965
1966/*
1967 * Fix up config after a partly-completed split. This is done with the
1968 * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off
1969 * pool have that entry in their config, but only the splitting one contains
1970 * a list of all the guids of the vdevs that are being split off.
1971 *
1972 * This function determines what to do with that list: either rejoin
1973 * all the disks to the pool, or complete the splitting process. To attempt
1974 * the rejoin, each disk that is offlined is marked online again, and
1975 * we do a reopen() call. If the vdev label for every disk that was
1976 * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL)
1977 * then we call vdev_split() on each disk, and complete the split.
1978 *
1979 * Otherwise we leave the config alone, with all the vdevs in place in
1980 * the original pool.
1981 */
1982static void
1983spa_try_repair(spa_t *spa, nvlist_t *config)
1984{
1985 uint_t extracted;
1986 uint64_t *glist;
1987 uint_t i, gcount;
1988 nvlist_t *nvl;
1989 vdev_t **vd;
1990 boolean_t attempt_reopen;
1991
1992 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0)
1993 return;
1994
1995 /* check that the config is complete */
1996 if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
1997 &glist, &gcount) != 0)
1998 return;
1999
b8d06fca 2000 vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_PUSHPAGE);
428870ff
BB
2001
2002 /* attempt to online all the vdevs & validate */
2003 attempt_reopen = B_TRUE;
2004 for (i = 0; i < gcount; i++) {
2005 if (glist[i] == 0) /* vdev is hole */
2006 continue;
2007
2008 vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE);
2009 if (vd[i] == NULL) {
2010 /*
2011 * Don't bother attempting to reopen the disks;
2012 * just do the split.
2013 */
2014 attempt_reopen = B_FALSE;
2015 } else {
2016 /* attempt to re-online it */
2017 vd[i]->vdev_offline = B_FALSE;
2018 }
2019 }
2020
2021 if (attempt_reopen) {
2022 vdev_reopen(spa->spa_root_vdev);
2023
2024 /* check each device to see what state it's in */
2025 for (extracted = 0, i = 0; i < gcount; i++) {
2026 if (vd[i] != NULL &&
2027 vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL)
2028 break;
2029 ++extracted;
2030 }
2031 }
2032
2033 /*
2034 * If every disk has been moved to the new pool, or if we never
2035 * even attempted to look at them, then we split them off for
2036 * good.
2037 */
2038 if (!attempt_reopen || gcount == extracted) {
2039 for (i = 0; i < gcount; i++)
2040 if (vd[i] != NULL)
2041 vdev_split(vd[i]);
2042 vdev_reopen(spa->spa_root_vdev);
2043 }
2044
2045 kmem_free(vd, gcount * sizeof (vdev_t *));
2046}
2047
2048static int
2049spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type,
2050 boolean_t mosconfig)
2051{
2052 nvlist_t *config = spa->spa_config;
2053 char *ereport = FM_EREPORT_ZFS_POOL;
d96eb2b1 2054 char *comment;
428870ff
BB
2055 int error;
2056 uint64_t pool_guid;
2057 nvlist_t *nvl;
2058
2059 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid))
2e528b49 2060 return (SET_ERROR(EINVAL));
428870ff 2061
d96eb2b1
DM
2062 ASSERT(spa->spa_comment == NULL);
2063 if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0)
2064 spa->spa_comment = spa_strdup(comment);
2065
428870ff
BB
2066 /*
2067 * Versioning wasn't explicitly added to the label until later, so if
2068 * it's not present treat it as the initial version.
2069 */
2070 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
2071 &spa->spa_ubsync.ub_version) != 0)
2072 spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
2073
2074 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
2075 &spa->spa_config_txg);
2076
2077 if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) &&
2078 spa_guid_exists(pool_guid, 0)) {
2e528b49 2079 error = SET_ERROR(EEXIST);
428870ff 2080 } else {
3541dc6d 2081 spa->spa_config_guid = pool_guid;
428870ff
BB
2082
2083 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT,
2084 &nvl) == 0) {
2085 VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting,
b8d06fca 2086 KM_PUSHPAGE) == 0);
428870ff
BB
2087 }
2088
9ae529ec
CS
2089 nvlist_free(spa->spa_load_info);
2090 spa->spa_load_info = fnvlist_alloc();
2091
572e2857 2092 gethrestime(&spa->spa_loaded_ts);
428870ff
BB
2093 error = spa_load_impl(spa, pool_guid, config, state, type,
2094 mosconfig, &ereport);
2095 }
2096
2097 spa->spa_minref = refcount_count(&spa->spa_refcount);
572e2857
BB
2098 if (error) {
2099 if (error != EEXIST) {
2100 spa->spa_loaded_ts.tv_sec = 0;
2101 spa->spa_loaded_ts.tv_nsec = 0;
2102 }
2103 if (error != EBADF) {
2104 zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0);
2105 }
2106 }
428870ff
BB
2107 spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE;
2108 spa->spa_ena = 0;
2109
2110 return (error);
2111}
2112
2113/*
2114 * Load an existing storage pool, using the pool's builtin spa_config as a
2115 * source of configuration information.
2116 */
bf701a83
BB
2117__attribute__((always_inline))
2118static inline int
428870ff
BB
2119spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
2120 spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig,
2121 char **ereport)
2122{
2123 int error = 0;
2124 nvlist_t *nvroot = NULL;
9ae529ec 2125 nvlist_t *label;
428870ff
BB
2126 vdev_t *rvd;
2127 uberblock_t *ub = &spa->spa_uberblock;
572e2857 2128 uint64_t children, config_cache_txg = spa->spa_config_txg;
428870ff
BB
2129 int orig_mode = spa->spa_mode;
2130 int parse;
2131 uint64_t obj;
9ae529ec 2132 boolean_t missing_feat_write = B_FALSE;
428870ff
BB
2133
2134 /*
2135 * If this is an untrusted config, access the pool in read-only mode.
2136 * This prevents things like resilvering recently removed devices.
2137 */
2138 if (!mosconfig)
2139 spa->spa_mode = FREAD;
2140
2141 ASSERT(MUTEX_HELD(&spa_namespace_lock));
2142
2143 spa->spa_load_state = state;
2144
2145 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot))
2e528b49 2146 return (SET_ERROR(EINVAL));
428870ff
BB
2147
2148 parse = (type == SPA_IMPORT_EXISTING ?
2149 VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT);
2150
2151 /*
2152 * Create "The Godfather" zio to hold all async IOs
2153 */
2154 spa->spa_async_zio_root = zio_root(spa, NULL, NULL,
2155 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER);
2156
2157 /*
2158 * Parse the configuration into a vdev tree. We explicitly set the
2159 * value that will be returned by spa_version() since parsing the
2160 * configuration requires knowing the version number.
2161 */
2162 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2163 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, parse);
2164 spa_config_exit(spa, SCL_ALL, FTAG);
2165
2166 if (error != 0)
2167 return (error);
2168
2169 ASSERT(spa->spa_root_vdev == rvd);
2170
2171 if (type != SPA_IMPORT_ASSEMBLE) {
2172 ASSERT(spa_guid(spa) == pool_guid);
2173 }
2174
2175 /*
2176 * Try to open all vdevs, loading each label in the process.
2177 */
2178 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2179 error = vdev_open(rvd);
2180 spa_config_exit(spa, SCL_ALL, FTAG);
2181 if (error != 0)
2182 return (error);
2183
2184 /*
2185 * We need to validate the vdev labels against the configuration that
2186 * we have in hand, which is dependent on the setting of mosconfig. If
2187 * mosconfig is true then we're validating the vdev labels based on
2188 * that config. Otherwise, we're validating against the cached config
2189 * (zpool.cache) that was read when we loaded the zfs module, and then
2190 * later we will recursively call spa_load() and validate against
2191 * the vdev config.
2192 *
2193 * If we're assembling a new pool that's been split off from an
2194 * existing pool, the labels haven't yet been updated so we skip
2195 * validation for now.
2196 */
2197 if (type != SPA_IMPORT_ASSEMBLE) {
2198 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
c7f2d69d 2199 error = vdev_validate(rvd, mosconfig);
428870ff
BB
2200 spa_config_exit(spa, SCL_ALL, FTAG);
2201
2202 if (error != 0)
2203 return (error);
2204
2205 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
2e528b49 2206 return (SET_ERROR(ENXIO));
428870ff
BB
2207 }
2208
2209 /*
2210 * Find the best uberblock.
2211 */
9ae529ec 2212 vdev_uberblock_load(rvd, ub, &label);
428870ff
BB
2213
2214 /*
2215 * If we weren't able to find a single valid uberblock, return failure.
2216 */
9ae529ec
CS
2217 if (ub->ub_txg == 0) {
2218 nvlist_free(label);
428870ff 2219 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO));
9ae529ec 2220 }
428870ff
BB
2221
2222 /*
9ae529ec 2223 * If the pool has an unsupported version we can't open it.
428870ff 2224 */
9ae529ec
CS
2225 if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) {
2226 nvlist_free(label);
428870ff 2227 return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP));
9ae529ec
CS
2228 }
2229
2230 if (ub->ub_version >= SPA_VERSION_FEATURES) {
2231 nvlist_t *features;
2232
2233 /*
2234 * If we weren't able to find what's necessary for reading the
2235 * MOS in the label, return failure.
2236 */
2237 if (label == NULL || nvlist_lookup_nvlist(label,
2238 ZPOOL_CONFIG_FEATURES_FOR_READ, &features) != 0) {
2239 nvlist_free(label);
2240 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
2241 ENXIO));
2242 }
2243
2244 /*
2245 * Update our in-core representation with the definitive values
2246 * from the label.
2247 */
2248 nvlist_free(spa->spa_label_features);
2249 VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0);
2250 }
2251
2252 nvlist_free(label);
2253
2254 /*
2255 * Look through entries in the label nvlist's features_for_read. If
2256 * there is a feature listed there which we don't understand then we
2257 * cannot open a pool.
2258 */
2259 if (ub->ub_version >= SPA_VERSION_FEATURES) {
2260 nvlist_t *unsup_feat;
2261 nvpair_t *nvp;
2262
2263 VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) ==
2264 0);
2265
2266 for (nvp = nvlist_next_nvpair(spa->spa_label_features, NULL);
2267 nvp != NULL;
2268 nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) {
2269 if (!zfeature_is_supported(nvpair_name(nvp))) {
2270 VERIFY(nvlist_add_string(unsup_feat,
2271 nvpair_name(nvp), "") == 0);
2272 }
2273 }
2274
2275 if (!nvlist_empty(unsup_feat)) {
2276 VERIFY(nvlist_add_nvlist(spa->spa_load_info,
2277 ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0);
2278 nvlist_free(unsup_feat);
2279 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
2280 ENOTSUP));
2281 }
2282
2283 nvlist_free(unsup_feat);
2284 }
428870ff
BB
2285
2286 /*
2287 * If the vdev guid sum doesn't match the uberblock, we have an
572e2857
BB
2288 * incomplete configuration. We first check to see if the pool
2289 * is aware of the complete config (i.e ZPOOL_CONFIG_VDEV_CHILDREN).
2290 * If it is, defer the vdev_guid_sum check till later so we
2291 * can handle missing vdevs.
428870ff 2292 */
572e2857
BB
2293 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN,
2294 &children) != 0 && mosconfig && type != SPA_IMPORT_ASSEMBLE &&
428870ff
BB
2295 rvd->vdev_guid_sum != ub->ub_guid_sum)
2296 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO));
2297
2298 if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) {
2299 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2300 spa_try_repair(spa, config);
2301 spa_config_exit(spa, SCL_ALL, FTAG);
2302 nvlist_free(spa->spa_config_splitting);
2303 spa->spa_config_splitting = NULL;
2304 }
2305
2306 /*
2307 * Initialize internal SPA structures.
2308 */
2309 spa->spa_state = POOL_STATE_ACTIVE;
2310 spa->spa_ubsync = spa->spa_uberblock;
2311 spa->spa_verify_min_txg = spa->spa_extreme_rewind ?
2312 TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1;
2313 spa->spa_first_txg = spa->spa_last_ubsync_txg ?
2314 spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1;
2315 spa->spa_claim_max_txg = spa->spa_first_txg;
2316 spa->spa_prev_software_version = ub->ub_software_version;
2317
9ae529ec 2318 error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
428870ff
BB
2319 if (error)
2320 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2321 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
2322
2323 if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0)
2324 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2325
9ae529ec
CS
2326 if (spa_version(spa) >= SPA_VERSION_FEATURES) {
2327 boolean_t missing_feat_read = B_FALSE;
b9b24bb4 2328 nvlist_t *unsup_feat, *enabled_feat;
9ae529ec
CS
2329
2330 if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ,
2331 &spa->spa_feat_for_read_obj) != 0) {
2332 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2333 }
2334
2335 if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE,
2336 &spa->spa_feat_for_write_obj) != 0) {
2337 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2338 }
2339
2340 if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS,
2341 &spa->spa_feat_desc_obj) != 0) {
2342 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2343 }
2344
b9b24bb4
CS
2345 enabled_feat = fnvlist_alloc();
2346 unsup_feat = fnvlist_alloc();
9ae529ec
CS
2347
2348 if (!feature_is_supported(spa->spa_meta_objset,
2349 spa->spa_feat_for_read_obj, spa->spa_feat_desc_obj,
b9b24bb4 2350 unsup_feat, enabled_feat))
9ae529ec
CS
2351 missing_feat_read = B_TRUE;
2352
2353 if (spa_writeable(spa) || state == SPA_LOAD_TRYIMPORT) {
2354 if (!feature_is_supported(spa->spa_meta_objset,
2355 spa->spa_feat_for_write_obj, spa->spa_feat_desc_obj,
b9b24bb4 2356 unsup_feat, enabled_feat)) {
9ae529ec 2357 missing_feat_write = B_TRUE;
b9b24bb4 2358 }
9ae529ec
CS
2359 }
2360
b9b24bb4
CS
2361 fnvlist_add_nvlist(spa->spa_load_info,
2362 ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat);
2363
9ae529ec 2364 if (!nvlist_empty(unsup_feat)) {
b9b24bb4
CS
2365 fnvlist_add_nvlist(spa->spa_load_info,
2366 ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat);
9ae529ec
CS
2367 }
2368
b9b24bb4
CS
2369 fnvlist_free(enabled_feat);
2370 fnvlist_free(unsup_feat);
9ae529ec
CS
2371
2372 if (!missing_feat_read) {
2373 fnvlist_add_boolean(spa->spa_load_info,
2374 ZPOOL_CONFIG_CAN_RDONLY);
2375 }
2376
2377 /*
2378 * If the state is SPA_LOAD_TRYIMPORT, our objective is
2379 * twofold: to determine whether the pool is available for
2380 * import in read-write mode and (if it is not) whether the
2381 * pool is available for import in read-only mode. If the pool
2382 * is available for import in read-write mode, it is displayed
2383 * as available in userland; if it is not available for import
2384 * in read-only mode, it is displayed as unavailable in
2385 * userland. If the pool is available for import in read-only
2386 * mode but not read-write mode, it is displayed as unavailable
2387 * in userland with a special note that the pool is actually
2388 * available for open in read-only mode.
2389 *
2390 * As a result, if the state is SPA_LOAD_TRYIMPORT and we are
2391 * missing a feature for write, we must first determine whether
2392 * the pool can be opened read-only before returning to
2393 * userland in order to know whether to display the
2394 * abovementioned note.
2395 */
2396 if (missing_feat_read || (missing_feat_write &&
2397 spa_writeable(spa))) {
2398 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
2399 ENOTSUP));
2400 }
2401 }
2402
2403 spa->spa_is_initializing = B_TRUE;
2404 error = dsl_pool_open(spa->spa_dsl_pool);
2405 spa->spa_is_initializing = B_FALSE;
2406 if (error != 0)
2407 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2408
428870ff
BB
2409 if (!mosconfig) {
2410 uint64_t hostid;
2411 nvlist_t *policy = NULL, *nvconfig;
2412
2413 if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0)
2414 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2415
2416 if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig,
b128c09f 2417 ZPOOL_CONFIG_HOSTID, &hostid) == 0) {
34dc7c2f
BB
2418 char *hostname;
2419 unsigned long myhostid = 0;
2420
428870ff 2421 VERIFY(nvlist_lookup_string(nvconfig,
34dc7c2f
BB
2422 ZPOOL_CONFIG_HOSTNAME, &hostname) == 0);
2423
d164b209
BB
2424#ifdef _KERNEL
2425 myhostid = zone_get_hostid(NULL);
2426#else /* _KERNEL */
2427 /*
2428 * We're emulating the system's hostid in userland, so
2429 * we can't use zone_get_hostid().
2430 */
34dc7c2f 2431 (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid);
d164b209 2432#endif /* _KERNEL */
34dc7c2f 2433 if (hostid != 0 && myhostid != 0 &&
d164b209 2434 hostid != myhostid) {
428870ff 2435 nvlist_free(nvconfig);
34dc7c2f
BB
2436 cmn_err(CE_WARN, "pool '%s' could not be "
2437 "loaded as it was last accessed by "
b128c09f 2438 "another system (host: %s hostid: 0x%lx). "
3cee2262 2439 "See: http://zfsonlinux.org/msg/ZFS-8000-EY",
b128c09f 2440 spa_name(spa), hostname,
34dc7c2f 2441 (unsigned long)hostid);
2e528b49 2442 return (SET_ERROR(EBADF));
34dc7c2f
BB
2443 }
2444 }
428870ff
BB
2445 if (nvlist_lookup_nvlist(spa->spa_config,
2446 ZPOOL_REWIND_POLICY, &policy) == 0)
2447 VERIFY(nvlist_add_nvlist(nvconfig,
2448 ZPOOL_REWIND_POLICY, policy) == 0);
34dc7c2f 2449
428870ff 2450 spa_config_set(spa, nvconfig);
34dc7c2f
BB
2451 spa_unload(spa);
2452 spa_deactivate(spa);
fb5f0bc8 2453 spa_activate(spa, orig_mode);
34dc7c2f 2454
428870ff 2455 return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE));
34dc7c2f
BB
2456 }
2457
428870ff
BB
2458 if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj) != 0)
2459 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2460 error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj);
2461 if (error != 0)
2462 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
34dc7c2f
BB
2463
2464 /*
2465 * Load the bit that tells us to use the new accounting function
2466 * (raid-z deflation). If we have an older pool, this will not
2467 * be present.
2468 */
428870ff
BB
2469 error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate);
2470 if (error != 0 && error != ENOENT)
2471 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2472
2473 error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION,
2474 &spa->spa_creation_version);
2475 if (error != 0 && error != ENOENT)
2476 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
34dc7c2f
BB
2477
2478 /*
2479 * Load the persistent error log. If we have an older pool, this will
2480 * not be present.
2481 */
428870ff
BB
2482 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last);
2483 if (error != 0 && error != ENOENT)
2484 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
34dc7c2f 2485
428870ff
BB
2486 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB,
2487 &spa->spa_errlog_scrub);
2488 if (error != 0 && error != ENOENT)
2489 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
34dc7c2f
BB
2490
2491 /*
2492 * Load the history object. If we have an older pool, this
2493 * will not be present.
2494 */
428870ff
BB
2495 error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history);
2496 if (error != 0 && error != ENOENT)
2497 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2498
2499 /*
2500 * If we're assembling the pool from the split-off vdevs of
2501 * an existing pool, we don't want to attach the spares & cache
2502 * devices.
2503 */
34dc7c2f
BB
2504
2505 /*
2506 * Load any hot spares for this pool.
2507 */
428870ff
BB
2508 error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object);
2509 if (error != 0 && error != ENOENT)
2510 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2511 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
34dc7c2f
BB
2512 ASSERT(spa_version(spa) >= SPA_VERSION_SPARES);
2513 if (load_nvlist(spa, spa->spa_spares.sav_object,
428870ff
BB
2514 &spa->spa_spares.sav_config) != 0)
2515 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
34dc7c2f 2516
b128c09f 2517 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
34dc7c2f 2518 spa_load_spares(spa);
b128c09f 2519 spa_config_exit(spa, SCL_ALL, FTAG);
428870ff
BB
2520 } else if (error == 0) {
2521 spa->spa_spares.sav_sync = B_TRUE;
34dc7c2f
BB
2522 }
2523
2524 /*
2525 * Load any level 2 ARC devices for this pool.
2526 */
428870ff 2527 error = spa_dir_prop(spa, DMU_POOL_L2CACHE,
34dc7c2f 2528 &spa->spa_l2cache.sav_object);
428870ff
BB
2529 if (error != 0 && error != ENOENT)
2530 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2531 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
34dc7c2f
BB
2532 ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE);
2533 if (load_nvlist(spa, spa->spa_l2cache.sav_object,
428870ff
BB
2534 &spa->spa_l2cache.sav_config) != 0)
2535 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
34dc7c2f 2536
b128c09f 2537 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
34dc7c2f 2538 spa_load_l2cache(spa);
b128c09f 2539 spa_config_exit(spa, SCL_ALL, FTAG);
428870ff
BB
2540 } else if (error == 0) {
2541 spa->spa_l2cache.sav_sync = B_TRUE;
b128c09f
BB
2542 }
2543
34dc7c2f
BB
2544 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
2545
428870ff
BB
2546 error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object);
2547 if (error && error != ENOENT)
2548 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
34dc7c2f
BB
2549
2550 if (error == 0) {
428870ff
BB
2551 uint64_t autoreplace;
2552
2553 spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs);
2554 spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace);
2555 spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation);
2556 spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode);
2557 spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand);
2558 spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO,
2559 &spa->spa_dedup_ditto);
2560
2561 spa->spa_autoreplace = (autoreplace != 0);
34dc7c2f
BB
2562 }
2563
2564 /*
2565 * If the 'autoreplace' property is set, then post a resource notifying
2566 * the ZFS DE that it should not issue any faults for unopenable
2567 * devices. We also iterate over the vdevs, and post a sysevent for any
2568 * unopenable vdevs so that the normal autoreplace handler can take
2569 * over.
2570 */
428870ff 2571 if (spa->spa_autoreplace && state != SPA_LOAD_TRYIMPORT) {
34dc7c2f 2572 spa_check_removed(spa->spa_root_vdev);
428870ff
BB
2573 /*
2574 * For the import case, this is done in spa_import(), because
2575 * at this point we're using the spare definitions from
2576 * the MOS config, not necessarily from the userland config.
2577 */
2578 if (state != SPA_LOAD_IMPORT) {
2579 spa_aux_check_removed(&spa->spa_spares);
2580 spa_aux_check_removed(&spa->spa_l2cache);
2581 }
2582 }
34dc7c2f
BB
2583
2584 /*
2585 * Load the vdev state for all toplevel vdevs.
2586 */
2587 vdev_load(rvd);
2588
2589 /*
2590 * Propagate the leaf DTLs we just loaded all the way up the tree.
2591 */
b128c09f 2592 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
34dc7c2f 2593 vdev_dtl_reassess(rvd, 0, 0, B_FALSE);
b128c09f 2594 spa_config_exit(spa, SCL_ALL, FTAG);
34dc7c2f 2595
428870ff
BB
2596 /*
2597 * Load the DDTs (dedup tables).
2598 */
2599 error = ddt_load(spa);
2600 if (error != 0)
2601 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2602
2603 spa_update_dspace(spa);
2604
428870ff 2605 /*
572e2857
BB
2606 * Validate the config, using the MOS config to fill in any
2607 * information which might be missing. If we fail to validate
2608 * the config then declare the pool unfit for use. If we're
2609 * assembling a pool from a split, the log is not transferred
2610 * over.
428870ff
BB
2611 */
2612 if (type != SPA_IMPORT_ASSEMBLE) {
2613 nvlist_t *nvconfig;
2614
2615 if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0)
2616 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2617
572e2857
BB
2618 if (!spa_config_valid(spa, nvconfig)) {
2619 nvlist_free(nvconfig);
2620 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM,
2621 ENXIO));
2622 }
428870ff
BB
2623 nvlist_free(nvconfig);
2624
572e2857 2625 /*
9ae529ec 2626 * Now that we've validated the config, check the state of the
572e2857
BB
2627 * root vdev. If it can't be opened, it indicates one or
2628 * more toplevel vdevs are faulted.
2629 */
2630 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
2e528b49 2631 return (SET_ERROR(ENXIO));
572e2857 2632
428870ff
BB
2633 if (spa_check_logs(spa)) {
2634 *ereport = FM_EREPORT_ZFS_LOG_REPLAY;
2635 return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO));
2636 }
2637 }
2638
9ae529ec
CS
2639 if (missing_feat_write) {
2640 ASSERT(state == SPA_LOAD_TRYIMPORT);
2641
2642 /*
2643 * At this point, we know that we can open the pool in
2644 * read-only mode but not read-write mode. We now have enough
2645 * information and can return to userland.
2646 */
2647 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, ENOTSUP));
2648 }
2649
572e2857
BB
2650 /*
2651 * We've successfully opened the pool, verify that we're ready
2652 * to start pushing transactions.
2653 */
2654 if (state != SPA_LOAD_TRYIMPORT) {
c65aa5b2 2655 if ((error = spa_load_verify(spa)))
572e2857
BB
2656 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
2657 error));
2658 }
2659
428870ff
BB
2660 if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER ||
2661 spa->spa_load_max_txg == UINT64_MAX)) {
34dc7c2f
BB
2662 dmu_tx_t *tx;
2663 int need_update = B_FALSE;
d6320ddb 2664 int c;
fb5f0bc8
BB
2665
2666 ASSERT(state != SPA_LOAD_TRYIMPORT);
34dc7c2f
BB
2667
2668 /*
2669 * Claim log blocks that haven't been committed yet.
2670 * This must all happen in a single txg.
428870ff
BB
2671 * Note: spa_claim_max_txg is updated by spa_claim_notify(),
2672 * invoked from zil_claim_log_block()'s i/o done callback.
2673 * Price of rollback is that we abandon the log.
34dc7c2f 2674 */
428870ff
BB
2675 spa->spa_claiming = B_TRUE;
2676
34dc7c2f
BB
2677 tx = dmu_tx_create_assigned(spa_get_dsl(spa),
2678 spa_first_txg(spa));
b128c09f 2679 (void) dmu_objset_find(spa_name(spa),
34dc7c2f
BB
2680 zil_claim, tx, DS_FIND_CHILDREN);
2681 dmu_tx_commit(tx);
2682
428870ff
BB
2683 spa->spa_claiming = B_FALSE;
2684
2685 spa_set_log_state(spa, SPA_LOG_GOOD);
34dc7c2f
BB
2686 spa->spa_sync_on = B_TRUE;
2687 txg_sync_start(spa->spa_dsl_pool);
2688
2689 /*
428870ff
BB
2690 * Wait for all claims to sync. We sync up to the highest
2691 * claimed log block birth time so that claimed log blocks
2692 * don't appear to be from the future. spa_claim_max_txg
2693 * will have been set for us by either zil_check_log_chain()
2694 * (invoked from spa_check_logs()) or zil_claim() above.
34dc7c2f 2695 */
428870ff 2696 txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg);
34dc7c2f
BB
2697
2698 /*
2699 * If the config cache is stale, or we have uninitialized
2700 * metaslabs (see spa_vdev_add()), then update the config.
45d1cae3 2701 *
572e2857 2702 * If this is a verbatim import, trust the current
45d1cae3 2703 * in-core spa_config and update the disk labels.
34dc7c2f
BB
2704 */
2705 if (config_cache_txg != spa->spa_config_txg ||
572e2857
BB
2706 state == SPA_LOAD_IMPORT ||
2707 state == SPA_LOAD_RECOVER ||
2708 (spa->spa_import_flags & ZFS_IMPORT_VERBATIM))
34dc7c2f
BB
2709 need_update = B_TRUE;
2710
d6320ddb 2711 for (c = 0; c < rvd->vdev_children; c++)
34dc7c2f
BB
2712 if (rvd->vdev_child[c]->vdev_ms_array == 0)
2713 need_update = B_TRUE;
2714
2715 /*
2716 * Update the config cache asychronously in case we're the
2717 * root pool, in which case the config cache isn't writable yet.
2718 */
2719 if (need_update)
2720 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
fb5f0bc8
BB
2721
2722 /*
2723 * Check all DTLs to see if anything needs resilvering.
2724 */
428870ff
BB
2725 if (!dsl_scan_resilvering(spa->spa_dsl_pool) &&
2726 vdev_resilver_needed(rvd, NULL, NULL))
fb5f0bc8 2727 spa_async_request(spa, SPA_ASYNC_RESILVER);
428870ff 2728
6f1ffb06
MA
2729 /*
2730 * Log the fact that we booted up (so that we can detect if
2731 * we rebooted in the middle of an operation).
2732 */
2733 spa_history_log_version(spa, "open");
2734
428870ff
BB
2735 /*
2736 * Delete any inconsistent datasets.
2737 */
2738 (void) dmu_objset_find(spa_name(spa),
2739 dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN);
2740
2741 /*
2742 * Clean up any stale temporary dataset userrefs.
2743 */
2744 dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool);
34dc7c2f
BB
2745 }
2746
428870ff
BB
2747 return (0);
2748}
34dc7c2f 2749
428870ff
BB
2750static int
2751spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig)
2752{
572e2857
BB
2753 int mode = spa->spa_mode;
2754
428870ff
BB
2755 spa_unload(spa);
2756 spa_deactivate(spa);
2757
2758 spa->spa_load_max_txg--;
2759
572e2857 2760 spa_activate(spa, mode);
428870ff
BB
2761 spa_async_suspend(spa);
2762
2763 return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig));
2764}
2765
9ae529ec
CS
2766/*
2767 * If spa_load() fails this function will try loading prior txg's. If
2768 * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool
2769 * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this
2770 * function will not rewind the pool and will return the same error as
2771 * spa_load().
2772 */
428870ff
BB
2773static int
2774spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig,
2775 uint64_t max_request, int rewind_flags)
2776{
9ae529ec 2777 nvlist_t *loadinfo = NULL;
428870ff
BB
2778 nvlist_t *config = NULL;
2779 int load_error, rewind_error;
2780 uint64_t safe_rewind_txg;
2781 uint64_t min_txg;
2782
2783 if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) {
2784 spa->spa_load_max_txg = spa->spa_load_txg;
2785 spa_set_log_state(spa, SPA_LOG_CLEAR);
2786 } else {
2787 spa->spa_load_max_txg = max_request;
2788 }
2789
2790 load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING,
2791 mosconfig);
2792 if (load_error == 0)
2793 return (0);
2794
2795 if (spa->spa_root_vdev != NULL)
2796 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
2797
2798 spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg;
2799 spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp;
2800
2801 if (rewind_flags & ZPOOL_NEVER_REWIND) {
2802 nvlist_free(config);
2803 return (load_error);
2804 }
2805
9ae529ec
CS
2806 if (state == SPA_LOAD_RECOVER) {
2807 /* Price of rolling back is discarding txgs, including log */
428870ff 2808 spa_set_log_state(spa, SPA_LOG_CLEAR);
9ae529ec
CS
2809 } else {
2810 /*
2811 * If we aren't rolling back save the load info from our first
2812 * import attempt so that we can restore it after attempting
2813 * to rewind.
2814 */
2815 loadinfo = spa->spa_load_info;
2816 spa->spa_load_info = fnvlist_alloc();
2817 }
428870ff
BB
2818
2819 spa->spa_load_max_txg = spa->spa_last_ubsync_txg;
2820 safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE;
2821 min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ?
2822 TXG_INITIAL : safe_rewind_txg;
2823
2824 /*
2825 * Continue as long as we're finding errors, we're still within
2826 * the acceptable rewind range, and we're still finding uberblocks
2827 */
2828 while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg &&
2829 spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) {
2830 if (spa->spa_load_max_txg < safe_rewind_txg)
2831 spa->spa_extreme_rewind = B_TRUE;
2832 rewind_error = spa_load_retry(spa, state, mosconfig);
2833 }
2834
428870ff
BB
2835 spa->spa_extreme_rewind = B_FALSE;
2836 spa->spa_load_max_txg = UINT64_MAX;
2837
2838 if (config && (rewind_error || state != SPA_LOAD_RECOVER))
2839 spa_config_set(spa, config);
2840
9ae529ec
CS
2841 if (state == SPA_LOAD_RECOVER) {
2842 ASSERT3P(loadinfo, ==, NULL);
2843 return (rewind_error);
2844 } else {
2845 /* Store the rewind info as part of the initial load info */
2846 fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO,
2847 spa->spa_load_info);
2848
2849 /* Restore the initial load info */
2850 fnvlist_free(spa->spa_load_info);
2851 spa->spa_load_info = loadinfo;
2852
2853 return (load_error);
2854 }
34dc7c2f
BB
2855}
2856
2857/*
2858 * Pool Open/Import
2859 *
2860 * The import case is identical to an open except that the configuration is sent
2861 * down from userland, instead of grabbed from the configuration cache. For the
2862 * case of an open, the pool configuration will exist in the
2863 * POOL_STATE_UNINITIALIZED state.
2864 *
2865 * The stats information (gen/count/ustats) is used to gather vdev statistics at
2866 * the same time open the pool, without having to keep around the spa_t in some
2867 * ambiguous state.
2868 */
2869static int
428870ff
BB
2870spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy,
2871 nvlist_t **config)
34dc7c2f
BB
2872{
2873 spa_t *spa;
572e2857 2874 spa_load_state_t state = SPA_LOAD_OPEN;
34dc7c2f 2875 int error;
34dc7c2f 2876 int locked = B_FALSE;
526af785 2877 int firstopen = B_FALSE;
34dc7c2f
BB
2878
2879 *spapp = NULL;
2880
2881 /*
2882 * As disgusting as this is, we need to support recursive calls to this
2883 * function because dsl_dir_open() is called during spa_load(), and ends
2884 * up calling spa_open() again. The real fix is to figure out how to
2885 * avoid dsl_dir_open() calling this in the first place.
2886 */
2887 if (mutex_owner(&spa_namespace_lock) != curthread) {
2888 mutex_enter(&spa_namespace_lock);
2889 locked = B_TRUE;
2890 }
2891
2892 if ((spa = spa_lookup(pool)) == NULL) {
2893 if (locked)
2894 mutex_exit(&spa_namespace_lock);
2e528b49 2895 return (SET_ERROR(ENOENT));
34dc7c2f 2896 }
428870ff 2897
34dc7c2f 2898 if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
428870ff
BB
2899 zpool_rewind_policy_t policy;
2900
526af785
PJD
2901 firstopen = B_TRUE;
2902
428870ff
BB
2903 zpool_get_rewind_policy(nvpolicy ? nvpolicy : spa->spa_config,
2904 &policy);
2905 if (policy.zrp_request & ZPOOL_DO_REWIND)
2906 state = SPA_LOAD_RECOVER;
34dc7c2f 2907
fb5f0bc8 2908 spa_activate(spa, spa_mode_global);
34dc7c2f 2909
428870ff
BB
2910 if (state != SPA_LOAD_RECOVER)
2911 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
2912
2913 error = spa_load_best(spa, state, B_FALSE, policy.zrp_txg,
2914 policy.zrp_request);
34dc7c2f
BB
2915
2916 if (error == EBADF) {
2917 /*
2918 * If vdev_validate() returns failure (indicated by
2919 * EBADF), it indicates that one of the vdevs indicates
2920 * that the pool has been exported or destroyed. If
2921 * this is the case, the config cache is out of sync and
2922 * we should remove the pool from the namespace.
2923 */
34dc7c2f
BB
2924 spa_unload(spa);
2925 spa_deactivate(spa);
b128c09f 2926 spa_config_sync(spa, B_TRUE, B_TRUE);
34dc7c2f 2927 spa_remove(spa);
34dc7c2f
BB
2928 if (locked)
2929 mutex_exit(&spa_namespace_lock);
2e528b49 2930 return (SET_ERROR(ENOENT));
34dc7c2f
BB
2931 }
2932
2933 if (error) {
2934 /*
2935 * We can't open the pool, but we still have useful
2936 * information: the state of each vdev after the
2937 * attempted vdev_open(). Return this to the user.
2938 */
572e2857 2939 if (config != NULL && spa->spa_config) {
428870ff 2940 VERIFY(nvlist_dup(spa->spa_config, config,
b8d06fca 2941 KM_PUSHPAGE) == 0);
572e2857
BB
2942 VERIFY(nvlist_add_nvlist(*config,
2943 ZPOOL_CONFIG_LOAD_INFO,
2944 spa->spa_load_info) == 0);
2945 }
34dc7c2f
BB
2946 spa_unload(spa);
2947 spa_deactivate(spa);
428870ff 2948 spa->spa_last_open_failed = error;
34dc7c2f
BB
2949 if (locked)
2950 mutex_exit(&spa_namespace_lock);
2951 *spapp = NULL;
2952 return (error);
34dc7c2f 2953 }
34dc7c2f
BB
2954 }
2955
2956 spa_open_ref(spa, tag);
2957
b128c09f 2958 if (config != NULL)
34dc7c2f 2959 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
34dc7c2f 2960
572e2857
BB
2961 /*
2962 * If we've recovered the pool, pass back any information we
2963 * gathered while doing the load.
2964 */
2965 if (state == SPA_LOAD_RECOVER) {
2966 VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO,
2967 spa->spa_load_info) == 0);
2968 }
2969
428870ff
BB
2970 if (locked) {
2971 spa->spa_last_open_failed = 0;
2972 spa->spa_last_ubsync_txg = 0;
2973 spa->spa_load_txg = 0;
2974 mutex_exit(&spa_namespace_lock);
2975 }
2976
526af785
PJD
2977#ifdef _KERNEL
2978 if (firstopen)
2979 zvol_create_minors(spa->spa_name);
2980#endif
2981
428870ff
BB
2982 *spapp = spa;
2983
34dc7c2f
BB
2984 return (0);
2985}
2986
428870ff
BB
2987int
2988spa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy,
2989 nvlist_t **config)
2990{
2991 return (spa_open_common(name, spapp, tag, policy, config));
2992}
2993
34dc7c2f
BB
2994int
2995spa_open(const char *name, spa_t **spapp, void *tag)
2996{
428870ff 2997 return (spa_open_common(name, spapp, tag, NULL, NULL));
34dc7c2f
BB
2998}
2999
3000/*
3001 * Lookup the given spa_t, incrementing the inject count in the process,
3002 * preventing it from being exported or destroyed.
3003 */
3004spa_t *
3005spa_inject_addref(char *name)
3006{
3007 spa_t *spa;
3008
3009 mutex_enter(&spa_namespace_lock);
3010 if ((spa = spa_lookup(name)) == NULL) {
3011 mutex_exit(&spa_namespace_lock);
3012 return (NULL);
3013 }
3014 spa->spa_inject_ref++;
3015 mutex_exit(&spa_namespace_lock);
3016
3017 return (spa);
3018}
3019
3020void
3021spa_inject_delref(spa_t *spa)
3022{
3023 mutex_enter(&spa_namespace_lock);
3024 spa->spa_inject_ref--;
3025 mutex_exit(&spa_namespace_lock);
3026}
3027
3028/*
3029 * Add spares device information to the nvlist.
3030 */
3031static void
3032spa_add_spares(spa_t *spa, nvlist_t *config)
3033{
3034 nvlist_t **spares;
3035 uint_t i, nspares;
3036 nvlist_t *nvroot;
3037 uint64_t guid;
3038 vdev_stat_t *vs;
3039 uint_t vsc;
3040 uint64_t pool;
3041
9babb374
BB
3042 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
3043
34dc7c2f
BB
3044 if (spa->spa_spares.sav_count == 0)
3045 return;
3046
3047 VERIFY(nvlist_lookup_nvlist(config,
3048 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
3049 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
3050 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
3051 if (nspares != 0) {
3052 VERIFY(nvlist_add_nvlist_array(nvroot,
3053 ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
3054 VERIFY(nvlist_lookup_nvlist_array(nvroot,
3055 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
3056
3057 /*
3058 * Go through and find any spares which have since been
3059 * repurposed as an active spare. If this is the case, update
3060 * their status appropriately.
3061 */
3062 for (i = 0; i < nspares; i++) {
3063 VERIFY(nvlist_lookup_uint64(spares[i],
3064 ZPOOL_CONFIG_GUID, &guid) == 0);
b128c09f
BB
3065 if (spa_spare_exists(guid, &pool, NULL) &&
3066 pool != 0ULL) {
34dc7c2f 3067 VERIFY(nvlist_lookup_uint64_array(
428870ff 3068 spares[i], ZPOOL_CONFIG_VDEV_STATS,
34dc7c2f
BB
3069 (uint64_t **)&vs, &vsc) == 0);
3070 vs->vs_state = VDEV_STATE_CANT_OPEN;
3071 vs->vs_aux = VDEV_AUX_SPARED;
3072 }
3073 }
3074 }
3075}
3076
3077/*
3078 * Add l2cache device information to the nvlist, including vdev stats.
3079 */
3080static void
3081spa_add_l2cache(spa_t *spa, nvlist_t *config)
3082{
3083 nvlist_t **l2cache;
3084 uint_t i, j, nl2cache;
3085 nvlist_t *nvroot;
3086 uint64_t guid;
3087 vdev_t *vd;
3088 vdev_stat_t *vs;
3089 uint_t vsc;
3090
9babb374
BB
3091 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
3092
34dc7c2f
BB
3093 if (spa->spa_l2cache.sav_count == 0)
3094 return;
3095
34dc7c2f
BB
3096 VERIFY(nvlist_lookup_nvlist(config,
3097 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
3098 VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
3099 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
3100 if (nl2cache != 0) {
3101 VERIFY(nvlist_add_nvlist_array(nvroot,
3102 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
3103 VERIFY(nvlist_lookup_nvlist_array(nvroot,
3104 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
3105
3106 /*
3107 * Update level 2 cache device stats.
3108 */
3109
3110 for (i = 0; i < nl2cache; i++) {
3111 VERIFY(nvlist_lookup_uint64(l2cache[i],
3112 ZPOOL_CONFIG_GUID, &guid) == 0);
3113
3114 vd = NULL;
3115 for (j = 0; j < spa->spa_l2cache.sav_count; j++) {
3116 if (guid ==
3117 spa->spa_l2cache.sav_vdevs[j]->vdev_guid) {
3118 vd = spa->spa_l2cache.sav_vdevs[j];
3119 break;
3120 }
3121 }
3122 ASSERT(vd != NULL);
3123
3124 VERIFY(nvlist_lookup_uint64_array(l2cache[i],
428870ff
BB
3125 ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)
3126 == 0);
34dc7c2f
BB
3127 vdev_get_stats(vd, vs);
3128 }
3129 }
34dc7c2f
BB
3130}
3131
9ae529ec
CS
3132static void
3133spa_add_feature_stats(spa_t *spa, nvlist_t *config)
3134{
3135 nvlist_t *features;
3136 zap_cursor_t zc;
3137 zap_attribute_t za;
3138
3139 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
3140 VERIFY(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP) == 0);
3141
3142 if (spa->spa_feat_for_read_obj != 0) {
3143 for (zap_cursor_init(&zc, spa->spa_meta_objset,
3144 spa->spa_feat_for_read_obj);
3145 zap_cursor_retrieve(&zc, &za) == 0;
3146 zap_cursor_advance(&zc)) {
3147 ASSERT(za.za_integer_length == sizeof (uint64_t) &&
3148 za.za_num_integers == 1);
3149 VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name,
3150 za.za_first_integer));
3151 }
3152 zap_cursor_fini(&zc);
3153 }
3154
3155 if (spa->spa_feat_for_write_obj != 0) {
3156 for (zap_cursor_init(&zc, spa->spa_meta_objset,
3157 spa->spa_feat_for_write_obj);
3158 zap_cursor_retrieve(&zc, &za) == 0;
3159 zap_cursor_advance(&zc)) {
3160 ASSERT(za.za_integer_length == sizeof (uint64_t) &&
3161 za.za_num_integers == 1);
3162 VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name,
3163 za.za_first_integer));
3164 }
3165 zap_cursor_fini(&zc);
3166 }
3167
3168 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS,
3169 features) == 0);
3170 nvlist_free(features);
3171}
3172
34dc7c2f 3173int
9ae529ec
CS
3174spa_get_stats(const char *name, nvlist_t **config,
3175 char *altroot, size_t buflen)
34dc7c2f
BB
3176{
3177 int error;
3178 spa_t *spa;
3179
3180 *config = NULL;
428870ff 3181 error = spa_open_common(name, &spa, FTAG, NULL, config);
34dc7c2f 3182
9babb374
BB
3183 if (spa != NULL) {
3184 /*
3185 * This still leaves a window of inconsistency where the spares
3186 * or l2cache devices could change and the config would be
3187 * self-inconsistent.
3188 */
3189 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
34dc7c2f 3190
9babb374 3191 if (*config != NULL) {
572e2857
BB
3192 uint64_t loadtimes[2];
3193
3194 loadtimes[0] = spa->spa_loaded_ts.tv_sec;
3195 loadtimes[1] = spa->spa_loaded_ts.tv_nsec;
3196 VERIFY(nvlist_add_uint64_array(*config,
3197 ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0);
3198
b128c09f 3199 VERIFY(nvlist_add_uint64(*config,
9babb374
BB
3200 ZPOOL_CONFIG_ERRCOUNT,
3201 spa_get_errlog_size(spa)) == 0);
3202
3203 if (spa_suspended(spa))
3204 VERIFY(nvlist_add_uint64(*config,
3205 ZPOOL_CONFIG_SUSPENDED,
3206 spa->spa_failmode) == 0);
b128c09f 3207
9babb374
BB
3208 spa_add_spares(spa, *config);
3209 spa_add_l2cache(spa, *config);
9ae529ec 3210 spa_add_feature_stats(spa, *config);
9babb374 3211 }
34dc7c2f
BB
3212 }
3213
3214 /*
3215 * We want to get the alternate root even for faulted pools, so we cheat
3216 * and call spa_lookup() directly.
3217 */
3218 if (altroot) {
3219 if (spa == NULL) {
3220 mutex_enter(&spa_namespace_lock);
3221 spa = spa_lookup(name);
3222 if (spa)
3223 spa_altroot(spa, altroot, buflen);
3224 else
3225 altroot[0] = '\0';
3226 spa = NULL;
3227 mutex_exit(&spa_namespace_lock);
3228 } else {
3229 spa_altroot(spa, altroot, buflen);
3230 }
3231 }
3232
9babb374
BB
3233 if (spa != NULL) {
3234 spa_config_exit(spa, SCL_CONFIG, FTAG);
34dc7c2f 3235 spa_close(spa, FTAG);
9babb374 3236 }
34dc7c2f
BB
3237
3238 return (error);
3239}
3240
3241/*
3242 * Validate that the auxiliary device array is well formed. We must have an
3243 * array of nvlists, each which describes a valid leaf vdev. If this is an
3244 * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be
3245 * specified, as long as they are well-formed.
3246 */
3247static int
3248spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode,
3249 spa_aux_vdev_t *sav, const char *config, uint64_t version,
3250 vdev_labeltype_t label)
3251{
3252 nvlist_t **dev;
3253 uint_t i, ndev;
3254 vdev_t *vd;
3255 int error;
3256
b128c09f
BB
3257 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
3258
34dc7c2f
BB
3259 /*
3260 * It's acceptable to have no devs specified.
3261 */
3262 if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0)
3263 return (0);
3264
3265 if (ndev == 0)
2e528b49 3266 return (SET_ERROR(EINVAL));
34dc7c2f
BB
3267
3268 /*
3269 * Make sure the pool is formatted with a version that supports this
3270 * device type.
3271 */
3272 if (spa_version(spa) < version)
2e528b49 3273 return (SET_ERROR(ENOTSUP));
34dc7c2f
BB
3274
3275 /*
3276 * Set the pending device list so we correctly handle device in-use
3277 * checking.
3278 */
3279 sav->sav_pending = dev;
3280 sav->sav_npending = ndev;
3281
3282 for (i = 0; i < ndev; i++) {
3283 if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0,
3284 mode)) != 0)
3285 goto out;
3286
3287 if (!vd->vdev_ops->vdev_op_leaf) {
3288 vdev_free(vd);
2e528b49 3289 error = SET_ERROR(EINVAL);
34dc7c2f
BB
3290 goto out;
3291 }
3292
3293 /*
b128c09f
BB
3294 * The L2ARC currently only supports disk devices in
3295 * kernel context. For user-level testing, we allow it.
34dc7c2f 3296 */
b128c09f 3297#ifdef _KERNEL
34dc7c2f
BB
3298 if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) &&
3299 strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) {
2e528b49 3300 error = SET_ERROR(ENOTBLK);
5ffb9d1d 3301 vdev_free(vd);
34dc7c2f
BB
3302 goto out;
3303 }
b128c09f 3304#endif
34dc7c2f
BB
3305 vd->vdev_top = vd;
3306
3307 if ((error = vdev_open(vd)) == 0 &&
3308 (error = vdev_label_init(vd, crtxg, label)) == 0) {
3309 VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID,
3310 vd->vdev_guid) == 0);
3311 }
3312
3313 vdev_free(vd);
3314
3315 if (error &&
3316 (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE))
3317 goto out;
3318 else
3319 error = 0;
3320 }
3321
3322out:
3323 sav->sav_pending = NULL;
3324 sav->sav_npending = 0;
3325 return (error);
3326}
3327
3328static int
3329spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode)
3330{
3331 int error;
3332
b128c09f
BB
3333 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
3334
34dc7c2f
BB
3335 if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode,
3336 &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES,
3337 VDEV_LABEL_SPARE)) != 0) {
3338 return (error);
3339 }
3340
3341 return (spa_validate_aux_devs(spa, nvroot, crtxg, mode,
3342 &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE,
3343 VDEV_LABEL_L2CACHE));
3344}
3345
3346static void
3347spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs,
3348 const char *config)
3349{
3350 int i;
3351
3352 if (sav->sav_config != NULL) {
3353 nvlist_t **olddevs;
3354 uint_t oldndevs;
3355 nvlist_t **newdevs;
3356
3357 /*
3358 * Generate new dev list by concatentating with the
3359 * current dev list.
3360 */
3361 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config,
3362 &olddevs, &oldndevs) == 0);
3363
3364 newdevs = kmem_alloc(sizeof (void *) *
b8d06fca 3365 (ndevs + oldndevs), KM_PUSHPAGE);
34dc7c2f
BB
3366 for (i = 0; i < oldndevs; i++)
3367 VERIFY(nvlist_dup(olddevs[i], &newdevs[i],
b8d06fca 3368 KM_PUSHPAGE) == 0);
34dc7c2f
BB
3369 for (i = 0; i < ndevs; i++)
3370 VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs],
b8d06fca 3371 KM_PUSHPAGE) == 0);
34dc7c2f
BB
3372
3373 VERIFY(nvlist_remove(sav->sav_config, config,
3374 DATA_TYPE_NVLIST_ARRAY) == 0);
3375
3376 VERIFY(nvlist_add_nvlist_array(sav->sav_config,
3377 config, newdevs, ndevs + oldndevs) == 0);
3378 for (i = 0; i < oldndevs + ndevs; i++)
3379 nvlist_free(newdevs[i]);
3380 kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *));
3381 } else {
3382 /*
3383 * Generate a new dev list.
3384 */
3385 VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME,
b8d06fca 3386 KM_PUSHPAGE) == 0);
34dc7c2f
BB
3387 VERIFY(nvlist_add_nvlist_array(sav->sav_config, config,
3388 devs, ndevs) == 0);
3389 }
3390}
3391
3392/*
3393 * Stop and drop level 2 ARC devices
3394 */
3395void
3396spa_l2cache_drop(spa_t *spa)
3397{
3398 vdev_t *vd;
3399 int i;
3400 spa_aux_vdev_t *sav = &spa->spa_l2cache;
3401
3402 for (i = 0; i < sav->sav_count; i++) {
3403 uint64_t pool;
3404
3405 vd = sav->sav_vdevs[i];
3406 ASSERT(vd != NULL);
3407
fb5f0bc8
BB
3408 if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
3409 pool != 0ULL && l2arc_vdev_present(vd))
34dc7c2f 3410 l2arc_remove_vdev(vd);
34dc7c2f
BB
3411 }
3412}
3413
3414/*
3415 * Pool Creation
3416 */
3417int
3418spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
6f1ffb06 3419 nvlist_t *zplprops)
34dc7c2f
BB
3420{
3421 spa_t *spa;
3422 char *altroot = NULL;
3423 vdev_t *rvd;
3424 dsl_pool_t *dp;
3425 dmu_tx_t *tx;
9babb374 3426 int error = 0;
34dc7c2f
BB
3427 uint64_t txg = TXG_INITIAL;
3428 nvlist_t **spares, **l2cache;
3429 uint_t nspares, nl2cache;
428870ff 3430 uint64_t version, obj;
9ae529ec
CS
3431 boolean_t has_features;
3432 nvpair_t *elem;
d6320ddb 3433 int c;
34dc7c2f
BB
3434
3435 /*
3436 * If this pool already exists, return failure.
3437 */
3438 mutex_enter(&spa_namespace_lock);
3439 if (spa_lookup(pool) != NULL) {
3440 mutex_exit(&spa_namespace_lock);
2e528b49 3441 return (SET_ERROR(EEXIST));
34dc7c2f
BB
3442 }
3443
3444 /*
3445 * Allocate a new spa_t structure.
3446 */
3447 (void) nvlist_lookup_string(props,
3448 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
428870ff 3449 spa = spa_add(pool, NULL, altroot);
fb5f0bc8 3450 spa_activate(spa, spa_mode_global);
34dc7c2f 3451
34dc7c2f 3452 if (props && (error = spa_prop_validate(spa, props))) {
34dc7c2f
BB
3453 spa_deactivate(spa);
3454 spa_remove(spa);
b128c09f 3455 mutex_exit(&spa_namespace_lock);
34dc7c2f
BB
3456 return (error);
3457 }
3458
9ae529ec
CS
3459 has_features = B_FALSE;
3460 for (elem = nvlist_next_nvpair(props, NULL);
3461 elem != NULL; elem = nvlist_next_nvpair(props, elem)) {
3462 if (zpool_prop_feature(nvpair_name(elem)))
3463 has_features = B_TRUE;
3464 }
3465
3466 if (has_features || nvlist_lookup_uint64(props,
3467 zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) {
34dc7c2f 3468 version = SPA_VERSION;
9ae529ec
CS
3469 }
3470 ASSERT(SPA_VERSION_IS_SUPPORTED(version));
428870ff
BB
3471
3472 spa->spa_first_txg = txg;
3473 spa->spa_uberblock.ub_txg = txg - 1;
34dc7c2f
BB
3474 spa->spa_uberblock.ub_version = version;
3475 spa->spa_ubsync = spa->spa_uberblock;
3476
9babb374
BB
3477 /*
3478 * Create "The Godfather" zio to hold all async IOs
3479 */
3480 spa->spa_async_zio_root = zio_root(spa, NULL, NULL,
3481 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER);
3482
34dc7c2f
BB
3483 /*
3484 * Create the root vdev.
3485 */
b128c09f 3486 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
34dc7c2f
BB
3487
3488 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD);
3489
3490 ASSERT(error != 0 || rvd != NULL);
3491 ASSERT(error != 0 || spa->spa_root_vdev == rvd);
3492
3493 if (error == 0 && !zfs_allocatable_devs(nvroot))
2e528b49 3494 error = SET_ERROR(EINVAL);
34dc7c2f
BB
3495
3496 if (error == 0 &&
3497 (error = vdev_create(rvd, txg, B_FALSE)) == 0 &&
3498 (error = spa_validate_aux(spa, nvroot, txg,
3499 VDEV_ALLOC_ADD)) == 0) {
d6320ddb 3500 for (c = 0; c < rvd->vdev_children; c++) {
9babb374
BB
3501 vdev_metaslab_set_size(rvd->vdev_child[c]);
3502 vdev_expand(rvd->vdev_child[c], txg);
3503 }
34dc7c2f
BB
3504 }
3505
b128c09f 3506 spa_config_exit(spa, SCL_ALL, FTAG);
34dc7c2f
BB
3507
3508 if (error != 0) {
3509 spa_unload(spa);
3510 spa_deactivate(spa);
3511 spa_remove(spa);
3512 mutex_exit(&spa_namespace_lock);
3513 return (error);
3514 }
3515
3516 /*
3517 * Get the list of spares, if specified.
3518 */
3519 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
3520 &spares, &nspares) == 0) {
3521 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME,
b8d06fca 3522 KM_PUSHPAGE) == 0);
34dc7c2f
BB
3523 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
3524 ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
b128c09f 3525 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
34dc7c2f 3526 spa_load_spares(spa);
b128c09f 3527 spa_config_exit(spa, SCL_ALL, FTAG);
34dc7c2f
BB
3528 spa->spa_spares.sav_sync = B_TRUE;
3529 }
3530
3531 /*
3532 * Get the list of level 2 cache devices, if specified.
3533 */
3534 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
3535 &l2cache, &nl2cache) == 0) {
3536 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
b8d06fca 3537 NV_UNIQUE_NAME, KM_PUSHPAGE) == 0);
34dc7c2f
BB
3538 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
3539 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
b128c09f 3540 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
34dc7c2f 3541 spa_load_l2cache(spa);
b128c09f 3542 spa_config_exit(spa, SCL_ALL, FTAG);
34dc7c2f
BB
3543 spa->spa_l2cache.sav_sync = B_TRUE;
3544 }
3545
9ae529ec 3546 spa->spa_is_initializing = B_TRUE;
b128c09f 3547 spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg);
34dc7c2f 3548 spa->spa_meta_objset = dp->dp_meta_objset;
9ae529ec 3549 spa->spa_is_initializing = B_FALSE;
34dc7c2f 3550
428870ff
BB
3551 /*
3552 * Create DDTs (dedup tables).
3553 */
3554 ddt_create(spa);
3555
3556 spa_update_dspace(spa);
3557
34dc7c2f
BB
3558 tx = dmu_tx_create_assigned(dp, txg);
3559
3560 /*
3561 * Create the pool config object.
3562 */
3563 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset,
b128c09f 3564 DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE,
34dc7c2f
BB
3565 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx);
3566
3567 if (zap_add(spa->spa_meta_objset,
3568 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
3569 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) {
3570 cmn_err(CE_PANIC, "failed to add pool config");
3571 }
3572
9ae529ec
CS
3573 if (spa_version(spa) >= SPA_VERSION_FEATURES)
3574 spa_feature_create_zap_objects(spa, tx);
3575
428870ff
BB
3576 if (zap_add(spa->spa_meta_objset,
3577 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION,
3578 sizeof (uint64_t), 1, &version, tx) != 0) {
3579 cmn_err(CE_PANIC, "failed to add pool version");
3580 }
3581
34dc7c2f
BB
3582 /* Newly created pools with the right version are always deflated. */
3583 if (version >= SPA_VERSION_RAIDZ_DEFLATE) {
3584 spa->spa_deflate = TRUE;
3585 if (zap_add(spa->spa_meta_objset,
3586 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
3587 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) {
3588 cmn_err(CE_PANIC, "failed to add deflate");
3589 }
3590 }
3591
3592 /*
428870ff 3593 * Create the deferred-free bpobj. Turn off compression
34dc7c2f
BB
3594 * because sync-to-convergence takes longer if the blocksize
3595 * keeps changing.
3596 */
428870ff
BB
3597 obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx);
3598 dmu_object_set_compress(spa->spa_meta_objset, obj,
34dc7c2f 3599 ZIO_COMPRESS_OFF, tx);
34dc7c2f 3600 if (zap_add(spa->spa_meta_objset,
428870ff
BB
3601 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ,
3602 sizeof (uint64_t), 1, &obj, tx) != 0) {
3603 cmn_err(CE_PANIC, "failed to add bpobj");
34dc7c2f 3604 }
428870ff
BB
3605 VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj,
3606 spa->spa_meta_objset, obj));
34dc7c2f
BB
3607
3608 /*
3609 * Create the pool's history object.
3610 */
3611 if (version >= SPA_VERSION_ZPOOL_HISTORY)
3612 spa_history_create_obj(spa, tx);
3613
3614 /*
3615 * Set pool properties.
3616 */
3617 spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS);
3618 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
3619 spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE);
9babb374 3620 spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND);
428870ff 3621
d164b209
BB
3622 if (props != NULL) {
3623 spa_configfile_set(spa, props, B_FALSE);
13fe0198 3624 spa_sync_props(props, tx);
d164b209 3625 }
34dc7c2f
BB
3626
3627 dmu_tx_commit(tx);
3628
3629 spa->spa_sync_on = B_TRUE;
3630 txg_sync_start(spa->spa_dsl_pool);
3631
3632 /*
3633 * We explicitly wait for the first transaction to complete so that our
3634 * bean counters are appropriately updated.
3635 */
3636 txg_wait_synced(spa->spa_dsl_pool, txg);
3637
b128c09f 3638 spa_config_sync(spa, B_FALSE, B_TRUE);
34dc7c2f 3639
6f1ffb06 3640 spa_history_log_version(spa, "create");
34dc7c2f 3641
b128c09f
BB
3642 spa->spa_minref = refcount_count(&spa->spa_refcount);
3643
d164b209
BB
3644 mutex_exit(&spa_namespace_lock);
3645
34dc7c2f
BB
3646 return (0);
3647}
3648
9babb374 3649#ifdef _KERNEL
34dc7c2f 3650/*
9babb374
BB
3651 * Get the root pool information from the root disk, then import the root pool
3652 * during the system boot up time.
34dc7c2f 3653 */
9babb374
BB
3654extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **);
3655
3656static nvlist_t *
3657spa_generate_rootconf(char *devpath, char *devid, uint64_t *guid)
3658{
3659 nvlist_t *config;
3660 nvlist_t *nvtop, *nvroot;
3661 uint64_t pgid;
3662
3663 if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0)
3664 return (NULL);
3665
3666 /*
3667 * Add this top-level vdev to the child array.
3668 */
3669 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
3670 &nvtop) == 0);
3671 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
3672 &pgid) == 0);
3673 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0);
3674
3675 /*
3676 * Put this pool's top-level vdevs into a root vdev.
3677 */
b8d06fca 3678 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_PUSHPAGE) == 0);
9babb374
BB
3679 VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
3680 VDEV_TYPE_ROOT) == 0);
3681 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0);
3682 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0);
3683 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
3684 &nvtop, 1) == 0);
3685
3686 /*
3687 * Replace the existing vdev_tree with the new root vdev in
3688 * this pool's configuration (remove the old, add the new).
3689 */
3690 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
3691 nvlist_free(nvroot);
3692 return (config);
3693}
3694
3695/*
3696 * Walk the vdev tree and see if we can find a device with "better"
3697 * configuration. A configuration is "better" if the label on that
3698 * device has a more recent txg.
3699 */
3700static void
3701spa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg)
3702{
d6320ddb
BB
3703 int c;
3704
3705 for (c = 0; c < vd->vdev_children; c++)
9babb374
BB
3706 spa_alt_rootvdev(vd->vdev_child[c], avd, txg);
3707
3708 if (vd->vdev_ops->vdev_op_leaf) {
3709 nvlist_t *label;
3710 uint64_t label_txg;
3711
3712 if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid,
3713 &label) != 0)
3714 return;
3715
3716 VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG,
3717 &label_txg) == 0);
3718
3719 /*
3720 * Do we have a better boot device?
3721 */
3722 if (label_txg > *txg) {
3723 *txg = label_txg;
3724 *avd = vd;
3725 }
3726 nvlist_free(label);
3727 }
3728}
3729
3730/*
3731 * Import a root pool.
3732 *
3733 * For x86. devpath_list will consist of devid and/or physpath name of
3734 * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a").
3735 * The GRUB "findroot" command will return the vdev we should boot.
3736 *
3737 * For Sparc, devpath_list consists the physpath name of the booting device
3738 * no matter the rootpool is a single device pool or a mirrored pool.
3739 * e.g.
3740 * "/pci@1f,0/ide@d/disk@0,0:a"
3741 */
3742int
3743spa_import_rootpool(char *devpath, char *devid)
3744{
3745 spa_t *spa;
3746 vdev_t *rvd, *bvd, *avd = NULL;
3747 nvlist_t *config, *nvtop;
3748 uint64_t guid, txg;
3749 char *pname;
3750 int error;
3751
3752 /*
3753 * Read the label from the boot device and generate a configuration.
3754 */
428870ff
BB
3755 config = spa_generate_rootconf(devpath, devid, &guid);
3756#if defined(_OBP) && defined(_KERNEL)
3757 if (config == NULL) {
3758 if (strstr(devpath, "/iscsi/ssd") != NULL) {
3759 /* iscsi boot */
3760 get_iscsi_bootpath_phy(devpath);
3761 config = spa_generate_rootconf(devpath, devid, &guid);
3762 }
3763 }
3764#endif
3765 if (config == NULL) {
9ae529ec 3766 cmn_err(CE_NOTE, "Cannot read the pool label from '%s'",
9babb374 3767 devpath);
2e528b49 3768 return (SET_ERROR(EIO));
9babb374
BB
3769 }
3770
3771 VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
3772 &pname) == 0);
3773 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0);
3774
3775 mutex_enter(&spa_namespace_lock);
3776 if ((spa = spa_lookup(pname)) != NULL) {
3777 /*
3778 * Remove the existing root pool from the namespace so that we
3779 * can replace it with the correct config we just read in.
3780 */
3781 spa_remove(spa);
3782 }
3783
428870ff 3784 spa = spa_add(pname, config, NULL);
9babb374 3785 spa->spa_is_root = B_TRUE;
572e2857 3786 spa->spa_import_flags = ZFS_IMPORT_VERBATIM;
9babb374
BB
3787
3788 /*
3789 * Build up a vdev tree based on the boot device's label config.
3790 */
3791 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
3792 &nvtop) == 0);
3793 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3794 error = spa_config_parse(spa, &rvd, nvtop, NULL, 0,
3795 VDEV_ALLOC_ROOTPOOL);
3796 spa_config_exit(spa, SCL_ALL, FTAG);
3797 if (error) {
3798 mutex_exit(&spa_namespace_lock);
3799 nvlist_free(config);
3800 cmn_err(CE_NOTE, "Can not parse the config for pool '%s'",
3801 pname);
3802 return (error);
3803 }
3804
3805 /*
3806 * Get the boot vdev.
3807 */
3808 if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) {
3809 cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu",
3810 (u_longlong_t)guid);
2e528b49 3811 error = SET_ERROR(ENOENT);
9babb374
BB
3812 goto out;
3813 }
3814
3815 /*
3816 * Determine if there is a better boot device.
3817 */
3818 avd = bvd;
3819 spa_alt_rootvdev(rvd, &avd, &txg);
3820 if (avd != bvd) {
3821 cmn_err(CE_NOTE, "The boot device is 'degraded'. Please "
3822 "try booting from '%s'", avd->vdev_path);
2e528b49 3823 error = SET_ERROR(EINVAL);
9babb374
BB
3824 goto out;
3825 }
3826
3827 /*
3828 * If the boot device is part of a spare vdev then ensure that
3829 * we're booting off the active spare.
3830 */
3831 if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops &&
3832 !bvd->vdev_isspare) {
3833 cmn_err(CE_NOTE, "The boot device is currently spared. Please "
3834 "try booting from '%s'",
572e2857
BB
3835 bvd->vdev_parent->
3836 vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path);
2e528b49 3837 error = SET_ERROR(EINVAL);
9babb374
BB
3838 goto out;
3839 }
3840
9babb374
BB
3841 error = 0;
3842out:
3843 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3844 vdev_free(rvd);
3845 spa_config_exit(spa, SCL_ALL, FTAG);
3846 mutex_exit(&spa_namespace_lock);
3847
3848 nvlist_free(config);
3849 return (error);
3850}
3851
3852#endif
3853
9babb374
BB
3854/*
3855 * Import a non-root pool into the system.
3856 */
3857int
13fe0198 3858spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags)
34dc7c2f
BB
3859{
3860 spa_t *spa;
3861 char *altroot = NULL;
428870ff
BB
3862 spa_load_state_t state = SPA_LOAD_IMPORT;
3863 zpool_rewind_policy_t policy;
572e2857
BB
3864 uint64_t mode = spa_mode_global;
3865 uint64_t readonly = B_FALSE;
9babb374 3866 int error;
34dc7c2f
BB
3867 nvlist_t *nvroot;
3868 nvlist_t **spares, **l2cache;
3869 uint_t nspares, nl2cache;
34dc7c2f
BB
3870
3871 /*
3872 * If a pool with this name exists, return failure.
3873 */
3874 mutex_enter(&spa_namespace_lock);
428870ff 3875 if (spa_lookup(pool) != NULL) {
9babb374 3876 mutex_exit(&spa_namespace_lock);
2e528b49 3877 return (SET_ERROR(EEXIST));
34dc7c2f
BB
3878 }
3879
3880 /*
3881 * Create and initialize the spa structure.
3882 */
3883 (void) nvlist_lookup_string(props,
3884 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
572e2857
BB
3885 (void) nvlist_lookup_uint64(props,
3886 zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly);
3887 if (readonly)
3888 mode = FREAD;
428870ff 3889 spa = spa_add(pool, config, altroot);
572e2857
BB
3890 spa->spa_import_flags = flags;
3891
3892 /*
3893 * Verbatim import - Take a pool and insert it into the namespace
3894 * as if it had been loaded at boot.
3895 */
3896 if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) {
3897 if (props != NULL)
3898 spa_configfile_set(spa, props, B_FALSE);
3899
3900 spa_config_sync(spa, B_FALSE, B_TRUE);
3901
3902 mutex_exit(&spa_namespace_lock);
6f1ffb06 3903 spa_history_log_version(spa, "import");
572e2857
BB
3904
3905 return (0);
3906 }
3907
3908 spa_activate(spa, mode);
34dc7c2f 3909
9babb374
BB
3910 /*
3911 * Don't start async tasks until we know everything is healthy.
3912 */
3913 spa_async_suspend(spa);
b128c09f 3914
572e2857
BB
3915 zpool_get_rewind_policy(config, &policy);
3916 if (policy.zrp_request & ZPOOL_DO_REWIND)
3917 state = SPA_LOAD_RECOVER;
3918
34dc7c2f 3919 /*
9babb374
BB
3920 * Pass off the heavy lifting to spa_load(). Pass TRUE for mosconfig
3921 * because the user-supplied config is actually the one to trust when
b128c09f 3922 * doing an import.
34dc7c2f 3923 */
428870ff
BB
3924 if (state != SPA_LOAD_RECOVER)
3925 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
572e2857 3926
428870ff
BB
3927 error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg,
3928 policy.zrp_request);
3929
3930 /*
572e2857
BB
3931 * Propagate anything learned while loading the pool and pass it
3932 * back to caller (i.e. rewind info, missing devices, etc).
428870ff 3933 */
572e2857
BB
3934 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO,
3935 spa->spa_load_info) == 0);
34dc7c2f 3936
b128c09f 3937 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
34dc7c2f 3938 /*
9babb374
BB
3939 * Toss any existing sparelist, as it doesn't have any validity
3940 * anymore, and conflicts with spa_has_spare().
34dc7c2f 3941 */
9babb374 3942 if (spa->spa_spares.sav_config) {
34dc7c2f
BB
3943 nvlist_free(spa->spa_spares.sav_config);
3944 spa->spa_spares.sav_config = NULL;
3945 spa_load_spares(spa);
3946 }
9babb374 3947 if (spa->spa_l2cache.sav_config) {
34dc7c2f
BB
3948 nvlist_free(spa->spa_l2cache.sav_config);
3949 spa->spa_l2cache.sav_config = NULL;
3950 spa_load_l2cache(spa);
3951 }
3952
3953 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
3954 &nvroot) == 0);
3955 if (error == 0)
9babb374
BB
3956 error = spa_validate_aux(spa, nvroot, -1ULL,
3957 VDEV_ALLOC_SPARE);
34dc7c2f
BB
3958 if (error == 0)
3959 error = spa_validate_aux(spa, nvroot, -1ULL,
3960 VDEV_ALLOC_L2CACHE);
b128c09f 3961 spa_config_exit(spa, SCL_ALL, FTAG);
34dc7c2f 3962
d164b209
BB
3963 if (props != NULL)
3964 spa_configfile_set(spa, props, B_FALSE);
3965
fb5f0bc8
BB
3966 if (error != 0 || (props && spa_writeable(spa) &&
3967 (error = spa_prop_set(spa, props)))) {
9babb374
BB
3968 spa_unload(spa);
3969 spa_deactivate(spa);
3970 spa_remove(spa);
34dc7c2f
BB
3971 mutex_exit(&spa_namespace_lock);
3972 return (error);
3973 }
3974
572e2857
BB
3975 spa_async_resume(spa);
3976
34dc7c2f
BB
3977 /*
3978 * Override any spares and level 2 cache devices as specified by
3979 * the user, as these may have correct device names/devids, etc.
3980 */
3981 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
3982 &spares, &nspares) == 0) {
3983 if (spa->spa_spares.sav_config)
3984 VERIFY(nvlist_remove(spa->spa_spares.sav_config,
3985 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0);
3986 else
3987 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config,
b8d06fca 3988 NV_UNIQUE_NAME, KM_PUSHPAGE) == 0);
34dc7c2f
BB
3989 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
3990 ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
b128c09f 3991 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
34dc7c2f 3992 spa_load_spares(spa);
b128c09f 3993 spa_config_exit(spa, SCL_ALL, FTAG);
34dc7c2f
BB
3994 spa->spa_spares.sav_sync = B_TRUE;
3995 }
3996 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
3997 &l2cache, &nl2cache) == 0) {
3998 if (spa->spa_l2cache.sav_config)
3999 VERIFY(nvlist_remove(spa->spa_l2cache.sav_config,
4000 ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0);
4001 else
4002 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
b8d06fca 4003 NV_UNIQUE_NAME, KM_PUSHPAGE) == 0);
34dc7c2f
BB
4004 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
4005 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
b128c09f 4006 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
34dc7c2f 4007 spa_load_l2cache(spa);
b128c09f 4008 spa_config_exit(spa, SCL_ALL, FTAG);
34dc7c2f
BB
4009 spa->spa_l2cache.sav_sync = B_TRUE;
4010 }
4011
428870ff
BB
4012 /*
4013 * Check for any removed devices.
4014 */
4015 if (spa->spa_autoreplace) {
4016 spa_aux_check_removed(&spa->spa_spares);
4017 spa_aux_check_removed(&spa->spa_l2cache);
4018 }
4019
fb5f0bc8 4020 if (spa_writeable(spa)) {
b128c09f
BB
4021 /*
4022 * Update the config cache to include the newly-imported pool.
4023 */
45d1cae3 4024 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
b128c09f 4025 }
34dc7c2f 4026
34dc7c2f 4027 /*
9babb374
BB
4028 * It's possible that the pool was expanded while it was exported.
4029 * We kick off an async task to handle this for us.
34dc7c2f 4030 */
9babb374 4031 spa_async_request(spa, SPA_ASYNC_AUTOEXPAND);
b128c09f 4032
9babb374 4033 mutex_exit(&spa_namespace_lock);
6f1ffb06 4034 spa_history_log_version(spa, "import");
b128c09f 4035
526af785
PJD
4036#ifdef _KERNEL
4037 zvol_create_minors(pool);
4038#endif
4039
b128c09f
BB
4040 return (0);
4041}
4042
34dc7c2f
BB
4043nvlist_t *
4044spa_tryimport(nvlist_t *tryconfig)
4045{
4046 nvlist_t *config = NULL;
4047 char *poolname;
4048 spa_t *spa;
4049 uint64_t state;
d164b209 4050 int error;
34dc7c2f
BB
4051
4052 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname))
4053 return (NULL);
4054
4055 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state))
4056 return (NULL);
4057
4058 /*
4059 * Create and initialize the spa structure.
4060 */
4061 mutex_enter(&spa_namespace_lock);
428870ff 4062 spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL);
fb5f0bc8 4063 spa_activate(spa, FREAD);
34dc7c2f
BB
4064
4065 /*
4066 * Pass off the heavy lifting to spa_load().
4067 * Pass TRUE for mosconfig because the user-supplied config
4068 * is actually the one to trust when doing an import.
4069 */
428870ff 4070 error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING, B_TRUE);
34dc7c2f
BB
4071
4072 /*
4073 * If 'tryconfig' was at least parsable, return the current config.
4074 */
4075 if (spa->spa_root_vdev != NULL) {
34dc7c2f 4076 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
34dc7c2f
BB
4077 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME,
4078 poolname) == 0);
4079 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
4080 state) == 0);
4081 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP,
4082 spa->spa_uberblock.ub_timestamp) == 0);
9ae529ec
CS
4083 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO,
4084 spa->spa_load_info) == 0);
34dc7c2f
BB
4085
4086 /*
4087 * If the bootfs property exists on this pool then we
4088 * copy it out so that external consumers can tell which
4089 * pools are bootable.
4090 */
d164b209 4091 if ((!error || error == EEXIST) && spa->spa_bootfs) {
b8d06fca 4092 char *tmpname = kmem_alloc(MAXPATHLEN, KM_PUSHPAGE);
34dc7c2f
BB
4093
4094 /*
4095 * We have to play games with the name since the
4096 * pool was opened as TRYIMPORT_NAME.
4097 */
b128c09f 4098 if (dsl_dsobj_to_dsname(spa_name(spa),
34dc7c2f
BB
4099 spa->spa_bootfs, tmpname) == 0) {
4100 char *cp;
b8d06fca 4101 char *dsname = kmem_alloc(MAXPATHLEN, KM_PUSHPAGE);
34dc7c2f
BB
4102
4103 cp = strchr(tmpname, '/');
4104 if (cp == NULL) {
4105 (void) strlcpy(dsname, tmpname,
4106 MAXPATHLEN);
4107 } else {
4108 (void) snprintf(dsname, MAXPATHLEN,
4109 "%s/%s", poolname, ++cp);
4110 }
4111 VERIFY(nvlist_add_string(config,
4112 ZPOOL_CONFIG_BOOTFS, dsname) == 0);
4113 kmem_free(dsname, MAXPATHLEN);
4114 }
4115 kmem_free(tmpname, MAXPATHLEN);
4116 }
4117
4118 /*
4119 * Add the list of hot spares and level 2 cache devices.
4120 */
9babb374 4121 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
34dc7c2f
BB
4122 spa_add_spares(spa, config);
4123 spa_add_l2cache(spa, config);
9babb374 4124 spa_config_exit(spa, SCL_CONFIG, FTAG);
34dc7c2f
BB
4125 }
4126
4127 spa_unload(spa);
4128 spa_deactivate(spa);
4129 spa_remove(spa);
4130 mutex_exit(&spa_namespace_lock);
4131
4132 return (config);
4133}
4134
4135/*
4136 * Pool export/destroy
4137 *
4138 * The act of destroying or exporting a pool is very simple. We make sure there
4139 * is no more pending I/O and any references to the pool are gone. Then, we
4140 * update the pool state and sync all the labels to disk, removing the
fb5f0bc8
BB
4141 * configuration from the cache afterwards. If the 'hardforce' flag is set, then
4142 * we don't sync the labels or remove the configuration cache.
34dc7c2f
BB
4143 */
4144static int
b128c09f 4145spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
fb5f0bc8 4146 boolean_t force, boolean_t hardforce)
34dc7c2f
BB
4147{
4148 spa_t *spa;
4149
4150 if (oldconfig)
4151 *oldconfig = NULL;
4152
fb5f0bc8 4153 if (!(spa_mode_global & FWRITE))
2e528b49 4154 return (SET_ERROR(EROFS));
34dc7c2f
BB
4155
4156 mutex_enter(&spa_namespace_lock);
4157 if ((spa = spa_lookup(pool)) == NULL) {
4158 mutex_exit(&spa_namespace_lock);
2e528b49 4159 return (SET_ERROR(ENOENT));
34dc7c2f
BB
4160 }
4161
4162 /*
4163 * Put a hold on the pool, drop the namespace lock, stop async tasks,
4164 * reacquire the namespace lock, and see if we can export.
4165 */
4166 spa_open_ref(spa, FTAG);
4167 mutex_exit(&spa_namespace_lock);
4168 spa_async_suspend(spa);
4169 mutex_enter(&spa_namespace_lock);
4170 spa_close(spa, FTAG);
4171
4172 /*
4173 * The pool will be in core if it's openable,
4174 * in which case we can modify its state.
4175 */
4176 if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) {
4177 /*
4178 * Objsets may be open only because they're dirty, so we
4179 * have to force it to sync before checking spa_refcnt.
4180 */
34dc7c2f
BB
4181 txg_wait_synced(spa->spa_dsl_pool, 0);
4182
4183 /*
4184 * A pool cannot be exported or destroyed if there are active
4185 * references. If we are resetting a pool, allow references by
4186 * fault injection handlers.
4187 */
4188 if (!spa_refcount_zero(spa) ||
4189 (spa->spa_inject_ref != 0 &&
4190 new_state != POOL_STATE_UNINITIALIZED)) {
34dc7c2f
BB
4191 spa_async_resume(spa);
4192 mutex_exit(&spa_namespace_lock);
2e528b49 4193 return (SET_ERROR(EBUSY));
34dc7c2f
BB
4194 }
4195
b128c09f
BB
4196 /*
4197 * A pool cannot be exported if it has an active shared spare.
4198 * This is to prevent other pools stealing the active spare
4199 * from an exported pool. At user's own will, such pool can
4200 * be forcedly exported.
4201 */
4202 if (!force && new_state == POOL_STATE_EXPORTED &&
4203 spa_has_active_shared_spare(spa)) {
4204 spa_async_resume(spa);
4205 mutex_exit(&spa_namespace_lock);
2e528b49 4206 return (SET_ERROR(EXDEV));
b128c09f 4207 }
34dc7c2f
BB
4208
4209 /*
4210 * We want this to be reflected on every label,
4211 * so mark them all dirty. spa_unload() will do the
4212 * final sync that pushes these changes out.
4213 */
fb5f0bc8 4214 if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) {
b128c09f 4215 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
34dc7c2f 4216 spa->spa_state = new_state;
428870ff
BB
4217 spa->spa_final_txg = spa_last_synced_txg(spa) +
4218 TXG_DEFER_SIZE + 1;
34dc7c2f 4219 vdev_config_dirty(spa->spa_root_vdev);
b128c09f 4220 spa_config_exit(spa, SCL_ALL, FTAG);
34dc7c2f
BB
4221 }
4222 }
4223
26685276 4224 spa_event_notify(spa, NULL, FM_EREPORT_ZFS_POOL_DESTROY);
34dc7c2f
BB
4225
4226 if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
4227 spa_unload(spa);
4228 spa_deactivate(spa);
4229 }
4230
4231 if (oldconfig && spa->spa_config)
4232 VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0);
4233
4234 if (new_state != POOL_STATE_UNINITIALIZED) {
fb5f0bc8
BB
4235 if (!hardforce)
4236 spa_config_sync(spa, B_TRUE, B_TRUE);
34dc7c2f 4237 spa_remove(spa);
34dc7c2f
BB
4238 }
4239 mutex_exit(&spa_namespace_lock);
4240
4241 return (0);
4242}
4243
4244/*
4245 * Destroy a storage pool.
4246 */
4247int
4248spa_destroy(char *pool)
4249{
fb5f0bc8
BB
4250 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL,
4251 B_FALSE, B_FALSE));
34dc7c2f
BB
4252}
4253
4254/*
4255 * Export a storage pool.
4256 */
4257int
fb5f0bc8
BB
4258spa_export(char *pool, nvlist_t **oldconfig, boolean_t force,
4259 boolean_t hardforce)
34dc7c2f 4260{
fb5f0bc8
BB
4261 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig,
4262 force, hardforce));
34dc7c2f
BB
4263}
4264
4265/*
4266 * Similar to spa_export(), this unloads the spa_t without actually removing it
4267 * from the namespace in any way.
4268 */
4269int
4270spa_reset(char *pool)
4271{
b128c09f 4272 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL,
fb5f0bc8 4273 B_FALSE, B_FALSE));
34dc7c2f
BB
4274}
4275
34dc7c2f
BB
4276/*
4277 * ==========================================================================
4278 * Device manipulation
4279 * ==========================================================================
4280 */
4281
4282/*
4283 * Add a device to a storage pool.
4284 */
4285int
4286spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
4287{
428870ff 4288 uint64_t txg, id;
fb5f0bc8 4289 int error;
34dc7c2f
BB
4290 vdev_t *rvd = spa->spa_root_vdev;
4291 vdev_t *vd, *tvd;
4292 nvlist_t **spares, **l2cache;
4293 uint_t nspares, nl2cache;
d6320ddb 4294 int c;
34dc7c2f 4295
572e2857
BB
4296 ASSERT(spa_writeable(spa));
4297
34dc7c2f
BB
4298 txg = spa_vdev_enter(spa);
4299
4300 if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0,
4301 VDEV_ALLOC_ADD)) != 0)
4302 return (spa_vdev_exit(spa, NULL, txg, error));
4303
b128c09f 4304 spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */
34dc7c2f
BB
4305
4306 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares,
4307 &nspares) != 0)
4308 nspares = 0;
4309
4310 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache,
4311 &nl2cache) != 0)
4312 nl2cache = 0;
4313
b128c09f 4314 if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0)
34dc7c2f 4315 return (spa_vdev_exit(spa, vd, txg, EINVAL));
34dc7c2f 4316
b128c09f
BB
4317 if (vd->vdev_children != 0 &&
4318 (error = vdev_create(vd, txg, B_FALSE)) != 0)
4319 return (spa_vdev_exit(spa, vd, txg, error));
34dc7c2f
BB
4320
4321 /*
4322 * We must validate the spares and l2cache devices after checking the
4323 * children. Otherwise, vdev_inuse() will blindly overwrite the spare.
4324 */
b128c09f 4325 if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0)
34dc7c2f 4326 return (spa_vdev_exit(spa, vd, txg, error));
34dc7c2f
BB
4327
4328 /*
4329 * Transfer each new top-level vdev from vd to rvd.
4330 */
d6320ddb 4331 for (c = 0; c < vd->vdev_children; c++) {
428870ff
BB
4332
4333 /*
4334 * Set the vdev id to the first hole, if one exists.
4335 */
4336 for (id = 0; id < rvd->vdev_children; id++) {
4337 if (rvd->vdev_child[id]->vdev_ishole) {
4338 vdev_free(rvd->vdev_child[id]);
4339 break;
4340 }
4341 }
34dc7c2f
BB
4342 tvd = vd->vdev_child[c];
4343 vdev_remove_child(vd, tvd);
428870ff 4344 tvd->vdev_id = id;
34dc7c2f
BB
4345 vdev_add_child(rvd, tvd);
4346 vdev_config_dirty(tvd);
4347 }
4348
4349 if (nspares != 0) {
4350 spa_set_aux_vdevs(&spa->spa_spares, spares, nspares,
4351 ZPOOL_CONFIG_SPARES);
4352 spa_load_spares(spa);
4353 spa->spa_spares.sav_sync = B_TRUE;
4354 }
4355
4356 if (nl2cache != 0) {
4357 spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache,
4358 ZPOOL_CONFIG_L2CACHE);
4359 spa_load_l2cache(spa);
4360 spa->spa_l2cache.sav_sync = B_TRUE;
4361 }
4362
4363 /*
4364 * We have to be careful when adding new vdevs to an existing pool.
4365 * If other threads start allocating from these vdevs before we
4366 * sync the config cache, and we lose power, then upon reboot we may
4367 * fail to open the pool because there are DVAs that the config cache
4368 * can't translate. Therefore, we first add the vdevs without
4369 * initializing metaslabs; sync the config cache (via spa_vdev_exit());
4370 * and then let spa_config_update() initialize the new metaslabs.
4371 *
4372 * spa_load() checks for added-but-not-initialized vdevs, so that
4373 * if we lose power at any point in this sequence, the remaining
4374 * steps will be completed the next time we load the pool.
4375 */
4376 (void) spa_vdev_exit(spa, vd, txg, 0);
4377
4378 mutex_enter(&spa_namespace_lock);
4379 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
4380 mutex_exit(&spa_namespace_lock);
4381
4382 return (0);
4383}
4384
4385/*
4386 * Attach a device to a mirror. The arguments are the path to any device
4387 * in the mirror, and the nvroot for the new device. If the path specifies
4388 * a device that is not mirrored, we automatically insert the mirror vdev.
4389 *
4390 * If 'replacing' is specified, the new device is intended to replace the
4391 * existing device; in this case the two devices are made into their own
4392 * mirror using the 'replacing' vdev, which is functionally identical to
4393 * the mirror vdev (it actually reuses all the same ops) but has a few
4394 * extra rules: you can't attach to it after it's been created, and upon
4395 * completion of resilvering, the first disk (the one being replaced)
4396 * is automatically detached.
4397 */
4398int
4399spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
4400{
428870ff 4401 uint64_t txg, dtl_max_txg;
34dc7c2f
BB
4402 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd;
4403 vdev_ops_t *pvops;
b128c09f
BB
4404 char *oldvdpath, *newvdpath;
4405 int newvd_isspare;
4406 int error;
2e528b49 4407 ASSERTV(vdev_t *rvd = spa->spa_root_vdev);
34dc7c2f 4408
572e2857
BB
4409 ASSERT(spa_writeable(spa));
4410
34dc7c2f
BB
4411 txg = spa_vdev_enter(spa);
4412
b128c09f 4413 oldvd = spa_lookup_by_guid(spa, guid, B_FALSE);
34dc7c2f
BB
4414
4415 if (oldvd == NULL)
4416 return (spa_vdev_exit(spa, NULL, txg, ENODEV));
4417
4418 if (!oldvd->vdev_ops->vdev_op_leaf)
4419 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
4420
4421 pvd = oldvd->vdev_parent;
4422
4423 if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0,
5ffb9d1d 4424 VDEV_ALLOC_ATTACH)) != 0)
34dc7c2f
BB
4425 return (spa_vdev_exit(spa, NULL, txg, EINVAL));
4426
4427 if (newrootvd->vdev_children != 1)
4428 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
4429
4430 newvd = newrootvd->vdev_child[0];
4431
4432 if (!newvd->vdev_ops->vdev_op_leaf)
4433 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
4434
4435 if ((error = vdev_create(newrootvd, txg, replacing)) != 0)
4436 return (spa_vdev_exit(spa, newrootvd, txg, error));
4437
4438 /*
4439 * Spares can't replace logs
4440 */
b128c09f 4441 if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare)
34dc7c2f
BB
4442 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
4443
4444 if (!replacing) {
4445 /*
4446 * For attach, the only allowable parent is a mirror or the root
4447 * vdev.
4448 */
4449 if (pvd->vdev_ops != &vdev_mirror_ops &&
4450 pvd->vdev_ops != &vdev_root_ops)
4451 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
4452
4453 pvops = &vdev_mirror_ops;
4454 } else {
4455 /*
4456 * Active hot spares can only be replaced by inactive hot
4457 * spares.
4458 */
4459 if (pvd->vdev_ops == &vdev_spare_ops &&
572e2857 4460 oldvd->vdev_isspare &&
34dc7c2f
BB
4461 !spa_has_spare(spa, newvd->vdev_guid))
4462 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
4463
4464 /*
4465 * If the source is a hot spare, and the parent isn't already a
4466 * spare, then we want to create a new hot spare. Otherwise, we
4467 * want to create a replacing vdev. The user is not allowed to
4468 * attach to a spared vdev child unless the 'isspare' state is
4469 * the same (spare replaces spare, non-spare replaces
4470 * non-spare).
4471 */
572e2857
BB
4472 if (pvd->vdev_ops == &vdev_replacing_ops &&
4473 spa_version(spa) < SPA_VERSION_MULTI_REPLACE) {
34dc7c2f 4474 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
572e2857
BB
4475 } else if (pvd->vdev_ops == &vdev_spare_ops &&
4476 newvd->vdev_isspare != oldvd->vdev_isspare) {
34dc7c2f 4477 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
572e2857
BB
4478 }
4479
4480 if (newvd->vdev_isspare)
34dc7c2f
BB
4481 pvops = &vdev_spare_ops;
4482 else
4483 pvops = &vdev_replacing_ops;
4484 }
4485
4486 /*
9babb374 4487 * Make sure the new device is big enough.
34dc7c2f 4488 */
9babb374 4489 if (newvd->vdev_asize < vdev_get_min_asize(oldvd))
34dc7c2f
BB
4490 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW));
4491
4492 /*
4493 * The new device cannot have a higher alignment requirement
4494 * than the top-level vdev.
4495 */
4496 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift)
4497 return (spa_vdev_exit(spa, newrootvd, txg, EDOM));
4498
4499 /*
4500 * If this is an in-place replacement, update oldvd's path and devid
4501 * to make it distinguishable from newvd, and unopenable from now on.
4502 */
4503 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) {
4504 spa_strfree(oldvd->vdev_path);
4505 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5,
b8d06fca 4506 KM_PUSHPAGE);
34dc7c2f
BB
4507 (void) sprintf(oldvd->vdev_path, "%s/%s",
4508 newvd->vdev_path, "old");
4509 if (oldvd->vdev_devid != NULL) {
4510 spa_strfree(oldvd->vdev_devid);
4511 oldvd->vdev_devid = NULL;
4512 }
4513 }
4514
572e2857 4515 /* mark the device being resilvered */
5d1f7fb6 4516 newvd->vdev_resilver_txg = txg;
572e2857 4517
34dc7c2f
BB
4518 /*
4519 * If the parent is not a mirror, or if we're replacing, insert the new
4520 * mirror/replacing/spare vdev above oldvd.
4521 */
4522 if (pvd->vdev_ops != pvops)
4523 pvd = vdev_add_parent(oldvd, pvops);
4524
4525 ASSERT(pvd->vdev_top->vdev_parent == rvd);
4526 ASSERT(pvd->vdev_ops == pvops);
4527 ASSERT(oldvd->vdev_parent == pvd);
4528
4529 /*
4530 * Extract the new device from its root and add it to pvd.
4531 */
4532 vdev_remove_child(newrootvd, newvd);
4533 newvd->vdev_id = pvd->vdev_children;
428870ff 4534 newvd->vdev_crtxg = oldvd->vdev_crtxg;
34dc7c2f
BB
4535 vdev_add_child(pvd, newvd);
4536
34dc7c2f
BB
4537 tvd = newvd->vdev_top;
4538 ASSERT(pvd->vdev_top == tvd);
4539 ASSERT(tvd->vdev_parent == rvd);
4540
4541 vdev_config_dirty(tvd);
4542
4543 /*
428870ff
BB
4544 * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account
4545 * for any dmu_sync-ed blocks. It will propagate upward when
4546 * spa_vdev_exit() calls vdev_dtl_reassess().
34dc7c2f 4547 */
428870ff 4548 dtl_max_txg = txg + TXG_CONCURRENT_STATES;
34dc7c2f 4549
428870ff
BB
4550 vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL,
4551 dtl_max_txg - TXG_INITIAL);
34dc7c2f 4552
9babb374 4553 if (newvd->vdev_isspare) {
34dc7c2f 4554 spa_spare_activate(newvd);
26685276 4555 spa_event_notify(spa, newvd, FM_EREPORT_ZFS_DEVICE_SPARE);
9babb374
BB
4556 }
4557
b128c09f
BB
4558 oldvdpath = spa_strdup(oldvd->vdev_path);
4559 newvdpath = spa_strdup(newvd->vdev_path);
4560 newvd_isspare = newvd->vdev_isspare;
34dc7c2f
BB
4561
4562 /*
4563 * Mark newvd's DTL dirty in this txg.
4564 */
4565 vdev_dirty(tvd, VDD_DTL, newvd, txg);
4566
428870ff
BB
4567 /*
4568 * Restart the resilver
4569 */
4570 dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg);
4571
4572 /*
4573 * Commit the config
4574 */
4575 (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0);
34dc7c2f 4576
6f1ffb06 4577 spa_history_log_internal(spa, "vdev attach", NULL,
428870ff 4578 "%s vdev=%s %s vdev=%s",
45d1cae3
BB
4579 replacing && newvd_isspare ? "spare in" :
4580 replacing ? "replace" : "attach", newvdpath,
4581 replacing ? "for" : "to", oldvdpath);
b128c09f
BB
4582
4583 spa_strfree(oldvdpath);
4584 spa_strfree(newvdpath);
4585
572e2857 4586 if (spa->spa_bootfs)
26685276 4587 spa_event_notify(spa, newvd, FM_EREPORT_ZFS_BOOTFS_VDEV_ATTACH);
572e2857 4588
34dc7c2f
BB
4589 return (0);
4590}
4591
4592/*
4593 * Detach a device from a mirror or replacing vdev.
d3cc8b15 4594 *
34dc7c2f
BB
4595 * If 'replace_done' is specified, only detach if the parent
4596 * is a replacing vdev.
4597 */
4598int
fb5f0bc8 4599spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
34dc7c2f
BB
4600{
4601 uint64_t txg;
fb5f0bc8 4602 int error;
34dc7c2f
BB
4603 vdev_t *vd, *pvd, *cvd, *tvd;
4604 boolean_t unspare = B_FALSE;
d4ed6673 4605 uint64_t unspare_guid = 0;
428870ff 4606 char *vdpath;
d6320ddb 4607 int c, t;
2e528b49 4608 ASSERTV(vdev_t *rvd = spa->spa_root_vdev);
572e2857
BB
4609 ASSERT(spa_writeable(spa));
4610
34dc7c2f
BB
4611 txg = spa_vdev_enter(spa);
4612
b128c09f 4613 vd = spa_lookup_by_guid(spa, guid, B_FALSE);
34dc7c2f
BB
4614
4615 if (vd == NULL)
4616 return (spa_vdev_exit(spa, NULL, txg, ENODEV));
4617
4618 if (!vd->vdev_ops->vdev_op_leaf)
4619 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
4620
4621 pvd = vd->vdev_parent;
4622
fb5f0bc8
BB
4623 /*
4624 * If the parent/child relationship is not as expected, don't do it.
4625 * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing
4626 * vdev that's replacing B with C. The user's intent in replacing
4627 * is to go from M(A,B) to M(A,C). If the user decides to cancel
4628 * the replace by detaching C, the expected behavior is to end up
4629 * M(A,B). But suppose that right after deciding to detach C,
4630 * the replacement of B completes. We would have M(A,C), and then
4631 * ask to detach C, which would leave us with just A -- not what
4632 * the user wanted. To prevent this, we make sure that the
4633 * parent/child relationship hasn't changed -- in this example,
4634 * that C's parent is still the replacing vdev R.
4635 */
4636 if (pvd->vdev_guid != pguid && pguid != 0)
4637 return (spa_vdev_exit(spa, NULL, txg, EBUSY));
4638
34dc7c2f 4639 /*
572e2857 4640 * Only 'replacing' or 'spare' vdevs can be replaced.
34dc7c2f 4641 */
572e2857
BB
4642 if (replace_done && pvd->vdev_ops != &vdev_replacing_ops &&
4643 pvd->vdev_ops != &vdev_spare_ops)
4644 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
34dc7c2f
BB
4645
4646 ASSERT(pvd->vdev_ops != &vdev_spare_ops ||
4647 spa_version(spa) >= SPA_VERSION_SPARES);
4648
4649 /*
4650 * Only mirror, replacing, and spare vdevs support detach.
4651 */
4652 if (pvd->vdev_ops != &vdev_replacing_ops &&
4653 pvd->vdev_ops != &vdev_mirror_ops &&
4654 pvd->vdev_ops != &vdev_spare_ops)
4655 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
4656
4657 /*
fb5f0bc8
BB
4658 * If this device has the only valid copy of some data,
4659 * we cannot safely detach it.
34dc7c2f 4660 */
fb5f0bc8 4661 if (vdev_dtl_required(vd))
34dc7c2f
BB
4662 return (spa_vdev_exit(spa, NULL, txg, EBUSY));
4663
fb5f0bc8 4664 ASSERT(pvd->vdev_children >= 2);
34dc7c2f 4665
b128c09f
BB
4666 /*
4667 * If we are detaching the second disk from a replacing vdev, then
4668 * check to see if we changed the original vdev's path to have "/old"
4669 * at the end in spa_vdev_attach(). If so, undo that change now.
4670 */
572e2857
BB
4671 if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 &&
4672 vd->vdev_path != NULL) {
4673 size_t len = strlen(vd->vdev_path);
4674
d6320ddb 4675 for (c = 0; c < pvd->vdev_children; c++) {
572e2857
BB
4676 cvd = pvd->vdev_child[c];
4677
4678 if (cvd == vd || cvd->vdev_path == NULL)
4679 continue;
4680
4681 if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 &&
4682 strcmp(cvd->vdev_path + len, "/old") == 0) {
4683 spa_strfree(cvd->vdev_path);
4684 cvd->vdev_path = spa_strdup(vd->vdev_path);
4685 break;
4686 }
b128c09f
BB
4687 }
4688 }
4689
34dc7c2f
BB
4690 /*
4691 * If we are detaching the original disk from a spare, then it implies
4692 * that the spare should become a real disk, and be removed from the
4693 * active spare list for the pool.
4694 */
4695 if (pvd->vdev_ops == &vdev_spare_ops &&
572e2857
BB
4696 vd->vdev_id == 0 &&
4697 pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare)
34dc7c2f
BB
4698 unspare = B_TRUE;
4699
4700 /*
4701 * Erase the disk labels so the disk can be used for other things.
4702 * This must be done after all other error cases are handled,
4703 * but before we disembowel vd (so we can still do I/O to it).
4704 * But if we can't do it, don't treat the error as fatal --
4705 * it may be that the unwritability of the disk is the reason
4706 * it's being detached!
4707 */
4708 error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
4709
4710 /*
4711 * Remove vd from its parent and compact the parent's children.
4712 */
4713 vdev_remove_child(pvd, vd);
4714 vdev_compact_children(pvd);
4715
4716 /*
4717 * Remember one of the remaining children so we can get tvd below.
4718 */
572e2857 4719 cvd = pvd->vdev_child[pvd->vdev_children - 1];
34dc7c2f
BB
4720
4721 /*
4722 * If we need to remove the remaining child from the list of hot spares,
fb5f0bc8
BB
4723 * do it now, marking the vdev as no longer a spare in the process.
4724 * We must do this before vdev_remove_parent(), because that can
4725 * change the GUID if it creates a new toplevel GUID. For a similar
4726 * reason, we must remove the spare now, in the same txg as the detach;
4727 * otherwise someone could attach a new sibling, change the GUID, and
4728 * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail.
34dc7c2f
BB
4729 */
4730 if (unspare) {
4731 ASSERT(cvd->vdev_isspare);
4732 spa_spare_remove(cvd);
4733 unspare_guid = cvd->vdev_guid;
fb5f0bc8 4734 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
572e2857 4735 cvd->vdev_unspare = B_TRUE;
34dc7c2f
BB
4736 }
4737
428870ff
BB
4738 /*
4739 * If the parent mirror/replacing vdev only has one child,
4740 * the parent is no longer needed. Remove it from the tree.
4741 */
572e2857
BB
4742 if (pvd->vdev_children == 1) {
4743 if (pvd->vdev_ops == &vdev_spare_ops)
4744 cvd->vdev_unspare = B_FALSE;
428870ff 4745 vdev_remove_parent(cvd);
572e2857
BB
4746 }
4747
428870ff
BB
4748
4749 /*
4750 * We don't set tvd until now because the parent we just removed
4751 * may have been the previous top-level vdev.
4752 */
4753 tvd = cvd->vdev_top;
4754 ASSERT(tvd->vdev_parent == rvd);
4755
4756 /*
4757 * Reevaluate the parent vdev state.
4758 */
4759 vdev_propagate_state(cvd);
4760
4761 /*
4762 * If the 'autoexpand' property is set on the pool then automatically
4763 * try to expand the size of the pool. For example if the device we
4764 * just detached was smaller than the others, it may be possible to
4765 * add metaslabs (i.e. grow the pool). We need to reopen the vdev
4766 * first so that we can obtain the updated sizes of the leaf vdevs.
4767 */
4768 if (spa->spa_autoexpand) {
4769 vdev_reopen(tvd);
4770 vdev_expand(tvd, txg);
4771 }
4772
4773 vdev_config_dirty(tvd);
4774
4775 /*
4776 * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that
4777 * vd->vdev_detached is set and free vd's DTL object in syncing context.
4778 * But first make sure we're not on any *other* txg's DTL list, to
4779 * prevent vd from being accessed after it's freed.
4780 */
4781 vdpath = spa_strdup(vd->vdev_path);
d6320ddb 4782 for (t = 0; t < TXG_SIZE; t++)
428870ff
BB
4783 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
4784 vd->vdev_detached = B_TRUE;
4785 vdev_dirty(tvd, VDD_DTL, vd, txg);
4786
26685276 4787 spa_event_notify(spa, vd, FM_EREPORT_ZFS_DEVICE_REMOVE);
428870ff 4788
572e2857
BB
4789 /* hang on to the spa before we release the lock */
4790 spa_open_ref(spa, FTAG);
4791
428870ff
BB
4792 error = spa_vdev_exit(spa, vd, txg, 0);
4793
6f1ffb06 4794 spa_history_log_internal(spa, "detach", NULL,
428870ff
BB
4795 "vdev=%s", vdpath);
4796 spa_strfree(vdpath);
4797
4798 /*
4799 * If this was the removal of the original device in a hot spare vdev,
4800 * then we want to go through and remove the device from the hot spare
4801 * list of every other pool.
4802 */
4803 if (unspare) {
572e2857
BB
4804 spa_t *altspa = NULL;
4805
428870ff 4806 mutex_enter(&spa_namespace_lock);
572e2857
BB
4807 while ((altspa = spa_next(altspa)) != NULL) {
4808 if (altspa->spa_state != POOL_STATE_ACTIVE ||
4809 altspa == spa)
428870ff 4810 continue;
572e2857
BB
4811
4812 spa_open_ref(altspa, FTAG);
428870ff 4813 mutex_exit(&spa_namespace_lock);
572e2857 4814 (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE);
428870ff 4815 mutex_enter(&spa_namespace_lock);
572e2857 4816 spa_close(altspa, FTAG);
428870ff
BB
4817 }
4818 mutex_exit(&spa_namespace_lock);
572e2857
BB
4819
4820 /* search the rest of the vdevs for spares to remove */
4821 spa_vdev_resilver_done(spa);
428870ff
BB
4822 }
4823
572e2857
BB
4824 /* all done with the spa; OK to release */
4825 mutex_enter(&spa_namespace_lock);
4826 spa_close(spa, FTAG);
4827 mutex_exit(&spa_namespace_lock);
4828
428870ff
BB
4829 return (error);
4830}
4831
4832/*
4833 * Split a set of devices from their mirrors, and create a new pool from them.
4834 */
4835int
4836spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
4837 nvlist_t *props, boolean_t exp)
4838{
4839 int error = 0;
4840 uint64_t txg, *glist;
4841 spa_t *newspa;
4842 uint_t c, children, lastlog;
4843 nvlist_t **child, *nvl, *tmp;
4844 dmu_tx_t *tx;
4845 char *altroot = NULL;
4846 vdev_t *rvd, **vml = NULL; /* vdev modify list */
4847 boolean_t activate_slog;
4848
572e2857 4849 ASSERT(spa_writeable(spa));
428870ff
BB
4850
4851 txg = spa_vdev_enter(spa);
4852
4853 /* clear the log and flush everything up to now */
4854 activate_slog = spa_passivate_log(spa);
4855 (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
4856 error = spa_offline_log(spa);
4857 txg = spa_vdev_config_enter(spa);
4858
4859 if (activate_slog)
4860 spa_activate_log(spa);
4861
4862 if (error != 0)
4863 return (spa_vdev_exit(spa, NULL, txg, error));
4864
4865 /* check new spa name before going any further */
4866 if (spa_lookup(newname) != NULL)
4867 return (spa_vdev_exit(spa, NULL, txg, EEXIST));
4868
4869 /*
4870 * scan through all the children to ensure they're all mirrors
4871 */
4872 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 ||
4873 nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child,
4874 &children) != 0)
4875 return (spa_vdev_exit(spa, NULL, txg, EINVAL));
4876
4877 /* first, check to ensure we've got the right child count */
4878 rvd = spa->spa_root_vdev;
4879 lastlog = 0;
4880 for (c = 0; c < rvd->vdev_children; c++) {
4881 vdev_t *vd = rvd->vdev_child[c];
4882
4883 /* don't count the holes & logs as children */
4884 if (vd->vdev_islog || vd->vdev_ishole) {
4885 if (lastlog == 0)
4886 lastlog = c;
4887 continue;
4888 }
4889
4890 lastlog = 0;
4891 }
4892 if (children != (lastlog != 0 ? lastlog : rvd->vdev_children))
4893 return (spa_vdev_exit(spa, NULL, txg, EINVAL));
4894
4895 /* next, ensure no spare or cache devices are part of the split */
4896 if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 ||
4897 nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0)
4898 return (spa_vdev_exit(spa, NULL, txg, EINVAL));
4899
b8d06fca
RY
4900 vml = kmem_zalloc(children * sizeof (vdev_t *), KM_PUSHPAGE);
4901 glist = kmem_zalloc(children * sizeof (uint64_t), KM_PUSHPAGE);
428870ff
BB
4902
4903 /* then, loop over each vdev and validate it */
4904 for (c = 0; c < children; c++) {
4905 uint64_t is_hole = 0;
4906
4907 (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE,
4908 &is_hole);
4909
4910 if (is_hole != 0) {
4911 if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole ||
4912 spa->spa_root_vdev->vdev_child[c]->vdev_islog) {
4913 continue;
4914 } else {
2e528b49 4915 error = SET_ERROR(EINVAL);
428870ff
BB
4916 break;
4917 }
4918 }
4919
4920 /* which disk is going to be split? */
4921 if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID,
4922 &glist[c]) != 0) {
2e528b49 4923 error = SET_ERROR(EINVAL);
428870ff
BB
4924 break;
4925 }
4926
4927 /* look it up in the spa */
4928 vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE);
4929 if (vml[c] == NULL) {
2e528b49 4930 error = SET_ERROR(ENODEV);
428870ff
BB
4931 break;
4932 }
4933
4934 /* make sure there's nothing stopping the split */
4935 if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops ||
4936 vml[c]->vdev_islog ||
4937 vml[c]->vdev_ishole ||
4938 vml[c]->vdev_isspare ||
4939 vml[c]->vdev_isl2cache ||
4940 !vdev_writeable(vml[c]) ||
4941 vml[c]->vdev_children != 0 ||
4942 vml[c]->vdev_state != VDEV_STATE_HEALTHY ||
4943 c != spa->spa_root_vdev->vdev_child[c]->vdev_id) {
2e528b49 4944 error = SET_ERROR(EINVAL);
428870ff
BB
4945 break;
4946 }
4947
4948 if (vdev_dtl_required(vml[c])) {
2e528b49 4949 error = SET_ERROR(EBUSY);
428870ff
BB
4950 break;
4951 }
4952
4953 /* we need certain info from the top level */
4954 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY,
4955 vml[c]->vdev_top->vdev_ms_array) == 0);
4956 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT,
4957 vml[c]->vdev_top->vdev_ms_shift) == 0);
4958 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE,
4959 vml[c]->vdev_top->vdev_asize) == 0);
4960 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT,
4961 vml[c]->vdev_top->vdev_ashift) == 0);
4962 }
4963
4964 if (error != 0) {
4965 kmem_free(vml, children * sizeof (vdev_t *));
4966 kmem_free(glist, children * sizeof (uint64_t));
4967 return (spa_vdev_exit(spa, NULL, txg, error));
4968 }
4969
4970 /* stop writers from using the disks */
4971 for (c = 0; c < children; c++) {
4972 if (vml[c] != NULL)
4973 vml[c]->vdev_offline = B_TRUE;
4974 }
4975 vdev_reopen(spa->spa_root_vdev);
34dc7c2f
BB
4976
4977 /*
428870ff
BB
4978 * Temporarily record the splitting vdevs in the spa config. This
4979 * will disappear once the config is regenerated.
34dc7c2f 4980 */
b8d06fca 4981 VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_PUSHPAGE) == 0);
428870ff
BB
4982 VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
4983 glist, children) == 0);
4984 kmem_free(glist, children * sizeof (uint64_t));
34dc7c2f 4985
428870ff
BB
4986 mutex_enter(&spa->spa_props_lock);
4987 VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT,
4988 nvl) == 0);
4989 mutex_exit(&spa->spa_props_lock);
4990 spa->spa_config_splitting = nvl;
4991 vdev_config_dirty(spa->spa_root_vdev);
4992
4993 /* configure and create the new pool */
4994 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0);
4995 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
4996 exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0);
4997 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
4998 spa_version(spa)) == 0);
4999 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG,
5000 spa->spa_config_txg) == 0);
5001 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID,
5002 spa_generate_guid(NULL)) == 0);
5003 (void) nvlist_lookup_string(props,
5004 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
34dc7c2f 5005
428870ff
BB
5006 /* add the new pool to the namespace */
5007 newspa = spa_add(newname, config, altroot);
5008 newspa->spa_config_txg = spa->spa_config_txg;
5009 spa_set_log_state(newspa, SPA_LOG_CLEAR);
5010
5011 /* release the spa config lock, retaining the namespace lock */
5012 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
5013
5014 if (zio_injection_enabled)
5015 zio_handle_panic_injection(spa, FTAG, 1);
5016
5017 spa_activate(newspa, spa_mode_global);
5018 spa_async_suspend(newspa);
5019
5020 /* create the new pool from the disks of the original pool */
5021 error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE);
5022 if (error)
5023 goto out;
5024
5025 /* if that worked, generate a real config for the new pool */
5026 if (newspa->spa_root_vdev != NULL) {
5027 VERIFY(nvlist_alloc(&newspa->spa_config_splitting,
b8d06fca 5028 NV_UNIQUE_NAME, KM_PUSHPAGE) == 0);
428870ff
BB
5029 VERIFY(nvlist_add_uint64(newspa->spa_config_splitting,
5030 ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0);
5031 spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL,
5032 B_TRUE));
9babb374 5033 }
34dc7c2f 5034
428870ff
BB
5035 /* set the props */
5036 if (props != NULL) {
5037 spa_configfile_set(newspa, props, B_FALSE);
5038 error = spa_prop_set(newspa, props);
5039 if (error)
5040 goto out;
5041 }
34dc7c2f 5042
428870ff
BB
5043 /* flush everything */
5044 txg = spa_vdev_config_enter(newspa);
5045 vdev_config_dirty(newspa->spa_root_vdev);
5046 (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG);
34dc7c2f 5047
428870ff
BB
5048 if (zio_injection_enabled)
5049 zio_handle_panic_injection(spa, FTAG, 2);
34dc7c2f 5050
428870ff 5051 spa_async_resume(newspa);
34dc7c2f 5052
428870ff
BB
5053 /* finally, update the original pool's config */
5054 txg = spa_vdev_config_enter(spa);
5055 tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
5056 error = dmu_tx_assign(tx, TXG_WAIT);
5057 if (error != 0)
5058 dmu_tx_abort(tx);
5059 for (c = 0; c < children; c++) {
5060 if (vml[c] != NULL) {
5061 vdev_split(vml[c]);
5062 if (error == 0)
6f1ffb06
MA
5063 spa_history_log_internal(spa, "detach", tx,
5064 "vdev=%s", vml[c]->vdev_path);
428870ff 5065 vdev_free(vml[c]);
34dc7c2f 5066 }
34dc7c2f 5067 }
428870ff
BB
5068 vdev_config_dirty(spa->spa_root_vdev);
5069 spa->spa_config_splitting = NULL;
5070 nvlist_free(nvl);
5071 if (error == 0)
5072 dmu_tx_commit(tx);
5073 (void) spa_vdev_exit(spa, NULL, txg, 0);
5074
5075 if (zio_injection_enabled)
5076 zio_handle_panic_injection(spa, FTAG, 3);
5077
5078 /* split is complete; log a history record */
6f1ffb06
MA
5079 spa_history_log_internal(newspa, "split", NULL,
5080 "from pool %s", spa_name(spa));
428870ff
BB
5081
5082 kmem_free(vml, children * sizeof (vdev_t *));
5083
5084 /* if we're not going to mount the filesystems in userland, export */
5085 if (exp)
5086 error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL,
5087 B_FALSE, B_FALSE);
5088
5089 return (error);
5090
5091out:
5092 spa_unload(newspa);
5093 spa_deactivate(newspa);
5094 spa_remove(newspa);
5095
5096 txg = spa_vdev_config_enter(spa);
5097
5098 /* re-online all offlined disks */
5099 for (c = 0; c < children; c++) {
5100 if (vml[c] != NULL)
5101 vml[c]->vdev_offline = B_FALSE;
5102 }
5103 vdev_reopen(spa->spa_root_vdev);
5104
5105 nvlist_free(spa->spa_config_splitting);
5106 spa->spa_config_splitting = NULL;
5107 (void) spa_vdev_exit(spa, NULL, txg, error);
34dc7c2f 5108
428870ff 5109 kmem_free(vml, children * sizeof (vdev_t *));
34dc7c2f
BB
5110 return (error);
5111}
5112
b128c09f
BB
5113static nvlist_t *
5114spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid)
34dc7c2f 5115{
d6320ddb
BB
5116 int i;
5117
5118 for (i = 0; i < count; i++) {
b128c09f 5119 uint64_t guid;
34dc7c2f 5120
b128c09f
BB
5121 VERIFY(nvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID,
5122 &guid) == 0);
34dc7c2f 5123
b128c09f
BB
5124 if (guid == target_guid)
5125 return (nvpp[i]);
34dc7c2f
BB
5126 }
5127
b128c09f 5128 return (NULL);
34dc7c2f
BB
5129}
5130
b128c09f
BB
5131static void
5132spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count,
5133 nvlist_t *dev_to_remove)
34dc7c2f 5134{
b128c09f 5135 nvlist_t **newdev = NULL;
d6320ddb 5136 int i, j;
34dc7c2f 5137
b128c09f 5138 if (count > 1)
b8d06fca 5139 newdev = kmem_alloc((count - 1) * sizeof (void *), KM_PUSHPAGE);
34dc7c2f 5140
d6320ddb 5141 for (i = 0, j = 0; i < count; i++) {
b128c09f
BB
5142 if (dev[i] == dev_to_remove)
5143 continue;
b8d06fca 5144 VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_PUSHPAGE) == 0);
34dc7c2f
BB
5145 }
5146
b128c09f
BB
5147 VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0);
5148 VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0);
34dc7c2f 5149
d6320ddb 5150 for (i = 0; i < count - 1; i++)
b128c09f 5151 nvlist_free(newdev[i]);
34dc7c2f 5152
b128c09f
BB
5153 if (count > 1)
5154 kmem_free(newdev, (count - 1) * sizeof (void *));
34dc7c2f
BB
5155}
5156
428870ff
BB
5157/*
5158 * Evacuate the device.
5159 */
5160static int
5161spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd)
5162{
5163 uint64_t txg;
5164 int error = 0;
5165
5166 ASSERT(MUTEX_HELD(&spa_namespace_lock));
5167 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
5168 ASSERT(vd == vd->vdev_top);
5169
5170 /*
5171 * Evacuate the device. We don't hold the config lock as writer
5172 * since we need to do I/O but we do keep the
5173 * spa_namespace_lock held. Once this completes the device
5174 * should no longer have any blocks allocated on it.
5175 */
5176 if (vd->vdev_islog) {
5177 if (vd->vdev_stat.vs_alloc != 0)
5178 error = spa_offline_log(spa);
5179 } else {
2e528b49 5180 error = SET_ERROR(ENOTSUP);
428870ff
BB
5181 }
5182
5183 if (error)
5184 return (error);
5185
5186 /*
5187 * The evacuation succeeded. Remove any remaining MOS metadata
5188 * associated with this vdev, and wait for these changes to sync.
5189 */
c99c9001 5190 ASSERT0(vd->vdev_stat.vs_alloc);
428870ff
BB
5191 txg = spa_vdev_config_enter(spa);
5192 vd->vdev_removing = B_TRUE;
5193 vdev_dirty(vd, 0, NULL, txg);
5194 vdev_config_dirty(vd);
5195 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
5196
5197 return (0);
5198}
5199
5200/*
5201 * Complete the removal by cleaning up the namespace.
5202 */
5203static void
5204spa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd)
5205{
5206 vdev_t *rvd = spa->spa_root_vdev;
5207 uint64_t id = vd->vdev_id;
5208 boolean_t last_vdev = (id == (rvd->vdev_children - 1));
5209
5210 ASSERT(MUTEX_HELD(&spa_namespace_lock));
5211 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
5212 ASSERT(vd == vd->vdev_top);
5213
5214 /*
5215 * Only remove any devices which are empty.
5216 */
5217 if (vd->vdev_stat.vs_alloc != 0)
5218 return;
5219
5220 (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
5221
5222 if (list_link_active(&vd->vdev_state_dirty_node))
5223 vdev_state_clean(vd);
5224 if (list_link_active(&vd->vdev_config_dirty_node))
5225 vdev_config_clean(vd);
5226
5227 vdev_free(vd);
5228
5229 if (last_vdev) {
5230 vdev_compact_children(rvd);
5231 } else {
5232 vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops);
5233 vdev_add_child(rvd, vd);
5234 }
5235 vdev_config_dirty(rvd);
5236
5237 /*
5238 * Reassess the health of our root vdev.
5239 */
5240 vdev_reopen(rvd);
5241}
5242
5243/*
5244 * Remove a device from the pool -
5245 *
5246 * Removing a device from the vdev namespace requires several steps
5247 * and can take a significant amount of time. As a result we use
5248 * the spa_vdev_config_[enter/exit] functions which allow us to
5249 * grab and release the spa_config_lock while still holding the namespace
5250 * lock. During each step the configuration is synced out.
d3cc8b15
WA
5251 *
5252 * Currently, this supports removing only hot spares, slogs, and level 2 ARC
5253 * devices.
34dc7c2f
BB
5254 */
5255int
5256spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
5257{
5258 vdev_t *vd;
428870ff 5259 metaslab_group_t *mg;
b128c09f 5260 nvlist_t **spares, **l2cache, *nv;
fb5f0bc8 5261 uint64_t txg = 0;
428870ff 5262 uint_t nspares, nl2cache;
34dc7c2f 5263 int error = 0;
fb5f0bc8 5264 boolean_t locked = MUTEX_HELD(&spa_namespace_lock);
34dc7c2f 5265
572e2857
BB
5266 ASSERT(spa_writeable(spa));
5267
fb5f0bc8
BB
5268 if (!locked)
5269 txg = spa_vdev_enter(spa);
34dc7c2f 5270
b128c09f 5271 vd = spa_lookup_by_guid(spa, guid, B_FALSE);
34dc7c2f
BB
5272
5273 if (spa->spa_spares.sav_vdevs != NULL &&
34dc7c2f 5274 nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
b128c09f
BB
5275 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 &&
5276 (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) {
5277 /*
5278 * Only remove the hot spare if it's not currently in use
5279 * in this pool.
5280 */
5281 if (vd == NULL || unspare) {
5282 spa_vdev_remove_aux(spa->spa_spares.sav_config,
5283 ZPOOL_CONFIG_SPARES, spares, nspares, nv);
5284 spa_load_spares(spa);
5285 spa->spa_spares.sav_sync = B_TRUE;
5286 } else {
2e528b49 5287 error = SET_ERROR(EBUSY);
b128c09f
BB
5288 }
5289 } else if (spa->spa_l2cache.sav_vdevs != NULL &&
34dc7c2f 5290 nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
b128c09f
BB
5291 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 &&
5292 (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) {
5293 /*
5294 * Cache devices can always be removed.
5295 */
5296 spa_vdev_remove_aux(spa->spa_l2cache.sav_config,
5297 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv);
34dc7c2f
BB
5298 spa_load_l2cache(spa);
5299 spa->spa_l2cache.sav_sync = B_TRUE;
428870ff
BB
5300 } else if (vd != NULL && vd->vdev_islog) {
5301 ASSERT(!locked);
5302 ASSERT(vd == vd->vdev_top);
5303
5304 /*
5305 * XXX - Once we have bp-rewrite this should
5306 * become the common case.
5307 */
5308
5309 mg = vd->vdev_mg;
5310
5311 /*
5312 * Stop allocating from this vdev.
5313 */
5314 metaslab_group_passivate(mg);
5315
5316 /*
5317 * Wait for the youngest allocations and frees to sync,
5318 * and then wait for the deferral of those frees to finish.
5319 */
5320 spa_vdev_config_exit(spa, NULL,
5321 txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
5322
5323 /*
5324 * Attempt to evacuate the vdev.
5325 */
5326 error = spa_vdev_remove_evacuate(spa, vd);
5327
5328 txg = spa_vdev_config_enter(spa);
5329
5330 /*
5331 * If we couldn't evacuate the vdev, unwind.
5332 */
5333 if (error) {
5334 metaslab_group_activate(mg);
5335 return (spa_vdev_exit(spa, NULL, txg, error));
5336 }
5337
5338 /*
5339 * Clean up the vdev namespace.
5340 */
5341 spa_vdev_remove_from_namespace(spa, vd);
5342
b128c09f
BB
5343 } else if (vd != NULL) {
5344 /*
5345 * Normal vdevs cannot be removed (yet).
5346 */
2e528b49 5347 error = SET_ERROR(ENOTSUP);
b128c09f
BB
5348 } else {
5349 /*
5350 * There is no vdev of any kind with the specified guid.
5351 */
2e528b49 5352 error = SET_ERROR(ENOENT);
34dc7c2f
BB
5353 }
5354
fb5f0bc8
BB
5355 if (!locked)
5356 return (spa_vdev_exit(spa, NULL, txg, error));
5357
5358 return (error);
34dc7c2f
BB
5359}
5360
5361/*
5362 * Find any device that's done replacing, or a vdev marked 'unspare' that's
d3cc8b15 5363 * currently spared, so we can detach it.
34dc7c2f
BB
5364 */
5365static vdev_t *
5366spa_vdev_resilver_done_hunt(vdev_t *vd)
5367{
5368 vdev_t *newvd, *oldvd;
d6320ddb 5369 int c;
34dc7c2f 5370
d6320ddb 5371 for (c = 0; c < vd->vdev_children; c++) {
34dc7c2f
BB
5372 oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]);
5373 if (oldvd != NULL)
5374 return (oldvd);
5375 }
5376
5377 /*
572e2857
BB
5378 * Check for a completed replacement. We always consider the first
5379 * vdev in the list to be the oldest vdev, and the last one to be
5380 * the newest (see spa_vdev_attach() for how that works). In
5381 * the case where the newest vdev is faulted, we will not automatically
5382 * remove it after a resilver completes. This is OK as it will require
5383 * user intervention to determine which disk the admin wishes to keep.
34dc7c2f 5384 */
572e2857
BB
5385 if (vd->vdev_ops == &vdev_replacing_ops) {
5386 ASSERT(vd->vdev_children > 1);
5387
5388 newvd = vd->vdev_child[vd->vdev_children - 1];
34dc7c2f 5389 oldvd = vd->vdev_child[0];
34dc7c2f 5390
fb5f0bc8 5391 if (vdev_dtl_empty(newvd, DTL_MISSING) &&
428870ff 5392 vdev_dtl_empty(newvd, DTL_OUTAGE) &&
fb5f0bc8 5393 !vdev_dtl_required(oldvd))
34dc7c2f 5394 return (oldvd);
34dc7c2f
BB
5395 }
5396
5397 /*
5398 * Check for a completed resilver with the 'unspare' flag set.
5399 */
572e2857
BB
5400 if (vd->vdev_ops == &vdev_spare_ops) {
5401 vdev_t *first = vd->vdev_child[0];
5402 vdev_t *last = vd->vdev_child[vd->vdev_children - 1];
5403
5404 if (last->vdev_unspare) {
5405 oldvd = first;
5406 newvd = last;
5407 } else if (first->vdev_unspare) {
5408 oldvd = last;
5409 newvd = first;
5410 } else {
5411 oldvd = NULL;
5412 }
34dc7c2f 5413
572e2857 5414 if (oldvd != NULL &&
fb5f0bc8 5415 vdev_dtl_empty(newvd, DTL_MISSING) &&
428870ff 5416 vdev_dtl_empty(newvd, DTL_OUTAGE) &&
572e2857 5417 !vdev_dtl_required(oldvd))
34dc7c2f 5418 return (oldvd);
572e2857
BB
5419
5420 /*
5421 * If there are more than two spares attached to a disk,
5422 * and those spares are not required, then we want to
5423 * attempt to free them up now so that they can be used
5424 * by other pools. Once we're back down to a single
5425 * disk+spare, we stop removing them.
5426 */
5427 if (vd->vdev_children > 2) {
5428 newvd = vd->vdev_child[1];
5429
5430 if (newvd->vdev_isspare && last->vdev_isspare &&
5431 vdev_dtl_empty(last, DTL_MISSING) &&
5432 vdev_dtl_empty(last, DTL_OUTAGE) &&
5433 !vdev_dtl_required(newvd))
5434 return (newvd);
34dc7c2f 5435 }
34dc7c2f
BB
5436 }
5437
5438 return (NULL);
5439}
5440
5441static void
5442spa_vdev_resilver_done(spa_t *spa)
5443{
fb5f0bc8
BB
5444 vdev_t *vd, *pvd, *ppvd;
5445 uint64_t guid, sguid, pguid, ppguid;
34dc7c2f 5446
fb5f0bc8 5447 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
34dc7c2f
BB
5448
5449 while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) {
fb5f0bc8
BB
5450 pvd = vd->vdev_parent;
5451 ppvd = pvd->vdev_parent;
34dc7c2f 5452 guid = vd->vdev_guid;
fb5f0bc8
BB
5453 pguid = pvd->vdev_guid;
5454 ppguid = ppvd->vdev_guid;
5455 sguid = 0;
34dc7c2f
BB
5456 /*
5457 * If we have just finished replacing a hot spared device, then
5458 * we need to detach the parent's first child (the original hot
5459 * spare) as well.
5460 */
572e2857
BB
5461 if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 &&
5462 ppvd->vdev_children == 2) {
34dc7c2f 5463 ASSERT(pvd->vdev_ops == &vdev_replacing_ops);
fb5f0bc8 5464 sguid = ppvd->vdev_child[1]->vdev_guid;
34dc7c2f 5465 }
5d1f7fb6
GW
5466 ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd));
5467
fb5f0bc8
BB
5468 spa_config_exit(spa, SCL_ALL, FTAG);
5469 if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0)
34dc7c2f 5470 return;
fb5f0bc8 5471 if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0)
34dc7c2f 5472 return;
fb5f0bc8 5473 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
34dc7c2f
BB
5474 }
5475
fb5f0bc8 5476 spa_config_exit(spa, SCL_ALL, FTAG);
34dc7c2f
BB
5477}
5478
5479/*
428870ff 5480 * Update the stored path or FRU for this vdev.
34dc7c2f
BB
5481 */
5482int
9babb374
BB
5483spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value,
5484 boolean_t ispath)
34dc7c2f 5485{
b128c09f 5486 vdev_t *vd;
428870ff 5487 boolean_t sync = B_FALSE;
34dc7c2f 5488
572e2857
BB
5489 ASSERT(spa_writeable(spa));
5490
428870ff 5491 spa_vdev_state_enter(spa, SCL_ALL);
34dc7c2f 5492
9babb374 5493 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
428870ff 5494 return (spa_vdev_state_exit(spa, NULL, ENOENT));
34dc7c2f
BB
5495
5496 if (!vd->vdev_ops->vdev_op_leaf)
428870ff 5497 return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
34dc7c2f 5498
9babb374 5499 if (ispath) {
428870ff
BB
5500 if (strcmp(value, vd->vdev_path) != 0) {
5501 spa_strfree(vd->vdev_path);
5502 vd->vdev_path = spa_strdup(value);
5503 sync = B_TRUE;
5504 }
9babb374 5505 } else {
428870ff
BB
5506 if (vd->vdev_fru == NULL) {
5507 vd->vdev_fru = spa_strdup(value);
5508 sync = B_TRUE;
5509 } else if (strcmp(value, vd->vdev_fru) != 0) {
9babb374 5510 spa_strfree(vd->vdev_fru);
428870ff
BB
5511 vd->vdev_fru = spa_strdup(value);
5512 sync = B_TRUE;
5513 }
9babb374 5514 }
34dc7c2f 5515
428870ff 5516 return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0));
34dc7c2f
BB
5517}
5518
9babb374
BB
5519int
5520spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath)
5521{
5522 return (spa_vdev_set_common(spa, guid, newpath, B_TRUE));
5523}
5524
5525int
5526spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru)
5527{
5528 return (spa_vdev_set_common(spa, guid, newfru, B_FALSE));
5529}
5530
34dc7c2f
BB
5531/*
5532 * ==========================================================================
428870ff 5533 * SPA Scanning
34dc7c2f
BB
5534 * ==========================================================================
5535 */
5536
34dc7c2f 5537int
428870ff
BB
5538spa_scan_stop(spa_t *spa)
5539{
5540 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
5541 if (dsl_scan_resilvering(spa->spa_dsl_pool))
2e528b49 5542 return (SET_ERROR(EBUSY));
428870ff
BB
5543 return (dsl_scan_cancel(spa->spa_dsl_pool));
5544}
5545
5546int
5547spa_scan(spa_t *spa, pool_scan_func_t func)
34dc7c2f 5548{
b128c09f 5549 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
34dc7c2f 5550
428870ff 5551 if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE)
2e528b49 5552 return (SET_ERROR(ENOTSUP));
34dc7c2f 5553
34dc7c2f 5554 /*
b128c09f
BB
5555 * If a resilver was requested, but there is no DTL on a
5556 * writeable leaf device, we have nothing to do.
34dc7c2f 5557 */
428870ff 5558 if (func == POOL_SCAN_RESILVER &&
b128c09f
BB
5559 !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) {
5560 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
34dc7c2f
BB
5561 return (0);
5562 }
5563
428870ff 5564 return (dsl_scan(spa->spa_dsl_pool, func));
34dc7c2f
BB
5565}
5566
5567/*
5568 * ==========================================================================
5569 * SPA async task processing
5570 * ==========================================================================
5571 */
5572
5573static void
5574spa_async_remove(spa_t *spa, vdev_t *vd)
5575{
d6320ddb
BB
5576 int c;
5577
b128c09f 5578 if (vd->vdev_remove_wanted) {
428870ff
BB
5579 vd->vdev_remove_wanted = B_FALSE;
5580 vd->vdev_delayed_close = B_FALSE;
b128c09f 5581 vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE);
428870ff
BB
5582
5583 /*
5584 * We want to clear the stats, but we don't want to do a full
5585 * vdev_clear() as that will cause us to throw away
5586 * degraded/faulted state as well as attempt to reopen the
5587 * device, all of which is a waste.
5588 */
5589 vd->vdev_stat.vs_read_errors = 0;
5590 vd->vdev_stat.vs_write_errors = 0;
5591 vd->vdev_stat.vs_checksum_errors = 0;
5592
b128c09f
BB
5593 vdev_state_dirty(vd->vdev_top);
5594 }
34dc7c2f 5595
d6320ddb 5596 for (c = 0; c < vd->vdev_children; c++)
b128c09f
BB
5597 spa_async_remove(spa, vd->vdev_child[c]);
5598}
5599
5600static void
5601spa_async_probe(spa_t *spa, vdev_t *vd)
5602{
d6320ddb
BB
5603 int c;
5604
b128c09f 5605 if (vd->vdev_probe_wanted) {
428870ff 5606 vd->vdev_probe_wanted = B_FALSE;
b128c09f 5607 vdev_reopen(vd); /* vdev_open() does the actual probe */
34dc7c2f 5608 }
b128c09f 5609
d6320ddb 5610 for (c = 0; c < vd->vdev_children; c++)
b128c09f 5611 spa_async_probe(spa, vd->vdev_child[c]);
34dc7c2f
BB
5612}
5613
9babb374
BB
5614static void
5615spa_async_autoexpand(spa_t *spa, vdev_t *vd)
5616{
d6320ddb 5617 int c;
9babb374
BB
5618
5619 if (!spa->spa_autoexpand)
5620 return;
5621
d6320ddb 5622 for (c = 0; c < vd->vdev_children; c++) {
9babb374
BB
5623 vdev_t *cvd = vd->vdev_child[c];
5624 spa_async_autoexpand(spa, cvd);
5625 }
5626
5627 if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL)
5628 return;
5629
26685276 5630 spa_event_notify(vd->vdev_spa, vd, FM_EREPORT_ZFS_DEVICE_AUTOEXPAND);
9babb374
BB
5631}
5632
34dc7c2f
BB
5633static void
5634spa_async_thread(spa_t *spa)
5635{
d6320ddb 5636 int tasks, i;
34dc7c2f
BB
5637
5638 ASSERT(spa->spa_sync_on);
5639
5640 mutex_enter(&spa->spa_async_lock);
5641 tasks = spa->spa_async_tasks;
5642 spa->spa_async_tasks = 0;
5643 mutex_exit(&spa->spa_async_lock);
5644
5645 /*
5646 * See if the config needs to be updated.
5647 */
5648 if (tasks & SPA_ASYNC_CONFIG_UPDATE) {
428870ff 5649 uint64_t old_space, new_space;
9babb374 5650
34dc7c2f 5651 mutex_enter(&spa_namespace_lock);
428870ff 5652 old_space = metaslab_class_get_space(spa_normal_class(spa));
34dc7c2f 5653 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
428870ff 5654 new_space = metaslab_class_get_space(spa_normal_class(spa));
34dc7c2f 5655 mutex_exit(&spa_namespace_lock);
9babb374
BB
5656
5657 /*
5658 * If the pool grew as a result of the config update,
5659 * then log an internal history event.
5660 */
428870ff 5661 if (new_space != old_space) {
6f1ffb06 5662 spa_history_log_internal(spa, "vdev online", NULL,
45d1cae3 5663 "pool '%s' size: %llu(+%llu)",
428870ff 5664 spa_name(spa), new_space, new_space - old_space);
9babb374 5665 }
34dc7c2f
BB
5666 }
5667
5668 /*
5669 * See if any devices need to be marked REMOVED.
34dc7c2f 5670 */
b128c09f 5671 if (tasks & SPA_ASYNC_REMOVE) {
428870ff 5672 spa_vdev_state_enter(spa, SCL_NONE);
34dc7c2f 5673 spa_async_remove(spa, spa->spa_root_vdev);
d6320ddb 5674 for (i = 0; i < spa->spa_l2cache.sav_count; i++)
b128c09f 5675 spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]);
d6320ddb 5676 for (i = 0; i < spa->spa_spares.sav_count; i++)
b128c09f
BB
5677 spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]);
5678 (void) spa_vdev_state_exit(spa, NULL, 0);
34dc7c2f
BB
5679 }
5680
9babb374
BB
5681 if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) {
5682 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
5683 spa_async_autoexpand(spa, spa->spa_root_vdev);
5684 spa_config_exit(spa, SCL_CONFIG, FTAG);
5685 }
5686
34dc7c2f 5687 /*
b128c09f 5688 * See if any devices need to be probed.
34dc7c2f 5689 */
b128c09f 5690 if (tasks & SPA_ASYNC_PROBE) {
428870ff 5691 spa_vdev_state_enter(spa, SCL_NONE);
b128c09f
BB
5692 spa_async_probe(spa, spa->spa_root_vdev);
5693 (void) spa_vdev_state_exit(spa, NULL, 0);
5694 }
34dc7c2f
BB
5695
5696 /*
b128c09f 5697 * If any devices are done replacing, detach them.
34dc7c2f 5698 */
b128c09f
BB
5699 if (tasks & SPA_ASYNC_RESILVER_DONE)
5700 spa_vdev_resilver_done(spa);
34dc7c2f
BB
5701
5702 /*
5703 * Kick off a resilver.
5704 */
b128c09f 5705 if (tasks & SPA_ASYNC_RESILVER)
428870ff 5706 dsl_resilver_restart(spa->spa_dsl_pool, 0);
34dc7c2f
BB
5707
5708 /*
5709 * Let the world know that we're done.
5710 */
5711 mutex_enter(&spa->spa_async_lock);
5712 spa->spa_async_thread = NULL;
5713 cv_broadcast(&spa->spa_async_cv);
5714 mutex_exit(&spa->spa_async_lock);
5715 thread_exit();
5716}
5717
5718void
5719spa_async_suspend(spa_t *spa)
5720{
5721 mutex_enter(&spa->spa_async_lock);
5722 spa->spa_async_suspended++;
5723 while (spa->spa_async_thread != NULL)
5724 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock);
5725 mutex_exit(&spa->spa_async_lock);
5726}
5727
5728void
5729spa_async_resume(spa_t *spa)
5730{
5731 mutex_enter(&spa->spa_async_lock);
5732 ASSERT(spa->spa_async_suspended != 0);
5733 spa->spa_async_suspended--;
5734 mutex_exit(&spa->spa_async_lock);
5735}
5736
5737static void
5738spa_async_dispatch(spa_t *spa)
5739{
5740 mutex_enter(&spa->spa_async_lock);
5741 if (spa->spa_async_tasks && !spa->spa_async_suspended &&
5742 spa->spa_async_thread == NULL &&
5743 rootdir != NULL && !vn_is_readonly(rootdir))
5744 spa->spa_async_thread = thread_create(NULL, 0,
5745 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri);
5746 mutex_exit(&spa->spa_async_lock);
5747}
5748
5749void
5750spa_async_request(spa_t *spa, int task)
5751{
428870ff 5752 zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task);
34dc7c2f
BB
5753 mutex_enter(&spa->spa_async_lock);
5754 spa->spa_async_tasks |= task;
5755 mutex_exit(&spa->spa_async_lock);
5756}
5757
5758/*
5759 * ==========================================================================
5760 * SPA syncing routines
5761 * ==========================================================================
5762 */
5763
428870ff
BB
5764static int
5765bpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
34dc7c2f 5766{
428870ff
BB
5767 bpobj_t *bpo = arg;
5768 bpobj_enqueue(bpo, bp, tx);
5769 return (0);
5770}
34dc7c2f 5771
428870ff
BB
5772static int
5773spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
5774{
5775 zio_t *zio = arg;
34dc7c2f 5776
428870ff
BB
5777 zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp,
5778 zio->io_flags));
5779 return (0);
34dc7c2f
BB
5780}
5781
e8b96c60
MA
5782/*
5783 * Note: this simple function is not inlined to make it easier to dtrace the
5784 * amount of time spent syncing frees.
5785 */
5786static void
5787spa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx)
5788{
5789 zio_t *zio = zio_root(spa, NULL, NULL, 0);
5790 bplist_iterate(bpl, spa_free_sync_cb, zio, tx);
5791 VERIFY(zio_wait(zio) == 0);
5792}
5793
5794/*
5795 * Note: this simple function is not inlined to make it easier to dtrace the
5796 * amount of time spent syncing deferred frees.
5797 */
5798static void
5799spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx)
5800{
5801 zio_t *zio = zio_root(spa, NULL, NULL, 0);
5802 VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj,
5803 spa_free_sync_cb, zio, tx), ==, 0);
5804 VERIFY0(zio_wait(zio));
5805}
5806
34dc7c2f
BB
5807static void
5808spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx)
5809{
5810 char *packed = NULL;
b128c09f 5811 size_t bufsize;
34dc7c2f
BB
5812 size_t nvsize = 0;
5813 dmu_buf_t *db;
5814
5815 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0);
5816
b128c09f
BB
5817 /*
5818 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration
5819 * information. This avoids the dbuf_will_dirty() path and
5820 * saves us a pre-read to get data we don't actually care about.
5821 */
9ae529ec 5822 bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE);
b8d06fca 5823 packed = vmem_alloc(bufsize, KM_PUSHPAGE);
34dc7c2f
BB
5824
5825 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR,
b8d06fca 5826 KM_PUSHPAGE) == 0);
b128c09f 5827 bzero(packed + nvsize, bufsize - nvsize);
34dc7c2f 5828
b128c09f 5829 dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx);
34dc7c2f 5830
00b46022 5831 vmem_free(packed, bufsize);
34dc7c2f
BB
5832
5833 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
5834 dmu_buf_will_dirty(db, tx);
5835 *(uint64_t *)db->db_data = nvsize;
5836 dmu_buf_rele(db, FTAG);
5837}
5838
5839static void
5840spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx,
5841 const char *config, const char *entry)
5842{
5843 nvlist_t *nvroot;
5844 nvlist_t **list;
5845 int i;
5846
5847 if (!sav->sav_sync)
5848 return;
5849
5850 /*
5851 * Update the MOS nvlist describing the list of available devices.
5852 * spa_validate_aux() will have already made sure this nvlist is
5853 * valid and the vdevs are labeled appropriately.
5854 */
5855 if (sav->sav_object == 0) {
5856 sav->sav_object = dmu_object_alloc(spa->spa_meta_objset,
5857 DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE,
5858 sizeof (uint64_t), tx);
5859 VERIFY(zap_update(spa->spa_meta_objset,
5860 DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1,
5861 &sav->sav_object, tx) == 0);
5862 }
5863
b8d06fca 5864 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_PUSHPAGE) == 0);
34dc7c2f
BB
5865 if (sav->sav_count == 0) {
5866 VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0);
5867 } else {
b8d06fca 5868 list = kmem_alloc(sav->sav_count * sizeof (void *), KM_PUSHPAGE);
34dc7c2f
BB
5869 for (i = 0; i < sav->sav_count; i++)
5870 list[i] = vdev_config_generate(spa, sav->sav_vdevs[i],
428870ff 5871 B_FALSE, VDEV_CONFIG_L2CACHE);
34dc7c2f
BB
5872 VERIFY(nvlist_add_nvlist_array(nvroot, config, list,
5873 sav->sav_count) == 0);
5874 for (i = 0; i < sav->sav_count; i++)
5875 nvlist_free(list[i]);
5876 kmem_free(list, sav->sav_count * sizeof (void *));
5877 }
5878
5879 spa_sync_nvlist(spa, sav->sav_object, nvroot, tx);
5880 nvlist_free(nvroot);
5881
5882 sav->sav_sync = B_FALSE;
5883}
5884
5885static void
5886spa_sync_config_object(spa_t *spa, dmu_tx_t *tx)
5887{
5888 nvlist_t *config;
5889
b128c09f 5890 if (list_is_empty(&spa->spa_config_dirty_list))
34dc7c2f
BB
5891 return;
5892
b128c09f
BB
5893 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
5894
5895 config = spa_config_generate(spa, spa->spa_root_vdev,
5896 dmu_tx_get_txg(tx), B_FALSE);
5897
ea0b2538
GW
5898 /*
5899 * If we're upgrading the spa version then make sure that
5900 * the config object gets updated with the correct version.
5901 */
5902 if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version)
5903 fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
5904 spa->spa_uberblock.ub_version);
5905
b128c09f 5906 spa_config_exit(spa, SCL_STATE, FTAG);
34dc7c2f
BB
5907
5908 if (spa->spa_config_syncing)
5909 nvlist_free(spa->spa_config_syncing);
5910 spa->spa_config_syncing = config;
5911
5912 spa_sync_nvlist(spa, spa->spa_config_object, config, tx);
5913}
5914
9ae529ec 5915static void
13fe0198 5916spa_sync_version(void *arg, dmu_tx_t *tx)
9ae529ec 5917{
13fe0198
MA
5918 uint64_t *versionp = arg;
5919 uint64_t version = *versionp;
5920 spa_t *spa = dmu_tx_pool(tx)->dp_spa;
9ae529ec
CS
5921
5922 /*
5923 * Setting the version is special cased when first creating the pool.
5924 */
5925 ASSERT(tx->tx_txg != TXG_INITIAL);
5926
8dca0a9a 5927 ASSERT(SPA_VERSION_IS_SUPPORTED(version));
9ae529ec
CS
5928 ASSERT(version >= spa_version(spa));
5929
5930 spa->spa_uberblock.ub_version = version;
5931 vdev_config_dirty(spa->spa_root_vdev);
6f1ffb06 5932 spa_history_log_internal(spa, "set", tx, "version=%lld", version);
9ae529ec
CS
5933}
5934
34dc7c2f
BB
5935/*
5936 * Set zpool properties.
5937 */
5938static void
13fe0198 5939spa_sync_props(void *arg, dmu_tx_t *tx)
34dc7c2f 5940{
13fe0198
MA
5941 nvlist_t *nvp = arg;
5942 spa_t *spa = dmu_tx_pool(tx)->dp_spa;
34dc7c2f 5943 objset_t *mos = spa->spa_meta_objset;
9ae529ec 5944 nvpair_t *elem = NULL;
b128c09f
BB
5945
5946 mutex_enter(&spa->spa_props_lock);
34dc7c2f 5947
34dc7c2f 5948 while ((elem = nvlist_next_nvpair(nvp, elem))) {
9ae529ec
CS
5949 uint64_t intval;
5950 char *strval, *fname;
5951 zpool_prop_t prop;
5952 const char *propname;
5953 zprop_type_t proptype;
5954 zfeature_info_t *feature;
5955
5956 prop = zpool_name_to_prop(nvpair_name(elem));
5957 switch ((int)prop) {
5958 case ZPROP_INVAL:
5959 /*
5960 * We checked this earlier in spa_prop_validate().
5961 */
5962 ASSERT(zpool_prop_feature(nvpair_name(elem)));
5963
5964 fname = strchr(nvpair_name(elem), '@') + 1;
5965 VERIFY3U(0, ==, zfeature_lookup_name(fname, &feature));
5966
5967 spa_feature_enable(spa, feature, tx);
6f1ffb06
MA
5968 spa_history_log_internal(spa, "set", tx,
5969 "%s=enabled", nvpair_name(elem));
9ae529ec
CS
5970 break;
5971
34dc7c2f 5972 case ZPOOL_PROP_VERSION:
9ae529ec 5973 VERIFY(nvpair_value_uint64(elem, &intval) == 0);
34dc7c2f 5974 /*
9ae529ec
CS
5975 * The version is synced seperatly before other
5976 * properties and should be correct by now.
34dc7c2f 5977 */
9ae529ec 5978 ASSERT3U(spa_version(spa), >=, intval);
34dc7c2f
BB
5979 break;
5980
5981 case ZPOOL_PROP_ALTROOT:
5982 /*
5983 * 'altroot' is a non-persistent property. It should
5984 * have been set temporarily at creation or import time.
5985 */
5986 ASSERT(spa->spa_root != NULL);
5987 break;
5988
572e2857 5989 case ZPOOL_PROP_READONLY:
34dc7c2f
BB
5990 case ZPOOL_PROP_CACHEFILE:
5991 /*
572e2857
BB
5992 * 'readonly' and 'cachefile' are also non-persisitent
5993 * properties.
34dc7c2f 5994 */
34dc7c2f 5995 break;
d96eb2b1
DM
5996 case ZPOOL_PROP_COMMENT:
5997 VERIFY(nvpair_value_string(elem, &strval) == 0);
5998 if (spa->spa_comment != NULL)
5999 spa_strfree(spa->spa_comment);
6000 spa->spa_comment = spa_strdup(strval);
6001 /*
6002 * We need to dirty the configuration on all the vdevs
6003 * so that their labels get updated. It's unnecessary
6004 * to do this for pool creation since the vdev's
6005 * configuratoin has already been dirtied.
6006 */
6007 if (tx->tx_txg != TXG_INITIAL)
6008 vdev_config_dirty(spa->spa_root_vdev);
6f1ffb06
MA
6009 spa_history_log_internal(spa, "set", tx,
6010 "%s=%s", nvpair_name(elem), strval);
d96eb2b1 6011 break;
34dc7c2f
BB
6012 default:
6013 /*
6014 * Set pool property values in the poolprops mos object.
6015 */
34dc7c2f 6016 if (spa->spa_pool_props_object == 0) {
9ae529ec
CS
6017 spa->spa_pool_props_object =
6018 zap_create_link(mos, DMU_OT_POOL_PROPS,
34dc7c2f 6019 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS,
9ae529ec 6020 tx);
34dc7c2f 6021 }
34dc7c2f
BB
6022
6023 /* normalize the property name */
6024 propname = zpool_prop_to_name(prop);
6025 proptype = zpool_prop_get_type(prop);
6026
6027 if (nvpair_type(elem) == DATA_TYPE_STRING) {
6028 ASSERT(proptype == PROP_TYPE_STRING);
6029 VERIFY(nvpair_value_string(elem, &strval) == 0);
6030 VERIFY(zap_update(mos,
6031 spa->spa_pool_props_object, propname,
6032 1, strlen(strval) + 1, strval, tx) == 0);
6f1ffb06
MA
6033 spa_history_log_internal(spa, "set", tx,
6034 "%s=%s", nvpair_name(elem), strval);
34dc7c2f
BB
6035 } else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
6036 VERIFY(nvpair_value_uint64(elem, &intval) == 0);
6037
6038 if (proptype == PROP_TYPE_INDEX) {
6039 const char *unused;
6040 VERIFY(zpool_prop_index_to_string(
6041 prop, intval, &unused) == 0);
6042 }
6043 VERIFY(zap_update(mos,
6044 spa->spa_pool_props_object, propname,
6045 8, 1, &intval, tx) == 0);
6f1ffb06
MA
6046 spa_history_log_internal(spa, "set", tx,
6047 "%s=%lld", nvpair_name(elem), intval);
34dc7c2f
BB
6048 } else {
6049 ASSERT(0); /* not allowed */
6050 }
6051
6052 switch (prop) {
6053 case ZPOOL_PROP_DELEGATION:
6054 spa->spa_delegation = intval;
6055 break;
6056 case ZPOOL_PROP_BOOTFS:
6057 spa->spa_bootfs = intval;
6058 break;
6059 case ZPOOL_PROP_FAILUREMODE:
6060 spa->spa_failmode = intval;
6061 break;
9babb374
BB
6062 case ZPOOL_PROP_AUTOEXPAND:
6063 spa->spa_autoexpand = intval;
428870ff
BB
6064 if (tx->tx_txg != TXG_INITIAL)
6065 spa_async_request(spa,
6066 SPA_ASYNC_AUTOEXPAND);
6067 break;
6068 case ZPOOL_PROP_DEDUPDITTO:
6069 spa->spa_dedup_ditto = intval;
9babb374 6070 break;
34dc7c2f
BB
6071 default:
6072 break;
6073 }
6074 }
6075
34dc7c2f 6076 }
b128c09f
BB
6077
6078 mutex_exit(&spa->spa_props_lock);
34dc7c2f
BB
6079}
6080
428870ff
BB
6081/*
6082 * Perform one-time upgrade on-disk changes. spa_version() does not
6083 * reflect the new version this txg, so there must be no changes this
6084 * txg to anything that the upgrade code depends on after it executes.
6085 * Therefore this must be called after dsl_pool_sync() does the sync
6086 * tasks.
6087 */
6088static void
6089spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx)
6090{
6091 dsl_pool_t *dp = spa->spa_dsl_pool;
6092
6093 ASSERT(spa->spa_sync_pass == 1);
6094
13fe0198
MA
6095 rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
6096
428870ff
BB
6097 if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN &&
6098 spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) {
6099 dsl_pool_create_origin(dp, tx);
6100
6101 /* Keeping the origin open increases spa_minref */
6102 spa->spa_minref += 3;
6103 }
6104
6105 if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES &&
6106 spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) {
6107 dsl_pool_upgrade_clones(dp, tx);
6108 }
6109
6110 if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES &&
6111 spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) {
6112 dsl_pool_upgrade_dir_clones(dp, tx);
6113
6114 /* Keeping the freedir open increases spa_minref */
6115 spa->spa_minref += 3;
6116 }
9ae529ec
CS
6117
6118 if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES &&
6119 spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) {
6120 spa_feature_create_zap_objects(spa, tx);
6121 }
13fe0198 6122 rrw_exit(&dp->dp_config_rwlock, FTAG);
428870ff
BB
6123}
6124
34dc7c2f
BB
6125/*
6126 * Sync the specified transaction group. New blocks may be dirtied as
6127 * part of the process, so we iterate until it converges.
6128 */
6129void
6130spa_sync(spa_t *spa, uint64_t txg)
6131{
6132 dsl_pool_t *dp = spa->spa_dsl_pool;
6133 objset_t *mos = spa->spa_meta_objset;
428870ff 6134 bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK];
34dc7c2f
BB
6135 vdev_t *rvd = spa->spa_root_vdev;
6136 vdev_t *vd;
34dc7c2f 6137 dmu_tx_t *tx;
b128c09f 6138 int error;
d6320ddb 6139 int c;
34dc7c2f 6140
572e2857
BB
6141 VERIFY(spa_writeable(spa));
6142
34dc7c2f
BB
6143 /*
6144 * Lock out configuration changes.
6145 */
b128c09f 6146 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
34dc7c2f
BB
6147
6148 spa->spa_syncing_txg = txg;
6149 spa->spa_sync_pass = 0;
6150
b128c09f
BB
6151 /*
6152 * If there are any pending vdev state changes, convert them
6153 * into config changes that go out with this transaction group.
6154 */
6155 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
fb5f0bc8
BB
6156 while (list_head(&spa->spa_state_dirty_list) != NULL) {
6157 /*
6158 * We need the write lock here because, for aux vdevs,
6159 * calling vdev_config_dirty() modifies sav_config.
6160 * This is ugly and will become unnecessary when we
6161 * eliminate the aux vdev wart by integrating all vdevs
6162 * into the root vdev tree.
6163 */
6164 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
6165 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER);
6166 while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) {
6167 vdev_state_clean(vd);
6168 vdev_config_dirty(vd);
6169 }
6170 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
6171 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
b128c09f
BB
6172 }
6173 spa_config_exit(spa, SCL_STATE, FTAG);
6174
34dc7c2f
BB
6175 tx = dmu_tx_create_assigned(dp, txg);
6176
cc92e9d0
GW
6177 spa->spa_sync_starttime = gethrtime();
6178 taskq_cancel_id(system_taskq, spa->spa_deadman_tqid);
6179 spa->spa_deadman_tqid = taskq_dispatch_delay(system_taskq,
cbfa294d 6180 spa_deadman, spa, TQ_PUSHPAGE, ddi_get_lbolt() +
cc92e9d0
GW
6181 NSEC_TO_TICK(spa->spa_deadman_synctime));
6182
34dc7c2f
BB
6183 /*
6184 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg,
6185 * set spa_deflate if we have no raid-z vdevs.
6186 */
6187 if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE &&
6188 spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) {
6189 int i;
6190
6191 for (i = 0; i < rvd->vdev_children; i++) {
6192 vd = rvd->vdev_child[i];
6193 if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE)
6194 break;
6195 }
6196 if (i == rvd->vdev_children) {
6197 spa->spa_deflate = TRUE;
6198 VERIFY(0 == zap_add(spa->spa_meta_objset,
6199 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
6200 sizeof (uint64_t), 1, &spa->spa_deflate, tx));
6201 }
6202 }
6203
6204 /*
428870ff
BB
6205 * If anything has changed in this txg, or if someone is waiting
6206 * for this txg to sync (eg, spa_vdev_remove()), push the
6207 * deferred frees from the previous txg. If not, leave them
6208 * alone so that we don't generate work on an otherwise idle
6209 * system.
34dc7c2f
BB
6210 */
6211 if (!txg_list_empty(&dp->dp_dirty_datasets, txg) ||
6212 !txg_list_empty(&dp->dp_dirty_dirs, txg) ||
428870ff
BB
6213 !txg_list_empty(&dp->dp_sync_tasks, txg) ||
6214 ((dsl_scan_active(dp->dp_scan) ||
6215 txg_sync_waiting(dp)) && !spa_shutting_down(spa))) {
e8b96c60 6216 spa_sync_deferred_frees(spa, tx);
428870ff 6217 }
34dc7c2f
BB
6218
6219 /*
6220 * Iterate to convergence.
6221 */
6222 do {
428870ff 6223 int pass = ++spa->spa_sync_pass;
34dc7c2f
BB
6224
6225 spa_sync_config_object(spa, tx);
6226 spa_sync_aux_dev(spa, &spa->spa_spares, tx,
6227 ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES);
6228 spa_sync_aux_dev(spa, &spa->spa_l2cache, tx,
6229 ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE);
6230 spa_errlog_sync(spa, txg);
6231 dsl_pool_sync(dp, txg);
6232
55d85d5a 6233 if (pass < zfs_sync_pass_deferred_free) {
e8b96c60 6234 spa_sync_frees(spa, free_bpl, tx);
428870ff
BB
6235 } else {
6236 bplist_iterate(free_bpl, bpobj_enqueue_cb,
e8b96c60 6237 &spa->spa_deferred_bpobj, tx);
34dc7c2f
BB
6238 }
6239
428870ff
BB
6240 ddt_sync(spa, txg);
6241 dsl_scan_sync(dp, tx);
34dc7c2f 6242
c65aa5b2 6243 while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)))
428870ff
BB
6244 vdev_sync(vd, txg);
6245
6246 if (pass == 1)
6247 spa_sync_upgrades(spa, tx);
34dc7c2f 6248
428870ff 6249 } while (dmu_objset_is_dirty(mos, txg));
34dc7c2f
BB
6250
6251 /*
6252 * Rewrite the vdev configuration (which includes the uberblock)
6253 * to commit the transaction group.
6254 *
6255 * If there are no dirty vdevs, we sync the uberblock to a few
6256 * random top-level vdevs that are known to be visible in the
b128c09f
BB
6257 * config cache (see spa_vdev_add() for a complete description).
6258 * If there *are* dirty vdevs, sync the uberblock to all vdevs.
34dc7c2f 6259 */
b128c09f
BB
6260 for (;;) {
6261 /*
6262 * We hold SCL_STATE to prevent vdev open/close/etc.
6263 * while we're attempting to write the vdev labels.
6264 */
6265 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
6266
6267 if (list_is_empty(&spa->spa_config_dirty_list)) {
6268 vdev_t *svd[SPA_DVAS_PER_BP];
6269 int svdcount = 0;
6270 int children = rvd->vdev_children;
6271 int c0 = spa_get_random(children);
b128c09f 6272
d6320ddb 6273 for (c = 0; c < children; c++) {
b128c09f
BB
6274 vd = rvd->vdev_child[(c0 + c) % children];
6275 if (vd->vdev_ms_array == 0 || vd->vdev_islog)
6276 continue;
6277 svd[svdcount++] = vd;
6278 if (svdcount == SPA_DVAS_PER_BP)
6279 break;
6280 }
9babb374
BB
6281 error = vdev_config_sync(svd, svdcount, txg, B_FALSE);
6282 if (error != 0)
6283 error = vdev_config_sync(svd, svdcount, txg,
6284 B_TRUE);
b128c09f
BB
6285 } else {
6286 error = vdev_config_sync(rvd->vdev_child,
9babb374
BB
6287 rvd->vdev_children, txg, B_FALSE);
6288 if (error != 0)
6289 error = vdev_config_sync(rvd->vdev_child,
6290 rvd->vdev_children, txg, B_TRUE);
34dc7c2f 6291 }
34dc7c2f 6292
3bc7e0fb
GW
6293 if (error == 0)
6294 spa->spa_last_synced_guid = rvd->vdev_guid;
6295
b128c09f
BB
6296 spa_config_exit(spa, SCL_STATE, FTAG);
6297
6298 if (error == 0)
6299 break;
6300 zio_suspend(spa, NULL);
6301 zio_resume_wait(spa);
6302 }
34dc7c2f
BB
6303 dmu_tx_commit(tx);
6304
cc92e9d0
GW
6305 taskq_cancel_id(system_taskq, spa->spa_deadman_tqid);
6306 spa->spa_deadman_tqid = 0;
6307
34dc7c2f
BB
6308 /*
6309 * Clear the dirty config list.
6310 */
b128c09f 6311 while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL)
34dc7c2f
BB
6312 vdev_config_clean(vd);
6313
6314 /*
6315 * Now that the new config has synced transactionally,
6316 * let it become visible to the config cache.
6317 */
6318 if (spa->spa_config_syncing != NULL) {
6319 spa_config_set(spa, spa->spa_config_syncing);
6320 spa->spa_config_txg = txg;
6321 spa->spa_config_syncing = NULL;
6322 }
6323
34dc7c2f 6324 spa->spa_ubsync = spa->spa_uberblock;
34dc7c2f 6325
428870ff 6326 dsl_pool_sync_done(dp, txg);
34dc7c2f
BB
6327
6328 /*
6329 * Update usable space statistics.
6330 */
c65aa5b2 6331 while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))))
34dc7c2f
BB
6332 vdev_sync_done(vd, txg);
6333
428870ff
BB
6334 spa_update_dspace(spa);
6335
34dc7c2f
BB
6336 /*
6337 * It had better be the case that we didn't dirty anything
6338 * since vdev_config_sync().
6339 */
6340 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
6341 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
6342 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg));
428870ff
BB
6343
6344 spa->spa_sync_pass = 0;
34dc7c2f 6345
b128c09f 6346 spa_config_exit(spa, SCL_CONFIG, FTAG);
34dc7c2f 6347
428870ff
BB
6348 spa_handle_ignored_writes(spa);
6349
34dc7c2f
BB
6350 /*
6351 * If any async tasks have been requested, kick them off.
6352 */
6353 spa_async_dispatch(spa);
6354}
6355
6356/*
6357 * Sync all pools. We don't want to hold the namespace lock across these
6358 * operations, so we take a reference on the spa_t and drop the lock during the
6359 * sync.
6360 */
6361void
6362spa_sync_allpools(void)
6363{
6364 spa_t *spa = NULL;
6365 mutex_enter(&spa_namespace_lock);
6366 while ((spa = spa_next(spa)) != NULL) {
572e2857
BB
6367 if (spa_state(spa) != POOL_STATE_ACTIVE ||
6368 !spa_writeable(spa) || spa_suspended(spa))
34dc7c2f
BB
6369 continue;
6370 spa_open_ref(spa, FTAG);
6371 mutex_exit(&spa_namespace_lock);
6372 txg_wait_synced(spa_get_dsl(spa), 0);
6373 mutex_enter(&spa_namespace_lock);
6374 spa_close(spa, FTAG);
6375 }
6376 mutex_exit(&spa_namespace_lock);
6377}
6378
6379/*
6380 * ==========================================================================
6381 * Miscellaneous routines
6382 * ==========================================================================
6383 */
6384
6385/*
6386 * Remove all pools in the system.
6387 */
6388void
6389spa_evict_all(void)
6390{
6391 spa_t *spa;
6392
6393 /*
6394 * Remove all cached state. All pools should be closed now,
6395 * so every spa in the AVL tree should be unreferenced.
6396 */
6397 mutex_enter(&spa_namespace_lock);
6398 while ((spa = spa_next(NULL)) != NULL) {
6399 /*
6400 * Stop async tasks. The async thread may need to detach
6401 * a device that's been replaced, which requires grabbing
6402 * spa_namespace_lock, so we must drop it here.
6403 */
6404 spa_open_ref(spa, FTAG);
6405 mutex_exit(&spa_namespace_lock);
6406 spa_async_suspend(spa);
6407 mutex_enter(&spa_namespace_lock);
34dc7c2f
BB
6408 spa_close(spa, FTAG);
6409
6410 if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
6411 spa_unload(spa);
6412 spa_deactivate(spa);
6413 }
6414 spa_remove(spa);
6415 }
6416 mutex_exit(&spa_namespace_lock);
6417}
6418
6419vdev_t *
9babb374 6420spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux)
34dc7c2f 6421{
b128c09f
BB
6422 vdev_t *vd;
6423 int i;
6424
6425 if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL)
6426 return (vd);
6427
9babb374 6428 if (aux) {
b128c09f
BB
6429 for (i = 0; i < spa->spa_l2cache.sav_count; i++) {
6430 vd = spa->spa_l2cache.sav_vdevs[i];
9babb374
BB
6431 if (vd->vdev_guid == guid)
6432 return (vd);
6433 }
6434
6435 for (i = 0; i < spa->spa_spares.sav_count; i++) {
6436 vd = spa->spa_spares.sav_vdevs[i];
b128c09f
BB
6437 if (vd->vdev_guid == guid)
6438 return (vd);
6439 }
6440 }
6441
6442 return (NULL);
34dc7c2f
BB
6443}
6444
6445void
6446spa_upgrade(spa_t *spa, uint64_t version)
6447{
572e2857
BB
6448 ASSERT(spa_writeable(spa));
6449
b128c09f 6450 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
34dc7c2f
BB
6451
6452 /*
6453 * This should only be called for a non-faulted pool, and since a
6454 * future version would result in an unopenable pool, this shouldn't be
6455 * possible.
6456 */
8dca0a9a 6457 ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version));
34dc7c2f
BB
6458 ASSERT(version >= spa->spa_uberblock.ub_version);
6459
6460 spa->spa_uberblock.ub_version = version;
6461 vdev_config_dirty(spa->spa_root_vdev);
6462
b128c09f 6463 spa_config_exit(spa, SCL_ALL, FTAG);
34dc7c2f
BB
6464
6465 txg_wait_synced(spa_get_dsl(spa), 0);
6466}
6467
6468boolean_t
6469spa_has_spare(spa_t *spa, uint64_t guid)
6470{
6471 int i;
6472 uint64_t spareguid;
6473 spa_aux_vdev_t *sav = &spa->spa_spares;
6474
6475 for (i = 0; i < sav->sav_count; i++)
6476 if (sav->sav_vdevs[i]->vdev_guid == guid)
6477 return (B_TRUE);
6478
6479 for (i = 0; i < sav->sav_npending; i++) {
6480 if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID,
6481 &spareguid) == 0 && spareguid == guid)
6482 return (B_TRUE);
6483 }
6484
6485 return (B_FALSE);
6486}
6487
b128c09f
BB
6488/*
6489 * Check if a pool has an active shared spare device.
6490 * Note: reference count of an active spare is 2, as a spare and as a replace
6491 */
6492static boolean_t
6493spa_has_active_shared_spare(spa_t *spa)
6494{
6495 int i, refcnt;
6496 uint64_t pool;
6497 spa_aux_vdev_t *sav = &spa->spa_spares;
6498
6499 for (i = 0; i < sav->sav_count; i++) {
6500 if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool,
6501 &refcnt) && pool != 0ULL && pool == spa_guid(spa) &&
6502 refcnt > 2)
6503 return (B_TRUE);
6504 }
6505
6506 return (B_FALSE);
6507}
6508
34dc7c2f 6509/*
26685276 6510 * Post a FM_EREPORT_ZFS_* event from sys/fm/fs/zfs.h. The payload will be
34dc7c2f
BB
6511 * filled in from the spa and (optionally) the vdev. This doesn't do anything
6512 * in the userland libzpool, as we don't want consumers to misinterpret ztest
6513 * or zdb as real changes.
6514 */
6515void
6516spa_event_notify(spa_t *spa, vdev_t *vd, const char *name)
6517{
6518#ifdef _KERNEL
26685276 6519 zfs_ereport_post(name, spa, vd, NULL, 0, 0);
34dc7c2f
BB
6520#endif
6521}
c28b2279
BB
6522
6523#if defined(_KERNEL) && defined(HAVE_SPL)
6524/* state manipulation functions */
6525EXPORT_SYMBOL(spa_open);
6526EXPORT_SYMBOL(spa_open_rewind);
6527EXPORT_SYMBOL(spa_get_stats);
6528EXPORT_SYMBOL(spa_create);
6529EXPORT_SYMBOL(spa_import_rootpool);
6530EXPORT_SYMBOL(spa_import);
6531EXPORT_SYMBOL(spa_tryimport);
6532EXPORT_SYMBOL(spa_destroy);
6533EXPORT_SYMBOL(spa_export);
6534EXPORT_SYMBOL(spa_reset);
6535EXPORT_SYMBOL(spa_async_request);
6536EXPORT_SYMBOL(spa_async_suspend);
6537EXPORT_SYMBOL(spa_async_resume);
6538EXPORT_SYMBOL(spa_inject_addref);
6539EXPORT_SYMBOL(spa_inject_delref);
6540EXPORT_SYMBOL(spa_scan_stat_init);
6541EXPORT_SYMBOL(spa_scan_get_stats);
6542
6543/* device maniion */
6544EXPORT_SYMBOL(spa_vdev_add);
6545EXPORT_SYMBOL(spa_vdev_attach);
6546EXPORT_SYMBOL(spa_vdev_detach);
6547EXPORT_SYMBOL(spa_vdev_remove);
6548EXPORT_SYMBOL(spa_vdev_setpath);
6549EXPORT_SYMBOL(spa_vdev_setfru);
6550EXPORT_SYMBOL(spa_vdev_split_mirror);
6551
6552/* spare statech is global across all pools) */
6553EXPORT_SYMBOL(spa_spare_add);
6554EXPORT_SYMBOL(spa_spare_remove);
6555EXPORT_SYMBOL(spa_spare_exists);
6556EXPORT_SYMBOL(spa_spare_activate);
6557
6558/* L2ARC statech is global across all pools) */
6559EXPORT_SYMBOL(spa_l2cache_add);
6560EXPORT_SYMBOL(spa_l2cache_remove);
6561EXPORT_SYMBOL(spa_l2cache_exists);
6562EXPORT_SYMBOL(spa_l2cache_activate);
6563EXPORT_SYMBOL(spa_l2cache_drop);
6564
6565/* scanning */
6566EXPORT_SYMBOL(spa_scan);
6567EXPORT_SYMBOL(spa_scan_stop);
6568
6569/* spa syncing */
6570EXPORT_SYMBOL(spa_sync); /* only for DMU use */
6571EXPORT_SYMBOL(spa_sync_allpools);
6572
6573/* properties */
6574EXPORT_SYMBOL(spa_prop_set);
6575EXPORT_SYMBOL(spa_prop_get);
6576EXPORT_SYMBOL(spa_prop_clear_bootfs);
6577
6578/* asynchronous event notification */
6579EXPORT_SYMBOL(spa_event_notify);
6580#endif