]> git.proxmox.com Git - mirror_zfs.git/blob - module/zfs/spa.c
Implement large_dnode pool feature
[mirror_zfs.git] / module / zfs / spa.c
1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2013 by Delphix. All rights reserved.
25 * Copyright (c) 2013, 2014, Nexenta Systems, Inc. All rights reserved.
26 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
27 * Copyright (c) 2016 Actifio, Inc. All rights reserved.
28 */
29
30 /*
31 * SPA: Storage Pool Allocator
32 *
33 * This file contains all the routines used when modifying on-disk SPA state.
34 * This includes opening, importing, destroying, exporting a pool, and syncing a
35 * pool.
36 */
37
38 #include <sys/zfs_context.h>
39 #include <sys/fm/fs/zfs.h>
40 #include <sys/spa_impl.h>
41 #include <sys/zio.h>
42 #include <sys/zio_checksum.h>
43 #include <sys/dmu.h>
44 #include <sys/dmu_tx.h>
45 #include <sys/zap.h>
46 #include <sys/zil.h>
47 #include <sys/ddt.h>
48 #include <sys/vdev_impl.h>
49 #include <sys/vdev_disk.h>
50 #include <sys/metaslab.h>
51 #include <sys/metaslab_impl.h>
52 #include <sys/uberblock_impl.h>
53 #include <sys/txg.h>
54 #include <sys/avl.h>
55 #include <sys/dmu_traverse.h>
56 #include <sys/dmu_objset.h>
57 #include <sys/unique.h>
58 #include <sys/dsl_pool.h>
59 #include <sys/dsl_dataset.h>
60 #include <sys/dsl_dir.h>
61 #include <sys/dsl_prop.h>
62 #include <sys/dsl_synctask.h>
63 #include <sys/fs/zfs.h>
64 #include <sys/arc.h>
65 #include <sys/callb.h>
66 #include <sys/systeminfo.h>
67 #include <sys/spa_boot.h>
68 #include <sys/zfs_ioctl.h>
69 #include <sys/dsl_scan.h>
70 #include <sys/zfeature.h>
71 #include <sys/dsl_destroy.h>
72 #include <sys/zvol.h>
73
74 #ifdef _KERNEL
75 #include <sys/bootprops.h>
76 #include <sys/callb.h>
77 #include <sys/cpupart.h>
78 #include <sys/pool.h>
79 #include <sys/sysdc.h>
80 #include <sys/zone.h>
81 #endif /* _KERNEL */
82
83 #include "zfs_prop.h"
84 #include "zfs_comutil.h"
85
86 /*
87 * The interval, in seconds, at which failed configuration cache file writes
88 * should be retried.
89 */
90 static int zfs_ccw_retry_interval = 300;
91
92 typedef enum zti_modes {
93 ZTI_MODE_FIXED, /* value is # of threads (min 1) */
94 ZTI_MODE_BATCH, /* cpu-intensive; value is ignored */
95 ZTI_MODE_NULL, /* don't create a taskq */
96 ZTI_NMODES
97 } zti_modes_t;
98
99 #define ZTI_P(n, q) { ZTI_MODE_FIXED, (n), (q) }
100 #define ZTI_PCT(n) { ZTI_MODE_ONLINE_PERCENT, (n), 1 }
101 #define ZTI_BATCH { ZTI_MODE_BATCH, 0, 1 }
102 #define ZTI_NULL { ZTI_MODE_NULL, 0, 0 }
103
104 #define ZTI_N(n) ZTI_P(n, 1)
105 #define ZTI_ONE ZTI_N(1)
106
107 typedef struct zio_taskq_info {
108 zti_modes_t zti_mode;
109 uint_t zti_value;
110 uint_t zti_count;
111 } zio_taskq_info_t;
112
113 static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
114 "iss", "iss_h", "int", "int_h"
115 };
116
117 /*
118 * This table defines the taskq settings for each ZFS I/O type. When
119 * initializing a pool, we use this table to create an appropriately sized
120 * taskq. Some operations are low volume and therefore have a small, static
121 * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE
122 * macros. Other operations process a large amount of data; the ZTI_BATCH
123 * macro causes us to create a taskq oriented for throughput. Some operations
124 * are so high frequency and short-lived that the taskq itself can become a a
125 * point of lock contention. The ZTI_P(#, #) macro indicates that we need an
126 * additional degree of parallelism specified by the number of threads per-
127 * taskq and the number of taskqs; when dispatching an event in this case, the
128 * particular taskq is chosen at random.
129 *
130 * The different taskq priorities are to handle the different contexts (issue
131 * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that
132 * need to be handled with minimum delay.
133 */
134 const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
135 /* ISSUE ISSUE_HIGH INTR INTR_HIGH */
136 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */
137 { ZTI_N(8), ZTI_NULL, ZTI_P(12, 8), ZTI_NULL }, /* READ */
138 { ZTI_BATCH, ZTI_N(5), ZTI_P(12, 8), ZTI_N(5) }, /* WRITE */
139 { ZTI_P(12, 8), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */
140 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* CLAIM */
141 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* IOCTL */
142 };
143
144 static void spa_sync_version(void *arg, dmu_tx_t *tx);
145 static void spa_sync_props(void *arg, dmu_tx_t *tx);
146 static boolean_t spa_has_active_shared_spare(spa_t *spa);
147 static inline int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config,
148 spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig,
149 char **ereport);
150 static void spa_vdev_resilver_done(spa_t *spa);
151
152 uint_t zio_taskq_batch_pct = 75; /* 1 thread per cpu in pset */
153 id_t zio_taskq_psrset_bind = PS_NONE;
154 boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */
155 uint_t zio_taskq_basedc = 80; /* base duty cycle */
156
157 boolean_t spa_create_process = B_TRUE; /* no process ==> no sysdc */
158
159 /*
160 * This (illegal) pool name is used when temporarily importing a spa_t in order
161 * to get the vdev stats associated with the imported devices.
162 */
163 #define TRYIMPORT_NAME "$import"
164
165 /*
166 * ==========================================================================
167 * SPA properties routines
168 * ==========================================================================
169 */
170
171 /*
172 * Add a (source=src, propname=propval) list to an nvlist.
173 */
174 static void
175 spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval,
176 uint64_t intval, zprop_source_t src)
177 {
178 const char *propname = zpool_prop_to_name(prop);
179 nvlist_t *propval;
180
181 VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
182 VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0);
183
184 if (strval != NULL)
185 VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0);
186 else
187 VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0);
188
189 VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0);
190 nvlist_free(propval);
191 }
192
193 /*
194 * Get property values from the spa configuration.
195 */
196 static void
197 spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
198 {
199 vdev_t *rvd = spa->spa_root_vdev;
200 dsl_pool_t *pool = spa->spa_dsl_pool;
201 uint64_t size, alloc, cap, version;
202 zprop_source_t src = ZPROP_SRC_NONE;
203 spa_config_dirent_t *dp;
204 metaslab_class_t *mc = spa_normal_class(spa);
205
206 ASSERT(MUTEX_HELD(&spa->spa_props_lock));
207
208 if (rvd != NULL) {
209 alloc = metaslab_class_get_alloc(spa_normal_class(spa));
210 size = metaslab_class_get_space(spa_normal_class(spa));
211 spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src);
212 spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src);
213 spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src);
214 spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL,
215 size - alloc, src);
216
217 spa_prop_add_list(*nvp, ZPOOL_PROP_FRAGMENTATION, NULL,
218 metaslab_class_fragmentation(mc), src);
219 spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL,
220 metaslab_class_expandable_space(mc), src);
221 spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL,
222 (spa_mode(spa) == FREAD), src);
223
224 cap = (size == 0) ? 0 : (alloc * 100 / size);
225 spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src);
226
227 spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL,
228 ddt_get_pool_dedup_ratio(spa), src);
229
230 spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL,
231 rvd->vdev_state, src);
232
233 version = spa_version(spa);
234 if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION))
235 src = ZPROP_SRC_DEFAULT;
236 else
237 src = ZPROP_SRC_LOCAL;
238 spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src);
239 }
240
241 if (pool != NULL) {
242 /*
243 * The $FREE directory was introduced in SPA_VERSION_DEADLISTS,
244 * when opening pools before this version freedir will be NULL.
245 */
246 if (pool->dp_free_dir != NULL) {
247 spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL,
248 dsl_dir_phys(pool->dp_free_dir)->dd_used_bytes,
249 src);
250 } else {
251 spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING,
252 NULL, 0, src);
253 }
254
255 if (pool->dp_leak_dir != NULL) {
256 spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL,
257 dsl_dir_phys(pool->dp_leak_dir)->dd_used_bytes,
258 src);
259 } else {
260 spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED,
261 NULL, 0, src);
262 }
263 }
264
265 spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src);
266
267 if (spa->spa_comment != NULL) {
268 spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment,
269 0, ZPROP_SRC_LOCAL);
270 }
271
272 if (spa->spa_root != NULL)
273 spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root,
274 0, ZPROP_SRC_LOCAL);
275
276 if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) {
277 spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL,
278 MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE), ZPROP_SRC_NONE);
279 } else {
280 spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL,
281 SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE);
282 }
283
284 if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE)) {
285 spa_prop_add_list(*nvp, ZPOOL_PROP_MAXDNODESIZE, NULL,
286 DNODE_MAX_SIZE, ZPROP_SRC_NONE);
287 } else {
288 spa_prop_add_list(*nvp, ZPOOL_PROP_MAXDNODESIZE, NULL,
289 DNODE_MIN_SIZE, ZPROP_SRC_NONE);
290 }
291
292 if ((dp = list_head(&spa->spa_config_list)) != NULL) {
293 if (dp->scd_path == NULL) {
294 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
295 "none", 0, ZPROP_SRC_LOCAL);
296 } else if (strcmp(dp->scd_path, spa_config_path) != 0) {
297 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
298 dp->scd_path, 0, ZPROP_SRC_LOCAL);
299 }
300 }
301 }
302
303 /*
304 * Get zpool property values.
305 */
306 int
307 spa_prop_get(spa_t *spa, nvlist_t **nvp)
308 {
309 objset_t *mos = spa->spa_meta_objset;
310 zap_cursor_t zc;
311 zap_attribute_t za;
312 int err;
313
314 err = nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP);
315 if (err)
316 return (err);
317
318 mutex_enter(&spa->spa_props_lock);
319
320 /*
321 * Get properties from the spa config.
322 */
323 spa_prop_get_config(spa, nvp);
324
325 /* If no pool property object, no more prop to get. */
326 if (mos == NULL || spa->spa_pool_props_object == 0) {
327 mutex_exit(&spa->spa_props_lock);
328 goto out;
329 }
330
331 /*
332 * Get properties from the MOS pool property object.
333 */
334 for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object);
335 (err = zap_cursor_retrieve(&zc, &za)) == 0;
336 zap_cursor_advance(&zc)) {
337 uint64_t intval = 0;
338 char *strval = NULL;
339 zprop_source_t src = ZPROP_SRC_DEFAULT;
340 zpool_prop_t prop;
341
342 if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL)
343 continue;
344
345 switch (za.za_integer_length) {
346 case 8:
347 /* integer property */
348 if (za.za_first_integer !=
349 zpool_prop_default_numeric(prop))
350 src = ZPROP_SRC_LOCAL;
351
352 if (prop == ZPOOL_PROP_BOOTFS) {
353 dsl_pool_t *dp;
354 dsl_dataset_t *ds = NULL;
355
356 dp = spa_get_dsl(spa);
357 dsl_pool_config_enter(dp, FTAG);
358 if ((err = dsl_dataset_hold_obj(dp,
359 za.za_first_integer, FTAG, &ds))) {
360 dsl_pool_config_exit(dp, FTAG);
361 break;
362 }
363
364 strval = kmem_alloc(
365 MAXNAMELEN + strlen(MOS_DIR_NAME) + 1,
366 KM_SLEEP);
367 dsl_dataset_name(ds, strval);
368 dsl_dataset_rele(ds, FTAG);
369 dsl_pool_config_exit(dp, FTAG);
370 } else {
371 strval = NULL;
372 intval = za.za_first_integer;
373 }
374
375 spa_prop_add_list(*nvp, prop, strval, intval, src);
376
377 if (strval != NULL)
378 kmem_free(strval,
379 MAXNAMELEN + strlen(MOS_DIR_NAME) + 1);
380
381 break;
382
383 case 1:
384 /* string property */
385 strval = kmem_alloc(za.za_num_integers, KM_SLEEP);
386 err = zap_lookup(mos, spa->spa_pool_props_object,
387 za.za_name, 1, za.za_num_integers, strval);
388 if (err) {
389 kmem_free(strval, za.za_num_integers);
390 break;
391 }
392 spa_prop_add_list(*nvp, prop, strval, 0, src);
393 kmem_free(strval, za.za_num_integers);
394 break;
395
396 default:
397 break;
398 }
399 }
400 zap_cursor_fini(&zc);
401 mutex_exit(&spa->spa_props_lock);
402 out:
403 if (err && err != ENOENT) {
404 nvlist_free(*nvp);
405 *nvp = NULL;
406 return (err);
407 }
408
409 return (0);
410 }
411
412 /*
413 * Validate the given pool properties nvlist and modify the list
414 * for the property values to be set.
415 */
416 static int
417 spa_prop_validate(spa_t *spa, nvlist_t *props)
418 {
419 nvpair_t *elem;
420 int error = 0, reset_bootfs = 0;
421 uint64_t objnum = 0;
422 boolean_t has_feature = B_FALSE;
423
424 elem = NULL;
425 while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
426 uint64_t intval;
427 char *strval, *slash, *check, *fname;
428 const char *propname = nvpair_name(elem);
429 zpool_prop_t prop = zpool_name_to_prop(propname);
430
431 switch ((int)prop) {
432 case ZPROP_INVAL:
433 if (!zpool_prop_feature(propname)) {
434 error = SET_ERROR(EINVAL);
435 break;
436 }
437
438 /*
439 * Sanitize the input.
440 */
441 if (nvpair_type(elem) != DATA_TYPE_UINT64) {
442 error = SET_ERROR(EINVAL);
443 break;
444 }
445
446 if (nvpair_value_uint64(elem, &intval) != 0) {
447 error = SET_ERROR(EINVAL);
448 break;
449 }
450
451 if (intval != 0) {
452 error = SET_ERROR(EINVAL);
453 break;
454 }
455
456 fname = strchr(propname, '@') + 1;
457 if (zfeature_lookup_name(fname, NULL) != 0) {
458 error = SET_ERROR(EINVAL);
459 break;
460 }
461
462 has_feature = B_TRUE;
463 break;
464
465 case ZPOOL_PROP_VERSION:
466 error = nvpair_value_uint64(elem, &intval);
467 if (!error &&
468 (intval < spa_version(spa) ||
469 intval > SPA_VERSION_BEFORE_FEATURES ||
470 has_feature))
471 error = SET_ERROR(EINVAL);
472 break;
473
474 case ZPOOL_PROP_DELEGATION:
475 case ZPOOL_PROP_AUTOREPLACE:
476 case ZPOOL_PROP_LISTSNAPS:
477 case ZPOOL_PROP_AUTOEXPAND:
478 error = nvpair_value_uint64(elem, &intval);
479 if (!error && intval > 1)
480 error = SET_ERROR(EINVAL);
481 break;
482
483 case ZPOOL_PROP_BOOTFS:
484 /*
485 * If the pool version is less than SPA_VERSION_BOOTFS,
486 * or the pool is still being created (version == 0),
487 * the bootfs property cannot be set.
488 */
489 if (spa_version(spa) < SPA_VERSION_BOOTFS) {
490 error = SET_ERROR(ENOTSUP);
491 break;
492 }
493
494 /*
495 * Make sure the vdev config is bootable
496 */
497 if (!vdev_is_bootable(spa->spa_root_vdev)) {
498 error = SET_ERROR(ENOTSUP);
499 break;
500 }
501
502 reset_bootfs = 1;
503
504 error = nvpair_value_string(elem, &strval);
505
506 if (!error) {
507 objset_t *os;
508 uint64_t propval;
509
510 if (strval == NULL || strval[0] == '\0') {
511 objnum = zpool_prop_default_numeric(
512 ZPOOL_PROP_BOOTFS);
513 break;
514 }
515
516 error = dmu_objset_hold(strval, FTAG, &os);
517 if (error)
518 break;
519
520 /*
521 * Must be ZPL, and its property settings
522 * must be supported by GRUB (compression
523 * is not gzip, and large blocks or large
524 * dnodes are not used).
525 */
526
527 if (dmu_objset_type(os) != DMU_OST_ZFS) {
528 error = SET_ERROR(ENOTSUP);
529 } else if ((error =
530 dsl_prop_get_int_ds(dmu_objset_ds(os),
531 zfs_prop_to_name(ZFS_PROP_COMPRESSION),
532 &propval)) == 0 &&
533 !BOOTFS_COMPRESS_VALID(propval)) {
534 error = SET_ERROR(ENOTSUP);
535 } else if ((error =
536 dsl_prop_get_int_ds(dmu_objset_ds(os),
537 zfs_prop_to_name(ZFS_PROP_RECORDSIZE),
538 &propval)) == 0 &&
539 propval > SPA_OLD_MAXBLOCKSIZE) {
540 error = SET_ERROR(ENOTSUP);
541 } else if ((error =
542 dsl_prop_get_int_ds(dmu_objset_ds(os),
543 zfs_prop_to_name(ZFS_PROP_DNODESIZE),
544 &propval)) == 0 &&
545 propval != ZFS_DNSIZE_LEGACY) {
546 error = SET_ERROR(ENOTSUP);
547 } else {
548 objnum = dmu_objset_id(os);
549 }
550 dmu_objset_rele(os, FTAG);
551 }
552 break;
553
554 case ZPOOL_PROP_FAILUREMODE:
555 error = nvpair_value_uint64(elem, &intval);
556 if (!error && (intval < ZIO_FAILURE_MODE_WAIT ||
557 intval > ZIO_FAILURE_MODE_PANIC))
558 error = SET_ERROR(EINVAL);
559
560 /*
561 * This is a special case which only occurs when
562 * the pool has completely failed. This allows
563 * the user to change the in-core failmode property
564 * without syncing it out to disk (I/Os might
565 * currently be blocked). We do this by returning
566 * EIO to the caller (spa_prop_set) to trick it
567 * into thinking we encountered a property validation
568 * error.
569 */
570 if (!error && spa_suspended(spa)) {
571 spa->spa_failmode = intval;
572 error = SET_ERROR(EIO);
573 }
574 break;
575
576 case ZPOOL_PROP_CACHEFILE:
577 if ((error = nvpair_value_string(elem, &strval)) != 0)
578 break;
579
580 if (strval[0] == '\0')
581 break;
582
583 if (strcmp(strval, "none") == 0)
584 break;
585
586 if (strval[0] != '/') {
587 error = SET_ERROR(EINVAL);
588 break;
589 }
590
591 slash = strrchr(strval, '/');
592 ASSERT(slash != NULL);
593
594 if (slash[1] == '\0' || strcmp(slash, "/.") == 0 ||
595 strcmp(slash, "/..") == 0)
596 error = SET_ERROR(EINVAL);
597 break;
598
599 case ZPOOL_PROP_COMMENT:
600 if ((error = nvpair_value_string(elem, &strval)) != 0)
601 break;
602 for (check = strval; *check != '\0'; check++) {
603 if (!isprint(*check)) {
604 error = SET_ERROR(EINVAL);
605 break;
606 }
607 }
608 if (strlen(strval) > ZPROP_MAX_COMMENT)
609 error = SET_ERROR(E2BIG);
610 break;
611
612 case ZPOOL_PROP_DEDUPDITTO:
613 if (spa_version(spa) < SPA_VERSION_DEDUP)
614 error = SET_ERROR(ENOTSUP);
615 else
616 error = nvpair_value_uint64(elem, &intval);
617 if (error == 0 &&
618 intval != 0 && intval < ZIO_DEDUPDITTO_MIN)
619 error = SET_ERROR(EINVAL);
620 break;
621
622 default:
623 break;
624 }
625
626 if (error)
627 break;
628 }
629
630 if (!error && reset_bootfs) {
631 error = nvlist_remove(props,
632 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING);
633
634 if (!error) {
635 error = nvlist_add_uint64(props,
636 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum);
637 }
638 }
639
640 return (error);
641 }
642
643 void
644 spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync)
645 {
646 char *cachefile;
647 spa_config_dirent_t *dp;
648
649 if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE),
650 &cachefile) != 0)
651 return;
652
653 dp = kmem_alloc(sizeof (spa_config_dirent_t),
654 KM_SLEEP);
655
656 if (cachefile[0] == '\0')
657 dp->scd_path = spa_strdup(spa_config_path);
658 else if (strcmp(cachefile, "none") == 0)
659 dp->scd_path = NULL;
660 else
661 dp->scd_path = spa_strdup(cachefile);
662
663 list_insert_head(&spa->spa_config_list, dp);
664 if (need_sync)
665 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
666 }
667
668 int
669 spa_prop_set(spa_t *spa, nvlist_t *nvp)
670 {
671 int error;
672 nvpair_t *elem = NULL;
673 boolean_t need_sync = B_FALSE;
674
675 if ((error = spa_prop_validate(spa, nvp)) != 0)
676 return (error);
677
678 while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) {
679 zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem));
680
681 if (prop == ZPOOL_PROP_CACHEFILE ||
682 prop == ZPOOL_PROP_ALTROOT ||
683 prop == ZPOOL_PROP_READONLY)
684 continue;
685
686 if (prop == ZPOOL_PROP_VERSION || prop == ZPROP_INVAL) {
687 uint64_t ver;
688
689 if (prop == ZPOOL_PROP_VERSION) {
690 VERIFY(nvpair_value_uint64(elem, &ver) == 0);
691 } else {
692 ASSERT(zpool_prop_feature(nvpair_name(elem)));
693 ver = SPA_VERSION_FEATURES;
694 need_sync = B_TRUE;
695 }
696
697 /* Save time if the version is already set. */
698 if (ver == spa_version(spa))
699 continue;
700
701 /*
702 * In addition to the pool directory object, we might
703 * create the pool properties object, the features for
704 * read object, the features for write object, or the
705 * feature descriptions object.
706 */
707 error = dsl_sync_task(spa->spa_name, NULL,
708 spa_sync_version, &ver,
709 6, ZFS_SPACE_CHECK_RESERVED);
710 if (error)
711 return (error);
712 continue;
713 }
714
715 need_sync = B_TRUE;
716 break;
717 }
718
719 if (need_sync) {
720 return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props,
721 nvp, 6, ZFS_SPACE_CHECK_RESERVED));
722 }
723
724 return (0);
725 }
726
727 /*
728 * If the bootfs property value is dsobj, clear it.
729 */
730 void
731 spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx)
732 {
733 if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) {
734 VERIFY(zap_remove(spa->spa_meta_objset,
735 spa->spa_pool_props_object,
736 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0);
737 spa->spa_bootfs = 0;
738 }
739 }
740
741 /*ARGSUSED*/
742 static int
743 spa_change_guid_check(void *arg, dmu_tx_t *tx)
744 {
745 spa_t *spa = dmu_tx_pool(tx)->dp_spa;
746 vdev_t *rvd = spa->spa_root_vdev;
747 uint64_t vdev_state;
748 ASSERTV(uint64_t *newguid = arg);
749
750 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
751 vdev_state = rvd->vdev_state;
752 spa_config_exit(spa, SCL_STATE, FTAG);
753
754 if (vdev_state != VDEV_STATE_HEALTHY)
755 return (SET_ERROR(ENXIO));
756
757 ASSERT3U(spa_guid(spa), !=, *newguid);
758
759 return (0);
760 }
761
762 static void
763 spa_change_guid_sync(void *arg, dmu_tx_t *tx)
764 {
765 uint64_t *newguid = arg;
766 spa_t *spa = dmu_tx_pool(tx)->dp_spa;
767 uint64_t oldguid;
768 vdev_t *rvd = spa->spa_root_vdev;
769
770 oldguid = spa_guid(spa);
771
772 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
773 rvd->vdev_guid = *newguid;
774 rvd->vdev_guid_sum += (*newguid - oldguid);
775 vdev_config_dirty(rvd);
776 spa_config_exit(spa, SCL_STATE, FTAG);
777
778 spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu",
779 oldguid, *newguid);
780 }
781
782 /*
783 * Change the GUID for the pool. This is done so that we can later
784 * re-import a pool built from a clone of our own vdevs. We will modify
785 * the root vdev's guid, our own pool guid, and then mark all of our
786 * vdevs dirty. Note that we must make sure that all our vdevs are
787 * online when we do this, or else any vdevs that weren't present
788 * would be orphaned from our pool. We are also going to issue a
789 * sysevent to update any watchers.
790 */
791 int
792 spa_change_guid(spa_t *spa)
793 {
794 int error;
795 uint64_t guid;
796
797 mutex_enter(&spa->spa_vdev_top_lock);
798 mutex_enter(&spa_namespace_lock);
799 guid = spa_generate_guid(NULL);
800
801 error = dsl_sync_task(spa->spa_name, spa_change_guid_check,
802 spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED);
803
804 if (error == 0) {
805 spa_config_sync(spa, B_FALSE, B_TRUE);
806 spa_event_notify(spa, NULL, FM_EREPORT_ZFS_POOL_REGUID);
807 }
808
809 mutex_exit(&spa_namespace_lock);
810 mutex_exit(&spa->spa_vdev_top_lock);
811
812 return (error);
813 }
814
815 /*
816 * ==========================================================================
817 * SPA state manipulation (open/create/destroy/import/export)
818 * ==========================================================================
819 */
820
821 static int
822 spa_error_entry_compare(const void *a, const void *b)
823 {
824 spa_error_entry_t *sa = (spa_error_entry_t *)a;
825 spa_error_entry_t *sb = (spa_error_entry_t *)b;
826 int ret;
827
828 ret = bcmp(&sa->se_bookmark, &sb->se_bookmark,
829 sizeof (zbookmark_phys_t));
830
831 if (ret < 0)
832 return (-1);
833 else if (ret > 0)
834 return (1);
835 else
836 return (0);
837 }
838
839 /*
840 * Utility function which retrieves copies of the current logs and
841 * re-initializes them in the process.
842 */
843 void
844 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub)
845 {
846 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock));
847
848 bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t));
849 bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t));
850
851 avl_create(&spa->spa_errlist_scrub,
852 spa_error_entry_compare, sizeof (spa_error_entry_t),
853 offsetof(spa_error_entry_t, se_avl));
854 avl_create(&spa->spa_errlist_last,
855 spa_error_entry_compare, sizeof (spa_error_entry_t),
856 offsetof(spa_error_entry_t, se_avl));
857 }
858
859 static void
860 spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
861 {
862 const zio_taskq_info_t *ztip = &zio_taskqs[t][q];
863 enum zti_modes mode = ztip->zti_mode;
864 uint_t value = ztip->zti_value;
865 uint_t count = ztip->zti_count;
866 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
867 char name[32];
868 uint_t i, flags = TASKQ_DYNAMIC;
869 boolean_t batch = B_FALSE;
870
871 if (mode == ZTI_MODE_NULL) {
872 tqs->stqs_count = 0;
873 tqs->stqs_taskq = NULL;
874 return;
875 }
876
877 ASSERT3U(count, >, 0);
878
879 tqs->stqs_count = count;
880 tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP);
881
882 switch (mode) {
883 case ZTI_MODE_FIXED:
884 ASSERT3U(value, >=, 1);
885 value = MAX(value, 1);
886 break;
887
888 case ZTI_MODE_BATCH:
889 batch = B_TRUE;
890 flags |= TASKQ_THREADS_CPU_PCT;
891 value = MIN(zio_taskq_batch_pct, 100);
892 break;
893
894 default:
895 panic("unrecognized mode for %s_%s taskq (%u:%u) in "
896 "spa_activate()",
897 zio_type_name[t], zio_taskq_types[q], mode, value);
898 break;
899 }
900
901 for (i = 0; i < count; i++) {
902 taskq_t *tq;
903
904 if (count > 1) {
905 (void) snprintf(name, sizeof (name), "%s_%s_%u",
906 zio_type_name[t], zio_taskq_types[q], i);
907 } else {
908 (void) snprintf(name, sizeof (name), "%s_%s",
909 zio_type_name[t], zio_taskq_types[q]);
910 }
911
912 if (zio_taskq_sysdc && spa->spa_proc != &p0) {
913 if (batch)
914 flags |= TASKQ_DC_BATCH;
915
916 tq = taskq_create_sysdc(name, value, 50, INT_MAX,
917 spa->spa_proc, zio_taskq_basedc, flags);
918 } else {
919 pri_t pri = maxclsyspri;
920 /*
921 * The write issue taskq can be extremely CPU
922 * intensive. Run it at slightly less important
923 * priority than the other taskqs. Under Linux this
924 * means incrementing the priority value on platforms
925 * like illumos it should be decremented.
926 */
927 if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE)
928 pri++;
929
930 tq = taskq_create_proc(name, value, pri, 50,
931 INT_MAX, spa->spa_proc, flags);
932 }
933
934 tqs->stqs_taskq[i] = tq;
935 }
936 }
937
938 static void
939 spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
940 {
941 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
942 uint_t i;
943
944 if (tqs->stqs_taskq == NULL) {
945 ASSERT3U(tqs->stqs_count, ==, 0);
946 return;
947 }
948
949 for (i = 0; i < tqs->stqs_count; i++) {
950 ASSERT3P(tqs->stqs_taskq[i], !=, NULL);
951 taskq_destroy(tqs->stqs_taskq[i]);
952 }
953
954 kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *));
955 tqs->stqs_taskq = NULL;
956 }
957
958 /*
959 * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority.
960 * Note that a type may have multiple discrete taskqs to avoid lock contention
961 * on the taskq itself. In that case we choose which taskq at random by using
962 * the low bits of gethrtime().
963 */
964 void
965 spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
966 task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent)
967 {
968 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
969 taskq_t *tq;
970
971 ASSERT3P(tqs->stqs_taskq, !=, NULL);
972 ASSERT3U(tqs->stqs_count, !=, 0);
973
974 if (tqs->stqs_count == 1) {
975 tq = tqs->stqs_taskq[0];
976 } else {
977 tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count];
978 }
979
980 taskq_dispatch_ent(tq, func, arg, flags, ent);
981 }
982
983 /*
984 * Same as spa_taskq_dispatch_ent() but block on the task until completion.
985 */
986 void
987 spa_taskq_dispatch_sync(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
988 task_func_t *func, void *arg, uint_t flags)
989 {
990 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
991 taskq_t *tq;
992 taskqid_t id;
993
994 ASSERT3P(tqs->stqs_taskq, !=, NULL);
995 ASSERT3U(tqs->stqs_count, !=, 0);
996
997 if (tqs->stqs_count == 1) {
998 tq = tqs->stqs_taskq[0];
999 } else {
1000 tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count];
1001 }
1002
1003 id = taskq_dispatch(tq, func, arg, flags);
1004 if (id)
1005 taskq_wait_id(tq, id);
1006 }
1007
1008 static void
1009 spa_create_zio_taskqs(spa_t *spa)
1010 {
1011 int t, q;
1012
1013 for (t = 0; t < ZIO_TYPES; t++) {
1014 for (q = 0; q < ZIO_TASKQ_TYPES; q++) {
1015 spa_taskqs_init(spa, t, q);
1016 }
1017 }
1018 }
1019
1020 #if defined(_KERNEL) && defined(HAVE_SPA_THREAD)
1021 static void
1022 spa_thread(void *arg)
1023 {
1024 callb_cpr_t cprinfo;
1025
1026 spa_t *spa = arg;
1027 user_t *pu = PTOU(curproc);
1028
1029 CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr,
1030 spa->spa_name);
1031
1032 ASSERT(curproc != &p0);
1033 (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs),
1034 "zpool-%s", spa->spa_name);
1035 (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm));
1036
1037 /* bind this thread to the requested psrset */
1038 if (zio_taskq_psrset_bind != PS_NONE) {
1039 pool_lock();
1040 mutex_enter(&cpu_lock);
1041 mutex_enter(&pidlock);
1042 mutex_enter(&curproc->p_lock);
1043
1044 if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind,
1045 0, NULL, NULL) == 0) {
1046 curthread->t_bind_pset = zio_taskq_psrset_bind;
1047 } else {
1048 cmn_err(CE_WARN,
1049 "Couldn't bind process for zfs pool \"%s\" to "
1050 "pset %d\n", spa->spa_name, zio_taskq_psrset_bind);
1051 }
1052
1053 mutex_exit(&curproc->p_lock);
1054 mutex_exit(&pidlock);
1055 mutex_exit(&cpu_lock);
1056 pool_unlock();
1057 }
1058
1059 if (zio_taskq_sysdc) {
1060 sysdc_thread_enter(curthread, 100, 0);
1061 }
1062
1063 spa->spa_proc = curproc;
1064 spa->spa_did = curthread->t_did;
1065
1066 spa_create_zio_taskqs(spa);
1067
1068 mutex_enter(&spa->spa_proc_lock);
1069 ASSERT(spa->spa_proc_state == SPA_PROC_CREATED);
1070
1071 spa->spa_proc_state = SPA_PROC_ACTIVE;
1072 cv_broadcast(&spa->spa_proc_cv);
1073
1074 CALLB_CPR_SAFE_BEGIN(&cprinfo);
1075 while (spa->spa_proc_state == SPA_PROC_ACTIVE)
1076 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
1077 CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock);
1078
1079 ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE);
1080 spa->spa_proc_state = SPA_PROC_GONE;
1081 spa->spa_proc = &p0;
1082 cv_broadcast(&spa->spa_proc_cv);
1083 CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */
1084
1085 mutex_enter(&curproc->p_lock);
1086 lwp_exit();
1087 }
1088 #endif
1089
1090 /*
1091 * Activate an uninitialized pool.
1092 */
1093 static void
1094 spa_activate(spa_t *spa, int mode)
1095 {
1096 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
1097
1098 spa->spa_state = POOL_STATE_ACTIVE;
1099 spa->spa_mode = mode;
1100
1101 spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops);
1102 spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops);
1103
1104 /* Try to create a covering process */
1105 mutex_enter(&spa->spa_proc_lock);
1106 ASSERT(spa->spa_proc_state == SPA_PROC_NONE);
1107 ASSERT(spa->spa_proc == &p0);
1108 spa->spa_did = 0;
1109
1110 #ifdef HAVE_SPA_THREAD
1111 /* Only create a process if we're going to be around a while. */
1112 if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) {
1113 if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri,
1114 NULL, 0) == 0) {
1115 spa->spa_proc_state = SPA_PROC_CREATED;
1116 while (spa->spa_proc_state == SPA_PROC_CREATED) {
1117 cv_wait(&spa->spa_proc_cv,
1118 &spa->spa_proc_lock);
1119 }
1120 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
1121 ASSERT(spa->spa_proc != &p0);
1122 ASSERT(spa->spa_did != 0);
1123 } else {
1124 #ifdef _KERNEL
1125 cmn_err(CE_WARN,
1126 "Couldn't create process for zfs pool \"%s\"\n",
1127 spa->spa_name);
1128 #endif
1129 }
1130 }
1131 #endif /* HAVE_SPA_THREAD */
1132 mutex_exit(&spa->spa_proc_lock);
1133
1134 /* If we didn't create a process, we need to create our taskqs. */
1135 if (spa->spa_proc == &p0) {
1136 spa_create_zio_taskqs(spa);
1137 }
1138
1139 list_create(&spa->spa_config_dirty_list, sizeof (vdev_t),
1140 offsetof(vdev_t, vdev_config_dirty_node));
1141 list_create(&spa->spa_evicting_os_list, sizeof (objset_t),
1142 offsetof(objset_t, os_evicting_node));
1143 list_create(&spa->spa_state_dirty_list, sizeof (vdev_t),
1144 offsetof(vdev_t, vdev_state_dirty_node));
1145
1146 txg_list_create(&spa->spa_vdev_txg_list,
1147 offsetof(struct vdev, vdev_txg_node));
1148
1149 avl_create(&spa->spa_errlist_scrub,
1150 spa_error_entry_compare, sizeof (spa_error_entry_t),
1151 offsetof(spa_error_entry_t, se_avl));
1152 avl_create(&spa->spa_errlist_last,
1153 spa_error_entry_compare, sizeof (spa_error_entry_t),
1154 offsetof(spa_error_entry_t, se_avl));
1155
1156 /*
1157 * This taskq is used to perform zvol-minor-related tasks
1158 * asynchronously. This has several advantages, including easy
1159 * resolution of various deadlocks (zfsonlinux bug #3681).
1160 *
1161 * The taskq must be single threaded to ensure tasks are always
1162 * processed in the order in which they were dispatched.
1163 *
1164 * A taskq per pool allows one to keep the pools independent.
1165 * This way if one pool is suspended, it will not impact another.
1166 *
1167 * The preferred location to dispatch a zvol minor task is a sync
1168 * task. In this context, there is easy access to the spa_t and minimal
1169 * error handling is required because the sync task must succeed.
1170 */
1171 spa->spa_zvol_taskq = taskq_create("z_zvol", 1, defclsyspri,
1172 1, INT_MAX, 0);
1173 }
1174
1175 /*
1176 * Opposite of spa_activate().
1177 */
1178 static void
1179 spa_deactivate(spa_t *spa)
1180 {
1181 int t, q;
1182
1183 ASSERT(spa->spa_sync_on == B_FALSE);
1184 ASSERT(spa->spa_dsl_pool == NULL);
1185 ASSERT(spa->spa_root_vdev == NULL);
1186 ASSERT(spa->spa_async_zio_root == NULL);
1187 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED);
1188
1189 spa_evicting_os_wait(spa);
1190
1191 if (spa->spa_zvol_taskq) {
1192 taskq_destroy(spa->spa_zvol_taskq);
1193 spa->spa_zvol_taskq = NULL;
1194 }
1195
1196 txg_list_destroy(&spa->spa_vdev_txg_list);
1197
1198 list_destroy(&spa->spa_config_dirty_list);
1199 list_destroy(&spa->spa_evicting_os_list);
1200 list_destroy(&spa->spa_state_dirty_list);
1201
1202 taskq_cancel_id(system_taskq, spa->spa_deadman_tqid);
1203
1204 for (t = 0; t < ZIO_TYPES; t++) {
1205 for (q = 0; q < ZIO_TASKQ_TYPES; q++) {
1206 spa_taskqs_fini(spa, t, q);
1207 }
1208 }
1209
1210 metaslab_class_destroy(spa->spa_normal_class);
1211 spa->spa_normal_class = NULL;
1212
1213 metaslab_class_destroy(spa->spa_log_class);
1214 spa->spa_log_class = NULL;
1215
1216 /*
1217 * If this was part of an import or the open otherwise failed, we may
1218 * still have errors left in the queues. Empty them just in case.
1219 */
1220 spa_errlog_drain(spa);
1221
1222 avl_destroy(&spa->spa_errlist_scrub);
1223 avl_destroy(&spa->spa_errlist_last);
1224
1225 spa->spa_state = POOL_STATE_UNINITIALIZED;
1226
1227 mutex_enter(&spa->spa_proc_lock);
1228 if (spa->spa_proc_state != SPA_PROC_NONE) {
1229 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
1230 spa->spa_proc_state = SPA_PROC_DEACTIVATE;
1231 cv_broadcast(&spa->spa_proc_cv);
1232 while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) {
1233 ASSERT(spa->spa_proc != &p0);
1234 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
1235 }
1236 ASSERT(spa->spa_proc_state == SPA_PROC_GONE);
1237 spa->spa_proc_state = SPA_PROC_NONE;
1238 }
1239 ASSERT(spa->spa_proc == &p0);
1240 mutex_exit(&spa->spa_proc_lock);
1241
1242 /*
1243 * We want to make sure spa_thread() has actually exited the ZFS
1244 * module, so that the module can't be unloaded out from underneath
1245 * it.
1246 */
1247 if (spa->spa_did != 0) {
1248 thread_join(spa->spa_did);
1249 spa->spa_did = 0;
1250 }
1251 }
1252
1253 /*
1254 * Verify a pool configuration, and construct the vdev tree appropriately. This
1255 * will create all the necessary vdevs in the appropriate layout, with each vdev
1256 * in the CLOSED state. This will prep the pool before open/creation/import.
1257 * All vdev validation is done by the vdev_alloc() routine.
1258 */
1259 static int
1260 spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent,
1261 uint_t id, int atype)
1262 {
1263 nvlist_t **child;
1264 uint_t children;
1265 int error;
1266 int c;
1267
1268 if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0)
1269 return (error);
1270
1271 if ((*vdp)->vdev_ops->vdev_op_leaf)
1272 return (0);
1273
1274 error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
1275 &child, &children);
1276
1277 if (error == ENOENT)
1278 return (0);
1279
1280 if (error) {
1281 vdev_free(*vdp);
1282 *vdp = NULL;
1283 return (SET_ERROR(EINVAL));
1284 }
1285
1286 for (c = 0; c < children; c++) {
1287 vdev_t *vd;
1288 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c,
1289 atype)) != 0) {
1290 vdev_free(*vdp);
1291 *vdp = NULL;
1292 return (error);
1293 }
1294 }
1295
1296 ASSERT(*vdp != NULL);
1297
1298 return (0);
1299 }
1300
1301 /*
1302 * Opposite of spa_load().
1303 */
1304 static void
1305 spa_unload(spa_t *spa)
1306 {
1307 int i;
1308
1309 ASSERT(MUTEX_HELD(&spa_namespace_lock));
1310
1311 /*
1312 * Stop async tasks.
1313 */
1314 spa_async_suspend(spa);
1315
1316 /*
1317 * Stop syncing.
1318 */
1319 if (spa->spa_sync_on) {
1320 txg_sync_stop(spa->spa_dsl_pool);
1321 spa->spa_sync_on = B_FALSE;
1322 }
1323
1324 /*
1325 * Wait for any outstanding async I/O to complete.
1326 */
1327 if (spa->spa_async_zio_root != NULL) {
1328 for (i = 0; i < max_ncpus; i++)
1329 (void) zio_wait(spa->spa_async_zio_root[i]);
1330 kmem_free(spa->spa_async_zio_root, max_ncpus * sizeof (void *));
1331 spa->spa_async_zio_root = NULL;
1332 }
1333
1334 bpobj_close(&spa->spa_deferred_bpobj);
1335
1336 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1337
1338 /*
1339 * Close all vdevs.
1340 */
1341 if (spa->spa_root_vdev)
1342 vdev_free(spa->spa_root_vdev);
1343 ASSERT(spa->spa_root_vdev == NULL);
1344
1345 /*
1346 * Close the dsl pool.
1347 */
1348 if (spa->spa_dsl_pool) {
1349 dsl_pool_close(spa->spa_dsl_pool);
1350 spa->spa_dsl_pool = NULL;
1351 spa->spa_meta_objset = NULL;
1352 }
1353
1354 ddt_unload(spa);
1355
1356
1357 /*
1358 * Drop and purge level 2 cache
1359 */
1360 spa_l2cache_drop(spa);
1361
1362 for (i = 0; i < spa->spa_spares.sav_count; i++)
1363 vdev_free(spa->spa_spares.sav_vdevs[i]);
1364 if (spa->spa_spares.sav_vdevs) {
1365 kmem_free(spa->spa_spares.sav_vdevs,
1366 spa->spa_spares.sav_count * sizeof (void *));
1367 spa->spa_spares.sav_vdevs = NULL;
1368 }
1369 if (spa->spa_spares.sav_config) {
1370 nvlist_free(spa->spa_spares.sav_config);
1371 spa->spa_spares.sav_config = NULL;
1372 }
1373 spa->spa_spares.sav_count = 0;
1374
1375 for (i = 0; i < spa->spa_l2cache.sav_count; i++) {
1376 vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]);
1377 vdev_free(spa->spa_l2cache.sav_vdevs[i]);
1378 }
1379 if (spa->spa_l2cache.sav_vdevs) {
1380 kmem_free(spa->spa_l2cache.sav_vdevs,
1381 spa->spa_l2cache.sav_count * sizeof (void *));
1382 spa->spa_l2cache.sav_vdevs = NULL;
1383 }
1384 if (spa->spa_l2cache.sav_config) {
1385 nvlist_free(spa->spa_l2cache.sav_config);
1386 spa->spa_l2cache.sav_config = NULL;
1387 }
1388 spa->spa_l2cache.sav_count = 0;
1389
1390 spa->spa_async_suspended = 0;
1391
1392 if (spa->spa_comment != NULL) {
1393 spa_strfree(spa->spa_comment);
1394 spa->spa_comment = NULL;
1395 }
1396
1397 spa_config_exit(spa, SCL_ALL, FTAG);
1398 }
1399
1400 /*
1401 * Load (or re-load) the current list of vdevs describing the active spares for
1402 * this pool. When this is called, we have some form of basic information in
1403 * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and
1404 * then re-generate a more complete list including status information.
1405 */
1406 static void
1407 spa_load_spares(spa_t *spa)
1408 {
1409 nvlist_t **spares;
1410 uint_t nspares;
1411 int i;
1412 vdev_t *vd, *tvd;
1413
1414 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
1415
1416 /*
1417 * First, close and free any existing spare vdevs.
1418 */
1419 for (i = 0; i < spa->spa_spares.sav_count; i++) {
1420 vd = spa->spa_spares.sav_vdevs[i];
1421
1422 /* Undo the call to spa_activate() below */
1423 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
1424 B_FALSE)) != NULL && tvd->vdev_isspare)
1425 spa_spare_remove(tvd);
1426 vdev_close(vd);
1427 vdev_free(vd);
1428 }
1429
1430 if (spa->spa_spares.sav_vdevs)
1431 kmem_free(spa->spa_spares.sav_vdevs,
1432 spa->spa_spares.sav_count * sizeof (void *));
1433
1434 if (spa->spa_spares.sav_config == NULL)
1435 nspares = 0;
1436 else
1437 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
1438 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
1439
1440 spa->spa_spares.sav_count = (int)nspares;
1441 spa->spa_spares.sav_vdevs = NULL;
1442
1443 if (nspares == 0)
1444 return;
1445
1446 /*
1447 * Construct the array of vdevs, opening them to get status in the
1448 * process. For each spare, there is potentially two different vdev_t
1449 * structures associated with it: one in the list of spares (used only
1450 * for basic validation purposes) and one in the active vdev
1451 * configuration (if it's spared in). During this phase we open and
1452 * validate each vdev on the spare list. If the vdev also exists in the
1453 * active configuration, then we also mark this vdev as an active spare.
1454 */
1455 spa->spa_spares.sav_vdevs = kmem_zalloc(nspares * sizeof (void *),
1456 KM_SLEEP);
1457 for (i = 0; i < spa->spa_spares.sav_count; i++) {
1458 VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0,
1459 VDEV_ALLOC_SPARE) == 0);
1460 ASSERT(vd != NULL);
1461
1462 spa->spa_spares.sav_vdevs[i] = vd;
1463
1464 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
1465 B_FALSE)) != NULL) {
1466 if (!tvd->vdev_isspare)
1467 spa_spare_add(tvd);
1468
1469 /*
1470 * We only mark the spare active if we were successfully
1471 * able to load the vdev. Otherwise, importing a pool
1472 * with a bad active spare would result in strange
1473 * behavior, because multiple pool would think the spare
1474 * is actively in use.
1475 *
1476 * There is a vulnerability here to an equally bizarre
1477 * circumstance, where a dead active spare is later
1478 * brought back to life (onlined or otherwise). Given
1479 * the rarity of this scenario, and the extra complexity
1480 * it adds, we ignore the possibility.
1481 */
1482 if (!vdev_is_dead(tvd))
1483 spa_spare_activate(tvd);
1484 }
1485
1486 vd->vdev_top = vd;
1487 vd->vdev_aux = &spa->spa_spares;
1488
1489 if (vdev_open(vd) != 0)
1490 continue;
1491
1492 if (vdev_validate_aux(vd) == 0)
1493 spa_spare_add(vd);
1494 }
1495
1496 /*
1497 * Recompute the stashed list of spares, with status information
1498 * this time.
1499 */
1500 VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES,
1501 DATA_TYPE_NVLIST_ARRAY) == 0);
1502
1503 spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *),
1504 KM_SLEEP);
1505 for (i = 0; i < spa->spa_spares.sav_count; i++)
1506 spares[i] = vdev_config_generate(spa,
1507 spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE);
1508 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
1509 ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0);
1510 for (i = 0; i < spa->spa_spares.sav_count; i++)
1511 nvlist_free(spares[i]);
1512 kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *));
1513 }
1514
1515 /*
1516 * Load (or re-load) the current list of vdevs describing the active l2cache for
1517 * this pool. When this is called, we have some form of basic information in
1518 * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and
1519 * then re-generate a more complete list including status information.
1520 * Devices which are already active have their details maintained, and are
1521 * not re-opened.
1522 */
1523 static void
1524 spa_load_l2cache(spa_t *spa)
1525 {
1526 nvlist_t **l2cache;
1527 uint_t nl2cache;
1528 int i, j, oldnvdevs;
1529 uint64_t guid;
1530 vdev_t *vd, **oldvdevs, **newvdevs;
1531 spa_aux_vdev_t *sav = &spa->spa_l2cache;
1532
1533 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
1534
1535 if (sav->sav_config != NULL) {
1536 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config,
1537 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
1538 newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP);
1539 } else {
1540 nl2cache = 0;
1541 newvdevs = NULL;
1542 }
1543
1544 oldvdevs = sav->sav_vdevs;
1545 oldnvdevs = sav->sav_count;
1546 sav->sav_vdevs = NULL;
1547 sav->sav_count = 0;
1548
1549 /*
1550 * Process new nvlist of vdevs.
1551 */
1552 for (i = 0; i < nl2cache; i++) {
1553 VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID,
1554 &guid) == 0);
1555
1556 newvdevs[i] = NULL;
1557 for (j = 0; j < oldnvdevs; j++) {
1558 vd = oldvdevs[j];
1559 if (vd != NULL && guid == vd->vdev_guid) {
1560 /*
1561 * Retain previous vdev for add/remove ops.
1562 */
1563 newvdevs[i] = vd;
1564 oldvdevs[j] = NULL;
1565 break;
1566 }
1567 }
1568
1569 if (newvdevs[i] == NULL) {
1570 /*
1571 * Create new vdev
1572 */
1573 VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0,
1574 VDEV_ALLOC_L2CACHE) == 0);
1575 ASSERT(vd != NULL);
1576 newvdevs[i] = vd;
1577
1578 /*
1579 * Commit this vdev as an l2cache device,
1580 * even if it fails to open.
1581 */
1582 spa_l2cache_add(vd);
1583
1584 vd->vdev_top = vd;
1585 vd->vdev_aux = sav;
1586
1587 spa_l2cache_activate(vd);
1588
1589 if (vdev_open(vd) != 0)
1590 continue;
1591
1592 (void) vdev_validate_aux(vd);
1593
1594 if (!vdev_is_dead(vd))
1595 l2arc_add_vdev(spa, vd);
1596 }
1597 }
1598
1599 /*
1600 * Purge vdevs that were dropped
1601 */
1602 for (i = 0; i < oldnvdevs; i++) {
1603 uint64_t pool;
1604
1605 vd = oldvdevs[i];
1606 if (vd != NULL) {
1607 ASSERT(vd->vdev_isl2cache);
1608
1609 if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
1610 pool != 0ULL && l2arc_vdev_present(vd))
1611 l2arc_remove_vdev(vd);
1612 vdev_clear_stats(vd);
1613 vdev_free(vd);
1614 }
1615 }
1616
1617 if (oldvdevs)
1618 kmem_free(oldvdevs, oldnvdevs * sizeof (void *));
1619
1620 if (sav->sav_config == NULL)
1621 goto out;
1622
1623 sav->sav_vdevs = newvdevs;
1624 sav->sav_count = (int)nl2cache;
1625
1626 /*
1627 * Recompute the stashed list of l2cache devices, with status
1628 * information this time.
1629 */
1630 VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE,
1631 DATA_TYPE_NVLIST_ARRAY) == 0);
1632
1633 l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
1634 for (i = 0; i < sav->sav_count; i++)
1635 l2cache[i] = vdev_config_generate(spa,
1636 sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE);
1637 VERIFY(nvlist_add_nvlist_array(sav->sav_config,
1638 ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0);
1639 out:
1640 for (i = 0; i < sav->sav_count; i++)
1641 nvlist_free(l2cache[i]);
1642 if (sav->sav_count)
1643 kmem_free(l2cache, sav->sav_count * sizeof (void *));
1644 }
1645
1646 static int
1647 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value)
1648 {
1649 dmu_buf_t *db;
1650 char *packed = NULL;
1651 size_t nvsize = 0;
1652 int error;
1653 *value = NULL;
1654
1655 error = dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db);
1656 if (error)
1657 return (error);
1658
1659 nvsize = *(uint64_t *)db->db_data;
1660 dmu_buf_rele(db, FTAG);
1661
1662 packed = vmem_alloc(nvsize, KM_SLEEP);
1663 error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed,
1664 DMU_READ_PREFETCH);
1665 if (error == 0)
1666 error = nvlist_unpack(packed, nvsize, value, 0);
1667 vmem_free(packed, nvsize);
1668
1669 return (error);
1670 }
1671
1672 /*
1673 * Checks to see if the given vdev could not be opened, in which case we post a
1674 * sysevent to notify the autoreplace code that the device has been removed.
1675 */
1676 static void
1677 spa_check_removed(vdev_t *vd)
1678 {
1679 int c;
1680
1681 for (c = 0; c < vd->vdev_children; c++)
1682 spa_check_removed(vd->vdev_child[c]);
1683
1684 if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) &&
1685 !vd->vdev_ishole) {
1686 zfs_ereport_post(FM_EREPORT_RESOURCE_AUTOREPLACE,
1687 vd->vdev_spa, vd, NULL, 0, 0);
1688 spa_event_notify(vd->vdev_spa, vd, FM_EREPORT_ZFS_DEVICE_CHECK);
1689 }
1690 }
1691
1692 static void
1693 spa_config_valid_zaps(vdev_t *vd, vdev_t *mvd)
1694 {
1695 uint64_t i;
1696
1697 ASSERT3U(vd->vdev_children, ==, mvd->vdev_children);
1698
1699 vd->vdev_top_zap = mvd->vdev_top_zap;
1700 vd->vdev_leaf_zap = mvd->vdev_leaf_zap;
1701
1702 for (i = 0; i < vd->vdev_children; i++) {
1703 spa_config_valid_zaps(vd->vdev_child[i], mvd->vdev_child[i]);
1704 }
1705 }
1706
1707 /*
1708 * Validate the current config against the MOS config
1709 */
1710 static boolean_t
1711 spa_config_valid(spa_t *spa, nvlist_t *config)
1712 {
1713 vdev_t *mrvd, *rvd = spa->spa_root_vdev;
1714 nvlist_t *nv;
1715 int c, i;
1716
1717 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nv) == 0);
1718
1719 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1720 VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0);
1721
1722 ASSERT3U(rvd->vdev_children, ==, mrvd->vdev_children);
1723
1724 /*
1725 * If we're doing a normal import, then build up any additional
1726 * diagnostic information about missing devices in this config.
1727 * We'll pass this up to the user for further processing.
1728 */
1729 if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) {
1730 nvlist_t **child, *nv;
1731 uint64_t idx = 0;
1732
1733 child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t **),
1734 KM_SLEEP);
1735 VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
1736
1737 for (c = 0; c < rvd->vdev_children; c++) {
1738 vdev_t *tvd = rvd->vdev_child[c];
1739 vdev_t *mtvd = mrvd->vdev_child[c];
1740
1741 if (tvd->vdev_ops == &vdev_missing_ops &&
1742 mtvd->vdev_ops != &vdev_missing_ops &&
1743 mtvd->vdev_islog)
1744 child[idx++] = vdev_config_generate(spa, mtvd,
1745 B_FALSE, 0);
1746 }
1747
1748 if (idx) {
1749 VERIFY(nvlist_add_nvlist_array(nv,
1750 ZPOOL_CONFIG_CHILDREN, child, idx) == 0);
1751 VERIFY(nvlist_add_nvlist(spa->spa_load_info,
1752 ZPOOL_CONFIG_MISSING_DEVICES, nv) == 0);
1753
1754 for (i = 0; i < idx; i++)
1755 nvlist_free(child[i]);
1756 }
1757 nvlist_free(nv);
1758 kmem_free(child, rvd->vdev_children * sizeof (char **));
1759 }
1760
1761 /*
1762 * Compare the root vdev tree with the information we have
1763 * from the MOS config (mrvd). Check each top-level vdev
1764 * with the corresponding MOS config top-level (mtvd).
1765 */
1766 for (c = 0; c < rvd->vdev_children; c++) {
1767 vdev_t *tvd = rvd->vdev_child[c];
1768 vdev_t *mtvd = mrvd->vdev_child[c];
1769
1770 /*
1771 * Resolve any "missing" vdevs in the current configuration.
1772 * If we find that the MOS config has more accurate information
1773 * about the top-level vdev then use that vdev instead.
1774 */
1775 if (tvd->vdev_ops == &vdev_missing_ops &&
1776 mtvd->vdev_ops != &vdev_missing_ops) {
1777
1778 if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG))
1779 continue;
1780
1781 /*
1782 * Device specific actions.
1783 */
1784 if (mtvd->vdev_islog) {
1785 spa_set_log_state(spa, SPA_LOG_CLEAR);
1786 } else {
1787 /*
1788 * XXX - once we have 'readonly' pool
1789 * support we should be able to handle
1790 * missing data devices by transitioning
1791 * the pool to readonly.
1792 */
1793 continue;
1794 }
1795
1796 /*
1797 * Swap the missing vdev with the data we were
1798 * able to obtain from the MOS config.
1799 */
1800 vdev_remove_child(rvd, tvd);
1801 vdev_remove_child(mrvd, mtvd);
1802
1803 vdev_add_child(rvd, mtvd);
1804 vdev_add_child(mrvd, tvd);
1805
1806 spa_config_exit(spa, SCL_ALL, FTAG);
1807 vdev_load(mtvd);
1808 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1809
1810 vdev_reopen(rvd);
1811 } else {
1812 if (mtvd->vdev_islog) {
1813 /*
1814 * Load the slog device's state from the MOS
1815 * config since it's possible that the label
1816 * does not contain the most up-to-date
1817 * information.
1818 */
1819 vdev_load_log_state(tvd, mtvd);
1820 vdev_reopen(tvd);
1821 }
1822
1823 /*
1824 * Per-vdev ZAP info is stored exclusively in the MOS.
1825 */
1826 spa_config_valid_zaps(tvd, mtvd);
1827 }
1828 }
1829
1830 vdev_free(mrvd);
1831 spa_config_exit(spa, SCL_ALL, FTAG);
1832
1833 /*
1834 * Ensure we were able to validate the config.
1835 */
1836 return (rvd->vdev_guid_sum == spa->spa_uberblock.ub_guid_sum);
1837 }
1838
1839 /*
1840 * Check for missing log devices
1841 */
1842 static boolean_t
1843 spa_check_logs(spa_t *spa)
1844 {
1845 boolean_t rv = B_FALSE;
1846 dsl_pool_t *dp = spa_get_dsl(spa);
1847
1848 switch (spa->spa_log_state) {
1849 default:
1850 break;
1851 case SPA_LOG_MISSING:
1852 /* need to recheck in case slog has been restored */
1853 case SPA_LOG_UNKNOWN:
1854 rv = (dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
1855 zil_check_log_chain, NULL, DS_FIND_CHILDREN) != 0);
1856 if (rv)
1857 spa_set_log_state(spa, SPA_LOG_MISSING);
1858 break;
1859 }
1860 return (rv);
1861 }
1862
1863 static boolean_t
1864 spa_passivate_log(spa_t *spa)
1865 {
1866 vdev_t *rvd = spa->spa_root_vdev;
1867 boolean_t slog_found = B_FALSE;
1868 int c;
1869
1870 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
1871
1872 if (!spa_has_slogs(spa))
1873 return (B_FALSE);
1874
1875 for (c = 0; c < rvd->vdev_children; c++) {
1876 vdev_t *tvd = rvd->vdev_child[c];
1877 metaslab_group_t *mg = tvd->vdev_mg;
1878
1879 if (tvd->vdev_islog) {
1880 metaslab_group_passivate(mg);
1881 slog_found = B_TRUE;
1882 }
1883 }
1884
1885 return (slog_found);
1886 }
1887
1888 static void
1889 spa_activate_log(spa_t *spa)
1890 {
1891 vdev_t *rvd = spa->spa_root_vdev;
1892 int c;
1893
1894 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
1895
1896 for (c = 0; c < rvd->vdev_children; c++) {
1897 vdev_t *tvd = rvd->vdev_child[c];
1898 metaslab_group_t *mg = tvd->vdev_mg;
1899
1900 if (tvd->vdev_islog)
1901 metaslab_group_activate(mg);
1902 }
1903 }
1904
1905 int
1906 spa_offline_log(spa_t *spa)
1907 {
1908 int error;
1909
1910 error = dmu_objset_find(spa_name(spa), zil_vdev_offline,
1911 NULL, DS_FIND_CHILDREN);
1912 if (error == 0) {
1913 /*
1914 * We successfully offlined the log device, sync out the
1915 * current txg so that the "stubby" block can be removed
1916 * by zil_sync().
1917 */
1918 txg_wait_synced(spa->spa_dsl_pool, 0);
1919 }
1920 return (error);
1921 }
1922
1923 static void
1924 spa_aux_check_removed(spa_aux_vdev_t *sav)
1925 {
1926 int i;
1927
1928 for (i = 0; i < sav->sav_count; i++)
1929 spa_check_removed(sav->sav_vdevs[i]);
1930 }
1931
1932 void
1933 spa_claim_notify(zio_t *zio)
1934 {
1935 spa_t *spa = zio->io_spa;
1936
1937 if (zio->io_error)
1938 return;
1939
1940 mutex_enter(&spa->spa_props_lock); /* any mutex will do */
1941 if (spa->spa_claim_max_txg < zio->io_bp->blk_birth)
1942 spa->spa_claim_max_txg = zio->io_bp->blk_birth;
1943 mutex_exit(&spa->spa_props_lock);
1944 }
1945
1946 typedef struct spa_load_error {
1947 uint64_t sle_meta_count;
1948 uint64_t sle_data_count;
1949 } spa_load_error_t;
1950
1951 static void
1952 spa_load_verify_done(zio_t *zio)
1953 {
1954 blkptr_t *bp = zio->io_bp;
1955 spa_load_error_t *sle = zio->io_private;
1956 dmu_object_type_t type = BP_GET_TYPE(bp);
1957 int error = zio->io_error;
1958 spa_t *spa = zio->io_spa;
1959
1960 if (error) {
1961 if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) &&
1962 type != DMU_OT_INTENT_LOG)
1963 atomic_inc_64(&sle->sle_meta_count);
1964 else
1965 atomic_inc_64(&sle->sle_data_count);
1966 }
1967 zio_data_buf_free(zio->io_data, zio->io_size);
1968
1969 mutex_enter(&spa->spa_scrub_lock);
1970 spa->spa_scrub_inflight--;
1971 cv_broadcast(&spa->spa_scrub_io_cv);
1972 mutex_exit(&spa->spa_scrub_lock);
1973 }
1974
1975 /*
1976 * Maximum number of concurrent scrub i/os to create while verifying
1977 * a pool while importing it.
1978 */
1979 int spa_load_verify_maxinflight = 10000;
1980 int spa_load_verify_metadata = B_TRUE;
1981 int spa_load_verify_data = B_TRUE;
1982
1983 /*ARGSUSED*/
1984 static int
1985 spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
1986 const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
1987 {
1988 zio_t *rio;
1989 size_t size;
1990 void *data;
1991
1992 if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
1993 return (0);
1994 /*
1995 * Note: normally this routine will not be called if
1996 * spa_load_verify_metadata is not set. However, it may be useful
1997 * to manually set the flag after the traversal has begun.
1998 */
1999 if (!spa_load_verify_metadata)
2000 return (0);
2001 if (BP_GET_BUFC_TYPE(bp) == ARC_BUFC_DATA && !spa_load_verify_data)
2002 return (0);
2003
2004 rio = arg;
2005 size = BP_GET_PSIZE(bp);
2006 data = zio_data_buf_alloc(size);
2007
2008 mutex_enter(&spa->spa_scrub_lock);
2009 while (spa->spa_scrub_inflight >= spa_load_verify_maxinflight)
2010 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
2011 spa->spa_scrub_inflight++;
2012 mutex_exit(&spa->spa_scrub_lock);
2013
2014 zio_nowait(zio_read(rio, spa, bp, data, size,
2015 spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB,
2016 ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL |
2017 ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb));
2018 return (0);
2019 }
2020
2021 static int
2022 spa_load_verify(spa_t *spa)
2023 {
2024 zio_t *rio;
2025 spa_load_error_t sle = { 0 };
2026 zpool_rewind_policy_t policy;
2027 boolean_t verify_ok = B_FALSE;
2028 int error = 0;
2029
2030 zpool_get_rewind_policy(spa->spa_config, &policy);
2031
2032 if (policy.zrp_request & ZPOOL_NEVER_REWIND)
2033 return (0);
2034
2035 rio = zio_root(spa, NULL, &sle,
2036 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
2037
2038 if (spa_load_verify_metadata) {
2039 error = traverse_pool(spa, spa->spa_verify_min_txg,
2040 TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA,
2041 spa_load_verify_cb, rio);
2042 }
2043
2044 (void) zio_wait(rio);
2045
2046 spa->spa_load_meta_errors = sle.sle_meta_count;
2047 spa->spa_load_data_errors = sle.sle_data_count;
2048
2049 if (!error && sle.sle_meta_count <= policy.zrp_maxmeta &&
2050 sle.sle_data_count <= policy.zrp_maxdata) {
2051 int64_t loss = 0;
2052
2053 verify_ok = B_TRUE;
2054 spa->spa_load_txg = spa->spa_uberblock.ub_txg;
2055 spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp;
2056
2057 loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts;
2058 VERIFY(nvlist_add_uint64(spa->spa_load_info,
2059 ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0);
2060 VERIFY(nvlist_add_int64(spa->spa_load_info,
2061 ZPOOL_CONFIG_REWIND_TIME, loss) == 0);
2062 VERIFY(nvlist_add_uint64(spa->spa_load_info,
2063 ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0);
2064 } else {
2065 spa->spa_load_max_txg = spa->spa_uberblock.ub_txg;
2066 }
2067
2068 if (error) {
2069 if (error != ENXIO && error != EIO)
2070 error = SET_ERROR(EIO);
2071 return (error);
2072 }
2073
2074 return (verify_ok ? 0 : EIO);
2075 }
2076
2077 /*
2078 * Find a value in the pool props object.
2079 */
2080 static void
2081 spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val)
2082 {
2083 (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object,
2084 zpool_prop_to_name(prop), sizeof (uint64_t), 1, val);
2085 }
2086
2087 /*
2088 * Find a value in the pool directory object.
2089 */
2090 static int
2091 spa_dir_prop(spa_t *spa, const char *name, uint64_t *val)
2092 {
2093 return (zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
2094 name, sizeof (uint64_t), 1, val));
2095 }
2096
2097 static int
2098 spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err)
2099 {
2100 vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux);
2101 return (err);
2102 }
2103
2104 /*
2105 * Fix up config after a partly-completed split. This is done with the
2106 * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off
2107 * pool have that entry in their config, but only the splitting one contains
2108 * a list of all the guids of the vdevs that are being split off.
2109 *
2110 * This function determines what to do with that list: either rejoin
2111 * all the disks to the pool, or complete the splitting process. To attempt
2112 * the rejoin, each disk that is offlined is marked online again, and
2113 * we do a reopen() call. If the vdev label for every disk that was
2114 * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL)
2115 * then we call vdev_split() on each disk, and complete the split.
2116 *
2117 * Otherwise we leave the config alone, with all the vdevs in place in
2118 * the original pool.
2119 */
2120 static void
2121 spa_try_repair(spa_t *spa, nvlist_t *config)
2122 {
2123 uint_t extracted;
2124 uint64_t *glist;
2125 uint_t i, gcount;
2126 nvlist_t *nvl;
2127 vdev_t **vd;
2128 boolean_t attempt_reopen;
2129
2130 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0)
2131 return;
2132
2133 /* check that the config is complete */
2134 if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
2135 &glist, &gcount) != 0)
2136 return;
2137
2138 vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP);
2139
2140 /* attempt to online all the vdevs & validate */
2141 attempt_reopen = B_TRUE;
2142 for (i = 0; i < gcount; i++) {
2143 if (glist[i] == 0) /* vdev is hole */
2144 continue;
2145
2146 vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE);
2147 if (vd[i] == NULL) {
2148 /*
2149 * Don't bother attempting to reopen the disks;
2150 * just do the split.
2151 */
2152 attempt_reopen = B_FALSE;
2153 } else {
2154 /* attempt to re-online it */
2155 vd[i]->vdev_offline = B_FALSE;
2156 }
2157 }
2158
2159 if (attempt_reopen) {
2160 vdev_reopen(spa->spa_root_vdev);
2161
2162 /* check each device to see what state it's in */
2163 for (extracted = 0, i = 0; i < gcount; i++) {
2164 if (vd[i] != NULL &&
2165 vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL)
2166 break;
2167 ++extracted;
2168 }
2169 }
2170
2171 /*
2172 * If every disk has been moved to the new pool, or if we never
2173 * even attempted to look at them, then we split them off for
2174 * good.
2175 */
2176 if (!attempt_reopen || gcount == extracted) {
2177 for (i = 0; i < gcount; i++)
2178 if (vd[i] != NULL)
2179 vdev_split(vd[i]);
2180 vdev_reopen(spa->spa_root_vdev);
2181 }
2182
2183 kmem_free(vd, gcount * sizeof (vdev_t *));
2184 }
2185
2186 static int
2187 spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type,
2188 boolean_t mosconfig)
2189 {
2190 nvlist_t *config = spa->spa_config;
2191 char *ereport = FM_EREPORT_ZFS_POOL;
2192 char *comment;
2193 int error;
2194 uint64_t pool_guid;
2195 nvlist_t *nvl;
2196
2197 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid))
2198 return (SET_ERROR(EINVAL));
2199
2200 ASSERT(spa->spa_comment == NULL);
2201 if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0)
2202 spa->spa_comment = spa_strdup(comment);
2203
2204 /*
2205 * Versioning wasn't explicitly added to the label until later, so if
2206 * it's not present treat it as the initial version.
2207 */
2208 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
2209 &spa->spa_ubsync.ub_version) != 0)
2210 spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
2211
2212 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
2213 &spa->spa_config_txg);
2214
2215 if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) &&
2216 spa_guid_exists(pool_guid, 0)) {
2217 error = SET_ERROR(EEXIST);
2218 } else {
2219 spa->spa_config_guid = pool_guid;
2220
2221 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT,
2222 &nvl) == 0) {
2223 VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting,
2224 KM_SLEEP) == 0);
2225 }
2226
2227 nvlist_free(spa->spa_load_info);
2228 spa->spa_load_info = fnvlist_alloc();
2229
2230 gethrestime(&spa->spa_loaded_ts);
2231 error = spa_load_impl(spa, pool_guid, config, state, type,
2232 mosconfig, &ereport);
2233 }
2234
2235 /*
2236 * Don't count references from objsets that are already closed
2237 * and are making their way through the eviction process.
2238 */
2239 spa_evicting_os_wait(spa);
2240 spa->spa_minref = refcount_count(&spa->spa_refcount);
2241 if (error) {
2242 if (error != EEXIST) {
2243 spa->spa_loaded_ts.tv_sec = 0;
2244 spa->spa_loaded_ts.tv_nsec = 0;
2245 }
2246 if (error != EBADF) {
2247 zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0);
2248 }
2249 }
2250 spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE;
2251 spa->spa_ena = 0;
2252
2253 return (error);
2254 }
2255
2256 #ifdef ZFS_DEBUG
2257 /*
2258 * Count the number of per-vdev ZAPs associated with all of the vdevs in the
2259 * vdev tree rooted in the given vd, and ensure that each ZAP is present in the
2260 * spa's per-vdev ZAP list.
2261 */
2262 static uint64_t
2263 vdev_count_verify_zaps(vdev_t *vd)
2264 {
2265 spa_t *spa = vd->vdev_spa;
2266 uint64_t total = 0;
2267 uint64_t i;
2268
2269 if (vd->vdev_top_zap != 0) {
2270 total++;
2271 ASSERT0(zap_lookup_int(spa->spa_meta_objset,
2272 spa->spa_all_vdev_zaps, vd->vdev_top_zap));
2273 }
2274 if (vd->vdev_leaf_zap != 0) {
2275 total++;
2276 ASSERT0(zap_lookup_int(spa->spa_meta_objset,
2277 spa->spa_all_vdev_zaps, vd->vdev_leaf_zap));
2278 }
2279
2280 for (i = 0; i < vd->vdev_children; i++) {
2281 total += vdev_count_verify_zaps(vd->vdev_child[i]);
2282 }
2283
2284 return (total);
2285 }
2286 #endif
2287
2288 /*
2289 * Load an existing storage pool, using the pool's builtin spa_config as a
2290 * source of configuration information.
2291 */
2292 __attribute__((always_inline))
2293 static inline int
2294 spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
2295 spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig,
2296 char **ereport)
2297 {
2298 int error = 0;
2299 nvlist_t *nvroot = NULL;
2300 nvlist_t *label;
2301 vdev_t *rvd;
2302 uberblock_t *ub = &spa->spa_uberblock;
2303 uint64_t children, config_cache_txg = spa->spa_config_txg;
2304 int orig_mode = spa->spa_mode;
2305 int parse, i;
2306 uint64_t obj;
2307 boolean_t missing_feat_write = B_FALSE;
2308 nvlist_t *mos_config;
2309
2310 /*
2311 * If this is an untrusted config, access the pool in read-only mode.
2312 * This prevents things like resilvering recently removed devices.
2313 */
2314 if (!mosconfig)
2315 spa->spa_mode = FREAD;
2316
2317 ASSERT(MUTEX_HELD(&spa_namespace_lock));
2318
2319 spa->spa_load_state = state;
2320
2321 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot))
2322 return (SET_ERROR(EINVAL));
2323
2324 parse = (type == SPA_IMPORT_EXISTING ?
2325 VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT);
2326
2327 /*
2328 * Create "The Godfather" zio to hold all async IOs
2329 */
2330 spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *),
2331 KM_SLEEP);
2332 for (i = 0; i < max_ncpus; i++) {
2333 spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL,
2334 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
2335 ZIO_FLAG_GODFATHER);
2336 }
2337
2338 /*
2339 * Parse the configuration into a vdev tree. We explicitly set the
2340 * value that will be returned by spa_version() since parsing the
2341 * configuration requires knowing the version number.
2342 */
2343 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2344 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, parse);
2345 spa_config_exit(spa, SCL_ALL, FTAG);
2346
2347 if (error != 0)
2348 return (error);
2349
2350 ASSERT(spa->spa_root_vdev == rvd);
2351 ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT);
2352 ASSERT3U(spa->spa_max_ashift, <=, SPA_MAXBLOCKSHIFT);
2353
2354 if (type != SPA_IMPORT_ASSEMBLE) {
2355 ASSERT(spa_guid(spa) == pool_guid);
2356 }
2357
2358 /*
2359 * Try to open all vdevs, loading each label in the process.
2360 */
2361 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2362 error = vdev_open(rvd);
2363 spa_config_exit(spa, SCL_ALL, FTAG);
2364 if (error != 0)
2365 return (error);
2366
2367 /*
2368 * We need to validate the vdev labels against the configuration that
2369 * we have in hand, which is dependent on the setting of mosconfig. If
2370 * mosconfig is true then we're validating the vdev labels based on
2371 * that config. Otherwise, we're validating against the cached config
2372 * (zpool.cache) that was read when we loaded the zfs module, and then
2373 * later we will recursively call spa_load() and validate against
2374 * the vdev config.
2375 *
2376 * If we're assembling a new pool that's been split off from an
2377 * existing pool, the labels haven't yet been updated so we skip
2378 * validation for now.
2379 */
2380 if (type != SPA_IMPORT_ASSEMBLE) {
2381 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2382 error = vdev_validate(rvd, mosconfig);
2383 spa_config_exit(spa, SCL_ALL, FTAG);
2384
2385 if (error != 0)
2386 return (error);
2387
2388 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
2389 return (SET_ERROR(ENXIO));
2390 }
2391
2392 /*
2393 * Find the best uberblock.
2394 */
2395 vdev_uberblock_load(rvd, ub, &label);
2396
2397 /*
2398 * If we weren't able to find a single valid uberblock, return failure.
2399 */
2400 if (ub->ub_txg == 0) {
2401 nvlist_free(label);
2402 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO));
2403 }
2404
2405 /*
2406 * If the pool has an unsupported version we can't open it.
2407 */
2408 if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) {
2409 nvlist_free(label);
2410 return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP));
2411 }
2412
2413 if (ub->ub_version >= SPA_VERSION_FEATURES) {
2414 nvlist_t *features;
2415
2416 /*
2417 * If we weren't able to find what's necessary for reading the
2418 * MOS in the label, return failure.
2419 */
2420 if (label == NULL || nvlist_lookup_nvlist(label,
2421 ZPOOL_CONFIG_FEATURES_FOR_READ, &features) != 0) {
2422 nvlist_free(label);
2423 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
2424 ENXIO));
2425 }
2426
2427 /*
2428 * Update our in-core representation with the definitive values
2429 * from the label.
2430 */
2431 nvlist_free(spa->spa_label_features);
2432 VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0);
2433 }
2434
2435 nvlist_free(label);
2436
2437 /*
2438 * Look through entries in the label nvlist's features_for_read. If
2439 * there is a feature listed there which we don't understand then we
2440 * cannot open a pool.
2441 */
2442 if (ub->ub_version >= SPA_VERSION_FEATURES) {
2443 nvlist_t *unsup_feat;
2444 nvpair_t *nvp;
2445
2446 VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) ==
2447 0);
2448
2449 for (nvp = nvlist_next_nvpair(spa->spa_label_features, NULL);
2450 nvp != NULL;
2451 nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) {
2452 if (!zfeature_is_supported(nvpair_name(nvp))) {
2453 VERIFY(nvlist_add_string(unsup_feat,
2454 nvpair_name(nvp), "") == 0);
2455 }
2456 }
2457
2458 if (!nvlist_empty(unsup_feat)) {
2459 VERIFY(nvlist_add_nvlist(spa->spa_load_info,
2460 ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0);
2461 nvlist_free(unsup_feat);
2462 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
2463 ENOTSUP));
2464 }
2465
2466 nvlist_free(unsup_feat);
2467 }
2468
2469 /*
2470 * If the vdev guid sum doesn't match the uberblock, we have an
2471 * incomplete configuration. We first check to see if the pool
2472 * is aware of the complete config (i.e ZPOOL_CONFIG_VDEV_CHILDREN).
2473 * If it is, defer the vdev_guid_sum check till later so we
2474 * can handle missing vdevs.
2475 */
2476 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN,
2477 &children) != 0 && mosconfig && type != SPA_IMPORT_ASSEMBLE &&
2478 rvd->vdev_guid_sum != ub->ub_guid_sum)
2479 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO));
2480
2481 if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) {
2482 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2483 spa_try_repair(spa, config);
2484 spa_config_exit(spa, SCL_ALL, FTAG);
2485 nvlist_free(spa->spa_config_splitting);
2486 spa->spa_config_splitting = NULL;
2487 }
2488
2489 /*
2490 * Initialize internal SPA structures.
2491 */
2492 spa->spa_state = POOL_STATE_ACTIVE;
2493 spa->spa_ubsync = spa->spa_uberblock;
2494 spa->spa_verify_min_txg = spa->spa_extreme_rewind ?
2495 TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1;
2496 spa->spa_first_txg = spa->spa_last_ubsync_txg ?
2497 spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1;
2498 spa->spa_claim_max_txg = spa->spa_first_txg;
2499 spa->spa_prev_software_version = ub->ub_software_version;
2500
2501 error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
2502 if (error)
2503 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2504 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
2505
2506 if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0)
2507 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2508
2509 if (spa_version(spa) >= SPA_VERSION_FEATURES) {
2510 boolean_t missing_feat_read = B_FALSE;
2511 nvlist_t *unsup_feat, *enabled_feat;
2512 spa_feature_t i;
2513
2514 if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ,
2515 &spa->spa_feat_for_read_obj) != 0) {
2516 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2517 }
2518
2519 if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE,
2520 &spa->spa_feat_for_write_obj) != 0) {
2521 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2522 }
2523
2524 if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS,
2525 &spa->spa_feat_desc_obj) != 0) {
2526 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2527 }
2528
2529 enabled_feat = fnvlist_alloc();
2530 unsup_feat = fnvlist_alloc();
2531
2532 if (!spa_features_check(spa, B_FALSE,
2533 unsup_feat, enabled_feat))
2534 missing_feat_read = B_TRUE;
2535
2536 if (spa_writeable(spa) || state == SPA_LOAD_TRYIMPORT) {
2537 if (!spa_features_check(spa, B_TRUE,
2538 unsup_feat, enabled_feat)) {
2539 missing_feat_write = B_TRUE;
2540 }
2541 }
2542
2543 fnvlist_add_nvlist(spa->spa_load_info,
2544 ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat);
2545
2546 if (!nvlist_empty(unsup_feat)) {
2547 fnvlist_add_nvlist(spa->spa_load_info,
2548 ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat);
2549 }
2550
2551 fnvlist_free(enabled_feat);
2552 fnvlist_free(unsup_feat);
2553
2554 if (!missing_feat_read) {
2555 fnvlist_add_boolean(spa->spa_load_info,
2556 ZPOOL_CONFIG_CAN_RDONLY);
2557 }
2558
2559 /*
2560 * If the state is SPA_LOAD_TRYIMPORT, our objective is
2561 * twofold: to determine whether the pool is available for
2562 * import in read-write mode and (if it is not) whether the
2563 * pool is available for import in read-only mode. If the pool
2564 * is available for import in read-write mode, it is displayed
2565 * as available in userland; if it is not available for import
2566 * in read-only mode, it is displayed as unavailable in
2567 * userland. If the pool is available for import in read-only
2568 * mode but not read-write mode, it is displayed as unavailable
2569 * in userland with a special note that the pool is actually
2570 * available for open in read-only mode.
2571 *
2572 * As a result, if the state is SPA_LOAD_TRYIMPORT and we are
2573 * missing a feature for write, we must first determine whether
2574 * the pool can be opened read-only before returning to
2575 * userland in order to know whether to display the
2576 * abovementioned note.
2577 */
2578 if (missing_feat_read || (missing_feat_write &&
2579 spa_writeable(spa))) {
2580 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
2581 ENOTSUP));
2582 }
2583
2584 /*
2585 * Load refcounts for ZFS features from disk into an in-memory
2586 * cache during SPA initialization.
2587 */
2588 for (i = 0; i < SPA_FEATURES; i++) {
2589 uint64_t refcount;
2590
2591 error = feature_get_refcount_from_disk(spa,
2592 &spa_feature_table[i], &refcount);
2593 if (error == 0) {
2594 spa->spa_feat_refcount_cache[i] = refcount;
2595 } else if (error == ENOTSUP) {
2596 spa->spa_feat_refcount_cache[i] =
2597 SPA_FEATURE_DISABLED;
2598 } else {
2599 return (spa_vdev_err(rvd,
2600 VDEV_AUX_CORRUPT_DATA, EIO));
2601 }
2602 }
2603 }
2604
2605 if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) {
2606 if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG,
2607 &spa->spa_feat_enabled_txg_obj) != 0)
2608 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2609 }
2610
2611 spa->spa_is_initializing = B_TRUE;
2612 error = dsl_pool_open(spa->spa_dsl_pool);
2613 spa->spa_is_initializing = B_FALSE;
2614 if (error != 0)
2615 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2616
2617 if (!mosconfig) {
2618 uint64_t hostid;
2619 nvlist_t *policy = NULL, *nvconfig;
2620
2621 if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0)
2622 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2623
2624 if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig,
2625 ZPOOL_CONFIG_HOSTID, &hostid) == 0) {
2626 char *hostname;
2627 unsigned long myhostid = 0;
2628
2629 VERIFY(nvlist_lookup_string(nvconfig,
2630 ZPOOL_CONFIG_HOSTNAME, &hostname) == 0);
2631
2632 #ifdef _KERNEL
2633 myhostid = zone_get_hostid(NULL);
2634 #else /* _KERNEL */
2635 /*
2636 * We're emulating the system's hostid in userland, so
2637 * we can't use zone_get_hostid().
2638 */
2639 (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid);
2640 #endif /* _KERNEL */
2641 if (hostid != 0 && myhostid != 0 &&
2642 hostid != myhostid) {
2643 nvlist_free(nvconfig);
2644 cmn_err(CE_WARN, "pool '%s' could not be "
2645 "loaded as it was last accessed by another "
2646 "system (host: %s hostid: 0x%lx). See: "
2647 "http://zfsonlinux.org/msg/ZFS-8000-EY",
2648 spa_name(spa), hostname,
2649 (unsigned long)hostid);
2650 return (SET_ERROR(EBADF));
2651 }
2652 }
2653 if (nvlist_lookup_nvlist(spa->spa_config,
2654 ZPOOL_REWIND_POLICY, &policy) == 0)
2655 VERIFY(nvlist_add_nvlist(nvconfig,
2656 ZPOOL_REWIND_POLICY, policy) == 0);
2657
2658 spa_config_set(spa, nvconfig);
2659 spa_unload(spa);
2660 spa_deactivate(spa);
2661 spa_activate(spa, orig_mode);
2662
2663 return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE));
2664 }
2665
2666 if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj) != 0)
2667 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2668 error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj);
2669 if (error != 0)
2670 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2671
2672 /*
2673 * Load the bit that tells us to use the new accounting function
2674 * (raid-z deflation). If we have an older pool, this will not
2675 * be present.
2676 */
2677 error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate);
2678 if (error != 0 && error != ENOENT)
2679 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2680
2681 error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION,
2682 &spa->spa_creation_version);
2683 if (error != 0 && error != ENOENT)
2684 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2685
2686 /*
2687 * Load the persistent error log. If we have an older pool, this will
2688 * not be present.
2689 */
2690 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last);
2691 if (error != 0 && error != ENOENT)
2692 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2693
2694 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB,
2695 &spa->spa_errlog_scrub);
2696 if (error != 0 && error != ENOENT)
2697 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2698
2699 /*
2700 * Load the history object. If we have an older pool, this
2701 * will not be present.
2702 */
2703 error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history);
2704 if (error != 0 && error != ENOENT)
2705 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2706
2707 /*
2708 * Load the per-vdev ZAP map. If we have an older pool, this will not
2709 * be present; in this case, defer its creation to a later time to
2710 * avoid dirtying the MOS this early / out of sync context. See
2711 * spa_sync_config_object.
2712 */
2713
2714 /* The sentinel is only available in the MOS config. */
2715 if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0)
2716 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2717
2718 error = spa_dir_prop(spa, DMU_POOL_VDEV_ZAP_MAP,
2719 &spa->spa_all_vdev_zaps);
2720
2721 if (error != ENOENT && error != 0) {
2722 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2723 } else if (error == 0 && !nvlist_exists(mos_config,
2724 ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)) {
2725 /*
2726 * An older version of ZFS overwrote the sentinel value, so
2727 * we have orphaned per-vdev ZAPs in the MOS. Defer their
2728 * destruction to later; see spa_sync_config_object.
2729 */
2730 spa->spa_avz_action = AVZ_ACTION_DESTROY;
2731 /*
2732 * We're assuming that no vdevs have had their ZAPs created
2733 * before this. Better be sure of it.
2734 */
2735 ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev));
2736 }
2737 nvlist_free(mos_config);
2738
2739 /*
2740 * If we're assembling the pool from the split-off vdevs of
2741 * an existing pool, we don't want to attach the spares & cache
2742 * devices.
2743 */
2744
2745 /*
2746 * Load any hot spares for this pool.
2747 */
2748 error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object);
2749 if (error != 0 && error != ENOENT)
2750 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2751 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
2752 ASSERT(spa_version(spa) >= SPA_VERSION_SPARES);
2753 if (load_nvlist(spa, spa->spa_spares.sav_object,
2754 &spa->spa_spares.sav_config) != 0)
2755 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2756
2757 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2758 spa_load_spares(spa);
2759 spa_config_exit(spa, SCL_ALL, FTAG);
2760 } else if (error == 0) {
2761 spa->spa_spares.sav_sync = B_TRUE;
2762 }
2763
2764 /*
2765 * Load any level 2 ARC devices for this pool.
2766 */
2767 error = spa_dir_prop(spa, DMU_POOL_L2CACHE,
2768 &spa->spa_l2cache.sav_object);
2769 if (error != 0 && error != ENOENT)
2770 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2771 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
2772 ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE);
2773 if (load_nvlist(spa, spa->spa_l2cache.sav_object,
2774 &spa->spa_l2cache.sav_config) != 0)
2775 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2776
2777 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2778 spa_load_l2cache(spa);
2779 spa_config_exit(spa, SCL_ALL, FTAG);
2780 } else if (error == 0) {
2781 spa->spa_l2cache.sav_sync = B_TRUE;
2782 }
2783
2784 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
2785
2786 error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object);
2787 if (error && error != ENOENT)
2788 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2789
2790 if (error == 0) {
2791 uint64_t autoreplace = 0;
2792
2793 spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs);
2794 spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace);
2795 spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation);
2796 spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode);
2797 spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand);
2798 spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO,
2799 &spa->spa_dedup_ditto);
2800
2801 spa->spa_autoreplace = (autoreplace != 0);
2802 }
2803
2804 /*
2805 * If the 'autoreplace' property is set, then post a resource notifying
2806 * the ZFS DE that it should not issue any faults for unopenable
2807 * devices. We also iterate over the vdevs, and post a sysevent for any
2808 * unopenable vdevs so that the normal autoreplace handler can take
2809 * over.
2810 */
2811 if (spa->spa_autoreplace && state != SPA_LOAD_TRYIMPORT) {
2812 spa_check_removed(spa->spa_root_vdev);
2813 /*
2814 * For the import case, this is done in spa_import(), because
2815 * at this point we're using the spare definitions from
2816 * the MOS config, not necessarily from the userland config.
2817 */
2818 if (state != SPA_LOAD_IMPORT) {
2819 spa_aux_check_removed(&spa->spa_spares);
2820 spa_aux_check_removed(&spa->spa_l2cache);
2821 }
2822 }
2823
2824 /*
2825 * Load the vdev state for all toplevel vdevs.
2826 */
2827 vdev_load(rvd);
2828
2829 /*
2830 * Propagate the leaf DTLs we just loaded all the way up the tree.
2831 */
2832 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2833 vdev_dtl_reassess(rvd, 0, 0, B_FALSE);
2834 spa_config_exit(spa, SCL_ALL, FTAG);
2835
2836 /*
2837 * Load the DDTs (dedup tables).
2838 */
2839 error = ddt_load(spa);
2840 if (error != 0)
2841 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2842
2843 spa_update_dspace(spa);
2844
2845 /*
2846 * Validate the config, using the MOS config to fill in any
2847 * information which might be missing. If we fail to validate
2848 * the config then declare the pool unfit for use. If we're
2849 * assembling a pool from a split, the log is not transferred
2850 * over.
2851 */
2852 if (type != SPA_IMPORT_ASSEMBLE) {
2853 nvlist_t *nvconfig;
2854
2855 if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0)
2856 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2857
2858 if (!spa_config_valid(spa, nvconfig)) {
2859 nvlist_free(nvconfig);
2860 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM,
2861 ENXIO));
2862 }
2863 nvlist_free(nvconfig);
2864
2865 /*
2866 * Now that we've validated the config, check the state of the
2867 * root vdev. If it can't be opened, it indicates one or
2868 * more toplevel vdevs are faulted.
2869 */
2870 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
2871 return (SET_ERROR(ENXIO));
2872
2873 if (spa_writeable(spa) && spa_check_logs(spa)) {
2874 *ereport = FM_EREPORT_ZFS_LOG_REPLAY;
2875 return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO));
2876 }
2877 }
2878
2879 if (missing_feat_write) {
2880 ASSERT(state == SPA_LOAD_TRYIMPORT);
2881
2882 /*
2883 * At this point, we know that we can open the pool in
2884 * read-only mode but not read-write mode. We now have enough
2885 * information and can return to userland.
2886 */
2887 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, ENOTSUP));
2888 }
2889
2890 /*
2891 * We've successfully opened the pool, verify that we're ready
2892 * to start pushing transactions.
2893 */
2894 if (state != SPA_LOAD_TRYIMPORT) {
2895 if ((error = spa_load_verify(spa)))
2896 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
2897 error));
2898 }
2899
2900 if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER ||
2901 spa->spa_load_max_txg == UINT64_MAX)) {
2902 dmu_tx_t *tx;
2903 int need_update = B_FALSE;
2904 dsl_pool_t *dp = spa_get_dsl(spa);
2905 int c;
2906
2907 ASSERT(state != SPA_LOAD_TRYIMPORT);
2908
2909 /*
2910 * Claim log blocks that haven't been committed yet.
2911 * This must all happen in a single txg.
2912 * Note: spa_claim_max_txg is updated by spa_claim_notify(),
2913 * invoked from zil_claim_log_block()'s i/o done callback.
2914 * Price of rollback is that we abandon the log.
2915 */
2916 spa->spa_claiming = B_TRUE;
2917
2918 tx = dmu_tx_create_assigned(dp, spa_first_txg(spa));
2919 (void) dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
2920 zil_claim, tx, DS_FIND_CHILDREN);
2921 dmu_tx_commit(tx);
2922
2923 spa->spa_claiming = B_FALSE;
2924
2925 spa_set_log_state(spa, SPA_LOG_GOOD);
2926 spa->spa_sync_on = B_TRUE;
2927 txg_sync_start(spa->spa_dsl_pool);
2928
2929 /*
2930 * Wait for all claims to sync. We sync up to the highest
2931 * claimed log block birth time so that claimed log blocks
2932 * don't appear to be from the future. spa_claim_max_txg
2933 * will have been set for us by either zil_check_log_chain()
2934 * (invoked from spa_check_logs()) or zil_claim() above.
2935 */
2936 txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg);
2937
2938 /*
2939 * If the config cache is stale, or we have uninitialized
2940 * metaslabs (see spa_vdev_add()), then update the config.
2941 *
2942 * If this is a verbatim import, trust the current
2943 * in-core spa_config and update the disk labels.
2944 */
2945 if (config_cache_txg != spa->spa_config_txg ||
2946 state == SPA_LOAD_IMPORT ||
2947 state == SPA_LOAD_RECOVER ||
2948 (spa->spa_import_flags & ZFS_IMPORT_VERBATIM))
2949 need_update = B_TRUE;
2950
2951 for (c = 0; c < rvd->vdev_children; c++)
2952 if (rvd->vdev_child[c]->vdev_ms_array == 0)
2953 need_update = B_TRUE;
2954
2955 /*
2956 * Update the config cache asychronously in case we're the
2957 * root pool, in which case the config cache isn't writable yet.
2958 */
2959 if (need_update)
2960 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
2961
2962 /*
2963 * Check all DTLs to see if anything needs resilvering.
2964 */
2965 if (!dsl_scan_resilvering(spa->spa_dsl_pool) &&
2966 vdev_resilver_needed(rvd, NULL, NULL))
2967 spa_async_request(spa, SPA_ASYNC_RESILVER);
2968
2969 /*
2970 * Log the fact that we booted up (so that we can detect if
2971 * we rebooted in the middle of an operation).
2972 */
2973 spa_history_log_version(spa, "open");
2974
2975 /*
2976 * Delete any inconsistent datasets.
2977 */
2978 (void) dmu_objset_find(spa_name(spa),
2979 dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN);
2980
2981 /*
2982 * Clean up any stale temporary dataset userrefs.
2983 */
2984 dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool);
2985 }
2986
2987 return (0);
2988 }
2989
2990 static int
2991 spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig)
2992 {
2993 int mode = spa->spa_mode;
2994
2995 spa_unload(spa);
2996 spa_deactivate(spa);
2997
2998 spa->spa_load_max_txg = spa->spa_uberblock.ub_txg - 1;
2999
3000 spa_activate(spa, mode);
3001 spa_async_suspend(spa);
3002
3003 return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig));
3004 }
3005
3006 /*
3007 * If spa_load() fails this function will try loading prior txg's. If
3008 * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool
3009 * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this
3010 * function will not rewind the pool and will return the same error as
3011 * spa_load().
3012 */
3013 static int
3014 spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig,
3015 uint64_t max_request, int rewind_flags)
3016 {
3017 nvlist_t *loadinfo = NULL;
3018 nvlist_t *config = NULL;
3019 int load_error, rewind_error;
3020 uint64_t safe_rewind_txg;
3021 uint64_t min_txg;
3022
3023 if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) {
3024 spa->spa_load_max_txg = spa->spa_load_txg;
3025 spa_set_log_state(spa, SPA_LOG_CLEAR);
3026 } else {
3027 spa->spa_load_max_txg = max_request;
3028 if (max_request != UINT64_MAX)
3029 spa->spa_extreme_rewind = B_TRUE;
3030 }
3031
3032 load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING,
3033 mosconfig);
3034 if (load_error == 0)
3035 return (0);
3036
3037 if (spa->spa_root_vdev != NULL)
3038 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
3039
3040 spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg;
3041 spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp;
3042
3043 if (rewind_flags & ZPOOL_NEVER_REWIND) {
3044 nvlist_free(config);
3045 return (load_error);
3046 }
3047
3048 if (state == SPA_LOAD_RECOVER) {
3049 /* Price of rolling back is discarding txgs, including log */
3050 spa_set_log_state(spa, SPA_LOG_CLEAR);
3051 } else {
3052 /*
3053 * If we aren't rolling back save the load info from our first
3054 * import attempt so that we can restore it after attempting
3055 * to rewind.
3056 */
3057 loadinfo = spa->spa_load_info;
3058 spa->spa_load_info = fnvlist_alloc();
3059 }
3060
3061 spa->spa_load_max_txg = spa->spa_last_ubsync_txg;
3062 safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE;
3063 min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ?
3064 TXG_INITIAL : safe_rewind_txg;
3065
3066 /*
3067 * Continue as long as we're finding errors, we're still within
3068 * the acceptable rewind range, and we're still finding uberblocks
3069 */
3070 while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg &&
3071 spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) {
3072 if (spa->spa_load_max_txg < safe_rewind_txg)
3073 spa->spa_extreme_rewind = B_TRUE;
3074 rewind_error = spa_load_retry(spa, state, mosconfig);
3075 }
3076
3077 spa->spa_extreme_rewind = B_FALSE;
3078 spa->spa_load_max_txg = UINT64_MAX;
3079
3080 if (config && (rewind_error || state != SPA_LOAD_RECOVER))
3081 spa_config_set(spa, config);
3082
3083 if (state == SPA_LOAD_RECOVER) {
3084 ASSERT3P(loadinfo, ==, NULL);
3085 return (rewind_error);
3086 } else {
3087 /* Store the rewind info as part of the initial load info */
3088 fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO,
3089 spa->spa_load_info);
3090
3091 /* Restore the initial load info */
3092 fnvlist_free(spa->spa_load_info);
3093 spa->spa_load_info = loadinfo;
3094
3095 return (load_error);
3096 }
3097 }
3098
3099 /*
3100 * Pool Open/Import
3101 *
3102 * The import case is identical to an open except that the configuration is sent
3103 * down from userland, instead of grabbed from the configuration cache. For the
3104 * case of an open, the pool configuration will exist in the
3105 * POOL_STATE_UNINITIALIZED state.
3106 *
3107 * The stats information (gen/count/ustats) is used to gather vdev statistics at
3108 * the same time open the pool, without having to keep around the spa_t in some
3109 * ambiguous state.
3110 */
3111 static int
3112 spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy,
3113 nvlist_t **config)
3114 {
3115 spa_t *spa;
3116 spa_load_state_t state = SPA_LOAD_OPEN;
3117 int error;
3118 int locked = B_FALSE;
3119 int firstopen = B_FALSE;
3120
3121 *spapp = NULL;
3122
3123 /*
3124 * As disgusting as this is, we need to support recursive calls to this
3125 * function because dsl_dir_open() is called during spa_load(), and ends
3126 * up calling spa_open() again. The real fix is to figure out how to
3127 * avoid dsl_dir_open() calling this in the first place.
3128 */
3129 if (mutex_owner(&spa_namespace_lock) != curthread) {
3130 mutex_enter(&spa_namespace_lock);
3131 locked = B_TRUE;
3132 }
3133
3134 if ((spa = spa_lookup(pool)) == NULL) {
3135 if (locked)
3136 mutex_exit(&spa_namespace_lock);
3137 return (SET_ERROR(ENOENT));
3138 }
3139
3140 if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
3141 zpool_rewind_policy_t policy;
3142
3143 firstopen = B_TRUE;
3144
3145 zpool_get_rewind_policy(nvpolicy ? nvpolicy : spa->spa_config,
3146 &policy);
3147 if (policy.zrp_request & ZPOOL_DO_REWIND)
3148 state = SPA_LOAD_RECOVER;
3149
3150 spa_activate(spa, spa_mode_global);
3151
3152 if (state != SPA_LOAD_RECOVER)
3153 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
3154
3155 error = spa_load_best(spa, state, B_FALSE, policy.zrp_txg,
3156 policy.zrp_request);
3157
3158 if (error == EBADF) {
3159 /*
3160 * If vdev_validate() returns failure (indicated by
3161 * EBADF), it indicates that one of the vdevs indicates
3162 * that the pool has been exported or destroyed. If
3163 * this is the case, the config cache is out of sync and
3164 * we should remove the pool from the namespace.
3165 */
3166 spa_unload(spa);
3167 spa_deactivate(spa);
3168 spa_config_sync(spa, B_TRUE, B_TRUE);
3169 spa_remove(spa);
3170 if (locked)
3171 mutex_exit(&spa_namespace_lock);
3172 return (SET_ERROR(ENOENT));
3173 }
3174
3175 if (error) {
3176 /*
3177 * We can't open the pool, but we still have useful
3178 * information: the state of each vdev after the
3179 * attempted vdev_open(). Return this to the user.
3180 */
3181 if (config != NULL && spa->spa_config) {
3182 VERIFY(nvlist_dup(spa->spa_config, config,
3183 KM_SLEEP) == 0);
3184 VERIFY(nvlist_add_nvlist(*config,
3185 ZPOOL_CONFIG_LOAD_INFO,
3186 spa->spa_load_info) == 0);
3187 }
3188 spa_unload(spa);
3189 spa_deactivate(spa);
3190 spa->spa_last_open_failed = error;
3191 if (locked)
3192 mutex_exit(&spa_namespace_lock);
3193 *spapp = NULL;
3194 return (error);
3195 }
3196 }
3197
3198 spa_open_ref(spa, tag);
3199
3200 if (config != NULL)
3201 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
3202
3203 /*
3204 * If we've recovered the pool, pass back any information we
3205 * gathered while doing the load.
3206 */
3207 if (state == SPA_LOAD_RECOVER) {
3208 VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO,
3209 spa->spa_load_info) == 0);
3210 }
3211
3212 if (locked) {
3213 spa->spa_last_open_failed = 0;
3214 spa->spa_last_ubsync_txg = 0;
3215 spa->spa_load_txg = 0;
3216 mutex_exit(&spa_namespace_lock);
3217 }
3218
3219 if (firstopen)
3220 zvol_create_minors(spa, spa_name(spa), B_TRUE);
3221
3222 *spapp = spa;
3223
3224 return (0);
3225 }
3226
3227 int
3228 spa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy,
3229 nvlist_t **config)
3230 {
3231 return (spa_open_common(name, spapp, tag, policy, config));
3232 }
3233
3234 int
3235 spa_open(const char *name, spa_t **spapp, void *tag)
3236 {
3237 return (spa_open_common(name, spapp, tag, NULL, NULL));
3238 }
3239
3240 /*
3241 * Lookup the given spa_t, incrementing the inject count in the process,
3242 * preventing it from being exported or destroyed.
3243 */
3244 spa_t *
3245 spa_inject_addref(char *name)
3246 {
3247 spa_t *spa;
3248
3249 mutex_enter(&spa_namespace_lock);
3250 if ((spa = spa_lookup(name)) == NULL) {
3251 mutex_exit(&spa_namespace_lock);
3252 return (NULL);
3253 }
3254 spa->spa_inject_ref++;
3255 mutex_exit(&spa_namespace_lock);
3256
3257 return (spa);
3258 }
3259
3260 void
3261 spa_inject_delref(spa_t *spa)
3262 {
3263 mutex_enter(&spa_namespace_lock);
3264 spa->spa_inject_ref--;
3265 mutex_exit(&spa_namespace_lock);
3266 }
3267
3268 /*
3269 * Add spares device information to the nvlist.
3270 */
3271 static void
3272 spa_add_spares(spa_t *spa, nvlist_t *config)
3273 {
3274 nvlist_t **spares;
3275 uint_t i, nspares;
3276 nvlist_t *nvroot;
3277 uint64_t guid;
3278 vdev_stat_t *vs;
3279 uint_t vsc;
3280 uint64_t pool;
3281
3282 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
3283
3284 if (spa->spa_spares.sav_count == 0)
3285 return;
3286
3287 VERIFY(nvlist_lookup_nvlist(config,
3288 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
3289 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
3290 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
3291 if (nspares != 0) {
3292 VERIFY(nvlist_add_nvlist_array(nvroot,
3293 ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
3294 VERIFY(nvlist_lookup_nvlist_array(nvroot,
3295 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
3296
3297 /*
3298 * Go through and find any spares which have since been
3299 * repurposed as an active spare. If this is the case, update
3300 * their status appropriately.
3301 */
3302 for (i = 0; i < nspares; i++) {
3303 VERIFY(nvlist_lookup_uint64(spares[i],
3304 ZPOOL_CONFIG_GUID, &guid) == 0);
3305 if (spa_spare_exists(guid, &pool, NULL) &&
3306 pool != 0ULL) {
3307 VERIFY(nvlist_lookup_uint64_array(
3308 spares[i], ZPOOL_CONFIG_VDEV_STATS,
3309 (uint64_t **)&vs, &vsc) == 0);
3310 vs->vs_state = VDEV_STATE_CANT_OPEN;
3311 vs->vs_aux = VDEV_AUX_SPARED;
3312 }
3313 }
3314 }
3315 }
3316
3317 /*
3318 * Add l2cache device information to the nvlist, including vdev stats.
3319 */
3320 static void
3321 spa_add_l2cache(spa_t *spa, nvlist_t *config)
3322 {
3323 nvlist_t **l2cache;
3324 uint_t i, j, nl2cache;
3325 nvlist_t *nvroot;
3326 uint64_t guid;
3327 vdev_t *vd;
3328 vdev_stat_t *vs;
3329 uint_t vsc;
3330
3331 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
3332
3333 if (spa->spa_l2cache.sav_count == 0)
3334 return;
3335
3336 VERIFY(nvlist_lookup_nvlist(config,
3337 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
3338 VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
3339 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
3340 if (nl2cache != 0) {
3341 VERIFY(nvlist_add_nvlist_array(nvroot,
3342 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
3343 VERIFY(nvlist_lookup_nvlist_array(nvroot,
3344 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
3345
3346 /*
3347 * Update level 2 cache device stats.
3348 */
3349
3350 for (i = 0; i < nl2cache; i++) {
3351 VERIFY(nvlist_lookup_uint64(l2cache[i],
3352 ZPOOL_CONFIG_GUID, &guid) == 0);
3353
3354 vd = NULL;
3355 for (j = 0; j < spa->spa_l2cache.sav_count; j++) {
3356 if (guid ==
3357 spa->spa_l2cache.sav_vdevs[j]->vdev_guid) {
3358 vd = spa->spa_l2cache.sav_vdevs[j];
3359 break;
3360 }
3361 }
3362 ASSERT(vd != NULL);
3363
3364 VERIFY(nvlist_lookup_uint64_array(l2cache[i],
3365 ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)
3366 == 0);
3367 vdev_get_stats(vd, vs);
3368 vdev_config_generate_stats(vd, l2cache[i]);
3369
3370 }
3371 }
3372 }
3373
3374 static void
3375 spa_feature_stats_from_disk(spa_t *spa, nvlist_t *features)
3376 {
3377 zap_cursor_t zc;
3378 zap_attribute_t za;
3379
3380 if (spa->spa_feat_for_read_obj != 0) {
3381 for (zap_cursor_init(&zc, spa->spa_meta_objset,
3382 spa->spa_feat_for_read_obj);
3383 zap_cursor_retrieve(&zc, &za) == 0;
3384 zap_cursor_advance(&zc)) {
3385 ASSERT(za.za_integer_length == sizeof (uint64_t) &&
3386 za.za_num_integers == 1);
3387 VERIFY0(nvlist_add_uint64(features, za.za_name,
3388 za.za_first_integer));
3389 }
3390 zap_cursor_fini(&zc);
3391 }
3392
3393 if (spa->spa_feat_for_write_obj != 0) {
3394 for (zap_cursor_init(&zc, spa->spa_meta_objset,
3395 spa->spa_feat_for_write_obj);
3396 zap_cursor_retrieve(&zc, &za) == 0;
3397 zap_cursor_advance(&zc)) {
3398 ASSERT(za.za_integer_length == sizeof (uint64_t) &&
3399 za.za_num_integers == 1);
3400 VERIFY0(nvlist_add_uint64(features, za.za_name,
3401 za.za_first_integer));
3402 }
3403 zap_cursor_fini(&zc);
3404 }
3405 }
3406
3407 static void
3408 spa_feature_stats_from_cache(spa_t *spa, nvlist_t *features)
3409 {
3410 int i;
3411
3412 for (i = 0; i < SPA_FEATURES; i++) {
3413 zfeature_info_t feature = spa_feature_table[i];
3414 uint64_t refcount;
3415
3416 if (feature_get_refcount(spa, &feature, &refcount) != 0)
3417 continue;
3418
3419 VERIFY0(nvlist_add_uint64(features, feature.fi_guid, refcount));
3420 }
3421 }
3422
3423 /*
3424 * Store a list of pool features and their reference counts in the
3425 * config.
3426 *
3427 * The first time this is called on a spa, allocate a new nvlist, fetch
3428 * the pool features and reference counts from disk, then save the list
3429 * in the spa. In subsequent calls on the same spa use the saved nvlist
3430 * and refresh its values from the cached reference counts. This
3431 * ensures we don't block here on I/O on a suspended pool so 'zpool
3432 * clear' can resume the pool.
3433 */
3434 static void
3435 spa_add_feature_stats(spa_t *spa, nvlist_t *config)
3436 {
3437 nvlist_t *features;
3438
3439 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
3440
3441 mutex_enter(&spa->spa_feat_stats_lock);
3442 features = spa->spa_feat_stats;
3443
3444 if (features != NULL) {
3445 spa_feature_stats_from_cache(spa, features);
3446 } else {
3447 VERIFY0(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP));
3448 spa->spa_feat_stats = features;
3449 spa_feature_stats_from_disk(spa, features);
3450 }
3451
3452 VERIFY0(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS,
3453 features));
3454
3455 mutex_exit(&spa->spa_feat_stats_lock);
3456 }
3457
3458 int
3459 spa_get_stats(const char *name, nvlist_t **config,
3460 char *altroot, size_t buflen)
3461 {
3462 int error;
3463 spa_t *spa;
3464
3465 *config = NULL;
3466 error = spa_open_common(name, &spa, FTAG, NULL, config);
3467
3468 if (spa != NULL) {
3469 /*
3470 * This still leaves a window of inconsistency where the spares
3471 * or l2cache devices could change and the config would be
3472 * self-inconsistent.
3473 */
3474 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
3475
3476 if (*config != NULL) {
3477 uint64_t loadtimes[2];
3478
3479 loadtimes[0] = spa->spa_loaded_ts.tv_sec;
3480 loadtimes[1] = spa->spa_loaded_ts.tv_nsec;
3481 VERIFY(nvlist_add_uint64_array(*config,
3482 ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0);
3483
3484 VERIFY(nvlist_add_uint64(*config,
3485 ZPOOL_CONFIG_ERRCOUNT,
3486 spa_get_errlog_size(spa)) == 0);
3487
3488 if (spa_suspended(spa))
3489 VERIFY(nvlist_add_uint64(*config,
3490 ZPOOL_CONFIG_SUSPENDED,
3491 spa->spa_failmode) == 0);
3492
3493 spa_add_spares(spa, *config);
3494 spa_add_l2cache(spa, *config);
3495 spa_add_feature_stats(spa, *config);
3496 }
3497 }
3498
3499 /*
3500 * We want to get the alternate root even for faulted pools, so we cheat
3501 * and call spa_lookup() directly.
3502 */
3503 if (altroot) {
3504 if (spa == NULL) {
3505 mutex_enter(&spa_namespace_lock);
3506 spa = spa_lookup(name);
3507 if (spa)
3508 spa_altroot(spa, altroot, buflen);
3509 else
3510 altroot[0] = '\0';
3511 spa = NULL;
3512 mutex_exit(&spa_namespace_lock);
3513 } else {
3514 spa_altroot(spa, altroot, buflen);
3515 }
3516 }
3517
3518 if (spa != NULL) {
3519 spa_config_exit(spa, SCL_CONFIG, FTAG);
3520 spa_close(spa, FTAG);
3521 }
3522
3523 return (error);
3524 }
3525
3526 /*
3527 * Validate that the auxiliary device array is well formed. We must have an
3528 * array of nvlists, each which describes a valid leaf vdev. If this is an
3529 * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be
3530 * specified, as long as they are well-formed.
3531 */
3532 static int
3533 spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode,
3534 spa_aux_vdev_t *sav, const char *config, uint64_t version,
3535 vdev_labeltype_t label)
3536 {
3537 nvlist_t **dev;
3538 uint_t i, ndev;
3539 vdev_t *vd;
3540 int error;
3541
3542 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
3543
3544 /*
3545 * It's acceptable to have no devs specified.
3546 */
3547 if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0)
3548 return (0);
3549
3550 if (ndev == 0)
3551 return (SET_ERROR(EINVAL));
3552
3553 /*
3554 * Make sure the pool is formatted with a version that supports this
3555 * device type.
3556 */
3557 if (spa_version(spa) < version)
3558 return (SET_ERROR(ENOTSUP));
3559
3560 /*
3561 * Set the pending device list so we correctly handle device in-use
3562 * checking.
3563 */
3564 sav->sav_pending = dev;
3565 sav->sav_npending = ndev;
3566
3567 for (i = 0; i < ndev; i++) {
3568 if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0,
3569 mode)) != 0)
3570 goto out;
3571
3572 if (!vd->vdev_ops->vdev_op_leaf) {
3573 vdev_free(vd);
3574 error = SET_ERROR(EINVAL);
3575 goto out;
3576 }
3577
3578 /*
3579 * The L2ARC currently only supports disk devices in
3580 * kernel context. For user-level testing, we allow it.
3581 */
3582 #ifdef _KERNEL
3583 if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) &&
3584 strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) {
3585 error = SET_ERROR(ENOTBLK);
3586 vdev_free(vd);
3587 goto out;
3588 }
3589 #endif
3590 vd->vdev_top = vd;
3591
3592 if ((error = vdev_open(vd)) == 0 &&
3593 (error = vdev_label_init(vd, crtxg, label)) == 0) {
3594 VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID,
3595 vd->vdev_guid) == 0);
3596 }
3597
3598 vdev_free(vd);
3599
3600 if (error &&
3601 (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE))
3602 goto out;
3603 else
3604 error = 0;
3605 }
3606
3607 out:
3608 sav->sav_pending = NULL;
3609 sav->sav_npending = 0;
3610 return (error);
3611 }
3612
3613 static int
3614 spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode)
3615 {
3616 int error;
3617
3618 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
3619
3620 if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode,
3621 &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES,
3622 VDEV_LABEL_SPARE)) != 0) {
3623 return (error);
3624 }
3625
3626 return (spa_validate_aux_devs(spa, nvroot, crtxg, mode,
3627 &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE,
3628 VDEV_LABEL_L2CACHE));
3629 }
3630
3631 static void
3632 spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs,
3633 const char *config)
3634 {
3635 int i;
3636
3637 if (sav->sav_config != NULL) {
3638 nvlist_t **olddevs;
3639 uint_t oldndevs;
3640 nvlist_t **newdevs;
3641
3642 /*
3643 * Generate new dev list by concatentating with the
3644 * current dev list.
3645 */
3646 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config,
3647 &olddevs, &oldndevs) == 0);
3648
3649 newdevs = kmem_alloc(sizeof (void *) *
3650 (ndevs + oldndevs), KM_SLEEP);
3651 for (i = 0; i < oldndevs; i++)
3652 VERIFY(nvlist_dup(olddevs[i], &newdevs[i],
3653 KM_SLEEP) == 0);
3654 for (i = 0; i < ndevs; i++)
3655 VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs],
3656 KM_SLEEP) == 0);
3657
3658 VERIFY(nvlist_remove(sav->sav_config, config,
3659 DATA_TYPE_NVLIST_ARRAY) == 0);
3660
3661 VERIFY(nvlist_add_nvlist_array(sav->sav_config,
3662 config, newdevs, ndevs + oldndevs) == 0);
3663 for (i = 0; i < oldndevs + ndevs; i++)
3664 nvlist_free(newdevs[i]);
3665 kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *));
3666 } else {
3667 /*
3668 * Generate a new dev list.
3669 */
3670 VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME,
3671 KM_SLEEP) == 0);
3672 VERIFY(nvlist_add_nvlist_array(sav->sav_config, config,
3673 devs, ndevs) == 0);
3674 }
3675 }
3676
3677 /*
3678 * Stop and drop level 2 ARC devices
3679 */
3680 void
3681 spa_l2cache_drop(spa_t *spa)
3682 {
3683 vdev_t *vd;
3684 int i;
3685 spa_aux_vdev_t *sav = &spa->spa_l2cache;
3686
3687 for (i = 0; i < sav->sav_count; i++) {
3688 uint64_t pool;
3689
3690 vd = sav->sav_vdevs[i];
3691 ASSERT(vd != NULL);
3692
3693 if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
3694 pool != 0ULL && l2arc_vdev_present(vd))
3695 l2arc_remove_vdev(vd);
3696 }
3697 }
3698
3699 /*
3700 * Pool Creation
3701 */
3702 int
3703 spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
3704 nvlist_t *zplprops)
3705 {
3706 spa_t *spa;
3707 char *altroot = NULL;
3708 vdev_t *rvd;
3709 dsl_pool_t *dp;
3710 dmu_tx_t *tx;
3711 int error = 0;
3712 uint64_t txg = TXG_INITIAL;
3713 nvlist_t **spares, **l2cache;
3714 uint_t nspares, nl2cache;
3715 uint64_t version, obj;
3716 boolean_t has_features;
3717 nvpair_t *elem;
3718 int c, i;
3719 char *poolname;
3720 nvlist_t *nvl;
3721
3722 if (nvlist_lookup_string(props, "tname", &poolname) != 0)
3723 poolname = (char *)pool;
3724
3725 /*
3726 * If this pool already exists, return failure.
3727 */
3728 mutex_enter(&spa_namespace_lock);
3729 if (spa_lookup(poolname) != NULL) {
3730 mutex_exit(&spa_namespace_lock);
3731 return (SET_ERROR(EEXIST));
3732 }
3733
3734 /*
3735 * Allocate a new spa_t structure.
3736 */
3737 nvl = fnvlist_alloc();
3738 fnvlist_add_string(nvl, ZPOOL_CONFIG_POOL_NAME, pool);
3739 (void) nvlist_lookup_string(props,
3740 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
3741 spa = spa_add(poolname, nvl, altroot);
3742 fnvlist_free(nvl);
3743 spa_activate(spa, spa_mode_global);
3744
3745 if (props && (error = spa_prop_validate(spa, props))) {
3746 spa_deactivate(spa);
3747 spa_remove(spa);
3748 mutex_exit(&spa_namespace_lock);
3749 return (error);
3750 }
3751
3752 /*
3753 * Temporary pool names should never be written to disk.
3754 */
3755 if (poolname != pool)
3756 spa->spa_import_flags |= ZFS_IMPORT_TEMP_NAME;
3757
3758 has_features = B_FALSE;
3759 for (elem = nvlist_next_nvpair(props, NULL);
3760 elem != NULL; elem = nvlist_next_nvpair(props, elem)) {
3761 if (zpool_prop_feature(nvpair_name(elem)))
3762 has_features = B_TRUE;
3763 }
3764
3765 if (has_features || nvlist_lookup_uint64(props,
3766 zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) {
3767 version = SPA_VERSION;
3768 }
3769 ASSERT(SPA_VERSION_IS_SUPPORTED(version));
3770
3771 spa->spa_first_txg = txg;
3772 spa->spa_uberblock.ub_txg = txg - 1;
3773 spa->spa_uberblock.ub_version = version;
3774 spa->spa_ubsync = spa->spa_uberblock;
3775
3776 /*
3777 * Create "The Godfather" zio to hold all async IOs
3778 */
3779 spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *),
3780 KM_SLEEP);
3781 for (i = 0; i < max_ncpus; i++) {
3782 spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL,
3783 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
3784 ZIO_FLAG_GODFATHER);
3785 }
3786
3787 /*
3788 * Create the root vdev.
3789 */
3790 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3791
3792 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD);
3793
3794 ASSERT(error != 0 || rvd != NULL);
3795 ASSERT(error != 0 || spa->spa_root_vdev == rvd);
3796
3797 if (error == 0 && !zfs_allocatable_devs(nvroot))
3798 error = SET_ERROR(EINVAL);
3799
3800 if (error == 0 &&
3801 (error = vdev_create(rvd, txg, B_FALSE)) == 0 &&
3802 (error = spa_validate_aux(spa, nvroot, txg,
3803 VDEV_ALLOC_ADD)) == 0) {
3804 for (c = 0; c < rvd->vdev_children; c++) {
3805 vdev_metaslab_set_size(rvd->vdev_child[c]);
3806 vdev_expand(rvd->vdev_child[c], txg);
3807 }
3808 }
3809
3810 spa_config_exit(spa, SCL_ALL, FTAG);
3811
3812 if (error != 0) {
3813 spa_unload(spa);
3814 spa_deactivate(spa);
3815 spa_remove(spa);
3816 mutex_exit(&spa_namespace_lock);
3817 return (error);
3818 }
3819
3820 /*
3821 * Get the list of spares, if specified.
3822 */
3823 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
3824 &spares, &nspares) == 0) {
3825 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME,
3826 KM_SLEEP) == 0);
3827 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
3828 ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
3829 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3830 spa_load_spares(spa);
3831 spa_config_exit(spa, SCL_ALL, FTAG);
3832 spa->spa_spares.sav_sync = B_TRUE;
3833 }
3834
3835 /*
3836 * Get the list of level 2 cache devices, if specified.
3837 */
3838 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
3839 &l2cache, &nl2cache) == 0) {
3840 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
3841 NV_UNIQUE_NAME, KM_SLEEP) == 0);
3842 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
3843 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
3844 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3845 spa_load_l2cache(spa);
3846 spa_config_exit(spa, SCL_ALL, FTAG);
3847 spa->spa_l2cache.sav_sync = B_TRUE;
3848 }
3849
3850 spa->spa_is_initializing = B_TRUE;
3851 spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg);
3852 spa->spa_meta_objset = dp->dp_meta_objset;
3853 spa->spa_is_initializing = B_FALSE;
3854
3855 /*
3856 * Create DDTs (dedup tables).
3857 */
3858 ddt_create(spa);
3859
3860 spa_update_dspace(spa);
3861
3862 tx = dmu_tx_create_assigned(dp, txg);
3863
3864 /*
3865 * Create the pool config object.
3866 */
3867 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset,
3868 DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE,
3869 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx);
3870
3871 if (zap_add(spa->spa_meta_objset,
3872 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
3873 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) {
3874 cmn_err(CE_PANIC, "failed to add pool config");
3875 }
3876
3877 if (spa_version(spa) >= SPA_VERSION_FEATURES)
3878 spa_feature_create_zap_objects(spa, tx);
3879
3880 if (zap_add(spa->spa_meta_objset,
3881 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION,
3882 sizeof (uint64_t), 1, &version, tx) != 0) {
3883 cmn_err(CE_PANIC, "failed to add pool version");
3884 }
3885
3886 /* Newly created pools with the right version are always deflated. */
3887 if (version >= SPA_VERSION_RAIDZ_DEFLATE) {
3888 spa->spa_deflate = TRUE;
3889 if (zap_add(spa->spa_meta_objset,
3890 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
3891 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) {
3892 cmn_err(CE_PANIC, "failed to add deflate");
3893 }
3894 }
3895
3896 /*
3897 * Create the deferred-free bpobj. Turn off compression
3898 * because sync-to-convergence takes longer if the blocksize
3899 * keeps changing.
3900 */
3901 obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx);
3902 dmu_object_set_compress(spa->spa_meta_objset, obj,
3903 ZIO_COMPRESS_OFF, tx);
3904 if (zap_add(spa->spa_meta_objset,
3905 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ,
3906 sizeof (uint64_t), 1, &obj, tx) != 0) {
3907 cmn_err(CE_PANIC, "failed to add bpobj");
3908 }
3909 VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj,
3910 spa->spa_meta_objset, obj));
3911
3912 /*
3913 * Create the pool's history object.
3914 */
3915 if (version >= SPA_VERSION_ZPOOL_HISTORY)
3916 spa_history_create_obj(spa, tx);
3917
3918 /*
3919 * Set pool properties.
3920 */
3921 spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS);
3922 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
3923 spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE);
3924 spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND);
3925
3926 if (props != NULL) {
3927 spa_configfile_set(spa, props, B_FALSE);
3928 spa_sync_props(props, tx);
3929 }
3930
3931 dmu_tx_commit(tx);
3932
3933 spa->spa_sync_on = B_TRUE;
3934 txg_sync_start(spa->spa_dsl_pool);
3935
3936 /*
3937 * We explicitly wait for the first transaction to complete so that our
3938 * bean counters are appropriately updated.
3939 */
3940 txg_wait_synced(spa->spa_dsl_pool, txg);
3941
3942 spa_config_sync(spa, B_FALSE, B_TRUE);
3943
3944 spa_history_log_version(spa, "create");
3945
3946 /*
3947 * Don't count references from objsets that are already closed
3948 * and are making their way through the eviction process.
3949 */
3950 spa_evicting_os_wait(spa);
3951 spa->spa_minref = refcount_count(&spa->spa_refcount);
3952
3953 mutex_exit(&spa_namespace_lock);
3954
3955 return (0);
3956 }
3957
3958 #ifdef _KERNEL
3959 /*
3960 * Get the root pool information from the root disk, then import the root pool
3961 * during the system boot up time.
3962 */
3963 extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **);
3964
3965 static nvlist_t *
3966 spa_generate_rootconf(char *devpath, char *devid, uint64_t *guid)
3967 {
3968 nvlist_t *config;
3969 nvlist_t *nvtop, *nvroot;
3970 uint64_t pgid;
3971
3972 if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0)
3973 return (NULL);
3974
3975 /*
3976 * Add this top-level vdev to the child array.
3977 */
3978 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
3979 &nvtop) == 0);
3980 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
3981 &pgid) == 0);
3982 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0);
3983
3984 /*
3985 * Put this pool's top-level vdevs into a root vdev.
3986 */
3987 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
3988 VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
3989 VDEV_TYPE_ROOT) == 0);
3990 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0);
3991 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0);
3992 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
3993 &nvtop, 1) == 0);
3994
3995 /*
3996 * Replace the existing vdev_tree with the new root vdev in
3997 * this pool's configuration (remove the old, add the new).
3998 */
3999 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
4000 nvlist_free(nvroot);
4001 return (config);
4002 }
4003
4004 /*
4005 * Walk the vdev tree and see if we can find a device with "better"
4006 * configuration. A configuration is "better" if the label on that
4007 * device has a more recent txg.
4008 */
4009 static void
4010 spa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg)
4011 {
4012 int c;
4013
4014 for (c = 0; c < vd->vdev_children; c++)
4015 spa_alt_rootvdev(vd->vdev_child[c], avd, txg);
4016
4017 if (vd->vdev_ops->vdev_op_leaf) {
4018 nvlist_t *label;
4019 uint64_t label_txg;
4020
4021 if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid,
4022 &label) != 0)
4023 return;
4024
4025 VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG,
4026 &label_txg) == 0);
4027
4028 /*
4029 * Do we have a better boot device?
4030 */
4031 if (label_txg > *txg) {
4032 *txg = label_txg;
4033 *avd = vd;
4034 }
4035 nvlist_free(label);
4036 }
4037 }
4038
4039 /*
4040 * Import a root pool.
4041 *
4042 * For x86. devpath_list will consist of devid and/or physpath name of
4043 * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a").
4044 * The GRUB "findroot" command will return the vdev we should boot.
4045 *
4046 * For Sparc, devpath_list consists the physpath name of the booting device
4047 * no matter the rootpool is a single device pool or a mirrored pool.
4048 * e.g.
4049 * "/pci@1f,0/ide@d/disk@0,0:a"
4050 */
4051 int
4052 spa_import_rootpool(char *devpath, char *devid)
4053 {
4054 spa_t *spa;
4055 vdev_t *rvd, *bvd, *avd = NULL;
4056 nvlist_t *config, *nvtop;
4057 uint64_t guid, txg;
4058 char *pname;
4059 int error;
4060
4061 /*
4062 * Read the label from the boot device and generate a configuration.
4063 */
4064 config = spa_generate_rootconf(devpath, devid, &guid);
4065 #if defined(_OBP) && defined(_KERNEL)
4066 if (config == NULL) {
4067 if (strstr(devpath, "/iscsi/ssd") != NULL) {
4068 /* iscsi boot */
4069 get_iscsi_bootpath_phy(devpath);
4070 config = spa_generate_rootconf(devpath, devid, &guid);
4071 }
4072 }
4073 #endif
4074 if (config == NULL) {
4075 cmn_err(CE_NOTE, "Cannot read the pool label from '%s'",
4076 devpath);
4077 return (SET_ERROR(EIO));
4078 }
4079
4080 VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
4081 &pname) == 0);
4082 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0);
4083
4084 mutex_enter(&spa_namespace_lock);
4085 if ((spa = spa_lookup(pname)) != NULL) {
4086 /*
4087 * Remove the existing root pool from the namespace so that we
4088 * can replace it with the correct config we just read in.
4089 */
4090 spa_remove(spa);
4091 }
4092
4093 spa = spa_add(pname, config, NULL);
4094 spa->spa_is_root = B_TRUE;
4095 spa->spa_import_flags = ZFS_IMPORT_VERBATIM;
4096
4097 /*
4098 * Build up a vdev tree based on the boot device's label config.
4099 */
4100 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
4101 &nvtop) == 0);
4102 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
4103 error = spa_config_parse(spa, &rvd, nvtop, NULL, 0,
4104 VDEV_ALLOC_ROOTPOOL);
4105 spa_config_exit(spa, SCL_ALL, FTAG);
4106 if (error) {
4107 mutex_exit(&spa_namespace_lock);
4108 nvlist_free(config);
4109 cmn_err(CE_NOTE, "Can not parse the config for pool '%s'",
4110 pname);
4111 return (error);
4112 }
4113
4114 /*
4115 * Get the boot vdev.
4116 */
4117 if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) {
4118 cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu",
4119 (u_longlong_t)guid);
4120 error = SET_ERROR(ENOENT);
4121 goto out;
4122 }
4123
4124 /*
4125 * Determine if there is a better boot device.
4126 */
4127 avd = bvd;
4128 spa_alt_rootvdev(rvd, &avd, &txg);
4129 if (avd != bvd) {
4130 cmn_err(CE_NOTE, "The boot device is 'degraded'. Please "
4131 "try booting from '%s'", avd->vdev_path);
4132 error = SET_ERROR(EINVAL);
4133 goto out;
4134 }
4135
4136 /*
4137 * If the boot device is part of a spare vdev then ensure that
4138 * we're booting off the active spare.
4139 */
4140 if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops &&
4141 !bvd->vdev_isspare) {
4142 cmn_err(CE_NOTE, "The boot device is currently spared. Please "
4143 "try booting from '%s'",
4144 bvd->vdev_parent->
4145 vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path);
4146 error = SET_ERROR(EINVAL);
4147 goto out;
4148 }
4149
4150 error = 0;
4151 out:
4152 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
4153 vdev_free(rvd);
4154 spa_config_exit(spa, SCL_ALL, FTAG);
4155 mutex_exit(&spa_namespace_lock);
4156
4157 nvlist_free(config);
4158 return (error);
4159 }
4160
4161 #endif
4162
4163 /*
4164 * Import a non-root pool into the system.
4165 */
4166 int
4167 spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags)
4168 {
4169 spa_t *spa;
4170 char *altroot = NULL;
4171 spa_load_state_t state = SPA_LOAD_IMPORT;
4172 zpool_rewind_policy_t policy;
4173 uint64_t mode = spa_mode_global;
4174 uint64_t readonly = B_FALSE;
4175 int error;
4176 nvlist_t *nvroot;
4177 nvlist_t **spares, **l2cache;
4178 uint_t nspares, nl2cache;
4179
4180 /*
4181 * If a pool with this name exists, return failure.
4182 */
4183 mutex_enter(&spa_namespace_lock);
4184 if (spa_lookup(pool) != NULL) {
4185 mutex_exit(&spa_namespace_lock);
4186 return (SET_ERROR(EEXIST));
4187 }
4188
4189 /*
4190 * Create and initialize the spa structure.
4191 */
4192 (void) nvlist_lookup_string(props,
4193 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
4194 (void) nvlist_lookup_uint64(props,
4195 zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly);
4196 if (readonly)
4197 mode = FREAD;
4198 spa = spa_add(pool, config, altroot);
4199 spa->spa_import_flags = flags;
4200
4201 /*
4202 * Verbatim import - Take a pool and insert it into the namespace
4203 * as if it had been loaded at boot.
4204 */
4205 if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) {
4206 if (props != NULL)
4207 spa_configfile_set(spa, props, B_FALSE);
4208
4209 spa_config_sync(spa, B_FALSE, B_TRUE);
4210
4211 mutex_exit(&spa_namespace_lock);
4212 return (0);
4213 }
4214
4215 spa_activate(spa, mode);
4216
4217 /*
4218 * Don't start async tasks until we know everything is healthy.
4219 */
4220 spa_async_suspend(spa);
4221
4222 zpool_get_rewind_policy(config, &policy);
4223 if (policy.zrp_request & ZPOOL_DO_REWIND)
4224 state = SPA_LOAD_RECOVER;
4225
4226 /*
4227 * Pass off the heavy lifting to spa_load(). Pass TRUE for mosconfig
4228 * because the user-supplied config is actually the one to trust when
4229 * doing an import.
4230 */
4231 if (state != SPA_LOAD_RECOVER)
4232 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
4233
4234 error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg,
4235 policy.zrp_request);
4236
4237 /*
4238 * Propagate anything learned while loading the pool and pass it
4239 * back to caller (i.e. rewind info, missing devices, etc).
4240 */
4241 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO,
4242 spa->spa_load_info) == 0);
4243
4244 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
4245 /*
4246 * Toss any existing sparelist, as it doesn't have any validity
4247 * anymore, and conflicts with spa_has_spare().
4248 */
4249 if (spa->spa_spares.sav_config) {
4250 nvlist_free(spa->spa_spares.sav_config);
4251 spa->spa_spares.sav_config = NULL;
4252 spa_load_spares(spa);
4253 }
4254 if (spa->spa_l2cache.sav_config) {
4255 nvlist_free(spa->spa_l2cache.sav_config);
4256 spa->spa_l2cache.sav_config = NULL;
4257 spa_load_l2cache(spa);
4258 }
4259
4260 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
4261 &nvroot) == 0);
4262 if (error == 0)
4263 error = spa_validate_aux(spa, nvroot, -1ULL,
4264 VDEV_ALLOC_SPARE);
4265 if (error == 0)
4266 error = spa_validate_aux(spa, nvroot, -1ULL,
4267 VDEV_ALLOC_L2CACHE);
4268 spa_config_exit(spa, SCL_ALL, FTAG);
4269
4270 if (props != NULL)
4271 spa_configfile_set(spa, props, B_FALSE);
4272
4273 if (error != 0 || (props && spa_writeable(spa) &&
4274 (error = spa_prop_set(spa, props)))) {
4275 spa_unload(spa);
4276 spa_deactivate(spa);
4277 spa_remove(spa);
4278 mutex_exit(&spa_namespace_lock);
4279 return (error);
4280 }
4281
4282 spa_async_resume(spa);
4283
4284 /*
4285 * Override any spares and level 2 cache devices as specified by
4286 * the user, as these may have correct device names/devids, etc.
4287 */
4288 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
4289 &spares, &nspares) == 0) {
4290 if (spa->spa_spares.sav_config)
4291 VERIFY(nvlist_remove(spa->spa_spares.sav_config,
4292 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0);
4293 else
4294 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config,
4295 NV_UNIQUE_NAME, KM_SLEEP) == 0);
4296 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
4297 ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
4298 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
4299 spa_load_spares(spa);
4300 spa_config_exit(spa, SCL_ALL, FTAG);
4301 spa->spa_spares.sav_sync = B_TRUE;
4302 }
4303 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
4304 &l2cache, &nl2cache) == 0) {
4305 if (spa->spa_l2cache.sav_config)
4306 VERIFY(nvlist_remove(spa->spa_l2cache.sav_config,
4307 ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0);
4308 else
4309 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
4310 NV_UNIQUE_NAME, KM_SLEEP) == 0);
4311 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
4312 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
4313 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
4314 spa_load_l2cache(spa);
4315 spa_config_exit(spa, SCL_ALL, FTAG);
4316 spa->spa_l2cache.sav_sync = B_TRUE;
4317 }
4318
4319 /*
4320 * Check for any removed devices.
4321 */
4322 if (spa->spa_autoreplace) {
4323 spa_aux_check_removed(&spa->spa_spares);
4324 spa_aux_check_removed(&spa->spa_l2cache);
4325 }
4326
4327 if (spa_writeable(spa)) {
4328 /*
4329 * Update the config cache to include the newly-imported pool.
4330 */
4331 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
4332 }
4333
4334 /*
4335 * It's possible that the pool was expanded while it was exported.
4336 * We kick off an async task to handle this for us.
4337 */
4338 spa_async_request(spa, SPA_ASYNC_AUTOEXPAND);
4339
4340 mutex_exit(&spa_namespace_lock);
4341 spa_history_log_version(spa, "import");
4342 zvol_create_minors(spa, pool, B_TRUE);
4343
4344 return (0);
4345 }
4346
4347 nvlist_t *
4348 spa_tryimport(nvlist_t *tryconfig)
4349 {
4350 nvlist_t *config = NULL;
4351 char *poolname;
4352 spa_t *spa;
4353 uint64_t state;
4354 int error;
4355
4356 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname))
4357 return (NULL);
4358
4359 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state))
4360 return (NULL);
4361
4362 /*
4363 * Create and initialize the spa structure.
4364 */
4365 mutex_enter(&spa_namespace_lock);
4366 spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL);
4367 spa_activate(spa, FREAD);
4368
4369 /*
4370 * Pass off the heavy lifting to spa_load().
4371 * Pass TRUE for mosconfig because the user-supplied config
4372 * is actually the one to trust when doing an import.
4373 */
4374 error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING, B_TRUE);
4375
4376 /*
4377 * If 'tryconfig' was at least parsable, return the current config.
4378 */
4379 if (spa->spa_root_vdev != NULL) {
4380 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
4381 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME,
4382 poolname) == 0);
4383 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
4384 state) == 0);
4385 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP,
4386 spa->spa_uberblock.ub_timestamp) == 0);
4387 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO,
4388 spa->spa_load_info) == 0);
4389 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_ERRATA,
4390 spa->spa_errata) == 0);
4391
4392 /*
4393 * If the bootfs property exists on this pool then we
4394 * copy it out so that external consumers can tell which
4395 * pools are bootable.
4396 */
4397 if ((!error || error == EEXIST) && spa->spa_bootfs) {
4398 char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
4399
4400 /*
4401 * We have to play games with the name since the
4402 * pool was opened as TRYIMPORT_NAME.
4403 */
4404 if (dsl_dsobj_to_dsname(spa_name(spa),
4405 spa->spa_bootfs, tmpname) == 0) {
4406 char *cp;
4407 char *dsname;
4408
4409 dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
4410
4411 cp = strchr(tmpname, '/');
4412 if (cp == NULL) {
4413 (void) strlcpy(dsname, tmpname,
4414 MAXPATHLEN);
4415 } else {
4416 (void) snprintf(dsname, MAXPATHLEN,
4417 "%s/%s", poolname, ++cp);
4418 }
4419 VERIFY(nvlist_add_string(config,
4420 ZPOOL_CONFIG_BOOTFS, dsname) == 0);
4421 kmem_free(dsname, MAXPATHLEN);
4422 }
4423 kmem_free(tmpname, MAXPATHLEN);
4424 }
4425
4426 /*
4427 * Add the list of hot spares and level 2 cache devices.
4428 */
4429 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
4430 spa_add_spares(spa, config);
4431 spa_add_l2cache(spa, config);
4432 spa_config_exit(spa, SCL_CONFIG, FTAG);
4433 }
4434
4435 spa_unload(spa);
4436 spa_deactivate(spa);
4437 spa_remove(spa);
4438 mutex_exit(&spa_namespace_lock);
4439
4440 return (config);
4441 }
4442
4443 /*
4444 * Pool export/destroy
4445 *
4446 * The act of destroying or exporting a pool is very simple. We make sure there
4447 * is no more pending I/O and any references to the pool are gone. Then, we
4448 * update the pool state and sync all the labels to disk, removing the
4449 * configuration from the cache afterwards. If the 'hardforce' flag is set, then
4450 * we don't sync the labels or remove the configuration cache.
4451 */
4452 static int
4453 spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
4454 boolean_t force, boolean_t hardforce)
4455 {
4456 spa_t *spa;
4457
4458 if (oldconfig)
4459 *oldconfig = NULL;
4460
4461 if (!(spa_mode_global & FWRITE))
4462 return (SET_ERROR(EROFS));
4463
4464 mutex_enter(&spa_namespace_lock);
4465 if ((spa = spa_lookup(pool)) == NULL) {
4466 mutex_exit(&spa_namespace_lock);
4467 return (SET_ERROR(ENOENT));
4468 }
4469
4470 /*
4471 * Put a hold on the pool, drop the namespace lock, stop async tasks,
4472 * reacquire the namespace lock, and see if we can export.
4473 */
4474 spa_open_ref(spa, FTAG);
4475 mutex_exit(&spa_namespace_lock);
4476 spa_async_suspend(spa);
4477 if (spa->spa_zvol_taskq) {
4478 zvol_remove_minors(spa, spa_name(spa), B_TRUE);
4479 taskq_wait(spa->spa_zvol_taskq);
4480 }
4481 mutex_enter(&spa_namespace_lock);
4482 spa_close(spa, FTAG);
4483
4484 if (spa->spa_state == POOL_STATE_UNINITIALIZED)
4485 goto export_spa;
4486 /*
4487 * The pool will be in core if it's openable, in which case we can
4488 * modify its state. Objsets may be open only because they're dirty,
4489 * so we have to force it to sync before checking spa_refcnt.
4490 */
4491 if (spa->spa_sync_on) {
4492 txg_wait_synced(spa->spa_dsl_pool, 0);
4493 spa_evicting_os_wait(spa);
4494 }
4495
4496 /*
4497 * A pool cannot be exported or destroyed if there are active
4498 * references. If we are resetting a pool, allow references by
4499 * fault injection handlers.
4500 */
4501 if (!spa_refcount_zero(spa) ||
4502 (spa->spa_inject_ref != 0 &&
4503 new_state != POOL_STATE_UNINITIALIZED)) {
4504 spa_async_resume(spa);
4505 mutex_exit(&spa_namespace_lock);
4506 return (SET_ERROR(EBUSY));
4507 }
4508
4509 if (spa->spa_sync_on) {
4510 /*
4511 * A pool cannot be exported if it has an active shared spare.
4512 * This is to prevent other pools stealing the active spare
4513 * from an exported pool. At user's own will, such pool can
4514 * be forcedly exported.
4515 */
4516 if (!force && new_state == POOL_STATE_EXPORTED &&
4517 spa_has_active_shared_spare(spa)) {
4518 spa_async_resume(spa);
4519 mutex_exit(&spa_namespace_lock);
4520 return (SET_ERROR(EXDEV));
4521 }
4522
4523 /*
4524 * We want this to be reflected on every label,
4525 * so mark them all dirty. spa_unload() will do the
4526 * final sync that pushes these changes out.
4527 */
4528 if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) {
4529 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
4530 spa->spa_state = new_state;
4531 spa->spa_final_txg = spa_last_synced_txg(spa) +
4532 TXG_DEFER_SIZE + 1;
4533 vdev_config_dirty(spa->spa_root_vdev);
4534 spa_config_exit(spa, SCL_ALL, FTAG);
4535 }
4536 }
4537
4538 export_spa:
4539 spa_event_notify(spa, NULL, FM_EREPORT_ZFS_POOL_DESTROY);
4540
4541 if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
4542 spa_unload(spa);
4543 spa_deactivate(spa);
4544 }
4545
4546 if (oldconfig && spa->spa_config)
4547 VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0);
4548
4549 if (new_state != POOL_STATE_UNINITIALIZED) {
4550 if (!hardforce)
4551 spa_config_sync(spa, B_TRUE, B_TRUE);
4552 spa_remove(spa);
4553 }
4554 mutex_exit(&spa_namespace_lock);
4555
4556 return (0);
4557 }
4558
4559 /*
4560 * Destroy a storage pool.
4561 */
4562 int
4563 spa_destroy(char *pool)
4564 {
4565 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL,
4566 B_FALSE, B_FALSE));
4567 }
4568
4569 /*
4570 * Export a storage pool.
4571 */
4572 int
4573 spa_export(char *pool, nvlist_t **oldconfig, boolean_t force,
4574 boolean_t hardforce)
4575 {
4576 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig,
4577 force, hardforce));
4578 }
4579
4580 /*
4581 * Similar to spa_export(), this unloads the spa_t without actually removing it
4582 * from the namespace in any way.
4583 */
4584 int
4585 spa_reset(char *pool)
4586 {
4587 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL,
4588 B_FALSE, B_FALSE));
4589 }
4590
4591 /*
4592 * ==========================================================================
4593 * Device manipulation
4594 * ==========================================================================
4595 */
4596
4597 /*
4598 * Add a device to a storage pool.
4599 */
4600 int
4601 spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
4602 {
4603 uint64_t txg, id;
4604 int error;
4605 vdev_t *rvd = spa->spa_root_vdev;
4606 vdev_t *vd, *tvd;
4607 nvlist_t **spares, **l2cache;
4608 uint_t nspares, nl2cache;
4609 int c;
4610
4611 ASSERT(spa_writeable(spa));
4612
4613 txg = spa_vdev_enter(spa);
4614
4615 if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0,
4616 VDEV_ALLOC_ADD)) != 0)
4617 return (spa_vdev_exit(spa, NULL, txg, error));
4618
4619 spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */
4620
4621 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares,
4622 &nspares) != 0)
4623 nspares = 0;
4624
4625 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache,
4626 &nl2cache) != 0)
4627 nl2cache = 0;
4628
4629 if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0)
4630 return (spa_vdev_exit(spa, vd, txg, EINVAL));
4631
4632 if (vd->vdev_children != 0 &&
4633 (error = vdev_create(vd, txg, B_FALSE)) != 0)
4634 return (spa_vdev_exit(spa, vd, txg, error));
4635
4636 /*
4637 * We must validate the spares and l2cache devices after checking the
4638 * children. Otherwise, vdev_inuse() will blindly overwrite the spare.
4639 */
4640 if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0)
4641 return (spa_vdev_exit(spa, vd, txg, error));
4642
4643 /*
4644 * Transfer each new top-level vdev from vd to rvd.
4645 */
4646 for (c = 0; c < vd->vdev_children; c++) {
4647
4648 /*
4649 * Set the vdev id to the first hole, if one exists.
4650 */
4651 for (id = 0; id < rvd->vdev_children; id++) {
4652 if (rvd->vdev_child[id]->vdev_ishole) {
4653 vdev_free(rvd->vdev_child[id]);
4654 break;
4655 }
4656 }
4657 tvd = vd->vdev_child[c];
4658 vdev_remove_child(vd, tvd);
4659 tvd->vdev_id = id;
4660 vdev_add_child(rvd, tvd);
4661 vdev_config_dirty(tvd);
4662 }
4663
4664 if (nspares != 0) {
4665 spa_set_aux_vdevs(&spa->spa_spares, spares, nspares,
4666 ZPOOL_CONFIG_SPARES);
4667 spa_load_spares(spa);
4668 spa->spa_spares.sav_sync = B_TRUE;
4669 }
4670
4671 if (nl2cache != 0) {
4672 spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache,
4673 ZPOOL_CONFIG_L2CACHE);
4674 spa_load_l2cache(spa);
4675 spa->spa_l2cache.sav_sync = B_TRUE;
4676 }
4677
4678 /*
4679 * We have to be careful when adding new vdevs to an existing pool.
4680 * If other threads start allocating from these vdevs before we
4681 * sync the config cache, and we lose power, then upon reboot we may
4682 * fail to open the pool because there are DVAs that the config cache
4683 * can't translate. Therefore, we first add the vdevs without
4684 * initializing metaslabs; sync the config cache (via spa_vdev_exit());
4685 * and then let spa_config_update() initialize the new metaslabs.
4686 *
4687 * spa_load() checks for added-but-not-initialized vdevs, so that
4688 * if we lose power at any point in this sequence, the remaining
4689 * steps will be completed the next time we load the pool.
4690 */
4691 (void) spa_vdev_exit(spa, vd, txg, 0);
4692
4693 mutex_enter(&spa_namespace_lock);
4694 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
4695 mutex_exit(&spa_namespace_lock);
4696
4697 return (0);
4698 }
4699
4700 /*
4701 * Attach a device to a mirror. The arguments are the path to any device
4702 * in the mirror, and the nvroot for the new device. If the path specifies
4703 * a device that is not mirrored, we automatically insert the mirror vdev.
4704 *
4705 * If 'replacing' is specified, the new device is intended to replace the
4706 * existing device; in this case the two devices are made into their own
4707 * mirror using the 'replacing' vdev, which is functionally identical to
4708 * the mirror vdev (it actually reuses all the same ops) but has a few
4709 * extra rules: you can't attach to it after it's been created, and upon
4710 * completion of resilvering, the first disk (the one being replaced)
4711 * is automatically detached.
4712 */
4713 int
4714 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
4715 {
4716 uint64_t txg, dtl_max_txg;
4717 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd;
4718 vdev_ops_t *pvops;
4719 char *oldvdpath, *newvdpath;
4720 int newvd_isspare;
4721 int error;
4722 ASSERTV(vdev_t *rvd = spa->spa_root_vdev);
4723
4724 ASSERT(spa_writeable(spa));
4725
4726 txg = spa_vdev_enter(spa);
4727
4728 oldvd = spa_lookup_by_guid(spa, guid, B_FALSE);
4729
4730 if (oldvd == NULL)
4731 return (spa_vdev_exit(spa, NULL, txg, ENODEV));
4732
4733 if (!oldvd->vdev_ops->vdev_op_leaf)
4734 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
4735
4736 pvd = oldvd->vdev_parent;
4737
4738 if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0,
4739 VDEV_ALLOC_ATTACH)) != 0)
4740 return (spa_vdev_exit(spa, NULL, txg, EINVAL));
4741
4742 if (newrootvd->vdev_children != 1)
4743 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
4744
4745 newvd = newrootvd->vdev_child[0];
4746
4747 if (!newvd->vdev_ops->vdev_op_leaf)
4748 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
4749
4750 if ((error = vdev_create(newrootvd, txg, replacing)) != 0)
4751 return (spa_vdev_exit(spa, newrootvd, txg, error));
4752
4753 /*
4754 * Spares can't replace logs
4755 */
4756 if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare)
4757 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
4758
4759 if (!replacing) {
4760 /*
4761 * For attach, the only allowable parent is a mirror or the root
4762 * vdev.
4763 */
4764 if (pvd->vdev_ops != &vdev_mirror_ops &&
4765 pvd->vdev_ops != &vdev_root_ops)
4766 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
4767
4768 pvops = &vdev_mirror_ops;
4769 } else {
4770 /*
4771 * Active hot spares can only be replaced by inactive hot
4772 * spares.
4773 */
4774 if (pvd->vdev_ops == &vdev_spare_ops &&
4775 oldvd->vdev_isspare &&
4776 !spa_has_spare(spa, newvd->vdev_guid))
4777 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
4778
4779 /*
4780 * If the source is a hot spare, and the parent isn't already a
4781 * spare, then we want to create a new hot spare. Otherwise, we
4782 * want to create a replacing vdev. The user is not allowed to
4783 * attach to a spared vdev child unless the 'isspare' state is
4784 * the same (spare replaces spare, non-spare replaces
4785 * non-spare).
4786 */
4787 if (pvd->vdev_ops == &vdev_replacing_ops &&
4788 spa_version(spa) < SPA_VERSION_MULTI_REPLACE) {
4789 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
4790 } else if (pvd->vdev_ops == &vdev_spare_ops &&
4791 newvd->vdev_isspare != oldvd->vdev_isspare) {
4792 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
4793 }
4794
4795 if (newvd->vdev_isspare)
4796 pvops = &vdev_spare_ops;
4797 else
4798 pvops = &vdev_replacing_ops;
4799 }
4800
4801 /*
4802 * Make sure the new device is big enough.
4803 */
4804 if (newvd->vdev_asize < vdev_get_min_asize(oldvd))
4805 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW));
4806
4807 /*
4808 * The new device cannot have a higher alignment requirement
4809 * than the top-level vdev.
4810 */
4811 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift)
4812 return (spa_vdev_exit(spa, newrootvd, txg, EDOM));
4813
4814 /*
4815 * If this is an in-place replacement, update oldvd's path and devid
4816 * to make it distinguishable from newvd, and unopenable from now on.
4817 */
4818 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) {
4819 spa_strfree(oldvd->vdev_path);
4820 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5,
4821 KM_SLEEP);
4822 (void) sprintf(oldvd->vdev_path, "%s/%s",
4823 newvd->vdev_path, "old");
4824 if (oldvd->vdev_devid != NULL) {
4825 spa_strfree(oldvd->vdev_devid);
4826 oldvd->vdev_devid = NULL;
4827 }
4828 }
4829
4830 /* mark the device being resilvered */
4831 newvd->vdev_resilver_txg = txg;
4832
4833 /*
4834 * If the parent is not a mirror, or if we're replacing, insert the new
4835 * mirror/replacing/spare vdev above oldvd.
4836 */
4837 if (pvd->vdev_ops != pvops)
4838 pvd = vdev_add_parent(oldvd, pvops);
4839
4840 ASSERT(pvd->vdev_top->vdev_parent == rvd);
4841 ASSERT(pvd->vdev_ops == pvops);
4842 ASSERT(oldvd->vdev_parent == pvd);
4843
4844 /*
4845 * Extract the new device from its root and add it to pvd.
4846 */
4847 vdev_remove_child(newrootvd, newvd);
4848 newvd->vdev_id = pvd->vdev_children;
4849 newvd->vdev_crtxg = oldvd->vdev_crtxg;
4850 vdev_add_child(pvd, newvd);
4851
4852 tvd = newvd->vdev_top;
4853 ASSERT(pvd->vdev_top == tvd);
4854 ASSERT(tvd->vdev_parent == rvd);
4855
4856 vdev_config_dirty(tvd);
4857
4858 /*
4859 * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account
4860 * for any dmu_sync-ed blocks. It will propagate upward when
4861 * spa_vdev_exit() calls vdev_dtl_reassess().
4862 */
4863 dtl_max_txg = txg + TXG_CONCURRENT_STATES;
4864
4865 vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL,
4866 dtl_max_txg - TXG_INITIAL);
4867
4868 if (newvd->vdev_isspare) {
4869 spa_spare_activate(newvd);
4870 spa_event_notify(spa, newvd, FM_EREPORT_ZFS_DEVICE_SPARE);
4871 }
4872
4873 oldvdpath = spa_strdup(oldvd->vdev_path);
4874 newvdpath = spa_strdup(newvd->vdev_path);
4875 newvd_isspare = newvd->vdev_isspare;
4876
4877 /*
4878 * Mark newvd's DTL dirty in this txg.
4879 */
4880 vdev_dirty(tvd, VDD_DTL, newvd, txg);
4881
4882 /*
4883 * Schedule the resilver to restart in the future. We do this to
4884 * ensure that dmu_sync-ed blocks have been stitched into the
4885 * respective datasets.
4886 */
4887 dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg);
4888
4889 /*
4890 * Commit the config
4891 */
4892 (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0);
4893
4894 spa_history_log_internal(spa, "vdev attach", NULL,
4895 "%s vdev=%s %s vdev=%s",
4896 replacing && newvd_isspare ? "spare in" :
4897 replacing ? "replace" : "attach", newvdpath,
4898 replacing ? "for" : "to", oldvdpath);
4899
4900 spa_strfree(oldvdpath);
4901 spa_strfree(newvdpath);
4902
4903 if (spa->spa_bootfs)
4904 spa_event_notify(spa, newvd, FM_EREPORT_ZFS_BOOTFS_VDEV_ATTACH);
4905
4906 return (0);
4907 }
4908
4909 /*
4910 * Detach a device from a mirror or replacing vdev.
4911 *
4912 * If 'replace_done' is specified, only detach if the parent
4913 * is a replacing vdev.
4914 */
4915 int
4916 spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
4917 {
4918 uint64_t txg;
4919 int error;
4920 vdev_t *vd, *pvd, *cvd, *tvd;
4921 boolean_t unspare = B_FALSE;
4922 uint64_t unspare_guid = 0;
4923 char *vdpath;
4924 int c, t;
4925 ASSERTV(vdev_t *rvd = spa->spa_root_vdev);
4926 ASSERT(spa_writeable(spa));
4927
4928 txg = spa_vdev_enter(spa);
4929
4930 vd = spa_lookup_by_guid(spa, guid, B_FALSE);
4931
4932 if (vd == NULL)
4933 return (spa_vdev_exit(spa, NULL, txg, ENODEV));
4934
4935 if (!vd->vdev_ops->vdev_op_leaf)
4936 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
4937
4938 pvd = vd->vdev_parent;
4939
4940 /*
4941 * If the parent/child relationship is not as expected, don't do it.
4942 * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing
4943 * vdev that's replacing B with C. The user's intent in replacing
4944 * is to go from M(A,B) to M(A,C). If the user decides to cancel
4945 * the replace by detaching C, the expected behavior is to end up
4946 * M(A,B). But suppose that right after deciding to detach C,
4947 * the replacement of B completes. We would have M(A,C), and then
4948 * ask to detach C, which would leave us with just A -- not what
4949 * the user wanted. To prevent this, we make sure that the
4950 * parent/child relationship hasn't changed -- in this example,
4951 * that C's parent is still the replacing vdev R.
4952 */
4953 if (pvd->vdev_guid != pguid && pguid != 0)
4954 return (spa_vdev_exit(spa, NULL, txg, EBUSY));
4955
4956 /*
4957 * Only 'replacing' or 'spare' vdevs can be replaced.
4958 */
4959 if (replace_done && pvd->vdev_ops != &vdev_replacing_ops &&
4960 pvd->vdev_ops != &vdev_spare_ops)
4961 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
4962
4963 ASSERT(pvd->vdev_ops != &vdev_spare_ops ||
4964 spa_version(spa) >= SPA_VERSION_SPARES);
4965
4966 /*
4967 * Only mirror, replacing, and spare vdevs support detach.
4968 */
4969 if (pvd->vdev_ops != &vdev_replacing_ops &&
4970 pvd->vdev_ops != &vdev_mirror_ops &&
4971 pvd->vdev_ops != &vdev_spare_ops)
4972 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
4973
4974 /*
4975 * If this device has the only valid copy of some data,
4976 * we cannot safely detach it.
4977 */
4978 if (vdev_dtl_required(vd))
4979 return (spa_vdev_exit(spa, NULL, txg, EBUSY));
4980
4981 ASSERT(pvd->vdev_children >= 2);
4982
4983 /*
4984 * If we are detaching the second disk from a replacing vdev, then
4985 * check to see if we changed the original vdev's path to have "/old"
4986 * at the end in spa_vdev_attach(). If so, undo that change now.
4987 */
4988 if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 &&
4989 vd->vdev_path != NULL) {
4990 size_t len = strlen(vd->vdev_path);
4991
4992 for (c = 0; c < pvd->vdev_children; c++) {
4993 cvd = pvd->vdev_child[c];
4994
4995 if (cvd == vd || cvd->vdev_path == NULL)
4996 continue;
4997
4998 if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 &&
4999 strcmp(cvd->vdev_path + len, "/old") == 0) {
5000 spa_strfree(cvd->vdev_path);
5001 cvd->vdev_path = spa_strdup(vd->vdev_path);
5002 break;
5003 }
5004 }
5005 }
5006
5007 /*
5008 * If we are detaching the original disk from a spare, then it implies
5009 * that the spare should become a real disk, and be removed from the
5010 * active spare list for the pool.
5011 */
5012 if (pvd->vdev_ops == &vdev_spare_ops &&
5013 vd->vdev_id == 0 &&
5014 pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare)
5015 unspare = B_TRUE;
5016
5017 /*
5018 * Erase the disk labels so the disk can be used for other things.
5019 * This must be done after all other error cases are handled,
5020 * but before we disembowel vd (so we can still do I/O to it).
5021 * But if we can't do it, don't treat the error as fatal --
5022 * it may be that the unwritability of the disk is the reason
5023 * it's being detached!
5024 */
5025 error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
5026
5027 /*
5028 * Remove vd from its parent and compact the parent's children.
5029 */
5030 vdev_remove_child(pvd, vd);
5031 vdev_compact_children(pvd);
5032
5033 /*
5034 * Remember one of the remaining children so we can get tvd below.
5035 */
5036 cvd = pvd->vdev_child[pvd->vdev_children - 1];
5037
5038 /*
5039 * If we need to remove the remaining child from the list of hot spares,
5040 * do it now, marking the vdev as no longer a spare in the process.
5041 * We must do this before vdev_remove_parent(), because that can
5042 * change the GUID if it creates a new toplevel GUID. For a similar
5043 * reason, we must remove the spare now, in the same txg as the detach;
5044 * otherwise someone could attach a new sibling, change the GUID, and
5045 * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail.
5046 */
5047 if (unspare) {
5048 ASSERT(cvd->vdev_isspare);
5049 spa_spare_remove(cvd);
5050 unspare_guid = cvd->vdev_guid;
5051 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
5052 cvd->vdev_unspare = B_TRUE;
5053 }
5054
5055 /*
5056 * If the parent mirror/replacing vdev only has one child,
5057 * the parent is no longer needed. Remove it from the tree.
5058 */
5059 if (pvd->vdev_children == 1) {
5060 if (pvd->vdev_ops == &vdev_spare_ops)
5061 cvd->vdev_unspare = B_FALSE;
5062 vdev_remove_parent(cvd);
5063 }
5064
5065
5066 /*
5067 * We don't set tvd until now because the parent we just removed
5068 * may have been the previous top-level vdev.
5069 */
5070 tvd = cvd->vdev_top;
5071 ASSERT(tvd->vdev_parent == rvd);
5072
5073 /*
5074 * Reevaluate the parent vdev state.
5075 */
5076 vdev_propagate_state(cvd);
5077
5078 /*
5079 * If the 'autoexpand' property is set on the pool then automatically
5080 * try to expand the size of the pool. For example if the device we
5081 * just detached was smaller than the others, it may be possible to
5082 * add metaslabs (i.e. grow the pool). We need to reopen the vdev
5083 * first so that we can obtain the updated sizes of the leaf vdevs.
5084 */
5085 if (spa->spa_autoexpand) {
5086 vdev_reopen(tvd);
5087 vdev_expand(tvd, txg);
5088 }
5089
5090 vdev_config_dirty(tvd);
5091
5092 /*
5093 * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that
5094 * vd->vdev_detached is set and free vd's DTL object in syncing context.
5095 * But first make sure we're not on any *other* txg's DTL list, to
5096 * prevent vd from being accessed after it's freed.
5097 */
5098 vdpath = spa_strdup(vd->vdev_path);
5099 for (t = 0; t < TXG_SIZE; t++)
5100 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
5101 vd->vdev_detached = B_TRUE;
5102 vdev_dirty(tvd, VDD_DTL, vd, txg);
5103
5104 spa_event_notify(spa, vd, FM_EREPORT_ZFS_DEVICE_REMOVE);
5105
5106 /* hang on to the spa before we release the lock */
5107 spa_open_ref(spa, FTAG);
5108
5109 error = spa_vdev_exit(spa, vd, txg, 0);
5110
5111 spa_history_log_internal(spa, "detach", NULL,
5112 "vdev=%s", vdpath);
5113 spa_strfree(vdpath);
5114
5115 /*
5116 * If this was the removal of the original device in a hot spare vdev,
5117 * then we want to go through and remove the device from the hot spare
5118 * list of every other pool.
5119 */
5120 if (unspare) {
5121 spa_t *altspa = NULL;
5122
5123 mutex_enter(&spa_namespace_lock);
5124 while ((altspa = spa_next(altspa)) != NULL) {
5125 if (altspa->spa_state != POOL_STATE_ACTIVE ||
5126 altspa == spa)
5127 continue;
5128
5129 spa_open_ref(altspa, FTAG);
5130 mutex_exit(&spa_namespace_lock);
5131 (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE);
5132 mutex_enter(&spa_namespace_lock);
5133 spa_close(altspa, FTAG);
5134 }
5135 mutex_exit(&spa_namespace_lock);
5136
5137 /* search the rest of the vdevs for spares to remove */
5138 spa_vdev_resilver_done(spa);
5139 }
5140
5141 /* all done with the spa; OK to release */
5142 mutex_enter(&spa_namespace_lock);
5143 spa_close(spa, FTAG);
5144 mutex_exit(&spa_namespace_lock);
5145
5146 return (error);
5147 }
5148
5149 /*
5150 * Split a set of devices from their mirrors, and create a new pool from them.
5151 */
5152 int
5153 spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
5154 nvlist_t *props, boolean_t exp)
5155 {
5156 int error = 0;
5157 uint64_t txg, *glist;
5158 spa_t *newspa;
5159 uint_t c, children, lastlog;
5160 nvlist_t **child, *nvl, *tmp;
5161 dmu_tx_t *tx;
5162 char *altroot = NULL;
5163 vdev_t *rvd, **vml = NULL; /* vdev modify list */
5164 boolean_t activate_slog;
5165
5166 ASSERT(spa_writeable(spa));
5167
5168 txg = spa_vdev_enter(spa);
5169
5170 /* clear the log and flush everything up to now */
5171 activate_slog = spa_passivate_log(spa);
5172 (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
5173 error = spa_offline_log(spa);
5174 txg = spa_vdev_config_enter(spa);
5175
5176 if (activate_slog)
5177 spa_activate_log(spa);
5178
5179 if (error != 0)
5180 return (spa_vdev_exit(spa, NULL, txg, error));
5181
5182 /* check new spa name before going any further */
5183 if (spa_lookup(newname) != NULL)
5184 return (spa_vdev_exit(spa, NULL, txg, EEXIST));
5185
5186 /*
5187 * scan through all the children to ensure they're all mirrors
5188 */
5189 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 ||
5190 nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child,
5191 &children) != 0)
5192 return (spa_vdev_exit(spa, NULL, txg, EINVAL));
5193
5194 /* first, check to ensure we've got the right child count */
5195 rvd = spa->spa_root_vdev;
5196 lastlog = 0;
5197 for (c = 0; c < rvd->vdev_children; c++) {
5198 vdev_t *vd = rvd->vdev_child[c];
5199
5200 /* don't count the holes & logs as children */
5201 if (vd->vdev_islog || vd->vdev_ishole) {
5202 if (lastlog == 0)
5203 lastlog = c;
5204 continue;
5205 }
5206
5207 lastlog = 0;
5208 }
5209 if (children != (lastlog != 0 ? lastlog : rvd->vdev_children))
5210 return (spa_vdev_exit(spa, NULL, txg, EINVAL));
5211
5212 /* next, ensure no spare or cache devices are part of the split */
5213 if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 ||
5214 nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0)
5215 return (spa_vdev_exit(spa, NULL, txg, EINVAL));
5216
5217 vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP);
5218 glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP);
5219
5220 /* then, loop over each vdev and validate it */
5221 for (c = 0; c < children; c++) {
5222 uint64_t is_hole = 0;
5223
5224 (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE,
5225 &is_hole);
5226
5227 if (is_hole != 0) {
5228 if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole ||
5229 spa->spa_root_vdev->vdev_child[c]->vdev_islog) {
5230 continue;
5231 } else {
5232 error = SET_ERROR(EINVAL);
5233 break;
5234 }
5235 }
5236
5237 /* which disk is going to be split? */
5238 if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID,
5239 &glist[c]) != 0) {
5240 error = SET_ERROR(EINVAL);
5241 break;
5242 }
5243
5244 /* look it up in the spa */
5245 vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE);
5246 if (vml[c] == NULL) {
5247 error = SET_ERROR(ENODEV);
5248 break;
5249 }
5250
5251 /* make sure there's nothing stopping the split */
5252 if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops ||
5253 vml[c]->vdev_islog ||
5254 vml[c]->vdev_ishole ||
5255 vml[c]->vdev_isspare ||
5256 vml[c]->vdev_isl2cache ||
5257 !vdev_writeable(vml[c]) ||
5258 vml[c]->vdev_children != 0 ||
5259 vml[c]->vdev_state != VDEV_STATE_HEALTHY ||
5260 c != spa->spa_root_vdev->vdev_child[c]->vdev_id) {
5261 error = SET_ERROR(EINVAL);
5262 break;
5263 }
5264
5265 if (vdev_dtl_required(vml[c])) {
5266 error = SET_ERROR(EBUSY);
5267 break;
5268 }
5269
5270 /* we need certain info from the top level */
5271 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY,
5272 vml[c]->vdev_top->vdev_ms_array) == 0);
5273 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT,
5274 vml[c]->vdev_top->vdev_ms_shift) == 0);
5275 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE,
5276 vml[c]->vdev_top->vdev_asize) == 0);
5277 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT,
5278 vml[c]->vdev_top->vdev_ashift) == 0);
5279
5280 /* transfer per-vdev ZAPs */
5281 ASSERT3U(vml[c]->vdev_leaf_zap, !=, 0);
5282 VERIFY0(nvlist_add_uint64(child[c],
5283 ZPOOL_CONFIG_VDEV_LEAF_ZAP, vml[c]->vdev_leaf_zap));
5284
5285 ASSERT3U(vml[c]->vdev_top->vdev_top_zap, !=, 0);
5286 VERIFY0(nvlist_add_uint64(child[c],
5287 ZPOOL_CONFIG_VDEV_TOP_ZAP,
5288 vml[c]->vdev_parent->vdev_top_zap));
5289 }
5290
5291 if (error != 0) {
5292 kmem_free(vml, children * sizeof (vdev_t *));
5293 kmem_free(glist, children * sizeof (uint64_t));
5294 return (spa_vdev_exit(spa, NULL, txg, error));
5295 }
5296
5297 /* stop writers from using the disks */
5298 for (c = 0; c < children; c++) {
5299 if (vml[c] != NULL)
5300 vml[c]->vdev_offline = B_TRUE;
5301 }
5302 vdev_reopen(spa->spa_root_vdev);
5303
5304 /*
5305 * Temporarily record the splitting vdevs in the spa config. This
5306 * will disappear once the config is regenerated.
5307 */
5308 VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0);
5309 VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
5310 glist, children) == 0);
5311 kmem_free(glist, children * sizeof (uint64_t));
5312
5313 mutex_enter(&spa->spa_props_lock);
5314 VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT,
5315 nvl) == 0);
5316 mutex_exit(&spa->spa_props_lock);
5317 spa->spa_config_splitting = nvl;
5318 vdev_config_dirty(spa->spa_root_vdev);
5319
5320 /* configure and create the new pool */
5321 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0);
5322 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
5323 exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0);
5324 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
5325 spa_version(spa)) == 0);
5326 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG,
5327 spa->spa_config_txg) == 0);
5328 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID,
5329 spa_generate_guid(NULL)) == 0);
5330 VERIFY0(nvlist_add_boolean(config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS));
5331 (void) nvlist_lookup_string(props,
5332 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
5333
5334 /* add the new pool to the namespace */
5335 newspa = spa_add(newname, config, altroot);
5336 newspa->spa_avz_action = AVZ_ACTION_REBUILD;
5337 newspa->spa_config_txg = spa->spa_config_txg;
5338 spa_set_log_state(newspa, SPA_LOG_CLEAR);
5339
5340 /* release the spa config lock, retaining the namespace lock */
5341 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
5342
5343 if (zio_injection_enabled)
5344 zio_handle_panic_injection(spa, FTAG, 1);
5345
5346 spa_activate(newspa, spa_mode_global);
5347 spa_async_suspend(newspa);
5348
5349 /* create the new pool from the disks of the original pool */
5350 error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE);
5351 if (error)
5352 goto out;
5353
5354 /* if that worked, generate a real config for the new pool */
5355 if (newspa->spa_root_vdev != NULL) {
5356 VERIFY(nvlist_alloc(&newspa->spa_config_splitting,
5357 NV_UNIQUE_NAME, KM_SLEEP) == 0);
5358 VERIFY(nvlist_add_uint64(newspa->spa_config_splitting,
5359 ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0);
5360 spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL,
5361 B_TRUE));
5362 }
5363
5364 /* set the props */
5365 if (props != NULL) {
5366 spa_configfile_set(newspa, props, B_FALSE);
5367 error = spa_prop_set(newspa, props);
5368 if (error)
5369 goto out;
5370 }
5371
5372 /* flush everything */
5373 txg = spa_vdev_config_enter(newspa);
5374 vdev_config_dirty(newspa->spa_root_vdev);
5375 (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG);
5376
5377 if (zio_injection_enabled)
5378 zio_handle_panic_injection(spa, FTAG, 2);
5379
5380 spa_async_resume(newspa);
5381
5382 /* finally, update the original pool's config */
5383 txg = spa_vdev_config_enter(spa);
5384 tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
5385 error = dmu_tx_assign(tx, TXG_WAIT);
5386 if (error != 0)
5387 dmu_tx_abort(tx);
5388 for (c = 0; c < children; c++) {
5389 if (vml[c] != NULL) {
5390 vdev_split(vml[c]);
5391 if (error == 0)
5392 spa_history_log_internal(spa, "detach", tx,
5393 "vdev=%s", vml[c]->vdev_path);
5394
5395 vdev_free(vml[c]);
5396 }
5397 }
5398 spa->spa_avz_action = AVZ_ACTION_REBUILD;
5399 vdev_config_dirty(spa->spa_root_vdev);
5400 spa->spa_config_splitting = NULL;
5401 nvlist_free(nvl);
5402 if (error == 0)
5403 dmu_tx_commit(tx);
5404 (void) spa_vdev_exit(spa, NULL, txg, 0);
5405
5406 if (zio_injection_enabled)
5407 zio_handle_panic_injection(spa, FTAG, 3);
5408
5409 /* split is complete; log a history record */
5410 spa_history_log_internal(newspa, "split", NULL,
5411 "from pool %s", spa_name(spa));
5412
5413 kmem_free(vml, children * sizeof (vdev_t *));
5414
5415 /* if we're not going to mount the filesystems in userland, export */
5416 if (exp)
5417 error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL,
5418 B_FALSE, B_FALSE);
5419
5420 return (error);
5421
5422 out:
5423 spa_unload(newspa);
5424 spa_deactivate(newspa);
5425 spa_remove(newspa);
5426
5427 txg = spa_vdev_config_enter(spa);
5428
5429 /* re-online all offlined disks */
5430 for (c = 0; c < children; c++) {
5431 if (vml[c] != NULL)
5432 vml[c]->vdev_offline = B_FALSE;
5433 }
5434 vdev_reopen(spa->spa_root_vdev);
5435
5436 nvlist_free(spa->spa_config_splitting);
5437 spa->spa_config_splitting = NULL;
5438 (void) spa_vdev_exit(spa, NULL, txg, error);
5439
5440 kmem_free(vml, children * sizeof (vdev_t *));
5441 return (error);
5442 }
5443
5444 static nvlist_t *
5445 spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid)
5446 {
5447 int i;
5448
5449 for (i = 0; i < count; i++) {
5450 uint64_t guid;
5451
5452 VERIFY(nvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID,
5453 &guid) == 0);
5454
5455 if (guid == target_guid)
5456 return (nvpp[i]);
5457 }
5458
5459 return (NULL);
5460 }
5461
5462 static void
5463 spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count,
5464 nvlist_t *dev_to_remove)
5465 {
5466 nvlist_t **newdev = NULL;
5467 int i, j;
5468
5469 if (count > 1)
5470 newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP);
5471
5472 for (i = 0, j = 0; i < count; i++) {
5473 if (dev[i] == dev_to_remove)
5474 continue;
5475 VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0);
5476 }
5477
5478 VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0);
5479 VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0);
5480
5481 for (i = 0; i < count - 1; i++)
5482 nvlist_free(newdev[i]);
5483
5484 if (count > 1)
5485 kmem_free(newdev, (count - 1) * sizeof (void *));
5486 }
5487
5488 /*
5489 * Evacuate the device.
5490 */
5491 static int
5492 spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd)
5493 {
5494 uint64_t txg;
5495 int error = 0;
5496
5497 ASSERT(MUTEX_HELD(&spa_namespace_lock));
5498 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
5499 ASSERT(vd == vd->vdev_top);
5500
5501 /*
5502 * Evacuate the device. We don't hold the config lock as writer
5503 * since we need to do I/O but we do keep the
5504 * spa_namespace_lock held. Once this completes the device
5505 * should no longer have any blocks allocated on it.
5506 */
5507 if (vd->vdev_islog) {
5508 if (vd->vdev_stat.vs_alloc != 0)
5509 error = spa_offline_log(spa);
5510 } else {
5511 error = SET_ERROR(ENOTSUP);
5512 }
5513
5514 if (error)
5515 return (error);
5516
5517 /*
5518 * The evacuation succeeded. Remove any remaining MOS metadata
5519 * associated with this vdev, and wait for these changes to sync.
5520 */
5521 ASSERT0(vd->vdev_stat.vs_alloc);
5522 txg = spa_vdev_config_enter(spa);
5523 vd->vdev_removing = B_TRUE;
5524 vdev_dirty_leaves(vd, VDD_DTL, txg);
5525 vdev_config_dirty(vd);
5526 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
5527
5528 return (0);
5529 }
5530
5531 /*
5532 * Complete the removal by cleaning up the namespace.
5533 */
5534 static void
5535 spa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd)
5536 {
5537 vdev_t *rvd = spa->spa_root_vdev;
5538 uint64_t id = vd->vdev_id;
5539 boolean_t last_vdev = (id == (rvd->vdev_children - 1));
5540
5541 ASSERT(MUTEX_HELD(&spa_namespace_lock));
5542 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
5543 ASSERT(vd == vd->vdev_top);
5544
5545 /*
5546 * Only remove any devices which are empty.
5547 */
5548 if (vd->vdev_stat.vs_alloc != 0)
5549 return;
5550
5551 (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
5552
5553 if (list_link_active(&vd->vdev_state_dirty_node))
5554 vdev_state_clean(vd);
5555 if (list_link_active(&vd->vdev_config_dirty_node))
5556 vdev_config_clean(vd);
5557
5558 vdev_free(vd);
5559
5560 if (last_vdev) {
5561 vdev_compact_children(rvd);
5562 } else {
5563 vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops);
5564 vdev_add_child(rvd, vd);
5565 }
5566 vdev_config_dirty(rvd);
5567
5568 /*
5569 * Reassess the health of our root vdev.
5570 */
5571 vdev_reopen(rvd);
5572 }
5573
5574 /*
5575 * Remove a device from the pool -
5576 *
5577 * Removing a device from the vdev namespace requires several steps
5578 * and can take a significant amount of time. As a result we use
5579 * the spa_vdev_config_[enter/exit] functions which allow us to
5580 * grab and release the spa_config_lock while still holding the namespace
5581 * lock. During each step the configuration is synced out.
5582 *
5583 * Currently, this supports removing only hot spares, slogs, and level 2 ARC
5584 * devices.
5585 */
5586 int
5587 spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
5588 {
5589 vdev_t *vd;
5590 metaslab_group_t *mg;
5591 nvlist_t **spares, **l2cache, *nv;
5592 uint64_t txg = 0;
5593 uint_t nspares, nl2cache;
5594 int error = 0;
5595 boolean_t locked = MUTEX_HELD(&spa_namespace_lock);
5596
5597 ASSERT(spa_writeable(spa));
5598
5599 if (!locked)
5600 txg = spa_vdev_enter(spa);
5601
5602 vd = spa_lookup_by_guid(spa, guid, B_FALSE);
5603
5604 if (spa->spa_spares.sav_vdevs != NULL &&
5605 nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
5606 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 &&
5607 (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) {
5608 /*
5609 * Only remove the hot spare if it's not currently in use
5610 * in this pool.
5611 */
5612 if (vd == NULL || unspare) {
5613 spa_vdev_remove_aux(spa->spa_spares.sav_config,
5614 ZPOOL_CONFIG_SPARES, spares, nspares, nv);
5615 spa_load_spares(spa);
5616 spa->spa_spares.sav_sync = B_TRUE;
5617 } else {
5618 error = SET_ERROR(EBUSY);
5619 }
5620 } else if (spa->spa_l2cache.sav_vdevs != NULL &&
5621 nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
5622 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 &&
5623 (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) {
5624 /*
5625 * Cache devices can always be removed.
5626 */
5627 spa_vdev_remove_aux(spa->spa_l2cache.sav_config,
5628 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv);
5629 spa_load_l2cache(spa);
5630 spa->spa_l2cache.sav_sync = B_TRUE;
5631 } else if (vd != NULL && vd->vdev_islog) {
5632 ASSERT(!locked);
5633 ASSERT(vd == vd->vdev_top);
5634
5635 mg = vd->vdev_mg;
5636
5637 /*
5638 * Stop allocating from this vdev.
5639 */
5640 metaslab_group_passivate(mg);
5641
5642 /*
5643 * Wait for the youngest allocations and frees to sync,
5644 * and then wait for the deferral of those frees to finish.
5645 */
5646 spa_vdev_config_exit(spa, NULL,
5647 txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
5648
5649 /*
5650 * Attempt to evacuate the vdev.
5651 */
5652 error = spa_vdev_remove_evacuate(spa, vd);
5653
5654 txg = spa_vdev_config_enter(spa);
5655
5656 /*
5657 * If we couldn't evacuate the vdev, unwind.
5658 */
5659 if (error) {
5660 metaslab_group_activate(mg);
5661 return (spa_vdev_exit(spa, NULL, txg, error));
5662 }
5663
5664 /*
5665 * Clean up the vdev namespace.
5666 */
5667 spa_vdev_remove_from_namespace(spa, vd);
5668
5669 } else if (vd != NULL) {
5670 /*
5671 * Normal vdevs cannot be removed (yet).
5672 */
5673 error = SET_ERROR(ENOTSUP);
5674 } else {
5675 /*
5676 * There is no vdev of any kind with the specified guid.
5677 */
5678 error = SET_ERROR(ENOENT);
5679 }
5680
5681 if (!locked)
5682 return (spa_vdev_exit(spa, NULL, txg, error));
5683
5684 return (error);
5685 }
5686
5687 /*
5688 * Find any device that's done replacing, or a vdev marked 'unspare' that's
5689 * currently spared, so we can detach it.
5690 */
5691 static vdev_t *
5692 spa_vdev_resilver_done_hunt(vdev_t *vd)
5693 {
5694 vdev_t *newvd, *oldvd;
5695 int c;
5696
5697 for (c = 0; c < vd->vdev_children; c++) {
5698 oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]);
5699 if (oldvd != NULL)
5700 return (oldvd);
5701 }
5702
5703 /*
5704 * Check for a completed replacement. We always consider the first
5705 * vdev in the list to be the oldest vdev, and the last one to be
5706 * the newest (see spa_vdev_attach() for how that works). In
5707 * the case where the newest vdev is faulted, we will not automatically
5708 * remove it after a resilver completes. This is OK as it will require
5709 * user intervention to determine which disk the admin wishes to keep.
5710 */
5711 if (vd->vdev_ops == &vdev_replacing_ops) {
5712 ASSERT(vd->vdev_children > 1);
5713
5714 newvd = vd->vdev_child[vd->vdev_children - 1];
5715 oldvd = vd->vdev_child[0];
5716
5717 if (vdev_dtl_empty(newvd, DTL_MISSING) &&
5718 vdev_dtl_empty(newvd, DTL_OUTAGE) &&
5719 !vdev_dtl_required(oldvd))
5720 return (oldvd);
5721 }
5722
5723 /*
5724 * Check for a completed resilver with the 'unspare' flag set.
5725 */
5726 if (vd->vdev_ops == &vdev_spare_ops) {
5727 vdev_t *first = vd->vdev_child[0];
5728 vdev_t *last = vd->vdev_child[vd->vdev_children - 1];
5729
5730 if (last->vdev_unspare) {
5731 oldvd = first;
5732 newvd = last;
5733 } else if (first->vdev_unspare) {
5734 oldvd = last;
5735 newvd = first;
5736 } else {
5737 oldvd = NULL;
5738 }
5739
5740 if (oldvd != NULL &&
5741 vdev_dtl_empty(newvd, DTL_MISSING) &&
5742 vdev_dtl_empty(newvd, DTL_OUTAGE) &&
5743 !vdev_dtl_required(oldvd))
5744 return (oldvd);
5745
5746 /*
5747 * If there are more than two spares attached to a disk,
5748 * and those spares are not required, then we want to
5749 * attempt to free them up now so that they can be used
5750 * by other pools. Once we're back down to a single
5751 * disk+spare, we stop removing them.
5752 */
5753 if (vd->vdev_children > 2) {
5754 newvd = vd->vdev_child[1];
5755
5756 if (newvd->vdev_isspare && last->vdev_isspare &&
5757 vdev_dtl_empty(last, DTL_MISSING) &&
5758 vdev_dtl_empty(last, DTL_OUTAGE) &&
5759 !vdev_dtl_required(newvd))
5760 return (newvd);
5761 }
5762 }
5763
5764 return (NULL);
5765 }
5766
5767 static void
5768 spa_vdev_resilver_done(spa_t *spa)
5769 {
5770 vdev_t *vd, *pvd, *ppvd;
5771 uint64_t guid, sguid, pguid, ppguid;
5772
5773 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
5774
5775 while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) {
5776 pvd = vd->vdev_parent;
5777 ppvd = pvd->vdev_parent;
5778 guid = vd->vdev_guid;
5779 pguid = pvd->vdev_guid;
5780 ppguid = ppvd->vdev_guid;
5781 sguid = 0;
5782 /*
5783 * If we have just finished replacing a hot spared device, then
5784 * we need to detach the parent's first child (the original hot
5785 * spare) as well.
5786 */
5787 if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 &&
5788 ppvd->vdev_children == 2) {
5789 ASSERT(pvd->vdev_ops == &vdev_replacing_ops);
5790 sguid = ppvd->vdev_child[1]->vdev_guid;
5791 }
5792 ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd));
5793
5794 spa_config_exit(spa, SCL_ALL, FTAG);
5795 if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0)
5796 return;
5797 if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0)
5798 return;
5799 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
5800 }
5801
5802 spa_config_exit(spa, SCL_ALL, FTAG);
5803 }
5804
5805 /*
5806 * Update the stored path or FRU for this vdev.
5807 */
5808 int
5809 spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value,
5810 boolean_t ispath)
5811 {
5812 vdev_t *vd;
5813 boolean_t sync = B_FALSE;
5814
5815 ASSERT(spa_writeable(spa));
5816
5817 spa_vdev_state_enter(spa, SCL_ALL);
5818
5819 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
5820 return (spa_vdev_state_exit(spa, NULL, ENOENT));
5821
5822 if (!vd->vdev_ops->vdev_op_leaf)
5823 return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
5824
5825 if (ispath) {
5826 if (strcmp(value, vd->vdev_path) != 0) {
5827 spa_strfree(vd->vdev_path);
5828 vd->vdev_path = spa_strdup(value);
5829 sync = B_TRUE;
5830 }
5831 } else {
5832 if (vd->vdev_fru == NULL) {
5833 vd->vdev_fru = spa_strdup(value);
5834 sync = B_TRUE;
5835 } else if (strcmp(value, vd->vdev_fru) != 0) {
5836 spa_strfree(vd->vdev_fru);
5837 vd->vdev_fru = spa_strdup(value);
5838 sync = B_TRUE;
5839 }
5840 }
5841
5842 return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0));
5843 }
5844
5845 int
5846 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath)
5847 {
5848 return (spa_vdev_set_common(spa, guid, newpath, B_TRUE));
5849 }
5850
5851 int
5852 spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru)
5853 {
5854 return (spa_vdev_set_common(spa, guid, newfru, B_FALSE));
5855 }
5856
5857 /*
5858 * ==========================================================================
5859 * SPA Scanning
5860 * ==========================================================================
5861 */
5862
5863 int
5864 spa_scan_stop(spa_t *spa)
5865 {
5866 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
5867 if (dsl_scan_resilvering(spa->spa_dsl_pool))
5868 return (SET_ERROR(EBUSY));
5869 return (dsl_scan_cancel(spa->spa_dsl_pool));
5870 }
5871
5872 int
5873 spa_scan(spa_t *spa, pool_scan_func_t func)
5874 {
5875 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
5876
5877 if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE)
5878 return (SET_ERROR(ENOTSUP));
5879
5880 /*
5881 * If a resilver was requested, but there is no DTL on a
5882 * writeable leaf device, we have nothing to do.
5883 */
5884 if (func == POOL_SCAN_RESILVER &&
5885 !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) {
5886 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
5887 return (0);
5888 }
5889
5890 return (dsl_scan(spa->spa_dsl_pool, func));
5891 }
5892
5893 /*
5894 * ==========================================================================
5895 * SPA async task processing
5896 * ==========================================================================
5897 */
5898
5899 static void
5900 spa_async_remove(spa_t *spa, vdev_t *vd)
5901 {
5902 int c;
5903
5904 if (vd->vdev_remove_wanted) {
5905 vd->vdev_remove_wanted = B_FALSE;
5906 vd->vdev_delayed_close = B_FALSE;
5907 vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE);
5908
5909 /*
5910 * We want to clear the stats, but we don't want to do a full
5911 * vdev_clear() as that will cause us to throw away
5912 * degraded/faulted state as well as attempt to reopen the
5913 * device, all of which is a waste.
5914 */
5915 vd->vdev_stat.vs_read_errors = 0;
5916 vd->vdev_stat.vs_write_errors = 0;
5917 vd->vdev_stat.vs_checksum_errors = 0;
5918
5919 vdev_state_dirty(vd->vdev_top);
5920 }
5921
5922 for (c = 0; c < vd->vdev_children; c++)
5923 spa_async_remove(spa, vd->vdev_child[c]);
5924 }
5925
5926 static void
5927 spa_async_probe(spa_t *spa, vdev_t *vd)
5928 {
5929 int c;
5930
5931 if (vd->vdev_probe_wanted) {
5932 vd->vdev_probe_wanted = B_FALSE;
5933 vdev_reopen(vd); /* vdev_open() does the actual probe */
5934 }
5935
5936 for (c = 0; c < vd->vdev_children; c++)
5937 spa_async_probe(spa, vd->vdev_child[c]);
5938 }
5939
5940 static void
5941 spa_async_autoexpand(spa_t *spa, vdev_t *vd)
5942 {
5943 int c;
5944
5945 if (!spa->spa_autoexpand)
5946 return;
5947
5948 for (c = 0; c < vd->vdev_children; c++) {
5949 vdev_t *cvd = vd->vdev_child[c];
5950 spa_async_autoexpand(spa, cvd);
5951 }
5952
5953 if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL)
5954 return;
5955
5956 spa_event_notify(vd->vdev_spa, vd, FM_EREPORT_ZFS_DEVICE_AUTOEXPAND);
5957 }
5958
5959 static void
5960 spa_async_thread(spa_t *spa)
5961 {
5962 int tasks, i;
5963
5964 ASSERT(spa->spa_sync_on);
5965
5966 mutex_enter(&spa->spa_async_lock);
5967 tasks = spa->spa_async_tasks;
5968 spa->spa_async_tasks = 0;
5969 mutex_exit(&spa->spa_async_lock);
5970
5971 /*
5972 * See if the config needs to be updated.
5973 */
5974 if (tasks & SPA_ASYNC_CONFIG_UPDATE) {
5975 uint64_t old_space, new_space;
5976
5977 mutex_enter(&spa_namespace_lock);
5978 old_space = metaslab_class_get_space(spa_normal_class(spa));
5979 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
5980 new_space = metaslab_class_get_space(spa_normal_class(spa));
5981 mutex_exit(&spa_namespace_lock);
5982
5983 /*
5984 * If the pool grew as a result of the config update,
5985 * then log an internal history event.
5986 */
5987 if (new_space != old_space) {
5988 spa_history_log_internal(spa, "vdev online", NULL,
5989 "pool '%s' size: %llu(+%llu)",
5990 spa_name(spa), new_space, new_space - old_space);
5991 }
5992 }
5993
5994 /*
5995 * See if any devices need to be marked REMOVED.
5996 */
5997 if (tasks & SPA_ASYNC_REMOVE) {
5998 spa_vdev_state_enter(spa, SCL_NONE);
5999 spa_async_remove(spa, spa->spa_root_vdev);
6000 for (i = 0; i < spa->spa_l2cache.sav_count; i++)
6001 spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]);
6002 for (i = 0; i < spa->spa_spares.sav_count; i++)
6003 spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]);
6004 (void) spa_vdev_state_exit(spa, NULL, 0);
6005 }
6006
6007 if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) {
6008 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
6009 spa_async_autoexpand(spa, spa->spa_root_vdev);
6010 spa_config_exit(spa, SCL_CONFIG, FTAG);
6011 }
6012
6013 /*
6014 * See if any devices need to be probed.
6015 */
6016 if (tasks & SPA_ASYNC_PROBE) {
6017 spa_vdev_state_enter(spa, SCL_NONE);
6018 spa_async_probe(spa, spa->spa_root_vdev);
6019 (void) spa_vdev_state_exit(spa, NULL, 0);
6020 }
6021
6022 /*
6023 * If any devices are done replacing, detach them.
6024 */
6025 if (tasks & SPA_ASYNC_RESILVER_DONE)
6026 spa_vdev_resilver_done(spa);
6027
6028 /*
6029 * Kick off a resilver.
6030 */
6031 if (tasks & SPA_ASYNC_RESILVER)
6032 dsl_resilver_restart(spa->spa_dsl_pool, 0);
6033
6034 /*
6035 * Let the world know that we're done.
6036 */
6037 mutex_enter(&spa->spa_async_lock);
6038 spa->spa_async_thread = NULL;
6039 cv_broadcast(&spa->spa_async_cv);
6040 mutex_exit(&spa->spa_async_lock);
6041 thread_exit();
6042 }
6043
6044 void
6045 spa_async_suspend(spa_t *spa)
6046 {
6047 mutex_enter(&spa->spa_async_lock);
6048 spa->spa_async_suspended++;
6049 while (spa->spa_async_thread != NULL)
6050 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock);
6051 mutex_exit(&spa->spa_async_lock);
6052 }
6053
6054 void
6055 spa_async_resume(spa_t *spa)
6056 {
6057 mutex_enter(&spa->spa_async_lock);
6058 ASSERT(spa->spa_async_suspended != 0);
6059 spa->spa_async_suspended--;
6060 mutex_exit(&spa->spa_async_lock);
6061 }
6062
6063 static boolean_t
6064 spa_async_tasks_pending(spa_t *spa)
6065 {
6066 uint_t non_config_tasks;
6067 uint_t config_task;
6068 boolean_t config_task_suspended;
6069
6070 non_config_tasks = spa->spa_async_tasks & ~SPA_ASYNC_CONFIG_UPDATE;
6071 config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE;
6072 if (spa->spa_ccw_fail_time == 0) {
6073 config_task_suspended = B_FALSE;
6074 } else {
6075 config_task_suspended =
6076 (gethrtime() - spa->spa_ccw_fail_time) <
6077 (zfs_ccw_retry_interval * NANOSEC);
6078 }
6079
6080 return (non_config_tasks || (config_task && !config_task_suspended));
6081 }
6082
6083 static void
6084 spa_async_dispatch(spa_t *spa)
6085 {
6086 mutex_enter(&spa->spa_async_lock);
6087 if (spa_async_tasks_pending(spa) &&
6088 !spa->spa_async_suspended &&
6089 spa->spa_async_thread == NULL &&
6090 rootdir != NULL)
6091 spa->spa_async_thread = thread_create(NULL, 0,
6092 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri);
6093 mutex_exit(&spa->spa_async_lock);
6094 }
6095
6096 void
6097 spa_async_request(spa_t *spa, int task)
6098 {
6099 zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task);
6100 mutex_enter(&spa->spa_async_lock);
6101 spa->spa_async_tasks |= task;
6102 mutex_exit(&spa->spa_async_lock);
6103 }
6104
6105 /*
6106 * ==========================================================================
6107 * SPA syncing routines
6108 * ==========================================================================
6109 */
6110
6111 static int
6112 bpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
6113 {
6114 bpobj_t *bpo = arg;
6115 bpobj_enqueue(bpo, bp, tx);
6116 return (0);
6117 }
6118
6119 static int
6120 spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
6121 {
6122 zio_t *zio = arg;
6123
6124 zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp,
6125 zio->io_flags));
6126 return (0);
6127 }
6128
6129 /*
6130 * Note: this simple function is not inlined to make it easier to dtrace the
6131 * amount of time spent syncing frees.
6132 */
6133 static void
6134 spa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx)
6135 {
6136 zio_t *zio = zio_root(spa, NULL, NULL, 0);
6137 bplist_iterate(bpl, spa_free_sync_cb, zio, tx);
6138 VERIFY(zio_wait(zio) == 0);
6139 }
6140
6141 /*
6142 * Note: this simple function is not inlined to make it easier to dtrace the
6143 * amount of time spent syncing deferred frees.
6144 */
6145 static void
6146 spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx)
6147 {
6148 zio_t *zio = zio_root(spa, NULL, NULL, 0);
6149 VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj,
6150 spa_free_sync_cb, zio, tx), ==, 0);
6151 VERIFY0(zio_wait(zio));
6152 }
6153
6154 static void
6155 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx)
6156 {
6157 char *packed = NULL;
6158 size_t bufsize;
6159 size_t nvsize = 0;
6160 dmu_buf_t *db;
6161
6162 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0);
6163
6164 /*
6165 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration
6166 * information. This avoids the dmu_buf_will_dirty() path and
6167 * saves us a pre-read to get data we don't actually care about.
6168 */
6169 bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE);
6170 packed = vmem_alloc(bufsize, KM_SLEEP);
6171
6172 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR,
6173 KM_SLEEP) == 0);
6174 bzero(packed + nvsize, bufsize - nvsize);
6175
6176 dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx);
6177
6178 vmem_free(packed, bufsize);
6179
6180 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
6181 dmu_buf_will_dirty(db, tx);
6182 *(uint64_t *)db->db_data = nvsize;
6183 dmu_buf_rele(db, FTAG);
6184 }
6185
6186 static void
6187 spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx,
6188 const char *config, const char *entry)
6189 {
6190 nvlist_t *nvroot;
6191 nvlist_t **list;
6192 int i;
6193
6194 if (!sav->sav_sync)
6195 return;
6196
6197 /*
6198 * Update the MOS nvlist describing the list of available devices.
6199 * spa_validate_aux() will have already made sure this nvlist is
6200 * valid and the vdevs are labeled appropriately.
6201 */
6202 if (sav->sav_object == 0) {
6203 sav->sav_object = dmu_object_alloc(spa->spa_meta_objset,
6204 DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE,
6205 sizeof (uint64_t), tx);
6206 VERIFY(zap_update(spa->spa_meta_objset,
6207 DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1,
6208 &sav->sav_object, tx) == 0);
6209 }
6210
6211 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
6212 if (sav->sav_count == 0) {
6213 VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0);
6214 } else {
6215 list = kmem_alloc(sav->sav_count*sizeof (void *), KM_SLEEP);
6216 for (i = 0; i < sav->sav_count; i++)
6217 list[i] = vdev_config_generate(spa, sav->sav_vdevs[i],
6218 B_FALSE, VDEV_CONFIG_L2CACHE);
6219 VERIFY(nvlist_add_nvlist_array(nvroot, config, list,
6220 sav->sav_count) == 0);
6221 for (i = 0; i < sav->sav_count; i++)
6222 nvlist_free(list[i]);
6223 kmem_free(list, sav->sav_count * sizeof (void *));
6224 }
6225
6226 spa_sync_nvlist(spa, sav->sav_object, nvroot, tx);
6227 nvlist_free(nvroot);
6228
6229 sav->sav_sync = B_FALSE;
6230 }
6231
6232 /*
6233 * Rebuild spa's all-vdev ZAP from the vdev ZAPs indicated in each vdev_t.
6234 * The all-vdev ZAP must be empty.
6235 */
6236 static void
6237 spa_avz_build(vdev_t *vd, uint64_t avz, dmu_tx_t *tx)
6238 {
6239 spa_t *spa = vd->vdev_spa;
6240 uint64_t i;
6241
6242 if (vd->vdev_top_zap != 0) {
6243 VERIFY0(zap_add_int(spa->spa_meta_objset, avz,
6244 vd->vdev_top_zap, tx));
6245 }
6246 if (vd->vdev_leaf_zap != 0) {
6247 VERIFY0(zap_add_int(spa->spa_meta_objset, avz,
6248 vd->vdev_leaf_zap, tx));
6249 }
6250 for (i = 0; i < vd->vdev_children; i++) {
6251 spa_avz_build(vd->vdev_child[i], avz, tx);
6252 }
6253 }
6254
6255 static void
6256 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx)
6257 {
6258 nvlist_t *config;
6259
6260 /*
6261 * If the pool is being imported from a pre-per-vdev-ZAP version of ZFS,
6262 * its config may not be dirty but we still need to build per-vdev ZAPs.
6263 * Similarly, if the pool is being assembled (e.g. after a split), we
6264 * need to rebuild the AVZ although the config may not be dirty.
6265 */
6266 if (list_is_empty(&spa->spa_config_dirty_list) &&
6267 spa->spa_avz_action == AVZ_ACTION_NONE)
6268 return;
6269
6270 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
6271
6272 ASSERT(spa->spa_avz_action == AVZ_ACTION_NONE ||
6273 spa->spa_all_vdev_zaps != 0);
6274
6275 if (spa->spa_avz_action == AVZ_ACTION_REBUILD) {
6276 zap_cursor_t zc;
6277 zap_attribute_t za;
6278
6279 /* Make and build the new AVZ */
6280 uint64_t new_avz = zap_create(spa->spa_meta_objset,
6281 DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx);
6282 spa_avz_build(spa->spa_root_vdev, new_avz, tx);
6283
6284 /* Diff old AVZ with new one */
6285 for (zap_cursor_init(&zc, spa->spa_meta_objset,
6286 spa->spa_all_vdev_zaps);
6287 zap_cursor_retrieve(&zc, &za) == 0;
6288 zap_cursor_advance(&zc)) {
6289 uint64_t vdzap = za.za_first_integer;
6290 if (zap_lookup_int(spa->spa_meta_objset, new_avz,
6291 vdzap) == ENOENT) {
6292 /*
6293 * ZAP is listed in old AVZ but not in new one;
6294 * destroy it
6295 */
6296 VERIFY0(zap_destroy(spa->spa_meta_objset, vdzap,
6297 tx));
6298 }
6299 }
6300
6301 zap_cursor_fini(&zc);
6302
6303 /* Destroy the old AVZ */
6304 VERIFY0(zap_destroy(spa->spa_meta_objset,
6305 spa->spa_all_vdev_zaps, tx));
6306
6307 /* Replace the old AVZ in the dir obj with the new one */
6308 VERIFY0(zap_update(spa->spa_meta_objset,
6309 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP,
6310 sizeof (new_avz), 1, &new_avz, tx));
6311
6312 spa->spa_all_vdev_zaps = new_avz;
6313 } else if (spa->spa_avz_action == AVZ_ACTION_DESTROY) {
6314 zap_cursor_t zc;
6315 zap_attribute_t za;
6316
6317 /* Walk through the AVZ and destroy all listed ZAPs */
6318 for (zap_cursor_init(&zc, spa->spa_meta_objset,
6319 spa->spa_all_vdev_zaps);
6320 zap_cursor_retrieve(&zc, &za) == 0;
6321 zap_cursor_advance(&zc)) {
6322 uint64_t zap = za.za_first_integer;
6323 VERIFY0(zap_destroy(spa->spa_meta_objset, zap, tx));
6324 }
6325
6326 zap_cursor_fini(&zc);
6327
6328 /* Destroy and unlink the AVZ itself */
6329 VERIFY0(zap_destroy(spa->spa_meta_objset,
6330 spa->spa_all_vdev_zaps, tx));
6331 VERIFY0(zap_remove(spa->spa_meta_objset,
6332 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, tx));
6333 spa->spa_all_vdev_zaps = 0;
6334 }
6335
6336 if (spa->spa_all_vdev_zaps == 0) {
6337 spa->spa_all_vdev_zaps = zap_create_link(spa->spa_meta_objset,
6338 DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT,
6339 DMU_POOL_VDEV_ZAP_MAP, tx);
6340 }
6341 spa->spa_avz_action = AVZ_ACTION_NONE;
6342
6343 /* Create ZAPs for vdevs that don't have them. */
6344 vdev_construct_zaps(spa->spa_root_vdev, tx);
6345
6346 config = spa_config_generate(spa, spa->spa_root_vdev,
6347 dmu_tx_get_txg(tx), B_FALSE);
6348
6349 /*
6350 * If we're upgrading the spa version then make sure that
6351 * the config object gets updated with the correct version.
6352 */
6353 if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version)
6354 fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
6355 spa->spa_uberblock.ub_version);
6356
6357 spa_config_exit(spa, SCL_STATE, FTAG);
6358
6359 nvlist_free(spa->spa_config_syncing);
6360 spa->spa_config_syncing = config;
6361
6362 spa_sync_nvlist(spa, spa->spa_config_object, config, tx);
6363 }
6364
6365 static void
6366 spa_sync_version(void *arg, dmu_tx_t *tx)
6367 {
6368 uint64_t *versionp = arg;
6369 uint64_t version = *versionp;
6370 spa_t *spa = dmu_tx_pool(tx)->dp_spa;
6371
6372 /*
6373 * Setting the version is special cased when first creating the pool.
6374 */
6375 ASSERT(tx->tx_txg != TXG_INITIAL);
6376
6377 ASSERT(SPA_VERSION_IS_SUPPORTED(version));
6378 ASSERT(version >= spa_version(spa));
6379
6380 spa->spa_uberblock.ub_version = version;
6381 vdev_config_dirty(spa->spa_root_vdev);
6382 spa_history_log_internal(spa, "set", tx, "version=%lld", version);
6383 }
6384
6385 /*
6386 * Set zpool properties.
6387 */
6388 static void
6389 spa_sync_props(void *arg, dmu_tx_t *tx)
6390 {
6391 nvlist_t *nvp = arg;
6392 spa_t *spa = dmu_tx_pool(tx)->dp_spa;
6393 objset_t *mos = spa->spa_meta_objset;
6394 nvpair_t *elem = NULL;
6395
6396 mutex_enter(&spa->spa_props_lock);
6397
6398 while ((elem = nvlist_next_nvpair(nvp, elem))) {
6399 uint64_t intval;
6400 char *strval, *fname;
6401 zpool_prop_t prop;
6402 const char *propname;
6403 zprop_type_t proptype;
6404 spa_feature_t fid;
6405
6406 prop = zpool_name_to_prop(nvpair_name(elem));
6407 switch ((int)prop) {
6408 case ZPROP_INVAL:
6409 /*
6410 * We checked this earlier in spa_prop_validate().
6411 */
6412 ASSERT(zpool_prop_feature(nvpair_name(elem)));
6413
6414 fname = strchr(nvpair_name(elem), '@') + 1;
6415 VERIFY0(zfeature_lookup_name(fname, &fid));
6416
6417 spa_feature_enable(spa, fid, tx);
6418 spa_history_log_internal(spa, "set", tx,
6419 "%s=enabled", nvpair_name(elem));
6420 break;
6421
6422 case ZPOOL_PROP_VERSION:
6423 intval = fnvpair_value_uint64(elem);
6424 /*
6425 * The version is synced seperatly before other
6426 * properties and should be correct by now.
6427 */
6428 ASSERT3U(spa_version(spa), >=, intval);
6429 break;
6430
6431 case ZPOOL_PROP_ALTROOT:
6432 /*
6433 * 'altroot' is a non-persistent property. It should
6434 * have been set temporarily at creation or import time.
6435 */
6436 ASSERT(spa->spa_root != NULL);
6437 break;
6438
6439 case ZPOOL_PROP_READONLY:
6440 case ZPOOL_PROP_CACHEFILE:
6441 /*
6442 * 'readonly' and 'cachefile' are also non-persisitent
6443 * properties.
6444 */
6445 break;
6446 case ZPOOL_PROP_COMMENT:
6447 strval = fnvpair_value_string(elem);
6448 if (spa->spa_comment != NULL)
6449 spa_strfree(spa->spa_comment);
6450 spa->spa_comment = spa_strdup(strval);
6451 /*
6452 * We need to dirty the configuration on all the vdevs
6453 * so that their labels get updated. It's unnecessary
6454 * to do this for pool creation since the vdev's
6455 * configuratoin has already been dirtied.
6456 */
6457 if (tx->tx_txg != TXG_INITIAL)
6458 vdev_config_dirty(spa->spa_root_vdev);
6459 spa_history_log_internal(spa, "set", tx,
6460 "%s=%s", nvpair_name(elem), strval);
6461 break;
6462 default:
6463 /*
6464 * Set pool property values in the poolprops mos object.
6465 */
6466 if (spa->spa_pool_props_object == 0) {
6467 spa->spa_pool_props_object =
6468 zap_create_link(mos, DMU_OT_POOL_PROPS,
6469 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS,
6470 tx);
6471 }
6472
6473 /* normalize the property name */
6474 propname = zpool_prop_to_name(prop);
6475 proptype = zpool_prop_get_type(prop);
6476
6477 if (nvpair_type(elem) == DATA_TYPE_STRING) {
6478 ASSERT(proptype == PROP_TYPE_STRING);
6479 strval = fnvpair_value_string(elem);
6480 VERIFY0(zap_update(mos,
6481 spa->spa_pool_props_object, propname,
6482 1, strlen(strval) + 1, strval, tx));
6483 spa_history_log_internal(spa, "set", tx,
6484 "%s=%s", nvpair_name(elem), strval);
6485 } else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
6486 intval = fnvpair_value_uint64(elem);
6487
6488 if (proptype == PROP_TYPE_INDEX) {
6489 const char *unused;
6490 VERIFY0(zpool_prop_index_to_string(
6491 prop, intval, &unused));
6492 }
6493 VERIFY0(zap_update(mos,
6494 spa->spa_pool_props_object, propname,
6495 8, 1, &intval, tx));
6496 spa_history_log_internal(spa, "set", tx,
6497 "%s=%lld", nvpair_name(elem), intval);
6498 } else {
6499 ASSERT(0); /* not allowed */
6500 }
6501
6502 switch (prop) {
6503 case ZPOOL_PROP_DELEGATION:
6504 spa->spa_delegation = intval;
6505 break;
6506 case ZPOOL_PROP_BOOTFS:
6507 spa->spa_bootfs = intval;
6508 break;
6509 case ZPOOL_PROP_FAILUREMODE:
6510 spa->spa_failmode = intval;
6511 break;
6512 case ZPOOL_PROP_AUTOEXPAND:
6513 spa->spa_autoexpand = intval;
6514 if (tx->tx_txg != TXG_INITIAL)
6515 spa_async_request(spa,
6516 SPA_ASYNC_AUTOEXPAND);
6517 break;
6518 case ZPOOL_PROP_DEDUPDITTO:
6519 spa->spa_dedup_ditto = intval;
6520 break;
6521 default:
6522 break;
6523 }
6524 }
6525
6526 }
6527
6528 mutex_exit(&spa->spa_props_lock);
6529 }
6530
6531 /*
6532 * Perform one-time upgrade on-disk changes. spa_version() does not
6533 * reflect the new version this txg, so there must be no changes this
6534 * txg to anything that the upgrade code depends on after it executes.
6535 * Therefore this must be called after dsl_pool_sync() does the sync
6536 * tasks.
6537 */
6538 static void
6539 spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx)
6540 {
6541 dsl_pool_t *dp = spa->spa_dsl_pool;
6542
6543 ASSERT(spa->spa_sync_pass == 1);
6544
6545 rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
6546
6547 if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN &&
6548 spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) {
6549 dsl_pool_create_origin(dp, tx);
6550
6551 /* Keeping the origin open increases spa_minref */
6552 spa->spa_minref += 3;
6553 }
6554
6555 if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES &&
6556 spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) {
6557 dsl_pool_upgrade_clones(dp, tx);
6558 }
6559
6560 if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES &&
6561 spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) {
6562 dsl_pool_upgrade_dir_clones(dp, tx);
6563
6564 /* Keeping the freedir open increases spa_minref */
6565 spa->spa_minref += 3;
6566 }
6567
6568 if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES &&
6569 spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) {
6570 spa_feature_create_zap_objects(spa, tx);
6571 }
6572
6573 /*
6574 * LZ4_COMPRESS feature's behaviour was changed to activate_on_enable
6575 * when possibility to use lz4 compression for metadata was added
6576 * Old pools that have this feature enabled must be upgraded to have
6577 * this feature active
6578 */
6579 if (spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) {
6580 boolean_t lz4_en = spa_feature_is_enabled(spa,
6581 SPA_FEATURE_LZ4_COMPRESS);
6582 boolean_t lz4_ac = spa_feature_is_active(spa,
6583 SPA_FEATURE_LZ4_COMPRESS);
6584
6585 if (lz4_en && !lz4_ac)
6586 spa_feature_incr(spa, SPA_FEATURE_LZ4_COMPRESS, tx);
6587 }
6588 rrw_exit(&dp->dp_config_rwlock, FTAG);
6589 }
6590
6591 /*
6592 * Sync the specified transaction group. New blocks may be dirtied as
6593 * part of the process, so we iterate until it converges.
6594 */
6595 void
6596 spa_sync(spa_t *spa, uint64_t txg)
6597 {
6598 dsl_pool_t *dp = spa->spa_dsl_pool;
6599 objset_t *mos = spa->spa_meta_objset;
6600 bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK];
6601 vdev_t *rvd = spa->spa_root_vdev;
6602 vdev_t *vd;
6603 dmu_tx_t *tx;
6604 int error;
6605 int c;
6606
6607 VERIFY(spa_writeable(spa));
6608
6609 /*
6610 * Lock out configuration changes.
6611 */
6612 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
6613
6614 spa->spa_syncing_txg = txg;
6615 spa->spa_sync_pass = 0;
6616
6617 /*
6618 * If there are any pending vdev state changes, convert them
6619 * into config changes that go out with this transaction group.
6620 */
6621 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
6622 while (list_head(&spa->spa_state_dirty_list) != NULL) {
6623 /*
6624 * We need the write lock here because, for aux vdevs,
6625 * calling vdev_config_dirty() modifies sav_config.
6626 * This is ugly and will become unnecessary when we
6627 * eliminate the aux vdev wart by integrating all vdevs
6628 * into the root vdev tree.
6629 */
6630 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
6631 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER);
6632 while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) {
6633 vdev_state_clean(vd);
6634 vdev_config_dirty(vd);
6635 }
6636 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
6637 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
6638 }
6639 spa_config_exit(spa, SCL_STATE, FTAG);
6640
6641 tx = dmu_tx_create_assigned(dp, txg);
6642
6643 spa->spa_sync_starttime = gethrtime();
6644 taskq_cancel_id(system_taskq, spa->spa_deadman_tqid);
6645 spa->spa_deadman_tqid = taskq_dispatch_delay(system_taskq,
6646 spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() +
6647 NSEC_TO_TICK(spa->spa_deadman_synctime));
6648
6649 /*
6650 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg,
6651 * set spa_deflate if we have no raid-z vdevs.
6652 */
6653 if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE &&
6654 spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) {
6655 int i;
6656
6657 for (i = 0; i < rvd->vdev_children; i++) {
6658 vd = rvd->vdev_child[i];
6659 if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE)
6660 break;
6661 }
6662 if (i == rvd->vdev_children) {
6663 spa->spa_deflate = TRUE;
6664 VERIFY(0 == zap_add(spa->spa_meta_objset,
6665 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
6666 sizeof (uint64_t), 1, &spa->spa_deflate, tx));
6667 }
6668 }
6669
6670 /*
6671 * Iterate to convergence.
6672 */
6673 do {
6674 int pass = ++spa->spa_sync_pass;
6675
6676 spa_sync_config_object(spa, tx);
6677 spa_sync_aux_dev(spa, &spa->spa_spares, tx,
6678 ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES);
6679 spa_sync_aux_dev(spa, &spa->spa_l2cache, tx,
6680 ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE);
6681 spa_errlog_sync(spa, txg);
6682 dsl_pool_sync(dp, txg);
6683
6684 if (pass < zfs_sync_pass_deferred_free) {
6685 spa_sync_frees(spa, free_bpl, tx);
6686 } else {
6687 /*
6688 * We can not defer frees in pass 1, because
6689 * we sync the deferred frees later in pass 1.
6690 */
6691 ASSERT3U(pass, >, 1);
6692 bplist_iterate(free_bpl, bpobj_enqueue_cb,
6693 &spa->spa_deferred_bpobj, tx);
6694 }
6695
6696 ddt_sync(spa, txg);
6697 dsl_scan_sync(dp, tx);
6698
6699 while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)))
6700 vdev_sync(vd, txg);
6701
6702 if (pass == 1) {
6703 spa_sync_upgrades(spa, tx);
6704 ASSERT3U(txg, >=,
6705 spa->spa_uberblock.ub_rootbp.blk_birth);
6706 /*
6707 * Note: We need to check if the MOS is dirty
6708 * because we could have marked the MOS dirty
6709 * without updating the uberblock (e.g. if we
6710 * have sync tasks but no dirty user data). We
6711 * need to check the uberblock's rootbp because
6712 * it is updated if we have synced out dirty
6713 * data (though in this case the MOS will most
6714 * likely also be dirty due to second order
6715 * effects, we don't want to rely on that here).
6716 */
6717 if (spa->spa_uberblock.ub_rootbp.blk_birth < txg &&
6718 !dmu_objset_is_dirty(mos, txg)) {
6719 /*
6720 * Nothing changed on the first pass,
6721 * therefore this TXG is a no-op. Avoid
6722 * syncing deferred frees, so that we
6723 * can keep this TXG as a no-op.
6724 */
6725 ASSERT(txg_list_empty(&dp->dp_dirty_datasets,
6726 txg));
6727 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
6728 ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg));
6729 break;
6730 }
6731 spa_sync_deferred_frees(spa, tx);
6732 }
6733
6734 } while (dmu_objset_is_dirty(mos, txg));
6735
6736 #ifdef ZFS_DEBUG
6737 if (!list_is_empty(&spa->spa_config_dirty_list)) {
6738 /*
6739 * Make sure that the number of ZAPs for all the vdevs matches
6740 * the number of ZAPs in the per-vdev ZAP list. This only gets
6741 * called if the config is dirty; otherwise there may be
6742 * outstanding AVZ operations that weren't completed in
6743 * spa_sync_config_object.
6744 */
6745 uint64_t all_vdev_zap_entry_count;
6746 ASSERT0(zap_count(spa->spa_meta_objset,
6747 spa->spa_all_vdev_zaps, &all_vdev_zap_entry_count));
6748 ASSERT3U(vdev_count_verify_zaps(spa->spa_root_vdev), ==,
6749 all_vdev_zap_entry_count);
6750 }
6751 #endif
6752
6753 /*
6754 * Rewrite the vdev configuration (which includes the uberblock)
6755 * to commit the transaction group.
6756 *
6757 * If there are no dirty vdevs, we sync the uberblock to a few
6758 * random top-level vdevs that are known to be visible in the
6759 * config cache (see spa_vdev_add() for a complete description).
6760 * If there *are* dirty vdevs, sync the uberblock to all vdevs.
6761 */
6762 for (;;) {
6763 /*
6764 * We hold SCL_STATE to prevent vdev open/close/etc.
6765 * while we're attempting to write the vdev labels.
6766 */
6767 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
6768
6769 if (list_is_empty(&spa->spa_config_dirty_list)) {
6770 vdev_t *svd[SPA_DVAS_PER_BP];
6771 int svdcount = 0;
6772 int children = rvd->vdev_children;
6773 int c0 = spa_get_random(children);
6774
6775 for (c = 0; c < children; c++) {
6776 vd = rvd->vdev_child[(c0 + c) % children];
6777 if (vd->vdev_ms_array == 0 || vd->vdev_islog)
6778 continue;
6779 svd[svdcount++] = vd;
6780 if (svdcount == SPA_DVAS_PER_BP)
6781 break;
6782 }
6783 error = vdev_config_sync(svd, svdcount, txg);
6784 } else {
6785 error = vdev_config_sync(rvd->vdev_child,
6786 rvd->vdev_children, txg);
6787 }
6788
6789 if (error == 0)
6790 spa->spa_last_synced_guid = rvd->vdev_guid;
6791
6792 spa_config_exit(spa, SCL_STATE, FTAG);
6793
6794 if (error == 0)
6795 break;
6796 zio_suspend(spa, NULL);
6797 zio_resume_wait(spa);
6798 }
6799 dmu_tx_commit(tx);
6800
6801 taskq_cancel_id(system_taskq, spa->spa_deadman_tqid);
6802 spa->spa_deadman_tqid = 0;
6803
6804 /*
6805 * Clear the dirty config list.
6806 */
6807 while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL)
6808 vdev_config_clean(vd);
6809
6810 /*
6811 * Now that the new config has synced transactionally,
6812 * let it become visible to the config cache.
6813 */
6814 if (spa->spa_config_syncing != NULL) {
6815 spa_config_set(spa, spa->spa_config_syncing);
6816 spa->spa_config_txg = txg;
6817 spa->spa_config_syncing = NULL;
6818 }
6819
6820 spa->spa_ubsync = spa->spa_uberblock;
6821
6822 dsl_pool_sync_done(dp, txg);
6823
6824 /*
6825 * Update usable space statistics.
6826 */
6827 while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))))
6828 vdev_sync_done(vd, txg);
6829
6830 spa_update_dspace(spa);
6831
6832 /*
6833 * It had better be the case that we didn't dirty anything
6834 * since vdev_config_sync().
6835 */
6836 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
6837 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
6838 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg));
6839
6840 spa->spa_sync_pass = 0;
6841
6842 spa_config_exit(spa, SCL_CONFIG, FTAG);
6843
6844 spa_handle_ignored_writes(spa);
6845
6846 /*
6847 * If any async tasks have been requested, kick them off.
6848 */
6849 spa_async_dispatch(spa);
6850 }
6851
6852 /*
6853 * Sync all pools. We don't want to hold the namespace lock across these
6854 * operations, so we take a reference on the spa_t and drop the lock during the
6855 * sync.
6856 */
6857 void
6858 spa_sync_allpools(void)
6859 {
6860 spa_t *spa = NULL;
6861 mutex_enter(&spa_namespace_lock);
6862 while ((spa = spa_next(spa)) != NULL) {
6863 if (spa_state(spa) != POOL_STATE_ACTIVE ||
6864 !spa_writeable(spa) || spa_suspended(spa))
6865 continue;
6866 spa_open_ref(spa, FTAG);
6867 mutex_exit(&spa_namespace_lock);
6868 txg_wait_synced(spa_get_dsl(spa), 0);
6869 mutex_enter(&spa_namespace_lock);
6870 spa_close(spa, FTAG);
6871 }
6872 mutex_exit(&spa_namespace_lock);
6873 }
6874
6875 /*
6876 * ==========================================================================
6877 * Miscellaneous routines
6878 * ==========================================================================
6879 */
6880
6881 /*
6882 * Remove all pools in the system.
6883 */
6884 void
6885 spa_evict_all(void)
6886 {
6887 spa_t *spa;
6888
6889 /*
6890 * Remove all cached state. All pools should be closed now,
6891 * so every spa in the AVL tree should be unreferenced.
6892 */
6893 mutex_enter(&spa_namespace_lock);
6894 while ((spa = spa_next(NULL)) != NULL) {
6895 /*
6896 * Stop async tasks. The async thread may need to detach
6897 * a device that's been replaced, which requires grabbing
6898 * spa_namespace_lock, so we must drop it here.
6899 */
6900 spa_open_ref(spa, FTAG);
6901 mutex_exit(&spa_namespace_lock);
6902 spa_async_suspend(spa);
6903 mutex_enter(&spa_namespace_lock);
6904 spa_close(spa, FTAG);
6905
6906 if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
6907 spa_unload(spa);
6908 spa_deactivate(spa);
6909 }
6910 spa_remove(spa);
6911 }
6912 mutex_exit(&spa_namespace_lock);
6913 }
6914
6915 vdev_t *
6916 spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux)
6917 {
6918 vdev_t *vd;
6919 int i;
6920
6921 if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL)
6922 return (vd);
6923
6924 if (aux) {
6925 for (i = 0; i < spa->spa_l2cache.sav_count; i++) {
6926 vd = spa->spa_l2cache.sav_vdevs[i];
6927 if (vd->vdev_guid == guid)
6928 return (vd);
6929 }
6930
6931 for (i = 0; i < spa->spa_spares.sav_count; i++) {
6932 vd = spa->spa_spares.sav_vdevs[i];
6933 if (vd->vdev_guid == guid)
6934 return (vd);
6935 }
6936 }
6937
6938 return (NULL);
6939 }
6940
6941 void
6942 spa_upgrade(spa_t *spa, uint64_t version)
6943 {
6944 ASSERT(spa_writeable(spa));
6945
6946 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
6947
6948 /*
6949 * This should only be called for a non-faulted pool, and since a
6950 * future version would result in an unopenable pool, this shouldn't be
6951 * possible.
6952 */
6953 ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version));
6954 ASSERT3U(version, >=, spa->spa_uberblock.ub_version);
6955
6956 spa->spa_uberblock.ub_version = version;
6957 vdev_config_dirty(spa->spa_root_vdev);
6958
6959 spa_config_exit(spa, SCL_ALL, FTAG);
6960
6961 txg_wait_synced(spa_get_dsl(spa), 0);
6962 }
6963
6964 boolean_t
6965 spa_has_spare(spa_t *spa, uint64_t guid)
6966 {
6967 int i;
6968 uint64_t spareguid;
6969 spa_aux_vdev_t *sav = &spa->spa_spares;
6970
6971 for (i = 0; i < sav->sav_count; i++)
6972 if (sav->sav_vdevs[i]->vdev_guid == guid)
6973 return (B_TRUE);
6974
6975 for (i = 0; i < sav->sav_npending; i++) {
6976 if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID,
6977 &spareguid) == 0 && spareguid == guid)
6978 return (B_TRUE);
6979 }
6980
6981 return (B_FALSE);
6982 }
6983
6984 /*
6985 * Check if a pool has an active shared spare device.
6986 * Note: reference count of an active spare is 2, as a spare and as a replace
6987 */
6988 static boolean_t
6989 spa_has_active_shared_spare(spa_t *spa)
6990 {
6991 int i, refcnt;
6992 uint64_t pool;
6993 spa_aux_vdev_t *sav = &spa->spa_spares;
6994
6995 for (i = 0; i < sav->sav_count; i++) {
6996 if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool,
6997 &refcnt) && pool != 0ULL && pool == spa_guid(spa) &&
6998 refcnt > 2)
6999 return (B_TRUE);
7000 }
7001
7002 return (B_FALSE);
7003 }
7004
7005 /*
7006 * Post a FM_EREPORT_ZFS_* event from sys/fm/fs/zfs.h. The payload will be
7007 * filled in from the spa and (optionally) the vdev. This doesn't do anything
7008 * in the userland libzpool, as we don't want consumers to misinterpret ztest
7009 * or zdb as real changes.
7010 */
7011 void
7012 spa_event_notify(spa_t *spa, vdev_t *vd, const char *name)
7013 {
7014 #ifdef _KERNEL
7015 zfs_ereport_post(name, spa, vd, NULL, 0, 0);
7016 #endif
7017 }
7018
7019 #if defined(_KERNEL) && defined(HAVE_SPL)
7020 /* state manipulation functions */
7021 EXPORT_SYMBOL(spa_open);
7022 EXPORT_SYMBOL(spa_open_rewind);
7023 EXPORT_SYMBOL(spa_get_stats);
7024 EXPORT_SYMBOL(spa_create);
7025 EXPORT_SYMBOL(spa_import_rootpool);
7026 EXPORT_SYMBOL(spa_import);
7027 EXPORT_SYMBOL(spa_tryimport);
7028 EXPORT_SYMBOL(spa_destroy);
7029 EXPORT_SYMBOL(spa_export);
7030 EXPORT_SYMBOL(spa_reset);
7031 EXPORT_SYMBOL(spa_async_request);
7032 EXPORT_SYMBOL(spa_async_suspend);
7033 EXPORT_SYMBOL(spa_async_resume);
7034 EXPORT_SYMBOL(spa_inject_addref);
7035 EXPORT_SYMBOL(spa_inject_delref);
7036 EXPORT_SYMBOL(spa_scan_stat_init);
7037 EXPORT_SYMBOL(spa_scan_get_stats);
7038
7039 /* device maniion */
7040 EXPORT_SYMBOL(spa_vdev_add);
7041 EXPORT_SYMBOL(spa_vdev_attach);
7042 EXPORT_SYMBOL(spa_vdev_detach);
7043 EXPORT_SYMBOL(spa_vdev_remove);
7044 EXPORT_SYMBOL(spa_vdev_setpath);
7045 EXPORT_SYMBOL(spa_vdev_setfru);
7046 EXPORT_SYMBOL(spa_vdev_split_mirror);
7047
7048 /* spare statech is global across all pools) */
7049 EXPORT_SYMBOL(spa_spare_add);
7050 EXPORT_SYMBOL(spa_spare_remove);
7051 EXPORT_SYMBOL(spa_spare_exists);
7052 EXPORT_SYMBOL(spa_spare_activate);
7053
7054 /* L2ARC statech is global across all pools) */
7055 EXPORT_SYMBOL(spa_l2cache_add);
7056 EXPORT_SYMBOL(spa_l2cache_remove);
7057 EXPORT_SYMBOL(spa_l2cache_exists);
7058 EXPORT_SYMBOL(spa_l2cache_activate);
7059 EXPORT_SYMBOL(spa_l2cache_drop);
7060
7061 /* scanning */
7062 EXPORT_SYMBOL(spa_scan);
7063 EXPORT_SYMBOL(spa_scan_stop);
7064
7065 /* spa syncing */
7066 EXPORT_SYMBOL(spa_sync); /* only for DMU use */
7067 EXPORT_SYMBOL(spa_sync_allpools);
7068
7069 /* properties */
7070 EXPORT_SYMBOL(spa_prop_set);
7071 EXPORT_SYMBOL(spa_prop_get);
7072 EXPORT_SYMBOL(spa_prop_clear_bootfs);
7073
7074 /* asynchronous event notification */
7075 EXPORT_SYMBOL(spa_event_notify);
7076 #endif
7077
7078 #if defined(_KERNEL) && defined(HAVE_SPL)
7079 module_param(spa_load_verify_maxinflight, int, 0644);
7080 MODULE_PARM_DESC(spa_load_verify_maxinflight,
7081 "Max concurrent traversal I/Os while verifying pool during import -X");
7082
7083 module_param(spa_load_verify_metadata, int, 0644);
7084 MODULE_PARM_DESC(spa_load_verify_metadata,
7085 "Set to traverse metadata on pool import");
7086
7087 module_param(spa_load_verify_data, int, 0644);
7088 MODULE_PARM_DESC(spa_load_verify_data,
7089 "Set to traverse data on pool import");
7090
7091 module_param(zio_taskq_batch_pct, uint, 0444);
7092 MODULE_PARM_DESC(zio_taskq_batch_pct,
7093 "Percentage of CPUs to run an IO worker thread");
7094
7095 #endif