4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
25 * Copyright 2015 RackTop Systems.
26 * Copyright (c) 2016, Intel Corporation.
30 * Pool import support functions.
32 * To import a pool, we rely on reading the configuration information from the
33 * ZFS label of each device. If we successfully read the label, then we
34 * organize the configuration information in the following hierarchy:
36 * pool guid -> toplevel vdev guid -> label txg
38 * Duplicate entries matching this same tuple will be discarded. Once we have
39 * examined every device, we pick the best label txg config for each toplevel
40 * vdev. We then arrange these toplevel vdevs into a complete pool config, and
41 * update any paths that have changed. Finally, we attempt to import the pool
42 * using our derived config, and record the results.
61 #include <sys/dktp/fdisk.h>
62 #include <sys/efi_partition.h>
63 #include <sys/vdev_impl.h>
64 #include <blkid/blkid.h>
66 #include "libzfs_impl.h"
70 * Intermediate structures used to gather configuration information.
72 typedef struct config_entry
{
75 struct config_entry
*ce_next
;
78 typedef struct vdev_entry
{
80 config_entry_t
*ve_configs
;
81 struct vdev_entry
*ve_next
;
84 typedef struct pool_entry
{
86 vdev_entry_t
*pe_vdevs
;
87 struct pool_entry
*pe_next
;
90 typedef struct name_entry
{
94 uint64_t ne_num_labels
;
95 struct name_entry
*ne_next
;
98 typedef struct pool_list
{
103 #define DEV_BYID_PATH "/dev/disk/by-id/"
106 * Linux persistent device strings for vdev labels
108 * based on libudev for consistency with libudev disk add/remove events
112 typedef struct vdev_dev_strs
{
114 char vds_devphys
[128];
118 * Obtain the persistent device id string (describes what)
120 * used by ZED vdev matching for auto-{online,expand,replace}
123 zfs_device_get_devid(struct udev_device
*dev
, char *bufptr
, size_t buflen
)
125 struct udev_list_entry
*entry
;
127 char devbyid
[MAXPATHLEN
];
129 /* The bus based by-id path is preferred */
130 bus
= udev_device_get_property_value(dev
, "ID_BUS");
136 * For multipath nodes use the persistent uuid based identifier
138 * Example: /dev/disk/by-id/dm-uuid-mpath-35000c5006304de3f
140 dm_uuid
= udev_device_get_property_value(dev
, "DM_UUID");
141 if (dm_uuid
!= NULL
) {
142 (void) snprintf(bufptr
, buflen
, "dm-uuid-%s", dm_uuid
);
147 * NVME 'by-id' symlinks are similar to bus case
149 struct udev_device
*parent
;
151 parent
= udev_device_get_parent_with_subsystem_devtype(dev
,
154 bus
= "nvme"; /* continue with bus symlink search */
160 * locate the bus specific by-id link
162 (void) snprintf(devbyid
, sizeof (devbyid
), "%s%s-", DEV_BYID_PATH
, bus
);
163 entry
= udev_device_get_devlinks_list_entry(dev
);
164 while (entry
!= NULL
) {
167 name
= udev_list_entry_get_name(entry
);
168 if (strncmp(name
, devbyid
, strlen(devbyid
)) == 0) {
169 name
+= strlen(DEV_BYID_PATH
);
170 (void) strlcpy(bufptr
, name
, buflen
);
173 entry
= udev_list_entry_get_next(entry
);
180 * Obtain the persistent physical location string (describes where)
182 * used by ZED vdev matching for auto-{online,expand,replace}
185 zfs_device_get_physical(struct udev_device
*dev
, char *bufptr
, size_t buflen
)
187 const char *physpath
= NULL
;
190 * Normal disks use ID_PATH for their physical path. Device mapper
191 * devices are virtual and don't have a physical path. For them we
192 * use ID_VDEV instead, which is setup via the /etc/vdev_id.conf file.
193 * ID_VDEV provides a persistent path to a virtual device. If you
194 * don't have vdev_id.conf setup, you cannot use multipath autoreplace.
196 if (!((physpath
= udev_device_get_property_value(dev
, "ID_PATH")) &&
199 udev_device_get_property_value(dev
, "ID_VDEV")) &&
205 (void) strlcpy(bufptr
, physpath
, buflen
);
211 udev_is_mpath(struct udev_device
*dev
)
213 return udev_device_get_property_value(dev
, "DM_UUID") &&
214 udev_device_get_property_value(dev
, "MPATH_SBIN_PATH");
218 * A disk is considered a multipath whole disk when:
219 * DEVNAME key value has "dm-"
220 * DM_NAME key value has "mpath" prefix
222 * ID_PART_TABLE_TYPE key does not exist or is not gpt
225 udev_mpath_whole_disk(struct udev_device
*dev
)
227 const char *devname
, *type
, *uuid
;
229 devname
= udev_device_get_property_value(dev
, "DEVNAME");
230 type
= udev_device_get_property_value(dev
, "ID_PART_TABLE_TYPE");
231 uuid
= udev_device_get_property_value(dev
, "DM_UUID");
233 if ((devname
!= NULL
&& strncmp(devname
, "/dev/dm-", 8) == 0) &&
234 ((type
== NULL
) || (strcmp(type
, "gpt") != 0)) &&
243 * Check if a disk is effectively a multipath whole disk
246 is_mpath_whole_disk(const char *path
)
249 struct udev_device
*dev
= NULL
;
250 char nodepath
[MAXPATHLEN
];
252 boolean_t wholedisk
= B_FALSE
;
254 if (realpath(path
, nodepath
) == NULL
)
256 sysname
= strrchr(nodepath
, '/') + 1;
257 if (strncmp(sysname
, "dm-", 3) != 0)
259 if ((udev
= udev_new()) == NULL
)
261 if ((dev
= udev_device_new_from_subsystem_sysname(udev
, "block",
263 udev_device_unref(dev
);
267 wholedisk
= udev_mpath_whole_disk(dev
);
269 udev_device_unref(dev
);
274 udev_device_is_ready(struct udev_device
*dev
)
276 #ifdef HAVE_LIBUDEV_UDEV_DEVICE_GET_IS_INITIALIZED
277 return (udev_device_get_is_initialized(dev
));
279 /* wait for DEVLINKS property to be initialized */
280 return (udev_device_get_property_value(dev
, "DEVLINKS") != NULL
);
285 * Wait up to timeout_ms for udev to set up the device node. The device is
286 * considered ready when libudev determines it has been initialized, all of
287 * the device links have been verified to exist, and it has been allowed to
288 * settle. At this point the device the device can be accessed reliably.
289 * Depending on the complexity of the udev rules this process could take
293 zpool_label_disk_wait(char *path
, int timeout_ms
)
296 struct udev_device
*dev
= NULL
;
297 char nodepath
[MAXPATHLEN
];
298 char *sysname
= NULL
;
302 hrtime_t start
, settle
;
304 if ((udev
= udev_new()) == NULL
)
311 if (sysname
== NULL
) {
312 if (realpath(path
, nodepath
) != NULL
) {
313 sysname
= strrchr(nodepath
, '/') + 1;
315 (void) usleep(sleep_ms
* MILLISEC
);
320 dev
= udev_device_new_from_subsystem_sysname(udev
,
322 if ((dev
!= NULL
) && udev_device_is_ready(dev
)) {
323 struct udev_list_entry
*links
, *link
= NULL
;
326 links
= udev_device_get_devlinks_list_entry(dev
);
328 udev_list_entry_foreach(link
, links
) {
329 struct stat64 statbuf
;
332 name
= udev_list_entry_get_name(link
);
334 if (stat64(name
, &statbuf
) == 0 && errno
== 0)
344 settle
= gethrtime();
345 } else if (NSEC2MSEC(gethrtime() - settle
) >=
347 udev_device_unref(dev
);
353 udev_device_unref(dev
);
354 (void) usleep(sleep_ms
* MILLISEC
);
356 } while (NSEC2MSEC(gethrtime() - start
) < timeout_ms
);
365 * Encode the persistent devices strings
366 * used for the vdev disk label
369 encode_device_strings(const char *path
, vdev_dev_strs_t
*ds
,
373 struct udev_device
*dev
= NULL
;
374 char nodepath
[MAXPATHLEN
];
379 if ((udev
= udev_new()) == NULL
)
382 /* resolve path to a runtime device node instance */
383 if (realpath(path
, nodepath
) == NULL
)
386 sysname
= strrchr(nodepath
, '/') + 1;
389 * Wait up to 3 seconds for udev to set up the device node context
393 dev
= udev_device_new_from_subsystem_sysname(udev
, "block",
397 if (udev_device_is_ready(dev
))
398 break; /* udev ready */
400 udev_device_unref(dev
);
403 if (NSEC2MSEC(gethrtime() - start
) < 10)
404 (void) sched_yield(); /* yield/busy wait up to 10ms */
406 (void) usleep(10 * MILLISEC
);
408 } while (NSEC2MSEC(gethrtime() - start
) < (3 * MILLISEC
));
414 * Only whole disks require extra device strings
416 if (!wholedisk
&& !udev_mpath_whole_disk(dev
))
419 ret
= zfs_device_get_devid(dev
, ds
->vds_devid
, sizeof (ds
->vds_devid
));
423 /* physical location string (optional) */
424 if (zfs_device_get_physical(dev
, ds
->vds_devphys
,
425 sizeof (ds
->vds_devphys
)) != 0) {
426 ds
->vds_devphys
[0] = '\0'; /* empty string --> not available */
430 udev_device_unref(dev
);
438 * Update a leaf vdev's persistent device strings (Linux only)
440 * - only applies for a dedicated leaf vdev (aka whole disk)
441 * - updated during pool create|add|attach|import
442 * - used for matching device matching during auto-{online,expand,replace}
443 * - stored in a leaf disk config label (i.e. alongside 'path' NVP)
444 * - these strings are currently not used in kernel (i.e. for vdev_disk_open)
446 * single device node example:
447 * devid: 'scsi-MG03SCA300_350000494a8cb3d67-part1'
448 * phys_path: 'pci-0000:04:00.0-sas-0x50000394a8cb3d67-lun-0'
450 * multipath device node example:
451 * devid: 'dm-uuid-mpath-35000c5006304de3f'
453 * We also store the enclosure sysfs path for turning on enclosure LEDs
455 * vdev_enc_sysfs_path: '/sys/class/enclosure/11:0:1:0/SLOT 4'
458 update_vdev_config_dev_strs(nvlist_t
*nv
)
461 char *env
, *type
, *path
;
462 uint64_t wholedisk
= 0;
466 * For the benefit of legacy ZFS implementations, allow
467 * for opting out of devid strings in the vdev label.
470 * env ZFS_VDEV_DEVID_OPT_OUT=YES zpool import dozer
473 * Older ZFS on Linux implementations had issues when attempting to
474 * display pool config VDEV names if a "devid" NVP value is present
475 * in the pool's config.
477 * For example, a pool that originated on illumos platform would
478 * have a devid value in the config and "zpool status" would fail
479 * when listing the config.
481 * A pool can be stripped of any "devid" values on import or
482 * prevented from adding them on zpool create|add by setting
483 * ZFS_VDEV_DEVID_OPT_OUT.
485 env
= getenv("ZFS_VDEV_DEVID_OPT_OUT");
486 if (env
&& (strtoul(env
, NULL
, 0) > 0 ||
487 !strncasecmp(env
, "YES", 3) || !strncasecmp(env
, "ON", 2))) {
488 (void) nvlist_remove_all(nv
, ZPOOL_CONFIG_DEVID
);
489 (void) nvlist_remove_all(nv
, ZPOOL_CONFIG_PHYS_PATH
);
493 if (nvlist_lookup_string(nv
, ZPOOL_CONFIG_TYPE
, &type
) != 0 ||
494 strcmp(type
, VDEV_TYPE_DISK
) != 0) {
497 if (nvlist_lookup_string(nv
, ZPOOL_CONFIG_PATH
, &path
) != 0)
499 (void) nvlist_lookup_uint64(nv
, ZPOOL_CONFIG_WHOLE_DISK
, &wholedisk
);
502 * Update device string values in config nvlist
504 if (encode_device_strings(path
, &vds
, (boolean_t
)wholedisk
) == 0) {
505 (void) nvlist_add_string(nv
, ZPOOL_CONFIG_DEVID
, vds
.vds_devid
);
506 if (vds
.vds_devphys
[0] != '\0') {
507 (void) nvlist_add_string(nv
, ZPOOL_CONFIG_PHYS_PATH
,
511 /* Add enclosure sysfs path (if disk is in an enclosure) */
512 upath
= zfs_get_underlying_path(path
);
513 spath
= zfs_get_enclosure_sysfs_path(upath
);
515 nvlist_add_string(nv
, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH
,
518 nvlist_remove_all(nv
, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH
);
523 /* clear out any stale entries */
524 (void) nvlist_remove_all(nv
, ZPOOL_CONFIG_DEVID
);
525 (void) nvlist_remove_all(nv
, ZPOOL_CONFIG_PHYS_PATH
);
526 (void) nvlist_remove_all(nv
, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH
);
532 is_mpath_whole_disk(const char *path
)
538 * Wait up to timeout_ms for udev to set up the device node. The device is
539 * considered ready when the provided path have been verified to exist and
540 * it has been allowed to settle. At this point the device the device can
541 * be accessed reliably. Depending on the complexity of the udev rules thisi
542 * process could take several seconds.
545 zpool_label_disk_wait(char *path
, int timeout_ms
)
549 hrtime_t start
, settle
;
550 struct stat64 statbuf
;
557 if ((stat64(path
, &statbuf
) == 0) && (errno
== 0)) {
559 settle
= gethrtime();
560 else if (NSEC2MSEC(gethrtime() - settle
) >= settle_ms
)
562 } else if (errno
!= ENOENT
) {
566 usleep(sleep_ms
* MILLISEC
);
567 } while (NSEC2MSEC(gethrtime() - start
) < timeout_ms
);
573 update_vdev_config_dev_strs(nvlist_t
*nv
)
577 #endif /* HAVE_LIBUDEV */
580 * Go through and fix up any path and/or devid information for the given vdev
584 fix_paths(nvlist_t
*nv
, name_entry_t
*names
)
589 name_entry_t
*ne
, *best
;
592 if (nvlist_lookup_nvlist_array(nv
, ZPOOL_CONFIG_CHILDREN
,
593 &child
, &children
) == 0) {
594 for (c
= 0; c
< children
; c
++)
595 if (fix_paths(child
[c
], names
) != 0)
601 * This is a leaf (file or disk) vdev. In either case, go through
602 * the name list and see if we find a matching guid. If so, replace
603 * the path and see if we can calculate a new devid.
605 * There may be multiple names associated with a particular guid, in
606 * which case we have overlapping partitions or multiple paths to the
607 * same disk. In this case we prefer to use the path name which
608 * matches the ZPOOL_CONFIG_PATH. If no matching entry is found we
609 * use the lowest order device which corresponds to the first match
610 * while traversing the ZPOOL_IMPORT_PATH search path.
612 verify(nvlist_lookup_uint64(nv
, ZPOOL_CONFIG_GUID
, &guid
) == 0);
613 if (nvlist_lookup_string(nv
, ZPOOL_CONFIG_PATH
, &path
) != 0)
617 for (ne
= names
; ne
!= NULL
; ne
= ne
->ne_next
) {
618 if (ne
->ne_guid
== guid
) {
624 if ((strlen(path
) == strlen(ne
->ne_name
)) &&
625 strncmp(path
, ne
->ne_name
, strlen(path
)) == 0) {
635 /* Prefer paths with move vdev labels. */
636 if (ne
->ne_num_labels
> best
->ne_num_labels
) {
641 /* Prefer paths earlier in the search order. */
642 if (ne
->ne_num_labels
== best
->ne_num_labels
&&
643 ne
->ne_order
< best
->ne_order
) {
653 if (nvlist_add_string(nv
, ZPOOL_CONFIG_PATH
, best
->ne_name
) != 0)
656 /* Linux only - update ZPOOL_CONFIG_DEVID and ZPOOL_CONFIG_PHYS_PATH */
657 update_vdev_config_dev_strs(nv
);
663 * Add the given configuration to the list of known devices.
666 add_config(libzfs_handle_t
*hdl
, pool_list_t
*pl
, const char *path
,
667 int order
, int num_labels
, nvlist_t
*config
)
669 uint64_t pool_guid
, vdev_guid
, top_guid
, txg
, state
;
676 * If this is a hot spare not currently in use or level 2 cache
677 * device, add it to the list of names to translate, but don't do
680 if (nvlist_lookup_uint64(config
, ZPOOL_CONFIG_POOL_STATE
,
682 (state
== POOL_STATE_SPARE
|| state
== POOL_STATE_L2CACHE
) &&
683 nvlist_lookup_uint64(config
, ZPOOL_CONFIG_GUID
, &vdev_guid
) == 0) {
684 if ((ne
= zfs_alloc(hdl
, sizeof (name_entry_t
))) == NULL
) {
689 if ((ne
->ne_name
= zfs_strdup(hdl
, path
)) == NULL
) {
694 ne
->ne_guid
= vdev_guid
;
695 ne
->ne_order
= order
;
696 ne
->ne_num_labels
= num_labels
;
697 ne
->ne_next
= pl
->names
;
704 * If we have a valid config but cannot read any of these fields, then
705 * it means we have a half-initialized label. In vdev_label_init()
706 * we write a label with txg == 0 so that we can identify the device
707 * in case the user refers to the same disk later on. If we fail to
708 * create the pool, we'll be left with a label in this state
709 * which should not be considered part of a valid pool.
711 if (nvlist_lookup_uint64(config
, ZPOOL_CONFIG_POOL_GUID
,
713 nvlist_lookup_uint64(config
, ZPOOL_CONFIG_GUID
,
715 nvlist_lookup_uint64(config
, ZPOOL_CONFIG_TOP_GUID
,
717 nvlist_lookup_uint64(config
, ZPOOL_CONFIG_POOL_TXG
,
718 &txg
) != 0 || txg
== 0) {
724 * First, see if we know about this pool. If not, then add it to the
725 * list of known pools.
727 for (pe
= pl
->pools
; pe
!= NULL
; pe
= pe
->pe_next
) {
728 if (pe
->pe_guid
== pool_guid
)
733 if ((pe
= zfs_alloc(hdl
, sizeof (pool_entry_t
))) == NULL
) {
737 pe
->pe_guid
= pool_guid
;
738 pe
->pe_next
= pl
->pools
;
743 * Second, see if we know about this toplevel vdev. Add it if its
746 for (ve
= pe
->pe_vdevs
; ve
!= NULL
; ve
= ve
->ve_next
) {
747 if (ve
->ve_guid
== top_guid
)
752 if ((ve
= zfs_alloc(hdl
, sizeof (vdev_entry_t
))) == NULL
) {
756 ve
->ve_guid
= top_guid
;
757 ve
->ve_next
= pe
->pe_vdevs
;
762 * Third, see if we have a config with a matching transaction group. If
763 * so, then we do nothing. Otherwise, add it to the list of known
766 for (ce
= ve
->ve_configs
; ce
!= NULL
; ce
= ce
->ce_next
) {
767 if (ce
->ce_txg
== txg
)
772 if ((ce
= zfs_alloc(hdl
, sizeof (config_entry_t
))) == NULL
) {
777 ce
->ce_config
= config
;
778 ce
->ce_next
= ve
->ve_configs
;
785 * At this point we've successfully added our config to the list of
786 * known configs. The last thing to do is add the vdev guid -> path
787 * mappings so that we can fix up the configuration as necessary before
790 if ((ne
= zfs_alloc(hdl
, sizeof (name_entry_t
))) == NULL
)
793 if ((ne
->ne_name
= zfs_strdup(hdl
, path
)) == NULL
) {
798 ne
->ne_guid
= vdev_guid
;
799 ne
->ne_order
= order
;
800 ne
->ne_num_labels
= num_labels
;
801 ne
->ne_next
= pl
->names
;
808 * Returns true if the named pool matches the given GUID.
811 pool_active(libzfs_handle_t
*hdl
, const char *name
, uint64_t guid
,
817 if (zpool_open_silent(hdl
, name
, &zhp
) != 0)
825 verify(nvlist_lookup_uint64(zhp
->zpool_config
, ZPOOL_CONFIG_POOL_GUID
,
830 *isactive
= (theguid
== guid
);
835 refresh_config(libzfs_handle_t
*hdl
, nvlist_t
*config
)
838 zfs_cmd_t zc
= {"\0"};
839 int err
, dstbuf_size
;
841 if (zcmd_write_conf_nvlist(hdl
, &zc
, config
) != 0)
844 dstbuf_size
= MAX(CONFIG_BUF_MINSIZE
, zc
.zc_nvlist_conf_size
* 4);
846 if (zcmd_alloc_dst_nvlist(hdl
, &zc
, dstbuf_size
) != 0) {
847 zcmd_free_nvlists(&zc
);
851 while ((err
= ioctl(hdl
->libzfs_fd
, ZFS_IOC_POOL_TRYIMPORT
,
852 &zc
)) != 0 && errno
== ENOMEM
) {
853 if (zcmd_expand_dst_nvlist(hdl
, &zc
) != 0) {
854 zcmd_free_nvlists(&zc
);
860 zcmd_free_nvlists(&zc
);
864 if (zcmd_read_dst_nvlist(hdl
, &zc
, &nvl
) != 0) {
865 zcmd_free_nvlists(&zc
);
869 zcmd_free_nvlists(&zc
);
874 * Determine if the vdev id is a hole in the namespace.
877 vdev_is_hole(uint64_t *hole_array
, uint_t holes
, uint_t id
)
881 for (c
= 0; c
< holes
; c
++) {
883 /* Top-level is a hole */
884 if (hole_array
[c
] == id
)
891 * Convert our list of pools into the definitive set of configurations. We
892 * start by picking the best config for each toplevel vdev. Once that's done,
893 * we assemble the toplevel vdevs into a full config for the pool. We make a
894 * pass to fix up any incorrect paths, and then add it to the main list to
895 * return to the user.
898 get_configs(libzfs_handle_t
*hdl
, pool_list_t
*pl
, boolean_t active_ok
)
903 nvlist_t
*ret
= NULL
, *config
= NULL
, *tmp
= NULL
, *nvtop
, *nvroot
;
904 nvlist_t
**spares
, **l2cache
;
905 uint_t i
, nspares
, nl2cache
;
906 boolean_t config_seen
;
908 char *name
, *hostname
= NULL
;
911 nvlist_t
**child
= NULL
;
913 uint64_t *hole_array
, max_id
;
918 boolean_t valid_top_config
= B_FALSE
;
920 if (nvlist_alloc(&ret
, 0, 0) != 0)
923 for (pe
= pl
->pools
; pe
!= NULL
; pe
= pe
->pe_next
) {
924 uint64_t id
, max_txg
= 0;
926 if (nvlist_alloc(&config
, NV_UNIQUE_NAME
, 0) != 0)
928 config_seen
= B_FALSE
;
931 * Iterate over all toplevel vdevs. Grab the pool configuration
932 * from the first one we find, and then go through the rest and
933 * add them as necessary to the 'vdevs' member of the config.
935 for (ve
= pe
->pe_vdevs
; ve
!= NULL
; ve
= ve
->ve_next
) {
938 * Determine the best configuration for this vdev by
939 * selecting the config with the latest transaction
943 for (ce
= ve
->ve_configs
; ce
!= NULL
;
946 if (ce
->ce_txg
> best_txg
) {
948 best_txg
= ce
->ce_txg
;
953 * We rely on the fact that the max txg for the
954 * pool will contain the most up-to-date information
955 * about the valid top-levels in the vdev namespace.
957 if (best_txg
> max_txg
) {
958 (void) nvlist_remove(config
,
959 ZPOOL_CONFIG_VDEV_CHILDREN
,
961 (void) nvlist_remove(config
,
962 ZPOOL_CONFIG_HOLE_ARRAY
,
963 DATA_TYPE_UINT64_ARRAY
);
969 valid_top_config
= B_FALSE
;
971 if (nvlist_lookup_uint64(tmp
,
972 ZPOOL_CONFIG_VDEV_CHILDREN
, &max_id
) == 0) {
973 verify(nvlist_add_uint64(config
,
974 ZPOOL_CONFIG_VDEV_CHILDREN
,
976 valid_top_config
= B_TRUE
;
979 if (nvlist_lookup_uint64_array(tmp
,
980 ZPOOL_CONFIG_HOLE_ARRAY
, &hole_array
,
982 verify(nvlist_add_uint64_array(config
,
983 ZPOOL_CONFIG_HOLE_ARRAY
,
984 hole_array
, holes
) == 0);
990 * Copy the relevant pieces of data to the pool
996 * comment (if available)
998 * hostid (if available)
999 * hostname (if available)
1001 uint64_t state
, version
;
1002 char *comment
= NULL
;
1004 version
= fnvlist_lookup_uint64(tmp
,
1005 ZPOOL_CONFIG_VERSION
);
1006 fnvlist_add_uint64(config
,
1007 ZPOOL_CONFIG_VERSION
, version
);
1008 guid
= fnvlist_lookup_uint64(tmp
,
1009 ZPOOL_CONFIG_POOL_GUID
);
1010 fnvlist_add_uint64(config
,
1011 ZPOOL_CONFIG_POOL_GUID
, guid
);
1012 name
= fnvlist_lookup_string(tmp
,
1013 ZPOOL_CONFIG_POOL_NAME
);
1014 fnvlist_add_string(config
,
1015 ZPOOL_CONFIG_POOL_NAME
, name
);
1017 if (nvlist_lookup_string(tmp
,
1018 ZPOOL_CONFIG_COMMENT
, &comment
) == 0)
1019 fnvlist_add_string(config
,
1020 ZPOOL_CONFIG_COMMENT
, comment
);
1022 state
= fnvlist_lookup_uint64(tmp
,
1023 ZPOOL_CONFIG_POOL_STATE
);
1024 fnvlist_add_uint64(config
,
1025 ZPOOL_CONFIG_POOL_STATE
, state
);
1028 if (nvlist_lookup_uint64(tmp
,
1029 ZPOOL_CONFIG_HOSTID
, &hostid
) == 0) {
1030 fnvlist_add_uint64(config
,
1031 ZPOOL_CONFIG_HOSTID
, hostid
);
1032 hostname
= fnvlist_lookup_string(tmp
,
1033 ZPOOL_CONFIG_HOSTNAME
);
1034 fnvlist_add_string(config
,
1035 ZPOOL_CONFIG_HOSTNAME
, hostname
);
1038 config_seen
= B_TRUE
;
1042 * Add this top-level vdev to the child array.
1044 verify(nvlist_lookup_nvlist(tmp
,
1045 ZPOOL_CONFIG_VDEV_TREE
, &nvtop
) == 0);
1046 verify(nvlist_lookup_uint64(nvtop
, ZPOOL_CONFIG_ID
,
1049 if (id
>= children
) {
1050 nvlist_t
**newchild
;
1052 newchild
= zfs_alloc(hdl
, (id
+ 1) *
1053 sizeof (nvlist_t
*));
1054 if (newchild
== NULL
)
1057 for (c
= 0; c
< children
; c
++)
1058 newchild
[c
] = child
[c
];
1064 if (nvlist_dup(nvtop
, &child
[id
], 0) != 0)
1070 * If we have information about all the top-levels then
1071 * clean up the nvlist which we've constructed. This
1072 * means removing any extraneous devices that are
1073 * beyond the valid range or adding devices to the end
1074 * of our array which appear to be missing.
1076 if (valid_top_config
) {
1077 if (max_id
< children
) {
1078 for (c
= max_id
; c
< children
; c
++)
1079 nvlist_free(child
[c
]);
1081 } else if (max_id
> children
) {
1082 nvlist_t
**newchild
;
1084 newchild
= zfs_alloc(hdl
, (max_id
) *
1085 sizeof (nvlist_t
*));
1086 if (newchild
== NULL
)
1089 for (c
= 0; c
< children
; c
++)
1090 newchild
[c
] = child
[c
];
1098 verify(nvlist_lookup_uint64(config
, ZPOOL_CONFIG_POOL_GUID
,
1102 * The vdev namespace may contain holes as a result of
1103 * device removal. We must add them back into the vdev
1104 * tree before we process any missing devices.
1107 ASSERT(valid_top_config
);
1109 for (c
= 0; c
< children
; c
++) {
1112 if (child
[c
] != NULL
||
1113 !vdev_is_hole(hole_array
, holes
, c
))
1116 if (nvlist_alloc(&holey
, NV_UNIQUE_NAME
,
1121 * Holes in the namespace are treated as
1122 * "hole" top-level vdevs and have a
1123 * special flag set on them.
1125 if (nvlist_add_string(holey
,
1127 VDEV_TYPE_HOLE
) != 0 ||
1128 nvlist_add_uint64(holey
,
1129 ZPOOL_CONFIG_ID
, c
) != 0 ||
1130 nvlist_add_uint64(holey
,
1131 ZPOOL_CONFIG_GUID
, 0ULL) != 0) {
1140 * Look for any missing top-level vdevs. If this is the case,
1141 * create a faked up 'missing' vdev as a placeholder. We cannot
1142 * simply compress the child array, because the kernel performs
1143 * certain checks to make sure the vdev IDs match their location
1144 * in the configuration.
1146 for (c
= 0; c
< children
; c
++) {
1147 if (child
[c
] == NULL
) {
1149 if (nvlist_alloc(&missing
, NV_UNIQUE_NAME
,
1152 if (nvlist_add_string(missing
,
1154 VDEV_TYPE_MISSING
) != 0 ||
1155 nvlist_add_uint64(missing
,
1156 ZPOOL_CONFIG_ID
, c
) != 0 ||
1157 nvlist_add_uint64(missing
,
1158 ZPOOL_CONFIG_GUID
, 0ULL) != 0) {
1159 nvlist_free(missing
);
1167 * Put all of this pool's top-level vdevs into a root vdev.
1169 if (nvlist_alloc(&nvroot
, NV_UNIQUE_NAME
, 0) != 0)
1171 if (nvlist_add_string(nvroot
, ZPOOL_CONFIG_TYPE
,
1172 VDEV_TYPE_ROOT
) != 0 ||
1173 nvlist_add_uint64(nvroot
, ZPOOL_CONFIG_ID
, 0ULL) != 0 ||
1174 nvlist_add_uint64(nvroot
, ZPOOL_CONFIG_GUID
, guid
) != 0 ||
1175 nvlist_add_nvlist_array(nvroot
, ZPOOL_CONFIG_CHILDREN
,
1176 child
, children
) != 0) {
1177 nvlist_free(nvroot
);
1181 for (c
= 0; c
< children
; c
++)
1182 nvlist_free(child
[c
]);
1188 * Go through and fix up any paths and/or devids based on our
1189 * known list of vdev GUID -> path mappings.
1191 if (fix_paths(nvroot
, pl
->names
) != 0) {
1192 nvlist_free(nvroot
);
1197 * Add the root vdev to this pool's configuration.
1199 if (nvlist_add_nvlist(config
, ZPOOL_CONFIG_VDEV_TREE
,
1201 nvlist_free(nvroot
);
1204 nvlist_free(nvroot
);
1207 * zdb uses this path to report on active pools that were
1208 * imported or created using -R.
1214 * Determine if this pool is currently active, in which case we
1215 * can't actually import it.
1217 verify(nvlist_lookup_string(config
, ZPOOL_CONFIG_POOL_NAME
,
1219 verify(nvlist_lookup_uint64(config
, ZPOOL_CONFIG_POOL_GUID
,
1222 if (pool_active(hdl
, name
, guid
, &isactive
) != 0)
1226 nvlist_free(config
);
1231 if ((nvl
= refresh_config(hdl
, config
)) == NULL
) {
1232 nvlist_free(config
);
1237 nvlist_free(config
);
1241 * Go through and update the paths for spares, now that we have
1244 verify(nvlist_lookup_nvlist(config
, ZPOOL_CONFIG_VDEV_TREE
,
1246 if (nvlist_lookup_nvlist_array(nvroot
, ZPOOL_CONFIG_SPARES
,
1247 &spares
, &nspares
) == 0) {
1248 for (i
= 0; i
< nspares
; i
++) {
1249 if (fix_paths(spares
[i
], pl
->names
) != 0)
1255 * Update the paths for l2cache devices.
1257 if (nvlist_lookup_nvlist_array(nvroot
, ZPOOL_CONFIG_L2CACHE
,
1258 &l2cache
, &nl2cache
) == 0) {
1259 for (i
= 0; i
< nl2cache
; i
++) {
1260 if (fix_paths(l2cache
[i
], pl
->names
) != 0)
1266 * Restore the original information read from the actual label.
1268 (void) nvlist_remove(config
, ZPOOL_CONFIG_HOSTID
,
1270 (void) nvlist_remove(config
, ZPOOL_CONFIG_HOSTNAME
,
1273 verify(nvlist_add_uint64(config
, ZPOOL_CONFIG_HOSTID
,
1275 verify(nvlist_add_string(config
, ZPOOL_CONFIG_HOSTNAME
,
1281 * Add this pool to the list of configs.
1283 verify(nvlist_lookup_string(config
, ZPOOL_CONFIG_POOL_NAME
,
1285 if (nvlist_add_nvlist(ret
, name
, config
) != 0)
1288 nvlist_free(config
);
1295 (void) no_memory(hdl
);
1297 nvlist_free(config
);
1299 for (c
= 0; c
< children
; c
++)
1300 nvlist_free(child
[c
]);
1307 * Return the offset of the given label.
1310 label_offset(uint64_t size
, int l
)
1312 ASSERT(P2PHASE_TYPED(size
, sizeof (vdev_label_t
), uint64_t) == 0);
1313 return (l
* sizeof (vdev_label_t
) + (l
< VDEV_LABELS
/ 2 ?
1314 0 : size
- VDEV_LABELS
* sizeof (vdev_label_t
)));
1318 * Given a file descriptor, read the label information and return an nvlist
1319 * describing the configuration, if there is one. The number of valid
1320 * labels found will be returned in num_labels when non-NULL.
1323 zpool_read_label(int fd
, nvlist_t
**config
, int *num_labels
)
1325 struct stat64 statbuf
;
1327 vdev_label_t
*label
;
1328 nvlist_t
*expected_config
= NULL
;
1329 uint64_t expected_guid
= 0, size
;
1334 if (fstat64_blk(fd
, &statbuf
) == -1)
1336 size
= P2ALIGN_TYPED(statbuf
.st_size
, sizeof (vdev_label_t
), uint64_t);
1338 error
= posix_memalign((void **)&label
, PAGESIZE
, sizeof (*label
));
1342 for (l
= 0; l
< VDEV_LABELS
; l
++) {
1343 uint64_t state
, guid
, txg
;
1345 if (pread64(fd
, label
, sizeof (vdev_label_t
),
1346 label_offset(size
, l
)) != sizeof (vdev_label_t
))
1349 if (nvlist_unpack(label
->vl_vdev_phys
.vp_nvlist
,
1350 sizeof (label
->vl_vdev_phys
.vp_nvlist
), config
, 0) != 0)
1353 if (nvlist_lookup_uint64(*config
, ZPOOL_CONFIG_GUID
,
1354 &guid
) != 0 || guid
== 0) {
1355 nvlist_free(*config
);
1359 if (nvlist_lookup_uint64(*config
, ZPOOL_CONFIG_POOL_STATE
,
1360 &state
) != 0 || state
> POOL_STATE_L2CACHE
) {
1361 nvlist_free(*config
);
1365 if (state
!= POOL_STATE_SPARE
&& state
!= POOL_STATE_L2CACHE
&&
1366 (nvlist_lookup_uint64(*config
, ZPOOL_CONFIG_POOL_TXG
,
1367 &txg
) != 0 || txg
== 0)) {
1368 nvlist_free(*config
);
1372 if (expected_guid
) {
1373 if (expected_guid
== guid
)
1376 nvlist_free(*config
);
1378 expected_config
= *config
;
1379 expected_guid
= guid
;
1384 if (num_labels
!= NULL
)
1385 *num_labels
= count
;
1388 *config
= expected_config
;
1393 typedef struct rdsk_node
{
1394 char *rn_name
; /* Full path to device */
1395 int rn_order
; /* Preferred order (low to high) */
1396 int rn_num_labels
; /* Number of valid labels */
1397 uint64_t rn_vdev_guid
; /* Expected vdev guid when set */
1398 libzfs_handle_t
*rn_hdl
;
1399 nvlist_t
*rn_config
; /* Label config */
1403 boolean_t rn_labelpaths
;
1407 * Sorted by vdev guid and full path to allow for multiple entries with
1408 * the same full path name. This is required because it's possible to
1409 * have multiple block devices with labels that refer to the same
1410 * ZPOOL_CONFIG_PATH yet have different vdev guids. In this case both
1411 * entries need to be added to the cache. Scenarios where this can occur
1412 * include overwritten pool labels, devices which are visible from multiple
1413 * hosts and multipath devices.
1416 slice_cache_compare(const void *arg1
, const void *arg2
)
1418 const char *nm1
= ((rdsk_node_t
*)arg1
)->rn_name
;
1419 const char *nm2
= ((rdsk_node_t
*)arg2
)->rn_name
;
1420 uint64_t guid1
= ((rdsk_node_t
*)arg1
)->rn_vdev_guid
;
1421 uint64_t guid2
= ((rdsk_node_t
*)arg2
)->rn_vdev_guid
;
1424 rv
= AVL_CMP(guid1
, guid2
);
1428 return (AVL_ISIGN(strcmp(nm1
, nm2
)));
1432 is_watchdog_dev(char *dev
)
1434 /* For 'watchdog' dev */
1435 if (strcmp(dev
, "watchdog") == 0)
1438 /* For 'watchdog<digit><whatever> */
1439 if (strstr(dev
, "watchdog") == dev
&& isdigit(dev
[8]))
1446 label_paths_impl(libzfs_handle_t
*hdl
, nvlist_t
*nvroot
, uint64_t pool_guid
,
1447 uint64_t vdev_guid
, char **path
, char **devid
)
1455 if (nvlist_lookup_nvlist_array(nvroot
, ZPOOL_CONFIG_CHILDREN
,
1456 &child
, &children
) == 0) {
1457 for (c
= 0; c
< children
; c
++) {
1458 error
= label_paths_impl(hdl
, child
[c
],
1459 pool_guid
, vdev_guid
, path
, devid
);
1469 error
= nvlist_lookup_uint64(nvroot
, ZPOOL_CONFIG_GUID
, &guid
);
1470 if ((error
!= 0) || (guid
!= vdev_guid
))
1473 error
= nvlist_lookup_string(nvroot
, ZPOOL_CONFIG_PATH
, &val
);
1477 error
= nvlist_lookup_string(nvroot
, ZPOOL_CONFIG_DEVID
, &val
);
1485 * Given a disk label fetch the ZPOOL_CONFIG_PATH and ZPOOL_CONFIG_DEVID
1486 * and store these strings as config_path and devid_path respectively.
1487 * The returned pointers are only valid as long as label remains valid.
1490 label_paths(libzfs_handle_t
*hdl
, nvlist_t
*label
, char **path
, char **devid
)
1499 if (nvlist_lookup_nvlist(label
, ZPOOL_CONFIG_VDEV_TREE
, &nvroot
) ||
1500 nvlist_lookup_uint64(label
, ZPOOL_CONFIG_POOL_GUID
, &pool_guid
) ||
1501 nvlist_lookup_uint64(label
, ZPOOL_CONFIG_GUID
, &vdev_guid
))
1504 return (label_paths_impl(hdl
, nvroot
, pool_guid
, vdev_guid
, path
,
1509 zpool_open_func(void *arg
)
1511 rdsk_node_t
*rn
= arg
;
1512 libzfs_handle_t
*hdl
= rn
->rn_hdl
;
1513 struct stat64 statbuf
;
1515 char *bname
, *dupname
;
1516 uint64_t vdev_guid
= 0;
1522 * Skip devices with well known prefixes there can be side effects
1523 * when opening devices which need to be avoided.
1525 * hpet - High Precision Event Timer
1526 * watchdog - Watchdog must be closed in a special way.
1528 dupname
= zfs_strdup(hdl
, rn
->rn_name
);
1529 bname
= basename(dupname
);
1530 error
= ((strcmp(bname
, "hpet") == 0) || is_watchdog_dev(bname
));
1536 * Ignore failed stats. We only want regular files and block devices.
1538 if (stat64(rn
->rn_name
, &statbuf
) != 0 ||
1539 (!S_ISREG(statbuf
.st_mode
) && !S_ISBLK(statbuf
.st_mode
)))
1543 * Preferentially open using O_DIRECT to bypass the block device
1544 * cache which may be stale for multipath devices. An EINVAL errno
1545 * indicates O_DIRECT is unsupported so fallback to just O_RDONLY.
1547 fd
= open(rn
->rn_name
, O_RDONLY
| O_DIRECT
);
1548 if ((fd
< 0) && (errno
== EINVAL
))
1549 fd
= open(rn
->rn_name
, O_RDONLY
);
1555 * This file is too small to hold a zpool
1557 if (S_ISREG(statbuf
.st_mode
) && statbuf
.st_size
< SPA_MINDEVSIZE
) {
1562 error
= zpool_read_label(fd
, &config
, &num_labels
);
1568 if (num_labels
== 0) {
1570 nvlist_free(config
);
1575 * Check that the vdev is for the expected guid. Additional entries
1576 * are speculatively added based on the paths stored in the labels.
1577 * Entries with valid paths but incorrect guids must be removed.
1579 error
= nvlist_lookup_uint64(config
, ZPOOL_CONFIG_GUID
, &vdev_guid
);
1580 if (error
|| (rn
->rn_vdev_guid
&& rn
->rn_vdev_guid
!= vdev_guid
)) {
1582 nvlist_free(config
);
1588 rn
->rn_config
= config
;
1589 rn
->rn_num_labels
= num_labels
;
1592 * Add additional entries for paths described by this label.
1594 if (rn
->rn_labelpaths
) {
1601 if (label_paths(rn
->rn_hdl
, rn
->rn_config
, &path
, &devid
))
1605 * Allow devlinks to stabilize so all paths are available.
1607 zpool_label_disk_wait(rn
->rn_name
, DISK_LABEL_WAIT
);
1610 slice
= zfs_alloc(hdl
, sizeof (rdsk_node_t
));
1611 slice
->rn_name
= zfs_strdup(hdl
, path
);
1612 slice
->rn_vdev_guid
= vdev_guid
;
1613 slice
->rn_avl
= rn
->rn_avl
;
1614 slice
->rn_hdl
= hdl
;
1615 slice
->rn_order
= IMPORT_ORDER_PREFERRED_1
;
1616 slice
->rn_labelpaths
= B_FALSE
;
1617 mutex_enter(rn
->rn_lock
);
1618 if (avl_find(rn
->rn_avl
, slice
, &where
)) {
1619 mutex_exit(rn
->rn_lock
);
1620 free(slice
->rn_name
);
1623 avl_insert(rn
->rn_avl
, slice
, where
);
1624 mutex_exit(rn
->rn_lock
);
1625 zpool_open_func(slice
);
1629 if (devid
!= NULL
) {
1630 slice
= zfs_alloc(hdl
, sizeof (rdsk_node_t
));
1631 error
= asprintf(&slice
->rn_name
, "%s%s",
1632 DEV_BYID_PATH
, devid
);
1638 slice
->rn_vdev_guid
= vdev_guid
;
1639 slice
->rn_avl
= rn
->rn_avl
;
1640 slice
->rn_hdl
= hdl
;
1641 slice
->rn_order
= IMPORT_ORDER_PREFERRED_2
;
1642 slice
->rn_labelpaths
= B_FALSE
;
1643 mutex_enter(rn
->rn_lock
);
1644 if (avl_find(rn
->rn_avl
, slice
, &where
)) {
1645 mutex_exit(rn
->rn_lock
);
1646 free(slice
->rn_name
);
1649 avl_insert(rn
->rn_avl
, slice
, where
);
1650 mutex_exit(rn
->rn_lock
);
1651 zpool_open_func(slice
);
1658 * Given a file descriptor, clear (zero) the label information. This function
1659 * is used in the appliance stack as part of the ZFS sysevent module and
1660 * to implement the "zpool labelclear" command.
1663 zpool_clear_label(int fd
)
1665 struct stat64 statbuf
;
1667 vdev_label_t
*label
;
1670 if (fstat64_blk(fd
, &statbuf
) == -1)
1672 size
= P2ALIGN_TYPED(statbuf
.st_size
, sizeof (vdev_label_t
), uint64_t);
1674 if ((label
= calloc(sizeof (vdev_label_t
), 1)) == NULL
)
1677 for (l
= 0; l
< VDEV_LABELS
; l
++) {
1678 if (pwrite64(fd
, label
, sizeof (vdev_label_t
),
1679 label_offset(size
, l
)) != sizeof (vdev_label_t
)) {
1690 * Scan a list of directories for zfs devices.
1693 zpool_find_import_scan(libzfs_handle_t
*hdl
, kmutex_t
*lock
,
1694 avl_tree_t
**slice_cache
, char **dir
, int dirs
)
1701 *slice_cache
= NULL
;
1702 cache
= zfs_alloc(hdl
, sizeof (avl_tree_t
));
1703 avl_create(cache
, slice_cache_compare
, sizeof (rdsk_node_t
),
1704 offsetof(rdsk_node_t
, rn_node
));
1706 for (i
= 0; i
< dirs
; i
++) {
1707 char path
[MAXPATHLEN
];
1708 struct dirent64
*dp
;
1711 if (realpath(dir
[i
], path
) == NULL
) {
1713 if (error
== ENOENT
)
1716 zfs_error_aux(hdl
, strerror(error
));
1717 (void) zfs_error_fmt(hdl
, EZFS_BADPATH
, dgettext(
1718 TEXT_DOMAIN
, "cannot resolve path '%s'"), dir
[i
]);
1722 dirp
= opendir(path
);
1725 zfs_error_aux(hdl
, strerror(error
));
1726 (void) zfs_error_fmt(hdl
, EZFS_BADPATH
,
1727 dgettext(TEXT_DOMAIN
, "cannot open '%s'"), path
);
1731 while ((dp
= readdir64(dirp
)) != NULL
) {
1732 const char *name
= dp
->d_name
;
1733 if (name
[0] == '.' &&
1734 (name
[1] == 0 || (name
[1] == '.' && name
[2] == 0)))
1737 slice
= zfs_alloc(hdl
, sizeof (rdsk_node_t
));
1738 error
= asprintf(&slice
->rn_name
, "%s/%s", path
, name
);
1743 slice
->rn_vdev_guid
= 0;
1744 slice
->rn_lock
= lock
;
1745 slice
->rn_avl
= cache
;
1746 slice
->rn_hdl
= hdl
;
1747 slice
->rn_order
= i
+ IMPORT_ORDER_SCAN_OFFSET
;
1748 slice
->rn_labelpaths
= B_FALSE
;
1750 avl_add(cache
, slice
);
1754 (void) closedir(dirp
);
1757 *slice_cache
= cache
;
1762 while ((slice
= avl_destroy_nodes(cache
, &cookie
)) != NULL
) {
1763 free(slice
->rn_name
);
1772 * Use libblkid to quickly enumerate all known zfs devices.
1775 zpool_find_import_blkid(libzfs_handle_t
*hdl
, kmutex_t
*lock
,
1776 avl_tree_t
**slice_cache
)
1780 blkid_dev_iterate iter
;
1785 *slice_cache
= NULL
;
1787 error
= blkid_get_cache(&cache
, NULL
);
1791 error
= blkid_probe_all_new(cache
);
1793 blkid_put_cache(cache
);
1797 iter
= blkid_dev_iterate_begin(cache
);
1799 blkid_put_cache(cache
);
1803 error
= blkid_dev_set_search(iter
, "TYPE", "zfs_member");
1805 blkid_dev_iterate_end(iter
);
1806 blkid_put_cache(cache
);
1810 *slice_cache
= zfs_alloc(hdl
, sizeof (avl_tree_t
));
1811 avl_create(*slice_cache
, slice_cache_compare
, sizeof (rdsk_node_t
),
1812 offsetof(rdsk_node_t
, rn_node
));
1814 while (blkid_dev_next(iter
, &dev
) == 0) {
1815 slice
= zfs_alloc(hdl
, sizeof (rdsk_node_t
));
1816 slice
->rn_name
= zfs_strdup(hdl
, blkid_dev_devname(dev
));
1817 slice
->rn_vdev_guid
= 0;
1818 slice
->rn_lock
= lock
;
1819 slice
->rn_avl
= *slice_cache
;
1820 slice
->rn_hdl
= hdl
;
1821 slice
->rn_labelpaths
= B_TRUE
;
1823 error
= zfs_path_order(slice
->rn_name
, &slice
->rn_order
);
1825 slice
->rn_order
+= IMPORT_ORDER_SCAN_OFFSET
;
1827 slice
->rn_order
= IMPORT_ORDER_DEFAULT
;
1830 if (avl_find(*slice_cache
, slice
, &where
)) {
1831 free(slice
->rn_name
);
1834 avl_insert(*slice_cache
, slice
, where
);
1839 blkid_dev_iterate_end(iter
);
1840 blkid_put_cache(cache
);
1846 zpool_default_import_path
[DEFAULT_IMPORT_PATH_SIZE
] = {
1847 "/dev/disk/by-vdev", /* Custom rules, use first if they exist */
1848 "/dev/mapper", /* Use multipath devices before components */
1849 "/dev/disk/by-partlabel", /* Single unique entry set by user */
1850 "/dev/disk/by-partuuid", /* Generated partition uuid */
1851 "/dev/disk/by-label", /* Custom persistent labels */
1852 "/dev/disk/by-uuid", /* Single unique entry and persistent */
1853 "/dev/disk/by-id", /* May be multiple entries and persistent */
1854 "/dev/disk/by-path", /* Encodes physical location and persistent */
1855 "/dev" /* UNSAFE device names will change */
1859 * Given a list of directories to search, find all pools stored on disk. This
1860 * includes partial pools which are not available to import. If no args are
1861 * given (argc is 0), then the default directory (/dev/dsk) is searched.
1862 * poolname or guid (but not both) are provided by the caller when trying
1863 * to import a specific pool.
1866 zpool_find_import_impl(libzfs_handle_t
*hdl
, importargs_t
*iarg
)
1868 nvlist_t
*ret
= NULL
;
1869 pool_list_t pools
= { 0 };
1870 pool_entry_t
*pe
, *penext
;
1871 vdev_entry_t
*ve
, *venext
;
1872 config_entry_t
*ce
, *cenext
;
1873 name_entry_t
*ne
, *nenext
;
1880 verify(iarg
->poolname
== NULL
|| iarg
->guid
== 0);
1881 mutex_init(&lock
, NULL
, MUTEX_DEFAULT
, NULL
);
1884 * Locate pool member vdevs using libblkid or by directory scanning.
1885 * On success a newly allocated AVL tree which is populated with an
1886 * entry for each discovered vdev will be returned as the cache.
1887 * It's the callers responsibility to consume and destroy this tree.
1889 if (iarg
->scan
|| iarg
->paths
!= 0) {
1890 int dirs
= iarg
->paths
;
1891 char **dir
= iarg
->path
;
1894 dir
= zpool_default_import_path
;
1895 dirs
= DEFAULT_IMPORT_PATH_SIZE
;
1898 if (zpool_find_import_scan(hdl
, &lock
, &cache
, dir
, dirs
) != 0)
1901 if (zpool_find_import_blkid(hdl
, &lock
, &cache
) != 0)
1906 * Create a thread pool to parallelize the process of reading and
1907 * validating labels, a large number of threads can be used due to
1908 * minimal contention.
1910 t
= taskq_create("z_import", 2 * boot_ncpus
, defclsyspri
,
1911 2 * boot_ncpus
, INT_MAX
, TASKQ_PREPOPULATE
);
1913 for (slice
= avl_first(cache
); slice
;
1914 (slice
= avl_walk(cache
, slice
, AVL_AFTER
)))
1915 (void) taskq_dispatch(t
, zpool_open_func
, slice
, TQ_SLEEP
);
1921 * Process the cache filtering out any entries which are not
1922 * for the specificed pool then adding matching label configs.
1925 while ((slice
= avl_destroy_nodes(cache
, &cookie
)) != NULL
) {
1926 if (slice
->rn_config
!= NULL
) {
1927 nvlist_t
*config
= slice
->rn_config
;
1928 boolean_t matched
= B_TRUE
;
1929 boolean_t aux
= B_FALSE
;
1933 * Check if it's a spare or l2cache device. If it is,
1934 * we need to skip the name and guid check since they
1935 * don't exist on aux device label.
1937 if (iarg
->poolname
!= NULL
|| iarg
->guid
!= 0) {
1939 aux
= nvlist_lookup_uint64(config
,
1940 ZPOOL_CONFIG_POOL_STATE
, &state
) == 0 &&
1941 (state
== POOL_STATE_SPARE
||
1942 state
== POOL_STATE_L2CACHE
);
1945 if (iarg
->poolname
!= NULL
&& !aux
) {
1948 matched
= nvlist_lookup_string(config
,
1949 ZPOOL_CONFIG_POOL_NAME
, &pname
) == 0 &&
1950 strcmp(iarg
->poolname
, pname
) == 0;
1951 } else if (iarg
->guid
!= 0 && !aux
) {
1954 matched
= nvlist_lookup_uint64(config
,
1955 ZPOOL_CONFIG_POOL_GUID
, &this_guid
) == 0 &&
1956 iarg
->guid
== this_guid
;
1959 nvlist_free(config
);
1962 * Verify all remaining entries can be opened
1963 * exclusively. This will prune all underlying
1964 * multipath devices which otherwise could
1965 * result in the vdev appearing as UNAVAIL.
1967 * Under zdb, this step isn't required and
1968 * would prevent a zdb -e of active pools with
1971 fd
= open(slice
->rn_name
, O_RDONLY
| O_EXCL
);
1972 if (fd
>= 0 || iarg
->can_be_active
) {
1975 add_config(hdl
, &pools
,
1976 slice
->rn_name
, slice
->rn_order
,
1977 slice
->rn_num_labels
, config
);
1979 nvlist_free(config
);
1983 free(slice
->rn_name
);
1988 mutex_destroy(&lock
);
1990 ret
= get_configs(hdl
, &pools
, iarg
->can_be_active
);
1992 for (pe
= pools
.pools
; pe
!= NULL
; pe
= penext
) {
1993 penext
= pe
->pe_next
;
1994 for (ve
= pe
->pe_vdevs
; ve
!= NULL
; ve
= venext
) {
1995 venext
= ve
->ve_next
;
1996 for (ce
= ve
->ve_configs
; ce
!= NULL
; ce
= cenext
) {
1997 cenext
= ce
->ce_next
;
1998 nvlist_free(ce
->ce_config
);
2006 for (ne
= pools
.names
; ne
!= NULL
; ne
= nenext
) {
2007 nenext
= ne
->ne_next
;
2016 zpool_find_import(libzfs_handle_t
*hdl
, int argc
, char **argv
)
2018 importargs_t iarg
= { 0 };
2023 return (zpool_find_import_impl(hdl
, &iarg
));
2027 * Given a cache file, return the contents as a list of importable pools.
2028 * poolname or guid (but not both) are provided by the caller when trying
2029 * to import a specific pool.
2032 zpool_find_import_cached(libzfs_handle_t
*hdl
, const char *cachefile
,
2033 char *poolname
, uint64_t guid
)
2037 struct stat64 statbuf
;
2038 nvlist_t
*raw
, *src
, *dst
;
2045 verify(poolname
== NULL
|| guid
== 0);
2047 if ((fd
= open(cachefile
, O_RDONLY
)) < 0) {
2048 zfs_error_aux(hdl
, "%s", strerror(errno
));
2049 (void) zfs_error(hdl
, EZFS_BADCACHE
,
2050 dgettext(TEXT_DOMAIN
, "failed to open cache file"));
2054 if (fstat64(fd
, &statbuf
) != 0) {
2055 zfs_error_aux(hdl
, "%s", strerror(errno
));
2057 (void) zfs_error(hdl
, EZFS_BADCACHE
,
2058 dgettext(TEXT_DOMAIN
, "failed to get size of cache file"));
2062 if ((buf
= zfs_alloc(hdl
, statbuf
.st_size
)) == NULL
) {
2067 if (read(fd
, buf
, statbuf
.st_size
) != statbuf
.st_size
) {
2070 (void) zfs_error(hdl
, EZFS_BADCACHE
,
2071 dgettext(TEXT_DOMAIN
,
2072 "failed to read cache file contents"));
2078 if (nvlist_unpack(buf
, statbuf
.st_size
, &raw
, 0) != 0) {
2080 (void) zfs_error(hdl
, EZFS_BADCACHE
,
2081 dgettext(TEXT_DOMAIN
,
2082 "invalid or corrupt cache file contents"));
2089 * Go through and get the current state of the pools and refresh their
2092 if (nvlist_alloc(&pools
, 0, 0) != 0) {
2093 (void) no_memory(hdl
);
2099 while ((elem
= nvlist_next_nvpair(raw
, elem
)) != NULL
) {
2100 src
= fnvpair_value_nvlist(elem
);
2102 name
= fnvlist_lookup_string(src
, ZPOOL_CONFIG_POOL_NAME
);
2103 if (poolname
!= NULL
&& strcmp(poolname
, name
) != 0)
2106 this_guid
= fnvlist_lookup_uint64(src
, ZPOOL_CONFIG_POOL_GUID
);
2107 if (guid
!= 0 && guid
!= this_guid
)
2110 if (pool_active(hdl
, name
, this_guid
, &active
) != 0) {
2119 if ((dst
= refresh_config(hdl
, src
)) == NULL
) {
2125 if (nvlist_add_nvlist(pools
, nvpair_name(elem
), dst
) != 0) {
2126 (void) no_memory(hdl
);
2140 name_or_guid_exists(zpool_handle_t
*zhp
, void *data
)
2142 importargs_t
*import
= data
;
2145 if (import
->poolname
!= NULL
) {
2148 verify(nvlist_lookup_string(zhp
->zpool_config
,
2149 ZPOOL_CONFIG_POOL_NAME
, &pool_name
) == 0);
2150 if (strcmp(pool_name
, import
->poolname
) == 0)
2155 verify(nvlist_lookup_uint64(zhp
->zpool_config
,
2156 ZPOOL_CONFIG_POOL_GUID
, &pool_guid
) == 0);
2157 if (pool_guid
== import
->guid
)
2166 zpool_search_import(libzfs_handle_t
*hdl
, importargs_t
*import
)
2168 verify(import
->poolname
== NULL
|| import
->guid
== 0);
2171 import
->exists
= zpool_iter(hdl
, name_or_guid_exists
, import
);
2173 if (import
->cachefile
!= NULL
)
2174 return (zpool_find_import_cached(hdl
, import
->cachefile
,
2175 import
->poolname
, import
->guid
));
2177 return (zpool_find_import_impl(hdl
, import
));
2181 pool_match(nvlist_t
*cfg
, char *tgt
)
2183 uint64_t v
, guid
= strtoull(tgt
, NULL
, 0);
2187 if (nvlist_lookup_uint64(cfg
, ZPOOL_CONFIG_POOL_GUID
, &v
) == 0)
2190 if (nvlist_lookup_string(cfg
, ZPOOL_CONFIG_POOL_NAME
, &s
) == 0)
2191 return (strcmp(s
, tgt
) == 0);
2197 zpool_tryimport(libzfs_handle_t
*hdl
, char *target
, nvlist_t
**configp
,
2201 nvlist_t
*match
= NULL
;
2202 nvlist_t
*config
= NULL
;
2203 char *name
= NULL
, *sepp
= NULL
;
2206 char *targetdup
= strdup(target
);
2210 if ((sepp
= strpbrk(targetdup
, "/@")) != NULL
) {
2215 pools
= zpool_search_import(hdl
, args
);
2217 if (pools
!= NULL
) {
2218 nvpair_t
*elem
= NULL
;
2219 while ((elem
= nvlist_next_nvpair(pools
, elem
)) != NULL
) {
2220 VERIFY0(nvpair_value_nvlist(elem
, &config
));
2221 if (pool_match(config
, targetdup
)) {
2223 if (match
!= NULL
) {
2224 /* multiple matches found */
2228 name
= nvpair_name(elem
);
2235 (void) zfs_error_aux(hdl
, dgettext(TEXT_DOMAIN
,
2242 (void) zfs_error_aux(hdl
, dgettext(TEXT_DOMAIN
,
2243 "%d pools found, use pool GUID\n"), count
);
2255 find_guid(nvlist_t
*nv
, uint64_t guid
)
2261 verify(nvlist_lookup_uint64(nv
, ZPOOL_CONFIG_GUID
, &tmp
) == 0);
2265 if (nvlist_lookup_nvlist_array(nv
, ZPOOL_CONFIG_CHILDREN
,
2266 &child
, &children
) == 0) {
2267 for (c
= 0; c
< children
; c
++)
2268 if (find_guid(child
[c
], guid
))
2275 typedef struct aux_cbdata
{
2276 const char *cb_type
;
2278 zpool_handle_t
*cb_zhp
;
2282 find_aux(zpool_handle_t
*zhp
, void *data
)
2284 aux_cbdata_t
*cbp
= data
;
2290 verify(nvlist_lookup_nvlist(zhp
->zpool_config
, ZPOOL_CONFIG_VDEV_TREE
,
2293 if (nvlist_lookup_nvlist_array(nvroot
, cbp
->cb_type
,
2294 &list
, &count
) == 0) {
2295 for (i
= 0; i
< count
; i
++) {
2296 verify(nvlist_lookup_uint64(list
[i
],
2297 ZPOOL_CONFIG_GUID
, &guid
) == 0);
2298 if (guid
== cbp
->cb_guid
) {
2310 * Determines if the pool is in use. If so, it returns true and the state of
2311 * the pool as well as the name of the pool. Name string is allocated and
2312 * must be freed by the caller.
2315 zpool_in_use(libzfs_handle_t
*hdl
, int fd
, pool_state_t
*state
, char **namestr
,
2321 uint64_t guid
, vdev_guid
;
2322 zpool_handle_t
*zhp
;
2323 nvlist_t
*pool_config
;
2324 uint64_t stateval
, isspare
;
2325 aux_cbdata_t cb
= { 0 };
2330 if (zpool_read_label(fd
, &config
, NULL
) != 0) {
2331 (void) no_memory(hdl
);
2338 verify(nvlist_lookup_uint64(config
, ZPOOL_CONFIG_POOL_STATE
,
2340 verify(nvlist_lookup_uint64(config
, ZPOOL_CONFIG_GUID
,
2343 if (stateval
!= POOL_STATE_SPARE
&& stateval
!= POOL_STATE_L2CACHE
) {
2344 verify(nvlist_lookup_string(config
, ZPOOL_CONFIG_POOL_NAME
,
2346 verify(nvlist_lookup_uint64(config
, ZPOOL_CONFIG_POOL_GUID
,
2351 case POOL_STATE_EXPORTED
:
2353 * A pool with an exported state may in fact be imported
2354 * read-only, so check the in-core state to see if it's
2355 * active and imported read-only. If it is, set
2356 * its state to active.
2358 if (pool_active(hdl
, name
, guid
, &isactive
) == 0 && isactive
&&
2359 (zhp
= zpool_open_canfail(hdl
, name
)) != NULL
) {
2360 if (zpool_get_prop_int(zhp
, ZPOOL_PROP_READONLY
, NULL
))
2361 stateval
= POOL_STATE_ACTIVE
;
2364 * All we needed the zpool handle for is the
2365 * readonly prop check.
2373 case POOL_STATE_ACTIVE
:
2375 * For an active pool, we have to determine if it's really part
2376 * of a currently active pool (in which case the pool will exist
2377 * and the guid will be the same), or whether it's part of an
2378 * active pool that was disconnected without being explicitly
2381 if (pool_active(hdl
, name
, guid
, &isactive
) != 0) {
2382 nvlist_free(config
);
2388 * Because the device may have been removed while
2389 * offlined, we only report it as active if the vdev is
2390 * still present in the config. Otherwise, pretend like
2393 if ((zhp
= zpool_open_canfail(hdl
, name
)) != NULL
&&
2394 (pool_config
= zpool_get_config(zhp
, NULL
))
2398 verify(nvlist_lookup_nvlist(pool_config
,
2399 ZPOOL_CONFIG_VDEV_TREE
, &nvroot
) == 0);
2400 ret
= find_guid(nvroot
, vdev_guid
);
2406 * If this is an active spare within another pool, we
2407 * treat it like an unused hot spare. This allows the
2408 * user to create a pool with a hot spare that currently
2409 * in use within another pool. Since we return B_TRUE,
2410 * libdiskmgt will continue to prevent generic consumers
2411 * from using the device.
2413 if (ret
&& nvlist_lookup_uint64(config
,
2414 ZPOOL_CONFIG_IS_SPARE
, &isspare
) == 0 && isspare
)
2415 stateval
= POOL_STATE_SPARE
;
2420 stateval
= POOL_STATE_POTENTIALLY_ACTIVE
;
2425 case POOL_STATE_SPARE
:
2427 * For a hot spare, it can be either definitively in use, or
2428 * potentially active. To determine if it's in use, we iterate
2429 * over all pools in the system and search for one with a spare
2430 * with a matching guid.
2432 * Due to the shared nature of spares, we don't actually report
2433 * the potentially active case as in use. This means the user
2434 * can freely create pools on the hot spares of exported pools,
2435 * but to do otherwise makes the resulting code complicated, and
2436 * we end up having to deal with this case anyway.
2439 cb
.cb_guid
= vdev_guid
;
2440 cb
.cb_type
= ZPOOL_CONFIG_SPARES
;
2441 if (zpool_iter(hdl
, find_aux
, &cb
) == 1) {
2442 name
= (char *)zpool_get_name(cb
.cb_zhp
);
2449 case POOL_STATE_L2CACHE
:
2452 * Check if any pool is currently using this l2cache device.
2455 cb
.cb_guid
= vdev_guid
;
2456 cb
.cb_type
= ZPOOL_CONFIG_L2CACHE
;
2457 if (zpool_iter(hdl
, find_aux
, &cb
) == 1) {
2458 name
= (char *)zpool_get_name(cb
.cb_zhp
);
2471 if ((*namestr
= zfs_strdup(hdl
, name
)) == NULL
) {
2473 zpool_close(cb
.cb_zhp
);
2474 nvlist_free(config
);
2477 *state
= (pool_state_t
)stateval
;
2481 zpool_close(cb
.cb_zhp
);
2483 nvlist_free(config
);