4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
25 * Copyright 2015 RackTop Systems.
26 * Copyright (c) 2016, Intel Corporation.
30 * Pool import support functions.
32 * To import a pool, we rely on reading the configuration information from the
33 * ZFS label of each device. If we successfully read the label, then we
34 * organize the configuration information in the following hierarchy:
36 * pool guid -> toplevel vdev guid -> label txg
38 * Duplicate entries matching this same tuple will be discarded. Once we have
39 * examined every device, we pick the best label txg config for each toplevel
40 * vdev. We then arrange these toplevel vdevs into a complete pool config, and
41 * update any paths that have changed. Finally, we attempt to import the pool
42 * using our derived config, and record the results.
62 #include <sys/dktp/fdisk.h>
63 #include <sys/efi_partition.h>
64 #include <thread_pool.h>
65 #include <sys/vdev_impl.h>
66 #include <blkid/blkid.h>
68 #include "libzfs_impl.h"
72 * Intermediate structures used to gather configuration information.
74 typedef struct config_entry
{
77 struct config_entry
*ce_next
;
80 typedef struct vdev_entry
{
82 config_entry_t
*ve_configs
;
83 struct vdev_entry
*ve_next
;
86 typedef struct pool_entry
{
88 vdev_entry_t
*pe_vdevs
;
89 struct pool_entry
*pe_next
;
92 typedef struct name_entry
{
96 uint64_t ne_num_labels
;
97 struct name_entry
*ne_next
;
100 typedef struct pool_list
{
105 #define DEV_BYID_PATH "/dev/disk/by-id/"
108 * Linux persistent device strings for vdev labels
110 * based on libudev for consistency with libudev disk add/remove events
114 typedef struct vdev_dev_strs
{
116 char vds_devphys
[128];
120 * Obtain the persistent device id string (describes what)
122 * used by ZED vdev matching for auto-{online,expand,replace}
125 zfs_device_get_devid(struct udev_device
*dev
, char *bufptr
, size_t buflen
)
127 struct udev_list_entry
*entry
;
129 char devbyid
[MAXPATHLEN
];
131 /* The bus based by-id path is preferred */
132 bus
= udev_device_get_property_value(dev
, "ID_BUS");
138 * For multipath nodes use the persistent uuid based identifier
140 * Example: /dev/disk/by-id/dm-uuid-mpath-35000c5006304de3f
142 dm_uuid
= udev_device_get_property_value(dev
, "DM_UUID");
143 if (dm_uuid
!= NULL
) {
144 (void) snprintf(bufptr
, buflen
, "dm-uuid-%s", dm_uuid
);
149 * For volumes use the persistent /dev/zvol/dataset identifier
151 entry
= udev_device_get_devlinks_list_entry(dev
);
152 while (entry
!= NULL
) {
155 name
= udev_list_entry_get_name(entry
);
156 if (strncmp(name
, ZVOL_ROOT
, strlen(ZVOL_ROOT
)) == 0) {
157 (void) strlcpy(bufptr
, name
, buflen
);
160 entry
= udev_list_entry_get_next(entry
);
164 * NVME 'by-id' symlinks are similar to bus case
166 struct udev_device
*parent
;
168 parent
= udev_device_get_parent_with_subsystem_devtype(dev
,
171 bus
= "nvme"; /* continue with bus symlink search */
177 * locate the bus specific by-id link
179 (void) snprintf(devbyid
, sizeof (devbyid
), "%s%s-", DEV_BYID_PATH
, bus
);
180 entry
= udev_device_get_devlinks_list_entry(dev
);
181 while (entry
!= NULL
) {
184 name
= udev_list_entry_get_name(entry
);
185 if (strncmp(name
, devbyid
, strlen(devbyid
)) == 0) {
186 name
+= strlen(DEV_BYID_PATH
);
187 (void) strlcpy(bufptr
, name
, buflen
);
190 entry
= udev_list_entry_get_next(entry
);
197 * Obtain the persistent physical location string (describes where)
199 * used by ZED vdev matching for auto-{online,expand,replace}
202 zfs_device_get_physical(struct udev_device
*dev
, char *bufptr
, size_t buflen
)
204 const char *physpath
= NULL
;
205 struct udev_list_entry
*entry
;
208 * Normal disks use ID_PATH for their physical path.
210 physpath
= udev_device_get_property_value(dev
, "ID_PATH");
211 if (physpath
!= NULL
&& strlen(physpath
) > 0) {
212 (void) strlcpy(bufptr
, physpath
, buflen
);
217 * Device mapper devices are virtual and don't have a physical
218 * path. For them we use ID_VDEV instead, which is setup via the
219 * /etc/vdev_id.conf file. ID_VDEV provides a persistent path
220 * to a virtual device. If you don't have vdev_id.conf setup,
221 * you cannot use multipath autoreplace with device mapper.
223 physpath
= udev_device_get_property_value(dev
, "ID_VDEV");
224 if (physpath
!= NULL
&& strlen(physpath
) > 0) {
225 (void) strlcpy(bufptr
, physpath
, buflen
);
230 * For ZFS volumes use the persistent /dev/zvol/dataset identifier
232 entry
= udev_device_get_devlinks_list_entry(dev
);
233 while (entry
!= NULL
) {
234 physpath
= udev_list_entry_get_name(entry
);
235 if (strncmp(physpath
, ZVOL_ROOT
, strlen(ZVOL_ROOT
)) == 0) {
236 (void) strlcpy(bufptr
, physpath
, buflen
);
239 entry
= udev_list_entry_get_next(entry
);
243 * For all other devices fallback to using the by-uuid name.
245 entry
= udev_device_get_devlinks_list_entry(dev
);
246 while (entry
!= NULL
) {
247 physpath
= udev_list_entry_get_name(entry
);
248 if (strncmp(physpath
, "/dev/disk/by-uuid", 17) == 0) {
249 (void) strlcpy(bufptr
, physpath
, buflen
);
252 entry
= udev_list_entry_get_next(entry
);
259 udev_is_mpath(struct udev_device
*dev
)
261 return udev_device_get_property_value(dev
, "DM_UUID") &&
262 udev_device_get_property_value(dev
, "MPATH_SBIN_PATH");
266 * A disk is considered a multipath whole disk when:
267 * DEVNAME key value has "dm-"
268 * DM_NAME key value has "mpath" prefix
270 * ID_PART_TABLE_TYPE key does not exist or is not gpt
273 udev_mpath_whole_disk(struct udev_device
*dev
)
275 const char *devname
, *type
, *uuid
;
277 devname
= udev_device_get_property_value(dev
, "DEVNAME");
278 type
= udev_device_get_property_value(dev
, "ID_PART_TABLE_TYPE");
279 uuid
= udev_device_get_property_value(dev
, "DM_UUID");
281 if ((devname
!= NULL
&& strncmp(devname
, "/dev/dm-", 8) == 0) &&
282 ((type
== NULL
) || (strcmp(type
, "gpt") != 0)) &&
291 * Check if a disk is effectively a multipath whole disk
294 is_mpath_whole_disk(const char *path
)
297 struct udev_device
*dev
= NULL
;
298 char nodepath
[MAXPATHLEN
];
300 boolean_t wholedisk
= B_FALSE
;
302 if (realpath(path
, nodepath
) == NULL
)
304 sysname
= strrchr(nodepath
, '/') + 1;
305 if (strncmp(sysname
, "dm-", 3) != 0)
307 if ((udev
= udev_new()) == NULL
)
309 if ((dev
= udev_device_new_from_subsystem_sysname(udev
, "block",
311 udev_device_unref(dev
);
315 wholedisk
= udev_mpath_whole_disk(dev
);
317 udev_device_unref(dev
);
322 udev_device_is_ready(struct udev_device
*dev
)
324 #ifdef HAVE_LIBUDEV_UDEV_DEVICE_GET_IS_INITIALIZED
325 return (udev_device_get_is_initialized(dev
));
327 /* wait for DEVLINKS property to be initialized */
328 return (udev_device_get_property_value(dev
, "DEVLINKS") != NULL
);
333 * Wait up to timeout_ms for udev to set up the device node. The device is
334 * considered ready when libudev determines it has been initialized, all of
335 * the device links have been verified to exist, and it has been allowed to
336 * settle. At this point the device the device can be accessed reliably.
337 * Depending on the complexity of the udev rules this process could take
341 zpool_label_disk_wait(char *path
, int timeout_ms
)
344 struct udev_device
*dev
= NULL
;
345 char nodepath
[MAXPATHLEN
];
346 char *sysname
= NULL
;
350 hrtime_t start
, settle
;
352 if ((udev
= udev_new()) == NULL
)
359 if (sysname
== NULL
) {
360 if (realpath(path
, nodepath
) != NULL
) {
361 sysname
= strrchr(nodepath
, '/') + 1;
363 (void) usleep(sleep_ms
* MILLISEC
);
368 dev
= udev_device_new_from_subsystem_sysname(udev
,
370 if ((dev
!= NULL
) && udev_device_is_ready(dev
)) {
371 struct udev_list_entry
*links
, *link
= NULL
;
374 links
= udev_device_get_devlinks_list_entry(dev
);
376 udev_list_entry_foreach(link
, links
) {
377 struct stat64 statbuf
;
380 name
= udev_list_entry_get_name(link
);
382 if (stat64(name
, &statbuf
) == 0 && errno
== 0)
392 settle
= gethrtime();
393 } else if (NSEC2MSEC(gethrtime() - settle
) >=
395 udev_device_unref(dev
);
401 udev_device_unref(dev
);
402 (void) usleep(sleep_ms
* MILLISEC
);
404 } while (NSEC2MSEC(gethrtime() - start
) < timeout_ms
);
413 * Encode the persistent devices strings
414 * used for the vdev disk label
417 encode_device_strings(const char *path
, vdev_dev_strs_t
*ds
,
421 struct udev_device
*dev
= NULL
;
422 char nodepath
[MAXPATHLEN
];
427 if ((udev
= udev_new()) == NULL
)
430 /* resolve path to a runtime device node instance */
431 if (realpath(path
, nodepath
) == NULL
)
434 sysname
= strrchr(nodepath
, '/') + 1;
437 * Wait up to 3 seconds for udev to set up the device node context
441 dev
= udev_device_new_from_subsystem_sysname(udev
, "block",
445 if (udev_device_is_ready(dev
))
446 break; /* udev ready */
448 udev_device_unref(dev
);
451 if (NSEC2MSEC(gethrtime() - start
) < 10)
452 (void) sched_yield(); /* yield/busy wait up to 10ms */
454 (void) usleep(10 * MILLISEC
);
456 } while (NSEC2MSEC(gethrtime() - start
) < (3 * MILLISEC
));
462 * Only whole disks require extra device strings
464 if (!wholedisk
&& !udev_mpath_whole_disk(dev
))
467 ret
= zfs_device_get_devid(dev
, ds
->vds_devid
, sizeof (ds
->vds_devid
));
471 /* physical location string (optional) */
472 if (zfs_device_get_physical(dev
, ds
->vds_devphys
,
473 sizeof (ds
->vds_devphys
)) != 0) {
474 ds
->vds_devphys
[0] = '\0'; /* empty string --> not available */
478 udev_device_unref(dev
);
486 * Update a leaf vdev's persistent device strings (Linux only)
488 * - only applies for a dedicated leaf vdev (aka whole disk)
489 * - updated during pool create|add|attach|import
490 * - used for matching device matching during auto-{online,expand,replace}
491 * - stored in a leaf disk config label (i.e. alongside 'path' NVP)
492 * - these strings are currently not used in kernel (i.e. for vdev_disk_open)
494 * single device node example:
495 * devid: 'scsi-MG03SCA300_350000494a8cb3d67-part1'
496 * phys_path: 'pci-0000:04:00.0-sas-0x50000394a8cb3d67-lun-0'
498 * multipath device node example:
499 * devid: 'dm-uuid-mpath-35000c5006304de3f'
501 * We also store the enclosure sysfs path for turning on enclosure LEDs
503 * vdev_enc_sysfs_path: '/sys/class/enclosure/11:0:1:0/SLOT 4'
506 update_vdev_config_dev_strs(nvlist_t
*nv
)
509 char *env
, *type
, *path
;
510 uint64_t wholedisk
= 0;
514 * For the benefit of legacy ZFS implementations, allow
515 * for opting out of devid strings in the vdev label.
518 * env ZFS_VDEV_DEVID_OPT_OUT=YES zpool import dozer
521 * Older ZFS on Linux implementations had issues when attempting to
522 * display pool config VDEV names if a "devid" NVP value is present
523 * in the pool's config.
525 * For example, a pool that originated on illumos platform would
526 * have a devid value in the config and "zpool status" would fail
527 * when listing the config.
529 * A pool can be stripped of any "devid" values on import or
530 * prevented from adding them on zpool create|add by setting
531 * ZFS_VDEV_DEVID_OPT_OUT.
533 env
= getenv("ZFS_VDEV_DEVID_OPT_OUT");
534 if (env
&& (strtoul(env
, NULL
, 0) > 0 ||
535 !strncasecmp(env
, "YES", 3) || !strncasecmp(env
, "ON", 2))) {
536 (void) nvlist_remove_all(nv
, ZPOOL_CONFIG_DEVID
);
537 (void) nvlist_remove_all(nv
, ZPOOL_CONFIG_PHYS_PATH
);
541 if (nvlist_lookup_string(nv
, ZPOOL_CONFIG_TYPE
, &type
) != 0 ||
542 strcmp(type
, VDEV_TYPE_DISK
) != 0) {
545 if (nvlist_lookup_string(nv
, ZPOOL_CONFIG_PATH
, &path
) != 0)
547 (void) nvlist_lookup_uint64(nv
, ZPOOL_CONFIG_WHOLE_DISK
, &wholedisk
);
550 * Update device string values in config nvlist
552 if (encode_device_strings(path
, &vds
, (boolean_t
)wholedisk
) == 0) {
553 (void) nvlist_add_string(nv
, ZPOOL_CONFIG_DEVID
, vds
.vds_devid
);
554 if (vds
.vds_devphys
[0] != '\0') {
555 (void) nvlist_add_string(nv
, ZPOOL_CONFIG_PHYS_PATH
,
559 /* Add enclosure sysfs path (if disk is in an enclosure) */
560 upath
= zfs_get_underlying_path(path
);
561 spath
= zfs_get_enclosure_sysfs_path(upath
);
563 nvlist_add_string(nv
, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH
,
566 nvlist_remove_all(nv
, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH
);
571 /* clear out any stale entries */
572 (void) nvlist_remove_all(nv
, ZPOOL_CONFIG_DEVID
);
573 (void) nvlist_remove_all(nv
, ZPOOL_CONFIG_PHYS_PATH
);
574 (void) nvlist_remove_all(nv
, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH
);
580 is_mpath_whole_disk(const char *path
)
586 * Wait up to timeout_ms for udev to set up the device node. The device is
587 * considered ready when the provided path have been verified to exist and
588 * it has been allowed to settle. At this point the device the device can
589 * be accessed reliably. Depending on the complexity of the udev rules thisi
590 * process could take several seconds.
593 zpool_label_disk_wait(char *path
, int timeout_ms
)
597 hrtime_t start
, settle
;
598 struct stat64 statbuf
;
605 if ((stat64(path
, &statbuf
) == 0) && (errno
== 0)) {
607 settle
= gethrtime();
608 else if (NSEC2MSEC(gethrtime() - settle
) >= settle_ms
)
610 } else if (errno
!= ENOENT
) {
614 usleep(sleep_ms
* MILLISEC
);
615 } while (NSEC2MSEC(gethrtime() - start
) < timeout_ms
);
621 update_vdev_config_dev_strs(nvlist_t
*nv
)
625 #endif /* HAVE_LIBUDEV */
628 * Go through and fix up any path and/or devid information for the given vdev
632 fix_paths(nvlist_t
*nv
, name_entry_t
*names
)
637 name_entry_t
*ne
, *best
;
640 if (nvlist_lookup_nvlist_array(nv
, ZPOOL_CONFIG_CHILDREN
,
641 &child
, &children
) == 0) {
642 for (c
= 0; c
< children
; c
++)
643 if (fix_paths(child
[c
], names
) != 0)
649 * This is a leaf (file or disk) vdev. In either case, go through
650 * the name list and see if we find a matching guid. If so, replace
651 * the path and see if we can calculate a new devid.
653 * There may be multiple names associated with a particular guid, in
654 * which case we have overlapping partitions or multiple paths to the
655 * same disk. In this case we prefer to use the path name which
656 * matches the ZPOOL_CONFIG_PATH. If no matching entry is found we
657 * use the lowest order device which corresponds to the first match
658 * while traversing the ZPOOL_IMPORT_PATH search path.
660 verify(nvlist_lookup_uint64(nv
, ZPOOL_CONFIG_GUID
, &guid
) == 0);
661 if (nvlist_lookup_string(nv
, ZPOOL_CONFIG_PATH
, &path
) != 0)
665 for (ne
= names
; ne
!= NULL
; ne
= ne
->ne_next
) {
666 if (ne
->ne_guid
== guid
) {
672 if ((strlen(path
) == strlen(ne
->ne_name
)) &&
673 strncmp(path
, ne
->ne_name
, strlen(path
)) == 0) {
683 /* Prefer paths with move vdev labels. */
684 if (ne
->ne_num_labels
> best
->ne_num_labels
) {
689 /* Prefer paths earlier in the search order. */
690 if (ne
->ne_num_labels
== best
->ne_num_labels
&&
691 ne
->ne_order
< best
->ne_order
) {
701 if (nvlist_add_string(nv
, ZPOOL_CONFIG_PATH
, best
->ne_name
) != 0)
704 /* Linux only - update ZPOOL_CONFIG_DEVID and ZPOOL_CONFIG_PHYS_PATH */
705 update_vdev_config_dev_strs(nv
);
711 * Add the given configuration to the list of known devices.
714 add_config(libzfs_handle_t
*hdl
, pool_list_t
*pl
, const char *path
,
715 int order
, int num_labels
, nvlist_t
*config
)
717 uint64_t pool_guid
, vdev_guid
, top_guid
, txg
, state
;
724 * If this is a hot spare not currently in use or level 2 cache
725 * device, add it to the list of names to translate, but don't do
728 if (nvlist_lookup_uint64(config
, ZPOOL_CONFIG_POOL_STATE
,
730 (state
== POOL_STATE_SPARE
|| state
== POOL_STATE_L2CACHE
) &&
731 nvlist_lookup_uint64(config
, ZPOOL_CONFIG_GUID
, &vdev_guid
) == 0) {
732 if ((ne
= zfs_alloc(hdl
, sizeof (name_entry_t
))) == NULL
)
735 if ((ne
->ne_name
= zfs_strdup(hdl
, path
)) == NULL
) {
739 ne
->ne_guid
= vdev_guid
;
740 ne
->ne_order
= order
;
741 ne
->ne_num_labels
= num_labels
;
742 ne
->ne_next
= pl
->names
;
749 * If we have a valid config but cannot read any of these fields, then
750 * it means we have a half-initialized label. In vdev_label_init()
751 * we write a label with txg == 0 so that we can identify the device
752 * in case the user refers to the same disk later on. If we fail to
753 * create the pool, we'll be left with a label in this state
754 * which should not be considered part of a valid pool.
756 if (nvlist_lookup_uint64(config
, ZPOOL_CONFIG_POOL_GUID
,
758 nvlist_lookup_uint64(config
, ZPOOL_CONFIG_GUID
,
760 nvlist_lookup_uint64(config
, ZPOOL_CONFIG_TOP_GUID
,
762 nvlist_lookup_uint64(config
, ZPOOL_CONFIG_POOL_TXG
,
763 &txg
) != 0 || txg
== 0) {
768 * First, see if we know about this pool. If not, then add it to the
769 * list of known pools.
771 for (pe
= pl
->pools
; pe
!= NULL
; pe
= pe
->pe_next
) {
772 if (pe
->pe_guid
== pool_guid
)
777 if ((pe
= zfs_alloc(hdl
, sizeof (pool_entry_t
))) == NULL
) {
780 pe
->pe_guid
= pool_guid
;
781 pe
->pe_next
= pl
->pools
;
786 * Second, see if we know about this toplevel vdev. Add it if its
789 for (ve
= pe
->pe_vdevs
; ve
!= NULL
; ve
= ve
->ve_next
) {
790 if (ve
->ve_guid
== top_guid
)
795 if ((ve
= zfs_alloc(hdl
, sizeof (vdev_entry_t
))) == NULL
) {
798 ve
->ve_guid
= top_guid
;
799 ve
->ve_next
= pe
->pe_vdevs
;
804 * Third, see if we have a config with a matching transaction group. If
805 * so, then we do nothing. Otherwise, add it to the list of known
808 for (ce
= ve
->ve_configs
; ce
!= NULL
; ce
= ce
->ce_next
) {
809 if (ce
->ce_txg
== txg
)
814 if ((ce
= zfs_alloc(hdl
, sizeof (config_entry_t
))) == NULL
) {
818 ce
->ce_config
= fnvlist_dup(config
);
819 ce
->ce_next
= ve
->ve_configs
;
824 * At this point we've successfully added our config to the list of
825 * known configs. The last thing to do is add the vdev guid -> path
826 * mappings so that we can fix up the configuration as necessary before
829 if ((ne
= zfs_alloc(hdl
, sizeof (name_entry_t
))) == NULL
)
832 if ((ne
->ne_name
= zfs_strdup(hdl
, path
)) == NULL
) {
837 ne
->ne_guid
= vdev_guid
;
838 ne
->ne_order
= order
;
839 ne
->ne_num_labels
= num_labels
;
840 ne
->ne_next
= pl
->names
;
847 * Returns true if the named pool matches the given GUID.
850 pool_active(libzfs_handle_t
*hdl
, const char *name
, uint64_t guid
,
856 if (zpool_open_silent(hdl
, name
, &zhp
) != 0)
864 verify(nvlist_lookup_uint64(zhp
->zpool_config
, ZPOOL_CONFIG_POOL_GUID
,
869 *isactive
= (theguid
== guid
);
874 refresh_config(libzfs_handle_t
*hdl
, nvlist_t
*config
)
877 zfs_cmd_t zc
= {"\0"};
878 int err
, dstbuf_size
;
880 if (zcmd_write_conf_nvlist(hdl
, &zc
, config
) != 0)
883 dstbuf_size
= MAX(CONFIG_BUF_MINSIZE
, zc
.zc_nvlist_conf_size
* 4);
885 if (zcmd_alloc_dst_nvlist(hdl
, &zc
, dstbuf_size
) != 0) {
886 zcmd_free_nvlists(&zc
);
890 while ((err
= ioctl(hdl
->libzfs_fd
, ZFS_IOC_POOL_TRYIMPORT
,
891 &zc
)) != 0 && errno
== ENOMEM
) {
892 if (zcmd_expand_dst_nvlist(hdl
, &zc
) != 0) {
893 zcmd_free_nvlists(&zc
);
899 zcmd_free_nvlists(&zc
);
903 if (zcmd_read_dst_nvlist(hdl
, &zc
, &nvl
) != 0) {
904 zcmd_free_nvlists(&zc
);
908 zcmd_free_nvlists(&zc
);
913 * Determine if the vdev id is a hole in the namespace.
916 vdev_is_hole(uint64_t *hole_array
, uint_t holes
, uint_t id
)
920 for (c
= 0; c
< holes
; c
++) {
922 /* Top-level is a hole */
923 if (hole_array
[c
] == id
)
930 * Convert our list of pools into the definitive set of configurations. We
931 * start by picking the best config for each toplevel vdev. Once that's done,
932 * we assemble the toplevel vdevs into a full config for the pool. We make a
933 * pass to fix up any incorrect paths, and then add it to the main list to
934 * return to the user.
937 get_configs(libzfs_handle_t
*hdl
, pool_list_t
*pl
, boolean_t active_ok
,
943 nvlist_t
*ret
= NULL
, *config
= NULL
, *tmp
= NULL
, *nvtop
, *nvroot
;
944 nvlist_t
**spares
, **l2cache
;
945 uint_t i
, nspares
, nl2cache
;
946 boolean_t config_seen
;
948 char *name
, *hostname
= NULL
;
951 nvlist_t
**child
= NULL
;
953 uint64_t *hole_array
, max_id
;
958 boolean_t valid_top_config
= B_FALSE
;
960 if (nvlist_alloc(&ret
, 0, 0) != 0)
963 for (pe
= pl
->pools
; pe
!= NULL
; pe
= pe
->pe_next
) {
964 uint64_t id
, max_txg
= 0;
966 if (nvlist_alloc(&config
, NV_UNIQUE_NAME
, 0) != 0)
968 config_seen
= B_FALSE
;
971 * Iterate over all toplevel vdevs. Grab the pool configuration
972 * from the first one we find, and then go through the rest and
973 * add them as necessary to the 'vdevs' member of the config.
975 for (ve
= pe
->pe_vdevs
; ve
!= NULL
; ve
= ve
->ve_next
) {
978 * Determine the best configuration for this vdev by
979 * selecting the config with the latest transaction
983 for (ce
= ve
->ve_configs
; ce
!= NULL
;
986 if (ce
->ce_txg
> best_txg
) {
988 best_txg
= ce
->ce_txg
;
993 * We rely on the fact that the max txg for the
994 * pool will contain the most up-to-date information
995 * about the valid top-levels in the vdev namespace.
997 if (best_txg
> max_txg
) {
998 (void) nvlist_remove(config
,
999 ZPOOL_CONFIG_VDEV_CHILDREN
,
1001 (void) nvlist_remove(config
,
1002 ZPOOL_CONFIG_HOLE_ARRAY
,
1003 DATA_TYPE_UINT64_ARRAY
);
1009 valid_top_config
= B_FALSE
;
1011 if (nvlist_lookup_uint64(tmp
,
1012 ZPOOL_CONFIG_VDEV_CHILDREN
, &max_id
) == 0) {
1013 verify(nvlist_add_uint64(config
,
1014 ZPOOL_CONFIG_VDEV_CHILDREN
,
1016 valid_top_config
= B_TRUE
;
1019 if (nvlist_lookup_uint64_array(tmp
,
1020 ZPOOL_CONFIG_HOLE_ARRAY
, &hole_array
,
1022 verify(nvlist_add_uint64_array(config
,
1023 ZPOOL_CONFIG_HOLE_ARRAY
,
1024 hole_array
, holes
) == 0);
1030 * Copy the relevant pieces of data to the pool
1036 * comment (if available)
1038 * hostid (if available)
1039 * hostname (if available)
1041 uint64_t state
, version
;
1042 char *comment
= NULL
;
1044 version
= fnvlist_lookup_uint64(tmp
,
1045 ZPOOL_CONFIG_VERSION
);
1046 fnvlist_add_uint64(config
,
1047 ZPOOL_CONFIG_VERSION
, version
);
1048 guid
= fnvlist_lookup_uint64(tmp
,
1049 ZPOOL_CONFIG_POOL_GUID
);
1050 fnvlist_add_uint64(config
,
1051 ZPOOL_CONFIG_POOL_GUID
, guid
);
1052 name
= fnvlist_lookup_string(tmp
,
1053 ZPOOL_CONFIG_POOL_NAME
);
1054 fnvlist_add_string(config
,
1055 ZPOOL_CONFIG_POOL_NAME
, name
);
1057 if (nvlist_lookup_string(tmp
,
1058 ZPOOL_CONFIG_COMMENT
, &comment
) == 0)
1059 fnvlist_add_string(config
,
1060 ZPOOL_CONFIG_COMMENT
, comment
);
1062 state
= fnvlist_lookup_uint64(tmp
,
1063 ZPOOL_CONFIG_POOL_STATE
);
1064 fnvlist_add_uint64(config
,
1065 ZPOOL_CONFIG_POOL_STATE
, state
);
1068 if (nvlist_lookup_uint64(tmp
,
1069 ZPOOL_CONFIG_HOSTID
, &hostid
) == 0) {
1070 fnvlist_add_uint64(config
,
1071 ZPOOL_CONFIG_HOSTID
, hostid
);
1072 hostname
= fnvlist_lookup_string(tmp
,
1073 ZPOOL_CONFIG_HOSTNAME
);
1074 fnvlist_add_string(config
,
1075 ZPOOL_CONFIG_HOSTNAME
, hostname
);
1078 config_seen
= B_TRUE
;
1082 * Add this top-level vdev to the child array.
1084 verify(nvlist_lookup_nvlist(tmp
,
1085 ZPOOL_CONFIG_VDEV_TREE
, &nvtop
) == 0);
1086 verify(nvlist_lookup_uint64(nvtop
, ZPOOL_CONFIG_ID
,
1089 if (id
>= children
) {
1090 nvlist_t
**newchild
;
1092 newchild
= zfs_alloc(hdl
, (id
+ 1) *
1093 sizeof (nvlist_t
*));
1094 if (newchild
== NULL
)
1097 for (c
= 0; c
< children
; c
++)
1098 newchild
[c
] = child
[c
];
1104 if (nvlist_dup(nvtop
, &child
[id
], 0) != 0)
1110 * If we have information about all the top-levels then
1111 * clean up the nvlist which we've constructed. This
1112 * means removing any extraneous devices that are
1113 * beyond the valid range or adding devices to the end
1114 * of our array which appear to be missing.
1116 if (valid_top_config
) {
1117 if (max_id
< children
) {
1118 for (c
= max_id
; c
< children
; c
++)
1119 nvlist_free(child
[c
]);
1121 } else if (max_id
> children
) {
1122 nvlist_t
**newchild
;
1124 newchild
= zfs_alloc(hdl
, (max_id
) *
1125 sizeof (nvlist_t
*));
1126 if (newchild
== NULL
)
1129 for (c
= 0; c
< children
; c
++)
1130 newchild
[c
] = child
[c
];
1138 verify(nvlist_lookup_uint64(config
, ZPOOL_CONFIG_POOL_GUID
,
1142 * The vdev namespace may contain holes as a result of
1143 * device removal. We must add them back into the vdev
1144 * tree before we process any missing devices.
1147 ASSERT(valid_top_config
);
1149 for (c
= 0; c
< children
; c
++) {
1152 if (child
[c
] != NULL
||
1153 !vdev_is_hole(hole_array
, holes
, c
))
1156 if (nvlist_alloc(&holey
, NV_UNIQUE_NAME
,
1161 * Holes in the namespace are treated as
1162 * "hole" top-level vdevs and have a
1163 * special flag set on them.
1165 if (nvlist_add_string(holey
,
1167 VDEV_TYPE_HOLE
) != 0 ||
1168 nvlist_add_uint64(holey
,
1169 ZPOOL_CONFIG_ID
, c
) != 0 ||
1170 nvlist_add_uint64(holey
,
1171 ZPOOL_CONFIG_GUID
, 0ULL) != 0) {
1180 * Look for any missing top-level vdevs. If this is the case,
1181 * create a faked up 'missing' vdev as a placeholder. We cannot
1182 * simply compress the child array, because the kernel performs
1183 * certain checks to make sure the vdev IDs match their location
1184 * in the configuration.
1186 for (c
= 0; c
< children
; c
++) {
1187 if (child
[c
] == NULL
) {
1189 if (nvlist_alloc(&missing
, NV_UNIQUE_NAME
,
1192 if (nvlist_add_string(missing
,
1194 VDEV_TYPE_MISSING
) != 0 ||
1195 nvlist_add_uint64(missing
,
1196 ZPOOL_CONFIG_ID
, c
) != 0 ||
1197 nvlist_add_uint64(missing
,
1198 ZPOOL_CONFIG_GUID
, 0ULL) != 0) {
1199 nvlist_free(missing
);
1207 * Put all of this pool's top-level vdevs into a root vdev.
1209 if (nvlist_alloc(&nvroot
, NV_UNIQUE_NAME
, 0) != 0)
1211 if (nvlist_add_string(nvroot
, ZPOOL_CONFIG_TYPE
,
1212 VDEV_TYPE_ROOT
) != 0 ||
1213 nvlist_add_uint64(nvroot
, ZPOOL_CONFIG_ID
, 0ULL) != 0 ||
1214 nvlist_add_uint64(nvroot
, ZPOOL_CONFIG_GUID
, guid
) != 0 ||
1215 nvlist_add_nvlist_array(nvroot
, ZPOOL_CONFIG_CHILDREN
,
1216 child
, children
) != 0) {
1217 nvlist_free(nvroot
);
1221 for (c
= 0; c
< children
; c
++)
1222 nvlist_free(child
[c
]);
1228 * Go through and fix up any paths and/or devids based on our
1229 * known list of vdev GUID -> path mappings.
1231 if (fix_paths(nvroot
, pl
->names
) != 0) {
1232 nvlist_free(nvroot
);
1237 * Add the root vdev to this pool's configuration.
1239 if (nvlist_add_nvlist(config
, ZPOOL_CONFIG_VDEV_TREE
,
1241 nvlist_free(nvroot
);
1244 nvlist_free(nvroot
);
1247 * zdb uses this path to report on active pools that were
1248 * imported or created using -R.
1254 * Determine if this pool is currently active, in which case we
1255 * can't actually import it.
1257 verify(nvlist_lookup_string(config
, ZPOOL_CONFIG_POOL_NAME
,
1259 verify(nvlist_lookup_uint64(config
, ZPOOL_CONFIG_POOL_GUID
,
1262 if (pool_active(hdl
, name
, guid
, &isactive
) != 0)
1266 nvlist_free(config
);
1271 if (policy
!= NULL
) {
1272 if (nvlist_add_nvlist(config
, ZPOOL_LOAD_POLICY
,
1277 if ((nvl
= refresh_config(hdl
, config
)) == NULL
) {
1278 nvlist_free(config
);
1283 nvlist_free(config
);
1287 * Go through and update the paths for spares, now that we have
1290 verify(nvlist_lookup_nvlist(config
, ZPOOL_CONFIG_VDEV_TREE
,
1292 if (nvlist_lookup_nvlist_array(nvroot
, ZPOOL_CONFIG_SPARES
,
1293 &spares
, &nspares
) == 0) {
1294 for (i
= 0; i
< nspares
; i
++) {
1295 if (fix_paths(spares
[i
], pl
->names
) != 0)
1301 * Update the paths for l2cache devices.
1303 if (nvlist_lookup_nvlist_array(nvroot
, ZPOOL_CONFIG_L2CACHE
,
1304 &l2cache
, &nl2cache
) == 0) {
1305 for (i
= 0; i
< nl2cache
; i
++) {
1306 if (fix_paths(l2cache
[i
], pl
->names
) != 0)
1312 * Restore the original information read from the actual label.
1314 (void) nvlist_remove(config
, ZPOOL_CONFIG_HOSTID
,
1316 (void) nvlist_remove(config
, ZPOOL_CONFIG_HOSTNAME
,
1319 verify(nvlist_add_uint64(config
, ZPOOL_CONFIG_HOSTID
,
1321 verify(nvlist_add_string(config
, ZPOOL_CONFIG_HOSTNAME
,
1327 * Add this pool to the list of configs.
1329 verify(nvlist_lookup_string(config
, ZPOOL_CONFIG_POOL_NAME
,
1331 if (nvlist_add_nvlist(ret
, name
, config
) != 0)
1334 nvlist_free(config
);
1341 (void) no_memory(hdl
);
1343 nvlist_free(config
);
1345 for (c
= 0; c
< children
; c
++)
1346 nvlist_free(child
[c
]);
1353 * Return the offset of the given label.
1356 label_offset(uint64_t size
, int l
)
1358 ASSERT(P2PHASE_TYPED(size
, sizeof (vdev_label_t
), uint64_t) == 0);
1359 return (l
* sizeof (vdev_label_t
) + (l
< VDEV_LABELS
/ 2 ?
1360 0 : size
- VDEV_LABELS
* sizeof (vdev_label_t
)));
1364 * Given a file descriptor, read the label information and return an nvlist
1365 * describing the configuration, if there is one. The number of valid
1366 * labels found will be returned in num_labels when non-NULL.
1369 zpool_read_label(int fd
, nvlist_t
**config
, int *num_labels
)
1371 struct stat64 statbuf
;
1373 vdev_label_t
*label
;
1374 nvlist_t
*expected_config
= NULL
;
1375 uint64_t expected_guid
= 0, size
;
1380 if (fstat64_blk(fd
, &statbuf
) == -1)
1382 size
= P2ALIGN_TYPED(statbuf
.st_size
, sizeof (vdev_label_t
), uint64_t);
1384 error
= posix_memalign((void **)&label
, PAGESIZE
, sizeof (*label
));
1388 for (l
= 0; l
< VDEV_LABELS
; l
++) {
1389 uint64_t state
, guid
, txg
;
1391 if (pread64(fd
, label
, sizeof (vdev_label_t
),
1392 label_offset(size
, l
)) != sizeof (vdev_label_t
))
1395 if (nvlist_unpack(label
->vl_vdev_phys
.vp_nvlist
,
1396 sizeof (label
->vl_vdev_phys
.vp_nvlist
), config
, 0) != 0)
1399 if (nvlist_lookup_uint64(*config
, ZPOOL_CONFIG_GUID
,
1400 &guid
) != 0 || guid
== 0) {
1401 nvlist_free(*config
);
1405 if (nvlist_lookup_uint64(*config
, ZPOOL_CONFIG_POOL_STATE
,
1406 &state
) != 0 || state
> POOL_STATE_L2CACHE
) {
1407 nvlist_free(*config
);
1411 if (state
!= POOL_STATE_SPARE
&& state
!= POOL_STATE_L2CACHE
&&
1412 (nvlist_lookup_uint64(*config
, ZPOOL_CONFIG_POOL_TXG
,
1413 &txg
) != 0 || txg
== 0)) {
1414 nvlist_free(*config
);
1418 if (expected_guid
) {
1419 if (expected_guid
== guid
)
1422 nvlist_free(*config
);
1424 expected_config
= *config
;
1425 expected_guid
= guid
;
1430 if (num_labels
!= NULL
)
1431 *num_labels
= count
;
1434 *config
= expected_config
;
1439 typedef struct rdsk_node
{
1440 char *rn_name
; /* Full path to device */
1441 int rn_order
; /* Preferred order (low to high) */
1442 int rn_num_labels
; /* Number of valid labels */
1443 uint64_t rn_vdev_guid
; /* Expected vdev guid when set */
1444 libzfs_handle_t
*rn_hdl
;
1445 nvlist_t
*rn_config
; /* Label config */
1448 pthread_mutex_t
*rn_lock
;
1449 boolean_t rn_labelpaths
;
1453 * Sorted by vdev guid and full path to allow for multiple entries with
1454 * the same full path name. This is required because it's possible to
1455 * have multiple block devices with labels that refer to the same
1456 * ZPOOL_CONFIG_PATH yet have different vdev guids. In this case both
1457 * entries need to be added to the cache. Scenarios where this can occur
1458 * include overwritten pool labels, devices which are visible from multiple
1459 * hosts and multipath devices.
1462 slice_cache_compare(const void *arg1
, const void *arg2
)
1464 const char *nm1
= ((rdsk_node_t
*)arg1
)->rn_name
;
1465 const char *nm2
= ((rdsk_node_t
*)arg2
)->rn_name
;
1466 uint64_t guid1
= ((rdsk_node_t
*)arg1
)->rn_vdev_guid
;
1467 uint64_t guid2
= ((rdsk_node_t
*)arg2
)->rn_vdev_guid
;
1470 rv
= AVL_CMP(guid1
, guid2
);
1474 return (AVL_ISIGN(strcmp(nm1
, nm2
)));
1478 is_watchdog_dev(char *dev
)
1480 /* For 'watchdog' dev */
1481 if (strcmp(dev
, "watchdog") == 0)
1484 /* For 'watchdog<digit><whatever> */
1485 if (strstr(dev
, "watchdog") == dev
&& isdigit(dev
[8]))
1492 label_paths_impl(libzfs_handle_t
*hdl
, nvlist_t
*nvroot
, uint64_t pool_guid
,
1493 uint64_t vdev_guid
, char **path
, char **devid
)
1501 if (nvlist_lookup_nvlist_array(nvroot
, ZPOOL_CONFIG_CHILDREN
,
1502 &child
, &children
) == 0) {
1503 for (c
= 0; c
< children
; c
++) {
1504 error
= label_paths_impl(hdl
, child
[c
],
1505 pool_guid
, vdev_guid
, path
, devid
);
1515 error
= nvlist_lookup_uint64(nvroot
, ZPOOL_CONFIG_GUID
, &guid
);
1516 if ((error
!= 0) || (guid
!= vdev_guid
))
1519 error
= nvlist_lookup_string(nvroot
, ZPOOL_CONFIG_PATH
, &val
);
1523 error
= nvlist_lookup_string(nvroot
, ZPOOL_CONFIG_DEVID
, &val
);
1531 * Given a disk label fetch the ZPOOL_CONFIG_PATH and ZPOOL_CONFIG_DEVID
1532 * and store these strings as config_path and devid_path respectively.
1533 * The returned pointers are only valid as long as label remains valid.
1536 label_paths(libzfs_handle_t
*hdl
, nvlist_t
*label
, char **path
, char **devid
)
1545 if (nvlist_lookup_nvlist(label
, ZPOOL_CONFIG_VDEV_TREE
, &nvroot
) ||
1546 nvlist_lookup_uint64(label
, ZPOOL_CONFIG_POOL_GUID
, &pool_guid
) ||
1547 nvlist_lookup_uint64(label
, ZPOOL_CONFIG_GUID
, &vdev_guid
))
1550 return (label_paths_impl(hdl
, nvroot
, pool_guid
, vdev_guid
, path
,
1555 zpool_open_func(void *arg
)
1557 rdsk_node_t
*rn
= arg
;
1558 libzfs_handle_t
*hdl
= rn
->rn_hdl
;
1559 struct stat64 statbuf
;
1561 char *bname
, *dupname
;
1562 uint64_t vdev_guid
= 0;
1568 * Skip devices with well known prefixes there can be side effects
1569 * when opening devices which need to be avoided.
1571 * hpet - High Precision Event Timer
1572 * watchdog - Watchdog must be closed in a special way.
1574 dupname
= zfs_strdup(hdl
, rn
->rn_name
);
1575 bname
= basename(dupname
);
1576 error
= ((strcmp(bname
, "hpet") == 0) || is_watchdog_dev(bname
));
1582 * Ignore failed stats. We only want regular files and block devices.
1584 if (stat64(rn
->rn_name
, &statbuf
) != 0 ||
1585 (!S_ISREG(statbuf
.st_mode
) && !S_ISBLK(statbuf
.st_mode
)))
1589 * Preferentially open using O_DIRECT to bypass the block device
1590 * cache which may be stale for multipath devices. An EINVAL errno
1591 * indicates O_DIRECT is unsupported so fallback to just O_RDONLY.
1593 fd
= open(rn
->rn_name
, O_RDONLY
| O_DIRECT
);
1594 if ((fd
< 0) && (errno
== EINVAL
))
1595 fd
= open(rn
->rn_name
, O_RDONLY
);
1601 * This file is too small to hold a zpool
1603 if (S_ISREG(statbuf
.st_mode
) && statbuf
.st_size
< SPA_MINDEVSIZE
) {
1608 error
= zpool_read_label(fd
, &config
, &num_labels
);
1614 if (num_labels
== 0) {
1616 nvlist_free(config
);
1621 * Check that the vdev is for the expected guid. Additional entries
1622 * are speculatively added based on the paths stored in the labels.
1623 * Entries with valid paths but incorrect guids must be removed.
1625 error
= nvlist_lookup_uint64(config
, ZPOOL_CONFIG_GUID
, &vdev_guid
);
1626 if (error
|| (rn
->rn_vdev_guid
&& rn
->rn_vdev_guid
!= vdev_guid
)) {
1628 nvlist_free(config
);
1634 rn
->rn_config
= config
;
1635 rn
->rn_num_labels
= num_labels
;
1638 * Add additional entries for paths described by this label.
1640 if (rn
->rn_labelpaths
) {
1647 if (label_paths(rn
->rn_hdl
, rn
->rn_config
, &path
, &devid
))
1651 * Allow devlinks to stabilize so all paths are available.
1653 zpool_label_disk_wait(rn
->rn_name
, DISK_LABEL_WAIT
);
1656 slice
= zfs_alloc(hdl
, sizeof (rdsk_node_t
));
1657 slice
->rn_name
= zfs_strdup(hdl
, path
);
1658 slice
->rn_vdev_guid
= vdev_guid
;
1659 slice
->rn_avl
= rn
->rn_avl
;
1660 slice
->rn_hdl
= hdl
;
1661 slice
->rn_order
= IMPORT_ORDER_PREFERRED_1
;
1662 slice
->rn_labelpaths
= B_FALSE
;
1663 pthread_mutex_lock(rn
->rn_lock
);
1664 if (avl_find(rn
->rn_avl
, slice
, &where
)) {
1665 pthread_mutex_unlock(rn
->rn_lock
);
1666 free(slice
->rn_name
);
1669 avl_insert(rn
->rn_avl
, slice
, where
);
1670 pthread_mutex_unlock(rn
->rn_lock
);
1671 zpool_open_func(slice
);
1675 if (devid
!= NULL
) {
1676 slice
= zfs_alloc(hdl
, sizeof (rdsk_node_t
));
1677 error
= asprintf(&slice
->rn_name
, "%s%s",
1678 DEV_BYID_PATH
, devid
);
1684 slice
->rn_vdev_guid
= vdev_guid
;
1685 slice
->rn_avl
= rn
->rn_avl
;
1686 slice
->rn_hdl
= hdl
;
1687 slice
->rn_order
= IMPORT_ORDER_PREFERRED_2
;
1688 slice
->rn_labelpaths
= B_FALSE
;
1689 pthread_mutex_lock(rn
->rn_lock
);
1690 if (avl_find(rn
->rn_avl
, slice
, &where
)) {
1691 pthread_mutex_unlock(rn
->rn_lock
);
1692 free(slice
->rn_name
);
1695 avl_insert(rn
->rn_avl
, slice
, where
);
1696 pthread_mutex_unlock(rn
->rn_lock
);
1697 zpool_open_func(slice
);
1704 * Given a file descriptor, clear (zero) the label information. This function
1705 * is used in the appliance stack as part of the ZFS sysevent module and
1706 * to implement the "zpool labelclear" command.
1709 zpool_clear_label(int fd
)
1711 struct stat64 statbuf
;
1713 vdev_label_t
*label
;
1716 if (fstat64_blk(fd
, &statbuf
) == -1)
1718 size
= P2ALIGN_TYPED(statbuf
.st_size
, sizeof (vdev_label_t
), uint64_t);
1720 if ((label
= calloc(1, sizeof (vdev_label_t
))) == NULL
)
1723 for (l
= 0; l
< VDEV_LABELS
; l
++) {
1724 if (pwrite64(fd
, label
, sizeof (vdev_label_t
),
1725 label_offset(size
, l
)) != sizeof (vdev_label_t
)) {
1736 zpool_find_import_scan_add_slice(libzfs_handle_t
*hdl
, pthread_mutex_t
*lock
,
1737 avl_tree_t
*cache
, char *path
, const char *name
, int order
)
1742 slice
= zfs_alloc(hdl
, sizeof (rdsk_node_t
));
1743 if (asprintf(&slice
->rn_name
, "%s/%s", path
, name
) == -1) {
1747 slice
->rn_vdev_guid
= 0;
1748 slice
->rn_lock
= lock
;
1749 slice
->rn_avl
= cache
;
1750 slice
->rn_hdl
= hdl
;
1751 slice
->rn_order
= order
+ IMPORT_ORDER_SCAN_OFFSET
;
1752 slice
->rn_labelpaths
= B_FALSE
;
1754 pthread_mutex_lock(lock
);
1755 if (avl_find(cache
, slice
, &where
)) {
1756 free(slice
->rn_name
);
1759 avl_insert(cache
, slice
, where
);
1761 pthread_mutex_unlock(lock
);
1765 zpool_find_import_scan_dir(libzfs_handle_t
*hdl
, pthread_mutex_t
*lock
,
1766 avl_tree_t
*cache
, char *dir
, int order
)
1769 char path
[MAXPATHLEN
];
1770 struct dirent64
*dp
;
1773 if (realpath(dir
, path
) == NULL
) {
1775 if (error
== ENOENT
)
1778 zfs_error_aux(hdl
, strerror(error
));
1779 (void) zfs_error_fmt(hdl
, EZFS_BADPATH
, dgettext(
1780 TEXT_DOMAIN
, "cannot resolve path '%s'"), dir
);
1784 dirp
= opendir(path
);
1787 zfs_error_aux(hdl
, strerror(error
));
1788 (void) zfs_error_fmt(hdl
, EZFS_BADPATH
,
1789 dgettext(TEXT_DOMAIN
, "cannot open '%s'"), path
);
1793 while ((dp
= readdir64(dirp
)) != NULL
) {
1794 const char *name
= dp
->d_name
;
1795 if (name
[0] == '.' &&
1796 (name
[1] == 0 || (name
[1] == '.' && name
[2] == 0)))
1799 zpool_find_import_scan_add_slice(hdl
, lock
, cache
, path
, name
,
1803 (void) closedir(dirp
);
1808 zpool_find_import_scan_path(libzfs_handle_t
*hdl
, pthread_mutex_t
*lock
,
1809 avl_tree_t
*cache
, char *dir
, int order
)
1812 char path
[MAXPATHLEN
];
1817 * Seperate the directory part and last part of the
1818 * path. We do this so that we can get the realpath of
1819 * the directory. We don't get the realpath on the
1820 * whole path because if it's a symlink, we want the
1821 * path of the symlink not where it points to.
1823 d
= zfs_strdup(hdl
, dir
);
1824 b
= zfs_strdup(hdl
, dir
);
1828 if (realpath(dpath
, path
) == NULL
) {
1830 if (error
== ENOENT
) {
1835 zfs_error_aux(hdl
, strerror(error
));
1836 (void) zfs_error_fmt(hdl
, EZFS_BADPATH
, dgettext(
1837 TEXT_DOMAIN
, "cannot resolve path '%s'"), dir
);
1841 zpool_find_import_scan_add_slice(hdl
, lock
, cache
, path
, name
, order
);
1850 * Scan a list of directories for zfs devices.
1853 zpool_find_import_scan(libzfs_handle_t
*hdl
, pthread_mutex_t
*lock
,
1854 avl_tree_t
**slice_cache
, char **dir
, int dirs
)
1861 *slice_cache
= NULL
;
1862 cache
= zfs_alloc(hdl
, sizeof (avl_tree_t
));
1863 avl_create(cache
, slice_cache_compare
, sizeof (rdsk_node_t
),
1864 offsetof(rdsk_node_t
, rn_node
));
1866 for (i
= 0; i
< dirs
; i
++) {
1869 if (stat(dir
[i
], &sbuf
) != 0) {
1871 if (error
== ENOENT
)
1874 zfs_error_aux(hdl
, strerror(error
));
1875 (void) zfs_error_fmt(hdl
, EZFS_BADPATH
, dgettext(
1876 TEXT_DOMAIN
, "cannot resolve path '%s'"), dir
[i
]);
1881 * If dir[i] is a directory, we walk through it and add all
1882 * the entry to the cache. If it's not a directory, we just
1883 * add it to the cache.
1885 if (S_ISDIR(sbuf
.st_mode
)) {
1886 if ((error
= zpool_find_import_scan_dir(hdl
, lock
,
1887 cache
, dir
[i
], i
)) != 0)
1890 if ((error
= zpool_find_import_scan_path(hdl
, lock
,
1891 cache
, dir
[i
], i
)) != 0)
1896 *slice_cache
= cache
;
1901 while ((slice
= avl_destroy_nodes(cache
, &cookie
)) != NULL
) {
1902 free(slice
->rn_name
);
1911 * Use libblkid to quickly enumerate all known zfs devices.
1914 zpool_find_import_blkid(libzfs_handle_t
*hdl
, pthread_mutex_t
*lock
,
1915 avl_tree_t
**slice_cache
)
1919 blkid_dev_iterate iter
;
1924 *slice_cache
= NULL
;
1926 error
= blkid_get_cache(&cache
, NULL
);
1930 error
= blkid_probe_all_new(cache
);
1932 blkid_put_cache(cache
);
1936 iter
= blkid_dev_iterate_begin(cache
);
1938 blkid_put_cache(cache
);
1942 error
= blkid_dev_set_search(iter
, "TYPE", "zfs_member");
1944 blkid_dev_iterate_end(iter
);
1945 blkid_put_cache(cache
);
1949 *slice_cache
= zfs_alloc(hdl
, sizeof (avl_tree_t
));
1950 avl_create(*slice_cache
, slice_cache_compare
, sizeof (rdsk_node_t
),
1951 offsetof(rdsk_node_t
, rn_node
));
1953 while (blkid_dev_next(iter
, &dev
) == 0) {
1954 slice
= zfs_alloc(hdl
, sizeof (rdsk_node_t
));
1955 slice
->rn_name
= zfs_strdup(hdl
, blkid_dev_devname(dev
));
1956 slice
->rn_vdev_guid
= 0;
1957 slice
->rn_lock
= lock
;
1958 slice
->rn_avl
= *slice_cache
;
1959 slice
->rn_hdl
= hdl
;
1960 slice
->rn_labelpaths
= B_TRUE
;
1962 error
= zfs_path_order(slice
->rn_name
, &slice
->rn_order
);
1964 slice
->rn_order
+= IMPORT_ORDER_SCAN_OFFSET
;
1966 slice
->rn_order
= IMPORT_ORDER_DEFAULT
;
1968 pthread_mutex_lock(lock
);
1969 if (avl_find(*slice_cache
, slice
, &where
)) {
1970 free(slice
->rn_name
);
1973 avl_insert(*slice_cache
, slice
, where
);
1975 pthread_mutex_unlock(lock
);
1978 blkid_dev_iterate_end(iter
);
1979 blkid_put_cache(cache
);
1985 zpool_default_import_path
[DEFAULT_IMPORT_PATH_SIZE
] = {
1986 "/dev/disk/by-vdev", /* Custom rules, use first if they exist */
1987 "/dev/mapper", /* Use multipath devices before components */
1988 "/dev/disk/by-partlabel", /* Single unique entry set by user */
1989 "/dev/disk/by-partuuid", /* Generated partition uuid */
1990 "/dev/disk/by-label", /* Custom persistent labels */
1991 "/dev/disk/by-uuid", /* Single unique entry and persistent */
1992 "/dev/disk/by-id", /* May be multiple entries and persistent */
1993 "/dev/disk/by-path", /* Encodes physical location and persistent */
1994 "/dev" /* UNSAFE device names will change */
1998 * Given a list of directories to search, find all pools stored on disk. This
1999 * includes partial pools which are not available to import. If no args are
2000 * given (argc is 0), then the default directory (/dev/dsk) is searched.
2001 * poolname or guid (but not both) are provided by the caller when trying
2002 * to import a specific pool.
2005 zpool_find_import_impl(libzfs_handle_t
*hdl
, importargs_t
*iarg
)
2007 nvlist_t
*ret
= NULL
;
2008 pool_list_t pools
= { 0 };
2009 pool_entry_t
*pe
, *penext
;
2010 vdev_entry_t
*ve
, *venext
;
2011 config_entry_t
*ce
, *cenext
;
2012 name_entry_t
*ne
, *nenext
;
2013 pthread_mutex_t lock
;
2019 verify(iarg
->poolname
== NULL
|| iarg
->guid
== 0);
2020 pthread_mutex_init(&lock
, NULL
);
2023 * Locate pool member vdevs using libblkid or by directory scanning.
2024 * On success a newly allocated AVL tree which is populated with an
2025 * entry for each discovered vdev will be returned as the cache.
2026 * It's the callers responsibility to consume and destroy this tree.
2028 if (iarg
->scan
|| iarg
->paths
!= 0) {
2029 int dirs
= iarg
->paths
;
2030 char **dir
= iarg
->path
;
2033 dir
= zpool_default_import_path
;
2034 dirs
= DEFAULT_IMPORT_PATH_SIZE
;
2037 if (zpool_find_import_scan(hdl
, &lock
, &cache
, dir
, dirs
) != 0)
2040 if (zpool_find_import_blkid(hdl
, &lock
, &cache
) != 0)
2045 * Create a thread pool to parallelize the process of reading and
2046 * validating labels, a large number of threads can be used due to
2047 * minimal contention.
2049 t
= tpool_create(1, 2 * sysconf(_SC_NPROCESSORS_ONLN
), 0, NULL
);
2050 for (slice
= avl_first(cache
); slice
;
2051 (slice
= avl_walk(cache
, slice
, AVL_AFTER
)))
2052 (void) tpool_dispatch(t
, zpool_open_func
, slice
);
2058 * Process the cache filtering out any entries which are not
2059 * for the specificed pool then adding matching label configs.
2062 while ((slice
= avl_destroy_nodes(cache
, &cookie
)) != NULL
) {
2063 if (slice
->rn_config
!= NULL
) {
2064 nvlist_t
*config
= slice
->rn_config
;
2065 boolean_t matched
= B_TRUE
;
2066 boolean_t aux
= B_FALSE
;
2070 * Check if it's a spare or l2cache device. If it is,
2071 * we need to skip the name and guid check since they
2072 * don't exist on aux device label.
2074 if (iarg
->poolname
!= NULL
|| iarg
->guid
!= 0) {
2076 aux
= nvlist_lookup_uint64(config
,
2077 ZPOOL_CONFIG_POOL_STATE
, &state
) == 0 &&
2078 (state
== POOL_STATE_SPARE
||
2079 state
== POOL_STATE_L2CACHE
);
2082 if (iarg
->poolname
!= NULL
&& !aux
) {
2085 matched
= nvlist_lookup_string(config
,
2086 ZPOOL_CONFIG_POOL_NAME
, &pname
) == 0 &&
2087 strcmp(iarg
->poolname
, pname
) == 0;
2088 } else if (iarg
->guid
!= 0 && !aux
) {
2091 matched
= nvlist_lookup_uint64(config
,
2092 ZPOOL_CONFIG_POOL_GUID
, &this_guid
) == 0 &&
2093 iarg
->guid
== this_guid
;
2097 * Verify all remaining entries can be opened
2098 * exclusively. This will prune all underlying
2099 * multipath devices which otherwise could
2100 * result in the vdev appearing as UNAVAIL.
2102 * Under zdb, this step isn't required and
2103 * would prevent a zdb -e of active pools with
2106 fd
= open(slice
->rn_name
, O_RDONLY
| O_EXCL
);
2107 if (fd
>= 0 || iarg
->can_be_active
) {
2110 add_config(hdl
, &pools
,
2111 slice
->rn_name
, slice
->rn_order
,
2112 slice
->rn_num_labels
, config
);
2115 nvlist_free(config
);
2117 free(slice
->rn_name
);
2122 pthread_mutex_destroy(&lock
);
2124 ret
= get_configs(hdl
, &pools
, iarg
->can_be_active
, iarg
->policy
);
2126 for (pe
= pools
.pools
; pe
!= NULL
; pe
= penext
) {
2127 penext
= pe
->pe_next
;
2128 for (ve
= pe
->pe_vdevs
; ve
!= NULL
; ve
= venext
) {
2129 venext
= ve
->ve_next
;
2130 for (ce
= ve
->ve_configs
; ce
!= NULL
; ce
= cenext
) {
2131 cenext
= ce
->ce_next
;
2132 nvlist_free(ce
->ce_config
);
2140 for (ne
= pools
.names
; ne
!= NULL
; ne
= nenext
) {
2141 nenext
= ne
->ne_next
;
2150 zpool_find_import(libzfs_handle_t
*hdl
, int argc
, char **argv
)
2152 importargs_t iarg
= { 0 };
2157 return (zpool_find_import_impl(hdl
, &iarg
));
2161 * Given a cache file, return the contents as a list of importable pools.
2162 * poolname or guid (but not both) are provided by the caller when trying
2163 * to import a specific pool.
2166 zpool_find_import_cached(libzfs_handle_t
*hdl
, const char *cachefile
,
2167 char *poolname
, uint64_t guid
)
2171 struct stat64 statbuf
;
2172 nvlist_t
*raw
, *src
, *dst
;
2179 verify(poolname
== NULL
|| guid
== 0);
2181 if ((fd
= open(cachefile
, O_RDONLY
)) < 0) {
2182 zfs_error_aux(hdl
, "%s", strerror(errno
));
2183 (void) zfs_error(hdl
, EZFS_BADCACHE
,
2184 dgettext(TEXT_DOMAIN
, "failed to open cache file"));
2188 if (fstat64(fd
, &statbuf
) != 0) {
2189 zfs_error_aux(hdl
, "%s", strerror(errno
));
2191 (void) zfs_error(hdl
, EZFS_BADCACHE
,
2192 dgettext(TEXT_DOMAIN
, "failed to get size of cache file"));
2196 if ((buf
= zfs_alloc(hdl
, statbuf
.st_size
)) == NULL
) {
2201 if (read(fd
, buf
, statbuf
.st_size
) != statbuf
.st_size
) {
2204 (void) zfs_error(hdl
, EZFS_BADCACHE
,
2205 dgettext(TEXT_DOMAIN
,
2206 "failed to read cache file contents"));
2212 if (nvlist_unpack(buf
, statbuf
.st_size
, &raw
, 0) != 0) {
2214 (void) zfs_error(hdl
, EZFS_BADCACHE
,
2215 dgettext(TEXT_DOMAIN
,
2216 "invalid or corrupt cache file contents"));
2223 * Go through and get the current state of the pools and refresh their
2226 if (nvlist_alloc(&pools
, 0, 0) != 0) {
2227 (void) no_memory(hdl
);
2233 while ((elem
= nvlist_next_nvpair(raw
, elem
)) != NULL
) {
2234 src
= fnvpair_value_nvlist(elem
);
2236 name
= fnvlist_lookup_string(src
, ZPOOL_CONFIG_POOL_NAME
);
2237 if (poolname
!= NULL
&& strcmp(poolname
, name
) != 0)
2240 this_guid
= fnvlist_lookup_uint64(src
, ZPOOL_CONFIG_POOL_GUID
);
2241 if (guid
!= 0 && guid
!= this_guid
)
2244 if (pool_active(hdl
, name
, this_guid
, &active
) != 0) {
2253 if (nvlist_add_string(src
, ZPOOL_CONFIG_CACHEFILE
,
2255 (void) no_memory(hdl
);
2261 if ((dst
= refresh_config(hdl
, src
)) == NULL
) {
2267 if (nvlist_add_nvlist(pools
, nvpair_name(elem
), dst
) != 0) {
2268 (void) no_memory(hdl
);
2282 name_or_guid_exists(zpool_handle_t
*zhp
, void *data
)
2284 importargs_t
*import
= data
;
2287 if (import
->poolname
!= NULL
) {
2290 verify(nvlist_lookup_string(zhp
->zpool_config
,
2291 ZPOOL_CONFIG_POOL_NAME
, &pool_name
) == 0);
2292 if (strcmp(pool_name
, import
->poolname
) == 0)
2297 verify(nvlist_lookup_uint64(zhp
->zpool_config
,
2298 ZPOOL_CONFIG_POOL_GUID
, &pool_guid
) == 0);
2299 if (pool_guid
== import
->guid
)
2308 zpool_search_import(libzfs_handle_t
*hdl
, importargs_t
*import
)
2310 verify(import
->poolname
== NULL
|| import
->guid
== 0);
2313 import
->exists
= zpool_iter(hdl
, name_or_guid_exists
, import
);
2315 if (import
->cachefile
!= NULL
)
2316 return (zpool_find_import_cached(hdl
, import
->cachefile
,
2317 import
->poolname
, import
->guid
));
2319 return (zpool_find_import_impl(hdl
, import
));
2323 pool_match(nvlist_t
*cfg
, char *tgt
)
2325 uint64_t v
, guid
= strtoull(tgt
, NULL
, 0);
2329 if (nvlist_lookup_uint64(cfg
, ZPOOL_CONFIG_POOL_GUID
, &v
) == 0)
2332 if (nvlist_lookup_string(cfg
, ZPOOL_CONFIG_POOL_NAME
, &s
) == 0)
2333 return (strcmp(s
, tgt
) == 0);
2339 zpool_tryimport(libzfs_handle_t
*hdl
, char *target
, nvlist_t
**configp
,
2343 nvlist_t
*match
= NULL
;
2344 nvlist_t
*config
= NULL
;
2345 char *name
= NULL
, *sepp
= NULL
;
2348 char *targetdup
= strdup(target
);
2352 if ((sepp
= strpbrk(targetdup
, "/@")) != NULL
) {
2357 pools
= zpool_search_import(hdl
, args
);
2359 if (pools
!= NULL
) {
2360 nvpair_t
*elem
= NULL
;
2361 while ((elem
= nvlist_next_nvpair(pools
, elem
)) != NULL
) {
2362 VERIFY0(nvpair_value_nvlist(elem
, &config
));
2363 if (pool_match(config
, targetdup
)) {
2365 if (match
!= NULL
) {
2366 /* multiple matches found */
2370 name
= nvpair_name(elem
);
2377 (void) zfs_error_aux(hdl
, dgettext(TEXT_DOMAIN
,
2384 (void) zfs_error_aux(hdl
, dgettext(TEXT_DOMAIN
,
2385 "%d pools found, use pool GUID\n"), count
);
2397 find_guid(nvlist_t
*nv
, uint64_t guid
)
2403 verify(nvlist_lookup_uint64(nv
, ZPOOL_CONFIG_GUID
, &tmp
) == 0);
2407 if (nvlist_lookup_nvlist_array(nv
, ZPOOL_CONFIG_CHILDREN
,
2408 &child
, &children
) == 0) {
2409 for (c
= 0; c
< children
; c
++)
2410 if (find_guid(child
[c
], guid
))
2417 typedef struct aux_cbdata
{
2418 const char *cb_type
;
2420 zpool_handle_t
*cb_zhp
;
2424 find_aux(zpool_handle_t
*zhp
, void *data
)
2426 aux_cbdata_t
*cbp
= data
;
2432 verify(nvlist_lookup_nvlist(zhp
->zpool_config
, ZPOOL_CONFIG_VDEV_TREE
,
2435 if (nvlist_lookup_nvlist_array(nvroot
, cbp
->cb_type
,
2436 &list
, &count
) == 0) {
2437 for (i
= 0; i
< count
; i
++) {
2438 verify(nvlist_lookup_uint64(list
[i
],
2439 ZPOOL_CONFIG_GUID
, &guid
) == 0);
2440 if (guid
== cbp
->cb_guid
) {
2452 * Determines if the pool is in use. If so, it returns true and the state of
2453 * the pool as well as the name of the pool. Name string is allocated and
2454 * must be freed by the caller.
2457 zpool_in_use(libzfs_handle_t
*hdl
, int fd
, pool_state_t
*state
, char **namestr
,
2463 uint64_t guid
, vdev_guid
;
2464 zpool_handle_t
*zhp
;
2465 nvlist_t
*pool_config
;
2466 uint64_t stateval
, isspare
;
2467 aux_cbdata_t cb
= { 0 };
2472 if (zpool_read_label(fd
, &config
, NULL
) != 0) {
2473 (void) no_memory(hdl
);
2480 verify(nvlist_lookup_uint64(config
, ZPOOL_CONFIG_POOL_STATE
,
2482 verify(nvlist_lookup_uint64(config
, ZPOOL_CONFIG_GUID
,
2485 if (stateval
!= POOL_STATE_SPARE
&& stateval
!= POOL_STATE_L2CACHE
) {
2486 verify(nvlist_lookup_string(config
, ZPOOL_CONFIG_POOL_NAME
,
2488 verify(nvlist_lookup_uint64(config
, ZPOOL_CONFIG_POOL_GUID
,
2493 case POOL_STATE_EXPORTED
:
2495 * A pool with an exported state may in fact be imported
2496 * read-only, so check the in-core state to see if it's
2497 * active and imported read-only. If it is, set
2498 * its state to active.
2500 if (pool_active(hdl
, name
, guid
, &isactive
) == 0 && isactive
&&
2501 (zhp
= zpool_open_canfail(hdl
, name
)) != NULL
) {
2502 if (zpool_get_prop_int(zhp
, ZPOOL_PROP_READONLY
, NULL
))
2503 stateval
= POOL_STATE_ACTIVE
;
2506 * All we needed the zpool handle for is the
2507 * readonly prop check.
2515 case POOL_STATE_ACTIVE
:
2517 * For an active pool, we have to determine if it's really part
2518 * of a currently active pool (in which case the pool will exist
2519 * and the guid will be the same), or whether it's part of an
2520 * active pool that was disconnected without being explicitly
2523 if (pool_active(hdl
, name
, guid
, &isactive
) != 0) {
2524 nvlist_free(config
);
2530 * Because the device may have been removed while
2531 * offlined, we only report it as active if the vdev is
2532 * still present in the config. Otherwise, pretend like
2535 if ((zhp
= zpool_open_canfail(hdl
, name
)) != NULL
&&
2536 (pool_config
= zpool_get_config(zhp
, NULL
))
2540 verify(nvlist_lookup_nvlist(pool_config
,
2541 ZPOOL_CONFIG_VDEV_TREE
, &nvroot
) == 0);
2542 ret
= find_guid(nvroot
, vdev_guid
);
2548 * If this is an active spare within another pool, we
2549 * treat it like an unused hot spare. This allows the
2550 * user to create a pool with a hot spare that currently
2551 * in use within another pool. Since we return B_TRUE,
2552 * libdiskmgt will continue to prevent generic consumers
2553 * from using the device.
2555 if (ret
&& nvlist_lookup_uint64(config
,
2556 ZPOOL_CONFIG_IS_SPARE
, &isspare
) == 0 && isspare
)
2557 stateval
= POOL_STATE_SPARE
;
2562 stateval
= POOL_STATE_POTENTIALLY_ACTIVE
;
2567 case POOL_STATE_SPARE
:
2569 * For a hot spare, it can be either definitively in use, or
2570 * potentially active. To determine if it's in use, we iterate
2571 * over all pools in the system and search for one with a spare
2572 * with a matching guid.
2574 * Due to the shared nature of spares, we don't actually report
2575 * the potentially active case as in use. This means the user
2576 * can freely create pools on the hot spares of exported pools,
2577 * but to do otherwise makes the resulting code complicated, and
2578 * we end up having to deal with this case anyway.
2581 cb
.cb_guid
= vdev_guid
;
2582 cb
.cb_type
= ZPOOL_CONFIG_SPARES
;
2583 if (zpool_iter(hdl
, find_aux
, &cb
) == 1) {
2584 name
= (char *)zpool_get_name(cb
.cb_zhp
);
2591 case POOL_STATE_L2CACHE
:
2594 * Check if any pool is currently using this l2cache device.
2597 cb
.cb_guid
= vdev_guid
;
2598 cb
.cb_type
= ZPOOL_CONFIG_L2CACHE
;
2599 if (zpool_iter(hdl
, find_aux
, &cb
) == 1) {
2600 name
= (char *)zpool_get_name(cb
.cb_zhp
);
2613 if ((*namestr
= zfs_strdup(hdl
, name
)) == NULL
) {
2615 zpool_close(cb
.cb_zhp
);
2616 nvlist_free(config
);
2619 *state
= (pool_state_t
)stateval
;
2623 zpool_close(cb
.cb_zhp
);
2625 nvlist_free(config
);