]> git.proxmox.com Git - mirror_zfs.git/blob - lib/libzfs/libzfs_import.c
Fix allocation_classes GUID in zpool-features(5)
[mirror_zfs.git] / lib / libzfs / libzfs_import.c
1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
25 * Copyright 2015 RackTop Systems.
26 * Copyright (c) 2016, Intel Corporation.
27 */
28
29 /*
30 * Pool import support functions.
31 *
32 * To import a pool, we rely on reading the configuration information from the
33 * ZFS label of each device. If we successfully read the label, then we
34 * organize the configuration information in the following hierarchy:
35 *
36 * pool guid -> toplevel vdev guid -> label txg
37 *
38 * Duplicate entries matching this same tuple will be discarded. Once we have
39 * examined every device, we pick the best label txg config for each toplevel
40 * vdev. We then arrange these toplevel vdevs into a complete pool config, and
41 * update any paths that have changed. Finally, we attempt to import the pool
42 * using our derived config, and record the results.
43 */
44
45 #include <ctype.h>
46 #include <devid.h>
47 #include <dirent.h>
48 #include <errno.h>
49 #include <libintl.h>
50 #include <libgen.h>
51 #ifdef HAVE_LIBUDEV
52 #include <libudev.h>
53 #include <sched.h>
54 #endif
55 #include <stddef.h>
56 #include <stdlib.h>
57 #include <string.h>
58 #include <sys/stat.h>
59 #include <unistd.h>
60 #include <fcntl.h>
61 #include <sys/vtoc.h>
62 #include <sys/dktp/fdisk.h>
63 #include <sys/efi_partition.h>
64 #include <thread_pool.h>
65 #include <sys/vdev_impl.h>
66 #include <blkid/blkid.h>
67 #include "libzfs.h"
68 #include "libzfs_impl.h"
69 #include <libzfs.h>
70
71 /*
72 * Intermediate structures used to gather configuration information.
73 */
74 typedef struct config_entry {
75 uint64_t ce_txg;
76 nvlist_t *ce_config;
77 struct config_entry *ce_next;
78 } config_entry_t;
79
80 typedef struct vdev_entry {
81 uint64_t ve_guid;
82 config_entry_t *ve_configs;
83 struct vdev_entry *ve_next;
84 } vdev_entry_t;
85
86 typedef struct pool_entry {
87 uint64_t pe_guid;
88 vdev_entry_t *pe_vdevs;
89 struct pool_entry *pe_next;
90 } pool_entry_t;
91
92 typedef struct name_entry {
93 char *ne_name;
94 uint64_t ne_guid;
95 uint64_t ne_order;
96 uint64_t ne_num_labels;
97 struct name_entry *ne_next;
98 } name_entry_t;
99
100 typedef struct pool_list {
101 pool_entry_t *pools;
102 name_entry_t *names;
103 } pool_list_t;
104
105 #define DEV_BYID_PATH "/dev/disk/by-id/"
106
107 /*
108 * Linux persistent device strings for vdev labels
109 *
110 * based on libudev for consistency with libudev disk add/remove events
111 */
112 #ifdef HAVE_LIBUDEV
113
114 typedef struct vdev_dev_strs {
115 char vds_devid[128];
116 char vds_devphys[128];
117 } vdev_dev_strs_t;
118
119 /*
120 * Obtain the persistent device id string (describes what)
121 *
122 * used by ZED vdev matching for auto-{online,expand,replace}
123 */
124 int
125 zfs_device_get_devid(struct udev_device *dev, char *bufptr, size_t buflen)
126 {
127 struct udev_list_entry *entry;
128 const char *bus;
129 char devbyid[MAXPATHLEN];
130
131 /* The bus based by-id path is preferred */
132 bus = udev_device_get_property_value(dev, "ID_BUS");
133
134 if (bus == NULL) {
135 const char *dm_uuid;
136
137 /*
138 * For multipath nodes use the persistent uuid based identifier
139 *
140 * Example: /dev/disk/by-id/dm-uuid-mpath-35000c5006304de3f
141 */
142 dm_uuid = udev_device_get_property_value(dev, "DM_UUID");
143 if (dm_uuid != NULL) {
144 (void) snprintf(bufptr, buflen, "dm-uuid-%s", dm_uuid);
145 return (0);
146 }
147
148 /*
149 * For volumes use the persistent /dev/zvol/dataset identifier
150 */
151 entry = udev_device_get_devlinks_list_entry(dev);
152 while (entry != NULL) {
153 const char *name;
154
155 name = udev_list_entry_get_name(entry);
156 if (strncmp(name, ZVOL_ROOT, strlen(ZVOL_ROOT)) == 0) {
157 (void) strlcpy(bufptr, name, buflen);
158 return (0);
159 }
160 entry = udev_list_entry_get_next(entry);
161 }
162
163 /*
164 * NVME 'by-id' symlinks are similar to bus case
165 */
166 struct udev_device *parent;
167
168 parent = udev_device_get_parent_with_subsystem_devtype(dev,
169 "nvme", NULL);
170 if (parent != NULL)
171 bus = "nvme"; /* continue with bus symlink search */
172 else
173 return (ENODATA);
174 }
175
176 /*
177 * locate the bus specific by-id link
178 */
179 (void) snprintf(devbyid, sizeof (devbyid), "%s%s-", DEV_BYID_PATH, bus);
180 entry = udev_device_get_devlinks_list_entry(dev);
181 while (entry != NULL) {
182 const char *name;
183
184 name = udev_list_entry_get_name(entry);
185 if (strncmp(name, devbyid, strlen(devbyid)) == 0) {
186 name += strlen(DEV_BYID_PATH);
187 (void) strlcpy(bufptr, name, buflen);
188 return (0);
189 }
190 entry = udev_list_entry_get_next(entry);
191 }
192
193 return (ENODATA);
194 }
195
196 /*
197 * Obtain the persistent physical location string (describes where)
198 *
199 * used by ZED vdev matching for auto-{online,expand,replace}
200 */
201 int
202 zfs_device_get_physical(struct udev_device *dev, char *bufptr, size_t buflen)
203 {
204 const char *physpath = NULL;
205 struct udev_list_entry *entry;
206
207 /*
208 * Normal disks use ID_PATH for their physical path.
209 */
210 physpath = udev_device_get_property_value(dev, "ID_PATH");
211 if (physpath != NULL && strlen(physpath) > 0) {
212 (void) strlcpy(bufptr, physpath, buflen);
213 return (0);
214 }
215
216 /*
217 * Device mapper devices are virtual and don't have a physical
218 * path. For them we use ID_VDEV instead, which is setup via the
219 * /etc/vdev_id.conf file. ID_VDEV provides a persistent path
220 * to a virtual device. If you don't have vdev_id.conf setup,
221 * you cannot use multipath autoreplace with device mapper.
222 */
223 physpath = udev_device_get_property_value(dev, "ID_VDEV");
224 if (physpath != NULL && strlen(physpath) > 0) {
225 (void) strlcpy(bufptr, physpath, buflen);
226 return (0);
227 }
228
229 /*
230 * For ZFS volumes use the persistent /dev/zvol/dataset identifier
231 */
232 entry = udev_device_get_devlinks_list_entry(dev);
233 while (entry != NULL) {
234 physpath = udev_list_entry_get_name(entry);
235 if (strncmp(physpath, ZVOL_ROOT, strlen(ZVOL_ROOT)) == 0) {
236 (void) strlcpy(bufptr, physpath, buflen);
237 return (0);
238 }
239 entry = udev_list_entry_get_next(entry);
240 }
241
242 /*
243 * For all other devices fallback to using the by-uuid name.
244 */
245 entry = udev_device_get_devlinks_list_entry(dev);
246 while (entry != NULL) {
247 physpath = udev_list_entry_get_name(entry);
248 if (strncmp(physpath, "/dev/disk/by-uuid", 17) == 0) {
249 (void) strlcpy(bufptr, physpath, buflen);
250 return (0);
251 }
252 entry = udev_list_entry_get_next(entry);
253 }
254
255 return (ENODATA);
256 }
257
258 boolean_t
259 udev_is_mpath(struct udev_device *dev)
260 {
261 return udev_device_get_property_value(dev, "DM_UUID") &&
262 udev_device_get_property_value(dev, "MPATH_SBIN_PATH");
263 }
264
265 /*
266 * A disk is considered a multipath whole disk when:
267 * DEVNAME key value has "dm-"
268 * DM_NAME key value has "mpath" prefix
269 * DM_UUID key exists
270 * ID_PART_TABLE_TYPE key does not exist or is not gpt
271 */
272 static boolean_t
273 udev_mpath_whole_disk(struct udev_device *dev)
274 {
275 const char *devname, *type, *uuid;
276
277 devname = udev_device_get_property_value(dev, "DEVNAME");
278 type = udev_device_get_property_value(dev, "ID_PART_TABLE_TYPE");
279 uuid = udev_device_get_property_value(dev, "DM_UUID");
280
281 if ((devname != NULL && strncmp(devname, "/dev/dm-", 8) == 0) &&
282 ((type == NULL) || (strcmp(type, "gpt") != 0)) &&
283 (uuid != NULL)) {
284 return (B_TRUE);
285 }
286
287 return (B_FALSE);
288 }
289
290 /*
291 * Check if a disk is effectively a multipath whole disk
292 */
293 boolean_t
294 is_mpath_whole_disk(const char *path)
295 {
296 struct udev *udev;
297 struct udev_device *dev = NULL;
298 char nodepath[MAXPATHLEN];
299 char *sysname;
300 boolean_t wholedisk = B_FALSE;
301
302 if (realpath(path, nodepath) == NULL)
303 return (B_FALSE);
304 sysname = strrchr(nodepath, '/') + 1;
305 if (strncmp(sysname, "dm-", 3) != 0)
306 return (B_FALSE);
307 if ((udev = udev_new()) == NULL)
308 return (B_FALSE);
309 if ((dev = udev_device_new_from_subsystem_sysname(udev, "block",
310 sysname)) == NULL) {
311 udev_device_unref(dev);
312 return (B_FALSE);
313 }
314
315 wholedisk = udev_mpath_whole_disk(dev);
316
317 udev_device_unref(dev);
318 return (wholedisk);
319 }
320
321 static int
322 udev_device_is_ready(struct udev_device *dev)
323 {
324 #ifdef HAVE_LIBUDEV_UDEV_DEVICE_GET_IS_INITIALIZED
325 return (udev_device_get_is_initialized(dev));
326 #else
327 /* wait for DEVLINKS property to be initialized */
328 return (udev_device_get_property_value(dev, "DEVLINKS") != NULL);
329 #endif
330 }
331
332 /*
333 * Wait up to timeout_ms for udev to set up the device node. The device is
334 * considered ready when libudev determines it has been initialized, all of
335 * the device links have been verified to exist, and it has been allowed to
336 * settle. At this point the device the device can be accessed reliably.
337 * Depending on the complexity of the udev rules this process could take
338 * several seconds.
339 */
340 int
341 zpool_label_disk_wait(char *path, int timeout_ms)
342 {
343 struct udev *udev;
344 struct udev_device *dev = NULL;
345 char nodepath[MAXPATHLEN];
346 char *sysname = NULL;
347 int ret = ENODEV;
348 int settle_ms = 50;
349 long sleep_ms = 10;
350 hrtime_t start, settle;
351
352 if ((udev = udev_new()) == NULL)
353 return (ENXIO);
354
355 start = gethrtime();
356 settle = 0;
357
358 do {
359 if (sysname == NULL) {
360 if (realpath(path, nodepath) != NULL) {
361 sysname = strrchr(nodepath, '/') + 1;
362 } else {
363 (void) usleep(sleep_ms * MILLISEC);
364 continue;
365 }
366 }
367
368 dev = udev_device_new_from_subsystem_sysname(udev,
369 "block", sysname);
370 if ((dev != NULL) && udev_device_is_ready(dev)) {
371 struct udev_list_entry *links, *link = NULL;
372
373 ret = 0;
374 links = udev_device_get_devlinks_list_entry(dev);
375
376 udev_list_entry_foreach(link, links) {
377 struct stat64 statbuf;
378 const char *name;
379
380 name = udev_list_entry_get_name(link);
381 errno = 0;
382 if (stat64(name, &statbuf) == 0 && errno == 0)
383 continue;
384
385 settle = 0;
386 ret = ENODEV;
387 break;
388 }
389
390 if (ret == 0) {
391 if (settle == 0) {
392 settle = gethrtime();
393 } else if (NSEC2MSEC(gethrtime() - settle) >=
394 settle_ms) {
395 udev_device_unref(dev);
396 break;
397 }
398 }
399 }
400
401 udev_device_unref(dev);
402 (void) usleep(sleep_ms * MILLISEC);
403
404 } while (NSEC2MSEC(gethrtime() - start) < timeout_ms);
405
406 udev_unref(udev);
407
408 return (ret);
409 }
410
411
412 /*
413 * Encode the persistent devices strings
414 * used for the vdev disk label
415 */
416 static int
417 encode_device_strings(const char *path, vdev_dev_strs_t *ds,
418 boolean_t wholedisk)
419 {
420 struct udev *udev;
421 struct udev_device *dev = NULL;
422 char nodepath[MAXPATHLEN];
423 char *sysname;
424 int ret = ENODEV;
425 hrtime_t start;
426
427 if ((udev = udev_new()) == NULL)
428 return (ENXIO);
429
430 /* resolve path to a runtime device node instance */
431 if (realpath(path, nodepath) == NULL)
432 goto no_dev;
433
434 sysname = strrchr(nodepath, '/') + 1;
435
436 /*
437 * Wait up to 3 seconds for udev to set up the device node context
438 */
439 start = gethrtime();
440 do {
441 dev = udev_device_new_from_subsystem_sysname(udev, "block",
442 sysname);
443 if (dev == NULL)
444 goto no_dev;
445 if (udev_device_is_ready(dev))
446 break; /* udev ready */
447
448 udev_device_unref(dev);
449 dev = NULL;
450
451 if (NSEC2MSEC(gethrtime() - start) < 10)
452 (void) sched_yield(); /* yield/busy wait up to 10ms */
453 else
454 (void) usleep(10 * MILLISEC);
455
456 } while (NSEC2MSEC(gethrtime() - start) < (3 * MILLISEC));
457
458 if (dev == NULL)
459 goto no_dev;
460
461 /*
462 * Only whole disks require extra device strings
463 */
464 if (!wholedisk && !udev_mpath_whole_disk(dev))
465 goto no_dev;
466
467 ret = zfs_device_get_devid(dev, ds->vds_devid, sizeof (ds->vds_devid));
468 if (ret != 0)
469 goto no_dev_ref;
470
471 /* physical location string (optional) */
472 if (zfs_device_get_physical(dev, ds->vds_devphys,
473 sizeof (ds->vds_devphys)) != 0) {
474 ds->vds_devphys[0] = '\0'; /* empty string --> not available */
475 }
476
477 no_dev_ref:
478 udev_device_unref(dev);
479 no_dev:
480 udev_unref(udev);
481
482 return (ret);
483 }
484
485 /*
486 * Update a leaf vdev's persistent device strings (Linux only)
487 *
488 * - only applies for a dedicated leaf vdev (aka whole disk)
489 * - updated during pool create|add|attach|import
490 * - used for matching device matching during auto-{online,expand,replace}
491 * - stored in a leaf disk config label (i.e. alongside 'path' NVP)
492 * - these strings are currently not used in kernel (i.e. for vdev_disk_open)
493 *
494 * single device node example:
495 * devid: 'scsi-MG03SCA300_350000494a8cb3d67-part1'
496 * phys_path: 'pci-0000:04:00.0-sas-0x50000394a8cb3d67-lun-0'
497 *
498 * multipath device node example:
499 * devid: 'dm-uuid-mpath-35000c5006304de3f'
500 *
501 * We also store the enclosure sysfs path for turning on enclosure LEDs
502 * (if applicable):
503 * vdev_enc_sysfs_path: '/sys/class/enclosure/11:0:1:0/SLOT 4'
504 */
505 void
506 update_vdev_config_dev_strs(nvlist_t *nv)
507 {
508 vdev_dev_strs_t vds;
509 char *env, *type, *path;
510 uint64_t wholedisk = 0;
511 char *upath, *spath;
512
513 /*
514 * For the benefit of legacy ZFS implementations, allow
515 * for opting out of devid strings in the vdev label.
516 *
517 * example use:
518 * env ZFS_VDEV_DEVID_OPT_OUT=YES zpool import dozer
519 *
520 * explanation:
521 * Older ZFS on Linux implementations had issues when attempting to
522 * display pool config VDEV names if a "devid" NVP value is present
523 * in the pool's config.
524 *
525 * For example, a pool that originated on illumos platform would
526 * have a devid value in the config and "zpool status" would fail
527 * when listing the config.
528 *
529 * A pool can be stripped of any "devid" values on import or
530 * prevented from adding them on zpool create|add by setting
531 * ZFS_VDEV_DEVID_OPT_OUT.
532 */
533 env = getenv("ZFS_VDEV_DEVID_OPT_OUT");
534 if (env && (strtoul(env, NULL, 0) > 0 ||
535 !strncasecmp(env, "YES", 3) || !strncasecmp(env, "ON", 2))) {
536 (void) nvlist_remove_all(nv, ZPOOL_CONFIG_DEVID);
537 (void) nvlist_remove_all(nv, ZPOOL_CONFIG_PHYS_PATH);
538 return;
539 }
540
541 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0 ||
542 strcmp(type, VDEV_TYPE_DISK) != 0) {
543 return;
544 }
545 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) != 0)
546 return;
547 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk);
548
549 /*
550 * Update device string values in config nvlist
551 */
552 if (encode_device_strings(path, &vds, (boolean_t)wholedisk) == 0) {
553 (void) nvlist_add_string(nv, ZPOOL_CONFIG_DEVID, vds.vds_devid);
554 if (vds.vds_devphys[0] != '\0') {
555 (void) nvlist_add_string(nv, ZPOOL_CONFIG_PHYS_PATH,
556 vds.vds_devphys);
557 }
558
559 /* Add enclosure sysfs path (if disk is in an enclosure) */
560 upath = zfs_get_underlying_path(path);
561 spath = zfs_get_enclosure_sysfs_path(upath);
562 if (spath)
563 nvlist_add_string(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH,
564 spath);
565 else
566 nvlist_remove_all(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH);
567
568 free(upath);
569 free(spath);
570 } else {
571 /* clear out any stale entries */
572 (void) nvlist_remove_all(nv, ZPOOL_CONFIG_DEVID);
573 (void) nvlist_remove_all(nv, ZPOOL_CONFIG_PHYS_PATH);
574 (void) nvlist_remove_all(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH);
575 }
576 }
577 #else
578
579 boolean_t
580 is_mpath_whole_disk(const char *path)
581 {
582 return (B_FALSE);
583 }
584
585 /*
586 * Wait up to timeout_ms for udev to set up the device node. The device is
587 * considered ready when the provided path have been verified to exist and
588 * it has been allowed to settle. At this point the device the device can
589 * be accessed reliably. Depending on the complexity of the udev rules thisi
590 * process could take several seconds.
591 */
592 int
593 zpool_label_disk_wait(char *path, int timeout_ms)
594 {
595 int settle_ms = 50;
596 long sleep_ms = 10;
597 hrtime_t start, settle;
598 struct stat64 statbuf;
599
600 start = gethrtime();
601 settle = 0;
602
603 do {
604 errno = 0;
605 if ((stat64(path, &statbuf) == 0) && (errno == 0)) {
606 if (settle == 0)
607 settle = gethrtime();
608 else if (NSEC2MSEC(gethrtime() - settle) >= settle_ms)
609 return (0);
610 } else if (errno != ENOENT) {
611 return (errno);
612 }
613
614 usleep(sleep_ms * MILLISEC);
615 } while (NSEC2MSEC(gethrtime() - start) < timeout_ms);
616
617 return (ENODEV);
618 }
619
620 void
621 update_vdev_config_dev_strs(nvlist_t *nv)
622 {
623 }
624
625 #endif /* HAVE_LIBUDEV */
626
627 /*
628 * Go through and fix up any path and/or devid information for the given vdev
629 * configuration.
630 */
631 static int
632 fix_paths(nvlist_t *nv, name_entry_t *names)
633 {
634 nvlist_t **child;
635 uint_t c, children;
636 uint64_t guid;
637 name_entry_t *ne, *best;
638 char *path;
639
640 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
641 &child, &children) == 0) {
642 for (c = 0; c < children; c++)
643 if (fix_paths(child[c], names) != 0)
644 return (-1);
645 return (0);
646 }
647
648 /*
649 * This is a leaf (file or disk) vdev. In either case, go through
650 * the name list and see if we find a matching guid. If so, replace
651 * the path and see if we can calculate a new devid.
652 *
653 * There may be multiple names associated with a particular guid, in
654 * which case we have overlapping partitions or multiple paths to the
655 * same disk. In this case we prefer to use the path name which
656 * matches the ZPOOL_CONFIG_PATH. If no matching entry is found we
657 * use the lowest order device which corresponds to the first match
658 * while traversing the ZPOOL_IMPORT_PATH search path.
659 */
660 verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) == 0);
661 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) != 0)
662 path = NULL;
663
664 best = NULL;
665 for (ne = names; ne != NULL; ne = ne->ne_next) {
666 if (ne->ne_guid == guid) {
667 if (path == NULL) {
668 best = ne;
669 break;
670 }
671
672 if ((strlen(path) == strlen(ne->ne_name)) &&
673 strncmp(path, ne->ne_name, strlen(path)) == 0) {
674 best = ne;
675 break;
676 }
677
678 if (best == NULL) {
679 best = ne;
680 continue;
681 }
682
683 /* Prefer paths with move vdev labels. */
684 if (ne->ne_num_labels > best->ne_num_labels) {
685 best = ne;
686 continue;
687 }
688
689 /* Prefer paths earlier in the search order. */
690 if (ne->ne_num_labels == best->ne_num_labels &&
691 ne->ne_order < best->ne_order) {
692 best = ne;
693 continue;
694 }
695 }
696 }
697
698 if (best == NULL)
699 return (0);
700
701 if (nvlist_add_string(nv, ZPOOL_CONFIG_PATH, best->ne_name) != 0)
702 return (-1);
703
704 /* Linux only - update ZPOOL_CONFIG_DEVID and ZPOOL_CONFIG_PHYS_PATH */
705 update_vdev_config_dev_strs(nv);
706
707 return (0);
708 }
709
710 /*
711 * Add the given configuration to the list of known devices.
712 */
713 static int
714 add_config(libzfs_handle_t *hdl, pool_list_t *pl, const char *path,
715 int order, int num_labels, nvlist_t *config)
716 {
717 uint64_t pool_guid, vdev_guid, top_guid, txg, state;
718 pool_entry_t *pe;
719 vdev_entry_t *ve;
720 config_entry_t *ce;
721 name_entry_t *ne;
722
723 /*
724 * If this is a hot spare not currently in use or level 2 cache
725 * device, add it to the list of names to translate, but don't do
726 * anything else.
727 */
728 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE,
729 &state) == 0 &&
730 (state == POOL_STATE_SPARE || state == POOL_STATE_L2CACHE) &&
731 nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid) == 0) {
732 if ((ne = zfs_alloc(hdl, sizeof (name_entry_t))) == NULL)
733 return (-1);
734
735 if ((ne->ne_name = zfs_strdup(hdl, path)) == NULL) {
736 free(ne);
737 return (-1);
738 }
739 ne->ne_guid = vdev_guid;
740 ne->ne_order = order;
741 ne->ne_num_labels = num_labels;
742 ne->ne_next = pl->names;
743 pl->names = ne;
744
745 return (0);
746 }
747
748 /*
749 * If we have a valid config but cannot read any of these fields, then
750 * it means we have a half-initialized label. In vdev_label_init()
751 * we write a label with txg == 0 so that we can identify the device
752 * in case the user refers to the same disk later on. If we fail to
753 * create the pool, we'll be left with a label in this state
754 * which should not be considered part of a valid pool.
755 */
756 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
757 &pool_guid) != 0 ||
758 nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID,
759 &vdev_guid) != 0 ||
760 nvlist_lookup_uint64(config, ZPOOL_CONFIG_TOP_GUID,
761 &top_guid) != 0 ||
762 nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
763 &txg) != 0 || txg == 0) {
764 return (0);
765 }
766
767 /*
768 * First, see if we know about this pool. If not, then add it to the
769 * list of known pools.
770 */
771 for (pe = pl->pools; pe != NULL; pe = pe->pe_next) {
772 if (pe->pe_guid == pool_guid)
773 break;
774 }
775
776 if (pe == NULL) {
777 if ((pe = zfs_alloc(hdl, sizeof (pool_entry_t))) == NULL) {
778 return (-1);
779 }
780 pe->pe_guid = pool_guid;
781 pe->pe_next = pl->pools;
782 pl->pools = pe;
783 }
784
785 /*
786 * Second, see if we know about this toplevel vdev. Add it if its
787 * missing.
788 */
789 for (ve = pe->pe_vdevs; ve != NULL; ve = ve->ve_next) {
790 if (ve->ve_guid == top_guid)
791 break;
792 }
793
794 if (ve == NULL) {
795 if ((ve = zfs_alloc(hdl, sizeof (vdev_entry_t))) == NULL) {
796 return (-1);
797 }
798 ve->ve_guid = top_guid;
799 ve->ve_next = pe->pe_vdevs;
800 pe->pe_vdevs = ve;
801 }
802
803 /*
804 * Third, see if we have a config with a matching transaction group. If
805 * so, then we do nothing. Otherwise, add it to the list of known
806 * configs.
807 */
808 for (ce = ve->ve_configs; ce != NULL; ce = ce->ce_next) {
809 if (ce->ce_txg == txg)
810 break;
811 }
812
813 if (ce == NULL) {
814 if ((ce = zfs_alloc(hdl, sizeof (config_entry_t))) == NULL) {
815 return (-1);
816 }
817 ce->ce_txg = txg;
818 ce->ce_config = fnvlist_dup(config);
819 ce->ce_next = ve->ve_configs;
820 ve->ve_configs = ce;
821 }
822
823 /*
824 * At this point we've successfully added our config to the list of
825 * known configs. The last thing to do is add the vdev guid -> path
826 * mappings so that we can fix up the configuration as necessary before
827 * doing the import.
828 */
829 if ((ne = zfs_alloc(hdl, sizeof (name_entry_t))) == NULL)
830 return (-1);
831
832 if ((ne->ne_name = zfs_strdup(hdl, path)) == NULL) {
833 free(ne);
834 return (-1);
835 }
836
837 ne->ne_guid = vdev_guid;
838 ne->ne_order = order;
839 ne->ne_num_labels = num_labels;
840 ne->ne_next = pl->names;
841 pl->names = ne;
842
843 return (0);
844 }
845
846 /*
847 * Returns true if the named pool matches the given GUID.
848 */
849 static int
850 pool_active(libzfs_handle_t *hdl, const char *name, uint64_t guid,
851 boolean_t *isactive)
852 {
853 zpool_handle_t *zhp;
854 uint64_t theguid;
855
856 if (zpool_open_silent(hdl, name, &zhp) != 0)
857 return (-1);
858
859 if (zhp == NULL) {
860 *isactive = B_FALSE;
861 return (0);
862 }
863
864 verify(nvlist_lookup_uint64(zhp->zpool_config, ZPOOL_CONFIG_POOL_GUID,
865 &theguid) == 0);
866
867 zpool_close(zhp);
868
869 *isactive = (theguid == guid);
870 return (0);
871 }
872
873 static nvlist_t *
874 refresh_config(libzfs_handle_t *hdl, nvlist_t *config)
875 {
876 nvlist_t *nvl;
877 zfs_cmd_t zc = {"\0"};
878 int err, dstbuf_size;
879
880 if (zcmd_write_conf_nvlist(hdl, &zc, config) != 0)
881 return (NULL);
882
883 dstbuf_size = MAX(CONFIG_BUF_MINSIZE, zc.zc_nvlist_conf_size * 4);
884
885 if (zcmd_alloc_dst_nvlist(hdl, &zc, dstbuf_size) != 0) {
886 zcmd_free_nvlists(&zc);
887 return (NULL);
888 }
889
890 while ((err = ioctl(hdl->libzfs_fd, ZFS_IOC_POOL_TRYIMPORT,
891 &zc)) != 0 && errno == ENOMEM) {
892 if (zcmd_expand_dst_nvlist(hdl, &zc) != 0) {
893 zcmd_free_nvlists(&zc);
894 return (NULL);
895 }
896 }
897
898 if (err) {
899 zcmd_free_nvlists(&zc);
900 return (NULL);
901 }
902
903 if (zcmd_read_dst_nvlist(hdl, &zc, &nvl) != 0) {
904 zcmd_free_nvlists(&zc);
905 return (NULL);
906 }
907
908 zcmd_free_nvlists(&zc);
909 return (nvl);
910 }
911
912 /*
913 * Determine if the vdev id is a hole in the namespace.
914 */
915 boolean_t
916 vdev_is_hole(uint64_t *hole_array, uint_t holes, uint_t id)
917 {
918 int c;
919
920 for (c = 0; c < holes; c++) {
921
922 /* Top-level is a hole */
923 if (hole_array[c] == id)
924 return (B_TRUE);
925 }
926 return (B_FALSE);
927 }
928
929 /*
930 * Convert our list of pools into the definitive set of configurations. We
931 * start by picking the best config for each toplevel vdev. Once that's done,
932 * we assemble the toplevel vdevs into a full config for the pool. We make a
933 * pass to fix up any incorrect paths, and then add it to the main list to
934 * return to the user.
935 */
936 static nvlist_t *
937 get_configs(libzfs_handle_t *hdl, pool_list_t *pl, boolean_t active_ok,
938 nvlist_t *policy)
939 {
940 pool_entry_t *pe;
941 vdev_entry_t *ve;
942 config_entry_t *ce;
943 nvlist_t *ret = NULL, *config = NULL, *tmp = NULL, *nvtop, *nvroot;
944 nvlist_t **spares, **l2cache;
945 uint_t i, nspares, nl2cache;
946 boolean_t config_seen;
947 uint64_t best_txg;
948 char *name, *hostname = NULL;
949 uint64_t guid;
950 uint_t children = 0;
951 nvlist_t **child = NULL;
952 uint_t holes;
953 uint64_t *hole_array, max_id;
954 uint_t c;
955 boolean_t isactive;
956 uint64_t hostid;
957 nvlist_t *nvl;
958 boolean_t valid_top_config = B_FALSE;
959
960 if (nvlist_alloc(&ret, 0, 0) != 0)
961 goto nomem;
962
963 for (pe = pl->pools; pe != NULL; pe = pe->pe_next) {
964 uint64_t id, max_txg = 0;
965
966 if (nvlist_alloc(&config, NV_UNIQUE_NAME, 0) != 0)
967 goto nomem;
968 config_seen = B_FALSE;
969
970 /*
971 * Iterate over all toplevel vdevs. Grab the pool configuration
972 * from the first one we find, and then go through the rest and
973 * add them as necessary to the 'vdevs' member of the config.
974 */
975 for (ve = pe->pe_vdevs; ve != NULL; ve = ve->ve_next) {
976
977 /*
978 * Determine the best configuration for this vdev by
979 * selecting the config with the latest transaction
980 * group.
981 */
982 best_txg = 0;
983 for (ce = ve->ve_configs; ce != NULL;
984 ce = ce->ce_next) {
985
986 if (ce->ce_txg > best_txg) {
987 tmp = ce->ce_config;
988 best_txg = ce->ce_txg;
989 }
990 }
991
992 /*
993 * We rely on the fact that the max txg for the
994 * pool will contain the most up-to-date information
995 * about the valid top-levels in the vdev namespace.
996 */
997 if (best_txg > max_txg) {
998 (void) nvlist_remove(config,
999 ZPOOL_CONFIG_VDEV_CHILDREN,
1000 DATA_TYPE_UINT64);
1001 (void) nvlist_remove(config,
1002 ZPOOL_CONFIG_HOLE_ARRAY,
1003 DATA_TYPE_UINT64_ARRAY);
1004
1005 max_txg = best_txg;
1006 hole_array = NULL;
1007 holes = 0;
1008 max_id = 0;
1009 valid_top_config = B_FALSE;
1010
1011 if (nvlist_lookup_uint64(tmp,
1012 ZPOOL_CONFIG_VDEV_CHILDREN, &max_id) == 0) {
1013 verify(nvlist_add_uint64(config,
1014 ZPOOL_CONFIG_VDEV_CHILDREN,
1015 max_id) == 0);
1016 valid_top_config = B_TRUE;
1017 }
1018
1019 if (nvlist_lookup_uint64_array(tmp,
1020 ZPOOL_CONFIG_HOLE_ARRAY, &hole_array,
1021 &holes) == 0) {
1022 verify(nvlist_add_uint64_array(config,
1023 ZPOOL_CONFIG_HOLE_ARRAY,
1024 hole_array, holes) == 0);
1025 }
1026 }
1027
1028 if (!config_seen) {
1029 /*
1030 * Copy the relevant pieces of data to the pool
1031 * configuration:
1032 *
1033 * version
1034 * pool guid
1035 * name
1036 * comment (if available)
1037 * pool state
1038 * hostid (if available)
1039 * hostname (if available)
1040 */
1041 uint64_t state, version;
1042 char *comment = NULL;
1043
1044 version = fnvlist_lookup_uint64(tmp,
1045 ZPOOL_CONFIG_VERSION);
1046 fnvlist_add_uint64(config,
1047 ZPOOL_CONFIG_VERSION, version);
1048 guid = fnvlist_lookup_uint64(tmp,
1049 ZPOOL_CONFIG_POOL_GUID);
1050 fnvlist_add_uint64(config,
1051 ZPOOL_CONFIG_POOL_GUID, guid);
1052 name = fnvlist_lookup_string(tmp,
1053 ZPOOL_CONFIG_POOL_NAME);
1054 fnvlist_add_string(config,
1055 ZPOOL_CONFIG_POOL_NAME, name);
1056
1057 if (nvlist_lookup_string(tmp,
1058 ZPOOL_CONFIG_COMMENT, &comment) == 0)
1059 fnvlist_add_string(config,
1060 ZPOOL_CONFIG_COMMENT, comment);
1061
1062 state = fnvlist_lookup_uint64(tmp,
1063 ZPOOL_CONFIG_POOL_STATE);
1064 fnvlist_add_uint64(config,
1065 ZPOOL_CONFIG_POOL_STATE, state);
1066
1067 hostid = 0;
1068 if (nvlist_lookup_uint64(tmp,
1069 ZPOOL_CONFIG_HOSTID, &hostid) == 0) {
1070 fnvlist_add_uint64(config,
1071 ZPOOL_CONFIG_HOSTID, hostid);
1072 hostname = fnvlist_lookup_string(tmp,
1073 ZPOOL_CONFIG_HOSTNAME);
1074 fnvlist_add_string(config,
1075 ZPOOL_CONFIG_HOSTNAME, hostname);
1076 }
1077
1078 config_seen = B_TRUE;
1079 }
1080
1081 /*
1082 * Add this top-level vdev to the child array.
1083 */
1084 verify(nvlist_lookup_nvlist(tmp,
1085 ZPOOL_CONFIG_VDEV_TREE, &nvtop) == 0);
1086 verify(nvlist_lookup_uint64(nvtop, ZPOOL_CONFIG_ID,
1087 &id) == 0);
1088
1089 if (id >= children) {
1090 nvlist_t **newchild;
1091
1092 newchild = zfs_alloc(hdl, (id + 1) *
1093 sizeof (nvlist_t *));
1094 if (newchild == NULL)
1095 goto nomem;
1096
1097 for (c = 0; c < children; c++)
1098 newchild[c] = child[c];
1099
1100 free(child);
1101 child = newchild;
1102 children = id + 1;
1103 }
1104 if (nvlist_dup(nvtop, &child[id], 0) != 0)
1105 goto nomem;
1106
1107 }
1108
1109 /*
1110 * If we have information about all the top-levels then
1111 * clean up the nvlist which we've constructed. This
1112 * means removing any extraneous devices that are
1113 * beyond the valid range or adding devices to the end
1114 * of our array which appear to be missing.
1115 */
1116 if (valid_top_config) {
1117 if (max_id < children) {
1118 for (c = max_id; c < children; c++)
1119 nvlist_free(child[c]);
1120 children = max_id;
1121 } else if (max_id > children) {
1122 nvlist_t **newchild;
1123
1124 newchild = zfs_alloc(hdl, (max_id) *
1125 sizeof (nvlist_t *));
1126 if (newchild == NULL)
1127 goto nomem;
1128
1129 for (c = 0; c < children; c++)
1130 newchild[c] = child[c];
1131
1132 free(child);
1133 child = newchild;
1134 children = max_id;
1135 }
1136 }
1137
1138 verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
1139 &guid) == 0);
1140
1141 /*
1142 * The vdev namespace may contain holes as a result of
1143 * device removal. We must add them back into the vdev
1144 * tree before we process any missing devices.
1145 */
1146 if (holes > 0) {
1147 ASSERT(valid_top_config);
1148
1149 for (c = 0; c < children; c++) {
1150 nvlist_t *holey;
1151
1152 if (child[c] != NULL ||
1153 !vdev_is_hole(hole_array, holes, c))
1154 continue;
1155
1156 if (nvlist_alloc(&holey, NV_UNIQUE_NAME,
1157 0) != 0)
1158 goto nomem;
1159
1160 /*
1161 * Holes in the namespace are treated as
1162 * "hole" top-level vdevs and have a
1163 * special flag set on them.
1164 */
1165 if (nvlist_add_string(holey,
1166 ZPOOL_CONFIG_TYPE,
1167 VDEV_TYPE_HOLE) != 0 ||
1168 nvlist_add_uint64(holey,
1169 ZPOOL_CONFIG_ID, c) != 0 ||
1170 nvlist_add_uint64(holey,
1171 ZPOOL_CONFIG_GUID, 0ULL) != 0) {
1172 nvlist_free(holey);
1173 goto nomem;
1174 }
1175 child[c] = holey;
1176 }
1177 }
1178
1179 /*
1180 * Look for any missing top-level vdevs. If this is the case,
1181 * create a faked up 'missing' vdev as a placeholder. We cannot
1182 * simply compress the child array, because the kernel performs
1183 * certain checks to make sure the vdev IDs match their location
1184 * in the configuration.
1185 */
1186 for (c = 0; c < children; c++) {
1187 if (child[c] == NULL) {
1188 nvlist_t *missing;
1189 if (nvlist_alloc(&missing, NV_UNIQUE_NAME,
1190 0) != 0)
1191 goto nomem;
1192 if (nvlist_add_string(missing,
1193 ZPOOL_CONFIG_TYPE,
1194 VDEV_TYPE_MISSING) != 0 ||
1195 nvlist_add_uint64(missing,
1196 ZPOOL_CONFIG_ID, c) != 0 ||
1197 nvlist_add_uint64(missing,
1198 ZPOOL_CONFIG_GUID, 0ULL) != 0) {
1199 nvlist_free(missing);
1200 goto nomem;
1201 }
1202 child[c] = missing;
1203 }
1204 }
1205
1206 /*
1207 * Put all of this pool's top-level vdevs into a root vdev.
1208 */
1209 if (nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) != 0)
1210 goto nomem;
1211 if (nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
1212 VDEV_TYPE_ROOT) != 0 ||
1213 nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) != 0 ||
1214 nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, guid) != 0 ||
1215 nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
1216 child, children) != 0) {
1217 nvlist_free(nvroot);
1218 goto nomem;
1219 }
1220
1221 for (c = 0; c < children; c++)
1222 nvlist_free(child[c]);
1223 free(child);
1224 children = 0;
1225 child = NULL;
1226
1227 /*
1228 * Go through and fix up any paths and/or devids based on our
1229 * known list of vdev GUID -> path mappings.
1230 */
1231 if (fix_paths(nvroot, pl->names) != 0) {
1232 nvlist_free(nvroot);
1233 goto nomem;
1234 }
1235
1236 /*
1237 * Add the root vdev to this pool's configuration.
1238 */
1239 if (nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
1240 nvroot) != 0) {
1241 nvlist_free(nvroot);
1242 goto nomem;
1243 }
1244 nvlist_free(nvroot);
1245
1246 /*
1247 * zdb uses this path to report on active pools that were
1248 * imported or created using -R.
1249 */
1250 if (active_ok)
1251 goto add_pool;
1252
1253 /*
1254 * Determine if this pool is currently active, in which case we
1255 * can't actually import it.
1256 */
1257 verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
1258 &name) == 0);
1259 verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
1260 &guid) == 0);
1261
1262 if (pool_active(hdl, name, guid, &isactive) != 0)
1263 goto error;
1264
1265 if (isactive) {
1266 nvlist_free(config);
1267 config = NULL;
1268 continue;
1269 }
1270
1271 if (policy != NULL) {
1272 if (nvlist_add_nvlist(config, ZPOOL_LOAD_POLICY,
1273 policy) != 0)
1274 goto nomem;
1275 }
1276
1277 if ((nvl = refresh_config(hdl, config)) == NULL) {
1278 nvlist_free(config);
1279 config = NULL;
1280 continue;
1281 }
1282
1283 nvlist_free(config);
1284 config = nvl;
1285
1286 /*
1287 * Go through and update the paths for spares, now that we have
1288 * them.
1289 */
1290 verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
1291 &nvroot) == 0);
1292 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
1293 &spares, &nspares) == 0) {
1294 for (i = 0; i < nspares; i++) {
1295 if (fix_paths(spares[i], pl->names) != 0)
1296 goto nomem;
1297 }
1298 }
1299
1300 /*
1301 * Update the paths for l2cache devices.
1302 */
1303 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
1304 &l2cache, &nl2cache) == 0) {
1305 for (i = 0; i < nl2cache; i++) {
1306 if (fix_paths(l2cache[i], pl->names) != 0)
1307 goto nomem;
1308 }
1309 }
1310
1311 /*
1312 * Restore the original information read from the actual label.
1313 */
1314 (void) nvlist_remove(config, ZPOOL_CONFIG_HOSTID,
1315 DATA_TYPE_UINT64);
1316 (void) nvlist_remove(config, ZPOOL_CONFIG_HOSTNAME,
1317 DATA_TYPE_STRING);
1318 if (hostid != 0) {
1319 verify(nvlist_add_uint64(config, ZPOOL_CONFIG_HOSTID,
1320 hostid) == 0);
1321 verify(nvlist_add_string(config, ZPOOL_CONFIG_HOSTNAME,
1322 hostname) == 0);
1323 }
1324
1325 add_pool:
1326 /*
1327 * Add this pool to the list of configs.
1328 */
1329 verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
1330 &name) == 0);
1331 if (nvlist_add_nvlist(ret, name, config) != 0)
1332 goto nomem;
1333
1334 nvlist_free(config);
1335 config = NULL;
1336 }
1337
1338 return (ret);
1339
1340 nomem:
1341 (void) no_memory(hdl);
1342 error:
1343 nvlist_free(config);
1344 nvlist_free(ret);
1345 for (c = 0; c < children; c++)
1346 nvlist_free(child[c]);
1347 free(child);
1348
1349 return (NULL);
1350 }
1351
1352 /*
1353 * Return the offset of the given label.
1354 */
1355 static uint64_t
1356 label_offset(uint64_t size, int l)
1357 {
1358 ASSERT(P2PHASE_TYPED(size, sizeof (vdev_label_t), uint64_t) == 0);
1359 return (l * sizeof (vdev_label_t) + (l < VDEV_LABELS / 2 ?
1360 0 : size - VDEV_LABELS * sizeof (vdev_label_t)));
1361 }
1362
1363 /*
1364 * Given a file descriptor, read the label information and return an nvlist
1365 * describing the configuration, if there is one. The number of valid
1366 * labels found will be returned in num_labels when non-NULL.
1367 */
1368 int
1369 zpool_read_label(int fd, nvlist_t **config, int *num_labels)
1370 {
1371 struct stat64 statbuf;
1372 int l, count = 0;
1373 vdev_label_t *label;
1374 nvlist_t *expected_config = NULL;
1375 uint64_t expected_guid = 0, size;
1376 int error;
1377
1378 *config = NULL;
1379
1380 if (fstat64_blk(fd, &statbuf) == -1)
1381 return (0);
1382 size = P2ALIGN_TYPED(statbuf.st_size, sizeof (vdev_label_t), uint64_t);
1383
1384 error = posix_memalign((void **)&label, PAGESIZE, sizeof (*label));
1385 if (error)
1386 return (-1);
1387
1388 for (l = 0; l < VDEV_LABELS; l++) {
1389 uint64_t state, guid, txg;
1390
1391 if (pread64(fd, label, sizeof (vdev_label_t),
1392 label_offset(size, l)) != sizeof (vdev_label_t))
1393 continue;
1394
1395 if (nvlist_unpack(label->vl_vdev_phys.vp_nvlist,
1396 sizeof (label->vl_vdev_phys.vp_nvlist), config, 0) != 0)
1397 continue;
1398
1399 if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_GUID,
1400 &guid) != 0 || guid == 0) {
1401 nvlist_free(*config);
1402 continue;
1403 }
1404
1405 if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE,
1406 &state) != 0 || state > POOL_STATE_L2CACHE) {
1407 nvlist_free(*config);
1408 continue;
1409 }
1410
1411 if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE &&
1412 (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG,
1413 &txg) != 0 || txg == 0)) {
1414 nvlist_free(*config);
1415 continue;
1416 }
1417
1418 if (expected_guid) {
1419 if (expected_guid == guid)
1420 count++;
1421
1422 nvlist_free(*config);
1423 } else {
1424 expected_config = *config;
1425 expected_guid = guid;
1426 count++;
1427 }
1428 }
1429
1430 if (num_labels != NULL)
1431 *num_labels = count;
1432
1433 free(label);
1434 *config = expected_config;
1435
1436 return (0);
1437 }
1438
1439 typedef struct rdsk_node {
1440 char *rn_name; /* Full path to device */
1441 int rn_order; /* Preferred order (low to high) */
1442 int rn_num_labels; /* Number of valid labels */
1443 uint64_t rn_vdev_guid; /* Expected vdev guid when set */
1444 libzfs_handle_t *rn_hdl;
1445 nvlist_t *rn_config; /* Label config */
1446 avl_tree_t *rn_avl;
1447 avl_node_t rn_node;
1448 pthread_mutex_t *rn_lock;
1449 boolean_t rn_labelpaths;
1450 } rdsk_node_t;
1451
1452 /*
1453 * Sorted by vdev guid and full path to allow for multiple entries with
1454 * the same full path name. This is required because it's possible to
1455 * have multiple block devices with labels that refer to the same
1456 * ZPOOL_CONFIG_PATH yet have different vdev guids. In this case both
1457 * entries need to be added to the cache. Scenarios where this can occur
1458 * include overwritten pool labels, devices which are visible from multiple
1459 * hosts and multipath devices.
1460 */
1461 static int
1462 slice_cache_compare(const void *arg1, const void *arg2)
1463 {
1464 const char *nm1 = ((rdsk_node_t *)arg1)->rn_name;
1465 const char *nm2 = ((rdsk_node_t *)arg2)->rn_name;
1466 uint64_t guid1 = ((rdsk_node_t *)arg1)->rn_vdev_guid;
1467 uint64_t guid2 = ((rdsk_node_t *)arg2)->rn_vdev_guid;
1468 int rv;
1469
1470 rv = AVL_CMP(guid1, guid2);
1471 if (rv)
1472 return (rv);
1473
1474 return (AVL_ISIGN(strcmp(nm1, nm2)));
1475 }
1476
1477 static boolean_t
1478 is_watchdog_dev(char *dev)
1479 {
1480 /* For 'watchdog' dev */
1481 if (strcmp(dev, "watchdog") == 0)
1482 return (B_TRUE);
1483
1484 /* For 'watchdog<digit><whatever> */
1485 if (strstr(dev, "watchdog") == dev && isdigit(dev[8]))
1486 return (B_TRUE);
1487
1488 return (B_FALSE);
1489 }
1490
1491 static int
1492 label_paths_impl(libzfs_handle_t *hdl, nvlist_t *nvroot, uint64_t pool_guid,
1493 uint64_t vdev_guid, char **path, char **devid)
1494 {
1495 nvlist_t **child;
1496 uint_t c, children;
1497 uint64_t guid;
1498 char *val;
1499 int error;
1500
1501 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
1502 &child, &children) == 0) {
1503 for (c = 0; c < children; c++) {
1504 error = label_paths_impl(hdl, child[c],
1505 pool_guid, vdev_guid, path, devid);
1506 if (error)
1507 return (error);
1508 }
1509 return (0);
1510 }
1511
1512 if (nvroot == NULL)
1513 return (0);
1514
1515 error = nvlist_lookup_uint64(nvroot, ZPOOL_CONFIG_GUID, &guid);
1516 if ((error != 0) || (guid != vdev_guid))
1517 return (0);
1518
1519 error = nvlist_lookup_string(nvroot, ZPOOL_CONFIG_PATH, &val);
1520 if (error == 0)
1521 *path = val;
1522
1523 error = nvlist_lookup_string(nvroot, ZPOOL_CONFIG_DEVID, &val);
1524 if (error == 0)
1525 *devid = val;
1526
1527 return (0);
1528 }
1529
1530 /*
1531 * Given a disk label fetch the ZPOOL_CONFIG_PATH and ZPOOL_CONFIG_DEVID
1532 * and store these strings as config_path and devid_path respectively.
1533 * The returned pointers are only valid as long as label remains valid.
1534 */
1535 static int
1536 label_paths(libzfs_handle_t *hdl, nvlist_t *label, char **path, char **devid)
1537 {
1538 nvlist_t *nvroot;
1539 uint64_t pool_guid;
1540 uint64_t vdev_guid;
1541
1542 *path = NULL;
1543 *devid = NULL;
1544
1545 if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvroot) ||
1546 nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, &pool_guid) ||
1547 nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &vdev_guid))
1548 return (ENOENT);
1549
1550 return (label_paths_impl(hdl, nvroot, pool_guid, vdev_guid, path,
1551 devid));
1552 }
1553
1554 static void
1555 zpool_open_func(void *arg)
1556 {
1557 rdsk_node_t *rn = arg;
1558 libzfs_handle_t *hdl = rn->rn_hdl;
1559 struct stat64 statbuf;
1560 nvlist_t *config;
1561 char *bname, *dupname;
1562 uint64_t vdev_guid = 0;
1563 int error;
1564 int num_labels;
1565 int fd;
1566
1567 /*
1568 * Skip devices with well known prefixes there can be side effects
1569 * when opening devices which need to be avoided.
1570 *
1571 * hpet - High Precision Event Timer
1572 * watchdog - Watchdog must be closed in a special way.
1573 */
1574 dupname = zfs_strdup(hdl, rn->rn_name);
1575 bname = basename(dupname);
1576 error = ((strcmp(bname, "hpet") == 0) || is_watchdog_dev(bname));
1577 free(dupname);
1578 if (error)
1579 return;
1580
1581 /*
1582 * Ignore failed stats. We only want regular files and block devices.
1583 */
1584 if (stat64(rn->rn_name, &statbuf) != 0 ||
1585 (!S_ISREG(statbuf.st_mode) && !S_ISBLK(statbuf.st_mode)))
1586 return;
1587
1588 /*
1589 * Preferentially open using O_DIRECT to bypass the block device
1590 * cache which may be stale for multipath devices. An EINVAL errno
1591 * indicates O_DIRECT is unsupported so fallback to just O_RDONLY.
1592 */
1593 fd = open(rn->rn_name, O_RDONLY | O_DIRECT);
1594 if ((fd < 0) && (errno == EINVAL))
1595 fd = open(rn->rn_name, O_RDONLY);
1596
1597 if (fd < 0)
1598 return;
1599
1600 /*
1601 * This file is too small to hold a zpool
1602 */
1603 if (S_ISREG(statbuf.st_mode) && statbuf.st_size < SPA_MINDEVSIZE) {
1604 (void) close(fd);
1605 return;
1606 }
1607
1608 error = zpool_read_label(fd, &config, &num_labels);
1609 if (error != 0) {
1610 (void) close(fd);
1611 return;
1612 }
1613
1614 if (num_labels == 0) {
1615 (void) close(fd);
1616 nvlist_free(config);
1617 return;
1618 }
1619
1620 /*
1621 * Check that the vdev is for the expected guid. Additional entries
1622 * are speculatively added based on the paths stored in the labels.
1623 * Entries with valid paths but incorrect guids must be removed.
1624 */
1625 error = nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid);
1626 if (error || (rn->rn_vdev_guid && rn->rn_vdev_guid != vdev_guid)) {
1627 (void) close(fd);
1628 nvlist_free(config);
1629 return;
1630 }
1631
1632 (void) close(fd);
1633
1634 rn->rn_config = config;
1635 rn->rn_num_labels = num_labels;
1636
1637 /*
1638 * Add additional entries for paths described by this label.
1639 */
1640 if (rn->rn_labelpaths) {
1641 char *path = NULL;
1642 char *devid = NULL;
1643 rdsk_node_t *slice;
1644 avl_index_t where;
1645 int error;
1646
1647 if (label_paths(rn->rn_hdl, rn->rn_config, &path, &devid))
1648 return;
1649
1650 /*
1651 * Allow devlinks to stabilize so all paths are available.
1652 */
1653 zpool_label_disk_wait(rn->rn_name, DISK_LABEL_WAIT);
1654
1655 if (path != NULL) {
1656 slice = zfs_alloc(hdl, sizeof (rdsk_node_t));
1657 slice->rn_name = zfs_strdup(hdl, path);
1658 slice->rn_vdev_guid = vdev_guid;
1659 slice->rn_avl = rn->rn_avl;
1660 slice->rn_hdl = hdl;
1661 slice->rn_order = IMPORT_ORDER_PREFERRED_1;
1662 slice->rn_labelpaths = B_FALSE;
1663 pthread_mutex_lock(rn->rn_lock);
1664 if (avl_find(rn->rn_avl, slice, &where)) {
1665 pthread_mutex_unlock(rn->rn_lock);
1666 free(slice->rn_name);
1667 free(slice);
1668 } else {
1669 avl_insert(rn->rn_avl, slice, where);
1670 pthread_mutex_unlock(rn->rn_lock);
1671 zpool_open_func(slice);
1672 }
1673 }
1674
1675 if (devid != NULL) {
1676 slice = zfs_alloc(hdl, sizeof (rdsk_node_t));
1677 error = asprintf(&slice->rn_name, "%s%s",
1678 DEV_BYID_PATH, devid);
1679 if (error == -1) {
1680 free(slice);
1681 return;
1682 }
1683
1684 slice->rn_vdev_guid = vdev_guid;
1685 slice->rn_avl = rn->rn_avl;
1686 slice->rn_hdl = hdl;
1687 slice->rn_order = IMPORT_ORDER_PREFERRED_2;
1688 slice->rn_labelpaths = B_FALSE;
1689 pthread_mutex_lock(rn->rn_lock);
1690 if (avl_find(rn->rn_avl, slice, &where)) {
1691 pthread_mutex_unlock(rn->rn_lock);
1692 free(slice->rn_name);
1693 free(slice);
1694 } else {
1695 avl_insert(rn->rn_avl, slice, where);
1696 pthread_mutex_unlock(rn->rn_lock);
1697 zpool_open_func(slice);
1698 }
1699 }
1700 }
1701 }
1702
1703 /*
1704 * Given a file descriptor, clear (zero) the label information. This function
1705 * is used in the appliance stack as part of the ZFS sysevent module and
1706 * to implement the "zpool labelclear" command.
1707 */
1708 int
1709 zpool_clear_label(int fd)
1710 {
1711 struct stat64 statbuf;
1712 int l;
1713 vdev_label_t *label;
1714 uint64_t size;
1715
1716 if (fstat64_blk(fd, &statbuf) == -1)
1717 return (0);
1718 size = P2ALIGN_TYPED(statbuf.st_size, sizeof (vdev_label_t), uint64_t);
1719
1720 if ((label = calloc(1, sizeof (vdev_label_t))) == NULL)
1721 return (-1);
1722
1723 for (l = 0; l < VDEV_LABELS; l++) {
1724 if (pwrite64(fd, label, sizeof (vdev_label_t),
1725 label_offset(size, l)) != sizeof (vdev_label_t)) {
1726 free(label);
1727 return (-1);
1728 }
1729 }
1730
1731 free(label);
1732 return (0);
1733 }
1734
1735 static void
1736 zpool_find_import_scan_add_slice(libzfs_handle_t *hdl, pthread_mutex_t *lock,
1737 avl_tree_t *cache, char *path, const char *name, int order)
1738 {
1739 avl_index_t where;
1740 rdsk_node_t *slice;
1741
1742 slice = zfs_alloc(hdl, sizeof (rdsk_node_t));
1743 if (asprintf(&slice->rn_name, "%s/%s", path, name) == -1) {
1744 free(slice);
1745 return;
1746 }
1747 slice->rn_vdev_guid = 0;
1748 slice->rn_lock = lock;
1749 slice->rn_avl = cache;
1750 slice->rn_hdl = hdl;
1751 slice->rn_order = order + IMPORT_ORDER_SCAN_OFFSET;
1752 slice->rn_labelpaths = B_FALSE;
1753
1754 pthread_mutex_lock(lock);
1755 if (avl_find(cache, slice, &where)) {
1756 free(slice->rn_name);
1757 free(slice);
1758 } else {
1759 avl_insert(cache, slice, where);
1760 }
1761 pthread_mutex_unlock(lock);
1762 }
1763
1764 static int
1765 zpool_find_import_scan_dir(libzfs_handle_t *hdl, pthread_mutex_t *lock,
1766 avl_tree_t *cache, char *dir, int order)
1767 {
1768 int error;
1769 char path[MAXPATHLEN];
1770 struct dirent64 *dp;
1771 DIR *dirp;
1772
1773 if (realpath(dir, path) == NULL) {
1774 error = errno;
1775 if (error == ENOENT)
1776 return (0);
1777
1778 zfs_error_aux(hdl, strerror(error));
1779 (void) zfs_error_fmt(hdl, EZFS_BADPATH, dgettext(
1780 TEXT_DOMAIN, "cannot resolve path '%s'"), dir);
1781 return (error);
1782 }
1783
1784 dirp = opendir(path);
1785 if (dirp == NULL) {
1786 error = errno;
1787 zfs_error_aux(hdl, strerror(error));
1788 (void) zfs_error_fmt(hdl, EZFS_BADPATH,
1789 dgettext(TEXT_DOMAIN, "cannot open '%s'"), path);
1790 return (error);
1791 }
1792
1793 while ((dp = readdir64(dirp)) != NULL) {
1794 const char *name = dp->d_name;
1795 if (name[0] == '.' &&
1796 (name[1] == 0 || (name[1] == '.' && name[2] == 0)))
1797 continue;
1798
1799 zpool_find_import_scan_add_slice(hdl, lock, cache, path, name,
1800 order);
1801 }
1802
1803 (void) closedir(dirp);
1804 return (0);
1805 }
1806
1807 static int
1808 zpool_find_import_scan_path(libzfs_handle_t *hdl, pthread_mutex_t *lock,
1809 avl_tree_t *cache, char *dir, int order)
1810 {
1811 int error = 0;
1812 char path[MAXPATHLEN];
1813 char *d, *b;
1814 char *dpath, *name;
1815
1816 /*
1817 * Seperate the directory part and last part of the
1818 * path. We do this so that we can get the realpath of
1819 * the directory. We don't get the realpath on the
1820 * whole path because if it's a symlink, we want the
1821 * path of the symlink not where it points to.
1822 */
1823 d = zfs_strdup(hdl, dir);
1824 b = zfs_strdup(hdl, dir);
1825 dpath = dirname(d);
1826 name = basename(b);
1827
1828 if (realpath(dpath, path) == NULL) {
1829 error = errno;
1830 if (error == ENOENT) {
1831 error = 0;
1832 goto out;
1833 }
1834
1835 zfs_error_aux(hdl, strerror(error));
1836 (void) zfs_error_fmt(hdl, EZFS_BADPATH, dgettext(
1837 TEXT_DOMAIN, "cannot resolve path '%s'"), dir);
1838 goto out;
1839 }
1840
1841 zpool_find_import_scan_add_slice(hdl, lock, cache, path, name, order);
1842
1843 out:
1844 free(b);
1845 free(d);
1846 return (error);
1847 }
1848
1849 /*
1850 * Scan a list of directories for zfs devices.
1851 */
1852 static int
1853 zpool_find_import_scan(libzfs_handle_t *hdl, pthread_mutex_t *lock,
1854 avl_tree_t **slice_cache, char **dir, int dirs)
1855 {
1856 avl_tree_t *cache;
1857 rdsk_node_t *slice;
1858 void *cookie;
1859 int i, error;
1860
1861 *slice_cache = NULL;
1862 cache = zfs_alloc(hdl, sizeof (avl_tree_t));
1863 avl_create(cache, slice_cache_compare, sizeof (rdsk_node_t),
1864 offsetof(rdsk_node_t, rn_node));
1865
1866 for (i = 0; i < dirs; i++) {
1867 struct stat sbuf;
1868
1869 if (stat(dir[i], &sbuf) != 0) {
1870 error = errno;
1871 if (error == ENOENT)
1872 continue;
1873
1874 zfs_error_aux(hdl, strerror(error));
1875 (void) zfs_error_fmt(hdl, EZFS_BADPATH, dgettext(
1876 TEXT_DOMAIN, "cannot resolve path '%s'"), dir[i]);
1877 goto error;
1878 }
1879
1880 /*
1881 * If dir[i] is a directory, we walk through it and add all
1882 * the entry to the cache. If it's not a directory, we just
1883 * add it to the cache.
1884 */
1885 if (S_ISDIR(sbuf.st_mode)) {
1886 if ((error = zpool_find_import_scan_dir(hdl, lock,
1887 cache, dir[i], i)) != 0)
1888 goto error;
1889 } else {
1890 if ((error = zpool_find_import_scan_path(hdl, lock,
1891 cache, dir[i], i)) != 0)
1892 goto error;
1893 }
1894 }
1895
1896 *slice_cache = cache;
1897 return (0);
1898
1899 error:
1900 cookie = NULL;
1901 while ((slice = avl_destroy_nodes(cache, &cookie)) != NULL) {
1902 free(slice->rn_name);
1903 free(slice);
1904 }
1905 free(cache);
1906
1907 return (error);
1908 }
1909
1910 /*
1911 * Use libblkid to quickly enumerate all known zfs devices.
1912 */
1913 static int
1914 zpool_find_import_blkid(libzfs_handle_t *hdl, pthread_mutex_t *lock,
1915 avl_tree_t **slice_cache)
1916 {
1917 rdsk_node_t *slice;
1918 blkid_cache cache;
1919 blkid_dev_iterate iter;
1920 blkid_dev dev;
1921 avl_index_t where;
1922 int error;
1923
1924 *slice_cache = NULL;
1925
1926 error = blkid_get_cache(&cache, NULL);
1927 if (error != 0)
1928 return (error);
1929
1930 error = blkid_probe_all_new(cache);
1931 if (error != 0) {
1932 blkid_put_cache(cache);
1933 return (error);
1934 }
1935
1936 iter = blkid_dev_iterate_begin(cache);
1937 if (iter == NULL) {
1938 blkid_put_cache(cache);
1939 return (EINVAL);
1940 }
1941
1942 error = blkid_dev_set_search(iter, "TYPE", "zfs_member");
1943 if (error != 0) {
1944 blkid_dev_iterate_end(iter);
1945 blkid_put_cache(cache);
1946 return (error);
1947 }
1948
1949 *slice_cache = zfs_alloc(hdl, sizeof (avl_tree_t));
1950 avl_create(*slice_cache, slice_cache_compare, sizeof (rdsk_node_t),
1951 offsetof(rdsk_node_t, rn_node));
1952
1953 while (blkid_dev_next(iter, &dev) == 0) {
1954 slice = zfs_alloc(hdl, sizeof (rdsk_node_t));
1955 slice->rn_name = zfs_strdup(hdl, blkid_dev_devname(dev));
1956 slice->rn_vdev_guid = 0;
1957 slice->rn_lock = lock;
1958 slice->rn_avl = *slice_cache;
1959 slice->rn_hdl = hdl;
1960 slice->rn_labelpaths = B_TRUE;
1961
1962 error = zfs_path_order(slice->rn_name, &slice->rn_order);
1963 if (error == 0)
1964 slice->rn_order += IMPORT_ORDER_SCAN_OFFSET;
1965 else
1966 slice->rn_order = IMPORT_ORDER_DEFAULT;
1967
1968 pthread_mutex_lock(lock);
1969 if (avl_find(*slice_cache, slice, &where)) {
1970 free(slice->rn_name);
1971 free(slice);
1972 } else {
1973 avl_insert(*slice_cache, slice, where);
1974 }
1975 pthread_mutex_unlock(lock);
1976 }
1977
1978 blkid_dev_iterate_end(iter);
1979 blkid_put_cache(cache);
1980
1981 return (0);
1982 }
1983
1984 char *
1985 zpool_default_import_path[DEFAULT_IMPORT_PATH_SIZE] = {
1986 "/dev/disk/by-vdev", /* Custom rules, use first if they exist */
1987 "/dev/mapper", /* Use multipath devices before components */
1988 "/dev/disk/by-partlabel", /* Single unique entry set by user */
1989 "/dev/disk/by-partuuid", /* Generated partition uuid */
1990 "/dev/disk/by-label", /* Custom persistent labels */
1991 "/dev/disk/by-uuid", /* Single unique entry and persistent */
1992 "/dev/disk/by-id", /* May be multiple entries and persistent */
1993 "/dev/disk/by-path", /* Encodes physical location and persistent */
1994 "/dev" /* UNSAFE device names will change */
1995 };
1996
1997 /*
1998 * Given a list of directories to search, find all pools stored on disk. This
1999 * includes partial pools which are not available to import. If no args are
2000 * given (argc is 0), then the default directory (/dev/dsk) is searched.
2001 * poolname or guid (but not both) are provided by the caller when trying
2002 * to import a specific pool.
2003 */
2004 static nvlist_t *
2005 zpool_find_import_impl(libzfs_handle_t *hdl, importargs_t *iarg)
2006 {
2007 nvlist_t *ret = NULL;
2008 pool_list_t pools = { 0 };
2009 pool_entry_t *pe, *penext;
2010 vdev_entry_t *ve, *venext;
2011 config_entry_t *ce, *cenext;
2012 name_entry_t *ne, *nenext;
2013 pthread_mutex_t lock;
2014 avl_tree_t *cache;
2015 rdsk_node_t *slice;
2016 void *cookie;
2017 tpool_t *t;
2018
2019 verify(iarg->poolname == NULL || iarg->guid == 0);
2020 pthread_mutex_init(&lock, NULL);
2021
2022 /*
2023 * Locate pool member vdevs using libblkid or by directory scanning.
2024 * On success a newly allocated AVL tree which is populated with an
2025 * entry for each discovered vdev will be returned as the cache.
2026 * It's the callers responsibility to consume and destroy this tree.
2027 */
2028 if (iarg->scan || iarg->paths != 0) {
2029 int dirs = iarg->paths;
2030 char **dir = iarg->path;
2031
2032 if (dirs == 0) {
2033 dir = zpool_default_import_path;
2034 dirs = DEFAULT_IMPORT_PATH_SIZE;
2035 }
2036
2037 if (zpool_find_import_scan(hdl, &lock, &cache, dir, dirs) != 0)
2038 return (NULL);
2039 } else {
2040 if (zpool_find_import_blkid(hdl, &lock, &cache) != 0)
2041 return (NULL);
2042 }
2043
2044 /*
2045 * Create a thread pool to parallelize the process of reading and
2046 * validating labels, a large number of threads can be used due to
2047 * minimal contention.
2048 */
2049 t = tpool_create(1, 2 * sysconf(_SC_NPROCESSORS_ONLN), 0, NULL);
2050 for (slice = avl_first(cache); slice;
2051 (slice = avl_walk(cache, slice, AVL_AFTER)))
2052 (void) tpool_dispatch(t, zpool_open_func, slice);
2053
2054 tpool_wait(t);
2055 tpool_destroy(t);
2056
2057 /*
2058 * Process the cache filtering out any entries which are not
2059 * for the specificed pool then adding matching label configs.
2060 */
2061 cookie = NULL;
2062 while ((slice = avl_destroy_nodes(cache, &cookie)) != NULL) {
2063 if (slice->rn_config != NULL) {
2064 nvlist_t *config = slice->rn_config;
2065 boolean_t matched = B_TRUE;
2066 boolean_t aux = B_FALSE;
2067 int fd;
2068
2069 /*
2070 * Check if it's a spare or l2cache device. If it is,
2071 * we need to skip the name and guid check since they
2072 * don't exist on aux device label.
2073 */
2074 if (iarg->poolname != NULL || iarg->guid != 0) {
2075 uint64_t state;
2076 aux = nvlist_lookup_uint64(config,
2077 ZPOOL_CONFIG_POOL_STATE, &state) == 0 &&
2078 (state == POOL_STATE_SPARE ||
2079 state == POOL_STATE_L2CACHE);
2080 }
2081
2082 if (iarg->poolname != NULL && !aux) {
2083 char *pname;
2084
2085 matched = nvlist_lookup_string(config,
2086 ZPOOL_CONFIG_POOL_NAME, &pname) == 0 &&
2087 strcmp(iarg->poolname, pname) == 0;
2088 } else if (iarg->guid != 0 && !aux) {
2089 uint64_t this_guid;
2090
2091 matched = nvlist_lookup_uint64(config,
2092 ZPOOL_CONFIG_POOL_GUID, &this_guid) == 0 &&
2093 iarg->guid == this_guid;
2094 }
2095 if (matched) {
2096 /*
2097 * Verify all remaining entries can be opened
2098 * exclusively. This will prune all underlying
2099 * multipath devices which otherwise could
2100 * result in the vdev appearing as UNAVAIL.
2101 *
2102 * Under zdb, this step isn't required and
2103 * would prevent a zdb -e of active pools with
2104 * no cachefile.
2105 */
2106 fd = open(slice->rn_name, O_RDONLY | O_EXCL);
2107 if (fd >= 0 || iarg->can_be_active) {
2108 if (fd >= 0)
2109 close(fd);
2110 add_config(hdl, &pools,
2111 slice->rn_name, slice->rn_order,
2112 slice->rn_num_labels, config);
2113 }
2114 }
2115 nvlist_free(config);
2116 }
2117 free(slice->rn_name);
2118 free(slice);
2119 }
2120 avl_destroy(cache);
2121 free(cache);
2122 pthread_mutex_destroy(&lock);
2123
2124 ret = get_configs(hdl, &pools, iarg->can_be_active, iarg->policy);
2125
2126 for (pe = pools.pools; pe != NULL; pe = penext) {
2127 penext = pe->pe_next;
2128 for (ve = pe->pe_vdevs; ve != NULL; ve = venext) {
2129 venext = ve->ve_next;
2130 for (ce = ve->ve_configs; ce != NULL; ce = cenext) {
2131 cenext = ce->ce_next;
2132 nvlist_free(ce->ce_config);
2133 free(ce);
2134 }
2135 free(ve);
2136 }
2137 free(pe);
2138 }
2139
2140 for (ne = pools.names; ne != NULL; ne = nenext) {
2141 nenext = ne->ne_next;
2142 free(ne->ne_name);
2143 free(ne);
2144 }
2145
2146 return (ret);
2147 }
2148
2149 nvlist_t *
2150 zpool_find_import(libzfs_handle_t *hdl, int argc, char **argv)
2151 {
2152 importargs_t iarg = { 0 };
2153
2154 iarg.paths = argc;
2155 iarg.path = argv;
2156
2157 return (zpool_find_import_impl(hdl, &iarg));
2158 }
2159
2160 /*
2161 * Given a cache file, return the contents as a list of importable pools.
2162 * poolname or guid (but not both) are provided by the caller when trying
2163 * to import a specific pool.
2164 */
2165 nvlist_t *
2166 zpool_find_import_cached(libzfs_handle_t *hdl, const char *cachefile,
2167 char *poolname, uint64_t guid)
2168 {
2169 char *buf;
2170 int fd;
2171 struct stat64 statbuf;
2172 nvlist_t *raw, *src, *dst;
2173 nvlist_t *pools;
2174 nvpair_t *elem;
2175 char *name;
2176 uint64_t this_guid;
2177 boolean_t active;
2178
2179 verify(poolname == NULL || guid == 0);
2180
2181 if ((fd = open(cachefile, O_RDONLY)) < 0) {
2182 zfs_error_aux(hdl, "%s", strerror(errno));
2183 (void) zfs_error(hdl, EZFS_BADCACHE,
2184 dgettext(TEXT_DOMAIN, "failed to open cache file"));
2185 return (NULL);
2186 }
2187
2188 if (fstat64(fd, &statbuf) != 0) {
2189 zfs_error_aux(hdl, "%s", strerror(errno));
2190 (void) close(fd);
2191 (void) zfs_error(hdl, EZFS_BADCACHE,
2192 dgettext(TEXT_DOMAIN, "failed to get size of cache file"));
2193 return (NULL);
2194 }
2195
2196 if ((buf = zfs_alloc(hdl, statbuf.st_size)) == NULL) {
2197 (void) close(fd);
2198 return (NULL);
2199 }
2200
2201 if (read(fd, buf, statbuf.st_size) != statbuf.st_size) {
2202 (void) close(fd);
2203 free(buf);
2204 (void) zfs_error(hdl, EZFS_BADCACHE,
2205 dgettext(TEXT_DOMAIN,
2206 "failed to read cache file contents"));
2207 return (NULL);
2208 }
2209
2210 (void) close(fd);
2211
2212 if (nvlist_unpack(buf, statbuf.st_size, &raw, 0) != 0) {
2213 free(buf);
2214 (void) zfs_error(hdl, EZFS_BADCACHE,
2215 dgettext(TEXT_DOMAIN,
2216 "invalid or corrupt cache file contents"));
2217 return (NULL);
2218 }
2219
2220 free(buf);
2221
2222 /*
2223 * Go through and get the current state of the pools and refresh their
2224 * state.
2225 */
2226 if (nvlist_alloc(&pools, 0, 0) != 0) {
2227 (void) no_memory(hdl);
2228 nvlist_free(raw);
2229 return (NULL);
2230 }
2231
2232 elem = NULL;
2233 while ((elem = nvlist_next_nvpair(raw, elem)) != NULL) {
2234 src = fnvpair_value_nvlist(elem);
2235
2236 name = fnvlist_lookup_string(src, ZPOOL_CONFIG_POOL_NAME);
2237 if (poolname != NULL && strcmp(poolname, name) != 0)
2238 continue;
2239
2240 this_guid = fnvlist_lookup_uint64(src, ZPOOL_CONFIG_POOL_GUID);
2241 if (guid != 0 && guid != this_guid)
2242 continue;
2243
2244 if (pool_active(hdl, name, this_guid, &active) != 0) {
2245 nvlist_free(raw);
2246 nvlist_free(pools);
2247 return (NULL);
2248 }
2249
2250 if (active)
2251 continue;
2252
2253 if (nvlist_add_string(src, ZPOOL_CONFIG_CACHEFILE,
2254 cachefile) != 0) {
2255 (void) no_memory(hdl);
2256 nvlist_free(raw);
2257 nvlist_free(pools);
2258 return (NULL);
2259 }
2260
2261 if ((dst = refresh_config(hdl, src)) == NULL) {
2262 nvlist_free(raw);
2263 nvlist_free(pools);
2264 return (NULL);
2265 }
2266
2267 if (nvlist_add_nvlist(pools, nvpair_name(elem), dst) != 0) {
2268 (void) no_memory(hdl);
2269 nvlist_free(dst);
2270 nvlist_free(raw);
2271 nvlist_free(pools);
2272 return (NULL);
2273 }
2274 nvlist_free(dst);
2275 }
2276
2277 nvlist_free(raw);
2278 return (pools);
2279 }
2280
2281 static int
2282 name_or_guid_exists(zpool_handle_t *zhp, void *data)
2283 {
2284 importargs_t *import = data;
2285 int found = 0;
2286
2287 if (import->poolname != NULL) {
2288 char *pool_name;
2289
2290 verify(nvlist_lookup_string(zhp->zpool_config,
2291 ZPOOL_CONFIG_POOL_NAME, &pool_name) == 0);
2292 if (strcmp(pool_name, import->poolname) == 0)
2293 found = 1;
2294 } else {
2295 uint64_t pool_guid;
2296
2297 verify(nvlist_lookup_uint64(zhp->zpool_config,
2298 ZPOOL_CONFIG_POOL_GUID, &pool_guid) == 0);
2299 if (pool_guid == import->guid)
2300 found = 1;
2301 }
2302
2303 zpool_close(zhp);
2304 return (found);
2305 }
2306
2307 nvlist_t *
2308 zpool_search_import(libzfs_handle_t *hdl, importargs_t *import)
2309 {
2310 verify(import->poolname == NULL || import->guid == 0);
2311
2312 if (import->unique)
2313 import->exists = zpool_iter(hdl, name_or_guid_exists, import);
2314
2315 if (import->cachefile != NULL)
2316 return (zpool_find_import_cached(hdl, import->cachefile,
2317 import->poolname, import->guid));
2318
2319 return (zpool_find_import_impl(hdl, import));
2320 }
2321
2322 static boolean_t
2323 pool_match(nvlist_t *cfg, char *tgt)
2324 {
2325 uint64_t v, guid = strtoull(tgt, NULL, 0);
2326 char *s;
2327
2328 if (guid != 0) {
2329 if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &v) == 0)
2330 return (v == guid);
2331 } else {
2332 if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &s) == 0)
2333 return (strcmp(s, tgt) == 0);
2334 }
2335 return (B_FALSE);
2336 }
2337
2338 int
2339 zpool_tryimport(libzfs_handle_t *hdl, char *target, nvlist_t **configp,
2340 importargs_t *args)
2341 {
2342 nvlist_t *pools;
2343 nvlist_t *match = NULL;
2344 nvlist_t *config = NULL;
2345 char *name = NULL, *sepp = NULL;
2346 char sep = '\0';
2347 int count = 0;
2348 char *targetdup = strdup(target);
2349
2350 *configp = NULL;
2351
2352 if ((sepp = strpbrk(targetdup, "/@")) != NULL) {
2353 sep = *sepp;
2354 *sepp = '\0';
2355 }
2356
2357 pools = zpool_search_import(hdl, args);
2358
2359 if (pools != NULL) {
2360 nvpair_t *elem = NULL;
2361 while ((elem = nvlist_next_nvpair(pools, elem)) != NULL) {
2362 VERIFY0(nvpair_value_nvlist(elem, &config));
2363 if (pool_match(config, targetdup)) {
2364 count++;
2365 if (match != NULL) {
2366 /* multiple matches found */
2367 continue;
2368 } else {
2369 match = config;
2370 name = nvpair_name(elem);
2371 }
2372 }
2373 }
2374 }
2375
2376 if (count == 0) {
2377 (void) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
2378 "no pools found"));
2379 free(targetdup);
2380 return (ENOENT);
2381 }
2382
2383 if (count > 1) {
2384 (void) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
2385 "%d pools found, use pool GUID\n"), count);
2386 free(targetdup);
2387 return (EINVAL);
2388 }
2389
2390 *configp = match;
2391 free(targetdup);
2392
2393 return (0);
2394 }
2395
2396 boolean_t
2397 find_guid(nvlist_t *nv, uint64_t guid)
2398 {
2399 uint64_t tmp;
2400 nvlist_t **child;
2401 uint_t c, children;
2402
2403 verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &tmp) == 0);
2404 if (tmp == guid)
2405 return (B_TRUE);
2406
2407 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
2408 &child, &children) == 0) {
2409 for (c = 0; c < children; c++)
2410 if (find_guid(child[c], guid))
2411 return (B_TRUE);
2412 }
2413
2414 return (B_FALSE);
2415 }
2416
2417 typedef struct aux_cbdata {
2418 const char *cb_type;
2419 uint64_t cb_guid;
2420 zpool_handle_t *cb_zhp;
2421 } aux_cbdata_t;
2422
2423 static int
2424 find_aux(zpool_handle_t *zhp, void *data)
2425 {
2426 aux_cbdata_t *cbp = data;
2427 nvlist_t **list;
2428 uint_t i, count;
2429 uint64_t guid;
2430 nvlist_t *nvroot;
2431
2432 verify(nvlist_lookup_nvlist(zhp->zpool_config, ZPOOL_CONFIG_VDEV_TREE,
2433 &nvroot) == 0);
2434
2435 if (nvlist_lookup_nvlist_array(nvroot, cbp->cb_type,
2436 &list, &count) == 0) {
2437 for (i = 0; i < count; i++) {
2438 verify(nvlist_lookup_uint64(list[i],
2439 ZPOOL_CONFIG_GUID, &guid) == 0);
2440 if (guid == cbp->cb_guid) {
2441 cbp->cb_zhp = zhp;
2442 return (1);
2443 }
2444 }
2445 }
2446
2447 zpool_close(zhp);
2448 return (0);
2449 }
2450
2451 /*
2452 * Determines if the pool is in use. If so, it returns true and the state of
2453 * the pool as well as the name of the pool. Name string is allocated and
2454 * must be freed by the caller.
2455 */
2456 int
2457 zpool_in_use(libzfs_handle_t *hdl, int fd, pool_state_t *state, char **namestr,
2458 boolean_t *inuse)
2459 {
2460 nvlist_t *config;
2461 char *name;
2462 boolean_t ret;
2463 uint64_t guid, vdev_guid;
2464 zpool_handle_t *zhp;
2465 nvlist_t *pool_config;
2466 uint64_t stateval, isspare;
2467 aux_cbdata_t cb = { 0 };
2468 boolean_t isactive;
2469
2470 *inuse = B_FALSE;
2471
2472 if (zpool_read_label(fd, &config, NULL) != 0) {
2473 (void) no_memory(hdl);
2474 return (-1);
2475 }
2476
2477 if (config == NULL)
2478 return (0);
2479
2480 verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE,
2481 &stateval) == 0);
2482 verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID,
2483 &vdev_guid) == 0);
2484
2485 if (stateval != POOL_STATE_SPARE && stateval != POOL_STATE_L2CACHE) {
2486 verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
2487 &name) == 0);
2488 verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
2489 &guid) == 0);
2490 }
2491
2492 switch (stateval) {
2493 case POOL_STATE_EXPORTED:
2494 /*
2495 * A pool with an exported state may in fact be imported
2496 * read-only, so check the in-core state to see if it's
2497 * active and imported read-only. If it is, set
2498 * its state to active.
2499 */
2500 if (pool_active(hdl, name, guid, &isactive) == 0 && isactive &&
2501 (zhp = zpool_open_canfail(hdl, name)) != NULL) {
2502 if (zpool_get_prop_int(zhp, ZPOOL_PROP_READONLY, NULL))
2503 stateval = POOL_STATE_ACTIVE;
2504
2505 /*
2506 * All we needed the zpool handle for is the
2507 * readonly prop check.
2508 */
2509 zpool_close(zhp);
2510 }
2511
2512 ret = B_TRUE;
2513 break;
2514
2515 case POOL_STATE_ACTIVE:
2516 /*
2517 * For an active pool, we have to determine if it's really part
2518 * of a currently active pool (in which case the pool will exist
2519 * and the guid will be the same), or whether it's part of an
2520 * active pool that was disconnected without being explicitly
2521 * exported.
2522 */
2523 if (pool_active(hdl, name, guid, &isactive) != 0) {
2524 nvlist_free(config);
2525 return (-1);
2526 }
2527
2528 if (isactive) {
2529 /*
2530 * Because the device may have been removed while
2531 * offlined, we only report it as active if the vdev is
2532 * still present in the config. Otherwise, pretend like
2533 * it's not in use.
2534 */
2535 if ((zhp = zpool_open_canfail(hdl, name)) != NULL &&
2536 (pool_config = zpool_get_config(zhp, NULL))
2537 != NULL) {
2538 nvlist_t *nvroot;
2539
2540 verify(nvlist_lookup_nvlist(pool_config,
2541 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
2542 ret = find_guid(nvroot, vdev_guid);
2543 } else {
2544 ret = B_FALSE;
2545 }
2546
2547 /*
2548 * If this is an active spare within another pool, we
2549 * treat it like an unused hot spare. This allows the
2550 * user to create a pool with a hot spare that currently
2551 * in use within another pool. Since we return B_TRUE,
2552 * libdiskmgt will continue to prevent generic consumers
2553 * from using the device.
2554 */
2555 if (ret && nvlist_lookup_uint64(config,
2556 ZPOOL_CONFIG_IS_SPARE, &isspare) == 0 && isspare)
2557 stateval = POOL_STATE_SPARE;
2558
2559 if (zhp != NULL)
2560 zpool_close(zhp);
2561 } else {
2562 stateval = POOL_STATE_POTENTIALLY_ACTIVE;
2563 ret = B_TRUE;
2564 }
2565 break;
2566
2567 case POOL_STATE_SPARE:
2568 /*
2569 * For a hot spare, it can be either definitively in use, or
2570 * potentially active. To determine if it's in use, we iterate
2571 * over all pools in the system and search for one with a spare
2572 * with a matching guid.
2573 *
2574 * Due to the shared nature of spares, we don't actually report
2575 * the potentially active case as in use. This means the user
2576 * can freely create pools on the hot spares of exported pools,
2577 * but to do otherwise makes the resulting code complicated, and
2578 * we end up having to deal with this case anyway.
2579 */
2580 cb.cb_zhp = NULL;
2581 cb.cb_guid = vdev_guid;
2582 cb.cb_type = ZPOOL_CONFIG_SPARES;
2583 if (zpool_iter(hdl, find_aux, &cb) == 1) {
2584 name = (char *)zpool_get_name(cb.cb_zhp);
2585 ret = B_TRUE;
2586 } else {
2587 ret = B_FALSE;
2588 }
2589 break;
2590
2591 case POOL_STATE_L2CACHE:
2592
2593 /*
2594 * Check if any pool is currently using this l2cache device.
2595 */
2596 cb.cb_zhp = NULL;
2597 cb.cb_guid = vdev_guid;
2598 cb.cb_type = ZPOOL_CONFIG_L2CACHE;
2599 if (zpool_iter(hdl, find_aux, &cb) == 1) {
2600 name = (char *)zpool_get_name(cb.cb_zhp);
2601 ret = B_TRUE;
2602 } else {
2603 ret = B_FALSE;
2604 }
2605 break;
2606
2607 default:
2608 ret = B_FALSE;
2609 }
2610
2611
2612 if (ret) {
2613 if ((*namestr = zfs_strdup(hdl, name)) == NULL) {
2614 if (cb.cb_zhp)
2615 zpool_close(cb.cb_zhp);
2616 nvlist_free(config);
2617 return (-1);
2618 }
2619 *state = (pool_state_t)stateval;
2620 }
2621
2622 if (cb.cb_zhp)
2623 zpool_close(cb.cb_zhp);
2624
2625 nvlist_free(config);
2626 *inuse = ret;
2627 return (0);
2628 }