]> git.proxmox.com Git - mirror_zfs-debian.git/blob - cmd/zpool/zpool_vdev.c
Imported Upstream version 0.6.5.3
[mirror_zfs-debian.git] / cmd / zpool / zpool_vdev.c
1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2013 by Delphix. All rights reserved.
25 */
26
27 /*
28 * Functions to convert between a list of vdevs and an nvlist representing the
29 * configuration. Each entry in the list can be one of:
30 *
31 * Device vdevs
32 * disk=(path=..., devid=...)
33 * file=(path=...)
34 *
35 * Group vdevs
36 * raidz[1|2]=(...)
37 * mirror=(...)
38 *
39 * Hot spares
40 *
41 * While the underlying implementation supports it, group vdevs cannot contain
42 * other group vdevs. All userland verification of devices is contained within
43 * this file. If successful, the nvlist returned can be passed directly to the
44 * kernel; we've done as much verification as possible in userland.
45 *
46 * Hot spares are a special case, and passed down as an array of disk vdevs, at
47 * the same level as the root of the vdev tree.
48 *
49 * The only function exported by this file is 'make_root_vdev'. The
50 * function performs several passes:
51 *
52 * 1. Construct the vdev specification. Performs syntax validation and
53 * makes sure each device is valid.
54 * 2. Check for devices in use. Using libblkid to make sure that no
55 * devices are also in use. Some can be overridden using the 'force'
56 * flag, others cannot.
57 * 3. Check for replication errors if the 'force' flag is not specified.
58 * validates that the replication level is consistent across the
59 * entire pool.
60 * 4. Call libzfs to label any whole disks with an EFI label.
61 */
62
63 #include <assert.h>
64 #include <ctype.h>
65 #include <devid.h>
66 #include <errno.h>
67 #include <fcntl.h>
68 #include <libintl.h>
69 #include <libnvpair.h>
70 #include <limits.h>
71 #include <scsi/scsi.h>
72 #include <scsi/sg.h>
73 #include <stdio.h>
74 #include <string.h>
75 #include <unistd.h>
76 #include <sys/efi_partition.h>
77 #include <sys/stat.h>
78 #include <sys/vtoc.h>
79 #include <sys/mntent.h>
80 #include <uuid/uuid.h>
81 #ifdef HAVE_LIBBLKID
82 #include <blkid/blkid.h>
83 #else
84 #define blkid_cache void *
85 #endif /* HAVE_LIBBLKID */
86
87 #include "zpool_util.h"
88 #include <sys/zfs_context.h>
89
90 /*
91 * For any given vdev specification, we can have multiple errors. The
92 * vdev_error() function keeps track of whether we have seen an error yet, and
93 * prints out a header if its the first error we've seen.
94 */
95 boolean_t error_seen;
96 boolean_t is_force;
97
98 typedef struct vdev_disk_db_entry
99 {
100 char id[24];
101 int sector_size;
102 } vdev_disk_db_entry_t;
103
104 /*
105 * Database of block devices that lie about physical sector sizes. The
106 * identification string must be precisely 24 characters to avoid false
107 * negatives
108 */
109 static vdev_disk_db_entry_t vdev_disk_database[] = {
110 {"ATA ADATA SSD S396 3", 8192},
111 {"ATA APPLE SSD SM128E", 8192},
112 {"ATA APPLE SSD SM256E", 8192},
113 {"ATA APPLE SSD SM512E", 8192},
114 {"ATA APPLE SSD SM768E", 8192},
115 {"ATA C400-MTFDDAC064M", 8192},
116 {"ATA C400-MTFDDAC128M", 8192},
117 {"ATA C400-MTFDDAC256M", 8192},
118 {"ATA C400-MTFDDAC512M", 8192},
119 {"ATA Corsair Force 3 ", 8192},
120 {"ATA Corsair Force GS", 8192},
121 {"ATA INTEL SSDSA2CT04", 8192},
122 {"ATA INTEL SSDSA2BZ10", 8192},
123 {"ATA INTEL SSDSA2BZ20", 8192},
124 {"ATA INTEL SSDSA2BZ30", 8192},
125 {"ATA INTEL SSDSA2CW04", 8192},
126 {"ATA INTEL SSDSA2CW08", 8192},
127 {"ATA INTEL SSDSA2CW12", 8192},
128 {"ATA INTEL SSDSA2CW16", 8192},
129 {"ATA INTEL SSDSA2CW30", 8192},
130 {"ATA INTEL SSDSA2CW60", 8192},
131 {"ATA INTEL SSDSC2CT06", 8192},
132 {"ATA INTEL SSDSC2CT12", 8192},
133 {"ATA INTEL SSDSC2CT18", 8192},
134 {"ATA INTEL SSDSC2CT24", 8192},
135 {"ATA INTEL SSDSC2CW06", 8192},
136 {"ATA INTEL SSDSC2CW12", 8192},
137 {"ATA INTEL SSDSC2CW18", 8192},
138 {"ATA INTEL SSDSC2CW24", 8192},
139 {"ATA INTEL SSDSC2CW48", 8192},
140 {"ATA KINGSTON SH100S3", 8192},
141 {"ATA KINGSTON SH103S3", 8192},
142 {"ATA M4-CT064M4SSD2 ", 8192},
143 {"ATA M4-CT128M4SSD2 ", 8192},
144 {"ATA M4-CT256M4SSD2 ", 8192},
145 {"ATA M4-CT512M4SSD2 ", 8192},
146 {"ATA OCZ-AGILITY2 ", 8192},
147 {"ATA OCZ-AGILITY3 ", 8192},
148 {"ATA OCZ-VERTEX2 3.5 ", 8192},
149 {"ATA OCZ-VERTEX3 ", 8192},
150 {"ATA OCZ-VERTEX3 LT ", 8192},
151 {"ATA OCZ-VERTEX3 MI ", 8192},
152 {"ATA OCZ-VERTEX4 ", 8192},
153 {"ATA SAMSUNG MZ7WD120", 8192},
154 {"ATA SAMSUNG MZ7WD240", 8192},
155 {"ATA SAMSUNG MZ7WD480", 8192},
156 {"ATA SAMSUNG MZ7WD960", 8192},
157 {"ATA SAMSUNG SSD 830 ", 8192},
158 {"ATA Samsung SSD 840 ", 8192},
159 {"ATA SanDisk SSD U100", 8192},
160 {"ATA TOSHIBA THNSNH06", 8192},
161 {"ATA TOSHIBA THNSNH12", 8192},
162 {"ATA TOSHIBA THNSNH25", 8192},
163 {"ATA TOSHIBA THNSNH51", 8192},
164 {"ATA APPLE SSD TS064C", 4096},
165 {"ATA APPLE SSD TS128C", 4096},
166 {"ATA APPLE SSD TS256C", 4096},
167 {"ATA APPLE SSD TS512C", 4096},
168 {"ATA INTEL SSDSA2M040", 4096},
169 {"ATA INTEL SSDSA2M080", 4096},
170 {"ATA INTEL SSDSA2M160", 4096},
171 {"ATA INTEL SSDSC2MH12", 4096},
172 {"ATA INTEL SSDSC2MH25", 4096},
173 {"ATA OCZ CORE_SSD ", 4096},
174 {"ATA OCZ-VERTEX ", 4096},
175 {"ATA SAMSUNG MCCOE32G", 4096},
176 {"ATA SAMSUNG MCCOE64G", 4096},
177 {"ATA SAMSUNG SSD PM80", 4096},
178 /* Flash drives optimized for 4KB IOs on larger pages */
179 {"ATA INTEL SSDSC2BA10", 4096},
180 {"ATA INTEL SSDSC2BA20", 4096},
181 {"ATA INTEL SSDSC2BA40", 4096},
182 {"ATA INTEL SSDSC2BA80", 4096},
183 {"ATA INTEL SSDSC2BB08", 4096},
184 {"ATA INTEL SSDSC2BB12", 4096},
185 {"ATA INTEL SSDSC2BB16", 4096},
186 {"ATA INTEL SSDSC2BB24", 4096},
187 {"ATA INTEL SSDSC2BB30", 4096},
188 {"ATA INTEL SSDSC2BB40", 4096},
189 {"ATA INTEL SSDSC2BB48", 4096},
190 {"ATA INTEL SSDSC2BB60", 4096},
191 {"ATA INTEL SSDSC2BB80", 4096},
192 {"ATA INTEL SSDSC2BW24", 4096},
193 {"ATA INTEL SSDSC2BP24", 4096},
194 {"ATA INTEL SSDSC2BP48", 4096},
195 {"NA SmrtStorSDLKAE9W", 4096},
196 /* Imported from Open Solaris */
197 {"ATA MARVELL SD88SA02", 4096},
198 /* Advanced format Hard drives */
199 {"ATA Hitachi HDS5C303", 4096},
200 {"ATA SAMSUNG HD204UI ", 4096},
201 {"ATA ST2000DL004 HD20", 4096},
202 {"ATA WDC WD10EARS-00M", 4096},
203 {"ATA WDC WD10EARS-00S", 4096},
204 {"ATA WDC WD10EARS-00Z", 4096},
205 {"ATA WDC WD15EARS-00M", 4096},
206 {"ATA WDC WD15EARS-00S", 4096},
207 {"ATA WDC WD15EARS-00Z", 4096},
208 {"ATA WDC WD20EARS-00M", 4096},
209 {"ATA WDC WD20EARS-00S", 4096},
210 {"ATA WDC WD20EARS-00Z", 4096},
211 {"ATA WDC WD1600BEVT-0", 4096},
212 {"ATA WDC WD2500BEVT-0", 4096},
213 {"ATA WDC WD3200BEVT-0", 4096},
214 {"ATA WDC WD5000BEVT-0", 4096},
215 /* Virtual disks: Assume zvols with default volblocksize */
216 #if 0
217 {"ATA QEMU HARDDISK ", 8192},
218 {"IET VIRTUAL-DISK ", 8192},
219 {"OI COMSTAR ", 8192},
220 {"SUN COMSTAR ", 8192},
221 {"NETAPP LUN ", 8192},
222 #endif
223 };
224
225 static const int vdev_disk_database_size =
226 sizeof (vdev_disk_database) / sizeof (vdev_disk_database[0]);
227
228 #define INQ_REPLY_LEN 96
229 #define INQ_CMD_LEN 6
230
231 static boolean_t
232 check_sector_size_database(char *path, int *sector_size)
233 {
234 unsigned char inq_buff[INQ_REPLY_LEN];
235 unsigned char sense_buffer[32];
236 unsigned char inq_cmd_blk[INQ_CMD_LEN] =
237 {INQUIRY, 0, 0, 0, INQ_REPLY_LEN, 0};
238 sg_io_hdr_t io_hdr;
239 int error;
240 int fd;
241 int i;
242
243 /* Prepare INQUIRY command */
244 memset(&io_hdr, 0, sizeof (sg_io_hdr_t));
245 io_hdr.interface_id = 'S';
246 io_hdr.cmd_len = sizeof (inq_cmd_blk);
247 io_hdr.mx_sb_len = sizeof (sense_buffer);
248 io_hdr.dxfer_direction = SG_DXFER_FROM_DEV;
249 io_hdr.dxfer_len = INQ_REPLY_LEN;
250 io_hdr.dxferp = inq_buff;
251 io_hdr.cmdp = inq_cmd_blk;
252 io_hdr.sbp = sense_buffer;
253 io_hdr.timeout = 10; /* 10 milliseconds is ample time */
254
255 if ((fd = open(path, O_RDONLY|O_DIRECT)) < 0)
256 return (B_FALSE);
257
258 error = ioctl(fd, SG_IO, (unsigned long) &io_hdr);
259
260 (void) close(fd);
261
262 if (error < 0)
263 return (B_FALSE);
264
265 if ((io_hdr.info & SG_INFO_OK_MASK) != SG_INFO_OK)
266 return (B_FALSE);
267
268 for (i = 0; i < vdev_disk_database_size; i++) {
269 if (memcmp(inq_buff + 8, vdev_disk_database[i].id, 24))
270 continue;
271
272 *sector_size = vdev_disk_database[i].sector_size;
273 return (B_TRUE);
274 }
275
276 return (B_FALSE);
277 }
278
279 /*PRINTFLIKE1*/
280 static void
281 vdev_error(const char *fmt, ...)
282 {
283 va_list ap;
284
285 if (!error_seen) {
286 (void) fprintf(stderr, gettext("invalid vdev specification\n"));
287 if (!is_force)
288 (void) fprintf(stderr, gettext("use '-f' to override "
289 "the following errors:\n"));
290 else
291 (void) fprintf(stderr, gettext("the following errors "
292 "must be manually repaired:\n"));
293 error_seen = B_TRUE;
294 }
295
296 va_start(ap, fmt);
297 (void) vfprintf(stderr, fmt, ap);
298 va_end(ap);
299 }
300
301 /*
302 * Check that a file is valid. All we can do in this case is check that it's
303 * not in use by another pool, and not in use by swap.
304 */
305 static int
306 check_file(const char *file, boolean_t force, boolean_t isspare)
307 {
308 char *name;
309 int fd;
310 int ret = 0;
311 pool_state_t state;
312 boolean_t inuse;
313
314 if ((fd = open(file, O_RDONLY)) < 0)
315 return (0);
316
317 if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) == 0 && inuse) {
318 const char *desc;
319
320 switch (state) {
321 case POOL_STATE_ACTIVE:
322 desc = gettext("active");
323 break;
324
325 case POOL_STATE_EXPORTED:
326 desc = gettext("exported");
327 break;
328
329 case POOL_STATE_POTENTIALLY_ACTIVE:
330 desc = gettext("potentially active");
331 break;
332
333 default:
334 desc = gettext("unknown");
335 break;
336 }
337
338 /*
339 * Allow hot spares to be shared between pools.
340 */
341 if (state == POOL_STATE_SPARE && isspare)
342 return (0);
343
344 if (state == POOL_STATE_ACTIVE ||
345 state == POOL_STATE_SPARE || !force) {
346 switch (state) {
347 case POOL_STATE_SPARE:
348 vdev_error(gettext("%s is reserved as a hot "
349 "spare for pool %s\n"), file, name);
350 break;
351 default:
352 vdev_error(gettext("%s is part of %s pool "
353 "'%s'\n"), file, desc, name);
354 break;
355 }
356 ret = -1;
357 }
358
359 free(name);
360 }
361
362 (void) close(fd);
363 return (ret);
364 }
365
366 static void
367 check_error(int err)
368 {
369 (void) fprintf(stderr, gettext("warning: device in use checking "
370 "failed: %s\n"), strerror(err));
371 }
372
373 static int
374 check_slice(const char *path, blkid_cache cache, int force, boolean_t isspare)
375 {
376 int err;
377 #ifdef HAVE_LIBBLKID
378 char *value;
379
380 /* No valid type detected device is safe to use */
381 value = blkid_get_tag_value(cache, "TYPE", path);
382 if (value == NULL)
383 return (0);
384
385 /*
386 * If libblkid detects a ZFS device, we check the device
387 * using check_file() to see if it's safe. The one safe
388 * case is a spare device shared between multiple pools.
389 */
390 if (strcmp(value, "zfs_member") == 0) {
391 err = check_file(path, force, isspare);
392 } else {
393 if (force) {
394 err = 0;
395 } else {
396 err = -1;
397 vdev_error(gettext("%s contains a filesystem of "
398 "type '%s'\n"), path, value);
399 }
400 }
401
402 free(value);
403 #else
404 err = check_file(path, force, isspare);
405 #endif /* HAVE_LIBBLKID */
406
407 return (err);
408 }
409
410 /*
411 * Validate a whole disk. Iterate over all slices on the disk and make sure
412 * that none is in use by calling check_slice().
413 */
414 static int
415 check_disk(const char *path, blkid_cache cache, int force,
416 boolean_t isspare, boolean_t iswholedisk)
417 {
418 struct dk_gpt *vtoc;
419 char slice_path[MAXPATHLEN];
420 int err = 0;
421 int fd, i;
422
423 /* This is not a wholedisk we only check the given partition */
424 if (!iswholedisk)
425 return (check_slice(path, cache, force, isspare));
426
427 /*
428 * When the device is a whole disk try to read the efi partition
429 * label. If this is successful we safely check the all of the
430 * partitions. However, when it fails it may simply be because
431 * the disk is partitioned via the MBR. Since we currently can
432 * not easily decode the MBR return a failure and prompt to the
433 * user to use force option since we cannot check the partitions.
434 */
435 if ((fd = open(path, O_RDONLY|O_DIRECT)) < 0) {
436 check_error(errno);
437 return (-1);
438 }
439
440 if ((err = efi_alloc_and_read(fd, &vtoc)) != 0) {
441 (void) close(fd);
442
443 if (force) {
444 return (0);
445 } else {
446 vdev_error(gettext("%s does not contain an EFI "
447 "label but it may contain partition\n"
448 "information in the MBR.\n"), path);
449 return (-1);
450 }
451 }
452
453 /*
454 * The primary efi partition label is damaged however the secondary
455 * label at the end of the device is intact. Rather than use this
456 * label we should play it safe and treat this as a non efi device.
457 */
458 if (vtoc->efi_flags & EFI_GPT_PRIMARY_CORRUPT) {
459 efi_free(vtoc);
460 (void) close(fd);
461
462 if (force) {
463 /* Partitions will no be created using the backup */
464 return (0);
465 } else {
466 vdev_error(gettext("%s contains a corrupt primary "
467 "EFI label.\n"), path);
468 return (-1);
469 }
470 }
471
472 for (i = 0; i < vtoc->efi_nparts; i++) {
473
474 if (vtoc->efi_parts[i].p_tag == V_UNASSIGNED ||
475 uuid_is_null((uchar_t *)&vtoc->efi_parts[i].p_guid))
476 continue;
477
478 if (strncmp(path, UDISK_ROOT, strlen(UDISK_ROOT)) == 0)
479 (void) snprintf(slice_path, sizeof (slice_path),
480 "%s%s%d", path, "-part", i+1);
481 else
482 (void) snprintf(slice_path, sizeof (slice_path),
483 "%s%s%d", path, isdigit(path[strlen(path)-1]) ?
484 "p" : "", i+1);
485
486 err = check_slice(slice_path, cache, force, isspare);
487 if (err)
488 break;
489 }
490
491 efi_free(vtoc);
492 (void) close(fd);
493
494 return (err);
495 }
496
497 static int
498 check_device(const char *path, boolean_t force,
499 boolean_t isspare, boolean_t iswholedisk)
500 {
501 static blkid_cache cache = NULL;
502
503 #ifdef HAVE_LIBBLKID
504 /*
505 * There is no easy way to add a correct blkid_put_cache() call,
506 * memory will be reclaimed when the command exits.
507 */
508 if (cache == NULL) {
509 int err;
510
511 if ((err = blkid_get_cache(&cache, NULL)) != 0) {
512 check_error(err);
513 return (-1);
514 }
515
516 if ((err = blkid_probe_all(cache)) != 0) {
517 blkid_put_cache(cache);
518 check_error(err);
519 return (-1);
520 }
521 }
522 #endif /* HAVE_LIBBLKID */
523
524 return (check_disk(path, cache, force, isspare, iswholedisk));
525 }
526
527 /*
528 * By "whole disk" we mean an entire physical disk (something we can
529 * label, toggle the write cache on, etc.) as opposed to the full
530 * capacity of a pseudo-device such as lofi or did. We act as if we
531 * are labeling the disk, which should be a pretty good test of whether
532 * it's a viable device or not. Returns B_TRUE if it is and B_FALSE if
533 * it isn't.
534 */
535 static boolean_t
536 is_whole_disk(const char *path)
537 {
538 struct dk_gpt *label;
539 int fd;
540
541 if ((fd = open(path, O_RDONLY|O_DIRECT)) < 0)
542 return (B_FALSE);
543 if (efi_alloc_and_init(fd, EFI_NUMPAR, &label) != 0) {
544 (void) close(fd);
545 return (B_FALSE);
546 }
547 efi_free(label);
548 (void) close(fd);
549 return (B_TRUE);
550 }
551
552 /*
553 * This may be a shorthand device path or it could be total gibberish.
554 * Check to see if it is a known device available in zfs_vdev_paths.
555 * As part of this check, see if we've been given an entire disk
556 * (minus the slice number).
557 */
558 static int
559 is_shorthand_path(const char *arg, char *path,
560 struct stat64 *statbuf, boolean_t *wholedisk)
561 {
562 int error;
563
564 error = zfs_resolve_shortname(arg, path, MAXPATHLEN);
565 if (error == 0) {
566 *wholedisk = is_whole_disk(path);
567 if (*wholedisk || (stat64(path, statbuf) == 0))
568 return (0);
569 }
570
571 strlcpy(path, arg, sizeof (path));
572 memset(statbuf, 0, sizeof (*statbuf));
573 *wholedisk = B_FALSE;
574
575 return (error);
576 }
577
578 /*
579 * Determine if the given path is a hot spare within the given configuration.
580 * If no configuration is given we rely solely on the label.
581 */
582 static boolean_t
583 is_spare(nvlist_t *config, const char *path)
584 {
585 int fd;
586 pool_state_t state;
587 char *name = NULL;
588 nvlist_t *label;
589 uint64_t guid, spareguid;
590 nvlist_t *nvroot;
591 nvlist_t **spares;
592 uint_t i, nspares;
593 boolean_t inuse;
594
595 if ((fd = open(path, O_RDONLY)) < 0)
596 return (B_FALSE);
597
598 if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) != 0 ||
599 !inuse ||
600 state != POOL_STATE_SPARE ||
601 zpool_read_label(fd, &label, NULL) != 0) {
602 free(name);
603 (void) close(fd);
604 return (B_FALSE);
605 }
606 free(name);
607 (void) close(fd);
608
609 if (config == NULL)
610 return (B_TRUE);
611
612 verify(nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) == 0);
613 nvlist_free(label);
614
615 verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
616 &nvroot) == 0);
617 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
618 &spares, &nspares) == 0) {
619 for (i = 0; i < nspares; i++) {
620 verify(nvlist_lookup_uint64(spares[i],
621 ZPOOL_CONFIG_GUID, &spareguid) == 0);
622 if (spareguid == guid)
623 return (B_TRUE);
624 }
625 }
626
627 return (B_FALSE);
628 }
629
630 /*
631 * Create a leaf vdev. Determine if this is a file or a device. If it's a
632 * device, fill in the device id to make a complete nvlist. Valid forms for a
633 * leaf vdev are:
634 *
635 * /dev/xxx Complete disk path
636 * /xxx Full path to file
637 * xxx Shorthand for <zfs_vdev_paths>/xxx
638 */
639 static nvlist_t *
640 make_leaf_vdev(nvlist_t *props, const char *arg, uint64_t is_log)
641 {
642 char path[MAXPATHLEN];
643 struct stat64 statbuf;
644 nvlist_t *vdev = NULL;
645 char *type = NULL;
646 boolean_t wholedisk = B_FALSE;
647 uint64_t ashift = 0;
648 int err;
649
650 /*
651 * Determine what type of vdev this is, and put the full path into
652 * 'path'. We detect whether this is a device of file afterwards by
653 * checking the st_mode of the file.
654 */
655 if (arg[0] == '/') {
656 /*
657 * Complete device or file path. Exact type is determined by
658 * examining the file descriptor afterwards. Symbolic links
659 * are resolved to their real paths for the is_whole_disk()
660 * and S_ISBLK/S_ISREG type checks. However, we are careful
661 * to store the given path as ZPOOL_CONFIG_PATH to ensure we
662 * can leverage udev's persistent device labels.
663 */
664 if (realpath(arg, path) == NULL) {
665 (void) fprintf(stderr,
666 gettext("cannot resolve path '%s'\n"), arg);
667 return (NULL);
668 }
669
670 wholedisk = is_whole_disk(path);
671 if (!wholedisk && (stat64(path, &statbuf) != 0)) {
672 (void) fprintf(stderr,
673 gettext("cannot open '%s': %s\n"),
674 path, strerror(errno));
675 return (NULL);
676 }
677
678 /* After is_whole_disk() check restore original passed path */
679 strlcpy(path, arg, MAXPATHLEN);
680 } else {
681 err = is_shorthand_path(arg, path, &statbuf, &wholedisk);
682 if (err != 0) {
683 /*
684 * If we got ENOENT, then the user gave us
685 * gibberish, so try to direct them with a
686 * reasonable error message. Otherwise,
687 * regurgitate strerror() since it's the best we
688 * can do.
689 */
690 if (err == ENOENT) {
691 (void) fprintf(stderr,
692 gettext("cannot open '%s': no such "
693 "device in %s\n"), arg, DISK_ROOT);
694 (void) fprintf(stderr,
695 gettext("must be a full path or "
696 "shorthand device name\n"));
697 return (NULL);
698 } else {
699 (void) fprintf(stderr,
700 gettext("cannot open '%s': %s\n"),
701 path, strerror(errno));
702 return (NULL);
703 }
704 }
705 }
706
707 /*
708 * Determine whether this is a device or a file.
709 */
710 if (wholedisk || S_ISBLK(statbuf.st_mode)) {
711 type = VDEV_TYPE_DISK;
712 } else if (S_ISREG(statbuf.st_mode)) {
713 type = VDEV_TYPE_FILE;
714 } else {
715 (void) fprintf(stderr, gettext("cannot use '%s': must be a "
716 "block device or regular file\n"), path);
717 return (NULL);
718 }
719
720 /*
721 * Finally, we have the complete device or file, and we know that it is
722 * acceptable to use. Construct the nvlist to describe this vdev. All
723 * vdevs have a 'path' element, and devices also have a 'devid' element.
724 */
725 verify(nvlist_alloc(&vdev, NV_UNIQUE_NAME, 0) == 0);
726 verify(nvlist_add_string(vdev, ZPOOL_CONFIG_PATH, path) == 0);
727 verify(nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE, type) == 0);
728 verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_IS_LOG, is_log) == 0);
729 if (strcmp(type, VDEV_TYPE_DISK) == 0)
730 verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK,
731 (uint64_t)wholedisk) == 0);
732
733 /*
734 * Override defaults if custom properties are provided.
735 */
736 if (props != NULL) {
737 char *value = NULL;
738
739 if (nvlist_lookup_string(props,
740 zpool_prop_to_name(ZPOOL_PROP_ASHIFT), &value) == 0)
741 zfs_nicestrtonum(NULL, value, &ashift);
742 }
743
744 /*
745 * If the device is known to incorrectly report its physical sector
746 * size explicitly provide the known correct value.
747 */
748 if (ashift == 0) {
749 int sector_size;
750
751 if (check_sector_size_database(path, &sector_size) == B_TRUE)
752 ashift = highbit64(sector_size) - 1;
753 }
754
755 if (ashift > 0)
756 nvlist_add_uint64(vdev, ZPOOL_CONFIG_ASHIFT, ashift);
757
758 return (vdev);
759 }
760
761 /*
762 * Go through and verify the replication level of the pool is consistent.
763 * Performs the following checks:
764 *
765 * For the new spec, verifies that devices in mirrors and raidz are the
766 * same size.
767 *
768 * If the current configuration already has inconsistent replication
769 * levels, ignore any other potential problems in the new spec.
770 *
771 * Otherwise, make sure that the current spec (if there is one) and the new
772 * spec have consistent replication levels.
773 */
774 typedef struct replication_level {
775 char *zprl_type;
776 uint64_t zprl_children;
777 uint64_t zprl_parity;
778 } replication_level_t;
779
780 #define ZPOOL_FUZZ (16 * 1024 * 1024)
781
782 /*
783 * Given a list of toplevel vdevs, return the current replication level. If
784 * the config is inconsistent, then NULL is returned. If 'fatal' is set, then
785 * an error message will be displayed for each self-inconsistent vdev.
786 */
787 static replication_level_t *
788 get_replication(nvlist_t *nvroot, boolean_t fatal)
789 {
790 nvlist_t **top;
791 uint_t t, toplevels;
792 nvlist_t **child;
793 uint_t c, children;
794 nvlist_t *nv;
795 char *type;
796 replication_level_t lastrep = { 0 }, rep, *ret;
797 boolean_t dontreport;
798
799 ret = safe_malloc(sizeof (replication_level_t));
800
801 verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
802 &top, &toplevels) == 0);
803
804 lastrep.zprl_type = NULL;
805 for (t = 0; t < toplevels; t++) {
806 uint64_t is_log = B_FALSE;
807
808 nv = top[t];
809
810 /*
811 * For separate logs we ignore the top level vdev replication
812 * constraints.
813 */
814 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &is_log);
815 if (is_log)
816 continue;
817
818 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE,
819 &type) == 0);
820 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
821 &child, &children) != 0) {
822 /*
823 * This is a 'file' or 'disk' vdev.
824 */
825 rep.zprl_type = type;
826 rep.zprl_children = 1;
827 rep.zprl_parity = 0;
828 } else {
829 uint64_t vdev_size;
830
831 /*
832 * This is a mirror or RAID-Z vdev. Go through and make
833 * sure the contents are all the same (files vs. disks),
834 * keeping track of the number of elements in the
835 * process.
836 *
837 * We also check that the size of each vdev (if it can
838 * be determined) is the same.
839 */
840 rep.zprl_type = type;
841 rep.zprl_children = 0;
842
843 if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) {
844 verify(nvlist_lookup_uint64(nv,
845 ZPOOL_CONFIG_NPARITY,
846 &rep.zprl_parity) == 0);
847 assert(rep.zprl_parity != 0);
848 } else {
849 rep.zprl_parity = 0;
850 }
851
852 /*
853 * The 'dontreport' variable indicates that we've
854 * already reported an error for this spec, so don't
855 * bother doing it again.
856 */
857 type = NULL;
858 dontreport = 0;
859 vdev_size = -1ULL;
860 for (c = 0; c < children; c++) {
861 nvlist_t *cnv = child[c];
862 char *path;
863 struct stat64 statbuf;
864 uint64_t size = -1ULL;
865 char *childtype;
866 int fd, err;
867
868 rep.zprl_children++;
869
870 verify(nvlist_lookup_string(cnv,
871 ZPOOL_CONFIG_TYPE, &childtype) == 0);
872
873 /*
874 * If this is a replacing or spare vdev, then
875 * get the real first child of the vdev.
876 */
877 if (strcmp(childtype,
878 VDEV_TYPE_REPLACING) == 0 ||
879 strcmp(childtype, VDEV_TYPE_SPARE) == 0) {
880 nvlist_t **rchild;
881 uint_t rchildren;
882
883 verify(nvlist_lookup_nvlist_array(cnv,
884 ZPOOL_CONFIG_CHILDREN, &rchild,
885 &rchildren) == 0);
886 assert(rchildren == 2);
887 cnv = rchild[0];
888
889 verify(nvlist_lookup_string(cnv,
890 ZPOOL_CONFIG_TYPE,
891 &childtype) == 0);
892 }
893
894 verify(nvlist_lookup_string(cnv,
895 ZPOOL_CONFIG_PATH, &path) == 0);
896
897 /*
898 * If we have a raidz/mirror that combines disks
899 * with files, report it as an error.
900 */
901 if (!dontreport && type != NULL &&
902 strcmp(type, childtype) != 0) {
903 if (ret != NULL)
904 free(ret);
905 ret = NULL;
906 if (fatal)
907 vdev_error(gettext(
908 "mismatched replication "
909 "level: %s contains both "
910 "files and devices\n"),
911 rep.zprl_type);
912 else
913 return (NULL);
914 dontreport = B_TRUE;
915 }
916
917 /*
918 * According to stat(2), the value of 'st_size'
919 * is undefined for block devices and character
920 * devices. But there is no effective way to
921 * determine the real size in userland.
922 *
923 * Instead, we'll take advantage of an
924 * implementation detail of spec_size(). If the
925 * device is currently open, then we (should)
926 * return a valid size.
927 *
928 * If we still don't get a valid size (indicated
929 * by a size of 0 or MAXOFFSET_T), then ignore
930 * this device altogether.
931 */
932 if ((fd = open(path, O_RDONLY)) >= 0) {
933 err = fstat64(fd, &statbuf);
934 (void) close(fd);
935 } else {
936 err = stat64(path, &statbuf);
937 }
938
939 if (err != 0 ||
940 statbuf.st_size == 0 ||
941 statbuf.st_size == MAXOFFSET_T)
942 continue;
943
944 size = statbuf.st_size;
945
946 /*
947 * Also make sure that devices and
948 * slices have a consistent size. If
949 * they differ by a significant amount
950 * (~16MB) then report an error.
951 */
952 if (!dontreport &&
953 (vdev_size != -1ULL &&
954 (labs(size - vdev_size) >
955 ZPOOL_FUZZ))) {
956 if (ret != NULL)
957 free(ret);
958 ret = NULL;
959 if (fatal)
960 vdev_error(gettext(
961 "%s contains devices of "
962 "different sizes\n"),
963 rep.zprl_type);
964 else
965 return (NULL);
966 dontreport = B_TRUE;
967 }
968
969 type = childtype;
970 vdev_size = size;
971 }
972 }
973
974 /*
975 * At this point, we have the replication of the last toplevel
976 * vdev in 'rep'. Compare it to 'lastrep' to see if its
977 * different.
978 */
979 if (lastrep.zprl_type != NULL) {
980 if (strcmp(lastrep.zprl_type, rep.zprl_type) != 0) {
981 if (ret != NULL)
982 free(ret);
983 ret = NULL;
984 if (fatal)
985 vdev_error(gettext(
986 "mismatched replication level: "
987 "both %s and %s vdevs are "
988 "present\n"),
989 lastrep.zprl_type, rep.zprl_type);
990 else
991 return (NULL);
992 } else if (lastrep.zprl_parity != rep.zprl_parity) {
993 if (ret)
994 free(ret);
995 ret = NULL;
996 if (fatal)
997 vdev_error(gettext(
998 "mismatched replication level: "
999 "both %llu and %llu device parity "
1000 "%s vdevs are present\n"),
1001 lastrep.zprl_parity,
1002 rep.zprl_parity,
1003 rep.zprl_type);
1004 else
1005 return (NULL);
1006 } else if (lastrep.zprl_children != rep.zprl_children) {
1007 if (ret)
1008 free(ret);
1009 ret = NULL;
1010 if (fatal)
1011 vdev_error(gettext(
1012 "mismatched replication level: "
1013 "both %llu-way and %llu-way %s "
1014 "vdevs are present\n"),
1015 lastrep.zprl_children,
1016 rep.zprl_children,
1017 rep.zprl_type);
1018 else
1019 return (NULL);
1020 }
1021 }
1022 lastrep = rep;
1023 }
1024
1025 if (ret != NULL)
1026 *ret = rep;
1027
1028 return (ret);
1029 }
1030
1031 /*
1032 * Check the replication level of the vdev spec against the current pool. Calls
1033 * get_replication() to make sure the new spec is self-consistent. If the pool
1034 * has a consistent replication level, then we ignore any errors. Otherwise,
1035 * report any difference between the two.
1036 */
1037 static int
1038 check_replication(nvlist_t *config, nvlist_t *newroot)
1039 {
1040 nvlist_t **child;
1041 uint_t children;
1042 replication_level_t *current = NULL, *new;
1043 int ret;
1044
1045 /*
1046 * If we have a current pool configuration, check to see if it's
1047 * self-consistent. If not, simply return success.
1048 */
1049 if (config != NULL) {
1050 nvlist_t *nvroot;
1051
1052 verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
1053 &nvroot) == 0);
1054 if ((current = get_replication(nvroot, B_FALSE)) == NULL)
1055 return (0);
1056 }
1057 /*
1058 * for spares there may be no children, and therefore no
1059 * replication level to check
1060 */
1061 if ((nvlist_lookup_nvlist_array(newroot, ZPOOL_CONFIG_CHILDREN,
1062 &child, &children) != 0) || (children == 0)) {
1063 free(current);
1064 return (0);
1065 }
1066
1067 /*
1068 * If all we have is logs then there's no replication level to check.
1069 */
1070 if (num_logs(newroot) == children) {
1071 free(current);
1072 return (0);
1073 }
1074
1075 /*
1076 * Get the replication level of the new vdev spec, reporting any
1077 * inconsistencies found.
1078 */
1079 if ((new = get_replication(newroot, B_TRUE)) == NULL) {
1080 free(current);
1081 return (-1);
1082 }
1083
1084 /*
1085 * Check to see if the new vdev spec matches the replication level of
1086 * the current pool.
1087 */
1088 ret = 0;
1089 if (current != NULL) {
1090 if (strcmp(current->zprl_type, new->zprl_type) != 0) {
1091 vdev_error(gettext(
1092 "mismatched replication level: pool uses %s "
1093 "and new vdev is %s\n"),
1094 current->zprl_type, new->zprl_type);
1095 ret = -1;
1096 } else if (current->zprl_parity != new->zprl_parity) {
1097 vdev_error(gettext(
1098 "mismatched replication level: pool uses %llu "
1099 "device parity and new vdev uses %llu\n"),
1100 current->zprl_parity, new->zprl_parity);
1101 ret = -1;
1102 } else if (current->zprl_children != new->zprl_children) {
1103 vdev_error(gettext(
1104 "mismatched replication level: pool uses %llu-way "
1105 "%s and new vdev uses %llu-way %s\n"),
1106 current->zprl_children, current->zprl_type,
1107 new->zprl_children, new->zprl_type);
1108 ret = -1;
1109 }
1110 }
1111
1112 free(new);
1113 if (current != NULL)
1114 free(current);
1115
1116 return (ret);
1117 }
1118
1119 static int
1120 zero_label(char *path)
1121 {
1122 const int size = 4096;
1123 char buf[size];
1124 int err, fd;
1125
1126 if ((fd = open(path, O_WRONLY|O_EXCL)) < 0) {
1127 (void) fprintf(stderr, gettext("cannot open '%s': %s\n"),
1128 path, strerror(errno));
1129 return (-1);
1130 }
1131
1132 memset(buf, 0, size);
1133 err = write(fd, buf, size);
1134 (void) fdatasync(fd);
1135 (void) close(fd);
1136
1137 if (err == -1) {
1138 (void) fprintf(stderr, gettext("cannot zero first %d bytes "
1139 "of '%s': %s\n"), size, path, strerror(errno));
1140 return (-1);
1141 }
1142
1143 if (err != size) {
1144 (void) fprintf(stderr, gettext("could only zero %d/%d bytes "
1145 "of '%s'\n"), err, size, path);
1146 return (-1);
1147 }
1148
1149 return (0);
1150 }
1151
1152 /*
1153 * Go through and find any whole disks in the vdev specification, labelling them
1154 * as appropriate. When constructing the vdev spec, we were unable to open this
1155 * device in order to provide a devid. Now that we have labelled the disk and
1156 * know that slice 0 is valid, we can construct the devid now.
1157 *
1158 * If the disk was already labeled with an EFI label, we will have gotten the
1159 * devid already (because we were able to open the whole disk). Otherwise, we
1160 * need to get the devid after we label the disk.
1161 */
1162 static int
1163 make_disks(zpool_handle_t *zhp, nvlist_t *nv)
1164 {
1165 nvlist_t **child;
1166 uint_t c, children;
1167 char *type, *path;
1168 char devpath[MAXPATHLEN];
1169 char udevpath[MAXPATHLEN];
1170 uint64_t wholedisk;
1171 struct stat64 statbuf;
1172 int is_exclusive = 0;
1173 int fd;
1174 int ret;
1175
1176 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
1177
1178 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
1179 &child, &children) != 0) {
1180
1181 if (strcmp(type, VDEV_TYPE_DISK) != 0)
1182 return (0);
1183
1184 /*
1185 * We have a disk device. If this is a whole disk write
1186 * out the efi partition table, otherwise write zero's to
1187 * the first 4k of the partition. This is to ensure that
1188 * libblkid will not misidentify the partition due to a
1189 * magic value left by the previous filesystem.
1190 */
1191 verify(!nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path));
1192 verify(!nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
1193 &wholedisk));
1194
1195 if (!wholedisk) {
1196 (void) zero_label(path);
1197 return (0);
1198 }
1199
1200 if (realpath(path, devpath) == NULL) {
1201 ret = errno;
1202 (void) fprintf(stderr,
1203 gettext("cannot resolve path '%s'\n"), path);
1204 return (ret);
1205 }
1206
1207 /*
1208 * Remove any previously existing symlink from a udev path to
1209 * the device before labeling the disk. This makes
1210 * zpool_label_disk_wait() truly wait for the new link to show
1211 * up instead of returning if it finds an old link still in
1212 * place. Otherwise there is a window between when udev
1213 * deletes and recreates the link during which access attempts
1214 * will fail with ENOENT.
1215 */
1216 strncpy(udevpath, path, MAXPATHLEN);
1217 (void) zfs_append_partition(udevpath, MAXPATHLEN);
1218
1219 fd = open(devpath, O_RDWR|O_EXCL);
1220 if (fd == -1) {
1221 if (errno == EBUSY)
1222 is_exclusive = 1;
1223 } else {
1224 (void) close(fd);
1225 }
1226
1227 /*
1228 * If the partition exists, contains a valid spare label,
1229 * and is opened exclusively there is no need to partition
1230 * it. Hot spares have already been partitioned and are
1231 * held open exclusively by the kernel as a safety measure.
1232 *
1233 * If the provided path is for a /dev/disk/ device its
1234 * symbolic link will be removed, partition table created,
1235 * and then block until udev creates the new link.
1236 */
1237 if (!is_exclusive || !is_spare(NULL, udevpath)) {
1238 ret = strncmp(udevpath, UDISK_ROOT, strlen(UDISK_ROOT));
1239 if (ret == 0) {
1240 ret = lstat64(udevpath, &statbuf);
1241 if (ret == 0 && S_ISLNK(statbuf.st_mode))
1242 (void) unlink(udevpath);
1243 }
1244
1245 if (zpool_label_disk(g_zfs, zhp,
1246 strrchr(devpath, '/') + 1) == -1)
1247 return (-1);
1248
1249 ret = zpool_label_disk_wait(udevpath, DISK_LABEL_WAIT);
1250 if (ret) {
1251 (void) fprintf(stderr, gettext("cannot "
1252 "resolve path '%s': %d\n"), udevpath, ret);
1253 return (-1);
1254 }
1255
1256 (void) zero_label(udevpath);
1257 }
1258
1259 /*
1260 * Update the path to refer to the partition. The presence of
1261 * the 'whole_disk' field indicates to the CLI that we should
1262 * chop off the partition number when displaying the device in
1263 * future output.
1264 */
1265 verify(nvlist_add_string(nv, ZPOOL_CONFIG_PATH, udevpath) == 0);
1266
1267 return (0);
1268 }
1269
1270 for (c = 0; c < children; c++)
1271 if ((ret = make_disks(zhp, child[c])) != 0)
1272 return (ret);
1273
1274 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,
1275 &child, &children) == 0)
1276 for (c = 0; c < children; c++)
1277 if ((ret = make_disks(zhp, child[c])) != 0)
1278 return (ret);
1279
1280 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
1281 &child, &children) == 0)
1282 for (c = 0; c < children; c++)
1283 if ((ret = make_disks(zhp, child[c])) != 0)
1284 return (ret);
1285
1286 return (0);
1287 }
1288
1289 /*
1290 * Go through and find any devices that are in use. We rely on libdiskmgt for
1291 * the majority of this task.
1292 */
1293 static boolean_t
1294 is_device_in_use(nvlist_t *config, nvlist_t *nv, boolean_t force,
1295 boolean_t replacing, boolean_t isspare)
1296 {
1297 nvlist_t **child;
1298 uint_t c, children;
1299 char *type, *path;
1300 int ret = 0;
1301 char buf[MAXPATHLEN];
1302 uint64_t wholedisk = B_FALSE;
1303 boolean_t anyinuse = B_FALSE;
1304
1305 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
1306
1307 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
1308 &child, &children) != 0) {
1309
1310 verify(!nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path));
1311 if (strcmp(type, VDEV_TYPE_DISK) == 0)
1312 verify(!nvlist_lookup_uint64(nv,
1313 ZPOOL_CONFIG_WHOLE_DISK, &wholedisk));
1314
1315 /*
1316 * As a generic check, we look to see if this is a replace of a
1317 * hot spare within the same pool. If so, we allow it
1318 * regardless of what libblkid or zpool_in_use() says.
1319 */
1320 if (replacing) {
1321 (void) strlcpy(buf, path, sizeof (buf));
1322 if (wholedisk) {
1323 ret = zfs_append_partition(buf, sizeof (buf));
1324 if (ret == -1)
1325 return (-1);
1326 }
1327
1328 if (is_spare(config, buf))
1329 return (B_FALSE);
1330 }
1331
1332 if (strcmp(type, VDEV_TYPE_DISK) == 0)
1333 ret = check_device(path, force, isspare, wholedisk);
1334
1335 else if (strcmp(type, VDEV_TYPE_FILE) == 0)
1336 ret = check_file(path, force, isspare);
1337
1338 return (ret != 0);
1339 }
1340
1341 for (c = 0; c < children; c++)
1342 if (is_device_in_use(config, child[c], force, replacing,
1343 B_FALSE))
1344 anyinuse = B_TRUE;
1345
1346 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,
1347 &child, &children) == 0)
1348 for (c = 0; c < children; c++)
1349 if (is_device_in_use(config, child[c], force, replacing,
1350 B_TRUE))
1351 anyinuse = B_TRUE;
1352
1353 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
1354 &child, &children) == 0)
1355 for (c = 0; c < children; c++)
1356 if (is_device_in_use(config, child[c], force, replacing,
1357 B_FALSE))
1358 anyinuse = B_TRUE;
1359
1360 return (anyinuse);
1361 }
1362
1363 static const char *
1364 is_grouping(const char *type, int *mindev, int *maxdev)
1365 {
1366 if (strncmp(type, "raidz", 5) == 0) {
1367 const char *p = type + 5;
1368 char *end;
1369 long nparity;
1370
1371 if (*p == '\0') {
1372 nparity = 1;
1373 } else if (*p == '0') {
1374 return (NULL); /* no zero prefixes allowed */
1375 } else {
1376 errno = 0;
1377 nparity = strtol(p, &end, 10);
1378 if (errno != 0 || nparity < 1 || nparity >= 255 ||
1379 *end != '\0')
1380 return (NULL);
1381 }
1382
1383 if (mindev != NULL)
1384 *mindev = nparity + 1;
1385 if (maxdev != NULL)
1386 *maxdev = 255;
1387 return (VDEV_TYPE_RAIDZ);
1388 }
1389
1390 if (maxdev != NULL)
1391 *maxdev = INT_MAX;
1392
1393 if (strcmp(type, "mirror") == 0) {
1394 if (mindev != NULL)
1395 *mindev = 2;
1396 return (VDEV_TYPE_MIRROR);
1397 }
1398
1399 if (strcmp(type, "spare") == 0) {
1400 if (mindev != NULL)
1401 *mindev = 1;
1402 return (VDEV_TYPE_SPARE);
1403 }
1404
1405 if (strcmp(type, "log") == 0) {
1406 if (mindev != NULL)
1407 *mindev = 1;
1408 return (VDEV_TYPE_LOG);
1409 }
1410
1411 if (strcmp(type, "cache") == 0) {
1412 if (mindev != NULL)
1413 *mindev = 1;
1414 return (VDEV_TYPE_L2CACHE);
1415 }
1416
1417 return (NULL);
1418 }
1419
1420 /*
1421 * Construct a syntactically valid vdev specification,
1422 * and ensure that all devices and files exist and can be opened.
1423 * Note: we don't bother freeing anything in the error paths
1424 * because the program is just going to exit anyway.
1425 */
1426 nvlist_t *
1427 construct_spec(nvlist_t *props, int argc, char **argv)
1428 {
1429 nvlist_t *nvroot, *nv, **top, **spares, **l2cache;
1430 int t, toplevels, mindev, maxdev, nspares, nlogs, nl2cache;
1431 const char *type;
1432 uint64_t is_log;
1433 boolean_t seen_logs;
1434
1435 top = NULL;
1436 toplevels = 0;
1437 spares = NULL;
1438 l2cache = NULL;
1439 nspares = 0;
1440 nlogs = 0;
1441 nl2cache = 0;
1442 is_log = B_FALSE;
1443 seen_logs = B_FALSE;
1444
1445 while (argc > 0) {
1446 nv = NULL;
1447
1448 /*
1449 * If it's a mirror or raidz, the subsequent arguments are
1450 * its leaves -- until we encounter the next mirror or raidz.
1451 */
1452 if ((type = is_grouping(argv[0], &mindev, &maxdev)) != NULL) {
1453 nvlist_t **child = NULL;
1454 int c, children = 0;
1455
1456 if (strcmp(type, VDEV_TYPE_SPARE) == 0) {
1457 if (spares != NULL) {
1458 (void) fprintf(stderr,
1459 gettext("invalid vdev "
1460 "specification: 'spare' can be "
1461 "specified only once\n"));
1462 return (NULL);
1463 }
1464 is_log = B_FALSE;
1465 }
1466
1467 if (strcmp(type, VDEV_TYPE_LOG) == 0) {
1468 if (seen_logs) {
1469 (void) fprintf(stderr,
1470 gettext("invalid vdev "
1471 "specification: 'log' can be "
1472 "specified only once\n"));
1473 return (NULL);
1474 }
1475 seen_logs = B_TRUE;
1476 is_log = B_TRUE;
1477 argc--;
1478 argv++;
1479 /*
1480 * A log is not a real grouping device.
1481 * We just set is_log and continue.
1482 */
1483 continue;
1484 }
1485
1486 if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) {
1487 if (l2cache != NULL) {
1488 (void) fprintf(stderr,
1489 gettext("invalid vdev "
1490 "specification: 'cache' can be "
1491 "specified only once\n"));
1492 return (NULL);
1493 }
1494 is_log = B_FALSE;
1495 }
1496
1497 if (is_log) {
1498 if (strcmp(type, VDEV_TYPE_MIRROR) != 0) {
1499 (void) fprintf(stderr,
1500 gettext("invalid vdev "
1501 "specification: unsupported 'log' "
1502 "device: %s\n"), type);
1503 return (NULL);
1504 }
1505 nlogs++;
1506 }
1507
1508 for (c = 1; c < argc; c++) {
1509 if (is_grouping(argv[c], NULL, NULL) != NULL)
1510 break;
1511 children++;
1512 child = realloc(child,
1513 children * sizeof (nvlist_t *));
1514 if (child == NULL)
1515 zpool_no_memory();
1516 if ((nv = make_leaf_vdev(props, argv[c],
1517 B_FALSE)) == NULL)
1518 return (NULL);
1519 child[children - 1] = nv;
1520 }
1521
1522 if (children < mindev) {
1523 (void) fprintf(stderr, gettext("invalid vdev "
1524 "specification: %s requires at least %d "
1525 "devices\n"), argv[0], mindev);
1526 return (NULL);
1527 }
1528
1529 if (children > maxdev) {
1530 (void) fprintf(stderr, gettext("invalid vdev "
1531 "specification: %s supports no more than "
1532 "%d devices\n"), argv[0], maxdev);
1533 return (NULL);
1534 }
1535
1536 argc -= c;
1537 argv += c;
1538
1539 if (strcmp(type, VDEV_TYPE_SPARE) == 0) {
1540 spares = child;
1541 nspares = children;
1542 continue;
1543 } else if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) {
1544 l2cache = child;
1545 nl2cache = children;
1546 continue;
1547 } else {
1548 verify(nvlist_alloc(&nv, NV_UNIQUE_NAME,
1549 0) == 0);
1550 verify(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE,
1551 type) == 0);
1552 verify(nvlist_add_uint64(nv,
1553 ZPOOL_CONFIG_IS_LOG, is_log) == 0);
1554 if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) {
1555 verify(nvlist_add_uint64(nv,
1556 ZPOOL_CONFIG_NPARITY,
1557 mindev - 1) == 0);
1558 }
1559 verify(nvlist_add_nvlist_array(nv,
1560 ZPOOL_CONFIG_CHILDREN, child,
1561 children) == 0);
1562
1563 for (c = 0; c < children; c++)
1564 nvlist_free(child[c]);
1565 free(child);
1566 }
1567 } else {
1568 /*
1569 * We have a device. Pass off to make_leaf_vdev() to
1570 * construct the appropriate nvlist describing the vdev.
1571 */
1572 if ((nv = make_leaf_vdev(props, argv[0],
1573 is_log)) == NULL)
1574 return (NULL);
1575 if (is_log)
1576 nlogs++;
1577 argc--;
1578 argv++;
1579 }
1580
1581 toplevels++;
1582 top = realloc(top, toplevels * sizeof (nvlist_t *));
1583 if (top == NULL)
1584 zpool_no_memory();
1585 top[toplevels - 1] = nv;
1586 }
1587
1588 if (toplevels == 0 && nspares == 0 && nl2cache == 0) {
1589 (void) fprintf(stderr, gettext("invalid vdev "
1590 "specification: at least one toplevel vdev must be "
1591 "specified\n"));
1592 return (NULL);
1593 }
1594
1595 if (seen_logs && nlogs == 0) {
1596 (void) fprintf(stderr, gettext("invalid vdev specification: "
1597 "log requires at least 1 device\n"));
1598 return (NULL);
1599 }
1600
1601 /*
1602 * Finally, create nvroot and add all top-level vdevs to it.
1603 */
1604 verify(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) == 0);
1605 verify(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
1606 VDEV_TYPE_ROOT) == 0);
1607 verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
1608 top, toplevels) == 0);
1609 if (nspares != 0)
1610 verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
1611 spares, nspares) == 0);
1612 if (nl2cache != 0)
1613 verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
1614 l2cache, nl2cache) == 0);
1615
1616 for (t = 0; t < toplevels; t++)
1617 nvlist_free(top[t]);
1618 for (t = 0; t < nspares; t++)
1619 nvlist_free(spares[t]);
1620 for (t = 0; t < nl2cache; t++)
1621 nvlist_free(l2cache[t]);
1622 if (spares)
1623 free(spares);
1624 if (l2cache)
1625 free(l2cache);
1626 free(top);
1627
1628 return (nvroot);
1629 }
1630
1631 nvlist_t *
1632 split_mirror_vdev(zpool_handle_t *zhp, char *newname, nvlist_t *props,
1633 splitflags_t flags, int argc, char **argv)
1634 {
1635 nvlist_t *newroot = NULL, **child;
1636 uint_t c, children;
1637
1638 if (argc > 0) {
1639 if ((newroot = construct_spec(props, argc, argv)) == NULL) {
1640 (void) fprintf(stderr, gettext("Unable to build a "
1641 "pool from the specified devices\n"));
1642 return (NULL);
1643 }
1644
1645 if (!flags.dryrun && make_disks(zhp, newroot) != 0) {
1646 nvlist_free(newroot);
1647 return (NULL);
1648 }
1649
1650 /* avoid any tricks in the spec */
1651 verify(nvlist_lookup_nvlist_array(newroot,
1652 ZPOOL_CONFIG_CHILDREN, &child, &children) == 0);
1653 for (c = 0; c < children; c++) {
1654 char *path;
1655 const char *type;
1656 int min, max;
1657
1658 verify(nvlist_lookup_string(child[c],
1659 ZPOOL_CONFIG_PATH, &path) == 0);
1660 if ((type = is_grouping(path, &min, &max)) != NULL) {
1661 (void) fprintf(stderr, gettext("Cannot use "
1662 "'%s' as a device for splitting\n"), type);
1663 nvlist_free(newroot);
1664 return (NULL);
1665 }
1666 }
1667 }
1668
1669 if (zpool_vdev_split(zhp, newname, &newroot, props, flags) != 0) {
1670 if (newroot != NULL)
1671 nvlist_free(newroot);
1672 return (NULL);
1673 }
1674
1675 return (newroot);
1676 }
1677
1678 /*
1679 * Get and validate the contents of the given vdev specification. This ensures
1680 * that the nvlist returned is well-formed, that all the devices exist, and that
1681 * they are not currently in use by any other known consumer. The 'poolconfig'
1682 * parameter is the current configuration of the pool when adding devices
1683 * existing pool, and is used to perform additional checks, such as changing the
1684 * replication level of the pool. It can be 'NULL' to indicate that this is a
1685 * new pool. The 'force' flag controls whether devices should be forcefully
1686 * added, even if they appear in use.
1687 */
1688 nvlist_t *
1689 make_root_vdev(zpool_handle_t *zhp, nvlist_t *props, int force, int check_rep,
1690 boolean_t replacing, boolean_t dryrun, int argc, char **argv)
1691 {
1692 nvlist_t *newroot;
1693 nvlist_t *poolconfig = NULL;
1694 is_force = force;
1695
1696 /*
1697 * Construct the vdev specification. If this is successful, we know
1698 * that we have a valid specification, and that all devices can be
1699 * opened.
1700 */
1701 if ((newroot = construct_spec(props, argc, argv)) == NULL)
1702 return (NULL);
1703
1704 if (zhp && ((poolconfig = zpool_get_config(zhp, NULL)) == NULL)) {
1705 nvlist_free(newroot);
1706 return (NULL);
1707 }
1708
1709 /*
1710 * Validate each device to make sure that its not shared with another
1711 * subsystem. We do this even if 'force' is set, because there are some
1712 * uses (such as a dedicated dump device) that even '-f' cannot
1713 * override.
1714 */
1715 if (is_device_in_use(poolconfig, newroot, force, replacing, B_FALSE)) {
1716 nvlist_free(newroot);
1717 return (NULL);
1718 }
1719
1720 /*
1721 * Check the replication level of the given vdevs and report any errors
1722 * found. We include the existing pool spec, if any, as we need to
1723 * catch changes against the existing replication level.
1724 */
1725 if (check_rep && check_replication(poolconfig, newroot) != 0) {
1726 nvlist_free(newroot);
1727 return (NULL);
1728 }
1729
1730 /*
1731 * Run through the vdev specification and label any whole disks found.
1732 */
1733 if (!dryrun && make_disks(zhp, newroot) != 0) {
1734 nvlist_free(newroot);
1735 return (NULL);
1736 }
1737
1738 return (newroot);
1739 }