]> git.proxmox.com Git - mirror_zfs-debian.git/blame - cmd/zpool/zpool_vdev.c
Imported Upstream version 0.6.5.7
[mirror_zfs-debian.git] / cmd / zpool / zpool_vdev.c
CommitLineData
34dc7c2f
BB
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
428870ff 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
e10b0808 24 * Copyright (c) 2013 by Delphix. All rights reserved.
34dc7c2f
BB
25 */
26
34dc7c2f
BB
27/*
28 * Functions to convert between a list of vdevs and an nvlist representing the
29 * configuration. Each entry in the list can be one of:
30 *
31 * Device vdevs
32 * disk=(path=..., devid=...)
33 * file=(path=...)
34 *
35 * Group vdevs
36 * raidz[1|2]=(...)
37 * mirror=(...)
38 *
39 * Hot spares
40 *
41 * While the underlying implementation supports it, group vdevs cannot contain
42 * other group vdevs. All userland verification of devices is contained within
43 * this file. If successful, the nvlist returned can be passed directly to the
44 * kernel; we've done as much verification as possible in userland.
45 *
46 * Hot spares are a special case, and passed down as an array of disk vdevs, at
47 * the same level as the root of the vdev tree.
48 *
49 * The only function exported by this file is 'make_root_vdev'. The
50 * function performs several passes:
51 *
52 * 1. Construct the vdev specification. Performs syntax validation and
53 * makes sure each device is valid.
d603ed6c 54 * 2. Check for devices in use. Using libblkid to make sure that no
34dc7c2f
BB
55 * devices are also in use. Some can be overridden using the 'force'
56 * flag, others cannot.
57 * 3. Check for replication errors if the 'force' flag is not specified.
58 * validates that the replication level is consistent across the
59 * entire pool.
60 * 4. Call libzfs to label any whole disks with an EFI label.
61 */
62
63#include <assert.h>
d603ed6c 64#include <ctype.h>
34dc7c2f
BB
65#include <devid.h>
66#include <errno.h>
67#include <fcntl.h>
34dc7c2f
BB
68#include <libintl.h>
69#include <libnvpair.h>
45d1cae3 70#include <limits.h>
c06d4368
AX
71#include <scsi/scsi.h>
72#include <scsi/sg.h>
34dc7c2f
BB
73#include <stdio.h>
74#include <string.h>
75#include <unistd.h>
76#include <sys/efi_partition.h>
77#include <sys/stat.h>
78#include <sys/vtoc.h>
79#include <sys/mntent.h>
d603ed6c
BB
80#include <uuid/uuid.h>
81#ifdef HAVE_LIBBLKID
82#include <blkid/blkid.h>
83#else
a08ee875 84#define blkid_cache void *
d603ed6c 85#endif /* HAVE_LIBBLKID */
34dc7c2f
BB
86
87#include "zpool_util.h"
c06d4368 88#include <sys/zfs_context.h>
34dc7c2f 89
34dc7c2f
BB
90/*
91 * For any given vdev specification, we can have multiple errors. The
92 * vdev_error() function keeps track of whether we have seen an error yet, and
93 * prints out a header if its the first error we've seen.
94 */
95boolean_t error_seen;
96boolean_t is_force;
97
c06d4368
AX
98typedef struct vdev_disk_db_entry
99{
100 char id[24];
101 int sector_size;
102} vdev_disk_db_entry_t;
103
104/*
105 * Database of block devices that lie about physical sector sizes. The
106 * identification string must be precisely 24 characters to avoid false
107 * negatives
108 */
109static vdev_disk_db_entry_t vdev_disk_database[] = {
a08ee875
LG
110 {"ATA ADATA SSD S396 3", 8192},
111 {"ATA APPLE SSD SM128E", 8192},
112 {"ATA APPLE SSD SM256E", 8192},
113 {"ATA APPLE SSD SM512E", 8192},
114 {"ATA APPLE SSD SM768E", 8192},
115 {"ATA C400-MTFDDAC064M", 8192},
116 {"ATA C400-MTFDDAC128M", 8192},
117 {"ATA C400-MTFDDAC256M", 8192},
118 {"ATA C400-MTFDDAC512M", 8192},
c06d4368 119 {"ATA Corsair Force 3 ", 8192},
a08ee875 120 {"ATA Corsair Force GS", 8192},
c06d4368 121 {"ATA INTEL SSDSA2CT04", 8192},
a08ee875
LG
122 {"ATA INTEL SSDSA2BZ10", 8192},
123 {"ATA INTEL SSDSA2BZ20", 8192},
124 {"ATA INTEL SSDSA2BZ30", 8192},
125 {"ATA INTEL SSDSA2CW04", 8192},
126 {"ATA INTEL SSDSA2CW08", 8192},
127 {"ATA INTEL SSDSA2CW12", 8192},
c06d4368 128 {"ATA INTEL SSDSA2CW16", 8192},
a08ee875
LG
129 {"ATA INTEL SSDSA2CW30", 8192},
130 {"ATA INTEL SSDSA2CW60", 8192},
a08ee875
LG
131 {"ATA INTEL SSDSC2CT06", 8192},
132 {"ATA INTEL SSDSC2CT12", 8192},
c06d4368 133 {"ATA INTEL SSDSC2CT18", 8192},
a08ee875
LG
134 {"ATA INTEL SSDSC2CT24", 8192},
135 {"ATA INTEL SSDSC2CW06", 8192},
c06d4368 136 {"ATA INTEL SSDSC2CW12", 8192},
a08ee875
LG
137 {"ATA INTEL SSDSC2CW18", 8192},
138 {"ATA INTEL SSDSC2CW24", 8192},
139 {"ATA INTEL SSDSC2CW48", 8192},
c06d4368 140 {"ATA KINGSTON SH100S3", 8192},
a08ee875 141 {"ATA KINGSTON SH103S3", 8192},
c06d4368
AX
142 {"ATA M4-CT064M4SSD2 ", 8192},
143 {"ATA M4-CT128M4SSD2 ", 8192},
144 {"ATA M4-CT256M4SSD2 ", 8192},
145 {"ATA M4-CT512M4SSD2 ", 8192},
146 {"ATA OCZ-AGILITY2 ", 8192},
a08ee875 147 {"ATA OCZ-AGILITY3 ", 8192},
c06d4368
AX
148 {"ATA OCZ-VERTEX2 3.5 ", 8192},
149 {"ATA OCZ-VERTEX3 ", 8192},
150 {"ATA OCZ-VERTEX3 LT ", 8192},
151 {"ATA OCZ-VERTEX3 MI ", 8192},
a08ee875
LG
152 {"ATA OCZ-VERTEX4 ", 8192},
153 {"ATA SAMSUNG MZ7WD120", 8192},
154 {"ATA SAMSUNG MZ7WD240", 8192},
155 {"ATA SAMSUNG MZ7WD480", 8192},
156 {"ATA SAMSUNG MZ7WD960", 8192},
c06d4368
AX
157 {"ATA SAMSUNG SSD 830 ", 8192},
158 {"ATA Samsung SSD 840 ", 8192},
a08ee875
LG
159 {"ATA SanDisk SSD U100", 8192},
160 {"ATA TOSHIBA THNSNH06", 8192},
161 {"ATA TOSHIBA THNSNH12", 8192},
162 {"ATA TOSHIBA THNSNH25", 8192},
163 {"ATA TOSHIBA THNSNH51", 8192},
164 {"ATA APPLE SSD TS064C", 4096},
165 {"ATA APPLE SSD TS128C", 4096},
166 {"ATA APPLE SSD TS256C", 4096},
167 {"ATA APPLE SSD TS512C", 4096},
c06d4368
AX
168 {"ATA INTEL SSDSA2M040", 4096},
169 {"ATA INTEL SSDSA2M080", 4096},
170 {"ATA INTEL SSDSA2M160", 4096},
a08ee875
LG
171 {"ATA INTEL SSDSC2MH12", 4096},
172 {"ATA INTEL SSDSC2MH25", 4096},
173 {"ATA OCZ CORE_SSD ", 4096},
174 {"ATA OCZ-VERTEX ", 4096},
175 {"ATA SAMSUNG MCCOE32G", 4096},
176 {"ATA SAMSUNG MCCOE64G", 4096},
177 {"ATA SAMSUNG SSD PM80", 4096},
ea04106b
AX
178 /* Flash drives optimized for 4KB IOs on larger pages */
179 {"ATA INTEL SSDSC2BA10", 4096},
180 {"ATA INTEL SSDSC2BA20", 4096},
181 {"ATA INTEL SSDSC2BA40", 4096},
182 {"ATA INTEL SSDSC2BA80", 4096},
183 {"ATA INTEL SSDSC2BB08", 4096},
184 {"ATA INTEL SSDSC2BB12", 4096},
185 {"ATA INTEL SSDSC2BB16", 4096},
186 {"ATA INTEL SSDSC2BB24", 4096},
187 {"ATA INTEL SSDSC2BB30", 4096},
188 {"ATA INTEL SSDSC2BB40", 4096},
189 {"ATA INTEL SSDSC2BB48", 4096},
190 {"ATA INTEL SSDSC2BB60", 4096},
191 {"ATA INTEL SSDSC2BB80", 4096},
192 {"ATA INTEL SSDSC2BW24", 4096},
193 {"ATA INTEL SSDSC2BP24", 4096},
194 {"ATA INTEL SSDSC2BP48", 4096},
195 {"NA SmrtStorSDLKAE9W", 4096},
a08ee875 196 /* Imported from Open Solaris */
c06d4368
AX
197 {"ATA MARVELL SD88SA02", 4096},
198 /* Advanced format Hard drives */
199 {"ATA Hitachi HDS5C303", 4096},
200 {"ATA SAMSUNG HD204UI ", 4096},
201 {"ATA ST2000DL004 HD20", 4096},
202 {"ATA WDC WD10EARS-00M", 4096},
203 {"ATA WDC WD10EARS-00S", 4096},
204 {"ATA WDC WD10EARS-00Z", 4096},
205 {"ATA WDC WD15EARS-00M", 4096},
206 {"ATA WDC WD15EARS-00S", 4096},
207 {"ATA WDC WD15EARS-00Z", 4096},
208 {"ATA WDC WD20EARS-00M", 4096},
209 {"ATA WDC WD20EARS-00S", 4096},
210 {"ATA WDC WD20EARS-00Z", 4096},
ea04106b
AX
211 {"ATA WDC WD1600BEVT-0", 4096},
212 {"ATA WDC WD2500BEVT-0", 4096},
213 {"ATA WDC WD3200BEVT-0", 4096},
214 {"ATA WDC WD5000BEVT-0", 4096},
c06d4368
AX
215 /* Virtual disks: Assume zvols with default volblocksize */
216#if 0
217 {"ATA QEMU HARDDISK ", 8192},
218 {"IET VIRTUAL-DISK ", 8192},
219 {"OI COMSTAR ", 8192},
a08ee875
LG
220 {"SUN COMSTAR ", 8192},
221 {"NETAPP LUN ", 8192},
c06d4368
AX
222#endif
223};
224
225static const int vdev_disk_database_size =
226 sizeof (vdev_disk_database) / sizeof (vdev_disk_database[0]);
227
228#define INQ_REPLY_LEN 96
229#define INQ_CMD_LEN 6
230
231static boolean_t
232check_sector_size_database(char *path, int *sector_size)
233{
234 unsigned char inq_buff[INQ_REPLY_LEN];
235 unsigned char sense_buffer[32];
236 unsigned char inq_cmd_blk[INQ_CMD_LEN] =
237 {INQUIRY, 0, 0, 0, INQ_REPLY_LEN, 0};
238 sg_io_hdr_t io_hdr;
239 int error;
240 int fd;
241 int i;
242
243 /* Prepare INQUIRY command */
a08ee875 244 memset(&io_hdr, 0, sizeof (sg_io_hdr_t));
c06d4368 245 io_hdr.interface_id = 'S';
a08ee875
LG
246 io_hdr.cmd_len = sizeof (inq_cmd_blk);
247 io_hdr.mx_sb_len = sizeof (sense_buffer);
c06d4368
AX
248 io_hdr.dxfer_direction = SG_DXFER_FROM_DEV;
249 io_hdr.dxfer_len = INQ_REPLY_LEN;
250 io_hdr.dxferp = inq_buff;
251 io_hdr.cmdp = inq_cmd_blk;
252 io_hdr.sbp = sense_buffer;
a08ee875 253 io_hdr.timeout = 10; /* 10 milliseconds is ample time */
c06d4368
AX
254
255 if ((fd = open(path, O_RDONLY|O_DIRECT)) < 0)
256 return (B_FALSE);
257
258 error = ioctl(fd, SG_IO, (unsigned long) &io_hdr);
259
260 (void) close(fd);
261
262 if (error < 0)
263 return (B_FALSE);
264
265 if ((io_hdr.info & SG_INFO_OK_MASK) != SG_INFO_OK)
266 return (B_FALSE);
267
268 for (i = 0; i < vdev_disk_database_size; i++) {
269 if (memcmp(inq_buff + 8, vdev_disk_database[i].id, 24))
270 continue;
271
272 *sector_size = vdev_disk_database[i].sector_size;
273 return (B_TRUE);
274 }
275
276 return (B_FALSE);
277}
278
34dc7c2f
BB
279/*PRINTFLIKE1*/
280static void
281vdev_error(const char *fmt, ...)
282{
283 va_list ap;
284
285 if (!error_seen) {
286 (void) fprintf(stderr, gettext("invalid vdev specification\n"));
287 if (!is_force)
288 (void) fprintf(stderr, gettext("use '-f' to override "
289 "the following errors:\n"));
290 else
291 (void) fprintf(stderr, gettext("the following errors "
292 "must be manually repaired:\n"));
293 error_seen = B_TRUE;
294 }
295
296 va_start(ap, fmt);
297 (void) vfprintf(stderr, fmt, ap);
298 va_end(ap);
299}
300
34dc7c2f
BB
301/*
302 * Check that a file is valid. All we can do in this case is check that it's
303 * not in use by another pool, and not in use by swap.
304 */
305static int
306check_file(const char *file, boolean_t force, boolean_t isspare)
307{
308 char *name;
309 int fd;
310 int ret = 0;
34dc7c2f
BB
311 pool_state_t state;
312 boolean_t inuse;
313
34dc7c2f
BB
314 if ((fd = open(file, O_RDONLY)) < 0)
315 return (0);
316
317 if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) == 0 && inuse) {
318 const char *desc;
319
320 switch (state) {
321 case POOL_STATE_ACTIVE:
322 desc = gettext("active");
323 break;
324
325 case POOL_STATE_EXPORTED:
326 desc = gettext("exported");
327 break;
328
329 case POOL_STATE_POTENTIALLY_ACTIVE:
330 desc = gettext("potentially active");
331 break;
332
333 default:
334 desc = gettext("unknown");
335 break;
336 }
337
338 /*
339 * Allow hot spares to be shared between pools.
340 */
341 if (state == POOL_STATE_SPARE && isspare)
342 return (0);
343
344 if (state == POOL_STATE_ACTIVE ||
345 state == POOL_STATE_SPARE || !force) {
346 switch (state) {
347 case POOL_STATE_SPARE:
348 vdev_error(gettext("%s is reserved as a hot "
349 "spare for pool %s\n"), file, name);
350 break;
351 default:
352 vdev_error(gettext("%s is part of %s pool "
353 "'%s'\n"), file, desc, name);
354 break;
355 }
356 ret = -1;
357 }
358
359 free(name);
360 }
361
362 (void) close(fd);
363 return (ret);
364}
365
d603ed6c
BB
366static void
367check_error(int err)
368{
369 (void) fprintf(stderr, gettext("warning: device in use checking "
370 "failed: %s\n"), strerror(err));
371}
372
373static int
374check_slice(const char *path, blkid_cache cache, int force, boolean_t isspare)
375{
d603ed6c
BB
376 int err;
377#ifdef HAVE_LIBBLKID
378 char *value;
d603ed6c 379
d603ed6c
BB
380 /* No valid type detected device is safe to use */
381 value = blkid_get_tag_value(cache, "TYPE", path);
382 if (value == NULL)
383 return (0);
384
385 /*
386 * If libblkid detects a ZFS device, we check the device
387 * using check_file() to see if it's safe. The one safe
388 * case is a spare device shared between multiple pools.
389 */
ea04106b 390 if (strcmp(value, "zfs_member") == 0) {
d603ed6c
BB
391 err = check_file(path, force, isspare);
392 } else {
393 if (force) {
394 err = 0;
395 } else {
396 err = -1;
397 vdev_error(gettext("%s contains a filesystem of "
a08ee875 398 "type '%s'\n"), path, value);
d603ed6c
BB
399 }
400 }
401
402 free(value);
403#else
404 err = check_file(path, force, isspare);
405#endif /* HAVE_LIBBLKID */
406
407 return (err);
408}
409
410/*
411 * Validate a whole disk. Iterate over all slices on the disk and make sure
412 * that none is in use by calling check_slice().
413 */
414static int
415check_disk(const char *path, blkid_cache cache, int force,
a08ee875 416 boolean_t isspare, boolean_t iswholedisk)
d603ed6c
BB
417{
418 struct dk_gpt *vtoc;
419 char slice_path[MAXPATHLEN];
420 int err = 0;
421 int fd, i;
422
423 /* This is not a wholedisk we only check the given partition */
424 if (!iswholedisk)
a08ee875 425 return (check_slice(path, cache, force, isspare));
d603ed6c
BB
426
427 /*
428 * When the device is a whole disk try to read the efi partition
429 * label. If this is successful we safely check the all of the
430 * partitions. However, when it fails it may simply be because
431 * the disk is partitioned via the MBR. Since we currently can
432 * not easily decode the MBR return a failure and prompt to the
433 * user to use force option since we cannot check the partitions.
434 */
8128bd89 435 if ((fd = open(path, O_RDONLY|O_DIRECT)) < 0) {
d603ed6c 436 check_error(errno);
a08ee875 437 return (-1);
d603ed6c
BB
438 }
439
440 if ((err = efi_alloc_and_read(fd, &vtoc)) != 0) {
441 (void) close(fd);
442
443 if (force) {
a08ee875 444 return (0);
d603ed6c
BB
445 } else {
446 vdev_error(gettext("%s does not contain an EFI "
447 "label but it may contain partition\n"
448 "information in the MBR.\n"), path);
a08ee875 449 return (-1);
d603ed6c
BB
450 }
451 }
452
453 /*
454 * The primary efi partition label is damaged however the secondary
455 * label at the end of the device is intact. Rather than use this
456 * label we should play it safe and treat this as a non efi device.
457 */
458 if (vtoc->efi_flags & EFI_GPT_PRIMARY_CORRUPT) {
459 efi_free(vtoc);
460 (void) close(fd);
461
462 if (force) {
463 /* Partitions will no be created using the backup */
a08ee875 464 return (0);
d603ed6c
BB
465 } else {
466 vdev_error(gettext("%s contains a corrupt primary "
467 "EFI label.\n"), path);
a08ee875 468 return (-1);
d603ed6c
BB
469 }
470 }
471
472 for (i = 0; i < vtoc->efi_nparts; i++) {
473
474 if (vtoc->efi_parts[i].p_tag == V_UNASSIGNED ||
475 uuid_is_null((uchar_t *)&vtoc->efi_parts[i].p_guid))
476 continue;
477
478 if (strncmp(path, UDISK_ROOT, strlen(UDISK_ROOT)) == 0)
479 (void) snprintf(slice_path, sizeof (slice_path),
480 "%s%s%d", path, "-part", i+1);
481 else
482 (void) snprintf(slice_path, sizeof (slice_path),
483 "%s%s%d", path, isdigit(path[strlen(path)-1]) ?
484 "p" : "", i+1);
485
486 err = check_slice(slice_path, cache, force, isspare);
487 if (err)
488 break;
489 }
490
491 efi_free(vtoc);
492 (void) close(fd);
493
8128bd89 494 return (err);
d603ed6c
BB
495}
496
497static int
498check_device(const char *path, boolean_t force,
a08ee875 499 boolean_t isspare, boolean_t iswholedisk)
d603ed6c
BB
500{
501 static blkid_cache cache = NULL;
502
503#ifdef HAVE_LIBBLKID
504 /*
505 * There is no easy way to add a correct blkid_put_cache() call,
506 * memory will be reclaimed when the command exits.
507 */
508 if (cache == NULL) {
509 int err;
510
511 if ((err = blkid_get_cache(&cache, NULL)) != 0) {
512 check_error(err);
a08ee875 513 return (-1);
d603ed6c
BB
514 }
515
516 if ((err = blkid_probe_all(cache)) != 0) {
517 blkid_put_cache(cache);
518 check_error(err);
a08ee875 519 return (-1);
d603ed6c
BB
520 }
521 }
522#endif /* HAVE_LIBBLKID */
523
a08ee875 524 return (check_disk(path, cache, force, isspare, iswholedisk));
d603ed6c 525}
34dc7c2f
BB
526
527/*
528 * By "whole disk" we mean an entire physical disk (something we can
529 * label, toggle the write cache on, etc.) as opposed to the full
530 * capacity of a pseudo-device such as lofi or did. We act as if we
531 * are labeling the disk, which should be a pretty good test of whether
532 * it's a viable device or not. Returns B_TRUE if it is and B_FALSE if
533 * it isn't.
534 */
535static boolean_t
d603ed6c 536is_whole_disk(const char *path)
34dc7c2f
BB
537{
538 struct dk_gpt *label;
a08ee875 539 int fd;
34dc7c2f 540
8128bd89 541 if ((fd = open(path, O_RDONLY|O_DIRECT)) < 0)
34dc7c2f
BB
542 return (B_FALSE);
543 if (efi_alloc_and_init(fd, EFI_NUMPAR, &label) != 0) {
544 (void) close(fd);
545 return (B_FALSE);
546 }
547 efi_free(label);
548 (void) close(fd);
549 return (B_TRUE);
550}
551
d603ed6c
BB
552/*
553 * This may be a shorthand device path or it could be total gibberish.
eac47204
BB
554 * Check to see if it is a known device available in zfs_vdev_paths.
555 * As part of this check, see if we've been given an entire disk
556 * (minus the slice number).
d603ed6c
BB
557 */
558static int
559is_shorthand_path(const char *arg, char *path,
a08ee875 560 struct stat64 *statbuf, boolean_t *wholedisk)
d603ed6c 561{
eac47204
BB
562 int error;
563
564 error = zfs_resolve_shortname(arg, path, MAXPATHLEN);
565 if (error == 0) {
d603ed6c 566 *wholedisk = is_whole_disk(path);
79e7242a 567 if (*wholedisk || (stat64(path, statbuf) == 0))
d603ed6c
BB
568 return (0);
569 }
570
a08ee875
LG
571 strlcpy(path, arg, sizeof (path));
572 memset(statbuf, 0, sizeof (*statbuf));
d603ed6c
BB
573 *wholedisk = B_FALSE;
574
eac47204 575 return (error);
d603ed6c
BB
576}
577
8128bd89
BB
578/*
579 * Determine if the given path is a hot spare within the given configuration.
580 * If no configuration is given we rely solely on the label.
581 */
582static boolean_t
583is_spare(nvlist_t *config, const char *path)
584{
585 int fd;
586 pool_state_t state;
587 char *name = NULL;
588 nvlist_t *label;
589 uint64_t guid, spareguid;
590 nvlist_t *nvroot;
591 nvlist_t **spares;
592 uint_t i, nspares;
593 boolean_t inuse;
594
595 if ((fd = open(path, O_RDONLY)) < 0)
596 return (B_FALSE);
597
598 if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) != 0 ||
599 !inuse ||
600 state != POOL_STATE_SPARE ||
ea04106b 601 zpool_read_label(fd, &label, NULL) != 0) {
8128bd89
BB
602 free(name);
603 (void) close(fd);
604 return (B_FALSE);
605 }
606 free(name);
607 (void) close(fd);
608
609 if (config == NULL)
610 return (B_TRUE);
611
612 verify(nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) == 0);
613 nvlist_free(label);
614
615 verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
616 &nvroot) == 0);
617 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
618 &spares, &nspares) == 0) {
619 for (i = 0; i < nspares; i++) {
620 verify(nvlist_lookup_uint64(spares[i],
621 ZPOOL_CONFIG_GUID, &spareguid) == 0);
622 if (spareguid == guid)
623 return (B_TRUE);
624 }
625 }
626
627 return (B_FALSE);
628}
629
34dc7c2f
BB
630/*
631 * Create a leaf vdev. Determine if this is a file or a device. If it's a
632 * device, fill in the device id to make a complete nvlist. Valid forms for a
633 * leaf vdev are:
634 *
eac47204
BB
635 * /dev/xxx Complete disk path
636 * /xxx Full path to file
637 * xxx Shorthand for <zfs_vdev_paths>/xxx
34dc7c2f
BB
638 */
639static nvlist_t *
df30f566 640make_leaf_vdev(nvlist_t *props, const char *arg, uint64_t is_log)
34dc7c2f
BB
641{
642 char path[MAXPATHLEN];
643 struct stat64 statbuf;
644 nvlist_t *vdev = NULL;
645 char *type = NULL;
646 boolean_t wholedisk = B_FALSE;
c06d4368 647 uint64_t ashift = 0;
d603ed6c 648 int err;
34dc7c2f
BB
649
650 /*
651 * Determine what type of vdev this is, and put the full path into
652 * 'path'. We detect whether this is a device of file afterwards by
653 * checking the st_mode of the file.
654 */
655 if (arg[0] == '/') {
656 /*
657 * Complete device or file path. Exact type is determined by
d603ed6c
BB
658 * examining the file descriptor afterwards. Symbolic links
659 * are resolved to their real paths for the is_whole_disk()
660 * and S_ISBLK/S_ISREG type checks. However, we are careful
661 * to store the given path as ZPOOL_CONFIG_PATH to ensure we
662 * can leverage udev's persistent device labels.
34dc7c2f 663 */
d603ed6c 664 if (realpath(arg, path) == NULL) {
34dc7c2f 665 (void) fprintf(stderr,
d603ed6c 666 gettext("cannot resolve path '%s'\n"), arg);
34dc7c2f
BB
667 return (NULL);
668 }
669
34dc7c2f
BB
670 wholedisk = is_whole_disk(path);
671 if (!wholedisk && (stat64(path, &statbuf) != 0)) {
d603ed6c
BB
672 (void) fprintf(stderr,
673 gettext("cannot open '%s': %s\n"),
674 path, strerror(errno));
675 return (NULL);
676 }
677
678 /* After is_whole_disk() check restore original passed path */
679 strlcpy(path, arg, MAXPATHLEN);
680 } else {
681 err = is_shorthand_path(arg, path, &statbuf, &wholedisk);
682 if (err != 0) {
34dc7c2f
BB
683 /*
684 * If we got ENOENT, then the user gave us
685 * gibberish, so try to direct them with a
686 * reasonable error message. Otherwise,
687 * regurgitate strerror() since it's the best we
688 * can do.
689 */
d603ed6c 690 if (err == ENOENT) {
34dc7c2f
BB
691 (void) fprintf(stderr,
692 gettext("cannot open '%s': no such "
693 "device in %s\n"), arg, DISK_ROOT);
694 (void) fprintf(stderr,
695 gettext("must be a full path or "
696 "shorthand device name\n"));
697 return (NULL);
698 } else {
699 (void) fprintf(stderr,
700 gettext("cannot open '%s': %s\n"),
701 path, strerror(errno));
702 return (NULL);
703 }
704 }
705 }
706
707 /*
708 * Determine whether this is a device or a file.
709 */
710 if (wholedisk || S_ISBLK(statbuf.st_mode)) {
711 type = VDEV_TYPE_DISK;
712 } else if (S_ISREG(statbuf.st_mode)) {
713 type = VDEV_TYPE_FILE;
714 } else {
715 (void) fprintf(stderr, gettext("cannot use '%s': must be a "
716 "block device or regular file\n"), path);
717 return (NULL);
718 }
719
720 /*
721 * Finally, we have the complete device or file, and we know that it is
722 * acceptable to use. Construct the nvlist to describe this vdev. All
723 * vdevs have a 'path' element, and devices also have a 'devid' element.
724 */
725 verify(nvlist_alloc(&vdev, NV_UNIQUE_NAME, 0) == 0);
726 verify(nvlist_add_string(vdev, ZPOOL_CONFIG_PATH, path) == 0);
727 verify(nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE, type) == 0);
728 verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_IS_LOG, is_log) == 0);
729 if (strcmp(type, VDEV_TYPE_DISK) == 0)
730 verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK,
731 (uint64_t)wholedisk) == 0);
732
c06d4368
AX
733 /*
734 * Override defaults if custom properties are provided.
735 */
df30f566 736 if (props != NULL) {
df30f566
CK
737 char *value = NULL;
738
739 if (nvlist_lookup_string(props,
740 zpool_prop_to_name(ZPOOL_PROP_ASHIFT), &value) == 0)
741 zfs_nicestrtonum(NULL, value, &ashift);
c06d4368 742 }
df30f566 743
c06d4368
AX
744 /*
745 * If the device is known to incorrectly report its physical sector
746 * size explicitly provide the known correct value.
747 */
748 if (ashift == 0) {
749 int sector_size;
750
751 if (check_sector_size_database(path, &sector_size) == B_TRUE)
ea04106b 752 ashift = highbit64(sector_size) - 1;
df30f566
CK
753 }
754
c06d4368
AX
755 if (ashift > 0)
756 nvlist_add_uint64(vdev, ZPOOL_CONFIG_ASHIFT, ashift);
757
34dc7c2f
BB
758 return (vdev);
759}
760
761/*
762 * Go through and verify the replication level of the pool is consistent.
763 * Performs the following checks:
764 *
765 * For the new spec, verifies that devices in mirrors and raidz are the
766 * same size.
767 *
768 * If the current configuration already has inconsistent replication
769 * levels, ignore any other potential problems in the new spec.
770 *
771 * Otherwise, make sure that the current spec (if there is one) and the new
772 * spec have consistent replication levels.
773 */
774typedef struct replication_level {
775 char *zprl_type;
776 uint64_t zprl_children;
777 uint64_t zprl_parity;
778} replication_level_t;
779
780#define ZPOOL_FUZZ (16 * 1024 * 1024)
781
782/*
783 * Given a list of toplevel vdevs, return the current replication level. If
784 * the config is inconsistent, then NULL is returned. If 'fatal' is set, then
785 * an error message will be displayed for each self-inconsistent vdev.
786 */
787static replication_level_t *
788get_replication(nvlist_t *nvroot, boolean_t fatal)
789{
790 nvlist_t **top;
791 uint_t t, toplevels;
792 nvlist_t **child;
793 uint_t c, children;
794 nvlist_t *nv;
795 char *type;
d4ed6673 796 replication_level_t lastrep = { 0 }, rep, *ret;
34dc7c2f
BB
797 boolean_t dontreport;
798
799 ret = safe_malloc(sizeof (replication_level_t));
800
801 verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
802 &top, &toplevels) == 0);
803
804 lastrep.zprl_type = NULL;
805 for (t = 0; t < toplevels; t++) {
806 uint64_t is_log = B_FALSE;
807
808 nv = top[t];
809
810 /*
811 * For separate logs we ignore the top level vdev replication
812 * constraints.
813 */
814 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &is_log);
815 if (is_log)
816 continue;
817
818 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE,
819 &type) == 0);
820 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
821 &child, &children) != 0) {
822 /*
823 * This is a 'file' or 'disk' vdev.
824 */
825 rep.zprl_type = type;
826 rep.zprl_children = 1;
827 rep.zprl_parity = 0;
828 } else {
829 uint64_t vdev_size;
830
831 /*
832 * This is a mirror or RAID-Z vdev. Go through and make
833 * sure the contents are all the same (files vs. disks),
834 * keeping track of the number of elements in the
835 * process.
836 *
837 * We also check that the size of each vdev (if it can
838 * be determined) is the same.
839 */
840 rep.zprl_type = type;
841 rep.zprl_children = 0;
842
843 if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) {
844 verify(nvlist_lookup_uint64(nv,
845 ZPOOL_CONFIG_NPARITY,
846 &rep.zprl_parity) == 0);
847 assert(rep.zprl_parity != 0);
848 } else {
849 rep.zprl_parity = 0;
850 }
851
852 /*
853 * The 'dontreport' variable indicates that we've
854 * already reported an error for this spec, so don't
855 * bother doing it again.
856 */
857 type = NULL;
858 dontreport = 0;
859 vdev_size = -1ULL;
860 for (c = 0; c < children; c++) {
861 nvlist_t *cnv = child[c];
862 char *path;
863 struct stat64 statbuf;
864 uint64_t size = -1ULL;
865 char *childtype;
866 int fd, err;
867
868 rep.zprl_children++;
869
870 verify(nvlist_lookup_string(cnv,
871 ZPOOL_CONFIG_TYPE, &childtype) == 0);
872
873 /*
874 * If this is a replacing or spare vdev, then
875 * get the real first child of the vdev.
876 */
877 if (strcmp(childtype,
878 VDEV_TYPE_REPLACING) == 0 ||
879 strcmp(childtype, VDEV_TYPE_SPARE) == 0) {
880 nvlist_t **rchild;
881 uint_t rchildren;
882
883 verify(nvlist_lookup_nvlist_array(cnv,
884 ZPOOL_CONFIG_CHILDREN, &rchild,
885 &rchildren) == 0);
886 assert(rchildren == 2);
887 cnv = rchild[0];
888
889 verify(nvlist_lookup_string(cnv,
890 ZPOOL_CONFIG_TYPE,
891 &childtype) == 0);
892 }
893
894 verify(nvlist_lookup_string(cnv,
895 ZPOOL_CONFIG_PATH, &path) == 0);
896
897 /*
898 * If we have a raidz/mirror that combines disks
899 * with files, report it as an error.
900 */
901 if (!dontreport && type != NULL &&
902 strcmp(type, childtype) != 0) {
903 if (ret != NULL)
904 free(ret);
905 ret = NULL;
906 if (fatal)
907 vdev_error(gettext(
908 "mismatched replication "
909 "level: %s contains both "
910 "files and devices\n"),
911 rep.zprl_type);
912 else
913 return (NULL);
914 dontreport = B_TRUE;
915 }
916
917 /*
918 * According to stat(2), the value of 'st_size'
919 * is undefined for block devices and character
920 * devices. But there is no effective way to
921 * determine the real size in userland.
922 *
923 * Instead, we'll take advantage of an
924 * implementation detail of spec_size(). If the
925 * device is currently open, then we (should)
926 * return a valid size.
927 *
928 * If we still don't get a valid size (indicated
929 * by a size of 0 or MAXOFFSET_T), then ignore
930 * this device altogether.
931 */
932 if ((fd = open(path, O_RDONLY)) >= 0) {
933 err = fstat64(fd, &statbuf);
934 (void) close(fd);
935 } else {
936 err = stat64(path, &statbuf);
937 }
938
939 if (err != 0 ||
940 statbuf.st_size == 0 ||
941 statbuf.st_size == MAXOFFSET_T)
942 continue;
943
944 size = statbuf.st_size;
945
946 /*
947 * Also make sure that devices and
948 * slices have a consistent size. If
949 * they differ by a significant amount
950 * (~16MB) then report an error.
951 */
952 if (!dontreport &&
953 (vdev_size != -1ULL &&
954 (labs(size - vdev_size) >
955 ZPOOL_FUZZ))) {
956 if (ret != NULL)
957 free(ret);
958 ret = NULL;
959 if (fatal)
960 vdev_error(gettext(
961 "%s contains devices of "
962 "different sizes\n"),
963 rep.zprl_type);
964 else
965 return (NULL);
966 dontreport = B_TRUE;
967 }
968
969 type = childtype;
970 vdev_size = size;
971 }
972 }
973
974 /*
975 * At this point, we have the replication of the last toplevel
976 * vdev in 'rep'. Compare it to 'lastrep' to see if its
977 * different.
978 */
979 if (lastrep.zprl_type != NULL) {
980 if (strcmp(lastrep.zprl_type, rep.zprl_type) != 0) {
981 if (ret != NULL)
982 free(ret);
983 ret = NULL;
984 if (fatal)
985 vdev_error(gettext(
986 "mismatched replication level: "
987 "both %s and %s vdevs are "
988 "present\n"),
989 lastrep.zprl_type, rep.zprl_type);
990 else
991 return (NULL);
992 } else if (lastrep.zprl_parity != rep.zprl_parity) {
993 if (ret)
994 free(ret);
995 ret = NULL;
996 if (fatal)
997 vdev_error(gettext(
998 "mismatched replication level: "
999 "both %llu and %llu device parity "
1000 "%s vdevs are present\n"),
1001 lastrep.zprl_parity,
1002 rep.zprl_parity,
1003 rep.zprl_type);
1004 else
1005 return (NULL);
1006 } else if (lastrep.zprl_children != rep.zprl_children) {
1007 if (ret)
1008 free(ret);
1009 ret = NULL;
1010 if (fatal)
1011 vdev_error(gettext(
1012 "mismatched replication level: "
1013 "both %llu-way and %llu-way %s "
1014 "vdevs are present\n"),
1015 lastrep.zprl_children,
1016 rep.zprl_children,
1017 rep.zprl_type);
1018 else
1019 return (NULL);
1020 }
1021 }
1022 lastrep = rep;
1023 }
1024
1025 if (ret != NULL)
1026 *ret = rep;
1027
1028 return (ret);
1029}
1030
1031/*
1032 * Check the replication level of the vdev spec against the current pool. Calls
1033 * get_replication() to make sure the new spec is self-consistent. If the pool
1034 * has a consistent replication level, then we ignore any errors. Otherwise,
1035 * report any difference between the two.
1036 */
1037static int
1038check_replication(nvlist_t *config, nvlist_t *newroot)
1039{
1040 nvlist_t **child;
1041 uint_t children;
1042 replication_level_t *current = NULL, *new;
1043 int ret;
1044
1045 /*
1046 * If we have a current pool configuration, check to see if it's
1047 * self-consistent. If not, simply return success.
1048 */
1049 if (config != NULL) {
1050 nvlist_t *nvroot;
1051
1052 verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
1053 &nvroot) == 0);
1054 if ((current = get_replication(nvroot, B_FALSE)) == NULL)
1055 return (0);
1056 }
1057 /*
1058 * for spares there may be no children, and therefore no
1059 * replication level to check
1060 */
1061 if ((nvlist_lookup_nvlist_array(newroot, ZPOOL_CONFIG_CHILDREN,
1062 &child, &children) != 0) || (children == 0)) {
1063 free(current);
1064 return (0);
1065 }
1066
1067 /*
1068 * If all we have is logs then there's no replication level to check.
1069 */
1070 if (num_logs(newroot) == children) {
1071 free(current);
1072 return (0);
1073 }
1074
1075 /*
1076 * Get the replication level of the new vdev spec, reporting any
1077 * inconsistencies found.
1078 */
1079 if ((new = get_replication(newroot, B_TRUE)) == NULL) {
1080 free(current);
1081 return (-1);
1082 }
1083
1084 /*
1085 * Check to see if the new vdev spec matches the replication level of
1086 * the current pool.
1087 */
1088 ret = 0;
1089 if (current != NULL) {
1090 if (strcmp(current->zprl_type, new->zprl_type) != 0) {
1091 vdev_error(gettext(
1092 "mismatched replication level: pool uses %s "
1093 "and new vdev is %s\n"),
1094 current->zprl_type, new->zprl_type);
1095 ret = -1;
1096 } else if (current->zprl_parity != new->zprl_parity) {
1097 vdev_error(gettext(
1098 "mismatched replication level: pool uses %llu "
1099 "device parity and new vdev uses %llu\n"),
1100 current->zprl_parity, new->zprl_parity);
1101 ret = -1;
1102 } else if (current->zprl_children != new->zprl_children) {
1103 vdev_error(gettext(
1104 "mismatched replication level: pool uses %llu-way "
1105 "%s and new vdev uses %llu-way %s\n"),
1106 current->zprl_children, current->zprl_type,
1107 new->zprl_children, new->zprl_type);
1108 ret = -1;
1109 }
1110 }
1111
1112 free(new);
1113 if (current != NULL)
1114 free(current);
1115
1116 return (ret);
1117}
1118
d603ed6c
BB
1119static int
1120zero_label(char *path)
1121{
1122 const int size = 4096;
1123 char buf[size];
1124 int err, fd;
1125
1126 if ((fd = open(path, O_WRONLY|O_EXCL)) < 0) {
1127 (void) fprintf(stderr, gettext("cannot open '%s': %s\n"),
1128 path, strerror(errno));
1129 return (-1);
1130 }
1131
1132 memset(buf, 0, size);
1133 err = write(fd, buf, size);
1134 (void) fdatasync(fd);
1135 (void) close(fd);
1136
1137 if (err == -1) {
1138 (void) fprintf(stderr, gettext("cannot zero first %d bytes "
1139 "of '%s': %s\n"), size, path, strerror(errno));
1140 return (-1);
1141 }
1142
1143 if (err != size) {
1144 (void) fprintf(stderr, gettext("could only zero %d/%d bytes "
1145 "of '%s'\n"), err, size, path);
1146 return (-1);
1147 }
1148
a08ee875 1149 return (0);
d603ed6c
BB
1150}
1151
34dc7c2f
BB
1152/*
1153 * Go through and find any whole disks in the vdev specification, labelling them
1154 * as appropriate. When constructing the vdev spec, we were unable to open this
1155 * device in order to provide a devid. Now that we have labelled the disk and
1156 * know that slice 0 is valid, we can construct the devid now.
1157 *
1158 * If the disk was already labeled with an EFI label, we will have gotten the
1159 * devid already (because we were able to open the whole disk). Otherwise, we
1160 * need to get the devid after we label the disk.
1161 */
1162static int
1163make_disks(zpool_handle_t *zhp, nvlist_t *nv)
1164{
1165 nvlist_t **child;
1166 uint_t c, children;
8128bd89 1167 char *type, *path;
d877ac6b
NB
1168 char devpath[MAXPATHLEN];
1169 char udevpath[MAXPATHLEN];
34dc7c2f 1170 uint64_t wholedisk;
d877ac6b 1171 struct stat64 statbuf;
8128bd89
BB
1172 int is_exclusive = 0;
1173 int fd;
34dc7c2f 1174 int ret;
34dc7c2f
BB
1175
1176 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
1177
1178 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
1179 &child, &children) != 0) {
1180
1181 if (strcmp(type, VDEV_TYPE_DISK) != 0)
1182 return (0);
1183
1184 /*
d603ed6c
BB
1185 * We have a disk device. If this is a whole disk write
1186 * out the efi partition table, otherwise write zero's to
1187 * the first 4k of the partition. This is to ensure that
1188 * libblkid will not misidentify the partition due to a
1189 * magic value left by the previous filesystem.
34dc7c2f 1190 */
d603ed6c
BB
1191 verify(!nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path));
1192 verify(!nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
1193 &wholedisk));
1194
1195 if (!wholedisk) {
8128bd89
BB
1196 (void) zero_label(path);
1197 return (0);
d603ed6c
BB
1198 }
1199
d877ac6b 1200 if (realpath(path, devpath) == NULL) {
d603ed6c
BB
1201 ret = errno;
1202 (void) fprintf(stderr,
1203 gettext("cannot resolve path '%s'\n"), path);
1204 return (ret);
1205 }
34dc7c2f 1206
d877ac6b
NB
1207 /*
1208 * Remove any previously existing symlink from a udev path to
5eacc075
AX
1209 * the device before labeling the disk. This ensures that
1210 * only newly created links are used. Otherwise there is a
1211 * window between when udev deletes and recreates the link
1212 * during which access attempts will fail with ENOENT.
d877ac6b 1213 */
eac47204
BB
1214 strncpy(udevpath, path, MAXPATHLEN);
1215 (void) zfs_append_partition(udevpath, MAXPATHLEN);
1216
8128bd89
BB
1217 fd = open(devpath, O_RDWR|O_EXCL);
1218 if (fd == -1) {
1219 if (errno == EBUSY)
1220 is_exclusive = 1;
1221 } else {
1222 (void) close(fd);
1223 }
34dc7c2f
BB
1224
1225 /*
8128bd89
BB
1226 * If the partition exists, contains a valid spare label,
1227 * and is opened exclusively there is no need to partition
1228 * it. Hot spares have already been partitioned and are
1229 * held open exclusively by the kernel as a safety measure.
1230 *
1231 * If the provided path is for a /dev/disk/ device its
1232 * symbolic link will be removed, partition table created,
1233 * and then block until udev creates the new link.
34dc7c2f 1234 */
8128bd89 1235 if (!is_exclusive || !is_spare(NULL, udevpath)) {
5eacc075
AX
1236 char *devnode = strrchr(devpath, '/') + 1;
1237
a08ee875 1238 ret = strncmp(udevpath, UDISK_ROOT, strlen(UDISK_ROOT));
8128bd89
BB
1239 if (ret == 0) {
1240 ret = lstat64(udevpath, &statbuf);
1241 if (ret == 0 && S_ISLNK(statbuf.st_mode))
1242 (void) unlink(udevpath);
1243 }
1244
5eacc075
AX
1245 /*
1246 * When labeling a pool the raw device node name
1247 * is provided as it appears under /dev/.
1248 */
1249 if (zpool_label_disk(g_zfs, zhp, devnode) == -1)
8128bd89
BB
1250 return (-1);
1251
5eacc075
AX
1252 /*
1253 * Wait for udev to signal the device is available
1254 * by the provided path.
1255 */
a08ee875 1256 ret = zpool_label_disk_wait(udevpath, DISK_LABEL_WAIT);
8128bd89 1257 if (ret) {
5eacc075
AX
1258 (void) fprintf(stderr,
1259 gettext("missing link: %s was "
1260 "partitioned but %s is missing\n"),
1261 devnode, udevpath);
1262 return (ret);
8128bd89
BB
1263 }
1264
5eacc075
AX
1265 ret = zero_label(udevpath);
1266 if (ret)
1267 return (ret);
34dc7c2f
BB
1268 }
1269
34dc7c2f 1270 /*
eac47204 1271 * Update the path to refer to the partition. The presence of
34dc7c2f 1272 * the 'whole_disk' field indicates to the CLI that we should
eac47204 1273 * chop off the partition number when displaying the device in
34dc7c2f
BB
1274 * future output.
1275 */
d877ac6b 1276 verify(nvlist_add_string(nv, ZPOOL_CONFIG_PATH, udevpath) == 0);
34dc7c2f 1277
34dc7c2f
BB
1278 return (0);
1279 }
1280
1281 for (c = 0; c < children; c++)
1282 if ((ret = make_disks(zhp, child[c])) != 0)
1283 return (ret);
1284
1285 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,
1286 &child, &children) == 0)
1287 for (c = 0; c < children; c++)
1288 if ((ret = make_disks(zhp, child[c])) != 0)
1289 return (ret);
1290
1291 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
1292 &child, &children) == 0)
1293 for (c = 0; c < children; c++)
1294 if ((ret = make_disks(zhp, child[c])) != 0)
1295 return (ret);
1296
1297 return (0);
1298}
1299
34dc7c2f
BB
1300/*
1301 * Go through and find any devices that are in use. We rely on libdiskmgt for
1302 * the majority of this task.
1303 */
e10b0808
AX
1304static boolean_t
1305is_device_in_use(nvlist_t *config, nvlist_t *nv, boolean_t force,
428870ff 1306 boolean_t replacing, boolean_t isspare)
34dc7c2f
BB
1307{
1308 nvlist_t **child;
1309 uint_t c, children;
1310 char *type, *path;
d603ed6c 1311 int ret = 0;
34dc7c2f 1312 char buf[MAXPATHLEN];
d603ed6c 1313 uint64_t wholedisk = B_FALSE;
e10b0808 1314 boolean_t anyinuse = B_FALSE;
34dc7c2f
BB
1315
1316 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
1317
1318 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
1319 &child, &children) != 0) {
1320
d603ed6c
BB
1321 verify(!nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path));
1322 if (strcmp(type, VDEV_TYPE_DISK) == 0)
1323 verify(!nvlist_lookup_uint64(nv,
a08ee875 1324 ZPOOL_CONFIG_WHOLE_DISK, &wholedisk));
34dc7c2f
BB
1325
1326 /*
1327 * As a generic check, we look to see if this is a replace of a
1328 * hot spare within the same pool. If so, we allow it
d603ed6c 1329 * regardless of what libblkid or zpool_in_use() says.
34dc7c2f 1330 */
428870ff 1331 if (replacing) {
8128bd89
BB
1332 (void) strlcpy(buf, path, sizeof (buf));
1333 if (wholedisk) {
1334 ret = zfs_append_partition(buf, sizeof (buf));
1335 if (ret == -1)
1336 return (-1);
1337 }
428870ff 1338
34dc7c2f 1339 if (is_spare(config, buf))
e10b0808 1340 return (B_FALSE);
34dc7c2f
BB
1341 }
1342
1343 if (strcmp(type, VDEV_TYPE_DISK) == 0)
d603ed6c 1344 ret = check_device(path, force, isspare, wholedisk);
34dc7c2f 1345
e10b0808 1346 else if (strcmp(type, VDEV_TYPE_FILE) == 0)
34dc7c2f
BB
1347 ret = check_file(path, force, isspare);
1348
e10b0808 1349 return (ret != 0);
34dc7c2f
BB
1350 }
1351
1352 for (c = 0; c < children; c++)
e10b0808
AX
1353 if (is_device_in_use(config, child[c], force, replacing,
1354 B_FALSE))
1355 anyinuse = B_TRUE;
34dc7c2f
BB
1356
1357 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,
1358 &child, &children) == 0)
1359 for (c = 0; c < children; c++)
e10b0808
AX
1360 if (is_device_in_use(config, child[c], force, replacing,
1361 B_TRUE))
1362 anyinuse = B_TRUE;
34dc7c2f
BB
1363
1364 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
1365 &child, &children) == 0)
1366 for (c = 0; c < children; c++)
e10b0808
AX
1367 if (is_device_in_use(config, child[c], force, replacing,
1368 B_FALSE))
1369 anyinuse = B_TRUE;
34dc7c2f 1370
e10b0808 1371 return (anyinuse);
34dc7c2f
BB
1372}
1373
1374static const char *
45d1cae3 1375is_grouping(const char *type, int *mindev, int *maxdev)
34dc7c2f 1376{
45d1cae3
BB
1377 if (strncmp(type, "raidz", 5) == 0) {
1378 const char *p = type + 5;
1379 char *end;
1380 long nparity;
1381
1382 if (*p == '\0') {
1383 nparity = 1;
1384 } else if (*p == '0') {
1385 return (NULL); /* no zero prefixes allowed */
1386 } else {
1387 errno = 0;
1388 nparity = strtol(p, &end, 10);
1389 if (errno != 0 || nparity < 1 || nparity >= 255 ||
1390 *end != '\0')
1391 return (NULL);
1392 }
34dc7c2f 1393
34dc7c2f 1394 if (mindev != NULL)
45d1cae3
BB
1395 *mindev = nparity + 1;
1396 if (maxdev != NULL)
1397 *maxdev = 255;
34dc7c2f
BB
1398 return (VDEV_TYPE_RAIDZ);
1399 }
1400
45d1cae3
BB
1401 if (maxdev != NULL)
1402 *maxdev = INT_MAX;
1403
34dc7c2f
BB
1404 if (strcmp(type, "mirror") == 0) {
1405 if (mindev != NULL)
1406 *mindev = 2;
1407 return (VDEV_TYPE_MIRROR);
1408 }
1409
1410 if (strcmp(type, "spare") == 0) {
1411 if (mindev != NULL)
1412 *mindev = 1;
1413 return (VDEV_TYPE_SPARE);
1414 }
1415
1416 if (strcmp(type, "log") == 0) {
1417 if (mindev != NULL)
1418 *mindev = 1;
1419 return (VDEV_TYPE_LOG);
1420 }
1421
1422 if (strcmp(type, "cache") == 0) {
1423 if (mindev != NULL)
1424 *mindev = 1;
1425 return (VDEV_TYPE_L2CACHE);
1426 }
1427
1428 return (NULL);
1429}
1430
1431/*
1432 * Construct a syntactically valid vdev specification,
1433 * and ensure that all devices and files exist and can be opened.
1434 * Note: we don't bother freeing anything in the error paths
1435 * because the program is just going to exit anyway.
1436 */
1437nvlist_t *
df30f566 1438construct_spec(nvlist_t *props, int argc, char **argv)
34dc7c2f
BB
1439{
1440 nvlist_t *nvroot, *nv, **top, **spares, **l2cache;
45d1cae3 1441 int t, toplevels, mindev, maxdev, nspares, nlogs, nl2cache;
34dc7c2f
BB
1442 const char *type;
1443 uint64_t is_log;
1444 boolean_t seen_logs;
1445
1446 top = NULL;
1447 toplevels = 0;
1448 spares = NULL;
1449 l2cache = NULL;
1450 nspares = 0;
1451 nlogs = 0;
1452 nl2cache = 0;
1453 is_log = B_FALSE;
1454 seen_logs = B_FALSE;
1455
1456 while (argc > 0) {
1457 nv = NULL;
1458
1459 /*
1460 * If it's a mirror or raidz, the subsequent arguments are
1461 * its leaves -- until we encounter the next mirror or raidz.
1462 */
45d1cae3 1463 if ((type = is_grouping(argv[0], &mindev, &maxdev)) != NULL) {
34dc7c2f
BB
1464 nvlist_t **child = NULL;
1465 int c, children = 0;
1466
1467 if (strcmp(type, VDEV_TYPE_SPARE) == 0) {
1468 if (spares != NULL) {
1469 (void) fprintf(stderr,
1470 gettext("invalid vdev "
1471 "specification: 'spare' can be "
1472 "specified only once\n"));
1473 return (NULL);
1474 }
1475 is_log = B_FALSE;
1476 }
1477
1478 if (strcmp(type, VDEV_TYPE_LOG) == 0) {
1479 if (seen_logs) {
1480 (void) fprintf(stderr,
1481 gettext("invalid vdev "
1482 "specification: 'log' can be "
1483 "specified only once\n"));
1484 return (NULL);
1485 }
1486 seen_logs = B_TRUE;
1487 is_log = B_TRUE;
1488 argc--;
1489 argv++;
1490 /*
1491 * A log is not a real grouping device.
1492 * We just set is_log and continue.
1493 */
1494 continue;
1495 }
1496
1497 if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) {
1498 if (l2cache != NULL) {
1499 (void) fprintf(stderr,
1500 gettext("invalid vdev "
1501 "specification: 'cache' can be "
1502 "specified only once\n"));
1503 return (NULL);
1504 }
1505 is_log = B_FALSE;
1506 }
1507
1508 if (is_log) {
1509 if (strcmp(type, VDEV_TYPE_MIRROR) != 0) {
1510 (void) fprintf(stderr,
1511 gettext("invalid vdev "
1512 "specification: unsupported 'log' "
1513 "device: %s\n"), type);
1514 return (NULL);
1515 }
1516 nlogs++;
1517 }
1518
1519 for (c = 1; c < argc; c++) {
45d1cae3 1520 if (is_grouping(argv[c], NULL, NULL) != NULL)
34dc7c2f
BB
1521 break;
1522 children++;
1523 child = realloc(child,
1524 children * sizeof (nvlist_t *));
1525 if (child == NULL)
1526 zpool_no_memory();
a08ee875
LG
1527 if ((nv = make_leaf_vdev(props, argv[c],
1528 B_FALSE)) == NULL)
34dc7c2f
BB
1529 return (NULL);
1530 child[children - 1] = nv;
1531 }
1532
1533 if (children < mindev) {
1534 (void) fprintf(stderr, gettext("invalid vdev "
1535 "specification: %s requires at least %d "
1536 "devices\n"), argv[0], mindev);
1537 return (NULL);
1538 }
1539
45d1cae3
BB
1540 if (children > maxdev) {
1541 (void) fprintf(stderr, gettext("invalid vdev "
1542 "specification: %s supports no more than "
1543 "%d devices\n"), argv[0], maxdev);
1544 return (NULL);
1545 }
1546
34dc7c2f
BB
1547 argc -= c;
1548 argv += c;
1549
1550 if (strcmp(type, VDEV_TYPE_SPARE) == 0) {
1551 spares = child;
1552 nspares = children;
1553 continue;
1554 } else if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) {
1555 l2cache = child;
1556 nl2cache = children;
1557 continue;
1558 } else {
1559 verify(nvlist_alloc(&nv, NV_UNIQUE_NAME,
1560 0) == 0);
1561 verify(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE,
1562 type) == 0);
1563 verify(nvlist_add_uint64(nv,
1564 ZPOOL_CONFIG_IS_LOG, is_log) == 0);
1565 if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) {
1566 verify(nvlist_add_uint64(nv,
1567 ZPOOL_CONFIG_NPARITY,
1568 mindev - 1) == 0);
1569 }
1570 verify(nvlist_add_nvlist_array(nv,
1571 ZPOOL_CONFIG_CHILDREN, child,
1572 children) == 0);
1573
1574 for (c = 0; c < children; c++)
1575 nvlist_free(child[c]);
1576 free(child);
1577 }
1578 } else {
1579 /*
1580 * We have a device. Pass off to make_leaf_vdev() to
1581 * construct the appropriate nvlist describing the vdev.
1582 */
a08ee875
LG
1583 if ((nv = make_leaf_vdev(props, argv[0],
1584 is_log)) == NULL)
34dc7c2f
BB
1585 return (NULL);
1586 if (is_log)
1587 nlogs++;
1588 argc--;
1589 argv++;
1590 }
1591
1592 toplevels++;
1593 top = realloc(top, toplevels * sizeof (nvlist_t *));
1594 if (top == NULL)
1595 zpool_no_memory();
1596 top[toplevels - 1] = nv;
1597 }
1598
1599 if (toplevels == 0 && nspares == 0 && nl2cache == 0) {
1600 (void) fprintf(stderr, gettext("invalid vdev "
1601 "specification: at least one toplevel vdev must be "
1602 "specified\n"));
1603 return (NULL);
1604 }
1605
1606 if (seen_logs && nlogs == 0) {
1607 (void) fprintf(stderr, gettext("invalid vdev specification: "
1608 "log requires at least 1 device\n"));
1609 return (NULL);
1610 }
1611
1612 /*
1613 * Finally, create nvroot and add all top-level vdevs to it.
1614 */
1615 verify(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) == 0);
1616 verify(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
1617 VDEV_TYPE_ROOT) == 0);
1618 verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
1619 top, toplevels) == 0);
1620 if (nspares != 0)
1621 verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
1622 spares, nspares) == 0);
1623 if (nl2cache != 0)
1624 verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
1625 l2cache, nl2cache) == 0);
1626
1627 for (t = 0; t < toplevels; t++)
1628 nvlist_free(top[t]);
1629 for (t = 0; t < nspares; t++)
1630 nvlist_free(spares[t]);
1631 for (t = 0; t < nl2cache; t++)
1632 nvlist_free(l2cache[t]);
1633 if (spares)
1634 free(spares);
1635 if (l2cache)
1636 free(l2cache);
1637 free(top);
1638
1639 return (nvroot);
1640}
1641
428870ff
BB
1642nvlist_t *
1643split_mirror_vdev(zpool_handle_t *zhp, char *newname, nvlist_t *props,
1644 splitflags_t flags, int argc, char **argv)
1645{
1646 nvlist_t *newroot = NULL, **child;
1647 uint_t c, children;
1648
1649 if (argc > 0) {
df30f566 1650 if ((newroot = construct_spec(props, argc, argv)) == NULL) {
428870ff
BB
1651 (void) fprintf(stderr, gettext("Unable to build a "
1652 "pool from the specified devices\n"));
1653 return (NULL);
1654 }
1655
1656 if (!flags.dryrun && make_disks(zhp, newroot) != 0) {
1657 nvlist_free(newroot);
1658 return (NULL);
1659 }
1660
1661 /* avoid any tricks in the spec */
1662 verify(nvlist_lookup_nvlist_array(newroot,
1663 ZPOOL_CONFIG_CHILDREN, &child, &children) == 0);
1664 for (c = 0; c < children; c++) {
1665 char *path;
1666 const char *type;
1667 int min, max;
1668
1669 verify(nvlist_lookup_string(child[c],
1670 ZPOOL_CONFIG_PATH, &path) == 0);
1671 if ((type = is_grouping(path, &min, &max)) != NULL) {
1672 (void) fprintf(stderr, gettext("Cannot use "
1673 "'%s' as a device for splitting\n"), type);
1674 nvlist_free(newroot);
1675 return (NULL);
1676 }
1677 }
1678 }
1679
1680 if (zpool_vdev_split(zhp, newname, &newroot, props, flags) != 0) {
1681 if (newroot != NULL)
1682 nvlist_free(newroot);
1683 return (NULL);
1684 }
1685
1686 return (newroot);
1687}
34dc7c2f
BB
1688
1689/*
1690 * Get and validate the contents of the given vdev specification. This ensures
1691 * that the nvlist returned is well-formed, that all the devices exist, and that
1692 * they are not currently in use by any other known consumer. The 'poolconfig'
1693 * parameter is the current configuration of the pool when adding devices
1694 * existing pool, and is used to perform additional checks, such as changing the
1695 * replication level of the pool. It can be 'NULL' to indicate that this is a
1696 * new pool. The 'force' flag controls whether devices should be forcefully
1697 * added, even if they appear in use.
1698 */
1699nvlist_t *
df30f566 1700make_root_vdev(zpool_handle_t *zhp, nvlist_t *props, int force, int check_rep,
428870ff 1701 boolean_t replacing, boolean_t dryrun, int argc, char **argv)
34dc7c2f
BB
1702{
1703 nvlist_t *newroot;
1704 nvlist_t *poolconfig = NULL;
1705 is_force = force;
1706
1707 /*
1708 * Construct the vdev specification. If this is successful, we know
1709 * that we have a valid specification, and that all devices can be
1710 * opened.
1711 */
df30f566 1712 if ((newroot = construct_spec(props, argc, argv)) == NULL)
34dc7c2f
BB
1713 return (NULL);
1714
e10b0808
AX
1715 if (zhp && ((poolconfig = zpool_get_config(zhp, NULL)) == NULL)) {
1716 nvlist_free(newroot);
34dc7c2f 1717 return (NULL);
e10b0808 1718 }
34dc7c2f
BB
1719
1720 /*
1721 * Validate each device to make sure that its not shared with another
1722 * subsystem. We do this even if 'force' is set, because there are some
1723 * uses (such as a dedicated dump device) that even '-f' cannot
1724 * override.
1725 */
e10b0808 1726 if (is_device_in_use(poolconfig, newroot, force, replacing, B_FALSE)) {
34dc7c2f
BB
1727 nvlist_free(newroot);
1728 return (NULL);
1729 }
1730
1731 /*
1732 * Check the replication level of the given vdevs and report any errors
1733 * found. We include the existing pool spec, if any, as we need to
1734 * catch changes against the existing replication level.
1735 */
1736 if (check_rep && check_replication(poolconfig, newroot) != 0) {
1737 nvlist_free(newroot);
1738 return (NULL);
1739 }
1740
1741 /*
1742 * Run through the vdev specification and label any whole disks found.
1743 */
b128c09f 1744 if (!dryrun && make_disks(zhp, newroot) != 0) {
34dc7c2f
BB
1745 nvlist_free(newroot);
1746 return (NULL);
1747 }
1748
1749 return (newroot);
1750}