]>
git.proxmox.com Git - mirror_zfs-debian.git/blob - zfs/lib/libzpool/vdev_disk.c
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
26 #pragma ident "@(#)vdev_disk.c 1.15 08/04/09 SMI"
28 #include <sys/zfs_context.h>
30 #include <sys/refcount.h>
31 #include <sys/vdev_disk.h>
32 #include <sys/vdev_impl.h>
33 #include <sys/fs/zfs.h>
35 #include <sys/sunldi.h>
38 * Virtual device vector for disks.
41 extern ldi_ident_t zfs_li
;
43 typedef struct vdev_disk_buf
{
49 vdev_disk_open_common(vdev_t
*vd
)
56 * We must have a pathname, and it must be absolute.
58 if (vd
->vdev_path
== NULL
|| vd
->vdev_path
[0] != '/') {
59 vd
->vdev_stat
.vs_aux
= VDEV_AUX_BAD_LABEL
;
63 dvd
= vd
->vdev_tsd
= kmem_zalloc(sizeof (vdev_disk_t
), KM_SLEEP
);
66 * When opening a disk device, we want to preserve the user's original
67 * intent. We always want to open the device by the path the user gave
68 * us, even if it is one of multiple paths to the save device. But we
69 * also want to be able to survive disks being removed/recabled.
70 * Therefore the sequence of opening devices is:
72 * 1. Try opening the device by path. For legacy pools without the
73 * 'whole_disk' property, attempt to fix the path by appending 's0'.
75 * 2. If the devid of the device matches the stored value, return
78 * 3. Otherwise, the device may have moved. Try opening the device
79 * by the devid instead.
82 if (vd
->vdev_devid
!= NULL
) {
83 if (ddi_devid_str_decode(vd
->vdev_devid
, &dvd
->vd_devid
,
84 &dvd
->vd_minor
) != 0) {
85 vd
->vdev_stat
.vs_aux
= VDEV_AUX_BAD_LABEL
;
90 error
= EINVAL
; /* presume failure */
92 if (vd
->vdev_path
!= NULL
) {
95 if (vd
->vdev_wholedisk
== -1ULL) {
96 size_t len
= strlen(vd
->vdev_path
) + 3;
97 char *buf
= kmem_alloc(len
, KM_SLEEP
);
100 (void) snprintf(buf
, len
, "%ss0", vd
->vdev_path
);
102 if (ldi_open_by_name(buf
, spa_mode
, kcred
,
104 spa_strfree(vd
->vdev_path
);
106 vd
->vdev_wholedisk
= 1ULL;
107 (void) ldi_close(lh
, spa_mode
, kcred
);
113 error
= ldi_open_by_name(vd
->vdev_path
, spa_mode
, kcred
,
114 &dvd
->vd_lh
, zfs_li
);
117 * Compare the devid to the stored value.
119 if (error
== 0 && vd
->vdev_devid
!= NULL
&&
120 ldi_get_devid(dvd
->vd_lh
, &devid
) == 0) {
121 if (ddi_devid_compare(devid
, dvd
->vd_devid
) != 0) {
123 (void) ldi_close(dvd
->vd_lh
, spa_mode
, kcred
);
126 ddi_devid_free(devid
);
130 * If we succeeded in opening the device, but 'vdev_wholedisk'
131 * is not yet set, then this must be a slice.
133 if (error
== 0 && vd
->vdev_wholedisk
== -1ULL)
134 vd
->vdev_wholedisk
= 0;
138 * If we were unable to open by path, or the devid check fails, open by
141 if (error
!= 0 && vd
->vdev_devid
!= NULL
)
142 error
= ldi_open_by_devid(dvd
->vd_devid
, dvd
->vd_minor
,
143 spa_mode
, kcred
, &dvd
->vd_lh
, zfs_li
);
146 * If all else fails, then try opening by physical path (if available)
147 * or the logical path (if we failed due to the devid check). While not
148 * as reliable as the devid, this will give us something, and the higher
149 * level vdev validation will prevent us from opening the wrong device.
152 if (vd
->vdev_physpath
!= NULL
&&
153 (dev
= ddi_pathname_to_dev_t(vd
->vdev_physpath
)) != ENODEV
)
154 error
= ldi_open_by_dev(&dev
, OTYP_BLK
, spa_mode
,
155 kcred
, &dvd
->vd_lh
, zfs_li
);
158 * Note that we don't support the legacy auto-wholedisk support
159 * as above. This hasn't been used in a very long time and we
160 * don't need to propagate its oddities to this edge condition.
162 if (error
&& vd
->vdev_path
!= NULL
)
163 error
= ldi_open_by_name(vd
->vdev_path
, spa_mode
, kcred
,
164 &dvd
->vd_lh
, zfs_li
);
168 vd
->vdev_stat
.vs_aux
= VDEV_AUX_OPEN_FAILED
;
174 vdev_disk_open(vdev_t
*vd
, uint64_t *psize
, uint64_t *ashift
)
182 error
= vdev_disk_open_common(vd
);
188 * Once a device is opened, verify that the physical device path (if
189 * available) is up to date.
191 if (ldi_get_dev(dvd
->vd_lh
, &dev
) == 0 &&
192 ldi_get_otyp(dvd
->vd_lh
, &otyp
) == 0) {
193 char *physpath
, *minorname
;
195 physpath
= kmem_alloc(MAXPATHLEN
, KM_SLEEP
);
197 if (ddi_dev_pathname(dev
, otyp
, physpath
) == 0 &&
198 ldi_get_minor_name(dvd
->vd_lh
, &minorname
) == 0 &&
199 (vd
->vdev_physpath
== NULL
||
200 strcmp(vd
->vdev_physpath
, physpath
) != 0)) {
201 if (vd
->vdev_physpath
)
202 spa_strfree(vd
->vdev_physpath
);
203 (void) strlcat(physpath
, ":", MAXPATHLEN
);
204 (void) strlcat(physpath
, minorname
, MAXPATHLEN
);
205 vd
->vdev_physpath
= spa_strdup(physpath
);
208 kmem_free(minorname
, strlen(minorname
) + 1);
209 kmem_free(physpath
, MAXPATHLEN
);
213 * Determine the actual size of the device.
215 if (ldi_get_size(dvd
->vd_lh
, psize
) != 0) {
216 vd
->vdev_stat
.vs_aux
= VDEV_AUX_OPEN_FAILED
;
221 * If we own the whole disk, try to enable disk write caching.
222 * We ignore errors because it's OK if we can't do it.
224 if (vd
->vdev_wholedisk
== 1) {
226 (void) ldi_ioctl(dvd
->vd_lh
, DKIOCSETWCE
, (intptr_t)&wce
,
227 FKIOCTL
, kcred
, NULL
);
231 * Determine the device's minimum transfer size.
232 * If the ioctl isn't supported, assume DEV_BSIZE.
234 if (ldi_ioctl(dvd
->vd_lh
, DKIOCGMEDIAINFO
, (intptr_t)&dkm
,
235 FKIOCTL
, kcred
, NULL
) != 0)
236 dkm
.dki_lbsize
= DEV_BSIZE
;
238 *ashift
= highbit(MAX(dkm
.dki_lbsize
, SPA_MINBLOCKSIZE
)) - 1;
241 * Clear the nowritecache bit, so that on a vdev_reopen() we will
244 vd
->vdev_nowritecache
= B_FALSE
;
250 vdev_disk_close(vdev_t
*vd
)
252 vdev_disk_t
*dvd
= vd
->vdev_tsd
;
257 if (dvd
->vd_minor
!= NULL
)
258 ddi_devid_str_free(dvd
->vd_minor
);
260 if (dvd
->vd_devid
!= NULL
)
261 ddi_devid_free(dvd
->vd_devid
);
263 if (dvd
->vd_lh
!= NULL
)
264 (void) ldi_close(dvd
->vd_lh
, spa_mode
, kcred
);
266 kmem_free(dvd
, sizeof (vdev_disk_t
));
271 vdev_disk_physio(ldi_handle_t vd_lh
, caddr_t data
, size_t size
,
272 uint64_t offset
, int flags
)
280 ASSERT(flags
& B_READ
|| flags
& B_WRITE
);
282 bp
= getrbuf(KM_SLEEP
);
283 bp
->b_flags
= flags
| B_BUSY
| B_NOCACHE
| B_FAILFAST
;
285 bp
->b_un
.b_addr
= (void *)data
;
286 bp
->b_lblkno
= lbtodb(offset
);
287 bp
->b_bufsize
= size
;
289 error
= ldi_strategy(vd_lh
, bp
);
291 if ((error
= biowait(bp
)) == 0 && bp
->b_resid
!= 0)
299 vdev_disk_probe_io(vdev_t
*vd
, caddr_t data
, size_t size
, uint64_t offset
,
303 vdev_disk_t
*dvd
= vd
->vdev_tsd
;
305 if (vd
== NULL
|| dvd
== NULL
|| dvd
->vd_lh
== NULL
)
308 error
= vdev_disk_physio(dvd
->vd_lh
, data
, size
, offset
, flags
);
310 if (zio_injection_enabled
&& error
== 0)
311 error
= zio_handle_device_injection(vd
, EIO
);
317 * Determine if the underlying device is accessible by reading and writing
318 * to a known location. We must be able to do this during syncing context
319 * and thus we cannot set the vdev state directly.
322 vdev_disk_probe(vdev_t
*vd
)
326 int l
, error
= 0, retries
= 0;
332 /* Hijack the current vdev */
336 * Pick a random label to rewrite.
338 l
= spa_get_random(VDEV_LABELS
);
339 ASSERT(l
< VDEV_LABELS
);
341 offset
= vdev_label_offset(vd
->vdev_psize
, l
,
342 offsetof(vdev_label_t
, vl_pad
));
344 vl_pad
= kmem_alloc(VDEV_SKIP_SIZE
, KM_SLEEP
);
347 * Try to read and write to a special location on the
348 * label. We use the existing vdev initially and only
349 * try to create and reopen it if we encounter a failure.
351 while ((error
= vdev_disk_probe_io(nvd
, vl_pad
, VDEV_SKIP_SIZE
,
352 offset
, B_READ
)) != 0 && retries
== 0) {
354 nvd
= kmem_zalloc(sizeof (vdev_t
), KM_SLEEP
);
356 nvd
->vdev_path
= spa_strdup(vd
->vdev_path
);
357 if (vd
->vdev_physpath
)
358 nvd
->vdev_physpath
= spa_strdup(vd
->vdev_physpath
);
360 nvd
->vdev_devid
= spa_strdup(vd
->vdev_devid
);
361 nvd
->vdev_wholedisk
= vd
->vdev_wholedisk
;
362 nvd
->vdev_guid
= vd
->vdev_guid
;
365 error
= vdev_disk_open_common(nvd
);
371 error
= vdev_disk_probe_io(nvd
, vl_pad
, VDEV_SKIP_SIZE
,
375 /* Clean up if we allocated a new vdev */
377 vdev_disk_close(nvd
);
379 spa_strfree(nvd
->vdev_path
);
380 if (nvd
->vdev_physpath
)
381 spa_strfree(nvd
->vdev_physpath
);
383 spa_strfree(nvd
->vdev_devid
);
384 kmem_free(nvd
, sizeof (vdev_t
));
386 kmem_free(vl_pad
, VDEV_SKIP_SIZE
);
388 /* Reset the failing flag */
390 vd
->vdev_is_failing
= B_FALSE
;
396 vdev_disk_io_intr(buf_t
*bp
)
398 vdev_disk_buf_t
*vdb
= (vdev_disk_buf_t
*)bp
;
399 zio_t
*zio
= vdb
->vdb_io
;
401 if ((zio
->io_error
= geterror(bp
)) == 0 && bp
->b_resid
!= 0)
404 kmem_free(vdb
, sizeof (vdev_disk_buf_t
));
410 vdev_disk_ioctl_done(void *zio_arg
, int error
)
412 zio_t
*zio
= zio_arg
;
414 zio
->io_error
= error
;
420 vdev_disk_io_start(zio_t
*zio
)
422 vdev_t
*vd
= zio
->io_vd
;
423 vdev_disk_t
*dvd
= vd
->vdev_tsd
;
424 vdev_disk_buf_t
*vdb
;
428 if (zio
->io_type
== ZIO_TYPE_IOCTL
) {
429 zio_vdev_io_bypass(zio
);
432 if (!vdev_readable(vd
)) {
433 zio
->io_error
= ENXIO
;
434 return (ZIO_PIPELINE_CONTINUE
);
437 switch (zio
->io_cmd
) {
439 case DKIOCFLUSHWRITECACHE
:
441 if (zfs_nocacheflush
)
444 if (vd
->vdev_nowritecache
) {
445 zio
->io_error
= ENOTSUP
;
449 zio
->io_dk_callback
.dkc_callback
= vdev_disk_ioctl_done
;
450 zio
->io_dk_callback
.dkc_flag
= FLUSH_VOLATILE
;
451 zio
->io_dk_callback
.dkc_cookie
= zio
;
453 error
= ldi_ioctl(dvd
->vd_lh
, zio
->io_cmd
,
454 (uintptr_t)&zio
->io_dk_callback
,
455 FKIOCTL
, kcred
, NULL
);
459 * The ioctl will be done asychronously,
460 * and will call vdev_disk_ioctl_done()
463 return (ZIO_PIPELINE_STOP
);
466 if (error
== ENOTSUP
|| error
== ENOTTY
) {
468 * If we get ENOTSUP or ENOTTY, we know that
469 * no future attempts will ever succeed.
470 * In this case we set a persistent bit so
471 * that we don't bother with the ioctl in the
474 vd
->vdev_nowritecache
= B_TRUE
;
476 zio
->io_error
= error
;
481 zio
->io_error
= ENOTSUP
;
484 return (ZIO_PIPELINE_CONTINUE
);
487 if (zio
->io_type
== ZIO_TYPE_READ
&& vdev_cache_read(zio
) == 0)
488 return (ZIO_PIPELINE_STOP
);
490 if ((zio
= vdev_queue_io(zio
)) == NULL
)
491 return (ZIO_PIPELINE_STOP
);
493 if (zio
->io_type
== ZIO_TYPE_WRITE
)
494 error
= vdev_writeable(vd
) ? vdev_error_inject(vd
, zio
) : ENXIO
;
496 error
= vdev_readable(vd
) ? vdev_error_inject(vd
, zio
) : ENXIO
;
497 error
= (vd
->vdev_remove_wanted
|| vd
->vdev_is_failing
) ? ENXIO
: error
;
500 zio
->io_error
= error
;
502 return (ZIO_PIPELINE_STOP
);
505 flags
= (zio
->io_type
== ZIO_TYPE_READ
? B_READ
: B_WRITE
);
506 flags
|= B_BUSY
| B_NOCACHE
;
507 if (zio
->io_flags
& ZIO_FLAG_FAILFAST
)
510 vdb
= kmem_alloc(sizeof (vdev_disk_buf_t
), KM_SLEEP
);
517 bp
->b_bcount
= zio
->io_size
;
518 bp
->b_un
.b_addr
= zio
->io_data
;
519 bp
->b_lblkno
= lbtodb(zio
->io_offset
);
520 bp
->b_bufsize
= zio
->io_size
;
521 bp
->b_iodone
= (int (*)())vdev_disk_io_intr
;
523 error
= ldi_strategy(dvd
->vd_lh
, bp
);
524 /* ldi_strategy() will return non-zero only on programming errors */
527 return (ZIO_PIPELINE_STOP
);
531 vdev_disk_io_done(zio_t
*zio
)
533 vdev_queue_io_done(zio
);
535 if (zio
->io_type
== ZIO_TYPE_WRITE
)
536 vdev_cache_write(zio
);
538 if (zio_injection_enabled
&& zio
->io_error
== 0)
539 zio
->io_error
= zio_handle_device_injection(zio
->io_vd
, EIO
);
542 * If the device returned EIO, then attempt a DKIOCSTATE ioctl to see if
543 * the device has been removed. If this is the case, then we trigger an
544 * asynchronous removal of the device. Otherwise, probe the device and
545 * make sure it's still accessible.
547 if (zio
->io_error
== EIO
) {
548 vdev_t
*vd
= zio
->io_vd
;
549 vdev_disk_t
*dvd
= vd
->vdev_tsd
;
553 if (dvd
&& ldi_ioctl(dvd
->vd_lh
, DKIOCSTATE
, (intptr_t)&state
,
554 FKIOCTL
, kcred
, NULL
) == 0 &&
555 state
!= DKIO_INSERTED
) {
556 vd
->vdev_remove_wanted
= B_TRUE
;
557 spa_async_request(zio
->io_spa
, SPA_ASYNC_REMOVE
);
558 } else if (vdev_probe(vd
) != 0) {
559 ASSERT(vd
->vdev_ops
->vdev_op_leaf
);
560 vd
->vdev_is_failing
= B_TRUE
;
564 return (ZIO_PIPELINE_CONTINUE
);
567 vdev_ops_t vdev_disk_ops
= {
575 VDEV_TYPE_DISK
, /* name of this vdev type */
576 B_TRUE
/* leaf vdev */
580 * Given the root disk device pathname, read the label from the device,
581 * and construct a configuration nvlist.
584 vdev_disk_read_rootlabel(char *devpath
)
586 nvlist_t
*config
= NULL
;
593 * Read the device label and build the nvlist.
595 if (ldi_open_by_name(devpath
, FREAD
, kcred
, &vd_lh
, zfs_li
))
598 if (ldi_get_size(vd_lh
, &s
))
601 size
= P2ALIGN_TYPED(s
, sizeof (vdev_label_t
), uint64_t);
602 label
= kmem_alloc(sizeof (vdev_label_t
), KM_SLEEP
);
604 for (l
= 0; l
< VDEV_LABELS
; l
++) {
605 uint64_t offset
, state
, txg
= 0;
607 /* read vdev label */
608 offset
= vdev_label_offset(size
, l
, 0);
609 if (vdev_disk_physio(vd_lh
, (caddr_t
)label
,
610 VDEV_SKIP_SIZE
+ VDEV_BOOT_HEADER_SIZE
+
611 VDEV_PHYS_SIZE
, offset
, B_READ
) != 0)
614 if (nvlist_unpack(label
->vl_vdev_phys
.vp_nvlist
,
615 sizeof (label
->vl_vdev_phys
.vp_nvlist
), &config
, 0) != 0) {
620 if (nvlist_lookup_uint64(config
, ZPOOL_CONFIG_POOL_STATE
,
621 &state
) != 0 || state
>= POOL_STATE_DESTROYED
) {
627 if (nvlist_lookup_uint64(config
, ZPOOL_CONFIG_POOL_TXG
,
628 &txg
) != 0 || txg
== 0) {
637 kmem_free(label
, sizeof (vdev_label_t
));