4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC.
23 * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
24 * Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>.
26 * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
29 #include <sys/zfs_context.h>
30 #include <sys/spa_impl.h>
31 #include <sys/vdev_disk.h>
32 #include <sys/vdev_impl.h>
33 #include <sys/vdev_trim.h>
35 #include <sys/fs/zfs.h>
37 #include <linux/mod_compat.h>
38 #include <linux/msdos_fs.h>
39 #include <linux/vfs_compat.h>
41 char *zfs_vdev_scheduler
= VDEV_SCHEDULER
;
42 static void *zfs_vdev_holder
= VDEV_HOLDER
;
44 /* size of the "reserved" partition, in blocks */
45 #define EFI_MIN_RESV_SIZE (16 * 1024)
48 * Virtual device vector for disks.
50 typedef struct dio_request
{
51 zio_t
*dr_zio
; /* Parent ZIO */
52 atomic_t dr_ref
; /* References */
53 int dr_error
; /* Bio error */
54 int dr_bio_count
; /* Count of bio's */
55 struct bio
*dr_bio
[0]; /* Attached bio's */
59 #ifdef HAVE_OPEN_BDEV_EXCLUSIVE
61 vdev_bdev_mode(int smode
)
65 ASSERT3S(smode
& (FREAD
| FWRITE
), !=, 0);
77 vdev_bdev_mode(int smode
)
81 ASSERT3S(smode
& (FREAD
| FWRITE
), !=, 0);
83 if ((smode
& FREAD
) && !(smode
& FWRITE
))
88 #endif /* HAVE_OPEN_BDEV_EXCLUSIVE */
91 * Returns the usable capacity (in bytes) for the partition or disk.
94 bdev_capacity(struct block_device
*bdev
)
96 return (i_size_read(bdev
->bd_inode
));
100 * Returns the maximum expansion capacity of the block device (in bytes).
102 * It is possible to expand a vdev when it has been created as a wholedisk
103 * and the containing block device has increased in capacity. Or when the
104 * partition containing the pool has been manually increased in size.
106 * This function is only responsible for calculating the potential expansion
107 * size so it can be reported by 'zpool list'. The efi_use_whole_disk() is
108 * responsible for verifying the expected partition layout in the wholedisk
109 * case, and updating the partition table if appropriate. Once the partition
110 * size has been increased the additional capacity will be visible using
113 * The returned maximum expansion capacity is always expected to be larger, or
114 * at the very least equal, to its usable capacity to prevent overestimating
115 * the pool expandsize.
118 bdev_max_capacity(struct block_device
*bdev
, uint64_t wholedisk
)
123 if (wholedisk
&& bdev
->bd_part
!= NULL
&& bdev
!= bdev
->bd_contains
) {
125 * When reporting maximum expansion capacity for a wholedisk
126 * deduct any capacity which is expected to be lost due to
127 * alignment restrictions. Over reporting this value isn't
128 * harmful and would only result in slightly less capacity
129 * than expected post expansion.
130 * The estimated available space may be slightly smaller than
131 * bdev_capacity() for devices where the number of sectors is
132 * not a multiple of the alignment size and the partition layout
133 * is keeping less than PARTITION_END_ALIGNMENT bytes after the
134 * "reserved" EFI partition: in such cases return the device
137 available
= i_size_read(bdev
->bd_contains
->bd_inode
) -
138 ((EFI_MIN_RESV_SIZE
+ NEW_START_BLOCK
+
139 PARTITION_END_ALIGNMENT
) << SECTOR_BITS
);
140 psize
= MAX(available
, bdev_capacity(bdev
));
142 psize
= bdev_capacity(bdev
);
149 vdev_disk_error(zio_t
*zio
)
152 * This function can be called in interrupt context, for instance while
153 * handling IRQs coming from a misbehaving disk device; use printk()
154 * which is safe from any context.
156 printk(KERN_WARNING
"zio pool=%s vdev=%s error=%d type=%d "
157 "offset=%llu size=%llu flags=%x\n", spa_name(zio
->io_spa
),
158 zio
->io_vd
->vdev_path
, zio
->io_error
, zio
->io_type
,
159 (u_longlong_t
)zio
->io_offset
, (u_longlong_t
)zio
->io_size
,
164 * Use the Linux 'noop' elevator for zfs managed block devices. This
165 * strikes the ideal balance by allowing the zfs elevator to do all
166 * request ordering and prioritization. While allowing the Linux
167 * elevator to do the maximum front/back merging allowed by the
168 * physical device. This yields the largest possible requests for
169 * the device with the lowest total overhead.
172 vdev_elevator_switch(vdev_t
*v
, char *elevator
)
174 vdev_disk_t
*vd
= v
->vdev_tsd
;
175 struct request_queue
*q
;
179 for (int c
= 0; c
< v
->vdev_children
; c
++)
180 vdev_elevator_switch(v
->vdev_child
[c
], elevator
);
182 if (!v
->vdev_ops
->vdev_op_leaf
|| vd
->vd_bdev
== NULL
)
185 q
= bdev_get_queue(vd
->vd_bdev
);
186 device
= vd
->vd_bdev
->bd_disk
->disk_name
;
189 * Skip devices which are not whole disks (partitions).
190 * Device-mapper devices are excepted since they may be whole
191 * disks despite the vdev_wholedisk flag, in which case we can
192 * and should switch the elevator. If the device-mapper device
193 * does not have an elevator (i.e. dm-raid, dm-crypt, etc.) the
194 * "Skip devices without schedulers" check below will fail.
196 if (!v
->vdev_wholedisk
&& strncmp(device
, "dm-", 3) != 0)
199 /* Leave existing scheduler when set to "none" */
200 if ((strncmp(elevator
, "none", 4) == 0) && (strlen(elevator
) == 4))
204 * The elevator_change() function was available in kernels from
205 * 2.6.36 to 4.11. When not available fall back to using the user
206 * mode helper functionality to set the elevator via sysfs. This
207 * requires /bin/echo and sysfs to be mounted which may not be true
208 * early in the boot process.
210 #ifdef HAVE_ELEVATOR_CHANGE
211 error
= elevator_change(q
, elevator
);
213 #define SET_SCHEDULER_CMD \
214 "exec 0</dev/null " \
215 " 1>/sys/block/%s/queue/scheduler " \
219 char *argv
[] = { "/bin/sh", "-c", NULL
, NULL
};
220 char *envp
[] = { NULL
};
222 argv
[2] = kmem_asprintf(SET_SCHEDULER_CMD
, device
, elevator
);
223 error
= call_usermodehelper(argv
[0], argv
, envp
, UMH_WAIT_PROC
);
225 #endif /* HAVE_ELEVATOR_CHANGE */
227 zfs_dbgmsg("Unable to set \"%s\" scheduler for %s (%s): %d",
228 elevator
, v
->vdev_path
, device
, error
);
233 vdev_disk_open(vdev_t
*v
, uint64_t *psize
, uint64_t *max_psize
,
236 struct block_device
*bdev
;
237 fmode_t mode
= vdev_bdev_mode(spa_mode(v
->vdev_spa
));
238 int count
= 0, block_size
;
239 int bdev_retry_count
= 50;
242 /* Must have a pathname and it must be absolute. */
243 if (v
->vdev_path
== NULL
|| v
->vdev_path
[0] != '/') {
244 v
->vdev_stat
.vs_aux
= VDEV_AUX_BAD_LABEL
;
245 vdev_dbgmsg(v
, "invalid vdev_path");
246 return (SET_ERROR(EINVAL
));
250 * Reopen the device if it is currently open. When expanding a
251 * partition force re-scanning the partition table while closed
252 * in order to get an accurate updated block device size. Then
253 * since udev may need to recreate the device links increase the
254 * open retry count before reporting the device as unavailable.
258 char disk_name
[BDEVNAME_SIZE
+ 6] = "/dev/";
259 boolean_t reread_part
= B_FALSE
;
261 rw_enter(&vd
->vd_lock
, RW_WRITER
);
266 if (v
->vdev_expanding
&& bdev
!= bdev
->bd_contains
) {
267 bdevname(bdev
->bd_contains
, disk_name
+ 5);
268 reread_part
= B_TRUE
;
271 vdev_bdev_close(bdev
, mode
);
275 bdev
= vdev_bdev_open(disk_name
, mode
, zfs_vdev_holder
);
277 int error
= vdev_bdev_reread_part(bdev
);
278 vdev_bdev_close(bdev
, mode
);
280 bdev_retry_count
= 100;
284 vd
= kmem_zalloc(sizeof (vdev_disk_t
), KM_SLEEP
);
286 rw_init(&vd
->vd_lock
, NULL
, RW_DEFAULT
, NULL
);
287 rw_enter(&vd
->vd_lock
, RW_WRITER
);
291 * Devices are always opened by the path provided at configuration
292 * time. This means that if the provided path is a udev by-id path
293 * then drives may be re-cabled without an issue. If the provided
294 * path is a udev by-path path, then the physical location information
295 * will be preserved. This can be critical for more complicated
296 * configurations where drives are located in specific physical
297 * locations to maximize the systems tolerance to component failure.
299 * Alternatively, you can provide your own udev rule to flexibly map
300 * the drives as you see fit. It is not advised that you use the
301 * /dev/[hd]d devices which may be reordered due to probing order.
302 * Devices in the wrong locations will be detected by the higher
303 * level vdev validation.
305 * The specified paths may be briefly removed and recreated in
306 * response to udev events. This should be exceptionally unlikely
307 * because the zpool command makes every effort to verify these paths
308 * have already settled prior to reaching this point. Therefore,
309 * a ENOENT failure at this point is highly likely to be transient
310 * and it is reasonable to sleep and retry before giving up. In
311 * practice delays have been observed to be on the order of 100ms.
313 bdev
= ERR_PTR(-ENXIO
);
314 while (IS_ERR(bdev
) && count
< bdev_retry_count
) {
315 bdev
= vdev_bdev_open(v
->vdev_path
, mode
, zfs_vdev_holder
);
316 if (unlikely(PTR_ERR(bdev
) == -ENOENT
)) {
317 schedule_timeout(MSEC_TO_TICK(10));
319 } else if (IS_ERR(bdev
)) {
325 int error
= -PTR_ERR(bdev
);
326 vdev_dbgmsg(v
, "open error=%d count=%d", error
, count
);
329 rw_exit(&vd
->vd_lock
);
330 return (SET_ERROR(error
));
334 rw_exit(&vd
->vd_lock
);
337 struct request_queue
*q
= bdev_get_queue(vd
->vd_bdev
);
339 /* Determine the physical block size */
340 block_size
= vdev_bdev_block_size(vd
->vd_bdev
);
342 /* Clear the nowritecache bit, causes vdev_reopen() to try again. */
343 v
->vdev_nowritecache
= B_FALSE
;
345 /* Set when device reports it supports TRIM. */
346 v
->vdev_has_trim
= !!blk_queue_discard(q
);
348 /* Set when device reports it supports secure TRIM. */
349 v
->vdev_has_securetrim
= !!blk_queue_discard_secure(q
);
351 /* Inform the ZIO pipeline that we are non-rotational */
352 v
->vdev_nonrot
= blk_queue_nonrot(q
);
354 /* Physical volume size in bytes for the partition */
355 *psize
= bdev_capacity(vd
->vd_bdev
);
357 /* Physical volume size in bytes including possible expansion space */
358 *max_psize
= bdev_max_capacity(vd
->vd_bdev
, v
->vdev_wholedisk
);
360 /* Based on the minimum sector size set the block size */
361 *ashift
= highbit64(MAX(block_size
, SPA_MINBLOCKSIZE
)) - 1;
363 /* Try to set the io scheduler elevator algorithm */
364 (void) vdev_elevator_switch(v
, zfs_vdev_scheduler
);
370 vdev_disk_close(vdev_t
*v
)
372 vdev_disk_t
*vd
= v
->vdev_tsd
;
374 if (v
->vdev_reopening
|| vd
== NULL
)
377 if (vd
->vd_bdev
!= NULL
) {
378 vdev_bdev_close(vd
->vd_bdev
,
379 vdev_bdev_mode(spa_mode(v
->vdev_spa
)));
382 rw_destroy(&vd
->vd_lock
);
383 kmem_free(vd
, sizeof (vdev_disk_t
));
387 static dio_request_t
*
388 vdev_disk_dio_alloc(int bio_count
)
393 dr
= kmem_zalloc(sizeof (dio_request_t
) +
394 sizeof (struct bio
*) * bio_count
, KM_SLEEP
);
396 atomic_set(&dr
->dr_ref
, 0);
397 dr
->dr_bio_count
= bio_count
;
400 for (i
= 0; i
< dr
->dr_bio_count
; i
++)
401 dr
->dr_bio
[i
] = NULL
;
408 vdev_disk_dio_free(dio_request_t
*dr
)
412 for (i
= 0; i
< dr
->dr_bio_count
; i
++)
414 bio_put(dr
->dr_bio
[i
]);
416 kmem_free(dr
, sizeof (dio_request_t
) +
417 sizeof (struct bio
*) * dr
->dr_bio_count
);
421 vdev_disk_dio_get(dio_request_t
*dr
)
423 atomic_inc(&dr
->dr_ref
);
427 vdev_disk_dio_put(dio_request_t
*dr
)
429 int rc
= atomic_dec_return(&dr
->dr_ref
);
432 * Free the dio_request when the last reference is dropped and
433 * ensure zio_interpret is called only once with the correct zio
436 zio_t
*zio
= dr
->dr_zio
;
437 int error
= dr
->dr_error
;
439 vdev_disk_dio_free(dr
);
442 zio
->io_error
= error
;
443 ASSERT3S(zio
->io_error
, >=, 0);
445 vdev_disk_error(zio
);
447 zio_delay_interrupt(zio
);
454 BIO_END_IO_PROTO(vdev_disk_physio_completion
, bio
, error
)
456 dio_request_t
*dr
= bio
->bi_private
;
459 if (dr
->dr_error
== 0) {
460 #ifdef HAVE_1ARG_BIO_END_IO_T
461 dr
->dr_error
= BIO_END_IO_ERROR(bio
);
464 dr
->dr_error
= -(error
);
465 else if (!test_bit(BIO_UPTODATE
, &bio
->bi_flags
))
470 /* Drop reference acquired by __vdev_disk_physio */
471 rc
= vdev_disk_dio_put(dr
);
475 bio_map(struct bio
*bio
, void *bio_ptr
, unsigned int bio_size
)
477 unsigned int offset
, size
, i
;
480 offset
= offset_in_page(bio_ptr
);
481 for (i
= 0; i
< bio
->bi_max_vecs
; i
++) {
482 size
= PAGE_SIZE
- offset
;
490 if (is_vmalloc_addr(bio_ptr
))
491 page
= vmalloc_to_page(bio_ptr
);
493 page
= virt_to_page(bio_ptr
);
496 * Some network related block device uses tcp_sendpage, which
497 * doesn't behave well when using 0-count page, this is a
498 * safety net to catch them.
500 ASSERT3S(page_count(page
), >, 0);
502 if (bio_add_page(bio
, page
, size
, offset
) != size
)
514 bio_map_abd_off(struct bio
*bio
, abd_t
*abd
, unsigned int size
, size_t off
)
516 if (abd_is_linear(abd
))
517 return (bio_map(bio
, ((char *)abd_to_buf(abd
)) + off
, size
));
519 return (abd_scatter_bio_map_off(bio
, abd
, size
, off
));
523 vdev_submit_bio_impl(struct bio
*bio
)
525 #ifdef HAVE_1ARG_SUBMIT_BIO
532 #ifdef HAVE_BIO_SET_DEV
533 #if defined(CONFIG_BLK_CGROUP) && defined(HAVE_BIO_SET_DEV_GPL_ONLY)
535 * The Linux 5.0 kernel updated the bio_set_dev() macro so it calls the
536 * GPL-only bio_associate_blkg() symbol thus inadvertently converting
537 * the entire macro. Provide a minimal version which always assigns the
538 * request queue's root_blkg to the bio.
541 vdev_bio_associate_blkg(struct bio
*bio
)
543 struct request_queue
*q
= bio
->bi_disk
->queue
;
545 ASSERT3P(q
, !=, NULL
);
546 ASSERT3P(bio
->bi_blkg
, ==, NULL
);
548 if (blkg_tryget(q
->root_blkg
))
549 bio
->bi_blkg
= q
->root_blkg
;
551 #define bio_associate_blkg vdev_bio_associate_blkg
555 * Provide a bio_set_dev() helper macro for pre-Linux 4.14 kernels.
558 bio_set_dev(struct bio
*bio
, struct block_device
*bdev
)
562 #endif /* HAVE_BIO_SET_DEV */
565 vdev_submit_bio(struct bio
*bio
)
567 #ifdef HAVE_CURRENT_BIO_TAIL
568 struct bio
**bio_tail
= current
->bio_tail
;
569 current
->bio_tail
= NULL
;
570 vdev_submit_bio_impl(bio
);
571 current
->bio_tail
= bio_tail
;
573 struct bio_list
*bio_list
= current
->bio_list
;
574 current
->bio_list
= NULL
;
575 vdev_submit_bio_impl(bio
);
576 current
->bio_list
= bio_list
;
581 __vdev_disk_physio(struct block_device
*bdev
, zio_t
*zio
,
582 size_t io_size
, uint64_t io_offset
, int rw
, int flags
)
587 int bio_size
, bio_count
= 16;
588 int i
= 0, error
= 0;
589 #if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG)
590 struct blk_plug plug
;
593 * Accessing outside the block device is never allowed.
595 if (io_offset
+ io_size
> bdev
->bd_inode
->i_size
) {
596 vdev_dbgmsg(zio
->io_vd
,
597 "Illegal access %llu size %llu, device size %llu",
598 io_offset
, io_size
, i_size_read(bdev
->bd_inode
));
599 return (SET_ERROR(EIO
));
603 dr
= vdev_disk_dio_alloc(bio_count
);
605 return (SET_ERROR(ENOMEM
));
607 if (zio
&& !(zio
->io_flags
& (ZIO_FLAG_IO_RETRY
| ZIO_FLAG_TRYHARD
)))
608 bio_set_flags_failfast(bdev
, &flags
);
613 * When the IO size exceeds the maximum bio size for the request
614 * queue we are forced to break the IO in multiple bio's and wait
615 * for them all to complete. Ideally, all pool users will set
616 * their volume block size to match the maximum request size and
617 * the common case will be one bio per vdev IO request.
621 bio_offset
= io_offset
;
623 for (i
= 0; i
<= dr
->dr_bio_count
; i
++) {
625 /* Finished constructing bio's for given buffer */
630 * By default only 'bio_count' bio's per dio are allowed.
631 * However, if we find ourselves in a situation where more
632 * are needed we allocate a larger dio and warn the user.
634 if (dr
->dr_bio_count
== i
) {
635 vdev_disk_dio_free(dr
);
640 /* bio_alloc() with __GFP_WAIT never returns NULL */
641 dr
->dr_bio
[i
] = bio_alloc(GFP_NOIO
,
642 MIN(abd_nr_pages_off(zio
->io_abd
, bio_size
, abd_offset
),
644 if (unlikely(dr
->dr_bio
[i
] == NULL
)) {
645 vdev_disk_dio_free(dr
);
646 return (SET_ERROR(ENOMEM
));
649 /* Matching put called by vdev_disk_physio_completion */
650 vdev_disk_dio_get(dr
);
652 bio_set_dev(dr
->dr_bio
[i
], bdev
);
653 BIO_BI_SECTOR(dr
->dr_bio
[i
]) = bio_offset
>> 9;
654 dr
->dr_bio
[i
]->bi_end_io
= vdev_disk_physio_completion
;
655 dr
->dr_bio
[i
]->bi_private
= dr
;
656 bio_set_op_attrs(dr
->dr_bio
[i
], rw
, flags
);
658 /* Remaining size is returned to become the new size */
659 bio_size
= bio_map_abd_off(dr
->dr_bio
[i
], zio
->io_abd
,
660 bio_size
, abd_offset
);
662 /* Advance in buffer and construct another bio if needed */
663 abd_offset
+= BIO_BI_SIZE(dr
->dr_bio
[i
]);
664 bio_offset
+= BIO_BI_SIZE(dr
->dr_bio
[i
]);
667 /* Extra reference to protect dio_request during vdev_submit_bio */
668 vdev_disk_dio_get(dr
);
670 #if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG)
671 if (dr
->dr_bio_count
> 1)
672 blk_start_plug(&plug
);
675 /* Submit all bio's associated with this dio */
676 for (i
= 0; i
< dr
->dr_bio_count
; i
++)
678 vdev_submit_bio(dr
->dr_bio
[i
]);
680 #if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG)
681 if (dr
->dr_bio_count
> 1)
682 blk_finish_plug(&plug
);
685 (void) vdev_disk_dio_put(dr
);
690 BIO_END_IO_PROTO(vdev_disk_io_flush_completion
, bio
, error
)
692 zio_t
*zio
= bio
->bi_private
;
693 #ifdef HAVE_1ARG_BIO_END_IO_T
694 zio
->io_error
= BIO_END_IO_ERROR(bio
);
696 zio
->io_error
= -error
;
699 if (zio
->io_error
&& (zio
->io_error
== EOPNOTSUPP
))
700 zio
->io_vd
->vdev_nowritecache
= B_TRUE
;
703 ASSERT3S(zio
->io_error
, >=, 0);
705 vdev_disk_error(zio
);
710 vdev_disk_io_flush(struct block_device
*bdev
, zio_t
*zio
)
712 struct request_queue
*q
;
715 q
= bdev_get_queue(bdev
);
717 return (SET_ERROR(ENXIO
));
719 bio
= bio_alloc(GFP_NOIO
, 0);
720 /* bio_alloc() with __GFP_WAIT never returns NULL */
721 if (unlikely(bio
== NULL
))
722 return (SET_ERROR(ENOMEM
));
724 bio
->bi_end_io
= vdev_disk_io_flush_completion
;
725 bio
->bi_private
= zio
;
726 bio_set_dev(bio
, bdev
);
728 vdev_submit_bio(bio
);
729 invalidate_bdev(bdev
);
735 vdev_disk_io_start(zio_t
*zio
)
737 vdev_t
*v
= zio
->io_vd
;
738 vdev_disk_t
*vd
= v
->vdev_tsd
;
739 unsigned long trim_flags
= 0;
740 int rw
, flags
, error
;
743 * If the vdev is closed, it's likely in the REMOVED or FAULTED state.
744 * Nothing to be done here but return failure.
747 zio
->io_error
= ENXIO
;
752 rw_enter(&vd
->vd_lock
, RW_READER
);
755 * If the vdev is closed, it's likely due to a failed reopen and is
756 * in the UNAVAIL state. Nothing to be done here but return failure.
758 if (vd
->vd_bdev
== NULL
) {
759 rw_exit(&vd
->vd_lock
);
760 zio
->io_error
= ENXIO
;
765 switch (zio
->io_type
) {
768 if (!vdev_readable(v
)) {
769 rw_exit(&vd
->vd_lock
);
770 zio
->io_error
= SET_ERROR(ENXIO
);
775 switch (zio
->io_cmd
) {
776 case DKIOCFLUSHWRITECACHE
:
778 if (zfs_nocacheflush
)
781 if (v
->vdev_nowritecache
) {
782 zio
->io_error
= SET_ERROR(ENOTSUP
);
786 error
= vdev_disk_io_flush(vd
->vd_bdev
, zio
);
788 rw_exit(&vd
->vd_lock
);
792 zio
->io_error
= error
;
797 zio
->io_error
= SET_ERROR(ENOTSUP
);
800 rw_exit(&vd
->vd_lock
);
805 #if defined(HAVE_BLK_QUEUE_HAVE_BIO_RW_UNPLUG)
806 flags
= (1 << BIO_RW_UNPLUG
);
807 #elif defined(REQ_UNPLUG)
816 #if defined(HAVE_BLK_QUEUE_HAVE_BIO_RW_UNPLUG)
817 flags
= (1 << BIO_RW_UNPLUG
);
818 #elif defined(REQ_UNPLUG)
826 #if defined(BLKDEV_DISCARD_SECURE)
827 if (zio
->io_trim_flags
& ZIO_TRIM_SECURE
)
828 trim_flags
|= BLKDEV_DISCARD_SECURE
;
830 zio
->io_error
= -blkdev_issue_discard(vd
->vd_bdev
,
831 zio
->io_offset
>> 9, zio
->io_size
>> 9, GFP_NOFS
,
834 rw_exit(&vd
->vd_lock
);
839 rw_exit(&vd
->vd_lock
);
840 zio
->io_error
= SET_ERROR(ENOTSUP
);
845 zio
->io_target_timestamp
= zio_handle_io_delay(zio
);
846 error
= __vdev_disk_physio(vd
->vd_bdev
, zio
,
847 zio
->io_size
, zio
->io_offset
, rw
, flags
);
848 rw_exit(&vd
->vd_lock
);
851 zio
->io_error
= error
;
858 vdev_disk_io_done(zio_t
*zio
)
861 * If the device returned EIO, we revalidate the media. If it is
862 * determined the media has changed this triggers the asynchronous
863 * removal of the device from the configuration.
865 if (zio
->io_error
== EIO
) {
866 vdev_t
*v
= zio
->io_vd
;
867 vdev_disk_t
*vd
= v
->vdev_tsd
;
869 if (check_disk_change(vd
->vd_bdev
)) {
870 vdev_bdev_invalidate(vd
->vd_bdev
);
871 v
->vdev_remove_wanted
= B_TRUE
;
872 spa_async_request(zio
->io_spa
, SPA_ASYNC_REMOVE
);
878 vdev_disk_hold(vdev_t
*vd
)
880 ASSERT(spa_config_held(vd
->vdev_spa
, SCL_STATE
, RW_WRITER
));
882 /* We must have a pathname, and it must be absolute. */
883 if (vd
->vdev_path
== NULL
|| vd
->vdev_path
[0] != '/')
887 * Only prefetch path and devid info if the device has
890 if (vd
->vdev_tsd
!= NULL
)
893 /* XXX: Implement me as a vnode lookup for the device */
894 vd
->vdev_name_vp
= NULL
;
895 vd
->vdev_devid_vp
= NULL
;
899 vdev_disk_rele(vdev_t
*vd
)
901 ASSERT(spa_config_held(vd
->vdev_spa
, SCL_STATE
, RW_WRITER
));
903 /* XXX: Implement me as a vnode rele for the device */
907 param_set_vdev_scheduler(const char *val
, zfs_kernel_param_t
*kp
)
913 return (SET_ERROR(-EINVAL
));
915 if ((p
= strchr(val
, '\n')) != NULL
)
918 if (spa_mode_global
!= 0) {
919 mutex_enter(&spa_namespace_lock
);
920 while ((spa
= spa_next(spa
)) != NULL
) {
921 if (spa_state(spa
) != POOL_STATE_ACTIVE
||
922 !spa_writeable(spa
) || spa_suspended(spa
))
925 spa_open_ref(spa
, FTAG
);
926 mutex_exit(&spa_namespace_lock
);
927 vdev_elevator_switch(spa
->spa_root_vdev
, (char *)val
);
928 mutex_enter(&spa_namespace_lock
);
929 spa_close(spa
, FTAG
);
931 mutex_exit(&spa_namespace_lock
);
934 return (param_set_charp(val
, kp
));
937 vdev_ops_t vdev_disk_ops
= {
949 VDEV_TYPE_DISK
, /* name of this vdev type */
950 B_TRUE
/* leaf vdev */
953 module_param_call(zfs_vdev_scheduler
, param_set_vdev_scheduler
,
954 param_get_charp
, &zfs_vdev_scheduler
, 0644);
955 MODULE_PARM_DESC(zfs_vdev_scheduler
, "I/O scheduler");