4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC.
23 * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
24 * Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>.
26 * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
29 #include <sys/zfs_context.h>
30 #include <sys/spa_impl.h>
31 #include <sys/vdev_disk.h>
32 #include <sys/vdev_impl.h>
33 #include <sys/vdev_trim.h>
35 #include <sys/fs/zfs.h>
37 #include <linux/mod_compat.h>
38 #include <linux/msdos_fs.h>
39 #include <linux/vfs_compat.h>
41 char *zfs_vdev_scheduler
= VDEV_SCHEDULER
;
42 static void *zfs_vdev_holder
= VDEV_HOLDER
;
44 /* size of the "reserved" partition, in blocks */
45 #define EFI_MIN_RESV_SIZE (16 * 1024)
48 * Virtual device vector for disks.
50 typedef struct dio_request
{
51 zio_t
*dr_zio
; /* Parent ZIO */
52 atomic_t dr_ref
; /* References */
53 int dr_error
; /* Bio error */
54 int dr_bio_count
; /* Count of bio's */
55 struct bio
*dr_bio
[0]; /* Attached bio's */
59 #ifdef HAVE_OPEN_BDEV_EXCLUSIVE
61 vdev_bdev_mode(int smode
)
65 ASSERT3S(smode
& (FREAD
| FWRITE
), !=, 0);
77 vdev_bdev_mode(int smode
)
81 ASSERT3S(smode
& (FREAD
| FWRITE
), !=, 0);
83 if ((smode
& FREAD
) && !(smode
& FWRITE
))
88 #endif /* HAVE_OPEN_BDEV_EXCLUSIVE */
91 * Returns the usable capacity (in bytes) for the partition or disk.
94 bdev_capacity(struct block_device
*bdev
)
96 return (i_size_read(bdev
->bd_inode
));
100 * Returns the maximum expansion capacity of the block device (in bytes).
102 * It is possible to expand a vdev when it has been created as a wholedisk
103 * and the containing block device has increased in capacity. Or when the
104 * partition containing the pool has been manually increased in size.
106 * This function is only responsible for calculating the potential expansion
107 * size so it can be reported by 'zpool list'. The efi_use_whole_disk() is
108 * responsible for verifying the expected partition layout in the wholedisk
109 * case, and updating the partition table if appropriate. Once the partition
110 * size has been increased the additional capacity will be visible using
113 * The returned maximum expansion capacity is always expected to be larger, or
114 * at the very least equal, to its usable capacity to prevent overestimating
115 * the pool expandsize.
118 bdev_max_capacity(struct block_device
*bdev
, uint64_t wholedisk
)
123 if (wholedisk
&& bdev
->bd_part
!= NULL
&& bdev
!= bdev
->bd_contains
) {
125 * When reporting maximum expansion capacity for a wholedisk
126 * deduct any capacity which is expected to be lost due to
127 * alignment restrictions. Over reporting this value isn't
128 * harmful and would only result in slightly less capacity
129 * than expected post expansion.
130 * The estimated available space may be slightly smaller than
131 * bdev_capacity() for devices where the number of sectors is
132 * not a multiple of the alignment size and the partition layout
133 * is keeping less than PARTITION_END_ALIGNMENT bytes after the
134 * "reserved" EFI partition: in such cases return the device
137 available
= i_size_read(bdev
->bd_contains
->bd_inode
) -
138 ((EFI_MIN_RESV_SIZE
+ NEW_START_BLOCK
+
139 PARTITION_END_ALIGNMENT
) << SECTOR_BITS
);
140 psize
= MAX(available
, bdev_capacity(bdev
));
142 psize
= bdev_capacity(bdev
);
149 vdev_disk_error(zio_t
*zio
)
152 * This function can be called in interrupt context, for instance while
153 * handling IRQs coming from a misbehaving disk device; use printk()
154 * which is safe from any context.
156 printk(KERN_WARNING
"zio pool=%s vdev=%s error=%d type=%d "
157 "offset=%llu size=%llu flags=%x\n", spa_name(zio
->io_spa
),
158 zio
->io_vd
->vdev_path
, zio
->io_error
, zio
->io_type
,
159 (u_longlong_t
)zio
->io_offset
, (u_longlong_t
)zio
->io_size
,
164 * Use the Linux 'noop' elevator for zfs managed block devices. This
165 * strikes the ideal balance by allowing the zfs elevator to do all
166 * request ordering and prioritization. While allowing the Linux
167 * elevator to do the maximum front/back merging allowed by the
168 * physical device. This yields the largest possible requests for
169 * the device with the lowest total overhead.
172 vdev_elevator_switch(vdev_t
*v
, char *elevator
)
174 vdev_disk_t
*vd
= v
->vdev_tsd
;
175 struct request_queue
*q
;
179 for (int c
= 0; c
< v
->vdev_children
; c
++)
180 vdev_elevator_switch(v
->vdev_child
[c
], elevator
);
182 if (!v
->vdev_ops
->vdev_op_leaf
|| vd
->vd_bdev
== NULL
)
185 q
= bdev_get_queue(vd
->vd_bdev
);
186 device
= vd
->vd_bdev
->bd_disk
->disk_name
;
189 * Skip devices which are not whole disks (partitions).
190 * Device-mapper devices are excepted since they may be whole
191 * disks despite the vdev_wholedisk flag, in which case we can
192 * and should switch the elevator. If the device-mapper device
193 * does not have an elevator (i.e. dm-raid, dm-crypt, etc.) the
194 * "Skip devices without schedulers" check below will fail.
196 if (!v
->vdev_wholedisk
&& strncmp(device
, "dm-", 3) != 0)
199 /* Leave existing scheduler when set to "none" */
200 if ((strncmp(elevator
, "none", 4) == 0) && (strlen(elevator
) == 4))
204 * The elevator_change() function was available in kernels from
205 * 2.6.36 to 4.11. When not available fall back to using the user
206 * mode helper functionality to set the elevator via sysfs. This
207 * requires /bin/echo and sysfs to be mounted which may not be true
208 * early in the boot process.
210 #ifdef HAVE_ELEVATOR_CHANGE
211 error
= elevator_change(q
, elevator
);
213 #define SET_SCHEDULER_CMD \
214 "exec 0</dev/null " \
215 " 1>/sys/block/%s/queue/scheduler " \
219 char *argv
[] = { "/bin/sh", "-c", NULL
, NULL
};
220 char *envp
[] = { NULL
};
222 argv
[2] = kmem_asprintf(SET_SCHEDULER_CMD
, device
, elevator
);
223 error
= call_usermodehelper(argv
[0], argv
, envp
, UMH_WAIT_PROC
);
225 #endif /* HAVE_ELEVATOR_CHANGE */
227 zfs_dbgmsg("Unable to set \"%s\" scheduler for %s (%s): %d",
228 elevator
, v
->vdev_path
, device
, error
);
233 vdev_disk_open(vdev_t
*v
, uint64_t *psize
, uint64_t *max_psize
,
236 struct block_device
*bdev
;
237 fmode_t mode
= vdev_bdev_mode(spa_mode(v
->vdev_spa
));
238 int count
= 0, block_size
;
239 int bdev_retry_count
= 50;
242 /* Must have a pathname and it must be absolute. */
243 if (v
->vdev_path
== NULL
|| v
->vdev_path
[0] != '/') {
244 v
->vdev_stat
.vs_aux
= VDEV_AUX_BAD_LABEL
;
245 vdev_dbgmsg(v
, "invalid vdev_path");
246 return (SET_ERROR(EINVAL
));
250 * Reopen the device if it is currently open. When expanding a
251 * partition force re-scanning the partition table while closed
252 * in order to get an accurate updated block device size. Then
253 * since udev may need to recreate the device links increase the
254 * open retry count before reporting the device as unavailable.
258 char disk_name
[BDEVNAME_SIZE
+ 6] = "/dev/";
259 boolean_t reread_part
= B_FALSE
;
261 rw_enter(&vd
->vd_lock
, RW_WRITER
);
266 if (v
->vdev_expanding
&& bdev
!= bdev
->bd_contains
) {
267 bdevname(bdev
->bd_contains
, disk_name
+ 5);
268 reread_part
= B_TRUE
;
271 vdev_bdev_close(bdev
, mode
);
275 bdev
= vdev_bdev_open(disk_name
, mode
, zfs_vdev_holder
);
277 int error
= vdev_bdev_reread_part(bdev
);
278 vdev_bdev_close(bdev
, mode
);
280 bdev_retry_count
= 100;
284 vd
= kmem_zalloc(sizeof (vdev_disk_t
), KM_SLEEP
);
286 rw_init(&vd
->vd_lock
, NULL
, RW_DEFAULT
, NULL
);
287 rw_enter(&vd
->vd_lock
, RW_WRITER
);
291 * Devices are always opened by the path provided at configuration
292 * time. This means that if the provided path is a udev by-id path
293 * then drives may be re-cabled without an issue. If the provided
294 * path is a udev by-path path, then the physical location information
295 * will be preserved. This can be critical for more complicated
296 * configurations where drives are located in specific physical
297 * locations to maximize the systems tolerance to component failure.
299 * Alternatively, you can provide your own udev rule to flexibly map
300 * the drives as you see fit. It is not advised that you use the
301 * /dev/[hd]d devices which may be reordered due to probing order.
302 * Devices in the wrong locations will be detected by the higher
303 * level vdev validation.
305 * The specified paths may be briefly removed and recreated in
306 * response to udev events. This should be exceptionally unlikely
307 * because the zpool command makes every effort to verify these paths
308 * have already settled prior to reaching this point. Therefore,
309 * a ENOENT failure at this point is highly likely to be transient
310 * and it is reasonable to sleep and retry before giving up. In
311 * practice delays have been observed to be on the order of 100ms.
313 bdev
= ERR_PTR(-ENXIO
);
314 while (IS_ERR(bdev
) && count
< bdev_retry_count
) {
315 bdev
= vdev_bdev_open(v
->vdev_path
, mode
, zfs_vdev_holder
);
316 if (unlikely(PTR_ERR(bdev
) == -ENOENT
)) {
317 schedule_timeout(MSEC_TO_TICK(10));
319 } else if (IS_ERR(bdev
)) {
325 int error
= -PTR_ERR(bdev
);
326 vdev_dbgmsg(v
, "open error=%d count=%d", error
, count
);
329 rw_exit(&vd
->vd_lock
);
330 return (SET_ERROR(error
));
334 rw_exit(&vd
->vd_lock
);
337 struct request_queue
*q
= bdev_get_queue(vd
->vd_bdev
);
339 /* Determine the physical block size */
340 block_size
= vdev_bdev_block_size(vd
->vd_bdev
);
342 /* Clear the nowritecache bit, causes vdev_reopen() to try again. */
343 v
->vdev_nowritecache
= B_FALSE
;
345 /* Set when device reports it supports TRIM. */
346 v
->vdev_has_trim
= !!blk_queue_discard(q
);
348 /* Set when device reports it supports secure TRIM. */
349 v
->vdev_has_securetrim
= !!blk_queue_discard_secure(q
);
351 /* Inform the ZIO pipeline that we are non-rotational */
352 v
->vdev_nonrot
= blk_queue_nonrot(q
);
354 /* Physical volume size in bytes for the partition */
355 *psize
= bdev_capacity(vd
->vd_bdev
);
357 /* Physical volume size in bytes including possible expansion space */
358 *max_psize
= bdev_max_capacity(vd
->vd_bdev
, v
->vdev_wholedisk
);
360 /* Based on the minimum sector size set the block size */
361 *ashift
= highbit64(MAX(block_size
, SPA_MINBLOCKSIZE
)) - 1;
363 /* Try to set the io scheduler elevator algorithm */
364 (void) vdev_elevator_switch(v
, zfs_vdev_scheduler
);
370 vdev_disk_close(vdev_t
*v
)
372 vdev_disk_t
*vd
= v
->vdev_tsd
;
374 if (v
->vdev_reopening
|| vd
== NULL
)
377 if (vd
->vd_bdev
!= NULL
) {
378 vdev_bdev_close(vd
->vd_bdev
,
379 vdev_bdev_mode(spa_mode(v
->vdev_spa
)));
382 rw_destroy(&vd
->vd_lock
);
383 kmem_free(vd
, sizeof (vdev_disk_t
));
387 static dio_request_t
*
388 vdev_disk_dio_alloc(int bio_count
)
393 dr
= kmem_zalloc(sizeof (dio_request_t
) +
394 sizeof (struct bio
*) * bio_count
, KM_SLEEP
);
396 atomic_set(&dr
->dr_ref
, 0);
397 dr
->dr_bio_count
= bio_count
;
400 for (i
= 0; i
< dr
->dr_bio_count
; i
++)
401 dr
->dr_bio
[i
] = NULL
;
408 vdev_disk_dio_free(dio_request_t
*dr
)
412 for (i
= 0; i
< dr
->dr_bio_count
; i
++)
414 bio_put(dr
->dr_bio
[i
]);
416 kmem_free(dr
, sizeof (dio_request_t
) +
417 sizeof (struct bio
*) * dr
->dr_bio_count
);
421 vdev_disk_dio_get(dio_request_t
*dr
)
423 atomic_inc(&dr
->dr_ref
);
427 vdev_disk_dio_put(dio_request_t
*dr
)
429 int rc
= atomic_dec_return(&dr
->dr_ref
);
432 * Free the dio_request when the last reference is dropped and
433 * ensure zio_interpret is called only once with the correct zio
436 zio_t
*zio
= dr
->dr_zio
;
437 int error
= dr
->dr_error
;
439 vdev_disk_dio_free(dr
);
442 zio
->io_error
= error
;
443 ASSERT3S(zio
->io_error
, >=, 0);
445 vdev_disk_error(zio
);
447 zio_delay_interrupt(zio
);
454 BIO_END_IO_PROTO(vdev_disk_physio_completion
, bio
, error
)
456 dio_request_t
*dr
= bio
->bi_private
;
459 if (dr
->dr_error
== 0) {
460 #ifdef HAVE_1ARG_BIO_END_IO_T
461 dr
->dr_error
= BIO_END_IO_ERROR(bio
);
464 dr
->dr_error
= -(error
);
465 else if (!test_bit(BIO_UPTODATE
, &bio
->bi_flags
))
470 /* Drop reference acquired by __vdev_disk_physio */
471 rc
= vdev_disk_dio_put(dr
);
475 bio_map(struct bio
*bio
, void *bio_ptr
, unsigned int bio_size
)
477 unsigned int offset
, size
, i
;
480 offset
= offset_in_page(bio_ptr
);
481 for (i
= 0; i
< bio
->bi_max_vecs
; i
++) {
482 size
= PAGE_SIZE
- offset
;
490 if (is_vmalloc_addr(bio_ptr
))
491 page
= vmalloc_to_page(bio_ptr
);
493 page
= virt_to_page(bio_ptr
);
496 * Some network related block device uses tcp_sendpage, which
497 * doesn't behave well when using 0-count page, this is a
498 * safety net to catch them.
500 ASSERT3S(page_count(page
), >, 0);
502 if (bio_add_page(bio
, page
, size
, offset
) != size
)
514 bio_map_abd_off(struct bio
*bio
, abd_t
*abd
, unsigned int size
, size_t off
)
516 if (abd_is_linear(abd
))
517 return (bio_map(bio
, ((char *)abd_to_buf(abd
)) + off
, size
));
519 return (abd_scatter_bio_map_off(bio
, abd
, size
, off
));
523 vdev_submit_bio_impl(struct bio
*bio
)
525 #ifdef HAVE_1ARG_SUBMIT_BIO
532 #ifdef HAVE_BIO_SET_DEV
533 #if defined(CONFIG_BLK_CGROUP) && defined(HAVE_BIO_SET_DEV_GPL_ONLY)
535 * The Linux 5.0 kernel updated the bio_set_dev() macro so it calls the
536 * GPL-only bio_associate_blkg() symbol thus inadvertently converting
537 * the entire macro. Provide a minimal version which always assigns the
538 * request queue's root_blkg to the bio.
541 vdev_bio_associate_blkg(struct bio
*bio
)
543 struct request_queue
*q
= bio
->bi_disk
->queue
;
545 ASSERT3P(q
, !=, NULL
);
546 ASSERT3P(q
->root_blkg
, !=, NULL
);
547 ASSERT3P(bio
->bi_blkg
, ==, NULL
);
549 if (blkg_tryget(q
->root_blkg
))
550 bio
->bi_blkg
= q
->root_blkg
;
552 #define bio_associate_blkg vdev_bio_associate_blkg
556 * Provide a bio_set_dev() helper macro for pre-Linux 4.14 kernels.
559 bio_set_dev(struct bio
*bio
, struct block_device
*bdev
)
563 #endif /* HAVE_BIO_SET_DEV */
566 vdev_submit_bio(struct bio
*bio
)
568 #ifdef HAVE_CURRENT_BIO_TAIL
569 struct bio
**bio_tail
= current
->bio_tail
;
570 current
->bio_tail
= NULL
;
571 vdev_submit_bio_impl(bio
);
572 current
->bio_tail
= bio_tail
;
574 struct bio_list
*bio_list
= current
->bio_list
;
575 current
->bio_list
= NULL
;
576 vdev_submit_bio_impl(bio
);
577 current
->bio_list
= bio_list
;
582 __vdev_disk_physio(struct block_device
*bdev
, zio_t
*zio
,
583 size_t io_size
, uint64_t io_offset
, int rw
, int flags
)
588 int bio_size
, bio_count
= 16;
589 int i
= 0, error
= 0;
590 #if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG)
591 struct blk_plug plug
;
594 * Accessing outside the block device is never allowed.
596 if (io_offset
+ io_size
> bdev
->bd_inode
->i_size
) {
597 vdev_dbgmsg(zio
->io_vd
,
598 "Illegal access %llu size %llu, device size %llu",
599 io_offset
, io_size
, i_size_read(bdev
->bd_inode
));
600 return (SET_ERROR(EIO
));
604 dr
= vdev_disk_dio_alloc(bio_count
);
606 return (SET_ERROR(ENOMEM
));
608 if (zio
&& !(zio
->io_flags
& (ZIO_FLAG_IO_RETRY
| ZIO_FLAG_TRYHARD
)))
609 bio_set_flags_failfast(bdev
, &flags
);
614 * When the IO size exceeds the maximum bio size for the request
615 * queue we are forced to break the IO in multiple bio's and wait
616 * for them all to complete. Ideally, all pool users will set
617 * their volume block size to match the maximum request size and
618 * the common case will be one bio per vdev IO request.
622 bio_offset
= io_offset
;
624 for (i
= 0; i
<= dr
->dr_bio_count
; i
++) {
626 /* Finished constructing bio's for given buffer */
631 * By default only 'bio_count' bio's per dio are allowed.
632 * However, if we find ourselves in a situation where more
633 * are needed we allocate a larger dio and warn the user.
635 if (dr
->dr_bio_count
== i
) {
636 vdev_disk_dio_free(dr
);
641 /* bio_alloc() with __GFP_WAIT never returns NULL */
642 dr
->dr_bio
[i
] = bio_alloc(GFP_NOIO
,
643 MIN(abd_nr_pages_off(zio
->io_abd
, bio_size
, abd_offset
),
645 if (unlikely(dr
->dr_bio
[i
] == NULL
)) {
646 vdev_disk_dio_free(dr
);
647 return (SET_ERROR(ENOMEM
));
650 /* Matching put called by vdev_disk_physio_completion */
651 vdev_disk_dio_get(dr
);
653 bio_set_dev(dr
->dr_bio
[i
], bdev
);
654 BIO_BI_SECTOR(dr
->dr_bio
[i
]) = bio_offset
>> 9;
655 dr
->dr_bio
[i
]->bi_end_io
= vdev_disk_physio_completion
;
656 dr
->dr_bio
[i
]->bi_private
= dr
;
657 bio_set_op_attrs(dr
->dr_bio
[i
], rw
, flags
);
659 /* Remaining size is returned to become the new size */
660 bio_size
= bio_map_abd_off(dr
->dr_bio
[i
], zio
->io_abd
,
661 bio_size
, abd_offset
);
663 /* Advance in buffer and construct another bio if needed */
664 abd_offset
+= BIO_BI_SIZE(dr
->dr_bio
[i
]);
665 bio_offset
+= BIO_BI_SIZE(dr
->dr_bio
[i
]);
668 /* Extra reference to protect dio_request during vdev_submit_bio */
669 vdev_disk_dio_get(dr
);
671 #if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG)
672 if (dr
->dr_bio_count
> 1)
673 blk_start_plug(&plug
);
676 /* Submit all bio's associated with this dio */
677 for (i
= 0; i
< dr
->dr_bio_count
; i
++)
679 vdev_submit_bio(dr
->dr_bio
[i
]);
681 #if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG)
682 if (dr
->dr_bio_count
> 1)
683 blk_finish_plug(&plug
);
686 (void) vdev_disk_dio_put(dr
);
691 BIO_END_IO_PROTO(vdev_disk_io_flush_completion
, bio
, error
)
693 zio_t
*zio
= bio
->bi_private
;
694 #ifdef HAVE_1ARG_BIO_END_IO_T
695 zio
->io_error
= BIO_END_IO_ERROR(bio
);
697 zio
->io_error
= -error
;
700 if (zio
->io_error
&& (zio
->io_error
== EOPNOTSUPP
))
701 zio
->io_vd
->vdev_nowritecache
= B_TRUE
;
704 ASSERT3S(zio
->io_error
, >=, 0);
706 vdev_disk_error(zio
);
711 vdev_disk_io_flush(struct block_device
*bdev
, zio_t
*zio
)
713 struct request_queue
*q
;
716 q
= bdev_get_queue(bdev
);
718 return (SET_ERROR(ENXIO
));
720 bio
= bio_alloc(GFP_NOIO
, 0);
721 /* bio_alloc() with __GFP_WAIT never returns NULL */
722 if (unlikely(bio
== NULL
))
723 return (SET_ERROR(ENOMEM
));
725 bio
->bi_end_io
= vdev_disk_io_flush_completion
;
726 bio
->bi_private
= zio
;
727 bio_set_dev(bio
, bdev
);
729 vdev_submit_bio(bio
);
730 invalidate_bdev(bdev
);
736 vdev_disk_io_start(zio_t
*zio
)
738 vdev_t
*v
= zio
->io_vd
;
739 vdev_disk_t
*vd
= v
->vdev_tsd
;
740 unsigned long trim_flags
= 0;
741 int rw
, flags
, error
;
744 * If the vdev is closed, it's likely in the REMOVED or FAULTED state.
745 * Nothing to be done here but return failure.
748 zio
->io_error
= ENXIO
;
753 rw_enter(&vd
->vd_lock
, RW_READER
);
756 * If the vdev is closed, it's likely due to a failed reopen and is
757 * in the UNAVAIL state. Nothing to be done here but return failure.
759 if (vd
->vd_bdev
== NULL
) {
760 rw_exit(&vd
->vd_lock
);
761 zio
->io_error
= ENXIO
;
766 switch (zio
->io_type
) {
769 if (!vdev_readable(v
)) {
770 rw_exit(&vd
->vd_lock
);
771 zio
->io_error
= SET_ERROR(ENXIO
);
776 switch (zio
->io_cmd
) {
777 case DKIOCFLUSHWRITECACHE
:
779 if (zfs_nocacheflush
)
782 if (v
->vdev_nowritecache
) {
783 zio
->io_error
= SET_ERROR(ENOTSUP
);
787 error
= vdev_disk_io_flush(vd
->vd_bdev
, zio
);
789 rw_exit(&vd
->vd_lock
);
793 zio
->io_error
= error
;
798 zio
->io_error
= SET_ERROR(ENOTSUP
);
801 rw_exit(&vd
->vd_lock
);
806 #if defined(HAVE_BLK_QUEUE_HAVE_BIO_RW_UNPLUG)
807 flags
= (1 << BIO_RW_UNPLUG
);
808 #elif defined(REQ_UNPLUG)
817 #if defined(HAVE_BLK_QUEUE_HAVE_BIO_RW_UNPLUG)
818 flags
= (1 << BIO_RW_UNPLUG
);
819 #elif defined(REQ_UNPLUG)
827 #if defined(BLKDEV_DISCARD_SECURE)
828 if (zio
->io_trim_flags
& ZIO_TRIM_SECURE
)
829 trim_flags
|= BLKDEV_DISCARD_SECURE
;
831 zio
->io_error
= -blkdev_issue_discard(vd
->vd_bdev
,
832 zio
->io_offset
>> 9, zio
->io_size
>> 9, GFP_NOFS
,
835 rw_exit(&vd
->vd_lock
);
840 rw_exit(&vd
->vd_lock
);
841 zio
->io_error
= SET_ERROR(ENOTSUP
);
846 zio
->io_target_timestamp
= zio_handle_io_delay(zio
);
847 error
= __vdev_disk_physio(vd
->vd_bdev
, zio
,
848 zio
->io_size
, zio
->io_offset
, rw
, flags
);
849 rw_exit(&vd
->vd_lock
);
852 zio
->io_error
= error
;
859 vdev_disk_io_done(zio_t
*zio
)
862 * If the device returned EIO, we revalidate the media. If it is
863 * determined the media has changed this triggers the asynchronous
864 * removal of the device from the configuration.
866 if (zio
->io_error
== EIO
) {
867 vdev_t
*v
= zio
->io_vd
;
868 vdev_disk_t
*vd
= v
->vdev_tsd
;
870 if (check_disk_change(vd
->vd_bdev
)) {
871 vdev_bdev_invalidate(vd
->vd_bdev
);
872 v
->vdev_remove_wanted
= B_TRUE
;
873 spa_async_request(zio
->io_spa
, SPA_ASYNC_REMOVE
);
879 vdev_disk_hold(vdev_t
*vd
)
881 ASSERT(spa_config_held(vd
->vdev_spa
, SCL_STATE
, RW_WRITER
));
883 /* We must have a pathname, and it must be absolute. */
884 if (vd
->vdev_path
== NULL
|| vd
->vdev_path
[0] != '/')
888 * Only prefetch path and devid info if the device has
891 if (vd
->vdev_tsd
!= NULL
)
894 /* XXX: Implement me as a vnode lookup for the device */
895 vd
->vdev_name_vp
= NULL
;
896 vd
->vdev_devid_vp
= NULL
;
900 vdev_disk_rele(vdev_t
*vd
)
902 ASSERT(spa_config_held(vd
->vdev_spa
, SCL_STATE
, RW_WRITER
));
904 /* XXX: Implement me as a vnode rele for the device */
908 param_set_vdev_scheduler(const char *val
, zfs_kernel_param_t
*kp
)
914 return (SET_ERROR(-EINVAL
));
916 if ((p
= strchr(val
, '\n')) != NULL
)
919 if (spa_mode_global
!= 0) {
920 mutex_enter(&spa_namespace_lock
);
921 while ((spa
= spa_next(spa
)) != NULL
) {
922 if (spa_state(spa
) != POOL_STATE_ACTIVE
||
923 !spa_writeable(spa
) || spa_suspended(spa
))
926 spa_open_ref(spa
, FTAG
);
927 mutex_exit(&spa_namespace_lock
);
928 vdev_elevator_switch(spa
->spa_root_vdev
, (char *)val
);
929 mutex_enter(&spa_namespace_lock
);
930 spa_close(spa
, FTAG
);
932 mutex_exit(&spa_namespace_lock
);
935 return (param_set_charp(val
, kp
));
938 vdev_ops_t vdev_disk_ops
= {
950 VDEV_TYPE_DISK
, /* name of this vdev type */
951 B_TRUE
/* leaf vdev */
954 module_param_call(zfs_vdev_scheduler
, param_set_vdev_scheduler
,
955 param_get_charp
, &zfs_vdev_scheduler
, 0644);
956 MODULE_PARM_DESC(zfs_vdev_scheduler
, "I/O scheduler");