]> git.proxmox.com Git - mirror_zfs.git/blob - module/zfs/vdev_disk.c
c53a0aa0fcbe8b95927a36337200a0c0db3631fe
[mirror_zfs.git] / module / zfs / vdev_disk.c
1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC.
23 * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
24 * Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>.
25 * LLNL-CODE-403049.
26 * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
27 */
28
29 #include <sys/zfs_context.h>
30 #include <sys/spa_impl.h>
31 #include <sys/vdev_disk.h>
32 #include <sys/vdev_impl.h>
33 #include <sys/abd.h>
34 #include <sys/fs/zfs.h>
35 #include <sys/zio.h>
36 #include <linux/mod_compat.h>
37 #include <linux/msdos_fs.h>
38 #include <linux/vfs_compat.h>
39
40 char *zfs_vdev_scheduler = VDEV_SCHEDULER;
41 static void *zfs_vdev_holder = VDEV_HOLDER;
42
43 /* size of the "reserved" partition, in blocks */
44 #define EFI_MIN_RESV_SIZE (16 * 1024)
45
46 /*
47 * Virtual device vector for disks.
48 */
49 typedef struct dio_request {
50 zio_t *dr_zio; /* Parent ZIO */
51 atomic_t dr_ref; /* References */
52 int dr_error; /* Bio error */
53 int dr_bio_count; /* Count of bio's */
54 struct bio *dr_bio[0]; /* Attached bio's */
55 } dio_request_t;
56
57
58 #ifdef HAVE_OPEN_BDEV_EXCLUSIVE
59 static fmode_t
60 vdev_bdev_mode(int smode)
61 {
62 fmode_t mode = 0;
63
64 ASSERT3S(smode & (FREAD | FWRITE), !=, 0);
65
66 if (smode & FREAD)
67 mode |= FMODE_READ;
68
69 if (smode & FWRITE)
70 mode |= FMODE_WRITE;
71
72 return (mode);
73 }
74 #else
75 static int
76 vdev_bdev_mode(int smode)
77 {
78 int mode = 0;
79
80 ASSERT3S(smode & (FREAD | FWRITE), !=, 0);
81
82 if ((smode & FREAD) && !(smode & FWRITE))
83 mode = SB_RDONLY;
84
85 return (mode);
86 }
87 #endif /* HAVE_OPEN_BDEV_EXCLUSIVE */
88
89 /*
90 * Returns the usable capacity (in bytes) for the partition or disk.
91 */
92 static uint64_t
93 bdev_capacity(struct block_device *bdev)
94 {
95 return (i_size_read(bdev->bd_inode));
96 }
97
98 /*
99 * Returns the maximum expansion capacity of the block device (in bytes).
100 *
101 * It is possible to expand a vdev when it has been created as a wholedisk
102 * and the containing block device has increased in capacity. Or when the
103 * partition containing the pool has been manually increased in size.
104 *
105 * This function is only responsible for calculating the potential expansion
106 * size so it can be reported by 'zpool list'. The efi_use_whole_disk() is
107 * responsible for verifying the expected partition layout in the wholedisk
108 * case, and updating the partition table if appropriate. Once the partition
109 * size has been increased the additional capacity will be visible using
110 * bdev_capacity().
111 */
112 static uint64_t
113 bdev_max_capacity(struct block_device *bdev, uint64_t wholedisk)
114 {
115 uint64_t psize;
116 int64_t available;
117
118 if (wholedisk && bdev->bd_part != NULL && bdev != bdev->bd_contains) {
119 /*
120 * When reporting maximum expansion capacity for a wholedisk
121 * deduct any capacity which is expected to be lost due to
122 * alignment restrictions. Over reporting this value isn't
123 * harmful and would only result in slightly less capacity
124 * than expected post expansion.
125 */
126 available = i_size_read(bdev->bd_contains->bd_inode) -
127 ((EFI_MIN_RESV_SIZE + NEW_START_BLOCK +
128 PARTITION_END_ALIGNMENT) << SECTOR_BITS);
129 if (available > 0)
130 psize = available;
131 else
132 psize = bdev_capacity(bdev);
133 } else {
134 psize = bdev_capacity(bdev);
135 }
136
137 return (psize);
138 }
139
140 static void
141 vdev_disk_error(zio_t *zio)
142 {
143 /*
144 * This function can be called in interrupt context, for instance while
145 * handling IRQs coming from a misbehaving disk device; use printk()
146 * which is safe from any context.
147 */
148 printk(KERN_WARNING "zio pool=%s vdev=%s error=%d type=%d "
149 "offset=%llu size=%llu flags=%x\n", spa_name(zio->io_spa),
150 zio->io_vd->vdev_path, zio->io_error, zio->io_type,
151 (u_longlong_t)zio->io_offset, (u_longlong_t)zio->io_size,
152 zio->io_flags);
153 }
154
155 /*
156 * Use the Linux 'noop' elevator for zfs managed block devices. This
157 * strikes the ideal balance by allowing the zfs elevator to do all
158 * request ordering and prioritization. While allowing the Linux
159 * elevator to do the maximum front/back merging allowed by the
160 * physical device. This yields the largest possible requests for
161 * the device with the lowest total overhead.
162 */
163 static void
164 vdev_elevator_switch(vdev_t *v, char *elevator)
165 {
166 vdev_disk_t *vd = v->vdev_tsd;
167 struct request_queue *q;
168 char *device;
169 int error;
170
171 for (int c = 0; c < v->vdev_children; c++)
172 vdev_elevator_switch(v->vdev_child[c], elevator);
173
174 if (!v->vdev_ops->vdev_op_leaf || vd->vd_bdev == NULL)
175 return;
176
177 q = bdev_get_queue(vd->vd_bdev);
178 device = vd->vd_bdev->bd_disk->disk_name;
179
180 /*
181 * Skip devices which are not whole disks (partitions).
182 * Device-mapper devices are excepted since they may be whole
183 * disks despite the vdev_wholedisk flag, in which case we can
184 * and should switch the elevator. If the device-mapper device
185 * does not have an elevator (i.e. dm-raid, dm-crypt, etc.) the
186 * "Skip devices without schedulers" check below will fail.
187 */
188 if (!v->vdev_wholedisk && strncmp(device, "dm-", 3) != 0)
189 return;
190
191 /* Leave existing scheduler when set to "none" */
192 if ((strncmp(elevator, "none", 4) == 0) && (strlen(elevator) == 4))
193 return;
194
195 /*
196 * The elevator_change() function was available in kernels from
197 * 2.6.36 to 4.11. When not available fall back to using the user
198 * mode helper functionality to set the elevator via sysfs. This
199 * requires /bin/echo and sysfs to be mounted which may not be true
200 * early in the boot process.
201 */
202 #ifdef HAVE_ELEVATOR_CHANGE
203 error = elevator_change(q, elevator);
204 #else
205 #define SET_SCHEDULER_CMD \
206 "exec 0</dev/null " \
207 " 1>/sys/block/%s/queue/scheduler " \
208 " 2>/dev/null; " \
209 "echo %s"
210
211 char *argv[] = { "/bin/sh", "-c", NULL, NULL };
212 char *envp[] = { NULL };
213
214 argv[2] = kmem_asprintf(SET_SCHEDULER_CMD, device, elevator);
215 error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
216 strfree(argv[2]);
217 #endif /* HAVE_ELEVATOR_CHANGE */
218 if (error) {
219 zfs_dbgmsg("Unable to set \"%s\" scheduler for %s (%s): %d\n",
220 elevator, v->vdev_path, device, error);
221 }
222 }
223
224 static int
225 vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
226 uint64_t *ashift)
227 {
228 struct block_device *bdev;
229 fmode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa));
230 int count = 0, block_size;
231 int bdev_retry_count = 50;
232 vdev_disk_t *vd;
233
234 /* Must have a pathname and it must be absolute. */
235 if (v->vdev_path == NULL || v->vdev_path[0] != '/') {
236 v->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
237 vdev_dbgmsg(v, "invalid vdev_path");
238 return (SET_ERROR(EINVAL));
239 }
240
241 /*
242 * Reopen the device if it is currently open. When expanding a
243 * partition force re-scanning the partition table while closed
244 * in order to get an accurate updated block device size. Then
245 * since udev may need to recreate the device links increase the
246 * open retry count before reporting the device as unavailable.
247 */
248 vd = v->vdev_tsd;
249 if (vd) {
250 char disk_name[BDEVNAME_SIZE + 6] = "/dev/";
251 boolean_t reread_part = B_FALSE;
252
253 rw_enter(&vd->vd_lock, RW_WRITER);
254 bdev = vd->vd_bdev;
255 vd->vd_bdev = NULL;
256
257 if (bdev) {
258 if (v->vdev_expanding && bdev != bdev->bd_contains) {
259 bdevname(bdev->bd_contains, disk_name + 5);
260 reread_part = B_TRUE;
261 }
262
263 vdev_bdev_close(bdev, mode);
264 }
265
266 if (reread_part) {
267 bdev = vdev_bdev_open(disk_name, mode, zfs_vdev_holder);
268 if (!IS_ERR(bdev)) {
269 int error = vdev_bdev_reread_part(bdev);
270 vdev_bdev_close(bdev, mode);
271 if (error == 0)
272 bdev_retry_count = 100;
273 }
274 }
275 } else {
276 vd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);
277
278 rw_init(&vd->vd_lock, NULL, RW_DEFAULT, NULL);
279 rw_enter(&vd->vd_lock, RW_WRITER);
280 }
281
282 /*
283 * Devices are always opened by the path provided at configuration
284 * time. This means that if the provided path is a udev by-id path
285 * then drives may be re-cabled without an issue. If the provided
286 * path is a udev by-path path, then the physical location information
287 * will be preserved. This can be critical for more complicated
288 * configurations where drives are located in specific physical
289 * locations to maximize the systems tolerance to component failure.
290 *
291 * Alternatively, you can provide your own udev rule to flexibly map
292 * the drives as you see fit. It is not advised that you use the
293 * /dev/[hd]d devices which may be reordered due to probing order.
294 * Devices in the wrong locations will be detected by the higher
295 * level vdev validation.
296 *
297 * The specified paths may be briefly removed and recreated in
298 * response to udev events. This should be exceptionally unlikely
299 * because the zpool command makes every effort to verify these paths
300 * have already settled prior to reaching this point. Therefore,
301 * a ENOENT failure at this point is highly likely to be transient
302 * and it is reasonable to sleep and retry before giving up. In
303 * practice delays have been observed to be on the order of 100ms.
304 */
305 bdev = ERR_PTR(-ENXIO);
306 while (IS_ERR(bdev) && count < bdev_retry_count) {
307 bdev = vdev_bdev_open(v->vdev_path, mode, zfs_vdev_holder);
308 if (unlikely(PTR_ERR(bdev) == -ENOENT)) {
309 schedule_timeout(MSEC_TO_TICK(10));
310 count++;
311 } else if (IS_ERR(bdev)) {
312 break;
313 }
314 }
315
316 if (IS_ERR(bdev)) {
317 int error = -PTR_ERR(bdev);
318 vdev_dbgmsg(v, "open error=%d count=%d\n", error, count);
319 vd->vd_bdev = NULL;
320 v->vdev_tsd = vd;
321 rw_exit(&vd->vd_lock);
322 return (SET_ERROR(error));
323 } else {
324 vd->vd_bdev = bdev;
325 v->vdev_tsd = vd;
326 rw_exit(&vd->vd_lock);
327 }
328
329 /* Determine the physical block size */
330 block_size = vdev_bdev_block_size(vd->vd_bdev);
331
332 /* Clear the nowritecache bit, causes vdev_reopen() to try again. */
333 v->vdev_nowritecache = B_FALSE;
334
335 /* Inform the ZIO pipeline that we are non-rotational */
336 v->vdev_nonrot = blk_queue_nonrot(bdev_get_queue(vd->vd_bdev));
337
338 /* Physical volume size in bytes for the partition */
339 *psize = bdev_capacity(vd->vd_bdev);
340
341 /* Physical volume size in bytes including possible expansion space */
342 *max_psize = bdev_max_capacity(vd->vd_bdev, v->vdev_wholedisk);
343
344 /* Based on the minimum sector size set the block size */
345 *ashift = highbit64(MAX(block_size, SPA_MINBLOCKSIZE)) - 1;
346
347 /* Try to set the io scheduler elevator algorithm */
348 (void) vdev_elevator_switch(v, zfs_vdev_scheduler);
349
350 return (0);
351 }
352
353 static void
354 vdev_disk_close(vdev_t *v)
355 {
356 vdev_disk_t *vd = v->vdev_tsd;
357
358 if (v->vdev_reopening || vd == NULL)
359 return;
360
361 if (vd->vd_bdev != NULL) {
362 vdev_bdev_close(vd->vd_bdev,
363 vdev_bdev_mode(spa_mode(v->vdev_spa)));
364 }
365
366 rw_destroy(&vd->vd_lock);
367 kmem_free(vd, sizeof (vdev_disk_t));
368 v->vdev_tsd = NULL;
369 }
370
371 static dio_request_t *
372 vdev_disk_dio_alloc(int bio_count)
373 {
374 dio_request_t *dr;
375 int i;
376
377 dr = kmem_zalloc(sizeof (dio_request_t) +
378 sizeof (struct bio *) * bio_count, KM_SLEEP);
379 if (dr) {
380 atomic_set(&dr->dr_ref, 0);
381 dr->dr_bio_count = bio_count;
382 dr->dr_error = 0;
383
384 for (i = 0; i < dr->dr_bio_count; i++)
385 dr->dr_bio[i] = NULL;
386 }
387
388 return (dr);
389 }
390
391 static void
392 vdev_disk_dio_free(dio_request_t *dr)
393 {
394 int i;
395
396 for (i = 0; i < dr->dr_bio_count; i++)
397 if (dr->dr_bio[i])
398 bio_put(dr->dr_bio[i]);
399
400 kmem_free(dr, sizeof (dio_request_t) +
401 sizeof (struct bio *) * dr->dr_bio_count);
402 }
403
404 static void
405 vdev_disk_dio_get(dio_request_t *dr)
406 {
407 atomic_inc(&dr->dr_ref);
408 }
409
410 static int
411 vdev_disk_dio_put(dio_request_t *dr)
412 {
413 int rc = atomic_dec_return(&dr->dr_ref);
414
415 /*
416 * Free the dio_request when the last reference is dropped and
417 * ensure zio_interpret is called only once with the correct zio
418 */
419 if (rc == 0) {
420 zio_t *zio = dr->dr_zio;
421 int error = dr->dr_error;
422
423 vdev_disk_dio_free(dr);
424
425 if (zio) {
426 zio->io_error = error;
427 ASSERT3S(zio->io_error, >=, 0);
428 if (zio->io_error)
429 vdev_disk_error(zio);
430
431 zio_delay_interrupt(zio);
432 }
433 }
434
435 return (rc);
436 }
437
438 BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, error)
439 {
440 dio_request_t *dr = bio->bi_private;
441 int rc;
442
443 if (dr->dr_error == 0) {
444 #ifdef HAVE_1ARG_BIO_END_IO_T
445 dr->dr_error = BIO_END_IO_ERROR(bio);
446 #else
447 if (error)
448 dr->dr_error = -(error);
449 else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
450 dr->dr_error = EIO;
451 #endif
452 }
453
454 /* Drop reference acquired by __vdev_disk_physio */
455 rc = vdev_disk_dio_put(dr);
456 }
457
458 static unsigned int
459 bio_map(struct bio *bio, void *bio_ptr, unsigned int bio_size)
460 {
461 unsigned int offset, size, i;
462 struct page *page;
463
464 offset = offset_in_page(bio_ptr);
465 for (i = 0; i < bio->bi_max_vecs; i++) {
466 size = PAGE_SIZE - offset;
467
468 if (bio_size <= 0)
469 break;
470
471 if (size > bio_size)
472 size = bio_size;
473
474 if (is_vmalloc_addr(bio_ptr))
475 page = vmalloc_to_page(bio_ptr);
476 else
477 page = virt_to_page(bio_ptr);
478
479 /*
480 * Some network related block device uses tcp_sendpage, which
481 * doesn't behave well when using 0-count page, this is a
482 * safety net to catch them.
483 */
484 ASSERT3S(page_count(page), >, 0);
485
486 if (bio_add_page(bio, page, size, offset) != size)
487 break;
488
489 bio_ptr += size;
490 bio_size -= size;
491 offset = 0;
492 }
493
494 return (bio_size);
495 }
496
497 static unsigned int
498 bio_map_abd_off(struct bio *bio, abd_t *abd, unsigned int size, size_t off)
499 {
500 if (abd_is_linear(abd))
501 return (bio_map(bio, ((char *)abd_to_buf(abd)) + off, size));
502
503 return (abd_scatter_bio_map_off(bio, abd, size, off));
504 }
505
506 static inline void
507 vdev_submit_bio_impl(struct bio *bio)
508 {
509 #ifdef HAVE_1ARG_SUBMIT_BIO
510 submit_bio(bio);
511 #else
512 submit_bio(0, bio);
513 #endif
514 }
515
516 #ifndef HAVE_BIO_SET_DEV
517 static inline void
518 bio_set_dev(struct bio *bio, struct block_device *bdev)
519 {
520 bio->bi_bdev = bdev;
521 }
522 #endif /* !HAVE_BIO_SET_DEV */
523
524 static inline void
525 vdev_submit_bio(struct bio *bio)
526 {
527 #ifdef HAVE_CURRENT_BIO_TAIL
528 struct bio **bio_tail = current->bio_tail;
529 current->bio_tail = NULL;
530 vdev_submit_bio_impl(bio);
531 current->bio_tail = bio_tail;
532 #else
533 struct bio_list *bio_list = current->bio_list;
534 current->bio_list = NULL;
535 vdev_submit_bio_impl(bio);
536 current->bio_list = bio_list;
537 #endif
538 }
539
540 static int
541 __vdev_disk_physio(struct block_device *bdev, zio_t *zio,
542 size_t io_size, uint64_t io_offset, int rw, int flags)
543 {
544 dio_request_t *dr;
545 uint64_t abd_offset;
546 uint64_t bio_offset;
547 int bio_size, bio_count = 16;
548 int i = 0, error = 0;
549 #if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG)
550 struct blk_plug plug;
551 #endif
552 /*
553 * Accessing outside the block device is never allowed.
554 */
555 if (io_offset + io_size > bdev->bd_inode->i_size) {
556 vdev_dbgmsg(zio->io_vd,
557 "Illegal access %llu size %llu, device size %llu",
558 io_offset, io_size, i_size_read(bdev->bd_inode));
559 return (SET_ERROR(EIO));
560 }
561
562 retry:
563 dr = vdev_disk_dio_alloc(bio_count);
564 if (dr == NULL)
565 return (SET_ERROR(ENOMEM));
566
567 if (zio && !(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))
568 bio_set_flags_failfast(bdev, &flags);
569
570 dr->dr_zio = zio;
571
572 /*
573 * When the IO size exceeds the maximum bio size for the request
574 * queue we are forced to break the IO in multiple bio's and wait
575 * for them all to complete. Ideally, all pool users will set
576 * their volume block size to match the maximum request size and
577 * the common case will be one bio per vdev IO request.
578 */
579
580 abd_offset = 0;
581 bio_offset = io_offset;
582 bio_size = io_size;
583 for (i = 0; i <= dr->dr_bio_count; i++) {
584
585 /* Finished constructing bio's for given buffer */
586 if (bio_size <= 0)
587 break;
588
589 /*
590 * By default only 'bio_count' bio's per dio are allowed.
591 * However, if we find ourselves in a situation where more
592 * are needed we allocate a larger dio and warn the user.
593 */
594 if (dr->dr_bio_count == i) {
595 vdev_disk_dio_free(dr);
596 bio_count *= 2;
597 goto retry;
598 }
599
600 /* bio_alloc() with __GFP_WAIT never returns NULL */
601 dr->dr_bio[i] = bio_alloc(GFP_NOIO,
602 MIN(abd_nr_pages_off(zio->io_abd, bio_size, abd_offset),
603 BIO_MAX_PAGES));
604 if (unlikely(dr->dr_bio[i] == NULL)) {
605 vdev_disk_dio_free(dr);
606 return (SET_ERROR(ENOMEM));
607 }
608
609 /* Matching put called by vdev_disk_physio_completion */
610 vdev_disk_dio_get(dr);
611
612 bio_set_dev(dr->dr_bio[i], bdev);
613 BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9;
614 dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion;
615 dr->dr_bio[i]->bi_private = dr;
616 bio_set_op_attrs(dr->dr_bio[i], rw, flags);
617
618 /* Remaining size is returned to become the new size */
619 bio_size = bio_map_abd_off(dr->dr_bio[i], zio->io_abd,
620 bio_size, abd_offset);
621
622 /* Advance in buffer and construct another bio if needed */
623 abd_offset += BIO_BI_SIZE(dr->dr_bio[i]);
624 bio_offset += BIO_BI_SIZE(dr->dr_bio[i]);
625 }
626
627 /* Extra reference to protect dio_request during vdev_submit_bio */
628 vdev_disk_dio_get(dr);
629
630 #if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG)
631 if (dr->dr_bio_count > 1)
632 blk_start_plug(&plug);
633 #endif
634
635 /* Submit all bio's associated with this dio */
636 for (i = 0; i < dr->dr_bio_count; i++)
637 if (dr->dr_bio[i])
638 vdev_submit_bio(dr->dr_bio[i]);
639
640 #if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG)
641 if (dr->dr_bio_count > 1)
642 blk_finish_plug(&plug);
643 #endif
644
645 (void) vdev_disk_dio_put(dr);
646
647 return (error);
648 }
649
650 BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error)
651 {
652 zio_t *zio = bio->bi_private;
653 #ifdef HAVE_1ARG_BIO_END_IO_T
654 zio->io_error = BIO_END_IO_ERROR(bio);
655 #else
656 zio->io_error = -error;
657 #endif
658
659 if (zio->io_error && (zio->io_error == EOPNOTSUPP))
660 zio->io_vd->vdev_nowritecache = B_TRUE;
661
662 bio_put(bio);
663 ASSERT3S(zio->io_error, >=, 0);
664 if (zio->io_error)
665 vdev_disk_error(zio);
666 zio_interrupt(zio);
667 }
668
669 static int
670 vdev_disk_io_flush(struct block_device *bdev, zio_t *zio)
671 {
672 struct request_queue *q;
673 struct bio *bio;
674
675 q = bdev_get_queue(bdev);
676 if (!q)
677 return (SET_ERROR(ENXIO));
678
679 bio = bio_alloc(GFP_NOIO, 0);
680 /* bio_alloc() with __GFP_WAIT never returns NULL */
681 if (unlikely(bio == NULL))
682 return (SET_ERROR(ENOMEM));
683
684 bio->bi_end_io = vdev_disk_io_flush_completion;
685 bio->bi_private = zio;
686 bio_set_dev(bio, bdev);
687 bio_set_flush(bio);
688 vdev_submit_bio(bio);
689 invalidate_bdev(bdev);
690
691 return (0);
692 }
693
694 static void
695 vdev_disk_io_start(zio_t *zio)
696 {
697 vdev_t *v = zio->io_vd;
698 vdev_disk_t *vd = v->vdev_tsd;
699 int rw, flags, error;
700
701 /*
702 * If the vdev is closed, it's likely in the REMOVED or FAULTED state.
703 * Nothing to be done here but return failure.
704 */
705 if (vd == NULL) {
706 zio->io_error = ENXIO;
707 zio_interrupt(zio);
708 return;
709 }
710
711 rw_enter(&vd->vd_lock, RW_READER);
712
713 /*
714 * If the vdev is closed, it's likely due to a failed reopen and is
715 * in the UNAVAIL state. Nothing to be done here but return failure.
716 */
717 if (vd->vd_bdev == NULL) {
718 rw_exit(&vd->vd_lock);
719 zio->io_error = ENXIO;
720 zio_interrupt(zio);
721 return;
722 }
723
724 switch (zio->io_type) {
725 case ZIO_TYPE_IOCTL:
726
727 if (!vdev_readable(v)) {
728 rw_exit(&vd->vd_lock);
729 zio->io_error = SET_ERROR(ENXIO);
730 zio_interrupt(zio);
731 return;
732 }
733
734 switch (zio->io_cmd) {
735 case DKIOCFLUSHWRITECACHE:
736
737 if (zfs_nocacheflush)
738 break;
739
740 if (v->vdev_nowritecache) {
741 zio->io_error = SET_ERROR(ENOTSUP);
742 break;
743 }
744
745 error = vdev_disk_io_flush(vd->vd_bdev, zio);
746 if (error == 0) {
747 rw_exit(&vd->vd_lock);
748 return;
749 }
750
751 zio->io_error = error;
752
753 break;
754
755 default:
756 zio->io_error = SET_ERROR(ENOTSUP);
757 }
758
759 rw_exit(&vd->vd_lock);
760 zio_execute(zio);
761 return;
762 case ZIO_TYPE_WRITE:
763 rw = WRITE;
764 #if defined(HAVE_BLK_QUEUE_HAVE_BIO_RW_UNPLUG)
765 flags = (1 << BIO_RW_UNPLUG);
766 #elif defined(REQ_UNPLUG)
767 flags = REQ_UNPLUG;
768 #else
769 flags = 0;
770 #endif
771 break;
772
773 case ZIO_TYPE_READ:
774 rw = READ;
775 #if defined(HAVE_BLK_QUEUE_HAVE_BIO_RW_UNPLUG)
776 flags = (1 << BIO_RW_UNPLUG);
777 #elif defined(REQ_UNPLUG)
778 flags = REQ_UNPLUG;
779 #else
780 flags = 0;
781 #endif
782 break;
783
784 default:
785 rw_exit(&vd->vd_lock);
786 zio->io_error = SET_ERROR(ENOTSUP);
787 zio_interrupt(zio);
788 return;
789 }
790
791 zio->io_target_timestamp = zio_handle_io_delay(zio);
792 error = __vdev_disk_physio(vd->vd_bdev, zio,
793 zio->io_size, zio->io_offset, rw, flags);
794 rw_exit(&vd->vd_lock);
795
796 if (error) {
797 zio->io_error = error;
798 zio_interrupt(zio);
799 return;
800 }
801 }
802
803 static void
804 vdev_disk_io_done(zio_t *zio)
805 {
806 /*
807 * If the device returned EIO, we revalidate the media. If it is
808 * determined the media has changed this triggers the asynchronous
809 * removal of the device from the configuration.
810 */
811 if (zio->io_error == EIO) {
812 vdev_t *v = zio->io_vd;
813 vdev_disk_t *vd = v->vdev_tsd;
814
815 if (check_disk_change(vd->vd_bdev)) {
816 vdev_bdev_invalidate(vd->vd_bdev);
817 v->vdev_remove_wanted = B_TRUE;
818 spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE);
819 }
820 }
821 }
822
823 static void
824 vdev_disk_hold(vdev_t *vd)
825 {
826 ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
827
828 /* We must have a pathname, and it must be absolute. */
829 if (vd->vdev_path == NULL || vd->vdev_path[0] != '/')
830 return;
831
832 /*
833 * Only prefetch path and devid info if the device has
834 * never been opened.
835 */
836 if (vd->vdev_tsd != NULL)
837 return;
838
839 /* XXX: Implement me as a vnode lookup for the device */
840 vd->vdev_name_vp = NULL;
841 vd->vdev_devid_vp = NULL;
842 }
843
844 static void
845 vdev_disk_rele(vdev_t *vd)
846 {
847 ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
848
849 /* XXX: Implement me as a vnode rele for the device */
850 }
851
852 static int
853 param_set_vdev_scheduler(const char *val, zfs_kernel_param_t *kp)
854 {
855 spa_t *spa = NULL;
856 char *p;
857
858 if (val == NULL)
859 return (SET_ERROR(-EINVAL));
860
861 if ((p = strchr(val, '\n')) != NULL)
862 *p = '\0';
863
864 if (spa_mode_global != 0) {
865 mutex_enter(&spa_namespace_lock);
866 while ((spa = spa_next(spa)) != NULL) {
867 if (spa_state(spa) != POOL_STATE_ACTIVE ||
868 !spa_writeable(spa) || spa_suspended(spa))
869 continue;
870
871 spa_open_ref(spa, FTAG);
872 mutex_exit(&spa_namespace_lock);
873 vdev_elevator_switch(spa->spa_root_vdev, (char *)val);
874 mutex_enter(&spa_namespace_lock);
875 spa_close(spa, FTAG);
876 }
877 mutex_exit(&spa_namespace_lock);
878 }
879
880 return (param_set_charp(val, kp));
881 }
882
883 vdev_ops_t vdev_disk_ops = {
884 vdev_disk_open,
885 vdev_disk_close,
886 vdev_default_asize,
887 vdev_disk_io_start,
888 vdev_disk_io_done,
889 NULL,
890 NULL,
891 vdev_disk_hold,
892 vdev_disk_rele,
893 NULL,
894 vdev_default_xlate,
895 VDEV_TYPE_DISK, /* name of this vdev type */
896 B_TRUE /* leaf vdev */
897 };
898
899 module_param_call(zfs_vdev_scheduler, param_set_vdev_scheduler,
900 param_get_charp, &zfs_vdev_scheduler, 0644);
901 MODULE_PARM_DESC(zfs_vdev_scheduler, "I/O scheduler");