]> git.proxmox.com Git - mirror_zfs.git/blob - module/zfs/vdev_disk.c
Linux 5.0 compat: Convert MS_* macros to SB_*
[mirror_zfs.git] / module / zfs / vdev_disk.c
1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC.
23 * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
24 * Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>.
25 * LLNL-CODE-403049.
26 * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
27 */
28
29 #include <sys/zfs_context.h>
30 #include <sys/spa_impl.h>
31 #include <sys/vdev_disk.h>
32 #include <sys/vdev_impl.h>
33 #include <sys/abd.h>
34 #include <sys/fs/zfs.h>
35 #include <sys/zio.h>
36 #include <sys/sunldi.h>
37 #include <linux/mod_compat.h>
38 #include <linux/vfs_compat.h>
39
40 char *zfs_vdev_scheduler = VDEV_SCHEDULER;
41 static void *zfs_vdev_holder = VDEV_HOLDER;
42
43 /*
44 * Virtual device vector for disks.
45 */
46 typedef struct dio_request {
47 zio_t *dr_zio; /* Parent ZIO */
48 atomic_t dr_ref; /* References */
49 int dr_error; /* Bio error */
50 int dr_bio_count; /* Count of bio's */
51 struct bio *dr_bio[0]; /* Attached bio's */
52 } dio_request_t;
53
54
55 #ifdef HAVE_OPEN_BDEV_EXCLUSIVE
56 static fmode_t
57 vdev_bdev_mode(int smode)
58 {
59 fmode_t mode = 0;
60
61 ASSERT3S(smode & (FREAD | FWRITE), !=, 0);
62
63 if (smode & FREAD)
64 mode |= FMODE_READ;
65
66 if (smode & FWRITE)
67 mode |= FMODE_WRITE;
68
69 return (mode);
70 }
71 #else
72 static int
73 vdev_bdev_mode(int smode)
74 {
75 int mode = 0;
76
77 ASSERT3S(smode & (FREAD | FWRITE), !=, 0);
78
79 if ((smode & FREAD) && !(smode & FWRITE))
80 mode = SB_RDONLY;
81
82 return (mode);
83 }
84 #endif /* HAVE_OPEN_BDEV_EXCLUSIVE */
85
86 static uint64_t
87 bdev_capacity(struct block_device *bdev)
88 {
89 struct hd_struct *part = bdev->bd_part;
90
91 /* The partition capacity referenced by the block device */
92 if (part)
93 return (part->nr_sects << 9);
94
95 /* Otherwise assume the full device capacity */
96 return (get_capacity(bdev->bd_disk) << 9);
97 }
98
99 static void
100 vdev_disk_error(zio_t *zio)
101 {
102 #ifdef ZFS_DEBUG
103 printk("ZFS: zio error=%d type=%d offset=%llu size=%llu "
104 "flags=%x\n", zio->io_error, zio->io_type,
105 (u_longlong_t)zio->io_offset, (u_longlong_t)zio->io_size,
106 zio->io_flags);
107 #endif
108 }
109
110 /*
111 * Use the Linux 'noop' elevator for zfs managed block devices. This
112 * strikes the ideal balance by allowing the zfs elevator to do all
113 * request ordering and prioritization. While allowing the Linux
114 * elevator to do the maximum front/back merging allowed by the
115 * physical device. This yields the largest possible requests for
116 * the device with the lowest total overhead.
117 */
118 static void
119 vdev_elevator_switch(vdev_t *v, char *elevator)
120 {
121 vdev_disk_t *vd = v->vdev_tsd;
122 struct request_queue *q;
123 char *device;
124 int error;
125
126 for (int c = 0; c < v->vdev_children; c++)
127 vdev_elevator_switch(v->vdev_child[c], elevator);
128
129 if (!v->vdev_ops->vdev_op_leaf || vd->vd_bdev == NULL)
130 return;
131
132 q = bdev_get_queue(vd->vd_bdev);
133 device = vd->vd_bdev->bd_disk->disk_name;
134
135 /*
136 * Skip devices which are not whole disks (partitions).
137 * Device-mapper devices are excepted since they may be whole
138 * disks despite the vdev_wholedisk flag, in which case we can
139 * and should switch the elevator. If the device-mapper device
140 * does not have an elevator (i.e. dm-raid, dm-crypt, etc.) the
141 * "Skip devices without schedulers" check below will fail.
142 */
143 if (!v->vdev_wholedisk && strncmp(device, "dm-", 3) != 0)
144 return;
145
146 /* Leave existing scheduler when set to "none" */
147 if ((strncmp(elevator, "none", 4) == 0) && (strlen(elevator) == 4))
148 return;
149
150 /*
151 * The elevator_change() function was available in kernels from
152 * 2.6.36 to 4.11. When not available fall back to using the user
153 * mode helper functionality to set the elevator via sysfs. This
154 * requires /bin/echo and sysfs to be mounted which may not be true
155 * early in the boot process.
156 */
157 #ifdef HAVE_ELEVATOR_CHANGE
158 error = elevator_change(q, elevator);
159 #else
160 #define SET_SCHEDULER_CMD \
161 "exec 0</dev/null " \
162 " 1>/sys/block/%s/queue/scheduler " \
163 " 2>/dev/null; " \
164 "echo %s"
165
166 char *argv[] = { "/bin/sh", "-c", NULL, NULL };
167 char *envp[] = { NULL };
168
169 argv[2] = kmem_asprintf(SET_SCHEDULER_CMD, device, elevator);
170 error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
171 strfree(argv[2]);
172 #endif /* HAVE_ELEVATOR_CHANGE */
173 if (error) {
174 zfs_dbgmsg("Unable to set \"%s\" scheduler for %s (%s): %d\n",
175 elevator, v->vdev_path, device, error);
176 }
177 }
178
179 /*
180 * Expanding a whole disk vdev involves invoking BLKRRPART on the
181 * whole disk device. This poses a problem, because BLKRRPART will
182 * return EBUSY if one of the disk's partitions is open. That's why
183 * we have to do it here, just before opening the data partition.
184 * Unfortunately, BLKRRPART works by dropping all partitions and
185 * recreating them, which means that for a short time window, all
186 * /dev/sdxN device files disappear (until udev recreates them).
187 * This means two things:
188 * - When we open the data partition just after a BLKRRPART, we
189 * can't do it using the normal device file path because of the
190 * obvious race condition with udev. Instead, we use reliable
191 * kernel APIs to get a handle to the new partition device from
192 * the whole disk device.
193 * - Because vdev_disk_open() initially needs to find the device
194 * using its path, multiple vdev_disk_open() invocations in
195 * short succession on the same disk with BLKRRPARTs in the
196 * middle have a high probability of failure (because of the
197 * race condition with udev). A typical situation where this
198 * might happen is when the zpool userspace tool does a
199 * TRYIMPORT immediately followed by an IMPORT. For this
200 * reason, we only invoke BLKRRPART in the module when strictly
201 * necessary (zpool online -e case), and rely on userspace to
202 * do it when possible.
203 */
204 static struct block_device *
205 vdev_disk_rrpart(const char *path, int mode, vdev_disk_t *vd)
206 {
207 #if defined(HAVE_3ARG_BLKDEV_GET) && defined(HAVE_GET_GENDISK)
208 struct block_device *bdev, *result = ERR_PTR(-ENXIO);
209 struct gendisk *disk;
210 int error, partno;
211
212 bdev = vdev_bdev_open(path, vdev_bdev_mode(mode), zfs_vdev_holder);
213 if (IS_ERR(bdev))
214 return (bdev);
215
216 disk = get_gendisk(bdev->bd_dev, &partno);
217 vdev_bdev_close(bdev, vdev_bdev_mode(mode));
218
219 if (disk) {
220 bdev = bdget(disk_devt(disk));
221 if (bdev) {
222 error = blkdev_get(bdev, vdev_bdev_mode(mode), vd);
223 if (error == 0)
224 error = ioctl_by_bdev(bdev, BLKRRPART, 0);
225 vdev_bdev_close(bdev, vdev_bdev_mode(mode));
226 }
227
228 bdev = bdget_disk(disk, partno);
229 if (bdev) {
230 error = blkdev_get(bdev,
231 vdev_bdev_mode(mode) | FMODE_EXCL, vd);
232 if (error == 0)
233 result = bdev;
234 }
235 put_disk(disk);
236 }
237
238 return (result);
239 #else
240 return (ERR_PTR(-EOPNOTSUPP));
241 #endif /* defined(HAVE_3ARG_BLKDEV_GET) && defined(HAVE_GET_GENDISK) */
242 }
243
244 static int
245 vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
246 uint64_t *ashift)
247 {
248 struct block_device *bdev = ERR_PTR(-ENXIO);
249 vdev_disk_t *vd;
250 int count = 0, mode, block_size;
251
252 /* Must have a pathname and it must be absolute. */
253 if (v->vdev_path == NULL || v->vdev_path[0] != '/') {
254 v->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
255 return (SET_ERROR(EINVAL));
256 }
257
258 /*
259 * Reopen the device if it's not currently open. Otherwise,
260 * just update the physical size of the device.
261 */
262 if (v->vdev_tsd != NULL) {
263 ASSERT(v->vdev_reopening);
264 vd = v->vdev_tsd;
265 goto skip_open;
266 }
267
268 vd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);
269 if (vd == NULL)
270 return (SET_ERROR(ENOMEM));
271
272 /*
273 * Devices are always opened by the path provided at configuration
274 * time. This means that if the provided path is a udev by-id path
275 * then drives may be recabled without an issue. If the provided
276 * path is a udev by-path path, then the physical location information
277 * will be preserved. This can be critical for more complicated
278 * configurations where drives are located in specific physical
279 * locations to maximize the systems tolerence to component failure.
280 * Alternatively, you can provide your own udev rule to flexibly map
281 * the drives as you see fit. It is not advised that you use the
282 * /dev/[hd]d devices which may be reordered due to probing order.
283 * Devices in the wrong locations will be detected by the higher
284 * level vdev validation.
285 *
286 * The specified paths may be briefly removed and recreated in
287 * response to udev events. This should be exceptionally unlikely
288 * because the zpool command makes every effort to verify these paths
289 * have already settled prior to reaching this point. Therefore,
290 * a ENOENT failure at this point is highly likely to be transient
291 * and it is reasonable to sleep and retry before giving up. In
292 * practice delays have been observed to be on the order of 100ms.
293 */
294 mode = spa_mode(v->vdev_spa);
295 if (v->vdev_wholedisk && v->vdev_expanding)
296 bdev = vdev_disk_rrpart(v->vdev_path, mode, vd);
297
298 while (IS_ERR(bdev) && count < 50) {
299 bdev = vdev_bdev_open(v->vdev_path,
300 vdev_bdev_mode(mode), zfs_vdev_holder);
301 if (unlikely(PTR_ERR(bdev) == -ENOENT)) {
302 msleep(10);
303 count++;
304 } else if (IS_ERR(bdev)) {
305 break;
306 }
307 }
308
309 if (IS_ERR(bdev)) {
310 dprintf("failed open v->vdev_path=%s, error=%d count=%d\n",
311 v->vdev_path, -PTR_ERR(bdev), count);
312 kmem_free(vd, sizeof (vdev_disk_t));
313 return (SET_ERROR(-PTR_ERR(bdev)));
314 }
315
316 v->vdev_tsd = vd;
317 vd->vd_bdev = bdev;
318
319 skip_open:
320 /* Determine the physical block size */
321 block_size = vdev_bdev_block_size(vd->vd_bdev);
322
323 /* Clear the nowritecache bit, causes vdev_reopen() to try again. */
324 v->vdev_nowritecache = B_FALSE;
325
326 /* Inform the ZIO pipeline that we are non-rotational */
327 v->vdev_nonrot = blk_queue_nonrot(bdev_get_queue(vd->vd_bdev));
328
329 /* Physical volume size in bytes */
330 *psize = bdev_capacity(vd->vd_bdev);
331
332 /* TODO: report possible expansion size */
333 *max_psize = *psize;
334
335 /* Based on the minimum sector size set the block size */
336 *ashift = highbit64(MAX(block_size, SPA_MINBLOCKSIZE)) - 1;
337
338 /* Try to set the io scheduler elevator algorithm */
339 (void) vdev_elevator_switch(v, zfs_vdev_scheduler);
340
341 return (0);
342 }
343
344 static void
345 vdev_disk_close(vdev_t *v)
346 {
347 vdev_disk_t *vd = v->vdev_tsd;
348
349 if (v->vdev_reopening || vd == NULL)
350 return;
351
352 if (vd->vd_bdev != NULL)
353 vdev_bdev_close(vd->vd_bdev,
354 vdev_bdev_mode(spa_mode(v->vdev_spa)));
355
356 kmem_free(vd, sizeof (vdev_disk_t));
357 v->vdev_tsd = NULL;
358 }
359
360 static dio_request_t *
361 vdev_disk_dio_alloc(int bio_count)
362 {
363 dio_request_t *dr;
364 int i;
365
366 dr = kmem_zalloc(sizeof (dio_request_t) +
367 sizeof (struct bio *) * bio_count, KM_SLEEP);
368 if (dr) {
369 atomic_set(&dr->dr_ref, 0);
370 dr->dr_bio_count = bio_count;
371 dr->dr_error = 0;
372
373 for (i = 0; i < dr->dr_bio_count; i++)
374 dr->dr_bio[i] = NULL;
375 }
376
377 return (dr);
378 }
379
380 static void
381 vdev_disk_dio_free(dio_request_t *dr)
382 {
383 int i;
384
385 for (i = 0; i < dr->dr_bio_count; i++)
386 if (dr->dr_bio[i])
387 bio_put(dr->dr_bio[i]);
388
389 kmem_free(dr, sizeof (dio_request_t) +
390 sizeof (struct bio *) * dr->dr_bio_count);
391 }
392
393 static void
394 vdev_disk_dio_get(dio_request_t *dr)
395 {
396 atomic_inc(&dr->dr_ref);
397 }
398
399 static int
400 vdev_disk_dio_put(dio_request_t *dr)
401 {
402 int rc = atomic_dec_return(&dr->dr_ref);
403
404 /*
405 * Free the dio_request when the last reference is dropped and
406 * ensure zio_interpret is called only once with the correct zio
407 */
408 if (rc == 0) {
409 zio_t *zio = dr->dr_zio;
410 int error = dr->dr_error;
411
412 vdev_disk_dio_free(dr);
413
414 if (zio) {
415 zio->io_error = error;
416 ASSERT3S(zio->io_error, >=, 0);
417 if (zio->io_error)
418 vdev_disk_error(zio);
419
420 zio_delay_interrupt(zio);
421 }
422 }
423
424 return (rc);
425 }
426
427 BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, error)
428 {
429 dio_request_t *dr = bio->bi_private;
430 int rc;
431
432 if (dr->dr_error == 0) {
433 #ifdef HAVE_1ARG_BIO_END_IO_T
434 dr->dr_error = BIO_END_IO_ERROR(bio);
435 #else
436 if (error)
437 dr->dr_error = -(error);
438 else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
439 dr->dr_error = EIO;
440 #endif
441 }
442
443 /* Drop reference acquired by __vdev_disk_physio */
444 rc = vdev_disk_dio_put(dr);
445 }
446
447 static unsigned int
448 bio_map(struct bio *bio, void *bio_ptr, unsigned int bio_size)
449 {
450 unsigned int offset, size, i;
451 struct page *page;
452
453 offset = offset_in_page(bio_ptr);
454 for (i = 0; i < bio->bi_max_vecs; i++) {
455 size = PAGE_SIZE - offset;
456
457 if (bio_size <= 0)
458 break;
459
460 if (size > bio_size)
461 size = bio_size;
462
463 if (is_vmalloc_addr(bio_ptr))
464 page = vmalloc_to_page(bio_ptr);
465 else
466 page = virt_to_page(bio_ptr);
467
468 /*
469 * Some network related block device uses tcp_sendpage, which
470 * doesn't behave well when using 0-count page, this is a
471 * safety net to catch them.
472 */
473 ASSERT3S(page_count(page), >, 0);
474
475 if (bio_add_page(bio, page, size, offset) != size)
476 break;
477
478 bio_ptr += size;
479 bio_size -= size;
480 offset = 0;
481 }
482
483 return (bio_size);
484 }
485
486 static unsigned int
487 bio_map_abd_off(struct bio *bio, abd_t *abd, unsigned int size, size_t off)
488 {
489 if (abd_is_linear(abd))
490 return (bio_map(bio, ((char *)abd_to_buf(abd)) + off, size));
491
492 return (abd_scatter_bio_map_off(bio, abd, size, off));
493 }
494
495 static inline void
496 vdev_submit_bio_impl(struct bio *bio)
497 {
498 #ifdef HAVE_1ARG_SUBMIT_BIO
499 submit_bio(bio);
500 #else
501 submit_bio(0, bio);
502 #endif
503 }
504
505 #ifndef HAVE_BIO_SET_DEV
506 static inline void
507 bio_set_dev(struct bio *bio, struct block_device *bdev)
508 {
509 bio->bi_bdev = bdev;
510 }
511 #endif /* !HAVE_BIO_SET_DEV */
512
513 static inline void
514 vdev_submit_bio(struct bio *bio)
515 {
516 #ifdef HAVE_CURRENT_BIO_TAIL
517 struct bio **bio_tail = current->bio_tail;
518 current->bio_tail = NULL;
519 vdev_submit_bio_impl(bio);
520 current->bio_tail = bio_tail;
521 #else
522 struct bio_list *bio_list = current->bio_list;
523 current->bio_list = NULL;
524 vdev_submit_bio_impl(bio);
525 current->bio_list = bio_list;
526 #endif
527 }
528
529 static int
530 __vdev_disk_physio(struct block_device *bdev, zio_t *zio,
531 size_t io_size, uint64_t io_offset, int rw, int flags)
532 {
533 dio_request_t *dr;
534 uint64_t abd_offset;
535 uint64_t bio_offset;
536 int bio_size, bio_count = 16;
537 int i = 0, error = 0;
538 #if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG)
539 struct blk_plug plug;
540 #endif
541
542 ASSERT(zio != NULL);
543 ASSERT3U(io_offset + io_size, <=, bdev->bd_inode->i_size);
544
545 retry:
546 dr = vdev_disk_dio_alloc(bio_count);
547 if (dr == NULL)
548 return (ENOMEM);
549
550 if (zio && !(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))
551 bio_set_flags_failfast(bdev, &flags);
552
553 dr->dr_zio = zio;
554
555 /*
556 * When the IO size exceeds the maximum bio size for the request
557 * queue we are forced to break the IO in multiple bio's and wait
558 * for them all to complete. Ideally, all pool users will set
559 * their volume block size to match the maximum request size and
560 * the common case will be one bio per vdev IO request.
561 */
562
563 abd_offset = 0;
564 bio_offset = io_offset;
565 bio_size = io_size;
566 for (i = 0; i <= dr->dr_bio_count; i++) {
567
568 /* Finished constructing bio's for given buffer */
569 if (bio_size <= 0)
570 break;
571
572 /*
573 * By default only 'bio_count' bio's per dio are allowed.
574 * However, if we find ourselves in a situation where more
575 * are needed we allocate a larger dio and warn the user.
576 */
577 if (dr->dr_bio_count == i) {
578 vdev_disk_dio_free(dr);
579 bio_count *= 2;
580 goto retry;
581 }
582
583 /* bio_alloc() with __GFP_WAIT never returns NULL */
584 dr->dr_bio[i] = bio_alloc(GFP_NOIO,
585 MIN(abd_nr_pages_off(zio->io_abd, bio_size, abd_offset),
586 BIO_MAX_PAGES));
587 if (unlikely(dr->dr_bio[i] == NULL)) {
588 vdev_disk_dio_free(dr);
589 return (ENOMEM);
590 }
591
592 /* Matching put called by vdev_disk_physio_completion */
593 vdev_disk_dio_get(dr);
594
595 bio_set_dev(dr->dr_bio[i], bdev);
596 BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9;
597 dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion;
598 dr->dr_bio[i]->bi_private = dr;
599 bio_set_op_attrs(dr->dr_bio[i], rw, flags);
600
601 /* Remaining size is returned to become the new size */
602 bio_size = bio_map_abd_off(dr->dr_bio[i], zio->io_abd,
603 bio_size, abd_offset);
604
605 /* Advance in buffer and construct another bio if needed */
606 abd_offset += BIO_BI_SIZE(dr->dr_bio[i]);
607 bio_offset += BIO_BI_SIZE(dr->dr_bio[i]);
608 }
609
610 /* Extra reference to protect dio_request during vdev_submit_bio */
611 vdev_disk_dio_get(dr);
612
613 #if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG)
614 if (dr->dr_bio_count > 1)
615 blk_start_plug(&plug);
616 #endif
617
618 /* Submit all bio's associated with this dio */
619 for (i = 0; i < dr->dr_bio_count; i++)
620 if (dr->dr_bio[i])
621 vdev_submit_bio(dr->dr_bio[i]);
622
623 #if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG)
624 if (dr->dr_bio_count > 1)
625 blk_finish_plug(&plug);
626 #endif
627
628 (void) vdev_disk_dio_put(dr);
629
630 return (error);
631 }
632
633 BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error)
634 {
635 zio_t *zio = bio->bi_private;
636 #ifdef HAVE_1ARG_BIO_END_IO_T
637 zio->io_error = BIO_END_IO_ERROR(bio);
638 #else
639 zio->io_error = -error;
640 #endif
641
642 if (zio->io_error && (zio->io_error == EOPNOTSUPP))
643 zio->io_vd->vdev_nowritecache = B_TRUE;
644
645 bio_put(bio);
646 ASSERT3S(zio->io_error, >=, 0);
647 if (zio->io_error)
648 vdev_disk_error(zio);
649 zio_interrupt(zio);
650 }
651
652 static int
653 vdev_disk_io_flush(struct block_device *bdev, zio_t *zio)
654 {
655 struct request_queue *q;
656 struct bio *bio;
657
658 q = bdev_get_queue(bdev);
659 if (!q)
660 return (ENXIO);
661
662 bio = bio_alloc(GFP_NOIO, 0);
663 /* bio_alloc() with __GFP_WAIT never returns NULL */
664 if (unlikely(bio == NULL))
665 return (ENOMEM);
666
667 bio->bi_end_io = vdev_disk_io_flush_completion;
668 bio->bi_private = zio;
669 bio_set_dev(bio, bdev);
670 bio_set_flush(bio);
671 vdev_submit_bio(bio);
672 invalidate_bdev(bdev);
673
674 return (0);
675 }
676
677 static void
678 vdev_disk_io_start(zio_t *zio)
679 {
680 vdev_t *v = zio->io_vd;
681 vdev_disk_t *vd = v->vdev_tsd;
682 int rw, flags, error;
683
684 switch (zio->io_type) {
685 case ZIO_TYPE_IOCTL:
686
687 if (!vdev_readable(v)) {
688 zio->io_error = SET_ERROR(ENXIO);
689 zio_interrupt(zio);
690 return;
691 }
692
693 switch (zio->io_cmd) {
694 case DKIOCFLUSHWRITECACHE:
695
696 if (zfs_nocacheflush)
697 break;
698
699 if (v->vdev_nowritecache) {
700 zio->io_error = SET_ERROR(ENOTSUP);
701 break;
702 }
703
704 error = vdev_disk_io_flush(vd->vd_bdev, zio);
705 if (error == 0)
706 return;
707
708 zio->io_error = error;
709
710 break;
711
712 default:
713 zio->io_error = SET_ERROR(ENOTSUP);
714 }
715
716 zio_execute(zio);
717 return;
718 case ZIO_TYPE_WRITE:
719 rw = WRITE;
720 #if defined(HAVE_BLK_QUEUE_HAVE_BIO_RW_UNPLUG)
721 flags = (1 << BIO_RW_UNPLUG);
722 #elif defined(REQ_UNPLUG)
723 flags = REQ_UNPLUG;
724 #else
725 flags = 0;
726 #endif
727 break;
728
729 case ZIO_TYPE_READ:
730 rw = READ;
731 #if defined(HAVE_BLK_QUEUE_HAVE_BIO_RW_UNPLUG)
732 flags = (1 << BIO_RW_UNPLUG);
733 #elif defined(REQ_UNPLUG)
734 flags = REQ_UNPLUG;
735 #else
736 flags = 0;
737 #endif
738 break;
739
740 default:
741 zio->io_error = SET_ERROR(ENOTSUP);
742 zio_interrupt(zio);
743 return;
744 }
745
746 zio->io_target_timestamp = zio_handle_io_delay(zio);
747 error = __vdev_disk_physio(vd->vd_bdev, zio,
748 zio->io_size, zio->io_offset, rw, flags);
749 if (error) {
750 zio->io_error = error;
751 zio_interrupt(zio);
752 return;
753 }
754 }
755
756 static void
757 vdev_disk_io_done(zio_t *zio)
758 {
759 /*
760 * If the device returned EIO, we revalidate the media. If it is
761 * determined the media has changed this triggers the asynchronous
762 * removal of the device from the configuration.
763 */
764 if (zio->io_error == EIO) {
765 vdev_t *v = zio->io_vd;
766 vdev_disk_t *vd = v->vdev_tsd;
767
768 if (check_disk_change(vd->vd_bdev)) {
769 vdev_bdev_invalidate(vd->vd_bdev);
770 v->vdev_remove_wanted = B_TRUE;
771 spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE);
772 }
773 }
774 }
775
776 static void
777 vdev_disk_hold(vdev_t *vd)
778 {
779 ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
780
781 /* We must have a pathname, and it must be absolute. */
782 if (vd->vdev_path == NULL || vd->vdev_path[0] != '/')
783 return;
784
785 /*
786 * Only prefetch path and devid info if the device has
787 * never been opened.
788 */
789 if (vd->vdev_tsd != NULL)
790 return;
791
792 /* XXX: Implement me as a vnode lookup for the device */
793 vd->vdev_name_vp = NULL;
794 vd->vdev_devid_vp = NULL;
795 }
796
797 static void
798 vdev_disk_rele(vdev_t *vd)
799 {
800 ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
801
802 /* XXX: Implement me as a vnode rele for the device */
803 }
804
805 static int
806 param_set_vdev_scheduler(const char *val, zfs_kernel_param_t *kp)
807 {
808 spa_t *spa = NULL;
809 char *p;
810
811 if (val == NULL)
812 return (SET_ERROR(-EINVAL));
813
814 if ((p = strchr(val, '\n')) != NULL)
815 *p = '\0';
816
817 if (spa_mode_global != 0) {
818 mutex_enter(&spa_namespace_lock);
819 while ((spa = spa_next(spa)) != NULL) {
820 if (spa_state(spa) != POOL_STATE_ACTIVE ||
821 !spa_writeable(spa) || spa_suspended(spa))
822 continue;
823
824 spa_open_ref(spa, FTAG);
825 mutex_exit(&spa_namespace_lock);
826 vdev_elevator_switch(spa->spa_root_vdev, (char *)val);
827 mutex_enter(&spa_namespace_lock);
828 spa_close(spa, FTAG);
829 }
830 mutex_exit(&spa_namespace_lock);
831 }
832
833 return (param_set_charp(val, kp));
834 }
835
836 vdev_ops_t vdev_disk_ops = {
837 vdev_disk_open,
838 vdev_disk_close,
839 vdev_default_asize,
840 vdev_disk_io_start,
841 vdev_disk_io_done,
842 NULL,
843 NULL,
844 vdev_disk_hold,
845 vdev_disk_rele,
846 VDEV_TYPE_DISK, /* name of this vdev type */
847 B_TRUE /* leaf vdev */
848 };
849
850 module_param_call(zfs_vdev_scheduler, param_set_vdev_scheduler,
851 param_get_charp, &zfs_vdev_scheduler, 0644);
852 MODULE_PARM_DESC(zfs_vdev_scheduler, "I/O scheduler");