]> git.proxmox.com Git - mirror_ubuntu-artful-kernel.git/blob - zfs/module/zfs/vdev_disk.c
UBUNTU: SAUCE: (noup) Update spl to 0.6.5.11-ubuntu1, zfs to 0.6.5.11-1ubuntu3
[mirror_ubuntu-artful-kernel.git] / zfs / module / zfs / vdev_disk.c
1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC.
23 * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
24 * Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>.
25 * LLNL-CODE-403049.
26 * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
27 */
28
29 #include <sys/zfs_context.h>
30 #include <sys/spa.h>
31 #include <sys/vdev_disk.h>
32 #include <sys/vdev_impl.h>
33 #include <sys/fs/zfs.h>
34 #include <sys/zio.h>
35 #include <sys/sunldi.h>
36
37 char *zfs_vdev_scheduler = VDEV_SCHEDULER;
38 static void *zfs_vdev_holder = VDEV_HOLDER;
39
40 /*
41 * Virtual device vector for disks.
42 */
43 typedef struct dio_request {
44 zio_t *dr_zio; /* Parent ZIO */
45 atomic_t dr_ref; /* References */
46 int dr_error; /* Bio error */
47 int dr_bio_count; /* Count of bio's */
48 struct bio *dr_bio[0]; /* Attached bio's */
49 } dio_request_t;
50
51
52 #ifdef HAVE_OPEN_BDEV_EXCLUSIVE
53 static fmode_t
54 vdev_bdev_mode(int smode)
55 {
56 fmode_t mode = 0;
57
58 ASSERT3S(smode & (FREAD | FWRITE), !=, 0);
59
60 if (smode & FREAD)
61 mode |= FMODE_READ;
62
63 if (smode & FWRITE)
64 mode |= FMODE_WRITE;
65
66 return (mode);
67 }
68 #else
69 static int
70 vdev_bdev_mode(int smode)
71 {
72 int mode = 0;
73
74 ASSERT3S(smode & (FREAD | FWRITE), !=, 0);
75
76 if ((smode & FREAD) && !(smode & FWRITE))
77 mode = MS_RDONLY;
78
79 return (mode);
80 }
81 #endif /* HAVE_OPEN_BDEV_EXCLUSIVE */
82
83 static uint64_t
84 bdev_capacity(struct block_device *bdev)
85 {
86 struct hd_struct *part = bdev->bd_part;
87
88 /* The partition capacity referenced by the block device */
89 if (part)
90 return (part->nr_sects << 9);
91
92 /* Otherwise assume the full device capacity */
93 return (get_capacity(bdev->bd_disk) << 9);
94 }
95
96 static void
97 vdev_disk_error(zio_t *zio)
98 {
99 #ifdef ZFS_DEBUG
100 printk("ZFS: zio error=%d type=%d offset=%llu size=%llu "
101 "flags=%x delay=%llu\n", zio->io_error, zio->io_type,
102 (u_longlong_t)zio->io_offset, (u_longlong_t)zio->io_size,
103 zio->io_flags, (u_longlong_t)zio->io_delay);
104 #endif
105 }
106
107 /*
108 * Use the Linux 'noop' elevator for zfs managed block devices. This
109 * strikes the ideal balance by allowing the zfs elevator to do all
110 * request ordering and prioritization. While allowing the Linux
111 * elevator to do the maximum front/back merging allowed by the
112 * physical device. This yields the largest possible requests for
113 * the device with the lowest total overhead.
114 */
115 static int
116 vdev_elevator_switch(vdev_t *v, char *elevator)
117 {
118 vdev_disk_t *vd = v->vdev_tsd;
119 struct block_device *bdev = vd->vd_bdev;
120 struct request_queue *q = bdev_get_queue(bdev);
121 char *device = bdev->bd_disk->disk_name;
122 int error;
123
124 /*
125 * Skip devices which are not whole disks (partitions).
126 * Device-mapper devices are excepted since they may be whole
127 * disks despite the vdev_wholedisk flag, in which case we can
128 * and should switch the elevator. If the device-mapper device
129 * does not have an elevator (i.e. dm-raid, dm-crypt, etc.) the
130 * "Skip devices without schedulers" check below will fail.
131 */
132 if (!v->vdev_wholedisk && strncmp(device, "dm-", 3) != 0)
133 return (0);
134
135 /* Skip devices without schedulers (loop, ram, dm, etc) */
136 if (!q->elevator || !blk_queue_stackable(q))
137 return (0);
138
139 /* Leave existing scheduler when set to "none" */
140 if ((strncmp(elevator, "none", 4) == 0) && (strlen(elevator) == 4))
141 return (0);
142
143 #ifdef HAVE_ELEVATOR_CHANGE
144 error = elevator_change(q, elevator);
145 #else
146 /*
147 * For pre-2.6.36 kernels elevator_change() is not available.
148 * Therefore we fall back to using a usermodehelper to echo the
149 * elevator into sysfs; This requires /bin/echo and sysfs to be
150 * mounted which may not be true early in the boot process.
151 */
152 #define SET_SCHEDULER_CMD \
153 "exec 0</dev/null " \
154 " 1>/sys/block/%s/queue/scheduler " \
155 " 2>/dev/null; " \
156 "echo %s"
157
158 {
159 char *argv[] = { "/bin/sh", "-c", NULL, NULL };
160 char *envp[] = { NULL };
161
162 argv[2] = kmem_asprintf(SET_SCHEDULER_CMD, device, elevator);
163 error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
164 strfree(argv[2]);
165 }
166 #endif /* HAVE_ELEVATOR_CHANGE */
167 if (error)
168 printk("ZFS: Unable to set \"%s\" scheduler for %s (%s): %d\n",
169 elevator, v->vdev_path, device, error);
170
171 return (error);
172 }
173
174 /*
175 * Expanding a whole disk vdev involves invoking BLKRRPART on the
176 * whole disk device. This poses a problem, because BLKRRPART will
177 * return EBUSY if one of the disk's partitions is open. That's why
178 * we have to do it here, just before opening the data partition.
179 * Unfortunately, BLKRRPART works by dropping all partitions and
180 * recreating them, which means that for a short time window, all
181 * /dev/sdxN device files disappear (until udev recreates them).
182 * This means two things:
183 * - When we open the data partition just after a BLKRRPART, we
184 * can't do it using the normal device file path because of the
185 * obvious race condition with udev. Instead, we use reliable
186 * kernel APIs to get a handle to the new partition device from
187 * the whole disk device.
188 * - Because vdev_disk_open() initially needs to find the device
189 * using its path, multiple vdev_disk_open() invocations in
190 * short succession on the same disk with BLKRRPARTs in the
191 * middle have a high probability of failure (because of the
192 * race condition with udev). A typical situation where this
193 * might happen is when the zpool userspace tool does a
194 * TRYIMPORT immediately followed by an IMPORT. For this
195 * reason, we only invoke BLKRRPART in the module when strictly
196 * necessary (zpool online -e case), and rely on userspace to
197 * do it when possible.
198 */
199 static struct block_device *
200 vdev_disk_rrpart(const char *path, int mode, vdev_disk_t *vd)
201 {
202 #if defined(HAVE_3ARG_BLKDEV_GET) && defined(HAVE_GET_GENDISK)
203 struct block_device *bdev, *result = ERR_PTR(-ENXIO);
204 struct gendisk *disk;
205 int error, partno;
206
207 bdev = vdev_bdev_open(path, vdev_bdev_mode(mode), zfs_vdev_holder);
208 if (IS_ERR(bdev))
209 return (bdev);
210
211 disk = get_gendisk(bdev->bd_dev, &partno);
212 vdev_bdev_close(bdev, vdev_bdev_mode(mode));
213
214 if (disk) {
215 bdev = bdget(disk_devt(disk));
216 if (bdev) {
217 error = blkdev_get(bdev, vdev_bdev_mode(mode), vd);
218 if (error == 0)
219 error = ioctl_by_bdev(bdev, BLKRRPART, 0);
220 vdev_bdev_close(bdev, vdev_bdev_mode(mode));
221 }
222
223 bdev = bdget_disk(disk, partno);
224 if (bdev) {
225 error = blkdev_get(bdev,
226 vdev_bdev_mode(mode) | FMODE_EXCL, vd);
227 if (error == 0)
228 result = bdev;
229 }
230 put_disk(disk);
231 }
232
233 return (result);
234 #else
235 return (ERR_PTR(-EOPNOTSUPP));
236 #endif /* defined(HAVE_3ARG_BLKDEV_GET) && defined(HAVE_GET_GENDISK) */
237 }
238
239 static int
240 vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
241 uint64_t *ashift)
242 {
243 struct block_device *bdev = ERR_PTR(-ENXIO);
244 vdev_disk_t *vd;
245 int count = 0, mode, block_size;
246
247 /* Must have a pathname and it must be absolute. */
248 if (v->vdev_path == NULL || v->vdev_path[0] != '/') {
249 v->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
250 return (SET_ERROR(EINVAL));
251 }
252
253 /*
254 * Reopen the device if it's not currently open. Otherwise,
255 * just update the physical size of the device.
256 */
257 if (v->vdev_tsd != NULL) {
258 ASSERT(v->vdev_reopening);
259 vd = v->vdev_tsd;
260 goto skip_open;
261 }
262
263 vd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);
264 if (vd == NULL)
265 return (SET_ERROR(ENOMEM));
266
267 /*
268 * Devices are always opened by the path provided at configuration
269 * time. This means that if the provided path is a udev by-id path
270 * then drives may be recabled without an issue. If the provided
271 * path is a udev by-path path, then the physical location information
272 * will be preserved. This can be critical for more complicated
273 * configurations where drives are located in specific physical
274 * locations to maximize the systems tolerence to component failure.
275 * Alternatively, you can provide your own udev rule to flexibly map
276 * the drives as you see fit. It is not advised that you use the
277 * /dev/[hd]d devices which may be reordered due to probing order.
278 * Devices in the wrong locations will be detected by the higher
279 * level vdev validation.
280 *
281 * The specified paths may be briefly removed and recreated in
282 * response to udev events. This should be exceptionally unlikely
283 * because the zpool command makes every effort to verify these paths
284 * have already settled prior to reaching this point. Therefore,
285 * a ENOENT failure at this point is highly likely to be transient
286 * and it is reasonable to sleep and retry before giving up. In
287 * practice delays have been observed to be on the order of 100ms.
288 */
289 mode = spa_mode(v->vdev_spa);
290 if (v->vdev_wholedisk && v->vdev_expanding)
291 bdev = vdev_disk_rrpart(v->vdev_path, mode, vd);
292
293 while (IS_ERR(bdev) && count < 50) {
294 bdev = vdev_bdev_open(v->vdev_path,
295 vdev_bdev_mode(mode), zfs_vdev_holder);
296 if (unlikely(PTR_ERR(bdev) == -ENOENT)) {
297 msleep(10);
298 count++;
299 } else if (IS_ERR(bdev)) {
300 break;
301 }
302 }
303
304 if (IS_ERR(bdev)) {
305 dprintf("failed open v->vdev_path=%s, error=%d count=%d\n",
306 v->vdev_path, -PTR_ERR(bdev), count);
307 kmem_free(vd, sizeof (vdev_disk_t));
308 return (SET_ERROR(-PTR_ERR(bdev)));
309 }
310
311 v->vdev_tsd = vd;
312 vd->vd_bdev = bdev;
313
314 skip_open:
315 /* Determine the physical block size */
316 block_size = vdev_bdev_block_size(vd->vd_bdev);
317
318 /* Clear the nowritecache bit, causes vdev_reopen() to try again. */
319 v->vdev_nowritecache = B_FALSE;
320
321 /* Inform the ZIO pipeline that we are non-rotational */
322 v->vdev_nonrot = blk_queue_nonrot(bdev_get_queue(vd->vd_bdev));
323
324 /* Physical volume size in bytes */
325 *psize = bdev_capacity(vd->vd_bdev);
326
327 /* TODO: report possible expansion size */
328 *max_psize = *psize;
329
330 /* Based on the minimum sector size set the block size */
331 *ashift = highbit64(MAX(block_size, SPA_MINBLOCKSIZE)) - 1;
332
333 /* Try to set the io scheduler elevator algorithm */
334 (void) vdev_elevator_switch(v, zfs_vdev_scheduler);
335
336 return (0);
337 }
338
339 static void
340 vdev_disk_close(vdev_t *v)
341 {
342 vdev_disk_t *vd = v->vdev_tsd;
343
344 if (v->vdev_reopening || vd == NULL)
345 return;
346
347 if (vd->vd_bdev != NULL)
348 vdev_bdev_close(vd->vd_bdev,
349 vdev_bdev_mode(spa_mode(v->vdev_spa)));
350
351 kmem_free(vd, sizeof (vdev_disk_t));
352 v->vdev_tsd = NULL;
353 }
354
355 static dio_request_t *
356 vdev_disk_dio_alloc(int bio_count)
357 {
358 dio_request_t *dr;
359 int i;
360
361 dr = kmem_zalloc(sizeof (dio_request_t) +
362 sizeof (struct bio *) * bio_count, KM_SLEEP);
363 if (dr) {
364 atomic_set(&dr->dr_ref, 0);
365 dr->dr_bio_count = bio_count;
366 dr->dr_error = 0;
367
368 for (i = 0; i < dr->dr_bio_count; i++)
369 dr->dr_bio[i] = NULL;
370 }
371
372 return (dr);
373 }
374
375 static void
376 vdev_disk_dio_free(dio_request_t *dr)
377 {
378 int i;
379
380 for (i = 0; i < dr->dr_bio_count; i++)
381 if (dr->dr_bio[i])
382 bio_put(dr->dr_bio[i]);
383
384 kmem_free(dr, sizeof (dio_request_t) +
385 sizeof (struct bio *) * dr->dr_bio_count);
386 }
387
388 static void
389 vdev_disk_dio_get(dio_request_t *dr)
390 {
391 atomic_inc(&dr->dr_ref);
392 }
393
394 static int
395 vdev_disk_dio_put(dio_request_t *dr)
396 {
397 int rc = atomic_dec_return(&dr->dr_ref);
398
399 /*
400 * Free the dio_request when the last reference is dropped and
401 * ensure zio_interpret is called only once with the correct zio
402 */
403 if (rc == 0) {
404 zio_t *zio = dr->dr_zio;
405 int error = dr->dr_error;
406
407 vdev_disk_dio_free(dr);
408
409 if (zio) {
410 zio->io_delay = jiffies_64 - zio->io_delay;
411 zio->io_error = error;
412 ASSERT3S(zio->io_error, >=, 0);
413 if (zio->io_error)
414 vdev_disk_error(zio);
415 zio_interrupt(zio);
416 }
417 }
418
419 return (rc);
420 }
421
422 BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, error)
423 {
424 dio_request_t *dr = bio->bi_private;
425 int rc;
426
427 if (dr->dr_error == 0) {
428 #ifdef HAVE_1ARG_BIO_END_IO_T
429 dr->dr_error = BIO_END_IO_ERROR(bio);
430 #else
431 if (error)
432 dr->dr_error = -(error);
433 else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
434 dr->dr_error = EIO;
435 #endif
436 }
437
438 /* Drop reference aquired by __vdev_disk_physio */
439 rc = vdev_disk_dio_put(dr);
440 }
441
442 static inline unsigned long
443 bio_nr_pages(void *bio_ptr, unsigned int bio_size)
444 {
445 return ((((unsigned long)bio_ptr + bio_size + PAGE_SIZE - 1) >>
446 PAGE_SHIFT) - ((unsigned long)bio_ptr >> PAGE_SHIFT));
447 }
448
449 static unsigned int
450 bio_map(struct bio *bio, void *bio_ptr, unsigned int bio_size)
451 {
452 unsigned int offset, size, i;
453 struct page *page;
454
455 offset = offset_in_page(bio_ptr);
456 for (i = 0; i < bio->bi_max_vecs; i++) {
457 size = PAGE_SIZE - offset;
458
459 if (bio_size <= 0)
460 break;
461
462 if (size > bio_size)
463 size = bio_size;
464
465 if (is_vmalloc_addr(bio_ptr))
466 page = vmalloc_to_page(bio_ptr);
467 else
468 page = virt_to_page(bio_ptr);
469
470 /*
471 * Some network related block device uses tcp_sendpage, which
472 * doesn't behave well when using 0-count page, this is a
473 * safety net to catch them.
474 */
475 ASSERT3S(page_count(page), >, 0);
476
477 if (bio_add_page(bio, page, size, offset) != size)
478 break;
479
480 bio_ptr += size;
481 bio_size -= size;
482 offset = 0;
483 }
484
485 return (bio_size);
486 }
487
488 static inline void
489 vdev_submit_bio_impl(struct bio *bio)
490 {
491 #ifdef HAVE_1ARG_SUBMIT_BIO
492 submit_bio(bio);
493 #else
494 submit_bio(0, bio);
495 #endif
496 }
497
498 static inline void
499 vdev_submit_bio(struct bio *bio)
500 {
501 #ifdef HAVE_CURRENT_BIO_TAIL
502 struct bio **bio_tail = current->bio_tail;
503 current->bio_tail = NULL;
504 vdev_submit_bio_impl(bio);
505 current->bio_tail = bio_tail;
506 #else
507 struct bio_list *bio_list = current->bio_list;
508 current->bio_list = NULL;
509 vdev_submit_bio_impl(bio);
510 current->bio_list = bio_list;
511 #endif
512 }
513
514 static int
515 __vdev_disk_physio(struct block_device *bdev, zio_t *zio, caddr_t kbuf_ptr,
516 size_t kbuf_size, uint64_t kbuf_offset, int rw, int flags)
517 {
518 dio_request_t *dr;
519 caddr_t bio_ptr;
520 uint64_t bio_offset;
521 int bio_size, bio_count = 16;
522 int i = 0, error = 0;
523 #if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG)
524 struct blk_plug plug;
525 #endif
526
527 ASSERT3U(kbuf_offset + kbuf_size, <=, bdev->bd_inode->i_size);
528
529 retry:
530 dr = vdev_disk_dio_alloc(bio_count);
531 if (dr == NULL)
532 return (ENOMEM);
533
534 if (zio && !(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))
535 bio_set_flags_failfast(bdev, &flags);
536
537 dr->dr_zio = zio;
538
539 /*
540 * When the IO size exceeds the maximum bio size for the request
541 * queue we are forced to break the IO in multiple bio's and wait
542 * for them all to complete. Ideally, all pool users will set
543 * their volume block size to match the maximum request size and
544 * the common case will be one bio per vdev IO request.
545 */
546 bio_ptr = kbuf_ptr;
547 bio_offset = kbuf_offset;
548 bio_size = kbuf_size;
549 for (i = 0; i <= dr->dr_bio_count; i++) {
550
551 /* Finished constructing bio's for given buffer */
552 if (bio_size <= 0)
553 break;
554
555 /*
556 * By default only 'bio_count' bio's per dio are allowed.
557 * However, if we find ourselves in a situation where more
558 * are needed we allocate a larger dio and warn the user.
559 */
560 if (dr->dr_bio_count == i) {
561 vdev_disk_dio_free(dr);
562 bio_count *= 2;
563 goto retry;
564 }
565
566 /* bio_alloc() with __GFP_WAIT never returns NULL */
567 dr->dr_bio[i] = bio_alloc(GFP_NOIO,
568 MIN(bio_nr_pages(bio_ptr, bio_size), BIO_MAX_PAGES));
569 if (unlikely(dr->dr_bio[i] == NULL)) {
570 vdev_disk_dio_free(dr);
571 return (ENOMEM);
572 }
573
574 /* Matching put called by vdev_disk_physio_completion */
575 vdev_disk_dio_get(dr);
576
577 dr->dr_bio[i]->bi_bdev = bdev;
578 BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9;
579 dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion;
580 dr->dr_bio[i]->bi_private = dr;
581 bio_set_op_attrs(dr->dr_bio[i], rw, flags);
582
583 /* Remaining size is returned to become the new size */
584 bio_size = bio_map(dr->dr_bio[i], bio_ptr, bio_size);
585
586 /* Advance in buffer and construct another bio if needed */
587 bio_ptr += BIO_BI_SIZE(dr->dr_bio[i]);
588 bio_offset += BIO_BI_SIZE(dr->dr_bio[i]);
589 }
590
591 /* Extra reference to protect dio_request during vdev_submit_bio */
592 vdev_disk_dio_get(dr);
593 if (zio)
594 zio->io_delay = jiffies_64;
595
596 #if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG)
597 if (dr->dr_bio_count > 1)
598 blk_start_plug(&plug);
599 #endif
600
601 /* Submit all bio's associated with this dio */
602 for (i = 0; i < dr->dr_bio_count; i++)
603 if (dr->dr_bio[i])
604 vdev_submit_bio(dr->dr_bio[i]);
605
606 #if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG)
607 if (dr->dr_bio_count > 1)
608 blk_finish_plug(&plug);
609 #endif
610
611 (void) vdev_disk_dio_put(dr);
612
613 return (error);
614 }
615
616 BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error)
617 {
618 zio_t *zio = bio->bi_private;
619 #ifdef HAVE_1ARG_BIO_END_IO_T
620 zio->io_error = BIO_END_IO_ERROR(bio);
621 #else
622 zio->io_error = -error;
623 #endif
624
625 zio->io_delay = jiffies_64 - zio->io_delay;
626 if (zio->io_error && (zio->io_error == EOPNOTSUPP))
627 zio->io_vd->vdev_nowritecache = B_TRUE;
628
629 bio_put(bio);
630 ASSERT3S(zio->io_error, >=, 0);
631 if (zio->io_error)
632 vdev_disk_error(zio);
633 zio_interrupt(zio);
634 }
635
636 static int
637 vdev_disk_io_flush(struct block_device *bdev, zio_t *zio)
638 {
639 struct request_queue *q;
640 struct bio *bio;
641
642 q = bdev_get_queue(bdev);
643 if (!q)
644 return (ENXIO);
645
646 bio = bio_alloc(GFP_NOIO, 0);
647 /* bio_alloc() with __GFP_WAIT never returns NULL */
648 if (unlikely(bio == NULL))
649 return (ENOMEM);
650
651 bio->bi_end_io = vdev_disk_io_flush_completion;
652 bio->bi_private = zio;
653 bio->bi_bdev = bdev;
654 zio->io_delay = jiffies_64;
655 bio_set_flush(bio);
656 vdev_submit_bio(bio);
657 invalidate_bdev(bdev);
658
659 return (0);
660 }
661
662 static void
663 vdev_disk_io_start(zio_t *zio)
664 {
665 vdev_t *v = zio->io_vd;
666 vdev_disk_t *vd = v->vdev_tsd;
667 int rw, flags, error;
668
669 switch (zio->io_type) {
670 case ZIO_TYPE_IOCTL:
671
672 if (!vdev_readable(v)) {
673 zio->io_error = SET_ERROR(ENXIO);
674 zio_interrupt(zio);
675 return;
676 }
677
678 switch (zio->io_cmd) {
679 case DKIOCFLUSHWRITECACHE:
680
681 if (zfs_nocacheflush)
682 break;
683
684 if (v->vdev_nowritecache) {
685 zio->io_error = SET_ERROR(ENOTSUP);
686 break;
687 }
688
689 error = vdev_disk_io_flush(vd->vd_bdev, zio);
690 if (error == 0)
691 return;
692
693 zio->io_error = error;
694 if (error == ENOTSUP)
695 v->vdev_nowritecache = B_TRUE;
696
697 break;
698
699 default:
700 zio->io_error = SET_ERROR(ENOTSUP);
701 }
702
703 zio_execute(zio);
704 return;
705 case ZIO_TYPE_WRITE:
706 rw = WRITE;
707 #if defined(HAVE_BLK_QUEUE_HAVE_BIO_RW_UNPLUG)
708 flags = (1 << BIO_RW_UNPLUG);
709 #elif defined(REQ_UNPLUG)
710 flags = REQ_UNPLUG;
711 #else
712 flags = 0;
713 #endif
714 break;
715
716 case ZIO_TYPE_READ:
717 rw = READ;
718 #if defined(HAVE_BLK_QUEUE_HAVE_BIO_RW_UNPLUG)
719 flags = (1 << BIO_RW_UNPLUG);
720 #elif defined(REQ_UNPLUG)
721 flags = REQ_UNPLUG;
722 #else
723 flags = 0;
724 #endif
725 break;
726
727 default:
728 zio->io_error = SET_ERROR(ENOTSUP);
729 zio_interrupt(zio);
730 return;
731 }
732
733 error = __vdev_disk_physio(vd->vd_bdev, zio, zio->io_data,
734 zio->io_size, zio->io_offset, rw, flags);
735 if (error) {
736 zio->io_error = error;
737 zio_interrupt(zio);
738 return;
739 }
740 }
741
742 static void
743 vdev_disk_io_done(zio_t *zio)
744 {
745 /*
746 * If the device returned EIO, we revalidate the media. If it is
747 * determined the media has changed this triggers the asynchronous
748 * removal of the device from the configuration.
749 */
750 if (zio->io_error == EIO) {
751 vdev_t *v = zio->io_vd;
752 vdev_disk_t *vd = v->vdev_tsd;
753
754 if (check_disk_change(vd->vd_bdev)) {
755 vdev_bdev_invalidate(vd->vd_bdev);
756 v->vdev_remove_wanted = B_TRUE;
757 spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE);
758 }
759 }
760 }
761
762 static void
763 vdev_disk_hold(vdev_t *vd)
764 {
765 ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
766
767 /* We must have a pathname, and it must be absolute. */
768 if (vd->vdev_path == NULL || vd->vdev_path[0] != '/')
769 return;
770
771 /*
772 * Only prefetch path and devid info if the device has
773 * never been opened.
774 */
775 if (vd->vdev_tsd != NULL)
776 return;
777
778 /* XXX: Implement me as a vnode lookup for the device */
779 vd->vdev_name_vp = NULL;
780 vd->vdev_devid_vp = NULL;
781 }
782
783 static void
784 vdev_disk_rele(vdev_t *vd)
785 {
786 ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
787
788 /* XXX: Implement me as a vnode rele for the device */
789 }
790
791 vdev_ops_t vdev_disk_ops = {
792 vdev_disk_open,
793 vdev_disk_close,
794 vdev_default_asize,
795 vdev_disk_io_start,
796 vdev_disk_io_done,
797 NULL,
798 vdev_disk_hold,
799 vdev_disk_rele,
800 VDEV_TYPE_DISK, /* name of this vdev type */
801 B_TRUE /* leaf vdev */
802 };
803
804 module_param(zfs_vdev_scheduler, charp, 0644);
805 MODULE_PARM_DESC(zfs_vdev_scheduler, "I/O scheduler");