]> git.proxmox.com Git - mirror_zfs.git/blob - module/zfs/vdev_disk.c
Linux 4.8 compat: REQ_OP and bio_set_op_attrs()
[mirror_zfs.git] / module / zfs / vdev_disk.c
1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC.
23 * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
24 * Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>.
25 * LLNL-CODE-403049.
26 * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
27 */
28
29 #include <sys/zfs_context.h>
30 #include <sys/spa.h>
31 #include <sys/vdev_disk.h>
32 #include <sys/vdev_impl.h>
33 #include <sys/fs/zfs.h>
34 #include <sys/zio.h>
35 #include <sys/sunldi.h>
36
37 char *zfs_vdev_scheduler = VDEV_SCHEDULER;
38 static void *zfs_vdev_holder = VDEV_HOLDER;
39
40 /*
41 * Virtual device vector for disks.
42 */
43 typedef struct dio_request {
44 zio_t *dr_zio; /* Parent ZIO */
45 atomic_t dr_ref; /* References */
46 int dr_error; /* Bio error */
47 int dr_bio_count; /* Count of bio's */
48 struct bio *dr_bio[0]; /* Attached bio's */
49 } dio_request_t;
50
51
52 #ifdef HAVE_OPEN_BDEV_EXCLUSIVE
53 static fmode_t
54 vdev_bdev_mode(int smode)
55 {
56 fmode_t mode = 0;
57
58 ASSERT3S(smode & (FREAD | FWRITE), !=, 0);
59
60 if (smode & FREAD)
61 mode |= FMODE_READ;
62
63 if (smode & FWRITE)
64 mode |= FMODE_WRITE;
65
66 return (mode);
67 }
68 #else
69 static int
70 vdev_bdev_mode(int smode)
71 {
72 int mode = 0;
73
74 ASSERT3S(smode & (FREAD | FWRITE), !=, 0);
75
76 if ((smode & FREAD) && !(smode & FWRITE))
77 mode = MS_RDONLY;
78
79 return (mode);
80 }
81 #endif /* HAVE_OPEN_BDEV_EXCLUSIVE */
82
83 static uint64_t
84 bdev_capacity(struct block_device *bdev)
85 {
86 struct hd_struct *part = bdev->bd_part;
87
88 /* The partition capacity referenced by the block device */
89 if (part)
90 return (part->nr_sects << 9);
91
92 /* Otherwise assume the full device capacity */
93 return (get_capacity(bdev->bd_disk) << 9);
94 }
95
96 static void
97 vdev_disk_error(zio_t *zio)
98 {
99 #ifdef ZFS_DEBUG
100 printk("ZFS: zio error=%d type=%d offset=%llu size=%llu "
101 "flags=%x\n", zio->io_error, zio->io_type,
102 (u_longlong_t)zio->io_offset, (u_longlong_t)zio->io_size,
103 zio->io_flags);
104 #endif
105 }
106
107 /*
108 * Use the Linux 'noop' elevator for zfs managed block devices. This
109 * strikes the ideal balance by allowing the zfs elevator to do all
110 * request ordering and prioritization. While allowing the Linux
111 * elevator to do the maximum front/back merging allowed by the
112 * physical device. This yields the largest possible requests for
113 * the device with the lowest total overhead.
114 */
115 static int
116 vdev_elevator_switch(vdev_t *v, char *elevator)
117 {
118 vdev_disk_t *vd = v->vdev_tsd;
119 struct block_device *bdev = vd->vd_bdev;
120 struct request_queue *q = bdev_get_queue(bdev);
121 char *device = bdev->bd_disk->disk_name;
122 int error;
123
124 /*
125 * Skip devices which are not whole disks (partitions).
126 * Device-mapper devices are excepted since they may be whole
127 * disks despite the vdev_wholedisk flag, in which case we can
128 * and should switch the elevator. If the device-mapper device
129 * does not have an elevator (i.e. dm-raid, dm-crypt, etc.) the
130 * "Skip devices without schedulers" check below will fail.
131 */
132 if (!v->vdev_wholedisk && strncmp(device, "dm-", 3) != 0)
133 return (0);
134
135 /* Skip devices without schedulers (loop, ram, dm, etc) */
136 if (!q->elevator || !blk_queue_stackable(q))
137 return (0);
138
139 /* Leave existing scheduler when set to "none" */
140 if ((strncmp(elevator, "none", 4) == 0) && (strlen(elevator) == 4))
141 return (0);
142
143 #ifdef HAVE_ELEVATOR_CHANGE
144 error = elevator_change(q, elevator);
145 #else
146 /*
147 * For pre-2.6.36 kernels elevator_change() is not available.
148 * Therefore we fall back to using a usermodehelper to echo the
149 * elevator into sysfs; This requires /bin/echo and sysfs to be
150 * mounted which may not be true early in the boot process.
151 */
152 #define SET_SCHEDULER_CMD \
153 "exec 0</dev/null " \
154 " 1>/sys/block/%s/queue/scheduler " \
155 " 2>/dev/null; " \
156 "echo %s"
157
158 {
159 char *argv[] = { "/bin/sh", "-c", NULL, NULL };
160 char *envp[] = { NULL };
161
162 argv[2] = kmem_asprintf(SET_SCHEDULER_CMD, device, elevator);
163 error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
164 strfree(argv[2]);
165 }
166 #endif /* HAVE_ELEVATOR_CHANGE */
167 if (error)
168 printk("ZFS: Unable to set \"%s\" scheduler for %s (%s): %d\n",
169 elevator, v->vdev_path, device, error);
170
171 return (error);
172 }
173
174 /*
175 * Expanding a whole disk vdev involves invoking BLKRRPART on the
176 * whole disk device. This poses a problem, because BLKRRPART will
177 * return EBUSY if one of the disk's partitions is open. That's why
178 * we have to do it here, just before opening the data partition.
179 * Unfortunately, BLKRRPART works by dropping all partitions and
180 * recreating them, which means that for a short time window, all
181 * /dev/sdxN device files disappear (until udev recreates them).
182 * This means two things:
183 * - When we open the data partition just after a BLKRRPART, we
184 * can't do it using the normal device file path because of the
185 * obvious race condition with udev. Instead, we use reliable
186 * kernel APIs to get a handle to the new partition device from
187 * the whole disk device.
188 * - Because vdev_disk_open() initially needs to find the device
189 * using its path, multiple vdev_disk_open() invocations in
190 * short succession on the same disk with BLKRRPARTs in the
191 * middle have a high probability of failure (because of the
192 * race condition with udev). A typical situation where this
193 * might happen is when the zpool userspace tool does a
194 * TRYIMPORT immediately followed by an IMPORT. For this
195 * reason, we only invoke BLKRRPART in the module when strictly
196 * necessary (zpool online -e case), and rely on userspace to
197 * do it when possible.
198 */
199 static struct block_device *
200 vdev_disk_rrpart(const char *path, int mode, vdev_disk_t *vd)
201 {
202 #if defined(HAVE_3ARG_BLKDEV_GET) && defined(HAVE_GET_GENDISK)
203 struct block_device *bdev, *result = ERR_PTR(-ENXIO);
204 struct gendisk *disk;
205 int error, partno;
206
207 bdev = vdev_bdev_open(path, vdev_bdev_mode(mode), zfs_vdev_holder);
208 if (IS_ERR(bdev))
209 return (bdev);
210
211 disk = get_gendisk(bdev->bd_dev, &partno);
212 vdev_bdev_close(bdev, vdev_bdev_mode(mode));
213
214 if (disk) {
215 bdev = bdget(disk_devt(disk));
216 if (bdev) {
217 error = blkdev_get(bdev, vdev_bdev_mode(mode), vd);
218 if (error == 0)
219 error = ioctl_by_bdev(bdev, BLKRRPART, 0);
220 vdev_bdev_close(bdev, vdev_bdev_mode(mode));
221 }
222
223 bdev = bdget_disk(disk, partno);
224 if (bdev) {
225 error = blkdev_get(bdev,
226 vdev_bdev_mode(mode) | FMODE_EXCL, vd);
227 if (error == 0)
228 result = bdev;
229 }
230 put_disk(disk);
231 }
232
233 return (result);
234 #else
235 return (ERR_PTR(-EOPNOTSUPP));
236 #endif /* defined(HAVE_3ARG_BLKDEV_GET) && defined(HAVE_GET_GENDISK) */
237 }
238
239 static int
240 vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
241 uint64_t *ashift)
242 {
243 struct block_device *bdev = ERR_PTR(-ENXIO);
244 vdev_disk_t *vd;
245 int count = 0, mode, block_size;
246
247 /* Must have a pathname and it must be absolute. */
248 if (v->vdev_path == NULL || v->vdev_path[0] != '/') {
249 v->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
250 return (SET_ERROR(EINVAL));
251 }
252
253 /*
254 * Reopen the device if it's not currently open. Otherwise,
255 * just update the physical size of the device.
256 */
257 if (v->vdev_tsd != NULL) {
258 ASSERT(v->vdev_reopening);
259 vd = v->vdev_tsd;
260 goto skip_open;
261 }
262
263 vd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);
264 if (vd == NULL)
265 return (SET_ERROR(ENOMEM));
266
267 /*
268 * Devices are always opened by the path provided at configuration
269 * time. This means that if the provided path is a udev by-id path
270 * then drives may be recabled without an issue. If the provided
271 * path is a udev by-path path, then the physical location information
272 * will be preserved. This can be critical for more complicated
273 * configurations where drives are located in specific physical
274 * locations to maximize the systems tolerence to component failure.
275 * Alternatively, you can provide your own udev rule to flexibly map
276 * the drives as you see fit. It is not advised that you use the
277 * /dev/[hd]d devices which may be reordered due to probing order.
278 * Devices in the wrong locations will be detected by the higher
279 * level vdev validation.
280 *
281 * The specified paths may be briefly removed and recreated in
282 * response to udev events. This should be exceptionally unlikely
283 * because the zpool command makes every effort to verify these paths
284 * have already settled prior to reaching this point. Therefore,
285 * a ENOENT failure at this point is highly likely to be transient
286 * and it is reasonable to sleep and retry before giving up. In
287 * practice delays have been observed to be on the order of 100ms.
288 */
289 mode = spa_mode(v->vdev_spa);
290 if (v->vdev_wholedisk && v->vdev_expanding)
291 bdev = vdev_disk_rrpart(v->vdev_path, mode, vd);
292
293 while (IS_ERR(bdev) && count < 50) {
294 bdev = vdev_bdev_open(v->vdev_path,
295 vdev_bdev_mode(mode), zfs_vdev_holder);
296 if (unlikely(PTR_ERR(bdev) == -ENOENT)) {
297 msleep(10);
298 count++;
299 } else if (IS_ERR(bdev)) {
300 break;
301 }
302 }
303
304 if (IS_ERR(bdev)) {
305 dprintf("failed open v->vdev_path=%s, error=%d count=%d\n",
306 v->vdev_path, -PTR_ERR(bdev), count);
307 kmem_free(vd, sizeof (vdev_disk_t));
308 return (SET_ERROR(-PTR_ERR(bdev)));
309 }
310
311 v->vdev_tsd = vd;
312 vd->vd_bdev = bdev;
313
314 skip_open:
315 /* Determine the physical block size */
316 block_size = vdev_bdev_block_size(vd->vd_bdev);
317
318 /* Clear the nowritecache bit, causes vdev_reopen() to try again. */
319 v->vdev_nowritecache = B_FALSE;
320
321 /* Inform the ZIO pipeline that we are non-rotational */
322 v->vdev_nonrot = blk_queue_nonrot(bdev_get_queue(vd->vd_bdev));
323
324 /* Physical volume size in bytes */
325 *psize = bdev_capacity(vd->vd_bdev);
326
327 /* TODO: report possible expansion size */
328 *max_psize = *psize;
329
330 /* Based on the minimum sector size set the block size */
331 *ashift = highbit64(MAX(block_size, SPA_MINBLOCKSIZE)) - 1;
332
333 /* Try to set the io scheduler elevator algorithm */
334 (void) vdev_elevator_switch(v, zfs_vdev_scheduler);
335
336 return (0);
337 }
338
339 static void
340 vdev_disk_close(vdev_t *v)
341 {
342 vdev_disk_t *vd = v->vdev_tsd;
343
344 if (v->vdev_reopening || vd == NULL)
345 return;
346
347 if (vd->vd_bdev != NULL)
348 vdev_bdev_close(vd->vd_bdev,
349 vdev_bdev_mode(spa_mode(v->vdev_spa)));
350
351 kmem_free(vd, sizeof (vdev_disk_t));
352 v->vdev_tsd = NULL;
353 }
354
355 static dio_request_t *
356 vdev_disk_dio_alloc(int bio_count)
357 {
358 dio_request_t *dr;
359 int i;
360
361 dr = kmem_zalloc(sizeof (dio_request_t) +
362 sizeof (struct bio *) * bio_count, KM_SLEEP);
363 if (dr) {
364 atomic_set(&dr->dr_ref, 0);
365 dr->dr_bio_count = bio_count;
366 dr->dr_error = 0;
367
368 for (i = 0; i < dr->dr_bio_count; i++)
369 dr->dr_bio[i] = NULL;
370 }
371
372 return (dr);
373 }
374
375 static void
376 vdev_disk_dio_free(dio_request_t *dr)
377 {
378 int i;
379
380 for (i = 0; i < dr->dr_bio_count; i++)
381 if (dr->dr_bio[i])
382 bio_put(dr->dr_bio[i]);
383
384 kmem_free(dr, sizeof (dio_request_t) +
385 sizeof (struct bio *) * dr->dr_bio_count);
386 }
387
388 static void
389 vdev_disk_dio_get(dio_request_t *dr)
390 {
391 atomic_inc(&dr->dr_ref);
392 }
393
394 static int
395 vdev_disk_dio_put(dio_request_t *dr)
396 {
397 int rc = atomic_dec_return(&dr->dr_ref);
398
399 /*
400 * Free the dio_request when the last reference is dropped and
401 * ensure zio_interpret is called only once with the correct zio
402 */
403 if (rc == 0) {
404 zio_t *zio = dr->dr_zio;
405 int error = dr->dr_error;
406
407 vdev_disk_dio_free(dr);
408
409 if (zio) {
410 zio->io_error = error;
411 ASSERT3S(zio->io_error, >=, 0);
412 if (zio->io_error)
413 vdev_disk_error(zio);
414 zio_delay_interrupt(zio);
415 }
416 }
417
418 return (rc);
419 }
420
421 BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, error)
422 {
423 dio_request_t *dr = bio->bi_private;
424 int rc;
425
426 if (dr->dr_error == 0) {
427 #ifdef HAVE_1ARG_BIO_END_IO_T
428 dr->dr_error = -(bio->bi_error);
429 #else
430 if (error)
431 dr->dr_error = -(error);
432 else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
433 dr->dr_error = EIO;
434 #endif
435 }
436
437 /* Drop reference aquired by __vdev_disk_physio */
438 rc = vdev_disk_dio_put(dr);
439 }
440
441 static inline unsigned long
442 bio_nr_pages(void *bio_ptr, unsigned int bio_size)
443 {
444 return ((((unsigned long)bio_ptr + bio_size + PAGE_SIZE - 1) >>
445 PAGE_SHIFT) - ((unsigned long)bio_ptr >> PAGE_SHIFT));
446 }
447
448 static unsigned int
449 bio_map(struct bio *bio, void *bio_ptr, unsigned int bio_size)
450 {
451 unsigned int offset, size, i;
452 struct page *page;
453
454 offset = offset_in_page(bio_ptr);
455 for (i = 0; i < bio->bi_max_vecs; i++) {
456 size = PAGE_SIZE - offset;
457
458 if (bio_size <= 0)
459 break;
460
461 if (size > bio_size)
462 size = bio_size;
463
464 if (is_vmalloc_addr(bio_ptr))
465 page = vmalloc_to_page(bio_ptr);
466 else
467 page = virt_to_page(bio_ptr);
468
469 /*
470 * Some network related block device uses tcp_sendpage, which
471 * doesn't behave well when using 0-count page, this is a
472 * safety net to catch them.
473 */
474 ASSERT3S(page_count(page), >, 0);
475
476 if (bio_add_page(bio, page, size, offset) != size)
477 break;
478
479 bio_ptr += size;
480 bio_size -= size;
481 offset = 0;
482 }
483
484 return (bio_size);
485 }
486
487 #ifndef bio_set_op_attrs
488 #define bio_set_op_attrs(bio, rw, flags) \
489 do { (bio)->bi_rw |= (rw)|(flags); } while (0)
490 #endif
491
492 static inline void
493 vdev_submit_bio_impl(struct bio *bio)
494 {
495 #ifdef HAVE_1ARG_SUBMIT_BIO
496 submit_bio(bio);
497 #else
498 submit_bio(0, bio);
499 #endif
500 }
501
502 static inline void
503 vdev_submit_bio(struct bio *bio)
504 {
505 #ifdef HAVE_CURRENT_BIO_TAIL
506 struct bio **bio_tail = current->bio_tail;
507 current->bio_tail = NULL;
508 vdev_submit_bio_impl(bio);
509 current->bio_tail = bio_tail;
510 #else
511 struct bio_list *bio_list = current->bio_list;
512 current->bio_list = NULL;
513 vdev_submit_bio_impl(bio);
514 current->bio_list = bio_list;
515 #endif
516 }
517
518 static int
519 __vdev_disk_physio(struct block_device *bdev, zio_t *zio, caddr_t kbuf_ptr,
520 size_t kbuf_size, uint64_t kbuf_offset, int rw, int flags)
521 {
522 dio_request_t *dr;
523 caddr_t bio_ptr;
524 uint64_t bio_offset;
525 int bio_size, bio_count = 16;
526 int i = 0, error = 0;
527
528 ASSERT3U(kbuf_offset + kbuf_size, <=, bdev->bd_inode->i_size);
529
530 retry:
531 dr = vdev_disk_dio_alloc(bio_count);
532 if (dr == NULL)
533 return (ENOMEM);
534
535 if (zio && !(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))
536 bio_set_flags_failfast(bdev, &flags);
537
538 dr->dr_zio = zio;
539
540 /*
541 * When the IO size exceeds the maximum bio size for the request
542 * queue we are forced to break the IO in multiple bio's and wait
543 * for them all to complete. Ideally, all pool users will set
544 * their volume block size to match the maximum request size and
545 * the common case will be one bio per vdev IO request.
546 */
547 bio_ptr = kbuf_ptr;
548 bio_offset = kbuf_offset;
549 bio_size = kbuf_size;
550 for (i = 0; i <= dr->dr_bio_count; i++) {
551
552 /* Finished constructing bio's for given buffer */
553 if (bio_size <= 0)
554 break;
555
556 /*
557 * By default only 'bio_count' bio's per dio are allowed.
558 * However, if we find ourselves in a situation where more
559 * are needed we allocate a larger dio and warn the user.
560 */
561 if (dr->dr_bio_count == i) {
562 vdev_disk_dio_free(dr);
563 bio_count *= 2;
564 goto retry;
565 }
566
567 /* bio_alloc() with __GFP_WAIT never returns NULL */
568 dr->dr_bio[i] = bio_alloc(GFP_NOIO,
569 MIN(bio_nr_pages(bio_ptr, bio_size), BIO_MAX_PAGES));
570 if (unlikely(dr->dr_bio[i] == NULL)) {
571 vdev_disk_dio_free(dr);
572 return (ENOMEM);
573 }
574
575 /* Matching put called by vdev_disk_physio_completion */
576 vdev_disk_dio_get(dr);
577
578 dr->dr_bio[i]->bi_bdev = bdev;
579 BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9;
580 dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion;
581 dr->dr_bio[i]->bi_private = dr;
582 bio_set_op_attrs(dr->dr_bio[i], rw, flags);
583
584 /* Remaining size is returned to become the new size */
585 bio_size = bio_map(dr->dr_bio[i], bio_ptr, bio_size);
586
587 /* Advance in buffer and construct another bio if needed */
588 bio_ptr += BIO_BI_SIZE(dr->dr_bio[i]);
589 bio_offset += BIO_BI_SIZE(dr->dr_bio[i]);
590 }
591
592 /* Extra reference to protect dio_request during vdev_submit_bio */
593 vdev_disk_dio_get(dr);
594
595 /* Submit all bio's associated with this dio */
596 for (i = 0; i < dr->dr_bio_count; i++)
597 if (dr->dr_bio[i])
598 vdev_submit_bio(dr->dr_bio[i]);
599
600 (void) vdev_disk_dio_put(dr);
601
602 return (error);
603 }
604
605 #ifndef __linux__
606 int
607 vdev_disk_physio(struct block_device *bdev, caddr_t kbuf,
608 size_t size, uint64_t offset, int rw, int flags)
609 {
610 bio_set_flags_failfast(bdev, &flags);
611 return (__vdev_disk_physio(bdev, NULL, kbuf, size, offset, rw, flags));
612 }
613 #endif
614
615 BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, rc)
616 {
617 zio_t *zio = bio->bi_private;
618 #ifdef HAVE_1ARG_BIO_END_IO_T
619 int rc = bio->bi_error;
620 #endif
621
622 zio->io_error = -rc;
623 if (rc && (rc == -EOPNOTSUPP))
624 zio->io_vd->vdev_nowritecache = B_TRUE;
625
626 bio_put(bio);
627 ASSERT3S(zio->io_error, >=, 0);
628 if (zio->io_error)
629 vdev_disk_error(zio);
630 zio_interrupt(zio);
631 }
632
633 static int
634 vdev_disk_io_flush(struct block_device *bdev, zio_t *zio)
635 {
636 struct request_queue *q;
637 struct bio *bio;
638
639 q = bdev_get_queue(bdev);
640 if (!q)
641 return (ENXIO);
642
643 bio = bio_alloc(GFP_NOIO, 0);
644 /* bio_alloc() with __GFP_WAIT never returns NULL */
645 if (unlikely(bio == NULL))
646 return (ENOMEM);
647
648 bio->bi_end_io = vdev_disk_io_flush_completion;
649 bio->bi_private = zio;
650 bio->bi_bdev = bdev;
651 bio_set_op_attrs(bio, 0, VDEV_WRITE_FLUSH_FUA);
652 vdev_submit_bio(bio);
653 invalidate_bdev(bdev);
654
655 return (0);
656 }
657
658 static void
659 vdev_disk_io_start(zio_t *zio)
660 {
661 vdev_t *v = zio->io_vd;
662 vdev_disk_t *vd = v->vdev_tsd;
663 int rw, flags, error;
664
665 switch (zio->io_type) {
666 case ZIO_TYPE_IOCTL:
667
668 if (!vdev_readable(v)) {
669 zio->io_error = SET_ERROR(ENXIO);
670 zio_interrupt(zio);
671 return;
672 }
673
674 switch (zio->io_cmd) {
675 case DKIOCFLUSHWRITECACHE:
676
677 if (zfs_nocacheflush)
678 break;
679
680 if (v->vdev_nowritecache) {
681 zio->io_error = SET_ERROR(ENOTSUP);
682 break;
683 }
684
685 error = vdev_disk_io_flush(vd->vd_bdev, zio);
686 if (error == 0)
687 return;
688
689 zio->io_error = error;
690 if (error == ENOTSUP)
691 v->vdev_nowritecache = B_TRUE;
692
693 break;
694
695 default:
696 zio->io_error = SET_ERROR(ENOTSUP);
697 }
698
699 zio_execute(zio);
700 return;
701 case ZIO_TYPE_WRITE:
702 rw = WRITE;
703 #if defined(HAVE_BLK_QUEUE_HAVE_BIO_RW_UNPLUG)
704 flags = (1 << BIO_RW_UNPLUG);
705 #elif defined(REQ_UNPLUG)
706 flags = REQ_UNPLUG;
707 #else
708 flags = 0;
709 #endif
710 break;
711
712 case ZIO_TYPE_READ:
713 rw = READ;
714 #if defined(HAVE_BLK_QUEUE_HAVE_BIO_RW_UNPLUG)
715 flags = (1 << BIO_RW_UNPLUG);
716 #elif defined(REQ_UNPLUG)
717 flags = REQ_UNPLUG;
718 #else
719 flags = 0;
720 #endif
721 break;
722
723 default:
724 zio->io_error = SET_ERROR(ENOTSUP);
725 zio_interrupt(zio);
726 return;
727 }
728
729 zio->io_target_timestamp = zio_handle_io_delay(zio);
730 error = __vdev_disk_physio(vd->vd_bdev, zio, zio->io_data,
731 zio->io_size, zio->io_offset, rw, flags);
732 if (error) {
733 zio->io_error = error;
734 zio_interrupt(zio);
735 return;
736 }
737 }
738
739 static void
740 vdev_disk_io_done(zio_t *zio)
741 {
742 /*
743 * If the device returned EIO, we revalidate the media. If it is
744 * determined the media has changed this triggers the asynchronous
745 * removal of the device from the configuration.
746 */
747 if (zio->io_error == EIO) {
748 vdev_t *v = zio->io_vd;
749 vdev_disk_t *vd = v->vdev_tsd;
750
751 if (check_disk_change(vd->vd_bdev)) {
752 vdev_bdev_invalidate(vd->vd_bdev);
753 v->vdev_remove_wanted = B_TRUE;
754 spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE);
755 }
756 }
757 }
758
759 static void
760 vdev_disk_hold(vdev_t *vd)
761 {
762 ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
763
764 /* We must have a pathname, and it must be absolute. */
765 if (vd->vdev_path == NULL || vd->vdev_path[0] != '/')
766 return;
767
768 /*
769 * Only prefetch path and devid info if the device has
770 * never been opened.
771 */
772 if (vd->vdev_tsd != NULL)
773 return;
774
775 /* XXX: Implement me as a vnode lookup for the device */
776 vd->vdev_name_vp = NULL;
777 vd->vdev_devid_vp = NULL;
778 }
779
780 static void
781 vdev_disk_rele(vdev_t *vd)
782 {
783 ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
784
785 /* XXX: Implement me as a vnode rele for the device */
786 }
787
788 vdev_ops_t vdev_disk_ops = {
789 vdev_disk_open,
790 vdev_disk_close,
791 vdev_default_asize,
792 vdev_disk_io_start,
793 vdev_disk_io_done,
794 NULL,
795 vdev_disk_hold,
796 vdev_disk_rele,
797 VDEV_TYPE_DISK, /* name of this vdev type */
798 B_TRUE /* leaf vdev */
799 };
800
801 #ifndef __linux__
802 /*
803 * Given the root disk device devid or pathname, read the label from
804 * the device, and construct a configuration nvlist.
805 */
806 int
807 vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config)
808 {
809 struct block_device *bdev;
810 vdev_label_t *label;
811 uint64_t s, size;
812 int i;
813
814 bdev = vdev_bdev_open(devpath, vdev_bdev_mode(FREAD), zfs_vdev_holder);
815 if (IS_ERR(bdev))
816 return (-PTR_ERR(bdev));
817
818 s = bdev_capacity(bdev);
819 if (s == 0) {
820 vdev_bdev_close(bdev, vdev_bdev_mode(FREAD));
821 return (EIO);
822 }
823
824 size = P2ALIGN_TYPED(s, sizeof (vdev_label_t), uint64_t);
825 label = vmem_alloc(sizeof (vdev_label_t), KM_SLEEP);
826
827 for (i = 0; i < VDEV_LABELS; i++) {
828 uint64_t offset, state, txg = 0;
829
830 /* read vdev label */
831 offset = vdev_label_offset(size, i, 0);
832 if (vdev_disk_physio(bdev, (caddr_t)label,
833 VDEV_SKIP_SIZE + VDEV_PHYS_SIZE, offset, READ,
834 REQ_SYNC) != 0)
835 continue;
836
837 if (nvlist_unpack(label->vl_vdev_phys.vp_nvlist,
838 sizeof (label->vl_vdev_phys.vp_nvlist), config, 0) != 0) {
839 *config = NULL;
840 continue;
841 }
842
843 if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE,
844 &state) != 0 || state >= POOL_STATE_DESTROYED) {
845 nvlist_free(*config);
846 *config = NULL;
847 continue;
848 }
849
850 if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG,
851 &txg) != 0 || txg == 0) {
852 nvlist_free(*config);
853 *config = NULL;
854 continue;
855 }
856
857 break;
858 }
859
860 vmem_free(label, sizeof (vdev_label_t));
861 vdev_bdev_close(bdev, vdev_bdev_mode(FREAD));
862
863 return (0);
864 }
865 #endif /* __linux__ */
866
867 module_param(zfs_vdev_scheduler, charp, 0644);
868 MODULE_PARM_DESC(zfs_vdev_scheduler, "I/O scheduler");