]> git.proxmox.com Git - mirror_zfs.git/blob - module/zfs/vdev_disk.c
Fix sync behavior for disk vdevs
[mirror_zfs.git] / module / zfs / vdev_disk.c
1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC.
23 * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
24 * Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>.
25 * LLNL-CODE-403049.
26 * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
27 */
28
29 #include <sys/zfs_context.h>
30 #include <sys/spa.h>
31 #include <sys/vdev_disk.h>
32 #include <sys/vdev_impl.h>
33 #include <sys/fs/zfs.h>
34 #include <sys/zio.h>
35 #include <sys/sunldi.h>
36
37 char *zfs_vdev_scheduler = VDEV_SCHEDULER;
38 static void *zfs_vdev_holder = VDEV_HOLDER;
39
40 /*
41 * Virtual device vector for disks.
42 */
43 typedef struct dio_request {
44 zio_t *dr_zio; /* Parent ZIO */
45 atomic_t dr_ref; /* References */
46 int dr_error; /* Bio error */
47 int dr_bio_count; /* Count of bio's */
48 struct bio *dr_bio[0]; /* Attached bio's */
49 } dio_request_t;
50
51
52 #ifdef HAVE_OPEN_BDEV_EXCLUSIVE
53 static fmode_t
54 vdev_bdev_mode(int smode)
55 {
56 fmode_t mode = 0;
57
58 ASSERT3S(smode & (FREAD | FWRITE), !=, 0);
59
60 if (smode & FREAD)
61 mode |= FMODE_READ;
62
63 if (smode & FWRITE)
64 mode |= FMODE_WRITE;
65
66 return (mode);
67 }
68 #else
69 static int
70 vdev_bdev_mode(int smode)
71 {
72 int mode = 0;
73
74 ASSERT3S(smode & (FREAD | FWRITE), !=, 0);
75
76 if ((smode & FREAD) && !(smode & FWRITE))
77 mode = MS_RDONLY;
78
79 return (mode);
80 }
81 #endif /* HAVE_OPEN_BDEV_EXCLUSIVE */
82
83 static uint64_t
84 bdev_capacity(struct block_device *bdev)
85 {
86 struct hd_struct *part = bdev->bd_part;
87
88 /* The partition capacity referenced by the block device */
89 if (part)
90 return (part->nr_sects << 9);
91
92 /* Otherwise assume the full device capacity */
93 return (get_capacity(bdev->bd_disk) << 9);
94 }
95
96 static void
97 vdev_disk_error(zio_t *zio)
98 {
99 #ifdef ZFS_DEBUG
100 printk("ZFS: zio error=%d type=%d offset=%llu size=%llu "
101 "flags=%x\n", zio->io_error, zio->io_type,
102 (u_longlong_t)zio->io_offset, (u_longlong_t)zio->io_size,
103 zio->io_flags);
104 #endif
105 }
106
107 /*
108 * Use the Linux 'noop' elevator for zfs managed block devices. This
109 * strikes the ideal balance by allowing the zfs elevator to do all
110 * request ordering and prioritization. While allowing the Linux
111 * elevator to do the maximum front/back merging allowed by the
112 * physical device. This yields the largest possible requests for
113 * the device with the lowest total overhead.
114 */
115 static int
116 vdev_elevator_switch(vdev_t *v, char *elevator)
117 {
118 vdev_disk_t *vd = v->vdev_tsd;
119 struct block_device *bdev = vd->vd_bdev;
120 struct request_queue *q = bdev_get_queue(bdev);
121 char *device = bdev->bd_disk->disk_name;
122 int error;
123
124 /*
125 * Skip devices which are not whole disks (partitions).
126 * Device-mapper devices are excepted since they may be whole
127 * disks despite the vdev_wholedisk flag, in which case we can
128 * and should switch the elevator. If the device-mapper device
129 * does not have an elevator (i.e. dm-raid, dm-crypt, etc.) the
130 * "Skip devices without schedulers" check below will fail.
131 */
132 if (!v->vdev_wholedisk && strncmp(device, "dm-", 3) != 0)
133 return (0);
134
135 /* Skip devices without schedulers (loop, ram, dm, etc) */
136 if (!q->elevator || !blk_queue_stackable(q))
137 return (0);
138
139 /* Leave existing scheduler when set to "none" */
140 if ((strncmp(elevator, "none", 4) == 0) && (strlen(elevator) == 4))
141 return (0);
142
143 #ifdef HAVE_ELEVATOR_CHANGE
144 error = elevator_change(q, elevator);
145 #else
146 /*
147 * For pre-2.6.36 kernels elevator_change() is not available.
148 * Therefore we fall back to using a usermodehelper to echo the
149 * elevator into sysfs; This requires /bin/echo and sysfs to be
150 * mounted which may not be true early in the boot process.
151 */
152 #define SET_SCHEDULER_CMD \
153 "exec 0</dev/null " \
154 " 1>/sys/block/%s/queue/scheduler " \
155 " 2>/dev/null; " \
156 "echo %s"
157
158 {
159 char *argv[] = { "/bin/sh", "-c", NULL, NULL };
160 char *envp[] = { NULL };
161
162 argv[2] = kmem_asprintf(SET_SCHEDULER_CMD, device, elevator);
163 error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
164 strfree(argv[2]);
165 }
166 #endif /* HAVE_ELEVATOR_CHANGE */
167 if (error)
168 printk("ZFS: Unable to set \"%s\" scheduler for %s (%s): %d\n",
169 elevator, v->vdev_path, device, error);
170
171 return (error);
172 }
173
174 /*
175 * Expanding a whole disk vdev involves invoking BLKRRPART on the
176 * whole disk device. This poses a problem, because BLKRRPART will
177 * return EBUSY if one of the disk's partitions is open. That's why
178 * we have to do it here, just before opening the data partition.
179 * Unfortunately, BLKRRPART works by dropping all partitions and
180 * recreating them, which means that for a short time window, all
181 * /dev/sdxN device files disappear (until udev recreates them).
182 * This means two things:
183 * - When we open the data partition just after a BLKRRPART, we
184 * can't do it using the normal device file path because of the
185 * obvious race condition with udev. Instead, we use reliable
186 * kernel APIs to get a handle to the new partition device from
187 * the whole disk device.
188 * - Because vdev_disk_open() initially needs to find the device
189 * using its path, multiple vdev_disk_open() invocations in
190 * short succession on the same disk with BLKRRPARTs in the
191 * middle have a high probability of failure (because of the
192 * race condition with udev). A typical situation where this
193 * might happen is when the zpool userspace tool does a
194 * TRYIMPORT immediately followed by an IMPORT. For this
195 * reason, we only invoke BLKRRPART in the module when strictly
196 * necessary (zpool online -e case), and rely on userspace to
197 * do it when possible.
198 */
199 static struct block_device *
200 vdev_disk_rrpart(const char *path, int mode, vdev_disk_t *vd)
201 {
202 #if defined(HAVE_3ARG_BLKDEV_GET) && defined(HAVE_GET_GENDISK)
203 struct block_device *bdev, *result = ERR_PTR(-ENXIO);
204 struct gendisk *disk;
205 int error, partno;
206
207 bdev = vdev_bdev_open(path, vdev_bdev_mode(mode), zfs_vdev_holder);
208 if (IS_ERR(bdev))
209 return (bdev);
210
211 disk = get_gendisk(bdev->bd_dev, &partno);
212 vdev_bdev_close(bdev, vdev_bdev_mode(mode));
213
214 if (disk) {
215 bdev = bdget(disk_devt(disk));
216 if (bdev) {
217 error = blkdev_get(bdev, vdev_bdev_mode(mode), vd);
218 if (error == 0)
219 error = ioctl_by_bdev(bdev, BLKRRPART, 0);
220 vdev_bdev_close(bdev, vdev_bdev_mode(mode));
221 }
222
223 bdev = bdget_disk(disk, partno);
224 if (bdev) {
225 error = blkdev_get(bdev,
226 vdev_bdev_mode(mode) | FMODE_EXCL, vd);
227 if (error == 0)
228 result = bdev;
229 }
230 put_disk(disk);
231 }
232
233 return (result);
234 #else
235 return (ERR_PTR(-EOPNOTSUPP));
236 #endif /* defined(HAVE_3ARG_BLKDEV_GET) && defined(HAVE_GET_GENDISK) */
237 }
238
239 static int
240 vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
241 uint64_t *ashift)
242 {
243 struct block_device *bdev = ERR_PTR(-ENXIO);
244 vdev_disk_t *vd;
245 int count = 0, mode, block_size;
246
247 /* Must have a pathname and it must be absolute. */
248 if (v->vdev_path == NULL || v->vdev_path[0] != '/') {
249 v->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
250 return (SET_ERROR(EINVAL));
251 }
252
253 /*
254 * Reopen the device if it's not currently open. Otherwise,
255 * just update the physical size of the device.
256 */
257 if (v->vdev_tsd != NULL) {
258 ASSERT(v->vdev_reopening);
259 vd = v->vdev_tsd;
260 goto skip_open;
261 }
262
263 vd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);
264 if (vd == NULL)
265 return (SET_ERROR(ENOMEM));
266
267 /*
268 * Devices are always opened by the path provided at configuration
269 * time. This means that if the provided path is a udev by-id path
270 * then drives may be recabled without an issue. If the provided
271 * path is a udev by-path path, then the physical location information
272 * will be preserved. This can be critical for more complicated
273 * configurations where drives are located in specific physical
274 * locations to maximize the systems tolerence to component failure.
275 * Alternatively, you can provide your own udev rule to flexibly map
276 * the drives as you see fit. It is not advised that you use the
277 * /dev/[hd]d devices which may be reordered due to probing order.
278 * Devices in the wrong locations will be detected by the higher
279 * level vdev validation.
280 *
281 * The specified paths may be briefly removed and recreated in
282 * response to udev events. This should be exceptionally unlikely
283 * because the zpool command makes every effort to verify these paths
284 * have already settled prior to reaching this point. Therefore,
285 * a ENOENT failure at this point is highly likely to be transient
286 * and it is reasonable to sleep and retry before giving up. In
287 * practice delays have been observed to be on the order of 100ms.
288 */
289 mode = spa_mode(v->vdev_spa);
290 if (v->vdev_wholedisk && v->vdev_expanding)
291 bdev = vdev_disk_rrpart(v->vdev_path, mode, vd);
292
293 while (IS_ERR(bdev) && count < 50) {
294 bdev = vdev_bdev_open(v->vdev_path,
295 vdev_bdev_mode(mode), zfs_vdev_holder);
296 if (unlikely(PTR_ERR(bdev) == -ENOENT)) {
297 msleep(10);
298 count++;
299 } else if (IS_ERR(bdev)) {
300 break;
301 }
302 }
303
304 if (IS_ERR(bdev)) {
305 dprintf("failed open v->vdev_path=%s, error=%d count=%d\n",
306 v->vdev_path, -PTR_ERR(bdev), count);
307 kmem_free(vd, sizeof (vdev_disk_t));
308 return (SET_ERROR(-PTR_ERR(bdev)));
309 }
310
311 v->vdev_tsd = vd;
312 vd->vd_bdev = bdev;
313
314 skip_open:
315 /* Determine the physical block size */
316 block_size = vdev_bdev_block_size(vd->vd_bdev);
317
318 /* Clear the nowritecache bit, causes vdev_reopen() to try again. */
319 v->vdev_nowritecache = B_FALSE;
320
321 /* Inform the ZIO pipeline that we are non-rotational */
322 v->vdev_nonrot = blk_queue_nonrot(bdev_get_queue(vd->vd_bdev));
323
324 /* Physical volume size in bytes */
325 *psize = bdev_capacity(vd->vd_bdev);
326
327 /* TODO: report possible expansion size */
328 *max_psize = *psize;
329
330 /* Based on the minimum sector size set the block size */
331 *ashift = highbit64(MAX(block_size, SPA_MINBLOCKSIZE)) - 1;
332
333 /* Try to set the io scheduler elevator algorithm */
334 (void) vdev_elevator_switch(v, zfs_vdev_scheduler);
335
336 return (0);
337 }
338
339 static void
340 vdev_disk_close(vdev_t *v)
341 {
342 vdev_disk_t *vd = v->vdev_tsd;
343
344 if (v->vdev_reopening || vd == NULL)
345 return;
346
347 if (vd->vd_bdev != NULL)
348 vdev_bdev_close(vd->vd_bdev,
349 vdev_bdev_mode(spa_mode(v->vdev_spa)));
350
351 kmem_free(vd, sizeof (vdev_disk_t));
352 v->vdev_tsd = NULL;
353 }
354
355 static dio_request_t *
356 vdev_disk_dio_alloc(int bio_count)
357 {
358 dio_request_t *dr;
359 int i;
360
361 dr = kmem_zalloc(sizeof (dio_request_t) +
362 sizeof (struct bio *) * bio_count, KM_SLEEP);
363 if (dr) {
364 atomic_set(&dr->dr_ref, 0);
365 dr->dr_bio_count = bio_count;
366 dr->dr_error = 0;
367
368 for (i = 0; i < dr->dr_bio_count; i++)
369 dr->dr_bio[i] = NULL;
370 }
371
372 return (dr);
373 }
374
375 static void
376 vdev_disk_dio_free(dio_request_t *dr)
377 {
378 int i;
379
380 for (i = 0; i < dr->dr_bio_count; i++)
381 if (dr->dr_bio[i])
382 bio_put(dr->dr_bio[i]);
383
384 kmem_free(dr, sizeof (dio_request_t) +
385 sizeof (struct bio *) * dr->dr_bio_count);
386 }
387
388 static void
389 vdev_disk_dio_get(dio_request_t *dr)
390 {
391 atomic_inc(&dr->dr_ref);
392 }
393
394 static int
395 vdev_disk_dio_put(dio_request_t *dr)
396 {
397 int rc = atomic_dec_return(&dr->dr_ref);
398
399 /*
400 * Free the dio_request when the last reference is dropped and
401 * ensure zio_interpret is called only once with the correct zio
402 */
403 if (rc == 0) {
404 zio_t *zio = dr->dr_zio;
405 int error = dr->dr_error;
406
407 vdev_disk_dio_free(dr);
408
409 if (zio) {
410 zio->io_error = error;
411 ASSERT3S(zio->io_error, >=, 0);
412 if (zio->io_error)
413 vdev_disk_error(zio);
414 zio_delay_interrupt(zio);
415 }
416 }
417
418 return (rc);
419 }
420
421 BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, error)
422 {
423 dio_request_t *dr = bio->bi_private;
424 int rc;
425
426 if (dr->dr_error == 0) {
427 #ifdef HAVE_1ARG_BIO_END_IO_T
428 dr->dr_error = -(bio->bi_error);
429 #else
430 if (error)
431 dr->dr_error = -(error);
432 else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
433 dr->dr_error = EIO;
434 #endif
435 }
436
437 /* Drop reference aquired by __vdev_disk_physio */
438 rc = vdev_disk_dio_put(dr);
439 }
440
441 static inline unsigned long
442 bio_nr_pages(void *bio_ptr, unsigned int bio_size)
443 {
444 return ((((unsigned long)bio_ptr + bio_size + PAGE_SIZE - 1) >>
445 PAGE_SHIFT) - ((unsigned long)bio_ptr >> PAGE_SHIFT));
446 }
447
448 static unsigned int
449 bio_map(struct bio *bio, void *bio_ptr, unsigned int bio_size)
450 {
451 unsigned int offset, size, i;
452 struct page *page;
453
454 offset = offset_in_page(bio_ptr);
455 for (i = 0; i < bio->bi_max_vecs; i++) {
456 size = PAGE_SIZE - offset;
457
458 if (bio_size <= 0)
459 break;
460
461 if (size > bio_size)
462 size = bio_size;
463
464 if (is_vmalloc_addr(bio_ptr))
465 page = vmalloc_to_page(bio_ptr);
466 else
467 page = virt_to_page(bio_ptr);
468
469 /*
470 * Some network related block device uses tcp_sendpage, which
471 * doesn't behave well when using 0-count page, this is a
472 * safety net to catch them.
473 */
474 ASSERT3S(page_count(page), >, 0);
475
476 if (bio_add_page(bio, page, size, offset) != size)
477 break;
478
479 bio_ptr += size;
480 bio_size -= size;
481 offset = 0;
482 }
483
484 return (bio_size);
485 }
486
487 static inline void
488 vdev_submit_bio(int rw, struct bio *bio)
489 {
490 #ifdef HAVE_CURRENT_BIO_TAIL
491 struct bio **bio_tail = current->bio_tail;
492 current->bio_tail = NULL;
493 submit_bio(rw, bio);
494 current->bio_tail = bio_tail;
495 #else
496 struct bio_list *bio_list = current->bio_list;
497 current->bio_list = NULL;
498 submit_bio(rw, bio);
499 current->bio_list = bio_list;
500 #endif
501 }
502
503 static int
504 __vdev_disk_physio(struct block_device *bdev, zio_t *zio, caddr_t kbuf_ptr,
505 size_t kbuf_size, uint64_t kbuf_offset, int flags)
506 {
507 dio_request_t *dr;
508 caddr_t bio_ptr;
509 uint64_t bio_offset;
510 int rw, bio_size, bio_count = 16;
511 int i = 0, error = 0;
512
513 ASSERT3U(kbuf_offset + kbuf_size, <=, bdev->bd_inode->i_size);
514
515 retry:
516 dr = vdev_disk_dio_alloc(bio_count);
517 if (dr == NULL)
518 return (ENOMEM);
519
520 if (zio && !(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))
521 bio_set_flags_failfast(bdev, &flags);
522
523 rw = flags;
524 dr->dr_zio = zio;
525
526 /*
527 * When the IO size exceeds the maximum bio size for the request
528 * queue we are forced to break the IO in multiple bio's and wait
529 * for them all to complete. Ideally, all pool users will set
530 * their volume block size to match the maximum request size and
531 * the common case will be one bio per vdev IO request.
532 */
533 bio_ptr = kbuf_ptr;
534 bio_offset = kbuf_offset;
535 bio_size = kbuf_size;
536 for (i = 0; i <= dr->dr_bio_count; i++) {
537
538 /* Finished constructing bio's for given buffer */
539 if (bio_size <= 0)
540 break;
541
542 /*
543 * By default only 'bio_count' bio's per dio are allowed.
544 * However, if we find ourselves in a situation where more
545 * are needed we allocate a larger dio and warn the user.
546 */
547 if (dr->dr_bio_count == i) {
548 vdev_disk_dio_free(dr);
549 bio_count *= 2;
550 goto retry;
551 }
552
553 /* bio_alloc() with __GFP_WAIT never returns NULL */
554 dr->dr_bio[i] = bio_alloc(GFP_NOIO,
555 MIN(bio_nr_pages(bio_ptr, bio_size), BIO_MAX_PAGES));
556 if (unlikely(dr->dr_bio[i] == NULL)) {
557 vdev_disk_dio_free(dr);
558 return (ENOMEM);
559 }
560
561 /* Matching put called by vdev_disk_physio_completion */
562 vdev_disk_dio_get(dr);
563
564 dr->dr_bio[i]->bi_bdev = bdev;
565 BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9;
566 dr->dr_bio[i]->bi_rw = rw;
567 dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion;
568 dr->dr_bio[i]->bi_private = dr;
569
570 /* Remaining size is returned to become the new size */
571 bio_size = bio_map(dr->dr_bio[i], bio_ptr, bio_size);
572
573 /* Advance in buffer and construct another bio if needed */
574 bio_ptr += BIO_BI_SIZE(dr->dr_bio[i]);
575 bio_offset += BIO_BI_SIZE(dr->dr_bio[i]);
576 }
577
578 /* Extra reference to protect dio_request during vdev_submit_bio */
579 vdev_disk_dio_get(dr);
580
581 /* Submit all bio's associated with this dio */
582 for (i = 0; i < dr->dr_bio_count; i++)
583 if (dr->dr_bio[i])
584 vdev_submit_bio(rw, dr->dr_bio[i]);
585
586 (void) vdev_disk_dio_put(dr);
587
588 return (error);
589 }
590
591 #ifndef __linux__
592 int
593 vdev_disk_physio(struct block_device *bdev, caddr_t kbuf,
594 size_t size, uint64_t offset, int flags)
595 {
596 bio_set_flags_failfast(bdev, &flags);
597 return (__vdev_disk_physio(bdev, NULL, kbuf, size, offset, flags));
598 }
599 #endif
600
601 BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, rc)
602 {
603 zio_t *zio = bio->bi_private;
604 #ifdef HAVE_1ARG_BIO_END_IO_T
605 int rc = bio->bi_error;
606 #endif
607
608 zio->io_error = -rc;
609 if (rc && (rc == -EOPNOTSUPP))
610 zio->io_vd->vdev_nowritecache = B_TRUE;
611
612 bio_put(bio);
613 ASSERT3S(zio->io_error, >=, 0);
614 if (zio->io_error)
615 vdev_disk_error(zio);
616 zio_interrupt(zio);
617 }
618
619 static int
620 vdev_disk_io_flush(struct block_device *bdev, zio_t *zio)
621 {
622 struct request_queue *q;
623 struct bio *bio;
624
625 q = bdev_get_queue(bdev);
626 if (!q)
627 return (ENXIO);
628
629 bio = bio_alloc(GFP_NOIO, 0);
630 /* bio_alloc() with __GFP_WAIT never returns NULL */
631 if (unlikely(bio == NULL))
632 return (ENOMEM);
633
634 bio->bi_end_io = vdev_disk_io_flush_completion;
635 bio->bi_private = zio;
636 bio->bi_bdev = bdev;
637 vdev_submit_bio(VDEV_WRITE_FLUSH_FUA, bio);
638 invalidate_bdev(bdev);
639
640 return (0);
641 }
642
643 static void
644 vdev_disk_io_start(zio_t *zio)
645 {
646 vdev_t *v = zio->io_vd;
647 vdev_disk_t *vd = v->vdev_tsd;
648 int flags, error;
649
650 switch (zio->io_type) {
651 case ZIO_TYPE_IOCTL:
652
653 if (!vdev_readable(v)) {
654 zio->io_error = SET_ERROR(ENXIO);
655 zio_interrupt(zio);
656 return;
657 }
658
659 switch (zio->io_cmd) {
660 case DKIOCFLUSHWRITECACHE:
661
662 if (zfs_nocacheflush)
663 break;
664
665 if (v->vdev_nowritecache) {
666 zio->io_error = SET_ERROR(ENOTSUP);
667 break;
668 }
669
670 error = vdev_disk_io_flush(vd->vd_bdev, zio);
671 if (error == 0)
672 return;
673
674 zio->io_error = error;
675 if (error == ENOTSUP)
676 v->vdev_nowritecache = B_TRUE;
677
678 break;
679
680 default:
681 zio->io_error = SET_ERROR(ENOTSUP);
682 }
683
684 zio_execute(zio);
685 return;
686 case ZIO_TYPE_WRITE:
687 #if defined(HAVE_BLK_QUEUE_HAVE_BIO_RW_UNPLUG)
688 flags = WRITE | (1 << BIO_RW_UNPLUG);
689 #elif defined(REQ_UNPLUG)
690 flags = WRITE | REQ_UNPLUG;
691 #else
692 flags = WRITE;
693 #endif
694 break;
695
696 case ZIO_TYPE_READ:
697 #if defined(HAVE_BLK_QUEUE_HAVE_BIO_RW_UNPLUG)
698 flags = READ | (1 << BIO_RW_UNPLUG);
699 #elif defined(REQ_UNPLUG)
700 flags = READ | REQ_UNPLUG;
701 #else
702 flags = READ;
703 #endif
704 break;
705
706 default:
707 zio->io_error = SET_ERROR(ENOTSUP);
708 zio_interrupt(zio);
709 return;
710 }
711
712 zio->io_target_timestamp = zio_handle_io_delay(zio);
713 error = __vdev_disk_physio(vd->vd_bdev, zio, zio->io_data,
714 zio->io_size, zio->io_offset, flags);
715 if (error) {
716 zio->io_error = error;
717 zio_interrupt(zio);
718 return;
719 }
720 }
721
722 static void
723 vdev_disk_io_done(zio_t *zio)
724 {
725 /*
726 * If the device returned EIO, we revalidate the media. If it is
727 * determined the media has changed this triggers the asynchronous
728 * removal of the device from the configuration.
729 */
730 if (zio->io_error == EIO) {
731 vdev_t *v = zio->io_vd;
732 vdev_disk_t *vd = v->vdev_tsd;
733
734 if (check_disk_change(vd->vd_bdev)) {
735 vdev_bdev_invalidate(vd->vd_bdev);
736 v->vdev_remove_wanted = B_TRUE;
737 spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE);
738 }
739 }
740 }
741
742 static void
743 vdev_disk_hold(vdev_t *vd)
744 {
745 ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
746
747 /* We must have a pathname, and it must be absolute. */
748 if (vd->vdev_path == NULL || vd->vdev_path[0] != '/')
749 return;
750
751 /*
752 * Only prefetch path and devid info if the device has
753 * never been opened.
754 */
755 if (vd->vdev_tsd != NULL)
756 return;
757
758 /* XXX: Implement me as a vnode lookup for the device */
759 vd->vdev_name_vp = NULL;
760 vd->vdev_devid_vp = NULL;
761 }
762
763 static void
764 vdev_disk_rele(vdev_t *vd)
765 {
766 ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
767
768 /* XXX: Implement me as a vnode rele for the device */
769 }
770
771 vdev_ops_t vdev_disk_ops = {
772 vdev_disk_open,
773 vdev_disk_close,
774 vdev_default_asize,
775 vdev_disk_io_start,
776 vdev_disk_io_done,
777 NULL,
778 vdev_disk_hold,
779 vdev_disk_rele,
780 VDEV_TYPE_DISK, /* name of this vdev type */
781 B_TRUE /* leaf vdev */
782 };
783
784 #ifndef __linux__
785 /*
786 * Given the root disk device devid or pathname, read the label from
787 * the device, and construct a configuration nvlist.
788 */
789 int
790 vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config)
791 {
792 struct block_device *bdev;
793 vdev_label_t *label;
794 uint64_t s, size;
795 int i;
796
797 bdev = vdev_bdev_open(devpath, vdev_bdev_mode(FREAD), zfs_vdev_holder);
798 if (IS_ERR(bdev))
799 return (-PTR_ERR(bdev));
800
801 s = bdev_capacity(bdev);
802 if (s == 0) {
803 vdev_bdev_close(bdev, vdev_bdev_mode(FREAD));
804 return (EIO);
805 }
806
807 size = P2ALIGN_TYPED(s, sizeof (vdev_label_t), uint64_t);
808 label = vmem_alloc(sizeof (vdev_label_t), KM_SLEEP);
809
810 for (i = 0; i < VDEV_LABELS; i++) {
811 uint64_t offset, state, txg = 0;
812
813 /* read vdev label */
814 offset = vdev_label_offset(size, i, 0);
815 if (vdev_disk_physio(bdev, (caddr_t)label,
816 VDEV_SKIP_SIZE + VDEV_PHYS_SIZE, offset, READ_SYNC) != 0)
817 continue;
818
819 if (nvlist_unpack(label->vl_vdev_phys.vp_nvlist,
820 sizeof (label->vl_vdev_phys.vp_nvlist), config, 0) != 0) {
821 *config = NULL;
822 continue;
823 }
824
825 if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE,
826 &state) != 0 || state >= POOL_STATE_DESTROYED) {
827 nvlist_free(*config);
828 *config = NULL;
829 continue;
830 }
831
832 if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG,
833 &txg) != 0 || txg == 0) {
834 nvlist_free(*config);
835 *config = NULL;
836 continue;
837 }
838
839 break;
840 }
841
842 vmem_free(label, sizeof (vdev_label_t));
843 vdev_bdev_close(bdev, vdev_bdev_mode(FREAD));
844
845 return (0);
846 }
847 #endif /* __linux__ */
848
849 module_param(zfs_vdev_scheduler, charp, 0644);
850 MODULE_PARM_DESC(zfs_vdev_scheduler, "I/O scheduler");