]> git.proxmox.com Git - mirror_zfs.git/blob - module/zfs/vdev_disk.c
Fix ENXIO from spa_ld_verify_logs() in ztest
[mirror_zfs.git] / module / zfs / vdev_disk.c
1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC.
23 * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
24 * Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>.
25 * LLNL-CODE-403049.
26 * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
27 */
28
29 #include <sys/zfs_context.h>
30 #include <sys/spa_impl.h>
31 #include <sys/vdev_disk.h>
32 #include <sys/vdev_impl.h>
33 #include <sys/abd.h>
34 #include <sys/fs/zfs.h>
35 #include <sys/zio.h>
36 #include <linux/mod_compat.h>
37 #include <linux/msdos_fs.h>
38
39 char *zfs_vdev_scheduler = VDEV_SCHEDULER;
40 static void *zfs_vdev_holder = VDEV_HOLDER;
41
42 /* size of the "reserved" partition, in blocks */
43 #define EFI_MIN_RESV_SIZE (16 * 1024)
44
45 /*
46 * Virtual device vector for disks.
47 */
48 typedef struct dio_request {
49 zio_t *dr_zio; /* Parent ZIO */
50 atomic_t dr_ref; /* References */
51 int dr_error; /* Bio error */
52 int dr_bio_count; /* Count of bio's */
53 struct bio *dr_bio[0]; /* Attached bio's */
54 } dio_request_t;
55
56
57 #ifdef HAVE_OPEN_BDEV_EXCLUSIVE
58 static fmode_t
59 vdev_bdev_mode(int smode)
60 {
61 fmode_t mode = 0;
62
63 ASSERT3S(smode & (FREAD | FWRITE), !=, 0);
64
65 if (smode & FREAD)
66 mode |= FMODE_READ;
67
68 if (smode & FWRITE)
69 mode |= FMODE_WRITE;
70
71 return (mode);
72 }
73 #else
74 static int
75 vdev_bdev_mode(int smode)
76 {
77 int mode = 0;
78
79 ASSERT3S(smode & (FREAD | FWRITE), !=, 0);
80
81 if ((smode & FREAD) && !(smode & FWRITE))
82 mode = MS_RDONLY;
83
84 return (mode);
85 }
86 #endif /* HAVE_OPEN_BDEV_EXCLUSIVE */
87
88 /*
89 * Returns the usable capacity (in bytes) for the partition or disk.
90 */
91 static uint64_t
92 bdev_capacity(struct block_device *bdev)
93 {
94 return (i_size_read(bdev->bd_inode));
95 }
96
97 /*
98 * Returns the maximum expansion capacity of the block device (in bytes).
99 *
100 * It is possible to expand a vdev when it has been created as a wholedisk
101 * and the containing block device has increased in capacity. Or when the
102 * partition containing the pool has been manually increased in size.
103 *
104 * This function is only responsible for calculating the potential expansion
105 * size so it can be reported by 'zpool list'. The efi_use_whole_disk() is
106 * responsible for verifying the expected partition layout in the wholedisk
107 * case, and updating the partition table if appropriate. Once the partition
108 * size has been increased the additional capacity will be visible using
109 * bdev_capacity().
110 */
111 static uint64_t
112 bdev_max_capacity(struct block_device *bdev, uint64_t wholedisk)
113 {
114 uint64_t psize;
115 int64_t available;
116
117 if (wholedisk && bdev->bd_part != NULL && bdev != bdev->bd_contains) {
118 /*
119 * When reporting maximum expansion capacity for a wholedisk
120 * deduct any capacity which is expected to be lost due to
121 * alignment restrictions. Over reporting this value isn't
122 * harmful and would only result in slightly less capacity
123 * than expected post expansion.
124 */
125 available = i_size_read(bdev->bd_contains->bd_inode) -
126 ((EFI_MIN_RESV_SIZE + NEW_START_BLOCK +
127 PARTITION_END_ALIGNMENT) << SECTOR_BITS);
128 if (available > 0)
129 psize = available;
130 else
131 psize = bdev_capacity(bdev);
132 } else {
133 psize = bdev_capacity(bdev);
134 }
135
136 return (psize);
137 }
138
139 static void
140 vdev_disk_error(zio_t *zio)
141 {
142 zfs_dbgmsg("zio error=%d type=%d offset=%llu size=%llu flags=%x\n",
143 zio->io_error, zio->io_type, (u_longlong_t)zio->io_offset,
144 (u_longlong_t)zio->io_size, zio->io_flags);
145 }
146
147 /*
148 * Use the Linux 'noop' elevator for zfs managed block devices. This
149 * strikes the ideal balance by allowing the zfs elevator to do all
150 * request ordering and prioritization. While allowing the Linux
151 * elevator to do the maximum front/back merging allowed by the
152 * physical device. This yields the largest possible requests for
153 * the device with the lowest total overhead.
154 */
155 static void
156 vdev_elevator_switch(vdev_t *v, char *elevator)
157 {
158 vdev_disk_t *vd = v->vdev_tsd;
159 struct request_queue *q;
160 char *device;
161 int error;
162
163 for (int c = 0; c < v->vdev_children; c++)
164 vdev_elevator_switch(v->vdev_child[c], elevator);
165
166 if (!v->vdev_ops->vdev_op_leaf || vd->vd_bdev == NULL)
167 return;
168
169 q = bdev_get_queue(vd->vd_bdev);
170 device = vd->vd_bdev->bd_disk->disk_name;
171
172 /*
173 * Skip devices which are not whole disks (partitions).
174 * Device-mapper devices are excepted since they may be whole
175 * disks despite the vdev_wholedisk flag, in which case we can
176 * and should switch the elevator. If the device-mapper device
177 * does not have an elevator (i.e. dm-raid, dm-crypt, etc.) the
178 * "Skip devices without schedulers" check below will fail.
179 */
180 if (!v->vdev_wholedisk && strncmp(device, "dm-", 3) != 0)
181 return;
182
183 /* Leave existing scheduler when set to "none" */
184 if ((strncmp(elevator, "none", 4) == 0) && (strlen(elevator) == 4))
185 return;
186
187 /*
188 * The elevator_change() function was available in kernels from
189 * 2.6.36 to 4.11. When not available fall back to using the user
190 * mode helper functionality to set the elevator via sysfs. This
191 * requires /bin/echo and sysfs to be mounted which may not be true
192 * early in the boot process.
193 */
194 #ifdef HAVE_ELEVATOR_CHANGE
195 error = elevator_change(q, elevator);
196 #else
197 #define SET_SCHEDULER_CMD \
198 "exec 0</dev/null " \
199 " 1>/sys/block/%s/queue/scheduler " \
200 " 2>/dev/null; " \
201 "echo %s"
202
203 char *argv[] = { "/bin/sh", "-c", NULL, NULL };
204 char *envp[] = { NULL };
205
206 argv[2] = kmem_asprintf(SET_SCHEDULER_CMD, device, elevator);
207 error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
208 strfree(argv[2]);
209 #endif /* HAVE_ELEVATOR_CHANGE */
210 if (error) {
211 zfs_dbgmsg("Unable to set \"%s\" scheduler for %s (%s): %d\n",
212 elevator, v->vdev_path, device, error);
213 }
214 }
215
216 static int
217 vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
218 uint64_t *ashift)
219 {
220 struct block_device *bdev;
221 fmode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa));
222 int count = 0, block_size;
223 int bdev_retry_count = 50;
224 vdev_disk_t *vd;
225
226 /* Must have a pathname and it must be absolute. */
227 if (v->vdev_path == NULL || v->vdev_path[0] != '/') {
228 v->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
229 vdev_dbgmsg(v, "invalid vdev_path");
230 return (SET_ERROR(EINVAL));
231 }
232
233 /*
234 * Reopen the device if it is currently open. When expanding a
235 * partition force re-scanning the partition table while closed
236 * in order to get an accurate updated block device size. Then
237 * since udev may need to recreate the device links increase the
238 * open retry count before reporting the device as unavailable.
239 */
240 vd = v->vdev_tsd;
241 if (vd) {
242 char disk_name[BDEVNAME_SIZE + 6] = "/dev/";
243 boolean_t reread_part = B_FALSE;
244
245 rw_enter(&vd->vd_lock, RW_WRITER);
246 bdev = vd->vd_bdev;
247 vd->vd_bdev = NULL;
248
249 if (bdev) {
250 if (v->vdev_expanding && bdev != bdev->bd_contains) {
251 bdevname(bdev->bd_contains, disk_name + 5);
252 reread_part = B_TRUE;
253 }
254
255 vdev_bdev_close(bdev, mode);
256 }
257
258 if (reread_part) {
259 bdev = vdev_bdev_open(disk_name, mode, zfs_vdev_holder);
260 if (!IS_ERR(bdev)) {
261 int error = vdev_bdev_reread_part(bdev);
262 vdev_bdev_close(bdev, mode);
263 if (error == 0)
264 bdev_retry_count = 100;
265 }
266 }
267 } else {
268 vd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);
269
270 rw_init(&vd->vd_lock, NULL, RW_DEFAULT, NULL);
271 rw_enter(&vd->vd_lock, RW_WRITER);
272 }
273
274 /*
275 * Devices are always opened by the path provided at configuration
276 * time. This means that if the provided path is a udev by-id path
277 * then drives may be re-cabled without an issue. If the provided
278 * path is a udev by-path path, then the physical location information
279 * will be preserved. This can be critical for more complicated
280 * configurations where drives are located in specific physical
281 * locations to maximize the systems tolerance to component failure.
282 *
283 * Alternatively, you can provide your own udev rule to flexibly map
284 * the drives as you see fit. It is not advised that you use the
285 * /dev/[hd]d devices which may be reordered due to probing order.
286 * Devices in the wrong locations will be detected by the higher
287 * level vdev validation.
288 *
289 * The specified paths may be briefly removed and recreated in
290 * response to udev events. This should be exceptionally unlikely
291 * because the zpool command makes every effort to verify these paths
292 * have already settled prior to reaching this point. Therefore,
293 * a ENOENT failure at this point is highly likely to be transient
294 * and it is reasonable to sleep and retry before giving up. In
295 * practice delays have been observed to be on the order of 100ms.
296 */
297 bdev = ERR_PTR(-ENXIO);
298 while (IS_ERR(bdev) && count < bdev_retry_count) {
299 bdev = vdev_bdev_open(v->vdev_path, mode, zfs_vdev_holder);
300 if (unlikely(PTR_ERR(bdev) == -ENOENT)) {
301 schedule_timeout(MSEC_TO_TICK(10));
302 count++;
303 } else if (IS_ERR(bdev)) {
304 break;
305 }
306 }
307
308 if (IS_ERR(bdev)) {
309 int error = -PTR_ERR(bdev);
310 vdev_dbgmsg(v, "open error=%d count=%d\n", error, count);
311 vd->vd_bdev = NULL;
312 v->vdev_tsd = vd;
313 rw_exit(&vd->vd_lock);
314 return (SET_ERROR(error));
315 } else {
316 vd->vd_bdev = bdev;
317 v->vdev_tsd = vd;
318 rw_exit(&vd->vd_lock);
319 }
320
321 /* Determine the physical block size */
322 block_size = vdev_bdev_block_size(vd->vd_bdev);
323
324 /* Clear the nowritecache bit, causes vdev_reopen() to try again. */
325 v->vdev_nowritecache = B_FALSE;
326
327 /* Inform the ZIO pipeline that we are non-rotational */
328 v->vdev_nonrot = blk_queue_nonrot(bdev_get_queue(vd->vd_bdev));
329
330 /* Physical volume size in bytes for the partition */
331 *psize = bdev_capacity(vd->vd_bdev);
332
333 /* Physical volume size in bytes including possible expansion space */
334 *max_psize = bdev_max_capacity(vd->vd_bdev, v->vdev_wholedisk);
335
336 /* Based on the minimum sector size set the block size */
337 *ashift = highbit64(MAX(block_size, SPA_MINBLOCKSIZE)) - 1;
338
339 /* Try to set the io scheduler elevator algorithm */
340 (void) vdev_elevator_switch(v, zfs_vdev_scheduler);
341
342 return (0);
343 }
344
345 static void
346 vdev_disk_close(vdev_t *v)
347 {
348 vdev_disk_t *vd = v->vdev_tsd;
349
350 if (v->vdev_reopening || vd == NULL)
351 return;
352
353 if (vd->vd_bdev != NULL) {
354 vdev_bdev_close(vd->vd_bdev,
355 vdev_bdev_mode(spa_mode(v->vdev_spa)));
356 }
357
358 rw_destroy(&vd->vd_lock);
359 kmem_free(vd, sizeof (vdev_disk_t));
360 v->vdev_tsd = NULL;
361 }
362
363 static dio_request_t *
364 vdev_disk_dio_alloc(int bio_count)
365 {
366 dio_request_t *dr;
367 int i;
368
369 dr = kmem_zalloc(sizeof (dio_request_t) +
370 sizeof (struct bio *) * bio_count, KM_SLEEP);
371 if (dr) {
372 atomic_set(&dr->dr_ref, 0);
373 dr->dr_bio_count = bio_count;
374 dr->dr_error = 0;
375
376 for (i = 0; i < dr->dr_bio_count; i++)
377 dr->dr_bio[i] = NULL;
378 }
379
380 return (dr);
381 }
382
383 static void
384 vdev_disk_dio_free(dio_request_t *dr)
385 {
386 int i;
387
388 for (i = 0; i < dr->dr_bio_count; i++)
389 if (dr->dr_bio[i])
390 bio_put(dr->dr_bio[i]);
391
392 kmem_free(dr, sizeof (dio_request_t) +
393 sizeof (struct bio *) * dr->dr_bio_count);
394 }
395
396 static void
397 vdev_disk_dio_get(dio_request_t *dr)
398 {
399 atomic_inc(&dr->dr_ref);
400 }
401
402 static int
403 vdev_disk_dio_put(dio_request_t *dr)
404 {
405 int rc = atomic_dec_return(&dr->dr_ref);
406
407 /*
408 * Free the dio_request when the last reference is dropped and
409 * ensure zio_interpret is called only once with the correct zio
410 */
411 if (rc == 0) {
412 zio_t *zio = dr->dr_zio;
413 int error = dr->dr_error;
414
415 vdev_disk_dio_free(dr);
416
417 if (zio) {
418 zio->io_error = error;
419 ASSERT3S(zio->io_error, >=, 0);
420 if (zio->io_error)
421 vdev_disk_error(zio);
422
423 zio_delay_interrupt(zio);
424 }
425 }
426
427 return (rc);
428 }
429
430 BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, error)
431 {
432 dio_request_t *dr = bio->bi_private;
433 int rc;
434
435 if (dr->dr_error == 0) {
436 #ifdef HAVE_1ARG_BIO_END_IO_T
437 dr->dr_error = BIO_END_IO_ERROR(bio);
438 #else
439 if (error)
440 dr->dr_error = -(error);
441 else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
442 dr->dr_error = EIO;
443 #endif
444 }
445
446 /* Drop reference acquired by __vdev_disk_physio */
447 rc = vdev_disk_dio_put(dr);
448 }
449
450 static unsigned int
451 bio_map(struct bio *bio, void *bio_ptr, unsigned int bio_size)
452 {
453 unsigned int offset, size, i;
454 struct page *page;
455
456 offset = offset_in_page(bio_ptr);
457 for (i = 0; i < bio->bi_max_vecs; i++) {
458 size = PAGE_SIZE - offset;
459
460 if (bio_size <= 0)
461 break;
462
463 if (size > bio_size)
464 size = bio_size;
465
466 if (is_vmalloc_addr(bio_ptr))
467 page = vmalloc_to_page(bio_ptr);
468 else
469 page = virt_to_page(bio_ptr);
470
471 /*
472 * Some network related block device uses tcp_sendpage, which
473 * doesn't behave well when using 0-count page, this is a
474 * safety net to catch them.
475 */
476 ASSERT3S(page_count(page), >, 0);
477
478 if (bio_add_page(bio, page, size, offset) != size)
479 break;
480
481 bio_ptr += size;
482 bio_size -= size;
483 offset = 0;
484 }
485
486 return (bio_size);
487 }
488
489 static unsigned int
490 bio_map_abd_off(struct bio *bio, abd_t *abd, unsigned int size, size_t off)
491 {
492 if (abd_is_linear(abd))
493 return (bio_map(bio, ((char *)abd_to_buf(abd)) + off, size));
494
495 return (abd_scatter_bio_map_off(bio, abd, size, off));
496 }
497
498 static inline void
499 vdev_submit_bio_impl(struct bio *bio)
500 {
501 #ifdef HAVE_1ARG_SUBMIT_BIO
502 submit_bio(bio);
503 #else
504 submit_bio(0, bio);
505 #endif
506 }
507
508 #ifndef HAVE_BIO_SET_DEV
509 static inline void
510 bio_set_dev(struct bio *bio, struct block_device *bdev)
511 {
512 bio->bi_bdev = bdev;
513 }
514 #endif /* !HAVE_BIO_SET_DEV */
515
516 static inline void
517 vdev_submit_bio(struct bio *bio)
518 {
519 #ifdef HAVE_CURRENT_BIO_TAIL
520 struct bio **bio_tail = current->bio_tail;
521 current->bio_tail = NULL;
522 vdev_submit_bio_impl(bio);
523 current->bio_tail = bio_tail;
524 #else
525 struct bio_list *bio_list = current->bio_list;
526 current->bio_list = NULL;
527 vdev_submit_bio_impl(bio);
528 current->bio_list = bio_list;
529 #endif
530 }
531
532 static int
533 __vdev_disk_physio(struct block_device *bdev, zio_t *zio,
534 size_t io_size, uint64_t io_offset, int rw, int flags)
535 {
536 dio_request_t *dr;
537 uint64_t abd_offset;
538 uint64_t bio_offset;
539 int bio_size, bio_count = 16;
540 int i = 0, error = 0;
541 #if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG)
542 struct blk_plug plug;
543 #endif
544 /*
545 * Accessing outside the block device is never allowed.
546 */
547 if (io_offset + io_size > bdev->bd_inode->i_size) {
548 vdev_dbgmsg(zio->io_vd,
549 "Illegal access %llu size %llu, device size %llu",
550 io_offset, io_size, i_size_read(bdev->bd_inode));
551 return (SET_ERROR(EIO));
552 }
553
554 retry:
555 dr = vdev_disk_dio_alloc(bio_count);
556 if (dr == NULL)
557 return (SET_ERROR(ENOMEM));
558
559 if (zio && !(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))
560 bio_set_flags_failfast(bdev, &flags);
561
562 dr->dr_zio = zio;
563
564 /*
565 * When the IO size exceeds the maximum bio size for the request
566 * queue we are forced to break the IO in multiple bio's and wait
567 * for them all to complete. Ideally, all pool users will set
568 * their volume block size to match the maximum request size and
569 * the common case will be one bio per vdev IO request.
570 */
571
572 abd_offset = 0;
573 bio_offset = io_offset;
574 bio_size = io_size;
575 for (i = 0; i <= dr->dr_bio_count; i++) {
576
577 /* Finished constructing bio's for given buffer */
578 if (bio_size <= 0)
579 break;
580
581 /*
582 * By default only 'bio_count' bio's per dio are allowed.
583 * However, if we find ourselves in a situation where more
584 * are needed we allocate a larger dio and warn the user.
585 */
586 if (dr->dr_bio_count == i) {
587 vdev_disk_dio_free(dr);
588 bio_count *= 2;
589 goto retry;
590 }
591
592 /* bio_alloc() with __GFP_WAIT never returns NULL */
593 dr->dr_bio[i] = bio_alloc(GFP_NOIO,
594 MIN(abd_nr_pages_off(zio->io_abd, bio_size, abd_offset),
595 BIO_MAX_PAGES));
596 if (unlikely(dr->dr_bio[i] == NULL)) {
597 vdev_disk_dio_free(dr);
598 return (SET_ERROR(ENOMEM));
599 }
600
601 /* Matching put called by vdev_disk_physio_completion */
602 vdev_disk_dio_get(dr);
603
604 bio_set_dev(dr->dr_bio[i], bdev);
605 BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9;
606 dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion;
607 dr->dr_bio[i]->bi_private = dr;
608 bio_set_op_attrs(dr->dr_bio[i], rw, flags);
609
610 /* Remaining size is returned to become the new size */
611 bio_size = bio_map_abd_off(dr->dr_bio[i], zio->io_abd,
612 bio_size, abd_offset);
613
614 /* Advance in buffer and construct another bio if needed */
615 abd_offset += BIO_BI_SIZE(dr->dr_bio[i]);
616 bio_offset += BIO_BI_SIZE(dr->dr_bio[i]);
617 }
618
619 /* Extra reference to protect dio_request during vdev_submit_bio */
620 vdev_disk_dio_get(dr);
621
622 #if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG)
623 if (dr->dr_bio_count > 1)
624 blk_start_plug(&plug);
625 #endif
626
627 /* Submit all bio's associated with this dio */
628 for (i = 0; i < dr->dr_bio_count; i++)
629 if (dr->dr_bio[i])
630 vdev_submit_bio(dr->dr_bio[i]);
631
632 #if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG)
633 if (dr->dr_bio_count > 1)
634 blk_finish_plug(&plug);
635 #endif
636
637 (void) vdev_disk_dio_put(dr);
638
639 return (error);
640 }
641
642 BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error)
643 {
644 zio_t *zio = bio->bi_private;
645 #ifdef HAVE_1ARG_BIO_END_IO_T
646 zio->io_error = BIO_END_IO_ERROR(bio);
647 #else
648 zio->io_error = -error;
649 #endif
650
651 if (zio->io_error && (zio->io_error == EOPNOTSUPP))
652 zio->io_vd->vdev_nowritecache = B_TRUE;
653
654 bio_put(bio);
655 ASSERT3S(zio->io_error, >=, 0);
656 if (zio->io_error)
657 vdev_disk_error(zio);
658 zio_interrupt(zio);
659 }
660
661 static int
662 vdev_disk_io_flush(struct block_device *bdev, zio_t *zio)
663 {
664 struct request_queue *q;
665 struct bio *bio;
666
667 q = bdev_get_queue(bdev);
668 if (!q)
669 return (SET_ERROR(ENXIO));
670
671 bio = bio_alloc(GFP_NOIO, 0);
672 /* bio_alloc() with __GFP_WAIT never returns NULL */
673 if (unlikely(bio == NULL))
674 return (SET_ERROR(ENOMEM));
675
676 bio->bi_end_io = vdev_disk_io_flush_completion;
677 bio->bi_private = zio;
678 bio_set_dev(bio, bdev);
679 bio_set_flush(bio);
680 vdev_submit_bio(bio);
681 invalidate_bdev(bdev);
682
683 return (0);
684 }
685
686 static void
687 vdev_disk_io_start(zio_t *zio)
688 {
689 vdev_t *v = zio->io_vd;
690 vdev_disk_t *vd = v->vdev_tsd;
691 int rw, flags, error;
692
693 /*
694 * If the vdev is closed, it's likely in the REMOVED or FAULTED state.
695 * Nothing to be done here but return failure.
696 */
697 if (vd == NULL) {
698 zio->io_error = ENXIO;
699 zio_interrupt(zio);
700 return;
701 }
702
703 rw_enter(&vd->vd_lock, RW_READER);
704
705 /*
706 * If the vdev is closed, it's likely due to a failed reopen and is
707 * in the UNAVAIL state. Nothing to be done here but return failure.
708 */
709 if (vd->vd_bdev == NULL) {
710 rw_exit(&vd->vd_lock);
711 zio->io_error = ENXIO;
712 zio_interrupt(zio);
713 return;
714 }
715
716 switch (zio->io_type) {
717 case ZIO_TYPE_IOCTL:
718
719 if (!vdev_readable(v)) {
720 rw_exit(&vd->vd_lock);
721 zio->io_error = SET_ERROR(ENXIO);
722 zio_interrupt(zio);
723 return;
724 }
725
726 switch (zio->io_cmd) {
727 case DKIOCFLUSHWRITECACHE:
728
729 if (zfs_nocacheflush)
730 break;
731
732 if (v->vdev_nowritecache) {
733 zio->io_error = SET_ERROR(ENOTSUP);
734 break;
735 }
736
737 error = vdev_disk_io_flush(vd->vd_bdev, zio);
738 if (error == 0) {
739 rw_exit(&vd->vd_lock);
740 return;
741 }
742
743 zio->io_error = error;
744
745 break;
746
747 default:
748 zio->io_error = SET_ERROR(ENOTSUP);
749 }
750
751 rw_exit(&vd->vd_lock);
752 zio_execute(zio);
753 return;
754 case ZIO_TYPE_WRITE:
755 rw = WRITE;
756 #if defined(HAVE_BLK_QUEUE_HAVE_BIO_RW_UNPLUG)
757 flags = (1 << BIO_RW_UNPLUG);
758 #elif defined(REQ_UNPLUG)
759 flags = REQ_UNPLUG;
760 #else
761 flags = 0;
762 #endif
763 break;
764
765 case ZIO_TYPE_READ:
766 rw = READ;
767 #if defined(HAVE_BLK_QUEUE_HAVE_BIO_RW_UNPLUG)
768 flags = (1 << BIO_RW_UNPLUG);
769 #elif defined(REQ_UNPLUG)
770 flags = REQ_UNPLUG;
771 #else
772 flags = 0;
773 #endif
774 break;
775
776 default:
777 rw_exit(&vd->vd_lock);
778 zio->io_error = SET_ERROR(ENOTSUP);
779 zio_interrupt(zio);
780 return;
781 }
782
783 zio->io_target_timestamp = zio_handle_io_delay(zio);
784 error = __vdev_disk_physio(vd->vd_bdev, zio,
785 zio->io_size, zio->io_offset, rw, flags);
786 rw_exit(&vd->vd_lock);
787
788 if (error) {
789 zio->io_error = error;
790 zio_interrupt(zio);
791 return;
792 }
793 }
794
795 static void
796 vdev_disk_io_done(zio_t *zio)
797 {
798 /*
799 * If the device returned EIO, we revalidate the media. If it is
800 * determined the media has changed this triggers the asynchronous
801 * removal of the device from the configuration.
802 */
803 if (zio->io_error == EIO) {
804 vdev_t *v = zio->io_vd;
805 vdev_disk_t *vd = v->vdev_tsd;
806
807 if (check_disk_change(vd->vd_bdev)) {
808 vdev_bdev_invalidate(vd->vd_bdev);
809 v->vdev_remove_wanted = B_TRUE;
810 spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE);
811 }
812 }
813 }
814
815 static void
816 vdev_disk_hold(vdev_t *vd)
817 {
818 ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
819
820 /* We must have a pathname, and it must be absolute. */
821 if (vd->vdev_path == NULL || vd->vdev_path[0] != '/')
822 return;
823
824 /*
825 * Only prefetch path and devid info if the device has
826 * never been opened.
827 */
828 if (vd->vdev_tsd != NULL)
829 return;
830
831 /* XXX: Implement me as a vnode lookup for the device */
832 vd->vdev_name_vp = NULL;
833 vd->vdev_devid_vp = NULL;
834 }
835
836 static void
837 vdev_disk_rele(vdev_t *vd)
838 {
839 ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
840
841 /* XXX: Implement me as a vnode rele for the device */
842 }
843
844 static int
845 param_set_vdev_scheduler(const char *val, zfs_kernel_param_t *kp)
846 {
847 spa_t *spa = NULL;
848 char *p;
849
850 if (val == NULL)
851 return (SET_ERROR(-EINVAL));
852
853 if ((p = strchr(val, '\n')) != NULL)
854 *p = '\0';
855
856 if (spa_mode_global != 0) {
857 mutex_enter(&spa_namespace_lock);
858 while ((spa = spa_next(spa)) != NULL) {
859 if (spa_state(spa) != POOL_STATE_ACTIVE ||
860 !spa_writeable(spa) || spa_suspended(spa))
861 continue;
862
863 spa_open_ref(spa, FTAG);
864 mutex_exit(&spa_namespace_lock);
865 vdev_elevator_switch(spa->spa_root_vdev, (char *)val);
866 mutex_enter(&spa_namespace_lock);
867 spa_close(spa, FTAG);
868 }
869 mutex_exit(&spa_namespace_lock);
870 }
871
872 return (param_set_charp(val, kp));
873 }
874
875 vdev_ops_t vdev_disk_ops = {
876 vdev_disk_open,
877 vdev_disk_close,
878 vdev_default_asize,
879 vdev_disk_io_start,
880 vdev_disk_io_done,
881 NULL,
882 NULL,
883 vdev_disk_hold,
884 vdev_disk_rele,
885 NULL,
886 VDEV_TYPE_DISK, /* name of this vdev type */
887 B_TRUE /* leaf vdev */
888 };
889
890 module_param_call(zfs_vdev_scheduler, param_set_vdev_scheduler,
891 param_get_charp, &zfs_vdev_scheduler, 0644);
892 MODULE_PARM_DESC(zfs_vdev_scheduler, "I/O scheduler");