]> git.proxmox.com Git - mirror_zfs.git/blob - module/zfs/vdev_disk.c
Add support for autoexpand property
[mirror_zfs.git] / module / zfs / vdev_disk.c
1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC.
23 * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
24 * Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>.
25 * LLNL-CODE-403049.
26 * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
27 */
28
29 #include <sys/zfs_context.h>
30 #include <sys/spa_impl.h>
31 #include <sys/vdev_disk.h>
32 #include <sys/vdev_impl.h>
33 #include <sys/abd.h>
34 #include <sys/fs/zfs.h>
35 #include <sys/zio.h>
36 #include <linux/mod_compat.h>
37 #include <linux/msdos_fs.h>
38
39 char *zfs_vdev_scheduler = VDEV_SCHEDULER;
40 static void *zfs_vdev_holder = VDEV_HOLDER;
41
42 /* size of the "reserved" partition, in blocks */
43 #define EFI_MIN_RESV_SIZE (16 * 1024)
44
45 /*
46 * Virtual device vector for disks.
47 */
48 typedef struct dio_request {
49 zio_t *dr_zio; /* Parent ZIO */
50 atomic_t dr_ref; /* References */
51 int dr_error; /* Bio error */
52 int dr_bio_count; /* Count of bio's */
53 struct bio *dr_bio[0]; /* Attached bio's */
54 } dio_request_t;
55
56
57 #ifdef HAVE_OPEN_BDEV_EXCLUSIVE
58 static fmode_t
59 vdev_bdev_mode(int smode)
60 {
61 fmode_t mode = 0;
62
63 ASSERT3S(smode & (FREAD | FWRITE), !=, 0);
64
65 if (smode & FREAD)
66 mode |= FMODE_READ;
67
68 if (smode & FWRITE)
69 mode |= FMODE_WRITE;
70
71 return (mode);
72 }
73 #else
74 static int
75 vdev_bdev_mode(int smode)
76 {
77 int mode = 0;
78
79 ASSERT3S(smode & (FREAD | FWRITE), !=, 0);
80
81 if ((smode & FREAD) && !(smode & FWRITE))
82 mode = MS_RDONLY;
83
84 return (mode);
85 }
86 #endif /* HAVE_OPEN_BDEV_EXCLUSIVE */
87
88 /*
89 * Returns the usable capacity (in bytes) for the partition or disk.
90 */
91 static uint64_t
92 bdev_capacity(struct block_device *bdev)
93 {
94 return (i_size_read(bdev->bd_inode));
95 }
96
97 /*
98 * Returns the maximum expansion capacity of the block device (in bytes).
99 *
100 * It is possible to expand a vdev when it has been created as a wholedisk
101 * and the containing block device has increased in capacity. Or when the
102 * partition containing the pool has been manually increased in size.
103 *
104 * This function is only responsible for calculating the potential expansion
105 * size so it can be reported by 'zpool list'. The efi_use_whole_disk() is
106 * responsible for verifying the expected partition layout in the wholedisk
107 * case, and updating the partition table if appropriate. Once the partition
108 * size has been increased the additional capacity will be visible using
109 * bdev_capacity().
110 */
111 static uint64_t
112 bdev_max_capacity(struct block_device *bdev, uint64_t wholedisk)
113 {
114 uint64_t psize;
115 int64_t available;
116
117 if (wholedisk && bdev->bd_part != NULL && bdev != bdev->bd_contains) {
118 /*
119 * When reporting maximum expansion capacity for a wholedisk
120 * deduct any capacity which is expected to be lost due to
121 * alignment restrictions. Over reporting this value isn't
122 * harmful and would only result in slightly less capacity
123 * than expected post expansion.
124 */
125 available = i_size_read(bdev->bd_contains->bd_inode) -
126 ((EFI_MIN_RESV_SIZE + NEW_START_BLOCK +
127 PARTITION_END_ALIGNMENT) << SECTOR_BITS);
128 if (available > 0)
129 psize = available;
130 else
131 psize = bdev_capacity(bdev);
132 } else {
133 psize = bdev_capacity(bdev);
134 }
135
136 return (psize);
137 }
138
139 static void
140 vdev_disk_error(zio_t *zio)
141 {
142 zfs_dbgmsg(KERN_WARNING "zio error=%d type=%d offset=%llu size=%llu "
143 "flags=%x\n", zio->io_error, zio->io_type,
144 (u_longlong_t)zio->io_offset, (u_longlong_t)zio->io_size,
145 zio->io_flags);
146 }
147
148 /*
149 * Use the Linux 'noop' elevator for zfs managed block devices. This
150 * strikes the ideal balance by allowing the zfs elevator to do all
151 * request ordering and prioritization. While allowing the Linux
152 * elevator to do the maximum front/back merging allowed by the
153 * physical device. This yields the largest possible requests for
154 * the device with the lowest total overhead.
155 */
156 static void
157 vdev_elevator_switch(vdev_t *v, char *elevator)
158 {
159 vdev_disk_t *vd = v->vdev_tsd;
160 struct request_queue *q;
161 char *device;
162 int error;
163
164 for (int c = 0; c < v->vdev_children; c++)
165 vdev_elevator_switch(v->vdev_child[c], elevator);
166
167 if (!v->vdev_ops->vdev_op_leaf || vd->vd_bdev == NULL)
168 return;
169
170 q = bdev_get_queue(vd->vd_bdev);
171 device = vd->vd_bdev->bd_disk->disk_name;
172
173 /*
174 * Skip devices which are not whole disks (partitions).
175 * Device-mapper devices are excepted since they may be whole
176 * disks despite the vdev_wholedisk flag, in which case we can
177 * and should switch the elevator. If the device-mapper device
178 * does not have an elevator (i.e. dm-raid, dm-crypt, etc.) the
179 * "Skip devices without schedulers" check below will fail.
180 */
181 if (!v->vdev_wholedisk && strncmp(device, "dm-", 3) != 0)
182 return;
183
184 /* Leave existing scheduler when set to "none" */
185 if ((strncmp(elevator, "none", 4) == 0) && (strlen(elevator) == 4))
186 return;
187
188 /*
189 * The elevator_change() function was available in kernels from
190 * 2.6.36 to 4.11. When not available fall back to using the user
191 * mode helper functionality to set the elevator via sysfs. This
192 * requires /bin/echo and sysfs to be mounted which may not be true
193 * early in the boot process.
194 */
195 #ifdef HAVE_ELEVATOR_CHANGE
196 error = elevator_change(q, elevator);
197 #else
198 #define SET_SCHEDULER_CMD \
199 "exec 0</dev/null " \
200 " 1>/sys/block/%s/queue/scheduler " \
201 " 2>/dev/null; " \
202 "echo %s"
203
204 char *argv[] = { "/bin/sh", "-c", NULL, NULL };
205 char *envp[] = { NULL };
206
207 argv[2] = kmem_asprintf(SET_SCHEDULER_CMD, device, elevator);
208 error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
209 strfree(argv[2]);
210 #endif /* HAVE_ELEVATOR_CHANGE */
211 if (error) {
212 zfs_dbgmsg("Unable to set \"%s\" scheduler for %s (%s): %d\n",
213 elevator, v->vdev_path, device, error);
214 }
215 }
216
217 static int
218 vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
219 uint64_t *ashift)
220 {
221 struct block_device *bdev;
222 fmode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa));
223 int count = 0, block_size;
224 int bdev_retry_count = 50;
225 vdev_disk_t *vd;
226
227 /* Must have a pathname and it must be absolute. */
228 if (v->vdev_path == NULL || v->vdev_path[0] != '/') {
229 v->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
230 vdev_dbgmsg(v, "invalid vdev_path");
231 return (SET_ERROR(EINVAL));
232 }
233
234 /*
235 * Reopen the device if it is currently open. When expanding a
236 * partition force re-scanning the partition table while closed
237 * in order to get an accurate updated block device size. Then
238 * since udev may need to recreate the device links increase the
239 * open retry count before reporting the device as unavailable.
240 */
241 vd = v->vdev_tsd;
242 if (vd) {
243 char disk_name[BDEVNAME_SIZE + 6] = "/dev/";
244 boolean_t reread_part = B_FALSE;
245
246 rw_enter(&vd->vd_lock, RW_WRITER);
247 bdev = vd->vd_bdev;
248 vd->vd_bdev = NULL;
249
250 if (bdev) {
251 if (v->vdev_expanding && bdev != bdev->bd_contains) {
252 bdevname(bdev->bd_contains, disk_name + 5);
253 reread_part = B_TRUE;
254 }
255
256 vdev_bdev_close(bdev, mode);
257 }
258
259 if (reread_part) {
260 bdev = vdev_bdev_open(disk_name, mode, zfs_vdev_holder);
261 if (!IS_ERR(bdev)) {
262 int error = vdev_bdev_reread_part(bdev);
263 vdev_bdev_close(bdev, mode);
264 if (error == 0)
265 bdev_retry_count = 100;
266 }
267 }
268 } else {
269 vd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);
270
271 rw_init(&vd->vd_lock, NULL, RW_DEFAULT, NULL);
272 rw_enter(&vd->vd_lock, RW_WRITER);
273 }
274
275 /*
276 * Devices are always opened by the path provided at configuration
277 * time. This means that if the provided path is a udev by-id path
278 * then drives may be re-cabled without an issue. If the provided
279 * path is a udev by-path path, then the physical location information
280 * will be preserved. This can be critical for more complicated
281 * configurations where drives are located in specific physical
282 * locations to maximize the systems tolerance to component failure.
283 *
284 * Alternatively, you can provide your own udev rule to flexibly map
285 * the drives as you see fit. It is not advised that you use the
286 * /dev/[hd]d devices which may be reordered due to probing order.
287 * Devices in the wrong locations will be detected by the higher
288 * level vdev validation.
289 *
290 * The specified paths may be briefly removed and recreated in
291 * response to udev events. This should be exceptionally unlikely
292 * because the zpool command makes every effort to verify these paths
293 * have already settled prior to reaching this point. Therefore,
294 * a ENOENT failure at this point is highly likely to be transient
295 * and it is reasonable to sleep and retry before giving up. In
296 * practice delays have been observed to be on the order of 100ms.
297 */
298 bdev = ERR_PTR(-ENXIO);
299 while (IS_ERR(bdev) && count < bdev_retry_count) {
300 bdev = vdev_bdev_open(v->vdev_path, mode, zfs_vdev_holder);
301 if (unlikely(PTR_ERR(bdev) == -ENOENT)) {
302 schedule_timeout(MSEC_TO_TICK(10));
303 count++;
304 } else if (IS_ERR(bdev)) {
305 break;
306 }
307 }
308
309 if (IS_ERR(bdev)) {
310 int error = -PTR_ERR(bdev);
311 vdev_dbgmsg(v, "open error=%d count=%d\n", error, count);
312 vd->vd_bdev = NULL;
313 v->vdev_tsd = vd;
314 rw_exit(&vd->vd_lock);
315 return (SET_ERROR(error));
316 } else {
317 vd->vd_bdev = bdev;
318 v->vdev_tsd = vd;
319 rw_exit(&vd->vd_lock);
320 }
321
322 /* Determine the physical block size */
323 block_size = vdev_bdev_block_size(vd->vd_bdev);
324
325 /* Clear the nowritecache bit, causes vdev_reopen() to try again. */
326 v->vdev_nowritecache = B_FALSE;
327
328 /* Inform the ZIO pipeline that we are non-rotational */
329 v->vdev_nonrot = blk_queue_nonrot(bdev_get_queue(vd->vd_bdev));
330
331 /* Physical volume size in bytes for the partition */
332 *psize = bdev_capacity(vd->vd_bdev);
333
334 /* Physical volume size in bytes including possible expansion space */
335 *max_psize = bdev_max_capacity(vd->vd_bdev, v->vdev_wholedisk);
336
337 /* Based on the minimum sector size set the block size */
338 *ashift = highbit64(MAX(block_size, SPA_MINBLOCKSIZE)) - 1;
339
340 /* Try to set the io scheduler elevator algorithm */
341 (void) vdev_elevator_switch(v, zfs_vdev_scheduler);
342
343 return (0);
344 }
345
346 static void
347 vdev_disk_close(vdev_t *v)
348 {
349 vdev_disk_t *vd = v->vdev_tsd;
350
351 if (v->vdev_reopening || vd == NULL)
352 return;
353
354 if (vd->vd_bdev != NULL) {
355 vdev_bdev_close(vd->vd_bdev,
356 vdev_bdev_mode(spa_mode(v->vdev_spa)));
357 }
358
359 rw_destroy(&vd->vd_lock);
360 kmem_free(vd, sizeof (vdev_disk_t));
361 v->vdev_tsd = NULL;
362 }
363
364 static dio_request_t *
365 vdev_disk_dio_alloc(int bio_count)
366 {
367 dio_request_t *dr;
368 int i;
369
370 dr = kmem_zalloc(sizeof (dio_request_t) +
371 sizeof (struct bio *) * bio_count, KM_SLEEP);
372 if (dr) {
373 atomic_set(&dr->dr_ref, 0);
374 dr->dr_bio_count = bio_count;
375 dr->dr_error = 0;
376
377 for (i = 0; i < dr->dr_bio_count; i++)
378 dr->dr_bio[i] = NULL;
379 }
380
381 return (dr);
382 }
383
384 static void
385 vdev_disk_dio_free(dio_request_t *dr)
386 {
387 int i;
388
389 for (i = 0; i < dr->dr_bio_count; i++)
390 if (dr->dr_bio[i])
391 bio_put(dr->dr_bio[i]);
392
393 kmem_free(dr, sizeof (dio_request_t) +
394 sizeof (struct bio *) * dr->dr_bio_count);
395 }
396
397 static void
398 vdev_disk_dio_get(dio_request_t *dr)
399 {
400 atomic_inc(&dr->dr_ref);
401 }
402
403 static int
404 vdev_disk_dio_put(dio_request_t *dr)
405 {
406 int rc = atomic_dec_return(&dr->dr_ref);
407
408 /*
409 * Free the dio_request when the last reference is dropped and
410 * ensure zio_interpret is called only once with the correct zio
411 */
412 if (rc == 0) {
413 zio_t *zio = dr->dr_zio;
414 int error = dr->dr_error;
415
416 vdev_disk_dio_free(dr);
417
418 if (zio) {
419 zio->io_error = error;
420 ASSERT3S(zio->io_error, >=, 0);
421 if (zio->io_error)
422 vdev_disk_error(zio);
423
424 zio_delay_interrupt(zio);
425 }
426 }
427
428 return (rc);
429 }
430
431 BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, error)
432 {
433 dio_request_t *dr = bio->bi_private;
434 int rc;
435
436 if (dr->dr_error == 0) {
437 #ifdef HAVE_1ARG_BIO_END_IO_T
438 dr->dr_error = BIO_END_IO_ERROR(bio);
439 #else
440 if (error)
441 dr->dr_error = -(error);
442 else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
443 dr->dr_error = EIO;
444 #endif
445 }
446
447 /* Drop reference acquired by __vdev_disk_physio */
448 rc = vdev_disk_dio_put(dr);
449 }
450
451 static unsigned int
452 bio_map(struct bio *bio, void *bio_ptr, unsigned int bio_size)
453 {
454 unsigned int offset, size, i;
455 struct page *page;
456
457 offset = offset_in_page(bio_ptr);
458 for (i = 0; i < bio->bi_max_vecs; i++) {
459 size = PAGE_SIZE - offset;
460
461 if (bio_size <= 0)
462 break;
463
464 if (size > bio_size)
465 size = bio_size;
466
467 if (is_vmalloc_addr(bio_ptr))
468 page = vmalloc_to_page(bio_ptr);
469 else
470 page = virt_to_page(bio_ptr);
471
472 /*
473 * Some network related block device uses tcp_sendpage, which
474 * doesn't behave well when using 0-count page, this is a
475 * safety net to catch them.
476 */
477 ASSERT3S(page_count(page), >, 0);
478
479 if (bio_add_page(bio, page, size, offset) != size)
480 break;
481
482 bio_ptr += size;
483 bio_size -= size;
484 offset = 0;
485 }
486
487 return (bio_size);
488 }
489
490 static unsigned int
491 bio_map_abd_off(struct bio *bio, abd_t *abd, unsigned int size, size_t off)
492 {
493 if (abd_is_linear(abd))
494 return (bio_map(bio, ((char *)abd_to_buf(abd)) + off, size));
495
496 return (abd_scatter_bio_map_off(bio, abd, size, off));
497 }
498
499 static inline void
500 vdev_submit_bio_impl(struct bio *bio)
501 {
502 #ifdef HAVE_1ARG_SUBMIT_BIO
503 submit_bio(bio);
504 #else
505 submit_bio(0, bio);
506 #endif
507 }
508
509 #ifndef HAVE_BIO_SET_DEV
510 static inline void
511 bio_set_dev(struct bio *bio, struct block_device *bdev)
512 {
513 bio->bi_bdev = bdev;
514 }
515 #endif /* !HAVE_BIO_SET_DEV */
516
517 static inline void
518 vdev_submit_bio(struct bio *bio)
519 {
520 #ifdef HAVE_CURRENT_BIO_TAIL
521 struct bio **bio_tail = current->bio_tail;
522 current->bio_tail = NULL;
523 vdev_submit_bio_impl(bio);
524 current->bio_tail = bio_tail;
525 #else
526 struct bio_list *bio_list = current->bio_list;
527 current->bio_list = NULL;
528 vdev_submit_bio_impl(bio);
529 current->bio_list = bio_list;
530 #endif
531 }
532
533 static int
534 __vdev_disk_physio(struct block_device *bdev, zio_t *zio,
535 size_t io_size, uint64_t io_offset, int rw, int flags)
536 {
537 dio_request_t *dr;
538 uint64_t abd_offset;
539 uint64_t bio_offset;
540 int bio_size, bio_count = 16;
541 int i = 0, error = 0;
542 #if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG)
543 struct blk_plug plug;
544 #endif
545 /*
546 * Accessing outside the block device is never allowed.
547 */
548 if (io_offset + io_size > bdev->bd_inode->i_size) {
549 vdev_dbgmsg(zio->io_vd,
550 "Illegal access %llu size %llu, device size %llu",
551 io_offset, io_size, i_size_read(bdev->bd_inode));
552 return (SET_ERROR(EIO));
553 }
554
555 retry:
556 dr = vdev_disk_dio_alloc(bio_count);
557 if (dr == NULL)
558 return (SET_ERROR(ENOMEM));
559
560 if (zio && !(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))
561 bio_set_flags_failfast(bdev, &flags);
562
563 dr->dr_zio = zio;
564
565 /*
566 * When the IO size exceeds the maximum bio size for the request
567 * queue we are forced to break the IO in multiple bio's and wait
568 * for them all to complete. Ideally, all pool users will set
569 * their volume block size to match the maximum request size and
570 * the common case will be one bio per vdev IO request.
571 */
572
573 abd_offset = 0;
574 bio_offset = io_offset;
575 bio_size = io_size;
576 for (i = 0; i <= dr->dr_bio_count; i++) {
577
578 /* Finished constructing bio's for given buffer */
579 if (bio_size <= 0)
580 break;
581
582 /*
583 * By default only 'bio_count' bio's per dio are allowed.
584 * However, if we find ourselves in a situation where more
585 * are needed we allocate a larger dio and warn the user.
586 */
587 if (dr->dr_bio_count == i) {
588 vdev_disk_dio_free(dr);
589 bio_count *= 2;
590 goto retry;
591 }
592
593 /* bio_alloc() with __GFP_WAIT never returns NULL */
594 dr->dr_bio[i] = bio_alloc(GFP_NOIO,
595 MIN(abd_nr_pages_off(zio->io_abd, bio_size, abd_offset),
596 BIO_MAX_PAGES));
597 if (unlikely(dr->dr_bio[i] == NULL)) {
598 vdev_disk_dio_free(dr);
599 return (SET_ERROR(ENOMEM));
600 }
601
602 /* Matching put called by vdev_disk_physio_completion */
603 vdev_disk_dio_get(dr);
604
605 bio_set_dev(dr->dr_bio[i], bdev);
606 BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9;
607 dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion;
608 dr->dr_bio[i]->bi_private = dr;
609 bio_set_op_attrs(dr->dr_bio[i], rw, flags);
610
611 /* Remaining size is returned to become the new size */
612 bio_size = bio_map_abd_off(dr->dr_bio[i], zio->io_abd,
613 bio_size, abd_offset);
614
615 /* Advance in buffer and construct another bio if needed */
616 abd_offset += BIO_BI_SIZE(dr->dr_bio[i]);
617 bio_offset += BIO_BI_SIZE(dr->dr_bio[i]);
618 }
619
620 /* Extra reference to protect dio_request during vdev_submit_bio */
621 vdev_disk_dio_get(dr);
622
623 #if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG)
624 if (dr->dr_bio_count > 1)
625 blk_start_plug(&plug);
626 #endif
627
628 /* Submit all bio's associated with this dio */
629 for (i = 0; i < dr->dr_bio_count; i++)
630 if (dr->dr_bio[i])
631 vdev_submit_bio(dr->dr_bio[i]);
632
633 #if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG)
634 if (dr->dr_bio_count > 1)
635 blk_finish_plug(&plug);
636 #endif
637
638 (void) vdev_disk_dio_put(dr);
639
640 return (error);
641 }
642
643 BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error)
644 {
645 zio_t *zio = bio->bi_private;
646 #ifdef HAVE_1ARG_BIO_END_IO_T
647 zio->io_error = BIO_END_IO_ERROR(bio);
648 #else
649 zio->io_error = -error;
650 #endif
651
652 if (zio->io_error && (zio->io_error == EOPNOTSUPP))
653 zio->io_vd->vdev_nowritecache = B_TRUE;
654
655 bio_put(bio);
656 ASSERT3S(zio->io_error, >=, 0);
657 if (zio->io_error)
658 vdev_disk_error(zio);
659 zio_interrupt(zio);
660 }
661
662 static int
663 vdev_disk_io_flush(struct block_device *bdev, zio_t *zio)
664 {
665 struct request_queue *q;
666 struct bio *bio;
667
668 q = bdev_get_queue(bdev);
669 if (!q)
670 return (SET_ERROR(ENXIO));
671
672 bio = bio_alloc(GFP_NOIO, 0);
673 /* bio_alloc() with __GFP_WAIT never returns NULL */
674 if (unlikely(bio == NULL))
675 return (SET_ERROR(ENOMEM));
676
677 bio->bi_end_io = vdev_disk_io_flush_completion;
678 bio->bi_private = zio;
679 bio_set_dev(bio, bdev);
680 bio_set_flush(bio);
681 vdev_submit_bio(bio);
682 invalidate_bdev(bdev);
683
684 return (0);
685 }
686
687 static void
688 vdev_disk_io_start(zio_t *zio)
689 {
690 vdev_t *v = zio->io_vd;
691 vdev_disk_t *vd = v->vdev_tsd;
692 int rw, flags, error;
693
694 /*
695 * If the vdev is closed, it's likely in the REMOVED or FAULTED state.
696 * Nothing to be done here but return failure.
697 */
698 if (vd == NULL) {
699 zio->io_error = ENXIO;
700 zio_interrupt(zio);
701 return;
702 }
703
704 rw_enter(&vd->vd_lock, RW_READER);
705
706 /*
707 * If the vdev is closed, it's likely due to a failed reopen and is
708 * in the UNAVAIL state. Nothing to be done here but return failure.
709 */
710 if (vd->vd_bdev == NULL) {
711 rw_exit(&vd->vd_lock);
712 zio->io_error = ENXIO;
713 zio_interrupt(zio);
714 return;
715 }
716
717 switch (zio->io_type) {
718 case ZIO_TYPE_IOCTL:
719
720 if (!vdev_readable(v)) {
721 rw_exit(&vd->vd_lock);
722 zio->io_error = SET_ERROR(ENXIO);
723 zio_interrupt(zio);
724 return;
725 }
726
727 switch (zio->io_cmd) {
728 case DKIOCFLUSHWRITECACHE:
729
730 if (zfs_nocacheflush)
731 break;
732
733 if (v->vdev_nowritecache) {
734 zio->io_error = SET_ERROR(ENOTSUP);
735 break;
736 }
737
738 error = vdev_disk_io_flush(vd->vd_bdev, zio);
739 if (error == 0) {
740 rw_exit(&vd->vd_lock);
741 return;
742 }
743
744 zio->io_error = error;
745
746 break;
747
748 default:
749 zio->io_error = SET_ERROR(ENOTSUP);
750 }
751
752 rw_exit(&vd->vd_lock);
753 zio_execute(zio);
754 return;
755 case ZIO_TYPE_WRITE:
756 rw = WRITE;
757 #if defined(HAVE_BLK_QUEUE_HAVE_BIO_RW_UNPLUG)
758 flags = (1 << BIO_RW_UNPLUG);
759 #elif defined(REQ_UNPLUG)
760 flags = REQ_UNPLUG;
761 #else
762 flags = 0;
763 #endif
764 break;
765
766 case ZIO_TYPE_READ:
767 rw = READ;
768 #if defined(HAVE_BLK_QUEUE_HAVE_BIO_RW_UNPLUG)
769 flags = (1 << BIO_RW_UNPLUG);
770 #elif defined(REQ_UNPLUG)
771 flags = REQ_UNPLUG;
772 #else
773 flags = 0;
774 #endif
775 break;
776
777 default:
778 rw_exit(&vd->vd_lock);
779 zio->io_error = SET_ERROR(ENOTSUP);
780 zio_interrupt(zio);
781 return;
782 }
783
784 zio->io_target_timestamp = zio_handle_io_delay(zio);
785 error = __vdev_disk_physio(vd->vd_bdev, zio,
786 zio->io_size, zio->io_offset, rw, flags);
787 rw_exit(&vd->vd_lock);
788
789 if (error) {
790 zio->io_error = error;
791 zio_interrupt(zio);
792 return;
793 }
794 }
795
796 static void
797 vdev_disk_io_done(zio_t *zio)
798 {
799 /*
800 * If the device returned EIO, we revalidate the media. If it is
801 * determined the media has changed this triggers the asynchronous
802 * removal of the device from the configuration.
803 */
804 if (zio->io_error == EIO) {
805 vdev_t *v = zio->io_vd;
806 vdev_disk_t *vd = v->vdev_tsd;
807
808 if (check_disk_change(vd->vd_bdev)) {
809 vdev_bdev_invalidate(vd->vd_bdev);
810 v->vdev_remove_wanted = B_TRUE;
811 spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE);
812 }
813 }
814 }
815
816 static void
817 vdev_disk_hold(vdev_t *vd)
818 {
819 ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
820
821 /* We must have a pathname, and it must be absolute. */
822 if (vd->vdev_path == NULL || vd->vdev_path[0] != '/')
823 return;
824
825 /*
826 * Only prefetch path and devid info if the device has
827 * never been opened.
828 */
829 if (vd->vdev_tsd != NULL)
830 return;
831
832 /* XXX: Implement me as a vnode lookup for the device */
833 vd->vdev_name_vp = NULL;
834 vd->vdev_devid_vp = NULL;
835 }
836
837 static void
838 vdev_disk_rele(vdev_t *vd)
839 {
840 ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
841
842 /* XXX: Implement me as a vnode rele for the device */
843 }
844
845 static int
846 param_set_vdev_scheduler(const char *val, zfs_kernel_param_t *kp)
847 {
848 spa_t *spa = NULL;
849 char *p;
850
851 if (val == NULL)
852 return (SET_ERROR(-EINVAL));
853
854 if ((p = strchr(val, '\n')) != NULL)
855 *p = '\0';
856
857 if (spa_mode_global != 0) {
858 mutex_enter(&spa_namespace_lock);
859 while ((spa = spa_next(spa)) != NULL) {
860 if (spa_state(spa) != POOL_STATE_ACTIVE ||
861 !spa_writeable(spa) || spa_suspended(spa))
862 continue;
863
864 spa_open_ref(spa, FTAG);
865 mutex_exit(&spa_namespace_lock);
866 vdev_elevator_switch(spa->spa_root_vdev, (char *)val);
867 mutex_enter(&spa_namespace_lock);
868 spa_close(spa, FTAG);
869 }
870 mutex_exit(&spa_namespace_lock);
871 }
872
873 return (param_set_charp(val, kp));
874 }
875
876 vdev_ops_t vdev_disk_ops = {
877 vdev_disk_open,
878 vdev_disk_close,
879 vdev_default_asize,
880 vdev_disk_io_start,
881 vdev_disk_io_done,
882 NULL,
883 NULL,
884 vdev_disk_hold,
885 vdev_disk_rele,
886 NULL,
887 VDEV_TYPE_DISK, /* name of this vdev type */
888 B_TRUE /* leaf vdev */
889 };
890
891 module_param_call(zfs_vdev_scheduler, param_set_vdev_scheduler,
892 param_get_charp, &zfs_vdev_scheduler, 0644);
893 MODULE_PARM_DESC(zfs_vdev_scheduler, "I/O scheduler");