]> git.proxmox.com Git - mirror_zfs.git/blob - module/zfs/vdev_disk.c
Linux 5.0 compat: Remove incorrect ASSERT
[mirror_zfs.git] / module / zfs / vdev_disk.c
1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC.
23 * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
24 * Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>.
25 * LLNL-CODE-403049.
26 * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
27 */
28
29 #include <sys/zfs_context.h>
30 #include <sys/spa_impl.h>
31 #include <sys/vdev_disk.h>
32 #include <sys/vdev_impl.h>
33 #include <sys/vdev_trim.h>
34 #include <sys/abd.h>
35 #include <sys/fs/zfs.h>
36 #include <sys/zio.h>
37 #include <linux/mod_compat.h>
38 #include <linux/msdos_fs.h>
39 #include <linux/vfs_compat.h>
40
41 char *zfs_vdev_scheduler = VDEV_SCHEDULER;
42 static void *zfs_vdev_holder = VDEV_HOLDER;
43
44 /* size of the "reserved" partition, in blocks */
45 #define EFI_MIN_RESV_SIZE (16 * 1024)
46
47 /*
48 * Virtual device vector for disks.
49 */
50 typedef struct dio_request {
51 zio_t *dr_zio; /* Parent ZIO */
52 atomic_t dr_ref; /* References */
53 int dr_error; /* Bio error */
54 int dr_bio_count; /* Count of bio's */
55 struct bio *dr_bio[0]; /* Attached bio's */
56 } dio_request_t;
57
58
59 #ifdef HAVE_OPEN_BDEV_EXCLUSIVE
60 static fmode_t
61 vdev_bdev_mode(int smode)
62 {
63 fmode_t mode = 0;
64
65 ASSERT3S(smode & (FREAD | FWRITE), !=, 0);
66
67 if (smode & FREAD)
68 mode |= FMODE_READ;
69
70 if (smode & FWRITE)
71 mode |= FMODE_WRITE;
72
73 return (mode);
74 }
75 #else
76 static int
77 vdev_bdev_mode(int smode)
78 {
79 int mode = 0;
80
81 ASSERT3S(smode & (FREAD | FWRITE), !=, 0);
82
83 if ((smode & FREAD) && !(smode & FWRITE))
84 mode = SB_RDONLY;
85
86 return (mode);
87 }
88 #endif /* HAVE_OPEN_BDEV_EXCLUSIVE */
89
90 /*
91 * Returns the usable capacity (in bytes) for the partition or disk.
92 */
93 static uint64_t
94 bdev_capacity(struct block_device *bdev)
95 {
96 return (i_size_read(bdev->bd_inode));
97 }
98
99 /*
100 * Returns the maximum expansion capacity of the block device (in bytes).
101 *
102 * It is possible to expand a vdev when it has been created as a wholedisk
103 * and the containing block device has increased in capacity. Or when the
104 * partition containing the pool has been manually increased in size.
105 *
106 * This function is only responsible for calculating the potential expansion
107 * size so it can be reported by 'zpool list'. The efi_use_whole_disk() is
108 * responsible for verifying the expected partition layout in the wholedisk
109 * case, and updating the partition table if appropriate. Once the partition
110 * size has been increased the additional capacity will be visible using
111 * bdev_capacity().
112 *
113 * The returned maximum expansion capacity is always expected to be larger, or
114 * at the very least equal, to its usable capacity to prevent overestimating
115 * the pool expandsize.
116 */
117 static uint64_t
118 bdev_max_capacity(struct block_device *bdev, uint64_t wholedisk)
119 {
120 uint64_t psize;
121 int64_t available;
122
123 if (wholedisk && bdev->bd_part != NULL && bdev != bdev->bd_contains) {
124 /*
125 * When reporting maximum expansion capacity for a wholedisk
126 * deduct any capacity which is expected to be lost due to
127 * alignment restrictions. Over reporting this value isn't
128 * harmful and would only result in slightly less capacity
129 * than expected post expansion.
130 * The estimated available space may be slightly smaller than
131 * bdev_capacity() for devices where the number of sectors is
132 * not a multiple of the alignment size and the partition layout
133 * is keeping less than PARTITION_END_ALIGNMENT bytes after the
134 * "reserved" EFI partition: in such cases return the device
135 * usable capacity.
136 */
137 available = i_size_read(bdev->bd_contains->bd_inode) -
138 ((EFI_MIN_RESV_SIZE + NEW_START_BLOCK +
139 PARTITION_END_ALIGNMENT) << SECTOR_BITS);
140 psize = MAX(available, bdev_capacity(bdev));
141 } else {
142 psize = bdev_capacity(bdev);
143 }
144
145 return (psize);
146 }
147
148 static void
149 vdev_disk_error(zio_t *zio)
150 {
151 /*
152 * This function can be called in interrupt context, for instance while
153 * handling IRQs coming from a misbehaving disk device; use printk()
154 * which is safe from any context.
155 */
156 printk(KERN_WARNING "zio pool=%s vdev=%s error=%d type=%d "
157 "offset=%llu size=%llu flags=%x\n", spa_name(zio->io_spa),
158 zio->io_vd->vdev_path, zio->io_error, zio->io_type,
159 (u_longlong_t)zio->io_offset, (u_longlong_t)zio->io_size,
160 zio->io_flags);
161 }
162
163 /*
164 * Use the Linux 'noop' elevator for zfs managed block devices. This
165 * strikes the ideal balance by allowing the zfs elevator to do all
166 * request ordering and prioritization. While allowing the Linux
167 * elevator to do the maximum front/back merging allowed by the
168 * physical device. This yields the largest possible requests for
169 * the device with the lowest total overhead.
170 */
171 static void
172 vdev_elevator_switch(vdev_t *v, char *elevator)
173 {
174 vdev_disk_t *vd = v->vdev_tsd;
175 struct request_queue *q;
176 char *device;
177 int error;
178
179 for (int c = 0; c < v->vdev_children; c++)
180 vdev_elevator_switch(v->vdev_child[c], elevator);
181
182 if (!v->vdev_ops->vdev_op_leaf || vd->vd_bdev == NULL)
183 return;
184
185 q = bdev_get_queue(vd->vd_bdev);
186 device = vd->vd_bdev->bd_disk->disk_name;
187
188 /*
189 * Skip devices which are not whole disks (partitions).
190 * Device-mapper devices are excepted since they may be whole
191 * disks despite the vdev_wholedisk flag, in which case we can
192 * and should switch the elevator. If the device-mapper device
193 * does not have an elevator (i.e. dm-raid, dm-crypt, etc.) the
194 * "Skip devices without schedulers" check below will fail.
195 */
196 if (!v->vdev_wholedisk && strncmp(device, "dm-", 3) != 0)
197 return;
198
199 /* Leave existing scheduler when set to "none" */
200 if ((strncmp(elevator, "none", 4) == 0) && (strlen(elevator) == 4))
201 return;
202
203 /*
204 * The elevator_change() function was available in kernels from
205 * 2.6.36 to 4.11. When not available fall back to using the user
206 * mode helper functionality to set the elevator via sysfs. This
207 * requires /bin/echo and sysfs to be mounted which may not be true
208 * early in the boot process.
209 */
210 #ifdef HAVE_ELEVATOR_CHANGE
211 error = elevator_change(q, elevator);
212 #else
213 #define SET_SCHEDULER_CMD \
214 "exec 0</dev/null " \
215 " 1>/sys/block/%s/queue/scheduler " \
216 " 2>/dev/null; " \
217 "echo %s"
218
219 char *argv[] = { "/bin/sh", "-c", NULL, NULL };
220 char *envp[] = { NULL };
221
222 argv[2] = kmem_asprintf(SET_SCHEDULER_CMD, device, elevator);
223 error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
224 strfree(argv[2]);
225 #endif /* HAVE_ELEVATOR_CHANGE */
226 if (error) {
227 zfs_dbgmsg("Unable to set \"%s\" scheduler for %s (%s): %d",
228 elevator, v->vdev_path, device, error);
229 }
230 }
231
232 static int
233 vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
234 uint64_t *ashift)
235 {
236 struct block_device *bdev;
237 fmode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa));
238 int count = 0, block_size;
239 int bdev_retry_count = 50;
240 vdev_disk_t *vd;
241
242 /* Must have a pathname and it must be absolute. */
243 if (v->vdev_path == NULL || v->vdev_path[0] != '/') {
244 v->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
245 vdev_dbgmsg(v, "invalid vdev_path");
246 return (SET_ERROR(EINVAL));
247 }
248
249 /*
250 * Reopen the device if it is currently open. When expanding a
251 * partition force re-scanning the partition table while closed
252 * in order to get an accurate updated block device size. Then
253 * since udev may need to recreate the device links increase the
254 * open retry count before reporting the device as unavailable.
255 */
256 vd = v->vdev_tsd;
257 if (vd) {
258 char disk_name[BDEVNAME_SIZE + 6] = "/dev/";
259 boolean_t reread_part = B_FALSE;
260
261 rw_enter(&vd->vd_lock, RW_WRITER);
262 bdev = vd->vd_bdev;
263 vd->vd_bdev = NULL;
264
265 if (bdev) {
266 if (v->vdev_expanding && bdev != bdev->bd_contains) {
267 bdevname(bdev->bd_contains, disk_name + 5);
268 reread_part = B_TRUE;
269 }
270
271 vdev_bdev_close(bdev, mode);
272 }
273
274 if (reread_part) {
275 bdev = vdev_bdev_open(disk_name, mode, zfs_vdev_holder);
276 if (!IS_ERR(bdev)) {
277 int error = vdev_bdev_reread_part(bdev);
278 vdev_bdev_close(bdev, mode);
279 if (error == 0)
280 bdev_retry_count = 100;
281 }
282 }
283 } else {
284 vd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);
285
286 rw_init(&vd->vd_lock, NULL, RW_DEFAULT, NULL);
287 rw_enter(&vd->vd_lock, RW_WRITER);
288 }
289
290 /*
291 * Devices are always opened by the path provided at configuration
292 * time. This means that if the provided path is a udev by-id path
293 * then drives may be re-cabled without an issue. If the provided
294 * path is a udev by-path path, then the physical location information
295 * will be preserved. This can be critical for more complicated
296 * configurations where drives are located in specific physical
297 * locations to maximize the systems tolerance to component failure.
298 *
299 * Alternatively, you can provide your own udev rule to flexibly map
300 * the drives as you see fit. It is not advised that you use the
301 * /dev/[hd]d devices which may be reordered due to probing order.
302 * Devices in the wrong locations will be detected by the higher
303 * level vdev validation.
304 *
305 * The specified paths may be briefly removed and recreated in
306 * response to udev events. This should be exceptionally unlikely
307 * because the zpool command makes every effort to verify these paths
308 * have already settled prior to reaching this point. Therefore,
309 * a ENOENT failure at this point is highly likely to be transient
310 * and it is reasonable to sleep and retry before giving up. In
311 * practice delays have been observed to be on the order of 100ms.
312 */
313 bdev = ERR_PTR(-ENXIO);
314 while (IS_ERR(bdev) && count < bdev_retry_count) {
315 bdev = vdev_bdev_open(v->vdev_path, mode, zfs_vdev_holder);
316 if (unlikely(PTR_ERR(bdev) == -ENOENT)) {
317 schedule_timeout(MSEC_TO_TICK(10));
318 count++;
319 } else if (IS_ERR(bdev)) {
320 break;
321 }
322 }
323
324 if (IS_ERR(bdev)) {
325 int error = -PTR_ERR(bdev);
326 vdev_dbgmsg(v, "open error=%d count=%d", error, count);
327 vd->vd_bdev = NULL;
328 v->vdev_tsd = vd;
329 rw_exit(&vd->vd_lock);
330 return (SET_ERROR(error));
331 } else {
332 vd->vd_bdev = bdev;
333 v->vdev_tsd = vd;
334 rw_exit(&vd->vd_lock);
335 }
336
337 struct request_queue *q = bdev_get_queue(vd->vd_bdev);
338
339 /* Determine the physical block size */
340 block_size = vdev_bdev_block_size(vd->vd_bdev);
341
342 /* Clear the nowritecache bit, causes vdev_reopen() to try again. */
343 v->vdev_nowritecache = B_FALSE;
344
345 /* Set when device reports it supports TRIM. */
346 v->vdev_has_trim = !!blk_queue_discard(q);
347
348 /* Set when device reports it supports secure TRIM. */
349 v->vdev_has_securetrim = !!blk_queue_discard_secure(q);
350
351 /* Inform the ZIO pipeline that we are non-rotational */
352 v->vdev_nonrot = blk_queue_nonrot(q);
353
354 /* Physical volume size in bytes for the partition */
355 *psize = bdev_capacity(vd->vd_bdev);
356
357 /* Physical volume size in bytes including possible expansion space */
358 *max_psize = bdev_max_capacity(vd->vd_bdev, v->vdev_wholedisk);
359
360 /* Based on the minimum sector size set the block size */
361 *ashift = highbit64(MAX(block_size, SPA_MINBLOCKSIZE)) - 1;
362
363 /* Try to set the io scheduler elevator algorithm */
364 (void) vdev_elevator_switch(v, zfs_vdev_scheduler);
365
366 return (0);
367 }
368
369 static void
370 vdev_disk_close(vdev_t *v)
371 {
372 vdev_disk_t *vd = v->vdev_tsd;
373
374 if (v->vdev_reopening || vd == NULL)
375 return;
376
377 if (vd->vd_bdev != NULL) {
378 vdev_bdev_close(vd->vd_bdev,
379 vdev_bdev_mode(spa_mode(v->vdev_spa)));
380 }
381
382 rw_destroy(&vd->vd_lock);
383 kmem_free(vd, sizeof (vdev_disk_t));
384 v->vdev_tsd = NULL;
385 }
386
387 static dio_request_t *
388 vdev_disk_dio_alloc(int bio_count)
389 {
390 dio_request_t *dr;
391 int i;
392
393 dr = kmem_zalloc(sizeof (dio_request_t) +
394 sizeof (struct bio *) * bio_count, KM_SLEEP);
395 if (dr) {
396 atomic_set(&dr->dr_ref, 0);
397 dr->dr_bio_count = bio_count;
398 dr->dr_error = 0;
399
400 for (i = 0; i < dr->dr_bio_count; i++)
401 dr->dr_bio[i] = NULL;
402 }
403
404 return (dr);
405 }
406
407 static void
408 vdev_disk_dio_free(dio_request_t *dr)
409 {
410 int i;
411
412 for (i = 0; i < dr->dr_bio_count; i++)
413 if (dr->dr_bio[i])
414 bio_put(dr->dr_bio[i]);
415
416 kmem_free(dr, sizeof (dio_request_t) +
417 sizeof (struct bio *) * dr->dr_bio_count);
418 }
419
420 static void
421 vdev_disk_dio_get(dio_request_t *dr)
422 {
423 atomic_inc(&dr->dr_ref);
424 }
425
426 static int
427 vdev_disk_dio_put(dio_request_t *dr)
428 {
429 int rc = atomic_dec_return(&dr->dr_ref);
430
431 /*
432 * Free the dio_request when the last reference is dropped and
433 * ensure zio_interpret is called only once with the correct zio
434 */
435 if (rc == 0) {
436 zio_t *zio = dr->dr_zio;
437 int error = dr->dr_error;
438
439 vdev_disk_dio_free(dr);
440
441 if (zio) {
442 zio->io_error = error;
443 ASSERT3S(zio->io_error, >=, 0);
444 if (zio->io_error)
445 vdev_disk_error(zio);
446
447 zio_delay_interrupt(zio);
448 }
449 }
450
451 return (rc);
452 }
453
454 BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, error)
455 {
456 dio_request_t *dr = bio->bi_private;
457 int rc;
458
459 if (dr->dr_error == 0) {
460 #ifdef HAVE_1ARG_BIO_END_IO_T
461 dr->dr_error = BIO_END_IO_ERROR(bio);
462 #else
463 if (error)
464 dr->dr_error = -(error);
465 else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
466 dr->dr_error = EIO;
467 #endif
468 }
469
470 /* Drop reference acquired by __vdev_disk_physio */
471 rc = vdev_disk_dio_put(dr);
472 }
473
474 static unsigned int
475 bio_map(struct bio *bio, void *bio_ptr, unsigned int bio_size)
476 {
477 unsigned int offset, size, i;
478 struct page *page;
479
480 offset = offset_in_page(bio_ptr);
481 for (i = 0; i < bio->bi_max_vecs; i++) {
482 size = PAGE_SIZE - offset;
483
484 if (bio_size <= 0)
485 break;
486
487 if (size > bio_size)
488 size = bio_size;
489
490 if (is_vmalloc_addr(bio_ptr))
491 page = vmalloc_to_page(bio_ptr);
492 else
493 page = virt_to_page(bio_ptr);
494
495 /*
496 * Some network related block device uses tcp_sendpage, which
497 * doesn't behave well when using 0-count page, this is a
498 * safety net to catch them.
499 */
500 ASSERT3S(page_count(page), >, 0);
501
502 if (bio_add_page(bio, page, size, offset) != size)
503 break;
504
505 bio_ptr += size;
506 bio_size -= size;
507 offset = 0;
508 }
509
510 return (bio_size);
511 }
512
513 static unsigned int
514 bio_map_abd_off(struct bio *bio, abd_t *abd, unsigned int size, size_t off)
515 {
516 if (abd_is_linear(abd))
517 return (bio_map(bio, ((char *)abd_to_buf(abd)) + off, size));
518
519 return (abd_scatter_bio_map_off(bio, abd, size, off));
520 }
521
522 static inline void
523 vdev_submit_bio_impl(struct bio *bio)
524 {
525 #ifdef HAVE_1ARG_SUBMIT_BIO
526 submit_bio(bio);
527 #else
528 submit_bio(0, bio);
529 #endif
530 }
531
532 #ifdef HAVE_BIO_SET_DEV
533 #if defined(CONFIG_BLK_CGROUP) && defined(HAVE_BIO_SET_DEV_GPL_ONLY)
534 /*
535 * The Linux 5.0 kernel updated the bio_set_dev() macro so it calls the
536 * GPL-only bio_associate_blkg() symbol thus inadvertently converting
537 * the entire macro. Provide a minimal version which always assigns the
538 * request queue's root_blkg to the bio.
539 */
540 static inline void
541 vdev_bio_associate_blkg(struct bio *bio)
542 {
543 struct request_queue *q = bio->bi_disk->queue;
544
545 ASSERT3P(q, !=, NULL);
546 ASSERT3P(bio->bi_blkg, ==, NULL);
547
548 if (blkg_tryget(q->root_blkg))
549 bio->bi_blkg = q->root_blkg;
550 }
551 #define bio_associate_blkg vdev_bio_associate_blkg
552 #endif
553 #else
554 /*
555 * Provide a bio_set_dev() helper macro for pre-Linux 4.14 kernels.
556 */
557 static inline void
558 bio_set_dev(struct bio *bio, struct block_device *bdev)
559 {
560 bio->bi_bdev = bdev;
561 }
562 #endif /* HAVE_BIO_SET_DEV */
563
564 static inline void
565 vdev_submit_bio(struct bio *bio)
566 {
567 #ifdef HAVE_CURRENT_BIO_TAIL
568 struct bio **bio_tail = current->bio_tail;
569 current->bio_tail = NULL;
570 vdev_submit_bio_impl(bio);
571 current->bio_tail = bio_tail;
572 #else
573 struct bio_list *bio_list = current->bio_list;
574 current->bio_list = NULL;
575 vdev_submit_bio_impl(bio);
576 current->bio_list = bio_list;
577 #endif
578 }
579
580 static int
581 __vdev_disk_physio(struct block_device *bdev, zio_t *zio,
582 size_t io_size, uint64_t io_offset, int rw, int flags)
583 {
584 dio_request_t *dr;
585 uint64_t abd_offset;
586 uint64_t bio_offset;
587 int bio_size, bio_count = 16;
588 int i = 0, error = 0;
589 #if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG)
590 struct blk_plug plug;
591 #endif
592 /*
593 * Accessing outside the block device is never allowed.
594 */
595 if (io_offset + io_size > bdev->bd_inode->i_size) {
596 vdev_dbgmsg(zio->io_vd,
597 "Illegal access %llu size %llu, device size %llu",
598 io_offset, io_size, i_size_read(bdev->bd_inode));
599 return (SET_ERROR(EIO));
600 }
601
602 retry:
603 dr = vdev_disk_dio_alloc(bio_count);
604 if (dr == NULL)
605 return (SET_ERROR(ENOMEM));
606
607 if (zio && !(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))
608 bio_set_flags_failfast(bdev, &flags);
609
610 dr->dr_zio = zio;
611
612 /*
613 * When the IO size exceeds the maximum bio size for the request
614 * queue we are forced to break the IO in multiple bio's and wait
615 * for them all to complete. Ideally, all pool users will set
616 * their volume block size to match the maximum request size and
617 * the common case will be one bio per vdev IO request.
618 */
619
620 abd_offset = 0;
621 bio_offset = io_offset;
622 bio_size = io_size;
623 for (i = 0; i <= dr->dr_bio_count; i++) {
624
625 /* Finished constructing bio's for given buffer */
626 if (bio_size <= 0)
627 break;
628
629 /*
630 * By default only 'bio_count' bio's per dio are allowed.
631 * However, if we find ourselves in a situation where more
632 * are needed we allocate a larger dio and warn the user.
633 */
634 if (dr->dr_bio_count == i) {
635 vdev_disk_dio_free(dr);
636 bio_count *= 2;
637 goto retry;
638 }
639
640 /* bio_alloc() with __GFP_WAIT never returns NULL */
641 dr->dr_bio[i] = bio_alloc(GFP_NOIO,
642 MIN(abd_nr_pages_off(zio->io_abd, bio_size, abd_offset),
643 BIO_MAX_PAGES));
644 if (unlikely(dr->dr_bio[i] == NULL)) {
645 vdev_disk_dio_free(dr);
646 return (SET_ERROR(ENOMEM));
647 }
648
649 /* Matching put called by vdev_disk_physio_completion */
650 vdev_disk_dio_get(dr);
651
652 bio_set_dev(dr->dr_bio[i], bdev);
653 BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9;
654 dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion;
655 dr->dr_bio[i]->bi_private = dr;
656 bio_set_op_attrs(dr->dr_bio[i], rw, flags);
657
658 /* Remaining size is returned to become the new size */
659 bio_size = bio_map_abd_off(dr->dr_bio[i], zio->io_abd,
660 bio_size, abd_offset);
661
662 /* Advance in buffer and construct another bio if needed */
663 abd_offset += BIO_BI_SIZE(dr->dr_bio[i]);
664 bio_offset += BIO_BI_SIZE(dr->dr_bio[i]);
665 }
666
667 /* Extra reference to protect dio_request during vdev_submit_bio */
668 vdev_disk_dio_get(dr);
669
670 #if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG)
671 if (dr->dr_bio_count > 1)
672 blk_start_plug(&plug);
673 #endif
674
675 /* Submit all bio's associated with this dio */
676 for (i = 0; i < dr->dr_bio_count; i++)
677 if (dr->dr_bio[i])
678 vdev_submit_bio(dr->dr_bio[i]);
679
680 #if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG)
681 if (dr->dr_bio_count > 1)
682 blk_finish_plug(&plug);
683 #endif
684
685 (void) vdev_disk_dio_put(dr);
686
687 return (error);
688 }
689
690 BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error)
691 {
692 zio_t *zio = bio->bi_private;
693 #ifdef HAVE_1ARG_BIO_END_IO_T
694 zio->io_error = BIO_END_IO_ERROR(bio);
695 #else
696 zio->io_error = -error;
697 #endif
698
699 if (zio->io_error && (zio->io_error == EOPNOTSUPP))
700 zio->io_vd->vdev_nowritecache = B_TRUE;
701
702 bio_put(bio);
703 ASSERT3S(zio->io_error, >=, 0);
704 if (zio->io_error)
705 vdev_disk_error(zio);
706 zio_interrupt(zio);
707 }
708
709 static int
710 vdev_disk_io_flush(struct block_device *bdev, zio_t *zio)
711 {
712 struct request_queue *q;
713 struct bio *bio;
714
715 q = bdev_get_queue(bdev);
716 if (!q)
717 return (SET_ERROR(ENXIO));
718
719 bio = bio_alloc(GFP_NOIO, 0);
720 /* bio_alloc() with __GFP_WAIT never returns NULL */
721 if (unlikely(bio == NULL))
722 return (SET_ERROR(ENOMEM));
723
724 bio->bi_end_io = vdev_disk_io_flush_completion;
725 bio->bi_private = zio;
726 bio_set_dev(bio, bdev);
727 bio_set_flush(bio);
728 vdev_submit_bio(bio);
729 invalidate_bdev(bdev);
730
731 return (0);
732 }
733
734 static void
735 vdev_disk_io_start(zio_t *zio)
736 {
737 vdev_t *v = zio->io_vd;
738 vdev_disk_t *vd = v->vdev_tsd;
739 unsigned long trim_flags = 0;
740 int rw, flags, error;
741
742 /*
743 * If the vdev is closed, it's likely in the REMOVED or FAULTED state.
744 * Nothing to be done here but return failure.
745 */
746 if (vd == NULL) {
747 zio->io_error = ENXIO;
748 zio_interrupt(zio);
749 return;
750 }
751
752 rw_enter(&vd->vd_lock, RW_READER);
753
754 /*
755 * If the vdev is closed, it's likely due to a failed reopen and is
756 * in the UNAVAIL state. Nothing to be done here but return failure.
757 */
758 if (vd->vd_bdev == NULL) {
759 rw_exit(&vd->vd_lock);
760 zio->io_error = ENXIO;
761 zio_interrupt(zio);
762 return;
763 }
764
765 switch (zio->io_type) {
766 case ZIO_TYPE_IOCTL:
767
768 if (!vdev_readable(v)) {
769 rw_exit(&vd->vd_lock);
770 zio->io_error = SET_ERROR(ENXIO);
771 zio_interrupt(zio);
772 return;
773 }
774
775 switch (zio->io_cmd) {
776 case DKIOCFLUSHWRITECACHE:
777
778 if (zfs_nocacheflush)
779 break;
780
781 if (v->vdev_nowritecache) {
782 zio->io_error = SET_ERROR(ENOTSUP);
783 break;
784 }
785
786 error = vdev_disk_io_flush(vd->vd_bdev, zio);
787 if (error == 0) {
788 rw_exit(&vd->vd_lock);
789 return;
790 }
791
792 zio->io_error = error;
793
794 break;
795
796 default:
797 zio->io_error = SET_ERROR(ENOTSUP);
798 }
799
800 rw_exit(&vd->vd_lock);
801 zio_execute(zio);
802 return;
803 case ZIO_TYPE_WRITE:
804 rw = WRITE;
805 #if defined(HAVE_BLK_QUEUE_HAVE_BIO_RW_UNPLUG)
806 flags = (1 << BIO_RW_UNPLUG);
807 #elif defined(REQ_UNPLUG)
808 flags = REQ_UNPLUG;
809 #else
810 flags = 0;
811 #endif
812 break;
813
814 case ZIO_TYPE_READ:
815 rw = READ;
816 #if defined(HAVE_BLK_QUEUE_HAVE_BIO_RW_UNPLUG)
817 flags = (1 << BIO_RW_UNPLUG);
818 #elif defined(REQ_UNPLUG)
819 flags = REQ_UNPLUG;
820 #else
821 flags = 0;
822 #endif
823 break;
824
825 case ZIO_TYPE_TRIM:
826 #if defined(BLKDEV_DISCARD_SECURE)
827 if (zio->io_trim_flags & ZIO_TRIM_SECURE)
828 trim_flags |= BLKDEV_DISCARD_SECURE;
829 #endif
830 zio->io_error = -blkdev_issue_discard(vd->vd_bdev,
831 zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS,
832 trim_flags);
833
834 rw_exit(&vd->vd_lock);
835 zio_interrupt(zio);
836 return;
837
838 default:
839 rw_exit(&vd->vd_lock);
840 zio->io_error = SET_ERROR(ENOTSUP);
841 zio_interrupt(zio);
842 return;
843 }
844
845 zio->io_target_timestamp = zio_handle_io_delay(zio);
846 error = __vdev_disk_physio(vd->vd_bdev, zio,
847 zio->io_size, zio->io_offset, rw, flags);
848 rw_exit(&vd->vd_lock);
849
850 if (error) {
851 zio->io_error = error;
852 zio_interrupt(zio);
853 return;
854 }
855 }
856
857 static void
858 vdev_disk_io_done(zio_t *zio)
859 {
860 /*
861 * If the device returned EIO, we revalidate the media. If it is
862 * determined the media has changed this triggers the asynchronous
863 * removal of the device from the configuration.
864 */
865 if (zio->io_error == EIO) {
866 vdev_t *v = zio->io_vd;
867 vdev_disk_t *vd = v->vdev_tsd;
868
869 if (check_disk_change(vd->vd_bdev)) {
870 vdev_bdev_invalidate(vd->vd_bdev);
871 v->vdev_remove_wanted = B_TRUE;
872 spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE);
873 }
874 }
875 }
876
877 static void
878 vdev_disk_hold(vdev_t *vd)
879 {
880 ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
881
882 /* We must have a pathname, and it must be absolute. */
883 if (vd->vdev_path == NULL || vd->vdev_path[0] != '/')
884 return;
885
886 /*
887 * Only prefetch path and devid info if the device has
888 * never been opened.
889 */
890 if (vd->vdev_tsd != NULL)
891 return;
892
893 /* XXX: Implement me as a vnode lookup for the device */
894 vd->vdev_name_vp = NULL;
895 vd->vdev_devid_vp = NULL;
896 }
897
898 static void
899 vdev_disk_rele(vdev_t *vd)
900 {
901 ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
902
903 /* XXX: Implement me as a vnode rele for the device */
904 }
905
906 static int
907 param_set_vdev_scheduler(const char *val, zfs_kernel_param_t *kp)
908 {
909 spa_t *spa = NULL;
910 char *p;
911
912 if (val == NULL)
913 return (SET_ERROR(-EINVAL));
914
915 if ((p = strchr(val, '\n')) != NULL)
916 *p = '\0';
917
918 if (spa_mode_global != 0) {
919 mutex_enter(&spa_namespace_lock);
920 while ((spa = spa_next(spa)) != NULL) {
921 if (spa_state(spa) != POOL_STATE_ACTIVE ||
922 !spa_writeable(spa) || spa_suspended(spa))
923 continue;
924
925 spa_open_ref(spa, FTAG);
926 mutex_exit(&spa_namespace_lock);
927 vdev_elevator_switch(spa->spa_root_vdev, (char *)val);
928 mutex_enter(&spa_namespace_lock);
929 spa_close(spa, FTAG);
930 }
931 mutex_exit(&spa_namespace_lock);
932 }
933
934 return (param_set_charp(val, kp));
935 }
936
937 vdev_ops_t vdev_disk_ops = {
938 vdev_disk_open,
939 vdev_disk_close,
940 vdev_default_asize,
941 vdev_disk_io_start,
942 vdev_disk_io_done,
943 NULL,
944 NULL,
945 vdev_disk_hold,
946 vdev_disk_rele,
947 NULL,
948 vdev_default_xlate,
949 VDEV_TYPE_DISK, /* name of this vdev type */
950 B_TRUE /* leaf vdev */
951 };
952
953 module_param_call(zfs_vdev_scheduler, param_set_vdev_scheduler,
954 param_get_charp, &zfs_vdev_scheduler, 0644);
955 MODULE_PARM_DESC(zfs_vdev_scheduler, "I/O scheduler");