]> git.proxmox.com Git - mirror_zfs.git/blob - module/zfs/vdev_disk.c
c2312e6fa3bf8ec96bea7554b43d6d02aacad7dd
[mirror_zfs.git] / module / zfs / vdev_disk.c
1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC.
23 * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
24 * Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>.
25 * LLNL-CODE-403049.
26 * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
27 */
28
29 #include <sys/zfs_context.h>
30 #include <sys/spa_impl.h>
31 #include <sys/vdev_disk.h>
32 #include <sys/vdev_impl.h>
33 #include <sys/vdev_trim.h>
34 #include <sys/abd.h>
35 #include <sys/fs/zfs.h>
36 #include <sys/zio.h>
37 #include <linux/mod_compat.h>
38 #include <linux/msdos_fs.h>
39 #include <linux/vfs_compat.h>
40
41 char *zfs_vdev_scheduler = VDEV_SCHEDULER;
42 static void *zfs_vdev_holder = VDEV_HOLDER;
43
44 /* size of the "reserved" partition, in blocks */
45 #define EFI_MIN_RESV_SIZE (16 * 1024)
46
47 /*
48 * Virtual device vector for disks.
49 */
50 typedef struct dio_request {
51 zio_t *dr_zio; /* Parent ZIO */
52 atomic_t dr_ref; /* References */
53 int dr_error; /* Bio error */
54 int dr_bio_count; /* Count of bio's */
55 struct bio *dr_bio[0]; /* Attached bio's */
56 } dio_request_t;
57
58
59 #ifdef HAVE_OPEN_BDEV_EXCLUSIVE
60 static fmode_t
61 vdev_bdev_mode(int smode)
62 {
63 fmode_t mode = 0;
64
65 ASSERT3S(smode & (FREAD | FWRITE), !=, 0);
66
67 if (smode & FREAD)
68 mode |= FMODE_READ;
69
70 if (smode & FWRITE)
71 mode |= FMODE_WRITE;
72
73 return (mode);
74 }
75 #else
76 static int
77 vdev_bdev_mode(int smode)
78 {
79 int mode = 0;
80
81 ASSERT3S(smode & (FREAD | FWRITE), !=, 0);
82
83 if ((smode & FREAD) && !(smode & FWRITE))
84 mode = SB_RDONLY;
85
86 return (mode);
87 }
88 #endif /* HAVE_OPEN_BDEV_EXCLUSIVE */
89
90 /*
91 * Returns the usable capacity (in bytes) for the partition or disk.
92 */
93 static uint64_t
94 bdev_capacity(struct block_device *bdev)
95 {
96 return (i_size_read(bdev->bd_inode));
97 }
98
99 /*
100 * Returns the maximum expansion capacity of the block device (in bytes).
101 *
102 * It is possible to expand a vdev when it has been created as a wholedisk
103 * and the containing block device has increased in capacity. Or when the
104 * partition containing the pool has been manually increased in size.
105 *
106 * This function is only responsible for calculating the potential expansion
107 * size so it can be reported by 'zpool list'. The efi_use_whole_disk() is
108 * responsible for verifying the expected partition layout in the wholedisk
109 * case, and updating the partition table if appropriate. Once the partition
110 * size has been increased the additional capacity will be visible using
111 * bdev_capacity().
112 *
113 * The returned maximum expansion capacity is always expected to be larger, or
114 * at the very least equal, to its usable capacity to prevent overestimating
115 * the pool expandsize.
116 */
117 static uint64_t
118 bdev_max_capacity(struct block_device *bdev, uint64_t wholedisk)
119 {
120 uint64_t psize;
121 int64_t available;
122
123 if (wholedisk && bdev->bd_part != NULL && bdev != bdev->bd_contains) {
124 /*
125 * When reporting maximum expansion capacity for a wholedisk
126 * deduct any capacity which is expected to be lost due to
127 * alignment restrictions. Over reporting this value isn't
128 * harmful and would only result in slightly less capacity
129 * than expected post expansion.
130 * The estimated available space may be slightly smaller than
131 * bdev_capacity() for devices where the number of sectors is
132 * not a multiple of the alignment size and the partition layout
133 * is keeping less than PARTITION_END_ALIGNMENT bytes after the
134 * "reserved" EFI partition: in such cases return the device
135 * usable capacity.
136 */
137 available = i_size_read(bdev->bd_contains->bd_inode) -
138 ((EFI_MIN_RESV_SIZE + NEW_START_BLOCK +
139 PARTITION_END_ALIGNMENT) << SECTOR_BITS);
140 psize = MAX(available, bdev_capacity(bdev));
141 } else {
142 psize = bdev_capacity(bdev);
143 }
144
145 return (psize);
146 }
147
148 static void
149 vdev_disk_error(zio_t *zio)
150 {
151 /*
152 * This function can be called in interrupt context, for instance while
153 * handling IRQs coming from a misbehaving disk device; use printk()
154 * which is safe from any context.
155 */
156 printk(KERN_WARNING "zio pool=%s vdev=%s error=%d type=%d "
157 "offset=%llu size=%llu flags=%x\n", spa_name(zio->io_spa),
158 zio->io_vd->vdev_path, zio->io_error, zio->io_type,
159 (u_longlong_t)zio->io_offset, (u_longlong_t)zio->io_size,
160 zio->io_flags);
161 }
162
163 /*
164 * Use the Linux 'noop' elevator for zfs managed block devices. This
165 * strikes the ideal balance by allowing the zfs elevator to do all
166 * request ordering and prioritization. While allowing the Linux
167 * elevator to do the maximum front/back merging allowed by the
168 * physical device. This yields the largest possible requests for
169 * the device with the lowest total overhead.
170 */
171 static void
172 vdev_elevator_switch(vdev_t *v, char *elevator)
173 {
174 vdev_disk_t *vd = v->vdev_tsd;
175 struct request_queue *q;
176 char *device;
177 int error;
178
179 for (int c = 0; c < v->vdev_children; c++)
180 vdev_elevator_switch(v->vdev_child[c], elevator);
181
182 if (!v->vdev_ops->vdev_op_leaf || vd->vd_bdev == NULL)
183 return;
184
185 q = bdev_get_queue(vd->vd_bdev);
186 device = vd->vd_bdev->bd_disk->disk_name;
187
188 /*
189 * Skip devices which are not whole disks (partitions).
190 * Device-mapper devices are excepted since they may be whole
191 * disks despite the vdev_wholedisk flag, in which case we can
192 * and should switch the elevator. If the device-mapper device
193 * does not have an elevator (i.e. dm-raid, dm-crypt, etc.) the
194 * "Skip devices without schedulers" check below will fail.
195 */
196 if (!v->vdev_wholedisk && strncmp(device, "dm-", 3) != 0)
197 return;
198
199 /* Leave existing scheduler when set to "none" */
200 if ((strncmp(elevator, "none", 4) == 0) && (strlen(elevator) == 4))
201 return;
202
203 /*
204 * The elevator_change() function was available in kernels from
205 * 2.6.36 to 4.11. When not available fall back to using the user
206 * mode helper functionality to set the elevator via sysfs. This
207 * requires /bin/echo and sysfs to be mounted which may not be true
208 * early in the boot process.
209 */
210 #ifdef HAVE_ELEVATOR_CHANGE
211 error = elevator_change(q, elevator);
212 #else
213 #define SET_SCHEDULER_CMD \
214 "exec 0</dev/null " \
215 " 1>/sys/block/%s/queue/scheduler " \
216 " 2>/dev/null; " \
217 "echo %s"
218
219 char *argv[] = { "/bin/sh", "-c", NULL, NULL };
220 char *envp[] = { NULL };
221
222 argv[2] = kmem_asprintf(SET_SCHEDULER_CMD, device, elevator);
223 error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
224 strfree(argv[2]);
225 #endif /* HAVE_ELEVATOR_CHANGE */
226 if (error) {
227 zfs_dbgmsg("Unable to set \"%s\" scheduler for %s (%s): %d",
228 elevator, v->vdev_path, device, error);
229 }
230 }
231
232 static int
233 vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
234 uint64_t *ashift)
235 {
236 struct block_device *bdev;
237 fmode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa));
238 int count = 0, block_size;
239 int bdev_retry_count = 50;
240 vdev_disk_t *vd;
241
242 /* Must have a pathname and it must be absolute. */
243 if (v->vdev_path == NULL || v->vdev_path[0] != '/') {
244 v->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
245 vdev_dbgmsg(v, "invalid vdev_path");
246 return (SET_ERROR(EINVAL));
247 }
248
249 /*
250 * Reopen the device if it is currently open. When expanding a
251 * partition force re-scanning the partition table while closed
252 * in order to get an accurate updated block device size. Then
253 * since udev may need to recreate the device links increase the
254 * open retry count before reporting the device as unavailable.
255 */
256 vd = v->vdev_tsd;
257 if (vd) {
258 char disk_name[BDEVNAME_SIZE + 6] = "/dev/";
259 boolean_t reread_part = B_FALSE;
260
261 rw_enter(&vd->vd_lock, RW_WRITER);
262 bdev = vd->vd_bdev;
263 vd->vd_bdev = NULL;
264
265 if (bdev) {
266 if (v->vdev_expanding && bdev != bdev->bd_contains) {
267 bdevname(bdev->bd_contains, disk_name + 5);
268 reread_part = B_TRUE;
269 }
270
271 vdev_bdev_close(bdev, mode);
272 }
273
274 if (reread_part) {
275 bdev = vdev_bdev_open(disk_name, mode, zfs_vdev_holder);
276 if (!IS_ERR(bdev)) {
277 int error = vdev_bdev_reread_part(bdev);
278 vdev_bdev_close(bdev, mode);
279 if (error == 0)
280 bdev_retry_count = 100;
281 }
282 }
283 } else {
284 vd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);
285
286 rw_init(&vd->vd_lock, NULL, RW_DEFAULT, NULL);
287 rw_enter(&vd->vd_lock, RW_WRITER);
288 }
289
290 /*
291 * Devices are always opened by the path provided at configuration
292 * time. This means that if the provided path is a udev by-id path
293 * then drives may be re-cabled without an issue. If the provided
294 * path is a udev by-path path, then the physical location information
295 * will be preserved. This can be critical for more complicated
296 * configurations where drives are located in specific physical
297 * locations to maximize the systems tolerance to component failure.
298 *
299 * Alternatively, you can provide your own udev rule to flexibly map
300 * the drives as you see fit. It is not advised that you use the
301 * /dev/[hd]d devices which may be reordered due to probing order.
302 * Devices in the wrong locations will be detected by the higher
303 * level vdev validation.
304 *
305 * The specified paths may be briefly removed and recreated in
306 * response to udev events. This should be exceptionally unlikely
307 * because the zpool command makes every effort to verify these paths
308 * have already settled prior to reaching this point. Therefore,
309 * a ENOENT failure at this point is highly likely to be transient
310 * and it is reasonable to sleep and retry before giving up. In
311 * practice delays have been observed to be on the order of 100ms.
312 */
313 bdev = ERR_PTR(-ENXIO);
314 while (IS_ERR(bdev) && count < bdev_retry_count) {
315 bdev = vdev_bdev_open(v->vdev_path, mode, zfs_vdev_holder);
316 if (unlikely(PTR_ERR(bdev) == -ENOENT)) {
317 schedule_timeout(MSEC_TO_TICK(10));
318 count++;
319 } else if (IS_ERR(bdev)) {
320 break;
321 }
322 }
323
324 if (IS_ERR(bdev)) {
325 int error = -PTR_ERR(bdev);
326 vdev_dbgmsg(v, "open error=%d count=%d", error, count);
327 vd->vd_bdev = NULL;
328 v->vdev_tsd = vd;
329 rw_exit(&vd->vd_lock);
330 return (SET_ERROR(error));
331 } else {
332 vd->vd_bdev = bdev;
333 v->vdev_tsd = vd;
334 rw_exit(&vd->vd_lock);
335 }
336
337 struct request_queue *q = bdev_get_queue(vd->vd_bdev);
338
339 /* Determine the physical block size */
340 block_size = vdev_bdev_block_size(vd->vd_bdev);
341
342 /* Clear the nowritecache bit, causes vdev_reopen() to try again. */
343 v->vdev_nowritecache = B_FALSE;
344
345 /* Set when device reports it supports TRIM. */
346 v->vdev_has_trim = !!blk_queue_discard(q);
347
348 /* Set when device reports it supports secure TRIM. */
349 v->vdev_has_securetrim = !!blk_queue_discard_secure(q);
350
351 /* Inform the ZIO pipeline that we are non-rotational */
352 v->vdev_nonrot = blk_queue_nonrot(q);
353
354 /* Physical volume size in bytes for the partition */
355 *psize = bdev_capacity(vd->vd_bdev);
356
357 /* Physical volume size in bytes including possible expansion space */
358 *max_psize = bdev_max_capacity(vd->vd_bdev, v->vdev_wholedisk);
359
360 /* Based on the minimum sector size set the block size */
361 *ashift = highbit64(MAX(block_size, SPA_MINBLOCKSIZE)) - 1;
362
363 /* Try to set the io scheduler elevator algorithm */
364 (void) vdev_elevator_switch(v, zfs_vdev_scheduler);
365
366 return (0);
367 }
368
369 static void
370 vdev_disk_close(vdev_t *v)
371 {
372 vdev_disk_t *vd = v->vdev_tsd;
373
374 if (v->vdev_reopening || vd == NULL)
375 return;
376
377 if (vd->vd_bdev != NULL) {
378 vdev_bdev_close(vd->vd_bdev,
379 vdev_bdev_mode(spa_mode(v->vdev_spa)));
380 }
381
382 rw_destroy(&vd->vd_lock);
383 kmem_free(vd, sizeof (vdev_disk_t));
384 v->vdev_tsd = NULL;
385 }
386
387 static dio_request_t *
388 vdev_disk_dio_alloc(int bio_count)
389 {
390 dio_request_t *dr;
391 int i;
392
393 dr = kmem_zalloc(sizeof (dio_request_t) +
394 sizeof (struct bio *) * bio_count, KM_SLEEP);
395 if (dr) {
396 atomic_set(&dr->dr_ref, 0);
397 dr->dr_bio_count = bio_count;
398 dr->dr_error = 0;
399
400 for (i = 0; i < dr->dr_bio_count; i++)
401 dr->dr_bio[i] = NULL;
402 }
403
404 return (dr);
405 }
406
407 static void
408 vdev_disk_dio_free(dio_request_t *dr)
409 {
410 int i;
411
412 for (i = 0; i < dr->dr_bio_count; i++)
413 if (dr->dr_bio[i])
414 bio_put(dr->dr_bio[i]);
415
416 kmem_free(dr, sizeof (dio_request_t) +
417 sizeof (struct bio *) * dr->dr_bio_count);
418 }
419
420 static void
421 vdev_disk_dio_get(dio_request_t *dr)
422 {
423 atomic_inc(&dr->dr_ref);
424 }
425
426 static int
427 vdev_disk_dio_put(dio_request_t *dr)
428 {
429 int rc = atomic_dec_return(&dr->dr_ref);
430
431 /*
432 * Free the dio_request when the last reference is dropped and
433 * ensure zio_interpret is called only once with the correct zio
434 */
435 if (rc == 0) {
436 zio_t *zio = dr->dr_zio;
437 int error = dr->dr_error;
438
439 vdev_disk_dio_free(dr);
440
441 if (zio) {
442 zio->io_error = error;
443 ASSERT3S(zio->io_error, >=, 0);
444 if (zio->io_error)
445 vdev_disk_error(zio);
446
447 zio_delay_interrupt(zio);
448 }
449 }
450
451 return (rc);
452 }
453
454 BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, error)
455 {
456 dio_request_t *dr = bio->bi_private;
457 int rc;
458
459 if (dr->dr_error == 0) {
460 #ifdef HAVE_1ARG_BIO_END_IO_T
461 dr->dr_error = BIO_END_IO_ERROR(bio);
462 #else
463 if (error)
464 dr->dr_error = -(error);
465 else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
466 dr->dr_error = EIO;
467 #endif
468 }
469
470 /* Drop reference acquired by __vdev_disk_physio */
471 rc = vdev_disk_dio_put(dr);
472 }
473
474 static unsigned int
475 bio_map(struct bio *bio, void *bio_ptr, unsigned int bio_size)
476 {
477 unsigned int offset, size, i;
478 struct page *page;
479
480 offset = offset_in_page(bio_ptr);
481 for (i = 0; i < bio->bi_max_vecs; i++) {
482 size = PAGE_SIZE - offset;
483
484 if (bio_size <= 0)
485 break;
486
487 if (size > bio_size)
488 size = bio_size;
489
490 if (is_vmalloc_addr(bio_ptr))
491 page = vmalloc_to_page(bio_ptr);
492 else
493 page = virt_to_page(bio_ptr);
494
495 /*
496 * Some network related block device uses tcp_sendpage, which
497 * doesn't behave well when using 0-count page, this is a
498 * safety net to catch them.
499 */
500 ASSERT3S(page_count(page), >, 0);
501
502 if (bio_add_page(bio, page, size, offset) != size)
503 break;
504
505 bio_ptr += size;
506 bio_size -= size;
507 offset = 0;
508 }
509
510 return (bio_size);
511 }
512
513 static unsigned int
514 bio_map_abd_off(struct bio *bio, abd_t *abd, unsigned int size, size_t off)
515 {
516 if (abd_is_linear(abd))
517 return (bio_map(bio, ((char *)abd_to_buf(abd)) + off, size));
518
519 return (abd_scatter_bio_map_off(bio, abd, size, off));
520 }
521
522 static inline void
523 vdev_submit_bio_impl(struct bio *bio)
524 {
525 #ifdef HAVE_1ARG_SUBMIT_BIO
526 submit_bio(bio);
527 #else
528 submit_bio(0, bio);
529 #endif
530 }
531
532 #ifdef HAVE_BIO_SET_DEV
533 #if defined(CONFIG_BLK_CGROUP) && defined(HAVE_BIO_SET_DEV_GPL_ONLY)
534 /*
535 * The Linux 5.0 kernel updated the bio_set_dev() macro so it calls the
536 * GPL-only bio_associate_blkg() symbol thus inadvertently converting
537 * the entire macro. Provide a minimal version which always assigns the
538 * request queue's root_blkg to the bio.
539 */
540 static inline void
541 vdev_bio_associate_blkg(struct bio *bio)
542 {
543 struct request_queue *q = bio->bi_disk->queue;
544
545 ASSERT3P(q, !=, NULL);
546 ASSERT3P(q->root_blkg, !=, NULL);
547 ASSERT3P(bio->bi_blkg, ==, NULL);
548
549 if (blkg_tryget(q->root_blkg))
550 bio->bi_blkg = q->root_blkg;
551 }
552 #define bio_associate_blkg vdev_bio_associate_blkg
553 #endif
554 #else
555 /*
556 * Provide a bio_set_dev() helper macro for pre-Linux 4.14 kernels.
557 */
558 static inline void
559 bio_set_dev(struct bio *bio, struct block_device *bdev)
560 {
561 bio->bi_bdev = bdev;
562 }
563 #endif /* HAVE_BIO_SET_DEV */
564
565 static inline void
566 vdev_submit_bio(struct bio *bio)
567 {
568 #ifdef HAVE_CURRENT_BIO_TAIL
569 struct bio **bio_tail = current->bio_tail;
570 current->bio_tail = NULL;
571 vdev_submit_bio_impl(bio);
572 current->bio_tail = bio_tail;
573 #else
574 struct bio_list *bio_list = current->bio_list;
575 current->bio_list = NULL;
576 vdev_submit_bio_impl(bio);
577 current->bio_list = bio_list;
578 #endif
579 }
580
581 static int
582 __vdev_disk_physio(struct block_device *bdev, zio_t *zio,
583 size_t io_size, uint64_t io_offset, int rw, int flags)
584 {
585 dio_request_t *dr;
586 uint64_t abd_offset;
587 uint64_t bio_offset;
588 int bio_size, bio_count = 16;
589 int i = 0, error = 0;
590 #if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG)
591 struct blk_plug plug;
592 #endif
593 /*
594 * Accessing outside the block device is never allowed.
595 */
596 if (io_offset + io_size > bdev->bd_inode->i_size) {
597 vdev_dbgmsg(zio->io_vd,
598 "Illegal access %llu size %llu, device size %llu",
599 io_offset, io_size, i_size_read(bdev->bd_inode));
600 return (SET_ERROR(EIO));
601 }
602
603 retry:
604 dr = vdev_disk_dio_alloc(bio_count);
605 if (dr == NULL)
606 return (SET_ERROR(ENOMEM));
607
608 if (zio && !(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))
609 bio_set_flags_failfast(bdev, &flags);
610
611 dr->dr_zio = zio;
612
613 /*
614 * When the IO size exceeds the maximum bio size for the request
615 * queue we are forced to break the IO in multiple bio's and wait
616 * for them all to complete. Ideally, all pool users will set
617 * their volume block size to match the maximum request size and
618 * the common case will be one bio per vdev IO request.
619 */
620
621 abd_offset = 0;
622 bio_offset = io_offset;
623 bio_size = io_size;
624 for (i = 0; i <= dr->dr_bio_count; i++) {
625
626 /* Finished constructing bio's for given buffer */
627 if (bio_size <= 0)
628 break;
629
630 /*
631 * By default only 'bio_count' bio's per dio are allowed.
632 * However, if we find ourselves in a situation where more
633 * are needed we allocate a larger dio and warn the user.
634 */
635 if (dr->dr_bio_count == i) {
636 vdev_disk_dio_free(dr);
637 bio_count *= 2;
638 goto retry;
639 }
640
641 /* bio_alloc() with __GFP_WAIT never returns NULL */
642 dr->dr_bio[i] = bio_alloc(GFP_NOIO,
643 MIN(abd_nr_pages_off(zio->io_abd, bio_size, abd_offset),
644 BIO_MAX_PAGES));
645 if (unlikely(dr->dr_bio[i] == NULL)) {
646 vdev_disk_dio_free(dr);
647 return (SET_ERROR(ENOMEM));
648 }
649
650 /* Matching put called by vdev_disk_physio_completion */
651 vdev_disk_dio_get(dr);
652
653 bio_set_dev(dr->dr_bio[i], bdev);
654 BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9;
655 dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion;
656 dr->dr_bio[i]->bi_private = dr;
657 bio_set_op_attrs(dr->dr_bio[i], rw, flags);
658
659 /* Remaining size is returned to become the new size */
660 bio_size = bio_map_abd_off(dr->dr_bio[i], zio->io_abd,
661 bio_size, abd_offset);
662
663 /* Advance in buffer and construct another bio if needed */
664 abd_offset += BIO_BI_SIZE(dr->dr_bio[i]);
665 bio_offset += BIO_BI_SIZE(dr->dr_bio[i]);
666 }
667
668 /* Extra reference to protect dio_request during vdev_submit_bio */
669 vdev_disk_dio_get(dr);
670
671 #if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG)
672 if (dr->dr_bio_count > 1)
673 blk_start_plug(&plug);
674 #endif
675
676 /* Submit all bio's associated with this dio */
677 for (i = 0; i < dr->dr_bio_count; i++)
678 if (dr->dr_bio[i])
679 vdev_submit_bio(dr->dr_bio[i]);
680
681 #if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG)
682 if (dr->dr_bio_count > 1)
683 blk_finish_plug(&plug);
684 #endif
685
686 (void) vdev_disk_dio_put(dr);
687
688 return (error);
689 }
690
691 BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error)
692 {
693 zio_t *zio = bio->bi_private;
694 #ifdef HAVE_1ARG_BIO_END_IO_T
695 zio->io_error = BIO_END_IO_ERROR(bio);
696 #else
697 zio->io_error = -error;
698 #endif
699
700 if (zio->io_error && (zio->io_error == EOPNOTSUPP))
701 zio->io_vd->vdev_nowritecache = B_TRUE;
702
703 bio_put(bio);
704 ASSERT3S(zio->io_error, >=, 0);
705 if (zio->io_error)
706 vdev_disk_error(zio);
707 zio_interrupt(zio);
708 }
709
710 static int
711 vdev_disk_io_flush(struct block_device *bdev, zio_t *zio)
712 {
713 struct request_queue *q;
714 struct bio *bio;
715
716 q = bdev_get_queue(bdev);
717 if (!q)
718 return (SET_ERROR(ENXIO));
719
720 bio = bio_alloc(GFP_NOIO, 0);
721 /* bio_alloc() with __GFP_WAIT never returns NULL */
722 if (unlikely(bio == NULL))
723 return (SET_ERROR(ENOMEM));
724
725 bio->bi_end_io = vdev_disk_io_flush_completion;
726 bio->bi_private = zio;
727 bio_set_dev(bio, bdev);
728 bio_set_flush(bio);
729 vdev_submit_bio(bio);
730 invalidate_bdev(bdev);
731
732 return (0);
733 }
734
735 static void
736 vdev_disk_io_start(zio_t *zio)
737 {
738 vdev_t *v = zio->io_vd;
739 vdev_disk_t *vd = v->vdev_tsd;
740 unsigned long trim_flags = 0;
741 int rw, flags, error;
742
743 /*
744 * If the vdev is closed, it's likely in the REMOVED or FAULTED state.
745 * Nothing to be done here but return failure.
746 */
747 if (vd == NULL) {
748 zio->io_error = ENXIO;
749 zio_interrupt(zio);
750 return;
751 }
752
753 rw_enter(&vd->vd_lock, RW_READER);
754
755 /*
756 * If the vdev is closed, it's likely due to a failed reopen and is
757 * in the UNAVAIL state. Nothing to be done here but return failure.
758 */
759 if (vd->vd_bdev == NULL) {
760 rw_exit(&vd->vd_lock);
761 zio->io_error = ENXIO;
762 zio_interrupt(zio);
763 return;
764 }
765
766 switch (zio->io_type) {
767 case ZIO_TYPE_IOCTL:
768
769 if (!vdev_readable(v)) {
770 rw_exit(&vd->vd_lock);
771 zio->io_error = SET_ERROR(ENXIO);
772 zio_interrupt(zio);
773 return;
774 }
775
776 switch (zio->io_cmd) {
777 case DKIOCFLUSHWRITECACHE:
778
779 if (zfs_nocacheflush)
780 break;
781
782 if (v->vdev_nowritecache) {
783 zio->io_error = SET_ERROR(ENOTSUP);
784 break;
785 }
786
787 error = vdev_disk_io_flush(vd->vd_bdev, zio);
788 if (error == 0) {
789 rw_exit(&vd->vd_lock);
790 return;
791 }
792
793 zio->io_error = error;
794
795 break;
796
797 default:
798 zio->io_error = SET_ERROR(ENOTSUP);
799 }
800
801 rw_exit(&vd->vd_lock);
802 zio_execute(zio);
803 return;
804 case ZIO_TYPE_WRITE:
805 rw = WRITE;
806 #if defined(HAVE_BLK_QUEUE_HAVE_BIO_RW_UNPLUG)
807 flags = (1 << BIO_RW_UNPLUG);
808 #elif defined(REQ_UNPLUG)
809 flags = REQ_UNPLUG;
810 #else
811 flags = 0;
812 #endif
813 break;
814
815 case ZIO_TYPE_READ:
816 rw = READ;
817 #if defined(HAVE_BLK_QUEUE_HAVE_BIO_RW_UNPLUG)
818 flags = (1 << BIO_RW_UNPLUG);
819 #elif defined(REQ_UNPLUG)
820 flags = REQ_UNPLUG;
821 #else
822 flags = 0;
823 #endif
824 break;
825
826 case ZIO_TYPE_TRIM:
827 #if defined(BLKDEV_DISCARD_SECURE)
828 if (zio->io_trim_flags & ZIO_TRIM_SECURE)
829 trim_flags |= BLKDEV_DISCARD_SECURE;
830 #endif
831 zio->io_error = -blkdev_issue_discard(vd->vd_bdev,
832 zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS,
833 trim_flags);
834
835 rw_exit(&vd->vd_lock);
836 zio_interrupt(zio);
837 return;
838
839 default:
840 rw_exit(&vd->vd_lock);
841 zio->io_error = SET_ERROR(ENOTSUP);
842 zio_interrupt(zio);
843 return;
844 }
845
846 zio->io_target_timestamp = zio_handle_io_delay(zio);
847 error = __vdev_disk_physio(vd->vd_bdev, zio,
848 zio->io_size, zio->io_offset, rw, flags);
849 rw_exit(&vd->vd_lock);
850
851 if (error) {
852 zio->io_error = error;
853 zio_interrupt(zio);
854 return;
855 }
856 }
857
858 static void
859 vdev_disk_io_done(zio_t *zio)
860 {
861 /*
862 * If the device returned EIO, we revalidate the media. If it is
863 * determined the media has changed this triggers the asynchronous
864 * removal of the device from the configuration.
865 */
866 if (zio->io_error == EIO) {
867 vdev_t *v = zio->io_vd;
868 vdev_disk_t *vd = v->vdev_tsd;
869
870 if (check_disk_change(vd->vd_bdev)) {
871 vdev_bdev_invalidate(vd->vd_bdev);
872 v->vdev_remove_wanted = B_TRUE;
873 spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE);
874 }
875 }
876 }
877
878 static void
879 vdev_disk_hold(vdev_t *vd)
880 {
881 ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
882
883 /* We must have a pathname, and it must be absolute. */
884 if (vd->vdev_path == NULL || vd->vdev_path[0] != '/')
885 return;
886
887 /*
888 * Only prefetch path and devid info if the device has
889 * never been opened.
890 */
891 if (vd->vdev_tsd != NULL)
892 return;
893
894 /* XXX: Implement me as a vnode lookup for the device */
895 vd->vdev_name_vp = NULL;
896 vd->vdev_devid_vp = NULL;
897 }
898
899 static void
900 vdev_disk_rele(vdev_t *vd)
901 {
902 ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
903
904 /* XXX: Implement me as a vnode rele for the device */
905 }
906
907 static int
908 param_set_vdev_scheduler(const char *val, zfs_kernel_param_t *kp)
909 {
910 spa_t *spa = NULL;
911 char *p;
912
913 if (val == NULL)
914 return (SET_ERROR(-EINVAL));
915
916 if ((p = strchr(val, '\n')) != NULL)
917 *p = '\0';
918
919 if (spa_mode_global != 0) {
920 mutex_enter(&spa_namespace_lock);
921 while ((spa = spa_next(spa)) != NULL) {
922 if (spa_state(spa) != POOL_STATE_ACTIVE ||
923 !spa_writeable(spa) || spa_suspended(spa))
924 continue;
925
926 spa_open_ref(spa, FTAG);
927 mutex_exit(&spa_namespace_lock);
928 vdev_elevator_switch(spa->spa_root_vdev, (char *)val);
929 mutex_enter(&spa_namespace_lock);
930 spa_close(spa, FTAG);
931 }
932 mutex_exit(&spa_namespace_lock);
933 }
934
935 return (param_set_charp(val, kp));
936 }
937
938 vdev_ops_t vdev_disk_ops = {
939 vdev_disk_open,
940 vdev_disk_close,
941 vdev_default_asize,
942 vdev_disk_io_start,
943 vdev_disk_io_done,
944 NULL,
945 NULL,
946 vdev_disk_hold,
947 vdev_disk_rele,
948 NULL,
949 vdev_default_xlate,
950 VDEV_TYPE_DISK, /* name of this vdev type */
951 B_TRUE /* leaf vdev */
952 };
953
954 module_param_call(zfs_vdev_scheduler, param_set_vdev_scheduler,
955 param_get_charp, &zfs_vdev_scheduler, 0644);
956 MODULE_PARM_DESC(zfs_vdev_scheduler, "I/O scheduler");