]> git.proxmox.com Git - mirror_zfs.git/blob - module/zfs/vdev_disk.c
Get rid of space_map_update() for ms_synced_length
[mirror_zfs.git] / module / zfs / vdev_disk.c
1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC.
23 * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
24 * Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>.
25 * LLNL-CODE-403049.
26 * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
27 */
28
29 #include <sys/zfs_context.h>
30 #include <sys/spa_impl.h>
31 #include <sys/vdev_disk.h>
32 #include <sys/vdev_impl.h>
33 #include <sys/abd.h>
34 #include <sys/fs/zfs.h>
35 #include <sys/zio.h>
36 #include <linux/mod_compat.h>
37 #include <linux/msdos_fs.h>
38 #include <linux/vfs_compat.h>
39
40 char *zfs_vdev_scheduler = VDEV_SCHEDULER;
41 static void *zfs_vdev_holder = VDEV_HOLDER;
42
43 /* size of the "reserved" partition, in blocks */
44 #define EFI_MIN_RESV_SIZE (16 * 1024)
45
46 /*
47 * Virtual device vector for disks.
48 */
49 typedef struct dio_request {
50 zio_t *dr_zio; /* Parent ZIO */
51 atomic_t dr_ref; /* References */
52 int dr_error; /* Bio error */
53 int dr_bio_count; /* Count of bio's */
54 struct bio *dr_bio[0]; /* Attached bio's */
55 } dio_request_t;
56
57
58 #ifdef HAVE_OPEN_BDEV_EXCLUSIVE
59 static fmode_t
60 vdev_bdev_mode(int smode)
61 {
62 fmode_t mode = 0;
63
64 ASSERT3S(smode & (FREAD | FWRITE), !=, 0);
65
66 if (smode & FREAD)
67 mode |= FMODE_READ;
68
69 if (smode & FWRITE)
70 mode |= FMODE_WRITE;
71
72 return (mode);
73 }
74 #else
75 static int
76 vdev_bdev_mode(int smode)
77 {
78 int mode = 0;
79
80 ASSERT3S(smode & (FREAD | FWRITE), !=, 0);
81
82 if ((smode & FREAD) && !(smode & FWRITE))
83 mode = SB_RDONLY;
84
85 return (mode);
86 }
87 #endif /* HAVE_OPEN_BDEV_EXCLUSIVE */
88
89 /*
90 * Returns the usable capacity (in bytes) for the partition or disk.
91 */
92 static uint64_t
93 bdev_capacity(struct block_device *bdev)
94 {
95 return (i_size_read(bdev->bd_inode));
96 }
97
98 /*
99 * Returns the maximum expansion capacity of the block device (in bytes).
100 *
101 * It is possible to expand a vdev when it has been created as a wholedisk
102 * and the containing block device has increased in capacity. Or when the
103 * partition containing the pool has been manually increased in size.
104 *
105 * This function is only responsible for calculating the potential expansion
106 * size so it can be reported by 'zpool list'. The efi_use_whole_disk() is
107 * responsible for verifying the expected partition layout in the wholedisk
108 * case, and updating the partition table if appropriate. Once the partition
109 * size has been increased the additional capacity will be visible using
110 * bdev_capacity().
111 */
112 static uint64_t
113 bdev_max_capacity(struct block_device *bdev, uint64_t wholedisk)
114 {
115 uint64_t psize;
116 int64_t available;
117
118 if (wholedisk && bdev->bd_part != NULL && bdev != bdev->bd_contains) {
119 /*
120 * When reporting maximum expansion capacity for a wholedisk
121 * deduct any capacity which is expected to be lost due to
122 * alignment restrictions. Over reporting this value isn't
123 * harmful and would only result in slightly less capacity
124 * than expected post expansion.
125 */
126 available = i_size_read(bdev->bd_contains->bd_inode) -
127 ((EFI_MIN_RESV_SIZE + NEW_START_BLOCK +
128 PARTITION_END_ALIGNMENT) << SECTOR_BITS);
129 if (available > 0)
130 psize = available;
131 else
132 psize = bdev_capacity(bdev);
133 } else {
134 psize = bdev_capacity(bdev);
135 }
136
137 return (psize);
138 }
139
140 static void
141 vdev_disk_error(zio_t *zio)
142 {
143 /*
144 * This function can be called in interrupt context, for instance while
145 * handling IRQs coming from a misbehaving disk device; use printk()
146 * which is safe from any context.
147 */
148 printk(KERN_WARNING "zio pool=%s vdev=%s error=%d type=%d "
149 "offset=%llu size=%llu flags=%x\n", spa_name(zio->io_spa),
150 zio->io_vd->vdev_path, zio->io_error, zio->io_type,
151 (u_longlong_t)zio->io_offset, (u_longlong_t)zio->io_size,
152 zio->io_flags);
153 }
154
155 /*
156 * Use the Linux 'noop' elevator for zfs managed block devices. This
157 * strikes the ideal balance by allowing the zfs elevator to do all
158 * request ordering and prioritization. While allowing the Linux
159 * elevator to do the maximum front/back merging allowed by the
160 * physical device. This yields the largest possible requests for
161 * the device with the lowest total overhead.
162 */
163 static void
164 vdev_elevator_switch(vdev_t *v, char *elevator)
165 {
166 vdev_disk_t *vd = v->vdev_tsd;
167 struct request_queue *q;
168 char *device;
169 int error;
170
171 for (int c = 0; c < v->vdev_children; c++)
172 vdev_elevator_switch(v->vdev_child[c], elevator);
173
174 if (!v->vdev_ops->vdev_op_leaf || vd->vd_bdev == NULL)
175 return;
176
177 q = bdev_get_queue(vd->vd_bdev);
178 device = vd->vd_bdev->bd_disk->disk_name;
179
180 /*
181 * Skip devices which are not whole disks (partitions).
182 * Device-mapper devices are excepted since they may be whole
183 * disks despite the vdev_wholedisk flag, in which case we can
184 * and should switch the elevator. If the device-mapper device
185 * does not have an elevator (i.e. dm-raid, dm-crypt, etc.) the
186 * "Skip devices without schedulers" check below will fail.
187 */
188 if (!v->vdev_wholedisk && strncmp(device, "dm-", 3) != 0)
189 return;
190
191 /* Leave existing scheduler when set to "none" */
192 if ((strncmp(elevator, "none", 4) == 0) && (strlen(elevator) == 4))
193 return;
194
195 /*
196 * The elevator_change() function was available in kernels from
197 * 2.6.36 to 4.11. When not available fall back to using the user
198 * mode helper functionality to set the elevator via sysfs. This
199 * requires /bin/echo and sysfs to be mounted which may not be true
200 * early in the boot process.
201 */
202 #ifdef HAVE_ELEVATOR_CHANGE
203 error = elevator_change(q, elevator);
204 #else
205 #define SET_SCHEDULER_CMD \
206 "exec 0</dev/null " \
207 " 1>/sys/block/%s/queue/scheduler " \
208 " 2>/dev/null; " \
209 "echo %s"
210
211 char *argv[] = { "/bin/sh", "-c", NULL, NULL };
212 char *envp[] = { NULL };
213
214 argv[2] = kmem_asprintf(SET_SCHEDULER_CMD, device, elevator);
215 error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
216 strfree(argv[2]);
217 #endif /* HAVE_ELEVATOR_CHANGE */
218 if (error) {
219 zfs_dbgmsg("Unable to set \"%s\" scheduler for %s (%s): %d\n",
220 elevator, v->vdev_path, device, error);
221 }
222 }
223
224 static int
225 vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
226 uint64_t *ashift)
227 {
228 struct block_device *bdev;
229 fmode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa));
230 int count = 0, block_size;
231 int bdev_retry_count = 50;
232 vdev_disk_t *vd;
233
234 /* Must have a pathname and it must be absolute. */
235 if (v->vdev_path == NULL || v->vdev_path[0] != '/') {
236 v->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
237 vdev_dbgmsg(v, "invalid vdev_path");
238 return (SET_ERROR(EINVAL));
239 }
240
241 /*
242 * Reopen the device if it is currently open. When expanding a
243 * partition force re-scanning the partition table while closed
244 * in order to get an accurate updated block device size. Then
245 * since udev may need to recreate the device links increase the
246 * open retry count before reporting the device as unavailable.
247 */
248 vd = v->vdev_tsd;
249 if (vd) {
250 char disk_name[BDEVNAME_SIZE + 6] = "/dev/";
251 boolean_t reread_part = B_FALSE;
252
253 rw_enter(&vd->vd_lock, RW_WRITER);
254 bdev = vd->vd_bdev;
255 vd->vd_bdev = NULL;
256
257 if (bdev) {
258 if (v->vdev_expanding && bdev != bdev->bd_contains) {
259 bdevname(bdev->bd_contains, disk_name + 5);
260 reread_part = B_TRUE;
261 }
262
263 vdev_bdev_close(bdev, mode);
264 }
265
266 if (reread_part) {
267 bdev = vdev_bdev_open(disk_name, mode, zfs_vdev_holder);
268 if (!IS_ERR(bdev)) {
269 int error = vdev_bdev_reread_part(bdev);
270 vdev_bdev_close(bdev, mode);
271 if (error == 0)
272 bdev_retry_count = 100;
273 }
274 }
275 } else {
276 vd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);
277
278 rw_init(&vd->vd_lock, NULL, RW_DEFAULT, NULL);
279 rw_enter(&vd->vd_lock, RW_WRITER);
280 }
281
282 /*
283 * Devices are always opened by the path provided at configuration
284 * time. This means that if the provided path is a udev by-id path
285 * then drives may be re-cabled without an issue. If the provided
286 * path is a udev by-path path, then the physical location information
287 * will be preserved. This can be critical for more complicated
288 * configurations where drives are located in specific physical
289 * locations to maximize the systems tolerance to component failure.
290 *
291 * Alternatively, you can provide your own udev rule to flexibly map
292 * the drives as you see fit. It is not advised that you use the
293 * /dev/[hd]d devices which may be reordered due to probing order.
294 * Devices in the wrong locations will be detected by the higher
295 * level vdev validation.
296 *
297 * The specified paths may be briefly removed and recreated in
298 * response to udev events. This should be exceptionally unlikely
299 * because the zpool command makes every effort to verify these paths
300 * have already settled prior to reaching this point. Therefore,
301 * a ENOENT failure at this point is highly likely to be transient
302 * and it is reasonable to sleep and retry before giving up. In
303 * practice delays have been observed to be on the order of 100ms.
304 */
305 bdev = ERR_PTR(-ENXIO);
306 while (IS_ERR(bdev) && count < bdev_retry_count) {
307 bdev = vdev_bdev_open(v->vdev_path, mode, zfs_vdev_holder);
308 if (unlikely(PTR_ERR(bdev) == -ENOENT)) {
309 schedule_timeout(MSEC_TO_TICK(10));
310 count++;
311 } else if (IS_ERR(bdev)) {
312 break;
313 }
314 }
315
316 if (IS_ERR(bdev)) {
317 int error = -PTR_ERR(bdev);
318 vdev_dbgmsg(v, "open error=%d count=%d\n", error, count);
319 vd->vd_bdev = NULL;
320 v->vdev_tsd = vd;
321 rw_exit(&vd->vd_lock);
322 return (SET_ERROR(error));
323 } else {
324 vd->vd_bdev = bdev;
325 v->vdev_tsd = vd;
326 rw_exit(&vd->vd_lock);
327 }
328
329 /* Determine the physical block size */
330 block_size = vdev_bdev_block_size(vd->vd_bdev);
331
332 /* Clear the nowritecache bit, causes vdev_reopen() to try again. */
333 v->vdev_nowritecache = B_FALSE;
334
335 /* Inform the ZIO pipeline that we are non-rotational */
336 v->vdev_nonrot = blk_queue_nonrot(bdev_get_queue(vd->vd_bdev));
337
338 /* Physical volume size in bytes for the partition */
339 *psize = bdev_capacity(vd->vd_bdev);
340
341 /* Physical volume size in bytes including possible expansion space */
342 *max_psize = bdev_max_capacity(vd->vd_bdev, v->vdev_wholedisk);
343
344 /* Based on the minimum sector size set the block size */
345 *ashift = highbit64(MAX(block_size, SPA_MINBLOCKSIZE)) - 1;
346
347 /* Try to set the io scheduler elevator algorithm */
348 (void) vdev_elevator_switch(v, zfs_vdev_scheduler);
349
350 return (0);
351 }
352
353 static void
354 vdev_disk_close(vdev_t *v)
355 {
356 vdev_disk_t *vd = v->vdev_tsd;
357
358 if (v->vdev_reopening || vd == NULL)
359 return;
360
361 if (vd->vd_bdev != NULL) {
362 vdev_bdev_close(vd->vd_bdev,
363 vdev_bdev_mode(spa_mode(v->vdev_spa)));
364 }
365
366 rw_destroy(&vd->vd_lock);
367 kmem_free(vd, sizeof (vdev_disk_t));
368 v->vdev_tsd = NULL;
369 }
370
371 static dio_request_t *
372 vdev_disk_dio_alloc(int bio_count)
373 {
374 dio_request_t *dr;
375 int i;
376
377 dr = kmem_zalloc(sizeof (dio_request_t) +
378 sizeof (struct bio *) * bio_count, KM_SLEEP);
379 if (dr) {
380 atomic_set(&dr->dr_ref, 0);
381 dr->dr_bio_count = bio_count;
382 dr->dr_error = 0;
383
384 for (i = 0; i < dr->dr_bio_count; i++)
385 dr->dr_bio[i] = NULL;
386 }
387
388 return (dr);
389 }
390
391 static void
392 vdev_disk_dio_free(dio_request_t *dr)
393 {
394 int i;
395
396 for (i = 0; i < dr->dr_bio_count; i++)
397 if (dr->dr_bio[i])
398 bio_put(dr->dr_bio[i]);
399
400 kmem_free(dr, sizeof (dio_request_t) +
401 sizeof (struct bio *) * dr->dr_bio_count);
402 }
403
404 static void
405 vdev_disk_dio_get(dio_request_t *dr)
406 {
407 atomic_inc(&dr->dr_ref);
408 }
409
410 static int
411 vdev_disk_dio_put(dio_request_t *dr)
412 {
413 int rc = atomic_dec_return(&dr->dr_ref);
414
415 /*
416 * Free the dio_request when the last reference is dropped and
417 * ensure zio_interpret is called only once with the correct zio
418 */
419 if (rc == 0) {
420 zio_t *zio = dr->dr_zio;
421 int error = dr->dr_error;
422
423 vdev_disk_dio_free(dr);
424
425 if (zio) {
426 zio->io_error = error;
427 ASSERT3S(zio->io_error, >=, 0);
428 if (zio->io_error)
429 vdev_disk_error(zio);
430
431 zio_delay_interrupt(zio);
432 }
433 }
434
435 return (rc);
436 }
437
438 BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, error)
439 {
440 dio_request_t *dr = bio->bi_private;
441 int rc;
442
443 if (dr->dr_error == 0) {
444 #ifdef HAVE_1ARG_BIO_END_IO_T
445 dr->dr_error = BIO_END_IO_ERROR(bio);
446 #else
447 if (error)
448 dr->dr_error = -(error);
449 else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
450 dr->dr_error = EIO;
451 #endif
452 }
453
454 /* Drop reference acquired by __vdev_disk_physio */
455 rc = vdev_disk_dio_put(dr);
456 }
457
458 static unsigned int
459 bio_map(struct bio *bio, void *bio_ptr, unsigned int bio_size)
460 {
461 unsigned int offset, size, i;
462 struct page *page;
463
464 offset = offset_in_page(bio_ptr);
465 for (i = 0; i < bio->bi_max_vecs; i++) {
466 size = PAGE_SIZE - offset;
467
468 if (bio_size <= 0)
469 break;
470
471 if (size > bio_size)
472 size = bio_size;
473
474 if (is_vmalloc_addr(bio_ptr))
475 page = vmalloc_to_page(bio_ptr);
476 else
477 page = virt_to_page(bio_ptr);
478
479 /*
480 * Some network related block device uses tcp_sendpage, which
481 * doesn't behave well when using 0-count page, this is a
482 * safety net to catch them.
483 */
484 ASSERT3S(page_count(page), >, 0);
485
486 if (bio_add_page(bio, page, size, offset) != size)
487 break;
488
489 bio_ptr += size;
490 bio_size -= size;
491 offset = 0;
492 }
493
494 return (bio_size);
495 }
496
497 static unsigned int
498 bio_map_abd_off(struct bio *bio, abd_t *abd, unsigned int size, size_t off)
499 {
500 if (abd_is_linear(abd))
501 return (bio_map(bio, ((char *)abd_to_buf(abd)) + off, size));
502
503 return (abd_scatter_bio_map_off(bio, abd, size, off));
504 }
505
506 static inline void
507 vdev_submit_bio_impl(struct bio *bio)
508 {
509 #ifdef HAVE_1ARG_SUBMIT_BIO
510 submit_bio(bio);
511 #else
512 submit_bio(0, bio);
513 #endif
514 }
515
516 #ifdef HAVE_BIO_SET_DEV
517 #if defined(CONFIG_BLK_CGROUP) && defined(HAVE_BIO_SET_DEV_GPL_ONLY)
518 /*
519 * The Linux 5.0 kernel updated the bio_set_dev() macro so it calls the
520 * GPL-only bio_associate_blkg() symbol thus inadvertently converting
521 * the entire macro. Provide a minimal version which always assigns the
522 * request queue's root_blkg to the bio.
523 */
524 static inline void
525 vdev_bio_associate_blkg(struct bio *bio)
526 {
527 struct request_queue *q = bio->bi_disk->queue;
528
529 ASSERT3P(q, !=, NULL);
530 ASSERT3P(q->root_blkg, !=, NULL);
531 ASSERT3P(bio->bi_blkg, ==, NULL);
532
533 if (blkg_tryget(q->root_blkg))
534 bio->bi_blkg = q->root_blkg;
535 }
536 #define bio_associate_blkg vdev_bio_associate_blkg
537 #endif
538 #else
539 /*
540 * Provide a bio_set_dev() helper macro for pre-Linux 4.14 kernels.
541 */
542 static inline void
543 bio_set_dev(struct bio *bio, struct block_device *bdev)
544 {
545 bio->bi_bdev = bdev;
546 }
547 #endif /* HAVE_BIO_SET_DEV */
548
549 static inline void
550 vdev_submit_bio(struct bio *bio)
551 {
552 #ifdef HAVE_CURRENT_BIO_TAIL
553 struct bio **bio_tail = current->bio_tail;
554 current->bio_tail = NULL;
555 vdev_submit_bio_impl(bio);
556 current->bio_tail = bio_tail;
557 #else
558 struct bio_list *bio_list = current->bio_list;
559 current->bio_list = NULL;
560 vdev_submit_bio_impl(bio);
561 current->bio_list = bio_list;
562 #endif
563 }
564
565 static int
566 __vdev_disk_physio(struct block_device *bdev, zio_t *zio,
567 size_t io_size, uint64_t io_offset, int rw, int flags)
568 {
569 dio_request_t *dr;
570 uint64_t abd_offset;
571 uint64_t bio_offset;
572 int bio_size, bio_count = 16;
573 int i = 0, error = 0;
574 #if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG)
575 struct blk_plug plug;
576 #endif
577 /*
578 * Accessing outside the block device is never allowed.
579 */
580 if (io_offset + io_size > bdev->bd_inode->i_size) {
581 vdev_dbgmsg(zio->io_vd,
582 "Illegal access %llu size %llu, device size %llu",
583 io_offset, io_size, i_size_read(bdev->bd_inode));
584 return (SET_ERROR(EIO));
585 }
586
587 retry:
588 dr = vdev_disk_dio_alloc(bio_count);
589 if (dr == NULL)
590 return (SET_ERROR(ENOMEM));
591
592 if (zio && !(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))
593 bio_set_flags_failfast(bdev, &flags);
594
595 dr->dr_zio = zio;
596
597 /*
598 * When the IO size exceeds the maximum bio size for the request
599 * queue we are forced to break the IO in multiple bio's and wait
600 * for them all to complete. Ideally, all pool users will set
601 * their volume block size to match the maximum request size and
602 * the common case will be one bio per vdev IO request.
603 */
604
605 abd_offset = 0;
606 bio_offset = io_offset;
607 bio_size = io_size;
608 for (i = 0; i <= dr->dr_bio_count; i++) {
609
610 /* Finished constructing bio's for given buffer */
611 if (bio_size <= 0)
612 break;
613
614 /*
615 * By default only 'bio_count' bio's per dio are allowed.
616 * However, if we find ourselves in a situation where more
617 * are needed we allocate a larger dio and warn the user.
618 */
619 if (dr->dr_bio_count == i) {
620 vdev_disk_dio_free(dr);
621 bio_count *= 2;
622 goto retry;
623 }
624
625 /* bio_alloc() with __GFP_WAIT never returns NULL */
626 dr->dr_bio[i] = bio_alloc(GFP_NOIO,
627 MIN(abd_nr_pages_off(zio->io_abd, bio_size, abd_offset),
628 BIO_MAX_PAGES));
629 if (unlikely(dr->dr_bio[i] == NULL)) {
630 vdev_disk_dio_free(dr);
631 return (SET_ERROR(ENOMEM));
632 }
633
634 /* Matching put called by vdev_disk_physio_completion */
635 vdev_disk_dio_get(dr);
636
637 bio_set_dev(dr->dr_bio[i], bdev);
638 BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9;
639 dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion;
640 dr->dr_bio[i]->bi_private = dr;
641 bio_set_op_attrs(dr->dr_bio[i], rw, flags);
642
643 /* Remaining size is returned to become the new size */
644 bio_size = bio_map_abd_off(dr->dr_bio[i], zio->io_abd,
645 bio_size, abd_offset);
646
647 /* Advance in buffer and construct another bio if needed */
648 abd_offset += BIO_BI_SIZE(dr->dr_bio[i]);
649 bio_offset += BIO_BI_SIZE(dr->dr_bio[i]);
650 }
651
652 /* Extra reference to protect dio_request during vdev_submit_bio */
653 vdev_disk_dio_get(dr);
654
655 #if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG)
656 if (dr->dr_bio_count > 1)
657 blk_start_plug(&plug);
658 #endif
659
660 /* Submit all bio's associated with this dio */
661 for (i = 0; i < dr->dr_bio_count; i++)
662 if (dr->dr_bio[i])
663 vdev_submit_bio(dr->dr_bio[i]);
664
665 #if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG)
666 if (dr->dr_bio_count > 1)
667 blk_finish_plug(&plug);
668 #endif
669
670 (void) vdev_disk_dio_put(dr);
671
672 return (error);
673 }
674
675 BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error)
676 {
677 zio_t *zio = bio->bi_private;
678 #ifdef HAVE_1ARG_BIO_END_IO_T
679 zio->io_error = BIO_END_IO_ERROR(bio);
680 #else
681 zio->io_error = -error;
682 #endif
683
684 if (zio->io_error && (zio->io_error == EOPNOTSUPP))
685 zio->io_vd->vdev_nowritecache = B_TRUE;
686
687 bio_put(bio);
688 ASSERT3S(zio->io_error, >=, 0);
689 if (zio->io_error)
690 vdev_disk_error(zio);
691 zio_interrupt(zio);
692 }
693
694 static int
695 vdev_disk_io_flush(struct block_device *bdev, zio_t *zio)
696 {
697 struct request_queue *q;
698 struct bio *bio;
699
700 q = bdev_get_queue(bdev);
701 if (!q)
702 return (SET_ERROR(ENXIO));
703
704 bio = bio_alloc(GFP_NOIO, 0);
705 /* bio_alloc() with __GFP_WAIT never returns NULL */
706 if (unlikely(bio == NULL))
707 return (SET_ERROR(ENOMEM));
708
709 bio->bi_end_io = vdev_disk_io_flush_completion;
710 bio->bi_private = zio;
711 bio_set_dev(bio, bdev);
712 bio_set_flush(bio);
713 vdev_submit_bio(bio);
714 invalidate_bdev(bdev);
715
716 return (0);
717 }
718
719 static void
720 vdev_disk_io_start(zio_t *zio)
721 {
722 vdev_t *v = zio->io_vd;
723 vdev_disk_t *vd = v->vdev_tsd;
724 int rw, flags, error;
725
726 /*
727 * If the vdev is closed, it's likely in the REMOVED or FAULTED state.
728 * Nothing to be done here but return failure.
729 */
730 if (vd == NULL) {
731 zio->io_error = ENXIO;
732 zio_interrupt(zio);
733 return;
734 }
735
736 rw_enter(&vd->vd_lock, RW_READER);
737
738 /*
739 * If the vdev is closed, it's likely due to a failed reopen and is
740 * in the UNAVAIL state. Nothing to be done here but return failure.
741 */
742 if (vd->vd_bdev == NULL) {
743 rw_exit(&vd->vd_lock);
744 zio->io_error = ENXIO;
745 zio_interrupt(zio);
746 return;
747 }
748
749 switch (zio->io_type) {
750 case ZIO_TYPE_IOCTL:
751
752 if (!vdev_readable(v)) {
753 rw_exit(&vd->vd_lock);
754 zio->io_error = SET_ERROR(ENXIO);
755 zio_interrupt(zio);
756 return;
757 }
758
759 switch (zio->io_cmd) {
760 case DKIOCFLUSHWRITECACHE:
761
762 if (zfs_nocacheflush)
763 break;
764
765 if (v->vdev_nowritecache) {
766 zio->io_error = SET_ERROR(ENOTSUP);
767 break;
768 }
769
770 error = vdev_disk_io_flush(vd->vd_bdev, zio);
771 if (error == 0) {
772 rw_exit(&vd->vd_lock);
773 return;
774 }
775
776 zio->io_error = error;
777
778 break;
779
780 default:
781 zio->io_error = SET_ERROR(ENOTSUP);
782 }
783
784 rw_exit(&vd->vd_lock);
785 zio_execute(zio);
786 return;
787 case ZIO_TYPE_WRITE:
788 rw = WRITE;
789 #if defined(HAVE_BLK_QUEUE_HAVE_BIO_RW_UNPLUG)
790 flags = (1 << BIO_RW_UNPLUG);
791 #elif defined(REQ_UNPLUG)
792 flags = REQ_UNPLUG;
793 #else
794 flags = 0;
795 #endif
796 break;
797
798 case ZIO_TYPE_READ:
799 rw = READ;
800 #if defined(HAVE_BLK_QUEUE_HAVE_BIO_RW_UNPLUG)
801 flags = (1 << BIO_RW_UNPLUG);
802 #elif defined(REQ_UNPLUG)
803 flags = REQ_UNPLUG;
804 #else
805 flags = 0;
806 #endif
807 break;
808
809 default:
810 rw_exit(&vd->vd_lock);
811 zio->io_error = SET_ERROR(ENOTSUP);
812 zio_interrupt(zio);
813 return;
814 }
815
816 zio->io_target_timestamp = zio_handle_io_delay(zio);
817 error = __vdev_disk_physio(vd->vd_bdev, zio,
818 zio->io_size, zio->io_offset, rw, flags);
819 rw_exit(&vd->vd_lock);
820
821 if (error) {
822 zio->io_error = error;
823 zio_interrupt(zio);
824 return;
825 }
826 }
827
828 static void
829 vdev_disk_io_done(zio_t *zio)
830 {
831 /*
832 * If the device returned EIO, we revalidate the media. If it is
833 * determined the media has changed this triggers the asynchronous
834 * removal of the device from the configuration.
835 */
836 if (zio->io_error == EIO) {
837 vdev_t *v = zio->io_vd;
838 vdev_disk_t *vd = v->vdev_tsd;
839
840 if (check_disk_change(vd->vd_bdev)) {
841 vdev_bdev_invalidate(vd->vd_bdev);
842 v->vdev_remove_wanted = B_TRUE;
843 spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE);
844 }
845 }
846 }
847
848 static void
849 vdev_disk_hold(vdev_t *vd)
850 {
851 ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
852
853 /* We must have a pathname, and it must be absolute. */
854 if (vd->vdev_path == NULL || vd->vdev_path[0] != '/')
855 return;
856
857 /*
858 * Only prefetch path and devid info if the device has
859 * never been opened.
860 */
861 if (vd->vdev_tsd != NULL)
862 return;
863
864 /* XXX: Implement me as a vnode lookup for the device */
865 vd->vdev_name_vp = NULL;
866 vd->vdev_devid_vp = NULL;
867 }
868
869 static void
870 vdev_disk_rele(vdev_t *vd)
871 {
872 ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
873
874 /* XXX: Implement me as a vnode rele for the device */
875 }
876
877 static int
878 param_set_vdev_scheduler(const char *val, zfs_kernel_param_t *kp)
879 {
880 spa_t *spa = NULL;
881 char *p;
882
883 if (val == NULL)
884 return (SET_ERROR(-EINVAL));
885
886 if ((p = strchr(val, '\n')) != NULL)
887 *p = '\0';
888
889 if (spa_mode_global != 0) {
890 mutex_enter(&spa_namespace_lock);
891 while ((spa = spa_next(spa)) != NULL) {
892 if (spa_state(spa) != POOL_STATE_ACTIVE ||
893 !spa_writeable(spa) || spa_suspended(spa))
894 continue;
895
896 spa_open_ref(spa, FTAG);
897 mutex_exit(&spa_namespace_lock);
898 vdev_elevator_switch(spa->spa_root_vdev, (char *)val);
899 mutex_enter(&spa_namespace_lock);
900 spa_close(spa, FTAG);
901 }
902 mutex_exit(&spa_namespace_lock);
903 }
904
905 return (param_set_charp(val, kp));
906 }
907
908 vdev_ops_t vdev_disk_ops = {
909 vdev_disk_open,
910 vdev_disk_close,
911 vdev_default_asize,
912 vdev_disk_io_start,
913 vdev_disk_io_done,
914 NULL,
915 NULL,
916 vdev_disk_hold,
917 vdev_disk_rele,
918 NULL,
919 vdev_default_xlate,
920 VDEV_TYPE_DISK, /* name of this vdev type */
921 B_TRUE /* leaf vdev */
922 };
923
924 module_param_call(zfs_vdev_scheduler, param_set_vdev_scheduler,
925 param_get_charp, &zfs_vdev_scheduler, 0644);
926 MODULE_PARM_DESC(zfs_vdev_scheduler, "I/O scheduler");