]> git.proxmox.com Git - mirror_zfs.git/blame - module/zfs/vdev_disk.c
Don't acquire zthr_request_lock in zthr_wakeup
[mirror_zfs.git] / module / zfs / vdev_disk.c
CommitLineData
60101509
BB
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC.
23 * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
24 * Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>.
25 * LLNL-CODE-403049.
74d42600 26 * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
60101509
BB
27 */
28
29#include <sys/zfs_context.h>
e771de53 30#include <sys/spa_impl.h>
60101509
BB
31#include <sys/vdev_disk.h>
32#include <sys/vdev_impl.h>
a6255b7f 33#include <sys/abd.h>
60101509
BB
34#include <sys/fs/zfs.h>
35#include <sys/zio.h>
e771de53 36#include <linux/mod_compat.h>
74d42600 37#include <linux/msdos_fs.h>
05805494 38#include <linux/vfs_compat.h>
60101509 39
6839eed2 40char *zfs_vdev_scheduler = VDEV_SCHEDULER;
8128bd89 41static void *zfs_vdev_holder = VDEV_HOLDER;
6839eed2 42
74d42600
SH
43/* size of the "reserved" partition, in blocks */
44#define EFI_MIN_RESV_SIZE (16 * 1024)
45
60101509
BB
46/*
47 * Virtual device vector for disks.
48 */
49typedef struct dio_request {
60101509 50 zio_t *dr_zio; /* Parent ZIO */
aa159afb 51 atomic_t dr_ref; /* References */
60101509
BB
52 int dr_error; /* Bio error */
53 int dr_bio_count; /* Count of bio's */
d1d7e268 54 struct bio *dr_bio[0]; /* Attached bio's */
60101509
BB
55} dio_request_t;
56
57
58#ifdef HAVE_OPEN_BDEV_EXCLUSIVE
59static fmode_t
60vdev_bdev_mode(int smode)
61{
62 fmode_t mode = 0;
63
64 ASSERT3S(smode & (FREAD | FWRITE), !=, 0);
65
66 if (smode & FREAD)
67 mode |= FMODE_READ;
68
69 if (smode & FWRITE)
70 mode |= FMODE_WRITE;
71
d1d7e268 72 return (mode);
60101509
BB
73}
74#else
75static int
76vdev_bdev_mode(int smode)
77{
78 int mode = 0;
79
80 ASSERT3S(smode & (FREAD | FWRITE), !=, 0);
81
82 if ((smode & FREAD) && !(smode & FWRITE))
05805494 83 mode = SB_RDONLY;
60101509 84
d1d7e268 85 return (mode);
60101509
BB
86}
87#endif /* HAVE_OPEN_BDEV_EXCLUSIVE */
88
d441e85d
BB
89/*
90 * Returns the usable capacity (in bytes) for the partition or disk.
91 */
60101509 92static uint64_t
d441e85d 93bdev_capacity(struct block_device *bdev)
60101509 94{
d441e85d
BB
95 return (i_size_read(bdev->bd_inode));
96}
60101509 97
d441e85d
BB
98/*
99 * Returns the maximum expansion capacity of the block device (in bytes).
100 *
101 * It is possible to expand a vdev when it has been created as a wholedisk
102 * and the containing block device has increased in capacity. Or when the
103 * partition containing the pool has been manually increased in size.
104 *
105 * This function is only responsible for calculating the potential expansion
106 * size so it can be reported by 'zpool list'. The efi_use_whole_disk() is
107 * responsible for verifying the expected partition layout in the wholedisk
108 * case, and updating the partition table if appropriate. Once the partition
109 * size has been increased the additional capacity will be visible using
110 * bdev_capacity().
111 */
112static uint64_t
113bdev_max_capacity(struct block_device *bdev, uint64_t wholedisk)
114{
115 uint64_t psize;
116 int64_t available;
117
118 if (wholedisk && bdev->bd_part != NULL && bdev != bdev->bd_contains) {
74d42600 119 /*
d441e85d
BB
120 * When reporting maximum expansion capacity for a wholedisk
121 * deduct any capacity which is expected to be lost due to
122 * alignment restrictions. Over reporting this value isn't
123 * harmful and would only result in slightly less capacity
124 * than expected post expansion.
74d42600 125 */
d441e85d
BB
126 available = i_size_read(bdev->bd_contains->bd_inode) -
127 ((EFI_MIN_RESV_SIZE + NEW_START_BLOCK +
128 PARTITION_END_ALIGNMENT) << SECTOR_BITS);
129 if (available > 0)
130 psize = available;
131 else
132 psize = bdev_capacity(bdev);
74d42600 133 } else {
d441e85d 134 psize = bdev_capacity(bdev);
74d42600 135 }
d441e85d
BB
136
137 return (psize);
60101509
BB
138}
139
d148e951
BB
140static void
141vdev_disk_error(zio_t *zio)
142{
c71c8c71 143 /*
144 * This function can be called in interrupt context, for instance while
145 * handling IRQs coming from a misbehaving disk device; use printk()
146 * which is safe from any context.
147 */
148 printk(KERN_WARNING "zio pool=%s vdev=%s error=%d type=%d "
149 "offset=%llu size=%llu flags=%x\n", spa_name(zio->io_spa),
150 zio->io_vd->vdev_path, zio->io_error, zio->io_type,
151 (u_longlong_t)zio->io_offset, (u_longlong_t)zio->io_size,
152 zio->io_flags);
d148e951
BB
153}
154
6839eed2
BB
155/*
156 * Use the Linux 'noop' elevator for zfs managed block devices. This
157 * strikes the ideal balance by allowing the zfs elevator to do all
158 * request ordering and prioritization. While allowing the Linux
159 * elevator to do the maximum front/back merging allowed by the
160 * physical device. This yields the largest possible requests for
161 * the device with the lowest total overhead.
6839eed2 162 */
e771de53 163static void
fdcd952b 164vdev_elevator_switch(vdev_t *v, char *elevator)
6839eed2 165{
fdcd952b 166 vdev_disk_t *vd = v->vdev_tsd;
e771de53
BB
167 struct request_queue *q;
168 char *device;
e2448b0e 169 int error;
fdcd952b 170
e771de53
BB
171 for (int c = 0; c < v->vdev_children; c++)
172 vdev_elevator_switch(v->vdev_child[c], elevator);
173
174 if (!v->vdev_ops->vdev_op_leaf || vd->vd_bdev == NULL)
175 return;
176
177 q = bdev_get_queue(vd->vd_bdev);
178 device = vd->vd_bdev->bd_disk->disk_name;
179
84daadde
PS
180 /*
181 * Skip devices which are not whole disks (partitions).
182 * Device-mapper devices are excepted since they may be whole
183 * disks despite the vdev_wholedisk flag, in which case we can
184 * and should switch the elevator. If the device-mapper device
185 * does not have an elevator (i.e. dm-raid, dm-crypt, etc.) the
186 * "Skip devices without schedulers" check below will fail.
187 */
188 if (!v->vdev_wholedisk && strncmp(device, "dm-", 3) != 0)
e771de53 189 return;
04516a45 190
fdcd952b 191 /* Leave existing scheduler when set to "none" */
4903926f 192 if ((strncmp(elevator, "none", 4) == 0) && (strlen(elevator) == 4))
e771de53 193 return;
6839eed2 194
1c38ac61
BB
195 /*
196 * The elevator_change() function was available in kernels from
197 * 2.6.36 to 4.11. When not available fall back to using the user
198 * mode helper functionality to set the elevator via sysfs. This
199 * requires /bin/echo and sysfs to be mounted which may not be true
200 * early in the boot process.
201 */
6d1d976b
BB
202#ifdef HAVE_ELEVATOR_CHANGE
203 error = elevator_change(q, elevator);
204#else
d1d7e268 205#define SET_SCHEDULER_CMD \
6d1d976b
BB
206 "exec 0</dev/null " \
207 " 1>/sys/block/%s/queue/scheduler " \
208 " 2>/dev/null; " \
209 "echo %s"
210
e771de53
BB
211 char *argv[] = { "/bin/sh", "-c", NULL, NULL };
212 char *envp[] = { NULL };
6d1d976b 213
e771de53
BB
214 argv[2] = kmem_asprintf(SET_SCHEDULER_CMD, device, elevator);
215 error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
216 strfree(argv[2]);
6d1d976b 217#endif /* HAVE_ELEVATOR_CHANGE */
1c38ac61
BB
218 if (error) {
219 zfs_dbgmsg("Unable to set \"%s\" scheduler for %s (%s): %d\n",
220 elevator, v->vdev_path, device, error);
221 }
6839eed2
BB
222}
223
60101509 224static int
1bd201e7
CS
225vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
226 uint64_t *ashift)
60101509 227{
d441e85d
BB
228 struct block_device *bdev;
229 fmode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa));
230 int count = 0, block_size;
231 int bdev_retry_count = 50;
60101509 232 vdev_disk_t *vd;
60101509
BB
233
234 /* Must have a pathname and it must be absolute. */
235 if (v->vdev_path == NULL || v->vdev_path[0] != '/') {
236 v->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
d441e85d 237 vdev_dbgmsg(v, "invalid vdev_path");
2d82ea8b 238 return (SET_ERROR(EINVAL));
60101509
BB
239 }
240
0d8103d9 241 /*
d441e85d
BB
242 * Reopen the device if it is currently open. When expanding a
243 * partition force re-scanning the partition table while closed
244 * in order to get an accurate updated block device size. Then
245 * since udev may need to recreate the device links increase the
246 * open retry count before reporting the device as unavailable.
0d8103d9 247 */
d441e85d
BB
248 vd = v->vdev_tsd;
249 if (vd) {
250 char disk_name[BDEVNAME_SIZE + 6] = "/dev/";
251 boolean_t reread_part = B_FALSE;
0d8103d9 252
d441e85d
BB
253 rw_enter(&vd->vd_lock, RW_WRITER);
254 bdev = vd->vd_bdev;
255 vd->vd_bdev = NULL;
256
257 if (bdev) {
258 if (v->vdev_expanding && bdev != bdev->bd_contains) {
259 bdevname(bdev->bd_contains, disk_name + 5);
260 reread_part = B_TRUE;
261 }
262
263 vdev_bdev_close(bdev, mode);
264 }
265
266 if (reread_part) {
267 bdev = vdev_bdev_open(disk_name, mode, zfs_vdev_holder);
268 if (!IS_ERR(bdev)) {
269 int error = vdev_bdev_reread_part(bdev);
270 vdev_bdev_close(bdev, mode);
271 if (error == 0)
272 bdev_retry_count = 100;
273 }
274 }
275 } else {
276 vd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);
277
278 rw_init(&vd->vd_lock, NULL, RW_DEFAULT, NULL);
279 rw_enter(&vd->vd_lock, RW_WRITER);
280 }
60101509
BB
281
282 /*
283 * Devices are always opened by the path provided at configuration
284 * time. This means that if the provided path is a udev by-id path
d441e85d 285 * then drives may be re-cabled without an issue. If the provided
4e95cc99 286 * path is a udev by-path path, then the physical location information
60101509
BB
287 * will be preserved. This can be critical for more complicated
288 * configurations where drives are located in specific physical
d441e85d
BB
289 * locations to maximize the systems tolerance to component failure.
290 *
4e95cc99 291 * Alternatively, you can provide your own udev rule to flexibly map
60101509 292 * the drives as you see fit. It is not advised that you use the
4e95cc99 293 * /dev/[hd]d devices which may be reordered due to probing order.
60101509
BB
294 * Devices in the wrong locations will be detected by the higher
295 * level vdev validation.
2d82ea8b
BB
296 *
297 * The specified paths may be briefly removed and recreated in
298 * response to udev events. This should be exceptionally unlikely
299 * because the zpool command makes every effort to verify these paths
300 * have already settled prior to reaching this point. Therefore,
301 * a ENOENT failure at this point is highly likely to be transient
302 * and it is reasonable to sleep and retry before giving up. In
303 * practice delays have been observed to be on the order of 100ms.
60101509 304 */
d441e85d
BB
305 bdev = ERR_PTR(-ENXIO);
306 while (IS_ERR(bdev) && count < bdev_retry_count) {
307 bdev = vdev_bdev_open(v->vdev_path, mode, zfs_vdev_holder);
2d82ea8b 308 if (unlikely(PTR_ERR(bdev) == -ENOENT)) {
d441e85d 309 schedule_timeout(MSEC_TO_TICK(10));
2d82ea8b
BB
310 count++;
311 } else if (IS_ERR(bdev)) {
312 break;
313 }
314 }
315
60101509 316 if (IS_ERR(bdev)) {
d441e85d
BB
317 int error = -PTR_ERR(bdev);
318 vdev_dbgmsg(v, "open error=%d count=%d\n", error, count);
319 vd->vd_bdev = NULL;
320 v->vdev_tsd = vd;
321 rw_exit(&vd->vd_lock);
322 return (SET_ERROR(error));
323 } else {
324 vd->vd_bdev = bdev;
325 v->vdev_tsd = vd;
326 rw_exit(&vd->vd_lock);
60101509
BB
327 }
328
0d8103d9
BB
329 /* Determine the physical block size */
330 block_size = vdev_bdev_block_size(vd->vd_bdev);
60101509 331
60101509
BB
332 /* Clear the nowritecache bit, causes vdev_reopen() to try again. */
333 v->vdev_nowritecache = B_FALSE;
334
fb40095f
RY
335 /* Inform the ZIO pipeline that we are non-rotational */
336 v->vdev_nonrot = blk_queue_nonrot(bdev_get_queue(vd->vd_bdev));
337
d441e85d
BB
338 /* Physical volume size in bytes for the partition */
339 *psize = bdev_capacity(vd->vd_bdev);
340
341 /* Physical volume size in bytes including possible expansion space */
342 *max_psize = bdev_max_capacity(vd->vd_bdev, v->vdev_wholedisk);
1bd201e7 343
60101509 344 /* Based on the minimum sector size set the block size */
9bd274dd 345 *ashift = highbit64(MAX(block_size, SPA_MINBLOCKSIZE)) - 1;
60101509 346
6839eed2 347 /* Try to set the io scheduler elevator algorithm */
fdcd952b 348 (void) vdev_elevator_switch(v, zfs_vdev_scheduler);
6839eed2 349
d1d7e268 350 return (0);
60101509
BB
351}
352
353static void
354vdev_disk_close(vdev_t *v)
355{
356 vdev_disk_t *vd = v->vdev_tsd;
357
0d8103d9 358 if (v->vdev_reopening || vd == NULL)
60101509
BB
359 return;
360
d441e85d 361 if (vd->vd_bdev != NULL) {
60101509 362 vdev_bdev_close(vd->vd_bdev,
d1d7e268 363 vdev_bdev_mode(spa_mode(v->vdev_spa)));
d441e85d 364 }
60101509 365
d441e85d 366 rw_destroy(&vd->vd_lock);
d1d7e268 367 kmem_free(vd, sizeof (vdev_disk_t));
60101509
BB
368 v->vdev_tsd = NULL;
369}
370
371static dio_request_t *
372vdev_disk_dio_alloc(int bio_count)
373{
374 dio_request_t *dr;
375 int i;
376
d1d7e268 377 dr = kmem_zalloc(sizeof (dio_request_t) +
79c76d5b 378 sizeof (struct bio *) * bio_count, KM_SLEEP);
60101509 379 if (dr) {
60101509
BB
380 atomic_set(&dr->dr_ref, 0);
381 dr->dr_bio_count = bio_count;
382 dr->dr_error = 0;
383
384 for (i = 0; i < dr->dr_bio_count; i++)
385 dr->dr_bio[i] = NULL;
386 }
387
d1d7e268 388 return (dr);
60101509
BB
389}
390
391static void
392vdev_disk_dio_free(dio_request_t *dr)
393{
394 int i;
395
396 for (i = 0; i < dr->dr_bio_count; i++)
397 if (dr->dr_bio[i])
398 bio_put(dr->dr_bio[i]);
399
d1d7e268
MK
400 kmem_free(dr, sizeof (dio_request_t) +
401 sizeof (struct bio *) * dr->dr_bio_count);
60101509
BB
402}
403
404static void
405vdev_disk_dio_get(dio_request_t *dr)
406{
407 atomic_inc(&dr->dr_ref);
408}
409
410static int
411vdev_disk_dio_put(dio_request_t *dr)
412{
413 int rc = atomic_dec_return(&dr->dr_ref);
414
415 /*
416 * Free the dio_request when the last reference is dropped and
417 * ensure zio_interpret is called only once with the correct zio
418 */
419 if (rc == 0) {
420 zio_t *zio = dr->dr_zio;
421 int error = dr->dr_error;
422
423 vdev_disk_dio_free(dr);
424
425 if (zio) {
426 zio->io_error = error;
d148e951
BB
427 ASSERT3S(zio->io_error, >=, 0);
428 if (zio->io_error)
429 vdev_disk_error(zio);
a6255b7f 430
26ef0cc7 431 zio_delay_interrupt(zio);
60101509
BB
432 }
433 }
434
d1d7e268 435 return (rc);
60101509
BB
436}
437
784a7fe5 438BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, error)
60101509
BB
439{
440 dio_request_t *dr = bio->bi_private;
441 int rc;
442
784a7fe5
LW
443 if (dr->dr_error == 0) {
444#ifdef HAVE_1ARG_BIO_END_IO_T
36ba27e9 445 dr->dr_error = BIO_END_IO_ERROR(bio);
784a7fe5
LW
446#else
447 if (error)
448 dr->dr_error = -(error);
449 else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
450 dr->dr_error = EIO;
451#endif
452 }
60101509 453
b0be93e8 454 /* Drop reference acquired by __vdev_disk_physio */
60101509 455 rc = vdev_disk_dio_put(dr);
60101509
BB
456}
457
60101509
BB
458static unsigned int
459bio_map(struct bio *bio, void *bio_ptr, unsigned int bio_size)
460{
461 unsigned int offset, size, i;
462 struct page *page;
463
464 offset = offset_in_page(bio_ptr);
465 for (i = 0; i < bio->bi_max_vecs; i++) {
466 size = PAGE_SIZE - offset;
467
468 if (bio_size <= 0)
469 break;
470
471 if (size > bio_size)
472 size = bio_size;
473
71f8548e 474 if (is_vmalloc_addr(bio_ptr))
60101509
BB
475 page = vmalloc_to_page(bio_ptr);
476 else
477 page = virt_to_page(bio_ptr);
478
17584980
CC
479 /*
480 * Some network related block device uses tcp_sendpage, which
481 * doesn't behave well when using 0-count page, this is a
482 * safety net to catch them.
483 */
484 ASSERT3S(page_count(page), >, 0);
485
60101509
BB
486 if (bio_add_page(bio, page, size, offset) != size)
487 break;
488
489 bio_ptr += size;
490 bio_size -= size;
491 offset = 0;
492 }
493
d1d7e268 494 return (bio_size);
60101509
BB
495}
496
b0be93e8
IH
497static unsigned int
498bio_map_abd_off(struct bio *bio, abd_t *abd, unsigned int size, size_t off)
499{
500 if (abd_is_linear(abd))
501 return (bio_map(bio, ((char *)abd_to_buf(abd)) + off, size));
502
503 return (abd_scatter_bio_map_off(bio, abd, size, off));
504}
505
bbb1b6ce 506static inline void
3b86aeb2 507vdev_submit_bio_impl(struct bio *bio)
bbb1b6ce
BB
508{
509#ifdef HAVE_1ARG_SUBMIT_BIO
bbb1b6ce
BB
510 submit_bio(bio);
511#else
3b86aeb2 512 submit_bio(0, bio);
bbb1b6ce
BB
513#endif
514}
515
26a85659
BB
516#ifdef HAVE_BIO_SET_DEV
517#if defined(CONFIG_BLK_CGROUP) && defined(HAVE_BIO_SET_DEV_GPL_ONLY)
518/*
519 * The Linux 5.0 kernel updated the bio_set_dev() macro so it calls the
520 * GPL-only bio_associate_blkg() symbol thus inadvertently converting
521 * the entire macro. Provide a minimal version which always assigns the
522 * request queue's root_blkg to the bio.
523 */
524static inline void
525vdev_bio_associate_blkg(struct bio *bio)
526{
527 struct request_queue *q = bio->bi_disk->queue;
528
529 ASSERT3P(q, !=, NULL);
530 ASSERT3P(q->root_blkg, !=, NULL);
531 ASSERT3P(bio->bi_blkg, ==, NULL);
532
533 if (blkg_tryget(q->root_blkg))
534 bio->bi_blkg = q->root_blkg;
535}
536#define bio_associate_blkg vdev_bio_associate_blkg
537#endif
538#else
539/*
540 * Provide a bio_set_dev() helper macro for pre-Linux 4.14 kernels.
541 */
787acae0
GDN
542static inline void
543bio_set_dev(struct bio *bio, struct block_device *bdev)
544{
545 bio->bi_bdev = bdev;
546}
26a85659 547#endif /* HAVE_BIO_SET_DEV */
787acae0 548
37f9dac5 549static inline void
3b86aeb2 550vdev_submit_bio(struct bio *bio)
37f9dac5
RY
551{
552#ifdef HAVE_CURRENT_BIO_TAIL
553 struct bio **bio_tail = current->bio_tail;
554 current->bio_tail = NULL;
3b86aeb2 555 vdev_submit_bio_impl(bio);
37f9dac5
RY
556 current->bio_tail = bio_tail;
557#else
558 struct bio_list *bio_list = current->bio_list;
559 current->bio_list = NULL;
3b86aeb2 560 vdev_submit_bio_impl(bio);
37f9dac5
RY
561 current->bio_list = bio_list;
562#endif
563}
564
60101509 565static int
b0be93e8
IH
566__vdev_disk_physio(struct block_device *bdev, zio_t *zio,
567 size_t io_size, uint64_t io_offset, int rw, int flags)
60101509 568{
d1d7e268 569 dio_request_t *dr;
b0be93e8 570 uint64_t abd_offset;
60101509 571 uint64_t bio_offset;
3b86aeb2 572 int bio_size, bio_count = 16;
f74fae8b 573 int i = 0, error = 0;
e8ac4557
IH
574#if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG)
575 struct blk_plug plug;
576#endif
d441e85d
BB
577 /*
578 * Accessing outside the block device is never allowed.
579 */
580 if (io_offset + io_size > bdev->bd_inode->i_size) {
581 vdev_dbgmsg(zio->io_vd,
582 "Illegal access %llu size %llu, device size %llu",
583 io_offset, io_size, i_size_read(bdev->bd_inode));
584 return (SET_ERROR(EIO));
585 }
e06be586 586
60101509
BB
587retry:
588 dr = vdev_disk_dio_alloc(bio_count);
589 if (dr == NULL)
ecb2b7dc 590 return (SET_ERROR(ENOMEM));
60101509 591
2959d94a 592 if (zio && !(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))
29b763cd 593 bio_set_flags_failfast(bdev, &flags);
2959d94a 594
60101509 595 dr->dr_zio = zio;
60101509 596
60101509
BB
597 /*
598 * When the IO size exceeds the maximum bio size for the request
599 * queue we are forced to break the IO in multiple bio's and wait
600 * for them all to complete. Ideally, all pool users will set
601 * their volume block size to match the maximum request size and
602 * the common case will be one bio per vdev IO request.
603 */
a6255b7f 604
b0be93e8
IH
605 abd_offset = 0;
606 bio_offset = io_offset;
607 bio_size = io_size;
60101509
BB
608 for (i = 0; i <= dr->dr_bio_count; i++) {
609
610 /* Finished constructing bio's for given buffer */
611 if (bio_size <= 0)
612 break;
613
614 /*
615 * By default only 'bio_count' bio's per dio are allowed.
616 * However, if we find ourselves in a situation where more
617 * are needed we allocate a larger dio and warn the user.
618 */
619 if (dr->dr_bio_count == i) {
620 vdev_disk_dio_free(dr);
621 bio_count *= 2;
60101509
BB
622 goto retry;
623 }
624
29b763cd 625 /* bio_alloc() with __GFP_WAIT never returns NULL */
f1512ee6 626 dr->dr_bio[i] = bio_alloc(GFP_NOIO,
b0be93e8 627 MIN(abd_nr_pages_off(zio->io_abd, bio_size, abd_offset),
02730c33 628 BIO_MAX_PAGES));
29b763cd 629 if (unlikely(dr->dr_bio[i] == NULL)) {
60101509 630 vdev_disk_dio_free(dr);
ecb2b7dc 631 return (SET_ERROR(ENOMEM));
60101509
BB
632 }
633
634 /* Matching put called by vdev_disk_physio_completion */
635 vdev_disk_dio_get(dr);
636
787acae0 637 bio_set_dev(dr->dr_bio[i], bdev);
d4541210 638 BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9;
60101509
BB
639 dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion;
640 dr->dr_bio[i]->bi_private = dr;
3b86aeb2 641 bio_set_op_attrs(dr->dr_bio[i], rw, flags);
60101509
BB
642
643 /* Remaining size is returned to become the new size */
b0be93e8 644 bio_size = bio_map_abd_off(dr->dr_bio[i], zio->io_abd,
02730c33 645 bio_size, abd_offset);
60101509
BB
646
647 /* Advance in buffer and construct another bio if needed */
b0be93e8 648 abd_offset += BIO_BI_SIZE(dr->dr_bio[i]);
d4541210 649 bio_offset += BIO_BI_SIZE(dr->dr_bio[i]);
60101509
BB
650 }
651
37f9dac5 652 /* Extra reference to protect dio_request during vdev_submit_bio */
60101509
BB
653 vdev_disk_dio_get(dr);
654
e8ac4557
IH
655#if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG)
656 if (dr->dr_bio_count > 1)
657 blk_start_plug(&plug);
658#endif
659
60101509
BB
660 /* Submit all bio's associated with this dio */
661 for (i = 0; i < dr->dr_bio_count; i++)
662 if (dr->dr_bio[i])
3b86aeb2 663 vdev_submit_bio(dr->dr_bio[i]);
60101509 664
e8ac4557
IH
665#if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG)
666 if (dr->dr_bio_count > 1)
667 blk_finish_plug(&plug);
668#endif
669
d1d7e268 670 (void) vdev_disk_dio_put(dr);
60101509 671
d1d7e268 672 return (error);
60101509
BB
673}
674
36ba27e9 675BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error)
60101509
BB
676{
677 zio_t *zio = bio->bi_private;
784a7fe5 678#ifdef HAVE_1ARG_BIO_END_IO_T
36ba27e9
BB
679 zio->io_error = BIO_END_IO_ERROR(bio);
680#else
681 zio->io_error = -error;
784a7fe5 682#endif
60101509 683
36ba27e9 684 if (zio->io_error && (zio->io_error == EOPNOTSUPP))
60101509
BB
685 zio->io_vd->vdev_nowritecache = B_TRUE;
686
687 bio_put(bio);
d148e951
BB
688 ASSERT3S(zio->io_error, >=, 0);
689 if (zio->io_error)
690 vdev_disk_error(zio);
60101509 691 zio_interrupt(zio);
60101509
BB
692}
693
694static int
695vdev_disk_io_flush(struct block_device *bdev, zio_t *zio)
696{
697 struct request_queue *q;
698 struct bio *bio;
699
700 q = bdev_get_queue(bdev);
701 if (!q)
ecb2b7dc 702 return (SET_ERROR(ENXIO));
60101509 703
abc41ac7 704 bio = bio_alloc(GFP_NOIO, 0);
29b763cd
IH
705 /* bio_alloc() with __GFP_WAIT never returns NULL */
706 if (unlikely(bio == NULL))
ecb2b7dc 707 return (SET_ERROR(ENOMEM));
60101509
BB
708
709 bio->bi_end_io = vdev_disk_io_flush_completion;
710 bio->bi_private = zio;
787acae0 711 bio_set_dev(bio, bdev);
a5e046ea 712 bio_set_flush(bio);
3b86aeb2 713 vdev_submit_bio(bio);
cecb7487 714 invalidate_bdev(bdev);
60101509 715
d1d7e268 716 return (0);
60101509 717}
60101509 718
98b25418 719static void
60101509
BB
720vdev_disk_io_start(zio_t *zio)
721{
722 vdev_t *v = zio->io_vd;
723 vdev_disk_t *vd = v->vdev_tsd;
3b86aeb2 724 int rw, flags, error;
60101509 725
d441e85d
BB
726 /*
727 * If the vdev is closed, it's likely in the REMOVED or FAULTED state.
728 * Nothing to be done here but return failure.
729 */
730 if (vd == NULL) {
731 zio->io_error = ENXIO;
732 zio_interrupt(zio);
733 return;
734 }
735
736 rw_enter(&vd->vd_lock, RW_READER);
737
738 /*
739 * If the vdev is closed, it's likely due to a failed reopen and is
740 * in the UNAVAIL state. Nothing to be done here but return failure.
741 */
742 if (vd->vd_bdev == NULL) {
743 rw_exit(&vd->vd_lock);
744 zio->io_error = ENXIO;
745 zio_interrupt(zio);
746 return;
747 }
748
60101509
BB
749 switch (zio->io_type) {
750 case ZIO_TYPE_IOCTL:
751
752 if (!vdev_readable(v)) {
d441e85d 753 rw_exit(&vd->vd_lock);
2e528b49 754 zio->io_error = SET_ERROR(ENXIO);
98b25418
GW
755 zio_interrupt(zio);
756 return;
60101509
BB
757 }
758
759 switch (zio->io_cmd) {
760 case DKIOCFLUSHWRITECACHE:
761
762 if (zfs_nocacheflush)
763 break;
764
765 if (v->vdev_nowritecache) {
2e528b49 766 zio->io_error = SET_ERROR(ENOTSUP);
60101509
BB
767 break;
768 }
769
770 error = vdev_disk_io_flush(vd->vd_bdev, zio);
d441e85d
BB
771 if (error == 0) {
772 rw_exit(&vd->vd_lock);
98b25418 773 return;
d441e85d 774 }
60101509
BB
775
776 zio->io_error = error;
60101509
BB
777
778 break;
779
780 default:
2e528b49 781 zio->io_error = SET_ERROR(ENOTSUP);
60101509
BB
782 }
783
d441e85d 784 rw_exit(&vd->vd_lock);
98b25418
GW
785 zio_execute(zio);
786 return;
60101509 787 case ZIO_TYPE_WRITE:
3b86aeb2 788 rw = WRITE;
e6603b7c 789#if defined(HAVE_BLK_QUEUE_HAVE_BIO_RW_UNPLUG)
3b86aeb2 790 flags = (1 << BIO_RW_UNPLUG);
e6603b7c 791#elif defined(REQ_UNPLUG)
3b86aeb2 792 flags = REQ_UNPLUG;
e6603b7c 793#else
3b86aeb2 794 flags = 0;
e6603b7c 795#endif
60101509
BB
796 break;
797
798 case ZIO_TYPE_READ:
3b86aeb2 799 rw = READ;
e6603b7c 800#if defined(HAVE_BLK_QUEUE_HAVE_BIO_RW_UNPLUG)
3b86aeb2 801 flags = (1 << BIO_RW_UNPLUG);
e6603b7c 802#elif defined(REQ_UNPLUG)
3b86aeb2 803 flags = REQ_UNPLUG;
e6603b7c 804#else
3b86aeb2 805 flags = 0;
e6603b7c 806#endif
60101509
BB
807 break;
808
809 default:
d441e85d 810 rw_exit(&vd->vd_lock);
2e528b49 811 zio->io_error = SET_ERROR(ENOTSUP);
98b25418
GW
812 zio_interrupt(zio);
813 return;
60101509
BB
814 }
815
26ef0cc7 816 zio->io_target_timestamp = zio_handle_io_delay(zio);
b0be93e8 817 error = __vdev_disk_physio(vd->vd_bdev, zio,
3b86aeb2 818 zio->io_size, zio->io_offset, rw, flags);
d441e85d
BB
819 rw_exit(&vd->vd_lock);
820
60101509
BB
821 if (error) {
822 zio->io_error = error;
98b25418
GW
823 zio_interrupt(zio);
824 return;
60101509 825 }
60101509
BB
826}
827
828static void
829vdev_disk_io_done(zio_t *zio)
830{
831 /*
832 * If the device returned EIO, we revalidate the media. If it is
833 * determined the media has changed this triggers the asynchronous
834 * removal of the device from the configuration.
835 */
836 if (zio->io_error == EIO) {
d1d7e268 837 vdev_t *v = zio->io_vd;
60101509
BB
838 vdev_disk_t *vd = v->vdev_tsd;
839
840 if (check_disk_change(vd->vd_bdev)) {
841 vdev_bdev_invalidate(vd->vd_bdev);
842 v->vdev_remove_wanted = B_TRUE;
843 spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE);
844 }
845 }
846}
847
848static void
849vdev_disk_hold(vdev_t *vd)
850{
851 ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
852
853 /* We must have a pathname, and it must be absolute. */
854 if (vd->vdev_path == NULL || vd->vdev_path[0] != '/')
855 return;
856
857 /*
858 * Only prefetch path and devid info if the device has
859 * never been opened.
860 */
861 if (vd->vdev_tsd != NULL)
862 return;
863
864 /* XXX: Implement me as a vnode lookup for the device */
865 vd->vdev_name_vp = NULL;
866 vd->vdev_devid_vp = NULL;
867}
868
869static void
870vdev_disk_rele(vdev_t *vd)
871{
872 ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
873
874 /* XXX: Implement me as a vnode rele for the device */
875}
876
e771de53
BB
877static int
878param_set_vdev_scheduler(const char *val, zfs_kernel_param_t *kp)
879{
880 spa_t *spa = NULL;
881 char *p;
882
883 if (val == NULL)
884 return (SET_ERROR(-EINVAL));
885
886 if ((p = strchr(val, '\n')) != NULL)
887 *p = '\0';
888
bc5f51c5 889 if (spa_mode_global != 0) {
e771de53 890 mutex_enter(&spa_namespace_lock);
bc5f51c5
OF
891 while ((spa = spa_next(spa)) != NULL) {
892 if (spa_state(spa) != POOL_STATE_ACTIVE ||
893 !spa_writeable(spa) || spa_suspended(spa))
894 continue;
895
896 spa_open_ref(spa, FTAG);
897 mutex_exit(&spa_namespace_lock);
898 vdev_elevator_switch(spa->spa_root_vdev, (char *)val);
899 mutex_enter(&spa_namespace_lock);
900 spa_close(spa, FTAG);
901 }
902 mutex_exit(&spa_namespace_lock);
e771de53 903 }
e771de53
BB
904
905 return (param_set_charp(val, kp));
906}
907
60101509
BB
908vdev_ops_t vdev_disk_ops = {
909 vdev_disk_open,
910 vdev_disk_close,
911 vdev_default_asize,
912 vdev_disk_io_start,
913 vdev_disk_io_done,
914 NULL,
3d6da72d 915 NULL,
60101509
BB
916 vdev_disk_hold,
917 vdev_disk_rele,
a1d477c2 918 NULL,
619f0976 919 vdev_default_xlate,
60101509
BB
920 VDEV_TYPE_DISK, /* name of this vdev type */
921 B_TRUE /* leaf vdev */
922};
923
e771de53
BB
924module_param_call(zfs_vdev_scheduler, param_set_vdev_scheduler,
925 param_get_charp, &zfs_vdev_scheduler, 0644);
c409e464 926MODULE_PARM_DESC(zfs_vdev_scheduler, "I/O scheduler");