]> git.proxmox.com Git - mirror_zfs.git/blame - module/zfs/vdev_disk.c
Linux 5.0 compat: Remove incorrect ASSERT
[mirror_zfs.git] / module / zfs / vdev_disk.c
CommitLineData
60101509
BB
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC.
23 * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
24 * Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>.
25 * LLNL-CODE-403049.
74d42600 26 * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
60101509
BB
27 */
28
29#include <sys/zfs_context.h>
e771de53 30#include <sys/spa_impl.h>
60101509
BB
31#include <sys/vdev_disk.h>
32#include <sys/vdev_impl.h>
1b939560 33#include <sys/vdev_trim.h>
a6255b7f 34#include <sys/abd.h>
60101509
BB
35#include <sys/fs/zfs.h>
36#include <sys/zio.h>
e771de53 37#include <linux/mod_compat.h>
74d42600 38#include <linux/msdos_fs.h>
05805494 39#include <linux/vfs_compat.h>
60101509 40
6839eed2 41char *zfs_vdev_scheduler = VDEV_SCHEDULER;
8128bd89 42static void *zfs_vdev_holder = VDEV_HOLDER;
6839eed2 43
74d42600
SH
44/* size of the "reserved" partition, in blocks */
45#define EFI_MIN_RESV_SIZE (16 * 1024)
46
60101509
BB
47/*
48 * Virtual device vector for disks.
49 */
50typedef struct dio_request {
60101509 51 zio_t *dr_zio; /* Parent ZIO */
aa159afb 52 atomic_t dr_ref; /* References */
60101509
BB
53 int dr_error; /* Bio error */
54 int dr_bio_count; /* Count of bio's */
d1d7e268 55 struct bio *dr_bio[0]; /* Attached bio's */
60101509
BB
56} dio_request_t;
57
58
59#ifdef HAVE_OPEN_BDEV_EXCLUSIVE
60static fmode_t
61vdev_bdev_mode(int smode)
62{
63 fmode_t mode = 0;
64
65 ASSERT3S(smode & (FREAD | FWRITE), !=, 0);
66
67 if (smode & FREAD)
68 mode |= FMODE_READ;
69
70 if (smode & FWRITE)
71 mode |= FMODE_WRITE;
72
d1d7e268 73 return (mode);
60101509
BB
74}
75#else
76static int
77vdev_bdev_mode(int smode)
78{
79 int mode = 0;
80
81 ASSERT3S(smode & (FREAD | FWRITE), !=, 0);
82
83 if ((smode & FREAD) && !(smode & FWRITE))
05805494 84 mode = SB_RDONLY;
60101509 85
d1d7e268 86 return (mode);
60101509
BB
87}
88#endif /* HAVE_OPEN_BDEV_EXCLUSIVE */
89
d441e85d
BB
90/*
91 * Returns the usable capacity (in bytes) for the partition or disk.
92 */
60101509 93static uint64_t
d441e85d 94bdev_capacity(struct block_device *bdev)
60101509 95{
d441e85d
BB
96 return (i_size_read(bdev->bd_inode));
97}
60101509 98
d441e85d
BB
99/*
100 * Returns the maximum expansion capacity of the block device (in bytes).
101 *
102 * It is possible to expand a vdev when it has been created as a wholedisk
103 * and the containing block device has increased in capacity. Or when the
104 * partition containing the pool has been manually increased in size.
105 *
106 * This function is only responsible for calculating the potential expansion
107 * size so it can be reported by 'zpool list'. The efi_use_whole_disk() is
108 * responsible for verifying the expected partition layout in the wholedisk
109 * case, and updating the partition table if appropriate. Once the partition
110 * size has been increased the additional capacity will be visible using
111 * bdev_capacity().
0c637f31 112 *
113 * The returned maximum expansion capacity is always expected to be larger, or
114 * at the very least equal, to its usable capacity to prevent overestimating
115 * the pool expandsize.
d441e85d
BB
116 */
117static uint64_t
118bdev_max_capacity(struct block_device *bdev, uint64_t wholedisk)
119{
120 uint64_t psize;
121 int64_t available;
122
123 if (wholedisk && bdev->bd_part != NULL && bdev != bdev->bd_contains) {
74d42600 124 /*
d441e85d
BB
125 * When reporting maximum expansion capacity for a wholedisk
126 * deduct any capacity which is expected to be lost due to
127 * alignment restrictions. Over reporting this value isn't
128 * harmful and would only result in slightly less capacity
129 * than expected post expansion.
0c637f31 130 * The estimated available space may be slightly smaller than
131 * bdev_capacity() for devices where the number of sectors is
132 * not a multiple of the alignment size and the partition layout
133 * is keeping less than PARTITION_END_ALIGNMENT bytes after the
134 * "reserved" EFI partition: in such cases return the device
135 * usable capacity.
74d42600 136 */
d441e85d
BB
137 available = i_size_read(bdev->bd_contains->bd_inode) -
138 ((EFI_MIN_RESV_SIZE + NEW_START_BLOCK +
139 PARTITION_END_ALIGNMENT) << SECTOR_BITS);
0c637f31 140 psize = MAX(available, bdev_capacity(bdev));
74d42600 141 } else {
d441e85d 142 psize = bdev_capacity(bdev);
74d42600 143 }
d441e85d
BB
144
145 return (psize);
60101509
BB
146}
147
d148e951
BB
148static void
149vdev_disk_error(zio_t *zio)
150{
c71c8c71 151 /*
152 * This function can be called in interrupt context, for instance while
153 * handling IRQs coming from a misbehaving disk device; use printk()
154 * which is safe from any context.
155 */
156 printk(KERN_WARNING "zio pool=%s vdev=%s error=%d type=%d "
157 "offset=%llu size=%llu flags=%x\n", spa_name(zio->io_spa),
158 zio->io_vd->vdev_path, zio->io_error, zio->io_type,
159 (u_longlong_t)zio->io_offset, (u_longlong_t)zio->io_size,
160 zio->io_flags);
d148e951
BB
161}
162
6839eed2
BB
163/*
164 * Use the Linux 'noop' elevator for zfs managed block devices. This
165 * strikes the ideal balance by allowing the zfs elevator to do all
166 * request ordering and prioritization. While allowing the Linux
167 * elevator to do the maximum front/back merging allowed by the
168 * physical device. This yields the largest possible requests for
169 * the device with the lowest total overhead.
6839eed2 170 */
e771de53 171static void
fdcd952b 172vdev_elevator_switch(vdev_t *v, char *elevator)
6839eed2 173{
fdcd952b 174 vdev_disk_t *vd = v->vdev_tsd;
e771de53
BB
175 struct request_queue *q;
176 char *device;
e2448b0e 177 int error;
fdcd952b 178
e771de53
BB
179 for (int c = 0; c < v->vdev_children; c++)
180 vdev_elevator_switch(v->vdev_child[c], elevator);
181
182 if (!v->vdev_ops->vdev_op_leaf || vd->vd_bdev == NULL)
183 return;
184
185 q = bdev_get_queue(vd->vd_bdev);
186 device = vd->vd_bdev->bd_disk->disk_name;
187
84daadde
PS
188 /*
189 * Skip devices which are not whole disks (partitions).
190 * Device-mapper devices are excepted since they may be whole
191 * disks despite the vdev_wholedisk flag, in which case we can
192 * and should switch the elevator. If the device-mapper device
193 * does not have an elevator (i.e. dm-raid, dm-crypt, etc.) the
194 * "Skip devices without schedulers" check below will fail.
195 */
196 if (!v->vdev_wholedisk && strncmp(device, "dm-", 3) != 0)
e771de53 197 return;
04516a45 198
fdcd952b 199 /* Leave existing scheduler when set to "none" */
4903926f 200 if ((strncmp(elevator, "none", 4) == 0) && (strlen(elevator) == 4))
e771de53 201 return;
6839eed2 202
1c38ac61
BB
203 /*
204 * The elevator_change() function was available in kernels from
205 * 2.6.36 to 4.11. When not available fall back to using the user
206 * mode helper functionality to set the elevator via sysfs. This
207 * requires /bin/echo and sysfs to be mounted which may not be true
208 * early in the boot process.
209 */
6d1d976b
BB
210#ifdef HAVE_ELEVATOR_CHANGE
211 error = elevator_change(q, elevator);
212#else
d1d7e268 213#define SET_SCHEDULER_CMD \
6d1d976b
BB
214 "exec 0</dev/null " \
215 " 1>/sys/block/%s/queue/scheduler " \
216 " 2>/dev/null; " \
217 "echo %s"
218
e771de53
BB
219 char *argv[] = { "/bin/sh", "-c", NULL, NULL };
220 char *envp[] = { NULL };
6d1d976b 221
e771de53
BB
222 argv[2] = kmem_asprintf(SET_SCHEDULER_CMD, device, elevator);
223 error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
224 strfree(argv[2]);
6d1d976b 225#endif /* HAVE_ELEVATOR_CHANGE */
1c38ac61 226 if (error) {
1b939560 227 zfs_dbgmsg("Unable to set \"%s\" scheduler for %s (%s): %d",
1c38ac61
BB
228 elevator, v->vdev_path, device, error);
229 }
6839eed2
BB
230}
231
60101509 232static int
1bd201e7
CS
233vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
234 uint64_t *ashift)
60101509 235{
d441e85d
BB
236 struct block_device *bdev;
237 fmode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa));
238 int count = 0, block_size;
239 int bdev_retry_count = 50;
60101509 240 vdev_disk_t *vd;
60101509
BB
241
242 /* Must have a pathname and it must be absolute. */
243 if (v->vdev_path == NULL || v->vdev_path[0] != '/') {
244 v->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
d441e85d 245 vdev_dbgmsg(v, "invalid vdev_path");
2d82ea8b 246 return (SET_ERROR(EINVAL));
60101509
BB
247 }
248
0d8103d9 249 /*
d441e85d
BB
250 * Reopen the device if it is currently open. When expanding a
251 * partition force re-scanning the partition table while closed
252 * in order to get an accurate updated block device size. Then
253 * since udev may need to recreate the device links increase the
254 * open retry count before reporting the device as unavailable.
0d8103d9 255 */
d441e85d
BB
256 vd = v->vdev_tsd;
257 if (vd) {
258 char disk_name[BDEVNAME_SIZE + 6] = "/dev/";
259 boolean_t reread_part = B_FALSE;
0d8103d9 260
d441e85d
BB
261 rw_enter(&vd->vd_lock, RW_WRITER);
262 bdev = vd->vd_bdev;
263 vd->vd_bdev = NULL;
264
265 if (bdev) {
266 if (v->vdev_expanding && bdev != bdev->bd_contains) {
267 bdevname(bdev->bd_contains, disk_name + 5);
268 reread_part = B_TRUE;
269 }
270
271 vdev_bdev_close(bdev, mode);
272 }
273
274 if (reread_part) {
275 bdev = vdev_bdev_open(disk_name, mode, zfs_vdev_holder);
276 if (!IS_ERR(bdev)) {
277 int error = vdev_bdev_reread_part(bdev);
278 vdev_bdev_close(bdev, mode);
279 if (error == 0)
280 bdev_retry_count = 100;
281 }
282 }
283 } else {
284 vd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);
285
286 rw_init(&vd->vd_lock, NULL, RW_DEFAULT, NULL);
287 rw_enter(&vd->vd_lock, RW_WRITER);
288 }
60101509
BB
289
290 /*
291 * Devices are always opened by the path provided at configuration
292 * time. This means that if the provided path is a udev by-id path
d441e85d 293 * then drives may be re-cabled without an issue. If the provided
4e95cc99 294 * path is a udev by-path path, then the physical location information
60101509
BB
295 * will be preserved. This can be critical for more complicated
296 * configurations where drives are located in specific physical
d441e85d
BB
297 * locations to maximize the systems tolerance to component failure.
298 *
4e95cc99 299 * Alternatively, you can provide your own udev rule to flexibly map
60101509 300 * the drives as you see fit. It is not advised that you use the
4e95cc99 301 * /dev/[hd]d devices which may be reordered due to probing order.
60101509
BB
302 * Devices in the wrong locations will be detected by the higher
303 * level vdev validation.
2d82ea8b
BB
304 *
305 * The specified paths may be briefly removed and recreated in
306 * response to udev events. This should be exceptionally unlikely
307 * because the zpool command makes every effort to verify these paths
308 * have already settled prior to reaching this point. Therefore,
309 * a ENOENT failure at this point is highly likely to be transient
310 * and it is reasonable to sleep and retry before giving up. In
311 * practice delays have been observed to be on the order of 100ms.
60101509 312 */
d441e85d
BB
313 bdev = ERR_PTR(-ENXIO);
314 while (IS_ERR(bdev) && count < bdev_retry_count) {
315 bdev = vdev_bdev_open(v->vdev_path, mode, zfs_vdev_holder);
2d82ea8b 316 if (unlikely(PTR_ERR(bdev) == -ENOENT)) {
d441e85d 317 schedule_timeout(MSEC_TO_TICK(10));
2d82ea8b
BB
318 count++;
319 } else if (IS_ERR(bdev)) {
320 break;
321 }
322 }
323
60101509 324 if (IS_ERR(bdev)) {
d441e85d 325 int error = -PTR_ERR(bdev);
1b939560 326 vdev_dbgmsg(v, "open error=%d count=%d", error, count);
d441e85d
BB
327 vd->vd_bdev = NULL;
328 v->vdev_tsd = vd;
329 rw_exit(&vd->vd_lock);
330 return (SET_ERROR(error));
331 } else {
332 vd->vd_bdev = bdev;
333 v->vdev_tsd = vd;
334 rw_exit(&vd->vd_lock);
60101509
BB
335 }
336
1b939560
BB
337 struct request_queue *q = bdev_get_queue(vd->vd_bdev);
338
0d8103d9
BB
339 /* Determine the physical block size */
340 block_size = vdev_bdev_block_size(vd->vd_bdev);
60101509 341
60101509
BB
342 /* Clear the nowritecache bit, causes vdev_reopen() to try again. */
343 v->vdev_nowritecache = B_FALSE;
344
1b939560
BB
345 /* Set when device reports it supports TRIM. */
346 v->vdev_has_trim = !!blk_queue_discard(q);
347
348 /* Set when device reports it supports secure TRIM. */
349 v->vdev_has_securetrim = !!blk_queue_discard_secure(q);
350
fb40095f 351 /* Inform the ZIO pipeline that we are non-rotational */
1b939560 352 v->vdev_nonrot = blk_queue_nonrot(q);
fb40095f 353
d441e85d
BB
354 /* Physical volume size in bytes for the partition */
355 *psize = bdev_capacity(vd->vd_bdev);
356
357 /* Physical volume size in bytes including possible expansion space */
358 *max_psize = bdev_max_capacity(vd->vd_bdev, v->vdev_wholedisk);
1bd201e7 359
60101509 360 /* Based on the minimum sector size set the block size */
9bd274dd 361 *ashift = highbit64(MAX(block_size, SPA_MINBLOCKSIZE)) - 1;
60101509 362
6839eed2 363 /* Try to set the io scheduler elevator algorithm */
fdcd952b 364 (void) vdev_elevator_switch(v, zfs_vdev_scheduler);
6839eed2 365
d1d7e268 366 return (0);
60101509
BB
367}
368
369static void
370vdev_disk_close(vdev_t *v)
371{
372 vdev_disk_t *vd = v->vdev_tsd;
373
0d8103d9 374 if (v->vdev_reopening || vd == NULL)
60101509
BB
375 return;
376
d441e85d 377 if (vd->vd_bdev != NULL) {
60101509 378 vdev_bdev_close(vd->vd_bdev,
d1d7e268 379 vdev_bdev_mode(spa_mode(v->vdev_spa)));
d441e85d 380 }
60101509 381
d441e85d 382 rw_destroy(&vd->vd_lock);
d1d7e268 383 kmem_free(vd, sizeof (vdev_disk_t));
60101509
BB
384 v->vdev_tsd = NULL;
385}
386
387static dio_request_t *
388vdev_disk_dio_alloc(int bio_count)
389{
390 dio_request_t *dr;
391 int i;
392
d1d7e268 393 dr = kmem_zalloc(sizeof (dio_request_t) +
79c76d5b 394 sizeof (struct bio *) * bio_count, KM_SLEEP);
60101509 395 if (dr) {
60101509
BB
396 atomic_set(&dr->dr_ref, 0);
397 dr->dr_bio_count = bio_count;
398 dr->dr_error = 0;
399
400 for (i = 0; i < dr->dr_bio_count; i++)
401 dr->dr_bio[i] = NULL;
402 }
403
d1d7e268 404 return (dr);
60101509
BB
405}
406
407static void
408vdev_disk_dio_free(dio_request_t *dr)
409{
410 int i;
411
412 for (i = 0; i < dr->dr_bio_count; i++)
413 if (dr->dr_bio[i])
414 bio_put(dr->dr_bio[i]);
415
d1d7e268
MK
416 kmem_free(dr, sizeof (dio_request_t) +
417 sizeof (struct bio *) * dr->dr_bio_count);
60101509
BB
418}
419
420static void
421vdev_disk_dio_get(dio_request_t *dr)
422{
423 atomic_inc(&dr->dr_ref);
424}
425
426static int
427vdev_disk_dio_put(dio_request_t *dr)
428{
429 int rc = atomic_dec_return(&dr->dr_ref);
430
431 /*
432 * Free the dio_request when the last reference is dropped and
433 * ensure zio_interpret is called only once with the correct zio
434 */
435 if (rc == 0) {
436 zio_t *zio = dr->dr_zio;
437 int error = dr->dr_error;
438
439 vdev_disk_dio_free(dr);
440
441 if (zio) {
442 zio->io_error = error;
d148e951
BB
443 ASSERT3S(zio->io_error, >=, 0);
444 if (zio->io_error)
445 vdev_disk_error(zio);
a6255b7f 446
26ef0cc7 447 zio_delay_interrupt(zio);
60101509
BB
448 }
449 }
450
d1d7e268 451 return (rc);
60101509
BB
452}
453
784a7fe5 454BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, error)
60101509
BB
455{
456 dio_request_t *dr = bio->bi_private;
457 int rc;
458
784a7fe5
LW
459 if (dr->dr_error == 0) {
460#ifdef HAVE_1ARG_BIO_END_IO_T
36ba27e9 461 dr->dr_error = BIO_END_IO_ERROR(bio);
784a7fe5
LW
462#else
463 if (error)
464 dr->dr_error = -(error);
465 else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
466 dr->dr_error = EIO;
467#endif
468 }
60101509 469
b0be93e8 470 /* Drop reference acquired by __vdev_disk_physio */
60101509 471 rc = vdev_disk_dio_put(dr);
60101509
BB
472}
473
60101509
BB
474static unsigned int
475bio_map(struct bio *bio, void *bio_ptr, unsigned int bio_size)
476{
477 unsigned int offset, size, i;
478 struct page *page;
479
480 offset = offset_in_page(bio_ptr);
481 for (i = 0; i < bio->bi_max_vecs; i++) {
482 size = PAGE_SIZE - offset;
483
484 if (bio_size <= 0)
485 break;
486
487 if (size > bio_size)
488 size = bio_size;
489
71f8548e 490 if (is_vmalloc_addr(bio_ptr))
60101509
BB
491 page = vmalloc_to_page(bio_ptr);
492 else
493 page = virt_to_page(bio_ptr);
494
17584980
CC
495 /*
496 * Some network related block device uses tcp_sendpage, which
497 * doesn't behave well when using 0-count page, this is a
498 * safety net to catch them.
499 */
500 ASSERT3S(page_count(page), >, 0);
501
60101509
BB
502 if (bio_add_page(bio, page, size, offset) != size)
503 break;
504
505 bio_ptr += size;
506 bio_size -= size;
507 offset = 0;
508 }
509
d1d7e268 510 return (bio_size);
60101509
BB
511}
512
b0be93e8
IH
513static unsigned int
514bio_map_abd_off(struct bio *bio, abd_t *abd, unsigned int size, size_t off)
515{
516 if (abd_is_linear(abd))
517 return (bio_map(bio, ((char *)abd_to_buf(abd)) + off, size));
518
519 return (abd_scatter_bio_map_off(bio, abd, size, off));
520}
521
bbb1b6ce 522static inline void
3b86aeb2 523vdev_submit_bio_impl(struct bio *bio)
bbb1b6ce
BB
524{
525#ifdef HAVE_1ARG_SUBMIT_BIO
bbb1b6ce
BB
526 submit_bio(bio);
527#else
3b86aeb2 528 submit_bio(0, bio);
bbb1b6ce
BB
529#endif
530}
531
26a85659
BB
532#ifdef HAVE_BIO_SET_DEV
533#if defined(CONFIG_BLK_CGROUP) && defined(HAVE_BIO_SET_DEV_GPL_ONLY)
534/*
535 * The Linux 5.0 kernel updated the bio_set_dev() macro so it calls the
536 * GPL-only bio_associate_blkg() symbol thus inadvertently converting
537 * the entire macro. Provide a minimal version which always assigns the
538 * request queue's root_blkg to the bio.
539 */
540static inline void
541vdev_bio_associate_blkg(struct bio *bio)
542{
543 struct request_queue *q = bio->bi_disk->queue;
544
545 ASSERT3P(q, !=, NULL);
26a85659
BB
546 ASSERT3P(bio->bi_blkg, ==, NULL);
547
548 if (blkg_tryget(q->root_blkg))
549 bio->bi_blkg = q->root_blkg;
550}
551#define bio_associate_blkg vdev_bio_associate_blkg
552#endif
553#else
554/*
555 * Provide a bio_set_dev() helper macro for pre-Linux 4.14 kernels.
556 */
787acae0
GDN
557static inline void
558bio_set_dev(struct bio *bio, struct block_device *bdev)
559{
560 bio->bi_bdev = bdev;
561}
26a85659 562#endif /* HAVE_BIO_SET_DEV */
787acae0 563
37f9dac5 564static inline void
3b86aeb2 565vdev_submit_bio(struct bio *bio)
37f9dac5
RY
566{
567#ifdef HAVE_CURRENT_BIO_TAIL
568 struct bio **bio_tail = current->bio_tail;
569 current->bio_tail = NULL;
3b86aeb2 570 vdev_submit_bio_impl(bio);
37f9dac5
RY
571 current->bio_tail = bio_tail;
572#else
573 struct bio_list *bio_list = current->bio_list;
574 current->bio_list = NULL;
3b86aeb2 575 vdev_submit_bio_impl(bio);
37f9dac5
RY
576 current->bio_list = bio_list;
577#endif
578}
579
60101509 580static int
b0be93e8
IH
581__vdev_disk_physio(struct block_device *bdev, zio_t *zio,
582 size_t io_size, uint64_t io_offset, int rw, int flags)
60101509 583{
d1d7e268 584 dio_request_t *dr;
b0be93e8 585 uint64_t abd_offset;
60101509 586 uint64_t bio_offset;
3b86aeb2 587 int bio_size, bio_count = 16;
f74fae8b 588 int i = 0, error = 0;
e8ac4557
IH
589#if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG)
590 struct blk_plug plug;
591#endif
d441e85d
BB
592 /*
593 * Accessing outside the block device is never allowed.
594 */
595 if (io_offset + io_size > bdev->bd_inode->i_size) {
596 vdev_dbgmsg(zio->io_vd,
597 "Illegal access %llu size %llu, device size %llu",
598 io_offset, io_size, i_size_read(bdev->bd_inode));
599 return (SET_ERROR(EIO));
600 }
e06be586 601
60101509
BB
602retry:
603 dr = vdev_disk_dio_alloc(bio_count);
604 if (dr == NULL)
ecb2b7dc 605 return (SET_ERROR(ENOMEM));
60101509 606
2959d94a 607 if (zio && !(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))
29b763cd 608 bio_set_flags_failfast(bdev, &flags);
2959d94a 609
60101509 610 dr->dr_zio = zio;
60101509 611
60101509
BB
612 /*
613 * When the IO size exceeds the maximum bio size for the request
614 * queue we are forced to break the IO in multiple bio's and wait
615 * for them all to complete. Ideally, all pool users will set
616 * their volume block size to match the maximum request size and
617 * the common case will be one bio per vdev IO request.
618 */
a6255b7f 619
b0be93e8
IH
620 abd_offset = 0;
621 bio_offset = io_offset;
622 bio_size = io_size;
60101509
BB
623 for (i = 0; i <= dr->dr_bio_count; i++) {
624
625 /* Finished constructing bio's for given buffer */
626 if (bio_size <= 0)
627 break;
628
629 /*
630 * By default only 'bio_count' bio's per dio are allowed.
631 * However, if we find ourselves in a situation where more
632 * are needed we allocate a larger dio and warn the user.
633 */
634 if (dr->dr_bio_count == i) {
635 vdev_disk_dio_free(dr);
636 bio_count *= 2;
60101509
BB
637 goto retry;
638 }
639
29b763cd 640 /* bio_alloc() with __GFP_WAIT never returns NULL */
f1512ee6 641 dr->dr_bio[i] = bio_alloc(GFP_NOIO,
b0be93e8 642 MIN(abd_nr_pages_off(zio->io_abd, bio_size, abd_offset),
02730c33 643 BIO_MAX_PAGES));
29b763cd 644 if (unlikely(dr->dr_bio[i] == NULL)) {
60101509 645 vdev_disk_dio_free(dr);
ecb2b7dc 646 return (SET_ERROR(ENOMEM));
60101509
BB
647 }
648
649 /* Matching put called by vdev_disk_physio_completion */
650 vdev_disk_dio_get(dr);
651
787acae0 652 bio_set_dev(dr->dr_bio[i], bdev);
d4541210 653 BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9;
60101509
BB
654 dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion;
655 dr->dr_bio[i]->bi_private = dr;
3b86aeb2 656 bio_set_op_attrs(dr->dr_bio[i], rw, flags);
60101509
BB
657
658 /* Remaining size is returned to become the new size */
b0be93e8 659 bio_size = bio_map_abd_off(dr->dr_bio[i], zio->io_abd,
02730c33 660 bio_size, abd_offset);
60101509
BB
661
662 /* Advance in buffer and construct another bio if needed */
b0be93e8 663 abd_offset += BIO_BI_SIZE(dr->dr_bio[i]);
d4541210 664 bio_offset += BIO_BI_SIZE(dr->dr_bio[i]);
60101509
BB
665 }
666
37f9dac5 667 /* Extra reference to protect dio_request during vdev_submit_bio */
60101509
BB
668 vdev_disk_dio_get(dr);
669
e8ac4557
IH
670#if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG)
671 if (dr->dr_bio_count > 1)
672 blk_start_plug(&plug);
673#endif
674
60101509
BB
675 /* Submit all bio's associated with this dio */
676 for (i = 0; i < dr->dr_bio_count; i++)
677 if (dr->dr_bio[i])
3b86aeb2 678 vdev_submit_bio(dr->dr_bio[i]);
60101509 679
e8ac4557
IH
680#if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG)
681 if (dr->dr_bio_count > 1)
682 blk_finish_plug(&plug);
683#endif
684
d1d7e268 685 (void) vdev_disk_dio_put(dr);
60101509 686
d1d7e268 687 return (error);
60101509
BB
688}
689
36ba27e9 690BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error)
60101509
BB
691{
692 zio_t *zio = bio->bi_private;
784a7fe5 693#ifdef HAVE_1ARG_BIO_END_IO_T
36ba27e9
BB
694 zio->io_error = BIO_END_IO_ERROR(bio);
695#else
696 zio->io_error = -error;
784a7fe5 697#endif
60101509 698
36ba27e9 699 if (zio->io_error && (zio->io_error == EOPNOTSUPP))
60101509
BB
700 zio->io_vd->vdev_nowritecache = B_TRUE;
701
702 bio_put(bio);
d148e951
BB
703 ASSERT3S(zio->io_error, >=, 0);
704 if (zio->io_error)
705 vdev_disk_error(zio);
60101509 706 zio_interrupt(zio);
60101509
BB
707}
708
709static int
710vdev_disk_io_flush(struct block_device *bdev, zio_t *zio)
711{
712 struct request_queue *q;
713 struct bio *bio;
714
715 q = bdev_get_queue(bdev);
716 if (!q)
ecb2b7dc 717 return (SET_ERROR(ENXIO));
60101509 718
abc41ac7 719 bio = bio_alloc(GFP_NOIO, 0);
29b763cd
IH
720 /* bio_alloc() with __GFP_WAIT never returns NULL */
721 if (unlikely(bio == NULL))
ecb2b7dc 722 return (SET_ERROR(ENOMEM));
60101509
BB
723
724 bio->bi_end_io = vdev_disk_io_flush_completion;
725 bio->bi_private = zio;
787acae0 726 bio_set_dev(bio, bdev);
a5e046ea 727 bio_set_flush(bio);
3b86aeb2 728 vdev_submit_bio(bio);
cecb7487 729 invalidate_bdev(bdev);
60101509 730
d1d7e268 731 return (0);
60101509 732}
60101509 733
98b25418 734static void
60101509
BB
735vdev_disk_io_start(zio_t *zio)
736{
737 vdev_t *v = zio->io_vd;
738 vdev_disk_t *vd = v->vdev_tsd;
1b939560 739 unsigned long trim_flags = 0;
3b86aeb2 740 int rw, flags, error;
60101509 741
d441e85d
BB
742 /*
743 * If the vdev is closed, it's likely in the REMOVED or FAULTED state.
744 * Nothing to be done here but return failure.
745 */
746 if (vd == NULL) {
747 zio->io_error = ENXIO;
748 zio_interrupt(zio);
749 return;
750 }
751
752 rw_enter(&vd->vd_lock, RW_READER);
753
754 /*
755 * If the vdev is closed, it's likely due to a failed reopen and is
756 * in the UNAVAIL state. Nothing to be done here but return failure.
757 */
758 if (vd->vd_bdev == NULL) {
759 rw_exit(&vd->vd_lock);
760 zio->io_error = ENXIO;
761 zio_interrupt(zio);
762 return;
763 }
764
60101509
BB
765 switch (zio->io_type) {
766 case ZIO_TYPE_IOCTL:
767
768 if (!vdev_readable(v)) {
d441e85d 769 rw_exit(&vd->vd_lock);
2e528b49 770 zio->io_error = SET_ERROR(ENXIO);
98b25418
GW
771 zio_interrupt(zio);
772 return;
60101509
BB
773 }
774
775 switch (zio->io_cmd) {
776 case DKIOCFLUSHWRITECACHE:
777
778 if (zfs_nocacheflush)
779 break;
780
781 if (v->vdev_nowritecache) {
2e528b49 782 zio->io_error = SET_ERROR(ENOTSUP);
60101509
BB
783 break;
784 }
785
786 error = vdev_disk_io_flush(vd->vd_bdev, zio);
d441e85d
BB
787 if (error == 0) {
788 rw_exit(&vd->vd_lock);
98b25418 789 return;
d441e85d 790 }
60101509
BB
791
792 zio->io_error = error;
60101509
BB
793
794 break;
795
796 default:
2e528b49 797 zio->io_error = SET_ERROR(ENOTSUP);
60101509
BB
798 }
799
d441e85d 800 rw_exit(&vd->vd_lock);
98b25418
GW
801 zio_execute(zio);
802 return;
60101509 803 case ZIO_TYPE_WRITE:
3b86aeb2 804 rw = WRITE;
e6603b7c 805#if defined(HAVE_BLK_QUEUE_HAVE_BIO_RW_UNPLUG)
3b86aeb2 806 flags = (1 << BIO_RW_UNPLUG);
e6603b7c 807#elif defined(REQ_UNPLUG)
3b86aeb2 808 flags = REQ_UNPLUG;
e6603b7c 809#else
3b86aeb2 810 flags = 0;
e6603b7c 811#endif
60101509
BB
812 break;
813
814 case ZIO_TYPE_READ:
3b86aeb2 815 rw = READ;
e6603b7c 816#if defined(HAVE_BLK_QUEUE_HAVE_BIO_RW_UNPLUG)
3b86aeb2 817 flags = (1 << BIO_RW_UNPLUG);
e6603b7c 818#elif defined(REQ_UNPLUG)
3b86aeb2 819 flags = REQ_UNPLUG;
e6603b7c 820#else
3b86aeb2 821 flags = 0;
e6603b7c 822#endif
60101509
BB
823 break;
824
1b939560
BB
825 case ZIO_TYPE_TRIM:
826#if defined(BLKDEV_DISCARD_SECURE)
827 if (zio->io_trim_flags & ZIO_TRIM_SECURE)
828 trim_flags |= BLKDEV_DISCARD_SECURE;
829#endif
830 zio->io_error = -blkdev_issue_discard(vd->vd_bdev,
831 zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS,
832 trim_flags);
833
834 rw_exit(&vd->vd_lock);
835 zio_interrupt(zio);
836 return;
837
60101509 838 default:
d441e85d 839 rw_exit(&vd->vd_lock);
2e528b49 840 zio->io_error = SET_ERROR(ENOTSUP);
98b25418
GW
841 zio_interrupt(zio);
842 return;
60101509
BB
843 }
844
26ef0cc7 845 zio->io_target_timestamp = zio_handle_io_delay(zio);
b0be93e8 846 error = __vdev_disk_physio(vd->vd_bdev, zio,
3b86aeb2 847 zio->io_size, zio->io_offset, rw, flags);
d441e85d
BB
848 rw_exit(&vd->vd_lock);
849
60101509
BB
850 if (error) {
851 zio->io_error = error;
98b25418
GW
852 zio_interrupt(zio);
853 return;
60101509 854 }
60101509
BB
855}
856
857static void
858vdev_disk_io_done(zio_t *zio)
859{
860 /*
861 * If the device returned EIO, we revalidate the media. If it is
862 * determined the media has changed this triggers the asynchronous
863 * removal of the device from the configuration.
864 */
865 if (zio->io_error == EIO) {
d1d7e268 866 vdev_t *v = zio->io_vd;
60101509
BB
867 vdev_disk_t *vd = v->vdev_tsd;
868
869 if (check_disk_change(vd->vd_bdev)) {
870 vdev_bdev_invalidate(vd->vd_bdev);
871 v->vdev_remove_wanted = B_TRUE;
872 spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE);
873 }
874 }
875}
876
877static void
878vdev_disk_hold(vdev_t *vd)
879{
880 ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
881
882 /* We must have a pathname, and it must be absolute. */
883 if (vd->vdev_path == NULL || vd->vdev_path[0] != '/')
884 return;
885
886 /*
887 * Only prefetch path and devid info if the device has
888 * never been opened.
889 */
890 if (vd->vdev_tsd != NULL)
891 return;
892
893 /* XXX: Implement me as a vnode lookup for the device */
894 vd->vdev_name_vp = NULL;
895 vd->vdev_devid_vp = NULL;
896}
897
898static void
899vdev_disk_rele(vdev_t *vd)
900{
901 ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
902
903 /* XXX: Implement me as a vnode rele for the device */
904}
905
e771de53
BB
906static int
907param_set_vdev_scheduler(const char *val, zfs_kernel_param_t *kp)
908{
909 spa_t *spa = NULL;
910 char *p;
911
912 if (val == NULL)
913 return (SET_ERROR(-EINVAL));
914
915 if ((p = strchr(val, '\n')) != NULL)
916 *p = '\0';
917
bc5f51c5 918 if (spa_mode_global != 0) {
e771de53 919 mutex_enter(&spa_namespace_lock);
bc5f51c5
OF
920 while ((spa = spa_next(spa)) != NULL) {
921 if (spa_state(spa) != POOL_STATE_ACTIVE ||
922 !spa_writeable(spa) || spa_suspended(spa))
923 continue;
924
925 spa_open_ref(spa, FTAG);
926 mutex_exit(&spa_namespace_lock);
927 vdev_elevator_switch(spa->spa_root_vdev, (char *)val);
928 mutex_enter(&spa_namespace_lock);
929 spa_close(spa, FTAG);
930 }
931 mutex_exit(&spa_namespace_lock);
e771de53 932 }
e771de53
BB
933
934 return (param_set_charp(val, kp));
935}
936
60101509
BB
937vdev_ops_t vdev_disk_ops = {
938 vdev_disk_open,
939 vdev_disk_close,
940 vdev_default_asize,
941 vdev_disk_io_start,
942 vdev_disk_io_done,
943 NULL,
3d6da72d 944 NULL,
60101509
BB
945 vdev_disk_hold,
946 vdev_disk_rele,
a1d477c2 947 NULL,
619f0976 948 vdev_default_xlate,
60101509
BB
949 VDEV_TYPE_DISK, /* name of this vdev type */
950 B_TRUE /* leaf vdev */
951};
952
e771de53
BB
953module_param_call(zfs_vdev_scheduler, param_set_vdev_scheduler,
954 param_get_charp, &zfs_vdev_scheduler, 0644);
c409e464 955MODULE_PARM_DESC(zfs_vdev_scheduler, "I/O scheduler");