]> git.proxmox.com Git - mirror_zfs.git/blame - module/zfs/vdev_disk.c
zpool reopen should detect expanded devices
[mirror_zfs.git] / module / zfs / vdev_disk.c
CommitLineData
60101509
BB
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC.
23 * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
24 * Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>.
25 * LLNL-CODE-403049.
2a16d4cf 26 * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
60101509
BB
27 */
28
29#include <sys/zfs_context.h>
4805781c 30#include <sys/spa_impl.h>
60101509
BB
31#include <sys/vdev_disk.h>
32#include <sys/vdev_impl.h>
a6255b7f 33#include <sys/abd.h>
60101509
BB
34#include <sys/fs/zfs.h>
35#include <sys/zio.h>
36#include <sys/sunldi.h>
4805781c 37#include <linux/mod_compat.h>
2a16d4cf 38#include <linux/msdos_fs.h>
60101509 39
6839eed2 40char *zfs_vdev_scheduler = VDEV_SCHEDULER;
8128bd89 41static void *zfs_vdev_holder = VDEV_HOLDER;
6839eed2 42
2a16d4cf
SH
43/* size of the "reserved" partition, in blocks */
44#define EFI_MIN_RESV_SIZE (16 * 1024)
45
60101509
BB
46/*
47 * Virtual device vector for disks.
48 */
49typedef struct dio_request {
60101509 50 zio_t *dr_zio; /* Parent ZIO */
aa159afb 51 atomic_t dr_ref; /* References */
60101509
BB
52 int dr_error; /* Bio error */
53 int dr_bio_count; /* Count of bio's */
d1d7e268 54 struct bio *dr_bio[0]; /* Attached bio's */
60101509
BB
55} dio_request_t;
56
57
58#ifdef HAVE_OPEN_BDEV_EXCLUSIVE
59static fmode_t
60vdev_bdev_mode(int smode)
61{
62 fmode_t mode = 0;
63
64 ASSERT3S(smode & (FREAD | FWRITE), !=, 0);
65
66 if (smode & FREAD)
67 mode |= FMODE_READ;
68
69 if (smode & FWRITE)
70 mode |= FMODE_WRITE;
71
d1d7e268 72 return (mode);
60101509
BB
73}
74#else
75static int
76vdev_bdev_mode(int smode)
77{
78 int mode = 0;
79
80 ASSERT3S(smode & (FREAD | FWRITE), !=, 0);
81
82 if ((smode & FREAD) && !(smode & FWRITE))
83 mode = MS_RDONLY;
84
d1d7e268 85 return (mode);
60101509
BB
86}
87#endif /* HAVE_OPEN_BDEV_EXCLUSIVE */
88
2a16d4cf 89/* The capacity (in bytes) of a bdev that is available to be used by a vdev */
60101509 90static uint64_t
2a16d4cf 91bdev_capacity(struct block_device *bdev, boolean_t wholedisk)
60101509
BB
92{
93 struct hd_struct *part = bdev->bd_part;
2a16d4cf
SH
94 uint64_t sectors = get_capacity(bdev->bd_disk);
95 /* If there are no paritions, return the entire device capacity */
96 if (part == NULL)
97 return (sectors << SECTOR_BITS);
60101509 98
2a16d4cf
SH
99 /*
100 * If there are partitions, decide if we are using a `wholedisk`
101 * layout (composed of part1 and part9) or just a single partition.
102 */
103 if (wholedisk) {
104 /* Verify the expected device layout */
105 ASSERT3P(bdev, !=, bdev->bd_contains);
106 /*
107 * Sectors used by the EFI partition (part9) as well as
108 * partion alignment.
109 */
110 uint64_t used = EFI_MIN_RESV_SIZE + NEW_START_BLOCK +
111 PARTITION_END_ALIGNMENT;
112
113 /* Space available to the vdev, i.e. the size of part1 */
114 if (sectors <= used)
115 return (0);
116 uint64_t available = sectors - used;
117 return (available << SECTOR_BITS);
118 } else {
119 /* The partition capacity referenced by the block device */
120 return (part->nr_sects << SECTOR_BITS);
121 }
60101509
BB
122}
123
d148e951
BB
124static void
125vdev_disk_error(zio_t *zio)
126{
127#ifdef ZFS_DEBUG
a69052be 128 printk("ZFS: zio error=%d type=%d offset=%llu size=%llu "
193a37cb 129 "flags=%x\n", zio->io_error, zio->io_type,
d148e951 130 (u_longlong_t)zio->io_offset, (u_longlong_t)zio->io_size,
193a37cb 131 zio->io_flags);
d148e951
BB
132#endif
133}
134
6839eed2
BB
135/*
136 * Use the Linux 'noop' elevator for zfs managed block devices. This
137 * strikes the ideal balance by allowing the zfs elevator to do all
138 * request ordering and prioritization. While allowing the Linux
139 * elevator to do the maximum front/back merging allowed by the
140 * physical device. This yields the largest possible requests for
141 * the device with the lowest total overhead.
6839eed2 142 */
4805781c 143static void
fdcd952b 144vdev_elevator_switch(vdev_t *v, char *elevator)
6839eed2 145{
fdcd952b 146 vdev_disk_t *vd = v->vdev_tsd;
4805781c
BB
147 struct request_queue *q;
148 char *device;
e2448b0e 149 int error;
fdcd952b 150
4805781c
BB
151 for (int c = 0; c < v->vdev_children; c++)
152 vdev_elevator_switch(v->vdev_child[c], elevator);
153
154 if (!v->vdev_ops->vdev_op_leaf || vd->vd_bdev == NULL)
155 return;
156
157 q = bdev_get_queue(vd->vd_bdev);
158 device = vd->vd_bdev->bd_disk->disk_name;
159
84daadde
PS
160 /*
161 * Skip devices which are not whole disks (partitions).
162 * Device-mapper devices are excepted since they may be whole
163 * disks despite the vdev_wholedisk flag, in which case we can
164 * and should switch the elevator. If the device-mapper device
165 * does not have an elevator (i.e. dm-raid, dm-crypt, etc.) the
166 * "Skip devices without schedulers" check below will fail.
167 */
168 if (!v->vdev_wholedisk && strncmp(device, "dm-", 3) != 0)
4805781c 169 return;
04516a45 170
fdcd952b
BB
171 /* Skip devices without schedulers (loop, ram, dm, etc) */
172 if (!q->elevator || !blk_queue_stackable(q))
4805781c 173 return;
6839eed2 174
fdcd952b 175 /* Leave existing scheduler when set to "none" */
4903926f 176 if ((strncmp(elevator, "none", 4) == 0) && (strlen(elevator) == 4))
4805781c 177 return;
6839eed2 178
6d1d976b
BB
179#ifdef HAVE_ELEVATOR_CHANGE
180 error = elevator_change(q, elevator);
181#else
d1d7e268
MK
182 /*
183 * For pre-2.6.36 kernels elevator_change() is not available.
6d1d976b
BB
184 * Therefore we fall back to using a usermodehelper to echo the
185 * elevator into sysfs; This requires /bin/echo and sysfs to be
186 * mounted which may not be true early in the boot process.
187 */
d1d7e268 188#define SET_SCHEDULER_CMD \
6d1d976b
BB
189 "exec 0</dev/null " \
190 " 1>/sys/block/%s/queue/scheduler " \
191 " 2>/dev/null; " \
192 "echo %s"
193
4805781c
BB
194 char *argv[] = { "/bin/sh", "-c", NULL, NULL };
195 char *envp[] = { NULL };
6d1d976b 196
4805781c
BB
197 argv[2] = kmem_asprintf(SET_SCHEDULER_CMD, device, elevator);
198 error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
199 strfree(argv[2]);
6d1d976b 200#endif /* HAVE_ELEVATOR_CHANGE */
6839eed2
BB
201 if (error)
202 printk("ZFS: Unable to set \"%s\" scheduler for %s (%s): %d\n",
d1d7e268 203 elevator, v->vdev_path, device, error);
6839eed2
BB
204}
205
b5a28807
ED
206/*
207 * Expanding a whole disk vdev involves invoking BLKRRPART on the
208 * whole disk device. This poses a problem, because BLKRRPART will
209 * return EBUSY if one of the disk's partitions is open. That's why
210 * we have to do it here, just before opening the data partition.
211 * Unfortunately, BLKRRPART works by dropping all partitions and
212 * recreating them, which means that for a short time window, all
213 * /dev/sdxN device files disappear (until udev recreates them).
214 * This means two things:
215 * - When we open the data partition just after a BLKRRPART, we
216 * can't do it using the normal device file path because of the
217 * obvious race condition with udev. Instead, we use reliable
218 * kernel APIs to get a handle to the new partition device from
219 * the whole disk device.
220 * - Because vdev_disk_open() initially needs to find the device
221 * using its path, multiple vdev_disk_open() invocations in
222 * short succession on the same disk with BLKRRPARTs in the
223 * middle have a high probability of failure (because of the
224 * race condition with udev). A typical situation where this
225 * might happen is when the zpool userspace tool does a
226 * TRYIMPORT immediately followed by an IMPORT. For this
227 * reason, we only invoke BLKRRPART in the module when strictly
228 * necessary (zpool online -e case), and rely on userspace to
229 * do it when possible.
230 */
231static struct block_device *
232vdev_disk_rrpart(const char *path, int mode, vdev_disk_t *vd)
233{
234#if defined(HAVE_3ARG_BLKDEV_GET) && defined(HAVE_GET_GENDISK)
235 struct block_device *bdev, *result = ERR_PTR(-ENXIO);
236 struct gendisk *disk;
237 int error, partno;
238
8128bd89 239 bdev = vdev_bdev_open(path, vdev_bdev_mode(mode), zfs_vdev_holder);
b5a28807 240 if (IS_ERR(bdev))
d1d7e268 241 return (bdev);
b5a28807
ED
242
243 disk = get_gendisk(bdev->bd_dev, &partno);
244 vdev_bdev_close(bdev, vdev_bdev_mode(mode));
245
246 if (disk) {
247 bdev = bdget(disk_devt(disk));
248 if (bdev) {
249 error = blkdev_get(bdev, vdev_bdev_mode(mode), vd);
250 if (error == 0)
251 error = ioctl_by_bdev(bdev, BLKRRPART, 0);
252 vdev_bdev_close(bdev, vdev_bdev_mode(mode));
253 }
254
255 bdev = bdget_disk(disk, partno);
256 if (bdev) {
257 error = blkdev_get(bdev,
258 vdev_bdev_mode(mode) | FMODE_EXCL, vd);
259 if (error == 0)
260 result = bdev;
261 }
262 put_disk(disk);
263 }
264
d1d7e268 265 return (result);
b5a28807 266#else
d1d7e268 267 return (ERR_PTR(-EOPNOTSUPP));
b5a28807
ED
268#endif /* defined(HAVE_3ARG_BLKDEV_GET) && defined(HAVE_GET_GENDISK) */
269}
270
60101509 271static int
1bd201e7
CS
272vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
273 uint64_t *ashift)
60101509 274{
b5a28807 275 struct block_device *bdev = ERR_PTR(-ENXIO);
60101509 276 vdev_disk_t *vd;
2d82ea8b 277 int count = 0, mode, block_size;
60101509
BB
278
279 /* Must have a pathname and it must be absolute. */
280 if (v->vdev_path == NULL || v->vdev_path[0] != '/') {
281 v->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
2d82ea8b 282 return (SET_ERROR(EINVAL));
60101509
BB
283 }
284
0d8103d9
BB
285 /*
286 * Reopen the device if it's not currently open. Otherwise,
287 * just update the physical size of the device.
288 */
289 if (v->vdev_tsd != NULL) {
290 ASSERT(v->vdev_reopening);
291 vd = v->vdev_tsd;
292 goto skip_open;
293 }
294
79c76d5b 295 vd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);
60101509 296 if (vd == NULL)
2d82ea8b 297 return (SET_ERROR(ENOMEM));
60101509
BB
298
299 /*
300 * Devices are always opened by the path provided at configuration
301 * time. This means that if the provided path is a udev by-id path
302 * then drives may be recabled without an issue. If the provided
4e95cc99 303 * path is a udev by-path path, then the physical location information
60101509
BB
304 * will be preserved. This can be critical for more complicated
305 * configurations where drives are located in specific physical
306 * locations to maximize the systems tolerence to component failure.
4e95cc99 307 * Alternatively, you can provide your own udev rule to flexibly map
60101509 308 * the drives as you see fit. It is not advised that you use the
4e95cc99 309 * /dev/[hd]d devices which may be reordered due to probing order.
60101509
BB
310 * Devices in the wrong locations will be detected by the higher
311 * level vdev validation.
2d82ea8b
BB
312 *
313 * The specified paths may be briefly removed and recreated in
314 * response to udev events. This should be exceptionally unlikely
315 * because the zpool command makes every effort to verify these paths
316 * have already settled prior to reaching this point. Therefore,
317 * a ENOENT failure at this point is highly likely to be transient
318 * and it is reasonable to sleep and retry before giving up. In
319 * practice delays have been observed to be on the order of 100ms.
60101509
BB
320 */
321 mode = spa_mode(v->vdev_spa);
b5a28807
ED
322 if (v->vdev_wholedisk && v->vdev_expanding)
323 bdev = vdev_disk_rrpart(v->vdev_path, mode, vd);
2d82ea8b
BB
324
325 while (IS_ERR(bdev) && count < 50) {
8128bd89
BB
326 bdev = vdev_bdev_open(v->vdev_path,
327 vdev_bdev_mode(mode), zfs_vdev_holder);
2d82ea8b
BB
328 if (unlikely(PTR_ERR(bdev) == -ENOENT)) {
329 msleep(10);
330 count++;
331 } else if (IS_ERR(bdev)) {
332 break;
333 }
334 }
335
60101509 336 if (IS_ERR(bdev)) {
2d82ea8b
BB
337 dprintf("failed open v->vdev_path=%s, error=%d count=%d\n",
338 v->vdev_path, -PTR_ERR(bdev), count);
d1d7e268 339 kmem_free(vd, sizeof (vdev_disk_t));
2d82ea8b 340 return (SET_ERROR(-PTR_ERR(bdev)));
60101509
BB
341 }
342
343 v->vdev_tsd = vd;
344 vd->vd_bdev = bdev;
0d8103d9
BB
345
346skip_open:
347 /* Determine the physical block size */
348 block_size = vdev_bdev_block_size(vd->vd_bdev);
60101509 349
60101509
BB
350 /* Clear the nowritecache bit, causes vdev_reopen() to try again. */
351 v->vdev_nowritecache = B_FALSE;
352
fb40095f
RY
353 /* Inform the ZIO pipeline that we are non-rotational */
354 v->vdev_nonrot = blk_queue_nonrot(bdev_get_queue(vd->vd_bdev));
355
60101509 356 /* Physical volume size in bytes */
2a16d4cf 357 *psize = bdev_capacity(vd->vd_bdev, v->vdev_wholedisk);
1bd201e7
CS
358 *max_psize = *psize;
359
60101509 360 /* Based on the minimum sector size set the block size */
9bd274dd 361 *ashift = highbit64(MAX(block_size, SPA_MINBLOCKSIZE)) - 1;
60101509 362
6839eed2 363 /* Try to set the io scheduler elevator algorithm */
fdcd952b 364 (void) vdev_elevator_switch(v, zfs_vdev_scheduler);
6839eed2 365
d1d7e268 366 return (0);
60101509
BB
367}
368
369static void
370vdev_disk_close(vdev_t *v)
371{
372 vdev_disk_t *vd = v->vdev_tsd;
373
0d8103d9 374 if (v->vdev_reopening || vd == NULL)
60101509
BB
375 return;
376
377 if (vd->vd_bdev != NULL)
378 vdev_bdev_close(vd->vd_bdev,
d1d7e268 379 vdev_bdev_mode(spa_mode(v->vdev_spa)));
60101509 380
d1d7e268 381 kmem_free(vd, sizeof (vdev_disk_t));
60101509
BB
382 v->vdev_tsd = NULL;
383}
384
385static dio_request_t *
386vdev_disk_dio_alloc(int bio_count)
387{
388 dio_request_t *dr;
389 int i;
390
d1d7e268 391 dr = kmem_zalloc(sizeof (dio_request_t) +
79c76d5b 392 sizeof (struct bio *) * bio_count, KM_SLEEP);
60101509 393 if (dr) {
60101509
BB
394 atomic_set(&dr->dr_ref, 0);
395 dr->dr_bio_count = bio_count;
396 dr->dr_error = 0;
397
398 for (i = 0; i < dr->dr_bio_count; i++)
399 dr->dr_bio[i] = NULL;
400 }
401
d1d7e268 402 return (dr);
60101509
BB
403}
404
405static void
406vdev_disk_dio_free(dio_request_t *dr)
407{
408 int i;
409
410 for (i = 0; i < dr->dr_bio_count; i++)
411 if (dr->dr_bio[i])
412 bio_put(dr->dr_bio[i]);
413
d1d7e268
MK
414 kmem_free(dr, sizeof (dio_request_t) +
415 sizeof (struct bio *) * dr->dr_bio_count);
60101509
BB
416}
417
418static void
419vdev_disk_dio_get(dio_request_t *dr)
420{
421 atomic_inc(&dr->dr_ref);
422}
423
424static int
425vdev_disk_dio_put(dio_request_t *dr)
426{
427 int rc = atomic_dec_return(&dr->dr_ref);
428
429 /*
430 * Free the dio_request when the last reference is dropped and
431 * ensure zio_interpret is called only once with the correct zio
432 */
433 if (rc == 0) {
434 zio_t *zio = dr->dr_zio;
435 int error = dr->dr_error;
436
437 vdev_disk_dio_free(dr);
438
439 if (zio) {
440 zio->io_error = error;
d148e951
BB
441 ASSERT3S(zio->io_error, >=, 0);
442 if (zio->io_error)
443 vdev_disk_error(zio);
a6255b7f 444
26ef0cc7 445 zio_delay_interrupt(zio);
60101509
BB
446 }
447 }
448
d1d7e268 449 return (rc);
60101509
BB
450}
451
784a7fe5 452BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, error)
60101509
BB
453{
454 dio_request_t *dr = bio->bi_private;
455 int rc;
456
784a7fe5
LW
457 if (dr->dr_error == 0) {
458#ifdef HAVE_1ARG_BIO_END_IO_T
36ba27e9 459 dr->dr_error = BIO_END_IO_ERROR(bio);
784a7fe5
LW
460#else
461 if (error)
462 dr->dr_error = -(error);
463 else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
464 dr->dr_error = EIO;
465#endif
466 }
60101509 467
b0be93e8 468 /* Drop reference acquired by __vdev_disk_physio */
60101509 469 rc = vdev_disk_dio_put(dr);
60101509
BB
470}
471
60101509
BB
472static unsigned int
473bio_map(struct bio *bio, void *bio_ptr, unsigned int bio_size)
474{
475 unsigned int offset, size, i;
476 struct page *page;
477
478 offset = offset_in_page(bio_ptr);
479 for (i = 0; i < bio->bi_max_vecs; i++) {
480 size = PAGE_SIZE - offset;
481
482 if (bio_size <= 0)
483 break;
484
485 if (size > bio_size)
486 size = bio_size;
487
71f8548e 488 if (is_vmalloc_addr(bio_ptr))
60101509
BB
489 page = vmalloc_to_page(bio_ptr);
490 else
491 page = virt_to_page(bio_ptr);
492
17584980
CC
493 /*
494 * Some network related block device uses tcp_sendpage, which
495 * doesn't behave well when using 0-count page, this is a
496 * safety net to catch them.
497 */
498 ASSERT3S(page_count(page), >, 0);
499
60101509
BB
500 if (bio_add_page(bio, page, size, offset) != size)
501 break;
502
503 bio_ptr += size;
504 bio_size -= size;
505 offset = 0;
506 }
507
d1d7e268 508 return (bio_size);
60101509
BB
509}
510
b0be93e8
IH
511static unsigned int
512bio_map_abd_off(struct bio *bio, abd_t *abd, unsigned int size, size_t off)
513{
514 if (abd_is_linear(abd))
515 return (bio_map(bio, ((char *)abd_to_buf(abd)) + off, size));
516
517 return (abd_scatter_bio_map_off(bio, abd, size, off));
518}
519
bbb1b6ce 520static inline void
3b86aeb2 521vdev_submit_bio_impl(struct bio *bio)
bbb1b6ce
BB
522{
523#ifdef HAVE_1ARG_SUBMIT_BIO
bbb1b6ce
BB
524 submit_bio(bio);
525#else
3b86aeb2 526 submit_bio(0, bio);
bbb1b6ce
BB
527#endif
528}
529
661907e6
BB
530#ifndef HAVE_BIO_SET_DEV
531static inline void
532bio_set_dev(struct bio *bio, struct block_device *bdev)
533{
534 bio->bi_bdev = bdev;
535}
536#endif /* !HAVE_BIO_SET_DEV */
537
37f9dac5 538static inline void
3b86aeb2 539vdev_submit_bio(struct bio *bio)
37f9dac5
RY
540{
541#ifdef HAVE_CURRENT_BIO_TAIL
542 struct bio **bio_tail = current->bio_tail;
543 current->bio_tail = NULL;
3b86aeb2 544 vdev_submit_bio_impl(bio);
37f9dac5
RY
545 current->bio_tail = bio_tail;
546#else
547 struct bio_list *bio_list = current->bio_list;
548 current->bio_list = NULL;
3b86aeb2 549 vdev_submit_bio_impl(bio);
37f9dac5
RY
550 current->bio_list = bio_list;
551#endif
552}
553
60101509 554static int
b0be93e8
IH
555__vdev_disk_physio(struct block_device *bdev, zio_t *zio,
556 size_t io_size, uint64_t io_offset, int rw, int flags)
60101509 557{
d1d7e268 558 dio_request_t *dr;
b0be93e8 559 uint64_t abd_offset;
60101509 560 uint64_t bio_offset;
3b86aeb2 561 int bio_size, bio_count = 16;
f74fae8b 562 int i = 0, error = 0;
e8ac4557
IH
563#if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG)
564 struct blk_plug plug;
565#endif
60101509 566
b0be93e8
IH
567 ASSERT(zio != NULL);
568 ASSERT3U(io_offset + io_size, <=, bdev->bd_inode->i_size);
e06be586 569
60101509
BB
570retry:
571 dr = vdev_disk_dio_alloc(bio_count);
572 if (dr == NULL)
d1d7e268 573 return (ENOMEM);
60101509 574
2959d94a 575 if (zio && !(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))
29b763cd 576 bio_set_flags_failfast(bdev, &flags);
2959d94a 577
60101509 578 dr->dr_zio = zio;
60101509 579
60101509
BB
580 /*
581 * When the IO size exceeds the maximum bio size for the request
582 * queue we are forced to break the IO in multiple bio's and wait
583 * for them all to complete. Ideally, all pool users will set
584 * their volume block size to match the maximum request size and
585 * the common case will be one bio per vdev IO request.
586 */
a6255b7f 587
b0be93e8
IH
588 abd_offset = 0;
589 bio_offset = io_offset;
590 bio_size = io_size;
60101509
BB
591 for (i = 0; i <= dr->dr_bio_count; i++) {
592
593 /* Finished constructing bio's for given buffer */
594 if (bio_size <= 0)
595 break;
596
597 /*
598 * By default only 'bio_count' bio's per dio are allowed.
599 * However, if we find ourselves in a situation where more
600 * are needed we allocate a larger dio and warn the user.
601 */
602 if (dr->dr_bio_count == i) {
603 vdev_disk_dio_free(dr);
604 bio_count *= 2;
60101509
BB
605 goto retry;
606 }
607
29b763cd 608 /* bio_alloc() with __GFP_WAIT never returns NULL */
f1512ee6 609 dr->dr_bio[i] = bio_alloc(GFP_NOIO,
b0be93e8 610 MIN(abd_nr_pages_off(zio->io_abd, bio_size, abd_offset),
02730c33 611 BIO_MAX_PAGES));
29b763cd 612 if (unlikely(dr->dr_bio[i] == NULL)) {
60101509 613 vdev_disk_dio_free(dr);
d1d7e268 614 return (ENOMEM);
60101509
BB
615 }
616
617 /* Matching put called by vdev_disk_physio_completion */
618 vdev_disk_dio_get(dr);
619
661907e6 620 bio_set_dev(dr->dr_bio[i], bdev);
d4541210 621 BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9;
60101509
BB
622 dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion;
623 dr->dr_bio[i]->bi_private = dr;
3b86aeb2 624 bio_set_op_attrs(dr->dr_bio[i], rw, flags);
60101509
BB
625
626 /* Remaining size is returned to become the new size */
b0be93e8 627 bio_size = bio_map_abd_off(dr->dr_bio[i], zio->io_abd,
02730c33 628 bio_size, abd_offset);
60101509
BB
629
630 /* Advance in buffer and construct another bio if needed */
b0be93e8 631 abd_offset += BIO_BI_SIZE(dr->dr_bio[i]);
d4541210 632 bio_offset += BIO_BI_SIZE(dr->dr_bio[i]);
60101509
BB
633 }
634
37f9dac5 635 /* Extra reference to protect dio_request during vdev_submit_bio */
60101509
BB
636 vdev_disk_dio_get(dr);
637
e8ac4557
IH
638#if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG)
639 if (dr->dr_bio_count > 1)
640 blk_start_plug(&plug);
641#endif
642
60101509
BB
643 /* Submit all bio's associated with this dio */
644 for (i = 0; i < dr->dr_bio_count; i++)
645 if (dr->dr_bio[i])
3b86aeb2 646 vdev_submit_bio(dr->dr_bio[i]);
60101509 647
e8ac4557
IH
648#if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG)
649 if (dr->dr_bio_count > 1)
650 blk_finish_plug(&plug);
651#endif
652
d1d7e268 653 (void) vdev_disk_dio_put(dr);
60101509 654
d1d7e268 655 return (error);
60101509
BB
656}
657
36ba27e9 658BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error)
60101509
BB
659{
660 zio_t *zio = bio->bi_private;
784a7fe5 661#ifdef HAVE_1ARG_BIO_END_IO_T
36ba27e9
BB
662 zio->io_error = BIO_END_IO_ERROR(bio);
663#else
664 zio->io_error = -error;
784a7fe5 665#endif
60101509 666
36ba27e9 667 if (zio->io_error && (zio->io_error == EOPNOTSUPP))
60101509
BB
668 zio->io_vd->vdev_nowritecache = B_TRUE;
669
670 bio_put(bio);
d148e951
BB
671 ASSERT3S(zio->io_error, >=, 0);
672 if (zio->io_error)
673 vdev_disk_error(zio);
60101509 674 zio_interrupt(zio);
60101509
BB
675}
676
677static int
678vdev_disk_io_flush(struct block_device *bdev, zio_t *zio)
679{
680 struct request_queue *q;
681 struct bio *bio;
682
683 q = bdev_get_queue(bdev);
684 if (!q)
d1d7e268 685 return (ENXIO);
60101509 686
abc41ac7 687 bio = bio_alloc(GFP_NOIO, 0);
29b763cd
IH
688 /* bio_alloc() with __GFP_WAIT never returns NULL */
689 if (unlikely(bio == NULL))
d1d7e268 690 return (ENOMEM);
60101509
BB
691
692 bio->bi_end_io = vdev_disk_io_flush_completion;
693 bio->bi_private = zio;
661907e6 694 bio_set_dev(bio, bdev);
a5e046ea 695 bio_set_flush(bio);
3b86aeb2 696 vdev_submit_bio(bio);
cecb7487 697 invalidate_bdev(bdev);
60101509 698
d1d7e268 699 return (0);
60101509 700}
60101509 701
98b25418 702static void
60101509
BB
703vdev_disk_io_start(zio_t *zio)
704{
705 vdev_t *v = zio->io_vd;
706 vdev_disk_t *vd = v->vdev_tsd;
3b86aeb2 707 int rw, flags, error;
60101509
BB
708
709 switch (zio->io_type) {
710 case ZIO_TYPE_IOCTL:
711
712 if (!vdev_readable(v)) {
2e528b49 713 zio->io_error = SET_ERROR(ENXIO);
98b25418
GW
714 zio_interrupt(zio);
715 return;
60101509
BB
716 }
717
718 switch (zio->io_cmd) {
719 case DKIOCFLUSHWRITECACHE:
720
721 if (zfs_nocacheflush)
722 break;
723
724 if (v->vdev_nowritecache) {
2e528b49 725 zio->io_error = SET_ERROR(ENOTSUP);
60101509
BB
726 break;
727 }
728
729 error = vdev_disk_io_flush(vd->vd_bdev, zio);
730 if (error == 0)
98b25418 731 return;
60101509
BB
732
733 zio->io_error = error;
60101509
BB
734
735 break;
736
737 default:
2e528b49 738 zio->io_error = SET_ERROR(ENOTSUP);
60101509
BB
739 }
740
98b25418
GW
741 zio_execute(zio);
742 return;
60101509 743 case ZIO_TYPE_WRITE:
3b86aeb2 744 rw = WRITE;
e6603b7c 745#if defined(HAVE_BLK_QUEUE_HAVE_BIO_RW_UNPLUG)
3b86aeb2 746 flags = (1 << BIO_RW_UNPLUG);
e6603b7c 747#elif defined(REQ_UNPLUG)
3b86aeb2 748 flags = REQ_UNPLUG;
e6603b7c 749#else
3b86aeb2 750 flags = 0;
e6603b7c 751#endif
60101509
BB
752 break;
753
754 case ZIO_TYPE_READ:
3b86aeb2 755 rw = READ;
e6603b7c 756#if defined(HAVE_BLK_QUEUE_HAVE_BIO_RW_UNPLUG)
3b86aeb2 757 flags = (1 << BIO_RW_UNPLUG);
e6603b7c 758#elif defined(REQ_UNPLUG)
3b86aeb2 759 flags = REQ_UNPLUG;
e6603b7c 760#else
3b86aeb2 761 flags = 0;
e6603b7c 762#endif
60101509
BB
763 break;
764
765 default:
2e528b49 766 zio->io_error = SET_ERROR(ENOTSUP);
98b25418
GW
767 zio_interrupt(zio);
768 return;
60101509
BB
769 }
770
26ef0cc7 771 zio->io_target_timestamp = zio_handle_io_delay(zio);
b0be93e8 772 error = __vdev_disk_physio(vd->vd_bdev, zio,
3b86aeb2 773 zio->io_size, zio->io_offset, rw, flags);
60101509
BB
774 if (error) {
775 zio->io_error = error;
98b25418
GW
776 zio_interrupt(zio);
777 return;
60101509 778 }
60101509
BB
779}
780
781static void
782vdev_disk_io_done(zio_t *zio)
783{
784 /*
785 * If the device returned EIO, we revalidate the media. If it is
786 * determined the media has changed this triggers the asynchronous
787 * removal of the device from the configuration.
788 */
789 if (zio->io_error == EIO) {
d1d7e268 790 vdev_t *v = zio->io_vd;
60101509
BB
791 vdev_disk_t *vd = v->vdev_tsd;
792
793 if (check_disk_change(vd->vd_bdev)) {
794 vdev_bdev_invalidate(vd->vd_bdev);
795 v->vdev_remove_wanted = B_TRUE;
796 spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE);
797 }
798 }
799}
800
801static void
802vdev_disk_hold(vdev_t *vd)
803{
804 ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
805
806 /* We must have a pathname, and it must be absolute. */
807 if (vd->vdev_path == NULL || vd->vdev_path[0] != '/')
808 return;
809
810 /*
811 * Only prefetch path and devid info if the device has
812 * never been opened.
813 */
814 if (vd->vdev_tsd != NULL)
815 return;
816
817 /* XXX: Implement me as a vnode lookup for the device */
818 vd->vdev_name_vp = NULL;
819 vd->vdev_devid_vp = NULL;
820}
821
822static void
823vdev_disk_rele(vdev_t *vd)
824{
825 ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
826
827 /* XXX: Implement me as a vnode rele for the device */
828}
829
4805781c
BB
830static int
831param_set_vdev_scheduler(const char *val, zfs_kernel_param_t *kp)
832{
833 spa_t *spa = NULL;
834 char *p;
835
836 if (val == NULL)
837 return (SET_ERROR(-EINVAL));
838
839 if ((p = strchr(val, '\n')) != NULL)
840 *p = '\0';
841
3eef58c9 842 if (spa_mode_global != 0) {
4805781c 843 mutex_enter(&spa_namespace_lock);
3eef58c9
OF
844 while ((spa = spa_next(spa)) != NULL) {
845 if (spa_state(spa) != POOL_STATE_ACTIVE ||
846 !spa_writeable(spa) || spa_suspended(spa))
847 continue;
848
849 spa_open_ref(spa, FTAG);
850 mutex_exit(&spa_namespace_lock);
851 vdev_elevator_switch(spa->spa_root_vdev, (char *)val);
852 mutex_enter(&spa_namespace_lock);
853 spa_close(spa, FTAG);
854 }
855 mutex_exit(&spa_namespace_lock);
4805781c 856 }
4805781c
BB
857
858 return (param_set_charp(val, kp));
859}
860
60101509
BB
861vdev_ops_t vdev_disk_ops = {
862 vdev_disk_open,
863 vdev_disk_close,
864 vdev_default_asize,
865 vdev_disk_io_start,
866 vdev_disk_io_done,
867 NULL,
3d6da72d 868 NULL,
60101509
BB
869 vdev_disk_hold,
870 vdev_disk_rele,
871 VDEV_TYPE_DISK, /* name of this vdev type */
872 B_TRUE /* leaf vdev */
873};
874
4805781c
BB
875module_param_call(zfs_vdev_scheduler, param_set_vdev_scheduler,
876 param_get_charp, &zfs_vdev_scheduler, 0644);
c409e464 877MODULE_PARM_DESC(zfs_vdev_scheduler, "I/O scheduler");