]> git.proxmox.com Git - mirror_zfs.git/blame - module/zfs/vdev_disk.c
txg visibility code should not execute under tc_open_lock
[mirror_zfs.git] / module / zfs / vdev_disk.c
CommitLineData
60101509
BB
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC.
23 * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
24 * Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>.
25 * LLNL-CODE-403049.
26ef0cc7 26 * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
60101509
BB
27 */
28
29#include <sys/zfs_context.h>
30#include <sys/spa.h>
31#include <sys/vdev_disk.h>
32#include <sys/vdev_impl.h>
33#include <sys/fs/zfs.h>
34#include <sys/zio.h>
35#include <sys/sunldi.h>
36
6839eed2 37char *zfs_vdev_scheduler = VDEV_SCHEDULER;
8128bd89 38static void *zfs_vdev_holder = VDEV_HOLDER;
6839eed2 39
60101509
BB
40/*
41 * Virtual device vector for disks.
42 */
43typedef struct dio_request {
60101509 44 zio_t *dr_zio; /* Parent ZIO */
aa159afb 45 atomic_t dr_ref; /* References */
60101509
BB
46 int dr_error; /* Bio error */
47 int dr_bio_count; /* Count of bio's */
d1d7e268 48 struct bio *dr_bio[0]; /* Attached bio's */
60101509
BB
49} dio_request_t;
50
51
52#ifdef HAVE_OPEN_BDEV_EXCLUSIVE
53static fmode_t
54vdev_bdev_mode(int smode)
55{
56 fmode_t mode = 0;
57
58 ASSERT3S(smode & (FREAD | FWRITE), !=, 0);
59
60 if (smode & FREAD)
61 mode |= FMODE_READ;
62
63 if (smode & FWRITE)
64 mode |= FMODE_WRITE;
65
d1d7e268 66 return (mode);
60101509
BB
67}
68#else
69static int
70vdev_bdev_mode(int smode)
71{
72 int mode = 0;
73
74 ASSERT3S(smode & (FREAD | FWRITE), !=, 0);
75
76 if ((smode & FREAD) && !(smode & FWRITE))
77 mode = MS_RDONLY;
78
d1d7e268 79 return (mode);
60101509
BB
80}
81#endif /* HAVE_OPEN_BDEV_EXCLUSIVE */
82
83static uint64_t
84bdev_capacity(struct block_device *bdev)
85{
86 struct hd_struct *part = bdev->bd_part;
87
88 /* The partition capacity referenced by the block device */
89 if (part)
f74fae8b 90 return (part->nr_sects << 9);
60101509
BB
91
92 /* Otherwise assume the full device capacity */
f74fae8b 93 return (get_capacity(bdev->bd_disk) << 9);
60101509
BB
94}
95
d148e951
BB
96static void
97vdev_disk_error(zio_t *zio)
98{
99#ifdef ZFS_DEBUG
a69052be 100 printk("ZFS: zio error=%d type=%d offset=%llu size=%llu "
193a37cb 101 "flags=%x\n", zio->io_error, zio->io_type,
d148e951 102 (u_longlong_t)zio->io_offset, (u_longlong_t)zio->io_size,
193a37cb 103 zio->io_flags);
d148e951
BB
104#endif
105}
106
6839eed2
BB
107/*
108 * Use the Linux 'noop' elevator for zfs managed block devices. This
109 * strikes the ideal balance by allowing the zfs elevator to do all
110 * request ordering and prioritization. While allowing the Linux
111 * elevator to do the maximum front/back merging allowed by the
112 * physical device. This yields the largest possible requests for
113 * the device with the lowest total overhead.
6839eed2
BB
114 */
115static int
fdcd952b 116vdev_elevator_switch(vdev_t *v, char *elevator)
6839eed2 117{
fdcd952b
BB
118 vdev_disk_t *vd = v->vdev_tsd;
119 struct block_device *bdev = vd->vd_bdev;
120 struct request_queue *q = bdev_get_queue(bdev);
121 char *device = bdev->bd_disk->disk_name;
e2448b0e 122 int error;
fdcd952b 123
84daadde
PS
124 /*
125 * Skip devices which are not whole disks (partitions).
126 * Device-mapper devices are excepted since they may be whole
127 * disks despite the vdev_wholedisk flag, in which case we can
128 * and should switch the elevator. If the device-mapper device
129 * does not have an elevator (i.e. dm-raid, dm-crypt, etc.) the
130 * "Skip devices without schedulers" check below will fail.
131 */
132 if (!v->vdev_wholedisk && strncmp(device, "dm-", 3) != 0)
04516a45
BB
133 return (0);
134
fdcd952b
BB
135 /* Skip devices without schedulers (loop, ram, dm, etc) */
136 if (!q->elevator || !blk_queue_stackable(q))
137 return (0);
6839eed2 138
fdcd952b 139 /* Leave existing scheduler when set to "none" */
4903926f 140 if ((strncmp(elevator, "none", 4) == 0) && (strlen(elevator) == 4))
6839eed2
BB
141 return (0);
142
6d1d976b
BB
143#ifdef HAVE_ELEVATOR_CHANGE
144 error = elevator_change(q, elevator);
145#else
d1d7e268
MK
146 /*
147 * For pre-2.6.36 kernels elevator_change() is not available.
6d1d976b
BB
148 * Therefore we fall back to using a usermodehelper to echo the
149 * elevator into sysfs; This requires /bin/echo and sysfs to be
150 * mounted which may not be true early in the boot process.
151 */
d1d7e268 152#define SET_SCHEDULER_CMD \
6d1d976b
BB
153 "exec 0</dev/null " \
154 " 1>/sys/block/%s/queue/scheduler " \
155 " 2>/dev/null; " \
156 "echo %s"
157
158 {
159 char *argv[] = { "/bin/sh", "-c", NULL, NULL };
160 char *envp[] = { NULL };
161
162 argv[2] = kmem_asprintf(SET_SCHEDULER_CMD, device, elevator);
761394b3 163 error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
6d1d976b
BB
164 strfree(argv[2]);
165 }
166#endif /* HAVE_ELEVATOR_CHANGE */
6839eed2
BB
167 if (error)
168 printk("ZFS: Unable to set \"%s\" scheduler for %s (%s): %d\n",
d1d7e268 169 elevator, v->vdev_path, device, error);
6839eed2
BB
170
171 return (error);
172}
173
b5a28807
ED
174/*
175 * Expanding a whole disk vdev involves invoking BLKRRPART on the
176 * whole disk device. This poses a problem, because BLKRRPART will
177 * return EBUSY if one of the disk's partitions is open. That's why
178 * we have to do it here, just before opening the data partition.
179 * Unfortunately, BLKRRPART works by dropping all partitions and
180 * recreating them, which means that for a short time window, all
181 * /dev/sdxN device files disappear (until udev recreates them).
182 * This means two things:
183 * - When we open the data partition just after a BLKRRPART, we
184 * can't do it using the normal device file path because of the
185 * obvious race condition with udev. Instead, we use reliable
186 * kernel APIs to get a handle to the new partition device from
187 * the whole disk device.
188 * - Because vdev_disk_open() initially needs to find the device
189 * using its path, multiple vdev_disk_open() invocations in
190 * short succession on the same disk with BLKRRPARTs in the
191 * middle have a high probability of failure (because of the
192 * race condition with udev). A typical situation where this
193 * might happen is when the zpool userspace tool does a
194 * TRYIMPORT immediately followed by an IMPORT. For this
195 * reason, we only invoke BLKRRPART in the module when strictly
196 * necessary (zpool online -e case), and rely on userspace to
197 * do it when possible.
198 */
199static struct block_device *
200vdev_disk_rrpart(const char *path, int mode, vdev_disk_t *vd)
201{
202#if defined(HAVE_3ARG_BLKDEV_GET) && defined(HAVE_GET_GENDISK)
203 struct block_device *bdev, *result = ERR_PTR(-ENXIO);
204 struct gendisk *disk;
205 int error, partno;
206
8128bd89 207 bdev = vdev_bdev_open(path, vdev_bdev_mode(mode), zfs_vdev_holder);
b5a28807 208 if (IS_ERR(bdev))
d1d7e268 209 return (bdev);
b5a28807
ED
210
211 disk = get_gendisk(bdev->bd_dev, &partno);
212 vdev_bdev_close(bdev, vdev_bdev_mode(mode));
213
214 if (disk) {
215 bdev = bdget(disk_devt(disk));
216 if (bdev) {
217 error = blkdev_get(bdev, vdev_bdev_mode(mode), vd);
218 if (error == 0)
219 error = ioctl_by_bdev(bdev, BLKRRPART, 0);
220 vdev_bdev_close(bdev, vdev_bdev_mode(mode));
221 }
222
223 bdev = bdget_disk(disk, partno);
224 if (bdev) {
225 error = blkdev_get(bdev,
226 vdev_bdev_mode(mode) | FMODE_EXCL, vd);
227 if (error == 0)
228 result = bdev;
229 }
230 put_disk(disk);
231 }
232
d1d7e268 233 return (result);
b5a28807 234#else
d1d7e268 235 return (ERR_PTR(-EOPNOTSUPP));
b5a28807
ED
236#endif /* defined(HAVE_3ARG_BLKDEV_GET) && defined(HAVE_GET_GENDISK) */
237}
238
60101509 239static int
1bd201e7
CS
240vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
241 uint64_t *ashift)
60101509 242{
b5a28807 243 struct block_device *bdev = ERR_PTR(-ENXIO);
60101509 244 vdev_disk_t *vd;
2d82ea8b 245 int count = 0, mode, block_size;
60101509
BB
246
247 /* Must have a pathname and it must be absolute. */
248 if (v->vdev_path == NULL || v->vdev_path[0] != '/') {
249 v->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
2d82ea8b 250 return (SET_ERROR(EINVAL));
60101509
BB
251 }
252
0d8103d9
BB
253 /*
254 * Reopen the device if it's not currently open. Otherwise,
255 * just update the physical size of the device.
256 */
257 if (v->vdev_tsd != NULL) {
258 ASSERT(v->vdev_reopening);
259 vd = v->vdev_tsd;
260 goto skip_open;
261 }
262
79c76d5b 263 vd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);
60101509 264 if (vd == NULL)
2d82ea8b 265 return (SET_ERROR(ENOMEM));
60101509
BB
266
267 /*
268 * Devices are always opened by the path provided at configuration
269 * time. This means that if the provided path is a udev by-id path
270 * then drives may be recabled without an issue. If the provided
4e95cc99 271 * path is a udev by-path path, then the physical location information
60101509
BB
272 * will be preserved. This can be critical for more complicated
273 * configurations where drives are located in specific physical
274 * locations to maximize the systems tolerence to component failure.
4e95cc99 275 * Alternatively, you can provide your own udev rule to flexibly map
60101509 276 * the drives as you see fit. It is not advised that you use the
4e95cc99 277 * /dev/[hd]d devices which may be reordered due to probing order.
60101509
BB
278 * Devices in the wrong locations will be detected by the higher
279 * level vdev validation.
2d82ea8b
BB
280 *
281 * The specified paths may be briefly removed and recreated in
282 * response to udev events. This should be exceptionally unlikely
283 * because the zpool command makes every effort to verify these paths
284 * have already settled prior to reaching this point. Therefore,
285 * a ENOENT failure at this point is highly likely to be transient
286 * and it is reasonable to sleep and retry before giving up. In
287 * practice delays have been observed to be on the order of 100ms.
60101509
BB
288 */
289 mode = spa_mode(v->vdev_spa);
b5a28807
ED
290 if (v->vdev_wholedisk && v->vdev_expanding)
291 bdev = vdev_disk_rrpart(v->vdev_path, mode, vd);
2d82ea8b
BB
292
293 while (IS_ERR(bdev) && count < 50) {
8128bd89
BB
294 bdev = vdev_bdev_open(v->vdev_path,
295 vdev_bdev_mode(mode), zfs_vdev_holder);
2d82ea8b
BB
296 if (unlikely(PTR_ERR(bdev) == -ENOENT)) {
297 msleep(10);
298 count++;
299 } else if (IS_ERR(bdev)) {
300 break;
301 }
302 }
303
60101509 304 if (IS_ERR(bdev)) {
2d82ea8b
BB
305 dprintf("failed open v->vdev_path=%s, error=%d count=%d\n",
306 v->vdev_path, -PTR_ERR(bdev), count);
d1d7e268 307 kmem_free(vd, sizeof (vdev_disk_t));
2d82ea8b 308 return (SET_ERROR(-PTR_ERR(bdev)));
60101509
BB
309 }
310
311 v->vdev_tsd = vd;
312 vd->vd_bdev = bdev;
0d8103d9
BB
313
314skip_open:
315 /* Determine the physical block size */
316 block_size = vdev_bdev_block_size(vd->vd_bdev);
60101509 317
60101509
BB
318 /* Clear the nowritecache bit, causes vdev_reopen() to try again. */
319 v->vdev_nowritecache = B_FALSE;
320
fb40095f
RY
321 /* Inform the ZIO pipeline that we are non-rotational */
322 v->vdev_nonrot = blk_queue_nonrot(bdev_get_queue(vd->vd_bdev));
323
60101509 324 /* Physical volume size in bytes */
0d8103d9 325 *psize = bdev_capacity(vd->vd_bdev);
60101509 326
1bd201e7
CS
327 /* TODO: report possible expansion size */
328 *max_psize = *psize;
329
60101509 330 /* Based on the minimum sector size set the block size */
9bd274dd 331 *ashift = highbit64(MAX(block_size, SPA_MINBLOCKSIZE)) - 1;
60101509 332
6839eed2 333 /* Try to set the io scheduler elevator algorithm */
fdcd952b 334 (void) vdev_elevator_switch(v, zfs_vdev_scheduler);
6839eed2 335
d1d7e268 336 return (0);
60101509
BB
337}
338
339static void
340vdev_disk_close(vdev_t *v)
341{
342 vdev_disk_t *vd = v->vdev_tsd;
343
0d8103d9 344 if (v->vdev_reopening || vd == NULL)
60101509
BB
345 return;
346
347 if (vd->vd_bdev != NULL)
348 vdev_bdev_close(vd->vd_bdev,
d1d7e268 349 vdev_bdev_mode(spa_mode(v->vdev_spa)));
60101509 350
d1d7e268 351 kmem_free(vd, sizeof (vdev_disk_t));
60101509
BB
352 v->vdev_tsd = NULL;
353}
354
355static dio_request_t *
356vdev_disk_dio_alloc(int bio_count)
357{
358 dio_request_t *dr;
359 int i;
360
d1d7e268 361 dr = kmem_zalloc(sizeof (dio_request_t) +
79c76d5b 362 sizeof (struct bio *) * bio_count, KM_SLEEP);
60101509 363 if (dr) {
60101509
BB
364 atomic_set(&dr->dr_ref, 0);
365 dr->dr_bio_count = bio_count;
366 dr->dr_error = 0;
367
368 for (i = 0; i < dr->dr_bio_count; i++)
369 dr->dr_bio[i] = NULL;
370 }
371
d1d7e268 372 return (dr);
60101509
BB
373}
374
375static void
376vdev_disk_dio_free(dio_request_t *dr)
377{
378 int i;
379
380 for (i = 0; i < dr->dr_bio_count; i++)
381 if (dr->dr_bio[i])
382 bio_put(dr->dr_bio[i]);
383
d1d7e268
MK
384 kmem_free(dr, sizeof (dio_request_t) +
385 sizeof (struct bio *) * dr->dr_bio_count);
60101509
BB
386}
387
388static void
389vdev_disk_dio_get(dio_request_t *dr)
390{
391 atomic_inc(&dr->dr_ref);
392}
393
394static int
395vdev_disk_dio_put(dio_request_t *dr)
396{
397 int rc = atomic_dec_return(&dr->dr_ref);
398
399 /*
400 * Free the dio_request when the last reference is dropped and
401 * ensure zio_interpret is called only once with the correct zio
402 */
403 if (rc == 0) {
404 zio_t *zio = dr->dr_zio;
405 int error = dr->dr_error;
406
407 vdev_disk_dio_free(dr);
408
409 if (zio) {
410 zio->io_error = error;
d148e951
BB
411 ASSERT3S(zio->io_error, >=, 0);
412 if (zio->io_error)
413 vdev_disk_error(zio);
26ef0cc7 414 zio_delay_interrupt(zio);
60101509
BB
415 }
416 }
417
d1d7e268 418 return (rc);
60101509
BB
419}
420
784a7fe5 421BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, error)
60101509
BB
422{
423 dio_request_t *dr = bio->bi_private;
424 int rc;
425
784a7fe5
LW
426 if (dr->dr_error == 0) {
427#ifdef HAVE_1ARG_BIO_END_IO_T
428 dr->dr_error = -(bio->bi_error);
429#else
430 if (error)
431 dr->dr_error = -(error);
432 else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
433 dr->dr_error = EIO;
434#endif
435 }
60101509
BB
436
437 /* Drop reference aquired by __vdev_disk_physio */
438 rc = vdev_disk_dio_put(dr);
60101509
BB
439}
440
441static inline unsigned long
442bio_nr_pages(void *bio_ptr, unsigned int bio_size)
443{
444 return ((((unsigned long)bio_ptr + bio_size + PAGE_SIZE - 1) >>
d1d7e268 445 PAGE_SHIFT) - ((unsigned long)bio_ptr >> PAGE_SHIFT));
60101509
BB
446}
447
448static unsigned int
449bio_map(struct bio *bio, void *bio_ptr, unsigned int bio_size)
450{
451 unsigned int offset, size, i;
452 struct page *page;
453
454 offset = offset_in_page(bio_ptr);
455 for (i = 0; i < bio->bi_max_vecs; i++) {
456 size = PAGE_SIZE - offset;
457
458 if (bio_size <= 0)
459 break;
460
461 if (size > bio_size)
462 size = bio_size;
463
71f8548e 464 if (is_vmalloc_addr(bio_ptr))
60101509
BB
465 page = vmalloc_to_page(bio_ptr);
466 else
467 page = virt_to_page(bio_ptr);
468
17584980
CC
469 /*
470 * Some network related block device uses tcp_sendpage, which
471 * doesn't behave well when using 0-count page, this is a
472 * safety net to catch them.
473 */
474 ASSERT3S(page_count(page), >, 0);
475
60101509
BB
476 if (bio_add_page(bio, page, size, offset) != size)
477 break;
478
479 bio_ptr += size;
480 bio_size -= size;
481 offset = 0;
482 }
483
d1d7e268 484 return (bio_size);
60101509
BB
485}
486
37f9dac5
RY
487static inline void
488vdev_submit_bio(int rw, struct bio *bio)
489{
490#ifdef HAVE_CURRENT_BIO_TAIL
491 struct bio **bio_tail = current->bio_tail;
492 current->bio_tail = NULL;
493 submit_bio(rw, bio);
494 current->bio_tail = bio_tail;
495#else
496 struct bio_list *bio_list = current->bio_list;
497 current->bio_list = NULL;
498 submit_bio(rw, bio);
499 current->bio_list = bio_list;
500#endif
501}
502
60101509
BB
503static int
504__vdev_disk_physio(struct block_device *bdev, zio_t *zio, caddr_t kbuf_ptr,
e6603b7c 505 size_t kbuf_size, uint64_t kbuf_offset, int flags)
60101509 506{
d1d7e268 507 dio_request_t *dr;
60101509
BB
508 caddr_t bio_ptr;
509 uint64_t bio_offset;
aa159afb 510 int rw, bio_size, bio_count = 16;
f74fae8b 511 int i = 0, error = 0;
60101509 512
e06be586
NB
513 ASSERT3U(kbuf_offset + kbuf_size, <=, bdev->bd_inode->i_size);
514
60101509
BB
515retry:
516 dr = vdev_disk_dio_alloc(bio_count);
517 if (dr == NULL)
d1d7e268 518 return (ENOMEM);
60101509 519
2959d94a 520 if (zio && !(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))
29b763cd 521 bio_set_flags_failfast(bdev, &flags);
2959d94a 522
aa159afb 523 rw = flags;
60101509 524 dr->dr_zio = zio;
60101509 525
60101509
BB
526 /*
527 * When the IO size exceeds the maximum bio size for the request
528 * queue we are forced to break the IO in multiple bio's and wait
529 * for them all to complete. Ideally, all pool users will set
530 * their volume block size to match the maximum request size and
531 * the common case will be one bio per vdev IO request.
532 */
533 bio_ptr = kbuf_ptr;
534 bio_offset = kbuf_offset;
535 bio_size = kbuf_size;
536 for (i = 0; i <= dr->dr_bio_count; i++) {
537
538 /* Finished constructing bio's for given buffer */
539 if (bio_size <= 0)
540 break;
541
542 /*
543 * By default only 'bio_count' bio's per dio are allowed.
544 * However, if we find ourselves in a situation where more
545 * are needed we allocate a larger dio and warn the user.
546 */
547 if (dr->dr_bio_count == i) {
548 vdev_disk_dio_free(dr);
549 bio_count *= 2;
60101509
BB
550 goto retry;
551 }
552
29b763cd 553 /* bio_alloc() with __GFP_WAIT never returns NULL */
f1512ee6
MA
554 dr->dr_bio[i] = bio_alloc(GFP_NOIO,
555 MIN(bio_nr_pages(bio_ptr, bio_size), BIO_MAX_PAGES));
29b763cd 556 if (unlikely(dr->dr_bio[i] == NULL)) {
60101509 557 vdev_disk_dio_free(dr);
d1d7e268 558 return (ENOMEM);
60101509
BB
559 }
560
561 /* Matching put called by vdev_disk_physio_completion */
562 vdev_disk_dio_get(dr);
563
564 dr->dr_bio[i]->bi_bdev = bdev;
d4541210 565 BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9;
aa159afb 566 dr->dr_bio[i]->bi_rw = rw;
60101509
BB
567 dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion;
568 dr->dr_bio[i]->bi_private = dr;
569
570 /* Remaining size is returned to become the new size */
571 bio_size = bio_map(dr->dr_bio[i], bio_ptr, bio_size);
572
573 /* Advance in buffer and construct another bio if needed */
d4541210
CC
574 bio_ptr += BIO_BI_SIZE(dr->dr_bio[i]);
575 bio_offset += BIO_BI_SIZE(dr->dr_bio[i]);
60101509
BB
576 }
577
37f9dac5 578 /* Extra reference to protect dio_request during vdev_submit_bio */
60101509
BB
579 vdev_disk_dio_get(dr);
580
581 /* Submit all bio's associated with this dio */
582 for (i = 0; i < dr->dr_bio_count; i++)
583 if (dr->dr_bio[i])
aa159afb 584 vdev_submit_bio(rw, dr->dr_bio[i]);
60101509 585
d1d7e268 586 (void) vdev_disk_dio_put(dr);
60101509 587
d1d7e268 588 return (error);
60101509
BB
589}
590
e6603b7c 591#ifndef __linux__
60101509
BB
592int
593vdev_disk_physio(struct block_device *bdev, caddr_t kbuf,
d1d7e268 594 size_t size, uint64_t offset, int flags)
60101509 595{
2959d94a 596 bio_set_flags_failfast(bdev, &flags);
e6603b7c 597 return (__vdev_disk_physio(bdev, NULL, kbuf, size, offset, flags));
60101509 598}
e6603b7c 599#endif
60101509 600
784a7fe5 601BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, rc)
60101509
BB
602{
603 zio_t *zio = bio->bi_private;
784a7fe5
LW
604#ifdef HAVE_1ARG_BIO_END_IO_T
605 int rc = bio->bi_error;
606#endif
60101509
BB
607
608 zio->io_error = -rc;
609 if (rc && (rc == -EOPNOTSUPP))
610 zio->io_vd->vdev_nowritecache = B_TRUE;
611
612 bio_put(bio);
d148e951
BB
613 ASSERT3S(zio->io_error, >=, 0);
614 if (zio->io_error)
615 vdev_disk_error(zio);
60101509 616 zio_interrupt(zio);
60101509
BB
617}
618
619static int
620vdev_disk_io_flush(struct block_device *bdev, zio_t *zio)
621{
622 struct request_queue *q;
623 struct bio *bio;
624
625 q = bdev_get_queue(bdev);
626 if (!q)
d1d7e268 627 return (ENXIO);
60101509 628
abc41ac7 629 bio = bio_alloc(GFP_NOIO, 0);
29b763cd
IH
630 /* bio_alloc() with __GFP_WAIT never returns NULL */
631 if (unlikely(bio == NULL))
d1d7e268 632 return (ENOMEM);
60101509
BB
633
634 bio->bi_end_io = vdev_disk_io_flush_completion;
635 bio->bi_private = zio;
636 bio->bi_bdev = bdev;
37f9dac5 637 vdev_submit_bio(VDEV_WRITE_FLUSH_FUA, bio);
cecb7487 638 invalidate_bdev(bdev);
60101509 639
d1d7e268 640 return (0);
60101509 641}
60101509 642
98b25418 643static void
60101509
BB
644vdev_disk_io_start(zio_t *zio)
645{
646 vdev_t *v = zio->io_vd;
647 vdev_disk_t *vd = v->vdev_tsd;
648 int flags, error;
649
650 switch (zio->io_type) {
651 case ZIO_TYPE_IOCTL:
652
653 if (!vdev_readable(v)) {
2e528b49 654 zio->io_error = SET_ERROR(ENXIO);
98b25418
GW
655 zio_interrupt(zio);
656 return;
60101509
BB
657 }
658
659 switch (zio->io_cmd) {
660 case DKIOCFLUSHWRITECACHE:
661
662 if (zfs_nocacheflush)
663 break;
664
665 if (v->vdev_nowritecache) {
2e528b49 666 zio->io_error = SET_ERROR(ENOTSUP);
60101509
BB
667 break;
668 }
669
670 error = vdev_disk_io_flush(vd->vd_bdev, zio);
671 if (error == 0)
98b25418 672 return;
60101509
BB
673
674 zio->io_error = error;
675 if (error == ENOTSUP)
676 v->vdev_nowritecache = B_TRUE;
677
678 break;
679
680 default:
2e528b49 681 zio->io_error = SET_ERROR(ENOTSUP);
60101509
BB
682 }
683
98b25418
GW
684 zio_execute(zio);
685 return;
60101509 686 case ZIO_TYPE_WRITE:
e6603b7c
TC
687#if defined(HAVE_BLK_QUEUE_HAVE_BIO_RW_UNPLUG)
688 flags = WRITE | (1 << BIO_RW_UNPLUG);
689#elif defined(REQ_UNPLUG)
690 flags = WRITE | REQ_UNPLUG;
691#else
692 flags = WRITE;
693#endif
60101509
BB
694 break;
695
696 case ZIO_TYPE_READ:
e6603b7c
TC
697#if defined(HAVE_BLK_QUEUE_HAVE_BIO_RW_UNPLUG)
698 flags = READ | (1 << BIO_RW_UNPLUG);
699#elif defined(REQ_UNPLUG)
700 flags = READ | REQ_UNPLUG;
701#else
702 flags = READ;
703#endif
60101509
BB
704 break;
705
706 default:
2e528b49 707 zio->io_error = SET_ERROR(ENOTSUP);
98b25418
GW
708 zio_interrupt(zio);
709 return;
60101509
BB
710 }
711
26ef0cc7 712 zio->io_target_timestamp = zio_handle_io_delay(zio);
60101509 713 error = __vdev_disk_physio(vd->vd_bdev, zio, zio->io_data,
e6603b7c 714 zio->io_size, zio->io_offset, flags);
60101509
BB
715 if (error) {
716 zio->io_error = error;
98b25418
GW
717 zio_interrupt(zio);
718 return;
60101509 719 }
60101509
BB
720}
721
722static void
723vdev_disk_io_done(zio_t *zio)
724{
725 /*
726 * If the device returned EIO, we revalidate the media. If it is
727 * determined the media has changed this triggers the asynchronous
728 * removal of the device from the configuration.
729 */
730 if (zio->io_error == EIO) {
d1d7e268 731 vdev_t *v = zio->io_vd;
60101509
BB
732 vdev_disk_t *vd = v->vdev_tsd;
733
734 if (check_disk_change(vd->vd_bdev)) {
735 vdev_bdev_invalidate(vd->vd_bdev);
736 v->vdev_remove_wanted = B_TRUE;
737 spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE);
738 }
739 }
740}
741
742static void
743vdev_disk_hold(vdev_t *vd)
744{
745 ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
746
747 /* We must have a pathname, and it must be absolute. */
748 if (vd->vdev_path == NULL || vd->vdev_path[0] != '/')
749 return;
750
751 /*
752 * Only prefetch path and devid info if the device has
753 * never been opened.
754 */
755 if (vd->vdev_tsd != NULL)
756 return;
757
758 /* XXX: Implement me as a vnode lookup for the device */
759 vd->vdev_name_vp = NULL;
760 vd->vdev_devid_vp = NULL;
761}
762
763static void
764vdev_disk_rele(vdev_t *vd)
765{
766 ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
767
768 /* XXX: Implement me as a vnode rele for the device */
769}
770
771vdev_ops_t vdev_disk_ops = {
772 vdev_disk_open,
773 vdev_disk_close,
774 vdev_default_asize,
775 vdev_disk_io_start,
776 vdev_disk_io_done,
777 NULL,
778 vdev_disk_hold,
779 vdev_disk_rele,
780 VDEV_TYPE_DISK, /* name of this vdev type */
781 B_TRUE /* leaf vdev */
782};
783
e6603b7c 784#ifndef __linux__
60101509
BB
785/*
786 * Given the root disk device devid or pathname, read the label from
787 * the device, and construct a configuration nvlist.
788 */
789int
790vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config)
791{
792 struct block_device *bdev;
793 vdev_label_t *label;
794 uint64_t s, size;
795 int i;
796
8128bd89 797 bdev = vdev_bdev_open(devpath, vdev_bdev_mode(FREAD), zfs_vdev_holder);
60101509 798 if (IS_ERR(bdev))
d1d7e268 799 return (-PTR_ERR(bdev));
60101509 800
f74fae8b 801 s = bdev_capacity(bdev);
60101509
BB
802 if (s == 0) {
803 vdev_bdev_close(bdev, vdev_bdev_mode(FREAD));
d1d7e268 804 return (EIO);
60101509
BB
805 }
806
d1d7e268 807 size = P2ALIGN_TYPED(s, sizeof (vdev_label_t), uint64_t);
79c76d5b 808 label = vmem_alloc(sizeof (vdev_label_t), KM_SLEEP);
60101509
BB
809
810 for (i = 0; i < VDEV_LABELS; i++) {
d1d7e268 811 uint64_t offset, state, txg = 0;
60101509
BB
812
813 /* read vdev label */
814 offset = vdev_label_offset(size, i, 0);
815 if (vdev_disk_physio(bdev, (caddr_t)label,
816 VDEV_SKIP_SIZE + VDEV_PHYS_SIZE, offset, READ_SYNC) != 0)
817 continue;
818
819 if (nvlist_unpack(label->vl_vdev_phys.vp_nvlist,
820 sizeof (label->vl_vdev_phys.vp_nvlist), config, 0) != 0) {
821 *config = NULL;
822 continue;
823 }
824
825 if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE,
826 &state) != 0 || state >= POOL_STATE_DESTROYED) {
827 nvlist_free(*config);
828 *config = NULL;
829 continue;
830 }
831
832 if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG,
833 &txg) != 0 || txg == 0) {
834 nvlist_free(*config);
835 *config = NULL;
836 continue;
837 }
838
839 break;
840 }
841
d1d7e268 842 vmem_free(label, sizeof (vdev_label_t));
60101509
BB
843 vdev_bdev_close(bdev, vdev_bdev_mode(FREAD));
844
d1d7e268 845 return (0);
60101509 846}
e6603b7c 847#endif /* __linux__ */
6839eed2
BB
848
849module_param(zfs_vdev_scheduler, charp, 0644);
c409e464 850MODULE_PARM_DESC(zfs_vdev_scheduler, "I/O scheduler");