4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright (C) 2011 Lawrence Livermore National Security, LLC.
24 * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
25 * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
32 #include <linux/blkdev.h>
33 #include <linux/elevator.h>
34 #include <linux/backing-dev.h>
35 #include <linux/hdreg.h>
36 #include <linux/msdos_fs.h> /* for SECTOR_* */
39 typedef unsigned __bitwise__ fmode_t
;
40 #endif /* HAVE_FMODE_T */
42 #ifndef HAVE_BLK_QUEUE_FLAG_SET
44 blk_queue_flag_set(unsigned int flag
, struct request_queue
*q
)
46 queue_flag_set(flag
, q
);
50 #ifndef HAVE_BLK_QUEUE_FLAG_CLEAR
52 blk_queue_flag_clear(unsigned int flag
, struct request_queue
*q
)
54 queue_flag_clear(flag
, q
);
60 * The blk_queue_write_cache() interface has replaced blk_queue_flush()
61 * interface. However, the new interface is GPL-only thus we implement
62 * our own trivial wrapper when the GPL-only version is detected.
65 * The blk_queue_flush() interface has replaced blk_queue_ordered()
66 * interface. However, while the old interface was available to all the
67 * new one is GPL-only. Thus if the GPL-only version is detected we
68 * implement our own trivial helper.
71 * Legacy blk_queue_ordered() interface.
74 blk_queue_set_write_cache(struct request_queue
*q
, bool wc
, bool fua
)
76 #if defined(HAVE_BLK_QUEUE_WRITE_CACHE_GPL_ONLY)
78 blk_queue_flag_set(QUEUE_FLAG_WC
, q
);
80 blk_queue_flag_clear(QUEUE_FLAG_WC
, q
);
82 blk_queue_flag_set(QUEUE_FLAG_FUA
, q
);
84 blk_queue_flag_clear(QUEUE_FLAG_FUA
, q
);
85 #elif defined(HAVE_BLK_QUEUE_WRITE_CACHE)
86 blk_queue_write_cache(q
, wc
, fua
);
87 #elif defined(HAVE_BLK_QUEUE_FLUSH_GPL_ONLY)
89 q
->flush_flags
|= REQ_FLUSH
;
91 q
->flush_flags
|= REQ_FUA
;
92 #elif defined(HAVE_BLK_QUEUE_FLUSH)
93 blk_queue_flush(q
, (wc
? REQ_FLUSH
: 0) | (fua
? REQ_FUA
: 0));
95 blk_queue_ordered(q
, QUEUE_ORDERED_DRAIN
, NULL
);
100 * Most of the blk_* macros were removed in 2.6.36. Ostensibly this was
101 * done to improve readability and allow easier grepping. However, from
102 * a portability stand point the macros are helpful. Therefore the needed
103 * macros are redefined here if they are missing from the kernel.
105 #ifndef blk_fs_request
106 #define blk_fs_request(rq) ((rq)->cmd_type == REQ_TYPE_FS)
111 * The blk_queue_stackable() queue flag was added in 2.6.27 to handle dm
112 * stacking drivers. Prior to this request stacking drivers were detected
113 * by checking (q->request_fn == NULL), for earlier kernels we revert to
114 * this legacy behavior.
116 #ifndef blk_queue_stackable
117 #define blk_queue_stackable(q) ((q)->request_fn == NULL)
122 * The blk_queue_max_hw_sectors() function replaces blk_queue_max_sectors().
124 #ifndef HAVE_BLK_QUEUE_MAX_HW_SECTORS
125 #define blk_queue_max_hw_sectors __blk_queue_max_hw_sectors
127 __blk_queue_max_hw_sectors(struct request_queue
*q
, unsigned int max_hw_sectors
)
129 blk_queue_max_sectors(q
, max_hw_sectors
);
135 * The blk_queue_max_segments() function consolidates
136 * blk_queue_max_hw_segments() and blk_queue_max_phys_segments().
138 #ifndef HAVE_BLK_QUEUE_MAX_SEGMENTS
139 #define blk_queue_max_segments __blk_queue_max_segments
141 __blk_queue_max_segments(struct request_queue
*q
, unsigned short max_segments
)
143 blk_queue_max_phys_segments(q
, max_segments
);
144 blk_queue_max_hw_segments(q
, max_segments
);
149 blk_queue_set_read_ahead(struct request_queue
*q
, unsigned long ra_pages
)
151 #ifdef HAVE_BLK_QUEUE_BDI_DYNAMIC
152 q
->backing_dev_info
->ra_pages
= ra_pages
;
154 q
->backing_dev_info
.ra_pages
= ra_pages
;
158 #ifndef HAVE_GET_DISK_AND_MODULE
159 static inline struct kobject
*
160 get_disk_and_module(struct gendisk
*disk
)
162 return (get_disk(disk
));
166 #ifndef HAVE_GET_DISK_RO
168 get_disk_ro(struct gendisk
*disk
)
173 policy
= disk
->part
[0]->policy
;
177 #endif /* HAVE_GET_DISK_RO */
179 #ifdef HAVE_BIO_BVEC_ITER
180 #define BIO_BI_SECTOR(bio) (bio)->bi_iter.bi_sector
181 #define BIO_BI_SIZE(bio) (bio)->bi_iter.bi_size
182 #define BIO_BI_IDX(bio) (bio)->bi_iter.bi_idx
183 #define BIO_BI_SKIP(bio) (bio)->bi_iter.bi_bvec_done
184 #define bio_for_each_segment4(bv, bvp, b, i) \
185 bio_for_each_segment((bv), (b), (i))
186 typedef struct bvec_iter bvec_iterator_t
;
188 #define BIO_BI_SECTOR(bio) (bio)->bi_sector
189 #define BIO_BI_SIZE(bio) (bio)->bi_size
190 #define BIO_BI_IDX(bio) (bio)->bi_idx
191 #define BIO_BI_SKIP(bio) (0)
192 #define bio_for_each_segment4(bv, bvp, b, i) \
193 bio_for_each_segment((bvp), (b), (i))
194 typedef int bvec_iterator_t
;
198 * Portable helper for correctly setting the FAILFAST flags. The
199 * correct usage has changed 3 times from 2.6.12 to 2.6.38.
202 bio_set_flags_failfast(struct block_device
*bdev
, int *flags
)
206 * Disable FAILFAST for loopback devices because of the
207 * following incorrect BUG_ON() in loop_make_request().
208 * This support is also disabled for md devices because the
209 * test suite layers md devices on top of loopback devices.
210 * This may be removed when the loopback driver is fixed.
212 * BUG_ON(!lo || (rw != READ && rw != WRITE));
214 if ((MAJOR(bdev
->bd_dev
) == LOOP_MAJOR
) ||
215 (MAJOR(bdev
->bd_dev
) == MD_MAJOR
))
218 #ifdef BLOCK_EXT_MAJOR
219 if (MAJOR(bdev
->bd_dev
) == BLOCK_EXT_MAJOR
)
221 #endif /* BLOCK_EXT_MAJOR */
222 #endif /* CONFIG_BUG */
224 #if defined(HAVE_BIO_RW_FAILFAST_DTD)
225 /* BIO_RW_FAILFAST_* preferred interface from 2.6.28 - 2.6.35 */
227 (1 << BIO_RW_FAILFAST_DEV
) |
228 (1 << BIO_RW_FAILFAST_TRANSPORT
) |
229 (1 << BIO_RW_FAILFAST_DRIVER
));
230 #elif defined(HAVE_REQ_FAILFAST_MASK)
232 * REQ_FAILFAST_* preferred interface from 2.6.36 - 2.6.xx,
233 * the BIO_* and REQ_* flags were unified under REQ_* flags.
235 *flags
|= REQ_FAILFAST_MASK
;
237 #error "Undefined block IO FAILFAST interface."
242 * Maximum disk label length, it may be undefined for some kernels.
244 #ifndef DISK_NAME_LEN
245 #define DISK_NAME_LEN 32
246 #endif /* DISK_NAME_LEN */
248 #ifdef HAVE_BIO_BI_STATUS
250 bi_status_to_errno(blk_status_t status
)
255 case BLK_STS_NOTSUPP
:
257 case BLK_STS_TIMEOUT
:
261 case BLK_STS_TRANSPORT
:
269 case BLK_STS_PROTECTION
:
271 case BLK_STS_RESOURCE
:
282 static inline blk_status_t
283 errno_to_bi_status(int error
)
289 return (BLK_STS_NOTSUPP
);
291 return (BLK_STS_TIMEOUT
);
293 return (BLK_STS_NOSPC
);
295 return (BLK_STS_TRANSPORT
);
297 return (BLK_STS_TARGET
);
299 return (BLK_STS_NEXUS
);
301 return (BLK_STS_MEDIUM
);
303 return (BLK_STS_PROTECTION
);
305 return (BLK_STS_RESOURCE
);
307 return (BLK_STS_AGAIN
);
309 return (BLK_STS_IOERR
);
311 return (BLK_STS_IOERR
);
314 #endif /* HAVE_BIO_BI_STATUS */
318 * The bio_endio() prototype changed slightly. These are helper
319 * macro's to ensure the prototype and invocation are handled.
321 #ifdef HAVE_1ARG_BIO_END_IO_T
322 #ifdef HAVE_BIO_BI_STATUS
323 #define BIO_END_IO_ERROR(bio) bi_status_to_errno(bio->bi_status)
324 #define BIO_END_IO_PROTO(fn, x, z) static void fn(struct bio *x)
325 #define BIO_END_IO(bio, error) bio_set_bi_status(bio, error)
327 bio_set_bi_status(struct bio
*bio
, int error
)
329 ASSERT3S(error
, <=, 0);
330 bio
->bi_status
= errno_to_bi_status(-error
);
334 #define BIO_END_IO_ERROR(bio) (-(bio->bi_error))
335 #define BIO_END_IO_PROTO(fn, x, z) static void fn(struct bio *x)
336 #define BIO_END_IO(bio, error) bio_set_bi_error(bio, error)
338 bio_set_bi_error(struct bio
*bio
, int error
)
340 ASSERT3S(error
, <=, 0);
341 bio
->bi_error
= error
;
344 #endif /* HAVE_BIO_BI_STATUS */
347 #define BIO_END_IO_PROTO(fn, x, z) static void fn(struct bio *x, int z)
348 #define BIO_END_IO(bio, error) bio_endio(bio, error);
349 #endif /* HAVE_1ARG_BIO_END_IO_T */
352 * 2.6.38 - 2.6.x API,
353 * blkdev_get_by_path()
356 * 2.6.28 - 2.6.37 API,
357 * open_bdev_exclusive()
358 * close_bdev_exclusive()
360 * 2.6.12 - 2.6.27 API,
364 * Used to exclusively open a block device from within the kernel.
366 #if defined(HAVE_BLKDEV_GET_BY_PATH)
367 #define vdev_bdev_open(path, md, hld) blkdev_get_by_path(path, \
368 (md) | FMODE_EXCL, hld)
369 #define vdev_bdev_close(bdev, md) blkdev_put(bdev, (md) | FMODE_EXCL)
370 #elif defined(HAVE_OPEN_BDEV_EXCLUSIVE)
371 #define vdev_bdev_open(path, md, hld) open_bdev_exclusive(path, md, hld)
372 #define vdev_bdev_close(bdev, md) close_bdev_exclusive(bdev, md)
374 #define vdev_bdev_open(path, md, hld) open_bdev_excl(path, md, hld)
375 #define vdev_bdev_close(bdev, md) close_bdev_excl(bdev)
376 #endif /* HAVE_BLKDEV_GET_BY_PATH | HAVE_OPEN_BDEV_EXCLUSIVE */
380 * The function invalidate_bdev() lost it's second argument because
383 #ifdef HAVE_1ARG_INVALIDATE_BDEV
384 #define vdev_bdev_invalidate(bdev) invalidate_bdev(bdev)
386 #define vdev_bdev_invalidate(bdev) invalidate_bdev(bdev, 1)
387 #endif /* HAVE_1ARG_INVALIDATE_BDEV */
391 * The function was exported for use, prior to this it existed but the
392 * symbol was not exported.
394 * 4.4.0-6.21 API change for Ubuntu
395 * lookup_bdev() gained a second argument, FMODE_*, to check inode permissions.
397 #ifdef HAVE_1ARG_LOOKUP_BDEV
398 #define vdev_lookup_bdev(path) lookup_bdev(path)
400 #ifdef HAVE_2ARGS_LOOKUP_BDEV
401 #define vdev_lookup_bdev(path) lookup_bdev(path, 0)
403 #define vdev_lookup_bdev(path) ERR_PTR(-ENOTSUP)
404 #endif /* HAVE_2ARGS_LOOKUP_BDEV */
405 #endif /* HAVE_1ARG_LOOKUP_BDEV */
409 * To ensure good performance preferentially use the physical block size
410 * for proper alignment. The physical size is supposed to be the internal
411 * sector size used by the device. This is often 4096 byte for AF devices,
412 * while a smaller 512 byte logical size is supported for compatibility.
414 * Unfortunately, many drives still misreport their physical sector size.
415 * For devices which are known to lie you may need to manually set this
416 * at pool creation time with 'zpool create -o ashift=12 ...'.
418 * When the physical block size interface isn't available, we fall back to
419 * the logical block size interface and then the older hard sector size.
421 #ifdef HAVE_BDEV_PHYSICAL_BLOCK_SIZE
422 #define vdev_bdev_block_size(bdev) bdev_physical_block_size(bdev)
424 #ifdef HAVE_BDEV_LOGICAL_BLOCK_SIZE
425 #define vdev_bdev_block_size(bdev) bdev_logical_block_size(bdev)
427 #define vdev_bdev_block_size(bdev) bdev_hardsect_size(bdev)
428 #endif /* HAVE_BDEV_LOGICAL_BLOCK_SIZE */
429 #endif /* HAVE_BDEV_PHYSICAL_BLOCK_SIZE */
431 #ifndef HAVE_BIO_SET_OP_ATTRS
433 * Kernels without bio_set_op_attrs use bi_rw for the bio flags.
436 bio_set_op_attrs(struct bio
*bio
, unsigned rw
, unsigned flags
)
438 bio
->bi_rw
|= rw
| flags
;
443 * bio_set_flush - Set the appropriate flags in a bio to guarantee
444 * data are on non-volatile media on completion.
446 * 2.6.X - 2.6.36 API,
447 * WRITE_BARRIER - Tells the block layer to commit all previously submitted
448 * writes to stable storage before this one is started and that the current
449 * write is on stable storage upon completion. Also prevents reordering
450 * on both sides of the current operation.
453 * Introduce WRITE_FLUSH, WRITE_FUA, and WRITE_FLUSH_FUA flags as a
454 * replacement for WRITE_BARRIER to allow expressing richer semantics
455 * to the block layer. It's up to the block layer to implement the
456 * semantics correctly. Use the WRITE_FLUSH_FUA flag combination.
459 * REQ_FLUSH was renamed to REQ_PREFLUSH. For consistency with previous
460 * ZoL releases, prefer the WRITE_FLUSH_FUA flag set if it's available.
463 * The read/write flags and their modifiers, including WRITE_FLUSH,
464 * WRITE_FUA and WRITE_FLUSH_FUA were removed from fs.h in
465 * torvalds/linux@70fd7614 and replaced by direct flag modification
466 * of the REQ_ flags in bio->bi_opf. Use REQ_PREFLUSH.
469 bio_set_flush(struct bio
*bio
)
471 #if defined(REQ_PREFLUSH) /* >= 4.10 */
472 bio_set_op_attrs(bio
, 0, REQ_PREFLUSH
);
473 #elif defined(WRITE_FLUSH_FUA) /* >= 2.6.37 and <= 4.9 */
474 bio_set_op_attrs(bio
, 0, WRITE_FLUSH_FUA
);
475 #elif defined(WRITE_BARRIER) /* < 2.6.37 */
476 bio_set_op_attrs(bio
, 0, WRITE_BARRIER
);
478 #error "Allowing the build will cause bio_set_flush requests to be ignored."
492 * 2.6.x - 2.6.35 API,
493 * HAVE_BIO_RW_BARRIER
495 * Used to determine if a cache flush has been requested. This check has
496 * been left intentionally broad in order to cover both a legacy flush
497 * and the new preflush behavior introduced in Linux 4.8. This is correct
498 * in all cases but may have a performance impact for some kernels. It
499 * has the advantage of minimizing kernel specific changes in the zvol code.
502 static inline boolean_t
503 bio_is_flush(struct bio
*bio
)
505 #if defined(HAVE_REQ_OP_FLUSH) && defined(HAVE_BIO_BI_OPF)
506 return ((bio_op(bio
) == REQ_OP_FLUSH
) || (bio
->bi_opf
& REQ_PREFLUSH
));
507 #elif defined(REQ_PREFLUSH) && defined(HAVE_BIO_BI_OPF)
508 return (bio
->bi_opf
& REQ_PREFLUSH
);
509 #elif defined(REQ_PREFLUSH) && !defined(HAVE_BIO_BI_OPF)
510 return (bio
->bi_rw
& REQ_PREFLUSH
);
511 #elif defined(REQ_FLUSH)
512 return (bio
->bi_rw
& REQ_FLUSH
);
513 #elif defined(HAVE_BIO_RW_BARRIER)
514 return (bio
->bi_rw
& (1 << BIO_RW_BARRIER
));
516 #error "Allowing the build will cause flush requests to be ignored."
522 * REQ_FUA flag moved to bio->bi_opf
527 static inline boolean_t
528 bio_is_fua(struct bio
*bio
)
530 #if defined(HAVE_BIO_BI_OPF)
531 return (bio
->bi_opf
& REQ_FUA
);
532 #elif defined(REQ_FUA)
533 return (bio
->bi_rw
& REQ_FUA
);
535 #error "Allowing the build will cause fua requests to be ignored."
546 * 2.6.28 - 2.6.35 API,
549 * In all cases the normal I/O path is used for discards. The only
550 * difference is how the kernel tags individual I/Os as discards.
552 * Note that 2.6.32 era kernels provide both BIO_RW_DISCARD and REQ_DISCARD,
553 * where BIO_RW_DISCARD is the correct interface. Therefore, it is important
554 * that the HAVE_BIO_RW_DISCARD check occur before the REQ_DISCARD check.
556 static inline boolean_t
557 bio_is_discard(struct bio
*bio
)
559 #if defined(HAVE_REQ_OP_DISCARD)
560 return (bio_op(bio
) == REQ_OP_DISCARD
);
561 #elif defined(HAVE_BIO_RW_DISCARD)
562 return (bio
->bi_rw
& (1 << BIO_RW_DISCARD
));
563 #elif defined(REQ_DISCARD)
564 return (bio
->bi_rw
& REQ_DISCARD
);
566 /* potentially triggering the DMU_MAX_ACCESS assertion. */
567 #error "Allowing the build will cause discard requests to become writes."
573 * REQ_OP_SECURE_ERASE
578 * 2.6.x - 2.6.35 API,
579 * Unsupported by kernel
581 static inline boolean_t
582 bio_is_secure_erase(struct bio
*bio
)
584 #if defined(HAVE_REQ_OP_SECURE_ERASE)
585 return (bio_op(bio
) == REQ_OP_SECURE_ERASE
);
586 #elif defined(REQ_SECURE)
587 return (bio
->bi_rw
& REQ_SECURE
);
595 * Discard granularity and alignment restrictions may now be set. For
596 * older kernels which do not support this it is safe to skip it.
598 #ifdef HAVE_DISCARD_GRANULARITY
600 blk_queue_discard_granularity(struct request_queue
*q
, unsigned int dg
)
602 q
->limits
.discard_granularity
= dg
;
605 #define blk_queue_discard_granularity(x, dg) ((void)0)
606 #endif /* HAVE_DISCARD_GRANULARITY */
609 * Default Linux IO Scheduler,
610 * Setting the scheduler to noop will allow the Linux IO scheduler to
611 * still perform front and back merging, while leaving the request
612 * ordering and prioritization to the ZFS IO scheduler.
614 #define VDEV_SCHEDULER "noop"
617 * A common holder for vdev_bdev_open() is used to relax the exclusive open
618 * semantics slightly. Internal vdev disk callers may pass VDEV_HOLDER to
619 * allow them to open the device multiple times. Other kernel callers and
620 * user space processes which don't pass this value will get EBUSY. This is
621 * currently required for the correct operation of hot spares.
623 #define VDEV_HOLDER ((void *)0x2401de7)
626 blk_generic_start_io_acct(struct request_queue
*q
, int rw
,
627 unsigned long sectors
, struct hd_struct
*part
)
629 #if defined(HAVE_GENERIC_IO_ACCT_3ARG)
630 generic_start_io_acct(rw
, sectors
, part
);
631 #elif defined(HAVE_GENERIC_IO_ACCT_4ARG)
632 generic_start_io_acct(q
, rw
, sectors
, part
);
637 blk_generic_end_io_acct(struct request_queue
*q
, int rw
,
638 struct hd_struct
*part
, unsigned long start_time
)
640 #if defined(HAVE_GENERIC_IO_ACCT_3ARG)
641 generic_end_io_acct(rw
, part
, start_time
);
642 #elif defined(HAVE_GENERIC_IO_ACCT_4ARG)
643 generic_end_io_acct(q
, rw
, part
, start_time
);
647 #endif /* _ZFS_BLKDEV_H */