]>
Commit | Line | Data |
---|---|---|
60101509 BB |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
1d3ba0bf | 9 | * or https://opensource.org/licenses/CDDL-1.0. |
60101509 BB |
10 | * See the License for the specific language governing permissions |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | /* | |
22 | * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC. | |
23 | * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). | |
24 | * Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>. | |
25 | * LLNL-CODE-403049. | |
1eacf2b3 | 26 | * Copyright (c) 2012, 2019 by Delphix. All rights reserved. |
06a19602 | 27 | * Copyright (c) 2023, 2024, Klara Inc. |
60101509 BB |
28 | */ |
29 | ||
30 | #include <sys/zfs_context.h> | |
e771de53 | 31 | #include <sys/spa_impl.h> |
60101509 BB |
32 | #include <sys/vdev_disk.h> |
33 | #include <sys/vdev_impl.h> | |
1b939560 | 34 | #include <sys/vdev_trim.h> |
a6255b7f | 35 | #include <sys/abd.h> |
60101509 BB |
36 | #include <sys/fs/zfs.h> |
37 | #include <sys/zio.h> | |
8e82ffba | 38 | #include <linux/blkpg.h> |
74d42600 | 39 | #include <linux/msdos_fs.h> |
05805494 | 40 | #include <linux/vfs_compat.h> |
1e767532 CK |
41 | #ifdef HAVE_LINUX_BLK_CGROUP_HEADER |
42 | #include <linux/blk-cgroup.h> | |
43 | #endif | |
60101509 | 44 | |
386d6a75 RN |
45 | /* |
46 | * Linux 6.8.x uses a bdev_handle as an instance/refcount for an underlying | |
47 | * block_device. Since it carries the block_device inside, its convenient to | |
48 | * just use the handle as a proxy. For pre-6.8, we just emulate this with | |
49 | * a cast, since we don't need any of the other fields inside the handle. | |
50 | */ | |
51 | #ifdef HAVE_BDEV_OPEN_BY_PATH | |
52 | typedef struct bdev_handle zfs_bdev_handle_t; | |
53 | #define BDH_BDEV(bdh) ((bdh)->bdev) | |
54 | #define BDH_IS_ERR(bdh) (IS_ERR(bdh)) | |
55 | #define BDH_PTR_ERR(bdh) (PTR_ERR(bdh)) | |
56 | #define BDH_ERR_PTR(err) (ERR_PTR(err)) | |
57 | #else | |
58 | typedef void zfs_bdev_handle_t; | |
59 | #define BDH_BDEV(bdh) ((struct block_device *)bdh) | |
60 | #define BDH_IS_ERR(bdh) (IS_ERR(BDH_BDEV(bdh))) | |
61 | #define BDH_PTR_ERR(bdh) (PTR_ERR(BDH_BDEV(bdh))) | |
62 | #define BDH_ERR_PTR(err) (ERR_PTR(err)) | |
63 | #endif | |
64 | ||
d366c8fd | 65 | typedef struct vdev_disk { |
386d6a75 | 66 | zfs_bdev_handle_t *vd_bdh; |
d366c8fd JL |
67 | krwlock_t vd_lock; |
68 | } vdev_disk_t; | |
69 | ||
06a19602 RN |
70 | /* |
71 | * Maximum number of segments to add to a bio (min 4). If this is higher than | |
72 | * the maximum allowed by the device queue or the kernel itself, it will be | |
73 | * clamped. Setting it to zero will cause the kernel's ideal size to be used. | |
74 | */ | |
75 | uint_t zfs_vdev_disk_max_segs = 0; | |
76 | ||
a25861dc BB |
77 | /* |
78 | * Unique identifier for the exclusive vdev holder. | |
79 | */ | |
8128bd89 | 80 | static void *zfs_vdev_holder = VDEV_HOLDER; |
6839eed2 | 81 | |
a25861dc BB |
82 | /* |
83 | * Wait up to zfs_vdev_open_timeout_ms milliseconds before determining the | |
84 | * device is missing. The missing path may be transient since the links | |
85 | * can be briefly removed and recreated in response to udev events. | |
86 | */ | |
f66ffe68 | 87 | static uint_t zfs_vdev_open_timeout_ms = 1000; |
a25861dc BB |
88 | |
89 | /* | |
90 | * Size of the "reserved" partition, in blocks. | |
91 | */ | |
74d42600 SH |
92 | #define EFI_MIN_RESV_SIZE (16 * 1024) |
93 | ||
16f0fdad MZ |
94 | /* |
95 | * BIO request failfast mask. | |
96 | */ | |
97 | ||
98 | static unsigned int zfs_vdev_failfast_mask = 1; | |
99 | ||
43e8f6e3 CK |
100 | #ifdef HAVE_BLK_MODE_T |
101 | static blk_mode_t | |
102 | #else | |
60101509 | 103 | static fmode_t |
43e8f6e3 | 104 | #endif |
233d34e4 | 105 | vdev_bdev_mode(spa_mode_t spa_mode, boolean_t exclusive) |
60101509 | 106 | { |
43e8f6e3 CK |
107 | #ifdef HAVE_BLK_MODE_T |
108 | blk_mode_t mode = 0; | |
109 | ||
110 | if (spa_mode & SPA_MODE_READ) | |
111 | mode |= BLK_OPEN_READ; | |
112 | ||
113 | if (spa_mode & SPA_MODE_WRITE) | |
114 | mode |= BLK_OPEN_WRITE; | |
233d34e4 BB |
115 | |
116 | if (exclusive) | |
117 | mode |= BLK_OPEN_EXCL; | |
43e8f6e3 | 118 | #else |
60101509 BB |
119 | fmode_t mode = 0; |
120 | ||
da92d5cb | 121 | if (spa_mode & SPA_MODE_READ) |
60101509 BB |
122 | mode |= FMODE_READ; |
123 | ||
da92d5cb | 124 | if (spa_mode & SPA_MODE_WRITE) |
60101509 | 125 | mode |= FMODE_WRITE; |
233d34e4 BB |
126 | |
127 | if (exclusive) | |
128 | mode |= FMODE_EXCL; | |
43e8f6e3 | 129 | #endif |
60101509 | 130 | |
d1d7e268 | 131 | return (mode); |
60101509 | 132 | } |
60101509 | 133 | |
d441e85d BB |
134 | /* |
135 | * Returns the usable capacity (in bytes) for the partition or disk. | |
136 | */ | |
60101509 | 137 | static uint64_t |
d441e85d | 138 | bdev_capacity(struct block_device *bdev) |
60101509 | 139 | { |
d441e85d BB |
140 | return (i_size_read(bdev->bd_inode)); |
141 | } | |
60101509 | 142 | |
72ba4b2a BB |
143 | #if !defined(HAVE_BDEV_WHOLE) |
144 | static inline struct block_device * | |
145 | bdev_whole(struct block_device *bdev) | |
146 | { | |
147 | return (bdev->bd_contains); | |
148 | } | |
149 | #endif | |
150 | ||
bebdf52a BB |
151 | #if defined(HAVE_BDEVNAME) |
152 | #define vdev_bdevname(bdev, name) bdevname(bdev, name) | |
153 | #else | |
154 | static inline void | |
155 | vdev_bdevname(struct block_device *bdev, char *name) | |
156 | { | |
157 | snprintf(name, BDEVNAME_SIZE, "%pg", bdev); | |
158 | } | |
159 | #endif | |
160 | ||
d441e85d BB |
161 | /* |
162 | * Returns the maximum expansion capacity of the block device (in bytes). | |
163 | * | |
164 | * It is possible to expand a vdev when it has been created as a wholedisk | |
165 | * and the containing block device has increased in capacity. Or when the | |
166 | * partition containing the pool has been manually increased in size. | |
167 | * | |
168 | * This function is only responsible for calculating the potential expansion | |
169 | * size so it can be reported by 'zpool list'. The efi_use_whole_disk() is | |
170 | * responsible for verifying the expected partition layout in the wholedisk | |
171 | * case, and updating the partition table if appropriate. Once the partition | |
172 | * size has been increased the additional capacity will be visible using | |
173 | * bdev_capacity(). | |
0c637f31 | 174 | * |
175 | * The returned maximum expansion capacity is always expected to be larger, or | |
176 | * at the very least equal, to its usable capacity to prevent overestimating | |
177 | * the pool expandsize. | |
d441e85d BB |
178 | */ |
179 | static uint64_t | |
180 | bdev_max_capacity(struct block_device *bdev, uint64_t wholedisk) | |
181 | { | |
182 | uint64_t psize; | |
183 | int64_t available; | |
184 | ||
72ba4b2a | 185 | if (wholedisk && bdev != bdev_whole(bdev)) { |
74d42600 | 186 | /* |
d441e85d BB |
187 | * When reporting maximum expansion capacity for a wholedisk |
188 | * deduct any capacity which is expected to be lost due to | |
189 | * alignment restrictions. Over reporting this value isn't | |
190 | * harmful and would only result in slightly less capacity | |
191 | * than expected post expansion. | |
0c637f31 | 192 | * The estimated available space may be slightly smaller than |
193 | * bdev_capacity() for devices where the number of sectors is | |
194 | * not a multiple of the alignment size and the partition layout | |
195 | * is keeping less than PARTITION_END_ALIGNMENT bytes after the | |
196 | * "reserved" EFI partition: in such cases return the device | |
197 | * usable capacity. | |
74d42600 | 198 | */ |
72ba4b2a | 199 | available = i_size_read(bdev_whole(bdev)->bd_inode) - |
d441e85d BB |
200 | ((EFI_MIN_RESV_SIZE + NEW_START_BLOCK + |
201 | PARTITION_END_ALIGNMENT) << SECTOR_BITS); | |
0c637f31 | 202 | psize = MAX(available, bdev_capacity(bdev)); |
74d42600 | 203 | } else { |
d441e85d | 204 | psize = bdev_capacity(bdev); |
74d42600 | 205 | } |
d441e85d BB |
206 | |
207 | return (psize); | |
60101509 BB |
208 | } |
209 | ||
d148e951 BB |
210 | static void |
211 | vdev_disk_error(zio_t *zio) | |
212 | { | |
c71c8c71 | 213 | /* |
214 | * This function can be called in interrupt context, for instance while | |
215 | * handling IRQs coming from a misbehaving disk device; use printk() | |
216 | * which is safe from any context. | |
217 | */ | |
218 | printk(KERN_WARNING "zio pool=%s vdev=%s error=%d type=%d " | |
4938d01d | 219 | "offset=%llu size=%llu flags=%llu\n", spa_name(zio->io_spa), |
c71c8c71 | 220 | zio->io_vd->vdev_path, zio->io_error, zio->io_type, |
221 | (u_longlong_t)zio->io_offset, (u_longlong_t)zio->io_size, | |
222 | zio->io_flags); | |
d148e951 BB |
223 | } |
224 | ||
55c12724 AH |
225 | static void |
226 | vdev_disk_kobj_evt_post(vdev_t *v) | |
227 | { | |
228 | vdev_disk_t *vd = v->vdev_tsd; | |
386d6a75 RN |
229 | if (vd && vd->vd_bdh) { |
230 | spl_signal_kobj_evt(BDH_BDEV(vd->vd_bdh)); | |
55c12724 AH |
231 | } else { |
232 | vdev_dbgmsg(v, "vdev_disk_t is NULL for VDEV:%s\n", | |
233 | v->vdev_path); | |
234 | } | |
235 | } | |
236 | ||
386d6a75 RN |
237 | static zfs_bdev_handle_t * |
238 | vdev_blkdev_get_by_path(const char *path, spa_mode_t mode, void *holder) | |
43e8f6e3 | 239 | { |
386d6a75 RN |
240 | #if defined(HAVE_BDEV_OPEN_BY_PATH) |
241 | return (bdev_open_by_path(path, | |
242 | vdev_bdev_mode(mode, B_TRUE), holder, NULL)); | |
243 | #elif defined(HAVE_BLKDEV_GET_BY_PATH_4ARG) | |
43e8f6e3 | 244 | return (blkdev_get_by_path(path, |
386d6a75 | 245 | vdev_bdev_mode(mode, B_TRUE), holder, NULL)); |
43e8f6e3 CK |
246 | #else |
247 | return (blkdev_get_by_path(path, | |
233d34e4 | 248 | vdev_bdev_mode(mode, B_TRUE), holder)); |
43e8f6e3 CK |
249 | #endif |
250 | } | |
251 | ||
252 | static void | |
386d6a75 | 253 | vdev_blkdev_put(zfs_bdev_handle_t *bdh, spa_mode_t mode, void *holder) |
43e8f6e3 | 254 | { |
386d6a75 RN |
255 | #if defined(HAVE_BDEV_RELEASE) |
256 | return (bdev_release(bdh)); | |
257 | #elif defined(HAVE_BLKDEV_PUT_HOLDER) | |
258 | return (blkdev_put(BDH_BDEV(bdh), holder)); | |
43e8f6e3 | 259 | #else |
386d6a75 RN |
260 | return (blkdev_put(BDH_BDEV(bdh), |
261 | vdev_bdev_mode(mode, B_TRUE))); | |
43e8f6e3 CK |
262 | #endif |
263 | } | |
264 | ||
60101509 | 265 | static int |
1bd201e7 | 266 | vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, |
6fe3498c | 267 | uint64_t *logical_ashift, uint64_t *physical_ashift) |
60101509 | 268 | { |
386d6a75 | 269 | zfs_bdev_handle_t *bdh; |
43e8f6e3 | 270 | #ifdef HAVE_BLK_MODE_T |
233d34e4 | 271 | blk_mode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa), B_FALSE); |
43e8f6e3 | 272 | #else |
233d34e4 | 273 | fmode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa), B_FALSE); |
43e8f6e3 | 274 | #endif |
a25861dc | 275 | hrtime_t timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms); |
60101509 | 276 | vdev_disk_t *vd; |
60101509 BB |
277 | |
278 | /* Must have a pathname and it must be absolute. */ | |
279 | if (v->vdev_path == NULL || v->vdev_path[0] != '/') { | |
280 | v->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; | |
d441e85d | 281 | vdev_dbgmsg(v, "invalid vdev_path"); |
2d82ea8b | 282 | return (SET_ERROR(EINVAL)); |
60101509 BB |
283 | } |
284 | ||
0d8103d9 | 285 | /* |
d441e85d | 286 | * Reopen the device if it is currently open. When expanding a |
8e82ffba GW |
287 | * partition force re-scanning the partition table if userland |
288 | * did not take care of this already. We need to do this while closed | |
d441e85d BB |
289 | * in order to get an accurate updated block device size. Then |
290 | * since udev may need to recreate the device links increase the | |
a25861dc | 291 | * open retry timeout before reporting the device as unavailable. |
0d8103d9 | 292 | */ |
d441e85d BB |
293 | vd = v->vdev_tsd; |
294 | if (vd) { | |
295 | char disk_name[BDEVNAME_SIZE + 6] = "/dev/"; | |
296 | boolean_t reread_part = B_FALSE; | |
0d8103d9 | 297 | |
d441e85d | 298 | rw_enter(&vd->vd_lock, RW_WRITER); |
386d6a75 RN |
299 | bdh = vd->vd_bdh; |
300 | vd->vd_bdh = NULL; | |
d441e85d | 301 | |
386d6a75 RN |
302 | if (bdh) { |
303 | struct block_device *bdev = BDH_BDEV(bdh); | |
72ba4b2a | 304 | if (v->vdev_expanding && bdev != bdev_whole(bdev)) { |
bebdf52a | 305 | vdev_bdevname(bdev_whole(bdev), disk_name + 5); |
8e82ffba GW |
306 | /* |
307 | * If userland has BLKPG_RESIZE_PARTITION, | |
308 | * then it should have updated the partition | |
309 | * table already. We can detect this by | |
310 | * comparing our current physical size | |
311 | * with that of the device. If they are | |
312 | * the same, then we must not have | |
313 | * BLKPG_RESIZE_PARTITION or it failed to | |
314 | * update the partition table online. We | |
315 | * fallback to rescanning the partition | |
316 | * table from the kernel below. However, | |
317 | * if the capacity already reflects the | |
318 | * updated partition, then we skip | |
319 | * rescanning the partition table here. | |
320 | */ | |
321 | if (v->vdev_psize == bdev_capacity(bdev)) | |
322 | reread_part = B_TRUE; | |
d441e85d BB |
323 | } |
324 | ||
386d6a75 | 325 | vdev_blkdev_put(bdh, mode, zfs_vdev_holder); |
d441e85d BB |
326 | } |
327 | ||
328 | if (reread_part) { | |
386d6a75 RN |
329 | bdh = vdev_blkdev_get_by_path(disk_name, mode, |
330 | zfs_vdev_holder); | |
331 | if (!BDH_IS_ERR(bdh)) { | |
332 | int error = | |
333 | vdev_bdev_reread_part(BDH_BDEV(bdh)); | |
334 | vdev_blkdev_put(bdh, mode, zfs_vdev_holder); | |
a25861dc BB |
335 | if (error == 0) { |
336 | timeout = MSEC2NSEC( | |
337 | zfs_vdev_open_timeout_ms * 2); | |
338 | } | |
d441e85d BB |
339 | } |
340 | } | |
341 | } else { | |
342 | vd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP); | |
343 | ||
344 | rw_init(&vd->vd_lock, NULL, RW_DEFAULT, NULL); | |
345 | rw_enter(&vd->vd_lock, RW_WRITER); | |
346 | } | |
60101509 BB |
347 | |
348 | /* | |
349 | * Devices are always opened by the path provided at configuration | |
350 | * time. This means that if the provided path is a udev by-id path | |
d441e85d | 351 | * then drives may be re-cabled without an issue. If the provided |
4e95cc99 | 352 | * path is a udev by-path path, then the physical location information |
60101509 BB |
353 | * will be preserved. This can be critical for more complicated |
354 | * configurations where drives are located in specific physical | |
d441e85d BB |
355 | * locations to maximize the systems tolerance to component failure. |
356 | * | |
4e95cc99 | 357 | * Alternatively, you can provide your own udev rule to flexibly map |
60101509 | 358 | * the drives as you see fit. It is not advised that you use the |
4e95cc99 | 359 | * /dev/[hd]d devices which may be reordered due to probing order. |
60101509 BB |
360 | * Devices in the wrong locations will be detected by the higher |
361 | * level vdev validation. | |
2d82ea8b BB |
362 | * |
363 | * The specified paths may be briefly removed and recreated in | |
364 | * response to udev events. This should be exceptionally unlikely | |
365 | * because the zpool command makes every effort to verify these paths | |
366 | * have already settled prior to reaching this point. Therefore, | |
367 | * a ENOENT failure at this point is highly likely to be transient | |
368 | * and it is reasonable to sleep and retry before giving up. In | |
369 | * practice delays have been observed to be on the order of 100ms. | |
77e2756d BB |
370 | * |
371 | * When ERESTARTSYS is returned it indicates the block device is | |
372 | * a zvol which could not be opened due to the deadlock detection | |
373 | * logic in zvol_open(). Extend the timeout and retry the open | |
374 | * subsequent attempts are expected to eventually succeed. | |
60101509 | 375 | */ |
a25861dc | 376 | hrtime_t start = gethrtime(); |
386d6a75 RN |
377 | bdh = BDH_ERR_PTR(-ENXIO); |
378 | while (BDH_IS_ERR(bdh) && ((gethrtime() - start) < timeout)) { | |
379 | bdh = vdev_blkdev_get_by_path(v->vdev_path, mode, | |
380 | zfs_vdev_holder); | |
381 | if (unlikely(BDH_PTR_ERR(bdh) == -ENOENT)) { | |
55c12724 AH |
382 | /* |
383 | * There is no point of waiting since device is removed | |
384 | * explicitly | |
385 | */ | |
386 | if (v->vdev_removed) | |
387 | break; | |
388 | ||
d441e85d | 389 | schedule_timeout(MSEC_TO_TICK(10)); |
386d6a75 | 390 | } else if (unlikely(BDH_PTR_ERR(bdh) == -ERESTARTSYS)) { |
77e2756d BB |
391 | timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms * 10); |
392 | continue; | |
386d6a75 | 393 | } else if (BDH_IS_ERR(bdh)) { |
2d82ea8b BB |
394 | break; |
395 | } | |
396 | } | |
397 | ||
386d6a75 RN |
398 | if (BDH_IS_ERR(bdh)) { |
399 | int error = -BDH_PTR_ERR(bdh); | |
a25861dc BB |
400 | vdev_dbgmsg(v, "open error=%d timeout=%llu/%llu", error, |
401 | (u_longlong_t)(gethrtime() - start), | |
402 | (u_longlong_t)timeout); | |
386d6a75 | 403 | vd->vd_bdh = NULL; |
d441e85d BB |
404 | v->vdev_tsd = vd; |
405 | rw_exit(&vd->vd_lock); | |
406 | return (SET_ERROR(error)); | |
407 | } else { | |
386d6a75 | 408 | vd->vd_bdh = bdh; |
d441e85d BB |
409 | v->vdev_tsd = vd; |
410 | rw_exit(&vd->vd_lock); | |
60101509 BB |
411 | } |
412 | ||
386d6a75 RN |
413 | struct block_device *bdev = BDH_BDEV(vd->vd_bdh); |
414 | ||
0d8103d9 | 415 | /* Determine the physical block size */ |
386d6a75 | 416 | int physical_block_size = bdev_physical_block_size(bdev); |
6fe3498c RM |
417 | |
418 | /* Determine the logical block size */ | |
386d6a75 | 419 | int logical_block_size = bdev_logical_block_size(bdev); |
60101509 | 420 | |
60101509 BB |
421 | /* Clear the nowritecache bit, causes vdev_reopen() to try again. */ |
422 | v->vdev_nowritecache = B_FALSE; | |
423 | ||
1b939560 | 424 | /* Set when device reports it supports TRIM. */ |
386d6a75 | 425 | v->vdev_has_trim = bdev_discard_supported(bdev); |
1b939560 BB |
426 | |
427 | /* Set when device reports it supports secure TRIM. */ | |
386d6a75 | 428 | v->vdev_has_securetrim = bdev_secure_discard_supported(bdev); |
1b939560 | 429 | |
fb40095f | 430 | /* Inform the ZIO pipeline that we are non-rotational */ |
386d6a75 | 431 | v->vdev_nonrot = blk_queue_nonrot(bdev_get_queue(bdev)); |
fb40095f | 432 | |
d441e85d | 433 | /* Physical volume size in bytes for the partition */ |
386d6a75 | 434 | *psize = bdev_capacity(bdev); |
d441e85d BB |
435 | |
436 | /* Physical volume size in bytes including possible expansion space */ | |
386d6a75 | 437 | *max_psize = bdev_max_capacity(bdev, v->vdev_wholedisk); |
1bd201e7 | 438 | |
60101509 | 439 | /* Based on the minimum sector size set the block size */ |
6fe3498c RM |
440 | *physical_ashift = highbit64(MAX(physical_block_size, |
441 | SPA_MINBLOCKSIZE)) - 1; | |
442 | ||
443 | *logical_ashift = highbit64(MAX(logical_block_size, | |
444 | SPA_MINBLOCKSIZE)) - 1; | |
60101509 | 445 | |
d1d7e268 | 446 | return (0); |
60101509 BB |
447 | } |
448 | ||
449 | static void | |
450 | vdev_disk_close(vdev_t *v) | |
451 | { | |
452 | vdev_disk_t *vd = v->vdev_tsd; | |
453 | ||
0d8103d9 | 454 | if (v->vdev_reopening || vd == NULL) |
60101509 BB |
455 | return; |
456 | ||
72fd834c | 457 | if (vd->vd_bdh != NULL) |
386d6a75 | 458 | vdev_blkdev_put(vd->vd_bdh, spa_mode(v->vdev_spa), |
43e8f6e3 | 459 | zfs_vdev_holder); |
60101509 | 460 | |
d441e85d | 461 | rw_destroy(&vd->vd_lock); |
d1d7e268 | 462 | kmem_free(vd, sizeof (vdev_disk_t)); |
60101509 BB |
463 | v->vdev_tsd = NULL; |
464 | } | |
465 | ||
bbb1b6ce | 466 | static inline void |
3b86aeb2 | 467 | vdev_submit_bio_impl(struct bio *bio) |
bbb1b6ce BB |
468 | { |
469 | #ifdef HAVE_1ARG_SUBMIT_BIO | |
453c63e9 | 470 | (void) submit_bio(bio); |
bbb1b6ce | 471 | #else |
a3fbe2b9 | 472 | (void) submit_bio(bio_data_dir(bio), bio); |
bbb1b6ce BB |
473 | #endif |
474 | } | |
475 | ||
2e407941 BB |
476 | /* |
477 | * preempt_schedule_notrace is GPL-only which breaks the ZFS build, so | |
478 | * replace it with preempt_schedule under the following condition: | |
479 | */ | |
480 | #if defined(CONFIG_ARM64) && \ | |
481 | defined(CONFIG_PREEMPTION) && \ | |
482 | defined(CONFIG_BLK_CGROUP) | |
483 | #define preempt_schedule_notrace(x) preempt_schedule(x) | |
484 | #endif | |
485 | ||
5f264996 BB |
486 | /* |
487 | * As for the Linux 5.18 kernel bio_alloc() expects a block_device struct | |
488 | * as an argument removing the need to set it with bio_set_dev(). This | |
489 | * removes the need for all of the following compatibility code. | |
490 | */ | |
491 | #if !defined(HAVE_BIO_ALLOC_4ARG) | |
492 | ||
26a85659 BB |
493 | #ifdef HAVE_BIO_SET_DEV |
494 | #if defined(CONFIG_BLK_CGROUP) && defined(HAVE_BIO_SET_DEV_GPL_ONLY) | |
bd0d24e0 BB |
495 | /* |
496 | * The Linux 5.5 kernel updated percpu_ref_tryget() which is inlined by | |
497 | * blkg_tryget() to use rcu_read_lock() instead of rcu_read_lock_sched(). | |
498 | * As a side effect the function was converted to GPL-only. Define our | |
499 | * own version when needed which uses rcu_read_lock_sched(). | |
036e846a RS |
500 | * |
501 | * The Linux 5.17 kernel split linux/blk-cgroup.h into a private and a public | |
502 | * part, moving blkg_tryget into the private one. Define our own version. | |
bd0d24e0 | 503 | */ |
036e846a | 504 | #if defined(HAVE_BLKG_TRYGET_GPL_ONLY) || !defined(HAVE_BLKG_TRYGET) |
bd0d24e0 BB |
505 | static inline bool |
506 | vdev_blkg_tryget(struct blkcg_gq *blkg) | |
507 | { | |
508 | struct percpu_ref *ref = &blkg->refcnt; | |
509 | unsigned long __percpu *count; | |
510 | bool rc; | |
511 | ||
512 | rcu_read_lock_sched(); | |
513 | ||
514 | if (__ref_is_percpu(ref, &count)) { | |
515 | this_cpu_inc(*count); | |
516 | rc = true; | |
517 | } else { | |
838a2490 CK |
518 | #ifdef ZFS_PERCPU_REF_COUNT_IN_DATA |
519 | rc = atomic_long_inc_not_zero(&ref->data->count); | |
520 | #else | |
bd0d24e0 | 521 | rc = atomic_long_inc_not_zero(&ref->count); |
838a2490 | 522 | #endif |
bd0d24e0 BB |
523 | } |
524 | ||
525 | rcu_read_unlock_sched(); | |
526 | ||
527 | return (rc); | |
528 | } | |
036e846a | 529 | #else |
bd0d24e0 BB |
530 | #define vdev_blkg_tryget(bg) blkg_tryget(bg) |
531 | #endif | |
d08b99ac | 532 | #ifdef HAVE_BIO_SET_DEV_MACRO |
26a85659 BB |
533 | /* |
534 | * The Linux 5.0 kernel updated the bio_set_dev() macro so it calls the | |
535 | * GPL-only bio_associate_blkg() symbol thus inadvertently converting | |
536 | * the entire macro. Provide a minimal version which always assigns the | |
537 | * request queue's root_blkg to the bio. | |
538 | */ | |
539 | static inline void | |
540 | vdev_bio_associate_blkg(struct bio *bio) | |
541 | { | |
d939930f CK |
542 | #if defined(HAVE_BIO_BDEV_DISK) |
543 | struct request_queue *q = bio->bi_bdev->bd_disk->queue; | |
544 | #else | |
26a85659 | 545 | struct request_queue *q = bio->bi_disk->queue; |
d939930f | 546 | #endif |
26a85659 BB |
547 | |
548 | ASSERT3P(q, !=, NULL); | |
26a85659 BB |
549 | ASSERT3P(bio->bi_blkg, ==, NULL); |
550 | ||
bd0d24e0 | 551 | if (q->root_blkg && vdev_blkg_tryget(q->root_blkg)) |
26a85659 BB |
552 | bio->bi_blkg = q->root_blkg; |
553 | } | |
d08b99ac | 554 | |
26a85659 | 555 | #define bio_associate_blkg vdev_bio_associate_blkg |
d08b99ac CK |
556 | #else |
557 | static inline void | |
558 | vdev_bio_set_dev(struct bio *bio, struct block_device *bdev) | |
559 | { | |
560 | #if defined(HAVE_BIO_BDEV_DISK) | |
561 | struct request_queue *q = bdev->bd_disk->queue; | |
562 | #else | |
563 | struct request_queue *q = bio->bi_disk->queue; | |
564 | #endif | |
565 | bio_clear_flag(bio, BIO_REMAPPED); | |
566 | if (bio->bi_bdev != bdev) | |
567 | bio_clear_flag(bio, BIO_THROTTLED); | |
568 | bio->bi_bdev = bdev; | |
569 | ||
570 | ASSERT3P(q, !=, NULL); | |
571 | ASSERT3P(bio->bi_blkg, ==, NULL); | |
572 | ||
573 | if (q->root_blkg && vdev_blkg_tryget(q->root_blkg)) | |
574 | bio->bi_blkg = q->root_blkg; | |
575 | } | |
576 | #define bio_set_dev vdev_bio_set_dev | |
577 | #endif | |
26a85659 BB |
578 | #endif |
579 | #else | |
580 | /* | |
581 | * Provide a bio_set_dev() helper macro for pre-Linux 4.14 kernels. | |
582 | */ | |
787acae0 GDN |
583 | static inline void |
584 | bio_set_dev(struct bio *bio, struct block_device *bdev) | |
585 | { | |
586 | bio->bi_bdev = bdev; | |
587 | } | |
26a85659 | 588 | #endif /* HAVE_BIO_SET_DEV */ |
5f264996 | 589 | #endif /* !HAVE_BIO_ALLOC_4ARG */ |
787acae0 | 590 | |
37f9dac5 | 591 | static inline void |
3b86aeb2 | 592 | vdev_submit_bio(struct bio *bio) |
37f9dac5 | 593 | { |
37f9dac5 RY |
594 | struct bio_list *bio_list = current->bio_list; |
595 | current->bio_list = NULL; | |
3b86aeb2 | 596 | vdev_submit_bio_impl(bio); |
37f9dac5 | 597 | current->bio_list = bio_list; |
37f9dac5 RY |
598 | } |
599 | ||
5f264996 BB |
600 | static inline struct bio * |
601 | vdev_bio_alloc(struct block_device *bdev, gfp_t gfp_mask, | |
602 | unsigned short nr_vecs) | |
603 | { | |
604 | struct bio *bio; | |
605 | ||
d1325b4f | 606 | #ifdef HAVE_BIO_ALLOC_4ARG |
5f264996 BB |
607 | bio = bio_alloc(bdev, nr_vecs, 0, gfp_mask); |
608 | #else | |
609 | bio = bio_alloc(gfp_mask, nr_vecs); | |
610 | if (likely(bio != NULL)) | |
611 | bio_set_dev(bio, bdev); | |
d1325b4f AZ |
612 | #endif |
613 | ||
5f264996 BB |
614 | return (bio); |
615 | } | |
616 | ||
06a19602 RN |
617 | static inline uint_t |
618 | vdev_bio_max_segs(struct block_device *bdev) | |
619 | { | |
620 | /* | |
621 | * Smallest of the device max segs and the tuneable max segs. Minimum | |
622 | * 4, so there's room to finish split pages if they come up. | |
623 | */ | |
624 | const uint_t dev_max_segs = queue_max_segments(bdev_get_queue(bdev)); | |
625 | const uint_t tune_max_segs = (zfs_vdev_disk_max_segs > 0) ? | |
626 | MAX(4, zfs_vdev_disk_max_segs) : dev_max_segs; | |
627 | const uint_t max_segs = MIN(tune_max_segs, dev_max_segs); | |
628 | ||
629 | #ifdef HAVE_BIO_MAX_SEGS | |
630 | return (bio_max_segs(max_segs)); | |
631 | #else | |
632 | return (MIN(max_segs, BIO_MAX_PAGES)); | |
633 | #endif | |
634 | } | |
635 | ||
636 | static inline uint_t | |
637 | vdev_bio_max_bytes(struct block_device *bdev) | |
638 | { | |
639 | return (queue_max_sectors(bdev_get_queue(bdev)) << 9); | |
640 | } | |
641 | ||
642 | ||
643 | /* | |
644 | * Virtual block IO object (VBIO) | |
645 | * | |
646 | * Linux block IO (BIO) objects have a limit on how many data segments (pages) | |
647 | * they can hold. Depending on how they're allocated and structured, a large | |
648 | * ZIO can require more than one BIO to be submitted to the kernel, which then | |
649 | * all have to complete before we can return the completed ZIO back to ZFS. | |
650 | * | |
651 | * A VBIO is a wrapper around multiple BIOs, carrying everything needed to | |
652 | * translate a ZIO down into the kernel block layer and back again. | |
653 | * | |
654 | * Note that these are only used for data ZIOs (read/write). Meta-operations | |
655 | * (flush/trim) don't need multiple BIOs and so can just make the call | |
656 | * directly. | |
657 | */ | |
658 | typedef struct { | |
659 | zio_t *vbio_zio; /* parent zio */ | |
660 | ||
661 | struct block_device *vbio_bdev; /* blockdev to submit bios to */ | |
662 | ||
663 | abd_t *vbio_abd; /* abd carrying borrowed linear buf */ | |
664 | ||
06a19602 RN |
665 | uint_t vbio_max_segs; /* max segs per bio */ |
666 | ||
667 | uint_t vbio_max_bytes; /* max bytes per bio */ | |
668 | uint_t vbio_lbs_mask; /* logical block size mask */ | |
669 | ||
670 | uint64_t vbio_offset; /* start offset of next bio */ | |
671 | ||
672 | struct bio *vbio_bio; /* pointer to the current bio */ | |
72fd834c | 673 | int vbio_flags; /* bio flags */ |
06a19602 RN |
674 | } vbio_t; |
675 | ||
676 | static vbio_t * | |
72fd834c | 677 | vbio_alloc(zio_t *zio, struct block_device *bdev, int flags) |
06a19602 RN |
678 | { |
679 | vbio_t *vbio = kmem_zalloc(sizeof (vbio_t), KM_SLEEP); | |
680 | ||
681 | vbio->vbio_zio = zio; | |
682 | vbio->vbio_bdev = bdev; | |
72fd834c | 683 | vbio->vbio_abd = NULL; |
06a19602 RN |
684 | vbio->vbio_max_segs = vdev_bio_max_segs(bdev); |
685 | vbio->vbio_max_bytes = vdev_bio_max_bytes(bdev); | |
686 | vbio->vbio_lbs_mask = ~(bdev_logical_block_size(bdev)-1); | |
687 | vbio->vbio_offset = zio->io_offset; | |
72fd834c RN |
688 | vbio->vbio_bio = NULL; |
689 | vbio->vbio_flags = flags; | |
06a19602 RN |
690 | |
691 | return (vbio); | |
692 | } | |
693 | ||
72fd834c RN |
694 | BIO_END_IO_PROTO(vbio_completion, bio, error); |
695 | ||
06a19602 RN |
696 | static int |
697 | vbio_add_page(vbio_t *vbio, struct page *page, uint_t size, uint_t offset) | |
698 | { | |
72fd834c | 699 | struct bio *bio = vbio->vbio_bio; |
06a19602 RN |
700 | uint_t ssize; |
701 | ||
702 | while (size > 0) { | |
06a19602 RN |
703 | if (bio == NULL) { |
704 | /* New BIO, allocate and set up */ | |
705 | bio = vdev_bio_alloc(vbio->vbio_bdev, GFP_NOIO, | |
706 | vbio->vbio_max_segs); | |
72fd834c RN |
707 | VERIFY(bio); |
708 | ||
06a19602 | 709 | BIO_BI_SECTOR(bio) = vbio->vbio_offset >> 9; |
72fd834c RN |
710 | bio_set_op_attrs(bio, |
711 | vbio->vbio_zio->io_type == ZIO_TYPE_WRITE ? | |
712 | WRITE : READ, vbio->vbio_flags); | |
06a19602 | 713 | |
72fd834c RN |
714 | if (vbio->vbio_bio) { |
715 | bio_chain(vbio->vbio_bio, bio); | |
716 | vdev_submit_bio(vbio->vbio_bio); | |
717 | } | |
718 | vbio->vbio_bio = bio; | |
06a19602 RN |
719 | } |
720 | ||
721 | /* | |
722 | * Only load as much of the current page data as will fit in | |
723 | * the space left in the BIO, respecting lbs alignment. Older | |
724 | * kernels will error if we try to overfill the BIO, while | |
725 | * newer ones will accept it and split the BIO. This ensures | |
726 | * everything works on older kernels, and avoids an additional | |
727 | * overhead on the new. | |
728 | */ | |
729 | ssize = MIN(size, (vbio->vbio_max_bytes - BIO_BI_SIZE(bio)) & | |
730 | vbio->vbio_lbs_mask); | |
731 | if (ssize > 0 && | |
732 | bio_add_page(bio, page, ssize, offset) == ssize) { | |
733 | /* Accepted, adjust and load any remaining. */ | |
734 | size -= ssize; | |
735 | offset += ssize; | |
736 | continue; | |
737 | } | |
738 | ||
739 | /* No room, set up for a new BIO and loop */ | |
740 | vbio->vbio_offset += BIO_BI_SIZE(bio); | |
741 | ||
742 | /* Signal new BIO allocation wanted */ | |
72fd834c | 743 | bio = NULL; |
06a19602 RN |
744 | } |
745 | ||
746 | return (0); | |
747 | } | |
748 | ||
72fd834c RN |
749 | /* Iterator callback to submit ABD pages to the vbio. */ |
750 | static int | |
751 | vbio_fill_cb(struct page *page, size_t off, size_t len, void *priv) | |
752 | { | |
753 | vbio_t *vbio = priv; | |
754 | return (vbio_add_page(vbio, page, len, off)); | |
755 | } | |
06a19602 | 756 | |
72fd834c | 757 | /* Create some BIOs, fill them with data and submit them */ |
06a19602 | 758 | static void |
72fd834c | 759 | vbio_submit(vbio_t *vbio, abd_t *abd, uint64_t size) |
06a19602 | 760 | { |
72fd834c | 761 | ASSERT(vbio->vbio_bdev); |
06a19602 RN |
762 | |
763 | /* | |
72fd834c RN |
764 | * We plug so we can submit the BIOs as we go and only unplug them when |
765 | * they are fully created and submitted. This is important; if we don't | |
766 | * plug, then the kernel may start executing earlier BIOs while we're | |
767 | * still creating and executing later ones, and if the device goes | |
768 | * away while that's happening, older kernels can get confused and | |
769 | * trample memory. | |
06a19602 RN |
770 | */ |
771 | struct blk_plug plug; | |
72fd834c | 772 | blk_start_plug(&plug); |
06a19602 | 773 | |
72fd834c RN |
774 | (void) abd_iterate_page_func(abd, 0, size, vbio_fill_cb, vbio); |
775 | ASSERT(vbio->vbio_bio); | |
06a19602 | 776 | |
72fd834c RN |
777 | vbio->vbio_bio->bi_end_io = vbio_completion; |
778 | vbio->vbio_bio->bi_private = vbio; | |
06a19602 | 779 | |
72fd834c | 780 | vdev_submit_bio(vbio->vbio_bio); |
06a19602 | 781 | |
72fd834c | 782 | blk_finish_plug(&plug); |
06a19602 | 783 | |
72fd834c RN |
784 | vbio->vbio_bio = NULL; |
785 | vbio->vbio_bdev = NULL; | |
06a19602 RN |
786 | } |
787 | ||
72fd834c RN |
788 | /* IO completion callback */ |
789 | BIO_END_IO_PROTO(vbio_completion, bio, error) | |
06a19602 | 790 | { |
72fd834c | 791 | vbio_t *vbio = bio->bi_private; |
06a19602 | 792 | zio_t *zio = vbio->vbio_zio; |
06a19602 | 793 | |
72fd834c | 794 | ASSERT(zio); |
06a19602 | 795 | |
72fd834c RN |
796 | /* Capture and log any errors */ |
797 | #ifdef HAVE_1ARG_BIO_END_IO_T | |
798 | zio->io_error = BIO_END_IO_ERROR(bio); | |
799 | #else | |
800 | zio->io_error = 0; | |
801 | if (error) | |
802 | zio->io_error = -(error); | |
803 | else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) | |
804 | zio->io_error = EIO; | |
805 | #endif | |
806 | ASSERT3U(zio->io_error, >=, 0); | |
06a19602 | 807 | |
72fd834c RN |
808 | if (zio->io_error) |
809 | vdev_disk_error(zio); | |
06a19602 | 810 | |
72fd834c RN |
811 | /* Return the BIO to the kernel */ |
812 | bio_put(bio); | |
06a19602 RN |
813 | |
814 | /* | |
72fd834c RN |
815 | * If we copied the ABD before issuing it, clean up and return the copy |
816 | * to the ADB, with changes if appropriate. | |
06a19602 | 817 | */ |
72fd834c RN |
818 | if (vbio->vbio_abd != NULL) { |
819 | void *buf = abd_to_buf(vbio->vbio_abd); | |
820 | abd_free(vbio->vbio_abd); | |
821 | vbio->vbio_abd = NULL; | |
06a19602 | 822 | |
72fd834c RN |
823 | if (zio->io_type == ZIO_TYPE_READ) |
824 | abd_return_buf_copy(zio->io_abd, buf, zio->io_size); | |
825 | else | |
826 | abd_return_buf(zio->io_abd, buf, zio->io_size); | |
827 | } | |
06a19602 | 828 | |
72fd834c RN |
829 | /* Final cleanup */ |
830 | kmem_free(vbio, sizeof (vbio_t)); | |
06a19602 RN |
831 | |
832 | /* All done, submit for processing */ | |
833 | zio_delay_interrupt(zio); | |
06a19602 RN |
834 | } |
835 | ||
836 | /* | |
837 | * Iterator callback to count ABD pages and check their size & alignment. | |
838 | * | |
839 | * On Linux, each BIO segment can take a page pointer, and an offset+length of | |
840 | * the data within that page. A page can be arbitrarily large ("compound" | |
841 | * pages) but we still have to ensure the data portion is correctly sized and | |
842 | * aligned to the logical block size, to ensure that if the kernel wants to | |
843 | * split the BIO, the two halves will still be properly aligned. | |
844 | */ | |
845 | typedef struct { | |
846 | uint_t bmask; | |
847 | uint_t npages; | |
848 | uint_t end; | |
849 | } vdev_disk_check_pages_t; | |
850 | ||
851 | static int | |
852 | vdev_disk_check_pages_cb(struct page *page, size_t off, size_t len, void *priv) | |
853 | { | |
854 | vdev_disk_check_pages_t *s = priv; | |
855 | ||
856 | /* | |
857 | * If we didn't finish on a block size boundary last time, then there | |
858 | * would be a gap if we tried to use this ABD as-is, so abort. | |
859 | */ | |
860 | if (s->end != 0) | |
861 | return (1); | |
862 | ||
863 | /* | |
864 | * Note if we're taking less than a full block, so we can check it | |
865 | * above on the next call. | |
866 | */ | |
867 | s->end = len & s->bmask; | |
868 | ||
869 | /* All blocks after the first must start on a block size boundary. */ | |
870 | if (s->npages != 0 && (off & s->bmask) != 0) | |
871 | return (1); | |
872 | ||
873 | s->npages++; | |
874 | return (0); | |
875 | } | |
876 | ||
877 | /* | |
878 | * Check if we can submit the pages in this ABD to the kernel as-is. Returns | |
879 | * the number of pages, or 0 if it can't be submitted like this. | |
880 | */ | |
881 | static boolean_t | |
882 | vdev_disk_check_pages(abd_t *abd, uint64_t size, struct block_device *bdev) | |
883 | { | |
884 | vdev_disk_check_pages_t s = { | |
885 | .bmask = bdev_logical_block_size(bdev)-1, | |
886 | .npages = 0, | |
887 | .end = 0, | |
888 | }; | |
889 | ||
890 | if (abd_iterate_page_func(abd, 0, size, vdev_disk_check_pages_cb, &s)) | |
891 | return (B_FALSE); | |
892 | ||
893 | return (B_TRUE); | |
894 | } | |
895 | ||
06a19602 RN |
896 | static int |
897 | vdev_disk_io_rw(zio_t *zio) | |
898 | { | |
899 | vdev_t *v = zio->io_vd; | |
900 | vdev_disk_t *vd = v->vdev_tsd; | |
901 | struct block_device *bdev = BDH_BDEV(vd->vd_bdh); | |
902 | int flags = 0; | |
903 | ||
904 | /* | |
905 | * Accessing outside the block device is never allowed. | |
906 | */ | |
907 | if (zio->io_offset + zio->io_size > bdev->bd_inode->i_size) { | |
908 | vdev_dbgmsg(zio->io_vd, | |
909 | "Illegal access %llu size %llu, device size %llu", | |
910 | (u_longlong_t)zio->io_offset, | |
911 | (u_longlong_t)zio->io_size, | |
912 | (u_longlong_t)i_size_read(bdev->bd_inode)); | |
913 | return (SET_ERROR(EIO)); | |
914 | } | |
915 | ||
916 | if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)) && | |
917 | v->vdev_failfast == B_TRUE) { | |
918 | bio_set_flags_failfast(bdev, &flags, zfs_vdev_failfast_mask & 1, | |
919 | zfs_vdev_failfast_mask & 2, zfs_vdev_failfast_mask & 4); | |
920 | } | |
921 | ||
922 | /* | |
923 | * Check alignment of the incoming ABD. If any part of it would require | |
924 | * submitting a page that is not aligned to the logical block size, | |
925 | * then we take a copy into a linear buffer and submit that instead. | |
926 | * This should be impossible on a 512b LBS, and fairly rare on 4K, | |
927 | * usually requiring abnormally-small data blocks (eg gang blocks) | |
928 | * mixed into the same ABD as larger ones (eg aggregated). | |
929 | */ | |
930 | abd_t *abd = zio->io_abd; | |
931 | if (!vdev_disk_check_pages(abd, zio->io_size, bdev)) { | |
932 | void *buf; | |
933 | if (zio->io_type == ZIO_TYPE_READ) | |
934 | buf = abd_borrow_buf(zio->io_abd, zio->io_size); | |
935 | else | |
936 | buf = abd_borrow_buf_copy(zio->io_abd, zio->io_size); | |
937 | ||
938 | /* | |
939 | * Wrap the copy in an abd_t, so we can use the same iterators | |
940 | * to count and fill the vbio later. | |
941 | */ | |
942 | abd = abd_get_from_buf(buf, zio->io_size); | |
943 | ||
944 | /* | |
945 | * False here would mean the borrowed copy has an invalid | |
946 | * alignment too, which would mean we've somehow been passed a | |
947 | * linear ABD with an interior page that has a non-zero offset | |
948 | * or a size not a multiple of PAGE_SIZE. This is not possible. | |
949 | * It would mean either zio_buf_alloc() or its underlying | |
950 | * allocators have done something extremely strange, or our | |
951 | * math in vdev_disk_check_pages() is wrong. In either case, | |
952 | * something in seriously wrong and its not safe to continue. | |
953 | */ | |
954 | VERIFY(vdev_disk_check_pages(abd, zio->io_size, bdev)); | |
955 | } | |
956 | ||
957 | /* Allocate vbio, with a pointer to the borrowed ABD if necessary */ | |
72fd834c | 958 | vbio_t *vbio = vbio_alloc(zio, bdev, flags); |
06a19602 RN |
959 | if (abd != zio->io_abd) |
960 | vbio->vbio_abd = abd; | |
961 | ||
72fd834c RN |
962 | /* Fill it with data pages and submit it to the kernel */ |
963 | vbio_submit(vbio, abd, zio->io_size); | |
06a19602 RN |
964 | return (0); |
965 | } | |
966 | ||
f3b85d70 RN |
967 | /* ========== */ |
968 | ||
969 | /* | |
06a19602 RN |
970 | * This is the classic, battle-tested BIO submission code. Until we're totally |
971 | * sure that the new code is safe and correct in all cases, this will remain | |
972 | * available and can be enabled by setting zfs_vdev_disk_classic=1 at module | |
973 | * load time. | |
f3b85d70 RN |
974 | * |
975 | * These functions have been renamed to vdev_classic_* to make it clear what | |
976 | * they belong to, but their implementations are unchanged. | |
977 | */ | |
978 | ||
979 | /* | |
980 | * Virtual device vector for disks. | |
981 | */ | |
982 | typedef struct dio_request { | |
983 | zio_t *dr_zio; /* Parent ZIO */ | |
984 | atomic_t dr_ref; /* References */ | |
985 | int dr_error; /* Bio error */ | |
986 | int dr_bio_count; /* Count of bio's */ | |
987 | struct bio *dr_bio[]; /* Attached bio's */ | |
988 | } dio_request_t; | |
989 | ||
990 | static dio_request_t * | |
991 | vdev_classic_dio_alloc(int bio_count) | |
992 | { | |
993 | dio_request_t *dr = kmem_zalloc(sizeof (dio_request_t) + | |
994 | sizeof (struct bio *) * bio_count, KM_SLEEP); | |
995 | atomic_set(&dr->dr_ref, 0); | |
996 | dr->dr_bio_count = bio_count; | |
997 | dr->dr_error = 0; | |
998 | ||
999 | for (int i = 0; i < dr->dr_bio_count; i++) | |
1000 | dr->dr_bio[i] = NULL; | |
1001 | ||
1002 | return (dr); | |
1003 | } | |
1004 | ||
1005 | static void | |
1006 | vdev_classic_dio_free(dio_request_t *dr) | |
1007 | { | |
1008 | int i; | |
1009 | ||
1010 | for (i = 0; i < dr->dr_bio_count; i++) | |
1011 | if (dr->dr_bio[i]) | |
1012 | bio_put(dr->dr_bio[i]); | |
1013 | ||
1014 | kmem_free(dr, sizeof (dio_request_t) + | |
1015 | sizeof (struct bio *) * dr->dr_bio_count); | |
1016 | } | |
1017 | ||
1018 | static void | |
1019 | vdev_classic_dio_get(dio_request_t *dr) | |
1020 | { | |
1021 | atomic_inc(&dr->dr_ref); | |
1022 | } | |
1023 | ||
1024 | static void | |
1025 | vdev_classic_dio_put(dio_request_t *dr) | |
1026 | { | |
1027 | int rc = atomic_dec_return(&dr->dr_ref); | |
1028 | ||
1029 | /* | |
1030 | * Free the dio_request when the last reference is dropped and | |
1031 | * ensure zio_interpret is called only once with the correct zio | |
1032 | */ | |
1033 | if (rc == 0) { | |
1034 | zio_t *zio = dr->dr_zio; | |
1035 | int error = dr->dr_error; | |
1036 | ||
1037 | vdev_classic_dio_free(dr); | |
1038 | ||
1039 | if (zio) { | |
1040 | zio->io_error = error; | |
1041 | ASSERT3S(zio->io_error, >=, 0); | |
1042 | if (zio->io_error) | |
1043 | vdev_disk_error(zio); | |
1044 | ||
1045 | zio_delay_interrupt(zio); | |
1046 | } | |
1047 | } | |
1048 | } | |
1049 | ||
1050 | BIO_END_IO_PROTO(vdev_classic_physio_completion, bio, error) | |
1051 | { | |
1052 | dio_request_t *dr = bio->bi_private; | |
1053 | ||
1054 | if (dr->dr_error == 0) { | |
1055 | #ifdef HAVE_1ARG_BIO_END_IO_T | |
1056 | dr->dr_error = BIO_END_IO_ERROR(bio); | |
1057 | #else | |
1058 | if (error) | |
1059 | dr->dr_error = -(error); | |
1060 | else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) | |
1061 | dr->dr_error = EIO; | |
1062 | #endif | |
1063 | } | |
1064 | ||
1065 | /* Drop reference acquired by vdev_classic_physio */ | |
1066 | vdev_classic_dio_put(dr); | |
1067 | } | |
1068 | ||
5f264996 | 1069 | static inline unsigned int |
f3b85d70 | 1070 | vdev_classic_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset) |
5f264996 BB |
1071 | { |
1072 | unsigned long nr_segs = abd_nr_pages_off(zio->io_abd, | |
1073 | bio_size, abd_offset); | |
1074 | ||
1075 | #ifdef HAVE_BIO_MAX_SEGS | |
1076 | return (bio_max_segs(nr_segs)); | |
1077 | #else | |
1078 | return (MIN(nr_segs, BIO_MAX_PAGES)); | |
1079 | #endif | |
1080 | } | |
1081 | ||
60101509 | 1082 | static int |
867178ae | 1083 | vdev_classic_physio(zio_t *zio) |
60101509 | 1084 | { |
867178ae RN |
1085 | vdev_t *v = zio->io_vd; |
1086 | vdev_disk_t *vd = v->vdev_tsd; | |
1087 | struct block_device *bdev = BDH_BDEV(vd->vd_bdh); | |
1088 | size_t io_size = zio->io_size; | |
1089 | uint64_t io_offset = zio->io_offset; | |
1090 | int rw = zio->io_type == ZIO_TYPE_READ ? READ : WRITE; | |
1091 | int flags = 0; | |
1092 | ||
d1d7e268 | 1093 | dio_request_t *dr; |
b0be93e8 | 1094 | uint64_t abd_offset; |
60101509 | 1095 | uint64_t bio_offset; |
f8c0d7e1 MA |
1096 | int bio_size; |
1097 | int bio_count = 16; | |
1098 | int error = 0; | |
e8ac4557 | 1099 | struct blk_plug plug; |
5f264996 | 1100 | unsigned short nr_vecs; |
066e8252 | 1101 | |
d441e85d BB |
1102 | /* |
1103 | * Accessing outside the block device is never allowed. | |
1104 | */ | |
1105 | if (io_offset + io_size > bdev->bd_inode->i_size) { | |
1106 | vdev_dbgmsg(zio->io_vd, | |
1107 | "Illegal access %llu size %llu, device size %llu", | |
5dbf6c5a AZ |
1108 | (u_longlong_t)io_offset, |
1109 | (u_longlong_t)io_size, | |
1110 | (u_longlong_t)i_size_read(bdev->bd_inode)); | |
d441e85d BB |
1111 | return (SET_ERROR(EIO)); |
1112 | } | |
e06be586 | 1113 | |
60101509 | 1114 | retry: |
f3b85d70 | 1115 | dr = vdev_classic_dio_alloc(bio_count); |
60101509 | 1116 | |
f1100863 | 1117 | if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)) && |
16f0fdad MZ |
1118 | zio->io_vd->vdev_failfast == B_TRUE) { |
1119 | bio_set_flags_failfast(bdev, &flags, zfs_vdev_failfast_mask & 1, | |
1120 | zfs_vdev_failfast_mask & 2, zfs_vdev_failfast_mask & 4); | |
1121 | } | |
2959d94a | 1122 | |
60101509 | 1123 | dr->dr_zio = zio; |
60101509 | 1124 | |
60101509 | 1125 | /* |
f8c0d7e1 MA |
1126 | * Since bio's can have up to BIO_MAX_PAGES=256 iovec's, each of which |
1127 | * is at least 512 bytes and at most PAGESIZE (typically 4K), one bio | |
1128 | * can cover at least 128KB and at most 1MB. When the required number | |
1129 | * of iovec's exceeds this, we are forced to break the IO in multiple | |
1130 | * bio's and wait for them all to complete. This is likely if the | |
1131 | * recordsize property is increased beyond 1MB. The default | |
1132 | * bio_count=16 should typically accommodate the maximum-size zio of | |
1133 | * 16MB. | |
60101509 | 1134 | */ |
a6255b7f | 1135 | |
b0be93e8 IH |
1136 | abd_offset = 0; |
1137 | bio_offset = io_offset; | |
f8c0d7e1 MA |
1138 | bio_size = io_size; |
1139 | for (int i = 0; i <= dr->dr_bio_count; i++) { | |
60101509 BB |
1140 | |
1141 | /* Finished constructing bio's for given buffer */ | |
1142 | if (bio_size <= 0) | |
1143 | break; | |
1144 | ||
1145 | /* | |
f8c0d7e1 MA |
1146 | * If additional bio's are required, we have to retry, but |
1147 | * this should be rare - see the comment above. | |
60101509 BB |
1148 | */ |
1149 | if (dr->dr_bio_count == i) { | |
f3b85d70 | 1150 | vdev_classic_dio_free(dr); |
60101509 | 1151 | bio_count *= 2; |
60101509 BB |
1152 | goto retry; |
1153 | } | |
1154 | ||
f3b85d70 | 1155 | nr_vecs = vdev_classic_bio_max_segs(zio, bio_size, abd_offset); |
5f264996 | 1156 | dr->dr_bio[i] = vdev_bio_alloc(bdev, GFP_NOIO, nr_vecs); |
1086f542 | 1157 | if (unlikely(dr->dr_bio[i] == NULL)) { |
f3b85d70 | 1158 | vdev_classic_dio_free(dr); |
ecb2b7dc | 1159 | return (SET_ERROR(ENOMEM)); |
60101509 BB |
1160 | } |
1161 | ||
f3b85d70 RN |
1162 | /* Matching put called by vdev_classic_physio_completion */ |
1163 | vdev_classic_dio_get(dr); | |
60101509 | 1164 | |
d4541210 | 1165 | BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9; |
f3b85d70 | 1166 | dr->dr_bio[i]->bi_end_io = vdev_classic_physio_completion; |
60101509 | 1167 | dr->dr_bio[i]->bi_private = dr; |
3b86aeb2 | 1168 | bio_set_op_attrs(dr->dr_bio[i], rw, flags); |
60101509 BB |
1169 | |
1170 | /* Remaining size is returned to become the new size */ | |
fb822260 | 1171 | bio_size = abd_bio_map_off(dr->dr_bio[i], zio->io_abd, |
02730c33 | 1172 | bio_size, abd_offset); |
60101509 BB |
1173 | |
1174 | /* Advance in buffer and construct another bio if needed */ | |
b0be93e8 | 1175 | abd_offset += BIO_BI_SIZE(dr->dr_bio[i]); |
d4541210 | 1176 | bio_offset += BIO_BI_SIZE(dr->dr_bio[i]); |
60101509 BB |
1177 | } |
1178 | ||
37f9dac5 | 1179 | /* Extra reference to protect dio_request during vdev_submit_bio */ |
f3b85d70 | 1180 | vdev_classic_dio_get(dr); |
60101509 | 1181 | |
e8ac4557 IH |
1182 | if (dr->dr_bio_count > 1) |
1183 | blk_start_plug(&plug); | |
e8ac4557 | 1184 | |
60101509 | 1185 | /* Submit all bio's associated with this dio */ |
f8c0d7e1 | 1186 | for (int i = 0; i < dr->dr_bio_count; i++) { |
60101509 | 1187 | if (dr->dr_bio[i]) |
3b86aeb2 | 1188 | vdev_submit_bio(dr->dr_bio[i]); |
f8c0d7e1 | 1189 | } |
60101509 | 1190 | |
e8ac4557 IH |
1191 | if (dr->dr_bio_count > 1) |
1192 | blk_finish_plug(&plug); | |
e8ac4557 | 1193 | |
f3b85d70 | 1194 | vdev_classic_dio_put(dr); |
60101509 | 1195 | |
d1d7e268 | 1196 | return (error); |
60101509 BB |
1197 | } |
1198 | ||
f3b85d70 RN |
1199 | /* ========== */ |
1200 | ||
36ba27e9 | 1201 | BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error) |
60101509 BB |
1202 | { |
1203 | zio_t *zio = bio->bi_private; | |
784a7fe5 | 1204 | #ifdef HAVE_1ARG_BIO_END_IO_T |
36ba27e9 BB |
1205 | zio->io_error = BIO_END_IO_ERROR(bio); |
1206 | #else | |
1207 | zio->io_error = -error; | |
784a7fe5 | 1208 | #endif |
60101509 | 1209 | |
36ba27e9 | 1210 | if (zio->io_error && (zio->io_error == EOPNOTSUPP)) |
60101509 BB |
1211 | zio->io_vd->vdev_nowritecache = B_TRUE; |
1212 | ||
1213 | bio_put(bio); | |
d148e951 BB |
1214 | ASSERT3S(zio->io_error, >=, 0); |
1215 | if (zio->io_error) | |
1216 | vdev_disk_error(zio); | |
60101509 | 1217 | zio_interrupt(zio); |
60101509 BB |
1218 | } |
1219 | ||
1220 | static int | |
1221 | vdev_disk_io_flush(struct block_device *bdev, zio_t *zio) | |
1222 | { | |
1223 | struct request_queue *q; | |
1224 | struct bio *bio; | |
1225 | ||
1226 | q = bdev_get_queue(bdev); | |
1227 | if (!q) | |
ecb2b7dc | 1228 | return (SET_ERROR(ENXIO)); |
60101509 | 1229 | |
5f264996 | 1230 | bio = vdev_bio_alloc(bdev, GFP_NOIO, 0); |
29b763cd | 1231 | if (unlikely(bio == NULL)) |
ecb2b7dc | 1232 | return (SET_ERROR(ENOMEM)); |
60101509 BB |
1233 | |
1234 | bio->bi_end_io = vdev_disk_io_flush_completion; | |
1235 | bio->bi_private = zio; | |
a5e046ea | 1236 | bio_set_flush(bio); |
3b86aeb2 | 1237 | vdev_submit_bio(bio); |
cecb7487 | 1238 | invalidate_bdev(bdev); |
60101509 | 1239 | |
d1d7e268 | 1240 | return (0); |
60101509 | 1241 | } |
60101509 | 1242 | |
06e25f9c US |
1243 | #if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE) || \ |
1244 | defined(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC) | |
1245 | BIO_END_IO_PROTO(vdev_disk_discard_end_io, bio, error) | |
1246 | { | |
1247 | zio_t *zio = bio->bi_private; | |
1248 | #ifdef HAVE_1ARG_BIO_END_IO_T | |
1249 | zio->io_error = BIO_END_IO_ERROR(bio); | |
1250 | #else | |
1251 | zio->io_error = -error; | |
1252 | #endif | |
1253 | bio_put(bio); | |
1254 | if (zio->io_error) | |
1255 | vdev_disk_error(zio); | |
1256 | zio_interrupt(zio); | |
1257 | } | |
1258 | ||
a12a5cb5 | 1259 | static int |
06e25f9c | 1260 | vdev_issue_discard_trim(zio_t *zio, unsigned long flags) |
a12a5cb5 | 1261 | { |
06e25f9c US |
1262 | int ret; |
1263 | struct bio *bio = NULL; | |
a12a5cb5 | 1264 | |
06e25f9c US |
1265 | #if defined(BLKDEV_DISCARD_SECURE) |
1266 | ret = - __blkdev_issue_discard( | |
1267 | BDH_BDEV(((vdev_disk_t *)zio->io_vd->vdev_tsd)->vd_bdh), | |
1268 | zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS, flags, &bio); | |
1269 | #else | |
1270 | (void) flags; | |
1271 | ret = - __blkdev_issue_discard( | |
1272 | BDH_BDEV(((vdev_disk_t *)zio->io_vd->vdev_tsd)->vd_bdh), | |
1273 | zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS, &bio); | |
1274 | #endif | |
1275 | if (!ret && bio) { | |
1276 | bio->bi_private = zio; | |
1277 | bio->bi_end_io = vdev_disk_discard_end_io; | |
1278 | vdev_submit_bio(bio); | |
a12a5cb5 | 1279 | } |
06e25f9c US |
1280 | return (ret); |
1281 | } | |
1282 | #endif | |
1283 | ||
1284 | static int | |
1285 | vdev_disk_io_trim(zio_t *zio) | |
1286 | { | |
a12a5cb5 | 1287 | unsigned long trim_flags = 0; |
06e25f9c US |
1288 | if (zio->io_trim_flags & ZIO_TRIM_SECURE) { |
1289 | #if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE) | |
1290 | return (-blkdev_issue_secure_erase( | |
1291 | BDH_BDEV(((vdev_disk_t *)zio->io_vd->vdev_tsd)->vd_bdh), | |
1292 | zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS)); | |
1293 | #elif defined(BLKDEV_DISCARD_SECURE) | |
a12a5cb5 BB |
1294 | trim_flags |= BLKDEV_DISCARD_SECURE; |
1295 | #endif | |
06e25f9c US |
1296 | } |
1297 | #if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE) || \ | |
1298 | defined(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC) | |
1299 | return (vdev_issue_discard_trim(zio, trim_flags)); | |
1300 | #elif defined(HAVE_BLKDEV_ISSUE_DISCARD) | |
1301 | return (-blkdev_issue_discard( | |
1302 | BDH_BDEV(((vdev_disk_t *)zio->io_vd->vdev_tsd)->vd_bdh), | |
a12a5cb5 BB |
1303 | zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS, trim_flags)); |
1304 | #else | |
1305 | #error "Unsupported kernel" | |
1306 | #endif | |
1307 | } | |
1308 | ||
c4a13ba4 RN |
1309 | int (*vdev_disk_io_rw_fn)(zio_t *zio) = NULL; |
1310 | ||
98b25418 | 1311 | static void |
60101509 BB |
1312 | vdev_disk_io_start(zio_t *zio) |
1313 | { | |
1314 | vdev_t *v = zio->io_vd; | |
1315 | vdev_disk_t *vd = v->vdev_tsd; | |
867178ae | 1316 | int error; |
60101509 | 1317 | |
d441e85d BB |
1318 | /* |
1319 | * If the vdev is closed, it's likely in the REMOVED or FAULTED state. | |
1320 | * Nothing to be done here but return failure. | |
1321 | */ | |
1322 | if (vd == NULL) { | |
1323 | zio->io_error = ENXIO; | |
1324 | zio_interrupt(zio); | |
1325 | return; | |
1326 | } | |
1327 | ||
1328 | rw_enter(&vd->vd_lock, RW_READER); | |
1329 | ||
1330 | /* | |
1331 | * If the vdev is closed, it's likely due to a failed reopen and is | |
1332 | * in the UNAVAIL state. Nothing to be done here but return failure. | |
1333 | */ | |
386d6a75 | 1334 | if (vd->vd_bdh == NULL) { |
d441e85d BB |
1335 | rw_exit(&vd->vd_lock); |
1336 | zio->io_error = ENXIO; | |
1337 | zio_interrupt(zio); | |
1338 | return; | |
1339 | } | |
1340 | ||
60101509 BB |
1341 | switch (zio->io_type) { |
1342 | case ZIO_TYPE_IOCTL: | |
1343 | ||
1344 | if (!vdev_readable(v)) { | |
d441e85d | 1345 | rw_exit(&vd->vd_lock); |
2e528b49 | 1346 | zio->io_error = SET_ERROR(ENXIO); |
98b25418 GW |
1347 | zio_interrupt(zio); |
1348 | return; | |
60101509 BB |
1349 | } |
1350 | ||
1351 | switch (zio->io_cmd) { | |
1352 | case DKIOCFLUSHWRITECACHE: | |
1353 | ||
1354 | if (zfs_nocacheflush) | |
1355 | break; | |
1356 | ||
1357 | if (v->vdev_nowritecache) { | |
2e528b49 | 1358 | zio->io_error = SET_ERROR(ENOTSUP); |
60101509 BB |
1359 | break; |
1360 | } | |
1361 | ||
386d6a75 | 1362 | error = vdev_disk_io_flush(BDH_BDEV(vd->vd_bdh), zio); |
d441e85d BB |
1363 | if (error == 0) { |
1364 | rw_exit(&vd->vd_lock); | |
98b25418 | 1365 | return; |
d441e85d | 1366 | } |
60101509 BB |
1367 | |
1368 | zio->io_error = error; | |
60101509 BB |
1369 | |
1370 | break; | |
1371 | ||
1372 | default: | |
2e528b49 | 1373 | zio->io_error = SET_ERROR(ENOTSUP); |
60101509 BB |
1374 | } |
1375 | ||
d441e85d | 1376 | rw_exit(&vd->vd_lock); |
98b25418 GW |
1377 | zio_execute(zio); |
1378 | return; | |
60101509 | 1379 | |
1b939560 | 1380 | case ZIO_TYPE_TRIM: |
a12a5cb5 | 1381 | zio->io_error = vdev_disk_io_trim(zio); |
1b939560 | 1382 | rw_exit(&vd->vd_lock); |
06e25f9c US |
1383 | #if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE) |
1384 | if (zio->io_trim_flags & ZIO_TRIM_SECURE) | |
1385 | zio_interrupt(zio); | |
1386 | #elif defined(HAVE_BLKDEV_ISSUE_DISCARD) | |
1b939560 | 1387 | zio_interrupt(zio); |
06e25f9c | 1388 | #endif |
1b939560 BB |
1389 | return; |
1390 | ||
867178ae RN |
1391 | case ZIO_TYPE_READ: |
1392 | case ZIO_TYPE_WRITE: | |
1393 | zio->io_target_timestamp = zio_handle_io_delay(zio); | |
c4a13ba4 | 1394 | error = vdev_disk_io_rw_fn(zio); |
d441e85d | 1395 | rw_exit(&vd->vd_lock); |
867178ae RN |
1396 | if (error) { |
1397 | zio->io_error = error; | |
1398 | zio_interrupt(zio); | |
1399 | } | |
98b25418 | 1400 | return; |
60101509 | 1401 | |
867178ae RN |
1402 | default: |
1403 | /* | |
1404 | * Getting here means our parent vdev has made a very strange | |
1405 | * request of us, and shouldn't happen. Assert here to force a | |
1406 | * crash in dev builds, but in production return the IO | |
1407 | * unhandled. The pool will likely suspend anyway but that's | |
1408 | * nicer than crashing the kernel. | |
1409 | */ | |
1410 | ASSERT3S(zio->io_type, ==, -1); | |
d441e85d | 1411 | |
867178ae RN |
1412 | rw_exit(&vd->vd_lock); |
1413 | zio->io_error = SET_ERROR(ENOTSUP); | |
98b25418 GW |
1414 | zio_interrupt(zio); |
1415 | return; | |
60101509 | 1416 | } |
867178ae RN |
1417 | |
1418 | __builtin_unreachable(); | |
60101509 BB |
1419 | } |
1420 | ||
1421 | static void | |
1422 | vdev_disk_io_done(zio_t *zio) | |
1423 | { | |
1424 | /* | |
1425 | * If the device returned EIO, we revalidate the media. If it is | |
1426 | * determined the media has changed this triggers the asynchronous | |
1427 | * removal of the device from the configuration. | |
1428 | */ | |
1429 | if (zio->io_error == EIO) { | |
d1d7e268 | 1430 | vdev_t *v = zio->io_vd; |
60101509 BB |
1431 | vdev_disk_t *vd = v->vdev_tsd; |
1432 | ||
386d6a75 RN |
1433 | if (!zfs_check_disk_status(BDH_BDEV(vd->vd_bdh))) { |
1434 | invalidate_bdev(BDH_BDEV(vd->vd_bdh)); | |
60101509 BB |
1435 | v->vdev_remove_wanted = B_TRUE; |
1436 | spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE); | |
1437 | } | |
1438 | } | |
1439 | } | |
1440 | ||
1441 | static void | |
1442 | vdev_disk_hold(vdev_t *vd) | |
1443 | { | |
1444 | ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); | |
1445 | ||
1446 | /* We must have a pathname, and it must be absolute. */ | |
1447 | if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') | |
1448 | return; | |
1449 | ||
1450 | /* | |
1451 | * Only prefetch path and devid info if the device has | |
1452 | * never been opened. | |
1453 | */ | |
1454 | if (vd->vdev_tsd != NULL) | |
1455 | return; | |
1456 | ||
60101509 BB |
1457 | } |
1458 | ||
1459 | static void | |
1460 | vdev_disk_rele(vdev_t *vd) | |
1461 | { | |
1462 | ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); | |
1463 | ||
1464 | /* XXX: Implement me as a vnode rele for the device */ | |
1465 | } | |
1466 | ||
df2169d1 RN |
1467 | /* |
1468 | * BIO submission method. See comment above about vdev_classic. | |
1469 | * Set zfs_vdev_disk_classic=0 for new, =1 for classic | |
1470 | */ | |
1471 | static uint_t zfs_vdev_disk_classic = 0; /* default new */ | |
1472 | ||
1473 | /* Set submission function from module parameter */ | |
1474 | static int | |
1475 | vdev_disk_param_set_classic(const char *buf, zfs_kernel_param_t *kp) | |
1476 | { | |
1477 | int err = param_set_uint(buf, kp); | |
1478 | if (err < 0) | |
1479 | return (SET_ERROR(err)); | |
1480 | ||
1481 | vdev_disk_io_rw_fn = | |
1482 | zfs_vdev_disk_classic ? vdev_classic_physio : vdev_disk_io_rw; | |
1483 | ||
1484 | printk(KERN_INFO "ZFS: forcing %s BIO submission\n", | |
1485 | zfs_vdev_disk_classic ? "classic" : "new"); | |
1486 | ||
1487 | return (0); | |
1488 | } | |
1489 | ||
c4a13ba4 RN |
1490 | /* |
1491 | * At first use vdev use, set the submission function from the default value if | |
1492 | * it hasn't been set already. | |
1493 | */ | |
1494 | static int | |
1495 | vdev_disk_init(spa_t *spa, nvlist_t *nv, void **tsd) | |
1496 | { | |
1497 | (void) spa; | |
1498 | (void) nv; | |
1499 | (void) tsd; | |
1500 | ||
1501 | if (vdev_disk_io_rw_fn == NULL) | |
df2169d1 RN |
1502 | vdev_disk_io_rw_fn = zfs_vdev_disk_classic ? |
1503 | vdev_classic_physio : vdev_disk_io_rw; | |
c4a13ba4 RN |
1504 | |
1505 | return (0); | |
1506 | } | |
1507 | ||
60101509 | 1508 | vdev_ops_t vdev_disk_ops = { |
c4a13ba4 | 1509 | .vdev_op_init = vdev_disk_init, |
b2255edc | 1510 | .vdev_op_fini = NULL, |
a64f8276 I |
1511 | .vdev_op_open = vdev_disk_open, |
1512 | .vdev_op_close = vdev_disk_close, | |
1513 | .vdev_op_asize = vdev_default_asize, | |
b2255edc BB |
1514 | .vdev_op_min_asize = vdev_default_min_asize, |
1515 | .vdev_op_min_alloc = NULL, | |
a64f8276 I |
1516 | .vdev_op_io_start = vdev_disk_io_start, |
1517 | .vdev_op_io_done = vdev_disk_io_done, | |
1518 | .vdev_op_state_change = NULL, | |
1519 | .vdev_op_need_resilver = NULL, | |
1520 | .vdev_op_hold = vdev_disk_hold, | |
1521 | .vdev_op_rele = vdev_disk_rele, | |
1522 | .vdev_op_remap = NULL, | |
1523 | .vdev_op_xlate = vdev_default_xlate, | |
b2255edc BB |
1524 | .vdev_op_rebuild_asize = NULL, |
1525 | .vdev_op_metaslab_init = NULL, | |
1526 | .vdev_op_config_generate = NULL, | |
1527 | .vdev_op_nparity = NULL, | |
1528 | .vdev_op_ndisks = NULL, | |
a64f8276 | 1529 | .vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */ |
55c12724 AH |
1530 | .vdev_op_leaf = B_TRUE, /* leaf vdev */ |
1531 | .vdev_op_kobj_evt_post = vdev_disk_kobj_evt_post | |
60101509 BB |
1532 | }; |
1533 | ||
9e17e6f2 BB |
1534 | /* |
1535 | * The zfs_vdev_scheduler module option has been deprecated. Setting this | |
1536 | * value no longer has any effect. It has not yet been entirely removed | |
1537 | * to allow the module to be loaded if this option is specified in the | |
1538 | * /etc/modprobe.d/zfs.conf file. The following warning will be logged. | |
1539 | */ | |
1540 | static int | |
1541 | param_set_vdev_scheduler(const char *val, zfs_kernel_param_t *kp) | |
1542 | { | |
1543 | int error = param_set_charp(val, kp); | |
1544 | if (error == 0) { | |
1545 | printk(KERN_INFO "The 'zfs_vdev_scheduler' module option " | |
1546 | "is not supported.\n"); | |
1547 | } | |
1548 | ||
1549 | return (error); | |
1550 | } | |
1551 | ||
18168da7 | 1552 | static const char *zfs_vdev_scheduler = "unused"; |
e771de53 BB |
1553 | module_param_call(zfs_vdev_scheduler, param_set_vdev_scheduler, |
1554 | param_get_charp, &zfs_vdev_scheduler, 0644); | |
c409e464 | 1555 | MODULE_PARM_DESC(zfs_vdev_scheduler, "I/O scheduler"); |
6fe3498c RM |
1556 | |
1557 | int | |
1558 | param_set_min_auto_ashift(const char *buf, zfs_kernel_param_t *kp) | |
1559 | { | |
ab8d9c17 | 1560 | uint_t val; |
6fe3498c RM |
1561 | int error; |
1562 | ||
ab8d9c17 | 1563 | error = kstrtouint(buf, 0, &val); |
6fe3498c RM |
1564 | if (error < 0) |
1565 | return (SET_ERROR(error)); | |
1566 | ||
1567 | if (val < ASHIFT_MIN || val > zfs_vdev_max_auto_ashift) | |
1568 | return (SET_ERROR(-EINVAL)); | |
1569 | ||
ab8d9c17 | 1570 | error = param_set_uint(buf, kp); |
6fe3498c RM |
1571 | if (error < 0) |
1572 | return (SET_ERROR(error)); | |
1573 | ||
1574 | return (0); | |
1575 | } | |
1576 | ||
1577 | int | |
1578 | param_set_max_auto_ashift(const char *buf, zfs_kernel_param_t *kp) | |
1579 | { | |
ab8d9c17 | 1580 | uint_t val; |
6fe3498c RM |
1581 | int error; |
1582 | ||
ab8d9c17 | 1583 | error = kstrtouint(buf, 0, &val); |
6fe3498c RM |
1584 | if (error < 0) |
1585 | return (SET_ERROR(error)); | |
1586 | ||
1587 | if (val > ASHIFT_MAX || val < zfs_vdev_min_auto_ashift) | |
1588 | return (SET_ERROR(-EINVAL)); | |
1589 | ||
ab8d9c17 | 1590 | error = param_set_uint(buf, kp); |
6fe3498c RM |
1591 | if (error < 0) |
1592 | return (SET_ERROR(error)); | |
1593 | ||
1594 | return (0); | |
1595 | } | |
f66ffe68 SD |
1596 | |
1597 | ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, open_timeout_ms, UINT, ZMOD_RW, | |
1598 | "Timeout before determining that a device is missing"); | |
16f0fdad MZ |
1599 | |
1600 | ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, failfast_mask, UINT, ZMOD_RW, | |
1601 | "Defines failfast mask: 1 - device, 2 - transport, 4 - driver"); | |
06a19602 RN |
1602 | |
1603 | ZFS_MODULE_PARAM(zfs_vdev_disk, zfs_vdev_disk_, max_segs, UINT, ZMOD_RW, | |
1604 | "Maximum number of data segments to add to an IO request (min 4)"); | |
df2169d1 RN |
1605 | |
1606 | ZFS_MODULE_PARAM_CALL(zfs_vdev_disk, zfs_vdev_disk_, classic, | |
1607 | vdev_disk_param_set_classic, param_get_uint, ZMOD_RD, | |
1608 | "Use classic BIO submission method"); |