]>
Commit | Line | Data |
---|---|---|
60101509 BB |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
1d3ba0bf | 9 | * or https://opensource.org/licenses/CDDL-1.0. |
60101509 BB |
10 | * See the License for the specific language governing permissions |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | /* | |
22 | * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC. | |
23 | * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). | |
24 | * Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>. | |
25 | * LLNL-CODE-403049. | |
1eacf2b3 | 26 | * Copyright (c) 2012, 2019 by Delphix. All rights reserved. |
06a19602 | 27 | * Copyright (c) 2023, 2024, Klara Inc. |
60101509 BB |
28 | */ |
29 | ||
30 | #include <sys/zfs_context.h> | |
e771de53 | 31 | #include <sys/spa_impl.h> |
60101509 BB |
32 | #include <sys/vdev_disk.h> |
33 | #include <sys/vdev_impl.h> | |
1b939560 | 34 | #include <sys/vdev_trim.h> |
a6255b7f | 35 | #include <sys/abd.h> |
60101509 BB |
36 | #include <sys/fs/zfs.h> |
37 | #include <sys/zio.h> | |
8e82ffba | 38 | #include <linux/blkpg.h> |
74d42600 | 39 | #include <linux/msdos_fs.h> |
05805494 | 40 | #include <linux/vfs_compat.h> |
1e767532 CK |
41 | #ifdef HAVE_LINUX_BLK_CGROUP_HEADER |
42 | #include <linux/blk-cgroup.h> | |
43 | #endif | |
60101509 | 44 | |
386d6a75 RN |
45 | /* |
46 | * Linux 6.8.x uses a bdev_handle as an instance/refcount for an underlying | |
47 | * block_device. Since it carries the block_device inside, its convenient to | |
e3120f73 RN |
48 | * just use the handle as a proxy. |
49 | * | |
50 | * Linux 6.9.x uses a file for the same purpose. | |
51 | * | |
52 | * For pre-6.8, we just emulate this with a cast, since we don't need any of | |
53 | * the other fields inside the handle. | |
386d6a75 | 54 | */ |
e3120f73 | 55 | #if defined(HAVE_BDEV_OPEN_BY_PATH) |
386d6a75 RN |
56 | typedef struct bdev_handle zfs_bdev_handle_t; |
57 | #define BDH_BDEV(bdh) ((bdh)->bdev) | |
58 | #define BDH_IS_ERR(bdh) (IS_ERR(bdh)) | |
59 | #define BDH_PTR_ERR(bdh) (PTR_ERR(bdh)) | |
60 | #define BDH_ERR_PTR(err) (ERR_PTR(err)) | |
e3120f73 RN |
61 | #elif defined(HAVE_BDEV_FILE_OPEN_BY_PATH) |
62 | typedef struct file zfs_bdev_handle_t; | |
63 | #define BDH_BDEV(bdh) (file_bdev(bdh)) | |
64 | #define BDH_IS_ERR(bdh) (IS_ERR(bdh)) | |
65 | #define BDH_PTR_ERR(bdh) (PTR_ERR(bdh)) | |
66 | #define BDH_ERR_PTR(err) (ERR_PTR(err)) | |
386d6a75 RN |
67 | #else |
68 | typedef void zfs_bdev_handle_t; | |
69 | #define BDH_BDEV(bdh) ((struct block_device *)bdh) | |
70 | #define BDH_IS_ERR(bdh) (IS_ERR(BDH_BDEV(bdh))) | |
71 | #define BDH_PTR_ERR(bdh) (PTR_ERR(BDH_BDEV(bdh))) | |
72 | #define BDH_ERR_PTR(err) (ERR_PTR(err)) | |
73 | #endif | |
74 | ||
d366c8fd | 75 | typedef struct vdev_disk { |
386d6a75 | 76 | zfs_bdev_handle_t *vd_bdh; |
d366c8fd JL |
77 | krwlock_t vd_lock; |
78 | } vdev_disk_t; | |
79 | ||
06a19602 RN |
80 | /* |
81 | * Maximum number of segments to add to a bio (min 4). If this is higher than | |
82 | * the maximum allowed by the device queue or the kernel itself, it will be | |
83 | * clamped. Setting it to zero will cause the kernel's ideal size to be used. | |
84 | */ | |
85 | uint_t zfs_vdev_disk_max_segs = 0; | |
86 | ||
a25861dc BB |
87 | /* |
88 | * Unique identifier for the exclusive vdev holder. | |
89 | */ | |
8128bd89 | 90 | static void *zfs_vdev_holder = VDEV_HOLDER; |
6839eed2 | 91 | |
a25861dc BB |
92 | /* |
93 | * Wait up to zfs_vdev_open_timeout_ms milliseconds before determining the | |
94 | * device is missing. The missing path may be transient since the links | |
95 | * can be briefly removed and recreated in response to udev events. | |
96 | */ | |
f66ffe68 | 97 | static uint_t zfs_vdev_open_timeout_ms = 1000; |
a25861dc BB |
98 | |
99 | /* | |
100 | * Size of the "reserved" partition, in blocks. | |
101 | */ | |
74d42600 SH |
102 | #define EFI_MIN_RESV_SIZE (16 * 1024) |
103 | ||
16f0fdad MZ |
104 | /* |
105 | * BIO request failfast mask. | |
106 | */ | |
107 | ||
108 | static unsigned int zfs_vdev_failfast_mask = 1; | |
109 | ||
cfb96c77 RN |
110 | /* |
111 | * Convert SPA mode flags into bdev open mode flags. | |
112 | */ | |
43e8f6e3 | 113 | #ifdef HAVE_BLK_MODE_T |
cfb96c77 RN |
114 | typedef blk_mode_t vdev_bdev_mode_t; |
115 | #define VDEV_BDEV_MODE_READ BLK_OPEN_READ | |
116 | #define VDEV_BDEV_MODE_WRITE BLK_OPEN_WRITE | |
117 | #define VDEV_BDEV_MODE_EXCL BLK_OPEN_EXCL | |
118 | #define VDEV_BDEV_MODE_MASK (BLK_OPEN_READ|BLK_OPEN_WRITE|BLK_OPEN_EXCL) | |
43e8f6e3 | 119 | #else |
cfb96c77 RN |
120 | typedef fmode_t vdev_bdev_mode_t; |
121 | #define VDEV_BDEV_MODE_READ FMODE_READ | |
122 | #define VDEV_BDEV_MODE_WRITE FMODE_WRITE | |
123 | #define VDEV_BDEV_MODE_EXCL FMODE_EXCL | |
124 | #define VDEV_BDEV_MODE_MASK (FMODE_READ|FMODE_WRITE|FMODE_EXCL) | |
43e8f6e3 | 125 | #endif |
43e8f6e3 | 126 | |
cfb96c77 RN |
127 | static vdev_bdev_mode_t |
128 | vdev_bdev_mode(spa_mode_t smode) | |
129 | { | |
130 | ASSERT3U(smode, !=, SPA_MODE_UNINIT); | |
131 | ASSERT0(smode & ~(SPA_MODE_READ|SPA_MODE_WRITE)); | |
233d34e4 | 132 | |
cfb96c77 | 133 | vdev_bdev_mode_t bmode = VDEV_BDEV_MODE_EXCL; |
60101509 | 134 | |
cfb96c77 RN |
135 | if (smode & SPA_MODE_READ) |
136 | bmode |= VDEV_BDEV_MODE_READ; | |
60101509 | 137 | |
cfb96c77 RN |
138 | if (smode & SPA_MODE_WRITE) |
139 | bmode |= VDEV_BDEV_MODE_WRITE; | |
233d34e4 | 140 | |
cfb96c77 RN |
141 | ASSERT(bmode & VDEV_BDEV_MODE_MASK); |
142 | ASSERT0(bmode & ~VDEV_BDEV_MODE_MASK); | |
60101509 | 143 | |
cfb96c77 | 144 | return (bmode); |
60101509 | 145 | } |
60101509 | 146 | |
d441e85d BB |
147 | /* |
148 | * Returns the usable capacity (in bytes) for the partition or disk. | |
149 | */ | |
60101509 | 150 | static uint64_t |
d441e85d | 151 | bdev_capacity(struct block_device *bdev) |
60101509 | 152 | { |
d441e85d BB |
153 | return (i_size_read(bdev->bd_inode)); |
154 | } | |
60101509 | 155 | |
72ba4b2a BB |
156 | #if !defined(HAVE_BDEV_WHOLE) |
157 | static inline struct block_device * | |
158 | bdev_whole(struct block_device *bdev) | |
159 | { | |
160 | return (bdev->bd_contains); | |
161 | } | |
162 | #endif | |
163 | ||
bebdf52a BB |
164 | #if defined(HAVE_BDEVNAME) |
165 | #define vdev_bdevname(bdev, name) bdevname(bdev, name) | |
166 | #else | |
167 | static inline void | |
168 | vdev_bdevname(struct block_device *bdev, char *name) | |
169 | { | |
170 | snprintf(name, BDEVNAME_SIZE, "%pg", bdev); | |
171 | } | |
172 | #endif | |
173 | ||
d441e85d BB |
174 | /* |
175 | * Returns the maximum expansion capacity of the block device (in bytes). | |
176 | * | |
177 | * It is possible to expand a vdev when it has been created as a wholedisk | |
178 | * and the containing block device has increased in capacity. Or when the | |
179 | * partition containing the pool has been manually increased in size. | |
180 | * | |
181 | * This function is only responsible for calculating the potential expansion | |
182 | * size so it can be reported by 'zpool list'. The efi_use_whole_disk() is | |
183 | * responsible for verifying the expected partition layout in the wholedisk | |
184 | * case, and updating the partition table if appropriate. Once the partition | |
185 | * size has been increased the additional capacity will be visible using | |
186 | * bdev_capacity(). | |
0c637f31 | 187 | * |
188 | * The returned maximum expansion capacity is always expected to be larger, or | |
189 | * at the very least equal, to its usable capacity to prevent overestimating | |
190 | * the pool expandsize. | |
d441e85d BB |
191 | */ |
192 | static uint64_t | |
193 | bdev_max_capacity(struct block_device *bdev, uint64_t wholedisk) | |
194 | { | |
195 | uint64_t psize; | |
196 | int64_t available; | |
197 | ||
72ba4b2a | 198 | if (wholedisk && bdev != bdev_whole(bdev)) { |
74d42600 | 199 | /* |
d441e85d BB |
200 | * When reporting maximum expansion capacity for a wholedisk |
201 | * deduct any capacity which is expected to be lost due to | |
202 | * alignment restrictions. Over reporting this value isn't | |
203 | * harmful and would only result in slightly less capacity | |
204 | * than expected post expansion. | |
0c637f31 | 205 | * The estimated available space may be slightly smaller than |
206 | * bdev_capacity() for devices where the number of sectors is | |
207 | * not a multiple of the alignment size and the partition layout | |
208 | * is keeping less than PARTITION_END_ALIGNMENT bytes after the | |
209 | * "reserved" EFI partition: in such cases return the device | |
210 | * usable capacity. | |
74d42600 | 211 | */ |
72ba4b2a | 212 | available = i_size_read(bdev_whole(bdev)->bd_inode) - |
d441e85d BB |
213 | ((EFI_MIN_RESV_SIZE + NEW_START_BLOCK + |
214 | PARTITION_END_ALIGNMENT) << SECTOR_BITS); | |
0c637f31 | 215 | psize = MAX(available, bdev_capacity(bdev)); |
74d42600 | 216 | } else { |
d441e85d | 217 | psize = bdev_capacity(bdev); |
74d42600 | 218 | } |
d441e85d BB |
219 | |
220 | return (psize); | |
60101509 BB |
221 | } |
222 | ||
d148e951 BB |
223 | static void |
224 | vdev_disk_error(zio_t *zio) | |
225 | { | |
c71c8c71 | 226 | /* |
227 | * This function can be called in interrupt context, for instance while | |
228 | * handling IRQs coming from a misbehaving disk device; use printk() | |
229 | * which is safe from any context. | |
230 | */ | |
231 | printk(KERN_WARNING "zio pool=%s vdev=%s error=%d type=%d " | |
4938d01d | 232 | "offset=%llu size=%llu flags=%llu\n", spa_name(zio->io_spa), |
c71c8c71 | 233 | zio->io_vd->vdev_path, zio->io_error, zio->io_type, |
234 | (u_longlong_t)zio->io_offset, (u_longlong_t)zio->io_size, | |
235 | zio->io_flags); | |
d148e951 BB |
236 | } |
237 | ||
55c12724 AH |
238 | static void |
239 | vdev_disk_kobj_evt_post(vdev_t *v) | |
240 | { | |
241 | vdev_disk_t *vd = v->vdev_tsd; | |
386d6a75 RN |
242 | if (vd && vd->vd_bdh) { |
243 | spl_signal_kobj_evt(BDH_BDEV(vd->vd_bdh)); | |
55c12724 AH |
244 | } else { |
245 | vdev_dbgmsg(v, "vdev_disk_t is NULL for VDEV:%s\n", | |
246 | v->vdev_path); | |
247 | } | |
248 | } | |
249 | ||
386d6a75 | 250 | static zfs_bdev_handle_t * |
cfb96c77 | 251 | vdev_blkdev_get_by_path(const char *path, spa_mode_t smode, void *holder) |
43e8f6e3 | 252 | { |
cfb96c77 RN |
253 | vdev_bdev_mode_t bmode = vdev_bdev_mode(smode); |
254 | ||
e3120f73 RN |
255 | #if defined(HAVE_BDEV_FILE_OPEN_BY_PATH) |
256 | return (bdev_file_open_by_path(path, bmode, holder, NULL)); | |
257 | #elif defined(HAVE_BDEV_OPEN_BY_PATH) | |
cfb96c77 | 258 | return (bdev_open_by_path(path, bmode, holder, NULL)); |
386d6a75 | 259 | #elif defined(HAVE_BLKDEV_GET_BY_PATH_4ARG) |
cfb96c77 | 260 | return (blkdev_get_by_path(path, bmode, holder, NULL)); |
43e8f6e3 | 261 | #else |
cfb96c77 | 262 | return (blkdev_get_by_path(path, bmode, holder)); |
43e8f6e3 CK |
263 | #endif |
264 | } | |
265 | ||
266 | static void | |
cfb96c77 | 267 | vdev_blkdev_put(zfs_bdev_handle_t *bdh, spa_mode_t smode, void *holder) |
43e8f6e3 | 268 | { |
386d6a75 RN |
269 | #if defined(HAVE_BDEV_RELEASE) |
270 | return (bdev_release(bdh)); | |
271 | #elif defined(HAVE_BLKDEV_PUT_HOLDER) | |
272 | return (blkdev_put(BDH_BDEV(bdh), holder)); | |
e3120f73 | 273 | #elif defined(HAVE_BLKDEV_PUT) |
cfb96c77 | 274 | return (blkdev_put(BDH_BDEV(bdh), vdev_bdev_mode(smode))); |
e3120f73 RN |
275 | #else |
276 | fput(bdh); | |
43e8f6e3 CK |
277 | #endif |
278 | } | |
279 | ||
60101509 | 280 | static int |
1bd201e7 | 281 | vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, |
6fe3498c | 282 | uint64_t *logical_ashift, uint64_t *physical_ashift) |
60101509 | 283 | { |
386d6a75 | 284 | zfs_bdev_handle_t *bdh; |
cfb96c77 | 285 | spa_mode_t smode = spa_mode(v->vdev_spa); |
a25861dc | 286 | hrtime_t timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms); |
60101509 | 287 | vdev_disk_t *vd; |
60101509 BB |
288 | |
289 | /* Must have a pathname and it must be absolute. */ | |
290 | if (v->vdev_path == NULL || v->vdev_path[0] != '/') { | |
291 | v->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; | |
d441e85d | 292 | vdev_dbgmsg(v, "invalid vdev_path"); |
2d82ea8b | 293 | return (SET_ERROR(EINVAL)); |
60101509 BB |
294 | } |
295 | ||
0d8103d9 | 296 | /* |
d441e85d | 297 | * Reopen the device if it is currently open. When expanding a |
8e82ffba GW |
298 | * partition force re-scanning the partition table if userland |
299 | * did not take care of this already. We need to do this while closed | |
d441e85d BB |
300 | * in order to get an accurate updated block device size. Then |
301 | * since udev may need to recreate the device links increase the | |
a25861dc | 302 | * open retry timeout before reporting the device as unavailable. |
0d8103d9 | 303 | */ |
d441e85d BB |
304 | vd = v->vdev_tsd; |
305 | if (vd) { | |
306 | char disk_name[BDEVNAME_SIZE + 6] = "/dev/"; | |
307 | boolean_t reread_part = B_FALSE; | |
0d8103d9 | 308 | |
d441e85d | 309 | rw_enter(&vd->vd_lock, RW_WRITER); |
386d6a75 RN |
310 | bdh = vd->vd_bdh; |
311 | vd->vd_bdh = NULL; | |
d441e85d | 312 | |
386d6a75 RN |
313 | if (bdh) { |
314 | struct block_device *bdev = BDH_BDEV(bdh); | |
72ba4b2a | 315 | if (v->vdev_expanding && bdev != bdev_whole(bdev)) { |
bebdf52a | 316 | vdev_bdevname(bdev_whole(bdev), disk_name + 5); |
8e82ffba GW |
317 | /* |
318 | * If userland has BLKPG_RESIZE_PARTITION, | |
319 | * then it should have updated the partition | |
320 | * table already. We can detect this by | |
321 | * comparing our current physical size | |
322 | * with that of the device. If they are | |
323 | * the same, then we must not have | |
324 | * BLKPG_RESIZE_PARTITION or it failed to | |
325 | * update the partition table online. We | |
326 | * fallback to rescanning the partition | |
327 | * table from the kernel below. However, | |
328 | * if the capacity already reflects the | |
329 | * updated partition, then we skip | |
330 | * rescanning the partition table here. | |
331 | */ | |
332 | if (v->vdev_psize == bdev_capacity(bdev)) | |
333 | reread_part = B_TRUE; | |
d441e85d BB |
334 | } |
335 | ||
cfb96c77 | 336 | vdev_blkdev_put(bdh, smode, zfs_vdev_holder); |
d441e85d BB |
337 | } |
338 | ||
339 | if (reread_part) { | |
cfb96c77 | 340 | bdh = vdev_blkdev_get_by_path(disk_name, smode, |
386d6a75 RN |
341 | zfs_vdev_holder); |
342 | if (!BDH_IS_ERR(bdh)) { | |
343 | int error = | |
344 | vdev_bdev_reread_part(BDH_BDEV(bdh)); | |
cfb96c77 | 345 | vdev_blkdev_put(bdh, smode, zfs_vdev_holder); |
a25861dc BB |
346 | if (error == 0) { |
347 | timeout = MSEC2NSEC( | |
348 | zfs_vdev_open_timeout_ms * 2); | |
349 | } | |
d441e85d BB |
350 | } |
351 | } | |
352 | } else { | |
353 | vd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP); | |
354 | ||
355 | rw_init(&vd->vd_lock, NULL, RW_DEFAULT, NULL); | |
356 | rw_enter(&vd->vd_lock, RW_WRITER); | |
357 | } | |
60101509 BB |
358 | |
359 | /* | |
360 | * Devices are always opened by the path provided at configuration | |
361 | * time. This means that if the provided path is a udev by-id path | |
d441e85d | 362 | * then drives may be re-cabled without an issue. If the provided |
4e95cc99 | 363 | * path is a udev by-path path, then the physical location information |
60101509 BB |
364 | * will be preserved. This can be critical for more complicated |
365 | * configurations where drives are located in specific physical | |
d441e85d BB |
366 | * locations to maximize the systems tolerance to component failure. |
367 | * | |
4e95cc99 | 368 | * Alternatively, you can provide your own udev rule to flexibly map |
60101509 | 369 | * the drives as you see fit. It is not advised that you use the |
4e95cc99 | 370 | * /dev/[hd]d devices which may be reordered due to probing order. |
60101509 BB |
371 | * Devices in the wrong locations will be detected by the higher |
372 | * level vdev validation. | |
2d82ea8b BB |
373 | * |
374 | * The specified paths may be briefly removed and recreated in | |
375 | * response to udev events. This should be exceptionally unlikely | |
376 | * because the zpool command makes every effort to verify these paths | |
377 | * have already settled prior to reaching this point. Therefore, | |
378 | * a ENOENT failure at this point is highly likely to be transient | |
379 | * and it is reasonable to sleep and retry before giving up. In | |
380 | * practice delays have been observed to be on the order of 100ms. | |
77e2756d BB |
381 | * |
382 | * When ERESTARTSYS is returned it indicates the block device is | |
383 | * a zvol which could not be opened due to the deadlock detection | |
384 | * logic in zvol_open(). Extend the timeout and retry the open | |
385 | * subsequent attempts are expected to eventually succeed. | |
60101509 | 386 | */ |
a25861dc | 387 | hrtime_t start = gethrtime(); |
386d6a75 RN |
388 | bdh = BDH_ERR_PTR(-ENXIO); |
389 | while (BDH_IS_ERR(bdh) && ((gethrtime() - start) < timeout)) { | |
cfb96c77 | 390 | bdh = vdev_blkdev_get_by_path(v->vdev_path, smode, |
386d6a75 RN |
391 | zfs_vdev_holder); |
392 | if (unlikely(BDH_PTR_ERR(bdh) == -ENOENT)) { | |
55c12724 AH |
393 | /* |
394 | * There is no point of waiting since device is removed | |
395 | * explicitly | |
396 | */ | |
397 | if (v->vdev_removed) | |
398 | break; | |
399 | ||
d441e85d | 400 | schedule_timeout(MSEC_TO_TICK(10)); |
386d6a75 | 401 | } else if (unlikely(BDH_PTR_ERR(bdh) == -ERESTARTSYS)) { |
77e2756d BB |
402 | timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms * 10); |
403 | continue; | |
386d6a75 | 404 | } else if (BDH_IS_ERR(bdh)) { |
2d82ea8b BB |
405 | break; |
406 | } | |
407 | } | |
408 | ||
386d6a75 RN |
409 | if (BDH_IS_ERR(bdh)) { |
410 | int error = -BDH_PTR_ERR(bdh); | |
a25861dc BB |
411 | vdev_dbgmsg(v, "open error=%d timeout=%llu/%llu", error, |
412 | (u_longlong_t)(gethrtime() - start), | |
413 | (u_longlong_t)timeout); | |
386d6a75 | 414 | vd->vd_bdh = NULL; |
d441e85d BB |
415 | v->vdev_tsd = vd; |
416 | rw_exit(&vd->vd_lock); | |
417 | return (SET_ERROR(error)); | |
418 | } else { | |
386d6a75 | 419 | vd->vd_bdh = bdh; |
d441e85d BB |
420 | v->vdev_tsd = vd; |
421 | rw_exit(&vd->vd_lock); | |
60101509 BB |
422 | } |
423 | ||
386d6a75 RN |
424 | struct block_device *bdev = BDH_BDEV(vd->vd_bdh); |
425 | ||
0d8103d9 | 426 | /* Determine the physical block size */ |
386d6a75 | 427 | int physical_block_size = bdev_physical_block_size(bdev); |
6fe3498c RM |
428 | |
429 | /* Determine the logical block size */ | |
386d6a75 | 430 | int logical_block_size = bdev_logical_block_size(bdev); |
60101509 | 431 | |
60101509 BB |
432 | /* Clear the nowritecache bit, causes vdev_reopen() to try again. */ |
433 | v->vdev_nowritecache = B_FALSE; | |
434 | ||
1b939560 | 435 | /* Set when device reports it supports TRIM. */ |
386d6a75 | 436 | v->vdev_has_trim = bdev_discard_supported(bdev); |
1b939560 BB |
437 | |
438 | /* Set when device reports it supports secure TRIM. */ | |
386d6a75 | 439 | v->vdev_has_securetrim = bdev_secure_discard_supported(bdev); |
1b939560 | 440 | |
fb40095f | 441 | /* Inform the ZIO pipeline that we are non-rotational */ |
386d6a75 | 442 | v->vdev_nonrot = blk_queue_nonrot(bdev_get_queue(bdev)); |
fb40095f | 443 | |
d441e85d | 444 | /* Physical volume size in bytes for the partition */ |
386d6a75 | 445 | *psize = bdev_capacity(bdev); |
d441e85d BB |
446 | |
447 | /* Physical volume size in bytes including possible expansion space */ | |
386d6a75 | 448 | *max_psize = bdev_max_capacity(bdev, v->vdev_wholedisk); |
1bd201e7 | 449 | |
60101509 | 450 | /* Based on the minimum sector size set the block size */ |
6fe3498c RM |
451 | *physical_ashift = highbit64(MAX(physical_block_size, |
452 | SPA_MINBLOCKSIZE)) - 1; | |
453 | ||
454 | *logical_ashift = highbit64(MAX(logical_block_size, | |
455 | SPA_MINBLOCKSIZE)) - 1; | |
60101509 | 456 | |
d1d7e268 | 457 | return (0); |
60101509 BB |
458 | } |
459 | ||
460 | static void | |
461 | vdev_disk_close(vdev_t *v) | |
462 | { | |
463 | vdev_disk_t *vd = v->vdev_tsd; | |
464 | ||
0d8103d9 | 465 | if (v->vdev_reopening || vd == NULL) |
60101509 BB |
466 | return; |
467 | ||
72fd834c | 468 | if (vd->vd_bdh != NULL) |
386d6a75 | 469 | vdev_blkdev_put(vd->vd_bdh, spa_mode(v->vdev_spa), |
43e8f6e3 | 470 | zfs_vdev_holder); |
60101509 | 471 | |
d441e85d | 472 | rw_destroy(&vd->vd_lock); |
d1d7e268 | 473 | kmem_free(vd, sizeof (vdev_disk_t)); |
60101509 BB |
474 | v->vdev_tsd = NULL; |
475 | } | |
476 | ||
bbb1b6ce | 477 | static inline void |
3b86aeb2 | 478 | vdev_submit_bio_impl(struct bio *bio) |
bbb1b6ce BB |
479 | { |
480 | #ifdef HAVE_1ARG_SUBMIT_BIO | |
453c63e9 | 481 | (void) submit_bio(bio); |
bbb1b6ce | 482 | #else |
a3fbe2b9 | 483 | (void) submit_bio(bio_data_dir(bio), bio); |
bbb1b6ce BB |
484 | #endif |
485 | } | |
486 | ||
2e407941 BB |
487 | /* |
488 | * preempt_schedule_notrace is GPL-only which breaks the ZFS build, so | |
489 | * replace it with preempt_schedule under the following condition: | |
490 | */ | |
491 | #if defined(CONFIG_ARM64) && \ | |
492 | defined(CONFIG_PREEMPTION) && \ | |
493 | defined(CONFIG_BLK_CGROUP) | |
494 | #define preempt_schedule_notrace(x) preempt_schedule(x) | |
495 | #endif | |
496 | ||
5f264996 BB |
497 | /* |
498 | * As for the Linux 5.18 kernel bio_alloc() expects a block_device struct | |
499 | * as an argument removing the need to set it with bio_set_dev(). This | |
500 | * removes the need for all of the following compatibility code. | |
501 | */ | |
502 | #if !defined(HAVE_BIO_ALLOC_4ARG) | |
503 | ||
26a85659 BB |
504 | #ifdef HAVE_BIO_SET_DEV |
505 | #if defined(CONFIG_BLK_CGROUP) && defined(HAVE_BIO_SET_DEV_GPL_ONLY) | |
bd0d24e0 BB |
506 | /* |
507 | * The Linux 5.5 kernel updated percpu_ref_tryget() which is inlined by | |
508 | * blkg_tryget() to use rcu_read_lock() instead of rcu_read_lock_sched(). | |
509 | * As a side effect the function was converted to GPL-only. Define our | |
510 | * own version when needed which uses rcu_read_lock_sched(). | |
036e846a RS |
511 | * |
512 | * The Linux 5.17 kernel split linux/blk-cgroup.h into a private and a public | |
513 | * part, moving blkg_tryget into the private one. Define our own version. | |
bd0d24e0 | 514 | */ |
036e846a | 515 | #if defined(HAVE_BLKG_TRYGET_GPL_ONLY) || !defined(HAVE_BLKG_TRYGET) |
bd0d24e0 BB |
516 | static inline bool |
517 | vdev_blkg_tryget(struct blkcg_gq *blkg) | |
518 | { | |
519 | struct percpu_ref *ref = &blkg->refcnt; | |
520 | unsigned long __percpu *count; | |
521 | bool rc; | |
522 | ||
523 | rcu_read_lock_sched(); | |
524 | ||
525 | if (__ref_is_percpu(ref, &count)) { | |
526 | this_cpu_inc(*count); | |
527 | rc = true; | |
528 | } else { | |
838a2490 CK |
529 | #ifdef ZFS_PERCPU_REF_COUNT_IN_DATA |
530 | rc = atomic_long_inc_not_zero(&ref->data->count); | |
531 | #else | |
bd0d24e0 | 532 | rc = atomic_long_inc_not_zero(&ref->count); |
838a2490 | 533 | #endif |
bd0d24e0 BB |
534 | } |
535 | ||
536 | rcu_read_unlock_sched(); | |
537 | ||
538 | return (rc); | |
539 | } | |
036e846a | 540 | #else |
bd0d24e0 BB |
541 | #define vdev_blkg_tryget(bg) blkg_tryget(bg) |
542 | #endif | |
d08b99ac | 543 | #ifdef HAVE_BIO_SET_DEV_MACRO |
26a85659 BB |
544 | /* |
545 | * The Linux 5.0 kernel updated the bio_set_dev() macro so it calls the | |
546 | * GPL-only bio_associate_blkg() symbol thus inadvertently converting | |
547 | * the entire macro. Provide a minimal version which always assigns the | |
548 | * request queue's root_blkg to the bio. | |
549 | */ | |
550 | static inline void | |
551 | vdev_bio_associate_blkg(struct bio *bio) | |
552 | { | |
d939930f CK |
553 | #if defined(HAVE_BIO_BDEV_DISK) |
554 | struct request_queue *q = bio->bi_bdev->bd_disk->queue; | |
555 | #else | |
26a85659 | 556 | struct request_queue *q = bio->bi_disk->queue; |
d939930f | 557 | #endif |
26a85659 BB |
558 | |
559 | ASSERT3P(q, !=, NULL); | |
26a85659 BB |
560 | ASSERT3P(bio->bi_blkg, ==, NULL); |
561 | ||
bd0d24e0 | 562 | if (q->root_blkg && vdev_blkg_tryget(q->root_blkg)) |
26a85659 BB |
563 | bio->bi_blkg = q->root_blkg; |
564 | } | |
d08b99ac | 565 | |
26a85659 | 566 | #define bio_associate_blkg vdev_bio_associate_blkg |
d08b99ac CK |
567 | #else |
568 | static inline void | |
569 | vdev_bio_set_dev(struct bio *bio, struct block_device *bdev) | |
570 | { | |
571 | #if defined(HAVE_BIO_BDEV_DISK) | |
572 | struct request_queue *q = bdev->bd_disk->queue; | |
573 | #else | |
574 | struct request_queue *q = bio->bi_disk->queue; | |
575 | #endif | |
576 | bio_clear_flag(bio, BIO_REMAPPED); | |
577 | if (bio->bi_bdev != bdev) | |
578 | bio_clear_flag(bio, BIO_THROTTLED); | |
579 | bio->bi_bdev = bdev; | |
580 | ||
581 | ASSERT3P(q, !=, NULL); | |
582 | ASSERT3P(bio->bi_blkg, ==, NULL); | |
583 | ||
584 | if (q->root_blkg && vdev_blkg_tryget(q->root_blkg)) | |
585 | bio->bi_blkg = q->root_blkg; | |
586 | } | |
587 | #define bio_set_dev vdev_bio_set_dev | |
588 | #endif | |
26a85659 BB |
589 | #endif |
590 | #else | |
591 | /* | |
592 | * Provide a bio_set_dev() helper macro for pre-Linux 4.14 kernels. | |
593 | */ | |
787acae0 GDN |
594 | static inline void |
595 | bio_set_dev(struct bio *bio, struct block_device *bdev) | |
596 | { | |
597 | bio->bi_bdev = bdev; | |
598 | } | |
26a85659 | 599 | #endif /* HAVE_BIO_SET_DEV */ |
5f264996 | 600 | #endif /* !HAVE_BIO_ALLOC_4ARG */ |
787acae0 | 601 | |
37f9dac5 | 602 | static inline void |
3b86aeb2 | 603 | vdev_submit_bio(struct bio *bio) |
37f9dac5 | 604 | { |
37f9dac5 RY |
605 | struct bio_list *bio_list = current->bio_list; |
606 | current->bio_list = NULL; | |
3b86aeb2 | 607 | vdev_submit_bio_impl(bio); |
37f9dac5 | 608 | current->bio_list = bio_list; |
37f9dac5 RY |
609 | } |
610 | ||
5f264996 BB |
611 | static inline struct bio * |
612 | vdev_bio_alloc(struct block_device *bdev, gfp_t gfp_mask, | |
613 | unsigned short nr_vecs) | |
614 | { | |
615 | struct bio *bio; | |
616 | ||
d1325b4f | 617 | #ifdef HAVE_BIO_ALLOC_4ARG |
5f264996 BB |
618 | bio = bio_alloc(bdev, nr_vecs, 0, gfp_mask); |
619 | #else | |
620 | bio = bio_alloc(gfp_mask, nr_vecs); | |
621 | if (likely(bio != NULL)) | |
622 | bio_set_dev(bio, bdev); | |
d1325b4f AZ |
623 | #endif |
624 | ||
5f264996 BB |
625 | return (bio); |
626 | } | |
627 | ||
06a19602 RN |
628 | static inline uint_t |
629 | vdev_bio_max_segs(struct block_device *bdev) | |
630 | { | |
631 | /* | |
632 | * Smallest of the device max segs and the tuneable max segs. Minimum | |
633 | * 4, so there's room to finish split pages if they come up. | |
634 | */ | |
635 | const uint_t dev_max_segs = queue_max_segments(bdev_get_queue(bdev)); | |
636 | const uint_t tune_max_segs = (zfs_vdev_disk_max_segs > 0) ? | |
637 | MAX(4, zfs_vdev_disk_max_segs) : dev_max_segs; | |
638 | const uint_t max_segs = MIN(tune_max_segs, dev_max_segs); | |
639 | ||
640 | #ifdef HAVE_BIO_MAX_SEGS | |
641 | return (bio_max_segs(max_segs)); | |
642 | #else | |
643 | return (MIN(max_segs, BIO_MAX_PAGES)); | |
644 | #endif | |
645 | } | |
646 | ||
647 | static inline uint_t | |
648 | vdev_bio_max_bytes(struct block_device *bdev) | |
649 | { | |
650 | return (queue_max_sectors(bdev_get_queue(bdev)) << 9); | |
651 | } | |
652 | ||
653 | ||
654 | /* | |
655 | * Virtual block IO object (VBIO) | |
656 | * | |
657 | * Linux block IO (BIO) objects have a limit on how many data segments (pages) | |
658 | * they can hold. Depending on how they're allocated and structured, a large | |
659 | * ZIO can require more than one BIO to be submitted to the kernel, which then | |
660 | * all have to complete before we can return the completed ZIO back to ZFS. | |
661 | * | |
662 | * A VBIO is a wrapper around multiple BIOs, carrying everything needed to | |
663 | * translate a ZIO down into the kernel block layer and back again. | |
664 | * | |
665 | * Note that these are only used for data ZIOs (read/write). Meta-operations | |
666 | * (flush/trim) don't need multiple BIOs and so can just make the call | |
667 | * directly. | |
668 | */ | |
669 | typedef struct { | |
670 | zio_t *vbio_zio; /* parent zio */ | |
671 | ||
672 | struct block_device *vbio_bdev; /* blockdev to submit bios to */ | |
673 | ||
674 | abd_t *vbio_abd; /* abd carrying borrowed linear buf */ | |
675 | ||
06a19602 RN |
676 | uint_t vbio_max_segs; /* max segs per bio */ |
677 | ||
678 | uint_t vbio_max_bytes; /* max bytes per bio */ | |
679 | uint_t vbio_lbs_mask; /* logical block size mask */ | |
680 | ||
681 | uint64_t vbio_offset; /* start offset of next bio */ | |
682 | ||
683 | struct bio *vbio_bio; /* pointer to the current bio */ | |
72fd834c | 684 | int vbio_flags; /* bio flags */ |
06a19602 RN |
685 | } vbio_t; |
686 | ||
687 | static vbio_t * | |
72fd834c | 688 | vbio_alloc(zio_t *zio, struct block_device *bdev, int flags) |
06a19602 RN |
689 | { |
690 | vbio_t *vbio = kmem_zalloc(sizeof (vbio_t), KM_SLEEP); | |
691 | ||
692 | vbio->vbio_zio = zio; | |
693 | vbio->vbio_bdev = bdev; | |
72fd834c | 694 | vbio->vbio_abd = NULL; |
06a19602 RN |
695 | vbio->vbio_max_segs = vdev_bio_max_segs(bdev); |
696 | vbio->vbio_max_bytes = vdev_bio_max_bytes(bdev); | |
697 | vbio->vbio_lbs_mask = ~(bdev_logical_block_size(bdev)-1); | |
698 | vbio->vbio_offset = zio->io_offset; | |
72fd834c RN |
699 | vbio->vbio_bio = NULL; |
700 | vbio->vbio_flags = flags; | |
06a19602 RN |
701 | |
702 | return (vbio); | |
703 | } | |
704 | ||
72fd834c RN |
705 | BIO_END_IO_PROTO(vbio_completion, bio, error); |
706 | ||
06a19602 RN |
707 | static int |
708 | vbio_add_page(vbio_t *vbio, struct page *page, uint_t size, uint_t offset) | |
709 | { | |
72fd834c | 710 | struct bio *bio = vbio->vbio_bio; |
06a19602 RN |
711 | uint_t ssize; |
712 | ||
713 | while (size > 0) { | |
06a19602 RN |
714 | if (bio == NULL) { |
715 | /* New BIO, allocate and set up */ | |
716 | bio = vdev_bio_alloc(vbio->vbio_bdev, GFP_NOIO, | |
717 | vbio->vbio_max_segs); | |
72fd834c RN |
718 | VERIFY(bio); |
719 | ||
06a19602 | 720 | BIO_BI_SECTOR(bio) = vbio->vbio_offset >> 9; |
72fd834c RN |
721 | bio_set_op_attrs(bio, |
722 | vbio->vbio_zio->io_type == ZIO_TYPE_WRITE ? | |
723 | WRITE : READ, vbio->vbio_flags); | |
06a19602 | 724 | |
72fd834c RN |
725 | if (vbio->vbio_bio) { |
726 | bio_chain(vbio->vbio_bio, bio); | |
727 | vdev_submit_bio(vbio->vbio_bio); | |
728 | } | |
729 | vbio->vbio_bio = bio; | |
06a19602 RN |
730 | } |
731 | ||
732 | /* | |
733 | * Only load as much of the current page data as will fit in | |
734 | * the space left in the BIO, respecting lbs alignment. Older | |
735 | * kernels will error if we try to overfill the BIO, while | |
736 | * newer ones will accept it and split the BIO. This ensures | |
737 | * everything works on older kernels, and avoids an additional | |
738 | * overhead on the new. | |
739 | */ | |
740 | ssize = MIN(size, (vbio->vbio_max_bytes - BIO_BI_SIZE(bio)) & | |
741 | vbio->vbio_lbs_mask); | |
742 | if (ssize > 0 && | |
743 | bio_add_page(bio, page, ssize, offset) == ssize) { | |
744 | /* Accepted, adjust and load any remaining. */ | |
745 | size -= ssize; | |
746 | offset += ssize; | |
747 | continue; | |
748 | } | |
749 | ||
750 | /* No room, set up for a new BIO and loop */ | |
751 | vbio->vbio_offset += BIO_BI_SIZE(bio); | |
752 | ||
753 | /* Signal new BIO allocation wanted */ | |
72fd834c | 754 | bio = NULL; |
06a19602 RN |
755 | } |
756 | ||
757 | return (0); | |
758 | } | |
759 | ||
72fd834c RN |
760 | /* Iterator callback to submit ABD pages to the vbio. */ |
761 | static int | |
762 | vbio_fill_cb(struct page *page, size_t off, size_t len, void *priv) | |
763 | { | |
764 | vbio_t *vbio = priv; | |
765 | return (vbio_add_page(vbio, page, len, off)); | |
766 | } | |
06a19602 | 767 | |
72fd834c | 768 | /* Create some BIOs, fill them with data and submit them */ |
06a19602 | 769 | static void |
72fd834c | 770 | vbio_submit(vbio_t *vbio, abd_t *abd, uint64_t size) |
06a19602 | 771 | { |
06a19602 | 772 | /* |
72fd834c RN |
773 | * We plug so we can submit the BIOs as we go and only unplug them when |
774 | * they are fully created and submitted. This is important; if we don't | |
775 | * plug, then the kernel may start executing earlier BIOs while we're | |
776 | * still creating and executing later ones, and if the device goes | |
777 | * away while that's happening, older kernels can get confused and | |
778 | * trample memory. | |
06a19602 RN |
779 | */ |
780 | struct blk_plug plug; | |
72fd834c | 781 | blk_start_plug(&plug); |
06a19602 | 782 | |
72fd834c RN |
783 | (void) abd_iterate_page_func(abd, 0, size, vbio_fill_cb, vbio); |
784 | ASSERT(vbio->vbio_bio); | |
06a19602 | 785 | |
72fd834c RN |
786 | vbio->vbio_bio->bi_end_io = vbio_completion; |
787 | vbio->vbio_bio->bi_private = vbio; | |
06a19602 | 788 | |
917ff75e RN |
789 | /* |
790 | * Once submitted, vbio_bio now owns vbio (through bi_private) and we | |
791 | * can't touch it again. The bio may complete and vbio_completion() be | |
792 | * called and free the vbio before this task is run again, so we must | |
793 | * consider it invalid from this point. | |
794 | */ | |
72fd834c | 795 | vdev_submit_bio(vbio->vbio_bio); |
06a19602 | 796 | |
72fd834c | 797 | blk_finish_plug(&plug); |
06a19602 RN |
798 | } |
799 | ||
72fd834c RN |
800 | /* IO completion callback */ |
801 | BIO_END_IO_PROTO(vbio_completion, bio, error) | |
06a19602 | 802 | { |
72fd834c | 803 | vbio_t *vbio = bio->bi_private; |
06a19602 | 804 | zio_t *zio = vbio->vbio_zio; |
06a19602 | 805 | |
72fd834c | 806 | ASSERT(zio); |
06a19602 | 807 | |
72fd834c RN |
808 | /* Capture and log any errors */ |
809 | #ifdef HAVE_1ARG_BIO_END_IO_T | |
810 | zio->io_error = BIO_END_IO_ERROR(bio); | |
811 | #else | |
812 | zio->io_error = 0; | |
813 | if (error) | |
814 | zio->io_error = -(error); | |
815 | else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) | |
816 | zio->io_error = EIO; | |
817 | #endif | |
818 | ASSERT3U(zio->io_error, >=, 0); | |
06a19602 | 819 | |
72fd834c RN |
820 | if (zio->io_error) |
821 | vdev_disk_error(zio); | |
06a19602 | 822 | |
72fd834c RN |
823 | /* Return the BIO to the kernel */ |
824 | bio_put(bio); | |
06a19602 RN |
825 | |
826 | /* | |
72fd834c RN |
827 | * If we copied the ABD before issuing it, clean up and return the copy |
828 | * to the ADB, with changes if appropriate. | |
06a19602 | 829 | */ |
72fd834c RN |
830 | if (vbio->vbio_abd != NULL) { |
831 | void *buf = abd_to_buf(vbio->vbio_abd); | |
832 | abd_free(vbio->vbio_abd); | |
833 | vbio->vbio_abd = NULL; | |
06a19602 | 834 | |
72fd834c RN |
835 | if (zio->io_type == ZIO_TYPE_READ) |
836 | abd_return_buf_copy(zio->io_abd, buf, zio->io_size); | |
837 | else | |
838 | abd_return_buf(zio->io_abd, buf, zio->io_size); | |
839 | } | |
06a19602 | 840 | |
72fd834c RN |
841 | /* Final cleanup */ |
842 | kmem_free(vbio, sizeof (vbio_t)); | |
06a19602 RN |
843 | |
844 | /* All done, submit for processing */ | |
845 | zio_delay_interrupt(zio); | |
06a19602 RN |
846 | } |
847 | ||
848 | /* | |
849 | * Iterator callback to count ABD pages and check their size & alignment. | |
850 | * | |
851 | * On Linux, each BIO segment can take a page pointer, and an offset+length of | |
852 | * the data within that page. A page can be arbitrarily large ("compound" | |
853 | * pages) but we still have to ensure the data portion is correctly sized and | |
854 | * aligned to the logical block size, to ensure that if the kernel wants to | |
855 | * split the BIO, the two halves will still be properly aligned. | |
856 | */ | |
857 | typedef struct { | |
858 | uint_t bmask; | |
859 | uint_t npages; | |
860 | uint_t end; | |
861 | } vdev_disk_check_pages_t; | |
862 | ||
863 | static int | |
864 | vdev_disk_check_pages_cb(struct page *page, size_t off, size_t len, void *priv) | |
865 | { | |
866 | vdev_disk_check_pages_t *s = priv; | |
867 | ||
868 | /* | |
869 | * If we didn't finish on a block size boundary last time, then there | |
870 | * would be a gap if we tried to use this ABD as-is, so abort. | |
871 | */ | |
872 | if (s->end != 0) | |
873 | return (1); | |
874 | ||
875 | /* | |
876 | * Note if we're taking less than a full block, so we can check it | |
877 | * above on the next call. | |
878 | */ | |
879 | s->end = len & s->bmask; | |
880 | ||
881 | /* All blocks after the first must start on a block size boundary. */ | |
882 | if (s->npages != 0 && (off & s->bmask) != 0) | |
883 | return (1); | |
884 | ||
885 | s->npages++; | |
886 | return (0); | |
887 | } | |
888 | ||
889 | /* | |
890 | * Check if we can submit the pages in this ABD to the kernel as-is. Returns | |
891 | * the number of pages, or 0 if it can't be submitted like this. | |
892 | */ | |
893 | static boolean_t | |
894 | vdev_disk_check_pages(abd_t *abd, uint64_t size, struct block_device *bdev) | |
895 | { | |
896 | vdev_disk_check_pages_t s = { | |
897 | .bmask = bdev_logical_block_size(bdev)-1, | |
898 | .npages = 0, | |
899 | .end = 0, | |
900 | }; | |
901 | ||
902 | if (abd_iterate_page_func(abd, 0, size, vdev_disk_check_pages_cb, &s)) | |
903 | return (B_FALSE); | |
904 | ||
905 | return (B_TRUE); | |
906 | } | |
907 | ||
06a19602 RN |
908 | static int |
909 | vdev_disk_io_rw(zio_t *zio) | |
910 | { | |
911 | vdev_t *v = zio->io_vd; | |
912 | vdev_disk_t *vd = v->vdev_tsd; | |
913 | struct block_device *bdev = BDH_BDEV(vd->vd_bdh); | |
914 | int flags = 0; | |
915 | ||
916 | /* | |
917 | * Accessing outside the block device is never allowed. | |
918 | */ | |
919 | if (zio->io_offset + zio->io_size > bdev->bd_inode->i_size) { | |
920 | vdev_dbgmsg(zio->io_vd, | |
921 | "Illegal access %llu size %llu, device size %llu", | |
922 | (u_longlong_t)zio->io_offset, | |
923 | (u_longlong_t)zio->io_size, | |
924 | (u_longlong_t)i_size_read(bdev->bd_inode)); | |
925 | return (SET_ERROR(EIO)); | |
926 | } | |
927 | ||
928 | if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)) && | |
929 | v->vdev_failfast == B_TRUE) { | |
930 | bio_set_flags_failfast(bdev, &flags, zfs_vdev_failfast_mask & 1, | |
931 | zfs_vdev_failfast_mask & 2, zfs_vdev_failfast_mask & 4); | |
932 | } | |
933 | ||
934 | /* | |
935 | * Check alignment of the incoming ABD. If any part of it would require | |
936 | * submitting a page that is not aligned to the logical block size, | |
937 | * then we take a copy into a linear buffer and submit that instead. | |
938 | * This should be impossible on a 512b LBS, and fairly rare on 4K, | |
939 | * usually requiring abnormally-small data blocks (eg gang blocks) | |
940 | * mixed into the same ABD as larger ones (eg aggregated). | |
941 | */ | |
942 | abd_t *abd = zio->io_abd; | |
943 | if (!vdev_disk_check_pages(abd, zio->io_size, bdev)) { | |
944 | void *buf; | |
945 | if (zio->io_type == ZIO_TYPE_READ) | |
946 | buf = abd_borrow_buf(zio->io_abd, zio->io_size); | |
947 | else | |
948 | buf = abd_borrow_buf_copy(zio->io_abd, zio->io_size); | |
949 | ||
950 | /* | |
951 | * Wrap the copy in an abd_t, so we can use the same iterators | |
952 | * to count and fill the vbio later. | |
953 | */ | |
954 | abd = abd_get_from_buf(buf, zio->io_size); | |
955 | ||
956 | /* | |
957 | * False here would mean the borrowed copy has an invalid | |
958 | * alignment too, which would mean we've somehow been passed a | |
959 | * linear ABD with an interior page that has a non-zero offset | |
960 | * or a size not a multiple of PAGE_SIZE. This is not possible. | |
961 | * It would mean either zio_buf_alloc() or its underlying | |
962 | * allocators have done something extremely strange, or our | |
963 | * math in vdev_disk_check_pages() is wrong. In either case, | |
964 | * something in seriously wrong and its not safe to continue. | |
965 | */ | |
966 | VERIFY(vdev_disk_check_pages(abd, zio->io_size, bdev)); | |
967 | } | |
968 | ||
969 | /* Allocate vbio, with a pointer to the borrowed ABD if necessary */ | |
72fd834c | 970 | vbio_t *vbio = vbio_alloc(zio, bdev, flags); |
06a19602 RN |
971 | if (abd != zio->io_abd) |
972 | vbio->vbio_abd = abd; | |
973 | ||
72fd834c RN |
974 | /* Fill it with data pages and submit it to the kernel */ |
975 | vbio_submit(vbio, abd, zio->io_size); | |
06a19602 RN |
976 | return (0); |
977 | } | |
978 | ||
f3b85d70 RN |
979 | /* ========== */ |
980 | ||
981 | /* | |
06a19602 RN |
982 | * This is the classic, battle-tested BIO submission code. Until we're totally |
983 | * sure that the new code is safe and correct in all cases, this will remain | |
984 | * available and can be enabled by setting zfs_vdev_disk_classic=1 at module | |
985 | * load time. | |
f3b85d70 RN |
986 | * |
987 | * These functions have been renamed to vdev_classic_* to make it clear what | |
988 | * they belong to, but their implementations are unchanged. | |
989 | */ | |
990 | ||
991 | /* | |
992 | * Virtual device vector for disks. | |
993 | */ | |
994 | typedef struct dio_request { | |
995 | zio_t *dr_zio; /* Parent ZIO */ | |
996 | atomic_t dr_ref; /* References */ | |
997 | int dr_error; /* Bio error */ | |
998 | int dr_bio_count; /* Count of bio's */ | |
999 | struct bio *dr_bio[]; /* Attached bio's */ | |
1000 | } dio_request_t; | |
1001 | ||
1002 | static dio_request_t * | |
1003 | vdev_classic_dio_alloc(int bio_count) | |
1004 | { | |
1005 | dio_request_t *dr = kmem_zalloc(sizeof (dio_request_t) + | |
1006 | sizeof (struct bio *) * bio_count, KM_SLEEP); | |
1007 | atomic_set(&dr->dr_ref, 0); | |
1008 | dr->dr_bio_count = bio_count; | |
1009 | dr->dr_error = 0; | |
1010 | ||
1011 | for (int i = 0; i < dr->dr_bio_count; i++) | |
1012 | dr->dr_bio[i] = NULL; | |
1013 | ||
1014 | return (dr); | |
1015 | } | |
1016 | ||
1017 | static void | |
1018 | vdev_classic_dio_free(dio_request_t *dr) | |
1019 | { | |
1020 | int i; | |
1021 | ||
1022 | for (i = 0; i < dr->dr_bio_count; i++) | |
1023 | if (dr->dr_bio[i]) | |
1024 | bio_put(dr->dr_bio[i]); | |
1025 | ||
1026 | kmem_free(dr, sizeof (dio_request_t) + | |
1027 | sizeof (struct bio *) * dr->dr_bio_count); | |
1028 | } | |
1029 | ||
1030 | static void | |
1031 | vdev_classic_dio_get(dio_request_t *dr) | |
1032 | { | |
1033 | atomic_inc(&dr->dr_ref); | |
1034 | } | |
1035 | ||
1036 | static void | |
1037 | vdev_classic_dio_put(dio_request_t *dr) | |
1038 | { | |
1039 | int rc = atomic_dec_return(&dr->dr_ref); | |
1040 | ||
1041 | /* | |
1042 | * Free the dio_request when the last reference is dropped and | |
1043 | * ensure zio_interpret is called only once with the correct zio | |
1044 | */ | |
1045 | if (rc == 0) { | |
1046 | zio_t *zio = dr->dr_zio; | |
1047 | int error = dr->dr_error; | |
1048 | ||
1049 | vdev_classic_dio_free(dr); | |
1050 | ||
1051 | if (zio) { | |
1052 | zio->io_error = error; | |
1053 | ASSERT3S(zio->io_error, >=, 0); | |
1054 | if (zio->io_error) | |
1055 | vdev_disk_error(zio); | |
1056 | ||
1057 | zio_delay_interrupt(zio); | |
1058 | } | |
1059 | } | |
1060 | } | |
1061 | ||
1062 | BIO_END_IO_PROTO(vdev_classic_physio_completion, bio, error) | |
1063 | { | |
1064 | dio_request_t *dr = bio->bi_private; | |
1065 | ||
1066 | if (dr->dr_error == 0) { | |
1067 | #ifdef HAVE_1ARG_BIO_END_IO_T | |
1068 | dr->dr_error = BIO_END_IO_ERROR(bio); | |
1069 | #else | |
1070 | if (error) | |
1071 | dr->dr_error = -(error); | |
1072 | else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) | |
1073 | dr->dr_error = EIO; | |
1074 | #endif | |
1075 | } | |
1076 | ||
1077 | /* Drop reference acquired by vdev_classic_physio */ | |
1078 | vdev_classic_dio_put(dr); | |
1079 | } | |
1080 | ||
5f264996 | 1081 | static inline unsigned int |
f3b85d70 | 1082 | vdev_classic_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset) |
5f264996 BB |
1083 | { |
1084 | unsigned long nr_segs = abd_nr_pages_off(zio->io_abd, | |
1085 | bio_size, abd_offset); | |
1086 | ||
1087 | #ifdef HAVE_BIO_MAX_SEGS | |
1088 | return (bio_max_segs(nr_segs)); | |
1089 | #else | |
1090 | return (MIN(nr_segs, BIO_MAX_PAGES)); | |
1091 | #endif | |
1092 | } | |
1093 | ||
60101509 | 1094 | static int |
867178ae | 1095 | vdev_classic_physio(zio_t *zio) |
60101509 | 1096 | { |
867178ae RN |
1097 | vdev_t *v = zio->io_vd; |
1098 | vdev_disk_t *vd = v->vdev_tsd; | |
1099 | struct block_device *bdev = BDH_BDEV(vd->vd_bdh); | |
1100 | size_t io_size = zio->io_size; | |
1101 | uint64_t io_offset = zio->io_offset; | |
1102 | int rw = zio->io_type == ZIO_TYPE_READ ? READ : WRITE; | |
1103 | int flags = 0; | |
1104 | ||
d1d7e268 | 1105 | dio_request_t *dr; |
b0be93e8 | 1106 | uint64_t abd_offset; |
60101509 | 1107 | uint64_t bio_offset; |
f8c0d7e1 MA |
1108 | int bio_size; |
1109 | int bio_count = 16; | |
1110 | int error = 0; | |
e8ac4557 | 1111 | struct blk_plug plug; |
5f264996 | 1112 | unsigned short nr_vecs; |
066e8252 | 1113 | |
d441e85d BB |
1114 | /* |
1115 | * Accessing outside the block device is never allowed. | |
1116 | */ | |
1117 | if (io_offset + io_size > bdev->bd_inode->i_size) { | |
1118 | vdev_dbgmsg(zio->io_vd, | |
1119 | "Illegal access %llu size %llu, device size %llu", | |
5dbf6c5a AZ |
1120 | (u_longlong_t)io_offset, |
1121 | (u_longlong_t)io_size, | |
1122 | (u_longlong_t)i_size_read(bdev->bd_inode)); | |
d441e85d BB |
1123 | return (SET_ERROR(EIO)); |
1124 | } | |
e06be586 | 1125 | |
60101509 | 1126 | retry: |
f3b85d70 | 1127 | dr = vdev_classic_dio_alloc(bio_count); |
60101509 | 1128 | |
f1100863 | 1129 | if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)) && |
16f0fdad MZ |
1130 | zio->io_vd->vdev_failfast == B_TRUE) { |
1131 | bio_set_flags_failfast(bdev, &flags, zfs_vdev_failfast_mask & 1, | |
1132 | zfs_vdev_failfast_mask & 2, zfs_vdev_failfast_mask & 4); | |
1133 | } | |
2959d94a | 1134 | |
60101509 | 1135 | dr->dr_zio = zio; |
60101509 | 1136 | |
60101509 | 1137 | /* |
f8c0d7e1 MA |
1138 | * Since bio's can have up to BIO_MAX_PAGES=256 iovec's, each of which |
1139 | * is at least 512 bytes and at most PAGESIZE (typically 4K), one bio | |
1140 | * can cover at least 128KB and at most 1MB. When the required number | |
1141 | * of iovec's exceeds this, we are forced to break the IO in multiple | |
1142 | * bio's and wait for them all to complete. This is likely if the | |
1143 | * recordsize property is increased beyond 1MB. The default | |
1144 | * bio_count=16 should typically accommodate the maximum-size zio of | |
1145 | * 16MB. | |
60101509 | 1146 | */ |
a6255b7f | 1147 | |
b0be93e8 IH |
1148 | abd_offset = 0; |
1149 | bio_offset = io_offset; | |
f8c0d7e1 MA |
1150 | bio_size = io_size; |
1151 | for (int i = 0; i <= dr->dr_bio_count; i++) { | |
60101509 BB |
1152 | |
1153 | /* Finished constructing bio's for given buffer */ | |
1154 | if (bio_size <= 0) | |
1155 | break; | |
1156 | ||
1157 | /* | |
f8c0d7e1 MA |
1158 | * If additional bio's are required, we have to retry, but |
1159 | * this should be rare - see the comment above. | |
60101509 BB |
1160 | */ |
1161 | if (dr->dr_bio_count == i) { | |
f3b85d70 | 1162 | vdev_classic_dio_free(dr); |
60101509 | 1163 | bio_count *= 2; |
60101509 BB |
1164 | goto retry; |
1165 | } | |
1166 | ||
f3b85d70 | 1167 | nr_vecs = vdev_classic_bio_max_segs(zio, bio_size, abd_offset); |
5f264996 | 1168 | dr->dr_bio[i] = vdev_bio_alloc(bdev, GFP_NOIO, nr_vecs); |
1086f542 | 1169 | if (unlikely(dr->dr_bio[i] == NULL)) { |
f3b85d70 | 1170 | vdev_classic_dio_free(dr); |
ecb2b7dc | 1171 | return (SET_ERROR(ENOMEM)); |
60101509 BB |
1172 | } |
1173 | ||
f3b85d70 RN |
1174 | /* Matching put called by vdev_classic_physio_completion */ |
1175 | vdev_classic_dio_get(dr); | |
60101509 | 1176 | |
d4541210 | 1177 | BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9; |
f3b85d70 | 1178 | dr->dr_bio[i]->bi_end_io = vdev_classic_physio_completion; |
60101509 | 1179 | dr->dr_bio[i]->bi_private = dr; |
3b86aeb2 | 1180 | bio_set_op_attrs(dr->dr_bio[i], rw, flags); |
60101509 BB |
1181 | |
1182 | /* Remaining size is returned to become the new size */ | |
fb822260 | 1183 | bio_size = abd_bio_map_off(dr->dr_bio[i], zio->io_abd, |
02730c33 | 1184 | bio_size, abd_offset); |
60101509 BB |
1185 | |
1186 | /* Advance in buffer and construct another bio if needed */ | |
b0be93e8 | 1187 | abd_offset += BIO_BI_SIZE(dr->dr_bio[i]); |
d4541210 | 1188 | bio_offset += BIO_BI_SIZE(dr->dr_bio[i]); |
60101509 BB |
1189 | } |
1190 | ||
37f9dac5 | 1191 | /* Extra reference to protect dio_request during vdev_submit_bio */ |
f3b85d70 | 1192 | vdev_classic_dio_get(dr); |
60101509 | 1193 | |
e8ac4557 IH |
1194 | if (dr->dr_bio_count > 1) |
1195 | blk_start_plug(&plug); | |
e8ac4557 | 1196 | |
60101509 | 1197 | /* Submit all bio's associated with this dio */ |
f8c0d7e1 | 1198 | for (int i = 0; i < dr->dr_bio_count; i++) { |
60101509 | 1199 | if (dr->dr_bio[i]) |
3b86aeb2 | 1200 | vdev_submit_bio(dr->dr_bio[i]); |
f8c0d7e1 | 1201 | } |
60101509 | 1202 | |
e8ac4557 IH |
1203 | if (dr->dr_bio_count > 1) |
1204 | blk_finish_plug(&plug); | |
e8ac4557 | 1205 | |
f3b85d70 | 1206 | vdev_classic_dio_put(dr); |
60101509 | 1207 | |
d1d7e268 | 1208 | return (error); |
60101509 BB |
1209 | } |
1210 | ||
f3b85d70 RN |
1211 | /* ========== */ |
1212 | ||
36ba27e9 | 1213 | BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error) |
60101509 BB |
1214 | { |
1215 | zio_t *zio = bio->bi_private; | |
784a7fe5 | 1216 | #ifdef HAVE_1ARG_BIO_END_IO_T |
36ba27e9 BB |
1217 | zio->io_error = BIO_END_IO_ERROR(bio); |
1218 | #else | |
1219 | zio->io_error = -error; | |
784a7fe5 | 1220 | #endif |
60101509 | 1221 | |
36ba27e9 | 1222 | if (zio->io_error && (zio->io_error == EOPNOTSUPP)) |
60101509 BB |
1223 | zio->io_vd->vdev_nowritecache = B_TRUE; |
1224 | ||
1225 | bio_put(bio); | |
d148e951 BB |
1226 | ASSERT3S(zio->io_error, >=, 0); |
1227 | if (zio->io_error) | |
1228 | vdev_disk_error(zio); | |
60101509 | 1229 | zio_interrupt(zio); |
60101509 BB |
1230 | } |
1231 | ||
1232 | static int | |
1233 | vdev_disk_io_flush(struct block_device *bdev, zio_t *zio) | |
1234 | { | |
1235 | struct request_queue *q; | |
1236 | struct bio *bio; | |
1237 | ||
1238 | q = bdev_get_queue(bdev); | |
1239 | if (!q) | |
ecb2b7dc | 1240 | return (SET_ERROR(ENXIO)); |
60101509 | 1241 | |
5f264996 | 1242 | bio = vdev_bio_alloc(bdev, GFP_NOIO, 0); |
29b763cd | 1243 | if (unlikely(bio == NULL)) |
ecb2b7dc | 1244 | return (SET_ERROR(ENOMEM)); |
60101509 BB |
1245 | |
1246 | bio->bi_end_io = vdev_disk_io_flush_completion; | |
1247 | bio->bi_private = zio; | |
a5e046ea | 1248 | bio_set_flush(bio); |
3b86aeb2 | 1249 | vdev_submit_bio(bio); |
cecb7487 | 1250 | invalidate_bdev(bdev); |
60101509 | 1251 | |
d1d7e268 | 1252 | return (0); |
60101509 | 1253 | } |
60101509 | 1254 | |
06e25f9c US |
1255 | BIO_END_IO_PROTO(vdev_disk_discard_end_io, bio, error) |
1256 | { | |
1257 | zio_t *zio = bio->bi_private; | |
1258 | #ifdef HAVE_1ARG_BIO_END_IO_T | |
1259 | zio->io_error = BIO_END_IO_ERROR(bio); | |
1260 | #else | |
1261 | zio->io_error = -error; | |
1262 | #endif | |
1263 | bio_put(bio); | |
1264 | if (zio->io_error) | |
1265 | vdev_disk_error(zio); | |
1266 | zio_interrupt(zio); | |
1267 | } | |
1268 | ||
ba9f587a RN |
1269 | /* |
1270 | * Wrappers for the different secure erase and discard APIs. We use async | |
1271 | * when available; in this case, *biop is set to the last bio in the chain. | |
1272 | */ | |
a12a5cb5 | 1273 | static int |
ba9f587a RN |
1274 | vdev_bdev_issue_secure_erase(zfs_bdev_handle_t *bdh, sector_t sector, |
1275 | sector_t nsect, struct bio **biop) | |
a12a5cb5 | 1276 | { |
ba9f587a RN |
1277 | *biop = NULL; |
1278 | int error; | |
a12a5cb5 | 1279 | |
ba9f587a RN |
1280 | #if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE) |
1281 | error = blkdev_issue_secure_erase(BDH_BDEV(bdh), | |
1282 | sector, nsect, GFP_NOFS); | |
1283 | #elif defined(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC_FLAGS) | |
1284 | error = __blkdev_issue_discard(BDH_BDEV(bdh), | |
1285 | sector, nsect, GFP_NOFS, BLKDEV_DISCARD_SECURE, biop); | |
1286 | #elif defined(HAVE_BLKDEV_ISSUE_DISCARD_FLAGS) | |
1287 | error = blkdev_issue_discard(BDH_BDEV(bdh), | |
1288 | sector, nsect, GFP_NOFS, BLKDEV_DISCARD_SECURE); | |
06e25f9c | 1289 | #else |
ba9f587a | 1290 | #error "unsupported kernel" |
06e25f9c | 1291 | #endif |
ba9f587a RN |
1292 | |
1293 | return (error); | |
06e25f9c | 1294 | } |
ba9f587a RN |
1295 | |
1296 | static int | |
1297 | vdev_bdev_issue_discard(zfs_bdev_handle_t *bdh, sector_t sector, | |
1298 | sector_t nsect, struct bio **biop) | |
1299 | { | |
1300 | *biop = NULL; | |
1301 | int error; | |
1302 | ||
1303 | #if defined(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC_FLAGS) | |
1304 | error = __blkdev_issue_discard(BDH_BDEV(bdh), | |
1305 | sector, nsect, GFP_NOFS, 0, biop); | |
1306 | #elif defined(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC_NOFLAGS) | |
1307 | error = __blkdev_issue_discard(BDH_BDEV(bdh), | |
1308 | sector, nsect, GFP_NOFS, biop); | |
1309 | #elif defined(HAVE_BLKDEV_ISSUE_DISCARD_FLAGS) | |
1310 | error = blkdev_issue_discard(BDH_BDEV(bdh), | |
1311 | sector, nsect, GFP_NOFS, 0); | |
1312 | #elif defined(HAVE_BLKDEV_ISSUE_DISCARD_NOFLAGS) | |
1313 | error = blkdev_issue_discard(BDH_BDEV(bdh), | |
1314 | sector, nsect, GFP_NOFS); | |
1315 | #else | |
1316 | #error "unsupported kernel" | |
06e25f9c US |
1317 | #endif |
1318 | ||
ba9f587a RN |
1319 | return (error); |
1320 | } | |
1321 | ||
1322 | /* | |
1323 | * Entry point for TRIM ops. This calls the right wrapper for secure erase or | |
1324 | * discard, and then does the appropriate finishing work for error vs success | |
1325 | * and async vs sync. | |
1326 | */ | |
06e25f9c US |
1327 | static int |
1328 | vdev_disk_io_trim(zio_t *zio) | |
1329 | { | |
ba9f587a RN |
1330 | int error; |
1331 | struct bio *bio; | |
1332 | ||
1333 | zfs_bdev_handle_t *bdh = ((vdev_disk_t *)zio->io_vd->vdev_tsd)->vd_bdh; | |
1334 | sector_t sector = zio->io_offset >> 9; | |
1335 | sector_t nsects = zio->io_size >> 9; | |
1336 | ||
1337 | if (zio->io_trim_flags & ZIO_TRIM_SECURE) | |
1338 | error = vdev_bdev_issue_secure_erase(bdh, sector, nsects, &bio); | |
1339 | else | |
1340 | error = vdev_bdev_issue_discard(bdh, sector, nsects, &bio); | |
1341 | ||
1342 | if (error != 0) | |
1343 | return (SET_ERROR(-error)); | |
1344 | ||
1345 | if (bio == NULL) { | |
1346 | /* | |
1347 | * This was a synchronous op that completed successfully, so | |
1348 | * return it to ZFS immediately. | |
1349 | */ | |
1350 | zio_interrupt(zio); | |
1351 | } else { | |
1352 | /* | |
1353 | * This was an asynchronous op; set up completion callback and | |
1354 | * issue it. | |
1355 | */ | |
1356 | bio->bi_private = zio; | |
1357 | bio->bi_end_io = vdev_disk_discard_end_io; | |
1358 | vdev_submit_bio(bio); | |
06e25f9c | 1359 | } |
ba9f587a RN |
1360 | |
1361 | return (0); | |
a12a5cb5 BB |
1362 | } |
1363 | ||
c4a13ba4 RN |
1364 | int (*vdev_disk_io_rw_fn)(zio_t *zio) = NULL; |
1365 | ||
98b25418 | 1366 | static void |
60101509 BB |
1367 | vdev_disk_io_start(zio_t *zio) |
1368 | { | |
1369 | vdev_t *v = zio->io_vd; | |
1370 | vdev_disk_t *vd = v->vdev_tsd; | |
867178ae | 1371 | int error; |
60101509 | 1372 | |
d441e85d BB |
1373 | /* |
1374 | * If the vdev is closed, it's likely in the REMOVED or FAULTED state. | |
1375 | * Nothing to be done here but return failure. | |
1376 | */ | |
1377 | if (vd == NULL) { | |
1378 | zio->io_error = ENXIO; | |
1379 | zio_interrupt(zio); | |
1380 | return; | |
1381 | } | |
1382 | ||
1383 | rw_enter(&vd->vd_lock, RW_READER); | |
1384 | ||
1385 | /* | |
1386 | * If the vdev is closed, it's likely due to a failed reopen and is | |
1387 | * in the UNAVAIL state. Nothing to be done here but return failure. | |
1388 | */ | |
386d6a75 | 1389 | if (vd->vd_bdh == NULL) { |
d441e85d BB |
1390 | rw_exit(&vd->vd_lock); |
1391 | zio->io_error = ENXIO; | |
1392 | zio_interrupt(zio); | |
1393 | return; | |
1394 | } | |
1395 | ||
60101509 BB |
1396 | switch (zio->io_type) { |
1397 | case ZIO_TYPE_IOCTL: | |
1398 | ||
1399 | if (!vdev_readable(v)) { | |
d441e85d | 1400 | rw_exit(&vd->vd_lock); |
2e528b49 | 1401 | zio->io_error = SET_ERROR(ENXIO); |
98b25418 GW |
1402 | zio_interrupt(zio); |
1403 | return; | |
60101509 BB |
1404 | } |
1405 | ||
1406 | switch (zio->io_cmd) { | |
1407 | case DKIOCFLUSHWRITECACHE: | |
1408 | ||
1409 | if (zfs_nocacheflush) | |
1410 | break; | |
1411 | ||
1412 | if (v->vdev_nowritecache) { | |
2e528b49 | 1413 | zio->io_error = SET_ERROR(ENOTSUP); |
60101509 BB |
1414 | break; |
1415 | } | |
1416 | ||
386d6a75 | 1417 | error = vdev_disk_io_flush(BDH_BDEV(vd->vd_bdh), zio); |
d441e85d BB |
1418 | if (error == 0) { |
1419 | rw_exit(&vd->vd_lock); | |
98b25418 | 1420 | return; |
d441e85d | 1421 | } |
60101509 BB |
1422 | |
1423 | zio->io_error = error; | |
60101509 BB |
1424 | |
1425 | break; | |
1426 | ||
1427 | default: | |
2e528b49 | 1428 | zio->io_error = SET_ERROR(ENOTSUP); |
60101509 BB |
1429 | } |
1430 | ||
d441e85d | 1431 | rw_exit(&vd->vd_lock); |
98b25418 GW |
1432 | zio_execute(zio); |
1433 | return; | |
60101509 | 1434 | |
1b939560 | 1435 | case ZIO_TYPE_TRIM: |
ba9f587a | 1436 | error = vdev_disk_io_trim(zio); |
1b939560 | 1437 | rw_exit(&vd->vd_lock); |
ba9f587a RN |
1438 | if (error) { |
1439 | zio->io_error = error; | |
1440 | zio_execute(zio); | |
1441 | } | |
1b939560 BB |
1442 | return; |
1443 | ||
867178ae RN |
1444 | case ZIO_TYPE_READ: |
1445 | case ZIO_TYPE_WRITE: | |
1446 | zio->io_target_timestamp = zio_handle_io_delay(zio); | |
c4a13ba4 | 1447 | error = vdev_disk_io_rw_fn(zio); |
d441e85d | 1448 | rw_exit(&vd->vd_lock); |
867178ae RN |
1449 | if (error) { |
1450 | zio->io_error = error; | |
1451 | zio_interrupt(zio); | |
1452 | } | |
98b25418 | 1453 | return; |
60101509 | 1454 | |
867178ae RN |
1455 | default: |
1456 | /* | |
1457 | * Getting here means our parent vdev has made a very strange | |
1458 | * request of us, and shouldn't happen. Assert here to force a | |
1459 | * crash in dev builds, but in production return the IO | |
1460 | * unhandled. The pool will likely suspend anyway but that's | |
1461 | * nicer than crashing the kernel. | |
1462 | */ | |
1463 | ASSERT3S(zio->io_type, ==, -1); | |
d441e85d | 1464 | |
867178ae RN |
1465 | rw_exit(&vd->vd_lock); |
1466 | zio->io_error = SET_ERROR(ENOTSUP); | |
98b25418 GW |
1467 | zio_interrupt(zio); |
1468 | return; | |
60101509 | 1469 | } |
867178ae RN |
1470 | |
1471 | __builtin_unreachable(); | |
60101509 BB |
1472 | } |
1473 | ||
1474 | static void | |
1475 | vdev_disk_io_done(zio_t *zio) | |
1476 | { | |
1477 | /* | |
1478 | * If the device returned EIO, we revalidate the media. If it is | |
1479 | * determined the media has changed this triggers the asynchronous | |
1480 | * removal of the device from the configuration. | |
1481 | */ | |
1482 | if (zio->io_error == EIO) { | |
d1d7e268 | 1483 | vdev_t *v = zio->io_vd; |
60101509 BB |
1484 | vdev_disk_t *vd = v->vdev_tsd; |
1485 | ||
386d6a75 RN |
1486 | if (!zfs_check_disk_status(BDH_BDEV(vd->vd_bdh))) { |
1487 | invalidate_bdev(BDH_BDEV(vd->vd_bdh)); | |
60101509 BB |
1488 | v->vdev_remove_wanted = B_TRUE; |
1489 | spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE); | |
1490 | } | |
1491 | } | |
1492 | } | |
1493 | ||
1494 | static void | |
1495 | vdev_disk_hold(vdev_t *vd) | |
1496 | { | |
1497 | ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); | |
1498 | ||
1499 | /* We must have a pathname, and it must be absolute. */ | |
1500 | if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') | |
1501 | return; | |
1502 | ||
1503 | /* | |
1504 | * Only prefetch path and devid info if the device has | |
1505 | * never been opened. | |
1506 | */ | |
1507 | if (vd->vdev_tsd != NULL) | |
1508 | return; | |
1509 | ||
60101509 BB |
1510 | } |
1511 | ||
1512 | static void | |
1513 | vdev_disk_rele(vdev_t *vd) | |
1514 | { | |
1515 | ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); | |
1516 | ||
1517 | /* XXX: Implement me as a vnode rele for the device */ | |
1518 | } | |
1519 | ||
df2169d1 RN |
1520 | /* |
1521 | * BIO submission method. See comment above about vdev_classic. | |
1522 | * Set zfs_vdev_disk_classic=0 for new, =1 for classic | |
1523 | */ | |
1524 | static uint_t zfs_vdev_disk_classic = 0; /* default new */ | |
1525 | ||
1526 | /* Set submission function from module parameter */ | |
1527 | static int | |
1528 | vdev_disk_param_set_classic(const char *buf, zfs_kernel_param_t *kp) | |
1529 | { | |
1530 | int err = param_set_uint(buf, kp); | |
1531 | if (err < 0) | |
1532 | return (SET_ERROR(err)); | |
1533 | ||
1534 | vdev_disk_io_rw_fn = | |
1535 | zfs_vdev_disk_classic ? vdev_classic_physio : vdev_disk_io_rw; | |
1536 | ||
1537 | printk(KERN_INFO "ZFS: forcing %s BIO submission\n", | |
1538 | zfs_vdev_disk_classic ? "classic" : "new"); | |
1539 | ||
1540 | return (0); | |
1541 | } | |
1542 | ||
c4a13ba4 RN |
1543 | /* |
1544 | * At first use vdev use, set the submission function from the default value if | |
1545 | * it hasn't been set already. | |
1546 | */ | |
1547 | static int | |
1548 | vdev_disk_init(spa_t *spa, nvlist_t *nv, void **tsd) | |
1549 | { | |
1550 | (void) spa; | |
1551 | (void) nv; | |
1552 | (void) tsd; | |
1553 | ||
1554 | if (vdev_disk_io_rw_fn == NULL) | |
df2169d1 RN |
1555 | vdev_disk_io_rw_fn = zfs_vdev_disk_classic ? |
1556 | vdev_classic_physio : vdev_disk_io_rw; | |
c4a13ba4 RN |
1557 | |
1558 | return (0); | |
1559 | } | |
1560 | ||
60101509 | 1561 | vdev_ops_t vdev_disk_ops = { |
c4a13ba4 | 1562 | .vdev_op_init = vdev_disk_init, |
b2255edc | 1563 | .vdev_op_fini = NULL, |
a64f8276 I |
1564 | .vdev_op_open = vdev_disk_open, |
1565 | .vdev_op_close = vdev_disk_close, | |
1566 | .vdev_op_asize = vdev_default_asize, | |
b2255edc BB |
1567 | .vdev_op_min_asize = vdev_default_min_asize, |
1568 | .vdev_op_min_alloc = NULL, | |
a64f8276 I |
1569 | .vdev_op_io_start = vdev_disk_io_start, |
1570 | .vdev_op_io_done = vdev_disk_io_done, | |
1571 | .vdev_op_state_change = NULL, | |
1572 | .vdev_op_need_resilver = NULL, | |
1573 | .vdev_op_hold = vdev_disk_hold, | |
1574 | .vdev_op_rele = vdev_disk_rele, | |
1575 | .vdev_op_remap = NULL, | |
1576 | .vdev_op_xlate = vdev_default_xlate, | |
b2255edc BB |
1577 | .vdev_op_rebuild_asize = NULL, |
1578 | .vdev_op_metaslab_init = NULL, | |
1579 | .vdev_op_config_generate = NULL, | |
1580 | .vdev_op_nparity = NULL, | |
1581 | .vdev_op_ndisks = NULL, | |
a64f8276 | 1582 | .vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */ |
55c12724 AH |
1583 | .vdev_op_leaf = B_TRUE, /* leaf vdev */ |
1584 | .vdev_op_kobj_evt_post = vdev_disk_kobj_evt_post | |
60101509 BB |
1585 | }; |
1586 | ||
9e17e6f2 BB |
1587 | /* |
1588 | * The zfs_vdev_scheduler module option has been deprecated. Setting this | |
1589 | * value no longer has any effect. It has not yet been entirely removed | |
1590 | * to allow the module to be loaded if this option is specified in the | |
1591 | * /etc/modprobe.d/zfs.conf file. The following warning will be logged. | |
1592 | */ | |
1593 | static int | |
1594 | param_set_vdev_scheduler(const char *val, zfs_kernel_param_t *kp) | |
1595 | { | |
1596 | int error = param_set_charp(val, kp); | |
1597 | if (error == 0) { | |
1598 | printk(KERN_INFO "The 'zfs_vdev_scheduler' module option " | |
1599 | "is not supported.\n"); | |
1600 | } | |
1601 | ||
1602 | return (error); | |
1603 | } | |
1604 | ||
18168da7 | 1605 | static const char *zfs_vdev_scheduler = "unused"; |
e771de53 BB |
1606 | module_param_call(zfs_vdev_scheduler, param_set_vdev_scheduler, |
1607 | param_get_charp, &zfs_vdev_scheduler, 0644); | |
c409e464 | 1608 | MODULE_PARM_DESC(zfs_vdev_scheduler, "I/O scheduler"); |
6fe3498c RM |
1609 | |
1610 | int | |
1611 | param_set_min_auto_ashift(const char *buf, zfs_kernel_param_t *kp) | |
1612 | { | |
ab8d9c17 | 1613 | uint_t val; |
6fe3498c RM |
1614 | int error; |
1615 | ||
ab8d9c17 | 1616 | error = kstrtouint(buf, 0, &val); |
6fe3498c RM |
1617 | if (error < 0) |
1618 | return (SET_ERROR(error)); | |
1619 | ||
1620 | if (val < ASHIFT_MIN || val > zfs_vdev_max_auto_ashift) | |
1621 | return (SET_ERROR(-EINVAL)); | |
1622 | ||
ab8d9c17 | 1623 | error = param_set_uint(buf, kp); |
6fe3498c RM |
1624 | if (error < 0) |
1625 | return (SET_ERROR(error)); | |
1626 | ||
1627 | return (0); | |
1628 | } | |
1629 | ||
1630 | int | |
1631 | param_set_max_auto_ashift(const char *buf, zfs_kernel_param_t *kp) | |
1632 | { | |
ab8d9c17 | 1633 | uint_t val; |
6fe3498c RM |
1634 | int error; |
1635 | ||
ab8d9c17 | 1636 | error = kstrtouint(buf, 0, &val); |
6fe3498c RM |
1637 | if (error < 0) |
1638 | return (SET_ERROR(error)); | |
1639 | ||
1640 | if (val > ASHIFT_MAX || val < zfs_vdev_min_auto_ashift) | |
1641 | return (SET_ERROR(-EINVAL)); | |
1642 | ||
ab8d9c17 | 1643 | error = param_set_uint(buf, kp); |
6fe3498c RM |
1644 | if (error < 0) |
1645 | return (SET_ERROR(error)); | |
1646 | ||
1647 | return (0); | |
1648 | } | |
f66ffe68 SD |
1649 | |
1650 | ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, open_timeout_ms, UINT, ZMOD_RW, | |
1651 | "Timeout before determining that a device is missing"); | |
16f0fdad MZ |
1652 | |
1653 | ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, failfast_mask, UINT, ZMOD_RW, | |
1654 | "Defines failfast mask: 1 - device, 2 - transport, 4 - driver"); | |
06a19602 RN |
1655 | |
1656 | ZFS_MODULE_PARAM(zfs_vdev_disk, zfs_vdev_disk_, max_segs, UINT, ZMOD_RW, | |
1657 | "Maximum number of data segments to add to an IO request (min 4)"); | |
df2169d1 RN |
1658 | |
1659 | ZFS_MODULE_PARAM_CALL(zfs_vdev_disk, zfs_vdev_disk_, classic, | |
1660 | vdev_disk_param_set_classic, param_get_uint, ZMOD_RD, | |
1661 | "Use classic BIO submission method"); |