]>
Commit | Line | Data |
---|---|---|
60101509 BB |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
1d3ba0bf | 9 | * or https://opensource.org/licenses/CDDL-1.0. |
60101509 BB |
10 | * See the License for the specific language governing permissions |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | /* | |
22 | * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC. | |
23 | * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). | |
24 | * Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>. | |
25 | * LLNL-CODE-403049. | |
1eacf2b3 | 26 | * Copyright (c) 2012, 2019 by Delphix. All rights reserved. |
06a19602 | 27 | * Copyright (c) 2023, 2024, Klara Inc. |
60101509 BB |
28 | */ |
29 | ||
30 | #include <sys/zfs_context.h> | |
e771de53 | 31 | #include <sys/spa_impl.h> |
60101509 BB |
32 | #include <sys/vdev_disk.h> |
33 | #include <sys/vdev_impl.h> | |
1b939560 | 34 | #include <sys/vdev_trim.h> |
a6255b7f | 35 | #include <sys/abd.h> |
60101509 BB |
36 | #include <sys/fs/zfs.h> |
37 | #include <sys/zio.h> | |
8e82ffba | 38 | #include <linux/blkpg.h> |
74d42600 | 39 | #include <linux/msdos_fs.h> |
05805494 | 40 | #include <linux/vfs_compat.h> |
1e767532 CK |
41 | #ifdef HAVE_LINUX_BLK_CGROUP_HEADER |
42 | #include <linux/blk-cgroup.h> | |
43 | #endif | |
60101509 | 44 | |
386d6a75 RN |
45 | /* |
46 | * Linux 6.8.x uses a bdev_handle as an instance/refcount for an underlying | |
47 | * block_device. Since it carries the block_device inside, its convenient to | |
e3120f73 RN |
48 | * just use the handle as a proxy. |
49 | * | |
50 | * Linux 6.9.x uses a file for the same purpose. | |
51 | * | |
52 | * For pre-6.8, we just emulate this with a cast, since we don't need any of | |
53 | * the other fields inside the handle. | |
386d6a75 | 54 | */ |
e3120f73 | 55 | #if defined(HAVE_BDEV_OPEN_BY_PATH) |
386d6a75 RN |
56 | typedef struct bdev_handle zfs_bdev_handle_t; |
57 | #define BDH_BDEV(bdh) ((bdh)->bdev) | |
58 | #define BDH_IS_ERR(bdh) (IS_ERR(bdh)) | |
59 | #define BDH_PTR_ERR(bdh) (PTR_ERR(bdh)) | |
60 | #define BDH_ERR_PTR(err) (ERR_PTR(err)) | |
e3120f73 RN |
61 | #elif defined(HAVE_BDEV_FILE_OPEN_BY_PATH) |
62 | typedef struct file zfs_bdev_handle_t; | |
63 | #define BDH_BDEV(bdh) (file_bdev(bdh)) | |
64 | #define BDH_IS_ERR(bdh) (IS_ERR(bdh)) | |
65 | #define BDH_PTR_ERR(bdh) (PTR_ERR(bdh)) | |
66 | #define BDH_ERR_PTR(err) (ERR_PTR(err)) | |
386d6a75 RN |
67 | #else |
68 | typedef void zfs_bdev_handle_t; | |
69 | #define BDH_BDEV(bdh) ((struct block_device *)bdh) | |
70 | #define BDH_IS_ERR(bdh) (IS_ERR(BDH_BDEV(bdh))) | |
71 | #define BDH_PTR_ERR(bdh) (PTR_ERR(BDH_BDEV(bdh))) | |
72 | #define BDH_ERR_PTR(err) (ERR_PTR(err)) | |
73 | #endif | |
74 | ||
d366c8fd | 75 | typedef struct vdev_disk { |
386d6a75 | 76 | zfs_bdev_handle_t *vd_bdh; |
d366c8fd JL |
77 | krwlock_t vd_lock; |
78 | } vdev_disk_t; | |
79 | ||
06a19602 RN |
80 | /* |
81 | * Maximum number of segments to add to a bio (min 4). If this is higher than | |
82 | * the maximum allowed by the device queue or the kernel itself, it will be | |
83 | * clamped. Setting it to zero will cause the kernel's ideal size to be used. | |
84 | */ | |
85 | uint_t zfs_vdev_disk_max_segs = 0; | |
86 | ||
a25861dc BB |
87 | /* |
88 | * Unique identifier for the exclusive vdev holder. | |
89 | */ | |
8128bd89 | 90 | static void *zfs_vdev_holder = VDEV_HOLDER; |
6839eed2 | 91 | |
a25861dc BB |
92 | /* |
93 | * Wait up to zfs_vdev_open_timeout_ms milliseconds before determining the | |
94 | * device is missing. The missing path may be transient since the links | |
95 | * can be briefly removed and recreated in response to udev events. | |
96 | */ | |
f66ffe68 | 97 | static uint_t zfs_vdev_open_timeout_ms = 1000; |
a25861dc BB |
98 | |
99 | /* | |
100 | * Size of the "reserved" partition, in blocks. | |
101 | */ | |
74d42600 SH |
102 | #define EFI_MIN_RESV_SIZE (16 * 1024) |
103 | ||
16f0fdad MZ |
104 | /* |
105 | * BIO request failfast mask. | |
106 | */ | |
107 | ||
108 | static unsigned int zfs_vdev_failfast_mask = 1; | |
109 | ||
cfb96c77 RN |
110 | /* |
111 | * Convert SPA mode flags into bdev open mode flags. | |
112 | */ | |
43e8f6e3 | 113 | #ifdef HAVE_BLK_MODE_T |
cfb96c77 RN |
114 | typedef blk_mode_t vdev_bdev_mode_t; |
115 | #define VDEV_BDEV_MODE_READ BLK_OPEN_READ | |
116 | #define VDEV_BDEV_MODE_WRITE BLK_OPEN_WRITE | |
117 | #define VDEV_BDEV_MODE_EXCL BLK_OPEN_EXCL | |
118 | #define VDEV_BDEV_MODE_MASK (BLK_OPEN_READ|BLK_OPEN_WRITE|BLK_OPEN_EXCL) | |
43e8f6e3 | 119 | #else |
cfb96c77 RN |
120 | typedef fmode_t vdev_bdev_mode_t; |
121 | #define VDEV_BDEV_MODE_READ FMODE_READ | |
122 | #define VDEV_BDEV_MODE_WRITE FMODE_WRITE | |
123 | #define VDEV_BDEV_MODE_EXCL FMODE_EXCL | |
124 | #define VDEV_BDEV_MODE_MASK (FMODE_READ|FMODE_WRITE|FMODE_EXCL) | |
43e8f6e3 | 125 | #endif |
43e8f6e3 | 126 | |
cfb96c77 RN |
127 | static vdev_bdev_mode_t |
128 | vdev_bdev_mode(spa_mode_t smode) | |
129 | { | |
130 | ASSERT3U(smode, !=, SPA_MODE_UNINIT); | |
131 | ASSERT0(smode & ~(SPA_MODE_READ|SPA_MODE_WRITE)); | |
233d34e4 | 132 | |
cfb96c77 | 133 | vdev_bdev_mode_t bmode = VDEV_BDEV_MODE_EXCL; |
60101509 | 134 | |
cfb96c77 RN |
135 | if (smode & SPA_MODE_READ) |
136 | bmode |= VDEV_BDEV_MODE_READ; | |
60101509 | 137 | |
cfb96c77 RN |
138 | if (smode & SPA_MODE_WRITE) |
139 | bmode |= VDEV_BDEV_MODE_WRITE; | |
233d34e4 | 140 | |
cfb96c77 RN |
141 | ASSERT(bmode & VDEV_BDEV_MODE_MASK); |
142 | ASSERT0(bmode & ~VDEV_BDEV_MODE_MASK); | |
60101509 | 143 | |
cfb96c77 | 144 | return (bmode); |
60101509 | 145 | } |
60101509 | 146 | |
d441e85d BB |
147 | /* |
148 | * Returns the usable capacity (in bytes) for the partition or disk. | |
149 | */ | |
60101509 | 150 | static uint64_t |
d441e85d | 151 | bdev_capacity(struct block_device *bdev) |
60101509 | 152 | { |
d441e85d BB |
153 | return (i_size_read(bdev->bd_inode)); |
154 | } | |
60101509 | 155 | |
72ba4b2a BB |
156 | #if !defined(HAVE_BDEV_WHOLE) |
157 | static inline struct block_device * | |
158 | bdev_whole(struct block_device *bdev) | |
159 | { | |
160 | return (bdev->bd_contains); | |
161 | } | |
162 | #endif | |
163 | ||
bebdf52a BB |
164 | #if defined(HAVE_BDEVNAME) |
165 | #define vdev_bdevname(bdev, name) bdevname(bdev, name) | |
166 | #else | |
167 | static inline void | |
168 | vdev_bdevname(struct block_device *bdev, char *name) | |
169 | { | |
170 | snprintf(name, BDEVNAME_SIZE, "%pg", bdev); | |
171 | } | |
172 | #endif | |
173 | ||
d441e85d BB |
174 | /* |
175 | * Returns the maximum expansion capacity of the block device (in bytes). | |
176 | * | |
177 | * It is possible to expand a vdev when it has been created as a wholedisk | |
178 | * and the containing block device has increased in capacity. Or when the | |
179 | * partition containing the pool has been manually increased in size. | |
180 | * | |
181 | * This function is only responsible for calculating the potential expansion | |
182 | * size so it can be reported by 'zpool list'. The efi_use_whole_disk() is | |
183 | * responsible for verifying the expected partition layout in the wholedisk | |
184 | * case, and updating the partition table if appropriate. Once the partition | |
185 | * size has been increased the additional capacity will be visible using | |
186 | * bdev_capacity(). | |
0c637f31 | 187 | * |
188 | * The returned maximum expansion capacity is always expected to be larger, or | |
189 | * at the very least equal, to its usable capacity to prevent overestimating | |
190 | * the pool expandsize. | |
d441e85d BB |
191 | */ |
192 | static uint64_t | |
193 | bdev_max_capacity(struct block_device *bdev, uint64_t wholedisk) | |
194 | { | |
195 | uint64_t psize; | |
196 | int64_t available; | |
197 | ||
72ba4b2a | 198 | if (wholedisk && bdev != bdev_whole(bdev)) { |
74d42600 | 199 | /* |
d441e85d BB |
200 | * When reporting maximum expansion capacity for a wholedisk |
201 | * deduct any capacity which is expected to be lost due to | |
202 | * alignment restrictions. Over reporting this value isn't | |
203 | * harmful and would only result in slightly less capacity | |
204 | * than expected post expansion. | |
0c637f31 | 205 | * The estimated available space may be slightly smaller than |
206 | * bdev_capacity() for devices where the number of sectors is | |
207 | * not a multiple of the alignment size and the partition layout | |
208 | * is keeping less than PARTITION_END_ALIGNMENT bytes after the | |
209 | * "reserved" EFI partition: in such cases return the device | |
210 | * usable capacity. | |
74d42600 | 211 | */ |
72ba4b2a | 212 | available = i_size_read(bdev_whole(bdev)->bd_inode) - |
d441e85d BB |
213 | ((EFI_MIN_RESV_SIZE + NEW_START_BLOCK + |
214 | PARTITION_END_ALIGNMENT) << SECTOR_BITS); | |
0c637f31 | 215 | psize = MAX(available, bdev_capacity(bdev)); |
74d42600 | 216 | } else { |
d441e85d | 217 | psize = bdev_capacity(bdev); |
74d42600 | 218 | } |
d441e85d BB |
219 | |
220 | return (psize); | |
60101509 BB |
221 | } |
222 | ||
d148e951 BB |
223 | static void |
224 | vdev_disk_error(zio_t *zio) | |
225 | { | |
c71c8c71 | 226 | /* |
227 | * This function can be called in interrupt context, for instance while | |
228 | * handling IRQs coming from a misbehaving disk device; use printk() | |
229 | * which is safe from any context. | |
230 | */ | |
231 | printk(KERN_WARNING "zio pool=%s vdev=%s error=%d type=%d " | |
4938d01d | 232 | "offset=%llu size=%llu flags=%llu\n", spa_name(zio->io_spa), |
c71c8c71 | 233 | zio->io_vd->vdev_path, zio->io_error, zio->io_type, |
234 | (u_longlong_t)zio->io_offset, (u_longlong_t)zio->io_size, | |
235 | zio->io_flags); | |
d148e951 BB |
236 | } |
237 | ||
55c12724 AH |
238 | static void |
239 | vdev_disk_kobj_evt_post(vdev_t *v) | |
240 | { | |
241 | vdev_disk_t *vd = v->vdev_tsd; | |
386d6a75 RN |
242 | if (vd && vd->vd_bdh) { |
243 | spl_signal_kobj_evt(BDH_BDEV(vd->vd_bdh)); | |
55c12724 AH |
244 | } else { |
245 | vdev_dbgmsg(v, "vdev_disk_t is NULL for VDEV:%s\n", | |
246 | v->vdev_path); | |
247 | } | |
248 | } | |
249 | ||
386d6a75 | 250 | static zfs_bdev_handle_t * |
cfb96c77 | 251 | vdev_blkdev_get_by_path(const char *path, spa_mode_t smode, void *holder) |
43e8f6e3 | 252 | { |
cfb96c77 RN |
253 | vdev_bdev_mode_t bmode = vdev_bdev_mode(smode); |
254 | ||
e3120f73 RN |
255 | #if defined(HAVE_BDEV_FILE_OPEN_BY_PATH) |
256 | return (bdev_file_open_by_path(path, bmode, holder, NULL)); | |
257 | #elif defined(HAVE_BDEV_OPEN_BY_PATH) | |
cfb96c77 | 258 | return (bdev_open_by_path(path, bmode, holder, NULL)); |
386d6a75 | 259 | #elif defined(HAVE_BLKDEV_GET_BY_PATH_4ARG) |
cfb96c77 | 260 | return (blkdev_get_by_path(path, bmode, holder, NULL)); |
43e8f6e3 | 261 | #else |
cfb96c77 | 262 | return (blkdev_get_by_path(path, bmode, holder)); |
43e8f6e3 CK |
263 | #endif |
264 | } | |
265 | ||
266 | static void | |
cfb96c77 | 267 | vdev_blkdev_put(zfs_bdev_handle_t *bdh, spa_mode_t smode, void *holder) |
43e8f6e3 | 268 | { |
386d6a75 RN |
269 | #if defined(HAVE_BDEV_RELEASE) |
270 | return (bdev_release(bdh)); | |
271 | #elif defined(HAVE_BLKDEV_PUT_HOLDER) | |
272 | return (blkdev_put(BDH_BDEV(bdh), holder)); | |
e3120f73 | 273 | #elif defined(HAVE_BLKDEV_PUT) |
cfb96c77 | 274 | return (blkdev_put(BDH_BDEV(bdh), vdev_bdev_mode(smode))); |
e3120f73 RN |
275 | #else |
276 | fput(bdh); | |
43e8f6e3 CK |
277 | #endif |
278 | } | |
279 | ||
60101509 | 280 | static int |
1bd201e7 | 281 | vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, |
6fe3498c | 282 | uint64_t *logical_ashift, uint64_t *physical_ashift) |
60101509 | 283 | { |
386d6a75 | 284 | zfs_bdev_handle_t *bdh; |
cfb96c77 | 285 | spa_mode_t smode = spa_mode(v->vdev_spa); |
a25861dc | 286 | hrtime_t timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms); |
60101509 | 287 | vdev_disk_t *vd; |
60101509 BB |
288 | |
289 | /* Must have a pathname and it must be absolute. */ | |
290 | if (v->vdev_path == NULL || v->vdev_path[0] != '/') { | |
291 | v->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; | |
d441e85d | 292 | vdev_dbgmsg(v, "invalid vdev_path"); |
2d82ea8b | 293 | return (SET_ERROR(EINVAL)); |
60101509 BB |
294 | } |
295 | ||
0d8103d9 | 296 | /* |
d441e85d | 297 | * Reopen the device if it is currently open. When expanding a |
8e82ffba GW |
298 | * partition force re-scanning the partition table if userland |
299 | * did not take care of this already. We need to do this while closed | |
d441e85d BB |
300 | * in order to get an accurate updated block device size. Then |
301 | * since udev may need to recreate the device links increase the | |
a25861dc | 302 | * open retry timeout before reporting the device as unavailable. |
0d8103d9 | 303 | */ |
d441e85d BB |
304 | vd = v->vdev_tsd; |
305 | if (vd) { | |
306 | char disk_name[BDEVNAME_SIZE + 6] = "/dev/"; | |
307 | boolean_t reread_part = B_FALSE; | |
0d8103d9 | 308 | |
d441e85d | 309 | rw_enter(&vd->vd_lock, RW_WRITER); |
386d6a75 RN |
310 | bdh = vd->vd_bdh; |
311 | vd->vd_bdh = NULL; | |
d441e85d | 312 | |
386d6a75 RN |
313 | if (bdh) { |
314 | struct block_device *bdev = BDH_BDEV(bdh); | |
72ba4b2a | 315 | if (v->vdev_expanding && bdev != bdev_whole(bdev)) { |
bebdf52a | 316 | vdev_bdevname(bdev_whole(bdev), disk_name + 5); |
8e82ffba GW |
317 | /* |
318 | * If userland has BLKPG_RESIZE_PARTITION, | |
319 | * then it should have updated the partition | |
320 | * table already. We can detect this by | |
321 | * comparing our current physical size | |
322 | * with that of the device. If they are | |
323 | * the same, then we must not have | |
324 | * BLKPG_RESIZE_PARTITION or it failed to | |
325 | * update the partition table online. We | |
326 | * fallback to rescanning the partition | |
327 | * table from the kernel below. However, | |
328 | * if the capacity already reflects the | |
329 | * updated partition, then we skip | |
330 | * rescanning the partition table here. | |
331 | */ | |
332 | if (v->vdev_psize == bdev_capacity(bdev)) | |
333 | reread_part = B_TRUE; | |
d441e85d BB |
334 | } |
335 | ||
cfb96c77 | 336 | vdev_blkdev_put(bdh, smode, zfs_vdev_holder); |
d441e85d BB |
337 | } |
338 | ||
339 | if (reread_part) { | |
cfb96c77 | 340 | bdh = vdev_blkdev_get_by_path(disk_name, smode, |
386d6a75 RN |
341 | zfs_vdev_holder); |
342 | if (!BDH_IS_ERR(bdh)) { | |
343 | int error = | |
344 | vdev_bdev_reread_part(BDH_BDEV(bdh)); | |
cfb96c77 | 345 | vdev_blkdev_put(bdh, smode, zfs_vdev_holder); |
a25861dc BB |
346 | if (error == 0) { |
347 | timeout = MSEC2NSEC( | |
348 | zfs_vdev_open_timeout_ms * 2); | |
349 | } | |
d441e85d BB |
350 | } |
351 | } | |
352 | } else { | |
353 | vd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP); | |
354 | ||
355 | rw_init(&vd->vd_lock, NULL, RW_DEFAULT, NULL); | |
356 | rw_enter(&vd->vd_lock, RW_WRITER); | |
357 | } | |
60101509 BB |
358 | |
359 | /* | |
360 | * Devices are always opened by the path provided at configuration | |
361 | * time. This means that if the provided path is a udev by-id path | |
d441e85d | 362 | * then drives may be re-cabled without an issue. If the provided |
4e95cc99 | 363 | * path is a udev by-path path, then the physical location information |
60101509 BB |
364 | * will be preserved. This can be critical for more complicated |
365 | * configurations where drives are located in specific physical | |
d441e85d BB |
366 | * locations to maximize the systems tolerance to component failure. |
367 | * | |
4e95cc99 | 368 | * Alternatively, you can provide your own udev rule to flexibly map |
60101509 | 369 | * the drives as you see fit. It is not advised that you use the |
4e95cc99 | 370 | * /dev/[hd]d devices which may be reordered due to probing order. |
60101509 BB |
371 | * Devices in the wrong locations will be detected by the higher |
372 | * level vdev validation. | |
2d82ea8b BB |
373 | * |
374 | * The specified paths may be briefly removed and recreated in | |
375 | * response to udev events. This should be exceptionally unlikely | |
376 | * because the zpool command makes every effort to verify these paths | |
377 | * have already settled prior to reaching this point. Therefore, | |
378 | * a ENOENT failure at this point is highly likely to be transient | |
379 | * and it is reasonable to sleep and retry before giving up. In | |
380 | * practice delays have been observed to be on the order of 100ms. | |
77e2756d BB |
381 | * |
382 | * When ERESTARTSYS is returned it indicates the block device is | |
383 | * a zvol which could not be opened due to the deadlock detection | |
384 | * logic in zvol_open(). Extend the timeout and retry the open | |
385 | * subsequent attempts are expected to eventually succeed. | |
60101509 | 386 | */ |
a25861dc | 387 | hrtime_t start = gethrtime(); |
386d6a75 RN |
388 | bdh = BDH_ERR_PTR(-ENXIO); |
389 | while (BDH_IS_ERR(bdh) && ((gethrtime() - start) < timeout)) { | |
cfb96c77 | 390 | bdh = vdev_blkdev_get_by_path(v->vdev_path, smode, |
386d6a75 RN |
391 | zfs_vdev_holder); |
392 | if (unlikely(BDH_PTR_ERR(bdh) == -ENOENT)) { | |
55c12724 AH |
393 | /* |
394 | * There is no point of waiting since device is removed | |
395 | * explicitly | |
396 | */ | |
397 | if (v->vdev_removed) | |
398 | break; | |
399 | ||
2dff7527 | 400 | schedule_timeout_interruptible(MSEC_TO_TICK(10)); |
386d6a75 | 401 | } else if (unlikely(BDH_PTR_ERR(bdh) == -ERESTARTSYS)) { |
77e2756d BB |
402 | timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms * 10); |
403 | continue; | |
386d6a75 | 404 | } else if (BDH_IS_ERR(bdh)) { |
2d82ea8b BB |
405 | break; |
406 | } | |
407 | } | |
408 | ||
386d6a75 RN |
409 | if (BDH_IS_ERR(bdh)) { |
410 | int error = -BDH_PTR_ERR(bdh); | |
a25861dc BB |
411 | vdev_dbgmsg(v, "open error=%d timeout=%llu/%llu", error, |
412 | (u_longlong_t)(gethrtime() - start), | |
413 | (u_longlong_t)timeout); | |
386d6a75 | 414 | vd->vd_bdh = NULL; |
d441e85d BB |
415 | v->vdev_tsd = vd; |
416 | rw_exit(&vd->vd_lock); | |
417 | return (SET_ERROR(error)); | |
418 | } else { | |
386d6a75 | 419 | vd->vd_bdh = bdh; |
d441e85d BB |
420 | v->vdev_tsd = vd; |
421 | rw_exit(&vd->vd_lock); | |
60101509 BB |
422 | } |
423 | ||
386d6a75 RN |
424 | struct block_device *bdev = BDH_BDEV(vd->vd_bdh); |
425 | ||
0d8103d9 | 426 | /* Determine the physical block size */ |
386d6a75 | 427 | int physical_block_size = bdev_physical_block_size(bdev); |
6fe3498c RM |
428 | |
429 | /* Determine the logical block size */ | |
386d6a75 | 430 | int logical_block_size = bdev_logical_block_size(bdev); |
60101509 | 431 | |
8f1b7a6f RN |
432 | /* |
433 | * If the device has a write cache, clear the nowritecache flag, | |
434 | * so that we start issuing flush requests again. | |
435 | */ | |
436 | v->vdev_nowritecache = !zfs_bdev_has_write_cache(bdev); | |
60101509 | 437 | |
1b939560 | 438 | /* Set when device reports it supports TRIM. */ |
386d6a75 | 439 | v->vdev_has_trim = bdev_discard_supported(bdev); |
1b939560 BB |
440 | |
441 | /* Set when device reports it supports secure TRIM. */ | |
386d6a75 | 442 | v->vdev_has_securetrim = bdev_secure_discard_supported(bdev); |
1b939560 | 443 | |
fb40095f | 444 | /* Inform the ZIO pipeline that we are non-rotational */ |
386d6a75 | 445 | v->vdev_nonrot = blk_queue_nonrot(bdev_get_queue(bdev)); |
fb40095f | 446 | |
d441e85d | 447 | /* Physical volume size in bytes for the partition */ |
386d6a75 | 448 | *psize = bdev_capacity(bdev); |
d441e85d BB |
449 | |
450 | /* Physical volume size in bytes including possible expansion space */ | |
386d6a75 | 451 | *max_psize = bdev_max_capacity(bdev, v->vdev_wholedisk); |
1bd201e7 | 452 | |
60101509 | 453 | /* Based on the minimum sector size set the block size */ |
6fe3498c RM |
454 | *physical_ashift = highbit64(MAX(physical_block_size, |
455 | SPA_MINBLOCKSIZE)) - 1; | |
456 | ||
457 | *logical_ashift = highbit64(MAX(logical_block_size, | |
458 | SPA_MINBLOCKSIZE)) - 1; | |
60101509 | 459 | |
d1d7e268 | 460 | return (0); |
60101509 BB |
461 | } |
462 | ||
463 | static void | |
464 | vdev_disk_close(vdev_t *v) | |
465 | { | |
466 | vdev_disk_t *vd = v->vdev_tsd; | |
467 | ||
0d8103d9 | 468 | if (v->vdev_reopening || vd == NULL) |
60101509 BB |
469 | return; |
470 | ||
72fd834c | 471 | if (vd->vd_bdh != NULL) |
386d6a75 | 472 | vdev_blkdev_put(vd->vd_bdh, spa_mode(v->vdev_spa), |
43e8f6e3 | 473 | zfs_vdev_holder); |
60101509 | 474 | |
d441e85d | 475 | rw_destroy(&vd->vd_lock); |
d1d7e268 | 476 | kmem_free(vd, sizeof (vdev_disk_t)); |
60101509 BB |
477 | v->vdev_tsd = NULL; |
478 | } | |
479 | ||
bbb1b6ce | 480 | static inline void |
3b86aeb2 | 481 | vdev_submit_bio_impl(struct bio *bio) |
bbb1b6ce BB |
482 | { |
483 | #ifdef HAVE_1ARG_SUBMIT_BIO | |
453c63e9 | 484 | (void) submit_bio(bio); |
bbb1b6ce | 485 | #else |
a3fbe2b9 | 486 | (void) submit_bio(bio_data_dir(bio), bio); |
bbb1b6ce BB |
487 | #endif |
488 | } | |
489 | ||
2e407941 BB |
490 | /* |
491 | * preempt_schedule_notrace is GPL-only which breaks the ZFS build, so | |
492 | * replace it with preempt_schedule under the following condition: | |
493 | */ | |
494 | #if defined(CONFIG_ARM64) && \ | |
495 | defined(CONFIG_PREEMPTION) && \ | |
496 | defined(CONFIG_BLK_CGROUP) | |
497 | #define preempt_schedule_notrace(x) preempt_schedule(x) | |
498 | #endif | |
499 | ||
5f264996 BB |
500 | /* |
501 | * As for the Linux 5.18 kernel bio_alloc() expects a block_device struct | |
502 | * as an argument removing the need to set it with bio_set_dev(). This | |
503 | * removes the need for all of the following compatibility code. | |
504 | */ | |
505 | #if !defined(HAVE_BIO_ALLOC_4ARG) | |
506 | ||
26a85659 BB |
507 | #ifdef HAVE_BIO_SET_DEV |
508 | #if defined(CONFIG_BLK_CGROUP) && defined(HAVE_BIO_SET_DEV_GPL_ONLY) | |
bd0d24e0 BB |
509 | /* |
510 | * The Linux 5.5 kernel updated percpu_ref_tryget() which is inlined by | |
511 | * blkg_tryget() to use rcu_read_lock() instead of rcu_read_lock_sched(). | |
512 | * As a side effect the function was converted to GPL-only. Define our | |
513 | * own version when needed which uses rcu_read_lock_sched(). | |
036e846a RS |
514 | * |
515 | * The Linux 5.17 kernel split linux/blk-cgroup.h into a private and a public | |
516 | * part, moving blkg_tryget into the private one. Define our own version. | |
bd0d24e0 | 517 | */ |
036e846a | 518 | #if defined(HAVE_BLKG_TRYGET_GPL_ONLY) || !defined(HAVE_BLKG_TRYGET) |
bd0d24e0 BB |
519 | static inline bool |
520 | vdev_blkg_tryget(struct blkcg_gq *blkg) | |
521 | { | |
522 | struct percpu_ref *ref = &blkg->refcnt; | |
523 | unsigned long __percpu *count; | |
524 | bool rc; | |
525 | ||
526 | rcu_read_lock_sched(); | |
527 | ||
528 | if (__ref_is_percpu(ref, &count)) { | |
529 | this_cpu_inc(*count); | |
530 | rc = true; | |
531 | } else { | |
838a2490 CK |
532 | #ifdef ZFS_PERCPU_REF_COUNT_IN_DATA |
533 | rc = atomic_long_inc_not_zero(&ref->data->count); | |
534 | #else | |
bd0d24e0 | 535 | rc = atomic_long_inc_not_zero(&ref->count); |
838a2490 | 536 | #endif |
bd0d24e0 BB |
537 | } |
538 | ||
539 | rcu_read_unlock_sched(); | |
540 | ||
541 | return (rc); | |
542 | } | |
036e846a | 543 | #else |
bd0d24e0 BB |
544 | #define vdev_blkg_tryget(bg) blkg_tryget(bg) |
545 | #endif | |
d08b99ac | 546 | #ifdef HAVE_BIO_SET_DEV_MACRO |
26a85659 BB |
547 | /* |
548 | * The Linux 5.0 kernel updated the bio_set_dev() macro so it calls the | |
549 | * GPL-only bio_associate_blkg() symbol thus inadvertently converting | |
550 | * the entire macro. Provide a minimal version which always assigns the | |
551 | * request queue's root_blkg to the bio. | |
552 | */ | |
553 | static inline void | |
554 | vdev_bio_associate_blkg(struct bio *bio) | |
555 | { | |
d939930f CK |
556 | #if defined(HAVE_BIO_BDEV_DISK) |
557 | struct request_queue *q = bio->bi_bdev->bd_disk->queue; | |
558 | #else | |
26a85659 | 559 | struct request_queue *q = bio->bi_disk->queue; |
d939930f | 560 | #endif |
26a85659 BB |
561 | |
562 | ASSERT3P(q, !=, NULL); | |
26a85659 BB |
563 | ASSERT3P(bio->bi_blkg, ==, NULL); |
564 | ||
bd0d24e0 | 565 | if (q->root_blkg && vdev_blkg_tryget(q->root_blkg)) |
26a85659 BB |
566 | bio->bi_blkg = q->root_blkg; |
567 | } | |
d08b99ac | 568 | |
26a85659 | 569 | #define bio_associate_blkg vdev_bio_associate_blkg |
d08b99ac CK |
570 | #else |
571 | static inline void | |
572 | vdev_bio_set_dev(struct bio *bio, struct block_device *bdev) | |
573 | { | |
574 | #if defined(HAVE_BIO_BDEV_DISK) | |
575 | struct request_queue *q = bdev->bd_disk->queue; | |
576 | #else | |
577 | struct request_queue *q = bio->bi_disk->queue; | |
578 | #endif | |
579 | bio_clear_flag(bio, BIO_REMAPPED); | |
580 | if (bio->bi_bdev != bdev) | |
581 | bio_clear_flag(bio, BIO_THROTTLED); | |
582 | bio->bi_bdev = bdev; | |
583 | ||
584 | ASSERT3P(q, !=, NULL); | |
585 | ASSERT3P(bio->bi_blkg, ==, NULL); | |
586 | ||
587 | if (q->root_blkg && vdev_blkg_tryget(q->root_blkg)) | |
588 | bio->bi_blkg = q->root_blkg; | |
589 | } | |
590 | #define bio_set_dev vdev_bio_set_dev | |
591 | #endif | |
26a85659 BB |
592 | #endif |
593 | #else | |
594 | /* | |
595 | * Provide a bio_set_dev() helper macro for pre-Linux 4.14 kernels. | |
596 | */ | |
787acae0 GDN |
597 | static inline void |
598 | bio_set_dev(struct bio *bio, struct block_device *bdev) | |
599 | { | |
600 | bio->bi_bdev = bdev; | |
601 | } | |
26a85659 | 602 | #endif /* HAVE_BIO_SET_DEV */ |
5f264996 | 603 | #endif /* !HAVE_BIO_ALLOC_4ARG */ |
787acae0 | 604 | |
37f9dac5 | 605 | static inline void |
3b86aeb2 | 606 | vdev_submit_bio(struct bio *bio) |
37f9dac5 | 607 | { |
37f9dac5 RY |
608 | struct bio_list *bio_list = current->bio_list; |
609 | current->bio_list = NULL; | |
3b86aeb2 | 610 | vdev_submit_bio_impl(bio); |
37f9dac5 | 611 | current->bio_list = bio_list; |
37f9dac5 RY |
612 | } |
613 | ||
5f264996 BB |
614 | static inline struct bio * |
615 | vdev_bio_alloc(struct block_device *bdev, gfp_t gfp_mask, | |
616 | unsigned short nr_vecs) | |
617 | { | |
618 | struct bio *bio; | |
619 | ||
d1325b4f | 620 | #ifdef HAVE_BIO_ALLOC_4ARG |
5f264996 BB |
621 | bio = bio_alloc(bdev, nr_vecs, 0, gfp_mask); |
622 | #else | |
623 | bio = bio_alloc(gfp_mask, nr_vecs); | |
624 | if (likely(bio != NULL)) | |
625 | bio_set_dev(bio, bdev); | |
d1325b4f AZ |
626 | #endif |
627 | ||
5f264996 BB |
628 | return (bio); |
629 | } | |
630 | ||
06a19602 RN |
631 | static inline uint_t |
632 | vdev_bio_max_segs(struct block_device *bdev) | |
633 | { | |
634 | /* | |
635 | * Smallest of the device max segs and the tuneable max segs. Minimum | |
636 | * 4, so there's room to finish split pages if they come up. | |
637 | */ | |
638 | const uint_t dev_max_segs = queue_max_segments(bdev_get_queue(bdev)); | |
639 | const uint_t tune_max_segs = (zfs_vdev_disk_max_segs > 0) ? | |
640 | MAX(4, zfs_vdev_disk_max_segs) : dev_max_segs; | |
641 | const uint_t max_segs = MIN(tune_max_segs, dev_max_segs); | |
642 | ||
643 | #ifdef HAVE_BIO_MAX_SEGS | |
644 | return (bio_max_segs(max_segs)); | |
645 | #else | |
646 | return (MIN(max_segs, BIO_MAX_PAGES)); | |
647 | #endif | |
648 | } | |
649 | ||
650 | static inline uint_t | |
651 | vdev_bio_max_bytes(struct block_device *bdev) | |
652 | { | |
653 | return (queue_max_sectors(bdev_get_queue(bdev)) << 9); | |
654 | } | |
655 | ||
656 | ||
657 | /* | |
658 | * Virtual block IO object (VBIO) | |
659 | * | |
660 | * Linux block IO (BIO) objects have a limit on how many data segments (pages) | |
661 | * they can hold. Depending on how they're allocated and structured, a large | |
662 | * ZIO can require more than one BIO to be submitted to the kernel, which then | |
663 | * all have to complete before we can return the completed ZIO back to ZFS. | |
664 | * | |
665 | * A VBIO is a wrapper around multiple BIOs, carrying everything needed to | |
666 | * translate a ZIO down into the kernel block layer and back again. | |
667 | * | |
668 | * Note that these are only used for data ZIOs (read/write). Meta-operations | |
669 | * (flush/trim) don't need multiple BIOs and so can just make the call | |
670 | * directly. | |
671 | */ | |
672 | typedef struct { | |
673 | zio_t *vbio_zio; /* parent zio */ | |
674 | ||
675 | struct block_device *vbio_bdev; /* blockdev to submit bios to */ | |
676 | ||
677 | abd_t *vbio_abd; /* abd carrying borrowed linear buf */ | |
678 | ||
06a19602 RN |
679 | uint_t vbio_max_segs; /* max segs per bio */ |
680 | ||
681 | uint_t vbio_max_bytes; /* max bytes per bio */ | |
682 | uint_t vbio_lbs_mask; /* logical block size mask */ | |
683 | ||
684 | uint64_t vbio_offset; /* start offset of next bio */ | |
685 | ||
686 | struct bio *vbio_bio; /* pointer to the current bio */ | |
72fd834c | 687 | int vbio_flags; /* bio flags */ |
06a19602 RN |
688 | } vbio_t; |
689 | ||
690 | static vbio_t * | |
72fd834c | 691 | vbio_alloc(zio_t *zio, struct block_device *bdev, int flags) |
06a19602 RN |
692 | { |
693 | vbio_t *vbio = kmem_zalloc(sizeof (vbio_t), KM_SLEEP); | |
694 | ||
695 | vbio->vbio_zio = zio; | |
696 | vbio->vbio_bdev = bdev; | |
72fd834c | 697 | vbio->vbio_abd = NULL; |
06a19602 RN |
698 | vbio->vbio_max_segs = vdev_bio_max_segs(bdev); |
699 | vbio->vbio_max_bytes = vdev_bio_max_bytes(bdev); | |
700 | vbio->vbio_lbs_mask = ~(bdev_logical_block_size(bdev)-1); | |
701 | vbio->vbio_offset = zio->io_offset; | |
72fd834c RN |
702 | vbio->vbio_bio = NULL; |
703 | vbio->vbio_flags = flags; | |
06a19602 RN |
704 | |
705 | return (vbio); | |
706 | } | |
707 | ||
72fd834c RN |
708 | BIO_END_IO_PROTO(vbio_completion, bio, error); |
709 | ||
06a19602 RN |
710 | static int |
711 | vbio_add_page(vbio_t *vbio, struct page *page, uint_t size, uint_t offset) | |
712 | { | |
72fd834c | 713 | struct bio *bio = vbio->vbio_bio; |
06a19602 RN |
714 | uint_t ssize; |
715 | ||
716 | while (size > 0) { | |
06a19602 RN |
717 | if (bio == NULL) { |
718 | /* New BIO, allocate and set up */ | |
719 | bio = vdev_bio_alloc(vbio->vbio_bdev, GFP_NOIO, | |
720 | vbio->vbio_max_segs); | |
72fd834c RN |
721 | VERIFY(bio); |
722 | ||
06a19602 | 723 | BIO_BI_SECTOR(bio) = vbio->vbio_offset >> 9; |
72fd834c RN |
724 | bio_set_op_attrs(bio, |
725 | vbio->vbio_zio->io_type == ZIO_TYPE_WRITE ? | |
726 | WRITE : READ, vbio->vbio_flags); | |
06a19602 | 727 | |
72fd834c RN |
728 | if (vbio->vbio_bio) { |
729 | bio_chain(vbio->vbio_bio, bio); | |
730 | vdev_submit_bio(vbio->vbio_bio); | |
731 | } | |
732 | vbio->vbio_bio = bio; | |
06a19602 RN |
733 | } |
734 | ||
735 | /* | |
736 | * Only load as much of the current page data as will fit in | |
737 | * the space left in the BIO, respecting lbs alignment. Older | |
738 | * kernels will error if we try to overfill the BIO, while | |
739 | * newer ones will accept it and split the BIO. This ensures | |
740 | * everything works on older kernels, and avoids an additional | |
741 | * overhead on the new. | |
742 | */ | |
743 | ssize = MIN(size, (vbio->vbio_max_bytes - BIO_BI_SIZE(bio)) & | |
744 | vbio->vbio_lbs_mask); | |
745 | if (ssize > 0 && | |
746 | bio_add_page(bio, page, ssize, offset) == ssize) { | |
747 | /* Accepted, adjust and load any remaining. */ | |
748 | size -= ssize; | |
749 | offset += ssize; | |
750 | continue; | |
751 | } | |
752 | ||
753 | /* No room, set up for a new BIO and loop */ | |
754 | vbio->vbio_offset += BIO_BI_SIZE(bio); | |
755 | ||
756 | /* Signal new BIO allocation wanted */ | |
72fd834c | 757 | bio = NULL; |
06a19602 RN |
758 | } |
759 | ||
760 | return (0); | |
761 | } | |
762 | ||
72fd834c RN |
763 | /* Iterator callback to submit ABD pages to the vbio. */ |
764 | static int | |
765 | vbio_fill_cb(struct page *page, size_t off, size_t len, void *priv) | |
766 | { | |
767 | vbio_t *vbio = priv; | |
768 | return (vbio_add_page(vbio, page, len, off)); | |
769 | } | |
06a19602 | 770 | |
72fd834c | 771 | /* Create some BIOs, fill them with data and submit them */ |
06a19602 | 772 | static void |
72fd834c | 773 | vbio_submit(vbio_t *vbio, abd_t *abd, uint64_t size) |
06a19602 | 774 | { |
06a19602 | 775 | /* |
72fd834c RN |
776 | * We plug so we can submit the BIOs as we go and only unplug them when |
777 | * they are fully created and submitted. This is important; if we don't | |
778 | * plug, then the kernel may start executing earlier BIOs while we're | |
779 | * still creating and executing later ones, and if the device goes | |
780 | * away while that's happening, older kernels can get confused and | |
781 | * trample memory. | |
06a19602 RN |
782 | */ |
783 | struct blk_plug plug; | |
72fd834c | 784 | blk_start_plug(&plug); |
06a19602 | 785 | |
72fd834c RN |
786 | (void) abd_iterate_page_func(abd, 0, size, vbio_fill_cb, vbio); |
787 | ASSERT(vbio->vbio_bio); | |
06a19602 | 788 | |
72fd834c RN |
789 | vbio->vbio_bio->bi_end_io = vbio_completion; |
790 | vbio->vbio_bio->bi_private = vbio; | |
06a19602 | 791 | |
917ff75e RN |
792 | /* |
793 | * Once submitted, vbio_bio now owns vbio (through bi_private) and we | |
794 | * can't touch it again. The bio may complete and vbio_completion() be | |
795 | * called and free the vbio before this task is run again, so we must | |
796 | * consider it invalid from this point. | |
797 | */ | |
72fd834c | 798 | vdev_submit_bio(vbio->vbio_bio); |
06a19602 | 799 | |
72fd834c | 800 | blk_finish_plug(&plug); |
06a19602 RN |
801 | } |
802 | ||
72fd834c RN |
803 | /* IO completion callback */ |
804 | BIO_END_IO_PROTO(vbio_completion, bio, error) | |
06a19602 | 805 | { |
72fd834c | 806 | vbio_t *vbio = bio->bi_private; |
06a19602 | 807 | zio_t *zio = vbio->vbio_zio; |
06a19602 | 808 | |
72fd834c | 809 | ASSERT(zio); |
06a19602 | 810 | |
72fd834c RN |
811 | /* Capture and log any errors */ |
812 | #ifdef HAVE_1ARG_BIO_END_IO_T | |
813 | zio->io_error = BIO_END_IO_ERROR(bio); | |
814 | #else | |
815 | zio->io_error = 0; | |
816 | if (error) | |
817 | zio->io_error = -(error); | |
818 | else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) | |
819 | zio->io_error = EIO; | |
820 | #endif | |
821 | ASSERT3U(zio->io_error, >=, 0); | |
06a19602 | 822 | |
72fd834c RN |
823 | if (zio->io_error) |
824 | vdev_disk_error(zio); | |
06a19602 | 825 | |
72fd834c RN |
826 | /* Return the BIO to the kernel */ |
827 | bio_put(bio); | |
06a19602 RN |
828 | |
829 | /* | |
72fd834c RN |
830 | * If we copied the ABD before issuing it, clean up and return the copy |
831 | * to the ADB, with changes if appropriate. | |
06a19602 | 832 | */ |
72fd834c RN |
833 | if (vbio->vbio_abd != NULL) { |
834 | void *buf = abd_to_buf(vbio->vbio_abd); | |
835 | abd_free(vbio->vbio_abd); | |
836 | vbio->vbio_abd = NULL; | |
06a19602 | 837 | |
72fd834c RN |
838 | if (zio->io_type == ZIO_TYPE_READ) |
839 | abd_return_buf_copy(zio->io_abd, buf, zio->io_size); | |
840 | else | |
841 | abd_return_buf(zio->io_abd, buf, zio->io_size); | |
842 | } | |
06a19602 | 843 | |
72fd834c RN |
844 | /* Final cleanup */ |
845 | kmem_free(vbio, sizeof (vbio_t)); | |
06a19602 RN |
846 | |
847 | /* All done, submit for processing */ | |
848 | zio_delay_interrupt(zio); | |
06a19602 RN |
849 | } |
850 | ||
851 | /* | |
852 | * Iterator callback to count ABD pages and check their size & alignment. | |
853 | * | |
854 | * On Linux, each BIO segment can take a page pointer, and an offset+length of | |
855 | * the data within that page. A page can be arbitrarily large ("compound" | |
856 | * pages) but we still have to ensure the data portion is correctly sized and | |
857 | * aligned to the logical block size, to ensure that if the kernel wants to | |
858 | * split the BIO, the two halves will still be properly aligned. | |
bc27c494 RN |
859 | * |
860 | * NOTE: if you change this function, change the copy in | |
861 | * tests/zfs-tests/tests/functional/vdev_disk/page_alignment.c, and add test | |
862 | * data there to validate the change you're making. | |
863 | * | |
06a19602 RN |
864 | */ |
865 | typedef struct { | |
866 | uint_t bmask; | |
867 | uint_t npages; | |
868 | uint_t end; | |
869 | } vdev_disk_check_pages_t; | |
870 | ||
871 | static int | |
872 | vdev_disk_check_pages_cb(struct page *page, size_t off, size_t len, void *priv) | |
873 | { | |
bc27c494 | 874 | (void) page; |
06a19602 RN |
875 | vdev_disk_check_pages_t *s = priv; |
876 | ||
877 | /* | |
878 | * If we didn't finish on a block size boundary last time, then there | |
879 | * would be a gap if we tried to use this ABD as-is, so abort. | |
880 | */ | |
881 | if (s->end != 0) | |
882 | return (1); | |
883 | ||
884 | /* | |
885 | * Note if we're taking less than a full block, so we can check it | |
886 | * above on the next call. | |
887 | */ | |
1bf649cb | 888 | s->end = (off+len) & s->bmask; |
06a19602 RN |
889 | |
890 | /* All blocks after the first must start on a block size boundary. */ | |
891 | if (s->npages != 0 && (off & s->bmask) != 0) | |
892 | return (1); | |
893 | ||
894 | s->npages++; | |
895 | return (0); | |
896 | } | |
897 | ||
898 | /* | |
899 | * Check if we can submit the pages in this ABD to the kernel as-is. Returns | |
900 | * the number of pages, or 0 if it can't be submitted like this. | |
901 | */ | |
902 | static boolean_t | |
903 | vdev_disk_check_pages(abd_t *abd, uint64_t size, struct block_device *bdev) | |
904 | { | |
905 | vdev_disk_check_pages_t s = { | |
906 | .bmask = bdev_logical_block_size(bdev)-1, | |
907 | .npages = 0, | |
908 | .end = 0, | |
909 | }; | |
910 | ||
911 | if (abd_iterate_page_func(abd, 0, size, vdev_disk_check_pages_cb, &s)) | |
912 | return (B_FALSE); | |
913 | ||
914 | return (B_TRUE); | |
915 | } | |
916 | ||
06a19602 RN |
917 | static int |
918 | vdev_disk_io_rw(zio_t *zio) | |
919 | { | |
920 | vdev_t *v = zio->io_vd; | |
921 | vdev_disk_t *vd = v->vdev_tsd; | |
922 | struct block_device *bdev = BDH_BDEV(vd->vd_bdh); | |
923 | int flags = 0; | |
924 | ||
925 | /* | |
926 | * Accessing outside the block device is never allowed. | |
927 | */ | |
928 | if (zio->io_offset + zio->io_size > bdev->bd_inode->i_size) { | |
929 | vdev_dbgmsg(zio->io_vd, | |
930 | "Illegal access %llu size %llu, device size %llu", | |
931 | (u_longlong_t)zio->io_offset, | |
932 | (u_longlong_t)zio->io_size, | |
933 | (u_longlong_t)i_size_read(bdev->bd_inode)); | |
934 | return (SET_ERROR(EIO)); | |
935 | } | |
936 | ||
937 | if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)) && | |
938 | v->vdev_failfast == B_TRUE) { | |
939 | bio_set_flags_failfast(bdev, &flags, zfs_vdev_failfast_mask & 1, | |
940 | zfs_vdev_failfast_mask & 2, zfs_vdev_failfast_mask & 4); | |
941 | } | |
942 | ||
943 | /* | |
944 | * Check alignment of the incoming ABD. If any part of it would require | |
945 | * submitting a page that is not aligned to the logical block size, | |
946 | * then we take a copy into a linear buffer and submit that instead. | |
947 | * This should be impossible on a 512b LBS, and fairly rare on 4K, | |
948 | * usually requiring abnormally-small data blocks (eg gang blocks) | |
949 | * mixed into the same ABD as larger ones (eg aggregated). | |
950 | */ | |
951 | abd_t *abd = zio->io_abd; | |
952 | if (!vdev_disk_check_pages(abd, zio->io_size, bdev)) { | |
953 | void *buf; | |
954 | if (zio->io_type == ZIO_TYPE_READ) | |
955 | buf = abd_borrow_buf(zio->io_abd, zio->io_size); | |
956 | else | |
957 | buf = abd_borrow_buf_copy(zio->io_abd, zio->io_size); | |
958 | ||
959 | /* | |
960 | * Wrap the copy in an abd_t, so we can use the same iterators | |
961 | * to count and fill the vbio later. | |
962 | */ | |
963 | abd = abd_get_from_buf(buf, zio->io_size); | |
964 | ||
965 | /* | |
966 | * False here would mean the borrowed copy has an invalid | |
967 | * alignment too, which would mean we've somehow been passed a | |
968 | * linear ABD with an interior page that has a non-zero offset | |
969 | * or a size not a multiple of PAGE_SIZE. This is not possible. | |
970 | * It would mean either zio_buf_alloc() or its underlying | |
971 | * allocators have done something extremely strange, or our | |
972 | * math in vdev_disk_check_pages() is wrong. In either case, | |
973 | * something in seriously wrong and its not safe to continue. | |
974 | */ | |
975 | VERIFY(vdev_disk_check_pages(abd, zio->io_size, bdev)); | |
976 | } | |
977 | ||
978 | /* Allocate vbio, with a pointer to the borrowed ABD if necessary */ | |
72fd834c | 979 | vbio_t *vbio = vbio_alloc(zio, bdev, flags); |
06a19602 RN |
980 | if (abd != zio->io_abd) |
981 | vbio->vbio_abd = abd; | |
982 | ||
72fd834c RN |
983 | /* Fill it with data pages and submit it to the kernel */ |
984 | vbio_submit(vbio, abd, zio->io_size); | |
06a19602 RN |
985 | return (0); |
986 | } | |
987 | ||
f3b85d70 RN |
988 | /* ========== */ |
989 | ||
990 | /* | |
06a19602 RN |
991 | * This is the classic, battle-tested BIO submission code. Until we're totally |
992 | * sure that the new code is safe and correct in all cases, this will remain | |
993 | * available and can be enabled by setting zfs_vdev_disk_classic=1 at module | |
994 | * load time. | |
f3b85d70 RN |
995 | * |
996 | * These functions have been renamed to vdev_classic_* to make it clear what | |
997 | * they belong to, but their implementations are unchanged. | |
998 | */ | |
999 | ||
1000 | /* | |
1001 | * Virtual device vector for disks. | |
1002 | */ | |
1003 | typedef struct dio_request { | |
1004 | zio_t *dr_zio; /* Parent ZIO */ | |
1005 | atomic_t dr_ref; /* References */ | |
1006 | int dr_error; /* Bio error */ | |
1007 | int dr_bio_count; /* Count of bio's */ | |
1008 | struct bio *dr_bio[]; /* Attached bio's */ | |
1009 | } dio_request_t; | |
1010 | ||
1011 | static dio_request_t * | |
1012 | vdev_classic_dio_alloc(int bio_count) | |
1013 | { | |
1014 | dio_request_t *dr = kmem_zalloc(sizeof (dio_request_t) + | |
1015 | sizeof (struct bio *) * bio_count, KM_SLEEP); | |
1016 | atomic_set(&dr->dr_ref, 0); | |
1017 | dr->dr_bio_count = bio_count; | |
1018 | dr->dr_error = 0; | |
1019 | ||
1020 | for (int i = 0; i < dr->dr_bio_count; i++) | |
1021 | dr->dr_bio[i] = NULL; | |
1022 | ||
1023 | return (dr); | |
1024 | } | |
1025 | ||
1026 | static void | |
1027 | vdev_classic_dio_free(dio_request_t *dr) | |
1028 | { | |
1029 | int i; | |
1030 | ||
1031 | for (i = 0; i < dr->dr_bio_count; i++) | |
1032 | if (dr->dr_bio[i]) | |
1033 | bio_put(dr->dr_bio[i]); | |
1034 | ||
1035 | kmem_free(dr, sizeof (dio_request_t) + | |
1036 | sizeof (struct bio *) * dr->dr_bio_count); | |
1037 | } | |
1038 | ||
1039 | static void | |
1040 | vdev_classic_dio_get(dio_request_t *dr) | |
1041 | { | |
1042 | atomic_inc(&dr->dr_ref); | |
1043 | } | |
1044 | ||
1045 | static void | |
1046 | vdev_classic_dio_put(dio_request_t *dr) | |
1047 | { | |
1048 | int rc = atomic_dec_return(&dr->dr_ref); | |
1049 | ||
1050 | /* | |
1051 | * Free the dio_request when the last reference is dropped and | |
1052 | * ensure zio_interpret is called only once with the correct zio | |
1053 | */ | |
1054 | if (rc == 0) { | |
1055 | zio_t *zio = dr->dr_zio; | |
1056 | int error = dr->dr_error; | |
1057 | ||
1058 | vdev_classic_dio_free(dr); | |
1059 | ||
1060 | if (zio) { | |
1061 | zio->io_error = error; | |
1062 | ASSERT3S(zio->io_error, >=, 0); | |
1063 | if (zio->io_error) | |
1064 | vdev_disk_error(zio); | |
1065 | ||
1066 | zio_delay_interrupt(zio); | |
1067 | } | |
1068 | } | |
1069 | } | |
1070 | ||
1071 | BIO_END_IO_PROTO(vdev_classic_physio_completion, bio, error) | |
1072 | { | |
1073 | dio_request_t *dr = bio->bi_private; | |
1074 | ||
1075 | if (dr->dr_error == 0) { | |
1076 | #ifdef HAVE_1ARG_BIO_END_IO_T | |
1077 | dr->dr_error = BIO_END_IO_ERROR(bio); | |
1078 | #else | |
1079 | if (error) | |
1080 | dr->dr_error = -(error); | |
1081 | else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) | |
1082 | dr->dr_error = EIO; | |
1083 | #endif | |
1084 | } | |
1085 | ||
1086 | /* Drop reference acquired by vdev_classic_physio */ | |
1087 | vdev_classic_dio_put(dr); | |
1088 | } | |
1089 | ||
5f264996 | 1090 | static inline unsigned int |
f3b85d70 | 1091 | vdev_classic_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset) |
5f264996 BB |
1092 | { |
1093 | unsigned long nr_segs = abd_nr_pages_off(zio->io_abd, | |
1094 | bio_size, abd_offset); | |
1095 | ||
1096 | #ifdef HAVE_BIO_MAX_SEGS | |
1097 | return (bio_max_segs(nr_segs)); | |
1098 | #else | |
1099 | return (MIN(nr_segs, BIO_MAX_PAGES)); | |
1100 | #endif | |
1101 | } | |
1102 | ||
60101509 | 1103 | static int |
867178ae | 1104 | vdev_classic_physio(zio_t *zio) |
60101509 | 1105 | { |
867178ae RN |
1106 | vdev_t *v = zio->io_vd; |
1107 | vdev_disk_t *vd = v->vdev_tsd; | |
1108 | struct block_device *bdev = BDH_BDEV(vd->vd_bdh); | |
1109 | size_t io_size = zio->io_size; | |
1110 | uint64_t io_offset = zio->io_offset; | |
1111 | int rw = zio->io_type == ZIO_TYPE_READ ? READ : WRITE; | |
1112 | int flags = 0; | |
1113 | ||
d1d7e268 | 1114 | dio_request_t *dr; |
b0be93e8 | 1115 | uint64_t abd_offset; |
60101509 | 1116 | uint64_t bio_offset; |
f8c0d7e1 MA |
1117 | int bio_size; |
1118 | int bio_count = 16; | |
1119 | int error = 0; | |
e8ac4557 | 1120 | struct blk_plug plug; |
5f264996 | 1121 | unsigned short nr_vecs; |
066e8252 | 1122 | |
d441e85d BB |
1123 | /* |
1124 | * Accessing outside the block device is never allowed. | |
1125 | */ | |
1126 | if (io_offset + io_size > bdev->bd_inode->i_size) { | |
1127 | vdev_dbgmsg(zio->io_vd, | |
1128 | "Illegal access %llu size %llu, device size %llu", | |
5dbf6c5a AZ |
1129 | (u_longlong_t)io_offset, |
1130 | (u_longlong_t)io_size, | |
1131 | (u_longlong_t)i_size_read(bdev->bd_inode)); | |
d441e85d BB |
1132 | return (SET_ERROR(EIO)); |
1133 | } | |
e06be586 | 1134 | |
60101509 | 1135 | retry: |
f3b85d70 | 1136 | dr = vdev_classic_dio_alloc(bio_count); |
60101509 | 1137 | |
f1100863 | 1138 | if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)) && |
16f0fdad MZ |
1139 | zio->io_vd->vdev_failfast == B_TRUE) { |
1140 | bio_set_flags_failfast(bdev, &flags, zfs_vdev_failfast_mask & 1, | |
1141 | zfs_vdev_failfast_mask & 2, zfs_vdev_failfast_mask & 4); | |
1142 | } | |
2959d94a | 1143 | |
60101509 | 1144 | dr->dr_zio = zio; |
60101509 | 1145 | |
60101509 | 1146 | /* |
f8c0d7e1 MA |
1147 | * Since bio's can have up to BIO_MAX_PAGES=256 iovec's, each of which |
1148 | * is at least 512 bytes and at most PAGESIZE (typically 4K), one bio | |
1149 | * can cover at least 128KB and at most 1MB. When the required number | |
1150 | * of iovec's exceeds this, we are forced to break the IO in multiple | |
1151 | * bio's and wait for them all to complete. This is likely if the | |
1152 | * recordsize property is increased beyond 1MB. The default | |
1153 | * bio_count=16 should typically accommodate the maximum-size zio of | |
1154 | * 16MB. | |
60101509 | 1155 | */ |
a6255b7f | 1156 | |
b0be93e8 IH |
1157 | abd_offset = 0; |
1158 | bio_offset = io_offset; | |
f8c0d7e1 MA |
1159 | bio_size = io_size; |
1160 | for (int i = 0; i <= dr->dr_bio_count; i++) { | |
60101509 BB |
1161 | |
1162 | /* Finished constructing bio's for given buffer */ | |
1163 | if (bio_size <= 0) | |
1164 | break; | |
1165 | ||
1166 | /* | |
f8c0d7e1 MA |
1167 | * If additional bio's are required, we have to retry, but |
1168 | * this should be rare - see the comment above. | |
60101509 BB |
1169 | */ |
1170 | if (dr->dr_bio_count == i) { | |
f3b85d70 | 1171 | vdev_classic_dio_free(dr); |
60101509 | 1172 | bio_count *= 2; |
60101509 BB |
1173 | goto retry; |
1174 | } | |
1175 | ||
f3b85d70 | 1176 | nr_vecs = vdev_classic_bio_max_segs(zio, bio_size, abd_offset); |
5f264996 | 1177 | dr->dr_bio[i] = vdev_bio_alloc(bdev, GFP_NOIO, nr_vecs); |
1086f542 | 1178 | if (unlikely(dr->dr_bio[i] == NULL)) { |
f3b85d70 | 1179 | vdev_classic_dio_free(dr); |
ecb2b7dc | 1180 | return (SET_ERROR(ENOMEM)); |
60101509 BB |
1181 | } |
1182 | ||
f3b85d70 RN |
1183 | /* Matching put called by vdev_classic_physio_completion */ |
1184 | vdev_classic_dio_get(dr); | |
60101509 | 1185 | |
d4541210 | 1186 | BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9; |
f3b85d70 | 1187 | dr->dr_bio[i]->bi_end_io = vdev_classic_physio_completion; |
60101509 | 1188 | dr->dr_bio[i]->bi_private = dr; |
3b86aeb2 | 1189 | bio_set_op_attrs(dr->dr_bio[i], rw, flags); |
60101509 BB |
1190 | |
1191 | /* Remaining size is returned to become the new size */ | |
fb822260 | 1192 | bio_size = abd_bio_map_off(dr->dr_bio[i], zio->io_abd, |
02730c33 | 1193 | bio_size, abd_offset); |
60101509 BB |
1194 | |
1195 | /* Advance in buffer and construct another bio if needed */ | |
b0be93e8 | 1196 | abd_offset += BIO_BI_SIZE(dr->dr_bio[i]); |
d4541210 | 1197 | bio_offset += BIO_BI_SIZE(dr->dr_bio[i]); |
60101509 BB |
1198 | } |
1199 | ||
37f9dac5 | 1200 | /* Extra reference to protect dio_request during vdev_submit_bio */ |
f3b85d70 | 1201 | vdev_classic_dio_get(dr); |
60101509 | 1202 | |
e8ac4557 IH |
1203 | if (dr->dr_bio_count > 1) |
1204 | blk_start_plug(&plug); | |
e8ac4557 | 1205 | |
60101509 | 1206 | /* Submit all bio's associated with this dio */ |
f8c0d7e1 | 1207 | for (int i = 0; i < dr->dr_bio_count; i++) { |
60101509 | 1208 | if (dr->dr_bio[i]) |
3b86aeb2 | 1209 | vdev_submit_bio(dr->dr_bio[i]); |
f8c0d7e1 | 1210 | } |
60101509 | 1211 | |
e8ac4557 IH |
1212 | if (dr->dr_bio_count > 1) |
1213 | blk_finish_plug(&plug); | |
e8ac4557 | 1214 | |
f3b85d70 | 1215 | vdev_classic_dio_put(dr); |
60101509 | 1216 | |
d1d7e268 | 1217 | return (error); |
60101509 BB |
1218 | } |
1219 | ||
f3b85d70 RN |
1220 | /* ========== */ |
1221 | ||
36ba27e9 | 1222 | BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error) |
60101509 BB |
1223 | { |
1224 | zio_t *zio = bio->bi_private; | |
784a7fe5 | 1225 | #ifdef HAVE_1ARG_BIO_END_IO_T |
36ba27e9 BB |
1226 | zio->io_error = BIO_END_IO_ERROR(bio); |
1227 | #else | |
1228 | zio->io_error = -error; | |
784a7fe5 | 1229 | #endif |
60101509 | 1230 | |
36ba27e9 | 1231 | if (zio->io_error && (zio->io_error == EOPNOTSUPP)) |
60101509 BB |
1232 | zio->io_vd->vdev_nowritecache = B_TRUE; |
1233 | ||
1234 | bio_put(bio); | |
d148e951 BB |
1235 | ASSERT3S(zio->io_error, >=, 0); |
1236 | if (zio->io_error) | |
1237 | vdev_disk_error(zio); | |
60101509 | 1238 | zio_interrupt(zio); |
60101509 BB |
1239 | } |
1240 | ||
1241 | static int | |
1242 | vdev_disk_io_flush(struct block_device *bdev, zio_t *zio) | |
1243 | { | |
1244 | struct request_queue *q; | |
1245 | struct bio *bio; | |
1246 | ||
1247 | q = bdev_get_queue(bdev); | |
1248 | if (!q) | |
ecb2b7dc | 1249 | return (SET_ERROR(ENXIO)); |
60101509 | 1250 | |
5f264996 | 1251 | bio = vdev_bio_alloc(bdev, GFP_NOIO, 0); |
29b763cd | 1252 | if (unlikely(bio == NULL)) |
ecb2b7dc | 1253 | return (SET_ERROR(ENOMEM)); |
60101509 BB |
1254 | |
1255 | bio->bi_end_io = vdev_disk_io_flush_completion; | |
1256 | bio->bi_private = zio; | |
a5e046ea | 1257 | bio_set_flush(bio); |
3b86aeb2 | 1258 | vdev_submit_bio(bio); |
cecb7487 | 1259 | invalidate_bdev(bdev); |
60101509 | 1260 | |
d1d7e268 | 1261 | return (0); |
60101509 | 1262 | } |
60101509 | 1263 | |
06e25f9c US |
1264 | BIO_END_IO_PROTO(vdev_disk_discard_end_io, bio, error) |
1265 | { | |
1266 | zio_t *zio = bio->bi_private; | |
1267 | #ifdef HAVE_1ARG_BIO_END_IO_T | |
1268 | zio->io_error = BIO_END_IO_ERROR(bio); | |
1269 | #else | |
1270 | zio->io_error = -error; | |
1271 | #endif | |
1272 | bio_put(bio); | |
1273 | if (zio->io_error) | |
1274 | vdev_disk_error(zio); | |
1275 | zio_interrupt(zio); | |
1276 | } | |
1277 | ||
ba9f587a RN |
1278 | /* |
1279 | * Wrappers for the different secure erase and discard APIs. We use async | |
1280 | * when available; in this case, *biop is set to the last bio in the chain. | |
1281 | */ | |
a12a5cb5 | 1282 | static int |
ba9f587a RN |
1283 | vdev_bdev_issue_secure_erase(zfs_bdev_handle_t *bdh, sector_t sector, |
1284 | sector_t nsect, struct bio **biop) | |
a12a5cb5 | 1285 | { |
ba9f587a RN |
1286 | *biop = NULL; |
1287 | int error; | |
a12a5cb5 | 1288 | |
ba9f587a RN |
1289 | #if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE) |
1290 | error = blkdev_issue_secure_erase(BDH_BDEV(bdh), | |
1291 | sector, nsect, GFP_NOFS); | |
1292 | #elif defined(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC_FLAGS) | |
1293 | error = __blkdev_issue_discard(BDH_BDEV(bdh), | |
1294 | sector, nsect, GFP_NOFS, BLKDEV_DISCARD_SECURE, biop); | |
1295 | #elif defined(HAVE_BLKDEV_ISSUE_DISCARD_FLAGS) | |
1296 | error = blkdev_issue_discard(BDH_BDEV(bdh), | |
1297 | sector, nsect, GFP_NOFS, BLKDEV_DISCARD_SECURE); | |
06e25f9c | 1298 | #else |
ba9f587a | 1299 | #error "unsupported kernel" |
06e25f9c | 1300 | #endif |
ba9f587a RN |
1301 | |
1302 | return (error); | |
06e25f9c | 1303 | } |
ba9f587a RN |
1304 | |
1305 | static int | |
1306 | vdev_bdev_issue_discard(zfs_bdev_handle_t *bdh, sector_t sector, | |
1307 | sector_t nsect, struct bio **biop) | |
1308 | { | |
1309 | *biop = NULL; | |
1310 | int error; | |
1311 | ||
1312 | #if defined(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC_FLAGS) | |
1313 | error = __blkdev_issue_discard(BDH_BDEV(bdh), | |
1314 | sector, nsect, GFP_NOFS, 0, biop); | |
1315 | #elif defined(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC_NOFLAGS) | |
1316 | error = __blkdev_issue_discard(BDH_BDEV(bdh), | |
1317 | sector, nsect, GFP_NOFS, biop); | |
1318 | #elif defined(HAVE_BLKDEV_ISSUE_DISCARD_FLAGS) | |
1319 | error = blkdev_issue_discard(BDH_BDEV(bdh), | |
1320 | sector, nsect, GFP_NOFS, 0); | |
1321 | #elif defined(HAVE_BLKDEV_ISSUE_DISCARD_NOFLAGS) | |
1322 | error = blkdev_issue_discard(BDH_BDEV(bdh), | |
1323 | sector, nsect, GFP_NOFS); | |
1324 | #else | |
1325 | #error "unsupported kernel" | |
06e25f9c US |
1326 | #endif |
1327 | ||
ba9f587a RN |
1328 | return (error); |
1329 | } | |
1330 | ||
1331 | /* | |
1332 | * Entry point for TRIM ops. This calls the right wrapper for secure erase or | |
1333 | * discard, and then does the appropriate finishing work for error vs success | |
1334 | * and async vs sync. | |
1335 | */ | |
06e25f9c US |
1336 | static int |
1337 | vdev_disk_io_trim(zio_t *zio) | |
1338 | { | |
ba9f587a RN |
1339 | int error; |
1340 | struct bio *bio; | |
1341 | ||
1342 | zfs_bdev_handle_t *bdh = ((vdev_disk_t *)zio->io_vd->vdev_tsd)->vd_bdh; | |
1343 | sector_t sector = zio->io_offset >> 9; | |
1344 | sector_t nsects = zio->io_size >> 9; | |
1345 | ||
1346 | if (zio->io_trim_flags & ZIO_TRIM_SECURE) | |
1347 | error = vdev_bdev_issue_secure_erase(bdh, sector, nsects, &bio); | |
1348 | else | |
1349 | error = vdev_bdev_issue_discard(bdh, sector, nsects, &bio); | |
1350 | ||
1351 | if (error != 0) | |
1352 | return (SET_ERROR(-error)); | |
1353 | ||
1354 | if (bio == NULL) { | |
1355 | /* | |
1356 | * This was a synchronous op that completed successfully, so | |
1357 | * return it to ZFS immediately. | |
1358 | */ | |
1359 | zio_interrupt(zio); | |
1360 | } else { | |
1361 | /* | |
1362 | * This was an asynchronous op; set up completion callback and | |
1363 | * issue it. | |
1364 | */ | |
1365 | bio->bi_private = zio; | |
1366 | bio->bi_end_io = vdev_disk_discard_end_io; | |
1367 | vdev_submit_bio(bio); | |
06e25f9c | 1368 | } |
ba9f587a RN |
1369 | |
1370 | return (0); | |
a12a5cb5 BB |
1371 | } |
1372 | ||
c4a13ba4 RN |
1373 | int (*vdev_disk_io_rw_fn)(zio_t *zio) = NULL; |
1374 | ||
98b25418 | 1375 | static void |
60101509 BB |
1376 | vdev_disk_io_start(zio_t *zio) |
1377 | { | |
1378 | vdev_t *v = zio->io_vd; | |
1379 | vdev_disk_t *vd = v->vdev_tsd; | |
867178ae | 1380 | int error; |
60101509 | 1381 | |
d441e85d BB |
1382 | /* |
1383 | * If the vdev is closed, it's likely in the REMOVED or FAULTED state. | |
1384 | * Nothing to be done here but return failure. | |
1385 | */ | |
1386 | if (vd == NULL) { | |
1387 | zio->io_error = ENXIO; | |
1388 | zio_interrupt(zio); | |
1389 | return; | |
1390 | } | |
1391 | ||
1392 | rw_enter(&vd->vd_lock, RW_READER); | |
1393 | ||
1394 | /* | |
1395 | * If the vdev is closed, it's likely due to a failed reopen and is | |
1396 | * in the UNAVAIL state. Nothing to be done here but return failure. | |
1397 | */ | |
386d6a75 | 1398 | if (vd->vd_bdh == NULL) { |
d441e85d BB |
1399 | rw_exit(&vd->vd_lock); |
1400 | zio->io_error = ENXIO; | |
1401 | zio_interrupt(zio); | |
1402 | return; | |
1403 | } | |
1404 | ||
60101509 | 1405 | switch (zio->io_type) { |
d7605ae7 | 1406 | case ZIO_TYPE_FLUSH: |
60101509 BB |
1407 | |
1408 | if (!vdev_readable(v)) { | |
c9c838aa RN |
1409 | /* Drive not there, can't flush */ |
1410 | error = SET_ERROR(ENXIO); | |
1411 | } else if (zfs_nocacheflush) { | |
1412 | /* Flushing disabled by operator, declare success */ | |
1413 | error = 0; | |
1414 | } else if (v->vdev_nowritecache) { | |
1415 | /* This vdev not capable of flushing */ | |
1416 | error = SET_ERROR(ENOTSUP); | |
1417 | } else { | |
1418 | /* | |
1419 | * Issue the flush. If successful, the response will | |
1420 | * be handled in the completion callback, so we're done. | |
1421 | */ | |
386d6a75 | 1422 | error = vdev_disk_io_flush(BDH_BDEV(vd->vd_bdh), zio); |
d441e85d BB |
1423 | if (error == 0) { |
1424 | rw_exit(&vd->vd_lock); | |
98b25418 | 1425 | return; |
d441e85d | 1426 | } |
60101509 BB |
1427 | } |
1428 | ||
c9c838aa | 1429 | /* Couldn't issue the flush, so set the error and return it */ |
d441e85d | 1430 | rw_exit(&vd->vd_lock); |
c9c838aa | 1431 | zio->io_error = error; |
98b25418 GW |
1432 | zio_execute(zio); |
1433 | return; | |
60101509 | 1434 | |
1b939560 | 1435 | case ZIO_TYPE_TRIM: |
ba9f587a | 1436 | error = vdev_disk_io_trim(zio); |
1b939560 | 1437 | rw_exit(&vd->vd_lock); |
ba9f587a RN |
1438 | if (error) { |
1439 | zio->io_error = error; | |
1440 | zio_execute(zio); | |
1441 | } | |
1b939560 BB |
1442 | return; |
1443 | ||
867178ae RN |
1444 | case ZIO_TYPE_READ: |
1445 | case ZIO_TYPE_WRITE: | |
1446 | zio->io_target_timestamp = zio_handle_io_delay(zio); | |
c4a13ba4 | 1447 | error = vdev_disk_io_rw_fn(zio); |
d441e85d | 1448 | rw_exit(&vd->vd_lock); |
867178ae RN |
1449 | if (error) { |
1450 | zio->io_error = error; | |
1451 | zio_interrupt(zio); | |
1452 | } | |
98b25418 | 1453 | return; |
60101509 | 1454 | |
867178ae RN |
1455 | default: |
1456 | /* | |
1457 | * Getting here means our parent vdev has made a very strange | |
1458 | * request of us, and shouldn't happen. Assert here to force a | |
1459 | * crash in dev builds, but in production return the IO | |
1460 | * unhandled. The pool will likely suspend anyway but that's | |
1461 | * nicer than crashing the kernel. | |
1462 | */ | |
1463 | ASSERT3S(zio->io_type, ==, -1); | |
d441e85d | 1464 | |
867178ae RN |
1465 | rw_exit(&vd->vd_lock); |
1466 | zio->io_error = SET_ERROR(ENOTSUP); | |
98b25418 GW |
1467 | zio_interrupt(zio); |
1468 | return; | |
60101509 | 1469 | } |
867178ae RN |
1470 | |
1471 | __builtin_unreachable(); | |
60101509 BB |
1472 | } |
1473 | ||
1474 | static void | |
1475 | vdev_disk_io_done(zio_t *zio) | |
1476 | { | |
1477 | /* | |
1478 | * If the device returned EIO, we revalidate the media. If it is | |
1479 | * determined the media has changed this triggers the asynchronous | |
1480 | * removal of the device from the configuration. | |
1481 | */ | |
1482 | if (zio->io_error == EIO) { | |
d1d7e268 | 1483 | vdev_t *v = zio->io_vd; |
60101509 BB |
1484 | vdev_disk_t *vd = v->vdev_tsd; |
1485 | ||
386d6a75 RN |
1486 | if (!zfs_check_disk_status(BDH_BDEV(vd->vd_bdh))) { |
1487 | invalidate_bdev(BDH_BDEV(vd->vd_bdh)); | |
60101509 BB |
1488 | v->vdev_remove_wanted = B_TRUE; |
1489 | spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE); | |
1490 | } | |
1491 | } | |
1492 | } | |
1493 | ||
1494 | static void | |
1495 | vdev_disk_hold(vdev_t *vd) | |
1496 | { | |
1497 | ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); | |
1498 | ||
1499 | /* We must have a pathname, and it must be absolute. */ | |
1500 | if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') | |
1501 | return; | |
1502 | ||
1503 | /* | |
1504 | * Only prefetch path and devid info if the device has | |
1505 | * never been opened. | |
1506 | */ | |
1507 | if (vd->vdev_tsd != NULL) | |
1508 | return; | |
1509 | ||
60101509 BB |
1510 | } |
1511 | ||
1512 | static void | |
1513 | vdev_disk_rele(vdev_t *vd) | |
1514 | { | |
1515 | ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); | |
1516 | ||
1517 | /* XXX: Implement me as a vnode rele for the device */ | |
1518 | } | |
1519 | ||
df2169d1 RN |
1520 | /* |
1521 | * BIO submission method. See comment above about vdev_classic. | |
1522 | * Set zfs_vdev_disk_classic=0 for new, =1 for classic | |
1523 | */ | |
1524 | static uint_t zfs_vdev_disk_classic = 0; /* default new */ | |
1525 | ||
1526 | /* Set submission function from module parameter */ | |
1527 | static int | |
1528 | vdev_disk_param_set_classic(const char *buf, zfs_kernel_param_t *kp) | |
1529 | { | |
1530 | int err = param_set_uint(buf, kp); | |
1531 | if (err < 0) | |
1532 | return (SET_ERROR(err)); | |
1533 | ||
1534 | vdev_disk_io_rw_fn = | |
1535 | zfs_vdev_disk_classic ? vdev_classic_physio : vdev_disk_io_rw; | |
1536 | ||
1537 | printk(KERN_INFO "ZFS: forcing %s BIO submission\n", | |
1538 | zfs_vdev_disk_classic ? "classic" : "new"); | |
1539 | ||
1540 | return (0); | |
1541 | } | |
1542 | ||
c4a13ba4 RN |
1543 | /* |
1544 | * At first use vdev use, set the submission function from the default value if | |
1545 | * it hasn't been set already. | |
1546 | */ | |
1547 | static int | |
1548 | vdev_disk_init(spa_t *spa, nvlist_t *nv, void **tsd) | |
1549 | { | |
1550 | (void) spa; | |
1551 | (void) nv; | |
1552 | (void) tsd; | |
1553 | ||
1554 | if (vdev_disk_io_rw_fn == NULL) | |
df2169d1 RN |
1555 | vdev_disk_io_rw_fn = zfs_vdev_disk_classic ? |
1556 | vdev_classic_physio : vdev_disk_io_rw; | |
c4a13ba4 RN |
1557 | |
1558 | return (0); | |
1559 | } | |
1560 | ||
60101509 | 1561 | vdev_ops_t vdev_disk_ops = { |
c4a13ba4 | 1562 | .vdev_op_init = vdev_disk_init, |
b2255edc | 1563 | .vdev_op_fini = NULL, |
a64f8276 I |
1564 | .vdev_op_open = vdev_disk_open, |
1565 | .vdev_op_close = vdev_disk_close, | |
1566 | .vdev_op_asize = vdev_default_asize, | |
b2255edc BB |
1567 | .vdev_op_min_asize = vdev_default_min_asize, |
1568 | .vdev_op_min_alloc = NULL, | |
a64f8276 I |
1569 | .vdev_op_io_start = vdev_disk_io_start, |
1570 | .vdev_op_io_done = vdev_disk_io_done, | |
1571 | .vdev_op_state_change = NULL, | |
1572 | .vdev_op_need_resilver = NULL, | |
1573 | .vdev_op_hold = vdev_disk_hold, | |
1574 | .vdev_op_rele = vdev_disk_rele, | |
1575 | .vdev_op_remap = NULL, | |
1576 | .vdev_op_xlate = vdev_default_xlate, | |
b2255edc BB |
1577 | .vdev_op_rebuild_asize = NULL, |
1578 | .vdev_op_metaslab_init = NULL, | |
1579 | .vdev_op_config_generate = NULL, | |
1580 | .vdev_op_nparity = NULL, | |
1581 | .vdev_op_ndisks = NULL, | |
a64f8276 | 1582 | .vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */ |
55c12724 AH |
1583 | .vdev_op_leaf = B_TRUE, /* leaf vdev */ |
1584 | .vdev_op_kobj_evt_post = vdev_disk_kobj_evt_post | |
60101509 BB |
1585 | }; |
1586 | ||
9e17e6f2 BB |
1587 | /* |
1588 | * The zfs_vdev_scheduler module option has been deprecated. Setting this | |
1589 | * value no longer has any effect. It has not yet been entirely removed | |
1590 | * to allow the module to be loaded if this option is specified in the | |
1591 | * /etc/modprobe.d/zfs.conf file. The following warning will be logged. | |
1592 | */ | |
1593 | static int | |
1594 | param_set_vdev_scheduler(const char *val, zfs_kernel_param_t *kp) | |
1595 | { | |
1596 | int error = param_set_charp(val, kp); | |
1597 | if (error == 0) { | |
1598 | printk(KERN_INFO "The 'zfs_vdev_scheduler' module option " | |
1599 | "is not supported.\n"); | |
1600 | } | |
1601 | ||
1602 | return (error); | |
1603 | } | |
1604 | ||
18168da7 | 1605 | static const char *zfs_vdev_scheduler = "unused"; |
e771de53 BB |
1606 | module_param_call(zfs_vdev_scheduler, param_set_vdev_scheduler, |
1607 | param_get_charp, &zfs_vdev_scheduler, 0644); | |
c409e464 | 1608 | MODULE_PARM_DESC(zfs_vdev_scheduler, "I/O scheduler"); |
6fe3498c RM |
1609 | |
1610 | int | |
1611 | param_set_min_auto_ashift(const char *buf, zfs_kernel_param_t *kp) | |
1612 | { | |
ab8d9c17 | 1613 | uint_t val; |
6fe3498c RM |
1614 | int error; |
1615 | ||
ab8d9c17 | 1616 | error = kstrtouint(buf, 0, &val); |
6fe3498c RM |
1617 | if (error < 0) |
1618 | return (SET_ERROR(error)); | |
1619 | ||
1620 | if (val < ASHIFT_MIN || val > zfs_vdev_max_auto_ashift) | |
1621 | return (SET_ERROR(-EINVAL)); | |
1622 | ||
ab8d9c17 | 1623 | error = param_set_uint(buf, kp); |
6fe3498c RM |
1624 | if (error < 0) |
1625 | return (SET_ERROR(error)); | |
1626 | ||
1627 | return (0); | |
1628 | } | |
1629 | ||
1630 | int | |
1631 | param_set_max_auto_ashift(const char *buf, zfs_kernel_param_t *kp) | |
1632 | { | |
ab8d9c17 | 1633 | uint_t val; |
6fe3498c RM |
1634 | int error; |
1635 | ||
ab8d9c17 | 1636 | error = kstrtouint(buf, 0, &val); |
6fe3498c RM |
1637 | if (error < 0) |
1638 | return (SET_ERROR(error)); | |
1639 | ||
1640 | if (val > ASHIFT_MAX || val < zfs_vdev_min_auto_ashift) | |
1641 | return (SET_ERROR(-EINVAL)); | |
1642 | ||
ab8d9c17 | 1643 | error = param_set_uint(buf, kp); |
6fe3498c RM |
1644 | if (error < 0) |
1645 | return (SET_ERROR(error)); | |
1646 | ||
1647 | return (0); | |
1648 | } | |
f66ffe68 SD |
1649 | |
1650 | ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, open_timeout_ms, UINT, ZMOD_RW, | |
1651 | "Timeout before determining that a device is missing"); | |
16f0fdad MZ |
1652 | |
1653 | ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, failfast_mask, UINT, ZMOD_RW, | |
1654 | "Defines failfast mask: 1 - device, 2 - transport, 4 - driver"); | |
06a19602 RN |
1655 | |
1656 | ZFS_MODULE_PARAM(zfs_vdev_disk, zfs_vdev_disk_, max_segs, UINT, ZMOD_RW, | |
1657 | "Maximum number of data segments to add to an IO request (min 4)"); | |
df2169d1 RN |
1658 | |
1659 | ZFS_MODULE_PARAM_CALL(zfs_vdev_disk, zfs_vdev_disk_, classic, | |
1660 | vdev_disk_param_set_classic, param_get_uint, ZMOD_RD, | |
1661 | "Use classic BIO submission method"); |