]>
Commit | Line | Data |
---|---|---|
60101509 BB |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
1d3ba0bf | 9 | * or https://opensource.org/licenses/CDDL-1.0. |
60101509 BB |
10 | * See the License for the specific language governing permissions |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | /* | |
22 | * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC. | |
23 | * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). | |
24 | * Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>. | |
25 | * LLNL-CODE-403049. | |
1eacf2b3 | 26 | * Copyright (c) 2012, 2019 by Delphix. All rights reserved. |
60101509 BB |
27 | */ |
28 | ||
29 | #include <sys/zfs_context.h> | |
e771de53 | 30 | #include <sys/spa_impl.h> |
60101509 BB |
31 | #include <sys/vdev_disk.h> |
32 | #include <sys/vdev_impl.h> | |
1b939560 | 33 | #include <sys/vdev_trim.h> |
a6255b7f | 34 | #include <sys/abd.h> |
60101509 BB |
35 | #include <sys/fs/zfs.h> |
36 | #include <sys/zio.h> | |
8e82ffba | 37 | #include <linux/blkpg.h> |
74d42600 | 38 | #include <linux/msdos_fs.h> |
05805494 | 39 | #include <linux/vfs_compat.h> |
1e767532 CK |
40 | #ifdef HAVE_LINUX_BLK_CGROUP_HEADER |
41 | #include <linux/blk-cgroup.h> | |
42 | #endif | |
60101509 | 43 | |
386d6a75 RN |
44 | /* |
45 | * Linux 6.8.x uses a bdev_handle as an instance/refcount for an underlying | |
46 | * block_device. Since it carries the block_device inside, its convenient to | |
47 | * just use the handle as a proxy. For pre-6.8, we just emulate this with | |
48 | * a cast, since we don't need any of the other fields inside the handle. | |
49 | */ | |
50 | #ifdef HAVE_BDEV_OPEN_BY_PATH | |
51 | typedef struct bdev_handle zfs_bdev_handle_t; | |
52 | #define BDH_BDEV(bdh) ((bdh)->bdev) | |
53 | #define BDH_IS_ERR(bdh) (IS_ERR(bdh)) | |
54 | #define BDH_PTR_ERR(bdh) (PTR_ERR(bdh)) | |
55 | #define BDH_ERR_PTR(err) (ERR_PTR(err)) | |
56 | #else | |
57 | typedef void zfs_bdev_handle_t; | |
58 | #define BDH_BDEV(bdh) ((struct block_device *)bdh) | |
59 | #define BDH_IS_ERR(bdh) (IS_ERR(BDH_BDEV(bdh))) | |
60 | #define BDH_PTR_ERR(bdh) (PTR_ERR(BDH_BDEV(bdh))) | |
61 | #define BDH_ERR_PTR(err) (ERR_PTR(err)) | |
62 | #endif | |
63 | ||
d366c8fd | 64 | typedef struct vdev_disk { |
386d6a75 | 65 | zfs_bdev_handle_t *vd_bdh; |
d366c8fd JL |
66 | krwlock_t vd_lock; |
67 | } vdev_disk_t; | |
68 | ||
a25861dc BB |
69 | /* |
70 | * Unique identifier for the exclusive vdev holder. | |
71 | */ | |
8128bd89 | 72 | static void *zfs_vdev_holder = VDEV_HOLDER; |
6839eed2 | 73 | |
a25861dc BB |
74 | /* |
75 | * Wait up to zfs_vdev_open_timeout_ms milliseconds before determining the | |
76 | * device is missing. The missing path may be transient since the links | |
77 | * can be briefly removed and recreated in response to udev events. | |
78 | */ | |
f66ffe68 | 79 | static uint_t zfs_vdev_open_timeout_ms = 1000; |
a25861dc BB |
80 | |
81 | /* | |
82 | * Size of the "reserved" partition, in blocks. | |
83 | */ | |
74d42600 SH |
84 | #define EFI_MIN_RESV_SIZE (16 * 1024) |
85 | ||
16f0fdad MZ |
86 | /* |
87 | * BIO request failfast mask. | |
88 | */ | |
89 | ||
90 | static unsigned int zfs_vdev_failfast_mask = 1; | |
91 | ||
43e8f6e3 CK |
92 | #ifdef HAVE_BLK_MODE_T |
93 | static blk_mode_t | |
94 | #else | |
60101509 | 95 | static fmode_t |
43e8f6e3 | 96 | #endif |
233d34e4 | 97 | vdev_bdev_mode(spa_mode_t spa_mode, boolean_t exclusive) |
60101509 | 98 | { |
43e8f6e3 CK |
99 | #ifdef HAVE_BLK_MODE_T |
100 | blk_mode_t mode = 0; | |
101 | ||
102 | if (spa_mode & SPA_MODE_READ) | |
103 | mode |= BLK_OPEN_READ; | |
104 | ||
105 | if (spa_mode & SPA_MODE_WRITE) | |
106 | mode |= BLK_OPEN_WRITE; | |
233d34e4 BB |
107 | |
108 | if (exclusive) | |
109 | mode |= BLK_OPEN_EXCL; | |
43e8f6e3 | 110 | #else |
60101509 BB |
111 | fmode_t mode = 0; |
112 | ||
da92d5cb | 113 | if (spa_mode & SPA_MODE_READ) |
60101509 BB |
114 | mode |= FMODE_READ; |
115 | ||
da92d5cb | 116 | if (spa_mode & SPA_MODE_WRITE) |
60101509 | 117 | mode |= FMODE_WRITE; |
233d34e4 BB |
118 | |
119 | if (exclusive) | |
120 | mode |= FMODE_EXCL; | |
43e8f6e3 | 121 | #endif |
60101509 | 122 | |
d1d7e268 | 123 | return (mode); |
60101509 | 124 | } |
60101509 | 125 | |
d441e85d BB |
126 | /* |
127 | * Returns the usable capacity (in bytes) for the partition or disk. | |
128 | */ | |
60101509 | 129 | static uint64_t |
d441e85d | 130 | bdev_capacity(struct block_device *bdev) |
60101509 | 131 | { |
d441e85d BB |
132 | return (i_size_read(bdev->bd_inode)); |
133 | } | |
60101509 | 134 | |
72ba4b2a BB |
135 | #if !defined(HAVE_BDEV_WHOLE) |
136 | static inline struct block_device * | |
137 | bdev_whole(struct block_device *bdev) | |
138 | { | |
139 | return (bdev->bd_contains); | |
140 | } | |
141 | #endif | |
142 | ||
bebdf52a BB |
143 | #if defined(HAVE_BDEVNAME) |
144 | #define vdev_bdevname(bdev, name) bdevname(bdev, name) | |
145 | #else | |
146 | static inline void | |
147 | vdev_bdevname(struct block_device *bdev, char *name) | |
148 | { | |
149 | snprintf(name, BDEVNAME_SIZE, "%pg", bdev); | |
150 | } | |
151 | #endif | |
152 | ||
d441e85d BB |
153 | /* |
154 | * Returns the maximum expansion capacity of the block device (in bytes). | |
155 | * | |
156 | * It is possible to expand a vdev when it has been created as a wholedisk | |
157 | * and the containing block device has increased in capacity. Or when the | |
158 | * partition containing the pool has been manually increased in size. | |
159 | * | |
160 | * This function is only responsible for calculating the potential expansion | |
161 | * size so it can be reported by 'zpool list'. The efi_use_whole_disk() is | |
162 | * responsible for verifying the expected partition layout in the wholedisk | |
163 | * case, and updating the partition table if appropriate. Once the partition | |
164 | * size has been increased the additional capacity will be visible using | |
165 | * bdev_capacity(). | |
0c637f31 | 166 | * |
167 | * The returned maximum expansion capacity is always expected to be larger, or | |
168 | * at the very least equal, to its usable capacity to prevent overestimating | |
169 | * the pool expandsize. | |
d441e85d BB |
170 | */ |
171 | static uint64_t | |
172 | bdev_max_capacity(struct block_device *bdev, uint64_t wholedisk) | |
173 | { | |
174 | uint64_t psize; | |
175 | int64_t available; | |
176 | ||
72ba4b2a | 177 | if (wholedisk && bdev != bdev_whole(bdev)) { |
74d42600 | 178 | /* |
d441e85d BB |
179 | * When reporting maximum expansion capacity for a wholedisk |
180 | * deduct any capacity which is expected to be lost due to | |
181 | * alignment restrictions. Over reporting this value isn't | |
182 | * harmful and would only result in slightly less capacity | |
183 | * than expected post expansion. | |
0c637f31 | 184 | * The estimated available space may be slightly smaller than |
185 | * bdev_capacity() for devices where the number of sectors is | |
186 | * not a multiple of the alignment size and the partition layout | |
187 | * is keeping less than PARTITION_END_ALIGNMENT bytes after the | |
188 | * "reserved" EFI partition: in such cases return the device | |
189 | * usable capacity. | |
74d42600 | 190 | */ |
72ba4b2a | 191 | available = i_size_read(bdev_whole(bdev)->bd_inode) - |
d441e85d BB |
192 | ((EFI_MIN_RESV_SIZE + NEW_START_BLOCK + |
193 | PARTITION_END_ALIGNMENT) << SECTOR_BITS); | |
0c637f31 | 194 | psize = MAX(available, bdev_capacity(bdev)); |
74d42600 | 195 | } else { |
d441e85d | 196 | psize = bdev_capacity(bdev); |
74d42600 | 197 | } |
d441e85d BB |
198 | |
199 | return (psize); | |
60101509 BB |
200 | } |
201 | ||
d148e951 BB |
202 | static void |
203 | vdev_disk_error(zio_t *zio) | |
204 | { | |
c71c8c71 | 205 | /* |
206 | * This function can be called in interrupt context, for instance while | |
207 | * handling IRQs coming from a misbehaving disk device; use printk() | |
208 | * which is safe from any context. | |
209 | */ | |
210 | printk(KERN_WARNING "zio pool=%s vdev=%s error=%d type=%d " | |
4938d01d | 211 | "offset=%llu size=%llu flags=%llu\n", spa_name(zio->io_spa), |
c71c8c71 | 212 | zio->io_vd->vdev_path, zio->io_error, zio->io_type, |
213 | (u_longlong_t)zio->io_offset, (u_longlong_t)zio->io_size, | |
214 | zio->io_flags); | |
d148e951 BB |
215 | } |
216 | ||
55c12724 AH |
217 | static void |
218 | vdev_disk_kobj_evt_post(vdev_t *v) | |
219 | { | |
220 | vdev_disk_t *vd = v->vdev_tsd; | |
386d6a75 RN |
221 | if (vd && vd->vd_bdh) { |
222 | spl_signal_kobj_evt(BDH_BDEV(vd->vd_bdh)); | |
55c12724 AH |
223 | } else { |
224 | vdev_dbgmsg(v, "vdev_disk_t is NULL for VDEV:%s\n", | |
225 | v->vdev_path); | |
226 | } | |
227 | } | |
228 | ||
386d6a75 RN |
229 | static zfs_bdev_handle_t * |
230 | vdev_blkdev_get_by_path(const char *path, spa_mode_t mode, void *holder) | |
43e8f6e3 | 231 | { |
386d6a75 RN |
232 | #if defined(HAVE_BDEV_OPEN_BY_PATH) |
233 | return (bdev_open_by_path(path, | |
234 | vdev_bdev_mode(mode, B_TRUE), holder, NULL)); | |
235 | #elif defined(HAVE_BLKDEV_GET_BY_PATH_4ARG) | |
43e8f6e3 | 236 | return (blkdev_get_by_path(path, |
386d6a75 | 237 | vdev_bdev_mode(mode, B_TRUE), holder, NULL)); |
43e8f6e3 CK |
238 | #else |
239 | return (blkdev_get_by_path(path, | |
233d34e4 | 240 | vdev_bdev_mode(mode, B_TRUE), holder)); |
43e8f6e3 CK |
241 | #endif |
242 | } | |
243 | ||
244 | static void | |
386d6a75 | 245 | vdev_blkdev_put(zfs_bdev_handle_t *bdh, spa_mode_t mode, void *holder) |
43e8f6e3 | 246 | { |
386d6a75 RN |
247 | #if defined(HAVE_BDEV_RELEASE) |
248 | return (bdev_release(bdh)); | |
249 | #elif defined(HAVE_BLKDEV_PUT_HOLDER) | |
250 | return (blkdev_put(BDH_BDEV(bdh), holder)); | |
43e8f6e3 | 251 | #else |
386d6a75 RN |
252 | return (blkdev_put(BDH_BDEV(bdh), |
253 | vdev_bdev_mode(mode, B_TRUE))); | |
43e8f6e3 CK |
254 | #endif |
255 | } | |
256 | ||
60101509 | 257 | static int |
1bd201e7 | 258 | vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, |
6fe3498c | 259 | uint64_t *logical_ashift, uint64_t *physical_ashift) |
60101509 | 260 | { |
386d6a75 | 261 | zfs_bdev_handle_t *bdh; |
43e8f6e3 | 262 | #ifdef HAVE_BLK_MODE_T |
233d34e4 | 263 | blk_mode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa), B_FALSE); |
43e8f6e3 | 264 | #else |
233d34e4 | 265 | fmode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa), B_FALSE); |
43e8f6e3 | 266 | #endif |
a25861dc | 267 | hrtime_t timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms); |
60101509 | 268 | vdev_disk_t *vd; |
60101509 BB |
269 | |
270 | /* Must have a pathname and it must be absolute. */ | |
271 | if (v->vdev_path == NULL || v->vdev_path[0] != '/') { | |
272 | v->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; | |
d441e85d | 273 | vdev_dbgmsg(v, "invalid vdev_path"); |
2d82ea8b | 274 | return (SET_ERROR(EINVAL)); |
60101509 BB |
275 | } |
276 | ||
0d8103d9 | 277 | /* |
d441e85d | 278 | * Reopen the device if it is currently open. When expanding a |
8e82ffba GW |
279 | * partition force re-scanning the partition table if userland |
280 | * did not take care of this already. We need to do this while closed | |
d441e85d BB |
281 | * in order to get an accurate updated block device size. Then |
282 | * since udev may need to recreate the device links increase the | |
a25861dc | 283 | * open retry timeout before reporting the device as unavailable. |
0d8103d9 | 284 | */ |
d441e85d BB |
285 | vd = v->vdev_tsd; |
286 | if (vd) { | |
287 | char disk_name[BDEVNAME_SIZE + 6] = "/dev/"; | |
288 | boolean_t reread_part = B_FALSE; | |
0d8103d9 | 289 | |
d441e85d | 290 | rw_enter(&vd->vd_lock, RW_WRITER); |
386d6a75 RN |
291 | bdh = vd->vd_bdh; |
292 | vd->vd_bdh = NULL; | |
d441e85d | 293 | |
386d6a75 RN |
294 | if (bdh) { |
295 | struct block_device *bdev = BDH_BDEV(bdh); | |
72ba4b2a | 296 | if (v->vdev_expanding && bdev != bdev_whole(bdev)) { |
bebdf52a | 297 | vdev_bdevname(bdev_whole(bdev), disk_name + 5); |
8e82ffba GW |
298 | /* |
299 | * If userland has BLKPG_RESIZE_PARTITION, | |
300 | * then it should have updated the partition | |
301 | * table already. We can detect this by | |
302 | * comparing our current physical size | |
303 | * with that of the device. If they are | |
304 | * the same, then we must not have | |
305 | * BLKPG_RESIZE_PARTITION or it failed to | |
306 | * update the partition table online. We | |
307 | * fallback to rescanning the partition | |
308 | * table from the kernel below. However, | |
309 | * if the capacity already reflects the | |
310 | * updated partition, then we skip | |
311 | * rescanning the partition table here. | |
312 | */ | |
313 | if (v->vdev_psize == bdev_capacity(bdev)) | |
314 | reread_part = B_TRUE; | |
d441e85d BB |
315 | } |
316 | ||
386d6a75 | 317 | vdev_blkdev_put(bdh, mode, zfs_vdev_holder); |
d441e85d BB |
318 | } |
319 | ||
320 | if (reread_part) { | |
386d6a75 RN |
321 | bdh = vdev_blkdev_get_by_path(disk_name, mode, |
322 | zfs_vdev_holder); | |
323 | if (!BDH_IS_ERR(bdh)) { | |
324 | int error = | |
325 | vdev_bdev_reread_part(BDH_BDEV(bdh)); | |
326 | vdev_blkdev_put(bdh, mode, zfs_vdev_holder); | |
a25861dc BB |
327 | if (error == 0) { |
328 | timeout = MSEC2NSEC( | |
329 | zfs_vdev_open_timeout_ms * 2); | |
330 | } | |
d441e85d BB |
331 | } |
332 | } | |
333 | } else { | |
334 | vd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP); | |
335 | ||
336 | rw_init(&vd->vd_lock, NULL, RW_DEFAULT, NULL); | |
337 | rw_enter(&vd->vd_lock, RW_WRITER); | |
338 | } | |
60101509 BB |
339 | |
340 | /* | |
341 | * Devices are always opened by the path provided at configuration | |
342 | * time. This means that if the provided path is a udev by-id path | |
d441e85d | 343 | * then drives may be re-cabled without an issue. If the provided |
4e95cc99 | 344 | * path is a udev by-path path, then the physical location information |
60101509 BB |
345 | * will be preserved. This can be critical for more complicated |
346 | * configurations where drives are located in specific physical | |
d441e85d BB |
347 | * locations to maximize the systems tolerance to component failure. |
348 | * | |
4e95cc99 | 349 | * Alternatively, you can provide your own udev rule to flexibly map |
60101509 | 350 | * the drives as you see fit. It is not advised that you use the |
4e95cc99 | 351 | * /dev/[hd]d devices which may be reordered due to probing order. |
60101509 BB |
352 | * Devices in the wrong locations will be detected by the higher |
353 | * level vdev validation. | |
2d82ea8b BB |
354 | * |
355 | * The specified paths may be briefly removed and recreated in | |
356 | * response to udev events. This should be exceptionally unlikely | |
357 | * because the zpool command makes every effort to verify these paths | |
358 | * have already settled prior to reaching this point. Therefore, | |
359 | * a ENOENT failure at this point is highly likely to be transient | |
360 | * and it is reasonable to sleep and retry before giving up. In | |
361 | * practice delays have been observed to be on the order of 100ms. | |
77e2756d BB |
362 | * |
363 | * When ERESTARTSYS is returned it indicates the block device is | |
364 | * a zvol which could not be opened due to the deadlock detection | |
365 | * logic in zvol_open(). Extend the timeout and retry the open | |
366 | * subsequent attempts are expected to eventually succeed. | |
60101509 | 367 | */ |
a25861dc | 368 | hrtime_t start = gethrtime(); |
386d6a75 RN |
369 | bdh = BDH_ERR_PTR(-ENXIO); |
370 | while (BDH_IS_ERR(bdh) && ((gethrtime() - start) < timeout)) { | |
371 | bdh = vdev_blkdev_get_by_path(v->vdev_path, mode, | |
372 | zfs_vdev_holder); | |
373 | if (unlikely(BDH_PTR_ERR(bdh) == -ENOENT)) { | |
55c12724 AH |
374 | /* |
375 | * There is no point of waiting since device is removed | |
376 | * explicitly | |
377 | */ | |
378 | if (v->vdev_removed) | |
379 | break; | |
380 | ||
d441e85d | 381 | schedule_timeout(MSEC_TO_TICK(10)); |
386d6a75 | 382 | } else if (unlikely(BDH_PTR_ERR(bdh) == -ERESTARTSYS)) { |
77e2756d BB |
383 | timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms * 10); |
384 | continue; | |
386d6a75 | 385 | } else if (BDH_IS_ERR(bdh)) { |
2d82ea8b BB |
386 | break; |
387 | } | |
388 | } | |
389 | ||
386d6a75 RN |
390 | if (BDH_IS_ERR(bdh)) { |
391 | int error = -BDH_PTR_ERR(bdh); | |
a25861dc BB |
392 | vdev_dbgmsg(v, "open error=%d timeout=%llu/%llu", error, |
393 | (u_longlong_t)(gethrtime() - start), | |
394 | (u_longlong_t)timeout); | |
386d6a75 | 395 | vd->vd_bdh = NULL; |
d441e85d BB |
396 | v->vdev_tsd = vd; |
397 | rw_exit(&vd->vd_lock); | |
398 | return (SET_ERROR(error)); | |
399 | } else { | |
386d6a75 | 400 | vd->vd_bdh = bdh; |
d441e85d BB |
401 | v->vdev_tsd = vd; |
402 | rw_exit(&vd->vd_lock); | |
60101509 BB |
403 | } |
404 | ||
386d6a75 RN |
405 | struct block_device *bdev = BDH_BDEV(vd->vd_bdh); |
406 | ||
0d8103d9 | 407 | /* Determine the physical block size */ |
386d6a75 | 408 | int physical_block_size = bdev_physical_block_size(bdev); |
6fe3498c RM |
409 | |
410 | /* Determine the logical block size */ | |
386d6a75 | 411 | int logical_block_size = bdev_logical_block_size(bdev); |
60101509 | 412 | |
60101509 BB |
413 | /* Clear the nowritecache bit, causes vdev_reopen() to try again. */ |
414 | v->vdev_nowritecache = B_FALSE; | |
415 | ||
1b939560 | 416 | /* Set when device reports it supports TRIM. */ |
386d6a75 | 417 | v->vdev_has_trim = bdev_discard_supported(bdev); |
1b939560 BB |
418 | |
419 | /* Set when device reports it supports secure TRIM. */ | |
386d6a75 | 420 | v->vdev_has_securetrim = bdev_secure_discard_supported(bdev); |
1b939560 | 421 | |
fb40095f | 422 | /* Inform the ZIO pipeline that we are non-rotational */ |
386d6a75 | 423 | v->vdev_nonrot = blk_queue_nonrot(bdev_get_queue(bdev)); |
fb40095f | 424 | |
d441e85d | 425 | /* Physical volume size in bytes for the partition */ |
386d6a75 | 426 | *psize = bdev_capacity(bdev); |
d441e85d BB |
427 | |
428 | /* Physical volume size in bytes including possible expansion space */ | |
386d6a75 | 429 | *max_psize = bdev_max_capacity(bdev, v->vdev_wholedisk); |
1bd201e7 | 430 | |
60101509 | 431 | /* Based on the minimum sector size set the block size */ |
6fe3498c RM |
432 | *physical_ashift = highbit64(MAX(physical_block_size, |
433 | SPA_MINBLOCKSIZE)) - 1; | |
434 | ||
435 | *logical_ashift = highbit64(MAX(logical_block_size, | |
436 | SPA_MINBLOCKSIZE)) - 1; | |
60101509 | 437 | |
d1d7e268 | 438 | return (0); |
60101509 BB |
439 | } |
440 | ||
441 | static void | |
442 | vdev_disk_close(vdev_t *v) | |
443 | { | |
444 | vdev_disk_t *vd = v->vdev_tsd; | |
445 | ||
0d8103d9 | 446 | if (v->vdev_reopening || vd == NULL) |
60101509 BB |
447 | return; |
448 | ||
386d6a75 RN |
449 | if (vd->vd_bdh != NULL) { |
450 | vdev_blkdev_put(vd->vd_bdh, spa_mode(v->vdev_spa), | |
43e8f6e3 | 451 | zfs_vdev_holder); |
d441e85d | 452 | } |
60101509 | 453 | |
d441e85d | 454 | rw_destroy(&vd->vd_lock); |
d1d7e268 | 455 | kmem_free(vd, sizeof (vdev_disk_t)); |
60101509 BB |
456 | v->vdev_tsd = NULL; |
457 | } | |
458 | ||
bbb1b6ce | 459 | static inline void |
3b86aeb2 | 460 | vdev_submit_bio_impl(struct bio *bio) |
bbb1b6ce BB |
461 | { |
462 | #ifdef HAVE_1ARG_SUBMIT_BIO | |
453c63e9 | 463 | (void) submit_bio(bio); |
bbb1b6ce | 464 | #else |
a3fbe2b9 | 465 | (void) submit_bio(bio_data_dir(bio), bio); |
bbb1b6ce BB |
466 | #endif |
467 | } | |
468 | ||
2e407941 BB |
469 | /* |
470 | * preempt_schedule_notrace is GPL-only which breaks the ZFS build, so | |
471 | * replace it with preempt_schedule under the following condition: | |
472 | */ | |
473 | #if defined(CONFIG_ARM64) && \ | |
474 | defined(CONFIG_PREEMPTION) && \ | |
475 | defined(CONFIG_BLK_CGROUP) | |
476 | #define preempt_schedule_notrace(x) preempt_schedule(x) | |
477 | #endif | |
478 | ||
5f264996 BB |
479 | /* |
480 | * As for the Linux 5.18 kernel bio_alloc() expects a block_device struct | |
481 | * as an argument removing the need to set it with bio_set_dev(). This | |
482 | * removes the need for all of the following compatibility code. | |
483 | */ | |
484 | #if !defined(HAVE_BIO_ALLOC_4ARG) | |
485 | ||
26a85659 BB |
486 | #ifdef HAVE_BIO_SET_DEV |
487 | #if defined(CONFIG_BLK_CGROUP) && defined(HAVE_BIO_SET_DEV_GPL_ONLY) | |
bd0d24e0 BB |
488 | /* |
489 | * The Linux 5.5 kernel updated percpu_ref_tryget() which is inlined by | |
490 | * blkg_tryget() to use rcu_read_lock() instead of rcu_read_lock_sched(). | |
491 | * As a side effect the function was converted to GPL-only. Define our | |
492 | * own version when needed which uses rcu_read_lock_sched(). | |
036e846a RS |
493 | * |
494 | * The Linux 5.17 kernel split linux/blk-cgroup.h into a private and a public | |
495 | * part, moving blkg_tryget into the private one. Define our own version. | |
bd0d24e0 | 496 | */ |
036e846a | 497 | #if defined(HAVE_BLKG_TRYGET_GPL_ONLY) || !defined(HAVE_BLKG_TRYGET) |
bd0d24e0 BB |
498 | static inline bool |
499 | vdev_blkg_tryget(struct blkcg_gq *blkg) | |
500 | { | |
501 | struct percpu_ref *ref = &blkg->refcnt; | |
502 | unsigned long __percpu *count; | |
503 | bool rc; | |
504 | ||
505 | rcu_read_lock_sched(); | |
506 | ||
507 | if (__ref_is_percpu(ref, &count)) { | |
508 | this_cpu_inc(*count); | |
509 | rc = true; | |
510 | } else { | |
838a2490 CK |
511 | #ifdef ZFS_PERCPU_REF_COUNT_IN_DATA |
512 | rc = atomic_long_inc_not_zero(&ref->data->count); | |
513 | #else | |
bd0d24e0 | 514 | rc = atomic_long_inc_not_zero(&ref->count); |
838a2490 | 515 | #endif |
bd0d24e0 BB |
516 | } |
517 | ||
518 | rcu_read_unlock_sched(); | |
519 | ||
520 | return (rc); | |
521 | } | |
036e846a | 522 | #else |
bd0d24e0 BB |
523 | #define vdev_blkg_tryget(bg) blkg_tryget(bg) |
524 | #endif | |
d08b99ac | 525 | #ifdef HAVE_BIO_SET_DEV_MACRO |
26a85659 BB |
526 | /* |
527 | * The Linux 5.0 kernel updated the bio_set_dev() macro so it calls the | |
528 | * GPL-only bio_associate_blkg() symbol thus inadvertently converting | |
529 | * the entire macro. Provide a minimal version which always assigns the | |
530 | * request queue's root_blkg to the bio. | |
531 | */ | |
532 | static inline void | |
533 | vdev_bio_associate_blkg(struct bio *bio) | |
534 | { | |
d939930f CK |
535 | #if defined(HAVE_BIO_BDEV_DISK) |
536 | struct request_queue *q = bio->bi_bdev->bd_disk->queue; | |
537 | #else | |
26a85659 | 538 | struct request_queue *q = bio->bi_disk->queue; |
d939930f | 539 | #endif |
26a85659 BB |
540 | |
541 | ASSERT3P(q, !=, NULL); | |
26a85659 BB |
542 | ASSERT3P(bio->bi_blkg, ==, NULL); |
543 | ||
bd0d24e0 | 544 | if (q->root_blkg && vdev_blkg_tryget(q->root_blkg)) |
26a85659 BB |
545 | bio->bi_blkg = q->root_blkg; |
546 | } | |
d08b99ac | 547 | |
26a85659 | 548 | #define bio_associate_blkg vdev_bio_associate_blkg |
d08b99ac CK |
549 | #else |
550 | static inline void | |
551 | vdev_bio_set_dev(struct bio *bio, struct block_device *bdev) | |
552 | { | |
553 | #if defined(HAVE_BIO_BDEV_DISK) | |
554 | struct request_queue *q = bdev->bd_disk->queue; | |
555 | #else | |
556 | struct request_queue *q = bio->bi_disk->queue; | |
557 | #endif | |
558 | bio_clear_flag(bio, BIO_REMAPPED); | |
559 | if (bio->bi_bdev != bdev) | |
560 | bio_clear_flag(bio, BIO_THROTTLED); | |
561 | bio->bi_bdev = bdev; | |
562 | ||
563 | ASSERT3P(q, !=, NULL); | |
564 | ASSERT3P(bio->bi_blkg, ==, NULL); | |
565 | ||
566 | if (q->root_blkg && vdev_blkg_tryget(q->root_blkg)) | |
567 | bio->bi_blkg = q->root_blkg; | |
568 | } | |
569 | #define bio_set_dev vdev_bio_set_dev | |
570 | #endif | |
26a85659 BB |
571 | #endif |
572 | #else | |
573 | /* | |
574 | * Provide a bio_set_dev() helper macro for pre-Linux 4.14 kernels. | |
575 | */ | |
787acae0 GDN |
576 | static inline void |
577 | bio_set_dev(struct bio *bio, struct block_device *bdev) | |
578 | { | |
579 | bio->bi_bdev = bdev; | |
580 | } | |
26a85659 | 581 | #endif /* HAVE_BIO_SET_DEV */ |
5f264996 | 582 | #endif /* !HAVE_BIO_ALLOC_4ARG */ |
787acae0 | 583 | |
37f9dac5 | 584 | static inline void |
3b86aeb2 | 585 | vdev_submit_bio(struct bio *bio) |
37f9dac5 | 586 | { |
37f9dac5 RY |
587 | struct bio_list *bio_list = current->bio_list; |
588 | current->bio_list = NULL; | |
3b86aeb2 | 589 | vdev_submit_bio_impl(bio); |
37f9dac5 | 590 | current->bio_list = bio_list; |
37f9dac5 RY |
591 | } |
592 | ||
5f264996 BB |
593 | static inline struct bio * |
594 | vdev_bio_alloc(struct block_device *bdev, gfp_t gfp_mask, | |
595 | unsigned short nr_vecs) | |
596 | { | |
597 | struct bio *bio; | |
598 | ||
d1325b4f | 599 | #ifdef HAVE_BIO_ALLOC_4ARG |
5f264996 BB |
600 | bio = bio_alloc(bdev, nr_vecs, 0, gfp_mask); |
601 | #else | |
602 | bio = bio_alloc(gfp_mask, nr_vecs); | |
603 | if (likely(bio != NULL)) | |
604 | bio_set_dev(bio, bdev); | |
d1325b4f AZ |
605 | #endif |
606 | ||
5f264996 BB |
607 | return (bio); |
608 | } | |
609 | ||
f3b85d70 RN |
610 | /* ========== */ |
611 | ||
612 | /* | |
613 | * This is the classic, battle-tested BIO submission code. | |
614 | * | |
615 | * These functions have been renamed to vdev_classic_* to make it clear what | |
616 | * they belong to, but their implementations are unchanged. | |
617 | */ | |
618 | ||
619 | /* | |
620 | * Virtual device vector for disks. | |
621 | */ | |
622 | typedef struct dio_request { | |
623 | zio_t *dr_zio; /* Parent ZIO */ | |
624 | atomic_t dr_ref; /* References */ | |
625 | int dr_error; /* Bio error */ | |
626 | int dr_bio_count; /* Count of bio's */ | |
627 | struct bio *dr_bio[]; /* Attached bio's */ | |
628 | } dio_request_t; | |
629 | ||
630 | static dio_request_t * | |
631 | vdev_classic_dio_alloc(int bio_count) | |
632 | { | |
633 | dio_request_t *dr = kmem_zalloc(sizeof (dio_request_t) + | |
634 | sizeof (struct bio *) * bio_count, KM_SLEEP); | |
635 | atomic_set(&dr->dr_ref, 0); | |
636 | dr->dr_bio_count = bio_count; | |
637 | dr->dr_error = 0; | |
638 | ||
639 | for (int i = 0; i < dr->dr_bio_count; i++) | |
640 | dr->dr_bio[i] = NULL; | |
641 | ||
642 | return (dr); | |
643 | } | |
644 | ||
645 | static void | |
646 | vdev_classic_dio_free(dio_request_t *dr) | |
647 | { | |
648 | int i; | |
649 | ||
650 | for (i = 0; i < dr->dr_bio_count; i++) | |
651 | if (dr->dr_bio[i]) | |
652 | bio_put(dr->dr_bio[i]); | |
653 | ||
654 | kmem_free(dr, sizeof (dio_request_t) + | |
655 | sizeof (struct bio *) * dr->dr_bio_count); | |
656 | } | |
657 | ||
658 | static void | |
659 | vdev_classic_dio_get(dio_request_t *dr) | |
660 | { | |
661 | atomic_inc(&dr->dr_ref); | |
662 | } | |
663 | ||
664 | static void | |
665 | vdev_classic_dio_put(dio_request_t *dr) | |
666 | { | |
667 | int rc = atomic_dec_return(&dr->dr_ref); | |
668 | ||
669 | /* | |
670 | * Free the dio_request when the last reference is dropped and | |
671 | * ensure zio_interpret is called only once with the correct zio | |
672 | */ | |
673 | if (rc == 0) { | |
674 | zio_t *zio = dr->dr_zio; | |
675 | int error = dr->dr_error; | |
676 | ||
677 | vdev_classic_dio_free(dr); | |
678 | ||
679 | if (zio) { | |
680 | zio->io_error = error; | |
681 | ASSERT3S(zio->io_error, >=, 0); | |
682 | if (zio->io_error) | |
683 | vdev_disk_error(zio); | |
684 | ||
685 | zio_delay_interrupt(zio); | |
686 | } | |
687 | } | |
688 | } | |
689 | ||
690 | BIO_END_IO_PROTO(vdev_classic_physio_completion, bio, error) | |
691 | { | |
692 | dio_request_t *dr = bio->bi_private; | |
693 | ||
694 | if (dr->dr_error == 0) { | |
695 | #ifdef HAVE_1ARG_BIO_END_IO_T | |
696 | dr->dr_error = BIO_END_IO_ERROR(bio); | |
697 | #else | |
698 | if (error) | |
699 | dr->dr_error = -(error); | |
700 | else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) | |
701 | dr->dr_error = EIO; | |
702 | #endif | |
703 | } | |
704 | ||
705 | /* Drop reference acquired by vdev_classic_physio */ | |
706 | vdev_classic_dio_put(dr); | |
707 | } | |
708 | ||
5f264996 | 709 | static inline unsigned int |
f3b85d70 | 710 | vdev_classic_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset) |
5f264996 BB |
711 | { |
712 | unsigned long nr_segs = abd_nr_pages_off(zio->io_abd, | |
713 | bio_size, abd_offset); | |
714 | ||
715 | #ifdef HAVE_BIO_MAX_SEGS | |
716 | return (bio_max_segs(nr_segs)); | |
717 | #else | |
718 | return (MIN(nr_segs, BIO_MAX_PAGES)); | |
719 | #endif | |
720 | } | |
721 | ||
60101509 | 722 | static int |
867178ae | 723 | vdev_classic_physio(zio_t *zio) |
60101509 | 724 | { |
867178ae RN |
725 | vdev_t *v = zio->io_vd; |
726 | vdev_disk_t *vd = v->vdev_tsd; | |
727 | struct block_device *bdev = BDH_BDEV(vd->vd_bdh); | |
728 | size_t io_size = zio->io_size; | |
729 | uint64_t io_offset = zio->io_offset; | |
730 | int rw = zio->io_type == ZIO_TYPE_READ ? READ : WRITE; | |
731 | int flags = 0; | |
732 | ||
d1d7e268 | 733 | dio_request_t *dr; |
b0be93e8 | 734 | uint64_t abd_offset; |
60101509 | 735 | uint64_t bio_offset; |
f8c0d7e1 MA |
736 | int bio_size; |
737 | int bio_count = 16; | |
738 | int error = 0; | |
e8ac4557 | 739 | struct blk_plug plug; |
5f264996 | 740 | unsigned short nr_vecs; |
066e8252 | 741 | |
d441e85d BB |
742 | /* |
743 | * Accessing outside the block device is never allowed. | |
744 | */ | |
745 | if (io_offset + io_size > bdev->bd_inode->i_size) { | |
746 | vdev_dbgmsg(zio->io_vd, | |
747 | "Illegal access %llu size %llu, device size %llu", | |
5dbf6c5a AZ |
748 | (u_longlong_t)io_offset, |
749 | (u_longlong_t)io_size, | |
750 | (u_longlong_t)i_size_read(bdev->bd_inode)); | |
d441e85d BB |
751 | return (SET_ERROR(EIO)); |
752 | } | |
e06be586 | 753 | |
60101509 | 754 | retry: |
f3b85d70 | 755 | dr = vdev_classic_dio_alloc(bio_count); |
60101509 | 756 | |
f1100863 | 757 | if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)) && |
16f0fdad MZ |
758 | zio->io_vd->vdev_failfast == B_TRUE) { |
759 | bio_set_flags_failfast(bdev, &flags, zfs_vdev_failfast_mask & 1, | |
760 | zfs_vdev_failfast_mask & 2, zfs_vdev_failfast_mask & 4); | |
761 | } | |
2959d94a | 762 | |
60101509 | 763 | dr->dr_zio = zio; |
60101509 | 764 | |
60101509 | 765 | /* |
f8c0d7e1 MA |
766 | * Since bio's can have up to BIO_MAX_PAGES=256 iovec's, each of which |
767 | * is at least 512 bytes and at most PAGESIZE (typically 4K), one bio | |
768 | * can cover at least 128KB and at most 1MB. When the required number | |
769 | * of iovec's exceeds this, we are forced to break the IO in multiple | |
770 | * bio's and wait for them all to complete. This is likely if the | |
771 | * recordsize property is increased beyond 1MB. The default | |
772 | * bio_count=16 should typically accommodate the maximum-size zio of | |
773 | * 16MB. | |
60101509 | 774 | */ |
a6255b7f | 775 | |
b0be93e8 IH |
776 | abd_offset = 0; |
777 | bio_offset = io_offset; | |
f8c0d7e1 MA |
778 | bio_size = io_size; |
779 | for (int i = 0; i <= dr->dr_bio_count; i++) { | |
60101509 BB |
780 | |
781 | /* Finished constructing bio's for given buffer */ | |
782 | if (bio_size <= 0) | |
783 | break; | |
784 | ||
785 | /* | |
f8c0d7e1 MA |
786 | * If additional bio's are required, we have to retry, but |
787 | * this should be rare - see the comment above. | |
60101509 BB |
788 | */ |
789 | if (dr->dr_bio_count == i) { | |
f3b85d70 | 790 | vdev_classic_dio_free(dr); |
60101509 | 791 | bio_count *= 2; |
60101509 BB |
792 | goto retry; |
793 | } | |
794 | ||
f3b85d70 | 795 | nr_vecs = vdev_classic_bio_max_segs(zio, bio_size, abd_offset); |
5f264996 | 796 | dr->dr_bio[i] = vdev_bio_alloc(bdev, GFP_NOIO, nr_vecs); |
1086f542 | 797 | if (unlikely(dr->dr_bio[i] == NULL)) { |
f3b85d70 | 798 | vdev_classic_dio_free(dr); |
ecb2b7dc | 799 | return (SET_ERROR(ENOMEM)); |
60101509 BB |
800 | } |
801 | ||
f3b85d70 RN |
802 | /* Matching put called by vdev_classic_physio_completion */ |
803 | vdev_classic_dio_get(dr); | |
60101509 | 804 | |
d4541210 | 805 | BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9; |
f3b85d70 | 806 | dr->dr_bio[i]->bi_end_io = vdev_classic_physio_completion; |
60101509 | 807 | dr->dr_bio[i]->bi_private = dr; |
3b86aeb2 | 808 | bio_set_op_attrs(dr->dr_bio[i], rw, flags); |
60101509 BB |
809 | |
810 | /* Remaining size is returned to become the new size */ | |
fb822260 | 811 | bio_size = abd_bio_map_off(dr->dr_bio[i], zio->io_abd, |
02730c33 | 812 | bio_size, abd_offset); |
60101509 BB |
813 | |
814 | /* Advance in buffer and construct another bio if needed */ | |
b0be93e8 | 815 | abd_offset += BIO_BI_SIZE(dr->dr_bio[i]); |
d4541210 | 816 | bio_offset += BIO_BI_SIZE(dr->dr_bio[i]); |
60101509 BB |
817 | } |
818 | ||
37f9dac5 | 819 | /* Extra reference to protect dio_request during vdev_submit_bio */ |
f3b85d70 | 820 | vdev_classic_dio_get(dr); |
60101509 | 821 | |
e8ac4557 IH |
822 | if (dr->dr_bio_count > 1) |
823 | blk_start_plug(&plug); | |
e8ac4557 | 824 | |
60101509 | 825 | /* Submit all bio's associated with this dio */ |
f8c0d7e1 | 826 | for (int i = 0; i < dr->dr_bio_count; i++) { |
60101509 | 827 | if (dr->dr_bio[i]) |
3b86aeb2 | 828 | vdev_submit_bio(dr->dr_bio[i]); |
f8c0d7e1 | 829 | } |
60101509 | 830 | |
e8ac4557 IH |
831 | if (dr->dr_bio_count > 1) |
832 | blk_finish_plug(&plug); | |
e8ac4557 | 833 | |
f3b85d70 | 834 | vdev_classic_dio_put(dr); |
60101509 | 835 | |
d1d7e268 | 836 | return (error); |
60101509 BB |
837 | } |
838 | ||
f3b85d70 RN |
839 | /* ========== */ |
840 | ||
36ba27e9 | 841 | BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error) |
60101509 BB |
842 | { |
843 | zio_t *zio = bio->bi_private; | |
784a7fe5 | 844 | #ifdef HAVE_1ARG_BIO_END_IO_T |
36ba27e9 BB |
845 | zio->io_error = BIO_END_IO_ERROR(bio); |
846 | #else | |
847 | zio->io_error = -error; | |
784a7fe5 | 848 | #endif |
60101509 | 849 | |
36ba27e9 | 850 | if (zio->io_error && (zio->io_error == EOPNOTSUPP)) |
60101509 BB |
851 | zio->io_vd->vdev_nowritecache = B_TRUE; |
852 | ||
853 | bio_put(bio); | |
d148e951 BB |
854 | ASSERT3S(zio->io_error, >=, 0); |
855 | if (zio->io_error) | |
856 | vdev_disk_error(zio); | |
60101509 | 857 | zio_interrupt(zio); |
60101509 BB |
858 | } |
859 | ||
860 | static int | |
861 | vdev_disk_io_flush(struct block_device *bdev, zio_t *zio) | |
862 | { | |
863 | struct request_queue *q; | |
864 | struct bio *bio; | |
865 | ||
866 | q = bdev_get_queue(bdev); | |
867 | if (!q) | |
ecb2b7dc | 868 | return (SET_ERROR(ENXIO)); |
60101509 | 869 | |
5f264996 | 870 | bio = vdev_bio_alloc(bdev, GFP_NOIO, 0); |
29b763cd | 871 | if (unlikely(bio == NULL)) |
ecb2b7dc | 872 | return (SET_ERROR(ENOMEM)); |
60101509 BB |
873 | |
874 | bio->bi_end_io = vdev_disk_io_flush_completion; | |
875 | bio->bi_private = zio; | |
a5e046ea | 876 | bio_set_flush(bio); |
3b86aeb2 | 877 | vdev_submit_bio(bio); |
cecb7487 | 878 | invalidate_bdev(bdev); |
60101509 | 879 | |
d1d7e268 | 880 | return (0); |
60101509 | 881 | } |
60101509 | 882 | |
06e25f9c US |
883 | #if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE) || \ |
884 | defined(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC) | |
885 | BIO_END_IO_PROTO(vdev_disk_discard_end_io, bio, error) | |
886 | { | |
887 | zio_t *zio = bio->bi_private; | |
888 | #ifdef HAVE_1ARG_BIO_END_IO_T | |
889 | zio->io_error = BIO_END_IO_ERROR(bio); | |
890 | #else | |
891 | zio->io_error = -error; | |
892 | #endif | |
893 | bio_put(bio); | |
894 | if (zio->io_error) | |
895 | vdev_disk_error(zio); | |
896 | zio_interrupt(zio); | |
897 | } | |
898 | ||
a12a5cb5 | 899 | static int |
06e25f9c | 900 | vdev_issue_discard_trim(zio_t *zio, unsigned long flags) |
a12a5cb5 | 901 | { |
06e25f9c US |
902 | int ret; |
903 | struct bio *bio = NULL; | |
a12a5cb5 | 904 | |
06e25f9c US |
905 | #if defined(BLKDEV_DISCARD_SECURE) |
906 | ret = - __blkdev_issue_discard( | |
907 | BDH_BDEV(((vdev_disk_t *)zio->io_vd->vdev_tsd)->vd_bdh), | |
908 | zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS, flags, &bio); | |
909 | #else | |
910 | (void) flags; | |
911 | ret = - __blkdev_issue_discard( | |
912 | BDH_BDEV(((vdev_disk_t *)zio->io_vd->vdev_tsd)->vd_bdh), | |
913 | zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS, &bio); | |
914 | #endif | |
915 | if (!ret && bio) { | |
916 | bio->bi_private = zio; | |
917 | bio->bi_end_io = vdev_disk_discard_end_io; | |
918 | vdev_submit_bio(bio); | |
a12a5cb5 | 919 | } |
06e25f9c US |
920 | return (ret); |
921 | } | |
922 | #endif | |
923 | ||
924 | static int | |
925 | vdev_disk_io_trim(zio_t *zio) | |
926 | { | |
a12a5cb5 | 927 | unsigned long trim_flags = 0; |
06e25f9c US |
928 | if (zio->io_trim_flags & ZIO_TRIM_SECURE) { |
929 | #if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE) | |
930 | return (-blkdev_issue_secure_erase( | |
931 | BDH_BDEV(((vdev_disk_t *)zio->io_vd->vdev_tsd)->vd_bdh), | |
932 | zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS)); | |
933 | #elif defined(BLKDEV_DISCARD_SECURE) | |
a12a5cb5 BB |
934 | trim_flags |= BLKDEV_DISCARD_SECURE; |
935 | #endif | |
06e25f9c US |
936 | } |
937 | #if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE) || \ | |
938 | defined(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC) | |
939 | return (vdev_issue_discard_trim(zio, trim_flags)); | |
940 | #elif defined(HAVE_BLKDEV_ISSUE_DISCARD) | |
941 | return (-blkdev_issue_discard( | |
942 | BDH_BDEV(((vdev_disk_t *)zio->io_vd->vdev_tsd)->vd_bdh), | |
a12a5cb5 BB |
943 | zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS, trim_flags)); |
944 | #else | |
945 | #error "Unsupported kernel" | |
946 | #endif | |
947 | } | |
948 | ||
c4a13ba4 RN |
949 | int (*vdev_disk_io_rw_fn)(zio_t *zio) = NULL; |
950 | ||
98b25418 | 951 | static void |
60101509 BB |
952 | vdev_disk_io_start(zio_t *zio) |
953 | { | |
954 | vdev_t *v = zio->io_vd; | |
955 | vdev_disk_t *vd = v->vdev_tsd; | |
867178ae | 956 | int error; |
60101509 | 957 | |
d441e85d BB |
958 | /* |
959 | * If the vdev is closed, it's likely in the REMOVED or FAULTED state. | |
960 | * Nothing to be done here but return failure. | |
961 | */ | |
962 | if (vd == NULL) { | |
963 | zio->io_error = ENXIO; | |
964 | zio_interrupt(zio); | |
965 | return; | |
966 | } | |
967 | ||
968 | rw_enter(&vd->vd_lock, RW_READER); | |
969 | ||
970 | /* | |
971 | * If the vdev is closed, it's likely due to a failed reopen and is | |
972 | * in the UNAVAIL state. Nothing to be done here but return failure. | |
973 | */ | |
386d6a75 | 974 | if (vd->vd_bdh == NULL) { |
d441e85d BB |
975 | rw_exit(&vd->vd_lock); |
976 | zio->io_error = ENXIO; | |
977 | zio_interrupt(zio); | |
978 | return; | |
979 | } | |
980 | ||
60101509 BB |
981 | switch (zio->io_type) { |
982 | case ZIO_TYPE_IOCTL: | |
983 | ||
984 | if (!vdev_readable(v)) { | |
d441e85d | 985 | rw_exit(&vd->vd_lock); |
2e528b49 | 986 | zio->io_error = SET_ERROR(ENXIO); |
98b25418 GW |
987 | zio_interrupt(zio); |
988 | return; | |
60101509 BB |
989 | } |
990 | ||
991 | switch (zio->io_cmd) { | |
992 | case DKIOCFLUSHWRITECACHE: | |
993 | ||
994 | if (zfs_nocacheflush) | |
995 | break; | |
996 | ||
997 | if (v->vdev_nowritecache) { | |
2e528b49 | 998 | zio->io_error = SET_ERROR(ENOTSUP); |
60101509 BB |
999 | break; |
1000 | } | |
1001 | ||
386d6a75 | 1002 | error = vdev_disk_io_flush(BDH_BDEV(vd->vd_bdh), zio); |
d441e85d BB |
1003 | if (error == 0) { |
1004 | rw_exit(&vd->vd_lock); | |
98b25418 | 1005 | return; |
d441e85d | 1006 | } |
60101509 BB |
1007 | |
1008 | zio->io_error = error; | |
60101509 BB |
1009 | |
1010 | break; | |
1011 | ||
1012 | default: | |
2e528b49 | 1013 | zio->io_error = SET_ERROR(ENOTSUP); |
60101509 BB |
1014 | } |
1015 | ||
d441e85d | 1016 | rw_exit(&vd->vd_lock); |
98b25418 GW |
1017 | zio_execute(zio); |
1018 | return; | |
60101509 | 1019 | |
1b939560 | 1020 | case ZIO_TYPE_TRIM: |
a12a5cb5 | 1021 | zio->io_error = vdev_disk_io_trim(zio); |
1b939560 | 1022 | rw_exit(&vd->vd_lock); |
06e25f9c US |
1023 | #if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE) |
1024 | if (zio->io_trim_flags & ZIO_TRIM_SECURE) | |
1025 | zio_interrupt(zio); | |
1026 | #elif defined(HAVE_BLKDEV_ISSUE_DISCARD) | |
1b939560 | 1027 | zio_interrupt(zio); |
06e25f9c | 1028 | #endif |
1b939560 BB |
1029 | return; |
1030 | ||
867178ae RN |
1031 | case ZIO_TYPE_READ: |
1032 | case ZIO_TYPE_WRITE: | |
1033 | zio->io_target_timestamp = zio_handle_io_delay(zio); | |
c4a13ba4 | 1034 | error = vdev_disk_io_rw_fn(zio); |
d441e85d | 1035 | rw_exit(&vd->vd_lock); |
867178ae RN |
1036 | if (error) { |
1037 | zio->io_error = error; | |
1038 | zio_interrupt(zio); | |
1039 | } | |
98b25418 | 1040 | return; |
60101509 | 1041 | |
867178ae RN |
1042 | default: |
1043 | /* | |
1044 | * Getting here means our parent vdev has made a very strange | |
1045 | * request of us, and shouldn't happen. Assert here to force a | |
1046 | * crash in dev builds, but in production return the IO | |
1047 | * unhandled. The pool will likely suspend anyway but that's | |
1048 | * nicer than crashing the kernel. | |
1049 | */ | |
1050 | ASSERT3S(zio->io_type, ==, -1); | |
d441e85d | 1051 | |
867178ae RN |
1052 | rw_exit(&vd->vd_lock); |
1053 | zio->io_error = SET_ERROR(ENOTSUP); | |
98b25418 GW |
1054 | zio_interrupt(zio); |
1055 | return; | |
60101509 | 1056 | } |
867178ae RN |
1057 | |
1058 | __builtin_unreachable(); | |
60101509 BB |
1059 | } |
1060 | ||
1061 | static void | |
1062 | vdev_disk_io_done(zio_t *zio) | |
1063 | { | |
1064 | /* | |
1065 | * If the device returned EIO, we revalidate the media. If it is | |
1066 | * determined the media has changed this triggers the asynchronous | |
1067 | * removal of the device from the configuration. | |
1068 | */ | |
1069 | if (zio->io_error == EIO) { | |
d1d7e268 | 1070 | vdev_t *v = zio->io_vd; |
60101509 BB |
1071 | vdev_disk_t *vd = v->vdev_tsd; |
1072 | ||
386d6a75 RN |
1073 | if (!zfs_check_disk_status(BDH_BDEV(vd->vd_bdh))) { |
1074 | invalidate_bdev(BDH_BDEV(vd->vd_bdh)); | |
60101509 BB |
1075 | v->vdev_remove_wanted = B_TRUE; |
1076 | spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE); | |
1077 | } | |
1078 | } | |
1079 | } | |
1080 | ||
1081 | static void | |
1082 | vdev_disk_hold(vdev_t *vd) | |
1083 | { | |
1084 | ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); | |
1085 | ||
1086 | /* We must have a pathname, and it must be absolute. */ | |
1087 | if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') | |
1088 | return; | |
1089 | ||
1090 | /* | |
1091 | * Only prefetch path and devid info if the device has | |
1092 | * never been opened. | |
1093 | */ | |
1094 | if (vd->vdev_tsd != NULL) | |
1095 | return; | |
1096 | ||
60101509 BB |
1097 | } |
1098 | ||
1099 | static void | |
1100 | vdev_disk_rele(vdev_t *vd) | |
1101 | { | |
1102 | ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); | |
1103 | ||
1104 | /* XXX: Implement me as a vnode rele for the device */ | |
1105 | } | |
1106 | ||
c4a13ba4 RN |
1107 | /* |
1108 | * At first use vdev use, set the submission function from the default value if | |
1109 | * it hasn't been set already. | |
1110 | */ | |
1111 | static int | |
1112 | vdev_disk_init(spa_t *spa, nvlist_t *nv, void **tsd) | |
1113 | { | |
1114 | (void) spa; | |
1115 | (void) nv; | |
1116 | (void) tsd; | |
1117 | ||
1118 | if (vdev_disk_io_rw_fn == NULL) | |
1119 | vdev_disk_io_rw_fn = vdev_classic_physio; | |
1120 | ||
1121 | return (0); | |
1122 | } | |
1123 | ||
60101509 | 1124 | vdev_ops_t vdev_disk_ops = { |
c4a13ba4 | 1125 | .vdev_op_init = vdev_disk_init, |
b2255edc | 1126 | .vdev_op_fini = NULL, |
a64f8276 I |
1127 | .vdev_op_open = vdev_disk_open, |
1128 | .vdev_op_close = vdev_disk_close, | |
1129 | .vdev_op_asize = vdev_default_asize, | |
b2255edc BB |
1130 | .vdev_op_min_asize = vdev_default_min_asize, |
1131 | .vdev_op_min_alloc = NULL, | |
a64f8276 I |
1132 | .vdev_op_io_start = vdev_disk_io_start, |
1133 | .vdev_op_io_done = vdev_disk_io_done, | |
1134 | .vdev_op_state_change = NULL, | |
1135 | .vdev_op_need_resilver = NULL, | |
1136 | .vdev_op_hold = vdev_disk_hold, | |
1137 | .vdev_op_rele = vdev_disk_rele, | |
1138 | .vdev_op_remap = NULL, | |
1139 | .vdev_op_xlate = vdev_default_xlate, | |
b2255edc BB |
1140 | .vdev_op_rebuild_asize = NULL, |
1141 | .vdev_op_metaslab_init = NULL, | |
1142 | .vdev_op_config_generate = NULL, | |
1143 | .vdev_op_nparity = NULL, | |
1144 | .vdev_op_ndisks = NULL, | |
a64f8276 | 1145 | .vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */ |
55c12724 AH |
1146 | .vdev_op_leaf = B_TRUE, /* leaf vdev */ |
1147 | .vdev_op_kobj_evt_post = vdev_disk_kobj_evt_post | |
60101509 BB |
1148 | }; |
1149 | ||
9e17e6f2 BB |
1150 | /* |
1151 | * The zfs_vdev_scheduler module option has been deprecated. Setting this | |
1152 | * value no longer has any effect. It has not yet been entirely removed | |
1153 | * to allow the module to be loaded if this option is specified in the | |
1154 | * /etc/modprobe.d/zfs.conf file. The following warning will be logged. | |
1155 | */ | |
1156 | static int | |
1157 | param_set_vdev_scheduler(const char *val, zfs_kernel_param_t *kp) | |
1158 | { | |
1159 | int error = param_set_charp(val, kp); | |
1160 | if (error == 0) { | |
1161 | printk(KERN_INFO "The 'zfs_vdev_scheduler' module option " | |
1162 | "is not supported.\n"); | |
1163 | } | |
1164 | ||
1165 | return (error); | |
1166 | } | |
1167 | ||
18168da7 | 1168 | static const char *zfs_vdev_scheduler = "unused"; |
e771de53 BB |
1169 | module_param_call(zfs_vdev_scheduler, param_set_vdev_scheduler, |
1170 | param_get_charp, &zfs_vdev_scheduler, 0644); | |
c409e464 | 1171 | MODULE_PARM_DESC(zfs_vdev_scheduler, "I/O scheduler"); |
6fe3498c RM |
1172 | |
1173 | int | |
1174 | param_set_min_auto_ashift(const char *buf, zfs_kernel_param_t *kp) | |
1175 | { | |
ab8d9c17 | 1176 | uint_t val; |
6fe3498c RM |
1177 | int error; |
1178 | ||
ab8d9c17 | 1179 | error = kstrtouint(buf, 0, &val); |
6fe3498c RM |
1180 | if (error < 0) |
1181 | return (SET_ERROR(error)); | |
1182 | ||
1183 | if (val < ASHIFT_MIN || val > zfs_vdev_max_auto_ashift) | |
1184 | return (SET_ERROR(-EINVAL)); | |
1185 | ||
ab8d9c17 | 1186 | error = param_set_uint(buf, kp); |
6fe3498c RM |
1187 | if (error < 0) |
1188 | return (SET_ERROR(error)); | |
1189 | ||
1190 | return (0); | |
1191 | } | |
1192 | ||
1193 | int | |
1194 | param_set_max_auto_ashift(const char *buf, zfs_kernel_param_t *kp) | |
1195 | { | |
ab8d9c17 | 1196 | uint_t val; |
6fe3498c RM |
1197 | int error; |
1198 | ||
ab8d9c17 | 1199 | error = kstrtouint(buf, 0, &val); |
6fe3498c RM |
1200 | if (error < 0) |
1201 | return (SET_ERROR(error)); | |
1202 | ||
1203 | if (val > ASHIFT_MAX || val < zfs_vdev_min_auto_ashift) | |
1204 | return (SET_ERROR(-EINVAL)); | |
1205 | ||
ab8d9c17 | 1206 | error = param_set_uint(buf, kp); |
6fe3498c RM |
1207 | if (error < 0) |
1208 | return (SET_ERROR(error)); | |
1209 | ||
1210 | return (0); | |
1211 | } | |
f66ffe68 SD |
1212 | |
1213 | ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, open_timeout_ms, UINT, ZMOD_RW, | |
1214 | "Timeout before determining that a device is missing"); | |
16f0fdad MZ |
1215 | |
1216 | ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, failfast_mask, UINT, ZMOD_RW, | |
1217 | "Defines failfast mask: 1 - device, 2 - transport, 4 - driver"); |