]>
Commit | Line | Data |
---|---|---|
60101509 BB |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
9 | * or http://www.opensolaris.org/os/licensing. | |
10 | * See the License for the specific language governing permissions | |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | /* | |
22 | * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC. | |
23 | * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). | |
24 | * Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>. | |
25 | * LLNL-CODE-403049. | |
1eacf2b3 | 26 | * Copyright (c) 2012, 2019 by Delphix. All rights reserved. |
60101509 BB |
27 | */ |
28 | ||
29 | #include <sys/zfs_context.h> | |
e771de53 | 30 | #include <sys/spa_impl.h> |
60101509 BB |
31 | #include <sys/vdev_disk.h> |
32 | #include <sys/vdev_impl.h> | |
1b939560 | 33 | #include <sys/vdev_trim.h> |
a6255b7f | 34 | #include <sys/abd.h> |
60101509 BB |
35 | #include <sys/fs/zfs.h> |
36 | #include <sys/zio.h> | |
8e82ffba | 37 | #include <linux/blkpg.h> |
74d42600 | 38 | #include <linux/msdos_fs.h> |
05805494 | 39 | #include <linux/vfs_compat.h> |
60101509 | 40 | |
d366c8fd JL |
41 | typedef struct vdev_disk { |
42 | struct block_device *vd_bdev; | |
43 | krwlock_t vd_lock; | |
44 | } vdev_disk_t; | |
45 | ||
a25861dc BB |
46 | /* |
47 | * Unique identifier for the exclusive vdev holder. | |
48 | */ | |
8128bd89 | 49 | static void *zfs_vdev_holder = VDEV_HOLDER; |
6839eed2 | 50 | |
a25861dc BB |
51 | /* |
52 | * Wait up to zfs_vdev_open_timeout_ms milliseconds before determining the | |
53 | * device is missing. The missing path may be transient since the links | |
54 | * can be briefly removed and recreated in response to udev events. | |
55 | */ | |
56 | static unsigned zfs_vdev_open_timeout_ms = 1000; | |
57 | ||
58 | /* | |
59 | * Size of the "reserved" partition, in blocks. | |
60 | */ | |
74d42600 SH |
61 | #define EFI_MIN_RESV_SIZE (16 * 1024) |
62 | ||
60101509 BB |
63 | /* |
64 | * Virtual device vector for disks. | |
65 | */ | |
66 | typedef struct dio_request { | |
60101509 | 67 | zio_t *dr_zio; /* Parent ZIO */ |
aa159afb | 68 | atomic_t dr_ref; /* References */ |
60101509 BB |
69 | int dr_error; /* Bio error */ |
70 | int dr_bio_count; /* Count of bio's */ | |
d1d7e268 | 71 | struct bio *dr_bio[0]; /* Attached bio's */ |
60101509 BB |
72 | } dio_request_t; |
73 | ||
60101509 | 74 | static fmode_t |
da92d5cb | 75 | vdev_bdev_mode(spa_mode_t spa_mode) |
60101509 BB |
76 | { |
77 | fmode_t mode = 0; | |
78 | ||
da92d5cb | 79 | if (spa_mode & SPA_MODE_READ) |
60101509 BB |
80 | mode |= FMODE_READ; |
81 | ||
da92d5cb | 82 | if (spa_mode & SPA_MODE_WRITE) |
60101509 BB |
83 | mode |= FMODE_WRITE; |
84 | ||
d1d7e268 | 85 | return (mode); |
60101509 | 86 | } |
60101509 | 87 | |
d441e85d BB |
88 | /* |
89 | * Returns the usable capacity (in bytes) for the partition or disk. | |
90 | */ | |
60101509 | 91 | static uint64_t |
d441e85d | 92 | bdev_capacity(struct block_device *bdev) |
60101509 | 93 | { |
d441e85d BB |
94 | return (i_size_read(bdev->bd_inode)); |
95 | } | |
60101509 | 96 | |
72ba4b2a BB |
97 | #if !defined(HAVE_BDEV_WHOLE) |
98 | static inline struct block_device * | |
99 | bdev_whole(struct block_device *bdev) | |
100 | { | |
101 | return (bdev->bd_contains); | |
102 | } | |
103 | #endif | |
104 | ||
d441e85d BB |
105 | /* |
106 | * Returns the maximum expansion capacity of the block device (in bytes). | |
107 | * | |
108 | * It is possible to expand a vdev when it has been created as a wholedisk | |
109 | * and the containing block device has increased in capacity. Or when the | |
110 | * partition containing the pool has been manually increased in size. | |
111 | * | |
112 | * This function is only responsible for calculating the potential expansion | |
113 | * size so it can be reported by 'zpool list'. The efi_use_whole_disk() is | |
114 | * responsible for verifying the expected partition layout in the wholedisk | |
115 | * case, and updating the partition table if appropriate. Once the partition | |
116 | * size has been increased the additional capacity will be visible using | |
117 | * bdev_capacity(). | |
0c637f31 | 118 | * |
119 | * The returned maximum expansion capacity is always expected to be larger, or | |
120 | * at the very least equal, to its usable capacity to prevent overestimating | |
121 | * the pool expandsize. | |
d441e85d BB |
122 | */ |
123 | static uint64_t | |
124 | bdev_max_capacity(struct block_device *bdev, uint64_t wholedisk) | |
125 | { | |
126 | uint64_t psize; | |
127 | int64_t available; | |
128 | ||
72ba4b2a | 129 | if (wholedisk && bdev != bdev_whole(bdev)) { |
74d42600 | 130 | /* |
d441e85d BB |
131 | * When reporting maximum expansion capacity for a wholedisk |
132 | * deduct any capacity which is expected to be lost due to | |
133 | * alignment restrictions. Over reporting this value isn't | |
134 | * harmful and would only result in slightly less capacity | |
135 | * than expected post expansion. | |
0c637f31 | 136 | * The estimated available space may be slightly smaller than |
137 | * bdev_capacity() for devices where the number of sectors is | |
138 | * not a multiple of the alignment size and the partition layout | |
139 | * is keeping less than PARTITION_END_ALIGNMENT bytes after the | |
140 | * "reserved" EFI partition: in such cases return the device | |
141 | * usable capacity. | |
74d42600 | 142 | */ |
72ba4b2a | 143 | available = i_size_read(bdev_whole(bdev)->bd_inode) - |
d441e85d BB |
144 | ((EFI_MIN_RESV_SIZE + NEW_START_BLOCK + |
145 | PARTITION_END_ALIGNMENT) << SECTOR_BITS); | |
0c637f31 | 146 | psize = MAX(available, bdev_capacity(bdev)); |
74d42600 | 147 | } else { |
d441e85d | 148 | psize = bdev_capacity(bdev); |
74d42600 | 149 | } |
d441e85d BB |
150 | |
151 | return (psize); | |
60101509 BB |
152 | } |
153 | ||
d148e951 BB |
154 | static void |
155 | vdev_disk_error(zio_t *zio) | |
156 | { | |
c71c8c71 | 157 | /* |
158 | * This function can be called in interrupt context, for instance while | |
159 | * handling IRQs coming from a misbehaving disk device; use printk() | |
160 | * which is safe from any context. | |
161 | */ | |
162 | printk(KERN_WARNING "zio pool=%s vdev=%s error=%d type=%d " | |
163 | "offset=%llu size=%llu flags=%x\n", spa_name(zio->io_spa), | |
164 | zio->io_vd->vdev_path, zio->io_error, zio->io_type, | |
165 | (u_longlong_t)zio->io_offset, (u_longlong_t)zio->io_size, | |
166 | zio->io_flags); | |
d148e951 BB |
167 | } |
168 | ||
60101509 | 169 | static int |
1bd201e7 | 170 | vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, |
6fe3498c | 171 | uint64_t *logical_ashift, uint64_t *physical_ashift) |
60101509 | 172 | { |
d441e85d BB |
173 | struct block_device *bdev; |
174 | fmode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa)); | |
a25861dc | 175 | hrtime_t timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms); |
60101509 | 176 | vdev_disk_t *vd; |
60101509 BB |
177 | |
178 | /* Must have a pathname and it must be absolute. */ | |
179 | if (v->vdev_path == NULL || v->vdev_path[0] != '/') { | |
180 | v->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; | |
d441e85d | 181 | vdev_dbgmsg(v, "invalid vdev_path"); |
2d82ea8b | 182 | return (SET_ERROR(EINVAL)); |
60101509 BB |
183 | } |
184 | ||
0d8103d9 | 185 | /* |
d441e85d | 186 | * Reopen the device if it is currently open. When expanding a |
8e82ffba GW |
187 | * partition force re-scanning the partition table if userland |
188 | * did not take care of this already. We need to do this while closed | |
d441e85d BB |
189 | * in order to get an accurate updated block device size. Then |
190 | * since udev may need to recreate the device links increase the | |
a25861dc | 191 | * open retry timeout before reporting the device as unavailable. |
0d8103d9 | 192 | */ |
d441e85d BB |
193 | vd = v->vdev_tsd; |
194 | if (vd) { | |
195 | char disk_name[BDEVNAME_SIZE + 6] = "/dev/"; | |
196 | boolean_t reread_part = B_FALSE; | |
0d8103d9 | 197 | |
d441e85d BB |
198 | rw_enter(&vd->vd_lock, RW_WRITER); |
199 | bdev = vd->vd_bdev; | |
200 | vd->vd_bdev = NULL; | |
201 | ||
202 | if (bdev) { | |
72ba4b2a BB |
203 | if (v->vdev_expanding && bdev != bdev_whole(bdev)) { |
204 | bdevname(bdev_whole(bdev), disk_name + 5); | |
8e82ffba GW |
205 | /* |
206 | * If userland has BLKPG_RESIZE_PARTITION, | |
207 | * then it should have updated the partition | |
208 | * table already. We can detect this by | |
209 | * comparing our current physical size | |
210 | * with that of the device. If they are | |
211 | * the same, then we must not have | |
212 | * BLKPG_RESIZE_PARTITION or it failed to | |
213 | * update the partition table online. We | |
214 | * fallback to rescanning the partition | |
215 | * table from the kernel below. However, | |
216 | * if the capacity already reflects the | |
217 | * updated partition, then we skip | |
218 | * rescanning the partition table here. | |
219 | */ | |
220 | if (v->vdev_psize == bdev_capacity(bdev)) | |
221 | reread_part = B_TRUE; | |
d441e85d BB |
222 | } |
223 | ||
066e8252 | 224 | blkdev_put(bdev, mode | FMODE_EXCL); |
d441e85d BB |
225 | } |
226 | ||
227 | if (reread_part) { | |
066e8252 BB |
228 | bdev = blkdev_get_by_path(disk_name, mode | FMODE_EXCL, |
229 | zfs_vdev_holder); | |
d441e85d BB |
230 | if (!IS_ERR(bdev)) { |
231 | int error = vdev_bdev_reread_part(bdev); | |
066e8252 | 232 | blkdev_put(bdev, mode | FMODE_EXCL); |
a25861dc BB |
233 | if (error == 0) { |
234 | timeout = MSEC2NSEC( | |
235 | zfs_vdev_open_timeout_ms * 2); | |
236 | } | |
d441e85d BB |
237 | } |
238 | } | |
239 | } else { | |
240 | vd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP); | |
241 | ||
242 | rw_init(&vd->vd_lock, NULL, RW_DEFAULT, NULL); | |
243 | rw_enter(&vd->vd_lock, RW_WRITER); | |
244 | } | |
60101509 BB |
245 | |
246 | /* | |
247 | * Devices are always opened by the path provided at configuration | |
248 | * time. This means that if the provided path is a udev by-id path | |
d441e85d | 249 | * then drives may be re-cabled without an issue. If the provided |
4e95cc99 | 250 | * path is a udev by-path path, then the physical location information |
60101509 BB |
251 | * will be preserved. This can be critical for more complicated |
252 | * configurations where drives are located in specific physical | |
d441e85d BB |
253 | * locations to maximize the systems tolerance to component failure. |
254 | * | |
4e95cc99 | 255 | * Alternatively, you can provide your own udev rule to flexibly map |
60101509 | 256 | * the drives as you see fit. It is not advised that you use the |
4e95cc99 | 257 | * /dev/[hd]d devices which may be reordered due to probing order. |
60101509 BB |
258 | * Devices in the wrong locations will be detected by the higher |
259 | * level vdev validation. | |
2d82ea8b BB |
260 | * |
261 | * The specified paths may be briefly removed and recreated in | |
262 | * response to udev events. This should be exceptionally unlikely | |
263 | * because the zpool command makes every effort to verify these paths | |
264 | * have already settled prior to reaching this point. Therefore, | |
265 | * a ENOENT failure at this point is highly likely to be transient | |
266 | * and it is reasonable to sleep and retry before giving up. In | |
267 | * practice delays have been observed to be on the order of 100ms. | |
60101509 | 268 | */ |
a25861dc | 269 | hrtime_t start = gethrtime(); |
d441e85d | 270 | bdev = ERR_PTR(-ENXIO); |
a25861dc | 271 | while (IS_ERR(bdev) && ((gethrtime() - start) < timeout)) { |
066e8252 BB |
272 | bdev = blkdev_get_by_path(v->vdev_path, mode | FMODE_EXCL, |
273 | zfs_vdev_holder); | |
2d82ea8b | 274 | if (unlikely(PTR_ERR(bdev) == -ENOENT)) { |
d441e85d | 275 | schedule_timeout(MSEC_TO_TICK(10)); |
2d82ea8b BB |
276 | } else if (IS_ERR(bdev)) { |
277 | break; | |
278 | } | |
279 | } | |
280 | ||
60101509 | 281 | if (IS_ERR(bdev)) { |
d441e85d | 282 | int error = -PTR_ERR(bdev); |
a25861dc BB |
283 | vdev_dbgmsg(v, "open error=%d timeout=%llu/%llu", error, |
284 | (u_longlong_t)(gethrtime() - start), | |
285 | (u_longlong_t)timeout); | |
d441e85d BB |
286 | vd->vd_bdev = NULL; |
287 | v->vdev_tsd = vd; | |
288 | rw_exit(&vd->vd_lock); | |
289 | return (SET_ERROR(error)); | |
290 | } else { | |
291 | vd->vd_bdev = bdev; | |
292 | v->vdev_tsd = vd; | |
293 | rw_exit(&vd->vd_lock); | |
60101509 BB |
294 | } |
295 | ||
1b939560 BB |
296 | struct request_queue *q = bdev_get_queue(vd->vd_bdev); |
297 | ||
0d8103d9 | 298 | /* Determine the physical block size */ |
6fe3498c RM |
299 | int physical_block_size = bdev_physical_block_size(vd->vd_bdev); |
300 | ||
301 | /* Determine the logical block size */ | |
302 | int logical_block_size = bdev_logical_block_size(vd->vd_bdev); | |
60101509 | 303 | |
60101509 BB |
304 | /* Clear the nowritecache bit, causes vdev_reopen() to try again. */ |
305 | v->vdev_nowritecache = B_FALSE; | |
306 | ||
1b939560 BB |
307 | /* Set when device reports it supports TRIM. */ |
308 | v->vdev_has_trim = !!blk_queue_discard(q); | |
309 | ||
310 | /* Set when device reports it supports secure TRIM. */ | |
311 | v->vdev_has_securetrim = !!blk_queue_discard_secure(q); | |
312 | ||
fb40095f | 313 | /* Inform the ZIO pipeline that we are non-rotational */ |
1b939560 | 314 | v->vdev_nonrot = blk_queue_nonrot(q); |
fb40095f | 315 | |
d441e85d BB |
316 | /* Physical volume size in bytes for the partition */ |
317 | *psize = bdev_capacity(vd->vd_bdev); | |
318 | ||
319 | /* Physical volume size in bytes including possible expansion space */ | |
320 | *max_psize = bdev_max_capacity(vd->vd_bdev, v->vdev_wholedisk); | |
1bd201e7 | 321 | |
60101509 | 322 | /* Based on the minimum sector size set the block size */ |
6fe3498c RM |
323 | *physical_ashift = highbit64(MAX(physical_block_size, |
324 | SPA_MINBLOCKSIZE)) - 1; | |
325 | ||
326 | *logical_ashift = highbit64(MAX(logical_block_size, | |
327 | SPA_MINBLOCKSIZE)) - 1; | |
60101509 | 328 | |
d1d7e268 | 329 | return (0); |
60101509 BB |
330 | } |
331 | ||
332 | static void | |
333 | vdev_disk_close(vdev_t *v) | |
334 | { | |
335 | vdev_disk_t *vd = v->vdev_tsd; | |
336 | ||
0d8103d9 | 337 | if (v->vdev_reopening || vd == NULL) |
60101509 BB |
338 | return; |
339 | ||
d441e85d | 340 | if (vd->vd_bdev != NULL) { |
066e8252 BB |
341 | blkdev_put(vd->vd_bdev, |
342 | vdev_bdev_mode(spa_mode(v->vdev_spa)) | FMODE_EXCL); | |
d441e85d | 343 | } |
60101509 | 344 | |
d441e85d | 345 | rw_destroy(&vd->vd_lock); |
d1d7e268 | 346 | kmem_free(vd, sizeof (vdev_disk_t)); |
60101509 BB |
347 | v->vdev_tsd = NULL; |
348 | } | |
349 | ||
350 | static dio_request_t * | |
351 | vdev_disk_dio_alloc(int bio_count) | |
352 | { | |
353 | dio_request_t *dr; | |
354 | int i; | |
355 | ||
d1d7e268 | 356 | dr = kmem_zalloc(sizeof (dio_request_t) + |
79c76d5b | 357 | sizeof (struct bio *) * bio_count, KM_SLEEP); |
60101509 | 358 | if (dr) { |
60101509 BB |
359 | atomic_set(&dr->dr_ref, 0); |
360 | dr->dr_bio_count = bio_count; | |
361 | dr->dr_error = 0; | |
362 | ||
363 | for (i = 0; i < dr->dr_bio_count; i++) | |
364 | dr->dr_bio[i] = NULL; | |
365 | } | |
366 | ||
d1d7e268 | 367 | return (dr); |
60101509 BB |
368 | } |
369 | ||
370 | static void | |
371 | vdev_disk_dio_free(dio_request_t *dr) | |
372 | { | |
373 | int i; | |
374 | ||
375 | for (i = 0; i < dr->dr_bio_count; i++) | |
376 | if (dr->dr_bio[i]) | |
377 | bio_put(dr->dr_bio[i]); | |
378 | ||
d1d7e268 MK |
379 | kmem_free(dr, sizeof (dio_request_t) + |
380 | sizeof (struct bio *) * dr->dr_bio_count); | |
60101509 BB |
381 | } |
382 | ||
383 | static void | |
384 | vdev_disk_dio_get(dio_request_t *dr) | |
385 | { | |
386 | atomic_inc(&dr->dr_ref); | |
387 | } | |
388 | ||
389 | static int | |
390 | vdev_disk_dio_put(dio_request_t *dr) | |
391 | { | |
392 | int rc = atomic_dec_return(&dr->dr_ref); | |
393 | ||
394 | /* | |
395 | * Free the dio_request when the last reference is dropped and | |
396 | * ensure zio_interpret is called only once with the correct zio | |
397 | */ | |
398 | if (rc == 0) { | |
399 | zio_t *zio = dr->dr_zio; | |
400 | int error = dr->dr_error; | |
401 | ||
402 | vdev_disk_dio_free(dr); | |
403 | ||
404 | if (zio) { | |
405 | zio->io_error = error; | |
d148e951 BB |
406 | ASSERT3S(zio->io_error, >=, 0); |
407 | if (zio->io_error) | |
408 | vdev_disk_error(zio); | |
a6255b7f | 409 | |
26ef0cc7 | 410 | zio_delay_interrupt(zio); |
60101509 BB |
411 | } |
412 | } | |
413 | ||
d1d7e268 | 414 | return (rc); |
60101509 BB |
415 | } |
416 | ||
784a7fe5 | 417 | BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, error) |
60101509 BB |
418 | { |
419 | dio_request_t *dr = bio->bi_private; | |
420 | int rc; | |
421 | ||
784a7fe5 LW |
422 | if (dr->dr_error == 0) { |
423 | #ifdef HAVE_1ARG_BIO_END_IO_T | |
36ba27e9 | 424 | dr->dr_error = BIO_END_IO_ERROR(bio); |
784a7fe5 LW |
425 | #else |
426 | if (error) | |
427 | dr->dr_error = -(error); | |
428 | else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) | |
429 | dr->dr_error = EIO; | |
430 | #endif | |
431 | } | |
60101509 | 432 | |
b0be93e8 | 433 | /* Drop reference acquired by __vdev_disk_physio */ |
60101509 | 434 | rc = vdev_disk_dio_put(dr); |
60101509 BB |
435 | } |
436 | ||
bbb1b6ce | 437 | static inline void |
3b86aeb2 | 438 | vdev_submit_bio_impl(struct bio *bio) |
bbb1b6ce BB |
439 | { |
440 | #ifdef HAVE_1ARG_SUBMIT_BIO | |
bbb1b6ce BB |
441 | submit_bio(bio); |
442 | #else | |
3b86aeb2 | 443 | submit_bio(0, bio); |
bbb1b6ce BB |
444 | #endif |
445 | } | |
446 | ||
2e407941 BB |
447 | /* |
448 | * preempt_schedule_notrace is GPL-only which breaks the ZFS build, so | |
449 | * replace it with preempt_schedule under the following condition: | |
450 | */ | |
451 | #if defined(CONFIG_ARM64) && \ | |
452 | defined(CONFIG_PREEMPTION) && \ | |
453 | defined(CONFIG_BLK_CGROUP) | |
454 | #define preempt_schedule_notrace(x) preempt_schedule(x) | |
455 | #endif | |
456 | ||
26a85659 BB |
457 | #ifdef HAVE_BIO_SET_DEV |
458 | #if defined(CONFIG_BLK_CGROUP) && defined(HAVE_BIO_SET_DEV_GPL_ONLY) | |
bd0d24e0 BB |
459 | /* |
460 | * The Linux 5.5 kernel updated percpu_ref_tryget() which is inlined by | |
461 | * blkg_tryget() to use rcu_read_lock() instead of rcu_read_lock_sched(). | |
462 | * As a side effect the function was converted to GPL-only. Define our | |
463 | * own version when needed which uses rcu_read_lock_sched(). | |
464 | */ | |
465 | #if defined(HAVE_BLKG_TRYGET_GPL_ONLY) | |
466 | static inline bool | |
467 | vdev_blkg_tryget(struct blkcg_gq *blkg) | |
468 | { | |
469 | struct percpu_ref *ref = &blkg->refcnt; | |
470 | unsigned long __percpu *count; | |
471 | bool rc; | |
472 | ||
473 | rcu_read_lock_sched(); | |
474 | ||
475 | if (__ref_is_percpu(ref, &count)) { | |
476 | this_cpu_inc(*count); | |
477 | rc = true; | |
478 | } else { | |
838a2490 CK |
479 | #ifdef ZFS_PERCPU_REF_COUNT_IN_DATA |
480 | rc = atomic_long_inc_not_zero(&ref->data->count); | |
481 | #else | |
bd0d24e0 | 482 | rc = atomic_long_inc_not_zero(&ref->count); |
838a2490 | 483 | #endif |
bd0d24e0 BB |
484 | } |
485 | ||
486 | rcu_read_unlock_sched(); | |
487 | ||
488 | return (rc); | |
489 | } | |
490 | #elif defined(HAVE_BLKG_TRYGET) | |
491 | #define vdev_blkg_tryget(bg) blkg_tryget(bg) | |
492 | #endif | |
26a85659 BB |
493 | /* |
494 | * The Linux 5.0 kernel updated the bio_set_dev() macro so it calls the | |
495 | * GPL-only bio_associate_blkg() symbol thus inadvertently converting | |
496 | * the entire macro. Provide a minimal version which always assigns the | |
497 | * request queue's root_blkg to the bio. | |
498 | */ | |
499 | static inline void | |
500 | vdev_bio_associate_blkg(struct bio *bio) | |
501 | { | |
502 | struct request_queue *q = bio->bi_disk->queue; | |
503 | ||
504 | ASSERT3P(q, !=, NULL); | |
26a85659 BB |
505 | ASSERT3P(bio->bi_blkg, ==, NULL); |
506 | ||
bd0d24e0 | 507 | if (q->root_blkg && vdev_blkg_tryget(q->root_blkg)) |
26a85659 BB |
508 | bio->bi_blkg = q->root_blkg; |
509 | } | |
510 | #define bio_associate_blkg vdev_bio_associate_blkg | |
511 | #endif | |
512 | #else | |
513 | /* | |
514 | * Provide a bio_set_dev() helper macro for pre-Linux 4.14 kernels. | |
515 | */ | |
787acae0 GDN |
516 | static inline void |
517 | bio_set_dev(struct bio *bio, struct block_device *bdev) | |
518 | { | |
519 | bio->bi_bdev = bdev; | |
520 | } | |
26a85659 | 521 | #endif /* HAVE_BIO_SET_DEV */ |
787acae0 | 522 | |
37f9dac5 | 523 | static inline void |
3b86aeb2 | 524 | vdev_submit_bio(struct bio *bio) |
37f9dac5 | 525 | { |
37f9dac5 RY |
526 | struct bio_list *bio_list = current->bio_list; |
527 | current->bio_list = NULL; | |
3b86aeb2 | 528 | vdev_submit_bio_impl(bio); |
37f9dac5 | 529 | current->bio_list = bio_list; |
37f9dac5 RY |
530 | } |
531 | ||
60101509 | 532 | static int |
b0be93e8 IH |
533 | __vdev_disk_physio(struct block_device *bdev, zio_t *zio, |
534 | size_t io_size, uint64_t io_offset, int rw, int flags) | |
60101509 | 535 | { |
d1d7e268 | 536 | dio_request_t *dr; |
b0be93e8 | 537 | uint64_t abd_offset; |
60101509 | 538 | uint64_t bio_offset; |
3b86aeb2 | 539 | int bio_size, bio_count = 16; |
f74fae8b | 540 | int i = 0, error = 0; |
e8ac4557 | 541 | struct blk_plug plug; |
066e8252 | 542 | |
d441e85d BB |
543 | /* |
544 | * Accessing outside the block device is never allowed. | |
545 | */ | |
546 | if (io_offset + io_size > bdev->bd_inode->i_size) { | |
547 | vdev_dbgmsg(zio->io_vd, | |
548 | "Illegal access %llu size %llu, device size %llu", | |
549 | io_offset, io_size, i_size_read(bdev->bd_inode)); | |
550 | return (SET_ERROR(EIO)); | |
551 | } | |
e06be586 | 552 | |
60101509 BB |
553 | retry: |
554 | dr = vdev_disk_dio_alloc(bio_count); | |
555 | if (dr == NULL) | |
ecb2b7dc | 556 | return (SET_ERROR(ENOMEM)); |
60101509 | 557 | |
2959d94a | 558 | if (zio && !(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD))) |
29b763cd | 559 | bio_set_flags_failfast(bdev, &flags); |
2959d94a | 560 | |
60101509 | 561 | dr->dr_zio = zio; |
60101509 | 562 | |
60101509 BB |
563 | /* |
564 | * When the IO size exceeds the maximum bio size for the request | |
565 | * queue we are forced to break the IO in multiple bio's and wait | |
566 | * for them all to complete. Ideally, all pool users will set | |
567 | * their volume block size to match the maximum request size and | |
568 | * the common case will be one bio per vdev IO request. | |
569 | */ | |
a6255b7f | 570 | |
b0be93e8 IH |
571 | abd_offset = 0; |
572 | bio_offset = io_offset; | |
573 | bio_size = io_size; | |
60101509 BB |
574 | for (i = 0; i <= dr->dr_bio_count; i++) { |
575 | ||
576 | /* Finished constructing bio's for given buffer */ | |
577 | if (bio_size <= 0) | |
578 | break; | |
579 | ||
580 | /* | |
581 | * By default only 'bio_count' bio's per dio are allowed. | |
582 | * However, if we find ourselves in a situation where more | |
583 | * are needed we allocate a larger dio and warn the user. | |
584 | */ | |
585 | if (dr->dr_bio_count == i) { | |
586 | vdev_disk_dio_free(dr); | |
587 | bio_count *= 2; | |
60101509 BB |
588 | goto retry; |
589 | } | |
590 | ||
29b763cd | 591 | /* bio_alloc() with __GFP_WAIT never returns NULL */ |
1086f542 BB |
592 | dr->dr_bio[i] = bio_alloc(GFP_NOIO, |
593 | MIN(abd_nr_pages_off(zio->io_abd, bio_size, abd_offset), | |
594 | BIO_MAX_PAGES)); | |
595 | if (unlikely(dr->dr_bio[i] == NULL)) { | |
60101509 | 596 | vdev_disk_dio_free(dr); |
ecb2b7dc | 597 | return (SET_ERROR(ENOMEM)); |
60101509 BB |
598 | } |
599 | ||
600 | /* Matching put called by vdev_disk_physio_completion */ | |
601 | vdev_disk_dio_get(dr); | |
602 | ||
787acae0 | 603 | bio_set_dev(dr->dr_bio[i], bdev); |
d4541210 | 604 | BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9; |
60101509 BB |
605 | dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion; |
606 | dr->dr_bio[i]->bi_private = dr; | |
3b86aeb2 | 607 | bio_set_op_attrs(dr->dr_bio[i], rw, flags); |
60101509 BB |
608 | |
609 | /* Remaining size is returned to become the new size */ | |
fb822260 | 610 | bio_size = abd_bio_map_off(dr->dr_bio[i], zio->io_abd, |
02730c33 | 611 | bio_size, abd_offset); |
60101509 BB |
612 | |
613 | /* Advance in buffer and construct another bio if needed */ | |
b0be93e8 | 614 | abd_offset += BIO_BI_SIZE(dr->dr_bio[i]); |
d4541210 | 615 | bio_offset += BIO_BI_SIZE(dr->dr_bio[i]); |
60101509 BB |
616 | } |
617 | ||
37f9dac5 | 618 | /* Extra reference to protect dio_request during vdev_submit_bio */ |
60101509 BB |
619 | vdev_disk_dio_get(dr); |
620 | ||
e8ac4557 IH |
621 | if (dr->dr_bio_count > 1) |
622 | blk_start_plug(&plug); | |
e8ac4557 | 623 | |
60101509 BB |
624 | /* Submit all bio's associated with this dio */ |
625 | for (i = 0; i < dr->dr_bio_count; i++) | |
626 | if (dr->dr_bio[i]) | |
3b86aeb2 | 627 | vdev_submit_bio(dr->dr_bio[i]); |
60101509 | 628 | |
e8ac4557 IH |
629 | if (dr->dr_bio_count > 1) |
630 | blk_finish_plug(&plug); | |
e8ac4557 | 631 | |
d1d7e268 | 632 | (void) vdev_disk_dio_put(dr); |
60101509 | 633 | |
d1d7e268 | 634 | return (error); |
60101509 BB |
635 | } |
636 | ||
36ba27e9 | 637 | BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error) |
60101509 BB |
638 | { |
639 | zio_t *zio = bio->bi_private; | |
784a7fe5 | 640 | #ifdef HAVE_1ARG_BIO_END_IO_T |
36ba27e9 BB |
641 | zio->io_error = BIO_END_IO_ERROR(bio); |
642 | #else | |
643 | zio->io_error = -error; | |
784a7fe5 | 644 | #endif |
60101509 | 645 | |
36ba27e9 | 646 | if (zio->io_error && (zio->io_error == EOPNOTSUPP)) |
60101509 BB |
647 | zio->io_vd->vdev_nowritecache = B_TRUE; |
648 | ||
649 | bio_put(bio); | |
d148e951 BB |
650 | ASSERT3S(zio->io_error, >=, 0); |
651 | if (zio->io_error) | |
652 | vdev_disk_error(zio); | |
60101509 | 653 | zio_interrupt(zio); |
60101509 BB |
654 | } |
655 | ||
656 | static int | |
657 | vdev_disk_io_flush(struct block_device *bdev, zio_t *zio) | |
658 | { | |
659 | struct request_queue *q; | |
660 | struct bio *bio; | |
661 | ||
662 | q = bdev_get_queue(bdev); | |
663 | if (!q) | |
ecb2b7dc | 664 | return (SET_ERROR(ENXIO)); |
60101509 | 665 | |
abc41ac7 | 666 | bio = bio_alloc(GFP_NOIO, 0); |
29b763cd IH |
667 | /* bio_alloc() with __GFP_WAIT never returns NULL */ |
668 | if (unlikely(bio == NULL)) | |
ecb2b7dc | 669 | return (SET_ERROR(ENOMEM)); |
60101509 BB |
670 | |
671 | bio->bi_end_io = vdev_disk_io_flush_completion; | |
672 | bio->bi_private = zio; | |
787acae0 | 673 | bio_set_dev(bio, bdev); |
a5e046ea | 674 | bio_set_flush(bio); |
3b86aeb2 | 675 | vdev_submit_bio(bio); |
cecb7487 | 676 | invalidate_bdev(bdev); |
60101509 | 677 | |
d1d7e268 | 678 | return (0); |
60101509 | 679 | } |
60101509 | 680 | |
98b25418 | 681 | static void |
60101509 BB |
682 | vdev_disk_io_start(zio_t *zio) |
683 | { | |
684 | vdev_t *v = zio->io_vd; | |
685 | vdev_disk_t *vd = v->vdev_tsd; | |
1b939560 | 686 | unsigned long trim_flags = 0; |
066e8252 | 687 | int rw, error; |
60101509 | 688 | |
d441e85d BB |
689 | /* |
690 | * If the vdev is closed, it's likely in the REMOVED or FAULTED state. | |
691 | * Nothing to be done here but return failure. | |
692 | */ | |
693 | if (vd == NULL) { | |
694 | zio->io_error = ENXIO; | |
695 | zio_interrupt(zio); | |
696 | return; | |
697 | } | |
698 | ||
699 | rw_enter(&vd->vd_lock, RW_READER); | |
700 | ||
701 | /* | |
702 | * If the vdev is closed, it's likely due to a failed reopen and is | |
703 | * in the UNAVAIL state. Nothing to be done here but return failure. | |
704 | */ | |
705 | if (vd->vd_bdev == NULL) { | |
706 | rw_exit(&vd->vd_lock); | |
707 | zio->io_error = ENXIO; | |
708 | zio_interrupt(zio); | |
709 | return; | |
710 | } | |
711 | ||
60101509 BB |
712 | switch (zio->io_type) { |
713 | case ZIO_TYPE_IOCTL: | |
714 | ||
715 | if (!vdev_readable(v)) { | |
d441e85d | 716 | rw_exit(&vd->vd_lock); |
2e528b49 | 717 | zio->io_error = SET_ERROR(ENXIO); |
98b25418 GW |
718 | zio_interrupt(zio); |
719 | return; | |
60101509 BB |
720 | } |
721 | ||
722 | switch (zio->io_cmd) { | |
723 | case DKIOCFLUSHWRITECACHE: | |
724 | ||
725 | if (zfs_nocacheflush) | |
726 | break; | |
727 | ||
728 | if (v->vdev_nowritecache) { | |
2e528b49 | 729 | zio->io_error = SET_ERROR(ENOTSUP); |
60101509 BB |
730 | break; |
731 | } | |
732 | ||
733 | error = vdev_disk_io_flush(vd->vd_bdev, zio); | |
d441e85d BB |
734 | if (error == 0) { |
735 | rw_exit(&vd->vd_lock); | |
98b25418 | 736 | return; |
d441e85d | 737 | } |
60101509 BB |
738 | |
739 | zio->io_error = error; | |
60101509 BB |
740 | |
741 | break; | |
742 | ||
743 | default: | |
2e528b49 | 744 | zio->io_error = SET_ERROR(ENOTSUP); |
60101509 BB |
745 | } |
746 | ||
d441e85d | 747 | rw_exit(&vd->vd_lock); |
98b25418 GW |
748 | zio_execute(zio); |
749 | return; | |
60101509 | 750 | case ZIO_TYPE_WRITE: |
3b86aeb2 | 751 | rw = WRITE; |
60101509 BB |
752 | break; |
753 | ||
754 | case ZIO_TYPE_READ: | |
3b86aeb2 | 755 | rw = READ; |
60101509 BB |
756 | break; |
757 | ||
1b939560 BB |
758 | case ZIO_TYPE_TRIM: |
759 | #if defined(BLKDEV_DISCARD_SECURE) | |
760 | if (zio->io_trim_flags & ZIO_TRIM_SECURE) | |
761 | trim_flags |= BLKDEV_DISCARD_SECURE; | |
762 | #endif | |
763 | zio->io_error = -blkdev_issue_discard(vd->vd_bdev, | |
764 | zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS, | |
765 | trim_flags); | |
766 | ||
767 | rw_exit(&vd->vd_lock); | |
768 | zio_interrupt(zio); | |
769 | return; | |
770 | ||
60101509 | 771 | default: |
d441e85d | 772 | rw_exit(&vd->vd_lock); |
2e528b49 | 773 | zio->io_error = SET_ERROR(ENOTSUP); |
98b25418 GW |
774 | zio_interrupt(zio); |
775 | return; | |
60101509 BB |
776 | } |
777 | ||
26ef0cc7 | 778 | zio->io_target_timestamp = zio_handle_io_delay(zio); |
b0be93e8 | 779 | error = __vdev_disk_physio(vd->vd_bdev, zio, |
066e8252 | 780 | zio->io_size, zio->io_offset, rw, 0); |
d441e85d BB |
781 | rw_exit(&vd->vd_lock); |
782 | ||
60101509 BB |
783 | if (error) { |
784 | zio->io_error = error; | |
98b25418 GW |
785 | zio_interrupt(zio); |
786 | return; | |
60101509 | 787 | } |
60101509 BB |
788 | } |
789 | ||
790 | static void | |
791 | vdev_disk_io_done(zio_t *zio) | |
792 | { | |
793 | /* | |
794 | * If the device returned EIO, we revalidate the media. If it is | |
795 | * determined the media has changed this triggers the asynchronous | |
796 | * removal of the device from the configuration. | |
797 | */ | |
798 | if (zio->io_error == EIO) { | |
d1d7e268 | 799 | vdev_t *v = zio->io_vd; |
60101509 BB |
800 | vdev_disk_t *vd = v->vdev_tsd; |
801 | ||
ae15f1c1 | 802 | if (zfs_check_media_change(vd->vd_bdev)) { |
066e8252 | 803 | invalidate_bdev(vd->vd_bdev); |
60101509 BB |
804 | v->vdev_remove_wanted = B_TRUE; |
805 | spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE); | |
806 | } | |
807 | } | |
808 | } | |
809 | ||
810 | static void | |
811 | vdev_disk_hold(vdev_t *vd) | |
812 | { | |
813 | ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); | |
814 | ||
815 | /* We must have a pathname, and it must be absolute. */ | |
816 | if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') | |
817 | return; | |
818 | ||
819 | /* | |
820 | * Only prefetch path and devid info if the device has | |
821 | * never been opened. | |
822 | */ | |
823 | if (vd->vdev_tsd != NULL) | |
824 | return; | |
825 | ||
60101509 BB |
826 | } |
827 | ||
828 | static void | |
829 | vdev_disk_rele(vdev_t *vd) | |
830 | { | |
831 | ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); | |
832 | ||
833 | /* XXX: Implement me as a vnode rele for the device */ | |
834 | } | |
835 | ||
836 | vdev_ops_t vdev_disk_ops = { | |
b2255edc BB |
837 | .vdev_op_init = NULL, |
838 | .vdev_op_fini = NULL, | |
a64f8276 I |
839 | .vdev_op_open = vdev_disk_open, |
840 | .vdev_op_close = vdev_disk_close, | |
841 | .vdev_op_asize = vdev_default_asize, | |
b2255edc BB |
842 | .vdev_op_min_asize = vdev_default_min_asize, |
843 | .vdev_op_min_alloc = NULL, | |
a64f8276 I |
844 | .vdev_op_io_start = vdev_disk_io_start, |
845 | .vdev_op_io_done = vdev_disk_io_done, | |
846 | .vdev_op_state_change = NULL, | |
847 | .vdev_op_need_resilver = NULL, | |
848 | .vdev_op_hold = vdev_disk_hold, | |
849 | .vdev_op_rele = vdev_disk_rele, | |
850 | .vdev_op_remap = NULL, | |
851 | .vdev_op_xlate = vdev_default_xlate, | |
b2255edc BB |
852 | .vdev_op_rebuild_asize = NULL, |
853 | .vdev_op_metaslab_init = NULL, | |
854 | .vdev_op_config_generate = NULL, | |
855 | .vdev_op_nparity = NULL, | |
856 | .vdev_op_ndisks = NULL, | |
a64f8276 I |
857 | .vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */ |
858 | .vdev_op_leaf = B_TRUE /* leaf vdev */ | |
60101509 BB |
859 | }; |
860 | ||
9e17e6f2 BB |
861 | /* |
862 | * The zfs_vdev_scheduler module option has been deprecated. Setting this | |
863 | * value no longer has any effect. It has not yet been entirely removed | |
864 | * to allow the module to be loaded if this option is specified in the | |
865 | * /etc/modprobe.d/zfs.conf file. The following warning will be logged. | |
866 | */ | |
867 | static int | |
868 | param_set_vdev_scheduler(const char *val, zfs_kernel_param_t *kp) | |
869 | { | |
870 | int error = param_set_charp(val, kp); | |
871 | if (error == 0) { | |
872 | printk(KERN_INFO "The 'zfs_vdev_scheduler' module option " | |
873 | "is not supported.\n"); | |
874 | } | |
875 | ||
876 | return (error); | |
877 | } | |
878 | ||
879 | char *zfs_vdev_scheduler = "unused"; | |
e771de53 BB |
880 | module_param_call(zfs_vdev_scheduler, param_set_vdev_scheduler, |
881 | param_get_charp, &zfs_vdev_scheduler, 0644); | |
c409e464 | 882 | MODULE_PARM_DESC(zfs_vdev_scheduler, "I/O scheduler"); |
6fe3498c RM |
883 | |
884 | int | |
885 | param_set_min_auto_ashift(const char *buf, zfs_kernel_param_t *kp) | |
886 | { | |
887 | uint64_t val; | |
888 | int error; | |
889 | ||
890 | error = kstrtoull(buf, 0, &val); | |
891 | if (error < 0) | |
892 | return (SET_ERROR(error)); | |
893 | ||
894 | if (val < ASHIFT_MIN || val > zfs_vdev_max_auto_ashift) | |
895 | return (SET_ERROR(-EINVAL)); | |
896 | ||
897 | error = param_set_ulong(buf, kp); | |
898 | if (error < 0) | |
899 | return (SET_ERROR(error)); | |
900 | ||
901 | return (0); | |
902 | } | |
903 | ||
904 | int | |
905 | param_set_max_auto_ashift(const char *buf, zfs_kernel_param_t *kp) | |
906 | { | |
907 | uint64_t val; | |
908 | int error; | |
909 | ||
910 | error = kstrtoull(buf, 0, &val); | |
911 | if (error < 0) | |
912 | return (SET_ERROR(error)); | |
913 | ||
914 | if (val > ASHIFT_MAX || val < zfs_vdev_min_auto_ashift) | |
915 | return (SET_ERROR(-EINVAL)); | |
916 | ||
917 | error = param_set_ulong(buf, kp); | |
918 | if (error < 0) | |
919 | return (SET_ERROR(error)); | |
920 | ||
921 | return (0); | |
922 | } |