]>
Commit | Line | Data |
---|---|---|
60101509 BB |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
9 | * or http://www.opensolaris.org/os/licensing. | |
10 | * See the License for the specific language governing permissions | |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | /* | |
22 | * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC. | |
23 | * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). | |
24 | * Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>. | |
25 | * LLNL-CODE-403049. | |
74d42600 | 26 | * Copyright (c) 2012, 2018 by Delphix. All rights reserved. |
60101509 BB |
27 | */ |
28 | ||
29 | #include <sys/zfs_context.h> | |
e771de53 | 30 | #include <sys/spa_impl.h> |
60101509 BB |
31 | #include <sys/vdev_disk.h> |
32 | #include <sys/vdev_impl.h> | |
1b939560 | 33 | #include <sys/vdev_trim.h> |
a6255b7f | 34 | #include <sys/abd.h> |
60101509 BB |
35 | #include <sys/fs/zfs.h> |
36 | #include <sys/zio.h> | |
e771de53 | 37 | #include <linux/mod_compat.h> |
74d42600 | 38 | #include <linux/msdos_fs.h> |
05805494 | 39 | #include <linux/vfs_compat.h> |
60101509 | 40 | |
6839eed2 | 41 | char *zfs_vdev_scheduler = VDEV_SCHEDULER; |
8128bd89 | 42 | static void *zfs_vdev_holder = VDEV_HOLDER; |
6839eed2 | 43 | |
74d42600 SH |
44 | /* size of the "reserved" partition, in blocks */ |
45 | #define EFI_MIN_RESV_SIZE (16 * 1024) | |
46 | ||
60101509 BB |
47 | /* |
48 | * Virtual device vector for disks. | |
49 | */ | |
50 | typedef struct dio_request { | |
60101509 | 51 | zio_t *dr_zio; /* Parent ZIO */ |
aa159afb | 52 | atomic_t dr_ref; /* References */ |
60101509 BB |
53 | int dr_error; /* Bio error */ |
54 | int dr_bio_count; /* Count of bio's */ | |
d1d7e268 | 55 | struct bio *dr_bio[0]; /* Attached bio's */ |
60101509 BB |
56 | } dio_request_t; |
57 | ||
58 | ||
59 | #ifdef HAVE_OPEN_BDEV_EXCLUSIVE | |
60 | static fmode_t | |
61 | vdev_bdev_mode(int smode) | |
62 | { | |
63 | fmode_t mode = 0; | |
64 | ||
65 | ASSERT3S(smode & (FREAD | FWRITE), !=, 0); | |
66 | ||
67 | if (smode & FREAD) | |
68 | mode |= FMODE_READ; | |
69 | ||
70 | if (smode & FWRITE) | |
71 | mode |= FMODE_WRITE; | |
72 | ||
d1d7e268 | 73 | return (mode); |
60101509 BB |
74 | } |
75 | #else | |
76 | static int | |
77 | vdev_bdev_mode(int smode) | |
78 | { | |
79 | int mode = 0; | |
80 | ||
81 | ASSERT3S(smode & (FREAD | FWRITE), !=, 0); | |
82 | ||
83 | if ((smode & FREAD) && !(smode & FWRITE)) | |
05805494 | 84 | mode = SB_RDONLY; |
60101509 | 85 | |
d1d7e268 | 86 | return (mode); |
60101509 BB |
87 | } |
88 | #endif /* HAVE_OPEN_BDEV_EXCLUSIVE */ | |
89 | ||
d441e85d BB |
90 | /* |
91 | * Returns the usable capacity (in bytes) for the partition or disk. | |
92 | */ | |
60101509 | 93 | static uint64_t |
d441e85d | 94 | bdev_capacity(struct block_device *bdev) |
60101509 | 95 | { |
d441e85d BB |
96 | return (i_size_read(bdev->bd_inode)); |
97 | } | |
60101509 | 98 | |
d441e85d BB |
99 | /* |
100 | * Returns the maximum expansion capacity of the block device (in bytes). | |
101 | * | |
102 | * It is possible to expand a vdev when it has been created as a wholedisk | |
103 | * and the containing block device has increased in capacity. Or when the | |
104 | * partition containing the pool has been manually increased in size. | |
105 | * | |
106 | * This function is only responsible for calculating the potential expansion | |
107 | * size so it can be reported by 'zpool list'. The efi_use_whole_disk() is | |
108 | * responsible for verifying the expected partition layout in the wholedisk | |
109 | * case, and updating the partition table if appropriate. Once the partition | |
110 | * size has been increased the additional capacity will be visible using | |
111 | * bdev_capacity(). | |
0c637f31 | 112 | * |
113 | * The returned maximum expansion capacity is always expected to be larger, or | |
114 | * at the very least equal, to its usable capacity to prevent overestimating | |
115 | * the pool expandsize. | |
d441e85d BB |
116 | */ |
117 | static uint64_t | |
118 | bdev_max_capacity(struct block_device *bdev, uint64_t wholedisk) | |
119 | { | |
120 | uint64_t psize; | |
121 | int64_t available; | |
122 | ||
123 | if (wholedisk && bdev->bd_part != NULL && bdev != bdev->bd_contains) { | |
74d42600 | 124 | /* |
d441e85d BB |
125 | * When reporting maximum expansion capacity for a wholedisk |
126 | * deduct any capacity which is expected to be lost due to | |
127 | * alignment restrictions. Over reporting this value isn't | |
128 | * harmful and would only result in slightly less capacity | |
129 | * than expected post expansion. | |
0c637f31 | 130 | * The estimated available space may be slightly smaller than |
131 | * bdev_capacity() for devices where the number of sectors is | |
132 | * not a multiple of the alignment size and the partition layout | |
133 | * is keeping less than PARTITION_END_ALIGNMENT bytes after the | |
134 | * "reserved" EFI partition: in such cases return the device | |
135 | * usable capacity. | |
74d42600 | 136 | */ |
d441e85d BB |
137 | available = i_size_read(bdev->bd_contains->bd_inode) - |
138 | ((EFI_MIN_RESV_SIZE + NEW_START_BLOCK + | |
139 | PARTITION_END_ALIGNMENT) << SECTOR_BITS); | |
0c637f31 | 140 | psize = MAX(available, bdev_capacity(bdev)); |
74d42600 | 141 | } else { |
d441e85d | 142 | psize = bdev_capacity(bdev); |
74d42600 | 143 | } |
d441e85d BB |
144 | |
145 | return (psize); | |
60101509 BB |
146 | } |
147 | ||
d148e951 BB |
148 | static void |
149 | vdev_disk_error(zio_t *zio) | |
150 | { | |
c71c8c71 | 151 | /* |
152 | * This function can be called in interrupt context, for instance while | |
153 | * handling IRQs coming from a misbehaving disk device; use printk() | |
154 | * which is safe from any context. | |
155 | */ | |
156 | printk(KERN_WARNING "zio pool=%s vdev=%s error=%d type=%d " | |
157 | "offset=%llu size=%llu flags=%x\n", spa_name(zio->io_spa), | |
158 | zio->io_vd->vdev_path, zio->io_error, zio->io_type, | |
159 | (u_longlong_t)zio->io_offset, (u_longlong_t)zio->io_size, | |
160 | zio->io_flags); | |
d148e951 BB |
161 | } |
162 | ||
6839eed2 BB |
163 | /* |
164 | * Use the Linux 'noop' elevator for zfs managed block devices. This | |
165 | * strikes the ideal balance by allowing the zfs elevator to do all | |
166 | * request ordering and prioritization. While allowing the Linux | |
167 | * elevator to do the maximum front/back merging allowed by the | |
168 | * physical device. This yields the largest possible requests for | |
169 | * the device with the lowest total overhead. | |
6839eed2 | 170 | */ |
e771de53 | 171 | static void |
fdcd952b | 172 | vdev_elevator_switch(vdev_t *v, char *elevator) |
6839eed2 | 173 | { |
fdcd952b | 174 | vdev_disk_t *vd = v->vdev_tsd; |
e771de53 BB |
175 | struct request_queue *q; |
176 | char *device; | |
e2448b0e | 177 | int error; |
fdcd952b | 178 | |
e771de53 BB |
179 | for (int c = 0; c < v->vdev_children; c++) |
180 | vdev_elevator_switch(v->vdev_child[c], elevator); | |
181 | ||
182 | if (!v->vdev_ops->vdev_op_leaf || vd->vd_bdev == NULL) | |
183 | return; | |
184 | ||
185 | q = bdev_get_queue(vd->vd_bdev); | |
186 | device = vd->vd_bdev->bd_disk->disk_name; | |
187 | ||
84daadde PS |
188 | /* |
189 | * Skip devices which are not whole disks (partitions). | |
190 | * Device-mapper devices are excepted since they may be whole | |
191 | * disks despite the vdev_wholedisk flag, in which case we can | |
192 | * and should switch the elevator. If the device-mapper device | |
193 | * does not have an elevator (i.e. dm-raid, dm-crypt, etc.) the | |
194 | * "Skip devices without schedulers" check below will fail. | |
195 | */ | |
196 | if (!v->vdev_wholedisk && strncmp(device, "dm-", 3) != 0) | |
e771de53 | 197 | return; |
04516a45 | 198 | |
fdcd952b | 199 | /* Leave existing scheduler when set to "none" */ |
4903926f | 200 | if ((strncmp(elevator, "none", 4) == 0) && (strlen(elevator) == 4)) |
e771de53 | 201 | return; |
6839eed2 | 202 | |
1c38ac61 BB |
203 | /* |
204 | * The elevator_change() function was available in kernels from | |
205 | * 2.6.36 to 4.11. When not available fall back to using the user | |
206 | * mode helper functionality to set the elevator via sysfs. This | |
207 | * requires /bin/echo and sysfs to be mounted which may not be true | |
208 | * early in the boot process. | |
209 | */ | |
6d1d976b BB |
210 | #ifdef HAVE_ELEVATOR_CHANGE |
211 | error = elevator_change(q, elevator); | |
212 | #else | |
d1d7e268 | 213 | #define SET_SCHEDULER_CMD \ |
6d1d976b BB |
214 | "exec 0</dev/null " \ |
215 | " 1>/sys/block/%s/queue/scheduler " \ | |
216 | " 2>/dev/null; " \ | |
217 | "echo %s" | |
218 | ||
e771de53 BB |
219 | char *argv[] = { "/bin/sh", "-c", NULL, NULL }; |
220 | char *envp[] = { NULL }; | |
6d1d976b | 221 | |
e771de53 BB |
222 | argv[2] = kmem_asprintf(SET_SCHEDULER_CMD, device, elevator); |
223 | error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC); | |
224 | strfree(argv[2]); | |
6d1d976b | 225 | #endif /* HAVE_ELEVATOR_CHANGE */ |
1c38ac61 | 226 | if (error) { |
1b939560 | 227 | zfs_dbgmsg("Unable to set \"%s\" scheduler for %s (%s): %d", |
1c38ac61 BB |
228 | elevator, v->vdev_path, device, error); |
229 | } | |
6839eed2 BB |
230 | } |
231 | ||
60101509 | 232 | static int |
1bd201e7 CS |
233 | vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, |
234 | uint64_t *ashift) | |
60101509 | 235 | { |
d441e85d BB |
236 | struct block_device *bdev; |
237 | fmode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa)); | |
238 | int count = 0, block_size; | |
239 | int bdev_retry_count = 50; | |
60101509 | 240 | vdev_disk_t *vd; |
60101509 BB |
241 | |
242 | /* Must have a pathname and it must be absolute. */ | |
243 | if (v->vdev_path == NULL || v->vdev_path[0] != '/') { | |
244 | v->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; | |
d441e85d | 245 | vdev_dbgmsg(v, "invalid vdev_path"); |
2d82ea8b | 246 | return (SET_ERROR(EINVAL)); |
60101509 BB |
247 | } |
248 | ||
0d8103d9 | 249 | /* |
d441e85d BB |
250 | * Reopen the device if it is currently open. When expanding a |
251 | * partition force re-scanning the partition table while closed | |
252 | * in order to get an accurate updated block device size. Then | |
253 | * since udev may need to recreate the device links increase the | |
254 | * open retry count before reporting the device as unavailable. | |
0d8103d9 | 255 | */ |
d441e85d BB |
256 | vd = v->vdev_tsd; |
257 | if (vd) { | |
258 | char disk_name[BDEVNAME_SIZE + 6] = "/dev/"; | |
259 | boolean_t reread_part = B_FALSE; | |
0d8103d9 | 260 | |
d441e85d BB |
261 | rw_enter(&vd->vd_lock, RW_WRITER); |
262 | bdev = vd->vd_bdev; | |
263 | vd->vd_bdev = NULL; | |
264 | ||
265 | if (bdev) { | |
266 | if (v->vdev_expanding && bdev != bdev->bd_contains) { | |
267 | bdevname(bdev->bd_contains, disk_name + 5); | |
268 | reread_part = B_TRUE; | |
269 | } | |
270 | ||
271 | vdev_bdev_close(bdev, mode); | |
272 | } | |
273 | ||
274 | if (reread_part) { | |
275 | bdev = vdev_bdev_open(disk_name, mode, zfs_vdev_holder); | |
276 | if (!IS_ERR(bdev)) { | |
277 | int error = vdev_bdev_reread_part(bdev); | |
278 | vdev_bdev_close(bdev, mode); | |
279 | if (error == 0) | |
280 | bdev_retry_count = 100; | |
281 | } | |
282 | } | |
283 | } else { | |
284 | vd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP); | |
285 | ||
286 | rw_init(&vd->vd_lock, NULL, RW_DEFAULT, NULL); | |
287 | rw_enter(&vd->vd_lock, RW_WRITER); | |
288 | } | |
60101509 BB |
289 | |
290 | /* | |
291 | * Devices are always opened by the path provided at configuration | |
292 | * time. This means that if the provided path is a udev by-id path | |
d441e85d | 293 | * then drives may be re-cabled without an issue. If the provided |
4e95cc99 | 294 | * path is a udev by-path path, then the physical location information |
60101509 BB |
295 | * will be preserved. This can be critical for more complicated |
296 | * configurations where drives are located in specific physical | |
d441e85d BB |
297 | * locations to maximize the systems tolerance to component failure. |
298 | * | |
4e95cc99 | 299 | * Alternatively, you can provide your own udev rule to flexibly map |
60101509 | 300 | * the drives as you see fit. It is not advised that you use the |
4e95cc99 | 301 | * /dev/[hd]d devices which may be reordered due to probing order. |
60101509 BB |
302 | * Devices in the wrong locations will be detected by the higher |
303 | * level vdev validation. | |
2d82ea8b BB |
304 | * |
305 | * The specified paths may be briefly removed and recreated in | |
306 | * response to udev events. This should be exceptionally unlikely | |
307 | * because the zpool command makes every effort to verify these paths | |
308 | * have already settled prior to reaching this point. Therefore, | |
309 | * a ENOENT failure at this point is highly likely to be transient | |
310 | * and it is reasonable to sleep and retry before giving up. In | |
311 | * practice delays have been observed to be on the order of 100ms. | |
60101509 | 312 | */ |
d441e85d BB |
313 | bdev = ERR_PTR(-ENXIO); |
314 | while (IS_ERR(bdev) && count < bdev_retry_count) { | |
315 | bdev = vdev_bdev_open(v->vdev_path, mode, zfs_vdev_holder); | |
2d82ea8b | 316 | if (unlikely(PTR_ERR(bdev) == -ENOENT)) { |
d441e85d | 317 | schedule_timeout(MSEC_TO_TICK(10)); |
2d82ea8b BB |
318 | count++; |
319 | } else if (IS_ERR(bdev)) { | |
320 | break; | |
321 | } | |
322 | } | |
323 | ||
60101509 | 324 | if (IS_ERR(bdev)) { |
d441e85d | 325 | int error = -PTR_ERR(bdev); |
1b939560 | 326 | vdev_dbgmsg(v, "open error=%d count=%d", error, count); |
d441e85d BB |
327 | vd->vd_bdev = NULL; |
328 | v->vdev_tsd = vd; | |
329 | rw_exit(&vd->vd_lock); | |
330 | return (SET_ERROR(error)); | |
331 | } else { | |
332 | vd->vd_bdev = bdev; | |
333 | v->vdev_tsd = vd; | |
334 | rw_exit(&vd->vd_lock); | |
60101509 BB |
335 | } |
336 | ||
1b939560 BB |
337 | struct request_queue *q = bdev_get_queue(vd->vd_bdev); |
338 | ||
0d8103d9 BB |
339 | /* Determine the physical block size */ |
340 | block_size = vdev_bdev_block_size(vd->vd_bdev); | |
60101509 | 341 | |
60101509 BB |
342 | /* Clear the nowritecache bit, causes vdev_reopen() to try again. */ |
343 | v->vdev_nowritecache = B_FALSE; | |
344 | ||
1b939560 BB |
345 | /* Set when device reports it supports TRIM. */ |
346 | v->vdev_has_trim = !!blk_queue_discard(q); | |
347 | ||
348 | /* Set when device reports it supports secure TRIM. */ | |
349 | v->vdev_has_securetrim = !!blk_queue_discard_secure(q); | |
350 | ||
fb40095f | 351 | /* Inform the ZIO pipeline that we are non-rotational */ |
1b939560 | 352 | v->vdev_nonrot = blk_queue_nonrot(q); |
fb40095f | 353 | |
d441e85d BB |
354 | /* Physical volume size in bytes for the partition */ |
355 | *psize = bdev_capacity(vd->vd_bdev); | |
356 | ||
357 | /* Physical volume size in bytes including possible expansion space */ | |
358 | *max_psize = bdev_max_capacity(vd->vd_bdev, v->vdev_wholedisk); | |
1bd201e7 | 359 | |
60101509 | 360 | /* Based on the minimum sector size set the block size */ |
9bd274dd | 361 | *ashift = highbit64(MAX(block_size, SPA_MINBLOCKSIZE)) - 1; |
60101509 | 362 | |
6839eed2 | 363 | /* Try to set the io scheduler elevator algorithm */ |
fdcd952b | 364 | (void) vdev_elevator_switch(v, zfs_vdev_scheduler); |
6839eed2 | 365 | |
d1d7e268 | 366 | return (0); |
60101509 BB |
367 | } |
368 | ||
369 | static void | |
370 | vdev_disk_close(vdev_t *v) | |
371 | { | |
372 | vdev_disk_t *vd = v->vdev_tsd; | |
373 | ||
0d8103d9 | 374 | if (v->vdev_reopening || vd == NULL) |
60101509 BB |
375 | return; |
376 | ||
d441e85d | 377 | if (vd->vd_bdev != NULL) { |
60101509 | 378 | vdev_bdev_close(vd->vd_bdev, |
d1d7e268 | 379 | vdev_bdev_mode(spa_mode(v->vdev_spa))); |
d441e85d | 380 | } |
60101509 | 381 | |
d441e85d | 382 | rw_destroy(&vd->vd_lock); |
d1d7e268 | 383 | kmem_free(vd, sizeof (vdev_disk_t)); |
60101509 BB |
384 | v->vdev_tsd = NULL; |
385 | } | |
386 | ||
387 | static dio_request_t * | |
388 | vdev_disk_dio_alloc(int bio_count) | |
389 | { | |
390 | dio_request_t *dr; | |
391 | int i; | |
392 | ||
d1d7e268 | 393 | dr = kmem_zalloc(sizeof (dio_request_t) + |
79c76d5b | 394 | sizeof (struct bio *) * bio_count, KM_SLEEP); |
60101509 | 395 | if (dr) { |
60101509 BB |
396 | atomic_set(&dr->dr_ref, 0); |
397 | dr->dr_bio_count = bio_count; | |
398 | dr->dr_error = 0; | |
399 | ||
400 | for (i = 0; i < dr->dr_bio_count; i++) | |
401 | dr->dr_bio[i] = NULL; | |
402 | } | |
403 | ||
d1d7e268 | 404 | return (dr); |
60101509 BB |
405 | } |
406 | ||
407 | static void | |
408 | vdev_disk_dio_free(dio_request_t *dr) | |
409 | { | |
410 | int i; | |
411 | ||
412 | for (i = 0; i < dr->dr_bio_count; i++) | |
413 | if (dr->dr_bio[i]) | |
414 | bio_put(dr->dr_bio[i]); | |
415 | ||
d1d7e268 MK |
416 | kmem_free(dr, sizeof (dio_request_t) + |
417 | sizeof (struct bio *) * dr->dr_bio_count); | |
60101509 BB |
418 | } |
419 | ||
420 | static void | |
421 | vdev_disk_dio_get(dio_request_t *dr) | |
422 | { | |
423 | atomic_inc(&dr->dr_ref); | |
424 | } | |
425 | ||
426 | static int | |
427 | vdev_disk_dio_put(dio_request_t *dr) | |
428 | { | |
429 | int rc = atomic_dec_return(&dr->dr_ref); | |
430 | ||
431 | /* | |
432 | * Free the dio_request when the last reference is dropped and | |
433 | * ensure zio_interpret is called only once with the correct zio | |
434 | */ | |
435 | if (rc == 0) { | |
436 | zio_t *zio = dr->dr_zio; | |
437 | int error = dr->dr_error; | |
438 | ||
439 | vdev_disk_dio_free(dr); | |
440 | ||
441 | if (zio) { | |
442 | zio->io_error = error; | |
d148e951 BB |
443 | ASSERT3S(zio->io_error, >=, 0); |
444 | if (zio->io_error) | |
445 | vdev_disk_error(zio); | |
a6255b7f | 446 | |
26ef0cc7 | 447 | zio_delay_interrupt(zio); |
60101509 BB |
448 | } |
449 | } | |
450 | ||
d1d7e268 | 451 | return (rc); |
60101509 BB |
452 | } |
453 | ||
784a7fe5 | 454 | BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, error) |
60101509 BB |
455 | { |
456 | dio_request_t *dr = bio->bi_private; | |
457 | int rc; | |
458 | ||
784a7fe5 LW |
459 | if (dr->dr_error == 0) { |
460 | #ifdef HAVE_1ARG_BIO_END_IO_T | |
36ba27e9 | 461 | dr->dr_error = BIO_END_IO_ERROR(bio); |
784a7fe5 LW |
462 | #else |
463 | if (error) | |
464 | dr->dr_error = -(error); | |
465 | else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) | |
466 | dr->dr_error = EIO; | |
467 | #endif | |
468 | } | |
60101509 | 469 | |
b0be93e8 | 470 | /* Drop reference acquired by __vdev_disk_physio */ |
60101509 | 471 | rc = vdev_disk_dio_put(dr); |
60101509 BB |
472 | } |
473 | ||
60101509 BB |
474 | static unsigned int |
475 | bio_map(struct bio *bio, void *bio_ptr, unsigned int bio_size) | |
476 | { | |
477 | unsigned int offset, size, i; | |
478 | struct page *page; | |
479 | ||
480 | offset = offset_in_page(bio_ptr); | |
481 | for (i = 0; i < bio->bi_max_vecs; i++) { | |
482 | size = PAGE_SIZE - offset; | |
483 | ||
484 | if (bio_size <= 0) | |
485 | break; | |
486 | ||
487 | if (size > bio_size) | |
488 | size = bio_size; | |
489 | ||
71f8548e | 490 | if (is_vmalloc_addr(bio_ptr)) |
60101509 BB |
491 | page = vmalloc_to_page(bio_ptr); |
492 | else | |
493 | page = virt_to_page(bio_ptr); | |
494 | ||
17584980 CC |
495 | /* |
496 | * Some network related block device uses tcp_sendpage, which | |
497 | * doesn't behave well when using 0-count page, this is a | |
498 | * safety net to catch them. | |
499 | */ | |
500 | ASSERT3S(page_count(page), >, 0); | |
501 | ||
60101509 BB |
502 | if (bio_add_page(bio, page, size, offset) != size) |
503 | break; | |
504 | ||
505 | bio_ptr += size; | |
506 | bio_size -= size; | |
507 | offset = 0; | |
508 | } | |
509 | ||
d1d7e268 | 510 | return (bio_size); |
60101509 BB |
511 | } |
512 | ||
b0be93e8 IH |
513 | static unsigned int |
514 | bio_map_abd_off(struct bio *bio, abd_t *abd, unsigned int size, size_t off) | |
515 | { | |
516 | if (abd_is_linear(abd)) | |
517 | return (bio_map(bio, ((char *)abd_to_buf(abd)) + off, size)); | |
518 | ||
519 | return (abd_scatter_bio_map_off(bio, abd, size, off)); | |
520 | } | |
521 | ||
bbb1b6ce | 522 | static inline void |
3b86aeb2 | 523 | vdev_submit_bio_impl(struct bio *bio) |
bbb1b6ce BB |
524 | { |
525 | #ifdef HAVE_1ARG_SUBMIT_BIO | |
bbb1b6ce BB |
526 | submit_bio(bio); |
527 | #else | |
3b86aeb2 | 528 | submit_bio(0, bio); |
bbb1b6ce BB |
529 | #endif |
530 | } | |
531 | ||
26a85659 BB |
532 | #ifdef HAVE_BIO_SET_DEV |
533 | #if defined(CONFIG_BLK_CGROUP) && defined(HAVE_BIO_SET_DEV_GPL_ONLY) | |
534 | /* | |
535 | * The Linux 5.0 kernel updated the bio_set_dev() macro so it calls the | |
536 | * GPL-only bio_associate_blkg() symbol thus inadvertently converting | |
537 | * the entire macro. Provide a minimal version which always assigns the | |
538 | * request queue's root_blkg to the bio. | |
539 | */ | |
540 | static inline void | |
541 | vdev_bio_associate_blkg(struct bio *bio) | |
542 | { | |
543 | struct request_queue *q = bio->bi_disk->queue; | |
544 | ||
545 | ASSERT3P(q, !=, NULL); | |
26a85659 BB |
546 | ASSERT3P(bio->bi_blkg, ==, NULL); |
547 | ||
548 | if (blkg_tryget(q->root_blkg)) | |
549 | bio->bi_blkg = q->root_blkg; | |
550 | } | |
551 | #define bio_associate_blkg vdev_bio_associate_blkg | |
552 | #endif | |
553 | #else | |
554 | /* | |
555 | * Provide a bio_set_dev() helper macro for pre-Linux 4.14 kernels. | |
556 | */ | |
787acae0 GDN |
557 | static inline void |
558 | bio_set_dev(struct bio *bio, struct block_device *bdev) | |
559 | { | |
560 | bio->bi_bdev = bdev; | |
561 | } | |
26a85659 | 562 | #endif /* HAVE_BIO_SET_DEV */ |
787acae0 | 563 | |
37f9dac5 | 564 | static inline void |
3b86aeb2 | 565 | vdev_submit_bio(struct bio *bio) |
37f9dac5 RY |
566 | { |
567 | #ifdef HAVE_CURRENT_BIO_TAIL | |
568 | struct bio **bio_tail = current->bio_tail; | |
569 | current->bio_tail = NULL; | |
3b86aeb2 | 570 | vdev_submit_bio_impl(bio); |
37f9dac5 RY |
571 | current->bio_tail = bio_tail; |
572 | #else | |
573 | struct bio_list *bio_list = current->bio_list; | |
574 | current->bio_list = NULL; | |
3b86aeb2 | 575 | vdev_submit_bio_impl(bio); |
37f9dac5 RY |
576 | current->bio_list = bio_list; |
577 | #endif | |
578 | } | |
579 | ||
60101509 | 580 | static int |
b0be93e8 IH |
581 | __vdev_disk_physio(struct block_device *bdev, zio_t *zio, |
582 | size_t io_size, uint64_t io_offset, int rw, int flags) | |
60101509 | 583 | { |
d1d7e268 | 584 | dio_request_t *dr; |
b0be93e8 | 585 | uint64_t abd_offset; |
60101509 | 586 | uint64_t bio_offset; |
3b86aeb2 | 587 | int bio_size, bio_count = 16; |
f74fae8b | 588 | int i = 0, error = 0; |
e8ac4557 IH |
589 | #if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG) |
590 | struct blk_plug plug; | |
591 | #endif | |
d441e85d BB |
592 | /* |
593 | * Accessing outside the block device is never allowed. | |
594 | */ | |
595 | if (io_offset + io_size > bdev->bd_inode->i_size) { | |
596 | vdev_dbgmsg(zio->io_vd, | |
597 | "Illegal access %llu size %llu, device size %llu", | |
598 | io_offset, io_size, i_size_read(bdev->bd_inode)); | |
599 | return (SET_ERROR(EIO)); | |
600 | } | |
e06be586 | 601 | |
60101509 BB |
602 | retry: |
603 | dr = vdev_disk_dio_alloc(bio_count); | |
604 | if (dr == NULL) | |
ecb2b7dc | 605 | return (SET_ERROR(ENOMEM)); |
60101509 | 606 | |
2959d94a | 607 | if (zio && !(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD))) |
29b763cd | 608 | bio_set_flags_failfast(bdev, &flags); |
2959d94a | 609 | |
60101509 | 610 | dr->dr_zio = zio; |
60101509 | 611 | |
60101509 BB |
612 | /* |
613 | * When the IO size exceeds the maximum bio size for the request | |
614 | * queue we are forced to break the IO in multiple bio's and wait | |
615 | * for them all to complete. Ideally, all pool users will set | |
616 | * their volume block size to match the maximum request size and | |
617 | * the common case will be one bio per vdev IO request. | |
618 | */ | |
a6255b7f | 619 | |
b0be93e8 IH |
620 | abd_offset = 0; |
621 | bio_offset = io_offset; | |
622 | bio_size = io_size; | |
60101509 BB |
623 | for (i = 0; i <= dr->dr_bio_count; i++) { |
624 | ||
625 | /* Finished constructing bio's for given buffer */ | |
626 | if (bio_size <= 0) | |
627 | break; | |
628 | ||
629 | /* | |
630 | * By default only 'bio_count' bio's per dio are allowed. | |
631 | * However, if we find ourselves in a situation where more | |
632 | * are needed we allocate a larger dio and warn the user. | |
633 | */ | |
634 | if (dr->dr_bio_count == i) { | |
635 | vdev_disk_dio_free(dr); | |
636 | bio_count *= 2; | |
60101509 BB |
637 | goto retry; |
638 | } | |
639 | ||
29b763cd | 640 | /* bio_alloc() with __GFP_WAIT never returns NULL */ |
f1512ee6 | 641 | dr->dr_bio[i] = bio_alloc(GFP_NOIO, |
b0be93e8 | 642 | MIN(abd_nr_pages_off(zio->io_abd, bio_size, abd_offset), |
02730c33 | 643 | BIO_MAX_PAGES)); |
29b763cd | 644 | if (unlikely(dr->dr_bio[i] == NULL)) { |
60101509 | 645 | vdev_disk_dio_free(dr); |
ecb2b7dc | 646 | return (SET_ERROR(ENOMEM)); |
60101509 BB |
647 | } |
648 | ||
649 | /* Matching put called by vdev_disk_physio_completion */ | |
650 | vdev_disk_dio_get(dr); | |
651 | ||
787acae0 | 652 | bio_set_dev(dr->dr_bio[i], bdev); |
d4541210 | 653 | BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9; |
60101509 BB |
654 | dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion; |
655 | dr->dr_bio[i]->bi_private = dr; | |
3b86aeb2 | 656 | bio_set_op_attrs(dr->dr_bio[i], rw, flags); |
60101509 BB |
657 | |
658 | /* Remaining size is returned to become the new size */ | |
b0be93e8 | 659 | bio_size = bio_map_abd_off(dr->dr_bio[i], zio->io_abd, |
02730c33 | 660 | bio_size, abd_offset); |
60101509 BB |
661 | |
662 | /* Advance in buffer and construct another bio if needed */ | |
b0be93e8 | 663 | abd_offset += BIO_BI_SIZE(dr->dr_bio[i]); |
d4541210 | 664 | bio_offset += BIO_BI_SIZE(dr->dr_bio[i]); |
60101509 BB |
665 | } |
666 | ||
37f9dac5 | 667 | /* Extra reference to protect dio_request during vdev_submit_bio */ |
60101509 BB |
668 | vdev_disk_dio_get(dr); |
669 | ||
e8ac4557 IH |
670 | #if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG) |
671 | if (dr->dr_bio_count > 1) | |
672 | blk_start_plug(&plug); | |
673 | #endif | |
674 | ||
60101509 BB |
675 | /* Submit all bio's associated with this dio */ |
676 | for (i = 0; i < dr->dr_bio_count; i++) | |
677 | if (dr->dr_bio[i]) | |
3b86aeb2 | 678 | vdev_submit_bio(dr->dr_bio[i]); |
60101509 | 679 | |
e8ac4557 IH |
680 | #if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG) |
681 | if (dr->dr_bio_count > 1) | |
682 | blk_finish_plug(&plug); | |
683 | #endif | |
684 | ||
d1d7e268 | 685 | (void) vdev_disk_dio_put(dr); |
60101509 | 686 | |
d1d7e268 | 687 | return (error); |
60101509 BB |
688 | } |
689 | ||
36ba27e9 | 690 | BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error) |
60101509 BB |
691 | { |
692 | zio_t *zio = bio->bi_private; | |
784a7fe5 | 693 | #ifdef HAVE_1ARG_BIO_END_IO_T |
36ba27e9 BB |
694 | zio->io_error = BIO_END_IO_ERROR(bio); |
695 | #else | |
696 | zio->io_error = -error; | |
784a7fe5 | 697 | #endif |
60101509 | 698 | |
36ba27e9 | 699 | if (zio->io_error && (zio->io_error == EOPNOTSUPP)) |
60101509 BB |
700 | zio->io_vd->vdev_nowritecache = B_TRUE; |
701 | ||
702 | bio_put(bio); | |
d148e951 BB |
703 | ASSERT3S(zio->io_error, >=, 0); |
704 | if (zio->io_error) | |
705 | vdev_disk_error(zio); | |
60101509 | 706 | zio_interrupt(zio); |
60101509 BB |
707 | } |
708 | ||
709 | static int | |
710 | vdev_disk_io_flush(struct block_device *bdev, zio_t *zio) | |
711 | { | |
712 | struct request_queue *q; | |
713 | struct bio *bio; | |
714 | ||
715 | q = bdev_get_queue(bdev); | |
716 | if (!q) | |
ecb2b7dc | 717 | return (SET_ERROR(ENXIO)); |
60101509 | 718 | |
abc41ac7 | 719 | bio = bio_alloc(GFP_NOIO, 0); |
29b763cd IH |
720 | /* bio_alloc() with __GFP_WAIT never returns NULL */ |
721 | if (unlikely(bio == NULL)) | |
ecb2b7dc | 722 | return (SET_ERROR(ENOMEM)); |
60101509 BB |
723 | |
724 | bio->bi_end_io = vdev_disk_io_flush_completion; | |
725 | bio->bi_private = zio; | |
787acae0 | 726 | bio_set_dev(bio, bdev); |
a5e046ea | 727 | bio_set_flush(bio); |
3b86aeb2 | 728 | vdev_submit_bio(bio); |
cecb7487 | 729 | invalidate_bdev(bdev); |
60101509 | 730 | |
d1d7e268 | 731 | return (0); |
60101509 | 732 | } |
60101509 | 733 | |
98b25418 | 734 | static void |
60101509 BB |
735 | vdev_disk_io_start(zio_t *zio) |
736 | { | |
737 | vdev_t *v = zio->io_vd; | |
738 | vdev_disk_t *vd = v->vdev_tsd; | |
1b939560 | 739 | unsigned long trim_flags = 0; |
3b86aeb2 | 740 | int rw, flags, error; |
60101509 | 741 | |
d441e85d BB |
742 | /* |
743 | * If the vdev is closed, it's likely in the REMOVED or FAULTED state. | |
744 | * Nothing to be done here but return failure. | |
745 | */ | |
746 | if (vd == NULL) { | |
747 | zio->io_error = ENXIO; | |
748 | zio_interrupt(zio); | |
749 | return; | |
750 | } | |
751 | ||
752 | rw_enter(&vd->vd_lock, RW_READER); | |
753 | ||
754 | /* | |
755 | * If the vdev is closed, it's likely due to a failed reopen and is | |
756 | * in the UNAVAIL state. Nothing to be done here but return failure. | |
757 | */ | |
758 | if (vd->vd_bdev == NULL) { | |
759 | rw_exit(&vd->vd_lock); | |
760 | zio->io_error = ENXIO; | |
761 | zio_interrupt(zio); | |
762 | return; | |
763 | } | |
764 | ||
60101509 BB |
765 | switch (zio->io_type) { |
766 | case ZIO_TYPE_IOCTL: | |
767 | ||
768 | if (!vdev_readable(v)) { | |
d441e85d | 769 | rw_exit(&vd->vd_lock); |
2e528b49 | 770 | zio->io_error = SET_ERROR(ENXIO); |
98b25418 GW |
771 | zio_interrupt(zio); |
772 | return; | |
60101509 BB |
773 | } |
774 | ||
775 | switch (zio->io_cmd) { | |
776 | case DKIOCFLUSHWRITECACHE: | |
777 | ||
778 | if (zfs_nocacheflush) | |
779 | break; | |
780 | ||
781 | if (v->vdev_nowritecache) { | |
2e528b49 | 782 | zio->io_error = SET_ERROR(ENOTSUP); |
60101509 BB |
783 | break; |
784 | } | |
785 | ||
786 | error = vdev_disk_io_flush(vd->vd_bdev, zio); | |
d441e85d BB |
787 | if (error == 0) { |
788 | rw_exit(&vd->vd_lock); | |
98b25418 | 789 | return; |
d441e85d | 790 | } |
60101509 BB |
791 | |
792 | zio->io_error = error; | |
60101509 BB |
793 | |
794 | break; | |
795 | ||
796 | default: | |
2e528b49 | 797 | zio->io_error = SET_ERROR(ENOTSUP); |
60101509 BB |
798 | } |
799 | ||
d441e85d | 800 | rw_exit(&vd->vd_lock); |
98b25418 GW |
801 | zio_execute(zio); |
802 | return; | |
60101509 | 803 | case ZIO_TYPE_WRITE: |
3b86aeb2 | 804 | rw = WRITE; |
e6603b7c | 805 | #if defined(HAVE_BLK_QUEUE_HAVE_BIO_RW_UNPLUG) |
3b86aeb2 | 806 | flags = (1 << BIO_RW_UNPLUG); |
e6603b7c | 807 | #elif defined(REQ_UNPLUG) |
3b86aeb2 | 808 | flags = REQ_UNPLUG; |
e6603b7c | 809 | #else |
3b86aeb2 | 810 | flags = 0; |
e6603b7c | 811 | #endif |
60101509 BB |
812 | break; |
813 | ||
814 | case ZIO_TYPE_READ: | |
3b86aeb2 | 815 | rw = READ; |
e6603b7c | 816 | #if defined(HAVE_BLK_QUEUE_HAVE_BIO_RW_UNPLUG) |
3b86aeb2 | 817 | flags = (1 << BIO_RW_UNPLUG); |
e6603b7c | 818 | #elif defined(REQ_UNPLUG) |
3b86aeb2 | 819 | flags = REQ_UNPLUG; |
e6603b7c | 820 | #else |
3b86aeb2 | 821 | flags = 0; |
e6603b7c | 822 | #endif |
60101509 BB |
823 | break; |
824 | ||
1b939560 BB |
825 | case ZIO_TYPE_TRIM: |
826 | #if defined(BLKDEV_DISCARD_SECURE) | |
827 | if (zio->io_trim_flags & ZIO_TRIM_SECURE) | |
828 | trim_flags |= BLKDEV_DISCARD_SECURE; | |
829 | #endif | |
830 | zio->io_error = -blkdev_issue_discard(vd->vd_bdev, | |
831 | zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS, | |
832 | trim_flags); | |
833 | ||
834 | rw_exit(&vd->vd_lock); | |
835 | zio_interrupt(zio); | |
836 | return; | |
837 | ||
60101509 | 838 | default: |
d441e85d | 839 | rw_exit(&vd->vd_lock); |
2e528b49 | 840 | zio->io_error = SET_ERROR(ENOTSUP); |
98b25418 GW |
841 | zio_interrupt(zio); |
842 | return; | |
60101509 BB |
843 | } |
844 | ||
26ef0cc7 | 845 | zio->io_target_timestamp = zio_handle_io_delay(zio); |
b0be93e8 | 846 | error = __vdev_disk_physio(vd->vd_bdev, zio, |
3b86aeb2 | 847 | zio->io_size, zio->io_offset, rw, flags); |
d441e85d BB |
848 | rw_exit(&vd->vd_lock); |
849 | ||
60101509 BB |
850 | if (error) { |
851 | zio->io_error = error; | |
98b25418 GW |
852 | zio_interrupt(zio); |
853 | return; | |
60101509 | 854 | } |
60101509 BB |
855 | } |
856 | ||
857 | static void | |
858 | vdev_disk_io_done(zio_t *zio) | |
859 | { | |
860 | /* | |
861 | * If the device returned EIO, we revalidate the media. If it is | |
862 | * determined the media has changed this triggers the asynchronous | |
863 | * removal of the device from the configuration. | |
864 | */ | |
865 | if (zio->io_error == EIO) { | |
d1d7e268 | 866 | vdev_t *v = zio->io_vd; |
60101509 BB |
867 | vdev_disk_t *vd = v->vdev_tsd; |
868 | ||
869 | if (check_disk_change(vd->vd_bdev)) { | |
870 | vdev_bdev_invalidate(vd->vd_bdev); | |
871 | v->vdev_remove_wanted = B_TRUE; | |
872 | spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE); | |
873 | } | |
874 | } | |
875 | } | |
876 | ||
877 | static void | |
878 | vdev_disk_hold(vdev_t *vd) | |
879 | { | |
880 | ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); | |
881 | ||
882 | /* We must have a pathname, and it must be absolute. */ | |
883 | if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') | |
884 | return; | |
885 | ||
886 | /* | |
887 | * Only prefetch path and devid info if the device has | |
888 | * never been opened. | |
889 | */ | |
890 | if (vd->vdev_tsd != NULL) | |
891 | return; | |
892 | ||
893 | /* XXX: Implement me as a vnode lookup for the device */ | |
894 | vd->vdev_name_vp = NULL; | |
895 | vd->vdev_devid_vp = NULL; | |
896 | } | |
897 | ||
898 | static void | |
899 | vdev_disk_rele(vdev_t *vd) | |
900 | { | |
901 | ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); | |
902 | ||
903 | /* XXX: Implement me as a vnode rele for the device */ | |
904 | } | |
905 | ||
e771de53 BB |
906 | static int |
907 | param_set_vdev_scheduler(const char *val, zfs_kernel_param_t *kp) | |
908 | { | |
909 | spa_t *spa = NULL; | |
910 | char *p; | |
911 | ||
912 | if (val == NULL) | |
913 | return (SET_ERROR(-EINVAL)); | |
914 | ||
915 | if ((p = strchr(val, '\n')) != NULL) | |
916 | *p = '\0'; | |
917 | ||
bc5f51c5 | 918 | if (spa_mode_global != 0) { |
e771de53 | 919 | mutex_enter(&spa_namespace_lock); |
bc5f51c5 OF |
920 | while ((spa = spa_next(spa)) != NULL) { |
921 | if (spa_state(spa) != POOL_STATE_ACTIVE || | |
922 | !spa_writeable(spa) || spa_suspended(spa)) | |
923 | continue; | |
924 | ||
925 | spa_open_ref(spa, FTAG); | |
926 | mutex_exit(&spa_namespace_lock); | |
927 | vdev_elevator_switch(spa->spa_root_vdev, (char *)val); | |
928 | mutex_enter(&spa_namespace_lock); | |
929 | spa_close(spa, FTAG); | |
930 | } | |
931 | mutex_exit(&spa_namespace_lock); | |
e771de53 | 932 | } |
e771de53 BB |
933 | |
934 | return (param_set_charp(val, kp)); | |
935 | } | |
936 | ||
60101509 BB |
937 | vdev_ops_t vdev_disk_ops = { |
938 | vdev_disk_open, | |
939 | vdev_disk_close, | |
940 | vdev_default_asize, | |
941 | vdev_disk_io_start, | |
942 | vdev_disk_io_done, | |
943 | NULL, | |
3d6da72d | 944 | NULL, |
60101509 BB |
945 | vdev_disk_hold, |
946 | vdev_disk_rele, | |
a1d477c2 | 947 | NULL, |
619f0976 | 948 | vdev_default_xlate, |
60101509 BB |
949 | VDEV_TYPE_DISK, /* name of this vdev type */ |
950 | B_TRUE /* leaf vdev */ | |
951 | }; | |
952 | ||
e771de53 BB |
953 | module_param_call(zfs_vdev_scheduler, param_set_vdev_scheduler, |
954 | param_get_charp, &zfs_vdev_scheduler, 0644); | |
c409e464 | 955 | MODULE_PARM_DESC(zfs_vdev_scheduler, "I/O scheduler"); |