]>
Commit | Line | Data |
---|---|---|
60101509 BB |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
9 | * or http://www.opensolaris.org/os/licensing. | |
10 | * See the License for the specific language governing permissions | |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | /* | |
22 | * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC. | |
23 | * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). | |
24 | * Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>. | |
25 | * LLNL-CODE-403049. | |
74d42600 | 26 | * Copyright (c) 2012, 2018 by Delphix. All rights reserved. |
60101509 BB |
27 | */ |
28 | ||
29 | #include <sys/zfs_context.h> | |
e771de53 | 30 | #include <sys/spa_impl.h> |
60101509 BB |
31 | #include <sys/vdev_disk.h> |
32 | #include <sys/vdev_impl.h> | |
a6255b7f | 33 | #include <sys/abd.h> |
60101509 BB |
34 | #include <sys/fs/zfs.h> |
35 | #include <sys/zio.h> | |
e771de53 | 36 | #include <linux/mod_compat.h> |
74d42600 | 37 | #include <linux/msdos_fs.h> |
05805494 | 38 | #include <linux/vfs_compat.h> |
60101509 | 39 | |
6839eed2 | 40 | char *zfs_vdev_scheduler = VDEV_SCHEDULER; |
8128bd89 | 41 | static void *zfs_vdev_holder = VDEV_HOLDER; |
6839eed2 | 42 | |
74d42600 SH |
43 | /* size of the "reserved" partition, in blocks */ |
44 | #define EFI_MIN_RESV_SIZE (16 * 1024) | |
45 | ||
60101509 BB |
46 | /* |
47 | * Virtual device vector for disks. | |
48 | */ | |
49 | typedef struct dio_request { | |
60101509 | 50 | zio_t *dr_zio; /* Parent ZIO */ |
aa159afb | 51 | atomic_t dr_ref; /* References */ |
60101509 BB |
52 | int dr_error; /* Bio error */ |
53 | int dr_bio_count; /* Count of bio's */ | |
d1d7e268 | 54 | struct bio *dr_bio[0]; /* Attached bio's */ |
60101509 BB |
55 | } dio_request_t; |
56 | ||
57 | ||
58 | #ifdef HAVE_OPEN_BDEV_EXCLUSIVE | |
59 | static fmode_t | |
60 | vdev_bdev_mode(int smode) | |
61 | { | |
62 | fmode_t mode = 0; | |
63 | ||
64 | ASSERT3S(smode & (FREAD | FWRITE), !=, 0); | |
65 | ||
66 | if (smode & FREAD) | |
67 | mode |= FMODE_READ; | |
68 | ||
69 | if (smode & FWRITE) | |
70 | mode |= FMODE_WRITE; | |
71 | ||
d1d7e268 | 72 | return (mode); |
60101509 BB |
73 | } |
74 | #else | |
75 | static int | |
76 | vdev_bdev_mode(int smode) | |
77 | { | |
78 | int mode = 0; | |
79 | ||
80 | ASSERT3S(smode & (FREAD | FWRITE), !=, 0); | |
81 | ||
82 | if ((smode & FREAD) && !(smode & FWRITE)) | |
05805494 | 83 | mode = SB_RDONLY; |
60101509 | 84 | |
d1d7e268 | 85 | return (mode); |
60101509 BB |
86 | } |
87 | #endif /* HAVE_OPEN_BDEV_EXCLUSIVE */ | |
88 | ||
d441e85d BB |
89 | /* |
90 | * Returns the usable capacity (in bytes) for the partition or disk. | |
91 | */ | |
60101509 | 92 | static uint64_t |
d441e85d | 93 | bdev_capacity(struct block_device *bdev) |
60101509 | 94 | { |
d441e85d BB |
95 | return (i_size_read(bdev->bd_inode)); |
96 | } | |
60101509 | 97 | |
d441e85d BB |
98 | /* |
99 | * Returns the maximum expansion capacity of the block device (in bytes). | |
100 | * | |
101 | * It is possible to expand a vdev when it has been created as a wholedisk | |
102 | * and the containing block device has increased in capacity. Or when the | |
103 | * partition containing the pool has been manually increased in size. | |
104 | * | |
105 | * This function is only responsible for calculating the potential expansion | |
106 | * size so it can be reported by 'zpool list'. The efi_use_whole_disk() is | |
107 | * responsible for verifying the expected partition layout in the wholedisk | |
108 | * case, and updating the partition table if appropriate. Once the partition | |
109 | * size has been increased the additional capacity will be visible using | |
110 | * bdev_capacity(). | |
111 | */ | |
112 | static uint64_t | |
113 | bdev_max_capacity(struct block_device *bdev, uint64_t wholedisk) | |
114 | { | |
115 | uint64_t psize; | |
116 | int64_t available; | |
117 | ||
118 | if (wholedisk && bdev->bd_part != NULL && bdev != bdev->bd_contains) { | |
74d42600 | 119 | /* |
d441e85d BB |
120 | * When reporting maximum expansion capacity for a wholedisk |
121 | * deduct any capacity which is expected to be lost due to | |
122 | * alignment restrictions. Over reporting this value isn't | |
123 | * harmful and would only result in slightly less capacity | |
124 | * than expected post expansion. | |
74d42600 | 125 | */ |
d441e85d BB |
126 | available = i_size_read(bdev->bd_contains->bd_inode) - |
127 | ((EFI_MIN_RESV_SIZE + NEW_START_BLOCK + | |
128 | PARTITION_END_ALIGNMENT) << SECTOR_BITS); | |
129 | if (available > 0) | |
130 | psize = available; | |
131 | else | |
132 | psize = bdev_capacity(bdev); | |
74d42600 | 133 | } else { |
d441e85d | 134 | psize = bdev_capacity(bdev); |
74d42600 | 135 | } |
d441e85d BB |
136 | |
137 | return (psize); | |
60101509 BB |
138 | } |
139 | ||
d148e951 BB |
140 | static void |
141 | vdev_disk_error(zio_t *zio) | |
142 | { | |
c71c8c71 | 143 | /* |
144 | * This function can be called in interrupt context, for instance while | |
145 | * handling IRQs coming from a misbehaving disk device; use printk() | |
146 | * which is safe from any context. | |
147 | */ | |
148 | printk(KERN_WARNING "zio pool=%s vdev=%s error=%d type=%d " | |
149 | "offset=%llu size=%llu flags=%x\n", spa_name(zio->io_spa), | |
150 | zio->io_vd->vdev_path, zio->io_error, zio->io_type, | |
151 | (u_longlong_t)zio->io_offset, (u_longlong_t)zio->io_size, | |
152 | zio->io_flags); | |
d148e951 BB |
153 | } |
154 | ||
6839eed2 BB |
155 | /* |
156 | * Use the Linux 'noop' elevator for zfs managed block devices. This | |
157 | * strikes the ideal balance by allowing the zfs elevator to do all | |
158 | * request ordering and prioritization. While allowing the Linux | |
159 | * elevator to do the maximum front/back merging allowed by the | |
160 | * physical device. This yields the largest possible requests for | |
161 | * the device with the lowest total overhead. | |
6839eed2 | 162 | */ |
e771de53 | 163 | static void |
fdcd952b | 164 | vdev_elevator_switch(vdev_t *v, char *elevator) |
6839eed2 | 165 | { |
fdcd952b | 166 | vdev_disk_t *vd = v->vdev_tsd; |
e771de53 BB |
167 | struct request_queue *q; |
168 | char *device; | |
e2448b0e | 169 | int error; |
fdcd952b | 170 | |
e771de53 BB |
171 | for (int c = 0; c < v->vdev_children; c++) |
172 | vdev_elevator_switch(v->vdev_child[c], elevator); | |
173 | ||
174 | if (!v->vdev_ops->vdev_op_leaf || vd->vd_bdev == NULL) | |
175 | return; | |
176 | ||
177 | q = bdev_get_queue(vd->vd_bdev); | |
178 | device = vd->vd_bdev->bd_disk->disk_name; | |
179 | ||
84daadde PS |
180 | /* |
181 | * Skip devices which are not whole disks (partitions). | |
182 | * Device-mapper devices are excepted since they may be whole | |
183 | * disks despite the vdev_wholedisk flag, in which case we can | |
184 | * and should switch the elevator. If the device-mapper device | |
185 | * does not have an elevator (i.e. dm-raid, dm-crypt, etc.) the | |
186 | * "Skip devices without schedulers" check below will fail. | |
187 | */ | |
188 | if (!v->vdev_wholedisk && strncmp(device, "dm-", 3) != 0) | |
e771de53 | 189 | return; |
04516a45 | 190 | |
fdcd952b | 191 | /* Leave existing scheduler when set to "none" */ |
4903926f | 192 | if ((strncmp(elevator, "none", 4) == 0) && (strlen(elevator) == 4)) |
e771de53 | 193 | return; |
6839eed2 | 194 | |
1c38ac61 BB |
195 | /* |
196 | * The elevator_change() function was available in kernels from | |
197 | * 2.6.36 to 4.11. When not available fall back to using the user | |
198 | * mode helper functionality to set the elevator via sysfs. This | |
199 | * requires /bin/echo and sysfs to be mounted which may not be true | |
200 | * early in the boot process. | |
201 | */ | |
6d1d976b BB |
202 | #ifdef HAVE_ELEVATOR_CHANGE |
203 | error = elevator_change(q, elevator); | |
204 | #else | |
d1d7e268 | 205 | #define SET_SCHEDULER_CMD \ |
6d1d976b BB |
206 | "exec 0</dev/null " \ |
207 | " 1>/sys/block/%s/queue/scheduler " \ | |
208 | " 2>/dev/null; " \ | |
209 | "echo %s" | |
210 | ||
e771de53 BB |
211 | char *argv[] = { "/bin/sh", "-c", NULL, NULL }; |
212 | char *envp[] = { NULL }; | |
6d1d976b | 213 | |
e771de53 BB |
214 | argv[2] = kmem_asprintf(SET_SCHEDULER_CMD, device, elevator); |
215 | error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC); | |
216 | strfree(argv[2]); | |
6d1d976b | 217 | #endif /* HAVE_ELEVATOR_CHANGE */ |
1c38ac61 BB |
218 | if (error) { |
219 | zfs_dbgmsg("Unable to set \"%s\" scheduler for %s (%s): %d\n", | |
220 | elevator, v->vdev_path, device, error); | |
221 | } | |
6839eed2 BB |
222 | } |
223 | ||
60101509 | 224 | static int |
1bd201e7 CS |
225 | vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, |
226 | uint64_t *ashift) | |
60101509 | 227 | { |
d441e85d BB |
228 | struct block_device *bdev; |
229 | fmode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa)); | |
230 | int count = 0, block_size; | |
231 | int bdev_retry_count = 50; | |
60101509 | 232 | vdev_disk_t *vd; |
60101509 BB |
233 | |
234 | /* Must have a pathname and it must be absolute. */ | |
235 | if (v->vdev_path == NULL || v->vdev_path[0] != '/') { | |
236 | v->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; | |
d441e85d | 237 | vdev_dbgmsg(v, "invalid vdev_path"); |
2d82ea8b | 238 | return (SET_ERROR(EINVAL)); |
60101509 BB |
239 | } |
240 | ||
0d8103d9 | 241 | /* |
d441e85d BB |
242 | * Reopen the device if it is currently open. When expanding a |
243 | * partition force re-scanning the partition table while closed | |
244 | * in order to get an accurate updated block device size. Then | |
245 | * since udev may need to recreate the device links increase the | |
246 | * open retry count before reporting the device as unavailable. | |
0d8103d9 | 247 | */ |
d441e85d BB |
248 | vd = v->vdev_tsd; |
249 | if (vd) { | |
250 | char disk_name[BDEVNAME_SIZE + 6] = "/dev/"; | |
251 | boolean_t reread_part = B_FALSE; | |
0d8103d9 | 252 | |
d441e85d BB |
253 | rw_enter(&vd->vd_lock, RW_WRITER); |
254 | bdev = vd->vd_bdev; | |
255 | vd->vd_bdev = NULL; | |
256 | ||
257 | if (bdev) { | |
258 | if (v->vdev_expanding && bdev != bdev->bd_contains) { | |
259 | bdevname(bdev->bd_contains, disk_name + 5); | |
260 | reread_part = B_TRUE; | |
261 | } | |
262 | ||
263 | vdev_bdev_close(bdev, mode); | |
264 | } | |
265 | ||
266 | if (reread_part) { | |
267 | bdev = vdev_bdev_open(disk_name, mode, zfs_vdev_holder); | |
268 | if (!IS_ERR(bdev)) { | |
269 | int error = vdev_bdev_reread_part(bdev); | |
270 | vdev_bdev_close(bdev, mode); | |
271 | if (error == 0) | |
272 | bdev_retry_count = 100; | |
273 | } | |
274 | } | |
275 | } else { | |
276 | vd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP); | |
277 | ||
278 | rw_init(&vd->vd_lock, NULL, RW_DEFAULT, NULL); | |
279 | rw_enter(&vd->vd_lock, RW_WRITER); | |
280 | } | |
60101509 BB |
281 | |
282 | /* | |
283 | * Devices are always opened by the path provided at configuration | |
284 | * time. This means that if the provided path is a udev by-id path | |
d441e85d | 285 | * then drives may be re-cabled without an issue. If the provided |
4e95cc99 | 286 | * path is a udev by-path path, then the physical location information |
60101509 BB |
287 | * will be preserved. This can be critical for more complicated |
288 | * configurations where drives are located in specific physical | |
d441e85d BB |
289 | * locations to maximize the systems tolerance to component failure. |
290 | * | |
4e95cc99 | 291 | * Alternatively, you can provide your own udev rule to flexibly map |
60101509 | 292 | * the drives as you see fit. It is not advised that you use the |
4e95cc99 | 293 | * /dev/[hd]d devices which may be reordered due to probing order. |
60101509 BB |
294 | * Devices in the wrong locations will be detected by the higher |
295 | * level vdev validation. | |
2d82ea8b BB |
296 | * |
297 | * The specified paths may be briefly removed and recreated in | |
298 | * response to udev events. This should be exceptionally unlikely | |
299 | * because the zpool command makes every effort to verify these paths | |
300 | * have already settled prior to reaching this point. Therefore, | |
301 | * a ENOENT failure at this point is highly likely to be transient | |
302 | * and it is reasonable to sleep and retry before giving up. In | |
303 | * practice delays have been observed to be on the order of 100ms. | |
60101509 | 304 | */ |
d441e85d BB |
305 | bdev = ERR_PTR(-ENXIO); |
306 | while (IS_ERR(bdev) && count < bdev_retry_count) { | |
307 | bdev = vdev_bdev_open(v->vdev_path, mode, zfs_vdev_holder); | |
2d82ea8b | 308 | if (unlikely(PTR_ERR(bdev) == -ENOENT)) { |
d441e85d | 309 | schedule_timeout(MSEC_TO_TICK(10)); |
2d82ea8b BB |
310 | count++; |
311 | } else if (IS_ERR(bdev)) { | |
312 | break; | |
313 | } | |
314 | } | |
315 | ||
60101509 | 316 | if (IS_ERR(bdev)) { |
d441e85d BB |
317 | int error = -PTR_ERR(bdev); |
318 | vdev_dbgmsg(v, "open error=%d count=%d\n", error, count); | |
319 | vd->vd_bdev = NULL; | |
320 | v->vdev_tsd = vd; | |
321 | rw_exit(&vd->vd_lock); | |
322 | return (SET_ERROR(error)); | |
323 | } else { | |
324 | vd->vd_bdev = bdev; | |
325 | v->vdev_tsd = vd; | |
326 | rw_exit(&vd->vd_lock); | |
60101509 BB |
327 | } |
328 | ||
0d8103d9 BB |
329 | /* Determine the physical block size */ |
330 | block_size = vdev_bdev_block_size(vd->vd_bdev); | |
60101509 | 331 | |
60101509 BB |
332 | /* Clear the nowritecache bit, causes vdev_reopen() to try again. */ |
333 | v->vdev_nowritecache = B_FALSE; | |
334 | ||
fb40095f RY |
335 | /* Inform the ZIO pipeline that we are non-rotational */ |
336 | v->vdev_nonrot = blk_queue_nonrot(bdev_get_queue(vd->vd_bdev)); | |
337 | ||
d441e85d BB |
338 | /* Physical volume size in bytes for the partition */ |
339 | *psize = bdev_capacity(vd->vd_bdev); | |
340 | ||
341 | /* Physical volume size in bytes including possible expansion space */ | |
342 | *max_psize = bdev_max_capacity(vd->vd_bdev, v->vdev_wholedisk); | |
1bd201e7 | 343 | |
60101509 | 344 | /* Based on the minimum sector size set the block size */ |
9bd274dd | 345 | *ashift = highbit64(MAX(block_size, SPA_MINBLOCKSIZE)) - 1; |
60101509 | 346 | |
6839eed2 | 347 | /* Try to set the io scheduler elevator algorithm */ |
fdcd952b | 348 | (void) vdev_elevator_switch(v, zfs_vdev_scheduler); |
6839eed2 | 349 | |
d1d7e268 | 350 | return (0); |
60101509 BB |
351 | } |
352 | ||
353 | static void | |
354 | vdev_disk_close(vdev_t *v) | |
355 | { | |
356 | vdev_disk_t *vd = v->vdev_tsd; | |
357 | ||
0d8103d9 | 358 | if (v->vdev_reopening || vd == NULL) |
60101509 BB |
359 | return; |
360 | ||
d441e85d | 361 | if (vd->vd_bdev != NULL) { |
60101509 | 362 | vdev_bdev_close(vd->vd_bdev, |
d1d7e268 | 363 | vdev_bdev_mode(spa_mode(v->vdev_spa))); |
d441e85d | 364 | } |
60101509 | 365 | |
d441e85d | 366 | rw_destroy(&vd->vd_lock); |
d1d7e268 | 367 | kmem_free(vd, sizeof (vdev_disk_t)); |
60101509 BB |
368 | v->vdev_tsd = NULL; |
369 | } | |
370 | ||
371 | static dio_request_t * | |
372 | vdev_disk_dio_alloc(int bio_count) | |
373 | { | |
374 | dio_request_t *dr; | |
375 | int i; | |
376 | ||
d1d7e268 | 377 | dr = kmem_zalloc(sizeof (dio_request_t) + |
79c76d5b | 378 | sizeof (struct bio *) * bio_count, KM_SLEEP); |
60101509 | 379 | if (dr) { |
60101509 BB |
380 | atomic_set(&dr->dr_ref, 0); |
381 | dr->dr_bio_count = bio_count; | |
382 | dr->dr_error = 0; | |
383 | ||
384 | for (i = 0; i < dr->dr_bio_count; i++) | |
385 | dr->dr_bio[i] = NULL; | |
386 | } | |
387 | ||
d1d7e268 | 388 | return (dr); |
60101509 BB |
389 | } |
390 | ||
391 | static void | |
392 | vdev_disk_dio_free(dio_request_t *dr) | |
393 | { | |
394 | int i; | |
395 | ||
396 | for (i = 0; i < dr->dr_bio_count; i++) | |
397 | if (dr->dr_bio[i]) | |
398 | bio_put(dr->dr_bio[i]); | |
399 | ||
d1d7e268 MK |
400 | kmem_free(dr, sizeof (dio_request_t) + |
401 | sizeof (struct bio *) * dr->dr_bio_count); | |
60101509 BB |
402 | } |
403 | ||
404 | static void | |
405 | vdev_disk_dio_get(dio_request_t *dr) | |
406 | { | |
407 | atomic_inc(&dr->dr_ref); | |
408 | } | |
409 | ||
410 | static int | |
411 | vdev_disk_dio_put(dio_request_t *dr) | |
412 | { | |
413 | int rc = atomic_dec_return(&dr->dr_ref); | |
414 | ||
415 | /* | |
416 | * Free the dio_request when the last reference is dropped and | |
417 | * ensure zio_interpret is called only once with the correct zio | |
418 | */ | |
419 | if (rc == 0) { | |
420 | zio_t *zio = dr->dr_zio; | |
421 | int error = dr->dr_error; | |
422 | ||
423 | vdev_disk_dio_free(dr); | |
424 | ||
425 | if (zio) { | |
426 | zio->io_error = error; | |
d148e951 BB |
427 | ASSERT3S(zio->io_error, >=, 0); |
428 | if (zio->io_error) | |
429 | vdev_disk_error(zio); | |
a6255b7f | 430 | |
26ef0cc7 | 431 | zio_delay_interrupt(zio); |
60101509 BB |
432 | } |
433 | } | |
434 | ||
d1d7e268 | 435 | return (rc); |
60101509 BB |
436 | } |
437 | ||
784a7fe5 | 438 | BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, error) |
60101509 BB |
439 | { |
440 | dio_request_t *dr = bio->bi_private; | |
441 | int rc; | |
442 | ||
784a7fe5 LW |
443 | if (dr->dr_error == 0) { |
444 | #ifdef HAVE_1ARG_BIO_END_IO_T | |
36ba27e9 | 445 | dr->dr_error = BIO_END_IO_ERROR(bio); |
784a7fe5 LW |
446 | #else |
447 | if (error) | |
448 | dr->dr_error = -(error); | |
449 | else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) | |
450 | dr->dr_error = EIO; | |
451 | #endif | |
452 | } | |
60101509 | 453 | |
b0be93e8 | 454 | /* Drop reference acquired by __vdev_disk_physio */ |
60101509 | 455 | rc = vdev_disk_dio_put(dr); |
60101509 BB |
456 | } |
457 | ||
60101509 BB |
458 | static unsigned int |
459 | bio_map(struct bio *bio, void *bio_ptr, unsigned int bio_size) | |
460 | { | |
461 | unsigned int offset, size, i; | |
462 | struct page *page; | |
463 | ||
464 | offset = offset_in_page(bio_ptr); | |
465 | for (i = 0; i < bio->bi_max_vecs; i++) { | |
466 | size = PAGE_SIZE - offset; | |
467 | ||
468 | if (bio_size <= 0) | |
469 | break; | |
470 | ||
471 | if (size > bio_size) | |
472 | size = bio_size; | |
473 | ||
71f8548e | 474 | if (is_vmalloc_addr(bio_ptr)) |
60101509 BB |
475 | page = vmalloc_to_page(bio_ptr); |
476 | else | |
477 | page = virt_to_page(bio_ptr); | |
478 | ||
17584980 CC |
479 | /* |
480 | * Some network related block device uses tcp_sendpage, which | |
481 | * doesn't behave well when using 0-count page, this is a | |
482 | * safety net to catch them. | |
483 | */ | |
484 | ASSERT3S(page_count(page), >, 0); | |
485 | ||
60101509 BB |
486 | if (bio_add_page(bio, page, size, offset) != size) |
487 | break; | |
488 | ||
489 | bio_ptr += size; | |
490 | bio_size -= size; | |
491 | offset = 0; | |
492 | } | |
493 | ||
d1d7e268 | 494 | return (bio_size); |
60101509 BB |
495 | } |
496 | ||
b0be93e8 IH |
497 | static unsigned int |
498 | bio_map_abd_off(struct bio *bio, abd_t *abd, unsigned int size, size_t off) | |
499 | { | |
500 | if (abd_is_linear(abd)) | |
501 | return (bio_map(bio, ((char *)abd_to_buf(abd)) + off, size)); | |
502 | ||
503 | return (abd_scatter_bio_map_off(bio, abd, size, off)); | |
504 | } | |
505 | ||
bbb1b6ce | 506 | static inline void |
3b86aeb2 | 507 | vdev_submit_bio_impl(struct bio *bio) |
bbb1b6ce BB |
508 | { |
509 | #ifdef HAVE_1ARG_SUBMIT_BIO | |
bbb1b6ce BB |
510 | submit_bio(bio); |
511 | #else | |
3b86aeb2 | 512 | submit_bio(0, bio); |
bbb1b6ce BB |
513 | #endif |
514 | } | |
515 | ||
26a85659 BB |
516 | #ifdef HAVE_BIO_SET_DEV |
517 | #if defined(CONFIG_BLK_CGROUP) && defined(HAVE_BIO_SET_DEV_GPL_ONLY) | |
518 | /* | |
519 | * The Linux 5.0 kernel updated the bio_set_dev() macro so it calls the | |
520 | * GPL-only bio_associate_blkg() symbol thus inadvertently converting | |
521 | * the entire macro. Provide a minimal version which always assigns the | |
522 | * request queue's root_blkg to the bio. | |
523 | */ | |
524 | static inline void | |
525 | vdev_bio_associate_blkg(struct bio *bio) | |
526 | { | |
527 | struct request_queue *q = bio->bi_disk->queue; | |
528 | ||
529 | ASSERT3P(q, !=, NULL); | |
530 | ASSERT3P(q->root_blkg, !=, NULL); | |
531 | ASSERT3P(bio->bi_blkg, ==, NULL); | |
532 | ||
533 | if (blkg_tryget(q->root_blkg)) | |
534 | bio->bi_blkg = q->root_blkg; | |
535 | } | |
536 | #define bio_associate_blkg vdev_bio_associate_blkg | |
537 | #endif | |
538 | #else | |
539 | /* | |
540 | * Provide a bio_set_dev() helper macro for pre-Linux 4.14 kernels. | |
541 | */ | |
787acae0 GDN |
542 | static inline void |
543 | bio_set_dev(struct bio *bio, struct block_device *bdev) | |
544 | { | |
545 | bio->bi_bdev = bdev; | |
546 | } | |
26a85659 | 547 | #endif /* HAVE_BIO_SET_DEV */ |
787acae0 | 548 | |
37f9dac5 | 549 | static inline void |
3b86aeb2 | 550 | vdev_submit_bio(struct bio *bio) |
37f9dac5 RY |
551 | { |
552 | #ifdef HAVE_CURRENT_BIO_TAIL | |
553 | struct bio **bio_tail = current->bio_tail; | |
554 | current->bio_tail = NULL; | |
3b86aeb2 | 555 | vdev_submit_bio_impl(bio); |
37f9dac5 RY |
556 | current->bio_tail = bio_tail; |
557 | #else | |
558 | struct bio_list *bio_list = current->bio_list; | |
559 | current->bio_list = NULL; | |
3b86aeb2 | 560 | vdev_submit_bio_impl(bio); |
37f9dac5 RY |
561 | current->bio_list = bio_list; |
562 | #endif | |
563 | } | |
564 | ||
60101509 | 565 | static int |
b0be93e8 IH |
566 | __vdev_disk_physio(struct block_device *bdev, zio_t *zio, |
567 | size_t io_size, uint64_t io_offset, int rw, int flags) | |
60101509 | 568 | { |
d1d7e268 | 569 | dio_request_t *dr; |
b0be93e8 | 570 | uint64_t abd_offset; |
60101509 | 571 | uint64_t bio_offset; |
3b86aeb2 | 572 | int bio_size, bio_count = 16; |
f74fae8b | 573 | int i = 0, error = 0; |
e8ac4557 IH |
574 | #if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG) |
575 | struct blk_plug plug; | |
576 | #endif | |
d441e85d BB |
577 | /* |
578 | * Accessing outside the block device is never allowed. | |
579 | */ | |
580 | if (io_offset + io_size > bdev->bd_inode->i_size) { | |
581 | vdev_dbgmsg(zio->io_vd, | |
582 | "Illegal access %llu size %llu, device size %llu", | |
583 | io_offset, io_size, i_size_read(bdev->bd_inode)); | |
584 | return (SET_ERROR(EIO)); | |
585 | } | |
e06be586 | 586 | |
60101509 BB |
587 | retry: |
588 | dr = vdev_disk_dio_alloc(bio_count); | |
589 | if (dr == NULL) | |
ecb2b7dc | 590 | return (SET_ERROR(ENOMEM)); |
60101509 | 591 | |
2959d94a | 592 | if (zio && !(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD))) |
29b763cd | 593 | bio_set_flags_failfast(bdev, &flags); |
2959d94a | 594 | |
60101509 | 595 | dr->dr_zio = zio; |
60101509 | 596 | |
60101509 BB |
597 | /* |
598 | * When the IO size exceeds the maximum bio size for the request | |
599 | * queue we are forced to break the IO in multiple bio's and wait | |
600 | * for them all to complete. Ideally, all pool users will set | |
601 | * their volume block size to match the maximum request size and | |
602 | * the common case will be one bio per vdev IO request. | |
603 | */ | |
a6255b7f | 604 | |
b0be93e8 IH |
605 | abd_offset = 0; |
606 | bio_offset = io_offset; | |
607 | bio_size = io_size; | |
60101509 BB |
608 | for (i = 0; i <= dr->dr_bio_count; i++) { |
609 | ||
610 | /* Finished constructing bio's for given buffer */ | |
611 | if (bio_size <= 0) | |
612 | break; | |
613 | ||
614 | /* | |
615 | * By default only 'bio_count' bio's per dio are allowed. | |
616 | * However, if we find ourselves in a situation where more | |
617 | * are needed we allocate a larger dio and warn the user. | |
618 | */ | |
619 | if (dr->dr_bio_count == i) { | |
620 | vdev_disk_dio_free(dr); | |
621 | bio_count *= 2; | |
60101509 BB |
622 | goto retry; |
623 | } | |
624 | ||
29b763cd | 625 | /* bio_alloc() with __GFP_WAIT never returns NULL */ |
f1512ee6 | 626 | dr->dr_bio[i] = bio_alloc(GFP_NOIO, |
b0be93e8 | 627 | MIN(abd_nr_pages_off(zio->io_abd, bio_size, abd_offset), |
02730c33 | 628 | BIO_MAX_PAGES)); |
29b763cd | 629 | if (unlikely(dr->dr_bio[i] == NULL)) { |
60101509 | 630 | vdev_disk_dio_free(dr); |
ecb2b7dc | 631 | return (SET_ERROR(ENOMEM)); |
60101509 BB |
632 | } |
633 | ||
634 | /* Matching put called by vdev_disk_physio_completion */ | |
635 | vdev_disk_dio_get(dr); | |
636 | ||
787acae0 | 637 | bio_set_dev(dr->dr_bio[i], bdev); |
d4541210 | 638 | BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9; |
60101509 BB |
639 | dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion; |
640 | dr->dr_bio[i]->bi_private = dr; | |
3b86aeb2 | 641 | bio_set_op_attrs(dr->dr_bio[i], rw, flags); |
60101509 BB |
642 | |
643 | /* Remaining size is returned to become the new size */ | |
b0be93e8 | 644 | bio_size = bio_map_abd_off(dr->dr_bio[i], zio->io_abd, |
02730c33 | 645 | bio_size, abd_offset); |
60101509 BB |
646 | |
647 | /* Advance in buffer and construct another bio if needed */ | |
b0be93e8 | 648 | abd_offset += BIO_BI_SIZE(dr->dr_bio[i]); |
d4541210 | 649 | bio_offset += BIO_BI_SIZE(dr->dr_bio[i]); |
60101509 BB |
650 | } |
651 | ||
37f9dac5 | 652 | /* Extra reference to protect dio_request during vdev_submit_bio */ |
60101509 BB |
653 | vdev_disk_dio_get(dr); |
654 | ||
e8ac4557 IH |
655 | #if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG) |
656 | if (dr->dr_bio_count > 1) | |
657 | blk_start_plug(&plug); | |
658 | #endif | |
659 | ||
60101509 BB |
660 | /* Submit all bio's associated with this dio */ |
661 | for (i = 0; i < dr->dr_bio_count; i++) | |
662 | if (dr->dr_bio[i]) | |
3b86aeb2 | 663 | vdev_submit_bio(dr->dr_bio[i]); |
60101509 | 664 | |
e8ac4557 IH |
665 | #if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG) |
666 | if (dr->dr_bio_count > 1) | |
667 | blk_finish_plug(&plug); | |
668 | #endif | |
669 | ||
d1d7e268 | 670 | (void) vdev_disk_dio_put(dr); |
60101509 | 671 | |
d1d7e268 | 672 | return (error); |
60101509 BB |
673 | } |
674 | ||
36ba27e9 | 675 | BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error) |
60101509 BB |
676 | { |
677 | zio_t *zio = bio->bi_private; | |
784a7fe5 | 678 | #ifdef HAVE_1ARG_BIO_END_IO_T |
36ba27e9 BB |
679 | zio->io_error = BIO_END_IO_ERROR(bio); |
680 | #else | |
681 | zio->io_error = -error; | |
784a7fe5 | 682 | #endif |
60101509 | 683 | |
36ba27e9 | 684 | if (zio->io_error && (zio->io_error == EOPNOTSUPP)) |
60101509 BB |
685 | zio->io_vd->vdev_nowritecache = B_TRUE; |
686 | ||
687 | bio_put(bio); | |
d148e951 BB |
688 | ASSERT3S(zio->io_error, >=, 0); |
689 | if (zio->io_error) | |
690 | vdev_disk_error(zio); | |
60101509 | 691 | zio_interrupt(zio); |
60101509 BB |
692 | } |
693 | ||
694 | static int | |
695 | vdev_disk_io_flush(struct block_device *bdev, zio_t *zio) | |
696 | { | |
697 | struct request_queue *q; | |
698 | struct bio *bio; | |
699 | ||
700 | q = bdev_get_queue(bdev); | |
701 | if (!q) | |
ecb2b7dc | 702 | return (SET_ERROR(ENXIO)); |
60101509 | 703 | |
abc41ac7 | 704 | bio = bio_alloc(GFP_NOIO, 0); |
29b763cd IH |
705 | /* bio_alloc() with __GFP_WAIT never returns NULL */ |
706 | if (unlikely(bio == NULL)) | |
ecb2b7dc | 707 | return (SET_ERROR(ENOMEM)); |
60101509 BB |
708 | |
709 | bio->bi_end_io = vdev_disk_io_flush_completion; | |
710 | bio->bi_private = zio; | |
787acae0 | 711 | bio_set_dev(bio, bdev); |
a5e046ea | 712 | bio_set_flush(bio); |
3b86aeb2 | 713 | vdev_submit_bio(bio); |
cecb7487 | 714 | invalidate_bdev(bdev); |
60101509 | 715 | |
d1d7e268 | 716 | return (0); |
60101509 | 717 | } |
60101509 | 718 | |
98b25418 | 719 | static void |
60101509 BB |
720 | vdev_disk_io_start(zio_t *zio) |
721 | { | |
722 | vdev_t *v = zio->io_vd; | |
723 | vdev_disk_t *vd = v->vdev_tsd; | |
3b86aeb2 | 724 | int rw, flags, error; |
60101509 | 725 | |
d441e85d BB |
726 | /* |
727 | * If the vdev is closed, it's likely in the REMOVED or FAULTED state. | |
728 | * Nothing to be done here but return failure. | |
729 | */ | |
730 | if (vd == NULL) { | |
731 | zio->io_error = ENXIO; | |
732 | zio_interrupt(zio); | |
733 | return; | |
734 | } | |
735 | ||
736 | rw_enter(&vd->vd_lock, RW_READER); | |
737 | ||
738 | /* | |
739 | * If the vdev is closed, it's likely due to a failed reopen and is | |
740 | * in the UNAVAIL state. Nothing to be done here but return failure. | |
741 | */ | |
742 | if (vd->vd_bdev == NULL) { | |
743 | rw_exit(&vd->vd_lock); | |
744 | zio->io_error = ENXIO; | |
745 | zio_interrupt(zio); | |
746 | return; | |
747 | } | |
748 | ||
60101509 BB |
749 | switch (zio->io_type) { |
750 | case ZIO_TYPE_IOCTL: | |
751 | ||
752 | if (!vdev_readable(v)) { | |
d441e85d | 753 | rw_exit(&vd->vd_lock); |
2e528b49 | 754 | zio->io_error = SET_ERROR(ENXIO); |
98b25418 GW |
755 | zio_interrupt(zio); |
756 | return; | |
60101509 BB |
757 | } |
758 | ||
759 | switch (zio->io_cmd) { | |
760 | case DKIOCFLUSHWRITECACHE: | |
761 | ||
762 | if (zfs_nocacheflush) | |
763 | break; | |
764 | ||
765 | if (v->vdev_nowritecache) { | |
2e528b49 | 766 | zio->io_error = SET_ERROR(ENOTSUP); |
60101509 BB |
767 | break; |
768 | } | |
769 | ||
770 | error = vdev_disk_io_flush(vd->vd_bdev, zio); | |
d441e85d BB |
771 | if (error == 0) { |
772 | rw_exit(&vd->vd_lock); | |
98b25418 | 773 | return; |
d441e85d | 774 | } |
60101509 BB |
775 | |
776 | zio->io_error = error; | |
60101509 BB |
777 | |
778 | break; | |
779 | ||
780 | default: | |
2e528b49 | 781 | zio->io_error = SET_ERROR(ENOTSUP); |
60101509 BB |
782 | } |
783 | ||
d441e85d | 784 | rw_exit(&vd->vd_lock); |
98b25418 GW |
785 | zio_execute(zio); |
786 | return; | |
60101509 | 787 | case ZIO_TYPE_WRITE: |
3b86aeb2 | 788 | rw = WRITE; |
e6603b7c | 789 | #if defined(HAVE_BLK_QUEUE_HAVE_BIO_RW_UNPLUG) |
3b86aeb2 | 790 | flags = (1 << BIO_RW_UNPLUG); |
e6603b7c | 791 | #elif defined(REQ_UNPLUG) |
3b86aeb2 | 792 | flags = REQ_UNPLUG; |
e6603b7c | 793 | #else |
3b86aeb2 | 794 | flags = 0; |
e6603b7c | 795 | #endif |
60101509 BB |
796 | break; |
797 | ||
798 | case ZIO_TYPE_READ: | |
3b86aeb2 | 799 | rw = READ; |
e6603b7c | 800 | #if defined(HAVE_BLK_QUEUE_HAVE_BIO_RW_UNPLUG) |
3b86aeb2 | 801 | flags = (1 << BIO_RW_UNPLUG); |
e6603b7c | 802 | #elif defined(REQ_UNPLUG) |
3b86aeb2 | 803 | flags = REQ_UNPLUG; |
e6603b7c | 804 | #else |
3b86aeb2 | 805 | flags = 0; |
e6603b7c | 806 | #endif |
60101509 BB |
807 | break; |
808 | ||
809 | default: | |
d441e85d | 810 | rw_exit(&vd->vd_lock); |
2e528b49 | 811 | zio->io_error = SET_ERROR(ENOTSUP); |
98b25418 GW |
812 | zio_interrupt(zio); |
813 | return; | |
60101509 BB |
814 | } |
815 | ||
26ef0cc7 | 816 | zio->io_target_timestamp = zio_handle_io_delay(zio); |
b0be93e8 | 817 | error = __vdev_disk_physio(vd->vd_bdev, zio, |
3b86aeb2 | 818 | zio->io_size, zio->io_offset, rw, flags); |
d441e85d BB |
819 | rw_exit(&vd->vd_lock); |
820 | ||
60101509 BB |
821 | if (error) { |
822 | zio->io_error = error; | |
98b25418 GW |
823 | zio_interrupt(zio); |
824 | return; | |
60101509 | 825 | } |
60101509 BB |
826 | } |
827 | ||
828 | static void | |
829 | vdev_disk_io_done(zio_t *zio) | |
830 | { | |
831 | /* | |
832 | * If the device returned EIO, we revalidate the media. If it is | |
833 | * determined the media has changed this triggers the asynchronous | |
834 | * removal of the device from the configuration. | |
835 | */ | |
836 | if (zio->io_error == EIO) { | |
d1d7e268 | 837 | vdev_t *v = zio->io_vd; |
60101509 BB |
838 | vdev_disk_t *vd = v->vdev_tsd; |
839 | ||
840 | if (check_disk_change(vd->vd_bdev)) { | |
841 | vdev_bdev_invalidate(vd->vd_bdev); | |
842 | v->vdev_remove_wanted = B_TRUE; | |
843 | spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE); | |
844 | } | |
845 | } | |
846 | } | |
847 | ||
848 | static void | |
849 | vdev_disk_hold(vdev_t *vd) | |
850 | { | |
851 | ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); | |
852 | ||
853 | /* We must have a pathname, and it must be absolute. */ | |
854 | if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') | |
855 | return; | |
856 | ||
857 | /* | |
858 | * Only prefetch path and devid info if the device has | |
859 | * never been opened. | |
860 | */ | |
861 | if (vd->vdev_tsd != NULL) | |
862 | return; | |
863 | ||
864 | /* XXX: Implement me as a vnode lookup for the device */ | |
865 | vd->vdev_name_vp = NULL; | |
866 | vd->vdev_devid_vp = NULL; | |
867 | } | |
868 | ||
869 | static void | |
870 | vdev_disk_rele(vdev_t *vd) | |
871 | { | |
872 | ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); | |
873 | ||
874 | /* XXX: Implement me as a vnode rele for the device */ | |
875 | } | |
876 | ||
e771de53 BB |
877 | static int |
878 | param_set_vdev_scheduler(const char *val, zfs_kernel_param_t *kp) | |
879 | { | |
880 | spa_t *spa = NULL; | |
881 | char *p; | |
882 | ||
883 | if (val == NULL) | |
884 | return (SET_ERROR(-EINVAL)); | |
885 | ||
886 | if ((p = strchr(val, '\n')) != NULL) | |
887 | *p = '\0'; | |
888 | ||
bc5f51c5 | 889 | if (spa_mode_global != 0) { |
e771de53 | 890 | mutex_enter(&spa_namespace_lock); |
bc5f51c5 OF |
891 | while ((spa = spa_next(spa)) != NULL) { |
892 | if (spa_state(spa) != POOL_STATE_ACTIVE || | |
893 | !spa_writeable(spa) || spa_suspended(spa)) | |
894 | continue; | |
895 | ||
896 | spa_open_ref(spa, FTAG); | |
897 | mutex_exit(&spa_namespace_lock); | |
898 | vdev_elevator_switch(spa->spa_root_vdev, (char *)val); | |
899 | mutex_enter(&spa_namespace_lock); | |
900 | spa_close(spa, FTAG); | |
901 | } | |
902 | mutex_exit(&spa_namespace_lock); | |
e771de53 | 903 | } |
e771de53 BB |
904 | |
905 | return (param_set_charp(val, kp)); | |
906 | } | |
907 | ||
60101509 BB |
908 | vdev_ops_t vdev_disk_ops = { |
909 | vdev_disk_open, | |
910 | vdev_disk_close, | |
911 | vdev_default_asize, | |
912 | vdev_disk_io_start, | |
913 | vdev_disk_io_done, | |
914 | NULL, | |
3d6da72d | 915 | NULL, |
60101509 BB |
916 | vdev_disk_hold, |
917 | vdev_disk_rele, | |
a1d477c2 | 918 | NULL, |
619f0976 | 919 | vdev_default_xlate, |
60101509 BB |
920 | VDEV_TYPE_DISK, /* name of this vdev type */ |
921 | B_TRUE /* leaf vdev */ | |
922 | }; | |
923 | ||
e771de53 BB |
924 | module_param_call(zfs_vdev_scheduler, param_set_vdev_scheduler, |
925 | param_get_charp, &zfs_vdev_scheduler, 0644); | |
c409e464 | 926 | MODULE_PARM_DESC(zfs_vdev_scheduler, "I/O scheduler"); |