]>
Commit | Line | Data |
---|---|---|
1 | /* | |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
9 | * or http://www.opensolaris.org/os/licensing. | |
10 | * See the License for the specific language governing permissions | |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | /* | |
22 | * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC. | |
23 | * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). | |
24 | * Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>. | |
25 | * LLNL-CODE-403049. | |
26 | * Copyright (c) 2012, 2018 by Delphix. All rights reserved. | |
27 | */ | |
28 | ||
29 | #include <sys/zfs_context.h> | |
30 | #include <sys/spa_impl.h> | |
31 | #include <sys/vdev_disk.h> | |
32 | #include <sys/vdev_impl.h> | |
33 | #include <sys/vdev_trim.h> | |
34 | #include <sys/abd.h> | |
35 | #include <sys/fs/zfs.h> | |
36 | #include <sys/zio.h> | |
37 | #include <linux/mod_compat.h> | |
38 | #include <linux/msdos_fs.h> | |
39 | #include <linux/vfs_compat.h> | |
40 | ||
41 | char *zfs_vdev_scheduler = VDEV_SCHEDULER; | |
42 | static void *zfs_vdev_holder = VDEV_HOLDER; | |
43 | ||
44 | /* size of the "reserved" partition, in blocks */ | |
45 | #define EFI_MIN_RESV_SIZE (16 * 1024) | |
46 | ||
47 | /* | |
48 | * Virtual device vector for disks. | |
49 | */ | |
50 | typedef struct dio_request { | |
51 | zio_t *dr_zio; /* Parent ZIO */ | |
52 | atomic_t dr_ref; /* References */ | |
53 | int dr_error; /* Bio error */ | |
54 | int dr_bio_count; /* Count of bio's */ | |
55 | struct bio *dr_bio[0]; /* Attached bio's */ | |
56 | } dio_request_t; | |
57 | ||
58 | ||
59 | #ifdef HAVE_OPEN_BDEV_EXCLUSIVE | |
60 | static fmode_t | |
61 | vdev_bdev_mode(int smode) | |
62 | { | |
63 | fmode_t mode = 0; | |
64 | ||
65 | ASSERT3S(smode & (FREAD | FWRITE), !=, 0); | |
66 | ||
67 | if (smode & FREAD) | |
68 | mode |= FMODE_READ; | |
69 | ||
70 | if (smode & FWRITE) | |
71 | mode |= FMODE_WRITE; | |
72 | ||
73 | return (mode); | |
74 | } | |
75 | #else | |
76 | static int | |
77 | vdev_bdev_mode(int smode) | |
78 | { | |
79 | int mode = 0; | |
80 | ||
81 | ASSERT3S(smode & (FREAD | FWRITE), !=, 0); | |
82 | ||
83 | if ((smode & FREAD) && !(smode & FWRITE)) | |
84 | mode = SB_RDONLY; | |
85 | ||
86 | return (mode); | |
87 | } | |
88 | #endif /* HAVE_OPEN_BDEV_EXCLUSIVE */ | |
89 | ||
90 | /* | |
91 | * Returns the usable capacity (in bytes) for the partition or disk. | |
92 | */ | |
93 | static uint64_t | |
94 | bdev_capacity(struct block_device *bdev) | |
95 | { | |
96 | return (i_size_read(bdev->bd_inode)); | |
97 | } | |
98 | ||
99 | /* | |
100 | * Returns the maximum expansion capacity of the block device (in bytes). | |
101 | * | |
102 | * It is possible to expand a vdev when it has been created as a wholedisk | |
103 | * and the containing block device has increased in capacity. Or when the | |
104 | * partition containing the pool has been manually increased in size. | |
105 | * | |
106 | * This function is only responsible for calculating the potential expansion | |
107 | * size so it can be reported by 'zpool list'. The efi_use_whole_disk() is | |
108 | * responsible for verifying the expected partition layout in the wholedisk | |
109 | * case, and updating the partition table if appropriate. Once the partition | |
110 | * size has been increased the additional capacity will be visible using | |
111 | * bdev_capacity(). | |
112 | * | |
113 | * The returned maximum expansion capacity is always expected to be larger, or | |
114 | * at the very least equal, to its usable capacity to prevent overestimating | |
115 | * the pool expandsize. | |
116 | */ | |
117 | static uint64_t | |
118 | bdev_max_capacity(struct block_device *bdev, uint64_t wholedisk) | |
119 | { | |
120 | uint64_t psize; | |
121 | int64_t available; | |
122 | ||
123 | if (wholedisk && bdev->bd_part != NULL && bdev != bdev->bd_contains) { | |
124 | /* | |
125 | * When reporting maximum expansion capacity for a wholedisk | |
126 | * deduct any capacity which is expected to be lost due to | |
127 | * alignment restrictions. Over reporting this value isn't | |
128 | * harmful and would only result in slightly less capacity | |
129 | * than expected post expansion. | |
130 | * The estimated available space may be slightly smaller than | |
131 | * bdev_capacity() for devices where the number of sectors is | |
132 | * not a multiple of the alignment size and the partition layout | |
133 | * is keeping less than PARTITION_END_ALIGNMENT bytes after the | |
134 | * "reserved" EFI partition: in such cases return the device | |
135 | * usable capacity. | |
136 | */ | |
137 | available = i_size_read(bdev->bd_contains->bd_inode) - | |
138 | ((EFI_MIN_RESV_SIZE + NEW_START_BLOCK + | |
139 | PARTITION_END_ALIGNMENT) << SECTOR_BITS); | |
140 | psize = MAX(available, bdev_capacity(bdev)); | |
141 | } else { | |
142 | psize = bdev_capacity(bdev); | |
143 | } | |
144 | ||
145 | return (psize); | |
146 | } | |
147 | ||
148 | static void | |
149 | vdev_disk_error(zio_t *zio) | |
150 | { | |
151 | /* | |
152 | * This function can be called in interrupt context, for instance while | |
153 | * handling IRQs coming from a misbehaving disk device; use printk() | |
154 | * which is safe from any context. | |
155 | */ | |
156 | printk(KERN_WARNING "zio pool=%s vdev=%s error=%d type=%d " | |
157 | "offset=%llu size=%llu flags=%x\n", spa_name(zio->io_spa), | |
158 | zio->io_vd->vdev_path, zio->io_error, zio->io_type, | |
159 | (u_longlong_t)zio->io_offset, (u_longlong_t)zio->io_size, | |
160 | zio->io_flags); | |
161 | } | |
162 | ||
163 | /* | |
164 | * Use the Linux 'noop' elevator for zfs managed block devices. This | |
165 | * strikes the ideal balance by allowing the zfs elevator to do all | |
166 | * request ordering and prioritization. While allowing the Linux | |
167 | * elevator to do the maximum front/back merging allowed by the | |
168 | * physical device. This yields the largest possible requests for | |
169 | * the device with the lowest total overhead. | |
170 | */ | |
171 | static void | |
172 | vdev_elevator_switch(vdev_t *v, char *elevator) | |
173 | { | |
174 | vdev_disk_t *vd = v->vdev_tsd; | |
175 | struct request_queue *q; | |
176 | char *device; | |
177 | int error; | |
178 | ||
179 | for (int c = 0; c < v->vdev_children; c++) | |
180 | vdev_elevator_switch(v->vdev_child[c], elevator); | |
181 | ||
182 | if (!v->vdev_ops->vdev_op_leaf || vd->vd_bdev == NULL) | |
183 | return; | |
184 | ||
185 | q = bdev_get_queue(vd->vd_bdev); | |
186 | device = vd->vd_bdev->bd_disk->disk_name; | |
187 | ||
188 | /* | |
189 | * Skip devices which are not whole disks (partitions). | |
190 | * Device-mapper devices are excepted since they may be whole | |
191 | * disks despite the vdev_wholedisk flag, in which case we can | |
192 | * and should switch the elevator. If the device-mapper device | |
193 | * does not have an elevator (i.e. dm-raid, dm-crypt, etc.) the | |
194 | * "Skip devices without schedulers" check below will fail. | |
195 | */ | |
196 | if (!v->vdev_wholedisk && strncmp(device, "dm-", 3) != 0) | |
197 | return; | |
198 | ||
199 | /* Leave existing scheduler when set to "none" */ | |
200 | if ((strncmp(elevator, "none", 4) == 0) && (strlen(elevator) == 4)) | |
201 | return; | |
202 | ||
203 | /* | |
204 | * The elevator_change() function was available in kernels from | |
205 | * 2.6.36 to 4.11. When not available fall back to using the user | |
206 | * mode helper functionality to set the elevator via sysfs. This | |
207 | * requires /bin/echo and sysfs to be mounted which may not be true | |
208 | * early in the boot process. | |
209 | */ | |
210 | #ifdef HAVE_ELEVATOR_CHANGE | |
211 | error = elevator_change(q, elevator); | |
212 | #else | |
213 | #define SET_SCHEDULER_CMD \ | |
214 | "exec 0</dev/null " \ | |
215 | " 1>/sys/block/%s/queue/scheduler " \ | |
216 | " 2>/dev/null; " \ | |
217 | "echo %s" | |
218 | ||
219 | char *argv[] = { "/bin/sh", "-c", NULL, NULL }; | |
220 | char *envp[] = { NULL }; | |
221 | ||
222 | argv[2] = kmem_asprintf(SET_SCHEDULER_CMD, device, elevator); | |
223 | error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC); | |
224 | strfree(argv[2]); | |
225 | #endif /* HAVE_ELEVATOR_CHANGE */ | |
226 | if (error) { | |
227 | zfs_dbgmsg("Unable to set \"%s\" scheduler for %s (%s): %d", | |
228 | elevator, v->vdev_path, device, error); | |
229 | } | |
230 | } | |
231 | ||
232 | static int | |
233 | vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, | |
234 | uint64_t *ashift) | |
235 | { | |
236 | struct block_device *bdev; | |
237 | fmode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa)); | |
238 | int count = 0, block_size; | |
239 | int bdev_retry_count = 50; | |
240 | vdev_disk_t *vd; | |
241 | ||
242 | /* Must have a pathname and it must be absolute. */ | |
243 | if (v->vdev_path == NULL || v->vdev_path[0] != '/') { | |
244 | v->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; | |
245 | vdev_dbgmsg(v, "invalid vdev_path"); | |
246 | return (SET_ERROR(EINVAL)); | |
247 | } | |
248 | ||
249 | /* | |
250 | * Reopen the device if it is currently open. When expanding a | |
251 | * partition force re-scanning the partition table while closed | |
252 | * in order to get an accurate updated block device size. Then | |
253 | * since udev may need to recreate the device links increase the | |
254 | * open retry count before reporting the device as unavailable. | |
255 | */ | |
256 | vd = v->vdev_tsd; | |
257 | if (vd) { | |
258 | char disk_name[BDEVNAME_SIZE + 6] = "/dev/"; | |
259 | boolean_t reread_part = B_FALSE; | |
260 | ||
261 | rw_enter(&vd->vd_lock, RW_WRITER); | |
262 | bdev = vd->vd_bdev; | |
263 | vd->vd_bdev = NULL; | |
264 | ||
265 | if (bdev) { | |
266 | if (v->vdev_expanding && bdev != bdev->bd_contains) { | |
267 | bdevname(bdev->bd_contains, disk_name + 5); | |
268 | reread_part = B_TRUE; | |
269 | } | |
270 | ||
271 | vdev_bdev_close(bdev, mode); | |
272 | } | |
273 | ||
274 | if (reread_part) { | |
275 | bdev = vdev_bdev_open(disk_name, mode, zfs_vdev_holder); | |
276 | if (!IS_ERR(bdev)) { | |
277 | int error = vdev_bdev_reread_part(bdev); | |
278 | vdev_bdev_close(bdev, mode); | |
279 | if (error == 0) | |
280 | bdev_retry_count = 100; | |
281 | } | |
282 | } | |
283 | } else { | |
284 | vd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP); | |
285 | ||
286 | rw_init(&vd->vd_lock, NULL, RW_DEFAULT, NULL); | |
287 | rw_enter(&vd->vd_lock, RW_WRITER); | |
288 | } | |
289 | ||
290 | /* | |
291 | * Devices are always opened by the path provided at configuration | |
292 | * time. This means that if the provided path is a udev by-id path | |
293 | * then drives may be re-cabled without an issue. If the provided | |
294 | * path is a udev by-path path, then the physical location information | |
295 | * will be preserved. This can be critical for more complicated | |
296 | * configurations where drives are located in specific physical | |
297 | * locations to maximize the systems tolerance to component failure. | |
298 | * | |
299 | * Alternatively, you can provide your own udev rule to flexibly map | |
300 | * the drives as you see fit. It is not advised that you use the | |
301 | * /dev/[hd]d devices which may be reordered due to probing order. | |
302 | * Devices in the wrong locations will be detected by the higher | |
303 | * level vdev validation. | |
304 | * | |
305 | * The specified paths may be briefly removed and recreated in | |
306 | * response to udev events. This should be exceptionally unlikely | |
307 | * because the zpool command makes every effort to verify these paths | |
308 | * have already settled prior to reaching this point. Therefore, | |
309 | * a ENOENT failure at this point is highly likely to be transient | |
310 | * and it is reasonable to sleep and retry before giving up. In | |
311 | * practice delays have been observed to be on the order of 100ms. | |
312 | */ | |
313 | bdev = ERR_PTR(-ENXIO); | |
314 | while (IS_ERR(bdev) && count < bdev_retry_count) { | |
315 | bdev = vdev_bdev_open(v->vdev_path, mode, zfs_vdev_holder); | |
316 | if (unlikely(PTR_ERR(bdev) == -ENOENT)) { | |
317 | schedule_timeout(MSEC_TO_TICK(10)); | |
318 | count++; | |
319 | } else if (IS_ERR(bdev)) { | |
320 | break; | |
321 | } | |
322 | } | |
323 | ||
324 | if (IS_ERR(bdev)) { | |
325 | int error = -PTR_ERR(bdev); | |
326 | vdev_dbgmsg(v, "open error=%d count=%d", error, count); | |
327 | vd->vd_bdev = NULL; | |
328 | v->vdev_tsd = vd; | |
329 | rw_exit(&vd->vd_lock); | |
330 | return (SET_ERROR(error)); | |
331 | } else { | |
332 | vd->vd_bdev = bdev; | |
333 | v->vdev_tsd = vd; | |
334 | rw_exit(&vd->vd_lock); | |
335 | } | |
336 | ||
337 | struct request_queue *q = bdev_get_queue(vd->vd_bdev); | |
338 | ||
339 | /* Determine the physical block size */ | |
340 | block_size = vdev_bdev_block_size(vd->vd_bdev); | |
341 | ||
342 | /* Clear the nowritecache bit, causes vdev_reopen() to try again. */ | |
343 | v->vdev_nowritecache = B_FALSE; | |
344 | ||
345 | /* Set when device reports it supports TRIM. */ | |
346 | v->vdev_has_trim = !!blk_queue_discard(q); | |
347 | ||
348 | /* Set when device reports it supports secure TRIM. */ | |
349 | v->vdev_has_securetrim = !!blk_queue_discard_secure(q); | |
350 | ||
351 | /* Inform the ZIO pipeline that we are non-rotational */ | |
352 | v->vdev_nonrot = blk_queue_nonrot(q); | |
353 | ||
354 | /* Physical volume size in bytes for the partition */ | |
355 | *psize = bdev_capacity(vd->vd_bdev); | |
356 | ||
357 | /* Physical volume size in bytes including possible expansion space */ | |
358 | *max_psize = bdev_max_capacity(vd->vd_bdev, v->vdev_wholedisk); | |
359 | ||
360 | /* Based on the minimum sector size set the block size */ | |
361 | *ashift = highbit64(MAX(block_size, SPA_MINBLOCKSIZE)) - 1; | |
362 | ||
363 | /* Try to set the io scheduler elevator algorithm */ | |
364 | (void) vdev_elevator_switch(v, zfs_vdev_scheduler); | |
365 | ||
366 | return (0); | |
367 | } | |
368 | ||
369 | static void | |
370 | vdev_disk_close(vdev_t *v) | |
371 | { | |
372 | vdev_disk_t *vd = v->vdev_tsd; | |
373 | ||
374 | if (v->vdev_reopening || vd == NULL) | |
375 | return; | |
376 | ||
377 | if (vd->vd_bdev != NULL) { | |
378 | vdev_bdev_close(vd->vd_bdev, | |
379 | vdev_bdev_mode(spa_mode(v->vdev_spa))); | |
380 | } | |
381 | ||
382 | rw_destroy(&vd->vd_lock); | |
383 | kmem_free(vd, sizeof (vdev_disk_t)); | |
384 | v->vdev_tsd = NULL; | |
385 | } | |
386 | ||
387 | static dio_request_t * | |
388 | vdev_disk_dio_alloc(int bio_count) | |
389 | { | |
390 | dio_request_t *dr; | |
391 | int i; | |
392 | ||
393 | dr = kmem_zalloc(sizeof (dio_request_t) + | |
394 | sizeof (struct bio *) * bio_count, KM_SLEEP); | |
395 | if (dr) { | |
396 | atomic_set(&dr->dr_ref, 0); | |
397 | dr->dr_bio_count = bio_count; | |
398 | dr->dr_error = 0; | |
399 | ||
400 | for (i = 0; i < dr->dr_bio_count; i++) | |
401 | dr->dr_bio[i] = NULL; | |
402 | } | |
403 | ||
404 | return (dr); | |
405 | } | |
406 | ||
407 | static void | |
408 | vdev_disk_dio_free(dio_request_t *dr) | |
409 | { | |
410 | int i; | |
411 | ||
412 | for (i = 0; i < dr->dr_bio_count; i++) | |
413 | if (dr->dr_bio[i]) | |
414 | bio_put(dr->dr_bio[i]); | |
415 | ||
416 | kmem_free(dr, sizeof (dio_request_t) + | |
417 | sizeof (struct bio *) * dr->dr_bio_count); | |
418 | } | |
419 | ||
420 | static void | |
421 | vdev_disk_dio_get(dio_request_t *dr) | |
422 | { | |
423 | atomic_inc(&dr->dr_ref); | |
424 | } | |
425 | ||
426 | static int | |
427 | vdev_disk_dio_put(dio_request_t *dr) | |
428 | { | |
429 | int rc = atomic_dec_return(&dr->dr_ref); | |
430 | ||
431 | /* | |
432 | * Free the dio_request when the last reference is dropped and | |
433 | * ensure zio_interpret is called only once with the correct zio | |
434 | */ | |
435 | if (rc == 0) { | |
436 | zio_t *zio = dr->dr_zio; | |
437 | int error = dr->dr_error; | |
438 | ||
439 | vdev_disk_dio_free(dr); | |
440 | ||
441 | if (zio) { | |
442 | zio->io_error = error; | |
443 | ASSERT3S(zio->io_error, >=, 0); | |
444 | if (zio->io_error) | |
445 | vdev_disk_error(zio); | |
446 | ||
447 | zio_delay_interrupt(zio); | |
448 | } | |
449 | } | |
450 | ||
451 | return (rc); | |
452 | } | |
453 | ||
454 | BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, error) | |
455 | { | |
456 | dio_request_t *dr = bio->bi_private; | |
457 | int rc; | |
458 | ||
459 | if (dr->dr_error == 0) { | |
460 | #ifdef HAVE_1ARG_BIO_END_IO_T | |
461 | dr->dr_error = BIO_END_IO_ERROR(bio); | |
462 | #else | |
463 | if (error) | |
464 | dr->dr_error = -(error); | |
465 | else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) | |
466 | dr->dr_error = EIO; | |
467 | #endif | |
468 | } | |
469 | ||
470 | /* Drop reference acquired by __vdev_disk_physio */ | |
471 | rc = vdev_disk_dio_put(dr); | |
472 | } | |
473 | ||
474 | static unsigned int | |
475 | bio_map(struct bio *bio, void *bio_ptr, unsigned int bio_size) | |
476 | { | |
477 | unsigned int offset, size, i; | |
478 | struct page *page; | |
479 | ||
480 | offset = offset_in_page(bio_ptr); | |
481 | for (i = 0; i < bio->bi_max_vecs; i++) { | |
482 | size = PAGE_SIZE - offset; | |
483 | ||
484 | if (bio_size <= 0) | |
485 | break; | |
486 | ||
487 | if (size > bio_size) | |
488 | size = bio_size; | |
489 | ||
490 | if (is_vmalloc_addr(bio_ptr)) | |
491 | page = vmalloc_to_page(bio_ptr); | |
492 | else | |
493 | page = virt_to_page(bio_ptr); | |
494 | ||
495 | /* | |
496 | * Some network related block device uses tcp_sendpage, which | |
497 | * doesn't behave well when using 0-count page, this is a | |
498 | * safety net to catch them. | |
499 | */ | |
500 | ASSERT3S(page_count(page), >, 0); | |
501 | ||
502 | if (bio_add_page(bio, page, size, offset) != size) | |
503 | break; | |
504 | ||
505 | bio_ptr += size; | |
506 | bio_size -= size; | |
507 | offset = 0; | |
508 | } | |
509 | ||
510 | return (bio_size); | |
511 | } | |
512 | ||
513 | static unsigned int | |
514 | bio_map_abd_off(struct bio *bio, abd_t *abd, unsigned int size, size_t off) | |
515 | { | |
516 | if (abd_is_linear(abd)) | |
517 | return (bio_map(bio, ((char *)abd_to_buf(abd)) + off, size)); | |
518 | ||
519 | return (abd_scatter_bio_map_off(bio, abd, size, off)); | |
520 | } | |
521 | ||
522 | static inline void | |
523 | vdev_submit_bio_impl(struct bio *bio) | |
524 | { | |
525 | #ifdef HAVE_1ARG_SUBMIT_BIO | |
526 | submit_bio(bio); | |
527 | #else | |
528 | submit_bio(0, bio); | |
529 | #endif | |
530 | } | |
531 | ||
532 | #ifdef HAVE_BIO_SET_DEV | |
533 | #if defined(CONFIG_BLK_CGROUP) && defined(HAVE_BIO_SET_DEV_GPL_ONLY) | |
534 | /* | |
535 | * The Linux 5.0 kernel updated the bio_set_dev() macro so it calls the | |
536 | * GPL-only bio_associate_blkg() symbol thus inadvertently converting | |
537 | * the entire macro. Provide a minimal version which always assigns the | |
538 | * request queue's root_blkg to the bio. | |
539 | */ | |
540 | static inline void | |
541 | vdev_bio_associate_blkg(struct bio *bio) | |
542 | { | |
543 | struct request_queue *q = bio->bi_disk->queue; | |
544 | ||
545 | ASSERT3P(q, !=, NULL); | |
546 | ASSERT3P(bio->bi_blkg, ==, NULL); | |
547 | ||
548 | if (blkg_tryget(q->root_blkg)) | |
549 | bio->bi_blkg = q->root_blkg; | |
550 | } | |
551 | #define bio_associate_blkg vdev_bio_associate_blkg | |
552 | #endif | |
553 | #else | |
554 | /* | |
555 | * Provide a bio_set_dev() helper macro for pre-Linux 4.14 kernels. | |
556 | */ | |
557 | static inline void | |
558 | bio_set_dev(struct bio *bio, struct block_device *bdev) | |
559 | { | |
560 | bio->bi_bdev = bdev; | |
561 | } | |
562 | #endif /* HAVE_BIO_SET_DEV */ | |
563 | ||
564 | static inline void | |
565 | vdev_submit_bio(struct bio *bio) | |
566 | { | |
567 | #ifdef HAVE_CURRENT_BIO_TAIL | |
568 | struct bio **bio_tail = current->bio_tail; | |
569 | current->bio_tail = NULL; | |
570 | vdev_submit_bio_impl(bio); | |
571 | current->bio_tail = bio_tail; | |
572 | #else | |
573 | struct bio_list *bio_list = current->bio_list; | |
574 | current->bio_list = NULL; | |
575 | vdev_submit_bio_impl(bio); | |
576 | current->bio_list = bio_list; | |
577 | #endif | |
578 | } | |
579 | ||
580 | static int | |
581 | __vdev_disk_physio(struct block_device *bdev, zio_t *zio, | |
582 | size_t io_size, uint64_t io_offset, int rw, int flags) | |
583 | { | |
584 | dio_request_t *dr; | |
585 | uint64_t abd_offset; | |
586 | uint64_t bio_offset; | |
587 | int bio_size, bio_count = 16; | |
588 | int i = 0, error = 0; | |
589 | #if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG) | |
590 | struct blk_plug plug; | |
591 | #endif | |
592 | /* | |
593 | * Accessing outside the block device is never allowed. | |
594 | */ | |
595 | if (io_offset + io_size > bdev->bd_inode->i_size) { | |
596 | vdev_dbgmsg(zio->io_vd, | |
597 | "Illegal access %llu size %llu, device size %llu", | |
598 | io_offset, io_size, i_size_read(bdev->bd_inode)); | |
599 | return (SET_ERROR(EIO)); | |
600 | } | |
601 | ||
602 | retry: | |
603 | dr = vdev_disk_dio_alloc(bio_count); | |
604 | if (dr == NULL) | |
605 | return (SET_ERROR(ENOMEM)); | |
606 | ||
607 | if (zio && !(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD))) | |
608 | bio_set_flags_failfast(bdev, &flags); | |
609 | ||
610 | dr->dr_zio = zio; | |
611 | ||
612 | /* | |
613 | * When the IO size exceeds the maximum bio size for the request | |
614 | * queue we are forced to break the IO in multiple bio's and wait | |
615 | * for them all to complete. Ideally, all pool users will set | |
616 | * their volume block size to match the maximum request size and | |
617 | * the common case will be one bio per vdev IO request. | |
618 | */ | |
619 | ||
620 | abd_offset = 0; | |
621 | bio_offset = io_offset; | |
622 | bio_size = io_size; | |
623 | for (i = 0; i <= dr->dr_bio_count; i++) { | |
624 | ||
625 | /* Finished constructing bio's for given buffer */ | |
626 | if (bio_size <= 0) | |
627 | break; | |
628 | ||
629 | /* | |
630 | * By default only 'bio_count' bio's per dio are allowed. | |
631 | * However, if we find ourselves in a situation where more | |
632 | * are needed we allocate a larger dio and warn the user. | |
633 | */ | |
634 | if (dr->dr_bio_count == i) { | |
635 | vdev_disk_dio_free(dr); | |
636 | bio_count *= 2; | |
637 | goto retry; | |
638 | } | |
639 | ||
640 | /* bio_alloc() with __GFP_WAIT never returns NULL */ | |
641 | dr->dr_bio[i] = bio_alloc(GFP_NOIO, | |
642 | MIN(abd_nr_pages_off(zio->io_abd, bio_size, abd_offset), | |
643 | BIO_MAX_PAGES)); | |
644 | if (unlikely(dr->dr_bio[i] == NULL)) { | |
645 | vdev_disk_dio_free(dr); | |
646 | return (SET_ERROR(ENOMEM)); | |
647 | } | |
648 | ||
649 | /* Matching put called by vdev_disk_physio_completion */ | |
650 | vdev_disk_dio_get(dr); | |
651 | ||
652 | bio_set_dev(dr->dr_bio[i], bdev); | |
653 | BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9; | |
654 | dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion; | |
655 | dr->dr_bio[i]->bi_private = dr; | |
656 | bio_set_op_attrs(dr->dr_bio[i], rw, flags); | |
657 | ||
658 | /* Remaining size is returned to become the new size */ | |
659 | bio_size = bio_map_abd_off(dr->dr_bio[i], zio->io_abd, | |
660 | bio_size, abd_offset); | |
661 | ||
662 | /* Advance in buffer and construct another bio if needed */ | |
663 | abd_offset += BIO_BI_SIZE(dr->dr_bio[i]); | |
664 | bio_offset += BIO_BI_SIZE(dr->dr_bio[i]); | |
665 | } | |
666 | ||
667 | /* Extra reference to protect dio_request during vdev_submit_bio */ | |
668 | vdev_disk_dio_get(dr); | |
669 | ||
670 | #if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG) | |
671 | if (dr->dr_bio_count > 1) | |
672 | blk_start_plug(&plug); | |
673 | #endif | |
674 | ||
675 | /* Submit all bio's associated with this dio */ | |
676 | for (i = 0; i < dr->dr_bio_count; i++) | |
677 | if (dr->dr_bio[i]) | |
678 | vdev_submit_bio(dr->dr_bio[i]); | |
679 | ||
680 | #if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG) | |
681 | if (dr->dr_bio_count > 1) | |
682 | blk_finish_plug(&plug); | |
683 | #endif | |
684 | ||
685 | (void) vdev_disk_dio_put(dr); | |
686 | ||
687 | return (error); | |
688 | } | |
689 | ||
690 | BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error) | |
691 | { | |
692 | zio_t *zio = bio->bi_private; | |
693 | #ifdef HAVE_1ARG_BIO_END_IO_T | |
694 | zio->io_error = BIO_END_IO_ERROR(bio); | |
695 | #else | |
696 | zio->io_error = -error; | |
697 | #endif | |
698 | ||
699 | if (zio->io_error && (zio->io_error == EOPNOTSUPP)) | |
700 | zio->io_vd->vdev_nowritecache = B_TRUE; | |
701 | ||
702 | bio_put(bio); | |
703 | ASSERT3S(zio->io_error, >=, 0); | |
704 | if (zio->io_error) | |
705 | vdev_disk_error(zio); | |
706 | zio_interrupt(zio); | |
707 | } | |
708 | ||
709 | static int | |
710 | vdev_disk_io_flush(struct block_device *bdev, zio_t *zio) | |
711 | { | |
712 | struct request_queue *q; | |
713 | struct bio *bio; | |
714 | ||
715 | q = bdev_get_queue(bdev); | |
716 | if (!q) | |
717 | return (SET_ERROR(ENXIO)); | |
718 | ||
719 | bio = bio_alloc(GFP_NOIO, 0); | |
720 | /* bio_alloc() with __GFP_WAIT never returns NULL */ | |
721 | if (unlikely(bio == NULL)) | |
722 | return (SET_ERROR(ENOMEM)); | |
723 | ||
724 | bio->bi_end_io = vdev_disk_io_flush_completion; | |
725 | bio->bi_private = zio; | |
726 | bio_set_dev(bio, bdev); | |
727 | bio_set_flush(bio); | |
728 | vdev_submit_bio(bio); | |
729 | invalidate_bdev(bdev); | |
730 | ||
731 | return (0); | |
732 | } | |
733 | ||
734 | static void | |
735 | vdev_disk_io_start(zio_t *zio) | |
736 | { | |
737 | vdev_t *v = zio->io_vd; | |
738 | vdev_disk_t *vd = v->vdev_tsd; | |
739 | unsigned long trim_flags = 0; | |
740 | int rw, flags, error; | |
741 | ||
742 | /* | |
743 | * If the vdev is closed, it's likely in the REMOVED or FAULTED state. | |
744 | * Nothing to be done here but return failure. | |
745 | */ | |
746 | if (vd == NULL) { | |
747 | zio->io_error = ENXIO; | |
748 | zio_interrupt(zio); | |
749 | return; | |
750 | } | |
751 | ||
752 | rw_enter(&vd->vd_lock, RW_READER); | |
753 | ||
754 | /* | |
755 | * If the vdev is closed, it's likely due to a failed reopen and is | |
756 | * in the UNAVAIL state. Nothing to be done here but return failure. | |
757 | */ | |
758 | if (vd->vd_bdev == NULL) { | |
759 | rw_exit(&vd->vd_lock); | |
760 | zio->io_error = ENXIO; | |
761 | zio_interrupt(zio); | |
762 | return; | |
763 | } | |
764 | ||
765 | switch (zio->io_type) { | |
766 | case ZIO_TYPE_IOCTL: | |
767 | ||
768 | if (!vdev_readable(v)) { | |
769 | rw_exit(&vd->vd_lock); | |
770 | zio->io_error = SET_ERROR(ENXIO); | |
771 | zio_interrupt(zio); | |
772 | return; | |
773 | } | |
774 | ||
775 | switch (zio->io_cmd) { | |
776 | case DKIOCFLUSHWRITECACHE: | |
777 | ||
778 | if (zfs_nocacheflush) | |
779 | break; | |
780 | ||
781 | if (v->vdev_nowritecache) { | |
782 | zio->io_error = SET_ERROR(ENOTSUP); | |
783 | break; | |
784 | } | |
785 | ||
786 | error = vdev_disk_io_flush(vd->vd_bdev, zio); | |
787 | if (error == 0) { | |
788 | rw_exit(&vd->vd_lock); | |
789 | return; | |
790 | } | |
791 | ||
792 | zio->io_error = error; | |
793 | ||
794 | break; | |
795 | ||
796 | default: | |
797 | zio->io_error = SET_ERROR(ENOTSUP); | |
798 | } | |
799 | ||
800 | rw_exit(&vd->vd_lock); | |
801 | zio_execute(zio); | |
802 | return; | |
803 | case ZIO_TYPE_WRITE: | |
804 | rw = WRITE; | |
805 | #if defined(HAVE_BLK_QUEUE_HAVE_BIO_RW_UNPLUG) | |
806 | flags = (1 << BIO_RW_UNPLUG); | |
807 | #elif defined(REQ_UNPLUG) | |
808 | flags = REQ_UNPLUG; | |
809 | #else | |
810 | flags = 0; | |
811 | #endif | |
812 | break; | |
813 | ||
814 | case ZIO_TYPE_READ: | |
815 | rw = READ; | |
816 | #if defined(HAVE_BLK_QUEUE_HAVE_BIO_RW_UNPLUG) | |
817 | flags = (1 << BIO_RW_UNPLUG); | |
818 | #elif defined(REQ_UNPLUG) | |
819 | flags = REQ_UNPLUG; | |
820 | #else | |
821 | flags = 0; | |
822 | #endif | |
823 | break; | |
824 | ||
825 | case ZIO_TYPE_TRIM: | |
826 | #if defined(BLKDEV_DISCARD_SECURE) | |
827 | if (zio->io_trim_flags & ZIO_TRIM_SECURE) | |
828 | trim_flags |= BLKDEV_DISCARD_SECURE; | |
829 | #endif | |
830 | zio->io_error = -blkdev_issue_discard(vd->vd_bdev, | |
831 | zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS, | |
832 | trim_flags); | |
833 | ||
834 | rw_exit(&vd->vd_lock); | |
835 | zio_interrupt(zio); | |
836 | return; | |
837 | ||
838 | default: | |
839 | rw_exit(&vd->vd_lock); | |
840 | zio->io_error = SET_ERROR(ENOTSUP); | |
841 | zio_interrupt(zio); | |
842 | return; | |
843 | } | |
844 | ||
845 | zio->io_target_timestamp = zio_handle_io_delay(zio); | |
846 | error = __vdev_disk_physio(vd->vd_bdev, zio, | |
847 | zio->io_size, zio->io_offset, rw, flags); | |
848 | rw_exit(&vd->vd_lock); | |
849 | ||
850 | if (error) { | |
851 | zio->io_error = error; | |
852 | zio_interrupt(zio); | |
853 | return; | |
854 | } | |
855 | } | |
856 | ||
857 | static void | |
858 | vdev_disk_io_done(zio_t *zio) | |
859 | { | |
860 | /* | |
861 | * If the device returned EIO, we revalidate the media. If it is | |
862 | * determined the media has changed this triggers the asynchronous | |
863 | * removal of the device from the configuration. | |
864 | */ | |
865 | if (zio->io_error == EIO) { | |
866 | vdev_t *v = zio->io_vd; | |
867 | vdev_disk_t *vd = v->vdev_tsd; | |
868 | ||
869 | if (check_disk_change(vd->vd_bdev)) { | |
870 | vdev_bdev_invalidate(vd->vd_bdev); | |
871 | v->vdev_remove_wanted = B_TRUE; | |
872 | spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE); | |
873 | } | |
874 | } | |
875 | } | |
876 | ||
877 | static void | |
878 | vdev_disk_hold(vdev_t *vd) | |
879 | { | |
880 | ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); | |
881 | ||
882 | /* We must have a pathname, and it must be absolute. */ | |
883 | if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') | |
884 | return; | |
885 | ||
886 | /* | |
887 | * Only prefetch path and devid info if the device has | |
888 | * never been opened. | |
889 | */ | |
890 | if (vd->vdev_tsd != NULL) | |
891 | return; | |
892 | ||
893 | /* XXX: Implement me as a vnode lookup for the device */ | |
894 | vd->vdev_name_vp = NULL; | |
895 | vd->vdev_devid_vp = NULL; | |
896 | } | |
897 | ||
898 | static void | |
899 | vdev_disk_rele(vdev_t *vd) | |
900 | { | |
901 | ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); | |
902 | ||
903 | /* XXX: Implement me as a vnode rele for the device */ | |
904 | } | |
905 | ||
906 | static int | |
907 | param_set_vdev_scheduler(const char *val, zfs_kernel_param_t *kp) | |
908 | { | |
909 | spa_t *spa = NULL; | |
910 | char *p; | |
911 | ||
912 | if (val == NULL) | |
913 | return (SET_ERROR(-EINVAL)); | |
914 | ||
915 | if ((p = strchr(val, '\n')) != NULL) | |
916 | *p = '\0'; | |
917 | ||
918 | if (spa_mode_global != 0) { | |
919 | mutex_enter(&spa_namespace_lock); | |
920 | while ((spa = spa_next(spa)) != NULL) { | |
921 | if (spa_state(spa) != POOL_STATE_ACTIVE || | |
922 | !spa_writeable(spa) || spa_suspended(spa)) | |
923 | continue; | |
924 | ||
925 | spa_open_ref(spa, FTAG); | |
926 | mutex_exit(&spa_namespace_lock); | |
927 | vdev_elevator_switch(spa->spa_root_vdev, (char *)val); | |
928 | mutex_enter(&spa_namespace_lock); | |
929 | spa_close(spa, FTAG); | |
930 | } | |
931 | mutex_exit(&spa_namespace_lock); | |
932 | } | |
933 | ||
934 | return (param_set_charp(val, kp)); | |
935 | } | |
936 | ||
937 | vdev_ops_t vdev_disk_ops = { | |
938 | vdev_disk_open, | |
939 | vdev_disk_close, | |
940 | vdev_default_asize, | |
941 | vdev_disk_io_start, | |
942 | vdev_disk_io_done, | |
943 | NULL, | |
944 | NULL, | |
945 | vdev_disk_hold, | |
946 | vdev_disk_rele, | |
947 | NULL, | |
948 | vdev_default_xlate, | |
949 | VDEV_TYPE_DISK, /* name of this vdev type */ | |
950 | B_TRUE /* leaf vdev */ | |
951 | }; | |
952 | ||
953 | module_param_call(zfs_vdev_scheduler, param_set_vdev_scheduler, | |
954 | param_get_charp, &zfs_vdev_scheduler, 0644); | |
955 | MODULE_PARM_DESC(zfs_vdev_scheduler, "I/O scheduler"); |