]>
Commit | Line | Data |
---|---|---|
70e083d2 TG |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
9 | * or http://www.opensolaris.org/os/licensing. | |
10 | * See the License for the specific language governing permissions | |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | /* | |
22 | * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC. | |
23 | * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). | |
24 | * Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>. | |
25 | * LLNL-CODE-403049. | |
86e3c28a | 26 | * Copyright (c) 2012, 2015 by Delphix. All rights reserved. |
70e083d2 TG |
27 | */ |
28 | ||
29 | #include <sys/zfs_context.h> | |
30 | #include <sys/spa.h> | |
31 | #include <sys/vdev_disk.h> | |
32 | #include <sys/vdev_impl.h> | |
86e3c28a | 33 | #include <sys/abd.h> |
70e083d2 TG |
34 | #include <sys/fs/zfs.h> |
35 | #include <sys/zio.h> | |
36 | #include <sys/sunldi.h> | |
37 | ||
38 | char *zfs_vdev_scheduler = VDEV_SCHEDULER; | |
39 | static void *zfs_vdev_holder = VDEV_HOLDER; | |
40 | ||
41 | /* | |
42 | * Virtual device vector for disks. | |
43 | */ | |
44 | typedef struct dio_request { | |
45 | zio_t *dr_zio; /* Parent ZIO */ | |
46 | atomic_t dr_ref; /* References */ | |
47 | int dr_error; /* Bio error */ | |
48 | int dr_bio_count; /* Count of bio's */ | |
49 | struct bio *dr_bio[0]; /* Attached bio's */ | |
50 | } dio_request_t; | |
51 | ||
52 | ||
53 | #ifdef HAVE_OPEN_BDEV_EXCLUSIVE | |
54 | static fmode_t | |
55 | vdev_bdev_mode(int smode) | |
56 | { | |
57 | fmode_t mode = 0; | |
58 | ||
59 | ASSERT3S(smode & (FREAD | FWRITE), !=, 0); | |
60 | ||
61 | if (smode & FREAD) | |
62 | mode |= FMODE_READ; | |
63 | ||
64 | if (smode & FWRITE) | |
65 | mode |= FMODE_WRITE; | |
66 | ||
67 | return (mode); | |
68 | } | |
69 | #else | |
70 | static int | |
71 | vdev_bdev_mode(int smode) | |
72 | { | |
73 | int mode = 0; | |
74 | ||
75 | ASSERT3S(smode & (FREAD | FWRITE), !=, 0); | |
76 | ||
77 | if ((smode & FREAD) && !(smode & FWRITE)) | |
78 | mode = MS_RDONLY; | |
79 | ||
80 | return (mode); | |
81 | } | |
82 | #endif /* HAVE_OPEN_BDEV_EXCLUSIVE */ | |
83 | ||
84 | static uint64_t | |
85 | bdev_capacity(struct block_device *bdev) | |
86 | { | |
87 | struct hd_struct *part = bdev->bd_part; | |
88 | ||
89 | /* The partition capacity referenced by the block device */ | |
90 | if (part) | |
91 | return (part->nr_sects << 9); | |
92 | ||
93 | /* Otherwise assume the full device capacity */ | |
94 | return (get_capacity(bdev->bd_disk) << 9); | |
95 | } | |
96 | ||
97 | static void | |
98 | vdev_disk_error(zio_t *zio) | |
99 | { | |
100 | #ifdef ZFS_DEBUG | |
b49151d6 | 101 | printk(KERN_WARNING "ZFS: zio error=%d type=%d offset=%llu size=%llu " |
86e3c28a | 102 | "flags=%x\n", zio->io_error, zio->io_type, |
70e083d2 | 103 | (u_longlong_t)zio->io_offset, (u_longlong_t)zio->io_size, |
86e3c28a | 104 | zio->io_flags); |
70e083d2 TG |
105 | #endif |
106 | } | |
107 | ||
108 | /* | |
109 | * Use the Linux 'noop' elevator for zfs managed block devices. This | |
110 | * strikes the ideal balance by allowing the zfs elevator to do all | |
111 | * request ordering and prioritization. While allowing the Linux | |
112 | * elevator to do the maximum front/back merging allowed by the | |
113 | * physical device. This yields the largest possible requests for | |
114 | * the device with the lowest total overhead. | |
115 | */ | |
116 | static int | |
117 | vdev_elevator_switch(vdev_t *v, char *elevator) | |
118 | { | |
119 | vdev_disk_t *vd = v->vdev_tsd; | |
120 | struct block_device *bdev = vd->vd_bdev; | |
121 | struct request_queue *q = bdev_get_queue(bdev); | |
122 | char *device = bdev->bd_disk->disk_name; | |
123 | int error; | |
124 | ||
125 | /* | |
126 | * Skip devices which are not whole disks (partitions). | |
127 | * Device-mapper devices are excepted since they may be whole | |
128 | * disks despite the vdev_wholedisk flag, in which case we can | |
129 | * and should switch the elevator. If the device-mapper device | |
130 | * does not have an elevator (i.e. dm-raid, dm-crypt, etc.) the | |
131 | * "Skip devices without schedulers" check below will fail. | |
132 | */ | |
133 | if (!v->vdev_wholedisk && strncmp(device, "dm-", 3) != 0) | |
134 | return (0); | |
135 | ||
136 | /* Skip devices without schedulers (loop, ram, dm, etc) */ | |
137 | if (!q->elevator || !blk_queue_stackable(q)) | |
138 | return (0); | |
139 | ||
140 | /* Leave existing scheduler when set to "none" */ | |
141 | if ((strncmp(elevator, "none", 4) == 0) && (strlen(elevator) == 4)) | |
142 | return (0); | |
143 | ||
144 | #ifdef HAVE_ELEVATOR_CHANGE | |
145 | error = elevator_change(q, elevator); | |
146 | #else | |
147 | /* | |
148 | * For pre-2.6.36 kernels elevator_change() is not available. | |
149 | * Therefore we fall back to using a usermodehelper to echo the | |
150 | * elevator into sysfs; This requires /bin/echo and sysfs to be | |
151 | * mounted which may not be true early in the boot process. | |
152 | */ | |
153 | #define SET_SCHEDULER_CMD \ | |
154 | "exec 0</dev/null " \ | |
155 | " 1>/sys/block/%s/queue/scheduler " \ | |
156 | " 2>/dev/null; " \ | |
157 | "echo %s" | |
158 | ||
159 | { | |
160 | char *argv[] = { "/bin/sh", "-c", NULL, NULL }; | |
161 | char *envp[] = { NULL }; | |
162 | ||
163 | argv[2] = kmem_asprintf(SET_SCHEDULER_CMD, device, elevator); | |
164 | error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC); | |
165 | strfree(argv[2]); | |
166 | } | |
167 | #endif /* HAVE_ELEVATOR_CHANGE */ | |
168 | if (error) | |
169 | printk("ZFS: Unable to set \"%s\" scheduler for %s (%s): %d\n", | |
170 | elevator, v->vdev_path, device, error); | |
171 | ||
172 | return (error); | |
173 | } | |
174 | ||
175 | /* | |
176 | * Expanding a whole disk vdev involves invoking BLKRRPART on the | |
177 | * whole disk device. This poses a problem, because BLKRRPART will | |
178 | * return EBUSY if one of the disk's partitions is open. That's why | |
179 | * we have to do it here, just before opening the data partition. | |
180 | * Unfortunately, BLKRRPART works by dropping all partitions and | |
181 | * recreating them, which means that for a short time window, all | |
182 | * /dev/sdxN device files disappear (until udev recreates them). | |
183 | * This means two things: | |
184 | * - When we open the data partition just after a BLKRRPART, we | |
185 | * can't do it using the normal device file path because of the | |
186 | * obvious race condition with udev. Instead, we use reliable | |
187 | * kernel APIs to get a handle to the new partition device from | |
188 | * the whole disk device. | |
189 | * - Because vdev_disk_open() initially needs to find the device | |
190 | * using its path, multiple vdev_disk_open() invocations in | |
191 | * short succession on the same disk with BLKRRPARTs in the | |
192 | * middle have a high probability of failure (because of the | |
193 | * race condition with udev). A typical situation where this | |
194 | * might happen is when the zpool userspace tool does a | |
195 | * TRYIMPORT immediately followed by an IMPORT. For this | |
196 | * reason, we only invoke BLKRRPART in the module when strictly | |
197 | * necessary (zpool online -e case), and rely on userspace to | |
198 | * do it when possible. | |
199 | */ | |
200 | static struct block_device * | |
201 | vdev_disk_rrpart(const char *path, int mode, vdev_disk_t *vd) | |
202 | { | |
203 | #if defined(HAVE_3ARG_BLKDEV_GET) && defined(HAVE_GET_GENDISK) | |
204 | struct block_device *bdev, *result = ERR_PTR(-ENXIO); | |
205 | struct gendisk *disk; | |
206 | int error, partno; | |
207 | ||
208 | bdev = vdev_bdev_open(path, vdev_bdev_mode(mode), zfs_vdev_holder); | |
209 | if (IS_ERR(bdev)) | |
210 | return (bdev); | |
211 | ||
212 | disk = get_gendisk(bdev->bd_dev, &partno); | |
213 | vdev_bdev_close(bdev, vdev_bdev_mode(mode)); | |
214 | ||
215 | if (disk) { | |
216 | bdev = bdget(disk_devt(disk)); | |
217 | if (bdev) { | |
218 | error = blkdev_get(bdev, vdev_bdev_mode(mode), vd); | |
219 | if (error == 0) | |
220 | error = ioctl_by_bdev(bdev, BLKRRPART, 0); | |
221 | vdev_bdev_close(bdev, vdev_bdev_mode(mode)); | |
222 | } | |
223 | ||
224 | bdev = bdget_disk(disk, partno); | |
225 | if (bdev) { | |
226 | error = blkdev_get(bdev, | |
227 | vdev_bdev_mode(mode) | FMODE_EXCL, vd); | |
228 | if (error == 0) | |
229 | result = bdev; | |
230 | } | |
231 | put_disk(disk); | |
232 | } | |
233 | ||
234 | return (result); | |
235 | #else | |
236 | return (ERR_PTR(-EOPNOTSUPP)); | |
237 | #endif /* defined(HAVE_3ARG_BLKDEV_GET) && defined(HAVE_GET_GENDISK) */ | |
238 | } | |
239 | ||
240 | static int | |
241 | vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, | |
242 | uint64_t *ashift) | |
243 | { | |
244 | struct block_device *bdev = ERR_PTR(-ENXIO); | |
245 | vdev_disk_t *vd; | |
246 | int count = 0, mode, block_size; | |
247 | ||
248 | /* Must have a pathname and it must be absolute. */ | |
249 | if (v->vdev_path == NULL || v->vdev_path[0] != '/') { | |
250 | v->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; | |
251 | return (SET_ERROR(EINVAL)); | |
252 | } | |
253 | ||
254 | /* | |
255 | * Reopen the device if it's not currently open. Otherwise, | |
256 | * just update the physical size of the device. | |
257 | */ | |
258 | if (v->vdev_tsd != NULL) { | |
259 | ASSERT(v->vdev_reopening); | |
260 | vd = v->vdev_tsd; | |
261 | goto skip_open; | |
262 | } | |
263 | ||
264 | vd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP); | |
265 | if (vd == NULL) | |
266 | return (SET_ERROR(ENOMEM)); | |
267 | ||
268 | /* | |
269 | * Devices are always opened by the path provided at configuration | |
270 | * time. This means that if the provided path is a udev by-id path | |
271 | * then drives may be recabled without an issue. If the provided | |
272 | * path is a udev by-path path, then the physical location information | |
273 | * will be preserved. This can be critical for more complicated | |
274 | * configurations where drives are located in specific physical | |
275 | * locations to maximize the systems tolerence to component failure. | |
276 | * Alternatively, you can provide your own udev rule to flexibly map | |
277 | * the drives as you see fit. It is not advised that you use the | |
278 | * /dev/[hd]d devices which may be reordered due to probing order. | |
279 | * Devices in the wrong locations will be detected by the higher | |
280 | * level vdev validation. | |
281 | * | |
282 | * The specified paths may be briefly removed and recreated in | |
283 | * response to udev events. This should be exceptionally unlikely | |
284 | * because the zpool command makes every effort to verify these paths | |
285 | * have already settled prior to reaching this point. Therefore, | |
286 | * a ENOENT failure at this point is highly likely to be transient | |
287 | * and it is reasonable to sleep and retry before giving up. In | |
288 | * practice delays have been observed to be on the order of 100ms. | |
289 | */ | |
290 | mode = spa_mode(v->vdev_spa); | |
291 | if (v->vdev_wholedisk && v->vdev_expanding) | |
292 | bdev = vdev_disk_rrpart(v->vdev_path, mode, vd); | |
293 | ||
294 | while (IS_ERR(bdev) && count < 50) { | |
295 | bdev = vdev_bdev_open(v->vdev_path, | |
296 | vdev_bdev_mode(mode), zfs_vdev_holder); | |
297 | if (unlikely(PTR_ERR(bdev) == -ENOENT)) { | |
298 | msleep(10); | |
299 | count++; | |
300 | } else if (IS_ERR(bdev)) { | |
301 | break; | |
302 | } | |
303 | } | |
304 | ||
305 | if (IS_ERR(bdev)) { | |
306 | dprintf("failed open v->vdev_path=%s, error=%d count=%d\n", | |
307 | v->vdev_path, -PTR_ERR(bdev), count); | |
308 | kmem_free(vd, sizeof (vdev_disk_t)); | |
309 | return (SET_ERROR(-PTR_ERR(bdev))); | |
310 | } | |
311 | ||
312 | v->vdev_tsd = vd; | |
313 | vd->vd_bdev = bdev; | |
314 | ||
315 | skip_open: | |
316 | /* Determine the physical block size */ | |
317 | block_size = vdev_bdev_block_size(vd->vd_bdev); | |
318 | ||
319 | /* Clear the nowritecache bit, causes vdev_reopen() to try again. */ | |
320 | v->vdev_nowritecache = B_FALSE; | |
321 | ||
322 | /* Inform the ZIO pipeline that we are non-rotational */ | |
323 | v->vdev_nonrot = blk_queue_nonrot(bdev_get_queue(vd->vd_bdev)); | |
324 | ||
325 | /* Physical volume size in bytes */ | |
326 | *psize = bdev_capacity(vd->vd_bdev); | |
327 | ||
328 | /* TODO: report possible expansion size */ | |
329 | *max_psize = *psize; | |
330 | ||
331 | /* Based on the minimum sector size set the block size */ | |
332 | *ashift = highbit64(MAX(block_size, SPA_MINBLOCKSIZE)) - 1; | |
333 | ||
334 | /* Try to set the io scheduler elevator algorithm */ | |
335 | (void) vdev_elevator_switch(v, zfs_vdev_scheduler); | |
336 | ||
337 | return (0); | |
338 | } | |
339 | ||
340 | static void | |
341 | vdev_disk_close(vdev_t *v) | |
342 | { | |
343 | vdev_disk_t *vd = v->vdev_tsd; | |
344 | ||
345 | if (v->vdev_reopening || vd == NULL) | |
346 | return; | |
347 | ||
348 | if (vd->vd_bdev != NULL) | |
349 | vdev_bdev_close(vd->vd_bdev, | |
350 | vdev_bdev_mode(spa_mode(v->vdev_spa))); | |
351 | ||
352 | kmem_free(vd, sizeof (vdev_disk_t)); | |
353 | v->vdev_tsd = NULL; | |
354 | } | |
355 | ||
356 | static dio_request_t * | |
357 | vdev_disk_dio_alloc(int bio_count) | |
358 | { | |
359 | dio_request_t *dr; | |
360 | int i; | |
361 | ||
362 | dr = kmem_zalloc(sizeof (dio_request_t) + | |
363 | sizeof (struct bio *) * bio_count, KM_SLEEP); | |
364 | if (dr) { | |
365 | atomic_set(&dr->dr_ref, 0); | |
366 | dr->dr_bio_count = bio_count; | |
367 | dr->dr_error = 0; | |
368 | ||
369 | for (i = 0; i < dr->dr_bio_count; i++) | |
370 | dr->dr_bio[i] = NULL; | |
371 | } | |
372 | ||
373 | return (dr); | |
374 | } | |
375 | ||
376 | static void | |
377 | vdev_disk_dio_free(dio_request_t *dr) | |
378 | { | |
379 | int i; | |
380 | ||
381 | for (i = 0; i < dr->dr_bio_count; i++) | |
382 | if (dr->dr_bio[i]) | |
383 | bio_put(dr->dr_bio[i]); | |
384 | ||
385 | kmem_free(dr, sizeof (dio_request_t) + | |
386 | sizeof (struct bio *) * dr->dr_bio_count); | |
387 | } | |
388 | ||
389 | static void | |
390 | vdev_disk_dio_get(dio_request_t *dr) | |
391 | { | |
392 | atomic_inc(&dr->dr_ref); | |
393 | } | |
394 | ||
395 | static int | |
396 | vdev_disk_dio_put(dio_request_t *dr) | |
397 | { | |
398 | int rc = atomic_dec_return(&dr->dr_ref); | |
399 | ||
400 | /* | |
401 | * Free the dio_request when the last reference is dropped and | |
402 | * ensure zio_interpret is called only once with the correct zio | |
403 | */ | |
404 | if (rc == 0) { | |
405 | zio_t *zio = dr->dr_zio; | |
406 | int error = dr->dr_error; | |
407 | ||
408 | vdev_disk_dio_free(dr); | |
409 | ||
410 | if (zio) { | |
70e083d2 TG |
411 | zio->io_error = error; |
412 | ASSERT3S(zio->io_error, >=, 0); | |
413 | if (zio->io_error) | |
414 | vdev_disk_error(zio); | |
86e3c28a CIK |
415 | |
416 | zio_delay_interrupt(zio); | |
70e083d2 TG |
417 | } |
418 | } | |
419 | ||
420 | return (rc); | |
421 | } | |
422 | ||
423 | BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, error) | |
424 | { | |
425 | dio_request_t *dr = bio->bi_private; | |
426 | int rc; | |
427 | ||
428 | if (dr->dr_error == 0) { | |
429 | #ifdef HAVE_1ARG_BIO_END_IO_T | |
3ab1144a | 430 | dr->dr_error = BIO_END_IO_ERROR(bio); |
70e083d2 TG |
431 | #else |
432 | if (error) | |
433 | dr->dr_error = -(error); | |
434 | else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) | |
435 | dr->dr_error = EIO; | |
436 | #endif | |
437 | } | |
438 | ||
86e3c28a | 439 | /* Drop reference acquired by __vdev_disk_physio */ |
70e083d2 TG |
440 | rc = vdev_disk_dio_put(dr); |
441 | } | |
442 | ||
70e083d2 TG |
443 | static unsigned int |
444 | bio_map(struct bio *bio, void *bio_ptr, unsigned int bio_size) | |
445 | { | |
446 | unsigned int offset, size, i; | |
447 | struct page *page; | |
448 | ||
449 | offset = offset_in_page(bio_ptr); | |
450 | for (i = 0; i < bio->bi_max_vecs; i++) { | |
451 | size = PAGE_SIZE - offset; | |
452 | ||
453 | if (bio_size <= 0) | |
454 | break; | |
455 | ||
456 | if (size > bio_size) | |
457 | size = bio_size; | |
458 | ||
459 | if (is_vmalloc_addr(bio_ptr)) | |
460 | page = vmalloc_to_page(bio_ptr); | |
461 | else | |
462 | page = virt_to_page(bio_ptr); | |
463 | ||
464 | /* | |
465 | * Some network related block device uses tcp_sendpage, which | |
466 | * doesn't behave well when using 0-count page, this is a | |
467 | * safety net to catch them. | |
468 | */ | |
469 | ASSERT3S(page_count(page), >, 0); | |
470 | ||
471 | if (bio_add_page(bio, page, size, offset) != size) | |
472 | break; | |
473 | ||
474 | bio_ptr += size; | |
475 | bio_size -= size; | |
476 | offset = 0; | |
477 | } | |
478 | ||
479 | return (bio_size); | |
480 | } | |
481 | ||
86e3c28a CIK |
482 | static unsigned int |
483 | bio_map_abd_off(struct bio *bio, abd_t *abd, unsigned int size, size_t off) | |
484 | { | |
485 | if (abd_is_linear(abd)) | |
486 | return (bio_map(bio, ((char *)abd_to_buf(abd)) + off, size)); | |
487 | ||
488 | return (abd_scatter_bio_map_off(bio, abd, size, off)); | |
489 | } | |
490 | ||
70e083d2 TG |
491 | static inline void |
492 | vdev_submit_bio_impl(struct bio *bio) | |
493 | { | |
494 | #ifdef HAVE_1ARG_SUBMIT_BIO | |
495 | submit_bio(bio); | |
496 | #else | |
497 | submit_bio(0, bio); | |
498 | #endif | |
499 | } | |
500 | ||
86e3c28a CIK |
501 | #ifndef HAVE_BIO_SET_DEV |
502 | static inline void | |
503 | bio_set_dev(struct bio *bio, struct block_device *bdev) | |
504 | { | |
505 | bio->bi_bdev = bdev; | |
506 | } | |
507 | #endif /* !HAVE_BIO_SET_DEV */ | |
508 | ||
70e083d2 TG |
509 | static inline void |
510 | vdev_submit_bio(struct bio *bio) | |
511 | { | |
512 | #ifdef HAVE_CURRENT_BIO_TAIL | |
513 | struct bio **bio_tail = current->bio_tail; | |
514 | current->bio_tail = NULL; | |
515 | vdev_submit_bio_impl(bio); | |
516 | current->bio_tail = bio_tail; | |
517 | #else | |
518 | struct bio_list *bio_list = current->bio_list; | |
519 | current->bio_list = NULL; | |
520 | vdev_submit_bio_impl(bio); | |
521 | current->bio_list = bio_list; | |
522 | #endif | |
523 | } | |
524 | ||
525 | static int | |
86e3c28a CIK |
526 | __vdev_disk_physio(struct block_device *bdev, zio_t *zio, |
527 | size_t io_size, uint64_t io_offset, int rw, int flags) | |
70e083d2 TG |
528 | { |
529 | dio_request_t *dr; | |
86e3c28a | 530 | uint64_t abd_offset; |
70e083d2 TG |
531 | uint64_t bio_offset; |
532 | int bio_size, bio_count = 16; | |
533 | int i = 0, error = 0; | |
534 | #if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG) | |
535 | struct blk_plug plug; | |
536 | #endif | |
537 | ||
86e3c28a CIK |
538 | ASSERT(zio != NULL); |
539 | ASSERT3U(io_offset + io_size, <=, bdev->bd_inode->i_size); | |
70e083d2 TG |
540 | |
541 | retry: | |
542 | dr = vdev_disk_dio_alloc(bio_count); | |
543 | if (dr == NULL) | |
544 | return (ENOMEM); | |
545 | ||
546 | if (zio && !(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD))) | |
547 | bio_set_flags_failfast(bdev, &flags); | |
548 | ||
549 | dr->dr_zio = zio; | |
550 | ||
551 | /* | |
552 | * When the IO size exceeds the maximum bio size for the request | |
553 | * queue we are forced to break the IO in multiple bio's and wait | |
554 | * for them all to complete. Ideally, all pool users will set | |
555 | * their volume block size to match the maximum request size and | |
556 | * the common case will be one bio per vdev IO request. | |
557 | */ | |
86e3c28a CIK |
558 | |
559 | abd_offset = 0; | |
560 | bio_offset = io_offset; | |
561 | bio_size = io_size; | |
70e083d2 TG |
562 | for (i = 0; i <= dr->dr_bio_count; i++) { |
563 | ||
564 | /* Finished constructing bio's for given buffer */ | |
565 | if (bio_size <= 0) | |
566 | break; | |
567 | ||
568 | /* | |
569 | * By default only 'bio_count' bio's per dio are allowed. | |
570 | * However, if we find ourselves in a situation where more | |
571 | * are needed we allocate a larger dio and warn the user. | |
572 | */ | |
573 | if (dr->dr_bio_count == i) { | |
574 | vdev_disk_dio_free(dr); | |
575 | bio_count *= 2; | |
576 | goto retry; | |
577 | } | |
578 | ||
579 | /* bio_alloc() with __GFP_WAIT never returns NULL */ | |
580 | dr->dr_bio[i] = bio_alloc(GFP_NOIO, | |
86e3c28a CIK |
581 | MIN(abd_nr_pages_off(zio->io_abd, bio_size, abd_offset), |
582 | BIO_MAX_PAGES)); | |
70e083d2 TG |
583 | if (unlikely(dr->dr_bio[i] == NULL)) { |
584 | vdev_disk_dio_free(dr); | |
585 | return (ENOMEM); | |
586 | } | |
587 | ||
588 | /* Matching put called by vdev_disk_physio_completion */ | |
589 | vdev_disk_dio_get(dr); | |
590 | ||
93f7b346 | 591 | bio_set_dev(dr->dr_bio[i], bdev); |
70e083d2 TG |
592 | BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9; |
593 | dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion; | |
594 | dr->dr_bio[i]->bi_private = dr; | |
595 | bio_set_op_attrs(dr->dr_bio[i], rw, flags); | |
596 | ||
597 | /* Remaining size is returned to become the new size */ | |
86e3c28a CIK |
598 | bio_size = bio_map_abd_off(dr->dr_bio[i], zio->io_abd, |
599 | bio_size, abd_offset); | |
70e083d2 TG |
600 | |
601 | /* Advance in buffer and construct another bio if needed */ | |
86e3c28a | 602 | abd_offset += BIO_BI_SIZE(dr->dr_bio[i]); |
70e083d2 TG |
603 | bio_offset += BIO_BI_SIZE(dr->dr_bio[i]); |
604 | } | |
605 | ||
606 | /* Extra reference to protect dio_request during vdev_submit_bio */ | |
607 | vdev_disk_dio_get(dr); | |
70e083d2 TG |
608 | |
609 | #if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG) | |
610 | if (dr->dr_bio_count > 1) | |
611 | blk_start_plug(&plug); | |
612 | #endif | |
613 | ||
614 | /* Submit all bio's associated with this dio */ | |
615 | for (i = 0; i < dr->dr_bio_count; i++) | |
616 | if (dr->dr_bio[i]) | |
617 | vdev_submit_bio(dr->dr_bio[i]); | |
618 | ||
619 | #if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG) | |
620 | if (dr->dr_bio_count > 1) | |
621 | blk_finish_plug(&plug); | |
622 | #endif | |
623 | ||
624 | (void) vdev_disk_dio_put(dr); | |
625 | ||
626 | return (error); | |
627 | } | |
628 | ||
3ab1144a | 629 | BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error) |
70e083d2 TG |
630 | { |
631 | zio_t *zio = bio->bi_private; | |
632 | #ifdef HAVE_1ARG_BIO_END_IO_T | |
3ab1144a CIK |
633 | zio->io_error = BIO_END_IO_ERROR(bio); |
634 | #else | |
635 | zio->io_error = -error; | |
70e083d2 TG |
636 | #endif |
637 | ||
3ab1144a | 638 | if (zio->io_error && (zio->io_error == EOPNOTSUPP)) |
70e083d2 TG |
639 | zio->io_vd->vdev_nowritecache = B_TRUE; |
640 | ||
641 | bio_put(bio); | |
642 | ASSERT3S(zio->io_error, >=, 0); | |
643 | if (zio->io_error) | |
644 | vdev_disk_error(zio); | |
645 | zio_interrupt(zio); | |
646 | } | |
647 | ||
648 | static int | |
649 | vdev_disk_io_flush(struct block_device *bdev, zio_t *zio) | |
650 | { | |
651 | struct request_queue *q; | |
652 | struct bio *bio; | |
653 | ||
654 | q = bdev_get_queue(bdev); | |
655 | if (!q) | |
656 | return (ENXIO); | |
657 | ||
658 | bio = bio_alloc(GFP_NOIO, 0); | |
659 | /* bio_alloc() with __GFP_WAIT never returns NULL */ | |
660 | if (unlikely(bio == NULL)) | |
661 | return (ENOMEM); | |
662 | ||
663 | bio->bi_end_io = vdev_disk_io_flush_completion; | |
664 | bio->bi_private = zio; | |
93f7b346 | 665 | bio_set_dev(bio, bdev); |
70e083d2 TG |
666 | bio_set_flush(bio); |
667 | vdev_submit_bio(bio); | |
668 | invalidate_bdev(bdev); | |
669 | ||
670 | return (0); | |
671 | } | |
672 | ||
673 | static void | |
674 | vdev_disk_io_start(zio_t *zio) | |
675 | { | |
676 | vdev_t *v = zio->io_vd; | |
677 | vdev_disk_t *vd = v->vdev_tsd; | |
678 | int rw, flags, error; | |
679 | ||
680 | switch (zio->io_type) { | |
681 | case ZIO_TYPE_IOCTL: | |
682 | ||
683 | if (!vdev_readable(v)) { | |
684 | zio->io_error = SET_ERROR(ENXIO); | |
685 | zio_interrupt(zio); | |
686 | return; | |
687 | } | |
688 | ||
689 | switch (zio->io_cmd) { | |
690 | case DKIOCFLUSHWRITECACHE: | |
691 | ||
692 | if (zfs_nocacheflush) | |
693 | break; | |
694 | ||
695 | if (v->vdev_nowritecache) { | |
696 | zio->io_error = SET_ERROR(ENOTSUP); | |
697 | break; | |
698 | } | |
699 | ||
700 | error = vdev_disk_io_flush(vd->vd_bdev, zio); | |
701 | if (error == 0) | |
702 | return; | |
703 | ||
704 | zio->io_error = error; | |
70e083d2 TG |
705 | |
706 | break; | |
707 | ||
708 | default: | |
709 | zio->io_error = SET_ERROR(ENOTSUP); | |
710 | } | |
711 | ||
712 | zio_execute(zio); | |
713 | return; | |
714 | case ZIO_TYPE_WRITE: | |
715 | rw = WRITE; | |
716 | #if defined(HAVE_BLK_QUEUE_HAVE_BIO_RW_UNPLUG) | |
717 | flags = (1 << BIO_RW_UNPLUG); | |
718 | #elif defined(REQ_UNPLUG) | |
719 | flags = REQ_UNPLUG; | |
720 | #else | |
721 | flags = 0; | |
722 | #endif | |
723 | break; | |
724 | ||
725 | case ZIO_TYPE_READ: | |
726 | rw = READ; | |
727 | #if defined(HAVE_BLK_QUEUE_HAVE_BIO_RW_UNPLUG) | |
728 | flags = (1 << BIO_RW_UNPLUG); | |
729 | #elif defined(REQ_UNPLUG) | |
730 | flags = REQ_UNPLUG; | |
731 | #else | |
732 | flags = 0; | |
733 | #endif | |
734 | break; | |
735 | ||
736 | default: | |
737 | zio->io_error = SET_ERROR(ENOTSUP); | |
738 | zio_interrupt(zio); | |
739 | return; | |
740 | } | |
741 | ||
86e3c28a CIK |
742 | zio->io_target_timestamp = zio_handle_io_delay(zio); |
743 | error = __vdev_disk_physio(vd->vd_bdev, zio, | |
70e083d2 TG |
744 | zio->io_size, zio->io_offset, rw, flags); |
745 | if (error) { | |
746 | zio->io_error = error; | |
747 | zio_interrupt(zio); | |
748 | return; | |
749 | } | |
750 | } | |
751 | ||
752 | static void | |
753 | vdev_disk_io_done(zio_t *zio) | |
754 | { | |
755 | /* | |
756 | * If the device returned EIO, we revalidate the media. If it is | |
757 | * determined the media has changed this triggers the asynchronous | |
758 | * removal of the device from the configuration. | |
759 | */ | |
760 | if (zio->io_error == EIO) { | |
761 | vdev_t *v = zio->io_vd; | |
762 | vdev_disk_t *vd = v->vdev_tsd; | |
763 | ||
764 | if (check_disk_change(vd->vd_bdev)) { | |
765 | vdev_bdev_invalidate(vd->vd_bdev); | |
766 | v->vdev_remove_wanted = B_TRUE; | |
767 | spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE); | |
768 | } | |
769 | } | |
770 | } | |
771 | ||
772 | static void | |
773 | vdev_disk_hold(vdev_t *vd) | |
774 | { | |
775 | ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); | |
776 | ||
777 | /* We must have a pathname, and it must be absolute. */ | |
778 | if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') | |
779 | return; | |
780 | ||
781 | /* | |
782 | * Only prefetch path and devid info if the device has | |
783 | * never been opened. | |
784 | */ | |
785 | if (vd->vdev_tsd != NULL) | |
786 | return; | |
787 | ||
788 | /* XXX: Implement me as a vnode lookup for the device */ | |
789 | vd->vdev_name_vp = NULL; | |
790 | vd->vdev_devid_vp = NULL; | |
791 | } | |
792 | ||
793 | static void | |
794 | vdev_disk_rele(vdev_t *vd) | |
795 | { | |
796 | ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); | |
797 | ||
798 | /* XXX: Implement me as a vnode rele for the device */ | |
799 | } | |
800 | ||
801 | vdev_ops_t vdev_disk_ops = { | |
802 | vdev_disk_open, | |
803 | vdev_disk_close, | |
804 | vdev_default_asize, | |
805 | vdev_disk_io_start, | |
806 | vdev_disk_io_done, | |
807 | NULL, | |
86e3c28a | 808 | NULL, |
70e083d2 TG |
809 | vdev_disk_hold, |
810 | vdev_disk_rele, | |
811 | VDEV_TYPE_DISK, /* name of this vdev type */ | |
812 | B_TRUE /* leaf vdev */ | |
813 | }; | |
814 | ||
815 | module_param(zfs_vdev_scheduler, charp, 0644); | |
816 | MODULE_PARM_DESC(zfs_vdev_scheduler, "I/O scheduler"); |