]>
Commit | Line | Data |
---|---|---|
60101509 BB |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
9 | * or http://www.opensolaris.org/os/licensing. | |
10 | * See the License for the specific language governing permissions | |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | /* | |
22 | * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC. | |
23 | * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). | |
24 | * Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>. | |
25 | * LLNL-CODE-403049. | |
26ef0cc7 | 26 | * Copyright (c) 2012, 2015 by Delphix. All rights reserved. |
60101509 BB |
27 | */ |
28 | ||
29 | #include <sys/zfs_context.h> | |
30 | #include <sys/spa.h> | |
31 | #include <sys/vdev_disk.h> | |
32 | #include <sys/vdev_impl.h> | |
33 | #include <sys/fs/zfs.h> | |
34 | #include <sys/zio.h> | |
35 | #include <sys/sunldi.h> | |
36 | ||
6839eed2 | 37 | char *zfs_vdev_scheduler = VDEV_SCHEDULER; |
8128bd89 | 38 | static void *zfs_vdev_holder = VDEV_HOLDER; |
6839eed2 | 39 | |
60101509 BB |
40 | /* |
41 | * Virtual device vector for disks. | |
42 | */ | |
43 | typedef struct dio_request { | |
60101509 | 44 | zio_t *dr_zio; /* Parent ZIO */ |
aa159afb | 45 | atomic_t dr_ref; /* References */ |
60101509 BB |
46 | int dr_error; /* Bio error */ |
47 | int dr_bio_count; /* Count of bio's */ | |
d1d7e268 | 48 | struct bio *dr_bio[0]; /* Attached bio's */ |
60101509 BB |
49 | } dio_request_t; |
50 | ||
51 | ||
52 | #ifdef HAVE_OPEN_BDEV_EXCLUSIVE | |
53 | static fmode_t | |
54 | vdev_bdev_mode(int smode) | |
55 | { | |
56 | fmode_t mode = 0; | |
57 | ||
58 | ASSERT3S(smode & (FREAD | FWRITE), !=, 0); | |
59 | ||
60 | if (smode & FREAD) | |
61 | mode |= FMODE_READ; | |
62 | ||
63 | if (smode & FWRITE) | |
64 | mode |= FMODE_WRITE; | |
65 | ||
d1d7e268 | 66 | return (mode); |
60101509 BB |
67 | } |
68 | #else | |
69 | static int | |
70 | vdev_bdev_mode(int smode) | |
71 | { | |
72 | int mode = 0; | |
73 | ||
74 | ASSERT3S(smode & (FREAD | FWRITE), !=, 0); | |
75 | ||
76 | if ((smode & FREAD) && !(smode & FWRITE)) | |
77 | mode = MS_RDONLY; | |
78 | ||
d1d7e268 | 79 | return (mode); |
60101509 BB |
80 | } |
81 | #endif /* HAVE_OPEN_BDEV_EXCLUSIVE */ | |
82 | ||
83 | static uint64_t | |
84 | bdev_capacity(struct block_device *bdev) | |
85 | { | |
86 | struct hd_struct *part = bdev->bd_part; | |
87 | ||
88 | /* The partition capacity referenced by the block device */ | |
89 | if (part) | |
f74fae8b | 90 | return (part->nr_sects << 9); |
60101509 BB |
91 | |
92 | /* Otherwise assume the full device capacity */ | |
f74fae8b | 93 | return (get_capacity(bdev->bd_disk) << 9); |
60101509 BB |
94 | } |
95 | ||
d148e951 BB |
96 | static void |
97 | vdev_disk_error(zio_t *zio) | |
98 | { | |
99 | #ifdef ZFS_DEBUG | |
a69052be | 100 | printk("ZFS: zio error=%d type=%d offset=%llu size=%llu " |
193a37cb | 101 | "flags=%x\n", zio->io_error, zio->io_type, |
d148e951 | 102 | (u_longlong_t)zio->io_offset, (u_longlong_t)zio->io_size, |
193a37cb | 103 | zio->io_flags); |
d148e951 BB |
104 | #endif |
105 | } | |
106 | ||
6839eed2 BB |
107 | /* |
108 | * Use the Linux 'noop' elevator for zfs managed block devices. This | |
109 | * strikes the ideal balance by allowing the zfs elevator to do all | |
110 | * request ordering and prioritization. While allowing the Linux | |
111 | * elevator to do the maximum front/back merging allowed by the | |
112 | * physical device. This yields the largest possible requests for | |
113 | * the device with the lowest total overhead. | |
6839eed2 BB |
114 | */ |
115 | static int | |
fdcd952b | 116 | vdev_elevator_switch(vdev_t *v, char *elevator) |
6839eed2 | 117 | { |
fdcd952b BB |
118 | vdev_disk_t *vd = v->vdev_tsd; |
119 | struct block_device *bdev = vd->vd_bdev; | |
120 | struct request_queue *q = bdev_get_queue(bdev); | |
121 | char *device = bdev->bd_disk->disk_name; | |
e2448b0e | 122 | int error; |
fdcd952b | 123 | |
84daadde PS |
124 | /* |
125 | * Skip devices which are not whole disks (partitions). | |
126 | * Device-mapper devices are excepted since they may be whole | |
127 | * disks despite the vdev_wholedisk flag, in which case we can | |
128 | * and should switch the elevator. If the device-mapper device | |
129 | * does not have an elevator (i.e. dm-raid, dm-crypt, etc.) the | |
130 | * "Skip devices without schedulers" check below will fail. | |
131 | */ | |
132 | if (!v->vdev_wholedisk && strncmp(device, "dm-", 3) != 0) | |
04516a45 BB |
133 | return (0); |
134 | ||
fdcd952b BB |
135 | /* Skip devices without schedulers (loop, ram, dm, etc) */ |
136 | if (!q->elevator || !blk_queue_stackable(q)) | |
137 | return (0); | |
6839eed2 | 138 | |
fdcd952b | 139 | /* Leave existing scheduler when set to "none" */ |
4903926f | 140 | if ((strncmp(elevator, "none", 4) == 0) && (strlen(elevator) == 4)) |
6839eed2 BB |
141 | return (0); |
142 | ||
6d1d976b BB |
143 | #ifdef HAVE_ELEVATOR_CHANGE |
144 | error = elevator_change(q, elevator); | |
145 | #else | |
d1d7e268 MK |
146 | /* |
147 | * For pre-2.6.36 kernels elevator_change() is not available. | |
6d1d976b BB |
148 | * Therefore we fall back to using a usermodehelper to echo the |
149 | * elevator into sysfs; This requires /bin/echo and sysfs to be | |
150 | * mounted which may not be true early in the boot process. | |
151 | */ | |
d1d7e268 | 152 | #define SET_SCHEDULER_CMD \ |
6d1d976b BB |
153 | "exec 0</dev/null " \ |
154 | " 1>/sys/block/%s/queue/scheduler " \ | |
155 | " 2>/dev/null; " \ | |
156 | "echo %s" | |
157 | ||
158 | { | |
159 | char *argv[] = { "/bin/sh", "-c", NULL, NULL }; | |
160 | char *envp[] = { NULL }; | |
161 | ||
162 | argv[2] = kmem_asprintf(SET_SCHEDULER_CMD, device, elevator); | |
761394b3 | 163 | error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC); |
6d1d976b BB |
164 | strfree(argv[2]); |
165 | } | |
166 | #endif /* HAVE_ELEVATOR_CHANGE */ | |
6839eed2 BB |
167 | if (error) |
168 | printk("ZFS: Unable to set \"%s\" scheduler for %s (%s): %d\n", | |
d1d7e268 | 169 | elevator, v->vdev_path, device, error); |
6839eed2 BB |
170 | |
171 | return (error); | |
172 | } | |
173 | ||
b5a28807 ED |
174 | /* |
175 | * Expanding a whole disk vdev involves invoking BLKRRPART on the | |
176 | * whole disk device. This poses a problem, because BLKRRPART will | |
177 | * return EBUSY if one of the disk's partitions is open. That's why | |
178 | * we have to do it here, just before opening the data partition. | |
179 | * Unfortunately, BLKRRPART works by dropping all partitions and | |
180 | * recreating them, which means that for a short time window, all | |
181 | * /dev/sdxN device files disappear (until udev recreates them). | |
182 | * This means two things: | |
183 | * - When we open the data partition just after a BLKRRPART, we | |
184 | * can't do it using the normal device file path because of the | |
185 | * obvious race condition with udev. Instead, we use reliable | |
186 | * kernel APIs to get a handle to the new partition device from | |
187 | * the whole disk device. | |
188 | * - Because vdev_disk_open() initially needs to find the device | |
189 | * using its path, multiple vdev_disk_open() invocations in | |
190 | * short succession on the same disk with BLKRRPARTs in the | |
191 | * middle have a high probability of failure (because of the | |
192 | * race condition with udev). A typical situation where this | |
193 | * might happen is when the zpool userspace tool does a | |
194 | * TRYIMPORT immediately followed by an IMPORT. For this | |
195 | * reason, we only invoke BLKRRPART in the module when strictly | |
196 | * necessary (zpool online -e case), and rely on userspace to | |
197 | * do it when possible. | |
198 | */ | |
199 | static struct block_device * | |
200 | vdev_disk_rrpart(const char *path, int mode, vdev_disk_t *vd) | |
201 | { | |
202 | #if defined(HAVE_3ARG_BLKDEV_GET) && defined(HAVE_GET_GENDISK) | |
203 | struct block_device *bdev, *result = ERR_PTR(-ENXIO); | |
204 | struct gendisk *disk; | |
205 | int error, partno; | |
206 | ||
8128bd89 | 207 | bdev = vdev_bdev_open(path, vdev_bdev_mode(mode), zfs_vdev_holder); |
b5a28807 | 208 | if (IS_ERR(bdev)) |
d1d7e268 | 209 | return (bdev); |
b5a28807 ED |
210 | |
211 | disk = get_gendisk(bdev->bd_dev, &partno); | |
212 | vdev_bdev_close(bdev, vdev_bdev_mode(mode)); | |
213 | ||
214 | if (disk) { | |
215 | bdev = bdget(disk_devt(disk)); | |
216 | if (bdev) { | |
217 | error = blkdev_get(bdev, vdev_bdev_mode(mode), vd); | |
218 | if (error == 0) | |
219 | error = ioctl_by_bdev(bdev, BLKRRPART, 0); | |
220 | vdev_bdev_close(bdev, vdev_bdev_mode(mode)); | |
221 | } | |
222 | ||
223 | bdev = bdget_disk(disk, partno); | |
224 | if (bdev) { | |
225 | error = blkdev_get(bdev, | |
226 | vdev_bdev_mode(mode) | FMODE_EXCL, vd); | |
227 | if (error == 0) | |
228 | result = bdev; | |
229 | } | |
230 | put_disk(disk); | |
231 | } | |
232 | ||
d1d7e268 | 233 | return (result); |
b5a28807 | 234 | #else |
d1d7e268 | 235 | return (ERR_PTR(-EOPNOTSUPP)); |
b5a28807 ED |
236 | #endif /* defined(HAVE_3ARG_BLKDEV_GET) && defined(HAVE_GET_GENDISK) */ |
237 | } | |
238 | ||
60101509 | 239 | static int |
1bd201e7 CS |
240 | vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, |
241 | uint64_t *ashift) | |
60101509 | 242 | { |
b5a28807 | 243 | struct block_device *bdev = ERR_PTR(-ENXIO); |
60101509 | 244 | vdev_disk_t *vd; |
2d82ea8b | 245 | int count = 0, mode, block_size; |
60101509 BB |
246 | |
247 | /* Must have a pathname and it must be absolute. */ | |
248 | if (v->vdev_path == NULL || v->vdev_path[0] != '/') { | |
249 | v->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; | |
2d82ea8b | 250 | return (SET_ERROR(EINVAL)); |
60101509 BB |
251 | } |
252 | ||
0d8103d9 BB |
253 | /* |
254 | * Reopen the device if it's not currently open. Otherwise, | |
255 | * just update the physical size of the device. | |
256 | */ | |
257 | if (v->vdev_tsd != NULL) { | |
258 | ASSERT(v->vdev_reopening); | |
259 | vd = v->vdev_tsd; | |
260 | goto skip_open; | |
261 | } | |
262 | ||
79c76d5b | 263 | vd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP); |
60101509 | 264 | if (vd == NULL) |
2d82ea8b | 265 | return (SET_ERROR(ENOMEM)); |
60101509 BB |
266 | |
267 | /* | |
268 | * Devices are always opened by the path provided at configuration | |
269 | * time. This means that if the provided path is a udev by-id path | |
270 | * then drives may be recabled without an issue. If the provided | |
4e95cc99 | 271 | * path is a udev by-path path, then the physical location information |
60101509 BB |
272 | * will be preserved. This can be critical for more complicated |
273 | * configurations where drives are located in specific physical | |
274 | * locations to maximize the systems tolerence to component failure. | |
4e95cc99 | 275 | * Alternatively, you can provide your own udev rule to flexibly map |
60101509 | 276 | * the drives as you see fit. It is not advised that you use the |
4e95cc99 | 277 | * /dev/[hd]d devices which may be reordered due to probing order. |
60101509 BB |
278 | * Devices in the wrong locations will be detected by the higher |
279 | * level vdev validation. | |
2d82ea8b BB |
280 | * |
281 | * The specified paths may be briefly removed and recreated in | |
282 | * response to udev events. This should be exceptionally unlikely | |
283 | * because the zpool command makes every effort to verify these paths | |
284 | * have already settled prior to reaching this point. Therefore, | |
285 | * a ENOENT failure at this point is highly likely to be transient | |
286 | * and it is reasonable to sleep and retry before giving up. In | |
287 | * practice delays have been observed to be on the order of 100ms. | |
60101509 BB |
288 | */ |
289 | mode = spa_mode(v->vdev_spa); | |
b5a28807 ED |
290 | if (v->vdev_wholedisk && v->vdev_expanding) |
291 | bdev = vdev_disk_rrpart(v->vdev_path, mode, vd); | |
2d82ea8b BB |
292 | |
293 | while (IS_ERR(bdev) && count < 50) { | |
8128bd89 BB |
294 | bdev = vdev_bdev_open(v->vdev_path, |
295 | vdev_bdev_mode(mode), zfs_vdev_holder); | |
2d82ea8b BB |
296 | if (unlikely(PTR_ERR(bdev) == -ENOENT)) { |
297 | msleep(10); | |
298 | count++; | |
299 | } else if (IS_ERR(bdev)) { | |
300 | break; | |
301 | } | |
302 | } | |
303 | ||
60101509 | 304 | if (IS_ERR(bdev)) { |
2d82ea8b BB |
305 | dprintf("failed open v->vdev_path=%s, error=%d count=%d\n", |
306 | v->vdev_path, -PTR_ERR(bdev), count); | |
d1d7e268 | 307 | kmem_free(vd, sizeof (vdev_disk_t)); |
2d82ea8b | 308 | return (SET_ERROR(-PTR_ERR(bdev))); |
60101509 BB |
309 | } |
310 | ||
311 | v->vdev_tsd = vd; | |
312 | vd->vd_bdev = bdev; | |
0d8103d9 BB |
313 | |
314 | skip_open: | |
315 | /* Determine the physical block size */ | |
316 | block_size = vdev_bdev_block_size(vd->vd_bdev); | |
60101509 | 317 | |
60101509 BB |
318 | /* Clear the nowritecache bit, causes vdev_reopen() to try again. */ |
319 | v->vdev_nowritecache = B_FALSE; | |
320 | ||
fb40095f RY |
321 | /* Inform the ZIO pipeline that we are non-rotational */ |
322 | v->vdev_nonrot = blk_queue_nonrot(bdev_get_queue(vd->vd_bdev)); | |
323 | ||
60101509 | 324 | /* Physical volume size in bytes */ |
0d8103d9 | 325 | *psize = bdev_capacity(vd->vd_bdev); |
60101509 | 326 | |
1bd201e7 CS |
327 | /* TODO: report possible expansion size */ |
328 | *max_psize = *psize; | |
329 | ||
60101509 | 330 | /* Based on the minimum sector size set the block size */ |
9bd274dd | 331 | *ashift = highbit64(MAX(block_size, SPA_MINBLOCKSIZE)) - 1; |
60101509 | 332 | |
6839eed2 | 333 | /* Try to set the io scheduler elevator algorithm */ |
fdcd952b | 334 | (void) vdev_elevator_switch(v, zfs_vdev_scheduler); |
6839eed2 | 335 | |
d1d7e268 | 336 | return (0); |
60101509 BB |
337 | } |
338 | ||
339 | static void | |
340 | vdev_disk_close(vdev_t *v) | |
341 | { | |
342 | vdev_disk_t *vd = v->vdev_tsd; | |
343 | ||
0d8103d9 | 344 | if (v->vdev_reopening || vd == NULL) |
60101509 BB |
345 | return; |
346 | ||
347 | if (vd->vd_bdev != NULL) | |
348 | vdev_bdev_close(vd->vd_bdev, | |
d1d7e268 | 349 | vdev_bdev_mode(spa_mode(v->vdev_spa))); |
60101509 | 350 | |
d1d7e268 | 351 | kmem_free(vd, sizeof (vdev_disk_t)); |
60101509 BB |
352 | v->vdev_tsd = NULL; |
353 | } | |
354 | ||
355 | static dio_request_t * | |
356 | vdev_disk_dio_alloc(int bio_count) | |
357 | { | |
358 | dio_request_t *dr; | |
359 | int i; | |
360 | ||
d1d7e268 | 361 | dr = kmem_zalloc(sizeof (dio_request_t) + |
79c76d5b | 362 | sizeof (struct bio *) * bio_count, KM_SLEEP); |
60101509 | 363 | if (dr) { |
60101509 BB |
364 | atomic_set(&dr->dr_ref, 0); |
365 | dr->dr_bio_count = bio_count; | |
366 | dr->dr_error = 0; | |
367 | ||
368 | for (i = 0; i < dr->dr_bio_count; i++) | |
369 | dr->dr_bio[i] = NULL; | |
370 | } | |
371 | ||
d1d7e268 | 372 | return (dr); |
60101509 BB |
373 | } |
374 | ||
375 | static void | |
376 | vdev_disk_dio_free(dio_request_t *dr) | |
377 | { | |
378 | int i; | |
379 | ||
380 | for (i = 0; i < dr->dr_bio_count; i++) | |
381 | if (dr->dr_bio[i]) | |
382 | bio_put(dr->dr_bio[i]); | |
383 | ||
d1d7e268 MK |
384 | kmem_free(dr, sizeof (dio_request_t) + |
385 | sizeof (struct bio *) * dr->dr_bio_count); | |
60101509 BB |
386 | } |
387 | ||
388 | static void | |
389 | vdev_disk_dio_get(dio_request_t *dr) | |
390 | { | |
391 | atomic_inc(&dr->dr_ref); | |
392 | } | |
393 | ||
394 | static int | |
395 | vdev_disk_dio_put(dio_request_t *dr) | |
396 | { | |
397 | int rc = atomic_dec_return(&dr->dr_ref); | |
398 | ||
399 | /* | |
400 | * Free the dio_request when the last reference is dropped and | |
401 | * ensure zio_interpret is called only once with the correct zio | |
402 | */ | |
403 | if (rc == 0) { | |
404 | zio_t *zio = dr->dr_zio; | |
405 | int error = dr->dr_error; | |
406 | ||
407 | vdev_disk_dio_free(dr); | |
408 | ||
409 | if (zio) { | |
410 | zio->io_error = error; | |
d148e951 BB |
411 | ASSERT3S(zio->io_error, >=, 0); |
412 | if (zio->io_error) | |
413 | vdev_disk_error(zio); | |
26ef0cc7 | 414 | zio_delay_interrupt(zio); |
60101509 BB |
415 | } |
416 | } | |
417 | ||
d1d7e268 | 418 | return (rc); |
60101509 BB |
419 | } |
420 | ||
784a7fe5 | 421 | BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, error) |
60101509 BB |
422 | { |
423 | dio_request_t *dr = bio->bi_private; | |
424 | int rc; | |
425 | ||
784a7fe5 LW |
426 | if (dr->dr_error == 0) { |
427 | #ifdef HAVE_1ARG_BIO_END_IO_T | |
428 | dr->dr_error = -(bio->bi_error); | |
429 | #else | |
430 | if (error) | |
431 | dr->dr_error = -(error); | |
432 | else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) | |
433 | dr->dr_error = EIO; | |
434 | #endif | |
435 | } | |
60101509 BB |
436 | |
437 | /* Drop reference aquired by __vdev_disk_physio */ | |
438 | rc = vdev_disk_dio_put(dr); | |
60101509 BB |
439 | } |
440 | ||
441 | static inline unsigned long | |
442 | bio_nr_pages(void *bio_ptr, unsigned int bio_size) | |
443 | { | |
444 | return ((((unsigned long)bio_ptr + bio_size + PAGE_SIZE - 1) >> | |
d1d7e268 | 445 | PAGE_SHIFT) - ((unsigned long)bio_ptr >> PAGE_SHIFT)); |
60101509 BB |
446 | } |
447 | ||
448 | static unsigned int | |
449 | bio_map(struct bio *bio, void *bio_ptr, unsigned int bio_size) | |
450 | { | |
451 | unsigned int offset, size, i; | |
452 | struct page *page; | |
453 | ||
454 | offset = offset_in_page(bio_ptr); | |
455 | for (i = 0; i < bio->bi_max_vecs; i++) { | |
456 | size = PAGE_SIZE - offset; | |
457 | ||
458 | if (bio_size <= 0) | |
459 | break; | |
460 | ||
461 | if (size > bio_size) | |
462 | size = bio_size; | |
463 | ||
71f8548e | 464 | if (is_vmalloc_addr(bio_ptr)) |
60101509 BB |
465 | page = vmalloc_to_page(bio_ptr); |
466 | else | |
467 | page = virt_to_page(bio_ptr); | |
468 | ||
17584980 CC |
469 | /* |
470 | * Some network related block device uses tcp_sendpage, which | |
471 | * doesn't behave well when using 0-count page, this is a | |
472 | * safety net to catch them. | |
473 | */ | |
474 | ASSERT3S(page_count(page), >, 0); | |
475 | ||
60101509 BB |
476 | if (bio_add_page(bio, page, size, offset) != size) |
477 | break; | |
478 | ||
479 | bio_ptr += size; | |
480 | bio_size -= size; | |
481 | offset = 0; | |
482 | } | |
483 | ||
d1d7e268 | 484 | return (bio_size); |
60101509 BB |
485 | } |
486 | ||
bbb1b6ce BB |
487 | static inline void |
488 | vdev_submit_bio_impl(int rw, struct bio *bio) | |
489 | { | |
490 | #ifdef HAVE_1ARG_SUBMIT_BIO | |
491 | bio->bi_rw |= rw; | |
492 | submit_bio(bio); | |
493 | #else | |
494 | submit_bio(rw, bio); | |
495 | #endif | |
496 | } | |
497 | ||
37f9dac5 RY |
498 | static inline void |
499 | vdev_submit_bio(int rw, struct bio *bio) | |
500 | { | |
501 | #ifdef HAVE_CURRENT_BIO_TAIL | |
502 | struct bio **bio_tail = current->bio_tail; | |
503 | current->bio_tail = NULL; | |
bbb1b6ce | 504 | vdev_submit_bio_impl(rw, bio); |
37f9dac5 RY |
505 | current->bio_tail = bio_tail; |
506 | #else | |
507 | struct bio_list *bio_list = current->bio_list; | |
508 | current->bio_list = NULL; | |
bbb1b6ce | 509 | vdev_submit_bio_impl(rw, bio); |
37f9dac5 RY |
510 | current->bio_list = bio_list; |
511 | #endif | |
512 | } | |
513 | ||
60101509 BB |
514 | static int |
515 | __vdev_disk_physio(struct block_device *bdev, zio_t *zio, caddr_t kbuf_ptr, | |
e6603b7c | 516 | size_t kbuf_size, uint64_t kbuf_offset, int flags) |
60101509 | 517 | { |
d1d7e268 | 518 | dio_request_t *dr; |
60101509 BB |
519 | caddr_t bio_ptr; |
520 | uint64_t bio_offset; | |
aa159afb | 521 | int rw, bio_size, bio_count = 16; |
f74fae8b | 522 | int i = 0, error = 0; |
60101509 | 523 | |
e06be586 NB |
524 | ASSERT3U(kbuf_offset + kbuf_size, <=, bdev->bd_inode->i_size); |
525 | ||
60101509 BB |
526 | retry: |
527 | dr = vdev_disk_dio_alloc(bio_count); | |
528 | if (dr == NULL) | |
d1d7e268 | 529 | return (ENOMEM); |
60101509 | 530 | |
2959d94a | 531 | if (zio && !(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD))) |
29b763cd | 532 | bio_set_flags_failfast(bdev, &flags); |
2959d94a | 533 | |
aa159afb | 534 | rw = flags; |
60101509 | 535 | dr->dr_zio = zio; |
60101509 | 536 | |
60101509 BB |
537 | /* |
538 | * When the IO size exceeds the maximum bio size for the request | |
539 | * queue we are forced to break the IO in multiple bio's and wait | |
540 | * for them all to complete. Ideally, all pool users will set | |
541 | * their volume block size to match the maximum request size and | |
542 | * the common case will be one bio per vdev IO request. | |
543 | */ | |
544 | bio_ptr = kbuf_ptr; | |
545 | bio_offset = kbuf_offset; | |
546 | bio_size = kbuf_size; | |
547 | for (i = 0; i <= dr->dr_bio_count; i++) { | |
548 | ||
549 | /* Finished constructing bio's for given buffer */ | |
550 | if (bio_size <= 0) | |
551 | break; | |
552 | ||
553 | /* | |
554 | * By default only 'bio_count' bio's per dio are allowed. | |
555 | * However, if we find ourselves in a situation where more | |
556 | * are needed we allocate a larger dio and warn the user. | |
557 | */ | |
558 | if (dr->dr_bio_count == i) { | |
559 | vdev_disk_dio_free(dr); | |
560 | bio_count *= 2; | |
60101509 BB |
561 | goto retry; |
562 | } | |
563 | ||
29b763cd | 564 | /* bio_alloc() with __GFP_WAIT never returns NULL */ |
f1512ee6 MA |
565 | dr->dr_bio[i] = bio_alloc(GFP_NOIO, |
566 | MIN(bio_nr_pages(bio_ptr, bio_size), BIO_MAX_PAGES)); | |
29b763cd | 567 | if (unlikely(dr->dr_bio[i] == NULL)) { |
60101509 | 568 | vdev_disk_dio_free(dr); |
d1d7e268 | 569 | return (ENOMEM); |
60101509 BB |
570 | } |
571 | ||
572 | /* Matching put called by vdev_disk_physio_completion */ | |
573 | vdev_disk_dio_get(dr); | |
574 | ||
575 | dr->dr_bio[i]->bi_bdev = bdev; | |
d4541210 | 576 | BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9; |
aa159afb | 577 | dr->dr_bio[i]->bi_rw = rw; |
60101509 BB |
578 | dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion; |
579 | dr->dr_bio[i]->bi_private = dr; | |
580 | ||
581 | /* Remaining size is returned to become the new size */ | |
582 | bio_size = bio_map(dr->dr_bio[i], bio_ptr, bio_size); | |
583 | ||
584 | /* Advance in buffer and construct another bio if needed */ | |
d4541210 CC |
585 | bio_ptr += BIO_BI_SIZE(dr->dr_bio[i]); |
586 | bio_offset += BIO_BI_SIZE(dr->dr_bio[i]); | |
60101509 BB |
587 | } |
588 | ||
37f9dac5 | 589 | /* Extra reference to protect dio_request during vdev_submit_bio */ |
60101509 BB |
590 | vdev_disk_dio_get(dr); |
591 | ||
592 | /* Submit all bio's associated with this dio */ | |
593 | for (i = 0; i < dr->dr_bio_count; i++) | |
594 | if (dr->dr_bio[i]) | |
aa159afb | 595 | vdev_submit_bio(rw, dr->dr_bio[i]); |
60101509 | 596 | |
d1d7e268 | 597 | (void) vdev_disk_dio_put(dr); |
60101509 | 598 | |
d1d7e268 | 599 | return (error); |
60101509 BB |
600 | } |
601 | ||
e6603b7c | 602 | #ifndef __linux__ |
60101509 BB |
603 | int |
604 | vdev_disk_physio(struct block_device *bdev, caddr_t kbuf, | |
d1d7e268 | 605 | size_t size, uint64_t offset, int flags) |
60101509 | 606 | { |
2959d94a | 607 | bio_set_flags_failfast(bdev, &flags); |
e6603b7c | 608 | return (__vdev_disk_physio(bdev, NULL, kbuf, size, offset, flags)); |
60101509 | 609 | } |
e6603b7c | 610 | #endif |
60101509 | 611 | |
784a7fe5 | 612 | BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, rc) |
60101509 BB |
613 | { |
614 | zio_t *zio = bio->bi_private; | |
784a7fe5 LW |
615 | #ifdef HAVE_1ARG_BIO_END_IO_T |
616 | int rc = bio->bi_error; | |
617 | #endif | |
60101509 BB |
618 | |
619 | zio->io_error = -rc; | |
620 | if (rc && (rc == -EOPNOTSUPP)) | |
621 | zio->io_vd->vdev_nowritecache = B_TRUE; | |
622 | ||
623 | bio_put(bio); | |
d148e951 BB |
624 | ASSERT3S(zio->io_error, >=, 0); |
625 | if (zio->io_error) | |
626 | vdev_disk_error(zio); | |
60101509 | 627 | zio_interrupt(zio); |
60101509 BB |
628 | } |
629 | ||
630 | static int | |
631 | vdev_disk_io_flush(struct block_device *bdev, zio_t *zio) | |
632 | { | |
633 | struct request_queue *q; | |
634 | struct bio *bio; | |
635 | ||
636 | q = bdev_get_queue(bdev); | |
637 | if (!q) | |
d1d7e268 | 638 | return (ENXIO); |
60101509 | 639 | |
abc41ac7 | 640 | bio = bio_alloc(GFP_NOIO, 0); |
29b763cd IH |
641 | /* bio_alloc() with __GFP_WAIT never returns NULL */ |
642 | if (unlikely(bio == NULL)) | |
d1d7e268 | 643 | return (ENOMEM); |
60101509 BB |
644 | |
645 | bio->bi_end_io = vdev_disk_io_flush_completion; | |
646 | bio->bi_private = zio; | |
647 | bio->bi_bdev = bdev; | |
37f9dac5 | 648 | vdev_submit_bio(VDEV_WRITE_FLUSH_FUA, bio); |
cecb7487 | 649 | invalidate_bdev(bdev); |
60101509 | 650 | |
d1d7e268 | 651 | return (0); |
60101509 | 652 | } |
60101509 | 653 | |
98b25418 | 654 | static void |
60101509 BB |
655 | vdev_disk_io_start(zio_t *zio) |
656 | { | |
657 | vdev_t *v = zio->io_vd; | |
658 | vdev_disk_t *vd = v->vdev_tsd; | |
659 | int flags, error; | |
660 | ||
661 | switch (zio->io_type) { | |
662 | case ZIO_TYPE_IOCTL: | |
663 | ||
664 | if (!vdev_readable(v)) { | |
2e528b49 | 665 | zio->io_error = SET_ERROR(ENXIO); |
98b25418 GW |
666 | zio_interrupt(zio); |
667 | return; | |
60101509 BB |
668 | } |
669 | ||
670 | switch (zio->io_cmd) { | |
671 | case DKIOCFLUSHWRITECACHE: | |
672 | ||
673 | if (zfs_nocacheflush) | |
674 | break; | |
675 | ||
676 | if (v->vdev_nowritecache) { | |
2e528b49 | 677 | zio->io_error = SET_ERROR(ENOTSUP); |
60101509 BB |
678 | break; |
679 | } | |
680 | ||
681 | error = vdev_disk_io_flush(vd->vd_bdev, zio); | |
682 | if (error == 0) | |
98b25418 | 683 | return; |
60101509 BB |
684 | |
685 | zio->io_error = error; | |
686 | if (error == ENOTSUP) | |
687 | v->vdev_nowritecache = B_TRUE; | |
688 | ||
689 | break; | |
690 | ||
691 | default: | |
2e528b49 | 692 | zio->io_error = SET_ERROR(ENOTSUP); |
60101509 BB |
693 | } |
694 | ||
98b25418 GW |
695 | zio_execute(zio); |
696 | return; | |
60101509 | 697 | case ZIO_TYPE_WRITE: |
e6603b7c TC |
698 | #if defined(HAVE_BLK_QUEUE_HAVE_BIO_RW_UNPLUG) |
699 | flags = WRITE | (1 << BIO_RW_UNPLUG); | |
700 | #elif defined(REQ_UNPLUG) | |
701 | flags = WRITE | REQ_UNPLUG; | |
702 | #else | |
703 | flags = WRITE; | |
704 | #endif | |
60101509 BB |
705 | break; |
706 | ||
707 | case ZIO_TYPE_READ: | |
e6603b7c TC |
708 | #if defined(HAVE_BLK_QUEUE_HAVE_BIO_RW_UNPLUG) |
709 | flags = READ | (1 << BIO_RW_UNPLUG); | |
710 | #elif defined(REQ_UNPLUG) | |
711 | flags = READ | REQ_UNPLUG; | |
712 | #else | |
713 | flags = READ; | |
714 | #endif | |
60101509 BB |
715 | break; |
716 | ||
717 | default: | |
2e528b49 | 718 | zio->io_error = SET_ERROR(ENOTSUP); |
98b25418 GW |
719 | zio_interrupt(zio); |
720 | return; | |
60101509 BB |
721 | } |
722 | ||
26ef0cc7 | 723 | zio->io_target_timestamp = zio_handle_io_delay(zio); |
60101509 | 724 | error = __vdev_disk_physio(vd->vd_bdev, zio, zio->io_data, |
e6603b7c | 725 | zio->io_size, zio->io_offset, flags); |
60101509 BB |
726 | if (error) { |
727 | zio->io_error = error; | |
98b25418 GW |
728 | zio_interrupt(zio); |
729 | return; | |
60101509 | 730 | } |
60101509 BB |
731 | } |
732 | ||
733 | static void | |
734 | vdev_disk_io_done(zio_t *zio) | |
735 | { | |
736 | /* | |
737 | * If the device returned EIO, we revalidate the media. If it is | |
738 | * determined the media has changed this triggers the asynchronous | |
739 | * removal of the device from the configuration. | |
740 | */ | |
741 | if (zio->io_error == EIO) { | |
d1d7e268 | 742 | vdev_t *v = zio->io_vd; |
60101509 BB |
743 | vdev_disk_t *vd = v->vdev_tsd; |
744 | ||
745 | if (check_disk_change(vd->vd_bdev)) { | |
746 | vdev_bdev_invalidate(vd->vd_bdev); | |
747 | v->vdev_remove_wanted = B_TRUE; | |
748 | spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE); | |
749 | } | |
750 | } | |
751 | } | |
752 | ||
753 | static void | |
754 | vdev_disk_hold(vdev_t *vd) | |
755 | { | |
756 | ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); | |
757 | ||
758 | /* We must have a pathname, and it must be absolute. */ | |
759 | if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') | |
760 | return; | |
761 | ||
762 | /* | |
763 | * Only prefetch path and devid info if the device has | |
764 | * never been opened. | |
765 | */ | |
766 | if (vd->vdev_tsd != NULL) | |
767 | return; | |
768 | ||
769 | /* XXX: Implement me as a vnode lookup for the device */ | |
770 | vd->vdev_name_vp = NULL; | |
771 | vd->vdev_devid_vp = NULL; | |
772 | } | |
773 | ||
774 | static void | |
775 | vdev_disk_rele(vdev_t *vd) | |
776 | { | |
777 | ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); | |
778 | ||
779 | /* XXX: Implement me as a vnode rele for the device */ | |
780 | } | |
781 | ||
782 | vdev_ops_t vdev_disk_ops = { | |
783 | vdev_disk_open, | |
784 | vdev_disk_close, | |
785 | vdev_default_asize, | |
786 | vdev_disk_io_start, | |
787 | vdev_disk_io_done, | |
788 | NULL, | |
789 | vdev_disk_hold, | |
790 | vdev_disk_rele, | |
791 | VDEV_TYPE_DISK, /* name of this vdev type */ | |
792 | B_TRUE /* leaf vdev */ | |
793 | }; | |
794 | ||
e6603b7c | 795 | #ifndef __linux__ |
60101509 BB |
796 | /* |
797 | * Given the root disk device devid or pathname, read the label from | |
798 | * the device, and construct a configuration nvlist. | |
799 | */ | |
800 | int | |
801 | vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config) | |
802 | { | |
803 | struct block_device *bdev; | |
804 | vdev_label_t *label; | |
805 | uint64_t s, size; | |
806 | int i; | |
807 | ||
8128bd89 | 808 | bdev = vdev_bdev_open(devpath, vdev_bdev_mode(FREAD), zfs_vdev_holder); |
60101509 | 809 | if (IS_ERR(bdev)) |
d1d7e268 | 810 | return (-PTR_ERR(bdev)); |
60101509 | 811 | |
f74fae8b | 812 | s = bdev_capacity(bdev); |
60101509 BB |
813 | if (s == 0) { |
814 | vdev_bdev_close(bdev, vdev_bdev_mode(FREAD)); | |
d1d7e268 | 815 | return (EIO); |
60101509 BB |
816 | } |
817 | ||
d1d7e268 | 818 | size = P2ALIGN_TYPED(s, sizeof (vdev_label_t), uint64_t); |
79c76d5b | 819 | label = vmem_alloc(sizeof (vdev_label_t), KM_SLEEP); |
60101509 BB |
820 | |
821 | for (i = 0; i < VDEV_LABELS; i++) { | |
d1d7e268 | 822 | uint64_t offset, state, txg = 0; |
60101509 BB |
823 | |
824 | /* read vdev label */ | |
825 | offset = vdev_label_offset(size, i, 0); | |
826 | if (vdev_disk_physio(bdev, (caddr_t)label, | |
827 | VDEV_SKIP_SIZE + VDEV_PHYS_SIZE, offset, READ_SYNC) != 0) | |
828 | continue; | |
829 | ||
830 | if (nvlist_unpack(label->vl_vdev_phys.vp_nvlist, | |
831 | sizeof (label->vl_vdev_phys.vp_nvlist), config, 0) != 0) { | |
832 | *config = NULL; | |
833 | continue; | |
834 | } | |
835 | ||
836 | if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE, | |
837 | &state) != 0 || state >= POOL_STATE_DESTROYED) { | |
838 | nvlist_free(*config); | |
839 | *config = NULL; | |
840 | continue; | |
841 | } | |
842 | ||
843 | if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG, | |
844 | &txg) != 0 || txg == 0) { | |
845 | nvlist_free(*config); | |
846 | *config = NULL; | |
847 | continue; | |
848 | } | |
849 | ||
850 | break; | |
851 | } | |
852 | ||
d1d7e268 | 853 | vmem_free(label, sizeof (vdev_label_t)); |
60101509 BB |
854 | vdev_bdev_close(bdev, vdev_bdev_mode(FREAD)); |
855 | ||
d1d7e268 | 856 | return (0); |
60101509 | 857 | } |
e6603b7c | 858 | #endif /* __linux__ */ |
6839eed2 BB |
859 | |
860 | module_param(zfs_vdev_scheduler, charp, 0644); | |
c409e464 | 861 | MODULE_PARM_DESC(zfs_vdev_scheduler, "I/O scheduler"); |