]>
Commit | Line | Data |
---|---|---|
60101509 BB |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
9 | * or http://www.opensolaris.org/os/licensing. | |
10 | * See the License for the specific language governing permissions | |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | /* | |
22 | * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC. | |
23 | * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). | |
24 | * Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>. | |
25 | * LLNL-CODE-403049. | |
2a16d4cf | 26 | * Copyright (c) 2012, 2018 by Delphix. All rights reserved. |
60101509 BB |
27 | */ |
28 | ||
29 | #include <sys/zfs_context.h> | |
4805781c | 30 | #include <sys/spa_impl.h> |
60101509 BB |
31 | #include <sys/vdev_disk.h> |
32 | #include <sys/vdev_impl.h> | |
a6255b7f | 33 | #include <sys/abd.h> |
60101509 BB |
34 | #include <sys/fs/zfs.h> |
35 | #include <sys/zio.h> | |
36 | #include <sys/sunldi.h> | |
4805781c | 37 | #include <linux/mod_compat.h> |
2a16d4cf | 38 | #include <linux/msdos_fs.h> |
60101509 | 39 | |
6839eed2 | 40 | char *zfs_vdev_scheduler = VDEV_SCHEDULER; |
8128bd89 | 41 | static void *zfs_vdev_holder = VDEV_HOLDER; |
6839eed2 | 42 | |
2a16d4cf SH |
43 | /* size of the "reserved" partition, in blocks */ |
44 | #define EFI_MIN_RESV_SIZE (16 * 1024) | |
45 | ||
60101509 BB |
46 | /* |
47 | * Virtual device vector for disks. | |
48 | */ | |
49 | typedef struct dio_request { | |
60101509 | 50 | zio_t *dr_zio; /* Parent ZIO */ |
aa159afb | 51 | atomic_t dr_ref; /* References */ |
60101509 BB |
52 | int dr_error; /* Bio error */ |
53 | int dr_bio_count; /* Count of bio's */ | |
d1d7e268 | 54 | struct bio *dr_bio[0]; /* Attached bio's */ |
60101509 BB |
55 | } dio_request_t; |
56 | ||
57 | ||
58 | #ifdef HAVE_OPEN_BDEV_EXCLUSIVE | |
59 | static fmode_t | |
60 | vdev_bdev_mode(int smode) | |
61 | { | |
62 | fmode_t mode = 0; | |
63 | ||
64 | ASSERT3S(smode & (FREAD | FWRITE), !=, 0); | |
65 | ||
66 | if (smode & FREAD) | |
67 | mode |= FMODE_READ; | |
68 | ||
69 | if (smode & FWRITE) | |
70 | mode |= FMODE_WRITE; | |
71 | ||
d1d7e268 | 72 | return (mode); |
60101509 BB |
73 | } |
74 | #else | |
75 | static int | |
76 | vdev_bdev_mode(int smode) | |
77 | { | |
78 | int mode = 0; | |
79 | ||
80 | ASSERT3S(smode & (FREAD | FWRITE), !=, 0); | |
81 | ||
82 | if ((smode & FREAD) && !(smode & FWRITE)) | |
83 | mode = MS_RDONLY; | |
84 | ||
d1d7e268 | 85 | return (mode); |
60101509 BB |
86 | } |
87 | #endif /* HAVE_OPEN_BDEV_EXCLUSIVE */ | |
88 | ||
2a16d4cf | 89 | /* The capacity (in bytes) of a bdev that is available to be used by a vdev */ |
60101509 | 90 | static uint64_t |
2a16d4cf | 91 | bdev_capacity(struct block_device *bdev, boolean_t wholedisk) |
60101509 BB |
92 | { |
93 | struct hd_struct *part = bdev->bd_part; | |
2a16d4cf SH |
94 | uint64_t sectors = get_capacity(bdev->bd_disk); |
95 | /* If there are no paritions, return the entire device capacity */ | |
96 | if (part == NULL) | |
97 | return (sectors << SECTOR_BITS); | |
60101509 | 98 | |
2a16d4cf SH |
99 | /* |
100 | * If there are partitions, decide if we are using a `wholedisk` | |
101 | * layout (composed of part1 and part9) or just a single partition. | |
102 | */ | |
103 | if (wholedisk) { | |
104 | /* Verify the expected device layout */ | |
105 | ASSERT3P(bdev, !=, bdev->bd_contains); | |
106 | /* | |
107 | * Sectors used by the EFI partition (part9) as well as | |
108 | * partion alignment. | |
109 | */ | |
110 | uint64_t used = EFI_MIN_RESV_SIZE + NEW_START_BLOCK + | |
111 | PARTITION_END_ALIGNMENT; | |
112 | ||
113 | /* Space available to the vdev, i.e. the size of part1 */ | |
114 | if (sectors <= used) | |
115 | return (0); | |
116 | uint64_t available = sectors - used; | |
117 | return (available << SECTOR_BITS); | |
118 | } else { | |
119 | /* The partition capacity referenced by the block device */ | |
120 | return (part->nr_sects << SECTOR_BITS); | |
121 | } | |
60101509 BB |
122 | } |
123 | ||
d148e951 BB |
124 | static void |
125 | vdev_disk_error(zio_t *zio) | |
126 | { | |
127 | #ifdef ZFS_DEBUG | |
a69052be | 128 | printk("ZFS: zio error=%d type=%d offset=%llu size=%llu " |
193a37cb | 129 | "flags=%x\n", zio->io_error, zio->io_type, |
d148e951 | 130 | (u_longlong_t)zio->io_offset, (u_longlong_t)zio->io_size, |
193a37cb | 131 | zio->io_flags); |
d148e951 BB |
132 | #endif |
133 | } | |
134 | ||
6839eed2 BB |
135 | /* |
136 | * Use the Linux 'noop' elevator for zfs managed block devices. This | |
137 | * strikes the ideal balance by allowing the zfs elevator to do all | |
138 | * request ordering and prioritization. While allowing the Linux | |
139 | * elevator to do the maximum front/back merging allowed by the | |
140 | * physical device. This yields the largest possible requests for | |
141 | * the device with the lowest total overhead. | |
6839eed2 | 142 | */ |
4805781c | 143 | static void |
fdcd952b | 144 | vdev_elevator_switch(vdev_t *v, char *elevator) |
6839eed2 | 145 | { |
fdcd952b | 146 | vdev_disk_t *vd = v->vdev_tsd; |
4805781c BB |
147 | struct request_queue *q; |
148 | char *device; | |
e2448b0e | 149 | int error; |
fdcd952b | 150 | |
4805781c BB |
151 | for (int c = 0; c < v->vdev_children; c++) |
152 | vdev_elevator_switch(v->vdev_child[c], elevator); | |
153 | ||
154 | if (!v->vdev_ops->vdev_op_leaf || vd->vd_bdev == NULL) | |
155 | return; | |
156 | ||
157 | q = bdev_get_queue(vd->vd_bdev); | |
158 | device = vd->vd_bdev->bd_disk->disk_name; | |
159 | ||
84daadde PS |
160 | /* |
161 | * Skip devices which are not whole disks (partitions). | |
162 | * Device-mapper devices are excepted since they may be whole | |
163 | * disks despite the vdev_wholedisk flag, in which case we can | |
164 | * and should switch the elevator. If the device-mapper device | |
165 | * does not have an elevator (i.e. dm-raid, dm-crypt, etc.) the | |
166 | * "Skip devices without schedulers" check below will fail. | |
167 | */ | |
168 | if (!v->vdev_wholedisk && strncmp(device, "dm-", 3) != 0) | |
4805781c | 169 | return; |
04516a45 | 170 | |
fdcd952b BB |
171 | /* Skip devices without schedulers (loop, ram, dm, etc) */ |
172 | if (!q->elevator || !blk_queue_stackable(q)) | |
4805781c | 173 | return; |
6839eed2 | 174 | |
fdcd952b | 175 | /* Leave existing scheduler when set to "none" */ |
4903926f | 176 | if ((strncmp(elevator, "none", 4) == 0) && (strlen(elevator) == 4)) |
4805781c | 177 | return; |
6839eed2 | 178 | |
6d1d976b BB |
179 | #ifdef HAVE_ELEVATOR_CHANGE |
180 | error = elevator_change(q, elevator); | |
181 | #else | |
d1d7e268 MK |
182 | /* |
183 | * For pre-2.6.36 kernels elevator_change() is not available. | |
6d1d976b BB |
184 | * Therefore we fall back to using a usermodehelper to echo the |
185 | * elevator into sysfs; This requires /bin/echo and sysfs to be | |
186 | * mounted which may not be true early in the boot process. | |
187 | */ | |
d1d7e268 | 188 | #define SET_SCHEDULER_CMD \ |
6d1d976b BB |
189 | "exec 0</dev/null " \ |
190 | " 1>/sys/block/%s/queue/scheduler " \ | |
191 | " 2>/dev/null; " \ | |
192 | "echo %s" | |
193 | ||
4805781c BB |
194 | char *argv[] = { "/bin/sh", "-c", NULL, NULL }; |
195 | char *envp[] = { NULL }; | |
6d1d976b | 196 | |
4805781c BB |
197 | argv[2] = kmem_asprintf(SET_SCHEDULER_CMD, device, elevator); |
198 | error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC); | |
199 | strfree(argv[2]); | |
6d1d976b | 200 | #endif /* HAVE_ELEVATOR_CHANGE */ |
6839eed2 BB |
201 | if (error) |
202 | printk("ZFS: Unable to set \"%s\" scheduler for %s (%s): %d\n", | |
d1d7e268 | 203 | elevator, v->vdev_path, device, error); |
6839eed2 BB |
204 | } |
205 | ||
b5a28807 ED |
206 | /* |
207 | * Expanding a whole disk vdev involves invoking BLKRRPART on the | |
208 | * whole disk device. This poses a problem, because BLKRRPART will | |
209 | * return EBUSY if one of the disk's partitions is open. That's why | |
210 | * we have to do it here, just before opening the data partition. | |
211 | * Unfortunately, BLKRRPART works by dropping all partitions and | |
212 | * recreating them, which means that for a short time window, all | |
213 | * /dev/sdxN device files disappear (until udev recreates them). | |
214 | * This means two things: | |
215 | * - When we open the data partition just after a BLKRRPART, we | |
216 | * can't do it using the normal device file path because of the | |
217 | * obvious race condition with udev. Instead, we use reliable | |
218 | * kernel APIs to get a handle to the new partition device from | |
219 | * the whole disk device. | |
220 | * - Because vdev_disk_open() initially needs to find the device | |
221 | * using its path, multiple vdev_disk_open() invocations in | |
222 | * short succession on the same disk with BLKRRPARTs in the | |
223 | * middle have a high probability of failure (because of the | |
224 | * race condition with udev). A typical situation where this | |
225 | * might happen is when the zpool userspace tool does a | |
226 | * TRYIMPORT immediately followed by an IMPORT. For this | |
227 | * reason, we only invoke BLKRRPART in the module when strictly | |
228 | * necessary (zpool online -e case), and rely on userspace to | |
229 | * do it when possible. | |
230 | */ | |
231 | static struct block_device * | |
232 | vdev_disk_rrpart(const char *path, int mode, vdev_disk_t *vd) | |
233 | { | |
234 | #if defined(HAVE_3ARG_BLKDEV_GET) && defined(HAVE_GET_GENDISK) | |
235 | struct block_device *bdev, *result = ERR_PTR(-ENXIO); | |
236 | struct gendisk *disk; | |
237 | int error, partno; | |
238 | ||
8128bd89 | 239 | bdev = vdev_bdev_open(path, vdev_bdev_mode(mode), zfs_vdev_holder); |
b5a28807 | 240 | if (IS_ERR(bdev)) |
d1d7e268 | 241 | return (bdev); |
b5a28807 ED |
242 | |
243 | disk = get_gendisk(bdev->bd_dev, &partno); | |
244 | vdev_bdev_close(bdev, vdev_bdev_mode(mode)); | |
245 | ||
246 | if (disk) { | |
247 | bdev = bdget(disk_devt(disk)); | |
248 | if (bdev) { | |
249 | error = blkdev_get(bdev, vdev_bdev_mode(mode), vd); | |
250 | if (error == 0) | |
251 | error = ioctl_by_bdev(bdev, BLKRRPART, 0); | |
252 | vdev_bdev_close(bdev, vdev_bdev_mode(mode)); | |
253 | } | |
254 | ||
255 | bdev = bdget_disk(disk, partno); | |
256 | if (bdev) { | |
257 | error = blkdev_get(bdev, | |
258 | vdev_bdev_mode(mode) | FMODE_EXCL, vd); | |
259 | if (error == 0) | |
260 | result = bdev; | |
261 | } | |
262 | put_disk(disk); | |
263 | } | |
264 | ||
d1d7e268 | 265 | return (result); |
b5a28807 | 266 | #else |
d1d7e268 | 267 | return (ERR_PTR(-EOPNOTSUPP)); |
b5a28807 ED |
268 | #endif /* defined(HAVE_3ARG_BLKDEV_GET) && defined(HAVE_GET_GENDISK) */ |
269 | } | |
270 | ||
60101509 | 271 | static int |
1bd201e7 CS |
272 | vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, |
273 | uint64_t *ashift) | |
60101509 | 274 | { |
b5a28807 | 275 | struct block_device *bdev = ERR_PTR(-ENXIO); |
60101509 | 276 | vdev_disk_t *vd; |
2d82ea8b | 277 | int count = 0, mode, block_size; |
60101509 BB |
278 | |
279 | /* Must have a pathname and it must be absolute. */ | |
280 | if (v->vdev_path == NULL || v->vdev_path[0] != '/') { | |
281 | v->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; | |
2d82ea8b | 282 | return (SET_ERROR(EINVAL)); |
60101509 BB |
283 | } |
284 | ||
0d8103d9 BB |
285 | /* |
286 | * Reopen the device if it's not currently open. Otherwise, | |
287 | * just update the physical size of the device. | |
288 | */ | |
289 | if (v->vdev_tsd != NULL) { | |
290 | ASSERT(v->vdev_reopening); | |
291 | vd = v->vdev_tsd; | |
292 | goto skip_open; | |
293 | } | |
294 | ||
79c76d5b | 295 | vd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP); |
60101509 | 296 | if (vd == NULL) |
2d82ea8b | 297 | return (SET_ERROR(ENOMEM)); |
60101509 BB |
298 | |
299 | /* | |
300 | * Devices are always opened by the path provided at configuration | |
301 | * time. This means that if the provided path is a udev by-id path | |
302 | * then drives may be recabled without an issue. If the provided | |
4e95cc99 | 303 | * path is a udev by-path path, then the physical location information |
60101509 BB |
304 | * will be preserved. This can be critical for more complicated |
305 | * configurations where drives are located in specific physical | |
306 | * locations to maximize the systems tolerence to component failure. | |
4e95cc99 | 307 | * Alternatively, you can provide your own udev rule to flexibly map |
60101509 | 308 | * the drives as you see fit. It is not advised that you use the |
4e95cc99 | 309 | * /dev/[hd]d devices which may be reordered due to probing order. |
60101509 BB |
310 | * Devices in the wrong locations will be detected by the higher |
311 | * level vdev validation. | |
2d82ea8b BB |
312 | * |
313 | * The specified paths may be briefly removed and recreated in | |
314 | * response to udev events. This should be exceptionally unlikely | |
315 | * because the zpool command makes every effort to verify these paths | |
316 | * have already settled prior to reaching this point. Therefore, | |
317 | * a ENOENT failure at this point is highly likely to be transient | |
318 | * and it is reasonable to sleep and retry before giving up. In | |
319 | * practice delays have been observed to be on the order of 100ms. | |
60101509 BB |
320 | */ |
321 | mode = spa_mode(v->vdev_spa); | |
b5a28807 ED |
322 | if (v->vdev_wholedisk && v->vdev_expanding) |
323 | bdev = vdev_disk_rrpart(v->vdev_path, mode, vd); | |
2d82ea8b BB |
324 | |
325 | while (IS_ERR(bdev) && count < 50) { | |
8128bd89 BB |
326 | bdev = vdev_bdev_open(v->vdev_path, |
327 | vdev_bdev_mode(mode), zfs_vdev_holder); | |
2d82ea8b BB |
328 | if (unlikely(PTR_ERR(bdev) == -ENOENT)) { |
329 | msleep(10); | |
330 | count++; | |
331 | } else if (IS_ERR(bdev)) { | |
332 | break; | |
333 | } | |
334 | } | |
335 | ||
60101509 | 336 | if (IS_ERR(bdev)) { |
2d82ea8b BB |
337 | dprintf("failed open v->vdev_path=%s, error=%d count=%d\n", |
338 | v->vdev_path, -PTR_ERR(bdev), count); | |
d1d7e268 | 339 | kmem_free(vd, sizeof (vdev_disk_t)); |
2d82ea8b | 340 | return (SET_ERROR(-PTR_ERR(bdev))); |
60101509 BB |
341 | } |
342 | ||
343 | v->vdev_tsd = vd; | |
344 | vd->vd_bdev = bdev; | |
0d8103d9 BB |
345 | |
346 | skip_open: | |
347 | /* Determine the physical block size */ | |
348 | block_size = vdev_bdev_block_size(vd->vd_bdev); | |
60101509 | 349 | |
60101509 BB |
350 | /* Clear the nowritecache bit, causes vdev_reopen() to try again. */ |
351 | v->vdev_nowritecache = B_FALSE; | |
352 | ||
fb40095f RY |
353 | /* Inform the ZIO pipeline that we are non-rotational */ |
354 | v->vdev_nonrot = blk_queue_nonrot(bdev_get_queue(vd->vd_bdev)); | |
355 | ||
60101509 | 356 | /* Physical volume size in bytes */ |
2a16d4cf | 357 | *psize = bdev_capacity(vd->vd_bdev, v->vdev_wholedisk); |
1bd201e7 CS |
358 | *max_psize = *psize; |
359 | ||
60101509 | 360 | /* Based on the minimum sector size set the block size */ |
9bd274dd | 361 | *ashift = highbit64(MAX(block_size, SPA_MINBLOCKSIZE)) - 1; |
60101509 | 362 | |
6839eed2 | 363 | /* Try to set the io scheduler elevator algorithm */ |
fdcd952b | 364 | (void) vdev_elevator_switch(v, zfs_vdev_scheduler); |
6839eed2 | 365 | |
d1d7e268 | 366 | return (0); |
60101509 BB |
367 | } |
368 | ||
369 | static void | |
370 | vdev_disk_close(vdev_t *v) | |
371 | { | |
372 | vdev_disk_t *vd = v->vdev_tsd; | |
373 | ||
0d8103d9 | 374 | if (v->vdev_reopening || vd == NULL) |
60101509 BB |
375 | return; |
376 | ||
377 | if (vd->vd_bdev != NULL) | |
378 | vdev_bdev_close(vd->vd_bdev, | |
d1d7e268 | 379 | vdev_bdev_mode(spa_mode(v->vdev_spa))); |
60101509 | 380 | |
d1d7e268 | 381 | kmem_free(vd, sizeof (vdev_disk_t)); |
60101509 BB |
382 | v->vdev_tsd = NULL; |
383 | } | |
384 | ||
385 | static dio_request_t * | |
386 | vdev_disk_dio_alloc(int bio_count) | |
387 | { | |
388 | dio_request_t *dr; | |
389 | int i; | |
390 | ||
d1d7e268 | 391 | dr = kmem_zalloc(sizeof (dio_request_t) + |
79c76d5b | 392 | sizeof (struct bio *) * bio_count, KM_SLEEP); |
60101509 | 393 | if (dr) { |
60101509 BB |
394 | atomic_set(&dr->dr_ref, 0); |
395 | dr->dr_bio_count = bio_count; | |
396 | dr->dr_error = 0; | |
397 | ||
398 | for (i = 0; i < dr->dr_bio_count; i++) | |
399 | dr->dr_bio[i] = NULL; | |
400 | } | |
401 | ||
d1d7e268 | 402 | return (dr); |
60101509 BB |
403 | } |
404 | ||
405 | static void | |
406 | vdev_disk_dio_free(dio_request_t *dr) | |
407 | { | |
408 | int i; | |
409 | ||
410 | for (i = 0; i < dr->dr_bio_count; i++) | |
411 | if (dr->dr_bio[i]) | |
412 | bio_put(dr->dr_bio[i]); | |
413 | ||
d1d7e268 MK |
414 | kmem_free(dr, sizeof (dio_request_t) + |
415 | sizeof (struct bio *) * dr->dr_bio_count); | |
60101509 BB |
416 | } |
417 | ||
418 | static void | |
419 | vdev_disk_dio_get(dio_request_t *dr) | |
420 | { | |
421 | atomic_inc(&dr->dr_ref); | |
422 | } | |
423 | ||
424 | static int | |
425 | vdev_disk_dio_put(dio_request_t *dr) | |
426 | { | |
427 | int rc = atomic_dec_return(&dr->dr_ref); | |
428 | ||
429 | /* | |
430 | * Free the dio_request when the last reference is dropped and | |
431 | * ensure zio_interpret is called only once with the correct zio | |
432 | */ | |
433 | if (rc == 0) { | |
434 | zio_t *zio = dr->dr_zio; | |
435 | int error = dr->dr_error; | |
436 | ||
437 | vdev_disk_dio_free(dr); | |
438 | ||
439 | if (zio) { | |
440 | zio->io_error = error; | |
d148e951 BB |
441 | ASSERT3S(zio->io_error, >=, 0); |
442 | if (zio->io_error) | |
443 | vdev_disk_error(zio); | |
a6255b7f | 444 | |
26ef0cc7 | 445 | zio_delay_interrupt(zio); |
60101509 BB |
446 | } |
447 | } | |
448 | ||
d1d7e268 | 449 | return (rc); |
60101509 BB |
450 | } |
451 | ||
784a7fe5 | 452 | BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, error) |
60101509 BB |
453 | { |
454 | dio_request_t *dr = bio->bi_private; | |
455 | int rc; | |
456 | ||
784a7fe5 LW |
457 | if (dr->dr_error == 0) { |
458 | #ifdef HAVE_1ARG_BIO_END_IO_T | |
36ba27e9 | 459 | dr->dr_error = BIO_END_IO_ERROR(bio); |
784a7fe5 LW |
460 | #else |
461 | if (error) | |
462 | dr->dr_error = -(error); | |
463 | else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) | |
464 | dr->dr_error = EIO; | |
465 | #endif | |
466 | } | |
60101509 | 467 | |
b0be93e8 | 468 | /* Drop reference acquired by __vdev_disk_physio */ |
60101509 | 469 | rc = vdev_disk_dio_put(dr); |
60101509 BB |
470 | } |
471 | ||
60101509 BB |
472 | static unsigned int |
473 | bio_map(struct bio *bio, void *bio_ptr, unsigned int bio_size) | |
474 | { | |
475 | unsigned int offset, size, i; | |
476 | struct page *page; | |
477 | ||
478 | offset = offset_in_page(bio_ptr); | |
479 | for (i = 0; i < bio->bi_max_vecs; i++) { | |
480 | size = PAGE_SIZE - offset; | |
481 | ||
482 | if (bio_size <= 0) | |
483 | break; | |
484 | ||
485 | if (size > bio_size) | |
486 | size = bio_size; | |
487 | ||
71f8548e | 488 | if (is_vmalloc_addr(bio_ptr)) |
60101509 BB |
489 | page = vmalloc_to_page(bio_ptr); |
490 | else | |
491 | page = virt_to_page(bio_ptr); | |
492 | ||
17584980 CC |
493 | /* |
494 | * Some network related block device uses tcp_sendpage, which | |
495 | * doesn't behave well when using 0-count page, this is a | |
496 | * safety net to catch them. | |
497 | */ | |
498 | ASSERT3S(page_count(page), >, 0); | |
499 | ||
60101509 BB |
500 | if (bio_add_page(bio, page, size, offset) != size) |
501 | break; | |
502 | ||
503 | bio_ptr += size; | |
504 | bio_size -= size; | |
505 | offset = 0; | |
506 | } | |
507 | ||
d1d7e268 | 508 | return (bio_size); |
60101509 BB |
509 | } |
510 | ||
b0be93e8 IH |
511 | static unsigned int |
512 | bio_map_abd_off(struct bio *bio, abd_t *abd, unsigned int size, size_t off) | |
513 | { | |
514 | if (abd_is_linear(abd)) | |
515 | return (bio_map(bio, ((char *)abd_to_buf(abd)) + off, size)); | |
516 | ||
517 | return (abd_scatter_bio_map_off(bio, abd, size, off)); | |
518 | } | |
519 | ||
bbb1b6ce | 520 | static inline void |
3b86aeb2 | 521 | vdev_submit_bio_impl(struct bio *bio) |
bbb1b6ce BB |
522 | { |
523 | #ifdef HAVE_1ARG_SUBMIT_BIO | |
bbb1b6ce BB |
524 | submit_bio(bio); |
525 | #else | |
3b86aeb2 | 526 | submit_bio(0, bio); |
bbb1b6ce BB |
527 | #endif |
528 | } | |
529 | ||
661907e6 BB |
530 | #ifndef HAVE_BIO_SET_DEV |
531 | static inline void | |
532 | bio_set_dev(struct bio *bio, struct block_device *bdev) | |
533 | { | |
534 | bio->bi_bdev = bdev; | |
535 | } | |
536 | #endif /* !HAVE_BIO_SET_DEV */ | |
537 | ||
37f9dac5 | 538 | static inline void |
3b86aeb2 | 539 | vdev_submit_bio(struct bio *bio) |
37f9dac5 RY |
540 | { |
541 | #ifdef HAVE_CURRENT_BIO_TAIL | |
542 | struct bio **bio_tail = current->bio_tail; | |
543 | current->bio_tail = NULL; | |
3b86aeb2 | 544 | vdev_submit_bio_impl(bio); |
37f9dac5 RY |
545 | current->bio_tail = bio_tail; |
546 | #else | |
547 | struct bio_list *bio_list = current->bio_list; | |
548 | current->bio_list = NULL; | |
3b86aeb2 | 549 | vdev_submit_bio_impl(bio); |
37f9dac5 RY |
550 | current->bio_list = bio_list; |
551 | #endif | |
552 | } | |
553 | ||
60101509 | 554 | static int |
b0be93e8 IH |
555 | __vdev_disk_physio(struct block_device *bdev, zio_t *zio, |
556 | size_t io_size, uint64_t io_offset, int rw, int flags) | |
60101509 | 557 | { |
d1d7e268 | 558 | dio_request_t *dr; |
b0be93e8 | 559 | uint64_t abd_offset; |
60101509 | 560 | uint64_t bio_offset; |
3b86aeb2 | 561 | int bio_size, bio_count = 16; |
f74fae8b | 562 | int i = 0, error = 0; |
e8ac4557 IH |
563 | #if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG) |
564 | struct blk_plug plug; | |
565 | #endif | |
60101509 | 566 | |
b0be93e8 IH |
567 | ASSERT(zio != NULL); |
568 | ASSERT3U(io_offset + io_size, <=, bdev->bd_inode->i_size); | |
e06be586 | 569 | |
60101509 BB |
570 | retry: |
571 | dr = vdev_disk_dio_alloc(bio_count); | |
572 | if (dr == NULL) | |
d1d7e268 | 573 | return (ENOMEM); |
60101509 | 574 | |
2959d94a | 575 | if (zio && !(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD))) |
29b763cd | 576 | bio_set_flags_failfast(bdev, &flags); |
2959d94a | 577 | |
60101509 | 578 | dr->dr_zio = zio; |
60101509 | 579 | |
60101509 BB |
580 | /* |
581 | * When the IO size exceeds the maximum bio size for the request | |
582 | * queue we are forced to break the IO in multiple bio's and wait | |
583 | * for them all to complete. Ideally, all pool users will set | |
584 | * their volume block size to match the maximum request size and | |
585 | * the common case will be one bio per vdev IO request. | |
586 | */ | |
a6255b7f | 587 | |
b0be93e8 IH |
588 | abd_offset = 0; |
589 | bio_offset = io_offset; | |
590 | bio_size = io_size; | |
60101509 BB |
591 | for (i = 0; i <= dr->dr_bio_count; i++) { |
592 | ||
593 | /* Finished constructing bio's for given buffer */ | |
594 | if (bio_size <= 0) | |
595 | break; | |
596 | ||
597 | /* | |
598 | * By default only 'bio_count' bio's per dio are allowed. | |
599 | * However, if we find ourselves in a situation where more | |
600 | * are needed we allocate a larger dio and warn the user. | |
601 | */ | |
602 | if (dr->dr_bio_count == i) { | |
603 | vdev_disk_dio_free(dr); | |
604 | bio_count *= 2; | |
60101509 BB |
605 | goto retry; |
606 | } | |
607 | ||
29b763cd | 608 | /* bio_alloc() with __GFP_WAIT never returns NULL */ |
f1512ee6 | 609 | dr->dr_bio[i] = bio_alloc(GFP_NOIO, |
b0be93e8 | 610 | MIN(abd_nr_pages_off(zio->io_abd, bio_size, abd_offset), |
02730c33 | 611 | BIO_MAX_PAGES)); |
29b763cd | 612 | if (unlikely(dr->dr_bio[i] == NULL)) { |
60101509 | 613 | vdev_disk_dio_free(dr); |
d1d7e268 | 614 | return (ENOMEM); |
60101509 BB |
615 | } |
616 | ||
617 | /* Matching put called by vdev_disk_physio_completion */ | |
618 | vdev_disk_dio_get(dr); | |
619 | ||
661907e6 | 620 | bio_set_dev(dr->dr_bio[i], bdev); |
d4541210 | 621 | BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9; |
60101509 BB |
622 | dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion; |
623 | dr->dr_bio[i]->bi_private = dr; | |
3b86aeb2 | 624 | bio_set_op_attrs(dr->dr_bio[i], rw, flags); |
60101509 BB |
625 | |
626 | /* Remaining size is returned to become the new size */ | |
b0be93e8 | 627 | bio_size = bio_map_abd_off(dr->dr_bio[i], zio->io_abd, |
02730c33 | 628 | bio_size, abd_offset); |
60101509 BB |
629 | |
630 | /* Advance in buffer and construct another bio if needed */ | |
b0be93e8 | 631 | abd_offset += BIO_BI_SIZE(dr->dr_bio[i]); |
d4541210 | 632 | bio_offset += BIO_BI_SIZE(dr->dr_bio[i]); |
60101509 BB |
633 | } |
634 | ||
37f9dac5 | 635 | /* Extra reference to protect dio_request during vdev_submit_bio */ |
60101509 BB |
636 | vdev_disk_dio_get(dr); |
637 | ||
e8ac4557 IH |
638 | #if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG) |
639 | if (dr->dr_bio_count > 1) | |
640 | blk_start_plug(&plug); | |
641 | #endif | |
642 | ||
60101509 BB |
643 | /* Submit all bio's associated with this dio */ |
644 | for (i = 0; i < dr->dr_bio_count; i++) | |
645 | if (dr->dr_bio[i]) | |
3b86aeb2 | 646 | vdev_submit_bio(dr->dr_bio[i]); |
60101509 | 647 | |
e8ac4557 IH |
648 | #if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG) |
649 | if (dr->dr_bio_count > 1) | |
650 | blk_finish_plug(&plug); | |
651 | #endif | |
652 | ||
d1d7e268 | 653 | (void) vdev_disk_dio_put(dr); |
60101509 | 654 | |
d1d7e268 | 655 | return (error); |
60101509 BB |
656 | } |
657 | ||
36ba27e9 | 658 | BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error) |
60101509 BB |
659 | { |
660 | zio_t *zio = bio->bi_private; | |
784a7fe5 | 661 | #ifdef HAVE_1ARG_BIO_END_IO_T |
36ba27e9 BB |
662 | zio->io_error = BIO_END_IO_ERROR(bio); |
663 | #else | |
664 | zio->io_error = -error; | |
784a7fe5 | 665 | #endif |
60101509 | 666 | |
36ba27e9 | 667 | if (zio->io_error && (zio->io_error == EOPNOTSUPP)) |
60101509 BB |
668 | zio->io_vd->vdev_nowritecache = B_TRUE; |
669 | ||
670 | bio_put(bio); | |
d148e951 BB |
671 | ASSERT3S(zio->io_error, >=, 0); |
672 | if (zio->io_error) | |
673 | vdev_disk_error(zio); | |
60101509 | 674 | zio_interrupt(zio); |
60101509 BB |
675 | } |
676 | ||
677 | static int | |
678 | vdev_disk_io_flush(struct block_device *bdev, zio_t *zio) | |
679 | { | |
680 | struct request_queue *q; | |
681 | struct bio *bio; | |
682 | ||
683 | q = bdev_get_queue(bdev); | |
684 | if (!q) | |
d1d7e268 | 685 | return (ENXIO); |
60101509 | 686 | |
abc41ac7 | 687 | bio = bio_alloc(GFP_NOIO, 0); |
29b763cd IH |
688 | /* bio_alloc() with __GFP_WAIT never returns NULL */ |
689 | if (unlikely(bio == NULL)) | |
d1d7e268 | 690 | return (ENOMEM); |
60101509 BB |
691 | |
692 | bio->bi_end_io = vdev_disk_io_flush_completion; | |
693 | bio->bi_private = zio; | |
661907e6 | 694 | bio_set_dev(bio, bdev); |
a5e046ea | 695 | bio_set_flush(bio); |
3b86aeb2 | 696 | vdev_submit_bio(bio); |
cecb7487 | 697 | invalidate_bdev(bdev); |
60101509 | 698 | |
d1d7e268 | 699 | return (0); |
60101509 | 700 | } |
60101509 | 701 | |
98b25418 | 702 | static void |
60101509 BB |
703 | vdev_disk_io_start(zio_t *zio) |
704 | { | |
705 | vdev_t *v = zio->io_vd; | |
706 | vdev_disk_t *vd = v->vdev_tsd; | |
3b86aeb2 | 707 | int rw, flags, error; |
60101509 BB |
708 | |
709 | switch (zio->io_type) { | |
710 | case ZIO_TYPE_IOCTL: | |
711 | ||
712 | if (!vdev_readable(v)) { | |
2e528b49 | 713 | zio->io_error = SET_ERROR(ENXIO); |
98b25418 GW |
714 | zio_interrupt(zio); |
715 | return; | |
60101509 BB |
716 | } |
717 | ||
718 | switch (zio->io_cmd) { | |
719 | case DKIOCFLUSHWRITECACHE: | |
720 | ||
721 | if (zfs_nocacheflush) | |
722 | break; | |
723 | ||
724 | if (v->vdev_nowritecache) { | |
2e528b49 | 725 | zio->io_error = SET_ERROR(ENOTSUP); |
60101509 BB |
726 | break; |
727 | } | |
728 | ||
729 | error = vdev_disk_io_flush(vd->vd_bdev, zio); | |
730 | if (error == 0) | |
98b25418 | 731 | return; |
60101509 BB |
732 | |
733 | zio->io_error = error; | |
60101509 BB |
734 | |
735 | break; | |
736 | ||
737 | default: | |
2e528b49 | 738 | zio->io_error = SET_ERROR(ENOTSUP); |
60101509 BB |
739 | } |
740 | ||
98b25418 GW |
741 | zio_execute(zio); |
742 | return; | |
60101509 | 743 | case ZIO_TYPE_WRITE: |
3b86aeb2 | 744 | rw = WRITE; |
e6603b7c | 745 | #if defined(HAVE_BLK_QUEUE_HAVE_BIO_RW_UNPLUG) |
3b86aeb2 | 746 | flags = (1 << BIO_RW_UNPLUG); |
e6603b7c | 747 | #elif defined(REQ_UNPLUG) |
3b86aeb2 | 748 | flags = REQ_UNPLUG; |
e6603b7c | 749 | #else |
3b86aeb2 | 750 | flags = 0; |
e6603b7c | 751 | #endif |
60101509 BB |
752 | break; |
753 | ||
754 | case ZIO_TYPE_READ: | |
3b86aeb2 | 755 | rw = READ; |
e6603b7c | 756 | #if defined(HAVE_BLK_QUEUE_HAVE_BIO_RW_UNPLUG) |
3b86aeb2 | 757 | flags = (1 << BIO_RW_UNPLUG); |
e6603b7c | 758 | #elif defined(REQ_UNPLUG) |
3b86aeb2 | 759 | flags = REQ_UNPLUG; |
e6603b7c | 760 | #else |
3b86aeb2 | 761 | flags = 0; |
e6603b7c | 762 | #endif |
60101509 BB |
763 | break; |
764 | ||
765 | default: | |
2e528b49 | 766 | zio->io_error = SET_ERROR(ENOTSUP); |
98b25418 GW |
767 | zio_interrupt(zio); |
768 | return; | |
60101509 BB |
769 | } |
770 | ||
26ef0cc7 | 771 | zio->io_target_timestamp = zio_handle_io_delay(zio); |
b0be93e8 | 772 | error = __vdev_disk_physio(vd->vd_bdev, zio, |
3b86aeb2 | 773 | zio->io_size, zio->io_offset, rw, flags); |
60101509 BB |
774 | if (error) { |
775 | zio->io_error = error; | |
98b25418 GW |
776 | zio_interrupt(zio); |
777 | return; | |
60101509 | 778 | } |
60101509 BB |
779 | } |
780 | ||
781 | static void | |
782 | vdev_disk_io_done(zio_t *zio) | |
783 | { | |
784 | /* | |
785 | * If the device returned EIO, we revalidate the media. If it is | |
786 | * determined the media has changed this triggers the asynchronous | |
787 | * removal of the device from the configuration. | |
788 | */ | |
789 | if (zio->io_error == EIO) { | |
d1d7e268 | 790 | vdev_t *v = zio->io_vd; |
60101509 BB |
791 | vdev_disk_t *vd = v->vdev_tsd; |
792 | ||
793 | if (check_disk_change(vd->vd_bdev)) { | |
794 | vdev_bdev_invalidate(vd->vd_bdev); | |
795 | v->vdev_remove_wanted = B_TRUE; | |
796 | spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE); | |
797 | } | |
798 | } | |
799 | } | |
800 | ||
801 | static void | |
802 | vdev_disk_hold(vdev_t *vd) | |
803 | { | |
804 | ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); | |
805 | ||
806 | /* We must have a pathname, and it must be absolute. */ | |
807 | if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') | |
808 | return; | |
809 | ||
810 | /* | |
811 | * Only prefetch path and devid info if the device has | |
812 | * never been opened. | |
813 | */ | |
814 | if (vd->vdev_tsd != NULL) | |
815 | return; | |
816 | ||
817 | /* XXX: Implement me as a vnode lookup for the device */ | |
818 | vd->vdev_name_vp = NULL; | |
819 | vd->vdev_devid_vp = NULL; | |
820 | } | |
821 | ||
822 | static void | |
823 | vdev_disk_rele(vdev_t *vd) | |
824 | { | |
825 | ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); | |
826 | ||
827 | /* XXX: Implement me as a vnode rele for the device */ | |
828 | } | |
829 | ||
4805781c BB |
830 | static int |
831 | param_set_vdev_scheduler(const char *val, zfs_kernel_param_t *kp) | |
832 | { | |
833 | spa_t *spa = NULL; | |
834 | char *p; | |
835 | ||
836 | if (val == NULL) | |
837 | return (SET_ERROR(-EINVAL)); | |
838 | ||
839 | if ((p = strchr(val, '\n')) != NULL) | |
840 | *p = '\0'; | |
841 | ||
3eef58c9 | 842 | if (spa_mode_global != 0) { |
4805781c | 843 | mutex_enter(&spa_namespace_lock); |
3eef58c9 OF |
844 | while ((spa = spa_next(spa)) != NULL) { |
845 | if (spa_state(spa) != POOL_STATE_ACTIVE || | |
846 | !spa_writeable(spa) || spa_suspended(spa)) | |
847 | continue; | |
848 | ||
849 | spa_open_ref(spa, FTAG); | |
850 | mutex_exit(&spa_namespace_lock); | |
851 | vdev_elevator_switch(spa->spa_root_vdev, (char *)val); | |
852 | mutex_enter(&spa_namespace_lock); | |
853 | spa_close(spa, FTAG); | |
854 | } | |
855 | mutex_exit(&spa_namespace_lock); | |
4805781c | 856 | } |
4805781c BB |
857 | |
858 | return (param_set_charp(val, kp)); | |
859 | } | |
860 | ||
60101509 BB |
861 | vdev_ops_t vdev_disk_ops = { |
862 | vdev_disk_open, | |
863 | vdev_disk_close, | |
864 | vdev_default_asize, | |
865 | vdev_disk_io_start, | |
866 | vdev_disk_io_done, | |
867 | NULL, | |
3d6da72d | 868 | NULL, |
60101509 BB |
869 | vdev_disk_hold, |
870 | vdev_disk_rele, | |
871 | VDEV_TYPE_DISK, /* name of this vdev type */ | |
872 | B_TRUE /* leaf vdev */ | |
873 | }; | |
874 | ||
4805781c BB |
875 | module_param_call(zfs_vdev_scheduler, param_set_vdev_scheduler, |
876 | param_get_charp, &zfs_vdev_scheduler, 0644); | |
c409e464 | 877 | MODULE_PARM_DESC(zfs_vdev_scheduler, "I/O scheduler"); |