]>
Commit | Line | Data |
---|---|---|
34dc7c2f BB |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
9 | * or http://www.opensolaris.org/os/licensing. | |
10 | * See the License for the specific language governing permissions | |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | /* | |
d164b209 | 22 | * Copyright 2009 Sun Microsystems, Inc. All rights reserved. |
34dc7c2f BB |
23 | * Use is subject to license terms. |
24 | */ | |
25 | ||
34dc7c2f | 26 | #include <sys/zfs_context.h> |
34dc7c2f BB |
27 | #include <sys/vdev_impl.h> |
28 | #include <sys/zio.h> | |
29 | #include <sys/avl.h> | |
30 | ||
31 | /* | |
32 | * These tunables are for performance analysis. | |
33 | */ | |
34 | /* | |
35 | * zfs_vdev_max_pending is the maximum number of i/os concurrently | |
36 | * pending to each device. zfs_vdev_min_pending is the initial number | |
37 | * of i/os pending to each device (before it starts ramping up to | |
38 | * max_pending). | |
39 | */ | |
428870ff | 40 | int zfs_vdev_max_pending = 10; |
34dc7c2f BB |
41 | int zfs_vdev_min_pending = 4; |
42 | ||
428870ff | 43 | /* deadline = pri + ddi_get_lbolt64() >> time_shift) */ |
34dc7c2f BB |
44 | int zfs_vdev_time_shift = 6; |
45 | ||
46 | /* exponential I/O issue ramp-up rate */ | |
47 | int zfs_vdev_ramp_rate = 2; | |
48 | ||
49 | /* | |
45d1cae3 BB |
50 | * To reduce IOPs, we aggregate small adjacent I/Os into one large I/O. |
51 | * For read I/Os, we also aggregate across small adjacency gaps; for writes | |
52 | * we include spans of optional I/Os to aid aggregation at the disk even when | |
53 | * they aren't able to help us aggregate at this level. | |
34dc7c2f BB |
54 | */ |
55 | int zfs_vdev_aggregation_limit = SPA_MAXBLOCKSIZE; | |
9babb374 | 56 | int zfs_vdev_read_gap_limit = 32 << 10; |
45d1cae3 | 57 | int zfs_vdev_write_gap_limit = 4 << 10; |
34dc7c2f BB |
58 | |
59 | /* | |
60 | * Virtual device vector for disk I/O scheduling. | |
61 | */ | |
62 | int | |
63 | vdev_queue_deadline_compare(const void *x1, const void *x2) | |
64 | { | |
65 | const zio_t *z1 = x1; | |
66 | const zio_t *z2 = x2; | |
67 | ||
68 | if (z1->io_deadline < z2->io_deadline) | |
69 | return (-1); | |
70 | if (z1->io_deadline > z2->io_deadline) | |
71 | return (1); | |
72 | ||
73 | if (z1->io_offset < z2->io_offset) | |
74 | return (-1); | |
75 | if (z1->io_offset > z2->io_offset) | |
76 | return (1); | |
77 | ||
78 | if (z1 < z2) | |
79 | return (-1); | |
80 | if (z1 > z2) | |
81 | return (1); | |
82 | ||
83 | return (0); | |
84 | } | |
85 | ||
86 | int | |
87 | vdev_queue_offset_compare(const void *x1, const void *x2) | |
88 | { | |
89 | const zio_t *z1 = x1; | |
90 | const zio_t *z2 = x2; | |
91 | ||
92 | if (z1->io_offset < z2->io_offset) | |
93 | return (-1); | |
94 | if (z1->io_offset > z2->io_offset) | |
95 | return (1); | |
96 | ||
97 | if (z1 < z2) | |
98 | return (-1); | |
99 | if (z1 > z2) | |
100 | return (1); | |
101 | ||
102 | return (0); | |
103 | } | |
104 | ||
105 | void | |
106 | vdev_queue_init(vdev_t *vd) | |
107 | { | |
108 | vdev_queue_t *vq = &vd->vdev_queue; | |
109 | ||
110 | mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL); | |
111 | ||
112 | avl_create(&vq->vq_deadline_tree, vdev_queue_deadline_compare, | |
113 | sizeof (zio_t), offsetof(struct zio, io_deadline_node)); | |
114 | ||
115 | avl_create(&vq->vq_read_tree, vdev_queue_offset_compare, | |
116 | sizeof (zio_t), offsetof(struct zio, io_offset_node)); | |
117 | ||
118 | avl_create(&vq->vq_write_tree, vdev_queue_offset_compare, | |
119 | sizeof (zio_t), offsetof(struct zio, io_offset_node)); | |
120 | ||
121 | avl_create(&vq->vq_pending_tree, vdev_queue_offset_compare, | |
122 | sizeof (zio_t), offsetof(struct zio, io_offset_node)); | |
123 | } | |
124 | ||
125 | void | |
126 | vdev_queue_fini(vdev_t *vd) | |
127 | { | |
128 | vdev_queue_t *vq = &vd->vdev_queue; | |
129 | ||
130 | avl_destroy(&vq->vq_deadline_tree); | |
131 | avl_destroy(&vq->vq_read_tree); | |
132 | avl_destroy(&vq->vq_write_tree); | |
133 | avl_destroy(&vq->vq_pending_tree); | |
134 | ||
135 | mutex_destroy(&vq->vq_lock); | |
136 | } | |
137 | ||
138 | static void | |
139 | vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio) | |
140 | { | |
141 | avl_add(&vq->vq_deadline_tree, zio); | |
142 | avl_add(zio->io_vdev_tree, zio); | |
143 | } | |
144 | ||
145 | static void | |
146 | vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio) | |
147 | { | |
148 | avl_remove(&vq->vq_deadline_tree, zio); | |
149 | avl_remove(zio->io_vdev_tree, zio); | |
150 | } | |
151 | ||
152 | static void | |
153 | vdev_queue_agg_io_done(zio_t *aio) | |
154 | { | |
d164b209 | 155 | zio_t *pio; |
34dc7c2f | 156 | |
d164b209 | 157 | while ((pio = zio_walk_parents(aio)) != NULL) |
34dc7c2f | 158 | if (aio->io_type == ZIO_TYPE_READ) |
d164b209 BB |
159 | bcopy((char *)aio->io_data + (pio->io_offset - |
160 | aio->io_offset), pio->io_data, pio->io_size); | |
34dc7c2f BB |
161 | |
162 | zio_buf_free(aio->io_data, aio->io_size); | |
163 | } | |
164 | ||
9babb374 BB |
165 | /* |
166 | * Compute the range spanned by two i/os, which is the endpoint of the last | |
167 | * (lio->io_offset + lio->io_size) minus start of the first (fio->io_offset). | |
168 | * Conveniently, the gap between fio and lio is given by -IO_SPAN(lio, fio); | |
169 | * thus fio and lio are adjacent if and only if IO_SPAN(lio, fio) == 0. | |
170 | */ | |
171 | #define IO_SPAN(fio, lio) ((lio)->io_offset + (lio)->io_size - (fio)->io_offset) | |
172 | #define IO_GAP(fio, lio) (-IO_SPAN(lio, fio)) | |
34dc7c2f BB |
173 | |
174 | static zio_t * | |
175 | vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit) | |
176 | { | |
45d1cae3 | 177 | zio_t *fio, *lio, *aio, *dio, *nio, *mio; |
d164b209 | 178 | avl_tree_t *t; |
fb5f0bc8 | 179 | int flags; |
9babb374 BB |
180 | uint64_t maxspan = zfs_vdev_aggregation_limit; |
181 | uint64_t maxgap; | |
45d1cae3 | 182 | int stretch; |
34dc7c2f | 183 | |
45d1cae3 | 184 | again: |
34dc7c2f BB |
185 | ASSERT(MUTEX_HELD(&vq->vq_lock)); |
186 | ||
187 | if (avl_numnodes(&vq->vq_pending_tree) >= pending_limit || | |
188 | avl_numnodes(&vq->vq_deadline_tree) == 0) | |
189 | return (NULL); | |
190 | ||
191 | fio = lio = avl_first(&vq->vq_deadline_tree); | |
192 | ||
d164b209 | 193 | t = fio->io_vdev_tree; |
fb5f0bc8 | 194 | flags = fio->io_flags & ZIO_FLAG_AGG_INHERIT; |
9babb374 | 195 | maxgap = (t == &vq->vq_read_tree) ? zfs_vdev_read_gap_limit : 0; |
fb5f0bc8 BB |
196 | |
197 | if (!(flags & ZIO_FLAG_DONT_AGGREGATE)) { | |
198 | /* | |
45d1cae3 BB |
199 | * We can aggregate I/Os that are sufficiently adjacent and of |
200 | * the same flavor, as expressed by the AGG_INHERIT flags. | |
201 | * The latter requirement is necessary so that certain | |
202 | * attributes of the I/O, such as whether it's a normal I/O | |
203 | * or a scrub/resilver, can be preserved in the aggregate. | |
204 | * We can include optional I/Os, but don't allow them | |
205 | * to begin a range as they add no benefit in that situation. | |
206 | */ | |
207 | ||
208 | /* | |
209 | * We keep track of the last non-optional I/O. | |
210 | */ | |
211 | mio = (fio->io_flags & ZIO_FLAG_OPTIONAL) ? NULL : fio; | |
212 | ||
213 | /* | |
214 | * Walk backwards through sufficiently contiguous I/Os | |
215 | * recording the last non-option I/O. | |
fb5f0bc8 | 216 | */ |
d164b209 | 217 | while ((dio = AVL_PREV(t, fio)) != NULL && |
fb5f0bc8 | 218 | (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && |
45d1cae3 BB |
219 | IO_SPAN(dio, lio) <= maxspan && |
220 | IO_GAP(dio, fio) <= maxgap) { | |
fb5f0bc8 | 221 | fio = dio; |
45d1cae3 BB |
222 | if (mio == NULL && !(fio->io_flags & ZIO_FLAG_OPTIONAL)) |
223 | mio = fio; | |
224 | } | |
225 | ||
226 | /* | |
227 | * Skip any initial optional I/Os. | |
228 | */ | |
229 | while ((fio->io_flags & ZIO_FLAG_OPTIONAL) && fio != lio) { | |
230 | fio = AVL_NEXT(t, fio); | |
231 | ASSERT(fio != NULL); | |
232 | } | |
9babb374 | 233 | |
45d1cae3 BB |
234 | /* |
235 | * Walk forward through sufficiently contiguous I/Os. | |
236 | */ | |
d164b209 | 237 | while ((dio = AVL_NEXT(t, lio)) != NULL && |
fb5f0bc8 | 238 | (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && |
45d1cae3 BB |
239 | IO_SPAN(fio, dio) <= maxspan && |
240 | IO_GAP(lio, dio) <= maxgap) { | |
fb5f0bc8 | 241 | lio = dio; |
45d1cae3 BB |
242 | if (!(lio->io_flags & ZIO_FLAG_OPTIONAL)) |
243 | mio = lio; | |
244 | } | |
245 | ||
246 | /* | |
247 | * Now that we've established the range of the I/O aggregation | |
248 | * we must decide what to do with trailing optional I/Os. | |
249 | * For reads, there's nothing to do. While we are unable to | |
250 | * aggregate further, it's possible that a trailing optional | |
251 | * I/O would allow the underlying device to aggregate with | |
252 | * subsequent I/Os. We must therefore determine if the next | |
253 | * non-optional I/O is close enough to make aggregation | |
254 | * worthwhile. | |
255 | */ | |
256 | stretch = B_FALSE; | |
257 | if (t != &vq->vq_read_tree && mio != NULL) { | |
258 | nio = lio; | |
259 | while ((dio = AVL_NEXT(t, nio)) != NULL && | |
260 | IO_GAP(nio, dio) == 0 && | |
261 | IO_GAP(mio, dio) <= zfs_vdev_write_gap_limit) { | |
262 | nio = dio; | |
263 | if (!(nio->io_flags & ZIO_FLAG_OPTIONAL)) { | |
264 | stretch = B_TRUE; | |
265 | break; | |
266 | } | |
267 | } | |
268 | } | |
269 | ||
270 | if (stretch) { | |
271 | /* This may be a no-op. */ | |
272 | VERIFY((dio = AVL_NEXT(t, lio)) != NULL); | |
273 | dio->io_flags &= ~ZIO_FLAG_OPTIONAL; | |
274 | } else { | |
275 | while (lio != mio && lio != fio) { | |
276 | ASSERT(lio->io_flags & ZIO_FLAG_OPTIONAL); | |
277 | lio = AVL_PREV(t, lio); | |
278 | ASSERT(lio != NULL); | |
279 | } | |
280 | } | |
34dc7c2f BB |
281 | } |
282 | ||
283 | if (fio != lio) { | |
9babb374 | 284 | uint64_t size = IO_SPAN(fio, lio); |
34dc7c2f BB |
285 | ASSERT(size <= zfs_vdev_aggregation_limit); |
286 | ||
b128c09f | 287 | aio = zio_vdev_delegated_io(fio->io_vd, fio->io_offset, |
428870ff | 288 | zio_buf_alloc(size), size, fio->io_type, ZIO_PRIORITY_AGG, |
fb5f0bc8 | 289 | flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE, |
34dc7c2f BB |
290 | vdev_queue_agg_io_done, NULL); |
291 | ||
9babb374 BB |
292 | nio = fio; |
293 | do { | |
294 | dio = nio; | |
295 | nio = AVL_NEXT(t, dio); | |
34dc7c2f | 296 | ASSERT(dio->io_type == aio->io_type); |
d164b209 BB |
297 | ASSERT(dio->io_vdev_tree == t); |
298 | ||
45d1cae3 BB |
299 | if (dio->io_flags & ZIO_FLAG_NODATA) { |
300 | ASSERT(dio->io_type == ZIO_TYPE_WRITE); | |
301 | bzero((char *)aio->io_data + (dio->io_offset - | |
302 | aio->io_offset), dio->io_size); | |
303 | } else if (dio->io_type == ZIO_TYPE_WRITE) { | |
d164b209 BB |
304 | bcopy(dio->io_data, (char *)aio->io_data + |
305 | (dio->io_offset - aio->io_offset), | |
306 | dio->io_size); | |
45d1cae3 | 307 | } |
d164b209 BB |
308 | |
309 | zio_add_child(dio, aio); | |
34dc7c2f BB |
310 | vdev_queue_io_remove(vq, dio); |
311 | zio_vdev_io_bypass(dio); | |
d164b209 | 312 | zio_execute(dio); |
9babb374 | 313 | } while (dio != lio); |
34dc7c2f | 314 | |
34dc7c2f BB |
315 | avl_add(&vq->vq_pending_tree, aio); |
316 | ||
317 | return (aio); | |
318 | } | |
319 | ||
d164b209 | 320 | ASSERT(fio->io_vdev_tree == t); |
34dc7c2f BB |
321 | vdev_queue_io_remove(vq, fio); |
322 | ||
45d1cae3 BB |
323 | /* |
324 | * If the I/O is or was optional and therefore has no data, we need to | |
325 | * simply discard it. We need to drop the vdev queue's lock to avoid a | |
326 | * deadlock that we could encounter since this I/O will complete | |
327 | * immediately. | |
328 | */ | |
329 | if (fio->io_flags & ZIO_FLAG_NODATA) { | |
330 | mutex_exit(&vq->vq_lock); | |
331 | zio_vdev_io_bypass(fio); | |
332 | zio_execute(fio); | |
333 | mutex_enter(&vq->vq_lock); | |
334 | goto again; | |
335 | } | |
336 | ||
34dc7c2f BB |
337 | avl_add(&vq->vq_pending_tree, fio); |
338 | ||
339 | return (fio); | |
340 | } | |
341 | ||
342 | zio_t * | |
343 | vdev_queue_io(zio_t *zio) | |
344 | { | |
345 | vdev_queue_t *vq = &zio->io_vd->vdev_queue; | |
346 | zio_t *nio; | |
347 | ||
348 | ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); | |
349 | ||
350 | if (zio->io_flags & ZIO_FLAG_DONT_QUEUE) | |
351 | return (zio); | |
352 | ||
353 | zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE; | |
354 | ||
355 | if (zio->io_type == ZIO_TYPE_READ) | |
356 | zio->io_vdev_tree = &vq->vq_read_tree; | |
357 | else | |
358 | zio->io_vdev_tree = &vq->vq_write_tree; | |
359 | ||
360 | mutex_enter(&vq->vq_lock); | |
361 | ||
428870ff BB |
362 | zio->io_deadline = (ddi_get_lbolt64() >> zfs_vdev_time_shift) + |
363 | zio->io_priority; | |
34dc7c2f BB |
364 | |
365 | vdev_queue_io_add(vq, zio); | |
366 | ||
367 | nio = vdev_queue_io_to_issue(vq, zfs_vdev_min_pending); | |
368 | ||
369 | mutex_exit(&vq->vq_lock); | |
370 | ||
371 | if (nio == NULL) | |
372 | return (NULL); | |
373 | ||
374 | if (nio->io_done == vdev_queue_agg_io_done) { | |
375 | zio_nowait(nio); | |
376 | return (NULL); | |
377 | } | |
378 | ||
379 | return (nio); | |
380 | } | |
381 | ||
382 | void | |
383 | vdev_queue_io_done(zio_t *zio) | |
384 | { | |
385 | vdev_queue_t *vq = &zio->io_vd->vdev_queue; | |
d6320ddb | 386 | int i; |
34dc7c2f BB |
387 | |
388 | mutex_enter(&vq->vq_lock); | |
389 | ||
390 | avl_remove(&vq->vq_pending_tree, zio); | |
391 | ||
d6320ddb | 392 | for (i = 0; i < zfs_vdev_ramp_rate; i++) { |
b128c09f | 393 | zio_t *nio = vdev_queue_io_to_issue(vq, zfs_vdev_max_pending); |
34dc7c2f BB |
394 | if (nio == NULL) |
395 | break; | |
396 | mutex_exit(&vq->vq_lock); | |
397 | if (nio->io_done == vdev_queue_agg_io_done) { | |
398 | zio_nowait(nio); | |
399 | } else { | |
400 | zio_vdev_io_reissue(nio); | |
401 | zio_execute(nio); | |
402 | } | |
403 | mutex_enter(&vq->vq_lock); | |
404 | } | |
405 | ||
406 | mutex_exit(&vq->vq_lock); | |
407 | } | |
c28b2279 BB |
408 | |
409 | #if defined(_KERNEL) && defined(HAVE_SPL) | |
410 | module_param(zfs_vdev_max_pending, int, 0644); | |
c409e464 | 411 | MODULE_PARM_DESC(zfs_vdev_max_pending, "Max pending per-vdev I/Os"); |
c28b2279 BB |
412 | |
413 | module_param(zfs_vdev_min_pending, int, 0644); | |
c409e464 | 414 | MODULE_PARM_DESC(zfs_vdev_min_pending, "Min pending per-vdev I/Os"); |
c28b2279 BB |
415 | |
416 | module_param(zfs_vdev_aggregation_limit, int, 0644); | |
c409e464 BB |
417 | MODULE_PARM_DESC(zfs_vdev_aggregation_limit, "Max vdev I/O aggregation size"); |
418 | ||
419 | module_param(zfs_vdev_time_shift, int, 0644); | |
420 | MODULE_PARM_DESC(zfs_vdev_time_shift, "Deadline time shift for vdev I/O"); | |
421 | ||
422 | module_param(zfs_vdev_ramp_rate, int, 0644); | |
423 | MODULE_PARM_DESC(zfs_vdev_ramp_rate, "Exponential I/O issue ramp-up rate"); | |
424 | ||
425 | module_param(zfs_vdev_read_gap_limit, int, 0644); | |
426 | MODULE_PARM_DESC(zfs_vdev_read_gap_limit, "Aggregate read I/O over gap"); | |
427 | ||
428 | module_param(zfs_vdev_write_gap_limit, int, 0644); | |
429 | MODULE_PARM_DESC(zfs_vdev_write_gap_limit, "Aggregate write I/O over gap"); | |
c28b2279 | 430 | #endif |