]> git.proxmox.com Git - mirror_zfs.git/blob - module/zfs/vdev_queue.c
Illumos #3742
[mirror_zfs.git] / module / zfs / vdev_queue.c
1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26 /*
27 * Copyright (c) 2012 by Delphix. All rights reserved.
28 */
29
30 #include <sys/zfs_context.h>
31 #include <sys/vdev_impl.h>
32 #include <sys/spa_impl.h>
33 #include <sys/zio.h>
34 #include <sys/avl.h>
35 #include <sys/kstat.h>
36
37 /*
38 * These tunables are for performance analysis.
39 */
40
41 /* The maximum number of I/Os concurrently pending to each device. */
42 int zfs_vdev_max_pending = 10;
43
44 /*
45 * The initial number of I/Os pending to each device, before it starts ramping
46 * up to zfs_vdev_max_pending.
47 */
48 int zfs_vdev_min_pending = 4;
49
50 /*
51 * The deadlines are grouped into buckets based on zfs_vdev_time_shift:
52 * deadline = pri + gethrtime() >> time_shift)
53 */
54 int zfs_vdev_time_shift = 29; /* each bucket is 0.537 seconds */
55
56 /* exponential I/O issue ramp-up rate */
57 int zfs_vdev_ramp_rate = 2;
58
59 /*
60 * To reduce IOPs, we aggregate small adjacent I/Os into one large I/O.
61 * For read I/Os, we also aggregate across small adjacency gaps; for writes
62 * we include spans of optional I/Os to aid aggregation at the disk even when
63 * they aren't able to help us aggregate at this level.
64 */
65 int zfs_vdev_aggregation_limit = SPA_MAXBLOCKSIZE;
66 int zfs_vdev_read_gap_limit = 32 << 10;
67 int zfs_vdev_write_gap_limit = 4 << 10;
68
69 /*
70 * Virtual device vector for disk I/O scheduling.
71 */
72 int
73 vdev_queue_deadline_compare(const void *x1, const void *x2)
74 {
75 const zio_t *z1 = x1;
76 const zio_t *z2 = x2;
77
78 if (z1->io_deadline < z2->io_deadline)
79 return (-1);
80 if (z1->io_deadline > z2->io_deadline)
81 return (1);
82
83 if (z1->io_offset < z2->io_offset)
84 return (-1);
85 if (z1->io_offset > z2->io_offset)
86 return (1);
87
88 if (z1 < z2)
89 return (-1);
90 if (z1 > z2)
91 return (1);
92
93 return (0);
94 }
95
96 int
97 vdev_queue_offset_compare(const void *x1, const void *x2)
98 {
99 const zio_t *z1 = x1;
100 const zio_t *z2 = x2;
101
102 if (z1->io_offset < z2->io_offset)
103 return (-1);
104 if (z1->io_offset > z2->io_offset)
105 return (1);
106
107 if (z1 < z2)
108 return (-1);
109 if (z1 > z2)
110 return (1);
111
112 return (0);
113 }
114
115 void
116 vdev_queue_init(vdev_t *vd)
117 {
118 vdev_queue_t *vq = &vd->vdev_queue;
119 int i;
120
121 mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL);
122
123 avl_create(&vq->vq_deadline_tree, vdev_queue_deadline_compare,
124 sizeof (zio_t), offsetof(struct zio, io_deadline_node));
125
126 avl_create(&vq->vq_read_tree, vdev_queue_offset_compare,
127 sizeof (zio_t), offsetof(struct zio, io_offset_node));
128
129 avl_create(&vq->vq_write_tree, vdev_queue_offset_compare,
130 sizeof (zio_t), offsetof(struct zio, io_offset_node));
131
132 avl_create(&vq->vq_pending_tree, vdev_queue_offset_compare,
133 sizeof (zio_t), offsetof(struct zio, io_offset_node));
134
135 /*
136 * A list of buffers which can be used for aggregate I/O, this
137 * avoids the need to allocate them on demand when memory is low.
138 */
139 list_create(&vq->vq_io_list, sizeof (vdev_io_t),
140 offsetof(vdev_io_t, vi_node));
141
142 for (i = 0; i < zfs_vdev_max_pending; i++)
143 list_insert_tail(&vq->vq_io_list, zio_vdev_alloc());
144 }
145
146 void
147 vdev_queue_fini(vdev_t *vd)
148 {
149 vdev_queue_t *vq = &vd->vdev_queue;
150 vdev_io_t *vi;
151
152 avl_destroy(&vq->vq_deadline_tree);
153 avl_destroy(&vq->vq_read_tree);
154 avl_destroy(&vq->vq_write_tree);
155 avl_destroy(&vq->vq_pending_tree);
156
157 while ((vi = list_head(&vq->vq_io_list)) != NULL) {
158 list_remove(&vq->vq_io_list, vi);
159 zio_vdev_free(vi);
160 }
161
162 list_destroy(&vq->vq_io_list);
163
164 mutex_destroy(&vq->vq_lock);
165 }
166
167 static void
168 vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio)
169 {
170 spa_t *spa = zio->io_spa;
171 spa_stats_history_t *ssh = &spa->spa_stats.io_history;
172
173 avl_add(&vq->vq_deadline_tree, zio);
174 avl_add(zio->io_vdev_tree, zio);
175
176 if (ssh->kstat != NULL) {
177 mutex_enter(&ssh->lock);
178 kstat_waitq_enter(ssh->kstat->ks_data);
179 mutex_exit(&ssh->lock);
180 }
181 }
182
183 static void
184 vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio)
185 {
186 spa_t *spa = zio->io_spa;
187 spa_stats_history_t *ssh = &spa->spa_stats.io_history;
188
189 avl_remove(&vq->vq_deadline_tree, zio);
190 avl_remove(zio->io_vdev_tree, zio);
191
192 if (ssh->kstat != NULL) {
193 mutex_enter(&ssh->lock);
194 kstat_waitq_exit(ssh->kstat->ks_data);
195 mutex_exit(&ssh->lock);
196 }
197 }
198
199 static void
200 vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio)
201 {
202 spa_t *spa = zio->io_spa;
203 spa_stats_history_t *ssh = &spa->spa_stats.io_history;
204
205 avl_add(&vq->vq_pending_tree, zio);
206
207 if (ssh->kstat != NULL) {
208 mutex_enter(&ssh->lock);
209 kstat_runq_enter(ssh->kstat->ks_data);
210 mutex_exit(&ssh->lock);
211 }
212 }
213
214 static void
215 vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio)
216 {
217 spa_t *spa = zio->io_spa;
218 spa_stats_history_t *ssh = &spa->spa_stats.io_history;
219
220 avl_remove(&vq->vq_pending_tree, zio);
221
222 if (ssh->kstat != NULL) {
223 kstat_io_t *ksio = ssh->kstat->ks_data;
224
225 mutex_enter(&ssh->lock);
226 kstat_runq_exit(ksio);
227 if (zio->io_type == ZIO_TYPE_READ) {
228 ksio->reads++;
229 ksio->nread += zio->io_size;
230 } else if (zio->io_type == ZIO_TYPE_WRITE) {
231 ksio->writes++;
232 ksio->nwritten += zio->io_size;
233 }
234 mutex_exit(&ssh->lock);
235 }
236 }
237
238 static void
239 vdev_queue_agg_io_done(zio_t *aio)
240 {
241 vdev_queue_t *vq = &aio->io_vd->vdev_queue;
242 vdev_io_t *vi = aio->io_data;
243 zio_t *pio;
244
245 while ((pio = zio_walk_parents(aio)) != NULL)
246 if (aio->io_type == ZIO_TYPE_READ)
247 bcopy((char *)aio->io_data + (pio->io_offset -
248 aio->io_offset), pio->io_data, pio->io_size);
249
250 mutex_enter(&vq->vq_lock);
251 list_insert_tail(&vq->vq_io_list, vi);
252 mutex_exit(&vq->vq_lock);
253 }
254
255 /*
256 * Compute the range spanned by two i/os, which is the endpoint of the last
257 * (lio->io_offset + lio->io_size) minus start of the first (fio->io_offset).
258 * Conveniently, the gap between fio and lio is given by -IO_SPAN(lio, fio);
259 * thus fio and lio are adjacent if and only if IO_SPAN(lio, fio) == 0.
260 */
261 #define IO_SPAN(fio, lio) ((lio)->io_offset + (lio)->io_size - (fio)->io_offset)
262 #define IO_GAP(fio, lio) (-IO_SPAN(lio, fio))
263
264 static zio_t *
265 vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit)
266 {
267 zio_t *fio, *lio, *aio, *dio, *nio, *mio;
268 avl_tree_t *t;
269 vdev_io_t *vi;
270 int flags;
271 uint64_t maxspan = MIN(zfs_vdev_aggregation_limit, SPA_MAXBLOCKSIZE);
272 uint64_t maxgap;
273 int stretch;
274
275 again:
276 ASSERT(MUTEX_HELD(&vq->vq_lock));
277
278 if (avl_numnodes(&vq->vq_pending_tree) >= pending_limit ||
279 avl_numnodes(&vq->vq_deadline_tree) == 0)
280 return (NULL);
281
282 fio = lio = avl_first(&vq->vq_deadline_tree);
283
284 t = fio->io_vdev_tree;
285 flags = fio->io_flags & ZIO_FLAG_AGG_INHERIT;
286 maxgap = (t == &vq->vq_read_tree) ? zfs_vdev_read_gap_limit : 0;
287
288 vi = list_head(&vq->vq_io_list);
289 if (vi == NULL) {
290 vi = zio_vdev_alloc();
291 list_insert_head(&vq->vq_io_list, vi);
292 }
293
294 if (!(flags & ZIO_FLAG_DONT_AGGREGATE)) {
295 /*
296 * We can aggregate I/Os that are sufficiently adjacent and of
297 * the same flavor, as expressed by the AGG_INHERIT flags.
298 * The latter requirement is necessary so that certain
299 * attributes of the I/O, such as whether it's a normal I/O
300 * or a scrub/resilver, can be preserved in the aggregate.
301 * We can include optional I/Os, but don't allow them
302 * to begin a range as they add no benefit in that situation.
303 */
304
305 /*
306 * We keep track of the last non-optional I/O.
307 */
308 mio = (fio->io_flags & ZIO_FLAG_OPTIONAL) ? NULL : fio;
309
310 /*
311 * Walk backwards through sufficiently contiguous I/Os
312 * recording the last non-option I/O.
313 */
314 while ((dio = AVL_PREV(t, fio)) != NULL &&
315 (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
316 IO_SPAN(dio, lio) <= maxspan &&
317 IO_GAP(dio, fio) <= maxgap) {
318 fio = dio;
319 if (mio == NULL && !(fio->io_flags & ZIO_FLAG_OPTIONAL))
320 mio = fio;
321 }
322
323 /*
324 * Skip any initial optional I/Os.
325 */
326 while ((fio->io_flags & ZIO_FLAG_OPTIONAL) && fio != lio) {
327 fio = AVL_NEXT(t, fio);
328 ASSERT(fio != NULL);
329 }
330
331 /*
332 * Walk forward through sufficiently contiguous I/Os.
333 */
334 while ((dio = AVL_NEXT(t, lio)) != NULL &&
335 (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
336 IO_SPAN(fio, dio) <= maxspan &&
337 IO_GAP(lio, dio) <= maxgap) {
338 lio = dio;
339 if (!(lio->io_flags & ZIO_FLAG_OPTIONAL))
340 mio = lio;
341 }
342
343 /*
344 * Now that we've established the range of the I/O aggregation
345 * we must decide what to do with trailing optional I/Os.
346 * For reads, there's nothing to do. While we are unable to
347 * aggregate further, it's possible that a trailing optional
348 * I/O would allow the underlying device to aggregate with
349 * subsequent I/Os. We must therefore determine if the next
350 * non-optional I/O is close enough to make aggregation
351 * worthwhile.
352 */
353 stretch = B_FALSE;
354 if (t != &vq->vq_read_tree && mio != NULL) {
355 nio = lio;
356 while ((dio = AVL_NEXT(t, nio)) != NULL &&
357 IO_GAP(nio, dio) == 0 &&
358 IO_GAP(mio, dio) <= zfs_vdev_write_gap_limit) {
359 nio = dio;
360 if (!(nio->io_flags & ZIO_FLAG_OPTIONAL)) {
361 stretch = B_TRUE;
362 break;
363 }
364 }
365 }
366
367 if (stretch) {
368 /* This may be a no-op. */
369 VERIFY((dio = AVL_NEXT(t, lio)) != NULL);
370 dio->io_flags &= ~ZIO_FLAG_OPTIONAL;
371 } else {
372 while (lio != mio && lio != fio) {
373 ASSERT(lio->io_flags & ZIO_FLAG_OPTIONAL);
374 lio = AVL_PREV(t, lio);
375 ASSERT(lio != NULL);
376 }
377 }
378 }
379
380 if (fio != lio) {
381 uint64_t size = IO_SPAN(fio, lio);
382 ASSERT(size <= maxspan);
383 ASSERT(vi != NULL);
384
385 aio = zio_vdev_delegated_io(fio->io_vd, fio->io_offset,
386 vi, size, fio->io_type, ZIO_PRIORITY_AGG,
387 flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE,
388 vdev_queue_agg_io_done, NULL);
389 aio->io_timestamp = fio->io_timestamp;
390
391 nio = fio;
392 do {
393 dio = nio;
394 nio = AVL_NEXT(t, dio);
395 ASSERT(dio->io_type == aio->io_type);
396 ASSERT(dio->io_vdev_tree == t);
397
398 if (dio->io_flags & ZIO_FLAG_NODATA) {
399 ASSERT(dio->io_type == ZIO_TYPE_WRITE);
400 bzero((char *)aio->io_data + (dio->io_offset -
401 aio->io_offset), dio->io_size);
402 } else if (dio->io_type == ZIO_TYPE_WRITE) {
403 bcopy(dio->io_data, (char *)aio->io_data +
404 (dio->io_offset - aio->io_offset),
405 dio->io_size);
406 }
407
408 zio_add_child(dio, aio);
409 vdev_queue_io_remove(vq, dio);
410 zio_vdev_io_bypass(dio);
411 zio_execute(dio);
412 } while (dio != lio);
413
414 vdev_queue_pending_add(vq, aio);
415 list_remove(&vq->vq_io_list, vi);
416
417 return (aio);
418 }
419
420 ASSERT(fio->io_vdev_tree == t);
421 vdev_queue_io_remove(vq, fio);
422
423 /*
424 * If the I/O is or was optional and therefore has no data, we need to
425 * simply discard it. We need to drop the vdev queue's lock to avoid a
426 * deadlock that we could encounter since this I/O will complete
427 * immediately.
428 */
429 if (fio->io_flags & ZIO_FLAG_NODATA) {
430 mutex_exit(&vq->vq_lock);
431 zio_vdev_io_bypass(fio);
432 zio_execute(fio);
433 mutex_enter(&vq->vq_lock);
434 goto again;
435 }
436
437 vdev_queue_pending_add(vq, fio);
438
439 return (fio);
440 }
441
442 zio_t *
443 vdev_queue_io(zio_t *zio)
444 {
445 vdev_queue_t *vq = &zio->io_vd->vdev_queue;
446 zio_t *nio;
447
448 ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
449
450 if (zio->io_flags & ZIO_FLAG_DONT_QUEUE)
451 return (zio);
452
453 zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE;
454
455 if (zio->io_type == ZIO_TYPE_READ)
456 zio->io_vdev_tree = &vq->vq_read_tree;
457 else
458 zio->io_vdev_tree = &vq->vq_write_tree;
459
460 mutex_enter(&vq->vq_lock);
461
462 zio->io_timestamp = gethrtime();
463 zio->io_deadline = (zio->io_timestamp >> zfs_vdev_time_shift) +
464 zio->io_priority;
465
466 vdev_queue_io_add(vq, zio);
467
468 nio = vdev_queue_io_to_issue(vq, zfs_vdev_min_pending);
469
470 mutex_exit(&vq->vq_lock);
471
472 if (nio == NULL)
473 return (NULL);
474
475 if (nio->io_done == vdev_queue_agg_io_done) {
476 zio_nowait(nio);
477 return (NULL);
478 }
479
480 return (nio);
481 }
482
483 void
484 vdev_queue_io_done(zio_t *zio)
485 {
486 vdev_queue_t *vq = &zio->io_vd->vdev_queue;
487 int i;
488
489 if (zio_injection_enabled)
490 delay(SEC_TO_TICK(zio_handle_io_delay(zio)));
491
492 mutex_enter(&vq->vq_lock);
493
494 vdev_queue_pending_remove(vq, zio);
495
496 zio->io_delta = gethrtime() - zio->io_timestamp;
497 vq->vq_io_complete_ts = gethrtime();
498 vq->vq_io_delta_ts = vq->vq_io_complete_ts - zio->io_timestamp;
499
500 for (i = 0; i < zfs_vdev_ramp_rate; i++) {
501 zio_t *nio = vdev_queue_io_to_issue(vq, zfs_vdev_max_pending);
502 if (nio == NULL)
503 break;
504 mutex_exit(&vq->vq_lock);
505 if (nio->io_done == vdev_queue_agg_io_done) {
506 zio_nowait(nio);
507 } else {
508 zio_vdev_io_reissue(nio);
509 zio_execute(nio);
510 }
511 mutex_enter(&vq->vq_lock);
512 }
513
514 mutex_exit(&vq->vq_lock);
515 }
516
517 #if defined(_KERNEL) && defined(HAVE_SPL)
518 module_param(zfs_vdev_max_pending, int, 0644);
519 MODULE_PARM_DESC(zfs_vdev_max_pending, "Max pending per-vdev I/Os");
520
521 module_param(zfs_vdev_min_pending, int, 0644);
522 MODULE_PARM_DESC(zfs_vdev_min_pending, "Min pending per-vdev I/Os");
523
524 module_param(zfs_vdev_aggregation_limit, int, 0644);
525 MODULE_PARM_DESC(zfs_vdev_aggregation_limit, "Max vdev I/O aggregation size");
526
527 module_param(zfs_vdev_time_shift, int, 0644);
528 MODULE_PARM_DESC(zfs_vdev_time_shift, "Deadline time shift for vdev I/O");
529
530 module_param(zfs_vdev_ramp_rate, int, 0644);
531 MODULE_PARM_DESC(zfs_vdev_ramp_rate, "Exponential I/O issue ramp-up rate");
532
533 module_param(zfs_vdev_read_gap_limit, int, 0644);
534 MODULE_PARM_DESC(zfs_vdev_read_gap_limit, "Aggregate read I/O over gap");
535
536 module_param(zfs_vdev_write_gap_limit, int, 0644);
537 MODULE_PARM_DESC(zfs_vdev_write_gap_limit, "Aggregate write I/O over gap");
538 #endif