4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
26 #pragma ident "@(#)vdev_queue.c 1.6 07/11/27 SMI"
28 #include <sys/zfs_context.h>
30 #include <sys/vdev_impl.h>
35 * These tunables are for performance analysis.
38 * zfs_vdev_max_pending is the maximum number of i/os concurrently
39 * pending to each device. zfs_vdev_min_pending is the initial number
40 * of i/os pending to each device (before it starts ramping up to
43 int zfs_vdev_max_pending
= 35;
44 int zfs_vdev_min_pending
= 4;
46 /* deadline = pri + (lbolt >> time_shift) */
47 int zfs_vdev_time_shift
= 6;
49 /* exponential I/O issue ramp-up rate */
50 int zfs_vdev_ramp_rate
= 2;
53 * i/os will be aggregated into a single large i/o up to
54 * zfs_vdev_aggregation_limit bytes long.
56 int zfs_vdev_aggregation_limit
= SPA_MAXBLOCKSIZE
;
59 * Virtual device vector for disk I/O scheduling.
62 vdev_queue_deadline_compare(const void *x1
, const void *x2
)
67 if (z1
->io_deadline
< z2
->io_deadline
)
69 if (z1
->io_deadline
> z2
->io_deadline
)
72 if (z1
->io_offset
< z2
->io_offset
)
74 if (z1
->io_offset
> z2
->io_offset
)
86 vdev_queue_offset_compare(const void *x1
, const void *x2
)
91 if (z1
->io_offset
< z2
->io_offset
)
93 if (z1
->io_offset
> z2
->io_offset
)
105 vdev_queue_init(vdev_t
*vd
)
107 vdev_queue_t
*vq
= &vd
->vdev_queue
;
109 mutex_init(&vq
->vq_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
111 avl_create(&vq
->vq_deadline_tree
, vdev_queue_deadline_compare
,
112 sizeof (zio_t
), offsetof(struct zio
, io_deadline_node
));
114 avl_create(&vq
->vq_read_tree
, vdev_queue_offset_compare
,
115 sizeof (zio_t
), offsetof(struct zio
, io_offset_node
));
117 avl_create(&vq
->vq_write_tree
, vdev_queue_offset_compare
,
118 sizeof (zio_t
), offsetof(struct zio
, io_offset_node
));
120 avl_create(&vq
->vq_pending_tree
, vdev_queue_offset_compare
,
121 sizeof (zio_t
), offsetof(struct zio
, io_offset_node
));
125 vdev_queue_fini(vdev_t
*vd
)
127 vdev_queue_t
*vq
= &vd
->vdev_queue
;
129 avl_destroy(&vq
->vq_deadline_tree
);
130 avl_destroy(&vq
->vq_read_tree
);
131 avl_destroy(&vq
->vq_write_tree
);
132 avl_destroy(&vq
->vq_pending_tree
);
134 mutex_destroy(&vq
->vq_lock
);
138 vdev_queue_io_add(vdev_queue_t
*vq
, zio_t
*zio
)
140 avl_add(&vq
->vq_deadline_tree
, zio
);
141 avl_add(zio
->io_vdev_tree
, zio
);
145 vdev_queue_io_remove(vdev_queue_t
*vq
, zio_t
*zio
)
147 avl_remove(&vq
->vq_deadline_tree
, zio
);
148 avl_remove(zio
->io_vdev_tree
, zio
);
152 vdev_queue_agg_io_done(zio_t
*aio
)
157 while ((dio
= aio
->io_delegate_list
) != NULL
) {
158 if (aio
->io_type
== ZIO_TYPE_READ
)
159 bcopy((char *)aio
->io_data
+ offset
, dio
->io_data
,
161 offset
+= dio
->io_size
;
162 aio
->io_delegate_list
= dio
->io_delegate_next
;
163 dio
->io_delegate_next
= NULL
;
164 dio
->io_error
= aio
->io_error
;
167 ASSERT3U(offset
, ==, aio
->io_size
);
169 zio_buf_free(aio
->io_data
, aio
->io_size
);
172 #define IS_ADJACENT(io, nio) \
173 ((io)->io_offset + (io)->io_size == (nio)->io_offset)
176 vdev_queue_io_to_issue(vdev_queue_t
*vq
, uint64_t pending_limit
)
178 zio_t
*fio
, *lio
, *aio
, *dio
;
182 ASSERT(MUTEX_HELD(&vq
->vq_lock
));
184 if (avl_numnodes(&vq
->vq_pending_tree
) >= pending_limit
||
185 avl_numnodes(&vq
->vq_deadline_tree
) == 0)
188 fio
= lio
= avl_first(&vq
->vq_deadline_tree
);
190 tree
= fio
->io_vdev_tree
;
193 while ((dio
= AVL_PREV(tree
, fio
)) != NULL
&& IS_ADJACENT(dio
, fio
) &&
194 size
+ dio
->io_size
<= zfs_vdev_aggregation_limit
) {
195 dio
->io_delegate_next
= fio
;
197 size
+= dio
->io_size
;
200 while ((dio
= AVL_NEXT(tree
, lio
)) != NULL
&& IS_ADJACENT(lio
, dio
) &&
201 size
+ dio
->io_size
<= zfs_vdev_aggregation_limit
) {
202 lio
->io_delegate_next
= dio
;
204 size
+= dio
->io_size
;
208 char *buf
= zio_buf_alloc(size
);
212 ASSERT(size
<= zfs_vdev_aggregation_limit
);
214 aio
= zio_vdev_child_io(fio
, NULL
, fio
->io_vd
,
215 fio
->io_offset
, buf
, size
, fio
->io_type
,
216 ZIO_PRIORITY_NOW
, ZIO_FLAG_DONT_QUEUE
|
217 ZIO_FLAG_DONT_CACHE
| ZIO_FLAG_DONT_PROPAGATE
|
219 vdev_queue_agg_io_done
, NULL
);
221 aio
->io_delegate_list
= fio
;
223 for (dio
= fio
; dio
!= NULL
; dio
= dio
->io_delegate_next
) {
224 ASSERT(dio
->io_type
== aio
->io_type
);
225 ASSERT(dio
->io_vdev_tree
== tree
);
226 if (dio
->io_type
== ZIO_TYPE_WRITE
)
227 bcopy(dio
->io_data
, buf
+ offset
, dio
->io_size
);
228 offset
+= dio
->io_size
;
229 vdev_queue_io_remove(vq
, dio
);
230 zio_vdev_io_bypass(dio
);
234 ASSERT(offset
== size
);
236 dprintf("%5s T=%llu off=%8llx agg=%3d "
237 "old=%5llx new=%5llx\n",
238 zio_type_name
[fio
->io_type
],
239 fio
->io_deadline
, fio
->io_offset
, nagg
, fio
->io_size
, size
);
241 avl_add(&vq
->vq_pending_tree
, aio
);
246 ASSERT(fio
->io_vdev_tree
== tree
);
247 vdev_queue_io_remove(vq
, fio
);
249 avl_add(&vq
->vq_pending_tree
, fio
);
255 vdev_queue_io(zio_t
*zio
)
257 vdev_queue_t
*vq
= &zio
->io_vd
->vdev_queue
;
260 ASSERT(zio
->io_type
== ZIO_TYPE_READ
|| zio
->io_type
== ZIO_TYPE_WRITE
);
262 if (zio
->io_flags
& ZIO_FLAG_DONT_QUEUE
)
265 zio
->io_flags
|= ZIO_FLAG_DONT_CACHE
| ZIO_FLAG_DONT_QUEUE
;
267 if (zio
->io_type
== ZIO_TYPE_READ
)
268 zio
->io_vdev_tree
= &vq
->vq_read_tree
;
270 zio
->io_vdev_tree
= &vq
->vq_write_tree
;
272 mutex_enter(&vq
->vq_lock
);
274 zio
->io_deadline
= (zio
->io_timestamp
>> zfs_vdev_time_shift
) +
277 vdev_queue_io_add(vq
, zio
);
279 nio
= vdev_queue_io_to_issue(vq
, zfs_vdev_min_pending
);
281 mutex_exit(&vq
->vq_lock
);
286 if (nio
->io_done
== vdev_queue_agg_io_done
) {
295 vdev_queue_io_done(zio_t
*zio
)
297 vdev_queue_t
*vq
= &zio
->io_vd
->vdev_queue
;
301 mutex_enter(&vq
->vq_lock
);
303 avl_remove(&vq
->vq_pending_tree
, zio
);
305 for (i
= 0; i
< zfs_vdev_ramp_rate
; i
++) {
306 nio
= vdev_queue_io_to_issue(vq
, zfs_vdev_max_pending
);
309 mutex_exit(&vq
->vq_lock
);
310 if (nio
->io_done
== vdev_queue_agg_io_done
) {
313 zio_vdev_io_reissue(nio
);
316 mutex_enter(&vq
->vq_lock
);
319 mutex_exit(&vq
->vq_lock
);