4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
26 #include <sys/zfs_context.h>
28 #include <sys/vdev_impl.h>
33 * These tunables are for performance analysis.
36 * zfs_vdev_max_pending is the maximum number of i/os concurrently
37 * pending to each device. zfs_vdev_min_pending is the initial number
38 * of i/os pending to each device (before it starts ramping up to
41 int zfs_vdev_max_pending
= 35;
42 int zfs_vdev_min_pending
= 4;
44 /* deadline = pri + (lbolt >> time_shift) */
45 int zfs_vdev_time_shift
= 6;
47 /* exponential I/O issue ramp-up rate */
48 int zfs_vdev_ramp_rate
= 2;
51 * i/os will be aggregated into a single large i/o up to
52 * zfs_vdev_aggregation_limit bytes long.
54 int zfs_vdev_aggregation_limit
= SPA_MAXBLOCKSIZE
;
57 * Virtual device vector for disk I/O scheduling.
60 vdev_queue_deadline_compare(const void *x1
, const void *x2
)
65 if (z1
->io_deadline
< z2
->io_deadline
)
67 if (z1
->io_deadline
> z2
->io_deadline
)
70 if (z1
->io_offset
< z2
->io_offset
)
72 if (z1
->io_offset
> z2
->io_offset
)
84 vdev_queue_offset_compare(const void *x1
, const void *x2
)
89 if (z1
->io_offset
< z2
->io_offset
)
91 if (z1
->io_offset
> z2
->io_offset
)
103 vdev_queue_init(vdev_t
*vd
)
105 vdev_queue_t
*vq
= &vd
->vdev_queue
;
107 mutex_init(&vq
->vq_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
109 avl_create(&vq
->vq_deadline_tree
, vdev_queue_deadline_compare
,
110 sizeof (zio_t
), offsetof(struct zio
, io_deadline_node
));
112 avl_create(&vq
->vq_read_tree
, vdev_queue_offset_compare
,
113 sizeof (zio_t
), offsetof(struct zio
, io_offset_node
));
115 avl_create(&vq
->vq_write_tree
, vdev_queue_offset_compare
,
116 sizeof (zio_t
), offsetof(struct zio
, io_offset_node
));
118 avl_create(&vq
->vq_pending_tree
, vdev_queue_offset_compare
,
119 sizeof (zio_t
), offsetof(struct zio
, io_offset_node
));
123 vdev_queue_fini(vdev_t
*vd
)
125 vdev_queue_t
*vq
= &vd
->vdev_queue
;
127 avl_destroy(&vq
->vq_deadline_tree
);
128 avl_destroy(&vq
->vq_read_tree
);
129 avl_destroy(&vq
->vq_write_tree
);
130 avl_destroy(&vq
->vq_pending_tree
);
132 mutex_destroy(&vq
->vq_lock
);
136 vdev_queue_io_add(vdev_queue_t
*vq
, zio_t
*zio
)
138 avl_add(&vq
->vq_deadline_tree
, zio
);
139 avl_add(zio
->io_vdev_tree
, zio
);
143 vdev_queue_io_remove(vdev_queue_t
*vq
, zio_t
*zio
)
145 avl_remove(&vq
->vq_deadline_tree
, zio
);
146 avl_remove(zio
->io_vdev_tree
, zio
);
150 vdev_queue_agg_io_done(zio_t
*aio
)
154 while ((pio
= zio_walk_parents(aio
)) != NULL
)
155 if (aio
->io_type
== ZIO_TYPE_READ
)
156 bcopy((char *)aio
->io_data
+ (pio
->io_offset
-
157 aio
->io_offset
), pio
->io_data
, pio
->io_size
);
159 zio_buf_free(aio
->io_data
, aio
->io_size
);
162 #define IS_ADJACENT(io, nio) \
163 ((io)->io_offset + (io)->io_size == (nio)->io_offset)
166 vdev_queue_io_to_issue(vdev_queue_t
*vq
, uint64_t pending_limit
)
168 zio_t
*fio
, *lio
, *aio
, *dio
, *nio
;
173 ASSERT(MUTEX_HELD(&vq
->vq_lock
));
175 if (avl_numnodes(&vq
->vq_pending_tree
) >= pending_limit
||
176 avl_numnodes(&vq
->vq_deadline_tree
) == 0)
179 fio
= lio
= avl_first(&vq
->vq_deadline_tree
);
181 t
= fio
->io_vdev_tree
;
183 flags
= fio
->io_flags
& ZIO_FLAG_AGG_INHERIT
;
185 if (!(flags
& ZIO_FLAG_DONT_AGGREGATE
)) {
187 * We can aggregate I/Os that are adjacent and of the
188 * same flavor, as expressed by the AGG_INHERIT flags.
189 * The latter is necessary so that certain attributes
190 * of the I/O, such as whether it's a normal I/O or a
191 * scrub/resilver, can be preserved in the aggregate.
193 while ((dio
= AVL_PREV(t
, fio
)) != NULL
&&
194 IS_ADJACENT(dio
, fio
) &&
195 (dio
->io_flags
& ZIO_FLAG_AGG_INHERIT
) == flags
&&
196 size
+ dio
->io_size
<= zfs_vdev_aggregation_limit
) {
198 size
+= dio
->io_size
;
200 while ((dio
= AVL_NEXT(t
, lio
)) != NULL
&&
201 IS_ADJACENT(lio
, dio
) &&
202 (dio
->io_flags
& ZIO_FLAG_AGG_INHERIT
) == flags
&&
203 size
+ dio
->io_size
<= zfs_vdev_aggregation_limit
) {
205 size
+= dio
->io_size
;
210 ASSERT(size
<= zfs_vdev_aggregation_limit
);
212 aio
= zio_vdev_delegated_io(fio
->io_vd
, fio
->io_offset
,
213 zio_buf_alloc(size
), size
, fio
->io_type
, ZIO_PRIORITY_NOW
,
214 flags
| ZIO_FLAG_DONT_CACHE
| ZIO_FLAG_DONT_QUEUE
,
215 vdev_queue_agg_io_done
, NULL
);
217 /* We want to process lio, then stop */
218 lio
= AVL_NEXT(t
, lio
);
219 for (dio
= fio
; dio
!= lio
; dio
= nio
) {
220 ASSERT(dio
->io_type
== aio
->io_type
);
221 ASSERT(dio
->io_vdev_tree
== t
);
223 if (dio
->io_type
== ZIO_TYPE_WRITE
)
224 bcopy(dio
->io_data
, (char *)aio
->io_data
+
225 (dio
->io_offset
- aio
->io_offset
),
227 nio
= AVL_NEXT(t
, dio
);
229 zio_add_child(dio
, aio
);
230 vdev_queue_io_remove(vq
, dio
);
231 zio_vdev_io_bypass(dio
);
235 avl_add(&vq
->vq_pending_tree
, aio
);
240 ASSERT(fio
->io_vdev_tree
== t
);
241 vdev_queue_io_remove(vq
, fio
);
243 avl_add(&vq
->vq_pending_tree
, fio
);
249 vdev_queue_io(zio_t
*zio
)
251 vdev_queue_t
*vq
= &zio
->io_vd
->vdev_queue
;
254 ASSERT(zio
->io_type
== ZIO_TYPE_READ
|| zio
->io_type
== ZIO_TYPE_WRITE
);
256 if (zio
->io_flags
& ZIO_FLAG_DONT_QUEUE
)
259 zio
->io_flags
|= ZIO_FLAG_DONT_CACHE
| ZIO_FLAG_DONT_QUEUE
;
261 if (zio
->io_type
== ZIO_TYPE_READ
)
262 zio
->io_vdev_tree
= &vq
->vq_read_tree
;
264 zio
->io_vdev_tree
= &vq
->vq_write_tree
;
266 mutex_enter(&vq
->vq_lock
);
268 zio
->io_deadline
= (lbolt64
>> zfs_vdev_time_shift
) + zio
->io_priority
;
270 vdev_queue_io_add(vq
, zio
);
272 nio
= vdev_queue_io_to_issue(vq
, zfs_vdev_min_pending
);
274 mutex_exit(&vq
->vq_lock
);
279 if (nio
->io_done
== vdev_queue_agg_io_done
) {
288 vdev_queue_io_done(zio_t
*zio
)
290 vdev_queue_t
*vq
= &zio
->io_vd
->vdev_queue
;
292 mutex_enter(&vq
->vq_lock
);
294 avl_remove(&vq
->vq_pending_tree
, zio
);
296 for (int i
= 0; i
< zfs_vdev_ramp_rate
; i
++) {
297 zio_t
*nio
= vdev_queue_io_to_issue(vq
, zfs_vdev_max_pending
);
300 mutex_exit(&vq
->vq_lock
);
301 if (nio
->io_done
== vdev_queue_agg_io_done
) {
304 zio_vdev_io_reissue(nio
);
307 mutex_enter(&vq
->vq_lock
);
310 mutex_exit(&vq
->vq_lock
);