4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
26 #include <sys/zfs_context.h>
28 #include <sys/vdev_impl.h>
33 * These tunables are for performance analysis.
36 * zfs_vdev_max_pending is the maximum number of i/os concurrently
37 * pending to each device. zfs_vdev_min_pending is the initial number
38 * of i/os pending to each device (before it starts ramping up to
41 int zfs_vdev_max_pending
= 35;
42 int zfs_vdev_min_pending
= 4;
44 /* deadline = pri + (lbolt >> time_shift) */
45 int zfs_vdev_time_shift
= 6;
47 /* exponential I/O issue ramp-up rate */
48 int zfs_vdev_ramp_rate
= 2;
51 * To reduce IOPs, we aggregate small adjacent i/os into one large i/o.
52 * For read i/os, we also aggregate across small adjacency gaps.
54 int zfs_vdev_aggregation_limit
= SPA_MAXBLOCKSIZE
;
55 int zfs_vdev_read_gap_limit
= 32 << 10;
58 * Virtual device vector for disk I/O scheduling.
61 vdev_queue_deadline_compare(const void *x1
, const void *x2
)
66 if (z1
->io_deadline
< z2
->io_deadline
)
68 if (z1
->io_deadline
> z2
->io_deadline
)
71 if (z1
->io_offset
< z2
->io_offset
)
73 if (z1
->io_offset
> z2
->io_offset
)
85 vdev_queue_offset_compare(const void *x1
, const void *x2
)
90 if (z1
->io_offset
< z2
->io_offset
)
92 if (z1
->io_offset
> z2
->io_offset
)
104 vdev_queue_init(vdev_t
*vd
)
106 vdev_queue_t
*vq
= &vd
->vdev_queue
;
108 mutex_init(&vq
->vq_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
110 avl_create(&vq
->vq_deadline_tree
, vdev_queue_deadline_compare
,
111 sizeof (zio_t
), offsetof(struct zio
, io_deadline_node
));
113 avl_create(&vq
->vq_read_tree
, vdev_queue_offset_compare
,
114 sizeof (zio_t
), offsetof(struct zio
, io_offset_node
));
116 avl_create(&vq
->vq_write_tree
, vdev_queue_offset_compare
,
117 sizeof (zio_t
), offsetof(struct zio
, io_offset_node
));
119 avl_create(&vq
->vq_pending_tree
, vdev_queue_offset_compare
,
120 sizeof (zio_t
), offsetof(struct zio
, io_offset_node
));
124 vdev_queue_fini(vdev_t
*vd
)
126 vdev_queue_t
*vq
= &vd
->vdev_queue
;
128 avl_destroy(&vq
->vq_deadline_tree
);
129 avl_destroy(&vq
->vq_read_tree
);
130 avl_destroy(&vq
->vq_write_tree
);
131 avl_destroy(&vq
->vq_pending_tree
);
133 mutex_destroy(&vq
->vq_lock
);
137 vdev_queue_io_add(vdev_queue_t
*vq
, zio_t
*zio
)
139 avl_add(&vq
->vq_deadline_tree
, zio
);
140 avl_add(zio
->io_vdev_tree
, zio
);
144 vdev_queue_io_remove(vdev_queue_t
*vq
, zio_t
*zio
)
146 avl_remove(&vq
->vq_deadline_tree
, zio
);
147 avl_remove(zio
->io_vdev_tree
, zio
);
151 vdev_queue_agg_io_done(zio_t
*aio
)
155 while ((pio
= zio_walk_parents(aio
)) != NULL
)
156 if (aio
->io_type
== ZIO_TYPE_READ
)
157 bcopy((char *)aio
->io_data
+ (pio
->io_offset
-
158 aio
->io_offset
), pio
->io_data
, pio
->io_size
);
160 zio_buf_free(aio
->io_data
, aio
->io_size
);
164 * Compute the range spanned by two i/os, which is the endpoint of the last
165 * (lio->io_offset + lio->io_size) minus start of the first (fio->io_offset).
166 * Conveniently, the gap between fio and lio is given by -IO_SPAN(lio, fio);
167 * thus fio and lio are adjacent if and only if IO_SPAN(lio, fio) == 0.
169 #define IO_SPAN(fio, lio) ((lio)->io_offset + (lio)->io_size - (fio)->io_offset)
170 #define IO_GAP(fio, lio) (-IO_SPAN(lio, fio))
173 vdev_queue_io_to_issue(vdev_queue_t
*vq
, uint64_t pending_limit
)
175 zio_t
*fio
, *lio
, *aio
, *dio
, *nio
;
178 uint64_t maxspan
= zfs_vdev_aggregation_limit
;
181 ASSERT(MUTEX_HELD(&vq
->vq_lock
));
183 if (avl_numnodes(&vq
->vq_pending_tree
) >= pending_limit
||
184 avl_numnodes(&vq
->vq_deadline_tree
) == 0)
187 fio
= lio
= avl_first(&vq
->vq_deadline_tree
);
189 t
= fio
->io_vdev_tree
;
190 flags
= fio
->io_flags
& ZIO_FLAG_AGG_INHERIT
;
191 maxgap
= (t
== &vq
->vq_read_tree
) ? zfs_vdev_read_gap_limit
: 0;
193 if (!(flags
& ZIO_FLAG_DONT_AGGREGATE
)) {
195 * We can aggregate I/Os that are adjacent and of the
196 * same flavor, as expressed by the AGG_INHERIT flags.
197 * The latter is necessary so that certain attributes
198 * of the I/O, such as whether it's a normal I/O or a
199 * scrub/resilver, can be preserved in the aggregate.
201 while ((dio
= AVL_PREV(t
, fio
)) != NULL
&&
202 (dio
->io_flags
& ZIO_FLAG_AGG_INHERIT
) == flags
&&
203 IO_SPAN(dio
, lio
) <= maxspan
&& IO_GAP(dio
, fio
) <= maxgap
)
206 while ((dio
= AVL_NEXT(t
, lio
)) != NULL
&&
207 (dio
->io_flags
& ZIO_FLAG_AGG_INHERIT
) == flags
&&
208 IO_SPAN(fio
, dio
) <= maxspan
&& IO_GAP(lio
, dio
) <= maxgap
)
213 uint64_t size
= IO_SPAN(fio
, lio
);
214 ASSERT(size
<= zfs_vdev_aggregation_limit
);
216 aio
= zio_vdev_delegated_io(fio
->io_vd
, fio
->io_offset
,
217 zio_buf_alloc(size
), size
, fio
->io_type
, ZIO_PRIORITY_NOW
,
218 flags
| ZIO_FLAG_DONT_CACHE
| ZIO_FLAG_DONT_QUEUE
,
219 vdev_queue_agg_io_done
, NULL
);
224 nio
= AVL_NEXT(t
, dio
);
225 ASSERT(dio
->io_type
== aio
->io_type
);
226 ASSERT(dio
->io_vdev_tree
== t
);
228 if (dio
->io_type
== ZIO_TYPE_WRITE
)
229 bcopy(dio
->io_data
, (char *)aio
->io_data
+
230 (dio
->io_offset
- aio
->io_offset
),
233 zio_add_child(dio
, aio
);
234 vdev_queue_io_remove(vq
, dio
);
235 zio_vdev_io_bypass(dio
);
237 } while (dio
!= lio
);
239 avl_add(&vq
->vq_pending_tree
, aio
);
244 ASSERT(fio
->io_vdev_tree
== t
);
245 vdev_queue_io_remove(vq
, fio
);
247 avl_add(&vq
->vq_pending_tree
, fio
);
253 vdev_queue_io(zio_t
*zio
)
255 vdev_queue_t
*vq
= &zio
->io_vd
->vdev_queue
;
258 ASSERT(zio
->io_type
== ZIO_TYPE_READ
|| zio
->io_type
== ZIO_TYPE_WRITE
);
260 if (zio
->io_flags
& ZIO_FLAG_DONT_QUEUE
)
263 zio
->io_flags
|= ZIO_FLAG_DONT_CACHE
| ZIO_FLAG_DONT_QUEUE
;
265 if (zio
->io_type
== ZIO_TYPE_READ
)
266 zio
->io_vdev_tree
= &vq
->vq_read_tree
;
268 zio
->io_vdev_tree
= &vq
->vq_write_tree
;
270 mutex_enter(&vq
->vq_lock
);
272 zio
->io_deadline
= (lbolt64
>> zfs_vdev_time_shift
) + zio
->io_priority
;
274 vdev_queue_io_add(vq
, zio
);
276 nio
= vdev_queue_io_to_issue(vq
, zfs_vdev_min_pending
);
278 mutex_exit(&vq
->vq_lock
);
283 if (nio
->io_done
== vdev_queue_agg_io_done
) {
292 vdev_queue_io_done(zio_t
*zio
)
294 vdev_queue_t
*vq
= &zio
->io_vd
->vdev_queue
;
296 mutex_enter(&vq
->vq_lock
);
298 avl_remove(&vq
->vq_pending_tree
, zio
);
300 for (int i
= 0; i
< zfs_vdev_ramp_rate
; i
++) {
301 zio_t
*nio
= vdev_queue_io_to_issue(vq
, zfs_vdev_max_pending
);
304 mutex_exit(&vq
->vq_lock
);
305 if (nio
->io_done
== vdev_queue_agg_io_done
) {
308 zio_vdev_io_reissue(nio
);
311 mutex_enter(&vq
->vq_lock
);
314 mutex_exit(&vq
->vq_lock
);