]>
Commit | Line | Data |
---|---|---|
34dc7c2f BB |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
9 | * or http://www.opensolaris.org/os/licensing. | |
10 | * See the License for the specific language governing permissions | |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | /* | |
d164b209 | 22 | * Copyright 2009 Sun Microsystems, Inc. All rights reserved. |
34dc7c2f BB |
23 | * Use is subject to license terms. |
24 | */ | |
25 | ||
34dc7c2f BB |
26 | #include <sys/zfs_context.h> |
27 | #include <sys/spa.h> | |
28 | #include <sys/vdev_impl.h> | |
29 | #include <sys/zio.h> | |
30 | #include <sys/avl.h> | |
31 | ||
32 | /* | |
33 | * These tunables are for performance analysis. | |
34 | */ | |
35 | /* | |
36 | * zfs_vdev_max_pending is the maximum number of i/os concurrently | |
37 | * pending to each device. zfs_vdev_min_pending is the initial number | |
38 | * of i/os pending to each device (before it starts ramping up to | |
39 | * max_pending). | |
40 | */ | |
41 | int zfs_vdev_max_pending = 35; | |
42 | int zfs_vdev_min_pending = 4; | |
43 | ||
44 | /* deadline = pri + (lbolt >> time_shift) */ | |
45 | int zfs_vdev_time_shift = 6; | |
46 | ||
47 | /* exponential I/O issue ramp-up rate */ | |
48 | int zfs_vdev_ramp_rate = 2; | |
49 | ||
50 | /* | |
9babb374 BB |
51 | * To reduce IOPs, we aggregate small adjacent i/os into one large i/o. |
52 | * For read i/os, we also aggregate across small adjacency gaps. | |
34dc7c2f BB |
53 | */ |
54 | int zfs_vdev_aggregation_limit = SPA_MAXBLOCKSIZE; | |
9babb374 | 55 | int zfs_vdev_read_gap_limit = 32 << 10; |
34dc7c2f BB |
56 | |
57 | /* | |
58 | * Virtual device vector for disk I/O scheduling. | |
59 | */ | |
60 | int | |
61 | vdev_queue_deadline_compare(const void *x1, const void *x2) | |
62 | { | |
63 | const zio_t *z1 = x1; | |
64 | const zio_t *z2 = x2; | |
65 | ||
66 | if (z1->io_deadline < z2->io_deadline) | |
67 | return (-1); | |
68 | if (z1->io_deadline > z2->io_deadline) | |
69 | return (1); | |
70 | ||
71 | if (z1->io_offset < z2->io_offset) | |
72 | return (-1); | |
73 | if (z1->io_offset > z2->io_offset) | |
74 | return (1); | |
75 | ||
76 | if (z1 < z2) | |
77 | return (-1); | |
78 | if (z1 > z2) | |
79 | return (1); | |
80 | ||
81 | return (0); | |
82 | } | |
83 | ||
84 | int | |
85 | vdev_queue_offset_compare(const void *x1, const void *x2) | |
86 | { | |
87 | const zio_t *z1 = x1; | |
88 | const zio_t *z2 = x2; | |
89 | ||
90 | if (z1->io_offset < z2->io_offset) | |
91 | return (-1); | |
92 | if (z1->io_offset > z2->io_offset) | |
93 | return (1); | |
94 | ||
95 | if (z1 < z2) | |
96 | return (-1); | |
97 | if (z1 > z2) | |
98 | return (1); | |
99 | ||
100 | return (0); | |
101 | } | |
102 | ||
103 | void | |
104 | vdev_queue_init(vdev_t *vd) | |
105 | { | |
106 | vdev_queue_t *vq = &vd->vdev_queue; | |
107 | ||
108 | mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL); | |
109 | ||
110 | avl_create(&vq->vq_deadline_tree, vdev_queue_deadline_compare, | |
111 | sizeof (zio_t), offsetof(struct zio, io_deadline_node)); | |
112 | ||
113 | avl_create(&vq->vq_read_tree, vdev_queue_offset_compare, | |
114 | sizeof (zio_t), offsetof(struct zio, io_offset_node)); | |
115 | ||
116 | avl_create(&vq->vq_write_tree, vdev_queue_offset_compare, | |
117 | sizeof (zio_t), offsetof(struct zio, io_offset_node)); | |
118 | ||
119 | avl_create(&vq->vq_pending_tree, vdev_queue_offset_compare, | |
120 | sizeof (zio_t), offsetof(struct zio, io_offset_node)); | |
121 | } | |
122 | ||
123 | void | |
124 | vdev_queue_fini(vdev_t *vd) | |
125 | { | |
126 | vdev_queue_t *vq = &vd->vdev_queue; | |
127 | ||
128 | avl_destroy(&vq->vq_deadline_tree); | |
129 | avl_destroy(&vq->vq_read_tree); | |
130 | avl_destroy(&vq->vq_write_tree); | |
131 | avl_destroy(&vq->vq_pending_tree); | |
132 | ||
133 | mutex_destroy(&vq->vq_lock); | |
134 | } | |
135 | ||
136 | static void | |
137 | vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio) | |
138 | { | |
139 | avl_add(&vq->vq_deadline_tree, zio); | |
140 | avl_add(zio->io_vdev_tree, zio); | |
141 | } | |
142 | ||
143 | static void | |
144 | vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio) | |
145 | { | |
146 | avl_remove(&vq->vq_deadline_tree, zio); | |
147 | avl_remove(zio->io_vdev_tree, zio); | |
148 | } | |
149 | ||
150 | static void | |
151 | vdev_queue_agg_io_done(zio_t *aio) | |
152 | { | |
d164b209 | 153 | zio_t *pio; |
34dc7c2f | 154 | |
d164b209 | 155 | while ((pio = zio_walk_parents(aio)) != NULL) |
34dc7c2f | 156 | if (aio->io_type == ZIO_TYPE_READ) |
d164b209 BB |
157 | bcopy((char *)aio->io_data + (pio->io_offset - |
158 | aio->io_offset), pio->io_data, pio->io_size); | |
34dc7c2f BB |
159 | |
160 | zio_buf_free(aio->io_data, aio->io_size); | |
161 | } | |
162 | ||
9babb374 BB |
163 | /* |
164 | * Compute the range spanned by two i/os, which is the endpoint of the last | |
165 | * (lio->io_offset + lio->io_size) minus start of the first (fio->io_offset). | |
166 | * Conveniently, the gap between fio and lio is given by -IO_SPAN(lio, fio); | |
167 | * thus fio and lio are adjacent if and only if IO_SPAN(lio, fio) == 0. | |
168 | */ | |
169 | #define IO_SPAN(fio, lio) ((lio)->io_offset + (lio)->io_size - (fio)->io_offset) | |
170 | #define IO_GAP(fio, lio) (-IO_SPAN(lio, fio)) | |
34dc7c2f BB |
171 | |
172 | static zio_t * | |
173 | vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit) | |
174 | { | |
d164b209 BB |
175 | zio_t *fio, *lio, *aio, *dio, *nio; |
176 | avl_tree_t *t; | |
fb5f0bc8 | 177 | int flags; |
9babb374 BB |
178 | uint64_t maxspan = zfs_vdev_aggregation_limit; |
179 | uint64_t maxgap; | |
34dc7c2f BB |
180 | |
181 | ASSERT(MUTEX_HELD(&vq->vq_lock)); | |
182 | ||
183 | if (avl_numnodes(&vq->vq_pending_tree) >= pending_limit || | |
184 | avl_numnodes(&vq->vq_deadline_tree) == 0) | |
185 | return (NULL); | |
186 | ||
187 | fio = lio = avl_first(&vq->vq_deadline_tree); | |
188 | ||
d164b209 | 189 | t = fio->io_vdev_tree; |
fb5f0bc8 | 190 | flags = fio->io_flags & ZIO_FLAG_AGG_INHERIT; |
9babb374 | 191 | maxgap = (t == &vq->vq_read_tree) ? zfs_vdev_read_gap_limit : 0; |
fb5f0bc8 BB |
192 | |
193 | if (!(flags & ZIO_FLAG_DONT_AGGREGATE)) { | |
194 | /* | |
195 | * We can aggregate I/Os that are adjacent and of the | |
196 | * same flavor, as expressed by the AGG_INHERIT flags. | |
197 | * The latter is necessary so that certain attributes | |
198 | * of the I/O, such as whether it's a normal I/O or a | |
199 | * scrub/resilver, can be preserved in the aggregate. | |
200 | */ | |
d164b209 | 201 | while ((dio = AVL_PREV(t, fio)) != NULL && |
fb5f0bc8 | 202 | (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && |
9babb374 | 203 | IO_SPAN(dio, lio) <= maxspan && IO_GAP(dio, fio) <= maxgap) |
fb5f0bc8 | 204 | fio = dio; |
9babb374 | 205 | |
d164b209 | 206 | while ((dio = AVL_NEXT(t, lio)) != NULL && |
fb5f0bc8 | 207 | (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && |
9babb374 | 208 | IO_SPAN(fio, dio) <= maxspan && IO_GAP(lio, dio) <= maxgap) |
fb5f0bc8 | 209 | lio = dio; |
34dc7c2f BB |
210 | } |
211 | ||
212 | if (fio != lio) { | |
9babb374 | 213 | uint64_t size = IO_SPAN(fio, lio); |
34dc7c2f BB |
214 | ASSERT(size <= zfs_vdev_aggregation_limit); |
215 | ||
b128c09f | 216 | aio = zio_vdev_delegated_io(fio->io_vd, fio->io_offset, |
d164b209 | 217 | zio_buf_alloc(size), size, fio->io_type, ZIO_PRIORITY_NOW, |
fb5f0bc8 | 218 | flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE, |
34dc7c2f BB |
219 | vdev_queue_agg_io_done, NULL); |
220 | ||
9babb374 BB |
221 | nio = fio; |
222 | do { | |
223 | dio = nio; | |
224 | nio = AVL_NEXT(t, dio); | |
34dc7c2f | 225 | ASSERT(dio->io_type == aio->io_type); |
d164b209 BB |
226 | ASSERT(dio->io_vdev_tree == t); |
227 | ||
34dc7c2f | 228 | if (dio->io_type == ZIO_TYPE_WRITE) |
d164b209 BB |
229 | bcopy(dio->io_data, (char *)aio->io_data + |
230 | (dio->io_offset - aio->io_offset), | |
231 | dio->io_size); | |
d164b209 BB |
232 | |
233 | zio_add_child(dio, aio); | |
34dc7c2f BB |
234 | vdev_queue_io_remove(vq, dio); |
235 | zio_vdev_io_bypass(dio); | |
d164b209 | 236 | zio_execute(dio); |
9babb374 | 237 | } while (dio != lio); |
34dc7c2f | 238 | |
34dc7c2f BB |
239 | avl_add(&vq->vq_pending_tree, aio); |
240 | ||
241 | return (aio); | |
242 | } | |
243 | ||
d164b209 | 244 | ASSERT(fio->io_vdev_tree == t); |
34dc7c2f BB |
245 | vdev_queue_io_remove(vq, fio); |
246 | ||
247 | avl_add(&vq->vq_pending_tree, fio); | |
248 | ||
249 | return (fio); | |
250 | } | |
251 | ||
252 | zio_t * | |
253 | vdev_queue_io(zio_t *zio) | |
254 | { | |
255 | vdev_queue_t *vq = &zio->io_vd->vdev_queue; | |
256 | zio_t *nio; | |
257 | ||
258 | ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); | |
259 | ||
260 | if (zio->io_flags & ZIO_FLAG_DONT_QUEUE) | |
261 | return (zio); | |
262 | ||
263 | zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE; | |
264 | ||
265 | if (zio->io_type == ZIO_TYPE_READ) | |
266 | zio->io_vdev_tree = &vq->vq_read_tree; | |
267 | else | |
268 | zio->io_vdev_tree = &vq->vq_write_tree; | |
269 | ||
270 | mutex_enter(&vq->vq_lock); | |
271 | ||
b128c09f | 272 | zio->io_deadline = (lbolt64 >> zfs_vdev_time_shift) + zio->io_priority; |
34dc7c2f BB |
273 | |
274 | vdev_queue_io_add(vq, zio); | |
275 | ||
276 | nio = vdev_queue_io_to_issue(vq, zfs_vdev_min_pending); | |
277 | ||
278 | mutex_exit(&vq->vq_lock); | |
279 | ||
280 | if (nio == NULL) | |
281 | return (NULL); | |
282 | ||
283 | if (nio->io_done == vdev_queue_agg_io_done) { | |
284 | zio_nowait(nio); | |
285 | return (NULL); | |
286 | } | |
287 | ||
288 | return (nio); | |
289 | } | |
290 | ||
291 | void | |
292 | vdev_queue_io_done(zio_t *zio) | |
293 | { | |
294 | vdev_queue_t *vq = &zio->io_vd->vdev_queue; | |
34dc7c2f BB |
295 | |
296 | mutex_enter(&vq->vq_lock); | |
297 | ||
298 | avl_remove(&vq->vq_pending_tree, zio); | |
299 | ||
b128c09f BB |
300 | for (int i = 0; i < zfs_vdev_ramp_rate; i++) { |
301 | zio_t *nio = vdev_queue_io_to_issue(vq, zfs_vdev_max_pending); | |
34dc7c2f BB |
302 | if (nio == NULL) |
303 | break; | |
304 | mutex_exit(&vq->vq_lock); | |
305 | if (nio->io_done == vdev_queue_agg_io_done) { | |
306 | zio_nowait(nio); | |
307 | } else { | |
308 | zio_vdev_io_reissue(nio); | |
309 | zio_execute(nio); | |
310 | } | |
311 | mutex_enter(&vq->vq_lock); | |
312 | } | |
313 | ||
314 | mutex_exit(&vq->vq_lock); | |
315 | } |