]>
Commit | Line | Data |
---|---|---|
34dc7c2f BB |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
9 | * or http://www.opensolaris.org/os/licensing. | |
10 | * See the License for the specific language governing permissions | |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | /* | |
b128c09f | 22 | * Copyright 2008 Sun Microsystems, Inc. All rights reserved. |
34dc7c2f BB |
23 | * Use is subject to license terms. |
24 | */ | |
25 | ||
34dc7c2f BB |
26 | #include <sys/zfs_context.h> |
27 | #include <sys/spa.h> | |
28 | #include <sys/vdev_impl.h> | |
29 | #include <sys/zio.h> | |
30 | #include <sys/avl.h> | |
31 | ||
32 | /* | |
33 | * These tunables are for performance analysis. | |
34 | */ | |
35 | /* | |
36 | * zfs_vdev_max_pending is the maximum number of i/os concurrently | |
37 | * pending to each device. zfs_vdev_min_pending is the initial number | |
38 | * of i/os pending to each device (before it starts ramping up to | |
39 | * max_pending). | |
40 | */ | |
41 | int zfs_vdev_max_pending = 35; | |
42 | int zfs_vdev_min_pending = 4; | |
43 | ||
44 | /* deadline = pri + (lbolt >> time_shift) */ | |
45 | int zfs_vdev_time_shift = 6; | |
46 | ||
47 | /* exponential I/O issue ramp-up rate */ | |
48 | int zfs_vdev_ramp_rate = 2; | |
49 | ||
50 | /* | |
51 | * i/os will be aggregated into a single large i/o up to | |
52 | * zfs_vdev_aggregation_limit bytes long. | |
53 | */ | |
54 | int zfs_vdev_aggregation_limit = SPA_MAXBLOCKSIZE; | |
55 | ||
56 | /* | |
57 | * Virtual device vector for disk I/O scheduling. | |
58 | */ | |
59 | int | |
60 | vdev_queue_deadline_compare(const void *x1, const void *x2) | |
61 | { | |
62 | const zio_t *z1 = x1; | |
63 | const zio_t *z2 = x2; | |
64 | ||
65 | if (z1->io_deadline < z2->io_deadline) | |
66 | return (-1); | |
67 | if (z1->io_deadline > z2->io_deadline) | |
68 | return (1); | |
69 | ||
70 | if (z1->io_offset < z2->io_offset) | |
71 | return (-1); | |
72 | if (z1->io_offset > z2->io_offset) | |
73 | return (1); | |
74 | ||
75 | if (z1 < z2) | |
76 | return (-1); | |
77 | if (z1 > z2) | |
78 | return (1); | |
79 | ||
80 | return (0); | |
81 | } | |
82 | ||
83 | int | |
84 | vdev_queue_offset_compare(const void *x1, const void *x2) | |
85 | { | |
86 | const zio_t *z1 = x1; | |
87 | const zio_t *z2 = x2; | |
88 | ||
89 | if (z1->io_offset < z2->io_offset) | |
90 | return (-1); | |
91 | if (z1->io_offset > z2->io_offset) | |
92 | return (1); | |
93 | ||
94 | if (z1 < z2) | |
95 | return (-1); | |
96 | if (z1 > z2) | |
97 | return (1); | |
98 | ||
99 | return (0); | |
100 | } | |
101 | ||
102 | void | |
103 | vdev_queue_init(vdev_t *vd) | |
104 | { | |
105 | vdev_queue_t *vq = &vd->vdev_queue; | |
106 | ||
107 | mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL); | |
108 | ||
109 | avl_create(&vq->vq_deadline_tree, vdev_queue_deadline_compare, | |
110 | sizeof (zio_t), offsetof(struct zio, io_deadline_node)); | |
111 | ||
112 | avl_create(&vq->vq_read_tree, vdev_queue_offset_compare, | |
113 | sizeof (zio_t), offsetof(struct zio, io_offset_node)); | |
114 | ||
115 | avl_create(&vq->vq_write_tree, vdev_queue_offset_compare, | |
116 | sizeof (zio_t), offsetof(struct zio, io_offset_node)); | |
117 | ||
118 | avl_create(&vq->vq_pending_tree, vdev_queue_offset_compare, | |
119 | sizeof (zio_t), offsetof(struct zio, io_offset_node)); | |
120 | } | |
121 | ||
122 | void | |
123 | vdev_queue_fini(vdev_t *vd) | |
124 | { | |
125 | vdev_queue_t *vq = &vd->vdev_queue; | |
126 | ||
127 | avl_destroy(&vq->vq_deadline_tree); | |
128 | avl_destroy(&vq->vq_read_tree); | |
129 | avl_destroy(&vq->vq_write_tree); | |
130 | avl_destroy(&vq->vq_pending_tree); | |
131 | ||
132 | mutex_destroy(&vq->vq_lock); | |
133 | } | |
134 | ||
135 | static void | |
136 | vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio) | |
137 | { | |
138 | avl_add(&vq->vq_deadline_tree, zio); | |
139 | avl_add(zio->io_vdev_tree, zio); | |
140 | } | |
141 | ||
142 | static void | |
143 | vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio) | |
144 | { | |
145 | avl_remove(&vq->vq_deadline_tree, zio); | |
146 | avl_remove(zio->io_vdev_tree, zio); | |
147 | } | |
148 | ||
149 | static void | |
150 | vdev_queue_agg_io_done(zio_t *aio) | |
151 | { | |
152 | zio_t *dio; | |
153 | uint64_t offset = 0; | |
154 | ||
155 | while ((dio = aio->io_delegate_list) != NULL) { | |
156 | if (aio->io_type == ZIO_TYPE_READ) | |
157 | bcopy((char *)aio->io_data + offset, dio->io_data, | |
158 | dio->io_size); | |
159 | offset += dio->io_size; | |
160 | aio->io_delegate_list = dio->io_delegate_next; | |
161 | dio->io_delegate_next = NULL; | |
162 | dio->io_error = aio->io_error; | |
163 | zio_execute(dio); | |
164 | } | |
165 | ASSERT3U(offset, ==, aio->io_size); | |
166 | ||
167 | zio_buf_free(aio->io_data, aio->io_size); | |
168 | } | |
169 | ||
170 | #define IS_ADJACENT(io, nio) \ | |
171 | ((io)->io_offset + (io)->io_size == (nio)->io_offset) | |
172 | ||
173 | static zio_t * | |
174 | vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit) | |
175 | { | |
176 | zio_t *fio, *lio, *aio, *dio; | |
177 | avl_tree_t *tree; | |
178 | uint64_t size; | |
179 | ||
180 | ASSERT(MUTEX_HELD(&vq->vq_lock)); | |
181 | ||
182 | if (avl_numnodes(&vq->vq_pending_tree) >= pending_limit || | |
183 | avl_numnodes(&vq->vq_deadline_tree) == 0) | |
184 | return (NULL); | |
185 | ||
186 | fio = lio = avl_first(&vq->vq_deadline_tree); | |
187 | ||
188 | tree = fio->io_vdev_tree; | |
189 | size = fio->io_size; | |
190 | ||
191 | while ((dio = AVL_PREV(tree, fio)) != NULL && IS_ADJACENT(dio, fio) && | |
b128c09f | 192 | !((dio->io_flags | fio->io_flags) & ZIO_FLAG_DONT_AGGREGATE) && |
34dc7c2f BB |
193 | size + dio->io_size <= zfs_vdev_aggregation_limit) { |
194 | dio->io_delegate_next = fio; | |
195 | fio = dio; | |
196 | size += dio->io_size; | |
197 | } | |
198 | ||
199 | while ((dio = AVL_NEXT(tree, lio)) != NULL && IS_ADJACENT(lio, dio) && | |
b128c09f | 200 | !((lio->io_flags | dio->io_flags) & ZIO_FLAG_DONT_AGGREGATE) && |
34dc7c2f BB |
201 | size + dio->io_size <= zfs_vdev_aggregation_limit) { |
202 | lio->io_delegate_next = dio; | |
203 | lio = dio; | |
204 | size += dio->io_size; | |
205 | } | |
206 | ||
207 | if (fio != lio) { | |
208 | char *buf = zio_buf_alloc(size); | |
209 | uint64_t offset = 0; | |
34dc7c2f BB |
210 | |
211 | ASSERT(size <= zfs_vdev_aggregation_limit); | |
212 | ||
b128c09f BB |
213 | aio = zio_vdev_delegated_io(fio->io_vd, fio->io_offset, |
214 | buf, size, fio->io_type, ZIO_PRIORITY_NOW, | |
215 | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE, | |
34dc7c2f BB |
216 | vdev_queue_agg_io_done, NULL); |
217 | ||
218 | aio->io_delegate_list = fio; | |
219 | ||
220 | for (dio = fio; dio != NULL; dio = dio->io_delegate_next) { | |
221 | ASSERT(dio->io_type == aio->io_type); | |
222 | ASSERT(dio->io_vdev_tree == tree); | |
223 | if (dio->io_type == ZIO_TYPE_WRITE) | |
224 | bcopy(dio->io_data, buf + offset, dio->io_size); | |
225 | offset += dio->io_size; | |
226 | vdev_queue_io_remove(vq, dio); | |
227 | zio_vdev_io_bypass(dio); | |
34dc7c2f BB |
228 | } |
229 | ||
230 | ASSERT(offset == size); | |
231 | ||
34dc7c2f BB |
232 | avl_add(&vq->vq_pending_tree, aio); |
233 | ||
234 | return (aio); | |
235 | } | |
236 | ||
237 | ASSERT(fio->io_vdev_tree == tree); | |
238 | vdev_queue_io_remove(vq, fio); | |
239 | ||
240 | avl_add(&vq->vq_pending_tree, fio); | |
241 | ||
242 | return (fio); | |
243 | } | |
244 | ||
245 | zio_t * | |
246 | vdev_queue_io(zio_t *zio) | |
247 | { | |
248 | vdev_queue_t *vq = &zio->io_vd->vdev_queue; | |
249 | zio_t *nio; | |
250 | ||
251 | ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); | |
252 | ||
253 | if (zio->io_flags & ZIO_FLAG_DONT_QUEUE) | |
254 | return (zio); | |
255 | ||
256 | zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE; | |
257 | ||
258 | if (zio->io_type == ZIO_TYPE_READ) | |
259 | zio->io_vdev_tree = &vq->vq_read_tree; | |
260 | else | |
261 | zio->io_vdev_tree = &vq->vq_write_tree; | |
262 | ||
263 | mutex_enter(&vq->vq_lock); | |
264 | ||
b128c09f | 265 | zio->io_deadline = (lbolt64 >> zfs_vdev_time_shift) + zio->io_priority; |
34dc7c2f BB |
266 | |
267 | vdev_queue_io_add(vq, zio); | |
268 | ||
269 | nio = vdev_queue_io_to_issue(vq, zfs_vdev_min_pending); | |
270 | ||
271 | mutex_exit(&vq->vq_lock); | |
272 | ||
273 | if (nio == NULL) | |
274 | return (NULL); | |
275 | ||
276 | if (nio->io_done == vdev_queue_agg_io_done) { | |
277 | zio_nowait(nio); | |
278 | return (NULL); | |
279 | } | |
280 | ||
281 | return (nio); | |
282 | } | |
283 | ||
284 | void | |
285 | vdev_queue_io_done(zio_t *zio) | |
286 | { | |
287 | vdev_queue_t *vq = &zio->io_vd->vdev_queue; | |
34dc7c2f BB |
288 | |
289 | mutex_enter(&vq->vq_lock); | |
290 | ||
291 | avl_remove(&vq->vq_pending_tree, zio); | |
292 | ||
b128c09f BB |
293 | for (int i = 0; i < zfs_vdev_ramp_rate; i++) { |
294 | zio_t *nio = vdev_queue_io_to_issue(vq, zfs_vdev_max_pending); | |
34dc7c2f BB |
295 | if (nio == NULL) |
296 | break; | |
297 | mutex_exit(&vq->vq_lock); | |
298 | if (nio->io_done == vdev_queue_agg_io_done) { | |
299 | zio_nowait(nio); | |
300 | } else { | |
301 | zio_vdev_io_reissue(nio); | |
302 | zio_execute(nio); | |
303 | } | |
304 | mutex_enter(&vq->vq_lock); | |
305 | } | |
306 | ||
307 | mutex_exit(&vq->vq_lock); | |
308 | } |