]>
Commit | Line | Data |
---|---|---|
34dc7c2f BB |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
9 | * or http://www.opensolaris.org/os/licensing. | |
10 | * See the License for the specific language governing permissions | |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | /* | |
22 | * Copyright 2008 Sun Microsystems, Inc. All rights reserved. | |
23 | * Use is subject to license terms. | |
24 | */ | |
25 | ||
34dc7c2f BB |
26 | #include <sys/zfs_context.h> |
27 | #include <sys/spa.h> | |
28 | #include <sys/vdev_impl.h> | |
29 | #include <sys/zio.h> | |
30 | #include <sys/kstat.h> | |
31 | ||
32 | /* | |
33 | * Virtual device read-ahead caching. | |
34 | * | |
35 | * This file implements a simple LRU read-ahead cache. When the DMU reads | |
36 | * a given block, it will often want other, nearby blocks soon thereafter. | |
37 | * We take advantage of this by reading a larger disk region and caching | |
38 | * the result. In the best case, this can turn 128 back-to-back 512-byte | |
39 | * reads into a single 64k read followed by 127 cache hits; this reduces | |
40 | * latency dramatically. In the worst case, it can turn an isolated 512-byte | |
41 | * read into a 64k read, which doesn't affect latency all that much but is | |
42 | * terribly wasteful of bandwidth. A more intelligent version of the cache | |
43 | * could keep track of access patterns and not do read-ahead unless it sees | |
44 | * at least two temporally close I/Os to the same region. Currently, only | |
45 | * metadata I/O is inflated. A futher enhancement could take advantage of | |
46 | * more semantic information about the I/O. And it could use something | |
47 | * faster than an AVL tree; that was chosen solely for convenience. | |
48 | * | |
49 | * There are five cache operations: allocate, fill, read, write, evict. | |
50 | * | |
51 | * (1) Allocate. This reserves a cache entry for the specified region. | |
52 | * We separate the allocate and fill operations so that multiple threads | |
53 | * don't generate I/O for the same cache miss. | |
54 | * | |
55 | * (2) Fill. When the I/O for a cache miss completes, the fill routine | |
56 | * places the data in the previously allocated cache entry. | |
57 | * | |
58 | * (3) Read. Read data from the cache. | |
59 | * | |
60 | * (4) Write. Update cache contents after write completion. | |
61 | * | |
62 | * (5) Evict. When allocating a new entry, we evict the oldest (LRU) entry | |
63 | * if the total cache size exceeds zfs_vdev_cache_size. | |
64 | */ | |
65 | ||
66 | /* | |
67 | * These tunables are for performance analysis. | |
68 | */ | |
69 | /* | |
70 | * All i/os smaller than zfs_vdev_cache_max will be turned into | |
71 | * 1<<zfs_vdev_cache_bshift byte reads by the vdev_cache (aka software | |
72 | * track buffer). At most zfs_vdev_cache_size bytes will be kept in each | |
73 | * vdev's vdev_cache. | |
74 | */ | |
75 | int zfs_vdev_cache_max = 1<<14; /* 16KB */ | |
76 | int zfs_vdev_cache_size = 10ULL << 20; /* 10MB */ | |
77 | int zfs_vdev_cache_bshift = 16; | |
78 | ||
79 | #define VCBS (1 << zfs_vdev_cache_bshift) /* 64KB */ | |
80 | ||
81 | kstat_t *vdc_ksp = NULL; | |
82 | ||
83 | typedef struct vdc_stats { | |
84 | kstat_named_t vdc_stat_delegations; | |
85 | kstat_named_t vdc_stat_hits; | |
86 | kstat_named_t vdc_stat_misses; | |
87 | } vdc_stats_t; | |
88 | ||
89 | static vdc_stats_t vdc_stats = { | |
90 | { "delegations", KSTAT_DATA_UINT64 }, | |
91 | { "hits", KSTAT_DATA_UINT64 }, | |
92 | { "misses", KSTAT_DATA_UINT64 } | |
93 | }; | |
94 | ||
95 | #define VDCSTAT_BUMP(stat) atomic_add_64(&vdc_stats.stat.value.ui64, 1); | |
96 | ||
97 | static int | |
98 | vdev_cache_offset_compare(const void *a1, const void *a2) | |
99 | { | |
100 | const vdev_cache_entry_t *ve1 = a1; | |
101 | const vdev_cache_entry_t *ve2 = a2; | |
102 | ||
103 | if (ve1->ve_offset < ve2->ve_offset) | |
104 | return (-1); | |
105 | if (ve1->ve_offset > ve2->ve_offset) | |
106 | return (1); | |
107 | return (0); | |
108 | } | |
109 | ||
110 | static int | |
111 | vdev_cache_lastused_compare(const void *a1, const void *a2) | |
112 | { | |
113 | const vdev_cache_entry_t *ve1 = a1; | |
114 | const vdev_cache_entry_t *ve2 = a2; | |
115 | ||
116 | if (ve1->ve_lastused < ve2->ve_lastused) | |
117 | return (-1); | |
118 | if (ve1->ve_lastused > ve2->ve_lastused) | |
119 | return (1); | |
120 | ||
121 | /* | |
122 | * Among equally old entries, sort by offset to ensure uniqueness. | |
123 | */ | |
124 | return (vdev_cache_offset_compare(a1, a2)); | |
125 | } | |
126 | ||
127 | /* | |
128 | * Evict the specified entry from the cache. | |
129 | */ | |
130 | static void | |
131 | vdev_cache_evict(vdev_cache_t *vc, vdev_cache_entry_t *ve) | |
132 | { | |
133 | ASSERT(MUTEX_HELD(&vc->vc_lock)); | |
134 | ASSERT(ve->ve_fill_io == NULL); | |
135 | ASSERT(ve->ve_data != NULL); | |
136 | ||
34dc7c2f BB |
137 | avl_remove(&vc->vc_lastused_tree, ve); |
138 | avl_remove(&vc->vc_offset_tree, ve); | |
139 | zio_buf_free(ve->ve_data, VCBS); | |
140 | kmem_free(ve, sizeof (vdev_cache_entry_t)); | |
141 | } | |
142 | ||
143 | /* | |
144 | * Allocate an entry in the cache. At the point we don't have the data, | |
145 | * we're just creating a placeholder so that multiple threads don't all | |
146 | * go off and read the same blocks. | |
147 | */ | |
148 | static vdev_cache_entry_t * | |
149 | vdev_cache_allocate(zio_t *zio) | |
150 | { | |
151 | vdev_cache_t *vc = &zio->io_vd->vdev_cache; | |
152 | uint64_t offset = P2ALIGN(zio->io_offset, VCBS); | |
153 | vdev_cache_entry_t *ve; | |
154 | ||
155 | ASSERT(MUTEX_HELD(&vc->vc_lock)); | |
156 | ||
157 | if (zfs_vdev_cache_size == 0) | |
158 | return (NULL); | |
159 | ||
160 | /* | |
161 | * If adding a new entry would exceed the cache size, | |
162 | * evict the oldest entry (LRU). | |
163 | */ | |
164 | if ((avl_numnodes(&vc->vc_lastused_tree) << zfs_vdev_cache_bshift) > | |
165 | zfs_vdev_cache_size) { | |
166 | ve = avl_first(&vc->vc_lastused_tree); | |
b128c09f | 167 | if (ve->ve_fill_io != NULL) |
34dc7c2f | 168 | return (NULL); |
34dc7c2f BB |
169 | ASSERT(ve->ve_hits != 0); |
170 | vdev_cache_evict(vc, ve); | |
171 | } | |
172 | ||
173 | ve = kmem_zalloc(sizeof (vdev_cache_entry_t), KM_SLEEP); | |
174 | ve->ve_offset = offset; | |
175 | ve->ve_lastused = lbolt; | |
176 | ve->ve_data = zio_buf_alloc(VCBS); | |
177 | ||
178 | avl_add(&vc->vc_offset_tree, ve); | |
179 | avl_add(&vc->vc_lastused_tree, ve); | |
180 | ||
181 | return (ve); | |
182 | } | |
183 | ||
184 | static void | |
185 | vdev_cache_hit(vdev_cache_t *vc, vdev_cache_entry_t *ve, zio_t *zio) | |
186 | { | |
187 | uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS); | |
188 | ||
189 | ASSERT(MUTEX_HELD(&vc->vc_lock)); | |
190 | ASSERT(ve->ve_fill_io == NULL); | |
191 | ||
192 | if (ve->ve_lastused != lbolt) { | |
193 | avl_remove(&vc->vc_lastused_tree, ve); | |
194 | ve->ve_lastused = lbolt; | |
195 | avl_add(&vc->vc_lastused_tree, ve); | |
196 | } | |
197 | ||
198 | ve->ve_hits++; | |
199 | bcopy(ve->ve_data + cache_phase, zio->io_data, zio->io_size); | |
200 | } | |
201 | ||
202 | /* | |
203 | * Fill a previously allocated cache entry with data. | |
204 | */ | |
205 | static void | |
206 | vdev_cache_fill(zio_t *zio) | |
207 | { | |
208 | vdev_t *vd = zio->io_vd; | |
209 | vdev_cache_t *vc = &vd->vdev_cache; | |
210 | vdev_cache_entry_t *ve = zio->io_private; | |
211 | zio_t *dio; | |
212 | ||
213 | ASSERT(zio->io_size == VCBS); | |
214 | ||
215 | /* | |
216 | * Add data to the cache. | |
217 | */ | |
218 | mutex_enter(&vc->vc_lock); | |
219 | ||
220 | ASSERT(ve->ve_fill_io == zio); | |
221 | ASSERT(ve->ve_offset == zio->io_offset); | |
222 | ASSERT(ve->ve_data == zio->io_data); | |
223 | ||
224 | ve->ve_fill_io = NULL; | |
225 | ||
226 | /* | |
227 | * Even if this cache line was invalidated by a missed write update, | |
228 | * any reads that were queued up before the missed update are still | |
229 | * valid, so we can satisfy them from this line before we evict it. | |
230 | */ | |
231 | for (dio = zio->io_delegate_list; dio; dio = dio->io_delegate_next) | |
232 | vdev_cache_hit(vc, ve, dio); | |
233 | ||
234 | if (zio->io_error || ve->ve_missed_update) | |
235 | vdev_cache_evict(vc, ve); | |
236 | ||
237 | mutex_exit(&vc->vc_lock); | |
238 | ||
239 | while ((dio = zio->io_delegate_list) != NULL) { | |
240 | zio->io_delegate_list = dio->io_delegate_next; | |
241 | dio->io_delegate_next = NULL; | |
242 | dio->io_error = zio->io_error; | |
243 | zio_execute(dio); | |
244 | } | |
245 | } | |
246 | ||
247 | /* | |
248 | * Read data from the cache. Returns 0 on cache hit, errno on a miss. | |
249 | */ | |
250 | int | |
251 | vdev_cache_read(zio_t *zio) | |
252 | { | |
253 | vdev_cache_t *vc = &zio->io_vd->vdev_cache; | |
254 | vdev_cache_entry_t *ve, ve_search; | |
255 | uint64_t cache_offset = P2ALIGN(zio->io_offset, VCBS); | |
256 | uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS); | |
257 | zio_t *fio; | |
258 | ||
259 | ASSERT(zio->io_type == ZIO_TYPE_READ); | |
260 | ||
261 | if (zio->io_flags & ZIO_FLAG_DONT_CACHE) | |
262 | return (EINVAL); | |
263 | ||
264 | if (zio->io_size > zfs_vdev_cache_max) | |
265 | return (EOVERFLOW); | |
266 | ||
267 | /* | |
268 | * If the I/O straddles two or more cache blocks, don't cache it. | |
269 | */ | |
b128c09f | 270 | if (P2BOUNDARY(zio->io_offset, zio->io_size, VCBS)) |
34dc7c2f BB |
271 | return (EXDEV); |
272 | ||
273 | ASSERT(cache_phase + zio->io_size <= VCBS); | |
274 | ||
275 | mutex_enter(&vc->vc_lock); | |
276 | ||
277 | ve_search.ve_offset = cache_offset; | |
278 | ve = avl_find(&vc->vc_offset_tree, &ve_search, NULL); | |
279 | ||
280 | if (ve != NULL) { | |
281 | if (ve->ve_missed_update) { | |
282 | mutex_exit(&vc->vc_lock); | |
283 | return (ESTALE); | |
284 | } | |
285 | ||
286 | if ((fio = ve->ve_fill_io) != NULL) { | |
287 | zio->io_delegate_next = fio->io_delegate_list; | |
288 | fio->io_delegate_list = zio; | |
289 | zio_vdev_io_bypass(zio); | |
290 | mutex_exit(&vc->vc_lock); | |
291 | VDCSTAT_BUMP(vdc_stat_delegations); | |
292 | return (0); | |
293 | } | |
294 | ||
295 | vdev_cache_hit(vc, ve, zio); | |
296 | zio_vdev_io_bypass(zio); | |
297 | ||
298 | mutex_exit(&vc->vc_lock); | |
299 | zio_execute(zio); | |
300 | VDCSTAT_BUMP(vdc_stat_hits); | |
301 | return (0); | |
302 | } | |
303 | ||
304 | ve = vdev_cache_allocate(zio); | |
305 | ||
306 | if (ve == NULL) { | |
307 | mutex_exit(&vc->vc_lock); | |
308 | return (ENOMEM); | |
309 | } | |
310 | ||
b128c09f | 311 | fio = zio_vdev_delegated_io(zio->io_vd, cache_offset, |
34dc7c2f | 312 | ve->ve_data, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_CACHE_FILL, |
b128c09f | 313 | ZIO_FLAG_DONT_CACHE, vdev_cache_fill, ve); |
34dc7c2f BB |
314 | |
315 | ve->ve_fill_io = fio; | |
316 | fio->io_delegate_list = zio; | |
317 | zio_vdev_io_bypass(zio); | |
318 | ||
319 | mutex_exit(&vc->vc_lock); | |
320 | zio_nowait(fio); | |
321 | VDCSTAT_BUMP(vdc_stat_misses); | |
322 | ||
323 | return (0); | |
324 | } | |
325 | ||
326 | /* | |
327 | * Update cache contents upon write completion. | |
328 | */ | |
329 | void | |
330 | vdev_cache_write(zio_t *zio) | |
331 | { | |
332 | vdev_cache_t *vc = &zio->io_vd->vdev_cache; | |
333 | vdev_cache_entry_t *ve, ve_search; | |
334 | uint64_t io_start = zio->io_offset; | |
335 | uint64_t io_end = io_start + zio->io_size; | |
336 | uint64_t min_offset = P2ALIGN(io_start, VCBS); | |
337 | uint64_t max_offset = P2ROUNDUP(io_end, VCBS); | |
338 | avl_index_t where; | |
339 | ||
340 | ASSERT(zio->io_type == ZIO_TYPE_WRITE); | |
341 | ||
342 | mutex_enter(&vc->vc_lock); | |
343 | ||
344 | ve_search.ve_offset = min_offset; | |
345 | ve = avl_find(&vc->vc_offset_tree, &ve_search, &where); | |
346 | ||
347 | if (ve == NULL) | |
348 | ve = avl_nearest(&vc->vc_offset_tree, where, AVL_AFTER); | |
349 | ||
350 | while (ve != NULL && ve->ve_offset < max_offset) { | |
351 | uint64_t start = MAX(ve->ve_offset, io_start); | |
352 | uint64_t end = MIN(ve->ve_offset + VCBS, io_end); | |
353 | ||
354 | if (ve->ve_fill_io != NULL) { | |
355 | ve->ve_missed_update = 1; | |
356 | } else { | |
357 | bcopy((char *)zio->io_data + start - io_start, | |
358 | ve->ve_data + start - ve->ve_offset, end - start); | |
359 | } | |
360 | ve = AVL_NEXT(&vc->vc_offset_tree, ve); | |
361 | } | |
362 | mutex_exit(&vc->vc_lock); | |
363 | } | |
364 | ||
365 | void | |
366 | vdev_cache_purge(vdev_t *vd) | |
367 | { | |
368 | vdev_cache_t *vc = &vd->vdev_cache; | |
369 | vdev_cache_entry_t *ve; | |
370 | ||
371 | mutex_enter(&vc->vc_lock); | |
372 | while ((ve = avl_first(&vc->vc_offset_tree)) != NULL) | |
373 | vdev_cache_evict(vc, ve); | |
374 | mutex_exit(&vc->vc_lock); | |
375 | } | |
376 | ||
377 | void | |
378 | vdev_cache_init(vdev_t *vd) | |
379 | { | |
380 | vdev_cache_t *vc = &vd->vdev_cache; | |
381 | ||
382 | mutex_init(&vc->vc_lock, NULL, MUTEX_DEFAULT, NULL); | |
383 | ||
384 | avl_create(&vc->vc_offset_tree, vdev_cache_offset_compare, | |
385 | sizeof (vdev_cache_entry_t), | |
386 | offsetof(struct vdev_cache_entry, ve_offset_node)); | |
387 | ||
388 | avl_create(&vc->vc_lastused_tree, vdev_cache_lastused_compare, | |
389 | sizeof (vdev_cache_entry_t), | |
390 | offsetof(struct vdev_cache_entry, ve_lastused_node)); | |
391 | } | |
392 | ||
393 | void | |
394 | vdev_cache_fini(vdev_t *vd) | |
395 | { | |
396 | vdev_cache_t *vc = &vd->vdev_cache; | |
397 | ||
398 | vdev_cache_purge(vd); | |
399 | ||
400 | avl_destroy(&vc->vc_offset_tree); | |
401 | avl_destroy(&vc->vc_lastused_tree); | |
402 | ||
403 | mutex_destroy(&vc->vc_lock); | |
404 | } | |
405 | ||
406 | void | |
407 | vdev_cache_stat_init(void) | |
408 | { | |
409 | vdc_ksp = kstat_create("zfs", 0, "vdev_cache_stats", "misc", | |
410 | KSTAT_TYPE_NAMED, sizeof (vdc_stats) / sizeof (kstat_named_t), | |
411 | KSTAT_FLAG_VIRTUAL); | |
412 | if (vdc_ksp != NULL) { | |
413 | vdc_ksp->ks_data = &vdc_stats; | |
414 | kstat_install(vdc_ksp); | |
415 | } | |
416 | } | |
417 | ||
418 | void | |
419 | vdev_cache_stat_fini(void) | |
420 | { | |
421 | if (vdc_ksp != NULL) { | |
422 | kstat_delete(vdc_ksp); | |
423 | vdc_ksp = NULL; | |
424 | } | |
425 | } |