]>
Commit | Line | Data |
---|---|---|
34dc7c2f BB |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
9 | * or http://www.opensolaris.org/os/licensing. | |
10 | * See the License for the specific language governing permissions | |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | /* | |
d164b209 | 22 | * Copyright 2009 Sun Microsystems, Inc. All rights reserved. |
34dc7c2f BB |
23 | * Use is subject to license terms. |
24 | */ | |
25 | ||
34dc7c2f BB |
26 | #include <sys/zfs_context.h> |
27 | #include <sys/spa.h> | |
28 | #include <sys/vdev_impl.h> | |
29 | #include <sys/zio.h> | |
30 | #include <sys/kstat.h> | |
31 | ||
32 | /* | |
33 | * Virtual device read-ahead caching. | |
34 | * | |
35 | * This file implements a simple LRU read-ahead cache. When the DMU reads | |
36 | * a given block, it will often want other, nearby blocks soon thereafter. | |
37 | * We take advantage of this by reading a larger disk region and caching | |
38 | * the result. In the best case, this can turn 128 back-to-back 512-byte | |
39 | * reads into a single 64k read followed by 127 cache hits; this reduces | |
40 | * latency dramatically. In the worst case, it can turn an isolated 512-byte | |
41 | * read into a 64k read, which doesn't affect latency all that much but is | |
42 | * terribly wasteful of bandwidth. A more intelligent version of the cache | |
43 | * could keep track of access patterns and not do read-ahead unless it sees | |
44 | * at least two temporally close I/Os to the same region. Currently, only | |
45 | * metadata I/O is inflated. A futher enhancement could take advantage of | |
46 | * more semantic information about the I/O. And it could use something | |
47 | * faster than an AVL tree; that was chosen solely for convenience. | |
48 | * | |
49 | * There are five cache operations: allocate, fill, read, write, evict. | |
50 | * | |
51 | * (1) Allocate. This reserves a cache entry for the specified region. | |
52 | * We separate the allocate and fill operations so that multiple threads | |
53 | * don't generate I/O for the same cache miss. | |
54 | * | |
55 | * (2) Fill. When the I/O for a cache miss completes, the fill routine | |
56 | * places the data in the previously allocated cache entry. | |
57 | * | |
58 | * (3) Read. Read data from the cache. | |
59 | * | |
60 | * (4) Write. Update cache contents after write completion. | |
61 | * | |
62 | * (5) Evict. When allocating a new entry, we evict the oldest (LRU) entry | |
63 | * if the total cache size exceeds zfs_vdev_cache_size. | |
64 | */ | |
65 | ||
66 | /* | |
67 | * These tunables are for performance analysis. | |
68 | */ | |
69 | /* | |
70 | * All i/os smaller than zfs_vdev_cache_max will be turned into | |
71 | * 1<<zfs_vdev_cache_bshift byte reads by the vdev_cache (aka software | |
72 | * track buffer). At most zfs_vdev_cache_size bytes will be kept in each | |
73 | * vdev's vdev_cache. | |
2cc6c8db GA |
74 | * |
75 | * TODO: Note that with the current ZFS code, it turns out that the | |
76 | * vdev cache is not helpful, and in some cases actually harmful. It | |
77 | * is better if we disable this. Once some time has passed, we should | |
78 | * actually remove this to simplify the code. For now we just disable | |
79 | * it by setting the zfs_vdev_cache_size to zero. Note that Solaris 11 | |
80 | * has made these same changes. | |
34dc7c2f BB |
81 | */ |
82 | int zfs_vdev_cache_max = 1<<14; /* 16KB */ | |
2cc6c8db | 83 | int zfs_vdev_cache_size = 0; |
34dc7c2f BB |
84 | int zfs_vdev_cache_bshift = 16; |
85 | ||
86 | #define VCBS (1 << zfs_vdev_cache_bshift) /* 64KB */ | |
87 | ||
88 | kstat_t *vdc_ksp = NULL; | |
89 | ||
90 | typedef struct vdc_stats { | |
91 | kstat_named_t vdc_stat_delegations; | |
92 | kstat_named_t vdc_stat_hits; | |
93 | kstat_named_t vdc_stat_misses; | |
94 | } vdc_stats_t; | |
95 | ||
96 | static vdc_stats_t vdc_stats = { | |
97 | { "delegations", KSTAT_DATA_UINT64 }, | |
98 | { "hits", KSTAT_DATA_UINT64 }, | |
99 | { "misses", KSTAT_DATA_UINT64 } | |
100 | }; | |
101 | ||
102 | #define VDCSTAT_BUMP(stat) atomic_add_64(&vdc_stats.stat.value.ui64, 1); | |
103 | ||
104 | static int | |
105 | vdev_cache_offset_compare(const void *a1, const void *a2) | |
106 | { | |
107 | const vdev_cache_entry_t *ve1 = a1; | |
108 | const vdev_cache_entry_t *ve2 = a2; | |
109 | ||
110 | if (ve1->ve_offset < ve2->ve_offset) | |
111 | return (-1); | |
112 | if (ve1->ve_offset > ve2->ve_offset) | |
113 | return (1); | |
114 | return (0); | |
115 | } | |
116 | ||
117 | static int | |
118 | vdev_cache_lastused_compare(const void *a1, const void *a2) | |
119 | { | |
120 | const vdev_cache_entry_t *ve1 = a1; | |
121 | const vdev_cache_entry_t *ve2 = a2; | |
122 | ||
123 | if (ve1->ve_lastused < ve2->ve_lastused) | |
124 | return (-1); | |
125 | if (ve1->ve_lastused > ve2->ve_lastused) | |
126 | return (1); | |
127 | ||
128 | /* | |
129 | * Among equally old entries, sort by offset to ensure uniqueness. | |
130 | */ | |
131 | return (vdev_cache_offset_compare(a1, a2)); | |
132 | } | |
133 | ||
134 | /* | |
135 | * Evict the specified entry from the cache. | |
136 | */ | |
137 | static void | |
138 | vdev_cache_evict(vdev_cache_t *vc, vdev_cache_entry_t *ve) | |
139 | { | |
140 | ASSERT(MUTEX_HELD(&vc->vc_lock)); | |
141 | ASSERT(ve->ve_fill_io == NULL); | |
142 | ASSERT(ve->ve_data != NULL); | |
143 | ||
34dc7c2f BB |
144 | avl_remove(&vc->vc_lastused_tree, ve); |
145 | avl_remove(&vc->vc_offset_tree, ve); | |
146 | zio_buf_free(ve->ve_data, VCBS); | |
147 | kmem_free(ve, sizeof (vdev_cache_entry_t)); | |
148 | } | |
149 | ||
150 | /* | |
151 | * Allocate an entry in the cache. At the point we don't have the data, | |
152 | * we're just creating a placeholder so that multiple threads don't all | |
153 | * go off and read the same blocks. | |
154 | */ | |
155 | static vdev_cache_entry_t * | |
156 | vdev_cache_allocate(zio_t *zio) | |
157 | { | |
158 | vdev_cache_t *vc = &zio->io_vd->vdev_cache; | |
159 | uint64_t offset = P2ALIGN(zio->io_offset, VCBS); | |
160 | vdev_cache_entry_t *ve; | |
161 | ||
162 | ASSERT(MUTEX_HELD(&vc->vc_lock)); | |
163 | ||
164 | if (zfs_vdev_cache_size == 0) | |
165 | return (NULL); | |
166 | ||
167 | /* | |
168 | * If adding a new entry would exceed the cache size, | |
169 | * evict the oldest entry (LRU). | |
170 | */ | |
171 | if ((avl_numnodes(&vc->vc_lastused_tree) << zfs_vdev_cache_bshift) > | |
172 | zfs_vdev_cache_size) { | |
173 | ve = avl_first(&vc->vc_lastused_tree); | |
b128c09f | 174 | if (ve->ve_fill_io != NULL) |
34dc7c2f | 175 | return (NULL); |
34dc7c2f BB |
176 | ASSERT(ve->ve_hits != 0); |
177 | vdev_cache_evict(vc, ve); | |
178 | } | |
179 | ||
180 | ve = kmem_zalloc(sizeof (vdev_cache_entry_t), KM_SLEEP); | |
181 | ve->ve_offset = offset; | |
428870ff | 182 | ve->ve_lastused = ddi_get_lbolt(); |
34dc7c2f BB |
183 | ve->ve_data = zio_buf_alloc(VCBS); |
184 | ||
185 | avl_add(&vc->vc_offset_tree, ve); | |
186 | avl_add(&vc->vc_lastused_tree, ve); | |
187 | ||
188 | return (ve); | |
189 | } | |
190 | ||
191 | static void | |
192 | vdev_cache_hit(vdev_cache_t *vc, vdev_cache_entry_t *ve, zio_t *zio) | |
193 | { | |
194 | uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS); | |
195 | ||
196 | ASSERT(MUTEX_HELD(&vc->vc_lock)); | |
197 | ASSERT(ve->ve_fill_io == NULL); | |
198 | ||
428870ff | 199 | if (ve->ve_lastused != ddi_get_lbolt()) { |
34dc7c2f | 200 | avl_remove(&vc->vc_lastused_tree, ve); |
428870ff | 201 | ve->ve_lastused = ddi_get_lbolt(); |
34dc7c2f BB |
202 | avl_add(&vc->vc_lastused_tree, ve); |
203 | } | |
204 | ||
205 | ve->ve_hits++; | |
206 | bcopy(ve->ve_data + cache_phase, zio->io_data, zio->io_size); | |
207 | } | |
208 | ||
209 | /* | |
210 | * Fill a previously allocated cache entry with data. | |
211 | */ | |
212 | static void | |
d164b209 | 213 | vdev_cache_fill(zio_t *fio) |
34dc7c2f | 214 | { |
d164b209 | 215 | vdev_t *vd = fio->io_vd; |
34dc7c2f | 216 | vdev_cache_t *vc = &vd->vdev_cache; |
d164b209 BB |
217 | vdev_cache_entry_t *ve = fio->io_private; |
218 | zio_t *pio; | |
34dc7c2f | 219 | |
d164b209 | 220 | ASSERT(fio->io_size == VCBS); |
34dc7c2f BB |
221 | |
222 | /* | |
223 | * Add data to the cache. | |
224 | */ | |
225 | mutex_enter(&vc->vc_lock); | |
226 | ||
d164b209 BB |
227 | ASSERT(ve->ve_fill_io == fio); |
228 | ASSERT(ve->ve_offset == fio->io_offset); | |
229 | ASSERT(ve->ve_data == fio->io_data); | |
34dc7c2f BB |
230 | |
231 | ve->ve_fill_io = NULL; | |
232 | ||
233 | /* | |
234 | * Even if this cache line was invalidated by a missed write update, | |
235 | * any reads that were queued up before the missed update are still | |
236 | * valid, so we can satisfy them from this line before we evict it. | |
237 | */ | |
d164b209 BB |
238 | while ((pio = zio_walk_parents(fio)) != NULL) |
239 | vdev_cache_hit(vc, ve, pio); | |
34dc7c2f | 240 | |
d164b209 | 241 | if (fio->io_error || ve->ve_missed_update) |
34dc7c2f BB |
242 | vdev_cache_evict(vc, ve); |
243 | ||
244 | mutex_exit(&vc->vc_lock); | |
34dc7c2f BB |
245 | } |
246 | ||
247 | /* | |
248 | * Read data from the cache. Returns 0 on cache hit, errno on a miss. | |
249 | */ | |
250 | int | |
251 | vdev_cache_read(zio_t *zio) | |
252 | { | |
253 | vdev_cache_t *vc = &zio->io_vd->vdev_cache; | |
5fed499d | 254 | vdev_cache_entry_t *ve, *ve_search; |
34dc7c2f | 255 | uint64_t cache_offset = P2ALIGN(zio->io_offset, VCBS); |
1fde1e37 | 256 | ASSERTV(uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS);) |
34dc7c2f BB |
257 | zio_t *fio; |
258 | ||
259 | ASSERT(zio->io_type == ZIO_TYPE_READ); | |
260 | ||
261 | if (zio->io_flags & ZIO_FLAG_DONT_CACHE) | |
262 | return (EINVAL); | |
263 | ||
264 | if (zio->io_size > zfs_vdev_cache_max) | |
265 | return (EOVERFLOW); | |
266 | ||
267 | /* | |
268 | * If the I/O straddles two or more cache blocks, don't cache it. | |
269 | */ | |
b128c09f | 270 | if (P2BOUNDARY(zio->io_offset, zio->io_size, VCBS)) |
34dc7c2f BB |
271 | return (EXDEV); |
272 | ||
273 | ASSERT(cache_phase + zio->io_size <= VCBS); | |
274 | ||
275 | mutex_enter(&vc->vc_lock); | |
276 | ||
5fed499d BB |
277 | ve_search = kmem_alloc(sizeof(vdev_cache_entry_t), KM_SLEEP); |
278 | ve_search->ve_offset = cache_offset; | |
279 | ve = avl_find(&vc->vc_offset_tree, ve_search, NULL); | |
280 | kmem_free(ve_search, sizeof(vdev_cache_entry_t)); | |
34dc7c2f BB |
281 | |
282 | if (ve != NULL) { | |
283 | if (ve->ve_missed_update) { | |
284 | mutex_exit(&vc->vc_lock); | |
285 | return (ESTALE); | |
286 | } | |
287 | ||
288 | if ((fio = ve->ve_fill_io) != NULL) { | |
34dc7c2f | 289 | zio_vdev_io_bypass(zio); |
d164b209 | 290 | zio_add_child(zio, fio); |
34dc7c2f BB |
291 | mutex_exit(&vc->vc_lock); |
292 | VDCSTAT_BUMP(vdc_stat_delegations); | |
293 | return (0); | |
294 | } | |
295 | ||
296 | vdev_cache_hit(vc, ve, zio); | |
297 | zio_vdev_io_bypass(zio); | |
298 | ||
299 | mutex_exit(&vc->vc_lock); | |
34dc7c2f BB |
300 | VDCSTAT_BUMP(vdc_stat_hits); |
301 | return (0); | |
302 | } | |
303 | ||
304 | ve = vdev_cache_allocate(zio); | |
305 | ||
306 | if (ve == NULL) { | |
307 | mutex_exit(&vc->vc_lock); | |
308 | return (ENOMEM); | |
309 | } | |
310 | ||
b128c09f | 311 | fio = zio_vdev_delegated_io(zio->io_vd, cache_offset, |
34dc7c2f | 312 | ve->ve_data, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_CACHE_FILL, |
b128c09f | 313 | ZIO_FLAG_DONT_CACHE, vdev_cache_fill, ve); |
34dc7c2f BB |
314 | |
315 | ve->ve_fill_io = fio; | |
34dc7c2f | 316 | zio_vdev_io_bypass(zio); |
d164b209 | 317 | zio_add_child(zio, fio); |
34dc7c2f BB |
318 | |
319 | mutex_exit(&vc->vc_lock); | |
320 | zio_nowait(fio); | |
321 | VDCSTAT_BUMP(vdc_stat_misses); | |
322 | ||
323 | return (0); | |
324 | } | |
325 | ||
326 | /* | |
327 | * Update cache contents upon write completion. | |
328 | */ | |
329 | void | |
330 | vdev_cache_write(zio_t *zio) | |
331 | { | |
332 | vdev_cache_t *vc = &zio->io_vd->vdev_cache; | |
333 | vdev_cache_entry_t *ve, ve_search; | |
334 | uint64_t io_start = zio->io_offset; | |
335 | uint64_t io_end = io_start + zio->io_size; | |
336 | uint64_t min_offset = P2ALIGN(io_start, VCBS); | |
337 | uint64_t max_offset = P2ROUNDUP(io_end, VCBS); | |
338 | avl_index_t where; | |
339 | ||
340 | ASSERT(zio->io_type == ZIO_TYPE_WRITE); | |
341 | ||
342 | mutex_enter(&vc->vc_lock); | |
343 | ||
344 | ve_search.ve_offset = min_offset; | |
345 | ve = avl_find(&vc->vc_offset_tree, &ve_search, &where); | |
346 | ||
347 | if (ve == NULL) | |
348 | ve = avl_nearest(&vc->vc_offset_tree, where, AVL_AFTER); | |
349 | ||
350 | while (ve != NULL && ve->ve_offset < max_offset) { | |
351 | uint64_t start = MAX(ve->ve_offset, io_start); | |
352 | uint64_t end = MIN(ve->ve_offset + VCBS, io_end); | |
353 | ||
354 | if (ve->ve_fill_io != NULL) { | |
355 | ve->ve_missed_update = 1; | |
356 | } else { | |
357 | bcopy((char *)zio->io_data + start - io_start, | |
358 | ve->ve_data + start - ve->ve_offset, end - start); | |
359 | } | |
360 | ve = AVL_NEXT(&vc->vc_offset_tree, ve); | |
361 | } | |
362 | mutex_exit(&vc->vc_lock); | |
363 | } | |
364 | ||
365 | void | |
366 | vdev_cache_purge(vdev_t *vd) | |
367 | { | |
368 | vdev_cache_t *vc = &vd->vdev_cache; | |
369 | vdev_cache_entry_t *ve; | |
370 | ||
371 | mutex_enter(&vc->vc_lock); | |
372 | while ((ve = avl_first(&vc->vc_offset_tree)) != NULL) | |
373 | vdev_cache_evict(vc, ve); | |
374 | mutex_exit(&vc->vc_lock); | |
375 | } | |
376 | ||
377 | void | |
378 | vdev_cache_init(vdev_t *vd) | |
379 | { | |
380 | vdev_cache_t *vc = &vd->vdev_cache; | |
381 | ||
382 | mutex_init(&vc->vc_lock, NULL, MUTEX_DEFAULT, NULL); | |
383 | ||
384 | avl_create(&vc->vc_offset_tree, vdev_cache_offset_compare, | |
385 | sizeof (vdev_cache_entry_t), | |
386 | offsetof(struct vdev_cache_entry, ve_offset_node)); | |
387 | ||
388 | avl_create(&vc->vc_lastused_tree, vdev_cache_lastused_compare, | |
389 | sizeof (vdev_cache_entry_t), | |
390 | offsetof(struct vdev_cache_entry, ve_lastused_node)); | |
391 | } | |
392 | ||
393 | void | |
394 | vdev_cache_fini(vdev_t *vd) | |
395 | { | |
396 | vdev_cache_t *vc = &vd->vdev_cache; | |
397 | ||
398 | vdev_cache_purge(vd); | |
399 | ||
400 | avl_destroy(&vc->vc_offset_tree); | |
401 | avl_destroy(&vc->vc_lastused_tree); | |
402 | ||
403 | mutex_destroy(&vc->vc_lock); | |
404 | } | |
405 | ||
406 | void | |
407 | vdev_cache_stat_init(void) | |
408 | { | |
409 | vdc_ksp = kstat_create("zfs", 0, "vdev_cache_stats", "misc", | |
410 | KSTAT_TYPE_NAMED, sizeof (vdc_stats) / sizeof (kstat_named_t), | |
411 | KSTAT_FLAG_VIRTUAL); | |
412 | if (vdc_ksp != NULL) { | |
413 | vdc_ksp->ks_data = &vdc_stats; | |
414 | kstat_install(vdc_ksp); | |
415 | } | |
416 | } | |
417 | ||
418 | void | |
419 | vdev_cache_stat_fini(void) | |
420 | { | |
421 | if (vdc_ksp != NULL) { | |
422 | kstat_delete(vdc_ksp); | |
423 | vdc_ksp = NULL; | |
424 | } | |
425 | } | |
c409e464 BB |
426 | |
427 | #if defined(_KERNEL) && defined(HAVE_SPL) | |
428 | module_param(zfs_vdev_cache_max, int, 0644); | |
429 | MODULE_PARM_DESC(zfs_vdev_cache_max, "Inflate reads small than max"); | |
430 | ||
431 | module_param(zfs_vdev_cache_size, int, 0444); | |
432 | MODULE_PARM_DESC(zfs_vdev_cache_size, "Total size of the per-disk cache"); | |
433 | ||
434 | module_param(zfs_vdev_cache_bshift, int, 0644); | |
435 | MODULE_PARM_DESC(zfs_vdev_cache_bshift, "Shift size to inflate reads too"); | |
436 | #endif |