]>
Commit | Line | Data |
---|---|---|
34dc7c2f BB |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
9 | * or http://www.opensolaris.org/os/licensing. | |
10 | * See the License for the specific language governing permissions | |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | /* | |
d164b209 | 22 | * Copyright 2009 Sun Microsystems, Inc. All rights reserved. |
34dc7c2f BB |
23 | * Use is subject to license terms. |
24 | */ | |
25 | ||
34dc7c2f BB |
26 | #include <sys/zfs_context.h> |
27 | #include <sys/spa.h> | |
28 | #include <sys/vdev_impl.h> | |
29 | #include <sys/zio.h> | |
30 | #include <sys/kstat.h> | |
31 | ||
32 | /* | |
33 | * Virtual device read-ahead caching. | |
34 | * | |
35 | * This file implements a simple LRU read-ahead cache. When the DMU reads | |
36 | * a given block, it will often want other, nearby blocks soon thereafter. | |
37 | * We take advantage of this by reading a larger disk region and caching | |
38 | * the result. In the best case, this can turn 128 back-to-back 512-byte | |
39 | * reads into a single 64k read followed by 127 cache hits; this reduces | |
40 | * latency dramatically. In the worst case, it can turn an isolated 512-byte | |
41 | * read into a 64k read, which doesn't affect latency all that much but is | |
42 | * terribly wasteful of bandwidth. A more intelligent version of the cache | |
43 | * could keep track of access patterns and not do read-ahead unless it sees | |
44 | * at least two temporally close I/Os to the same region. Currently, only | |
45 | * metadata I/O is inflated. A futher enhancement could take advantage of | |
46 | * more semantic information about the I/O. And it could use something | |
47 | * faster than an AVL tree; that was chosen solely for convenience. | |
48 | * | |
49 | * There are five cache operations: allocate, fill, read, write, evict. | |
50 | * | |
51 | * (1) Allocate. This reserves a cache entry for the specified region. | |
52 | * We separate the allocate and fill operations so that multiple threads | |
53 | * don't generate I/O for the same cache miss. | |
54 | * | |
55 | * (2) Fill. When the I/O for a cache miss completes, the fill routine | |
56 | * places the data in the previously allocated cache entry. | |
57 | * | |
58 | * (3) Read. Read data from the cache. | |
59 | * | |
60 | * (4) Write. Update cache contents after write completion. | |
61 | * | |
62 | * (5) Evict. When allocating a new entry, we evict the oldest (LRU) entry | |
63 | * if the total cache size exceeds zfs_vdev_cache_size. | |
64 | */ | |
65 | ||
66 | /* | |
67 | * These tunables are for performance analysis. | |
68 | */ | |
69 | /* | |
70 | * All i/os smaller than zfs_vdev_cache_max will be turned into | |
71 | * 1<<zfs_vdev_cache_bshift byte reads by the vdev_cache (aka software | |
72 | * track buffer). At most zfs_vdev_cache_size bytes will be kept in each | |
73 | * vdev's vdev_cache. | |
74 | */ | |
75 | int zfs_vdev_cache_max = 1<<14; /* 16KB */ | |
76 | int zfs_vdev_cache_size = 10ULL << 20; /* 10MB */ | |
77 | int zfs_vdev_cache_bshift = 16; | |
78 | ||
79 | #define VCBS (1 << zfs_vdev_cache_bshift) /* 64KB */ | |
80 | ||
81 | kstat_t *vdc_ksp = NULL; | |
82 | ||
83 | typedef struct vdc_stats { | |
84 | kstat_named_t vdc_stat_delegations; | |
85 | kstat_named_t vdc_stat_hits; | |
86 | kstat_named_t vdc_stat_misses; | |
87 | } vdc_stats_t; | |
88 | ||
89 | static vdc_stats_t vdc_stats = { | |
90 | { "delegations", KSTAT_DATA_UINT64 }, | |
91 | { "hits", KSTAT_DATA_UINT64 }, | |
92 | { "misses", KSTAT_DATA_UINT64 } | |
93 | }; | |
94 | ||
95 | #define VDCSTAT_BUMP(stat) atomic_add_64(&vdc_stats.stat.value.ui64, 1); | |
96 | ||
97 | static int | |
98 | vdev_cache_offset_compare(const void *a1, const void *a2) | |
99 | { | |
100 | const vdev_cache_entry_t *ve1 = a1; | |
101 | const vdev_cache_entry_t *ve2 = a2; | |
102 | ||
103 | if (ve1->ve_offset < ve2->ve_offset) | |
104 | return (-1); | |
105 | if (ve1->ve_offset > ve2->ve_offset) | |
106 | return (1); | |
107 | return (0); | |
108 | } | |
109 | ||
110 | static int | |
111 | vdev_cache_lastused_compare(const void *a1, const void *a2) | |
112 | { | |
113 | const vdev_cache_entry_t *ve1 = a1; | |
114 | const vdev_cache_entry_t *ve2 = a2; | |
115 | ||
116 | if (ve1->ve_lastused < ve2->ve_lastused) | |
117 | return (-1); | |
118 | if (ve1->ve_lastused > ve2->ve_lastused) | |
119 | return (1); | |
120 | ||
121 | /* | |
122 | * Among equally old entries, sort by offset to ensure uniqueness. | |
123 | */ | |
124 | return (vdev_cache_offset_compare(a1, a2)); | |
125 | } | |
126 | ||
127 | /* | |
128 | * Evict the specified entry from the cache. | |
129 | */ | |
130 | static void | |
131 | vdev_cache_evict(vdev_cache_t *vc, vdev_cache_entry_t *ve) | |
132 | { | |
133 | ASSERT(MUTEX_HELD(&vc->vc_lock)); | |
134 | ASSERT(ve->ve_fill_io == NULL); | |
135 | ASSERT(ve->ve_data != NULL); | |
136 | ||
34dc7c2f BB |
137 | avl_remove(&vc->vc_lastused_tree, ve); |
138 | avl_remove(&vc->vc_offset_tree, ve); | |
139 | zio_buf_free(ve->ve_data, VCBS); | |
140 | kmem_free(ve, sizeof (vdev_cache_entry_t)); | |
141 | } | |
142 | ||
143 | /* | |
144 | * Allocate an entry in the cache. At the point we don't have the data, | |
145 | * we're just creating a placeholder so that multiple threads don't all | |
146 | * go off and read the same blocks. | |
147 | */ | |
148 | static vdev_cache_entry_t * | |
149 | vdev_cache_allocate(zio_t *zio) | |
150 | { | |
151 | vdev_cache_t *vc = &zio->io_vd->vdev_cache; | |
152 | uint64_t offset = P2ALIGN(zio->io_offset, VCBS); | |
153 | vdev_cache_entry_t *ve; | |
154 | ||
155 | ASSERT(MUTEX_HELD(&vc->vc_lock)); | |
156 | ||
157 | if (zfs_vdev_cache_size == 0) | |
158 | return (NULL); | |
159 | ||
160 | /* | |
161 | * If adding a new entry would exceed the cache size, | |
162 | * evict the oldest entry (LRU). | |
163 | */ | |
164 | if ((avl_numnodes(&vc->vc_lastused_tree) << zfs_vdev_cache_bshift) > | |
165 | zfs_vdev_cache_size) { | |
166 | ve = avl_first(&vc->vc_lastused_tree); | |
b128c09f | 167 | if (ve->ve_fill_io != NULL) |
34dc7c2f | 168 | return (NULL); |
34dc7c2f BB |
169 | ASSERT(ve->ve_hits != 0); |
170 | vdev_cache_evict(vc, ve); | |
171 | } | |
172 | ||
173 | ve = kmem_zalloc(sizeof (vdev_cache_entry_t), KM_SLEEP); | |
174 | ve->ve_offset = offset; | |
428870ff | 175 | ve->ve_lastused = ddi_get_lbolt(); |
34dc7c2f BB |
176 | ve->ve_data = zio_buf_alloc(VCBS); |
177 | ||
178 | avl_add(&vc->vc_offset_tree, ve); | |
179 | avl_add(&vc->vc_lastused_tree, ve); | |
180 | ||
181 | return (ve); | |
182 | } | |
183 | ||
184 | static void | |
185 | vdev_cache_hit(vdev_cache_t *vc, vdev_cache_entry_t *ve, zio_t *zio) | |
186 | { | |
187 | uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS); | |
188 | ||
189 | ASSERT(MUTEX_HELD(&vc->vc_lock)); | |
190 | ASSERT(ve->ve_fill_io == NULL); | |
191 | ||
428870ff | 192 | if (ve->ve_lastused != ddi_get_lbolt()) { |
34dc7c2f | 193 | avl_remove(&vc->vc_lastused_tree, ve); |
428870ff | 194 | ve->ve_lastused = ddi_get_lbolt(); |
34dc7c2f BB |
195 | avl_add(&vc->vc_lastused_tree, ve); |
196 | } | |
197 | ||
198 | ve->ve_hits++; | |
199 | bcopy(ve->ve_data + cache_phase, zio->io_data, zio->io_size); | |
200 | } | |
201 | ||
202 | /* | |
203 | * Fill a previously allocated cache entry with data. | |
204 | */ | |
205 | static void | |
d164b209 | 206 | vdev_cache_fill(zio_t *fio) |
34dc7c2f | 207 | { |
d164b209 | 208 | vdev_t *vd = fio->io_vd; |
34dc7c2f | 209 | vdev_cache_t *vc = &vd->vdev_cache; |
d164b209 BB |
210 | vdev_cache_entry_t *ve = fio->io_private; |
211 | zio_t *pio; | |
34dc7c2f | 212 | |
d164b209 | 213 | ASSERT(fio->io_size == VCBS); |
34dc7c2f BB |
214 | |
215 | /* | |
216 | * Add data to the cache. | |
217 | */ | |
218 | mutex_enter(&vc->vc_lock); | |
219 | ||
d164b209 BB |
220 | ASSERT(ve->ve_fill_io == fio); |
221 | ASSERT(ve->ve_offset == fio->io_offset); | |
222 | ASSERT(ve->ve_data == fio->io_data); | |
34dc7c2f BB |
223 | |
224 | ve->ve_fill_io = NULL; | |
225 | ||
226 | /* | |
227 | * Even if this cache line was invalidated by a missed write update, | |
228 | * any reads that were queued up before the missed update are still | |
229 | * valid, so we can satisfy them from this line before we evict it. | |
230 | */ | |
d164b209 BB |
231 | while ((pio = zio_walk_parents(fio)) != NULL) |
232 | vdev_cache_hit(vc, ve, pio); | |
34dc7c2f | 233 | |
d164b209 | 234 | if (fio->io_error || ve->ve_missed_update) |
34dc7c2f BB |
235 | vdev_cache_evict(vc, ve); |
236 | ||
237 | mutex_exit(&vc->vc_lock); | |
34dc7c2f BB |
238 | } |
239 | ||
240 | /* | |
241 | * Read data from the cache. Returns 0 on cache hit, errno on a miss. | |
242 | */ | |
243 | int | |
244 | vdev_cache_read(zio_t *zio) | |
245 | { | |
246 | vdev_cache_t *vc = &zio->io_vd->vdev_cache; | |
5fed499d | 247 | vdev_cache_entry_t *ve, *ve_search; |
34dc7c2f | 248 | uint64_t cache_offset = P2ALIGN(zio->io_offset, VCBS); |
1fde1e37 | 249 | ASSERTV(uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS);) |
34dc7c2f BB |
250 | zio_t *fio; |
251 | ||
252 | ASSERT(zio->io_type == ZIO_TYPE_READ); | |
253 | ||
254 | if (zio->io_flags & ZIO_FLAG_DONT_CACHE) | |
255 | return (EINVAL); | |
256 | ||
257 | if (zio->io_size > zfs_vdev_cache_max) | |
258 | return (EOVERFLOW); | |
259 | ||
260 | /* | |
261 | * If the I/O straddles two or more cache blocks, don't cache it. | |
262 | */ | |
b128c09f | 263 | if (P2BOUNDARY(zio->io_offset, zio->io_size, VCBS)) |
34dc7c2f BB |
264 | return (EXDEV); |
265 | ||
266 | ASSERT(cache_phase + zio->io_size <= VCBS); | |
267 | ||
268 | mutex_enter(&vc->vc_lock); | |
269 | ||
5fed499d BB |
270 | ve_search = kmem_alloc(sizeof(vdev_cache_entry_t), KM_SLEEP); |
271 | ve_search->ve_offset = cache_offset; | |
272 | ve = avl_find(&vc->vc_offset_tree, ve_search, NULL); | |
273 | kmem_free(ve_search, sizeof(vdev_cache_entry_t)); | |
34dc7c2f BB |
274 | |
275 | if (ve != NULL) { | |
276 | if (ve->ve_missed_update) { | |
277 | mutex_exit(&vc->vc_lock); | |
278 | return (ESTALE); | |
279 | } | |
280 | ||
281 | if ((fio = ve->ve_fill_io) != NULL) { | |
34dc7c2f | 282 | zio_vdev_io_bypass(zio); |
d164b209 | 283 | zio_add_child(zio, fio); |
34dc7c2f BB |
284 | mutex_exit(&vc->vc_lock); |
285 | VDCSTAT_BUMP(vdc_stat_delegations); | |
286 | return (0); | |
287 | } | |
288 | ||
289 | vdev_cache_hit(vc, ve, zio); | |
290 | zio_vdev_io_bypass(zio); | |
291 | ||
292 | mutex_exit(&vc->vc_lock); | |
34dc7c2f BB |
293 | VDCSTAT_BUMP(vdc_stat_hits); |
294 | return (0); | |
295 | } | |
296 | ||
297 | ve = vdev_cache_allocate(zio); | |
298 | ||
299 | if (ve == NULL) { | |
300 | mutex_exit(&vc->vc_lock); | |
301 | return (ENOMEM); | |
302 | } | |
303 | ||
b128c09f | 304 | fio = zio_vdev_delegated_io(zio->io_vd, cache_offset, |
34dc7c2f | 305 | ve->ve_data, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_CACHE_FILL, |
b128c09f | 306 | ZIO_FLAG_DONT_CACHE, vdev_cache_fill, ve); |
34dc7c2f BB |
307 | |
308 | ve->ve_fill_io = fio; | |
34dc7c2f | 309 | zio_vdev_io_bypass(zio); |
d164b209 | 310 | zio_add_child(zio, fio); |
34dc7c2f BB |
311 | |
312 | mutex_exit(&vc->vc_lock); | |
313 | zio_nowait(fio); | |
314 | VDCSTAT_BUMP(vdc_stat_misses); | |
315 | ||
316 | return (0); | |
317 | } | |
318 | ||
319 | /* | |
320 | * Update cache contents upon write completion. | |
321 | */ | |
322 | void | |
323 | vdev_cache_write(zio_t *zio) | |
324 | { | |
325 | vdev_cache_t *vc = &zio->io_vd->vdev_cache; | |
326 | vdev_cache_entry_t *ve, ve_search; | |
327 | uint64_t io_start = zio->io_offset; | |
328 | uint64_t io_end = io_start + zio->io_size; | |
329 | uint64_t min_offset = P2ALIGN(io_start, VCBS); | |
330 | uint64_t max_offset = P2ROUNDUP(io_end, VCBS); | |
331 | avl_index_t where; | |
332 | ||
333 | ASSERT(zio->io_type == ZIO_TYPE_WRITE); | |
334 | ||
335 | mutex_enter(&vc->vc_lock); | |
336 | ||
337 | ve_search.ve_offset = min_offset; | |
338 | ve = avl_find(&vc->vc_offset_tree, &ve_search, &where); | |
339 | ||
340 | if (ve == NULL) | |
341 | ve = avl_nearest(&vc->vc_offset_tree, where, AVL_AFTER); | |
342 | ||
343 | while (ve != NULL && ve->ve_offset < max_offset) { | |
344 | uint64_t start = MAX(ve->ve_offset, io_start); | |
345 | uint64_t end = MIN(ve->ve_offset + VCBS, io_end); | |
346 | ||
347 | if (ve->ve_fill_io != NULL) { | |
348 | ve->ve_missed_update = 1; | |
349 | } else { | |
350 | bcopy((char *)zio->io_data + start - io_start, | |
351 | ve->ve_data + start - ve->ve_offset, end - start); | |
352 | } | |
353 | ve = AVL_NEXT(&vc->vc_offset_tree, ve); | |
354 | } | |
355 | mutex_exit(&vc->vc_lock); | |
356 | } | |
357 | ||
358 | void | |
359 | vdev_cache_purge(vdev_t *vd) | |
360 | { | |
361 | vdev_cache_t *vc = &vd->vdev_cache; | |
362 | vdev_cache_entry_t *ve; | |
363 | ||
364 | mutex_enter(&vc->vc_lock); | |
365 | while ((ve = avl_first(&vc->vc_offset_tree)) != NULL) | |
366 | vdev_cache_evict(vc, ve); | |
367 | mutex_exit(&vc->vc_lock); | |
368 | } | |
369 | ||
370 | void | |
371 | vdev_cache_init(vdev_t *vd) | |
372 | { | |
373 | vdev_cache_t *vc = &vd->vdev_cache; | |
374 | ||
375 | mutex_init(&vc->vc_lock, NULL, MUTEX_DEFAULT, NULL); | |
376 | ||
377 | avl_create(&vc->vc_offset_tree, vdev_cache_offset_compare, | |
378 | sizeof (vdev_cache_entry_t), | |
379 | offsetof(struct vdev_cache_entry, ve_offset_node)); | |
380 | ||
381 | avl_create(&vc->vc_lastused_tree, vdev_cache_lastused_compare, | |
382 | sizeof (vdev_cache_entry_t), | |
383 | offsetof(struct vdev_cache_entry, ve_lastused_node)); | |
384 | } | |
385 | ||
386 | void | |
387 | vdev_cache_fini(vdev_t *vd) | |
388 | { | |
389 | vdev_cache_t *vc = &vd->vdev_cache; | |
390 | ||
391 | vdev_cache_purge(vd); | |
392 | ||
393 | avl_destroy(&vc->vc_offset_tree); | |
394 | avl_destroy(&vc->vc_lastused_tree); | |
395 | ||
396 | mutex_destroy(&vc->vc_lock); | |
397 | } | |
398 | ||
399 | void | |
400 | vdev_cache_stat_init(void) | |
401 | { | |
402 | vdc_ksp = kstat_create("zfs", 0, "vdev_cache_stats", "misc", | |
403 | KSTAT_TYPE_NAMED, sizeof (vdc_stats) / sizeof (kstat_named_t), | |
404 | KSTAT_FLAG_VIRTUAL); | |
405 | if (vdc_ksp != NULL) { | |
406 | vdc_ksp->ks_data = &vdc_stats; | |
407 | kstat_install(vdc_ksp); | |
408 | } | |
409 | } | |
410 | ||
411 | void | |
412 | vdev_cache_stat_fini(void) | |
413 | { | |
414 | if (vdc_ksp != NULL) { | |
415 | kstat_delete(vdc_ksp); | |
416 | vdc_ksp = NULL; | |
417 | } | |
418 | } | |
c409e464 BB |
419 | |
420 | #if defined(_KERNEL) && defined(HAVE_SPL) | |
421 | module_param(zfs_vdev_cache_max, int, 0644); | |
422 | MODULE_PARM_DESC(zfs_vdev_cache_max, "Inflate reads small than max"); | |
423 | ||
424 | module_param(zfs_vdev_cache_size, int, 0444); | |
425 | MODULE_PARM_DESC(zfs_vdev_cache_size, "Total size of the per-disk cache"); | |
426 | ||
427 | module_param(zfs_vdev_cache_bshift, int, 0644); | |
428 | MODULE_PARM_DESC(zfs_vdev_cache_bshift, "Shift size to inflate reads too"); | |
429 | #endif |