]>
Commit | Line | Data |
---|---|---|
34dc7c2f BB |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
9 | * or http://www.opensolaris.org/os/licensing. | |
10 | * See the License for the specific language governing permissions | |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | /* | |
d164b209 | 22 | * Copyright 2009 Sun Microsystems, Inc. All rights reserved. |
34dc7c2f BB |
23 | * Use is subject to license terms. |
24 | */ | |
2e528b49 | 25 | /* |
a6255b7f | 26 | * Copyright (c) 2013, 2016 by Delphix. All rights reserved. |
2e528b49 | 27 | */ |
34dc7c2f | 28 | |
34dc7c2f BB |
29 | #include <sys/zfs_context.h> |
30 | #include <sys/spa.h> | |
31 | #include <sys/vdev_impl.h> | |
32 | #include <sys/zio.h> | |
33 | #include <sys/kstat.h> | |
a6255b7f | 34 | #include <sys/abd.h> |
34dc7c2f BB |
35 | |
36 | /* | |
37 | * Virtual device read-ahead caching. | |
38 | * | |
39 | * This file implements a simple LRU read-ahead cache. When the DMU reads | |
40 | * a given block, it will often want other, nearby blocks soon thereafter. | |
41 | * We take advantage of this by reading a larger disk region and caching | |
42 | * the result. In the best case, this can turn 128 back-to-back 512-byte | |
43 | * reads into a single 64k read followed by 127 cache hits; this reduces | |
44 | * latency dramatically. In the worst case, it can turn an isolated 512-byte | |
45 | * read into a 64k read, which doesn't affect latency all that much but is | |
46 | * terribly wasteful of bandwidth. A more intelligent version of the cache | |
47 | * could keep track of access patterns and not do read-ahead unless it sees | |
48 | * at least two temporally close I/Os to the same region. Currently, only | |
49 | * metadata I/O is inflated. A futher enhancement could take advantage of | |
50 | * more semantic information about the I/O. And it could use something | |
51 | * faster than an AVL tree; that was chosen solely for convenience. | |
52 | * | |
53 | * There are five cache operations: allocate, fill, read, write, evict. | |
54 | * | |
55 | * (1) Allocate. This reserves a cache entry for the specified region. | |
56 | * We separate the allocate and fill operations so that multiple threads | |
57 | * don't generate I/O for the same cache miss. | |
58 | * | |
59 | * (2) Fill. When the I/O for a cache miss completes, the fill routine | |
60 | * places the data in the previously allocated cache entry. | |
61 | * | |
62 | * (3) Read. Read data from the cache. | |
63 | * | |
64 | * (4) Write. Update cache contents after write completion. | |
65 | * | |
66 | * (5) Evict. When allocating a new entry, we evict the oldest (LRU) entry | |
67 | * if the total cache size exceeds zfs_vdev_cache_size. | |
68 | */ | |
69 | ||
70 | /* | |
71 | * These tunables are for performance analysis. | |
72 | */ | |
73 | /* | |
74 | * All i/os smaller than zfs_vdev_cache_max will be turned into | |
75 | * 1<<zfs_vdev_cache_bshift byte reads by the vdev_cache (aka software | |
76 | * track buffer). At most zfs_vdev_cache_size bytes will be kept in each | |
77 | * vdev's vdev_cache. | |
2cc6c8db GA |
78 | * |
79 | * TODO: Note that with the current ZFS code, it turns out that the | |
80 | * vdev cache is not helpful, and in some cases actually harmful. It | |
81 | * is better if we disable this. Once some time has passed, we should | |
82 | * actually remove this to simplify the code. For now we just disable | |
83 | * it by setting the zfs_vdev_cache_size to zero. Note that Solaris 11 | |
84 | * has made these same changes. | |
34dc7c2f BB |
85 | */ |
86 | int zfs_vdev_cache_max = 1<<14; /* 16KB */ | |
2cc6c8db | 87 | int zfs_vdev_cache_size = 0; |
34dc7c2f BB |
88 | int zfs_vdev_cache_bshift = 16; |
89 | ||
90 | #define VCBS (1 << zfs_vdev_cache_bshift) /* 64KB */ | |
91 | ||
92 | kstat_t *vdc_ksp = NULL; | |
93 | ||
94 | typedef struct vdc_stats { | |
95 | kstat_named_t vdc_stat_delegations; | |
96 | kstat_named_t vdc_stat_hits; | |
97 | kstat_named_t vdc_stat_misses; | |
98 | } vdc_stats_t; | |
99 | ||
100 | static vdc_stats_t vdc_stats = { | |
101 | { "delegations", KSTAT_DATA_UINT64 }, | |
102 | { "hits", KSTAT_DATA_UINT64 }, | |
103 | { "misses", KSTAT_DATA_UINT64 } | |
104 | }; | |
105 | ||
bc89ac84 | 106 | #define VDCSTAT_BUMP(stat) atomic_inc_64(&vdc_stats.stat.value.ui64); |
34dc7c2f | 107 | |
ee36c709 | 108 | static inline int |
34dc7c2f BB |
109 | vdev_cache_offset_compare(const void *a1, const void *a2) |
110 | { | |
ee36c709 GN |
111 | const vdev_cache_entry_t *ve1 = (const vdev_cache_entry_t *)a1; |
112 | const vdev_cache_entry_t *ve2 = (const vdev_cache_entry_t *)a2; | |
34dc7c2f | 113 | |
ee36c709 | 114 | return (AVL_CMP(ve1->ve_offset, ve2->ve_offset)); |
34dc7c2f BB |
115 | } |
116 | ||
117 | static int | |
118 | vdev_cache_lastused_compare(const void *a1, const void *a2) | |
119 | { | |
ee36c709 GN |
120 | const vdev_cache_entry_t *ve1 = (const vdev_cache_entry_t *)a1; |
121 | const vdev_cache_entry_t *ve2 = (const vdev_cache_entry_t *)a2; | |
34dc7c2f | 122 | |
ee36c709 GN |
123 | int cmp = AVL_CMP(ve1->ve_lastused, ve2->ve_lastused); |
124 | if (likely(cmp)) | |
125 | return (cmp); | |
34dc7c2f BB |
126 | |
127 | /* | |
128 | * Among equally old entries, sort by offset to ensure uniqueness. | |
129 | */ | |
130 | return (vdev_cache_offset_compare(a1, a2)); | |
131 | } | |
132 | ||
133 | /* | |
134 | * Evict the specified entry from the cache. | |
135 | */ | |
136 | static void | |
137 | vdev_cache_evict(vdev_cache_t *vc, vdev_cache_entry_t *ve) | |
138 | { | |
139 | ASSERT(MUTEX_HELD(&vc->vc_lock)); | |
a6255b7f DQ |
140 | ASSERT3P(ve->ve_fill_io, ==, NULL); |
141 | ASSERT3P(ve->ve_abd, !=, NULL); | |
34dc7c2f | 142 | |
34dc7c2f BB |
143 | avl_remove(&vc->vc_lastused_tree, ve); |
144 | avl_remove(&vc->vc_offset_tree, ve); | |
a6255b7f | 145 | abd_free(ve->ve_abd); |
34dc7c2f BB |
146 | kmem_free(ve, sizeof (vdev_cache_entry_t)); |
147 | } | |
148 | ||
149 | /* | |
150 | * Allocate an entry in the cache. At the point we don't have the data, | |
151 | * we're just creating a placeholder so that multiple threads don't all | |
152 | * go off and read the same blocks. | |
153 | */ | |
154 | static vdev_cache_entry_t * | |
155 | vdev_cache_allocate(zio_t *zio) | |
156 | { | |
157 | vdev_cache_t *vc = &zio->io_vd->vdev_cache; | |
158 | uint64_t offset = P2ALIGN(zio->io_offset, VCBS); | |
159 | vdev_cache_entry_t *ve; | |
160 | ||
161 | ASSERT(MUTEX_HELD(&vc->vc_lock)); | |
162 | ||
163 | if (zfs_vdev_cache_size == 0) | |
164 | return (NULL); | |
165 | ||
166 | /* | |
167 | * If adding a new entry would exceed the cache size, | |
168 | * evict the oldest entry (LRU). | |
169 | */ | |
170 | if ((avl_numnodes(&vc->vc_lastused_tree) << zfs_vdev_cache_bshift) > | |
171 | zfs_vdev_cache_size) { | |
172 | ve = avl_first(&vc->vc_lastused_tree); | |
b128c09f | 173 | if (ve->ve_fill_io != NULL) |
34dc7c2f | 174 | return (NULL); |
a6255b7f | 175 | ASSERT3U(ve->ve_hits, !=, 0); |
34dc7c2f BB |
176 | vdev_cache_evict(vc, ve); |
177 | } | |
178 | ||
79c76d5b | 179 | ve = kmem_zalloc(sizeof (vdev_cache_entry_t), KM_SLEEP); |
34dc7c2f | 180 | ve->ve_offset = offset; |
428870ff | 181 | ve->ve_lastused = ddi_get_lbolt(); |
a6255b7f | 182 | ve->ve_abd = abd_alloc_for_io(VCBS, B_TRUE); |
34dc7c2f BB |
183 | |
184 | avl_add(&vc->vc_offset_tree, ve); | |
185 | avl_add(&vc->vc_lastused_tree, ve); | |
186 | ||
187 | return (ve); | |
188 | } | |
189 | ||
190 | static void | |
191 | vdev_cache_hit(vdev_cache_t *vc, vdev_cache_entry_t *ve, zio_t *zio) | |
192 | { | |
193 | uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS); | |
194 | ||
195 | ASSERT(MUTEX_HELD(&vc->vc_lock)); | |
a6255b7f | 196 | ASSERT3P(ve->ve_fill_io, ==, NULL); |
34dc7c2f | 197 | |
428870ff | 198 | if (ve->ve_lastused != ddi_get_lbolt()) { |
34dc7c2f | 199 | avl_remove(&vc->vc_lastused_tree, ve); |
428870ff | 200 | ve->ve_lastused = ddi_get_lbolt(); |
34dc7c2f BB |
201 | avl_add(&vc->vc_lastused_tree, ve); |
202 | } | |
203 | ||
204 | ve->ve_hits++; | |
a6255b7f | 205 | abd_copy_off(zio->io_abd, ve->ve_abd, 0, cache_phase, zio->io_size); |
34dc7c2f BB |
206 | } |
207 | ||
208 | /* | |
209 | * Fill a previously allocated cache entry with data. | |
210 | */ | |
211 | static void | |
d164b209 | 212 | vdev_cache_fill(zio_t *fio) |
34dc7c2f | 213 | { |
d164b209 | 214 | vdev_t *vd = fio->io_vd; |
34dc7c2f | 215 | vdev_cache_t *vc = &vd->vdev_cache; |
d164b209 BB |
216 | vdev_cache_entry_t *ve = fio->io_private; |
217 | zio_t *pio; | |
3dfb57a3 | 218 | zio_link_t *zl; |
34dc7c2f | 219 | |
a6255b7f | 220 | ASSERT3U(fio->io_size, ==, VCBS); |
34dc7c2f BB |
221 | |
222 | /* | |
223 | * Add data to the cache. | |
224 | */ | |
225 | mutex_enter(&vc->vc_lock); | |
226 | ||
a6255b7f DQ |
227 | ASSERT3P(ve->ve_fill_io, ==, fio); |
228 | ASSERT3U(ve->ve_offset, ==, fio->io_offset); | |
229 | ASSERT3P(ve->ve_abd, ==, fio->io_abd); | |
34dc7c2f BB |
230 | |
231 | ve->ve_fill_io = NULL; | |
232 | ||
233 | /* | |
234 | * Even if this cache line was invalidated by a missed write update, | |
235 | * any reads that were queued up before the missed update are still | |
236 | * valid, so we can satisfy them from this line before we evict it. | |
237 | */ | |
3dfb57a3 DB |
238 | zl = NULL; |
239 | while ((pio = zio_walk_parents(fio, &zl)) != NULL) | |
d164b209 | 240 | vdev_cache_hit(vc, ve, pio); |
34dc7c2f | 241 | |
d164b209 | 242 | if (fio->io_error || ve->ve_missed_update) |
34dc7c2f BB |
243 | vdev_cache_evict(vc, ve); |
244 | ||
245 | mutex_exit(&vc->vc_lock); | |
34dc7c2f BB |
246 | } |
247 | ||
248 | /* | |
b0bc7a84 | 249 | * Read data from the cache. Returns B_TRUE cache hit, B_FALSE on miss. |
34dc7c2f | 250 | */ |
b0bc7a84 | 251 | boolean_t |
34dc7c2f BB |
252 | vdev_cache_read(zio_t *zio) |
253 | { | |
254 | vdev_cache_t *vc = &zio->io_vd->vdev_cache; | |
5fed499d | 255 | vdev_cache_entry_t *ve, *ve_search; |
34dc7c2f | 256 | uint64_t cache_offset = P2ALIGN(zio->io_offset, VCBS); |
34dc7c2f | 257 | zio_t *fio; |
d1d7e268 | 258 | ASSERTV(uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS)); |
34dc7c2f | 259 | |
a6255b7f | 260 | ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); |
34dc7c2f BB |
261 | |
262 | if (zio->io_flags & ZIO_FLAG_DONT_CACHE) | |
b0bc7a84 | 263 | return (B_FALSE); |
34dc7c2f BB |
264 | |
265 | if (zio->io_size > zfs_vdev_cache_max) | |
b0bc7a84 | 266 | return (B_FALSE); |
34dc7c2f BB |
267 | |
268 | /* | |
269 | * If the I/O straddles two or more cache blocks, don't cache it. | |
270 | */ | |
b128c09f | 271 | if (P2BOUNDARY(zio->io_offset, zio->io_size, VCBS)) |
b0bc7a84 | 272 | return (B_FALSE); |
34dc7c2f | 273 | |
a6255b7f | 274 | ASSERT3U(cache_phase + zio->io_size, <=, VCBS); |
34dc7c2f BB |
275 | |
276 | mutex_enter(&vc->vc_lock); | |
277 | ||
79c76d5b | 278 | ve_search = kmem_alloc(sizeof (vdev_cache_entry_t), KM_SLEEP); |
5fed499d BB |
279 | ve_search->ve_offset = cache_offset; |
280 | ve = avl_find(&vc->vc_offset_tree, ve_search, NULL); | |
d1d7e268 | 281 | kmem_free(ve_search, sizeof (vdev_cache_entry_t)); |
34dc7c2f BB |
282 | |
283 | if (ve != NULL) { | |
284 | if (ve->ve_missed_update) { | |
285 | mutex_exit(&vc->vc_lock); | |
b0bc7a84 | 286 | return (B_FALSE); |
34dc7c2f BB |
287 | } |
288 | ||
289 | if ((fio = ve->ve_fill_io) != NULL) { | |
34dc7c2f | 290 | zio_vdev_io_bypass(zio); |
d164b209 | 291 | zio_add_child(zio, fio); |
34dc7c2f BB |
292 | mutex_exit(&vc->vc_lock); |
293 | VDCSTAT_BUMP(vdc_stat_delegations); | |
b0bc7a84 | 294 | return (B_TRUE); |
34dc7c2f BB |
295 | } |
296 | ||
297 | vdev_cache_hit(vc, ve, zio); | |
298 | zio_vdev_io_bypass(zio); | |
299 | ||
300 | mutex_exit(&vc->vc_lock); | |
34dc7c2f | 301 | VDCSTAT_BUMP(vdc_stat_hits); |
b0bc7a84 | 302 | return (B_TRUE); |
34dc7c2f BB |
303 | } |
304 | ||
305 | ve = vdev_cache_allocate(zio); | |
306 | ||
307 | if (ve == NULL) { | |
308 | mutex_exit(&vc->vc_lock); | |
b0bc7a84 | 309 | return (B_FALSE); |
34dc7c2f BB |
310 | } |
311 | ||
b128c09f | 312 | fio = zio_vdev_delegated_io(zio->io_vd, cache_offset, |
a6255b7f | 313 | ve->ve_abd, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_NOW, |
b128c09f | 314 | ZIO_FLAG_DONT_CACHE, vdev_cache_fill, ve); |
34dc7c2f BB |
315 | |
316 | ve->ve_fill_io = fio; | |
34dc7c2f | 317 | zio_vdev_io_bypass(zio); |
d164b209 | 318 | zio_add_child(zio, fio); |
34dc7c2f BB |
319 | |
320 | mutex_exit(&vc->vc_lock); | |
321 | zio_nowait(fio); | |
322 | VDCSTAT_BUMP(vdc_stat_misses); | |
323 | ||
b0bc7a84 | 324 | return (B_TRUE); |
34dc7c2f BB |
325 | } |
326 | ||
327 | /* | |
328 | * Update cache contents upon write completion. | |
329 | */ | |
330 | void | |
331 | vdev_cache_write(zio_t *zio) | |
332 | { | |
333 | vdev_cache_t *vc = &zio->io_vd->vdev_cache; | |
334 | vdev_cache_entry_t *ve, ve_search; | |
335 | uint64_t io_start = zio->io_offset; | |
336 | uint64_t io_end = io_start + zio->io_size; | |
337 | uint64_t min_offset = P2ALIGN(io_start, VCBS); | |
338 | uint64_t max_offset = P2ROUNDUP(io_end, VCBS); | |
339 | avl_index_t where; | |
340 | ||
a6255b7f | 341 | ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); |
34dc7c2f BB |
342 | |
343 | mutex_enter(&vc->vc_lock); | |
344 | ||
345 | ve_search.ve_offset = min_offset; | |
346 | ve = avl_find(&vc->vc_offset_tree, &ve_search, &where); | |
347 | ||
348 | if (ve == NULL) | |
349 | ve = avl_nearest(&vc->vc_offset_tree, where, AVL_AFTER); | |
350 | ||
351 | while (ve != NULL && ve->ve_offset < max_offset) { | |
352 | uint64_t start = MAX(ve->ve_offset, io_start); | |
353 | uint64_t end = MIN(ve->ve_offset + VCBS, io_end); | |
354 | ||
355 | if (ve->ve_fill_io != NULL) { | |
356 | ve->ve_missed_update = 1; | |
357 | } else { | |
12aec7dc CC |
358 | abd_copy_off(ve->ve_abd, zio->io_abd, |
359 | start - ve->ve_offset, start - io_start, | |
360 | end - start); | |
34dc7c2f BB |
361 | } |
362 | ve = AVL_NEXT(&vc->vc_offset_tree, ve); | |
363 | } | |
364 | mutex_exit(&vc->vc_lock); | |
365 | } | |
366 | ||
367 | void | |
368 | vdev_cache_purge(vdev_t *vd) | |
369 | { | |
370 | vdev_cache_t *vc = &vd->vdev_cache; | |
371 | vdev_cache_entry_t *ve; | |
372 | ||
373 | mutex_enter(&vc->vc_lock); | |
374 | while ((ve = avl_first(&vc->vc_offset_tree)) != NULL) | |
375 | vdev_cache_evict(vc, ve); | |
376 | mutex_exit(&vc->vc_lock); | |
377 | } | |
378 | ||
379 | void | |
380 | vdev_cache_init(vdev_t *vd) | |
381 | { | |
382 | vdev_cache_t *vc = &vd->vdev_cache; | |
383 | ||
384 | mutex_init(&vc->vc_lock, NULL, MUTEX_DEFAULT, NULL); | |
385 | ||
386 | avl_create(&vc->vc_offset_tree, vdev_cache_offset_compare, | |
387 | sizeof (vdev_cache_entry_t), | |
388 | offsetof(struct vdev_cache_entry, ve_offset_node)); | |
389 | ||
390 | avl_create(&vc->vc_lastused_tree, vdev_cache_lastused_compare, | |
391 | sizeof (vdev_cache_entry_t), | |
392 | offsetof(struct vdev_cache_entry, ve_lastused_node)); | |
393 | } | |
394 | ||
395 | void | |
396 | vdev_cache_fini(vdev_t *vd) | |
397 | { | |
398 | vdev_cache_t *vc = &vd->vdev_cache; | |
399 | ||
400 | vdev_cache_purge(vd); | |
401 | ||
402 | avl_destroy(&vc->vc_offset_tree); | |
403 | avl_destroy(&vc->vc_lastused_tree); | |
404 | ||
405 | mutex_destroy(&vc->vc_lock); | |
406 | } | |
407 | ||
408 | void | |
409 | vdev_cache_stat_init(void) | |
410 | { | |
411 | vdc_ksp = kstat_create("zfs", 0, "vdev_cache_stats", "misc", | |
412 | KSTAT_TYPE_NAMED, sizeof (vdc_stats) / sizeof (kstat_named_t), | |
413 | KSTAT_FLAG_VIRTUAL); | |
414 | if (vdc_ksp != NULL) { | |
415 | vdc_ksp->ks_data = &vdc_stats; | |
416 | kstat_install(vdc_ksp); | |
417 | } | |
418 | } | |
419 | ||
420 | void | |
421 | vdev_cache_stat_fini(void) | |
422 | { | |
423 | if (vdc_ksp != NULL) { | |
424 | kstat_delete(vdc_ksp); | |
425 | vdc_ksp = NULL; | |
426 | } | |
427 | } | |
c409e464 BB |
428 | |
429 | #if defined(_KERNEL) && defined(HAVE_SPL) | |
430 | module_param(zfs_vdev_cache_max, int, 0644); | |
431 | MODULE_PARM_DESC(zfs_vdev_cache_max, "Inflate reads small than max"); | |
432 | ||
433 | module_param(zfs_vdev_cache_size, int, 0444); | |
434 | MODULE_PARM_DESC(zfs_vdev_cache_size, "Total size of the per-disk cache"); | |
435 | ||
436 | module_param(zfs_vdev_cache_bshift, int, 0644); | |
437 | MODULE_PARM_DESC(zfs_vdev_cache_bshift, "Shift size to inflate reads too"); | |
438 | #endif |